From: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Date: Fri, 27 Mar 2020 18:32:38 +0000 (+0100)
Subject: radeonsi: switch to 3-spaces style
X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=d7008fe46a8f689ce4ee2b14b61dc39baebccaa8;p=mesa.git

radeonsi: switch to 3-spaces style

Generated automatically using clang-format and the following config:

AlignAfterOpenBracket: true
AlignConsecutiveMacros: true
AllowAllArgumentsOnNextLine: false
AllowShortCaseLabelsOnASingleLine: false
AllowShortFunctionsOnASingleLine: false
AlwaysBreakAfterReturnType: None
BasedOnStyle: LLVM
BraceWrapping:
  AfterControlStatement: false
  AfterEnum: true
  AfterFunction: true
  AfterStruct: false
  BeforeElse: false
  SplitEmptyFunction: true
BinPackArguments: true
BinPackParameters: true
BreakBeforeBraces: Custom
ColumnLimit: 100
ContinuationIndentWidth: 3
Cpp11BracedListStyle: false
Cpp11BracedListStyle: true
ForEachMacros:
  - LIST_FOR_EACH_ENTRY
  - LIST_FOR_EACH_ENTRY_SAFE
  - util_dynarray_foreach
  - nir_foreach_variable
  - nir_foreach_variable_safe
  - nir_foreach_register
  - nir_foreach_register_safe
  - nir_foreach_use
  - nir_foreach_use_safe
  - nir_foreach_if_use
  - nir_foreach_if_use_safe
  - nir_foreach_def
  - nir_foreach_def_safe
  - nir_foreach_phi_src
  - nir_foreach_phi_src_safe
  - nir_foreach_parallel_copy_entry
  - nir_foreach_instr
  - nir_foreach_instr_reverse
  - nir_foreach_instr_safe
  - nir_foreach_instr_reverse_safe
  - nir_foreach_function
  - nir_foreach_block
  - nir_foreach_block_safe
  - nir_foreach_block_reverse
  - nir_foreach_block_reverse_safe
  - nir_foreach_block_in_cf_node
IncludeBlocks: Regroup
IncludeCategories:
  - Regex:           '<[[:alnum:].]+>'
    Priority:        2
  - Regex:           '.*'
    Priority:        1
IndentWidth: 3
PenaltyBreakBeforeFirstCallParameter: 1
PenaltyExcessCharacter: 100
SpaceAfterCStyleCast: false
SpaceBeforeCpp11BracedList: false
SpaceBeforeCtorInitializerColon: false
SpacesInContainerLiterals: false

Reviewed-by: Marek Olšák <marek.olsak@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/4319>
---

diff --git a/src/gallium/drivers/radeonsi/.editorconfig b/src/gallium/drivers/radeonsi/.editorconfig
deleted file mode 100644
index 21a3c7d1274..00000000000
--- a/src/gallium/drivers/radeonsi/.editorconfig
+++ /dev/null
@@ -1,3 +0,0 @@
-[*.{c,h}]
-indent_style = tab
-indent_size = tab
diff --git a/src/gallium/drivers/radeonsi/cik_sdma.c b/src/gallium/drivers/radeonsi/cik_sdma.c
index df8a2fcd577..74c289b0134 100644
--- a/src/gallium/drivers/radeonsi/cik_sdma.c
+++ b/src/gallium/drivers/radeonsi/cik_sdma.c
@@ -23,643 +23,531 @@
  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  */
 
-#include "sid.h"
 #include "si_pipe.h"
+#include "sid.h"
 
 static unsigned minify_as_blocks(unsigned width, unsigned level, unsigned blk_w)
 {
-	width = u_minify(width, level);
-	return DIV_ROUND_UP(width, blk_w);
+   width = u_minify(width, level);
+   return DIV_ROUND_UP(width, blk_w);
 }
 
-static unsigned encode_tile_info(struct si_context *sctx,
-				 struct si_texture *tex, unsigned level,
-				 bool set_bpp)
+static unsigned encode_tile_info(struct si_context *sctx, struct si_texture *tex, unsigned level,
+                                 bool set_bpp)
 {
-	struct radeon_info *info = &sctx->screen->info;
-	unsigned tile_index = tex->surface.u.legacy.tiling_index[level];
-	unsigned macro_tile_index = tex->surface.u.legacy.macro_tile_index;
-	unsigned tile_mode = info->si_tile_mode_array[tile_index];
-	unsigned macro_tile_mode = info->cik_macrotile_mode_array[macro_tile_index];
-
-	return (set_bpp ? util_logbase2(tex->surface.bpe) : 0) |
-		(G_009910_ARRAY_MODE(tile_mode) << 3) |
-		(G_009910_MICRO_TILE_MODE_NEW(tile_mode) << 8) |
-		/* Non-depth modes don't have TILE_SPLIT set. */
-		((util_logbase2(tex->surface.u.legacy.tile_split >> 6)) << 11) |
-		(G_009990_BANK_WIDTH(macro_tile_mode) << 15) |
-		(G_009990_BANK_HEIGHT(macro_tile_mode) << 18) |
-		(G_009990_NUM_BANKS(macro_tile_mode) << 21) |
-		(G_009990_MACRO_TILE_ASPECT(macro_tile_mode) << 24) |
-		(G_009910_PIPE_CONFIG(tile_mode) << 26);
+   struct radeon_info *info = &sctx->screen->info;
+   unsigned tile_index = tex->surface.u.legacy.tiling_index[level];
+   unsigned macro_tile_index = tex->surface.u.legacy.macro_tile_index;
+   unsigned tile_mode = info->si_tile_mode_array[tile_index];
+   unsigned macro_tile_mode = info->cik_macrotile_mode_array[macro_tile_index];
+
+   return (set_bpp ? util_logbase2(tex->surface.bpe) : 0) | (G_009910_ARRAY_MODE(tile_mode) << 3) |
+          (G_009910_MICRO_TILE_MODE_NEW(tile_mode) << 8) |
+          /* Non-depth modes don't have TILE_SPLIT set. */
+          ((util_logbase2(tex->surface.u.legacy.tile_split >> 6)) << 11) |
+          (G_009990_BANK_WIDTH(macro_tile_mode) << 15) |
+          (G_009990_BANK_HEIGHT(macro_tile_mode) << 18) |
+          (G_009990_NUM_BANKS(macro_tile_mode) << 21) |
+          (G_009990_MACRO_TILE_ASPECT(macro_tile_mode) << 24) |
+          (G_009910_PIPE_CONFIG(tile_mode) << 26);
 }
 
-
-static bool si_sdma_v4_copy_texture(struct si_context *sctx,
-				  struct pipe_resource *dst,
-				  unsigned dst_level,
-				  unsigned dstx, unsigned dsty, unsigned dstz,
-				  struct pipe_resource *src,
-				  unsigned src_level,
-				  const struct pipe_box *src_box)
+static bool si_sdma_v4_copy_texture(struct si_context *sctx, struct pipe_resource *dst,
+                                    unsigned dst_level, unsigned dstx, unsigned dsty, unsigned dstz,
+                                    struct pipe_resource *src, unsigned src_level,
+                                    const struct pipe_box *src_box)
 {
-	struct si_texture *ssrc = (struct si_texture*)src;
-	struct si_texture *sdst = (struct si_texture*)dst;
-
-	unsigned bpp = sdst->surface.bpe;
-	uint64_t dst_address = sdst->buffer.gpu_address +
-		sdst->surface.u.gfx9.surf_offset;
-	uint64_t src_address = ssrc->buffer.gpu_address +
-		ssrc->surface.u.gfx9.surf_offset;
-	unsigned dst_pitch = sdst->surface.u.gfx9.surf_pitch;
-	unsigned src_pitch = ssrc->surface.u.gfx9.surf_pitch;
-	uint64_t dst_slice_pitch = ((uint64_t)sdst->surface.u.gfx9.surf_slice_size) / bpp;
-	uint64_t src_slice_pitch = ((uint64_t)ssrc->surface.u.gfx9.surf_slice_size) / bpp;
-	unsigned srcx = src_box->x / ssrc->surface.blk_w;
-	unsigned srcy = src_box->y / ssrc->surface.blk_h;
-	unsigned srcz = src_box->z;
-	unsigned copy_width = DIV_ROUND_UP(src_box->width, ssrc->surface.blk_w);
-	unsigned copy_height = DIV_ROUND_UP(src_box->height, ssrc->surface.blk_h);
-	unsigned copy_depth = src_box->depth;
-	unsigned xalign = MAX2(1, 4 / bpp);
-
-	assert(src_level <= src->last_level);
-	assert(dst_level <= dst->last_level);
-	assert(sdst->surface.u.gfx9.surf_offset +
-	       dst_slice_pitch * bpp * (dstz + src_box->depth) <=
-	       sdst->buffer.buf->size);
-	assert(ssrc->surface.u.gfx9.surf_offset +
-	       src_slice_pitch * bpp * (srcz + src_box->depth) <=
-	       ssrc->buffer.buf->size);
-
-	if (!si_prepare_for_dma_blit(sctx, sdst, dst_level, dstx, dsty,
-				     dstz, ssrc, src_level, src_box))
-		return false;
-
-	dstx /= sdst->surface.blk_w;
-	dsty /= sdst->surface.blk_h;
-
-	if (srcx >= (1 << 14) ||
-	    srcy >= (1 << 14) ||
-	    srcz >= (1 << 11) ||
-	    dstx >= (1 << 14) ||
-	    dsty >= (1 << 14) ||
-	    dstz >= (1 << 11))
-		return false;
-
-	/* Linear -> linear sub-window copy. */
-	if (ssrc->surface.is_linear &&
-	    sdst->surface.is_linear) {
-		struct radeon_cmdbuf *cs = sctx->sdma_cs;
-
-		/* Check if everything fits into the bitfields */
-		if (!(src_pitch <= (1 << 19) &&
-		      dst_pitch <= (1 << 19) &&
-		      src_slice_pitch <= (1 << 28) &&
-		      dst_slice_pitch <= (1 << 28) &&
-		      copy_width <= (1 << 14) &&
-		      copy_height <= (1 << 14) &&
-		      copy_depth <= (1 << 11)))
-			return false;
-
-		si_need_dma_space(sctx, 13, &sdst->buffer, &ssrc->buffer);
-
-		src_address += ssrc->surface.u.gfx9.offset[src_level];
-		dst_address += sdst->surface.u.gfx9.offset[dst_level];
-
-		/* Check alignments */
-		if ((src_address % 4) != 0 ||
-		    (dst_address % 4) != 0 ||
-		    (src_pitch % xalign) != 0)
-			return false;
-
-		radeon_emit(cs, CIK_SDMA_PACKET(CIK_SDMA_OPCODE_COPY,
-						CIK_SDMA_COPY_SUB_OPCODE_LINEAR_SUB_WINDOW, 0) |
-			    (util_logbase2(bpp) << 29));
-		radeon_emit(cs, src_address);
-		radeon_emit(cs, src_address >> 32);
-		radeon_emit(cs, srcx | (srcy << 16));
-		radeon_emit(cs, srcz | ((src_pitch - 1) << 13));
-		radeon_emit(cs, src_slice_pitch - 1);
-		radeon_emit(cs, dst_address);
-		radeon_emit(cs, dst_address >> 32);
-		radeon_emit(cs, dstx | (dsty << 16));
-		radeon_emit(cs, dstz | ((dst_pitch - 1) << 13));
-		radeon_emit(cs, dst_slice_pitch - 1);
-		radeon_emit(cs, (copy_width - 1) | ((copy_height - 1) << 16));
-		radeon_emit(cs, (copy_depth - 1));
-		return true;
-	}
-
-	/* Linear <-> Tiled sub-window copy */
-	if (ssrc->surface.is_linear != sdst->surface.is_linear) {
-		struct si_texture *tiled = ssrc->surface.is_linear ? sdst : ssrc;
-		struct si_texture *linear = tiled == ssrc ? sdst : ssrc;
-		unsigned tiled_level =	tiled	== ssrc ? src_level : dst_level;
-		unsigned linear_level =	linear	== ssrc ? src_level : dst_level;
-		unsigned tiled_x =	tiled	== ssrc ? srcx : dstx;
-		unsigned linear_x =	linear  == ssrc ? srcx : dstx;
-		unsigned tiled_y =	tiled	== ssrc ? srcy : dsty;
-		unsigned linear_y =	linear  == ssrc ? srcy : dsty;
-		unsigned tiled_z =	tiled	== ssrc ? srcz : dstz;
-		unsigned linear_z =	linear  == ssrc ? srcz : dstz;
-		unsigned tiled_width = tiled == ssrc ?
-			DIV_ROUND_UP(ssrc->buffer.b.b.width0, ssrc->surface.blk_w) :
-			DIV_ROUND_UP(sdst->buffer.b.b.width0, sdst->surface.blk_w);
-		unsigned tiled_height = tiled == ssrc ?
-			DIV_ROUND_UP(ssrc->buffer.b.b.height0, ssrc->surface.blk_h) :
-			DIV_ROUND_UP(sdst->buffer.b.b.height0, sdst->surface.blk_h);
-		unsigned tiled_depth =	tiled	== ssrc ?
-			ssrc->buffer.b.b.depth0 :
-			sdst->buffer.b.b.depth0;
-		unsigned linear_pitch =	linear	== ssrc ? src_pitch : dst_pitch;
-		unsigned linear_slice_pitch = linear == ssrc ? src_slice_pitch : dst_slice_pitch;
-		uint64_t tiled_address =  tiled  == ssrc ? src_address : dst_address;
-		uint64_t linear_address = linear == ssrc ? src_address : dst_address;
-		struct radeon_cmdbuf *cs = sctx->sdma_cs;
-
-		linear_address += linear->surface.u.gfx9.offset[linear_level];
-
-		/* Check if everything fits into the bitfields */
-		if (!(tiled_x <= (1 << 14) &&
-		      tiled_y <= (1 << 14) &&
-		      tiled_z <= (1 << 11) &&
-		      tiled_width <= (1 << 14) &&
-		      tiled_height <= (1 << 14) &&
-		      tiled_depth <= (1 << 11) &&
-		      tiled->surface.u.gfx9.surf.epitch <= (1 << 16) &&
-		      linear_x <= (1 << 14) &&
-		      linear_y <= (1 << 14) &&
-		      linear_z <= (1 << 11) &&
-		      linear_pitch <= (1 << 14) &&
-		      linear_slice_pitch <= (1 << 28) &&
-		      copy_width <= (1 << 14) &&
-		      copy_height <= (1 << 14) &&
-		      copy_depth <= (1 << 11)))
-			return false;
-
-		/* Check alignments */
-		if ((tiled_address % 256 != 0) ||
-		    (linear_address % 4 != 0) ||
-		    (linear_pitch % xalign != 0) ||
-		    (linear_slice_pitch % xalign != 0))
-			return false;
-
-		si_need_dma_space(sctx, 14, &sdst->buffer, &ssrc->buffer);
-
-		radeon_emit(cs, CIK_SDMA_PACKET(CIK_SDMA_OPCODE_COPY,
-						CIK_SDMA_COPY_SUB_OPCODE_TILED_SUB_WINDOW, 0) |
-						tiled->buffer.b.b.last_level << 20 |
-						tiled_level << 24 |
-						(linear == sdst ? 1u : 0) << 31);
-		radeon_emit(cs, (uint32_t) tiled_address);
-		radeon_emit(cs, (uint32_t) (tiled_address >> 32));
-		radeon_emit(cs, tiled_x | (tiled_y << 16));
-		radeon_emit(cs, tiled_z | ((tiled_width - 1) << 16));
-		radeon_emit(cs, (tiled_height - 1) | (tiled_depth - 1) << 16);
-		radeon_emit(cs, util_logbase2(bpp) |
-				tiled->surface.u.gfx9.surf.swizzle_mode << 3 |
-				tiled->surface.u.gfx9.resource_type << 9 |
-				tiled->surface.u.gfx9.surf.epitch << 16);
-		radeon_emit(cs, (uint32_t) linear_address);
-		radeon_emit(cs, (uint32_t) (linear_address >> 32));
-		radeon_emit(cs, linear_x | (linear_y << 16));
-		radeon_emit(cs, linear_z | ((linear_pitch - 1) << 16));
-		radeon_emit(cs, linear_slice_pitch - 1);
-		radeon_emit(cs, (copy_width - 1) | ((copy_height - 1) << 16));
-		radeon_emit(cs, (copy_depth - 1));
-		return true;
-	}
-
-	return false;
+   struct si_texture *ssrc = (struct si_texture *)src;
+   struct si_texture *sdst = (struct si_texture *)dst;
+
+   unsigned bpp = sdst->surface.bpe;
+   uint64_t dst_address = sdst->buffer.gpu_address + sdst->surface.u.gfx9.surf_offset;
+   uint64_t src_address = ssrc->buffer.gpu_address + ssrc->surface.u.gfx9.surf_offset;
+   unsigned dst_pitch = sdst->surface.u.gfx9.surf_pitch;
+   unsigned src_pitch = ssrc->surface.u.gfx9.surf_pitch;
+   uint64_t dst_slice_pitch = ((uint64_t)sdst->surface.u.gfx9.surf_slice_size) / bpp;
+   uint64_t src_slice_pitch = ((uint64_t)ssrc->surface.u.gfx9.surf_slice_size) / bpp;
+   unsigned srcx = src_box->x / ssrc->surface.blk_w;
+   unsigned srcy = src_box->y / ssrc->surface.blk_h;
+   unsigned srcz = src_box->z;
+   unsigned copy_width = DIV_ROUND_UP(src_box->width, ssrc->surface.blk_w);
+   unsigned copy_height = DIV_ROUND_UP(src_box->height, ssrc->surface.blk_h);
+   unsigned copy_depth = src_box->depth;
+   unsigned xalign = MAX2(1, 4 / bpp);
+
+   assert(src_level <= src->last_level);
+   assert(dst_level <= dst->last_level);
+   assert(sdst->surface.u.gfx9.surf_offset + dst_slice_pitch * bpp * (dstz + src_box->depth) <=
+          sdst->buffer.buf->size);
+   assert(ssrc->surface.u.gfx9.surf_offset + src_slice_pitch * bpp * (srcz + src_box->depth) <=
+          ssrc->buffer.buf->size);
+
+   if (!si_prepare_for_dma_blit(sctx, sdst, dst_level, dstx, dsty, dstz, ssrc, src_level, src_box))
+      return false;
+
+   dstx /= sdst->surface.blk_w;
+   dsty /= sdst->surface.blk_h;
+
+   if (srcx >= (1 << 14) || srcy >= (1 << 14) || srcz >= (1 << 11) || dstx >= (1 << 14) ||
+       dsty >= (1 << 14) || dstz >= (1 << 11))
+      return false;
+
+   /* Linear -> linear sub-window copy. */
+   if (ssrc->surface.is_linear && sdst->surface.is_linear) {
+      struct radeon_cmdbuf *cs = sctx->sdma_cs;
+
+      /* Check if everything fits into the bitfields */
+      if (!(src_pitch <= (1 << 19) && dst_pitch <= (1 << 19) && src_slice_pitch <= (1 << 28) &&
+            dst_slice_pitch <= (1 << 28) && copy_width <= (1 << 14) && copy_height <= (1 << 14) &&
+            copy_depth <= (1 << 11)))
+         return false;
+
+      si_need_dma_space(sctx, 13, &sdst->buffer, &ssrc->buffer);
+
+      src_address += ssrc->surface.u.gfx9.offset[src_level];
+      dst_address += sdst->surface.u.gfx9.offset[dst_level];
+
+      /* Check alignments */
+      if ((src_address % 4) != 0 || (dst_address % 4) != 0 || (src_pitch % xalign) != 0)
+         return false;
+
+      radeon_emit(
+         cs, CIK_SDMA_PACKET(CIK_SDMA_OPCODE_COPY, CIK_SDMA_COPY_SUB_OPCODE_LINEAR_SUB_WINDOW, 0) |
+                (util_logbase2(bpp) << 29));
+      radeon_emit(cs, src_address);
+      radeon_emit(cs, src_address >> 32);
+      radeon_emit(cs, srcx | (srcy << 16));
+      radeon_emit(cs, srcz | ((src_pitch - 1) << 13));
+      radeon_emit(cs, src_slice_pitch - 1);
+      radeon_emit(cs, dst_address);
+      radeon_emit(cs, dst_address >> 32);
+      radeon_emit(cs, dstx | (dsty << 16));
+      radeon_emit(cs, dstz | ((dst_pitch - 1) << 13));
+      radeon_emit(cs, dst_slice_pitch - 1);
+      radeon_emit(cs, (copy_width - 1) | ((copy_height - 1) << 16));
+      radeon_emit(cs, (copy_depth - 1));
+      return true;
+   }
+
+   /* Linear <-> Tiled sub-window copy */
+   if (ssrc->surface.is_linear != sdst->surface.is_linear) {
+      struct si_texture *tiled = ssrc->surface.is_linear ? sdst : ssrc;
+      struct si_texture *linear = tiled == ssrc ? sdst : ssrc;
+      unsigned tiled_level = tiled == ssrc ? src_level : dst_level;
+      unsigned linear_level = linear == ssrc ? src_level : dst_level;
+      unsigned tiled_x = tiled == ssrc ? srcx : dstx;
+      unsigned linear_x = linear == ssrc ? srcx : dstx;
+      unsigned tiled_y = tiled == ssrc ? srcy : dsty;
+      unsigned linear_y = linear == ssrc ? srcy : dsty;
+      unsigned tiled_z = tiled == ssrc ? srcz : dstz;
+      unsigned linear_z = linear == ssrc ? srcz : dstz;
+      unsigned tiled_width = tiled == ssrc
+                                ? DIV_ROUND_UP(ssrc->buffer.b.b.width0, ssrc->surface.blk_w)
+                                : DIV_ROUND_UP(sdst->buffer.b.b.width0, sdst->surface.blk_w);
+      unsigned tiled_height = tiled == ssrc
+                                 ? DIV_ROUND_UP(ssrc->buffer.b.b.height0, ssrc->surface.blk_h)
+                                 : DIV_ROUND_UP(sdst->buffer.b.b.height0, sdst->surface.blk_h);
+      unsigned tiled_depth = tiled == ssrc ? ssrc->buffer.b.b.depth0 : sdst->buffer.b.b.depth0;
+      unsigned linear_pitch = linear == ssrc ? src_pitch : dst_pitch;
+      unsigned linear_slice_pitch = linear == ssrc ? src_slice_pitch : dst_slice_pitch;
+      uint64_t tiled_address = tiled == ssrc ? src_address : dst_address;
+      uint64_t linear_address = linear == ssrc ? src_address : dst_address;
+      struct radeon_cmdbuf *cs = sctx->sdma_cs;
+
+      linear_address += linear->surface.u.gfx9.offset[linear_level];
+
+      /* Check if everything fits into the bitfields */
+      if (!(tiled_x <= (1 << 14) && tiled_y <= (1 << 14) && tiled_z <= (1 << 11) &&
+            tiled_width <= (1 << 14) && tiled_height <= (1 << 14) && tiled_depth <= (1 << 11) &&
+            tiled->surface.u.gfx9.surf.epitch <= (1 << 16) && linear_x <= (1 << 14) &&
+            linear_y <= (1 << 14) && linear_z <= (1 << 11) && linear_pitch <= (1 << 14) &&
+            linear_slice_pitch <= (1 << 28) && copy_width <= (1 << 14) &&
+            copy_height <= (1 << 14) && copy_depth <= (1 << 11)))
+         return false;
+
+      /* Check alignments */
+      if ((tiled_address % 256 != 0) || (linear_address % 4 != 0) || (linear_pitch % xalign != 0) ||
+          (linear_slice_pitch % xalign != 0))
+         return false;
+
+      si_need_dma_space(sctx, 14, &sdst->buffer, &ssrc->buffer);
+
+      radeon_emit(
+         cs, CIK_SDMA_PACKET(CIK_SDMA_OPCODE_COPY, CIK_SDMA_COPY_SUB_OPCODE_TILED_SUB_WINDOW, 0) |
+                tiled->buffer.b.b.last_level << 20 | tiled_level << 24 |
+                (linear == sdst ? 1u : 0) << 31);
+      radeon_emit(cs, (uint32_t)tiled_address);
+      radeon_emit(cs, (uint32_t)(tiled_address >> 32));
+      radeon_emit(cs, tiled_x | (tiled_y << 16));
+      radeon_emit(cs, tiled_z | ((tiled_width - 1) << 16));
+      radeon_emit(cs, (tiled_height - 1) | (tiled_depth - 1) << 16);
+      radeon_emit(cs, util_logbase2(bpp) | tiled->surface.u.gfx9.surf.swizzle_mode << 3 |
+                         tiled->surface.u.gfx9.resource_type << 9 |
+                         tiled->surface.u.gfx9.surf.epitch << 16);
+      radeon_emit(cs, (uint32_t)linear_address);
+      radeon_emit(cs, (uint32_t)(linear_address >> 32));
+      radeon_emit(cs, linear_x | (linear_y << 16));
+      radeon_emit(cs, linear_z | ((linear_pitch - 1) << 16));
+      radeon_emit(cs, linear_slice_pitch - 1);
+      radeon_emit(cs, (copy_width - 1) | ((copy_height - 1) << 16));
+      radeon_emit(cs, (copy_depth - 1));
+      return true;
+   }
+
+   return false;
 }
 
-static bool cik_sdma_copy_texture(struct si_context *sctx,
-				  struct pipe_resource *dst,
-				  unsigned dst_level,
-				  unsigned dstx, unsigned dsty, unsigned dstz,
-				  struct pipe_resource *src,
-				  unsigned src_level,
-				  const struct pipe_box *src_box)
+static bool cik_sdma_copy_texture(struct si_context *sctx, struct pipe_resource *dst,
+                                  unsigned dst_level, unsigned dstx, unsigned dsty, unsigned dstz,
+                                  struct pipe_resource *src, unsigned src_level,
+                                  const struct pipe_box *src_box)
 {
-	struct radeon_info *info = &sctx->screen->info;
-	struct si_texture *ssrc = (struct si_texture*)src;
-	struct si_texture *sdst = (struct si_texture*)dst;
-	unsigned bpp = sdst->surface.bpe;
-	uint64_t dst_address = sdst->buffer.gpu_address +
-			       sdst->surface.u.legacy.level[dst_level].offset;
-	uint64_t src_address = ssrc->buffer.gpu_address +
-			       ssrc->surface.u.legacy.level[src_level].offset;
-	unsigned dst_mode = sdst->surface.u.legacy.level[dst_level].mode;
-	unsigned src_mode = ssrc->surface.u.legacy.level[src_level].mode;
-	unsigned dst_tile_index = sdst->surface.u.legacy.tiling_index[dst_level];
-	unsigned src_tile_index = ssrc->surface.u.legacy.tiling_index[src_level];
-	unsigned dst_tile_mode = info->si_tile_mode_array[dst_tile_index];
-	unsigned src_tile_mode = info->si_tile_mode_array[src_tile_index];
-	unsigned dst_micro_mode = G_009910_MICRO_TILE_MODE_NEW(dst_tile_mode);
-	unsigned src_micro_mode = G_009910_MICRO_TILE_MODE_NEW(src_tile_mode);
-	unsigned dst_tile_swizzle = dst_mode == RADEON_SURF_MODE_2D ?
-					    sdst->surface.tile_swizzle : 0;
-	unsigned src_tile_swizzle = src_mode == RADEON_SURF_MODE_2D ?
-					    ssrc->surface.tile_swizzle : 0;
-	unsigned dst_pitch = sdst->surface.u.legacy.level[dst_level].nblk_x;
-	unsigned src_pitch = ssrc->surface.u.legacy.level[src_level].nblk_x;
-	uint64_t dst_slice_pitch = ((uint64_t)sdst->surface.u.legacy.level[dst_level].slice_size_dw * 4) / bpp;
-	uint64_t src_slice_pitch = ((uint64_t)ssrc->surface.u.legacy.level[src_level].slice_size_dw * 4) / bpp;
-	unsigned dst_width = minify_as_blocks(sdst->buffer.b.b.width0,
-					      dst_level, sdst->surface.blk_w);
-	unsigned src_width = minify_as_blocks(ssrc->buffer.b.b.width0,
-					      src_level, ssrc->surface.blk_w);
-	unsigned dst_height = minify_as_blocks(sdst->buffer.b.b.height0,
-					       dst_level, sdst->surface.blk_h);
-	unsigned src_height = minify_as_blocks(ssrc->buffer.b.b.height0,
-					       src_level, ssrc->surface.blk_h);
-	unsigned srcx = src_box->x / ssrc->surface.blk_w;
-	unsigned srcy = src_box->y / ssrc->surface.blk_h;
-	unsigned srcz = src_box->z;
-	unsigned copy_width = DIV_ROUND_UP(src_box->width, ssrc->surface.blk_w);
-	unsigned copy_height = DIV_ROUND_UP(src_box->height, ssrc->surface.blk_h);
-	unsigned copy_depth = src_box->depth;
-
-	assert(src_level <= src->last_level);
-	assert(dst_level <= dst->last_level);
-	assert(sdst->surface.u.legacy.level[dst_level].offset +
-	       dst_slice_pitch * bpp * (dstz + src_box->depth) <=
-	       sdst->buffer.buf->size);
-	assert(ssrc->surface.u.legacy.level[src_level].offset +
-	       src_slice_pitch * bpp * (srcz + src_box->depth) <=
-	       ssrc->buffer.buf->size);
-
-	if (!si_prepare_for_dma_blit(sctx, sdst, dst_level, dstx, dsty,
-				     dstz, ssrc, src_level, src_box))
-		return false;
-
-	dstx /= sdst->surface.blk_w;
-	dsty /= sdst->surface.blk_h;
-
-	if (srcx >= (1 << 14) ||
-	    srcy >= (1 << 14) ||
-	    srcz >= (1 << 11) ||
-	    dstx >= (1 << 14) ||
-	    dsty >= (1 << 14) ||
-	    dstz >= (1 << 11))
-		return false;
-
-	dst_address |= dst_tile_swizzle << 8;
-	src_address |= src_tile_swizzle << 8;
-
-	/* Linear -> linear sub-window copy. */
-	if (dst_mode == RADEON_SURF_MODE_LINEAR_ALIGNED &&
-	    src_mode == RADEON_SURF_MODE_LINEAR_ALIGNED &&
-	    /* check if everything fits into the bitfields */
-	    src_pitch <= (1 << 14) &&
-	    dst_pitch <= (1 << 14) &&
-	    src_slice_pitch <= (1 << 28) &&
-	    dst_slice_pitch <= (1 << 28) &&
-	    copy_width <= (1 << 14) &&
-	    copy_height <= (1 << 14) &&
-	    copy_depth <= (1 << 11) &&
-	    /* HW limitation - GFX7: */
-	    (sctx->chip_class != GFX7 ||
-	     (copy_width < (1 << 14) &&
-	      copy_height < (1 << 14) &&
-	      copy_depth < (1 << 11))) &&
-	    /* HW limitation - some GFX7 parts: */
-	    ((sctx->family != CHIP_BONAIRE &&
-	      sctx->family != CHIP_KAVERI) ||
-	     (srcx + copy_width != (1 << 14) &&
-	      srcy + copy_height != (1 << 14)))) {
-		struct radeon_cmdbuf *cs = sctx->sdma_cs;
-
-		si_need_dma_space(sctx, 13, &sdst->buffer, &ssrc->buffer);
-
-		radeon_emit(cs, CIK_SDMA_PACKET(CIK_SDMA_OPCODE_COPY,
-						CIK_SDMA_COPY_SUB_OPCODE_LINEAR_SUB_WINDOW, 0) |
-			    (util_logbase2(bpp) << 29));
-		radeon_emit(cs, src_address);
-		radeon_emit(cs, src_address >> 32);
-		radeon_emit(cs, srcx | (srcy << 16));
-		radeon_emit(cs, srcz | ((src_pitch - 1) << 16));
-		radeon_emit(cs, src_slice_pitch - 1);
-		radeon_emit(cs, dst_address);
-		radeon_emit(cs, dst_address >> 32);
-		radeon_emit(cs, dstx | (dsty << 16));
-		radeon_emit(cs, dstz | ((dst_pitch - 1) << 16));
-		radeon_emit(cs, dst_slice_pitch - 1);
-		if (sctx->chip_class == GFX7) {
-			radeon_emit(cs, copy_width | (copy_height << 16));
-			radeon_emit(cs, copy_depth);
-		} else {
-			radeon_emit(cs, (copy_width - 1) | ((copy_height - 1) << 16));
-			radeon_emit(cs, (copy_depth - 1));
-		}
-		return true;
-	}
-
-	/* Tiled <-> linear sub-window copy. */
-	if ((src_mode >= RADEON_SURF_MODE_1D) != (dst_mode >= RADEON_SURF_MODE_1D)) {
-		struct si_texture *tiled = src_mode >= RADEON_SURF_MODE_1D ? ssrc : sdst;
-		struct si_texture *linear = tiled == ssrc ? sdst : ssrc;
-		unsigned tiled_level =	tiled	== ssrc ? src_level : dst_level;
-		unsigned linear_level =	linear	== ssrc ? src_level : dst_level;
-		unsigned tiled_x =	tiled	== ssrc ? srcx : dstx;
-		unsigned linear_x =	linear  == ssrc ? srcx : dstx;
-		unsigned tiled_y =	tiled	== ssrc ? srcy : dsty;
-		unsigned linear_y =	linear  == ssrc ? srcy : dsty;
-		unsigned tiled_z =	tiled	== ssrc ? srcz : dstz;
-		unsigned linear_z =	linear  == ssrc ? srcz : dstz;
-		unsigned tiled_width =	tiled	== ssrc ? src_width : dst_width;
-		unsigned linear_width =	linear	== ssrc ? src_width : dst_width;
-		unsigned tiled_pitch =	tiled	== ssrc ? src_pitch : dst_pitch;
-		unsigned linear_pitch =	linear	== ssrc ? src_pitch : dst_pitch;
-		unsigned tiled_slice_pitch  = tiled  == ssrc ? src_slice_pitch : dst_slice_pitch;
-		unsigned linear_slice_pitch = linear == ssrc ? src_slice_pitch : dst_slice_pitch;
-		uint64_t tiled_address =  tiled  == ssrc ? src_address : dst_address;
-		uint64_t linear_address = linear == ssrc ? src_address : dst_address;
-		unsigned tiled_micro_mode = tiled == ssrc ? src_micro_mode : dst_micro_mode;
-
-		assert(tiled_pitch % 8 == 0);
-		assert(tiled_slice_pitch % 64 == 0);
-		unsigned pitch_tile_max = tiled_pitch / 8 - 1;
-		unsigned slice_tile_max = tiled_slice_pitch / 64 - 1;
-		unsigned xalign = MAX2(1, 4 / bpp);
-		unsigned copy_width_aligned = copy_width;
-
-		/* If the region ends at the last pixel and is unaligned, we
-		 * can copy the remainder of the line that is not visible to
-		 * make it aligned.
-		 */
-		if (copy_width % xalign != 0 &&
-		    linear_x + copy_width == linear_width &&
-		    tiled_x  + copy_width == tiled_width &&
-		    linear_x + align(copy_width, xalign) <= linear_pitch &&
-		    tiled_x  + align(copy_width, xalign) <= tiled_pitch)
-			copy_width_aligned = align(copy_width, xalign);
-
-		/* HW limitations. */
-		if ((sctx->family == CHIP_BONAIRE ||
-		     sctx->family == CHIP_KAVERI) &&
-		    linear_pitch - 1 == 0x3fff &&
-		    bpp == 16)
-			return false;
-
-		if (sctx->chip_class == GFX7 &&
-		    (copy_width_aligned == (1 << 14) ||
-		     copy_height == (1 << 14) ||
-		     copy_depth == (1 << 11)))
-			return false;
-
-		if ((sctx->family == CHIP_BONAIRE ||
-		     sctx->family == CHIP_KAVERI ||
-		     sctx->family == CHIP_KABINI) &&
-		    (tiled_x + copy_width == (1 << 14) ||
-		     tiled_y + copy_height == (1 << 14)))
-			return false;
-
-		/* The hw can read outside of the given linear buffer bounds,
-		 * or access those pages but not touch the memory in case
-		 * of writes. (it still causes a VM fault)
-		 *
-		 * Out-of-bounds memory access or page directory access must
-		 * be prevented.
-		 */
-		int64_t start_linear_address, end_linear_address;
-		unsigned granularity;
-
-		/* Deduce the size of reads from the linear surface. */
-		switch (tiled_micro_mode) {
-		case V_009910_ADDR_SURF_DISPLAY_MICRO_TILING:
-			granularity = bpp == 1 ? 64 / (8*bpp) :
-						 128 / (8*bpp);
-			break;
-		case V_009910_ADDR_SURF_THIN_MICRO_TILING:
-		case V_009910_ADDR_SURF_DEPTH_MICRO_TILING:
-			if (0 /* TODO: THICK microtiling */)
-				granularity = bpp == 1 ? 32 / (8*bpp) :
-					      bpp == 2 ? 64 / (8*bpp) :
-					      bpp <= 8 ? 128 / (8*bpp) :
-							 256 / (8*bpp);
-			else
-				granularity = bpp <= 2 ? 64 / (8*bpp) :
-					      bpp <= 8 ? 128 / (8*bpp) :
-							 256 / (8*bpp);
-			break;
-		default:
-			return false;
-		}
-
-		/* The linear reads start at tiled_x & ~(granularity - 1).
-		 * If linear_x == 0 && tiled_x % granularity != 0, the hw
-		 * starts reading from an address preceding linear_address!!!
-		 */
-		start_linear_address =
-			linear->surface.u.legacy.level[linear_level].offset +
-			bpp * (linear_z * linear_slice_pitch +
-			       linear_y * linear_pitch +
-			       linear_x);
-		start_linear_address -= (int)(bpp * (tiled_x % granularity));
-
-		end_linear_address =
-			linear->surface.u.legacy.level[linear_level].offset +
-			bpp * ((linear_z + copy_depth - 1) * linear_slice_pitch +
-			       (linear_y + copy_height - 1) * linear_pitch +
-			       (linear_x + copy_width));
-
-		if ((tiled_x + copy_width) % granularity)
-			end_linear_address += granularity -
-					      (tiled_x + copy_width) % granularity;
-
-		if (start_linear_address < 0 ||
-		    end_linear_address > linear->surface.surf_size)
-			return false;
-
-		/* Check requirements. */
-		if (tiled_address % 256 == 0 &&
-		    linear_address % 4 == 0 &&
-		    linear_pitch % xalign == 0 &&
-		    linear_x % xalign == 0 &&
-		    tiled_x % xalign == 0 &&
-		    copy_width_aligned % xalign == 0 &&
-		    tiled_micro_mode != V_009910_ADDR_SURF_ROTATED_MICRO_TILING &&
-		    /* check if everything fits into the bitfields */
-		    tiled->surface.u.legacy.tile_split <= 4096 &&
-		    pitch_tile_max < (1 << 11) &&
-		    slice_tile_max < (1 << 22) &&
-		    linear_pitch <= (1 << 14) &&
-		    linear_slice_pitch <= (1 << 28) &&
-		    copy_width_aligned <= (1 << 14) &&
-		    copy_height <= (1 << 14) &&
-		    copy_depth <= (1 << 11)) {
-			struct radeon_cmdbuf *cs = sctx->sdma_cs;
-			uint32_t direction = linear == sdst ? 1u << 31 : 0;
-
-			si_need_dma_space(sctx, 14, &sdst->buffer, &ssrc->buffer);
-
-			radeon_emit(cs, CIK_SDMA_PACKET(CIK_SDMA_OPCODE_COPY,
-							CIK_SDMA_COPY_SUB_OPCODE_TILED_SUB_WINDOW, 0) |
-					direction);
-			radeon_emit(cs, tiled_address);
-			radeon_emit(cs, tiled_address >> 32);
-			radeon_emit(cs, tiled_x | (tiled_y << 16));
-			radeon_emit(cs, tiled_z | (pitch_tile_max << 16));
-			radeon_emit(cs, slice_tile_max);
-			radeon_emit(cs, encode_tile_info(sctx, tiled, tiled_level, true));
-			radeon_emit(cs, linear_address);
-			radeon_emit(cs, linear_address >> 32);
-			radeon_emit(cs, linear_x | (linear_y << 16));
-			radeon_emit(cs, linear_z | ((linear_pitch - 1) << 16));
-			radeon_emit(cs, linear_slice_pitch - 1);
-			if (sctx->chip_class == GFX7) {
-				radeon_emit(cs, copy_width_aligned | (copy_height << 16));
-				radeon_emit(cs, copy_depth);
-			} else {
-				radeon_emit(cs, (copy_width_aligned - 1) | ((copy_height - 1) << 16));
-				radeon_emit(cs, (copy_depth - 1));
-			}
-			return true;
-		}
-	}
-
-	/* Tiled -> Tiled sub-window copy. */
-	if (dst_mode >= RADEON_SURF_MODE_1D &&
-	    src_mode >= RADEON_SURF_MODE_1D &&
-	    /* check if these fit into the bitfields */
-	    src_address % 256 == 0 &&
-	    dst_address % 256 == 0 &&
-	    ssrc->surface.u.legacy.tile_split <= 4096 &&
-	    sdst->surface.u.legacy.tile_split <= 4096 &&
-	    dstx % 8 == 0 &&
-	    dsty % 8 == 0 &&
-	    srcx % 8 == 0 &&
-	    srcy % 8 == 0 &&
-	    /* this can either be equal, or display->rotated (GFX8+ only) */
-	    (src_micro_mode == dst_micro_mode ||
-	     (sctx->chip_class >= GFX8 &&
-	      src_micro_mode == V_009910_ADDR_SURF_DISPLAY_MICRO_TILING &&
-	      dst_micro_mode == V_009910_ADDR_SURF_ROTATED_MICRO_TILING))) {
-		assert(src_pitch % 8 == 0);
-		assert(dst_pitch % 8 == 0);
-		assert(src_slice_pitch % 64 == 0);
-		assert(dst_slice_pitch % 64 == 0);
-		unsigned src_pitch_tile_max = src_pitch / 8 - 1;
-		unsigned dst_pitch_tile_max = dst_pitch / 8 - 1;
-		unsigned src_slice_tile_max = src_slice_pitch / 64 - 1;
-		unsigned dst_slice_tile_max = dst_slice_pitch / 64 - 1;
-		unsigned copy_width_aligned = copy_width;
-		unsigned copy_height_aligned = copy_height;
-
-		/* If the region ends at the last pixel and is unaligned, we
-		 * can copy the remainder of the tile that is not visible to
-		 * make it aligned.
-		 */
-		if (copy_width % 8 != 0 &&
-		    srcx + copy_width == src_width &&
-		    dstx + copy_width == dst_width)
-			copy_width_aligned = align(copy_width, 8);
-
-		if (copy_height % 8 != 0 &&
-		    srcy + copy_height == src_height &&
-		    dsty + copy_height == dst_height)
-			copy_height_aligned = align(copy_height, 8);
-
-		/* check if these fit into the bitfields */
-		if (src_pitch_tile_max < (1 << 11) &&
-		    dst_pitch_tile_max < (1 << 11) &&
-		    src_slice_tile_max < (1 << 22) &&
-		    dst_slice_tile_max < (1 << 22) &&
-		    copy_width_aligned <= (1 << 14) &&
-		    copy_height_aligned <= (1 << 14) &&
-		    copy_depth <= (1 << 11) &&
-		    copy_width_aligned % 8 == 0 &&
-		    copy_height_aligned % 8 == 0 &&
-		    /* HW limitation - GFX7: */
-		    (sctx->chip_class != GFX7 ||
-		     (copy_width_aligned < (1 << 14) &&
-		      copy_height_aligned < (1 << 14) &&
-		      copy_depth < (1 << 11))) &&
-		    /* HW limitation - some GFX7 parts: */
-		    ((sctx->family != CHIP_BONAIRE &&
-		      sctx->family != CHIP_KAVERI &&
-		      sctx->family != CHIP_KABINI) ||
-		     (srcx + copy_width_aligned != (1 << 14) &&
-		      srcy + copy_height_aligned != (1 << 14) &&
-		      dstx + copy_width != (1 << 14)))) {
-			struct radeon_cmdbuf *cs = sctx->sdma_cs;
-
-			si_need_dma_space(sctx, 15, &sdst->buffer, &ssrc->buffer);
-
-			radeon_emit(cs, CIK_SDMA_PACKET(CIK_SDMA_OPCODE_COPY,
-							CIK_SDMA_COPY_SUB_OPCODE_T2T_SUB_WINDOW, 0));
-			radeon_emit(cs, src_address);
-			radeon_emit(cs, src_address >> 32);
-			radeon_emit(cs, srcx | (srcy << 16));
-			radeon_emit(cs, srcz | (src_pitch_tile_max << 16));
-			radeon_emit(cs, src_slice_tile_max);
-			radeon_emit(cs, encode_tile_info(sctx, ssrc, src_level, true));
-			radeon_emit(cs, dst_address);
-			radeon_emit(cs, dst_address >> 32);
-			radeon_emit(cs, dstx | (dsty << 16));
-			radeon_emit(cs, dstz | (dst_pitch_tile_max << 16));
-			radeon_emit(cs, dst_slice_tile_max);
-			radeon_emit(cs, encode_tile_info(sctx, sdst, dst_level, false));
-			if (sctx->chip_class == GFX7) {
-				radeon_emit(cs, copy_width_aligned |
-						(copy_height_aligned << 16));
-				radeon_emit(cs, copy_depth);
-			} else {
-				radeon_emit(cs, (copy_width_aligned - 8) |
-						((copy_height_aligned - 8) << 16));
-				radeon_emit(cs, (copy_depth - 1));
-			}
-			return true;
-		}
-	}
-
-	return false;
+   struct radeon_info *info = &sctx->screen->info;
+   struct si_texture *ssrc = (struct si_texture *)src;
+   struct si_texture *sdst = (struct si_texture *)dst;
+   unsigned bpp = sdst->surface.bpe;
+   uint64_t dst_address = sdst->buffer.gpu_address + sdst->surface.u.legacy.level[dst_level].offset;
+   uint64_t src_address = ssrc->buffer.gpu_address + ssrc->surface.u.legacy.level[src_level].offset;
+   unsigned dst_mode = sdst->surface.u.legacy.level[dst_level].mode;
+   unsigned src_mode = ssrc->surface.u.legacy.level[src_level].mode;
+   unsigned dst_tile_index = sdst->surface.u.legacy.tiling_index[dst_level];
+   unsigned src_tile_index = ssrc->surface.u.legacy.tiling_index[src_level];
+   unsigned dst_tile_mode = info->si_tile_mode_array[dst_tile_index];
+   unsigned src_tile_mode = info->si_tile_mode_array[src_tile_index];
+   unsigned dst_micro_mode = G_009910_MICRO_TILE_MODE_NEW(dst_tile_mode);
+   unsigned src_micro_mode = G_009910_MICRO_TILE_MODE_NEW(src_tile_mode);
+   unsigned dst_tile_swizzle = dst_mode == RADEON_SURF_MODE_2D ? sdst->surface.tile_swizzle : 0;
+   unsigned src_tile_swizzle = src_mode == RADEON_SURF_MODE_2D ? ssrc->surface.tile_swizzle : 0;
+   unsigned dst_pitch = sdst->surface.u.legacy.level[dst_level].nblk_x;
+   unsigned src_pitch = ssrc->surface.u.legacy.level[src_level].nblk_x;
+   uint64_t dst_slice_pitch =
+      ((uint64_t)sdst->surface.u.legacy.level[dst_level].slice_size_dw * 4) / bpp;
+   uint64_t src_slice_pitch =
+      ((uint64_t)ssrc->surface.u.legacy.level[src_level].slice_size_dw * 4) / bpp;
+   unsigned dst_width = minify_as_blocks(sdst->buffer.b.b.width0, dst_level, sdst->surface.blk_w);
+   unsigned src_width = minify_as_blocks(ssrc->buffer.b.b.width0, src_level, ssrc->surface.blk_w);
+   unsigned dst_height = minify_as_blocks(sdst->buffer.b.b.height0, dst_level, sdst->surface.blk_h);
+   unsigned src_height = minify_as_blocks(ssrc->buffer.b.b.height0, src_level, ssrc->surface.blk_h);
+   unsigned srcx = src_box->x / ssrc->surface.blk_w;
+   unsigned srcy = src_box->y / ssrc->surface.blk_h;
+   unsigned srcz = src_box->z;
+   unsigned copy_width = DIV_ROUND_UP(src_box->width, ssrc->surface.blk_w);
+   unsigned copy_height = DIV_ROUND_UP(src_box->height, ssrc->surface.blk_h);
+   unsigned copy_depth = src_box->depth;
+
+   assert(src_level <= src->last_level);
+   assert(dst_level <= dst->last_level);
+   assert(sdst->surface.u.legacy.level[dst_level].offset +
+             dst_slice_pitch * bpp * (dstz + src_box->depth) <=
+          sdst->buffer.buf->size);
+   assert(ssrc->surface.u.legacy.level[src_level].offset +
+             src_slice_pitch * bpp * (srcz + src_box->depth) <=
+          ssrc->buffer.buf->size);
+
+   if (!si_prepare_for_dma_blit(sctx, sdst, dst_level, dstx, dsty, dstz, ssrc, src_level, src_box))
+      return false;
+
+   dstx /= sdst->surface.blk_w;
+   dsty /= sdst->surface.blk_h;
+
+   if (srcx >= (1 << 14) || srcy >= (1 << 14) || srcz >= (1 << 11) || dstx >= (1 << 14) ||
+       dsty >= (1 << 14) || dstz >= (1 << 11))
+      return false;
+
+   dst_address |= dst_tile_swizzle << 8;
+   src_address |= src_tile_swizzle << 8;
+
+   /* Linear -> linear sub-window copy. */
+   if (dst_mode == RADEON_SURF_MODE_LINEAR_ALIGNED && src_mode == RADEON_SURF_MODE_LINEAR_ALIGNED &&
+       /* check if everything fits into the bitfields */
+       src_pitch <= (1 << 14) && dst_pitch <= (1 << 14) && src_slice_pitch <= (1 << 28) &&
+       dst_slice_pitch <= (1 << 28) && copy_width <= (1 << 14) && copy_height <= (1 << 14) &&
+       copy_depth <= (1 << 11) &&
+       /* HW limitation - GFX7: */
+       (sctx->chip_class != GFX7 ||
+        (copy_width < (1 << 14) && copy_height < (1 << 14) && copy_depth < (1 << 11))) &&
+       /* HW limitation - some GFX7 parts: */
+       ((sctx->family != CHIP_BONAIRE && sctx->family != CHIP_KAVERI) ||
+        (srcx + copy_width != (1 << 14) && srcy + copy_height != (1 << 14)))) {
+      struct radeon_cmdbuf *cs = sctx->sdma_cs;
+
+      si_need_dma_space(sctx, 13, &sdst->buffer, &ssrc->buffer);
+
+      radeon_emit(
+         cs, CIK_SDMA_PACKET(CIK_SDMA_OPCODE_COPY, CIK_SDMA_COPY_SUB_OPCODE_LINEAR_SUB_WINDOW, 0) |
+                (util_logbase2(bpp) << 29));
+      radeon_emit(cs, src_address);
+      radeon_emit(cs, src_address >> 32);
+      radeon_emit(cs, srcx | (srcy << 16));
+      radeon_emit(cs, srcz | ((src_pitch - 1) << 16));
+      radeon_emit(cs, src_slice_pitch - 1);
+      radeon_emit(cs, dst_address);
+      radeon_emit(cs, dst_address >> 32);
+      radeon_emit(cs, dstx | (dsty << 16));
+      radeon_emit(cs, dstz | ((dst_pitch - 1) << 16));
+      radeon_emit(cs, dst_slice_pitch - 1);
+      if (sctx->chip_class == GFX7) {
+         radeon_emit(cs, copy_width | (copy_height << 16));
+         radeon_emit(cs, copy_depth);
+      } else {
+         radeon_emit(cs, (copy_width - 1) | ((copy_height - 1) << 16));
+         radeon_emit(cs, (copy_depth - 1));
+      }
+      return true;
+   }
+
+   /* Tiled <-> linear sub-window copy. */
+   if ((src_mode >= RADEON_SURF_MODE_1D) != (dst_mode >= RADEON_SURF_MODE_1D)) {
+      struct si_texture *tiled = src_mode >= RADEON_SURF_MODE_1D ? ssrc : sdst;
+      struct si_texture *linear = tiled == ssrc ? sdst : ssrc;
+      unsigned tiled_level = tiled == ssrc ? src_level : dst_level;
+      unsigned linear_level = linear == ssrc ? src_level : dst_level;
+      unsigned tiled_x = tiled == ssrc ? srcx : dstx;
+      unsigned linear_x = linear == ssrc ? srcx : dstx;
+      unsigned tiled_y = tiled == ssrc ? srcy : dsty;
+      unsigned linear_y = linear == ssrc ? srcy : dsty;
+      unsigned tiled_z = tiled == ssrc ? srcz : dstz;
+      unsigned linear_z = linear == ssrc ? srcz : dstz;
+      unsigned tiled_width = tiled == ssrc ? src_width : dst_width;
+      unsigned linear_width = linear == ssrc ? src_width : dst_width;
+      unsigned tiled_pitch = tiled == ssrc ? src_pitch : dst_pitch;
+      unsigned linear_pitch = linear == ssrc ? src_pitch : dst_pitch;
+      unsigned tiled_slice_pitch = tiled == ssrc ? src_slice_pitch : dst_slice_pitch;
+      unsigned linear_slice_pitch = linear == ssrc ? src_slice_pitch : dst_slice_pitch;
+      uint64_t tiled_address = tiled == ssrc ? src_address : dst_address;
+      uint64_t linear_address = linear == ssrc ? src_address : dst_address;
+      unsigned tiled_micro_mode = tiled == ssrc ? src_micro_mode : dst_micro_mode;
+
+      assert(tiled_pitch % 8 == 0);
+      assert(tiled_slice_pitch % 64 == 0);
+      unsigned pitch_tile_max = tiled_pitch / 8 - 1;
+      unsigned slice_tile_max = tiled_slice_pitch / 64 - 1;
+      unsigned xalign = MAX2(1, 4 / bpp);
+      unsigned copy_width_aligned = copy_width;
+
+      /* If the region ends at the last pixel and is unaligned, we
+       * can copy the remainder of the line that is not visible to
+       * make it aligned.
+       */
+      if (copy_width % xalign != 0 && linear_x + copy_width == linear_width &&
+          tiled_x + copy_width == tiled_width &&
+          linear_x + align(copy_width, xalign) <= linear_pitch &&
+          tiled_x + align(copy_width, xalign) <= tiled_pitch)
+         copy_width_aligned = align(copy_width, xalign);
+
+      /* HW limitations. */
+      if ((sctx->family == CHIP_BONAIRE || sctx->family == CHIP_KAVERI) &&
+          linear_pitch - 1 == 0x3fff && bpp == 16)
+         return false;
+
+      if (sctx->chip_class == GFX7 &&
+          (copy_width_aligned == (1 << 14) || copy_height == (1 << 14) || copy_depth == (1 << 11)))
+         return false;
+
+      if ((sctx->family == CHIP_BONAIRE || sctx->family == CHIP_KAVERI ||
+           sctx->family == CHIP_KABINI) &&
+          (tiled_x + copy_width == (1 << 14) || tiled_y + copy_height == (1 << 14)))
+         return false;
+
+      /* The hw can read outside of the given linear buffer bounds,
+       * or access those pages but not touch the memory in case
+       * of writes. (it still causes a VM fault)
+       *
+       * Out-of-bounds memory access or page directory access must
+       * be prevented.
+       */
+      int64_t start_linear_address, end_linear_address;
+      unsigned granularity;
+
+      /* Deduce the size of reads from the linear surface. */
+      switch (tiled_micro_mode) {
+      case V_009910_ADDR_SURF_DISPLAY_MICRO_TILING:
+         granularity = bpp == 1 ? 64 / (8 * bpp) : 128 / (8 * bpp);
+         break;
+      case V_009910_ADDR_SURF_THIN_MICRO_TILING:
+      case V_009910_ADDR_SURF_DEPTH_MICRO_TILING:
+         if (0 /* TODO: THICK microtiling */)
+            granularity =
+               bpp == 1 ? 32 / (8 * bpp)
+                        : bpp == 2 ? 64 / (8 * bpp) : bpp <= 8 ? 128 / (8 * bpp) : 256 / (8 * bpp);
+         else
+            granularity = bpp <= 2 ? 64 / (8 * bpp) : bpp <= 8 ? 128 / (8 * bpp) : 256 / (8 * bpp);
+         break;
+      default:
+         return false;
+      }
+
+      /* The linear reads start at tiled_x & ~(granularity - 1).
+       * If linear_x == 0 && tiled_x % granularity != 0, the hw
+       * starts reading from an address preceding linear_address!!!
+       */
+      start_linear_address =
+         linear->surface.u.legacy.level[linear_level].offset +
+         bpp * (linear_z * linear_slice_pitch + linear_y * linear_pitch + linear_x);
+      start_linear_address -= (int)(bpp * (tiled_x % granularity));
+
+      end_linear_address =
+         linear->surface.u.legacy.level[linear_level].offset +
+         bpp * ((linear_z + copy_depth - 1) * linear_slice_pitch +
+                (linear_y + copy_height - 1) * linear_pitch + (linear_x + copy_width));
+
+      if ((tiled_x + copy_width) % granularity)
+         end_linear_address += granularity - (tiled_x + copy_width) % granularity;
+
+      if (start_linear_address < 0 || end_linear_address > linear->surface.surf_size)
+         return false;
+
+      /* Check requirements. */
+      if (tiled_address % 256 == 0 && linear_address % 4 == 0 && linear_pitch % xalign == 0 &&
+          linear_x % xalign == 0 && tiled_x % xalign == 0 && copy_width_aligned % xalign == 0 &&
+          tiled_micro_mode != V_009910_ADDR_SURF_ROTATED_MICRO_TILING &&
+          /* check if everything fits into the bitfields */
+          tiled->surface.u.legacy.tile_split <= 4096 && pitch_tile_max < (1 << 11) &&
+          slice_tile_max < (1 << 22) && linear_pitch <= (1 << 14) &&
+          linear_slice_pitch <= (1 << 28) && copy_width_aligned <= (1 << 14) &&
+          copy_height <= (1 << 14) && copy_depth <= (1 << 11)) {
+         struct radeon_cmdbuf *cs = sctx->sdma_cs;
+         uint32_t direction = linear == sdst ? 1u << 31 : 0;
+
+         si_need_dma_space(sctx, 14, &sdst->buffer, &ssrc->buffer);
+
+         radeon_emit(cs, CIK_SDMA_PACKET(CIK_SDMA_OPCODE_COPY,
+                                         CIK_SDMA_COPY_SUB_OPCODE_TILED_SUB_WINDOW, 0) |
+                            direction);
+         radeon_emit(cs, tiled_address);
+         radeon_emit(cs, tiled_address >> 32);
+         radeon_emit(cs, tiled_x | (tiled_y << 16));
+         radeon_emit(cs, tiled_z | (pitch_tile_max << 16));
+         radeon_emit(cs, slice_tile_max);
+         radeon_emit(cs, encode_tile_info(sctx, tiled, tiled_level, true));
+         radeon_emit(cs, linear_address);
+         radeon_emit(cs, linear_address >> 32);
+         radeon_emit(cs, linear_x | (linear_y << 16));
+         radeon_emit(cs, linear_z | ((linear_pitch - 1) << 16));
+         radeon_emit(cs, linear_slice_pitch - 1);
+         if (sctx->chip_class == GFX7) {
+            radeon_emit(cs, copy_width_aligned | (copy_height << 16));
+            radeon_emit(cs, copy_depth);
+         } else {
+            radeon_emit(cs, (copy_width_aligned - 1) | ((copy_height - 1) << 16));
+            radeon_emit(cs, (copy_depth - 1));
+         }
+         return true;
+      }
+   }
+
+   /* Tiled -> Tiled sub-window copy. */
+   if (dst_mode >= RADEON_SURF_MODE_1D && src_mode >= RADEON_SURF_MODE_1D &&
+       /* check if these fit into the bitfields */
+       src_address % 256 == 0 && dst_address % 256 == 0 &&
+       ssrc->surface.u.legacy.tile_split <= 4096 && sdst->surface.u.legacy.tile_split <= 4096 &&
+       dstx % 8 == 0 && dsty % 8 == 0 && srcx % 8 == 0 && srcy % 8 == 0 &&
+       /* this can either be equal, or display->rotated (GFX8+ only) */
+       (src_micro_mode == dst_micro_mode ||
+        (sctx->chip_class >= GFX8 && src_micro_mode == V_009910_ADDR_SURF_DISPLAY_MICRO_TILING &&
+         dst_micro_mode == V_009910_ADDR_SURF_ROTATED_MICRO_TILING))) {
+      assert(src_pitch % 8 == 0);
+      assert(dst_pitch % 8 == 0);
+      assert(src_slice_pitch % 64 == 0);
+      assert(dst_slice_pitch % 64 == 0);
+      unsigned src_pitch_tile_max = src_pitch / 8 - 1;
+      unsigned dst_pitch_tile_max = dst_pitch / 8 - 1;
+      unsigned src_slice_tile_max = src_slice_pitch / 64 - 1;
+      unsigned dst_slice_tile_max = dst_slice_pitch / 64 - 1;
+      unsigned copy_width_aligned = copy_width;
+      unsigned copy_height_aligned = copy_height;
+
+      /* If the region ends at the last pixel and is unaligned, we
+       * can copy the remainder of the tile that is not visible to
+       * make it aligned.
+       */
+      if (copy_width % 8 != 0 && srcx + copy_width == src_width && dstx + copy_width == dst_width)
+         copy_width_aligned = align(copy_width, 8);
+
+      if (copy_height % 8 != 0 && srcy + copy_height == src_height &&
+          dsty + copy_height == dst_height)
+         copy_height_aligned = align(copy_height, 8);
+
+      /* check if these fit into the bitfields */
+      if (src_pitch_tile_max < (1 << 11) && dst_pitch_tile_max < (1 << 11) &&
+          src_slice_tile_max < (1 << 22) && dst_slice_tile_max < (1 << 22) &&
+          copy_width_aligned <= (1 << 14) && copy_height_aligned <= (1 << 14) &&
+          copy_depth <= (1 << 11) && copy_width_aligned % 8 == 0 && copy_height_aligned % 8 == 0 &&
+          /* HW limitation - GFX7: */
+          (sctx->chip_class != GFX7 ||
+           (copy_width_aligned < (1 << 14) && copy_height_aligned < (1 << 14) &&
+            copy_depth < (1 << 11))) &&
+          /* HW limitation - some GFX7 parts: */
+          ((sctx->family != CHIP_BONAIRE && sctx->family != CHIP_KAVERI &&
+            sctx->family != CHIP_KABINI) ||
+           (srcx + copy_width_aligned != (1 << 14) && srcy + copy_height_aligned != (1 << 14) &&
+            dstx + copy_width != (1 << 14)))) {
+         struct radeon_cmdbuf *cs = sctx->sdma_cs;
+
+         si_need_dma_space(sctx, 15, &sdst->buffer, &ssrc->buffer);
+
+         radeon_emit(
+            cs, CIK_SDMA_PACKET(CIK_SDMA_OPCODE_COPY, CIK_SDMA_COPY_SUB_OPCODE_T2T_SUB_WINDOW, 0));
+         radeon_emit(cs, src_address);
+         radeon_emit(cs, src_address >> 32);
+         radeon_emit(cs, srcx | (srcy << 16));
+         radeon_emit(cs, srcz | (src_pitch_tile_max << 16));
+         radeon_emit(cs, src_slice_tile_max);
+         radeon_emit(cs, encode_tile_info(sctx, ssrc, src_level, true));
+         radeon_emit(cs, dst_address);
+         radeon_emit(cs, dst_address >> 32);
+         radeon_emit(cs, dstx | (dsty << 16));
+         radeon_emit(cs, dstz | (dst_pitch_tile_max << 16));
+         radeon_emit(cs, dst_slice_tile_max);
+         radeon_emit(cs, encode_tile_info(sctx, sdst, dst_level, false));
+         if (sctx->chip_class == GFX7) {
+            radeon_emit(cs, copy_width_aligned | (copy_height_aligned << 16));
+            radeon_emit(cs, copy_depth);
+         } else {
+            radeon_emit(cs, (copy_width_aligned - 8) | ((copy_height_aligned - 8) << 16));
+            radeon_emit(cs, (copy_depth - 1));
+         }
+         return true;
+      }
+   }
+
+   return false;
 }
 
-static void cik_sdma_copy(struct pipe_context *ctx,
-			  struct pipe_resource *dst,
-			  unsigned dst_level,
-			  unsigned dstx, unsigned dsty, unsigned dstz,
-			  struct pipe_resource *src,
-			  unsigned src_level,
-			  const struct pipe_box *src_box)
+static void cik_sdma_copy(struct pipe_context *ctx, struct pipe_resource *dst, unsigned dst_level,
+                          unsigned dstx, unsigned dsty, unsigned dstz, struct pipe_resource *src,
+                          unsigned src_level, const struct pipe_box *src_box)
 {
-	struct si_context *sctx = (struct si_context *)ctx;
-
-	assert(src->target != PIPE_BUFFER);
-
-	if (!sctx->sdma_cs ||
-	    src->flags & PIPE_RESOURCE_FLAG_SPARSE ||
-	    dst->flags & PIPE_RESOURCE_FLAG_SPARSE)
-		goto fallback;
-
-	/* SDMA causes corruption. See:
-	 *   https://bugs.freedesktop.org/show_bug.cgi?id=110575
-	 *   https://bugs.freedesktop.org/show_bug.cgi?id=110635
-	 *
-	 * Keep SDMA enabled on APUs.
-	 */
-	if (sctx->screen->debug_flags & DBG(FORCE_SDMA) ||
-	    (!sctx->screen->info.has_dedicated_vram &&
-	     !(sctx->screen->debug_flags & DBG(NO_SDMA_COPY_IMAGE)))) {
-		if ((sctx->chip_class == GFX7 || sctx->chip_class == GFX8) &&
-		    cik_sdma_copy_texture(sctx, dst, dst_level, dstx, dsty, dstz,
-					  src, src_level, src_box))
-			return;
-		else if (sctx->chip_class == GFX9 &&
-			 si_sdma_v4_copy_texture(sctx, dst, dst_level, dstx, dsty, dstz,
-						 src, src_level, src_box))
-			return;
-	}
+   struct si_context *sctx = (struct si_context *)ctx;
+
+   assert(src->target != PIPE_BUFFER);
+
+   if (!sctx->sdma_cs || src->flags & PIPE_RESOURCE_FLAG_SPARSE ||
+       dst->flags & PIPE_RESOURCE_FLAG_SPARSE)
+      goto fallback;
+
+   /* SDMA causes corruption. See:
+    *   https://bugs.freedesktop.org/show_bug.cgi?id=110575
+    *   https://bugs.freedesktop.org/show_bug.cgi?id=110635
+    *
+    * Keep SDMA enabled on APUs.
+    */
+   if (sctx->screen->debug_flags & DBG(FORCE_SDMA) ||
+       (!sctx->screen->info.has_dedicated_vram &&
+        !(sctx->screen->debug_flags & DBG(NO_SDMA_COPY_IMAGE)))) {
+      if ((sctx->chip_class == GFX7 || sctx->chip_class == GFX8) &&
+          cik_sdma_copy_texture(sctx, dst, dst_level, dstx, dsty, dstz, src, src_level, src_box))
+         return;
+      else if (sctx->chip_class == GFX9 && si_sdma_v4_copy_texture(sctx, dst, dst_level, dstx, dsty,
+                                                                   dstz, src, src_level, src_box))
+         return;
+   }
 
 fallback:
-	si_resource_copy_region(ctx, dst, dst_level, dstx, dsty, dstz,
-				src, src_level, src_box);
+   si_resource_copy_region(ctx, dst, dst_level, dstx, dsty, dstz, src, src_level, src_box);
 }
 
 void cik_init_sdma_functions(struct si_context *sctx)
 {
-	sctx->dma_copy = cik_sdma_copy;
+   sctx->dma_copy = cik_sdma_copy;
 }
diff --git a/src/gallium/drivers/radeonsi/driinfo_radeonsi.h b/src/gallium/drivers/radeonsi/driinfo_radeonsi.h
index 59b3d0a6b49..1570f286053 100644
--- a/src/gallium/drivers/radeonsi/driinfo_radeonsi.h
+++ b/src/gallium/drivers/radeonsi/driinfo_radeonsi.h
@@ -1,18 +1,18 @@
 // DriConf options specific to radeonsi
 DRI_CONF_SECTION_PERFORMANCE
-    DRI_CONF_ADAPTIVE_SYNC("true")
-    DRI_CONF_RADEONSI_ASSUME_NO_Z_FIGHTS("false")
-    DRI_CONF_RADEONSI_COMMUTATIVE_BLEND_ADD("false")
-    DRI_CONF_RADEONSI_ZERO_ALL_VRAM_ALLOCS("false")
+DRI_CONF_ADAPTIVE_SYNC("true")
+DRI_CONF_RADEONSI_ASSUME_NO_Z_FIGHTS("false")
+DRI_CONF_RADEONSI_COMMUTATIVE_BLEND_ADD("false")
+DRI_CONF_RADEONSI_ZERO_ALL_VRAM_ALLOCS("false")
 DRI_CONF_SECTION_END
 
 DRI_CONF_SECTION_DEBUG
 
 //= BEGIN VERBATIM
-#define OPT_BOOL(name, dflt, description) \
-	DRI_CONF_OPT_BEGIN_B(radeonsi_##name, #dflt) \
-		DRI_CONF_DESC(en, description) \
-	DRI_CONF_OPT_END
+#define OPT_BOOL(name, dflt, description)                                                          \
+   DRI_CONF_OPT_BEGIN_B(radeonsi_##name, #dflt)                                                    \
+   DRI_CONF_DESC(en, description)                                                                  \
+   DRI_CONF_OPT_END
 
 #include "radeonsi/si_debug_options.h"
 //= END VERBATIM
diff --git a/src/gallium/drivers/radeonsi/gfx10_query.c b/src/gallium/drivers/radeonsi/gfx10_query.c
index c0a0bc8ce57..aedf5090eed 100644
--- a/src/gallium/drivers/radeonsi/gfx10_query.c
+++ b/src/gallium/drivers/radeonsi/gfx10_query.c
@@ -22,13 +22,13 @@
  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  */
 
-#include <stddef.h>
-
 #include "si_pipe.h"
 #include "si_query.h"
+#include "sid.h"
 #include "util/u_memory.h"
 #include "util/u_suballoc.h"
-#include "sid.h"
+
+#include <stddef.h>
 
 /**
  * The query buffer is written to by ESGS NGG shaders with statistics about
@@ -39,12 +39,12 @@
  * without additional GPU cost.
  */
 struct gfx10_sh_query_buffer {
-	struct list_head list;
-	struct si_resource *buf;
-	unsigned refcount;
+   struct list_head list;
+   struct si_resource *buf;
+   unsigned refcount;
 
-	/* Offset into the buffer in bytes; points at the first un-emitted entry. */
-	unsigned head;
+   /* Offset into the buffer in bytes; points at the first un-emitted entry. */
+   unsigned head;
 };
 
 /* Memory layout of the query buffer. Must be kept in sync with shaders
@@ -55,469 +55,454 @@ struct gfx10_sh_query_buffer {
  * of all those values unconditionally.
  */
 struct gfx10_sh_query_buffer_mem {
-	struct {
-		uint64_t generated_primitives_start_dummy;
-		uint64_t emitted_primitives_start_dummy;
-		uint64_t generated_primitives;
-		uint64_t emitted_primitives;
-	} stream[4];
-	uint32_t fence; /* bottom-of-pipe fence: set to ~0 when draws have finished */
-	uint32_t pad[31];
+   struct {
+      uint64_t generated_primitives_start_dummy;
+      uint64_t emitted_primitives_start_dummy;
+      uint64_t generated_primitives;
+      uint64_t emitted_primitives;
+   } stream[4];
+   uint32_t fence; /* bottom-of-pipe fence: set to ~0 when draws have finished */
+   uint32_t pad[31];
 };
 
 /* Shader-based queries. */
 struct gfx10_sh_query {
-	struct si_query b;
+   struct si_query b;
 
-	struct gfx10_sh_query_buffer *first;
-	struct gfx10_sh_query_buffer *last;
-	unsigned first_begin;
-	unsigned last_end;
+   struct gfx10_sh_query_buffer *first;
+   struct gfx10_sh_query_buffer *last;
+   unsigned first_begin;
+   unsigned last_end;
 
-	unsigned stream;
+   unsigned stream;
 };
 
 static void emit_shader_query(struct si_context *sctx)
 {
-	assert(!list_is_empty(&sctx->shader_query_buffers));
+   assert(!list_is_empty(&sctx->shader_query_buffers));
 
-	struct gfx10_sh_query_buffer *qbuf = list_last_entry(&sctx->shader_query_buffers,
-							     struct gfx10_sh_query_buffer, list);
-	qbuf->head += sizeof(struct gfx10_sh_query_buffer_mem);
+   struct gfx10_sh_query_buffer *qbuf =
+      list_last_entry(&sctx->shader_query_buffers, struct gfx10_sh_query_buffer, list);
+   qbuf->head += sizeof(struct gfx10_sh_query_buffer_mem);
 }
 
 static void gfx10_release_query_buffers(struct si_context *sctx,
-					struct gfx10_sh_query_buffer *first,
-					struct gfx10_sh_query_buffer *last)
+                                        struct gfx10_sh_query_buffer *first,
+                                        struct gfx10_sh_query_buffer *last)
 {
-	while (first) {
-		struct gfx10_sh_query_buffer *qbuf = first;
-		if (first != last)
-			first = LIST_ENTRY(struct gfx10_sh_query_buffer, qbuf->list.next, list);
-		else
-			first = NULL;
-
-		qbuf->refcount--;
-		if (qbuf->refcount)
-			continue;
-
-		if (qbuf->list.next == &sctx->shader_query_buffers)
-			continue; /* keep the most recent buffer; it may not be full yet */
-		if (qbuf->list.prev == &sctx->shader_query_buffers)
-			continue; /* keep the oldest buffer for recycling */
-
-		list_del(&qbuf->list);
-		si_resource_reference(&qbuf->buf, NULL);
-		FREE(qbuf);
-	}
+   while (first) {
+      struct gfx10_sh_query_buffer *qbuf = first;
+      if (first != last)
+         first = LIST_ENTRY(struct gfx10_sh_query_buffer, qbuf->list.next, list);
+      else
+         first = NULL;
+
+      qbuf->refcount--;
+      if (qbuf->refcount)
+         continue;
+
+      if (qbuf->list.next == &sctx->shader_query_buffers)
+         continue; /* keep the most recent buffer; it may not be full yet */
+      if (qbuf->list.prev == &sctx->shader_query_buffers)
+         continue; /* keep the oldest buffer for recycling */
+
+      list_del(&qbuf->list);
+      si_resource_reference(&qbuf->buf, NULL);
+      FREE(qbuf);
+   }
 }
 
 static bool gfx10_alloc_query_buffer(struct si_context *sctx)
 {
-	if (si_is_atom_dirty(sctx, &sctx->atoms.s.shader_query))
-		return true;
-
-	struct gfx10_sh_query_buffer *qbuf = NULL;
-
-	if (!list_is_empty(&sctx->shader_query_buffers)) {
-		qbuf = list_last_entry(&sctx->shader_query_buffers,
-				       struct gfx10_sh_query_buffer, list);
-		if (qbuf->head + sizeof(struct gfx10_sh_query_buffer_mem) <= qbuf->buf->b.b.width0)
-			goto success;
-
-		qbuf = list_first_entry(&sctx->shader_query_buffers,
-				        struct gfx10_sh_query_buffer, list);
-		if (!qbuf->refcount &&
-		    !si_rings_is_buffer_referenced(sctx, qbuf->buf->buf, RADEON_USAGE_READWRITE) &&
-		    sctx->ws->buffer_wait(qbuf->buf->buf, 0, RADEON_USAGE_READWRITE)) {
-			/* Can immediately re-use the oldest buffer */
-			list_del(&qbuf->list);
-		} else {
-			qbuf = NULL;
-		}
-	}
-
-	if (!qbuf) {
-		qbuf = CALLOC_STRUCT(gfx10_sh_query_buffer);
-		if (unlikely(!qbuf))
-			return false;
-
-		struct si_screen *screen = sctx->screen;
-		unsigned buf_size = MAX2(sizeof(struct gfx10_sh_query_buffer_mem),
-					 screen->info.min_alloc_size);
-		qbuf->buf = si_resource(
-			pipe_buffer_create(&screen->b, 0, PIPE_USAGE_STAGING, buf_size));
-		if (unlikely(!qbuf->buf)) {
-			FREE(qbuf);
-			return false;
-		}
-	}
-
-	/* The buffer is currently unused by the GPU. Initialize it.
-	 *
-	 * We need to set the high bit of all the primitive counters for
-	 * compatibility with the SET_PREDICATION packet.
-	 */
-	uint64_t *results = sctx->ws->buffer_map(qbuf->buf->buf, NULL,
-						 PIPE_TRANSFER_WRITE |
-						 PIPE_TRANSFER_UNSYNCHRONIZED);
-	assert(results);
-
-	for (unsigned i = 0, e = qbuf->buf->b.b.width0 / sizeof(struct gfx10_sh_query_buffer_mem);
-	     i < e; ++i) {
-		for (unsigned j = 0; j < 16; ++j)
-			results[32 * i + j] = (uint64_t)1 << 63;
-		results[32 * i + 16] = 0;
-	}
-
-	list_addtail(&qbuf->list, &sctx->shader_query_buffers);
-	qbuf->head = 0;
-	qbuf->refcount = sctx->num_active_shader_queries;
+   if (si_is_atom_dirty(sctx, &sctx->atoms.s.shader_query))
+      return true;
+
+   struct gfx10_sh_query_buffer *qbuf = NULL;
+
+   if (!list_is_empty(&sctx->shader_query_buffers)) {
+      qbuf = list_last_entry(&sctx->shader_query_buffers, struct gfx10_sh_query_buffer, list);
+      if (qbuf->head + sizeof(struct gfx10_sh_query_buffer_mem) <= qbuf->buf->b.b.width0)
+         goto success;
+
+      qbuf = list_first_entry(&sctx->shader_query_buffers, struct gfx10_sh_query_buffer, list);
+      if (!qbuf->refcount &&
+          !si_rings_is_buffer_referenced(sctx, qbuf->buf->buf, RADEON_USAGE_READWRITE) &&
+          sctx->ws->buffer_wait(qbuf->buf->buf, 0, RADEON_USAGE_READWRITE)) {
+         /* Can immediately re-use the oldest buffer */
+         list_del(&qbuf->list);
+      } else {
+         qbuf = NULL;
+      }
+   }
+
+   if (!qbuf) {
+      qbuf = CALLOC_STRUCT(gfx10_sh_query_buffer);
+      if (unlikely(!qbuf))
+         return false;
+
+      struct si_screen *screen = sctx->screen;
+      unsigned buf_size =
+         MAX2(sizeof(struct gfx10_sh_query_buffer_mem), screen->info.min_alloc_size);
+      qbuf->buf = si_resource(pipe_buffer_create(&screen->b, 0, PIPE_USAGE_STAGING, buf_size));
+      if (unlikely(!qbuf->buf)) {
+         FREE(qbuf);
+         return false;
+      }
+   }
+
+   /* The buffer is currently unused by the GPU. Initialize it.
+    *
+    * We need to set the high bit of all the primitive counters for
+    * compatibility with the SET_PREDICATION packet.
+    */
+   uint64_t *results = sctx->ws->buffer_map(qbuf->buf->buf, NULL,
+                                            PIPE_TRANSFER_WRITE | PIPE_TRANSFER_UNSYNCHRONIZED);
+   assert(results);
+
+   for (unsigned i = 0, e = qbuf->buf->b.b.width0 / sizeof(struct gfx10_sh_query_buffer_mem); i < e;
+        ++i) {
+      for (unsigned j = 0; j < 16; ++j)
+         results[32 * i + j] = (uint64_t)1 << 63;
+      results[32 * i + 16] = 0;
+   }
+
+   list_addtail(&qbuf->list, &sctx->shader_query_buffers);
+   qbuf->head = 0;
+   qbuf->refcount = sctx->num_active_shader_queries;
 
 success:;
-	struct pipe_shader_buffer sbuf;
-	sbuf.buffer = &qbuf->buf->b.b;
-	sbuf.buffer_offset = qbuf->head;
-	sbuf.buffer_size = sizeof(struct gfx10_sh_query_buffer_mem);
-	si_set_rw_shader_buffer(sctx, GFX10_GS_QUERY_BUF, &sbuf);
-	sctx->current_vs_state |= S_VS_STATE_STREAMOUT_QUERY_ENABLED(1);
-
-	si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_query);
-	return true;
+   struct pipe_shader_buffer sbuf;
+   sbuf.buffer = &qbuf->buf->b.b;
+   sbuf.buffer_offset = qbuf->head;
+   sbuf.buffer_size = sizeof(struct gfx10_sh_query_buffer_mem);
+   si_set_rw_shader_buffer(sctx, GFX10_GS_QUERY_BUF, &sbuf);
+   sctx->current_vs_state |= S_VS_STATE_STREAMOUT_QUERY_ENABLED(1);
+
+   si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_query);
+   return true;
 }
 
 static void gfx10_sh_query_destroy(struct si_context *sctx, struct si_query *rquery)
 {
-	struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery;
-	gfx10_release_query_buffers(sctx, query->first, query->last);
-	FREE(query);
+   struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery;
+   gfx10_release_query_buffers(sctx, query->first, query->last);
+   FREE(query);
 }
 
 static bool gfx10_sh_query_begin(struct si_context *sctx, struct si_query *rquery)
 {
-	struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery;
+   struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery;
 
-	gfx10_release_query_buffers(sctx, query->first, query->last);
-	query->first = query->last = NULL;
+   gfx10_release_query_buffers(sctx, query->first, query->last);
+   query->first = query->last = NULL;
 
-	if (unlikely(!gfx10_alloc_query_buffer(sctx)))
-		return false;
+   if (unlikely(!gfx10_alloc_query_buffer(sctx)))
+      return false;
 
-	query->first = list_last_entry(&sctx->shader_query_buffers,
-				       struct gfx10_sh_query_buffer, list);
-	query->first_begin = query->first->head;
+   query->first = list_last_entry(&sctx->shader_query_buffers, struct gfx10_sh_query_buffer, list);
+   query->first_begin = query->first->head;
 
-	sctx->num_active_shader_queries++;
-	query->first->refcount++;
+   sctx->num_active_shader_queries++;
+   query->first->refcount++;
 
-	return true;
+   return true;
 }
 
 static bool gfx10_sh_query_end(struct si_context *sctx, struct si_query *rquery)
 {
-	struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery;
-
-	if (unlikely(!query->first))
-		return false; /* earlier out of memory error */
-
-	query->last = list_last_entry(&sctx->shader_query_buffers,
-				      struct gfx10_sh_query_buffer, list);
-	query->last_end = query->last->head;
-
-	/* Signal the fence of the previous chunk */
-	if (query->last_end != 0) {
-		uint64_t fence_va = query->last->buf->gpu_address;
-		fence_va += query->last_end - sizeof(struct gfx10_sh_query_buffer_mem);
-		fence_va += offsetof(struct gfx10_sh_query_buffer_mem, fence);
-		si_cp_release_mem(sctx, sctx->gfx_cs,
-				  V_028A90_BOTTOM_OF_PIPE_TS, 0,
-				  EOP_DST_SEL_MEM, EOP_INT_SEL_NONE,
-				  EOP_DATA_SEL_VALUE_32BIT,
-				  query->last->buf, fence_va, 0xffffffff,
-				  PIPE_QUERY_GPU_FINISHED);
-	}
-
-	sctx->num_active_shader_queries--;
-
-	if (sctx->num_active_shader_queries > 0) {
-		gfx10_alloc_query_buffer(sctx);
-	} else {
-		si_set_rw_shader_buffer(sctx, GFX10_GS_QUERY_BUF, NULL);
-		sctx->current_vs_state &= C_VS_STATE_STREAMOUT_QUERY_ENABLED;
-
-		/* If a query_begin is followed by a query_end without a draw
-		 * in-between, we need to clear the atom to ensure that the
-		 * next query_begin will re-initialize the shader buffer. */
-		si_set_atom_dirty(sctx, &sctx->atoms.s.shader_query, false);
-	}
-
-	return true;
+   struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery;
+
+   if (unlikely(!query->first))
+      return false; /* earlier out of memory error */
+
+   query->last = list_last_entry(&sctx->shader_query_buffers, struct gfx10_sh_query_buffer, list);
+   query->last_end = query->last->head;
+
+   /* Signal the fence of the previous chunk */
+   if (query->last_end != 0) {
+      uint64_t fence_va = query->last->buf->gpu_address;
+      fence_va += query->last_end - sizeof(struct gfx10_sh_query_buffer_mem);
+      fence_va += offsetof(struct gfx10_sh_query_buffer_mem, fence);
+      si_cp_release_mem(sctx, sctx->gfx_cs, V_028A90_BOTTOM_OF_PIPE_TS, 0, EOP_DST_SEL_MEM,
+                        EOP_INT_SEL_NONE, EOP_DATA_SEL_VALUE_32BIT, query->last->buf, fence_va,
+                        0xffffffff, PIPE_QUERY_GPU_FINISHED);
+   }
+
+   sctx->num_active_shader_queries--;
+
+   if (sctx->num_active_shader_queries > 0) {
+      gfx10_alloc_query_buffer(sctx);
+   } else {
+      si_set_rw_shader_buffer(sctx, GFX10_GS_QUERY_BUF, NULL);
+      sctx->current_vs_state &= C_VS_STATE_STREAMOUT_QUERY_ENABLED;
+
+      /* If a query_begin is followed by a query_end without a draw
+       * in-between, we need to clear the atom to ensure that the
+       * next query_begin will re-initialize the shader buffer. */
+      si_set_atom_dirty(sctx, &sctx->atoms.s.shader_query, false);
+   }
+
+   return true;
 }
 
 static void gfx10_sh_query_add_result(struct gfx10_sh_query *query,
-				      struct gfx10_sh_query_buffer_mem *qmem,
-				      union pipe_query_result *result)
+                                      struct gfx10_sh_query_buffer_mem *qmem,
+                                      union pipe_query_result *result)
 {
-	static const uint64_t mask = ((uint64_t)1 << 63) - 1;
-
-	switch (query->b.type) {
-	case PIPE_QUERY_PRIMITIVES_EMITTED:
-		result->u64 += qmem->stream[query->stream].emitted_primitives & mask;
-		break;
-	case PIPE_QUERY_PRIMITIVES_GENERATED:
-		result->u64 += qmem->stream[query->stream].generated_primitives & mask;
-		break;
-	case PIPE_QUERY_SO_STATISTICS:
-		result->so_statistics.num_primitives_written +=
-			qmem->stream[query->stream].emitted_primitives & mask;
-		result->so_statistics.primitives_storage_needed +=
-			qmem->stream[query->stream].generated_primitives & mask;
-		break;
-	case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
-		result->b |= qmem->stream[query->stream].emitted_primitives !=
-			     qmem->stream[query->stream].generated_primitives;
-		break;
-	case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
-		for (unsigned stream = 0; stream < SI_MAX_STREAMS; ++stream) {
-			result->b |= qmem->stream[query->stream].emitted_primitives !=
-				     qmem->stream[query->stream].generated_primitives;
-		}
-		break;
-	default:
-		assert(0);
-	}
+   static const uint64_t mask = ((uint64_t)1 << 63) - 1;
+
+   switch (query->b.type) {
+   case PIPE_QUERY_PRIMITIVES_EMITTED:
+      result->u64 += qmem->stream[query->stream].emitted_primitives & mask;
+      break;
+   case PIPE_QUERY_PRIMITIVES_GENERATED:
+      result->u64 += qmem->stream[query->stream].generated_primitives & mask;
+      break;
+   case PIPE_QUERY_SO_STATISTICS:
+      result->so_statistics.num_primitives_written +=
+         qmem->stream[query->stream].emitted_primitives & mask;
+      result->so_statistics.primitives_storage_needed +=
+         qmem->stream[query->stream].generated_primitives & mask;
+      break;
+   case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
+      result->b |= qmem->stream[query->stream].emitted_primitives !=
+                   qmem->stream[query->stream].generated_primitives;
+      break;
+   case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
+      for (unsigned stream = 0; stream < SI_MAX_STREAMS; ++stream) {
+         result->b |= qmem->stream[query->stream].emitted_primitives !=
+                      qmem->stream[query->stream].generated_primitives;
+      }
+      break;
+   default:
+      assert(0);
+   }
 }
 
-static bool gfx10_sh_query_get_result(struct si_context *sctx, struct si_query *rquery,
-				      bool wait, union pipe_query_result *result)
+static bool gfx10_sh_query_get_result(struct si_context *sctx, struct si_query *rquery, bool wait,
+                                      union pipe_query_result *result)
 {
-	struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery;
+   struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery;
 
-	util_query_clear_result(result, query->b.type);
+   util_query_clear_result(result, query->b.type);
 
-	if (unlikely(!query->first))
-		return false; /* earlier out of memory error */
-	assert(query->last);
+   if (unlikely(!query->first))
+      return false; /* earlier out of memory error */
+   assert(query->last);
 
-	for (struct gfx10_sh_query_buffer *qbuf = query->last;;
-	     qbuf = LIST_ENTRY(struct gfx10_sh_query_buffer, qbuf->list.prev, list)) {
-		unsigned usage = PIPE_TRANSFER_READ |
-				 (wait ? 0 : PIPE_TRANSFER_DONTBLOCK);
-		void *map;
+   for (struct gfx10_sh_query_buffer *qbuf = query->last;;
+        qbuf = LIST_ENTRY(struct gfx10_sh_query_buffer, qbuf->list.prev, list)) {
+      unsigned usage = PIPE_TRANSFER_READ | (wait ? 0 : PIPE_TRANSFER_DONTBLOCK);
+      void *map;
 
-		if (rquery->b.flushed)
-			map = sctx->ws->buffer_map(qbuf->buf->buf, NULL, usage);
-		else
-			map = si_buffer_map_sync_with_rings(sctx, qbuf->buf, usage);
+      if (rquery->b.flushed)
+         map = sctx->ws->buffer_map(qbuf->buf->buf, NULL, usage);
+      else
+         map = si_buffer_map_sync_with_rings(sctx, qbuf->buf, usage);
 
-		if (!map)
-			return false;
+      if (!map)
+         return false;
 
-		unsigned results_begin = 0;
-		unsigned results_end = qbuf->head;
-		if (qbuf == query->first)
-			results_begin = query->first_begin;
-		if (qbuf == query->last)
-			results_end = query->last_end;
+      unsigned results_begin = 0;
+      unsigned results_end = qbuf->head;
+      if (qbuf == query->first)
+         results_begin = query->first_begin;
+      if (qbuf == query->last)
+         results_end = query->last_end;
 
-		while (results_begin != results_end) {
-			struct gfx10_sh_query_buffer_mem *qmem = map + results_begin;
-			results_begin += sizeof(*qmem);
+      while (results_begin != results_end) {
+         struct gfx10_sh_query_buffer_mem *qmem = map + results_begin;
+         results_begin += sizeof(*qmem);
 
-			gfx10_sh_query_add_result(query, qmem, result);
-		}
+         gfx10_sh_query_add_result(query, qmem, result);
+      }
 
-		if (qbuf == query->first)
-			break;
-	}
+      if (qbuf == query->first)
+         break;
+   }
 
-	return true;
+   return true;
 }
 
-static void gfx10_sh_query_get_result_resource(struct si_context *sctx,
-					       struct si_query *rquery,
-					       bool wait,
-					       enum pipe_query_value_type result_type,
-					       int index,
-					       struct pipe_resource *resource,
-					       unsigned offset)
+static void gfx10_sh_query_get_result_resource(struct si_context *sctx, struct si_query *rquery,
+                                               bool wait, enum pipe_query_value_type result_type,
+                                               int index, struct pipe_resource *resource,
+                                               unsigned offset)
 {
-	struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery;
-	struct si_qbo_state saved_state = {};
-	struct pipe_resource *tmp_buffer = NULL;
-	unsigned tmp_buffer_offset = 0;
-
-	if (!sctx->sh_query_result_shader) {
-		sctx->sh_query_result_shader = gfx10_create_sh_query_result_cs(sctx);
-		if (!sctx->sh_query_result_shader)
-			return;
-	}
-
-	if (query->first != query->last) {
-		u_suballocator_alloc(sctx->allocator_zeroed_memory, 16, 16,
-				     &tmp_buffer_offset, &tmp_buffer);
-		if (!tmp_buffer)
-			return;
-	}
-
-	si_save_qbo_state(sctx, &saved_state);
-
-	/* Pre-fill the constants configuring the shader behavior. */
-	struct {
-		uint32_t config;
-		uint32_t offset;
-		uint32_t chain;
-		uint32_t result_count;
-	} consts;
-	struct pipe_constant_buffer constant_buffer = {};
-
-	if (index >= 0) {
-		switch (query->b.type) {
-		case PIPE_QUERY_PRIMITIVES_GENERATED:
-			consts.offset = sizeof(uint32_t) * query->stream;
-			consts.config = 0;
-			break;
-		case PIPE_QUERY_PRIMITIVES_EMITTED:
-			consts.offset = sizeof(uint32_t) * (4 + query->stream);
-			consts.config = 0;
-			break;
-		case PIPE_QUERY_SO_STATISTICS:
-			consts.offset = sizeof(uint32_t) * (4 * index + query->stream);
-			consts.config = 0;
-			break;
-		case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
-			consts.offset = sizeof(uint32_t) * query->stream;
-			consts.config = 2;
-			break;
-		case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
-			consts.offset = 0;
-			consts.config = 3;
-			break;
-		default: unreachable("bad query type");
-		}
-	} else {
-		/* Check result availability. */
-		consts.offset = 0;
-		consts.config = 1;
-	}
-
-	if (result_type == PIPE_QUERY_TYPE_I64 || result_type == PIPE_QUERY_TYPE_U64)
-		consts.config |= 8;
-
-	constant_buffer.buffer_size = sizeof(consts);
-	constant_buffer.user_buffer = &consts;
-
-	/* Pre-fill the SSBOs and grid. */
-	struct pipe_shader_buffer ssbo[3];
-	struct pipe_grid_info grid = {};
-
-	ssbo[1].buffer = tmp_buffer;
-	ssbo[1].buffer_offset = tmp_buffer_offset;
-	ssbo[1].buffer_size = 16;
-
-	ssbo[2] = ssbo[1];
-
-	sctx->b.bind_compute_state(&sctx->b, sctx->sh_query_result_shader);
-
-	grid.block[0] = 1;
-	grid.block[1] = 1;
-	grid.block[2] = 1;
-	grid.grid[0] = 1;
-	grid.grid[1] = 1;
-	grid.grid[2] = 1;
-
-	struct gfx10_sh_query_buffer *qbuf = query->first;
-	for (;;) {
-		unsigned begin = qbuf == query->first ? query->first_begin : 0;
-		unsigned end = qbuf == query->last ? query->last_end : qbuf->buf->b.b.width0;
-		if (!end)
-			continue;
-
-		ssbo[0].buffer = &qbuf->buf->b.b;
-		ssbo[0].buffer_offset = begin;
-		ssbo[0].buffer_size = end - begin;
-
-		consts.result_count = (end - begin) / sizeof(struct gfx10_sh_query_buffer_mem);
-		consts.chain = 0;
-		if (qbuf != query->first)
-			consts.chain |= 1;
-		if (qbuf != query->last)
-			consts.chain |= 2;
-
-		if (qbuf == query->last) {
-			ssbo[2].buffer = resource;
-			ssbo[2].buffer_offset = offset;
-			ssbo[2].buffer_size = 8;
-		}
-
-		sctx->b.set_constant_buffer(&sctx->b, PIPE_SHADER_COMPUTE, 0, &constant_buffer);
-		sctx->b.set_shader_buffers(&sctx->b, PIPE_SHADER_COMPUTE, 0, 3, ssbo, 0x6);
-
-		if (wait) {
-			uint64_t va;
-
-			/* Wait for result availability. Wait only for readiness
-			 * of the last entry, since the fence writes should be
-			 * serialized in the CP.
-			 */
-			va = qbuf->buf->gpu_address;
-			va += end - sizeof(struct gfx10_sh_query_buffer_mem);
-			va += offsetof(struct gfx10_sh_query_buffer_mem, fence);
-
-			si_cp_wait_mem(sctx, sctx->gfx_cs, va, 0x00000001, 0x00000001, 0);
-		}
-
-		sctx->b.launch_grid(&sctx->b, &grid);
-		sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH;
-
-		if (qbuf == query->last)
-			break;
-		qbuf = LIST_ENTRY(struct gfx10_sh_query_buffer, qbuf->list.next, list);
-	}
-
-	si_restore_qbo_state(sctx, &saved_state);
-	pipe_resource_reference(&tmp_buffer, NULL);
+   struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery;
+   struct si_qbo_state saved_state = {};
+   struct pipe_resource *tmp_buffer = NULL;
+   unsigned tmp_buffer_offset = 0;
+
+   if (!sctx->sh_query_result_shader) {
+      sctx->sh_query_result_shader = gfx10_create_sh_query_result_cs(sctx);
+      if (!sctx->sh_query_result_shader)
+         return;
+   }
+
+   if (query->first != query->last) {
+      u_suballocator_alloc(sctx->allocator_zeroed_memory, 16, 16, &tmp_buffer_offset, &tmp_buffer);
+      if (!tmp_buffer)
+         return;
+   }
+
+   si_save_qbo_state(sctx, &saved_state);
+
+   /* Pre-fill the constants configuring the shader behavior. */
+   struct {
+      uint32_t config;
+      uint32_t offset;
+      uint32_t chain;
+      uint32_t result_count;
+   } consts;
+   struct pipe_constant_buffer constant_buffer = {};
+
+   if (index >= 0) {
+      switch (query->b.type) {
+      case PIPE_QUERY_PRIMITIVES_GENERATED:
+         consts.offset = sizeof(uint32_t) * query->stream;
+         consts.config = 0;
+         break;
+      case PIPE_QUERY_PRIMITIVES_EMITTED:
+         consts.offset = sizeof(uint32_t) * (4 + query->stream);
+         consts.config = 0;
+         break;
+      case PIPE_QUERY_SO_STATISTICS:
+         consts.offset = sizeof(uint32_t) * (4 * index + query->stream);
+         consts.config = 0;
+         break;
+      case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
+         consts.offset = sizeof(uint32_t) * query->stream;
+         consts.config = 2;
+         break;
+      case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
+         consts.offset = 0;
+         consts.config = 3;
+         break;
+      default:
+         unreachable("bad query type");
+      }
+   } else {
+      /* Check result availability. */
+      consts.offset = 0;
+      consts.config = 1;
+   }
+
+   if (result_type == PIPE_QUERY_TYPE_I64 || result_type == PIPE_QUERY_TYPE_U64)
+      consts.config |= 8;
+
+   constant_buffer.buffer_size = sizeof(consts);
+   constant_buffer.user_buffer = &consts;
+
+   /* Pre-fill the SSBOs and grid. */
+   struct pipe_shader_buffer ssbo[3];
+   struct pipe_grid_info grid = {};
+
+   ssbo[1].buffer = tmp_buffer;
+   ssbo[1].buffer_offset = tmp_buffer_offset;
+   ssbo[1].buffer_size = 16;
+
+   ssbo[2] = ssbo[1];
+
+   sctx->b.bind_compute_state(&sctx->b, sctx->sh_query_result_shader);
+
+   grid.block[0] = 1;
+   grid.block[1] = 1;
+   grid.block[2] = 1;
+   grid.grid[0] = 1;
+   grid.grid[1] = 1;
+   grid.grid[2] = 1;
+
+   struct gfx10_sh_query_buffer *qbuf = query->first;
+   for (;;) {
+      unsigned begin = qbuf == query->first ? query->first_begin : 0;
+      unsigned end = qbuf == query->last ? query->last_end : qbuf->buf->b.b.width0;
+      if (!end)
+         continue;
+
+      ssbo[0].buffer = &qbuf->buf->b.b;
+      ssbo[0].buffer_offset = begin;
+      ssbo[0].buffer_size = end - begin;
+
+      consts.result_count = (end - begin) / sizeof(struct gfx10_sh_query_buffer_mem);
+      consts.chain = 0;
+      if (qbuf != query->first)
+         consts.chain |= 1;
+      if (qbuf != query->last)
+         consts.chain |= 2;
+
+      if (qbuf == query->last) {
+         ssbo[2].buffer = resource;
+         ssbo[2].buffer_offset = offset;
+         ssbo[2].buffer_size = 8;
+      }
+
+      sctx->b.set_constant_buffer(&sctx->b, PIPE_SHADER_COMPUTE, 0, &constant_buffer);
+      sctx->b.set_shader_buffers(&sctx->b, PIPE_SHADER_COMPUTE, 0, 3, ssbo, 0x6);
+
+      if (wait) {
+         uint64_t va;
+
+         /* Wait for result availability. Wait only for readiness
+          * of the last entry, since the fence writes should be
+          * serialized in the CP.
+          */
+         va = qbuf->buf->gpu_address;
+         va += end - sizeof(struct gfx10_sh_query_buffer_mem);
+         va += offsetof(struct gfx10_sh_query_buffer_mem, fence);
+
+         si_cp_wait_mem(sctx, sctx->gfx_cs, va, 0x00000001, 0x00000001, 0);
+      }
+
+      sctx->b.launch_grid(&sctx->b, &grid);
+      sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH;
+
+      if (qbuf == query->last)
+         break;
+      qbuf = LIST_ENTRY(struct gfx10_sh_query_buffer, qbuf->list.next, list);
+   }
+
+   si_restore_qbo_state(sctx, &saved_state);
+   pipe_resource_reference(&tmp_buffer, NULL);
 }
 
 static const struct si_query_ops gfx10_sh_query_ops = {
-	.destroy = gfx10_sh_query_destroy,
-	.begin = gfx10_sh_query_begin,
-	.end = gfx10_sh_query_end,
-	.get_result = gfx10_sh_query_get_result,
-	.get_result_resource = gfx10_sh_query_get_result_resource,
+   .destroy = gfx10_sh_query_destroy,
+   .begin = gfx10_sh_query_begin,
+   .end = gfx10_sh_query_end,
+   .get_result = gfx10_sh_query_get_result,
+   .get_result_resource = gfx10_sh_query_get_result_resource,
 };
 
-struct pipe_query *gfx10_sh_query_create(struct si_screen *screen,
-					 enum pipe_query_type query_type,
-					 unsigned index)
+struct pipe_query *gfx10_sh_query_create(struct si_screen *screen, enum pipe_query_type query_type,
+                                         unsigned index)
 {
-	struct gfx10_sh_query *query = CALLOC_STRUCT(gfx10_sh_query);
-	if (unlikely(!query))
-		return NULL;
+   struct gfx10_sh_query *query = CALLOC_STRUCT(gfx10_sh_query);
+   if (unlikely(!query))
+      return NULL;
 
-	query->b.ops = &gfx10_sh_query_ops;
-	query->b.type = query_type;
-	query->stream = index;
+   query->b.ops = &gfx10_sh_query_ops;
+   query->b.type = query_type;
+   query->stream = index;
 
-	return (struct pipe_query *)query;
+   return (struct pipe_query *)query;
 }
 
 void gfx10_init_query(struct si_context *sctx)
 {
-	list_inithead(&sctx->shader_query_buffers);
-	sctx->atoms.s.shader_query.emit = emit_shader_query;
+   list_inithead(&sctx->shader_query_buffers);
+   sctx->atoms.s.shader_query.emit = emit_shader_query;
 }
 
 void gfx10_destroy_query(struct si_context *sctx)
 {
-	while (!list_is_empty(&sctx->shader_query_buffers)) {
-		struct gfx10_sh_query_buffer *qbuf =
-			list_first_entry(&sctx->shader_query_buffers,
-					 struct gfx10_sh_query_buffer, list);
-		list_del(&qbuf->list);
-
-		assert(!qbuf->refcount);
-		si_resource_reference(&qbuf->buf, NULL);
-		FREE(qbuf);
-	}
+   while (!list_is_empty(&sctx->shader_query_buffers)) {
+      struct gfx10_sh_query_buffer *qbuf =
+         list_first_entry(&sctx->shader_query_buffers, struct gfx10_sh_query_buffer, list);
+      list_del(&qbuf->list);
+
+      assert(!qbuf->refcount);
+      si_resource_reference(&qbuf->buf, NULL);
+      FREE(qbuf);
+   }
 }
diff --git a/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c b/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c
index 63439733507..06eba4a1f61 100644
--- a/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c
+++ b/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c
@@ -21,250 +21,239 @@
  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  */
 
+#include "ac_llvm_cull.h"
 #include "si_pipe.h"
 #include "si_shader_internal.h"
-
 #include "sid.h"
-
 #include "util/u_memory.h"
 #include "util/u_prim.h"
-#include "ac_llvm_cull.h"
 
 static LLVMValueRef get_wave_id_in_tg(struct si_shader_context *ctx)
 {
-	return si_unpack_param(ctx, ctx->merged_wave_info, 24, 4);
+   return si_unpack_param(ctx, ctx->merged_wave_info, 24, 4);
 }
 
 static LLVMValueRef get_tgsize(struct si_shader_context *ctx)
 {
-	return si_unpack_param(ctx, ctx->merged_wave_info, 28, 4);
+   return si_unpack_param(ctx, ctx->merged_wave_info, 28, 4);
 }
 
 static LLVMValueRef get_thread_id_in_tg(struct si_shader_context *ctx)
 {
-	LLVMBuilderRef builder = ctx->ac.builder;
-	LLVMValueRef tmp;
-	tmp = LLVMBuildMul(builder, get_wave_id_in_tg(ctx),
-			   LLVMConstInt(ctx->ac.i32, ctx->ac.wave_size, false), "");
-	return LLVMBuildAdd(builder, tmp, ac_get_thread_id(&ctx->ac), "");
+   LLVMBuilderRef builder = ctx->ac.builder;
+   LLVMValueRef tmp;
+   tmp = LLVMBuildMul(builder, get_wave_id_in_tg(ctx),
+                      LLVMConstInt(ctx->ac.i32, ctx->ac.wave_size, false), "");
+   return LLVMBuildAdd(builder, tmp, ac_get_thread_id(&ctx->ac), "");
 }
 
 static LLVMValueRef ngg_get_vtx_cnt(struct si_shader_context *ctx)
 {
-	return si_unpack_param(ctx, ctx->gs_tg_info, 12, 9);
+   return si_unpack_param(ctx, ctx->gs_tg_info, 12, 9);
 }
 
 static LLVMValueRef ngg_get_prim_cnt(struct si_shader_context *ctx)
 {
-	return si_unpack_param(ctx, ctx->gs_tg_info, 22, 9);
+   return si_unpack_param(ctx, ctx->gs_tg_info, 22, 9);
 }
 
 static LLVMValueRef ngg_get_ordered_id(struct si_shader_context *ctx)
 {
-	return si_unpack_param(ctx, ctx->gs_tg_info, 0, 12);
+   return si_unpack_param(ctx, ctx->gs_tg_info, 0, 12);
 }
 
 static LLVMValueRef ngg_get_query_buf(struct si_shader_context *ctx)
 {
-	LLVMValueRef buf_ptr = ac_get_arg(&ctx->ac, ctx->rw_buffers);
+   LLVMValueRef buf_ptr = ac_get_arg(&ctx->ac, ctx->rw_buffers);
 
-	return ac_build_load_to_sgpr(&ctx->ac, buf_ptr,
-				     LLVMConstInt(ctx->ac.i32, GFX10_GS_QUERY_BUF, false));
+   return ac_build_load_to_sgpr(&ctx->ac, buf_ptr,
+                                LLVMConstInt(ctx->ac.i32, GFX10_GS_QUERY_BUF, false));
 }
 
 static LLVMValueRef ngg_get_initial_edgeflag(struct si_shader_context *ctx, unsigned index)
 {
-	if (ctx->type == PIPE_SHADER_VERTEX) {
-		LLVMValueRef tmp;
-		tmp = LLVMBuildLShr(ctx->ac.builder,
-				    ac_get_arg(&ctx->ac, ctx->args.gs_invocation_id),
-				    LLVMConstInt(ctx->ac.i32, 8 + index, false), "");
-		return LLVMBuildTrunc(ctx->ac.builder, tmp, ctx->ac.i1, "");
-	}
-	return ctx->ac.i1false;
+   if (ctx->type == PIPE_SHADER_VERTEX) {
+      LLVMValueRef tmp;
+      tmp = LLVMBuildLShr(ctx->ac.builder, ac_get_arg(&ctx->ac, ctx->args.gs_invocation_id),
+                          LLVMConstInt(ctx->ac.i32, 8 + index, false), "");
+      return LLVMBuildTrunc(ctx->ac.builder, tmp, ctx->ac.i1, "");
+   }
+   return ctx->ac.i1false;
 }
 
 /**
  * Return the number of vertices as a constant in \p num_vertices,
  * and return a more precise value as LLVMValueRef from the function.
  */
-static LLVMValueRef ngg_get_vertices_per_prim(struct si_shader_context *ctx,
-					      unsigned *num_vertices)
+static LLVMValueRef ngg_get_vertices_per_prim(struct si_shader_context *ctx, unsigned *num_vertices)
 {
-	const struct si_shader_info *info = &ctx->shader->selector->info;
-
-	if (ctx->type == PIPE_SHADER_VERTEX) {
-		if (info->properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD]) {
-			/* Blits always use axis-aligned rectangles with 3 vertices. */
-			*num_vertices = 3;
-			return LLVMConstInt(ctx->ac.i32, 3, 0);
-		} else {
-			/* We always build up all three indices for the prim export
-			 * independent of the primitive type. The additional garbage
-			 * data shouldn't hurt. This number doesn't matter with
-			 * NGG passthrough.
-			 */
-			*num_vertices = 3;
-
-			/* Extract OUTPRIM field. */
-			LLVMValueRef num = si_unpack_param(ctx, ctx->vs_state_bits, 2, 2);
-			return LLVMBuildAdd(ctx->ac.builder, num, ctx->ac.i32_1, "");
-		}
-	} else {
-		assert(ctx->type == PIPE_SHADER_TESS_EVAL);
-
-		if (info->properties[TGSI_PROPERTY_TES_POINT_MODE])
-			*num_vertices = 1;
-		else if (info->properties[TGSI_PROPERTY_TES_PRIM_MODE] == PIPE_PRIM_LINES)
-			*num_vertices = 2;
-		else
-			*num_vertices = 3;
-
-		return LLVMConstInt(ctx->ac.i32, *num_vertices, false);
-	}
+   const struct si_shader_info *info = &ctx->shader->selector->info;
+
+   if (ctx->type == PIPE_SHADER_VERTEX) {
+      if (info->properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD]) {
+         /* Blits always use axis-aligned rectangles with 3 vertices. */
+         *num_vertices = 3;
+         return LLVMConstInt(ctx->ac.i32, 3, 0);
+      } else {
+         /* We always build up all three indices for the prim export
+          * independent of the primitive type. The additional garbage
+          * data shouldn't hurt. This number doesn't matter with
+          * NGG passthrough.
+          */
+         *num_vertices = 3;
+
+         /* Extract OUTPRIM field. */
+         LLVMValueRef num = si_unpack_param(ctx, ctx->vs_state_bits, 2, 2);
+         return LLVMBuildAdd(ctx->ac.builder, num, ctx->ac.i32_1, "");
+      }
+   } else {
+      assert(ctx->type == PIPE_SHADER_TESS_EVAL);
+
+      if (info->properties[TGSI_PROPERTY_TES_POINT_MODE])
+         *num_vertices = 1;
+      else if (info->properties[TGSI_PROPERTY_TES_PRIM_MODE] == PIPE_PRIM_LINES)
+         *num_vertices = 2;
+      else
+         *num_vertices = 3;
+
+      return LLVMConstInt(ctx->ac.i32, *num_vertices, false);
+   }
 }
 
 bool gfx10_ngg_export_prim_early(struct si_shader *shader)
 {
-	struct si_shader_selector *sel = shader->selector;
+   struct si_shader_selector *sel = shader->selector;
 
-	assert(shader->key.as_ngg && !shader->key.as_es);
+   assert(shader->key.as_ngg && !shader->key.as_es);
 
-	return sel->type != PIPE_SHADER_GEOMETRY &&
-	       !sel->info.writes_edgeflag;
+   return sel->type != PIPE_SHADER_GEOMETRY && !sel->info.writes_edgeflag;
 }
 
 void gfx10_ngg_build_sendmsg_gs_alloc_req(struct si_shader_context *ctx)
 {
-	ac_build_sendmsg_gs_alloc_req(&ctx->ac, get_wave_id_in_tg(ctx),
-				      ngg_get_vtx_cnt(ctx),
-				      ngg_get_prim_cnt(ctx));
+   ac_build_sendmsg_gs_alloc_req(&ctx->ac, get_wave_id_in_tg(ctx), ngg_get_vtx_cnt(ctx),
+                                 ngg_get_prim_cnt(ctx));
 }
 
-void gfx10_ngg_build_export_prim(struct si_shader_context *ctx,
-				 LLVMValueRef user_edgeflags[3],
-				 LLVMValueRef prim_passthrough)
+void gfx10_ngg_build_export_prim(struct si_shader_context *ctx, LLVMValueRef user_edgeflags[3],
+                                 LLVMValueRef prim_passthrough)
 {
-	LLVMBuilderRef builder = ctx->ac.builder;
-
-	if (gfx10_is_ngg_passthrough(ctx->shader) ||
-	    ctx->shader->key.opt.ngg_culling) {
-		ac_build_ifcc(&ctx->ac, si_is_gs_thread(ctx), 6001);
-		{
-			struct ac_ngg_prim prim = {};
-
-			if (prim_passthrough)
-				prim.passthrough = prim_passthrough;
-			else
-				prim.passthrough = ac_get_arg(&ctx->ac, ctx->gs_vtx01_offset);
-
-			/* This is only used with NGG culling, which returns the NGG
-			 * passthrough prim export encoding.
-			 */
-			if (ctx->shader->selector->info.writes_edgeflag) {
-				unsigned all_bits_no_edgeflags = ~SI_NGG_PRIM_EDGE_FLAG_BITS;
-				LLVMValueRef edgeflags = LLVMConstInt(ctx->ac.i32, all_bits_no_edgeflags, 0);
-
-				unsigned num_vertices;
-				ngg_get_vertices_per_prim(ctx, &num_vertices);
-
-				for (unsigned i = 0; i < num_vertices; i++) {
-					unsigned shift = 9 + i*10;
-					LLVMValueRef edge;
-
-					edge = LLVMBuildLoad(builder, user_edgeflags[i], "");
-					edge = LLVMBuildZExt(builder, edge, ctx->ac.i32, "");
-					edge = LLVMBuildShl(builder, edge, LLVMConstInt(ctx->ac.i32, shift, 0), "");
-					edgeflags = LLVMBuildOr(builder, edgeflags, edge, "");
-				}
-				prim.passthrough = LLVMBuildAnd(builder, prim.passthrough, edgeflags, "");
-			}
-
-			ac_build_export_prim(&ctx->ac, &prim);
-		}
-		ac_build_endif(&ctx->ac, 6001);
-		return;
-	}
-
-	ac_build_ifcc(&ctx->ac, si_is_gs_thread(ctx), 6001);
-	{
-		struct ac_ngg_prim prim = {};
-
-		ngg_get_vertices_per_prim(ctx, &prim.num_vertices);
-
-		prim.isnull = ctx->ac.i1false;
-		prim.index[0] = si_unpack_param(ctx, ctx->gs_vtx01_offset, 0, 16);
-		prim.index[1] = si_unpack_param(ctx, ctx->gs_vtx01_offset, 16, 16);
-		prim.index[2] = si_unpack_param(ctx, ctx->gs_vtx23_offset, 0, 16);
-
-		for (unsigned i = 0; i < prim.num_vertices; ++i) {
-			prim.edgeflag[i] = ngg_get_initial_edgeflag(ctx, i);
-
-			if (ctx->shader->selector->info.writes_edgeflag) {
-				LLVMValueRef edge;
-
-				edge = LLVMBuildLoad(ctx->ac.builder, user_edgeflags[i], "");
-				edge = LLVMBuildAnd(ctx->ac.builder, prim.edgeflag[i], edge, "");
-				prim.edgeflag[i] = edge;
-			}
-		}
-
-		ac_build_export_prim(&ctx->ac, &prim);
-	}
-	ac_build_endif(&ctx->ac, 6001);
+   LLVMBuilderRef builder = ctx->ac.builder;
+
+   if (gfx10_is_ngg_passthrough(ctx->shader) || ctx->shader->key.opt.ngg_culling) {
+      ac_build_ifcc(&ctx->ac, si_is_gs_thread(ctx), 6001);
+      {
+         struct ac_ngg_prim prim = {};
+
+         if (prim_passthrough)
+            prim.passthrough = prim_passthrough;
+         else
+            prim.passthrough = ac_get_arg(&ctx->ac, ctx->gs_vtx01_offset);
+
+         /* This is only used with NGG culling, which returns the NGG
+          * passthrough prim export encoding.
+          */
+         if (ctx->shader->selector->info.writes_edgeflag) {
+            unsigned all_bits_no_edgeflags = ~SI_NGG_PRIM_EDGE_FLAG_BITS;
+            LLVMValueRef edgeflags = LLVMConstInt(ctx->ac.i32, all_bits_no_edgeflags, 0);
+
+            unsigned num_vertices;
+            ngg_get_vertices_per_prim(ctx, &num_vertices);
+
+            for (unsigned i = 0; i < num_vertices; i++) {
+               unsigned shift = 9 + i * 10;
+               LLVMValueRef edge;
+
+               edge = LLVMBuildLoad(builder, user_edgeflags[i], "");
+               edge = LLVMBuildZExt(builder, edge, ctx->ac.i32, "");
+               edge = LLVMBuildShl(builder, edge, LLVMConstInt(ctx->ac.i32, shift, 0), "");
+               edgeflags = LLVMBuildOr(builder, edgeflags, edge, "");
+            }
+            prim.passthrough = LLVMBuildAnd(builder, prim.passthrough, edgeflags, "");
+         }
+
+         ac_build_export_prim(&ctx->ac, &prim);
+      }
+      ac_build_endif(&ctx->ac, 6001);
+      return;
+   }
+
+   ac_build_ifcc(&ctx->ac, si_is_gs_thread(ctx), 6001);
+   {
+      struct ac_ngg_prim prim = {};
+
+      ngg_get_vertices_per_prim(ctx, &prim.num_vertices);
+
+      prim.isnull = ctx->ac.i1false;
+      prim.index[0] = si_unpack_param(ctx, ctx->gs_vtx01_offset, 0, 16);
+      prim.index[1] = si_unpack_param(ctx, ctx->gs_vtx01_offset, 16, 16);
+      prim.index[2] = si_unpack_param(ctx, ctx->gs_vtx23_offset, 0, 16);
+
+      for (unsigned i = 0; i < prim.num_vertices; ++i) {
+         prim.edgeflag[i] = ngg_get_initial_edgeflag(ctx, i);
+
+         if (ctx->shader->selector->info.writes_edgeflag) {
+            LLVMValueRef edge;
+
+            edge = LLVMBuildLoad(ctx->ac.builder, user_edgeflags[i], "");
+            edge = LLVMBuildAnd(ctx->ac.builder, prim.edgeflag[i], edge, "");
+            prim.edgeflag[i] = edge;
+         }
+      }
+
+      ac_build_export_prim(&ctx->ac, &prim);
+   }
+   ac_build_endif(&ctx->ac, 6001);
 }
 
-static void build_streamout_vertex(struct si_shader_context *ctx,
-				   LLVMValueRef *so_buffer, LLVMValueRef *wg_offset_dw,
-				   unsigned stream, LLVMValueRef offset_vtx,
-				   LLVMValueRef vertexptr)
+static void build_streamout_vertex(struct si_shader_context *ctx, LLVMValueRef *so_buffer,
+                                   LLVMValueRef *wg_offset_dw, unsigned stream,
+                                   LLVMValueRef offset_vtx, LLVMValueRef vertexptr)
 {
-	struct si_shader_info *info = &ctx->shader->selector->info;
-	struct pipe_stream_output_info *so = &ctx->shader->selector->so;
-	LLVMBuilderRef builder = ctx->ac.builder;
-	LLVMValueRef offset[4] = {};
-	LLVMValueRef tmp;
-
-	for (unsigned buffer = 0; buffer < 4; ++buffer) {
-		if (!wg_offset_dw[buffer])
-			continue;
-
-		tmp = LLVMBuildMul(builder, offset_vtx,
-				   LLVMConstInt(ctx->ac.i32, so->stride[buffer], false), "");
-		tmp = LLVMBuildAdd(builder, wg_offset_dw[buffer], tmp, "");
-		offset[buffer] = LLVMBuildShl(builder, tmp, LLVMConstInt(ctx->ac.i32, 2, false), "");
-	}
-
-	for (unsigned i = 0; i < so->num_outputs; ++i) {
-		if (so->output[i].stream != stream)
-			continue;
-
-		unsigned reg = so->output[i].register_index;
-		struct si_shader_output_values out;
-		out.semantic_name = info->output_semantic_name[reg];
-		out.semantic_index = info->output_semantic_index[reg];
-
-		for (unsigned comp = 0; comp < 4; comp++) {
-			tmp = ac_build_gep0(&ctx->ac, vertexptr,
-					    LLVMConstInt(ctx->ac.i32, 4 * reg + comp, false));
-			out.values[comp] = LLVMBuildLoad(builder, tmp, "");
-			out.vertex_stream[comp] =
-				(info->output_streams[reg] >> (2 * comp)) & 3;
-		}
-
-		si_llvm_streamout_store_output(ctx, so_buffer, offset, &so->output[i], &out);
-	}
+   struct si_shader_info *info = &ctx->shader->selector->info;
+   struct pipe_stream_output_info *so = &ctx->shader->selector->so;
+   LLVMBuilderRef builder = ctx->ac.builder;
+   LLVMValueRef offset[4] = {};
+   LLVMValueRef tmp;
+
+   for (unsigned buffer = 0; buffer < 4; ++buffer) {
+      if (!wg_offset_dw[buffer])
+         continue;
+
+      tmp = LLVMBuildMul(builder, offset_vtx, LLVMConstInt(ctx->ac.i32, so->stride[buffer], false),
+                         "");
+      tmp = LLVMBuildAdd(builder, wg_offset_dw[buffer], tmp, "");
+      offset[buffer] = LLVMBuildShl(builder, tmp, LLVMConstInt(ctx->ac.i32, 2, false), "");
+   }
+
+   for (unsigned i = 0; i < so->num_outputs; ++i) {
+      if (so->output[i].stream != stream)
+         continue;
+
+      unsigned reg = so->output[i].register_index;
+      struct si_shader_output_values out;
+      out.semantic_name = info->output_semantic_name[reg];
+      out.semantic_index = info->output_semantic_index[reg];
+
+      for (unsigned comp = 0; comp < 4; comp++) {
+         tmp = ac_build_gep0(&ctx->ac, vertexptr, LLVMConstInt(ctx->ac.i32, 4 * reg + comp, false));
+         out.values[comp] = LLVMBuildLoad(builder, tmp, "");
+         out.vertex_stream[comp] = (info->output_streams[reg] >> (2 * comp)) & 3;
+      }
+
+      si_llvm_streamout_store_output(ctx, so_buffer, offset, &so->output[i], &out);
+   }
 }
 
 struct ngg_streamout {
-	LLVMValueRef num_vertices;
+   LLVMValueRef num_vertices;
 
-	/* per-thread data */
-	LLVMValueRef prim_enable[4]; /* i1 per stream */
-	LLVMValueRef vertices[3]; /* [N x i32] addrspace(LDS)* */
+   /* per-thread data */
+   LLVMValueRef prim_enable[4]; /* i1 per stream */
+   LLVMValueRef vertices[3];    /* [N x i32] addrspace(LDS)* */
 
-	/* Output */
-	LLVMValueRef emit[4]; /* per-stream emitted primitives (only valid for used streams) */
+   /* Output */
+   LLVMValueRef emit[4]; /* per-stream emitted primitives (only valid for used streams) */
 };
 
 /**
@@ -276,427 +265,405 @@ struct ngg_streamout {
  *
  * Clobbers gs_ngg_scratch[8:].
  */
-static void build_streamout(struct si_shader_context *ctx,
-			    struct ngg_streamout *nggso)
+static void build_streamout(struct si_shader_context *ctx, struct ngg_streamout *nggso)
 {
-	struct si_shader_info *info = &ctx->shader->selector->info;
-	struct pipe_stream_output_info *so = &ctx->shader->selector->so;
-	LLVMBuilderRef builder = ctx->ac.builder;
-	LLVMValueRef buf_ptr = ac_get_arg(&ctx->ac, ctx->rw_buffers);
-	LLVMValueRef tid = get_thread_id_in_tg(ctx);
-	LLVMValueRef tmp, tmp2;
-	LLVMValueRef i32_2 = LLVMConstInt(ctx->ac.i32, 2, false);
-	LLVMValueRef i32_4 = LLVMConstInt(ctx->ac.i32, 4, false);
-	LLVMValueRef i32_8 = LLVMConstInt(ctx->ac.i32, 8, false);
-	LLVMValueRef so_buffer[4] = {};
-	unsigned max_num_vertices = 1 + (nggso->vertices[1] ? 1 : 0) +
-					(nggso->vertices[2] ? 1 : 0);
-	LLVMValueRef prim_stride_dw[4] = {};
-	LLVMValueRef prim_stride_dw_vgpr = LLVMGetUndef(ctx->ac.i32);
-	int stream_for_buffer[4] = { -1, -1, -1, -1 };
-	unsigned bufmask_for_stream[4] = {};
-	bool isgs = ctx->type == PIPE_SHADER_GEOMETRY;
-	unsigned scratch_emit_base = isgs ? 4 : 0;
-	LLVMValueRef scratch_emit_basev = isgs ? i32_4 : ctx->ac.i32_0;
-	unsigned scratch_offset_base = isgs ? 8 : 4;
-	LLVMValueRef scratch_offset_basev = isgs ? i32_8 : i32_4;
-
-	ac_llvm_add_target_dep_function_attr(ctx->main_fn, "amdgpu-gds-size", 256);
-
-	/* Determine the mapping of streamout buffers to vertex streams. */
-	for (unsigned i = 0; i < so->num_outputs; ++i) {
-		unsigned buf = so->output[i].output_buffer;
-		unsigned stream = so->output[i].stream;
-		assert(stream_for_buffer[buf] < 0 || stream_for_buffer[buf] == stream);
-		stream_for_buffer[buf] = stream;
-		bufmask_for_stream[stream] |= 1 << buf;
-	}
-
-	for (unsigned buffer = 0; buffer < 4; ++buffer) {
-		if (stream_for_buffer[buffer] == -1)
-			continue;
-
-		assert(so->stride[buffer]);
-
-		tmp = LLVMConstInt(ctx->ac.i32, so->stride[buffer], false);
-		prim_stride_dw[buffer] = LLVMBuildMul(builder, tmp, nggso->num_vertices, "");
-		prim_stride_dw_vgpr = ac_build_writelane(
-			&ctx->ac, prim_stride_dw_vgpr, prim_stride_dw[buffer],
-			LLVMConstInt(ctx->ac.i32, buffer, false));
-
-		so_buffer[buffer] = ac_build_load_to_sgpr(
-			&ctx->ac, buf_ptr,
-			LLVMConstInt(ctx->ac.i32, SI_VS_STREAMOUT_BUF0 + buffer, false));
-	}
-
-	tmp = LLVMBuildICmp(builder, LLVMIntEQ, get_wave_id_in_tg(ctx), ctx->ac.i32_0, "");
-	ac_build_ifcc(&ctx->ac, tmp, 5200);
-	{
-		LLVMTypeRef gdsptr = LLVMPointerType(ctx->ac.i32, AC_ADDR_SPACE_GDS);
-		LLVMValueRef gdsbase = LLVMBuildIntToPtr(builder, ctx->ac.i32_0, gdsptr, "");
-
-		/* Advance the streamout offsets in GDS. */
-		LLVMValueRef offsets_vgpr = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, "");
-		LLVMValueRef generated_by_stream_vgpr = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, "");
-
-		tmp = LLVMBuildICmp(builder, LLVMIntULT, ac_get_thread_id(&ctx->ac), i32_4, "");
-		ac_build_ifcc(&ctx->ac, tmp, 5210);
-		{
-			if (isgs) {
-				tmp = ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, tid);
-				tmp = LLVMBuildLoad(builder, tmp, "");
-			} else {
-				tmp = ac_build_writelane(&ctx->ac, ctx->ac.i32_0,
-						ngg_get_prim_cnt(ctx), ctx->ac.i32_0);
-			}
-			LLVMBuildStore(builder, tmp, generated_by_stream_vgpr);
-
-			unsigned swizzle[4];
-			int unused_stream = -1;
-			for (unsigned stream = 0; stream < 4; ++stream) {
-				if (!info->num_stream_output_components[stream]) {
-					unused_stream = stream;
-					break;
-				}
-			}
-			for (unsigned buffer = 0; buffer < 4; ++buffer) {
-				if (stream_for_buffer[buffer] >= 0) {
-					swizzle[buffer] = stream_for_buffer[buffer];
-				} else {
-					assert(unused_stream >= 0);
-					swizzle[buffer] = unused_stream;
-				}
-			}
-
-			tmp = ac_build_quad_swizzle(&ctx->ac, tmp,
-				swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
-			tmp = LLVMBuildMul(builder, tmp, prim_stride_dw_vgpr, "");
-
-			LLVMValueRef args[] = {
-				LLVMBuildIntToPtr(builder, ngg_get_ordered_id(ctx), gdsptr, ""),
-				tmp,
-				ctx->ac.i32_0, // ordering
-				ctx->ac.i32_0, // scope
-				ctx->ac.i1false, // isVolatile
-				LLVMConstInt(ctx->ac.i32, 4 << 24, false), // OA index
-				ctx->ac.i1true, // wave release
-				ctx->ac.i1true, // wave done
-			};
-			tmp = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.ds.ordered.add",
-						 ctx->ac.i32, args, ARRAY_SIZE(args), 0);
-
-			/* Keep offsets in a VGPR for quick retrieval via readlane by
-			 * the first wave for bounds checking, and also store in LDS
-			 * for retrieval by all waves later. */
-			LLVMBuildStore(builder, tmp, offsets_vgpr);
-
-			tmp2 = LLVMBuildAdd(builder, ac_get_thread_id(&ctx->ac),
-					    scratch_offset_basev, "");
-			tmp2 = ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, tmp2);
-			LLVMBuildStore(builder, tmp, tmp2);
-		}
-		ac_build_endif(&ctx->ac, 5210);
-
-		/* Determine the max emit per buffer. This is done via the SALU, in part
-		 * because LLVM can't generate divide-by-multiply if we try to do this
-		 * via VALU with one lane per buffer.
-		 */
-		LLVMValueRef max_emit[4] = {};
-		for (unsigned buffer = 0; buffer < 4; ++buffer) {
-			if (stream_for_buffer[buffer] == -1)
-				continue;
-
-			LLVMValueRef bufsize_dw =
-				LLVMBuildLShr(builder,
-					LLVMBuildExtractElement(builder, so_buffer[buffer], i32_2, ""),
-					i32_2, "");
-
-			tmp = LLVMBuildLoad(builder, offsets_vgpr, "");
-			LLVMValueRef offset_dw =
-				ac_build_readlane(&ctx->ac, tmp,
-						LLVMConstInt(ctx->ac.i32, buffer, false));
-
-			tmp = LLVMBuildSub(builder, bufsize_dw, offset_dw, "");
-			tmp = LLVMBuildUDiv(builder, tmp, prim_stride_dw[buffer], "");
-
-			tmp2 = LLVMBuildICmp(builder, LLVMIntULT, bufsize_dw, offset_dw, "");
-			max_emit[buffer] = LLVMBuildSelect(builder, tmp2, ctx->ac.i32_0, tmp, "");
-		}
-
-		/* Determine the number of emitted primitives per stream and fixup the
-		 * GDS counter if necessary.
-		 *
-		 * This is complicated by the fact that a single stream can emit to
-		 * multiple buffers (but luckily not vice versa).
-		 */
-		LLVMValueRef emit_vgpr = ctx->ac.i32_0;
-
-		for (unsigned stream = 0; stream < 4; ++stream) {
-			if (!info->num_stream_output_components[stream])
-				continue;
-
-			tmp = LLVMBuildLoad(builder, generated_by_stream_vgpr, "");
-			LLVMValueRef generated =
-				ac_build_readlane(&ctx->ac, tmp,
-						  LLVMConstInt(ctx->ac.i32, stream, false));
-
-			LLVMValueRef emit = generated;
-			for (unsigned buffer = 0; buffer < 4; ++buffer) {
-				if (stream_for_buffer[buffer] == stream)
-					emit = ac_build_umin(&ctx->ac, emit, max_emit[buffer]);
-			}
-
-			emit_vgpr = ac_build_writelane(&ctx->ac, emit_vgpr, emit,
-						       LLVMConstInt(ctx->ac.i32, stream, false));
-
-			/* Fixup the offset using a plain GDS atomic if we overflowed. */
-			tmp = LLVMBuildICmp(builder, LLVMIntULT, emit, generated, "");
-			ac_build_ifcc(&ctx->ac, tmp, 5221); /* scalar branch */
-			tmp = LLVMBuildLShr(builder,
-					    LLVMConstInt(ctx->ac.i32, bufmask_for_stream[stream], false),
-					    ac_get_thread_id(&ctx->ac), "");
-			tmp = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, "");
-			ac_build_ifcc(&ctx->ac, tmp, 5222);
-			{
-				tmp = LLVMBuildSub(builder, generated, emit, "");
-				tmp = LLVMBuildMul(builder, tmp, prim_stride_dw_vgpr, "");
-				tmp2 = LLVMBuildGEP(builder, gdsbase, &tid, 1, "");
-				LLVMBuildAtomicRMW(builder, LLVMAtomicRMWBinOpSub, tmp2, tmp,
-						   LLVMAtomicOrderingMonotonic, false);
-			}
-			ac_build_endif(&ctx->ac, 5222);
-			ac_build_endif(&ctx->ac, 5221);
-		}
-
-		tmp = LLVMBuildICmp(builder, LLVMIntULT, ac_get_thread_id(&ctx->ac), i32_4, "");
-		ac_build_ifcc(&ctx->ac, tmp, 5225);
-		{
-			tmp = LLVMBuildAdd(builder, ac_get_thread_id(&ctx->ac),
-					   scratch_emit_basev, "");
-			tmp = ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, tmp);
-			LLVMBuildStore(builder, emit_vgpr, tmp);
-		}
-		ac_build_endif(&ctx->ac, 5225);
-	}
-	ac_build_endif(&ctx->ac, 5200);
-
-	/* Determine the workgroup-relative per-thread / primitive offset into
-	 * the streamout buffers */
-	struct ac_wg_scan primemit_scan[4] = {};
-
-	if (isgs) {
-		for (unsigned stream = 0; stream < 4; ++stream) {
-			if (!info->num_stream_output_components[stream])
-				continue;
-
-			primemit_scan[stream].enable_exclusive = true;
-			primemit_scan[stream].op = nir_op_iadd;
-			primemit_scan[stream].src = nggso->prim_enable[stream];
-			primemit_scan[stream].scratch =
-				ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch,
-					LLVMConstInt(ctx->ac.i32, 12 + 8 * stream, false));
-			primemit_scan[stream].waveidx = get_wave_id_in_tg(ctx);
-			primemit_scan[stream].numwaves = get_tgsize(ctx);
-			primemit_scan[stream].maxwaves = 8;
-			ac_build_wg_scan_top(&ctx->ac, &primemit_scan[stream]);
-		}
-	}
-
-	ac_build_s_barrier(&ctx->ac);
-
-	/* Fetch the per-buffer offsets and per-stream emit counts in all waves. */
-	LLVMValueRef wgoffset_dw[4] = {};
-
-	{
-		LLVMValueRef scratch_vgpr;
-
-		tmp = ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, ac_get_thread_id(&ctx->ac));
-		scratch_vgpr = LLVMBuildLoad(builder, tmp, "");
-
-		for (unsigned buffer = 0; buffer < 4; ++buffer) {
-			if (stream_for_buffer[buffer] >= 0) {
-				wgoffset_dw[buffer] = ac_build_readlane(
-					&ctx->ac, scratch_vgpr,
-					LLVMConstInt(ctx->ac.i32, scratch_offset_base + buffer, false));
-			}
-		}
-
-		for (unsigned stream = 0; stream < 4; ++stream) {
-			if (info->num_stream_output_components[stream]) {
-				nggso->emit[stream] = ac_build_readlane(
-					&ctx->ac, scratch_vgpr,
-					LLVMConstInt(ctx->ac.i32, scratch_emit_base + stream, false));
-			}
-		}
-	}
-
-	/* Write out primitive data */
-	for (unsigned stream = 0; stream < 4; ++stream) {
-		if (!info->num_stream_output_components[stream])
-			continue;
-
-		if (isgs) {
-			ac_build_wg_scan_bottom(&ctx->ac, &primemit_scan[stream]);
-		} else {
-			primemit_scan[stream].result_exclusive = tid;
-		}
-
-		tmp = LLVMBuildICmp(builder, LLVMIntULT,
-				    primemit_scan[stream].result_exclusive,
-				    nggso->emit[stream], "");
-		tmp = LLVMBuildAnd(builder, tmp, nggso->prim_enable[stream], "");
-		ac_build_ifcc(&ctx->ac, tmp, 5240);
-		{
-			LLVMValueRef offset_vtx =
-				LLVMBuildMul(builder, primemit_scan[stream].result_exclusive,
-					     nggso->num_vertices, "");
-
-			for (unsigned i = 0; i < max_num_vertices; ++i) {
-				tmp = LLVMBuildICmp(builder, LLVMIntULT,
-						    LLVMConstInt(ctx->ac.i32, i, false),
-						    nggso->num_vertices, "");
-				ac_build_ifcc(&ctx->ac, tmp, 5241);
-				build_streamout_vertex(ctx, so_buffer, wgoffset_dw,
-						       stream, offset_vtx, nggso->vertices[i]);
-				ac_build_endif(&ctx->ac, 5241);
-				offset_vtx = LLVMBuildAdd(builder, offset_vtx, ctx->ac.i32_1, "");
-			}
-		}
-		ac_build_endif(&ctx->ac, 5240);
-	}
+   struct si_shader_info *info = &ctx->shader->selector->info;
+   struct pipe_stream_output_info *so = &ctx->shader->selector->so;
+   LLVMBuilderRef builder = ctx->ac.builder;
+   LLVMValueRef buf_ptr = ac_get_arg(&ctx->ac, ctx->rw_buffers);
+   LLVMValueRef tid = get_thread_id_in_tg(ctx);
+   LLVMValueRef tmp, tmp2;
+   LLVMValueRef i32_2 = LLVMConstInt(ctx->ac.i32, 2, false);
+   LLVMValueRef i32_4 = LLVMConstInt(ctx->ac.i32, 4, false);
+   LLVMValueRef i32_8 = LLVMConstInt(ctx->ac.i32, 8, false);
+   LLVMValueRef so_buffer[4] = {};
+   unsigned max_num_vertices = 1 + (nggso->vertices[1] ? 1 : 0) + (nggso->vertices[2] ? 1 : 0);
+   LLVMValueRef prim_stride_dw[4] = {};
+   LLVMValueRef prim_stride_dw_vgpr = LLVMGetUndef(ctx->ac.i32);
+   int stream_for_buffer[4] = {-1, -1, -1, -1};
+   unsigned bufmask_for_stream[4] = {};
+   bool isgs = ctx->type == PIPE_SHADER_GEOMETRY;
+   unsigned scratch_emit_base = isgs ? 4 : 0;
+   LLVMValueRef scratch_emit_basev = isgs ? i32_4 : ctx->ac.i32_0;
+   unsigned scratch_offset_base = isgs ? 8 : 4;
+   LLVMValueRef scratch_offset_basev = isgs ? i32_8 : i32_4;
+
+   ac_llvm_add_target_dep_function_attr(ctx->main_fn, "amdgpu-gds-size", 256);
+
+   /* Determine the mapping of streamout buffers to vertex streams. */
+   for (unsigned i = 0; i < so->num_outputs; ++i) {
+      unsigned buf = so->output[i].output_buffer;
+      unsigned stream = so->output[i].stream;
+      assert(stream_for_buffer[buf] < 0 || stream_for_buffer[buf] == stream);
+      stream_for_buffer[buf] = stream;
+      bufmask_for_stream[stream] |= 1 << buf;
+   }
+
+   for (unsigned buffer = 0; buffer < 4; ++buffer) {
+      if (stream_for_buffer[buffer] == -1)
+         continue;
+
+      assert(so->stride[buffer]);
+
+      tmp = LLVMConstInt(ctx->ac.i32, so->stride[buffer], false);
+      prim_stride_dw[buffer] = LLVMBuildMul(builder, tmp, nggso->num_vertices, "");
+      prim_stride_dw_vgpr =
+         ac_build_writelane(&ctx->ac, prim_stride_dw_vgpr, prim_stride_dw[buffer],
+                            LLVMConstInt(ctx->ac.i32, buffer, false));
+
+      so_buffer[buffer] = ac_build_load_to_sgpr(
+         &ctx->ac, buf_ptr, LLVMConstInt(ctx->ac.i32, SI_VS_STREAMOUT_BUF0 + buffer, false));
+   }
+
+   tmp = LLVMBuildICmp(builder, LLVMIntEQ, get_wave_id_in_tg(ctx), ctx->ac.i32_0, "");
+   ac_build_ifcc(&ctx->ac, tmp, 5200);
+   {
+      LLVMTypeRef gdsptr = LLVMPointerType(ctx->ac.i32, AC_ADDR_SPACE_GDS);
+      LLVMValueRef gdsbase = LLVMBuildIntToPtr(builder, ctx->ac.i32_0, gdsptr, "");
+
+      /* Advance the streamout offsets in GDS. */
+      LLVMValueRef offsets_vgpr = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, "");
+      LLVMValueRef generated_by_stream_vgpr = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, "");
+
+      tmp = LLVMBuildICmp(builder, LLVMIntULT, ac_get_thread_id(&ctx->ac), i32_4, "");
+      ac_build_ifcc(&ctx->ac, tmp, 5210);
+      {
+         if (isgs) {
+            tmp = ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, tid);
+            tmp = LLVMBuildLoad(builder, tmp, "");
+         } else {
+            tmp = ac_build_writelane(&ctx->ac, ctx->ac.i32_0, ngg_get_prim_cnt(ctx), ctx->ac.i32_0);
+         }
+         LLVMBuildStore(builder, tmp, generated_by_stream_vgpr);
+
+         unsigned swizzle[4];
+         int unused_stream = -1;
+         for (unsigned stream = 0; stream < 4; ++stream) {
+            if (!info->num_stream_output_components[stream]) {
+               unused_stream = stream;
+               break;
+            }
+         }
+         for (unsigned buffer = 0; buffer < 4; ++buffer) {
+            if (stream_for_buffer[buffer] >= 0) {
+               swizzle[buffer] = stream_for_buffer[buffer];
+            } else {
+               assert(unused_stream >= 0);
+               swizzle[buffer] = unused_stream;
+            }
+         }
+
+         tmp = ac_build_quad_swizzle(&ctx->ac, tmp, swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
+         tmp = LLVMBuildMul(builder, tmp, prim_stride_dw_vgpr, "");
+
+         LLVMValueRef args[] = {
+            LLVMBuildIntToPtr(builder, ngg_get_ordered_id(ctx), gdsptr, ""),
+            tmp,
+            ctx->ac.i32_0,                             // ordering
+            ctx->ac.i32_0,                             // scope
+            ctx->ac.i1false,                           // isVolatile
+            LLVMConstInt(ctx->ac.i32, 4 << 24, false), // OA index
+            ctx->ac.i1true,                            // wave release
+            ctx->ac.i1true,                            // wave done
+         };
+         tmp = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.ds.ordered.add", ctx->ac.i32, args,
+                                  ARRAY_SIZE(args), 0);
+
+         /* Keep offsets in a VGPR for quick retrieval via readlane by
+          * the first wave for bounds checking, and also store in LDS
+          * for retrieval by all waves later. */
+         LLVMBuildStore(builder, tmp, offsets_vgpr);
+
+         tmp2 = LLVMBuildAdd(builder, ac_get_thread_id(&ctx->ac), scratch_offset_basev, "");
+         tmp2 = ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, tmp2);
+         LLVMBuildStore(builder, tmp, tmp2);
+      }
+      ac_build_endif(&ctx->ac, 5210);
+
+      /* Determine the max emit per buffer. This is done via the SALU, in part
+       * because LLVM can't generate divide-by-multiply if we try to do this
+       * via VALU with one lane per buffer.
+       */
+      LLVMValueRef max_emit[4] = {};
+      for (unsigned buffer = 0; buffer < 4; ++buffer) {
+         if (stream_for_buffer[buffer] == -1)
+            continue;
+
+         LLVMValueRef bufsize_dw = LLVMBuildLShr(
+            builder, LLVMBuildExtractElement(builder, so_buffer[buffer], i32_2, ""), i32_2, "");
+
+         tmp = LLVMBuildLoad(builder, offsets_vgpr, "");
+         LLVMValueRef offset_dw =
+            ac_build_readlane(&ctx->ac, tmp, LLVMConstInt(ctx->ac.i32, buffer, false));
+
+         tmp = LLVMBuildSub(builder, bufsize_dw, offset_dw, "");
+         tmp = LLVMBuildUDiv(builder, tmp, prim_stride_dw[buffer], "");
+
+         tmp2 = LLVMBuildICmp(builder, LLVMIntULT, bufsize_dw, offset_dw, "");
+         max_emit[buffer] = LLVMBuildSelect(builder, tmp2, ctx->ac.i32_0, tmp, "");
+      }
+
+      /* Determine the number of emitted primitives per stream and fixup the
+       * GDS counter if necessary.
+       *
+       * This is complicated by the fact that a single stream can emit to
+       * multiple buffers (but luckily not vice versa).
+       */
+      LLVMValueRef emit_vgpr = ctx->ac.i32_0;
+
+      for (unsigned stream = 0; stream < 4; ++stream) {
+         if (!info->num_stream_output_components[stream])
+            continue;
+
+         tmp = LLVMBuildLoad(builder, generated_by_stream_vgpr, "");
+         LLVMValueRef generated =
+            ac_build_readlane(&ctx->ac, tmp, LLVMConstInt(ctx->ac.i32, stream, false));
+
+         LLVMValueRef emit = generated;
+         for (unsigned buffer = 0; buffer < 4; ++buffer) {
+            if (stream_for_buffer[buffer] == stream)
+               emit = ac_build_umin(&ctx->ac, emit, max_emit[buffer]);
+         }
+
+         emit_vgpr =
+            ac_build_writelane(&ctx->ac, emit_vgpr, emit, LLVMConstInt(ctx->ac.i32, stream, false));
+
+         /* Fixup the offset using a plain GDS atomic if we overflowed. */
+         tmp = LLVMBuildICmp(builder, LLVMIntULT, emit, generated, "");
+         ac_build_ifcc(&ctx->ac, tmp, 5221); /* scalar branch */
+         tmp = LLVMBuildLShr(builder, LLVMConstInt(ctx->ac.i32, bufmask_for_stream[stream], false),
+                             ac_get_thread_id(&ctx->ac), "");
+         tmp = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, "");
+         ac_build_ifcc(&ctx->ac, tmp, 5222);
+         {
+            tmp = LLVMBuildSub(builder, generated, emit, "");
+            tmp = LLVMBuildMul(builder, tmp, prim_stride_dw_vgpr, "");
+            tmp2 = LLVMBuildGEP(builder, gdsbase, &tid, 1, "");
+            LLVMBuildAtomicRMW(builder, LLVMAtomicRMWBinOpSub, tmp2, tmp,
+                               LLVMAtomicOrderingMonotonic, false);
+         }
+         ac_build_endif(&ctx->ac, 5222);
+         ac_build_endif(&ctx->ac, 5221);
+      }
+
+      tmp = LLVMBuildICmp(builder, LLVMIntULT, ac_get_thread_id(&ctx->ac), i32_4, "");
+      ac_build_ifcc(&ctx->ac, tmp, 5225);
+      {
+         tmp = LLVMBuildAdd(builder, ac_get_thread_id(&ctx->ac), scratch_emit_basev, "");
+         tmp = ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, tmp);
+         LLVMBuildStore(builder, emit_vgpr, tmp);
+      }
+      ac_build_endif(&ctx->ac, 5225);
+   }
+   ac_build_endif(&ctx->ac, 5200);
+
+   /* Determine the workgroup-relative per-thread / primitive offset into
+    * the streamout buffers */
+   struct ac_wg_scan primemit_scan[4] = {};
+
+   if (isgs) {
+      for (unsigned stream = 0; stream < 4; ++stream) {
+         if (!info->num_stream_output_components[stream])
+            continue;
+
+         primemit_scan[stream].enable_exclusive = true;
+         primemit_scan[stream].op = nir_op_iadd;
+         primemit_scan[stream].src = nggso->prim_enable[stream];
+         primemit_scan[stream].scratch = ac_build_gep0(
+            &ctx->ac, ctx->gs_ngg_scratch, LLVMConstInt(ctx->ac.i32, 12 + 8 * stream, false));
+         primemit_scan[stream].waveidx = get_wave_id_in_tg(ctx);
+         primemit_scan[stream].numwaves = get_tgsize(ctx);
+         primemit_scan[stream].maxwaves = 8;
+         ac_build_wg_scan_top(&ctx->ac, &primemit_scan[stream]);
+      }
+   }
+
+   ac_build_s_barrier(&ctx->ac);
+
+   /* Fetch the per-buffer offsets and per-stream emit counts in all waves. */
+   LLVMValueRef wgoffset_dw[4] = {};
+
+   {
+      LLVMValueRef scratch_vgpr;
+
+      tmp = ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, ac_get_thread_id(&ctx->ac));
+      scratch_vgpr = LLVMBuildLoad(builder, tmp, "");
+
+      for (unsigned buffer = 0; buffer < 4; ++buffer) {
+         if (stream_for_buffer[buffer] >= 0) {
+            wgoffset_dw[buffer] =
+               ac_build_readlane(&ctx->ac, scratch_vgpr,
+                                 LLVMConstInt(ctx->ac.i32, scratch_offset_base + buffer, false));
+         }
+      }
+
+      for (unsigned stream = 0; stream < 4; ++stream) {
+         if (info->num_stream_output_components[stream]) {
+            nggso->emit[stream] =
+               ac_build_readlane(&ctx->ac, scratch_vgpr,
+                                 LLVMConstInt(ctx->ac.i32, scratch_emit_base + stream, false));
+         }
+      }
+   }
+
+   /* Write out primitive data */
+   for (unsigned stream = 0; stream < 4; ++stream) {
+      if (!info->num_stream_output_components[stream])
+         continue;
+
+      if (isgs) {
+         ac_build_wg_scan_bottom(&ctx->ac, &primemit_scan[stream]);
+      } else {
+         primemit_scan[stream].result_exclusive = tid;
+      }
+
+      tmp = LLVMBuildICmp(builder, LLVMIntULT, primemit_scan[stream].result_exclusive,
+                          nggso->emit[stream], "");
+      tmp = LLVMBuildAnd(builder, tmp, nggso->prim_enable[stream], "");
+      ac_build_ifcc(&ctx->ac, tmp, 5240);
+      {
+         LLVMValueRef offset_vtx =
+            LLVMBuildMul(builder, primemit_scan[stream].result_exclusive, nggso->num_vertices, "");
+
+         for (unsigned i = 0; i < max_num_vertices; ++i) {
+            tmp = LLVMBuildICmp(builder, LLVMIntULT, LLVMConstInt(ctx->ac.i32, i, false),
+                                nggso->num_vertices, "");
+            ac_build_ifcc(&ctx->ac, tmp, 5241);
+            build_streamout_vertex(ctx, so_buffer, wgoffset_dw, stream, offset_vtx,
+                                   nggso->vertices[i]);
+            ac_build_endif(&ctx->ac, 5241);
+            offset_vtx = LLVMBuildAdd(builder, offset_vtx, ctx->ac.i32_1, "");
+         }
+      }
+      ac_build_endif(&ctx->ac, 5240);
+   }
 }
 
 /* LDS layout of ES vertex data for NGG culling. */
-enum {
-	/* Byte 0: Boolean ES thread accepted (unculled) flag, and later the old
-	 *         ES thread ID. After vertex compaction, compacted ES threads
-	 *         store the old thread ID here to copy input VGPRs from uncompacted
-	 *         ES threads.
-	 * Byte 1: New ES thread ID, loaded by GS to prepare the prim export value.
-	 * Byte 2: TES rel patch ID
-	 * Byte 3: Unused
-	 */
-	lds_byte0_accept_flag = 0,
-	lds_byte0_old_thread_id = 0,
-	lds_byte1_new_thread_id,
-	lds_byte2_tes_rel_patch_id,
-	lds_byte3_unused,
-
-	lds_packed_data = 0, /* lds_byteN_... */
-
-	lds_pos_x,
-	lds_pos_y,
-	lds_pos_z,
-	lds_pos_w,
-	lds_pos_x_div_w,
-	lds_pos_y_div_w,
-	/* If VS: */
-	lds_vertex_id,
-	lds_instance_id, /* optional */
-	/* If TES: */
-	lds_tes_u = lds_vertex_id,
-	lds_tes_v = lds_instance_id,
-	lds_tes_patch_id, /* optional */
+enum
+{
+   /* Byte 0: Boolean ES thread accepted (unculled) flag, and later the old
+    *         ES thread ID. After vertex compaction, compacted ES threads
+    *         store the old thread ID here to copy input VGPRs from uncompacted
+    *         ES threads.
+    * Byte 1: New ES thread ID, loaded by GS to prepare the prim export value.
+    * Byte 2: TES rel patch ID
+    * Byte 3: Unused
+    */
+   lds_byte0_accept_flag = 0,
+   lds_byte0_old_thread_id = 0,
+   lds_byte1_new_thread_id,
+   lds_byte2_tes_rel_patch_id,
+   lds_byte3_unused,
+
+   lds_packed_data = 0, /* lds_byteN_... */
+
+   lds_pos_x,
+   lds_pos_y,
+   lds_pos_z,
+   lds_pos_w,
+   lds_pos_x_div_w,
+   lds_pos_y_div_w,
+   /* If VS: */
+   lds_vertex_id,
+   lds_instance_id, /* optional */
+   /* If TES: */
+   lds_tes_u = lds_vertex_id,
+   lds_tes_v = lds_instance_id,
+   lds_tes_patch_id, /* optional */
 };
 
-static LLVMValueRef si_build_gep_i8(struct si_shader_context *ctx,
-				    LLVMValueRef ptr, unsigned byte_index)
+static LLVMValueRef si_build_gep_i8(struct si_shader_context *ctx, LLVMValueRef ptr,
+                                    unsigned byte_index)
 {
-	assert(byte_index < 4);
-	LLVMTypeRef pi8 = LLVMPointerType(ctx->ac.i8, AC_ADDR_SPACE_LDS);
-	LLVMValueRef index = LLVMConstInt(ctx->ac.i32, byte_index, 0);
+   assert(byte_index < 4);
+   LLVMTypeRef pi8 = LLVMPointerType(ctx->ac.i8, AC_ADDR_SPACE_LDS);
+   LLVMValueRef index = LLVMConstInt(ctx->ac.i32, byte_index, 0);
 
-	return LLVMBuildGEP(ctx->ac.builder,
-			    LLVMBuildPointerCast(ctx->ac.builder, ptr, pi8, ""),
-			    &index, 1, "");
+   return LLVMBuildGEP(ctx->ac.builder, LLVMBuildPointerCast(ctx->ac.builder, ptr, pi8, ""), &index,
+                       1, "");
 }
 
 static unsigned ngg_nogs_vertex_size(struct si_shader *shader)
 {
-	unsigned lds_vertex_size = 0;
-
-	/* The edgeflag is always stored in the last element that's also
-	 * used for padding to reduce LDS bank conflicts. */
-	if (shader->selector->so.num_outputs)
-		lds_vertex_size = 4 * shader->selector->info.num_outputs + 1;
-	if (shader->selector->info.writes_edgeflag)
-		lds_vertex_size = MAX2(lds_vertex_size, 1);
-
-	/* LDS size for passing data from GS to ES.
-	 * GS stores Primitive IDs into LDS at the address corresponding
-	 * to the ES thread of the provoking vertex. All ES threads
-	 * load and export PrimitiveID for their thread.
-	 */
-	if (shader->selector->type == PIPE_SHADER_VERTEX &&
-	    shader->key.mono.u.vs_export_prim_id)
-		lds_vertex_size = MAX2(lds_vertex_size, 1);
-
-	if (shader->key.opt.ngg_culling) {
-		if (shader->selector->type == PIPE_SHADER_VERTEX) {
-			STATIC_ASSERT(lds_instance_id + 1 == 9);
-			lds_vertex_size = MAX2(lds_vertex_size, 9);
-		} else {
-			assert(shader->selector->type == PIPE_SHADER_TESS_EVAL);
-
-			if (shader->selector->info.uses_primid ||
-			    shader->key.mono.u.vs_export_prim_id) {
-				STATIC_ASSERT(lds_tes_patch_id + 2 == 11);
-				lds_vertex_size = MAX2(lds_vertex_size, 11);
-			} else {
-				STATIC_ASSERT(lds_tes_v + 1 == 9);
-				lds_vertex_size = MAX2(lds_vertex_size, 9);
-			}
-		}
-	}
-
-	return lds_vertex_size;
+   unsigned lds_vertex_size = 0;
+
+   /* The edgeflag is always stored in the last element that's also
+    * used for padding to reduce LDS bank conflicts. */
+   if (shader->selector->so.num_outputs)
+      lds_vertex_size = 4 * shader->selector->info.num_outputs + 1;
+   if (shader->selector->info.writes_edgeflag)
+      lds_vertex_size = MAX2(lds_vertex_size, 1);
+
+   /* LDS size for passing data from GS to ES.
+    * GS stores Primitive IDs into LDS at the address corresponding
+    * to the ES thread of the provoking vertex. All ES threads
+    * load and export PrimitiveID for their thread.
+    */
+   if (shader->selector->type == PIPE_SHADER_VERTEX && shader->key.mono.u.vs_export_prim_id)
+      lds_vertex_size = MAX2(lds_vertex_size, 1);
+
+   if (shader->key.opt.ngg_culling) {
+      if (shader->selector->type == PIPE_SHADER_VERTEX) {
+         STATIC_ASSERT(lds_instance_id + 1 == 9);
+         lds_vertex_size = MAX2(lds_vertex_size, 9);
+      } else {
+         assert(shader->selector->type == PIPE_SHADER_TESS_EVAL);
+
+         if (shader->selector->info.uses_primid || shader->key.mono.u.vs_export_prim_id) {
+            STATIC_ASSERT(lds_tes_patch_id + 2 == 11);
+            lds_vertex_size = MAX2(lds_vertex_size, 11);
+         } else {
+            STATIC_ASSERT(lds_tes_v + 1 == 9);
+            lds_vertex_size = MAX2(lds_vertex_size, 9);
+         }
+      }
+   }
+
+   return lds_vertex_size;
 }
 
 /**
  * Returns an `[N x i32] addrspace(LDS)*` pointing at contiguous LDS storage
  * for the vertex outputs.
  */
-static LLVMValueRef ngg_nogs_vertex_ptr(struct si_shader_context *ctx,
-					LLVMValueRef vtxid)
+static LLVMValueRef ngg_nogs_vertex_ptr(struct si_shader_context *ctx, LLVMValueRef vtxid)
 {
-	/* The extra dword is used to avoid LDS bank conflicts. */
-	unsigned vertex_size = ngg_nogs_vertex_size(ctx->shader);
-	LLVMTypeRef ai32 = LLVMArrayType(ctx->ac.i32, vertex_size);
-	LLVMTypeRef pai32 = LLVMPointerType(ai32, AC_ADDR_SPACE_LDS);
-	LLVMValueRef tmp = LLVMBuildBitCast(ctx->ac.builder, ctx->esgs_ring, pai32, "");
-	return LLVMBuildGEP(ctx->ac.builder, tmp, &vtxid, 1, "");
+   /* The extra dword is used to avoid LDS bank conflicts. */
+   unsigned vertex_size = ngg_nogs_vertex_size(ctx->shader);
+   LLVMTypeRef ai32 = LLVMArrayType(ctx->ac.i32, vertex_size);
+   LLVMTypeRef pai32 = LLVMPointerType(ai32, AC_ADDR_SPACE_LDS);
+   LLVMValueRef tmp = LLVMBuildBitCast(ctx->ac.builder, ctx->esgs_ring, pai32, "");
+   return LLVMBuildGEP(ctx->ac.builder, tmp, &vtxid, 1, "");
 }
 
-static LLVMValueRef si_insert_input_v4i32(struct si_shader_context *ctx,
-					  LLVMValueRef ret, struct ac_arg param,
-					  unsigned return_index)
+static LLVMValueRef si_insert_input_v4i32(struct si_shader_context *ctx, LLVMValueRef ret,
+                                          struct ac_arg param, unsigned return_index)
 {
-	LLVMValueRef v = ac_get_arg(&ctx->ac, param);
-
-	for (unsigned i = 0; i < 4; i++) {
-		ret = LLVMBuildInsertValue(ctx->ac.builder, ret,
-					   ac_llvm_extract_elem(&ctx->ac, v, i),
-					   return_index + i, "");
-	}
-	return ret;
+   LLVMValueRef v = ac_get_arg(&ctx->ac, param);
+
+   for (unsigned i = 0; i < 4; i++) {
+      ret = LLVMBuildInsertValue(ctx->ac.builder, ret, ac_llvm_extract_elem(&ctx->ac, v, i),
+                                 return_index + i, "");
+   }
+   return ret;
 }
 
-static void load_bitmasks_2x64(struct si_shader_context *ctx,
-			       LLVMValueRef lds_ptr, unsigned dw_offset,
-			       LLVMValueRef mask[2], LLVMValueRef *total_bitcount)
+static void load_bitmasks_2x64(struct si_shader_context *ctx, LLVMValueRef lds_ptr,
+                               unsigned dw_offset, LLVMValueRef mask[2],
+                               LLVMValueRef *total_bitcount)
 {
-	LLVMBuilderRef builder = ctx->ac.builder;
-	LLVMValueRef ptr64 = LLVMBuildPointerCast(builder, lds_ptr,
-						  LLVMPointerType(LLVMArrayType(ctx->ac.i64, 2),
-								  AC_ADDR_SPACE_LDS), "");
-	for (unsigned i = 0; i < 2; i++) {
-		LLVMValueRef index = LLVMConstInt(ctx->ac.i32, dw_offset / 2 + i, 0);
-		mask[i] = LLVMBuildLoad(builder, ac_build_gep0(&ctx->ac, ptr64, index), "");
-	}
-
-	/* We get better code if we don't use the 128-bit bitcount. */
-	*total_bitcount = LLVMBuildAdd(builder, ac_build_bit_count(&ctx->ac, mask[0]),
-				       ac_build_bit_count(&ctx->ac, mask[1]), "");
+   LLVMBuilderRef builder = ctx->ac.builder;
+   LLVMValueRef ptr64 = LLVMBuildPointerCast(
+      builder, lds_ptr, LLVMPointerType(LLVMArrayType(ctx->ac.i64, 2), AC_ADDR_SPACE_LDS), "");
+   for (unsigned i = 0; i < 2; i++) {
+      LLVMValueRef index = LLVMConstInt(ctx->ac.i32, dw_offset / 2 + i, 0);
+      mask[i] = LLVMBuildLoad(builder, ac_build_gep0(&ctx->ac, ptr64, index), "");
+   }
+
+   /* We get better code if we don't use the 128-bit bitcount. */
+   *total_bitcount = LLVMBuildAdd(builder, ac_build_bit_count(&ctx->ac, mask[0]),
+                                  ac_build_bit_count(&ctx->ac, mask[1]), "");
 }
 
 /**
@@ -711,38 +678,33 @@ static void load_bitmasks_2x64(struct si_shader_context *ctx,
  * \param wave_info_num_bits the bit size of thread count field in merged_wave_info
  * \param wave_info_shift    the bit offset of the thread count field in merged_wave_info
  */
-static void update_thread_counts(struct si_shader_context *ctx,
-				 LLVMValueRef *new_num_threads,
-				 LLVMValueRef *tg_info,
-				 unsigned tg_info_num_bits,
-				 unsigned tg_info_shift,
-				 LLVMValueRef *wave_info,
-				 unsigned wave_info_num_bits,
-				 unsigned wave_info_shift)
+static void update_thread_counts(struct si_shader_context *ctx, LLVMValueRef *new_num_threads,
+                                 LLVMValueRef *tg_info, unsigned tg_info_num_bits,
+                                 unsigned tg_info_shift, LLVMValueRef *wave_info,
+                                 unsigned wave_info_num_bits, unsigned wave_info_shift)
 {
-	LLVMBuilderRef builder = ctx->ac.builder;
-
-	/* Update the total thread count. */
-	unsigned tg_info_mask = ~(u_bit_consecutive(0, tg_info_num_bits) << tg_info_shift);
-	*tg_info = LLVMBuildAnd(builder, *tg_info,
-				LLVMConstInt(ctx->ac.i32, tg_info_mask, 0), "");
-	*tg_info = LLVMBuildOr(builder, *tg_info,
-			       LLVMBuildShl(builder, *new_num_threads,
-					    LLVMConstInt(ctx->ac.i32, tg_info_shift, 0), ""), "");
-
-	/* Update the per-wave thread count. */
-	LLVMValueRef prev_threads = LLVMBuildMul(builder, get_wave_id_in_tg(ctx),
-						 LLVMConstInt(ctx->ac.i32, ctx->ac.wave_size, 0), "");
-	*new_num_threads = LLVMBuildSub(builder, *new_num_threads, prev_threads, "");
-	*new_num_threads = ac_build_imax(&ctx->ac, *new_num_threads, ctx->ac.i32_0);
-	*new_num_threads = ac_build_imin(&ctx->ac, *new_num_threads,
-					LLVMConstInt(ctx->ac.i32, ctx->ac.wave_size, 0));
-	unsigned wave_info_mask = ~(u_bit_consecutive(0, wave_info_num_bits) << wave_info_shift);
-	*wave_info = LLVMBuildAnd(builder, *wave_info,
-				  LLVMConstInt(ctx->ac.i32, wave_info_mask, 0), "");
-	*wave_info = LLVMBuildOr(builder, *wave_info,
-				 LLVMBuildShl(builder, *new_num_threads,
-					      LLVMConstInt(ctx->ac.i32, wave_info_shift, 0), ""), "");
+   LLVMBuilderRef builder = ctx->ac.builder;
+
+   /* Update the total thread count. */
+   unsigned tg_info_mask = ~(u_bit_consecutive(0, tg_info_num_bits) << tg_info_shift);
+   *tg_info = LLVMBuildAnd(builder, *tg_info, LLVMConstInt(ctx->ac.i32, tg_info_mask, 0), "");
+   *tg_info = LLVMBuildOr(
+      builder, *tg_info,
+      LLVMBuildShl(builder, *new_num_threads, LLVMConstInt(ctx->ac.i32, tg_info_shift, 0), ""), "");
+
+   /* Update the per-wave thread count. */
+   LLVMValueRef prev_threads = LLVMBuildMul(builder, get_wave_id_in_tg(ctx),
+                                            LLVMConstInt(ctx->ac.i32, ctx->ac.wave_size, 0), "");
+   *new_num_threads = LLVMBuildSub(builder, *new_num_threads, prev_threads, "");
+   *new_num_threads = ac_build_imax(&ctx->ac, *new_num_threads, ctx->ac.i32_0);
+   *new_num_threads =
+      ac_build_imin(&ctx->ac, *new_num_threads, LLVMConstInt(ctx->ac.i32, ctx->ac.wave_size, 0));
+   unsigned wave_info_mask = ~(u_bit_consecutive(0, wave_info_num_bits) << wave_info_shift);
+   *wave_info = LLVMBuildAnd(builder, *wave_info, LLVMConstInt(ctx->ac.i32, wave_info_mask, 0), "");
+   *wave_info = LLVMBuildOr(
+      builder, *wave_info,
+      LLVMBuildShl(builder, *new_num_threads, LLVMConstInt(ctx->ac.i32, wave_info_shift, 0), ""),
+      "");
 }
 
 /**
@@ -751,759 +713,719 @@ static void update_thread_counts(struct si_shader_context *ctx,
  * Also return the position, which is passed to the shader as an input,
  * so that we don't compute it twice.
  */
-void gfx10_emit_ngg_culling_epilogue_4x_wave32(struct ac_shader_abi *abi,
-					       unsigned max_outputs,
-					       LLVMValueRef *addrs)
+void gfx10_emit_ngg_culling_epilogue_4x_wave32(struct ac_shader_abi *abi, unsigned max_outputs,
+                                               LLVMValueRef *addrs)
 {
-	struct si_shader_context *ctx = si_shader_context_from_abi(abi);
-	struct si_shader *shader = ctx->shader;
-	struct si_shader_selector *sel = shader->selector;
-	struct si_shader_info *info = &sel->info;
-	LLVMBuilderRef builder = ctx->ac.builder;
-
-	assert(shader->key.opt.ngg_culling);
-	assert(shader->key.as_ngg);
-	assert(sel->type == PIPE_SHADER_VERTEX ||
-	       (sel->type == PIPE_SHADER_TESS_EVAL && !shader->key.as_es));
-
-	LLVMValueRef position[4] = {};
-	for (unsigned i = 0; i < info->num_outputs; i++) {
-		switch (info->output_semantic_name[i]) {
-		case TGSI_SEMANTIC_POSITION:
-			for (unsigned j = 0; j < 4; j++) {
-				position[j] = LLVMBuildLoad(ctx->ac.builder,
-							    addrs[4 * i + j], "");
-			}
-			break;
-		}
-	}
-	assert(position[0]);
-
-	/* Store Position.XYZW into LDS. */
-	LLVMValueRef es_vtxptr = ngg_nogs_vertex_ptr(ctx, get_thread_id_in_tg(ctx));
-	for (unsigned chan = 0; chan < 4; chan++) {
-		LLVMBuildStore(builder, ac_to_integer(&ctx->ac, position[chan]),
-				ac_build_gep0(&ctx->ac, es_vtxptr,
-					      LLVMConstInt(ctx->ac.i32, lds_pos_x + chan, 0)));
-	}
-	/* Store Position.XY / W into LDS. */
-	for (unsigned chan = 0; chan < 2; chan++) {
-		LLVMValueRef val = ac_build_fdiv(&ctx->ac, position[chan], position[3]);
-		LLVMBuildStore(builder, ac_to_integer(&ctx->ac, val),
-				ac_build_gep0(&ctx->ac, es_vtxptr,
-					      LLVMConstInt(ctx->ac.i32, lds_pos_x_div_w + chan, 0)));
-	}
-
-	/* Store VertexID and InstanceID. ES threads will have to load them
-	 * from LDS after vertex compaction and use them instead of their own
-	 * system values.
-	 */
-	bool uses_instance_id = false;
-	bool uses_tes_prim_id = false;
-	LLVMValueRef packed_data = ctx->ac.i32_0;
-
-	if (ctx->type == PIPE_SHADER_VERTEX) {
-		uses_instance_id = sel->info.uses_instanceid ||
-				   shader->key.part.vs.prolog.instance_divisor_is_one ||
-				   shader->key.part.vs.prolog.instance_divisor_is_fetched;
-
-		LLVMBuildStore(builder, ctx->abi.vertex_id,
-			       ac_build_gep0(&ctx->ac, es_vtxptr,
-					     LLVMConstInt(ctx->ac.i32, lds_vertex_id, 0)));
-		if (uses_instance_id) {
-			LLVMBuildStore(builder, ctx->abi.instance_id,
-				       ac_build_gep0(&ctx->ac, es_vtxptr,
-						     LLVMConstInt(ctx->ac.i32, lds_instance_id, 0)));
-		}
-	} else {
-		uses_tes_prim_id = sel->info.uses_primid ||
-				   shader->key.mono.u.vs_export_prim_id;
-
-		assert(ctx->type == PIPE_SHADER_TESS_EVAL);
-		LLVMBuildStore(builder, ac_to_integer(&ctx->ac, ac_get_arg(&ctx->ac, ctx->tes_u)),
-			       ac_build_gep0(&ctx->ac, es_vtxptr,
-					     LLVMConstInt(ctx->ac.i32, lds_tes_u, 0)));
-		LLVMBuildStore(builder, ac_to_integer(&ctx->ac, ac_get_arg(&ctx->ac, ctx->tes_v)),
-			       ac_build_gep0(&ctx->ac, es_vtxptr,
-					     LLVMConstInt(ctx->ac.i32, lds_tes_v, 0)));
-		packed_data = LLVMBuildShl(builder, ac_get_arg(&ctx->ac, ctx->tes_rel_patch_id),
-					   LLVMConstInt(ctx->ac.i32, lds_byte2_tes_rel_patch_id * 8, 0), "");
-		if (uses_tes_prim_id) {
-			LLVMBuildStore(builder, ac_get_arg(&ctx->ac, ctx->args.tes_patch_id),
-				       ac_build_gep0(&ctx->ac, es_vtxptr,
-						     LLVMConstInt(ctx->ac.i32, lds_tes_patch_id, 0)));
-		}
-	}
-	/* Initialize the packed data. */
-	LLVMBuildStore(builder, packed_data,
-		       ac_build_gep0(&ctx->ac, es_vtxptr,
-				     LLVMConstInt(ctx->ac.i32, lds_packed_data, 0)));
-	ac_build_endif(&ctx->ac, ctx->merged_wrap_if_label);
-
-	LLVMValueRef tid = ac_get_thread_id(&ctx->ac);
-
-	/* Initialize the last 3 gs_ngg_scratch dwords to 0, because we may have less
-	 * than 4 waves, but we always read all 4 values. This is where the thread
-	 * bitmasks of unculled threads will be stored.
-	 *
-	 * gs_ngg_scratch layout: esmask[0..3]
-	 */
-	ac_build_ifcc(&ctx->ac,
-		      LLVMBuildICmp(builder, LLVMIntULT, get_thread_id_in_tg(ctx),
-				    LLVMConstInt(ctx->ac.i32, 3, 0), ""), 16101);
-	{
-		LLVMValueRef index = LLVMBuildAdd(builder, tid, ctx->ac.i32_1, "");
-		LLVMBuildStore(builder, ctx->ac.i32_0,
-			       ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, index));
-	}
-	ac_build_endif(&ctx->ac, 16101);
-	ac_build_s_barrier(&ctx->ac);
-
-	/* The hardware requires that there are no holes between unculled vertices,
-	 * which means we have to pack ES threads, i.e. reduce the ES thread count
-	 * and move ES input VGPRs to lower threads. The upside is that varyings
-	 * are only fetched and computed for unculled vertices.
-	 *
-	 * Vertex compaction in GS threads:
-	 *
-	 * Part 1: Compute the surviving vertex mask in GS threads:
-	 * - Compute 4 32-bit surviving vertex masks in LDS. (max 4 waves)
-	 *   - In GS, notify ES threads whether the vertex survived.
-	 *   - Barrier
-	 *   - ES threads will create the mask and store it in LDS.
-	 * - Barrier
-	 * - Each GS thread loads the vertex masks from LDS.
-	 *
-	 * Part 2: Compact ES threads in GS threads:
-	 * - Compute the prefix sum for all 3 vertices from the masks. These are the new
-	 *   thread IDs for each vertex within the primitive.
-	 * - Write the value of the old thread ID into the LDS address of the new thread ID.
-	 *   The ES thread will load the old thread ID and use it to load the position, VertexID,
-	 *   and InstanceID.
-	 * - Update vertex indices and null flag in the GS input VGPRs.
-	 * - Barrier
-	 *
-	 * Part 3: Update inputs GPRs
-	 * - For all waves, update per-wave thread counts in input SGPRs.
-	 * - In ES threads, update the ES input VGPRs (VertexID, InstanceID, TES inputs).
-	 */
-
-	LLVMValueRef vtxindex[3];
-	if (shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL) {
-		/* For the GS fast launch, the VS prologs simply puts the Vertex IDs
-		 * into these VGPRs.
-		 */
-		vtxindex[0] = ac_get_arg(&ctx->ac, ctx->gs_vtx01_offset);
-		vtxindex[1] = ac_get_arg(&ctx->ac, ctx->gs_vtx23_offset);
-		vtxindex[2] = ac_get_arg(&ctx->ac, ctx->gs_vtx45_offset);
-	} else {
-		vtxindex[0] = si_unpack_param(ctx, ctx->gs_vtx01_offset, 0, 16);
-		vtxindex[1] = si_unpack_param(ctx, ctx->gs_vtx01_offset, 16, 16);
-		vtxindex[2] = si_unpack_param(ctx, ctx->gs_vtx23_offset, 0, 16);
-	};
-	LLVMValueRef gs_vtxptr[] = {
-		ngg_nogs_vertex_ptr(ctx, vtxindex[0]),
-		ngg_nogs_vertex_ptr(ctx, vtxindex[1]),
-		ngg_nogs_vertex_ptr(ctx, vtxindex[2]),
-	};
-	es_vtxptr = ngg_nogs_vertex_ptr(ctx, get_thread_id_in_tg(ctx));
-
-	LLVMValueRef gs_accepted = ac_build_alloca(&ctx->ac, ctx->ac.i32, "");
-
-	/* Do culling in GS threads. */
-	ac_build_ifcc(&ctx->ac, si_is_gs_thread(ctx), 16002);
-	{
-		/* Load positions. */
-		LLVMValueRef pos[3][4] = {};
-		for (unsigned vtx = 0; vtx < 3; vtx++) {
-			for (unsigned chan = 0; chan < 4; chan++) {
-				unsigned index;
-				if (chan == 0 || chan == 1)
-					index = lds_pos_x_div_w + chan;
-				else if (chan == 3)
-					index = lds_pos_w;
-				else
-					continue;
-
-				LLVMValueRef addr = ac_build_gep0(&ctx->ac, gs_vtxptr[vtx],
-								  LLVMConstInt(ctx->ac.i32, index, 0));
-				pos[vtx][chan] = LLVMBuildLoad(builder, addr, "");
-				pos[vtx][chan] = ac_to_float(&ctx->ac, pos[vtx][chan]);
-			}
-		}
-
-		/* Load the viewport state for small prim culling. */
-		LLVMValueRef vp = ac_build_load_invariant(&ctx->ac,
-							  ac_get_arg(&ctx->ac, ctx->small_prim_cull_info),
-							  ctx->ac.i32_0);
-		vp = LLVMBuildBitCast(builder, vp, ctx->ac.v4f32, "");
-		LLVMValueRef vp_scale[2], vp_translate[2];
-		vp_scale[0] = ac_llvm_extract_elem(&ctx->ac, vp, 0);
-		vp_scale[1] = ac_llvm_extract_elem(&ctx->ac, vp, 1);
-		vp_translate[0] = ac_llvm_extract_elem(&ctx->ac, vp, 2);
-		vp_translate[1] = ac_llvm_extract_elem(&ctx->ac, vp, 3);
-
-		/* Get the small prim filter precision. */
-		LLVMValueRef small_prim_precision = si_unpack_param(ctx, ctx->vs_state_bits, 7, 4);
-		small_prim_precision = LLVMBuildOr(builder, small_prim_precision,
-						   LLVMConstInt(ctx->ac.i32, 0x70, 0), "");
-		small_prim_precision = LLVMBuildShl(builder, small_prim_precision,
-						    LLVMConstInt(ctx->ac.i32, 23, 0), "");
-		small_prim_precision = LLVMBuildBitCast(builder, small_prim_precision, ctx->ac.f32, "");
-
-		/* Execute culling code. */
-		struct ac_cull_options options = {};
-		options.cull_front = shader->key.opt.ngg_culling & SI_NGG_CULL_FRONT_FACE;
-		options.cull_back = shader->key.opt.ngg_culling & SI_NGG_CULL_BACK_FACE;
-		options.cull_view_xy = shader->key.opt.ngg_culling & SI_NGG_CULL_VIEW_SMALLPRIMS;
-		options.cull_small_prims = options.cull_view_xy;
-		options.cull_zero_area = options.cull_front || options.cull_back;
-		options.cull_w = true;
-
-		/* Tell ES threads whether their vertex survived. */
-		ac_build_ifcc(&ctx->ac, ac_cull_triangle(&ctx->ac, pos, ctx->ac.i1true,
-							 vp_scale, vp_translate,
-							 small_prim_precision, &options), 16003);
-		{
-			LLVMBuildStore(builder, ctx->ac.i32_1, gs_accepted);
-			for (unsigned vtx = 0; vtx < 3; vtx++) {
-				LLVMBuildStore(builder, ctx->ac.i8_1,
-					       si_build_gep_i8(ctx, gs_vtxptr[vtx], lds_byte0_accept_flag));
-			}
-		}
-		ac_build_endif(&ctx->ac, 16003);
-	}
-	ac_build_endif(&ctx->ac, 16002);
-	ac_build_s_barrier(&ctx->ac);
-
-	gs_accepted = LLVMBuildLoad(builder, gs_accepted, "");
-
-	LLVMValueRef es_accepted = ac_build_alloca(&ctx->ac, ctx->ac.i1, "");
-
-	/* Convert the per-vertex flag to a thread bitmask in ES threads and store it in LDS. */
-	ac_build_ifcc(&ctx->ac, si_is_es_thread(ctx), 16007);
-	{
-		LLVMValueRef es_accepted_flag =
-			LLVMBuildLoad(builder,
-				      si_build_gep_i8(ctx, es_vtxptr, lds_byte0_accept_flag), "");
-
-		LLVMValueRef es_accepted_bool = LLVMBuildICmp(builder, LLVMIntNE,
-							      es_accepted_flag, ctx->ac.i8_0, "");
-		LLVMValueRef es_mask = ac_get_i1_sgpr_mask(&ctx->ac, es_accepted_bool);
-
-		LLVMBuildStore(builder, es_accepted_bool, es_accepted);
-
-		ac_build_ifcc(&ctx->ac, LLVMBuildICmp(builder, LLVMIntEQ,
-						      tid, ctx->ac.i32_0, ""), 16008);
-		{
-			LLVMBuildStore(builder, es_mask,
-				       ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch,
-						     get_wave_id_in_tg(ctx)));
-		}
-		ac_build_endif(&ctx->ac, 16008);
-	}
-	ac_build_endif(&ctx->ac, 16007);
-	ac_build_s_barrier(&ctx->ac);
-
-	/* Load the vertex masks and compute the new ES thread count. */
-	LLVMValueRef es_mask[2], new_num_es_threads, kill_wave;
-	load_bitmasks_2x64(ctx, ctx->gs_ngg_scratch, 0, es_mask, &new_num_es_threads);
-	new_num_es_threads = ac_build_readlane_no_opt_barrier(&ctx->ac, new_num_es_threads, NULL);
-
-	/* ES threads compute their prefix sum, which is the new ES thread ID.
-	 * Then they write the value of the old thread ID into the LDS address
-	 * of the new thread ID. It will be used it to load input VGPRs from
-	 * the old thread's LDS location.
-	 */
-	ac_build_ifcc(&ctx->ac, LLVMBuildLoad(builder, es_accepted, ""), 16009);
-	{
-		LLVMValueRef old_id = get_thread_id_in_tg(ctx);
-		LLVMValueRef new_id = ac_prefix_bitcount_2x64(&ctx->ac, es_mask, old_id);
-
-		LLVMBuildStore(builder, LLVMBuildTrunc(builder, old_id, ctx->ac.i8, ""),
-			       si_build_gep_i8(ctx, ngg_nogs_vertex_ptr(ctx, new_id),
-					       lds_byte0_old_thread_id));
-		LLVMBuildStore(builder, LLVMBuildTrunc(builder, new_id, ctx->ac.i8, ""),
-			       si_build_gep_i8(ctx, es_vtxptr, lds_byte1_new_thread_id));
-	}
-	ac_build_endif(&ctx->ac, 16009);
-
-	/* Kill waves that have inactive threads. */
-	kill_wave = LLVMBuildICmp(builder, LLVMIntULE,
-				  ac_build_imax(&ctx->ac, new_num_es_threads, ngg_get_prim_cnt(ctx)),
-				  LLVMBuildMul(builder, get_wave_id_in_tg(ctx),
-					       LLVMConstInt(ctx->ac.i32, ctx->ac.wave_size, 0), ""), "");
-	ac_build_ifcc(&ctx->ac, kill_wave, 19202);
-	{
-		/* If we are killing wave 0, send that there are no primitives
-		 * in this threadgroup.
-		 */
-		ac_build_sendmsg_gs_alloc_req(&ctx->ac, get_wave_id_in_tg(ctx),
-					      ctx->ac.i32_0, ctx->ac.i32_0);
-		ac_build_s_endpgm(&ctx->ac);
-	}
-	ac_build_endif(&ctx->ac, 19202);
-	ac_build_s_barrier(&ctx->ac);
-
-	/* Send the final vertex and primitive counts. */
-	ac_build_sendmsg_gs_alloc_req(&ctx->ac, get_wave_id_in_tg(ctx),
-				      new_num_es_threads, ngg_get_prim_cnt(ctx));
-
-	/* Update thread counts in SGPRs. */
-	LLVMValueRef new_gs_tg_info = ac_get_arg(&ctx->ac, ctx->gs_tg_info);
-	LLVMValueRef new_merged_wave_info = ac_get_arg(&ctx->ac, ctx->merged_wave_info);
-
-	/* This also converts the thread count from the total count to the per-wave count. */
-	update_thread_counts(ctx, &new_num_es_threads, &new_gs_tg_info, 9, 12,
-			     &new_merged_wave_info, 8, 0);
-
-	/* Update vertex indices in VGPR0 (same format as NGG passthrough). */
-	LLVMValueRef new_vgpr0 = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, "");
-
-	/* Set the null flag at the beginning (culled), and then
-	 * overwrite it for accepted primitives.
-	 */
-	LLVMBuildStore(builder, LLVMConstInt(ctx->ac.i32, 1u << 31, 0), new_vgpr0);
-
-	/* Get vertex indices after vertex compaction. */
-	ac_build_ifcc(&ctx->ac, LLVMBuildTrunc(builder, gs_accepted, ctx->ac.i1, ""), 16011);
-	{
-		struct ac_ngg_prim prim = {};
-		prim.num_vertices = 3;
-		prim.isnull = ctx->ac.i1false;
-
-		for (unsigned vtx = 0; vtx < 3; vtx++) {
-			prim.index[vtx] =
-				LLVMBuildLoad(builder,
-					      si_build_gep_i8(ctx, gs_vtxptr[vtx],
-							      lds_byte1_new_thread_id), "");
-			prim.index[vtx] = LLVMBuildZExt(builder, prim.index[vtx], ctx->ac.i32, "");
-			prim.edgeflag[vtx] = ngg_get_initial_edgeflag(ctx, vtx);
-		}
-
-		/* Set the new GS input VGPR. */
-		LLVMBuildStore(builder, ac_pack_prim_export(&ctx->ac, &prim), new_vgpr0);
-	}
-	ac_build_endif(&ctx->ac, 16011);
-
-	if (gfx10_ngg_export_prim_early(shader))
-		gfx10_ngg_build_export_prim(ctx, NULL, LLVMBuildLoad(builder, new_vgpr0, ""));
-
-	/* Set the new ES input VGPRs. */
-	LLVMValueRef es_data[4];
-	LLVMValueRef old_thread_id = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, "");
-
-	for (unsigned i = 0; i < 4; i++)
-		es_data[i] = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, "");
-
-	ac_build_ifcc(&ctx->ac, LLVMBuildICmp(ctx->ac.builder, LLVMIntULT, tid,
-					      new_num_es_threads, ""), 16012);
-	{
-		LLVMValueRef old_id, old_es_vtxptr, tmp;
-
-		/* Load ES input VGPRs from the ES thread before compaction. */
-		old_id = LLVMBuildLoad(builder,
-				       si_build_gep_i8(ctx, es_vtxptr, lds_byte0_old_thread_id), "");
-		old_id = LLVMBuildZExt(builder, old_id, ctx->ac.i32, "");
-
-		LLVMBuildStore(builder, old_id, old_thread_id);
-		old_es_vtxptr = ngg_nogs_vertex_ptr(ctx, old_id);
-
-		for (unsigned i = 0; i < 2; i++) {
-			tmp = LLVMBuildLoad(builder,
-					    ac_build_gep0(&ctx->ac, old_es_vtxptr,
-							  LLVMConstInt(ctx->ac.i32, lds_vertex_id + i, 0)), "");
-			LLVMBuildStore(builder, tmp, es_data[i]);
-		}
-
-		if (ctx->type == PIPE_SHADER_TESS_EVAL) {
-			tmp = LLVMBuildLoad(builder,
-					    si_build_gep_i8(ctx, old_es_vtxptr,
-							    lds_byte2_tes_rel_patch_id), "");
-			tmp = LLVMBuildZExt(builder, tmp, ctx->ac.i32, "");
-			LLVMBuildStore(builder, tmp, es_data[2]);
-
-			if (uses_tes_prim_id) {
-				tmp = LLVMBuildLoad(builder,
-						    ac_build_gep0(&ctx->ac, old_es_vtxptr,
-								  LLVMConstInt(ctx->ac.i32, lds_tes_patch_id, 0)), "");
-				LLVMBuildStore(builder, tmp, es_data[3]);
-			}
-		}
-	}
-	ac_build_endif(&ctx->ac, 16012);
-
-	/* Return values for the main function. */
-	LLVMValueRef ret = ctx->return_value;
-	LLVMValueRef val;
-
-	ret = LLVMBuildInsertValue(ctx->ac.builder, ret, new_gs_tg_info, 2, "");
-	ret = LLVMBuildInsertValue(ctx->ac.builder, ret, new_merged_wave_info, 3, "");
-	if (ctx->type == PIPE_SHADER_TESS_EVAL)
-		ret = si_insert_input_ret(ctx, ret, ctx->tcs_offchip_offset, 4);
-
-	ret = si_insert_input_ptr(ctx, ret, ctx->rw_buffers,
-				  8 + SI_SGPR_RW_BUFFERS);
-	ret = si_insert_input_ptr(ctx, ret,
-				  ctx->bindless_samplers_and_images,
-				  8 + SI_SGPR_BINDLESS_SAMPLERS_AND_IMAGES);
-	ret = si_insert_input_ptr(ctx, ret,
-				  ctx->const_and_shader_buffers,
-				  8 + SI_SGPR_CONST_AND_SHADER_BUFFERS);
-	ret = si_insert_input_ptr(ctx, ret,
-				  ctx->samplers_and_images,
-				  8 + SI_SGPR_SAMPLERS_AND_IMAGES);
-	ret = si_insert_input_ptr(ctx, ret, ctx->vs_state_bits,
-				  8 + SI_SGPR_VS_STATE_BITS);
-
-	if (ctx->type == PIPE_SHADER_VERTEX) {
-		ret = si_insert_input_ptr(ctx, ret, ctx->args.base_vertex,
-					  8 + SI_SGPR_BASE_VERTEX);
-		ret = si_insert_input_ptr(ctx, ret, ctx->args.start_instance,
-					  8 + SI_SGPR_START_INSTANCE);
-		ret = si_insert_input_ptr(ctx, ret, ctx->args.draw_id,
-					  8 + SI_SGPR_DRAWID);
-		ret = si_insert_input_ptr(ctx, ret, ctx->vertex_buffers,
-					  8 + SI_VS_NUM_USER_SGPR);
-
-		for (unsigned i = 0; i < shader->selector->num_vbos_in_user_sgprs; i++) {
-			ret = si_insert_input_v4i32(ctx, ret, ctx->vb_descriptors[i],
-						    8 + SI_SGPR_VS_VB_DESCRIPTOR_FIRST + i * 4);
-		}
-	} else {
-		assert(ctx->type == PIPE_SHADER_TESS_EVAL);
-		ret = si_insert_input_ptr(ctx, ret, ctx->tcs_offchip_layout,
-					  8 + SI_SGPR_TES_OFFCHIP_LAYOUT);
-		ret = si_insert_input_ptr(ctx, ret, ctx->tes_offchip_addr,
-					  8 + SI_SGPR_TES_OFFCHIP_ADDR);
-	}
-
-	unsigned vgpr;
-	if (ctx->type == PIPE_SHADER_VERTEX) {
-		if (shader->selector->num_vbos_in_user_sgprs) {
-			vgpr = 8 + SI_SGPR_VS_VB_DESCRIPTOR_FIRST +
-			       shader->selector->num_vbos_in_user_sgprs * 4;
-		} else {
-			vgpr = 8 + GFX9_VSGS_NUM_USER_SGPR + 1;
-		}
-	} else {
-		vgpr = 8 + GFX9_TESGS_NUM_USER_SGPR;
-	}
-
-	val = LLVMBuildLoad(builder, new_vgpr0, "");
-	ret = LLVMBuildInsertValue(builder, ret, ac_to_float(&ctx->ac, val),
-				   vgpr++, "");
-	vgpr++; /* gs_vtx23_offset */
-
-	ret = si_insert_input_ret_float(ctx, ret, ctx->args.gs_prim_id, vgpr++);
-	ret = si_insert_input_ret_float(ctx, ret, ctx->args.gs_invocation_id, vgpr++);
-	vgpr++; /* gs_vtx45_offset */
-
-	if (ctx->type == PIPE_SHADER_VERTEX) {
-		val = LLVMBuildLoad(builder, es_data[0], "");
-		ret = LLVMBuildInsertValue(builder, ret, ac_to_float(&ctx->ac, val),
-					   vgpr++, ""); /* VGPR5 - VertexID */
-		vgpr += 2;
-		if (uses_instance_id) {
-			val = LLVMBuildLoad(builder, es_data[1], "");
-			ret = LLVMBuildInsertValue(builder, ret, ac_to_float(&ctx->ac, val),
-						   vgpr++, ""); /* VGPR8 - InstanceID */
-		} else {
-			vgpr++;
-		}
-	} else {
-		assert(ctx->type == PIPE_SHADER_TESS_EVAL);
-		unsigned num_vgprs = uses_tes_prim_id ? 4 : 3;
-		for (unsigned i = 0; i < num_vgprs; i++) {
-			val = LLVMBuildLoad(builder, es_data[i], "");
-			ret = LLVMBuildInsertValue(builder, ret, ac_to_float(&ctx->ac, val),
-						   vgpr++, "");
-		}
-		if (num_vgprs == 3)
-			vgpr++;
-	}
-	/* Return the old thread ID. */
-	val = LLVMBuildLoad(builder, old_thread_id, "");
-	ret = LLVMBuildInsertValue(builder, ret, ac_to_float(&ctx->ac, val), vgpr++, "");
-
-	/* These two also use LDS. */
-	if (sel->info.writes_edgeflag ||
-	    (ctx->type == PIPE_SHADER_VERTEX && shader->key.mono.u.vs_export_prim_id))
-		ac_build_s_barrier(&ctx->ac);
-
-	ctx->return_value = ret;
+   struct si_shader_context *ctx = si_shader_context_from_abi(abi);
+   struct si_shader *shader = ctx->shader;
+   struct si_shader_selector *sel = shader->selector;
+   struct si_shader_info *info = &sel->info;
+   LLVMBuilderRef builder = ctx->ac.builder;
+
+   assert(shader->key.opt.ngg_culling);
+   assert(shader->key.as_ngg);
+   assert(sel->type == PIPE_SHADER_VERTEX ||
+          (sel->type == PIPE_SHADER_TESS_EVAL && !shader->key.as_es));
+
+   LLVMValueRef position[4] = {};
+   for (unsigned i = 0; i < info->num_outputs; i++) {
+      switch (info->output_semantic_name[i]) {
+      case TGSI_SEMANTIC_POSITION:
+         for (unsigned j = 0; j < 4; j++) {
+            position[j] = LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + j], "");
+         }
+         break;
+      }
+   }
+   assert(position[0]);
+
+   /* Store Position.XYZW into LDS. */
+   LLVMValueRef es_vtxptr = ngg_nogs_vertex_ptr(ctx, get_thread_id_in_tg(ctx));
+   for (unsigned chan = 0; chan < 4; chan++) {
+      LLVMBuildStore(
+         builder, ac_to_integer(&ctx->ac, position[chan]),
+         ac_build_gep0(&ctx->ac, es_vtxptr, LLVMConstInt(ctx->ac.i32, lds_pos_x + chan, 0)));
+   }
+   /* Store Position.XY / W into LDS. */
+   for (unsigned chan = 0; chan < 2; chan++) {
+      LLVMValueRef val = ac_build_fdiv(&ctx->ac, position[chan], position[3]);
+      LLVMBuildStore(
+         builder, ac_to_integer(&ctx->ac, val),
+         ac_build_gep0(&ctx->ac, es_vtxptr, LLVMConstInt(ctx->ac.i32, lds_pos_x_div_w + chan, 0)));
+   }
+
+   /* Store VertexID and InstanceID. ES threads will have to load them
+    * from LDS after vertex compaction and use them instead of their own
+    * system values.
+    */
+   bool uses_instance_id = false;
+   bool uses_tes_prim_id = false;
+   LLVMValueRef packed_data = ctx->ac.i32_0;
+
+   if (ctx->type == PIPE_SHADER_VERTEX) {
+      uses_instance_id = sel->info.uses_instanceid ||
+                         shader->key.part.vs.prolog.instance_divisor_is_one ||
+                         shader->key.part.vs.prolog.instance_divisor_is_fetched;
+
+      LLVMBuildStore(
+         builder, ctx->abi.vertex_id,
+         ac_build_gep0(&ctx->ac, es_vtxptr, LLVMConstInt(ctx->ac.i32, lds_vertex_id, 0)));
+      if (uses_instance_id) {
+         LLVMBuildStore(
+            builder, ctx->abi.instance_id,
+            ac_build_gep0(&ctx->ac, es_vtxptr, LLVMConstInt(ctx->ac.i32, lds_instance_id, 0)));
+      }
+   } else {
+      uses_tes_prim_id = sel->info.uses_primid || shader->key.mono.u.vs_export_prim_id;
+
+      assert(ctx->type == PIPE_SHADER_TESS_EVAL);
+      LLVMBuildStore(builder, ac_to_integer(&ctx->ac, ac_get_arg(&ctx->ac, ctx->tes_u)),
+                     ac_build_gep0(&ctx->ac, es_vtxptr, LLVMConstInt(ctx->ac.i32, lds_tes_u, 0)));
+      LLVMBuildStore(builder, ac_to_integer(&ctx->ac, ac_get_arg(&ctx->ac, ctx->tes_v)),
+                     ac_build_gep0(&ctx->ac, es_vtxptr, LLVMConstInt(ctx->ac.i32, lds_tes_v, 0)));
+      packed_data = LLVMBuildShl(builder, ac_get_arg(&ctx->ac, ctx->tes_rel_patch_id),
+                                 LLVMConstInt(ctx->ac.i32, lds_byte2_tes_rel_patch_id * 8, 0), "");
+      if (uses_tes_prim_id) {
+         LLVMBuildStore(
+            builder, ac_get_arg(&ctx->ac, ctx->args.tes_patch_id),
+            ac_build_gep0(&ctx->ac, es_vtxptr, LLVMConstInt(ctx->ac.i32, lds_tes_patch_id, 0)));
+      }
+   }
+   /* Initialize the packed data. */
+   LLVMBuildStore(
+      builder, packed_data,
+      ac_build_gep0(&ctx->ac, es_vtxptr, LLVMConstInt(ctx->ac.i32, lds_packed_data, 0)));
+   ac_build_endif(&ctx->ac, ctx->merged_wrap_if_label);
+
+   LLVMValueRef tid = ac_get_thread_id(&ctx->ac);
+
+   /* Initialize the last 3 gs_ngg_scratch dwords to 0, because we may have less
+    * than 4 waves, but we always read all 4 values. This is where the thread
+    * bitmasks of unculled threads will be stored.
+    *
+    * gs_ngg_scratch layout: esmask[0..3]
+    */
+   ac_build_ifcc(&ctx->ac,
+                 LLVMBuildICmp(builder, LLVMIntULT, get_thread_id_in_tg(ctx),
+                               LLVMConstInt(ctx->ac.i32, 3, 0), ""),
+                 16101);
+   {
+      LLVMValueRef index = LLVMBuildAdd(builder, tid, ctx->ac.i32_1, "");
+      LLVMBuildStore(builder, ctx->ac.i32_0, ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, index));
+   }
+   ac_build_endif(&ctx->ac, 16101);
+   ac_build_s_barrier(&ctx->ac);
+
+   /* The hardware requires that there are no holes between unculled vertices,
+    * which means we have to pack ES threads, i.e. reduce the ES thread count
+    * and move ES input VGPRs to lower threads. The upside is that varyings
+    * are only fetched and computed for unculled vertices.
+    *
+    * Vertex compaction in GS threads:
+    *
+    * Part 1: Compute the surviving vertex mask in GS threads:
+    * - Compute 4 32-bit surviving vertex masks in LDS. (max 4 waves)
+    *   - In GS, notify ES threads whether the vertex survived.
+    *   - Barrier
+    *   - ES threads will create the mask and store it in LDS.
+    * - Barrier
+    * - Each GS thread loads the vertex masks from LDS.
+    *
+    * Part 2: Compact ES threads in GS threads:
+    * - Compute the prefix sum for all 3 vertices from the masks. These are the new
+    *   thread IDs for each vertex within the primitive.
+    * - Write the value of the old thread ID into the LDS address of the new thread ID.
+    *   The ES thread will load the old thread ID and use it to load the position, VertexID,
+    *   and InstanceID.
+    * - Update vertex indices and null flag in the GS input VGPRs.
+    * - Barrier
+    *
+    * Part 3: Update inputs GPRs
+    * - For all waves, update per-wave thread counts in input SGPRs.
+    * - In ES threads, update the ES input VGPRs (VertexID, InstanceID, TES inputs).
+    */
+
+   LLVMValueRef vtxindex[3];
+   if (shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL) {
+      /* For the GS fast launch, the VS prologs simply puts the Vertex IDs
+       * into these VGPRs.
+       */
+      vtxindex[0] = ac_get_arg(&ctx->ac, ctx->gs_vtx01_offset);
+      vtxindex[1] = ac_get_arg(&ctx->ac, ctx->gs_vtx23_offset);
+      vtxindex[2] = ac_get_arg(&ctx->ac, ctx->gs_vtx45_offset);
+   } else {
+      vtxindex[0] = si_unpack_param(ctx, ctx->gs_vtx01_offset, 0, 16);
+      vtxindex[1] = si_unpack_param(ctx, ctx->gs_vtx01_offset, 16, 16);
+      vtxindex[2] = si_unpack_param(ctx, ctx->gs_vtx23_offset, 0, 16);
+   };
+   LLVMValueRef gs_vtxptr[] = {
+      ngg_nogs_vertex_ptr(ctx, vtxindex[0]),
+      ngg_nogs_vertex_ptr(ctx, vtxindex[1]),
+      ngg_nogs_vertex_ptr(ctx, vtxindex[2]),
+   };
+   es_vtxptr = ngg_nogs_vertex_ptr(ctx, get_thread_id_in_tg(ctx));
+
+   LLVMValueRef gs_accepted = ac_build_alloca(&ctx->ac, ctx->ac.i32, "");
+
+   /* Do culling in GS threads. */
+   ac_build_ifcc(&ctx->ac, si_is_gs_thread(ctx), 16002);
+   {
+      /* Load positions. */
+      LLVMValueRef pos[3][4] = {};
+      for (unsigned vtx = 0; vtx < 3; vtx++) {
+         for (unsigned chan = 0; chan < 4; chan++) {
+            unsigned index;
+            if (chan == 0 || chan == 1)
+               index = lds_pos_x_div_w + chan;
+            else if (chan == 3)
+               index = lds_pos_w;
+            else
+               continue;
+
+            LLVMValueRef addr =
+               ac_build_gep0(&ctx->ac, gs_vtxptr[vtx], LLVMConstInt(ctx->ac.i32, index, 0));
+            pos[vtx][chan] = LLVMBuildLoad(builder, addr, "");
+            pos[vtx][chan] = ac_to_float(&ctx->ac, pos[vtx][chan]);
+         }
+      }
+
+      /* Load the viewport state for small prim culling. */
+      LLVMValueRef vp = ac_build_load_invariant(
+         &ctx->ac, ac_get_arg(&ctx->ac, ctx->small_prim_cull_info), ctx->ac.i32_0);
+      vp = LLVMBuildBitCast(builder, vp, ctx->ac.v4f32, "");
+      LLVMValueRef vp_scale[2], vp_translate[2];
+      vp_scale[0] = ac_llvm_extract_elem(&ctx->ac, vp, 0);
+      vp_scale[1] = ac_llvm_extract_elem(&ctx->ac, vp, 1);
+      vp_translate[0] = ac_llvm_extract_elem(&ctx->ac, vp, 2);
+      vp_translate[1] = ac_llvm_extract_elem(&ctx->ac, vp, 3);
+
+      /* Get the small prim filter precision. */
+      LLVMValueRef small_prim_precision = si_unpack_param(ctx, ctx->vs_state_bits, 7, 4);
+      small_prim_precision =
+         LLVMBuildOr(builder, small_prim_precision, LLVMConstInt(ctx->ac.i32, 0x70, 0), "");
+      small_prim_precision =
+         LLVMBuildShl(builder, small_prim_precision, LLVMConstInt(ctx->ac.i32, 23, 0), "");
+      small_prim_precision = LLVMBuildBitCast(builder, small_prim_precision, ctx->ac.f32, "");
+
+      /* Execute culling code. */
+      struct ac_cull_options options = {};
+      options.cull_front = shader->key.opt.ngg_culling & SI_NGG_CULL_FRONT_FACE;
+      options.cull_back = shader->key.opt.ngg_culling & SI_NGG_CULL_BACK_FACE;
+      options.cull_view_xy = shader->key.opt.ngg_culling & SI_NGG_CULL_VIEW_SMALLPRIMS;
+      options.cull_small_prims = options.cull_view_xy;
+      options.cull_zero_area = options.cull_front || options.cull_back;
+      options.cull_w = true;
+
+      /* Tell ES threads whether their vertex survived. */
+      ac_build_ifcc(&ctx->ac,
+                    ac_cull_triangle(&ctx->ac, pos, ctx->ac.i1true, vp_scale, vp_translate,
+                                     small_prim_precision, &options),
+                    16003);
+      {
+         LLVMBuildStore(builder, ctx->ac.i32_1, gs_accepted);
+         for (unsigned vtx = 0; vtx < 3; vtx++) {
+            LLVMBuildStore(builder, ctx->ac.i8_1,
+                           si_build_gep_i8(ctx, gs_vtxptr[vtx], lds_byte0_accept_flag));
+         }
+      }
+      ac_build_endif(&ctx->ac, 16003);
+   }
+   ac_build_endif(&ctx->ac, 16002);
+   ac_build_s_barrier(&ctx->ac);
+
+   gs_accepted = LLVMBuildLoad(builder, gs_accepted, "");
+
+   LLVMValueRef es_accepted = ac_build_alloca(&ctx->ac, ctx->ac.i1, "");
+
+   /* Convert the per-vertex flag to a thread bitmask in ES threads and store it in LDS. */
+   ac_build_ifcc(&ctx->ac, si_is_es_thread(ctx), 16007);
+   {
+      LLVMValueRef es_accepted_flag =
+         LLVMBuildLoad(builder, si_build_gep_i8(ctx, es_vtxptr, lds_byte0_accept_flag), "");
+
+      LLVMValueRef es_accepted_bool =
+         LLVMBuildICmp(builder, LLVMIntNE, es_accepted_flag, ctx->ac.i8_0, "");
+      LLVMValueRef es_mask = ac_get_i1_sgpr_mask(&ctx->ac, es_accepted_bool);
+
+      LLVMBuildStore(builder, es_accepted_bool, es_accepted);
+
+      ac_build_ifcc(&ctx->ac, LLVMBuildICmp(builder, LLVMIntEQ, tid, ctx->ac.i32_0, ""), 16008);
+      {
+         LLVMBuildStore(builder, es_mask,
+                        ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, get_wave_id_in_tg(ctx)));
+      }
+      ac_build_endif(&ctx->ac, 16008);
+   }
+   ac_build_endif(&ctx->ac, 16007);
+   ac_build_s_barrier(&ctx->ac);
+
+   /* Load the vertex masks and compute the new ES thread count. */
+   LLVMValueRef es_mask[2], new_num_es_threads, kill_wave;
+   load_bitmasks_2x64(ctx, ctx->gs_ngg_scratch, 0, es_mask, &new_num_es_threads);
+   new_num_es_threads = ac_build_readlane_no_opt_barrier(&ctx->ac, new_num_es_threads, NULL);
+
+   /* ES threads compute their prefix sum, which is the new ES thread ID.
+    * Then they write the value of the old thread ID into the LDS address
+    * of the new thread ID. It will be used it to load input VGPRs from
+    * the old thread's LDS location.
+    */
+   ac_build_ifcc(&ctx->ac, LLVMBuildLoad(builder, es_accepted, ""), 16009);
+   {
+      LLVMValueRef old_id = get_thread_id_in_tg(ctx);
+      LLVMValueRef new_id = ac_prefix_bitcount_2x64(&ctx->ac, es_mask, old_id);
+
+      LLVMBuildStore(
+         builder, LLVMBuildTrunc(builder, old_id, ctx->ac.i8, ""),
+         si_build_gep_i8(ctx, ngg_nogs_vertex_ptr(ctx, new_id), lds_byte0_old_thread_id));
+      LLVMBuildStore(builder, LLVMBuildTrunc(builder, new_id, ctx->ac.i8, ""),
+                     si_build_gep_i8(ctx, es_vtxptr, lds_byte1_new_thread_id));
+   }
+   ac_build_endif(&ctx->ac, 16009);
+
+   /* Kill waves that have inactive threads. */
+   kill_wave = LLVMBuildICmp(builder, LLVMIntULE,
+                             ac_build_imax(&ctx->ac, new_num_es_threads, ngg_get_prim_cnt(ctx)),
+                             LLVMBuildMul(builder, get_wave_id_in_tg(ctx),
+                                          LLVMConstInt(ctx->ac.i32, ctx->ac.wave_size, 0), ""),
+                             "");
+   ac_build_ifcc(&ctx->ac, kill_wave, 19202);
+   {
+      /* If we are killing wave 0, send that there are no primitives
+       * in this threadgroup.
+       */
+      ac_build_sendmsg_gs_alloc_req(&ctx->ac, get_wave_id_in_tg(ctx), ctx->ac.i32_0, ctx->ac.i32_0);
+      ac_build_s_endpgm(&ctx->ac);
+   }
+   ac_build_endif(&ctx->ac, 19202);
+   ac_build_s_barrier(&ctx->ac);
+
+   /* Send the final vertex and primitive counts. */
+   ac_build_sendmsg_gs_alloc_req(&ctx->ac, get_wave_id_in_tg(ctx), new_num_es_threads,
+                                 ngg_get_prim_cnt(ctx));
+
+   /* Update thread counts in SGPRs. */
+   LLVMValueRef new_gs_tg_info = ac_get_arg(&ctx->ac, ctx->gs_tg_info);
+   LLVMValueRef new_merged_wave_info = ac_get_arg(&ctx->ac, ctx->merged_wave_info);
+
+   /* This also converts the thread count from the total count to the per-wave count. */
+   update_thread_counts(ctx, &new_num_es_threads, &new_gs_tg_info, 9, 12, &new_merged_wave_info, 8,
+                        0);
+
+   /* Update vertex indices in VGPR0 (same format as NGG passthrough). */
+   LLVMValueRef new_vgpr0 = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, "");
+
+   /* Set the null flag at the beginning (culled), and then
+    * overwrite it for accepted primitives.
+    */
+   LLVMBuildStore(builder, LLVMConstInt(ctx->ac.i32, 1u << 31, 0), new_vgpr0);
+
+   /* Get vertex indices after vertex compaction. */
+   ac_build_ifcc(&ctx->ac, LLVMBuildTrunc(builder, gs_accepted, ctx->ac.i1, ""), 16011);
+   {
+      struct ac_ngg_prim prim = {};
+      prim.num_vertices = 3;
+      prim.isnull = ctx->ac.i1false;
+
+      for (unsigned vtx = 0; vtx < 3; vtx++) {
+         prim.index[vtx] = LLVMBuildLoad(
+            builder, si_build_gep_i8(ctx, gs_vtxptr[vtx], lds_byte1_new_thread_id), "");
+         prim.index[vtx] = LLVMBuildZExt(builder, prim.index[vtx], ctx->ac.i32, "");
+         prim.edgeflag[vtx] = ngg_get_initial_edgeflag(ctx, vtx);
+      }
+
+      /* Set the new GS input VGPR. */
+      LLVMBuildStore(builder, ac_pack_prim_export(&ctx->ac, &prim), new_vgpr0);
+   }
+   ac_build_endif(&ctx->ac, 16011);
+
+   if (gfx10_ngg_export_prim_early(shader))
+      gfx10_ngg_build_export_prim(ctx, NULL, LLVMBuildLoad(builder, new_vgpr0, ""));
+
+   /* Set the new ES input VGPRs. */
+   LLVMValueRef es_data[4];
+   LLVMValueRef old_thread_id = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, "");
+
+   for (unsigned i = 0; i < 4; i++)
+      es_data[i] = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, "");
+
+   ac_build_ifcc(&ctx->ac, LLVMBuildICmp(ctx->ac.builder, LLVMIntULT, tid, new_num_es_threads, ""),
+                 16012);
+   {
+      LLVMValueRef old_id, old_es_vtxptr, tmp;
+
+      /* Load ES input VGPRs from the ES thread before compaction. */
+      old_id = LLVMBuildLoad(builder, si_build_gep_i8(ctx, es_vtxptr, lds_byte0_old_thread_id), "");
+      old_id = LLVMBuildZExt(builder, old_id, ctx->ac.i32, "");
+
+      LLVMBuildStore(builder, old_id, old_thread_id);
+      old_es_vtxptr = ngg_nogs_vertex_ptr(ctx, old_id);
+
+      for (unsigned i = 0; i < 2; i++) {
+         tmp = LLVMBuildLoad(
+            builder,
+            ac_build_gep0(&ctx->ac, old_es_vtxptr, LLVMConstInt(ctx->ac.i32, lds_vertex_id + i, 0)),
+            "");
+         LLVMBuildStore(builder, tmp, es_data[i]);
+      }
+
+      if (ctx->type == PIPE_SHADER_TESS_EVAL) {
+         tmp = LLVMBuildLoad(builder,
+                             si_build_gep_i8(ctx, old_es_vtxptr, lds_byte2_tes_rel_patch_id), "");
+         tmp = LLVMBuildZExt(builder, tmp, ctx->ac.i32, "");
+         LLVMBuildStore(builder, tmp, es_data[2]);
+
+         if (uses_tes_prim_id) {
+            tmp = LLVMBuildLoad(builder,
+                                ac_build_gep0(&ctx->ac, old_es_vtxptr,
+                                              LLVMConstInt(ctx->ac.i32, lds_tes_patch_id, 0)),
+                                "");
+            LLVMBuildStore(builder, tmp, es_data[3]);
+         }
+      }
+   }
+   ac_build_endif(&ctx->ac, 16012);
+
+   /* Return values for the main function. */
+   LLVMValueRef ret = ctx->return_value;
+   LLVMValueRef val;
+
+   ret = LLVMBuildInsertValue(ctx->ac.builder, ret, new_gs_tg_info, 2, "");
+   ret = LLVMBuildInsertValue(ctx->ac.builder, ret, new_merged_wave_info, 3, "");
+   if (ctx->type == PIPE_SHADER_TESS_EVAL)
+      ret = si_insert_input_ret(ctx, ret, ctx->tcs_offchip_offset, 4);
+
+   ret = si_insert_input_ptr(ctx, ret, ctx->rw_buffers, 8 + SI_SGPR_RW_BUFFERS);
+   ret = si_insert_input_ptr(ctx, ret, ctx->bindless_samplers_and_images,
+                             8 + SI_SGPR_BINDLESS_SAMPLERS_AND_IMAGES);
+   ret = si_insert_input_ptr(ctx, ret, ctx->const_and_shader_buffers,
+                             8 + SI_SGPR_CONST_AND_SHADER_BUFFERS);
+   ret = si_insert_input_ptr(ctx, ret, ctx->samplers_and_images, 8 + SI_SGPR_SAMPLERS_AND_IMAGES);
+   ret = si_insert_input_ptr(ctx, ret, ctx->vs_state_bits, 8 + SI_SGPR_VS_STATE_BITS);
+
+   if (ctx->type == PIPE_SHADER_VERTEX) {
+      ret = si_insert_input_ptr(ctx, ret, ctx->args.base_vertex, 8 + SI_SGPR_BASE_VERTEX);
+      ret = si_insert_input_ptr(ctx, ret, ctx->args.start_instance, 8 + SI_SGPR_START_INSTANCE);
+      ret = si_insert_input_ptr(ctx, ret, ctx->args.draw_id, 8 + SI_SGPR_DRAWID);
+      ret = si_insert_input_ptr(ctx, ret, ctx->vertex_buffers, 8 + SI_VS_NUM_USER_SGPR);
+
+      for (unsigned i = 0; i < shader->selector->num_vbos_in_user_sgprs; i++) {
+         ret = si_insert_input_v4i32(ctx, ret, ctx->vb_descriptors[i],
+                                     8 + SI_SGPR_VS_VB_DESCRIPTOR_FIRST + i * 4);
+      }
+   } else {
+      assert(ctx->type == PIPE_SHADER_TESS_EVAL);
+      ret = si_insert_input_ptr(ctx, ret, ctx->tcs_offchip_layout, 8 + SI_SGPR_TES_OFFCHIP_LAYOUT);
+      ret = si_insert_input_ptr(ctx, ret, ctx->tes_offchip_addr, 8 + SI_SGPR_TES_OFFCHIP_ADDR);
+   }
+
+   unsigned vgpr;
+   if (ctx->type == PIPE_SHADER_VERTEX) {
+      if (shader->selector->num_vbos_in_user_sgprs) {
+         vgpr = 8 + SI_SGPR_VS_VB_DESCRIPTOR_FIRST + shader->selector->num_vbos_in_user_sgprs * 4;
+      } else {
+         vgpr = 8 + GFX9_VSGS_NUM_USER_SGPR + 1;
+      }
+   } else {
+      vgpr = 8 + GFX9_TESGS_NUM_USER_SGPR;
+   }
+
+   val = LLVMBuildLoad(builder, new_vgpr0, "");
+   ret = LLVMBuildInsertValue(builder, ret, ac_to_float(&ctx->ac, val), vgpr++, "");
+   vgpr++; /* gs_vtx23_offset */
+
+   ret = si_insert_input_ret_float(ctx, ret, ctx->args.gs_prim_id, vgpr++);
+   ret = si_insert_input_ret_float(ctx, ret, ctx->args.gs_invocation_id, vgpr++);
+   vgpr++; /* gs_vtx45_offset */
+
+   if (ctx->type == PIPE_SHADER_VERTEX) {
+      val = LLVMBuildLoad(builder, es_data[0], "");
+      ret = LLVMBuildInsertValue(builder, ret, ac_to_float(&ctx->ac, val), vgpr++,
+                                 ""); /* VGPR5 - VertexID */
+      vgpr += 2;
+      if (uses_instance_id) {
+         val = LLVMBuildLoad(builder, es_data[1], "");
+         ret = LLVMBuildInsertValue(builder, ret, ac_to_float(&ctx->ac, val), vgpr++,
+                                    ""); /* VGPR8 - InstanceID */
+      } else {
+         vgpr++;
+      }
+   } else {
+      assert(ctx->type == PIPE_SHADER_TESS_EVAL);
+      unsigned num_vgprs = uses_tes_prim_id ? 4 : 3;
+      for (unsigned i = 0; i < num_vgprs; i++) {
+         val = LLVMBuildLoad(builder, es_data[i], "");
+         ret = LLVMBuildInsertValue(builder, ret, ac_to_float(&ctx->ac, val), vgpr++, "");
+      }
+      if (num_vgprs == 3)
+         vgpr++;
+   }
+   /* Return the old thread ID. */
+   val = LLVMBuildLoad(builder, old_thread_id, "");
+   ret = LLVMBuildInsertValue(builder, ret, ac_to_float(&ctx->ac, val), vgpr++, "");
+
+   /* These two also use LDS. */
+   if (sel->info.writes_edgeflag ||
+       (ctx->type == PIPE_SHADER_VERTEX && shader->key.mono.u.vs_export_prim_id))
+      ac_build_s_barrier(&ctx->ac);
+
+   ctx->return_value = ret;
 }
 
 /**
  * Emit the epilogue of an API VS or TES shader compiled as ESGS shader.
  */
-void gfx10_emit_ngg_epilogue(struct ac_shader_abi *abi,
-			     unsigned max_outputs,
-			     LLVMValueRef *addrs)
+void gfx10_emit_ngg_epilogue(struct ac_shader_abi *abi, unsigned max_outputs, LLVMValueRef *addrs)
 {
-	struct si_shader_context *ctx = si_shader_context_from_abi(abi);
-	struct si_shader_selector *sel = ctx->shader->selector;
-	struct si_shader_info *info = &sel->info;
-	struct si_shader_output_values outputs[PIPE_MAX_SHADER_OUTPUTS];
-	LLVMBuilderRef builder = ctx->ac.builder;
-	LLVMValueRef tmp, tmp2;
-
-	assert(!ctx->shader->is_gs_copy_shader);
-	assert(info->num_outputs <= max_outputs);
-
-	LLVMValueRef vertex_ptr = NULL;
-
-	if (sel->so.num_outputs || sel->info.writes_edgeflag)
-		vertex_ptr = ngg_nogs_vertex_ptr(ctx, get_thread_id_in_tg(ctx));
-
-	for (unsigned i = 0; i < info->num_outputs; i++) {
-		outputs[i].semantic_name = info->output_semantic_name[i];
-		outputs[i].semantic_index = info->output_semantic_index[i];
-
-		for (unsigned j = 0; j < 4; j++) {
-			outputs[i].vertex_stream[j] =
-				(info->output_streams[i] >> (2 * j)) & 3;
-
-			/* TODO: we may store more outputs than streamout needs,
-			 * but streamout performance isn't that important.
-			 */
-			if (sel->so.num_outputs) {
-				tmp = ac_build_gep0(&ctx->ac, vertex_ptr,
-					LLVMConstInt(ctx->ac.i32, 4 * i + j, false));
-				tmp2 = LLVMBuildLoad(builder, addrs[4 * i + j], "");
-				tmp2 = ac_to_integer(&ctx->ac, tmp2);
-				LLVMBuildStore(builder, tmp2, tmp);
-			}
-		}
-
-		/* Store the edgeflag at the end (if streamout is enabled) */
-		if (info->output_semantic_name[i] == TGSI_SEMANTIC_EDGEFLAG &&
-		    sel->info.writes_edgeflag) {
-			LLVMValueRef edgeflag = LLVMBuildLoad(builder, addrs[4 * i], "");
-			/* The output is a float, but the hw expects a 1-bit integer. */
-			edgeflag = LLVMBuildFPToUI(ctx->ac.builder, edgeflag, ctx->ac.i32, "");
-			edgeflag = ac_build_umin(&ctx->ac, edgeflag, ctx->ac.i32_1);
-
-			tmp = LLVMConstInt(ctx->ac.i32, ngg_nogs_vertex_size(ctx->shader) - 1, 0);
-			tmp = ac_build_gep0(&ctx->ac, vertex_ptr, tmp);
-			LLVMBuildStore(builder, edgeflag, tmp);
-		}
-	}
-
-	bool unterminated_es_if_block =
-		!sel->so.num_outputs &&
-		!sel->info.writes_edgeflag &&
-		!ctx->screen->use_ngg_streamout && /* no query buffer */
-		(ctx->type != PIPE_SHADER_VERTEX ||
-		 !ctx->shader->key.mono.u.vs_export_prim_id);
-
-	if (!unterminated_es_if_block)
-		ac_build_endif(&ctx->ac, ctx->merged_wrap_if_label);
-
-	LLVMValueRef is_gs_thread = si_is_gs_thread(ctx);
-	LLVMValueRef is_es_thread = si_is_es_thread(ctx);
-	LLVMValueRef vtxindex[3];
-
-	if (ctx->shader->key.opt.ngg_culling) {
-		vtxindex[0] = si_unpack_param(ctx, ctx->gs_vtx01_offset, 0, 9);
-		vtxindex[1] = si_unpack_param(ctx, ctx->gs_vtx01_offset, 10, 9);
-		vtxindex[2] = si_unpack_param(ctx, ctx->gs_vtx01_offset, 20, 9);
-	} else {
-		vtxindex[0] = si_unpack_param(ctx, ctx->gs_vtx01_offset, 0, 16);
-		vtxindex[1] = si_unpack_param(ctx, ctx->gs_vtx01_offset, 16, 16);
-		vtxindex[2] = si_unpack_param(ctx, ctx->gs_vtx23_offset, 0, 16);
-	}
-
-	/* Determine the number of vertices per primitive. */
-	unsigned num_vertices;
-	LLVMValueRef num_vertices_val = ngg_get_vertices_per_prim(ctx, &num_vertices);
-
-	/* Streamout */
-	LLVMValueRef emitted_prims = NULL;
-
-	if (sel->so.num_outputs) {
-		assert(!unterminated_es_if_block);
-
-		struct ngg_streamout nggso = {};
-		nggso.num_vertices = num_vertices_val;
-		nggso.prim_enable[0] = is_gs_thread;
-
-		for (unsigned i = 0; i < num_vertices; ++i)
-			nggso.vertices[i] = ngg_nogs_vertex_ptr(ctx, vtxindex[i]);
-
-		build_streamout(ctx, &nggso);
-		emitted_prims = nggso.emit[0];
-	}
-
-	LLVMValueRef user_edgeflags[3] = {};
-
-	if (sel->info.writes_edgeflag) {
-		assert(!unterminated_es_if_block);
-
-		/* Streamout already inserted the barrier, so don't insert it again. */
-		if (!sel->so.num_outputs)
-			ac_build_s_barrier(&ctx->ac);
-
-		ac_build_ifcc(&ctx->ac, is_gs_thread, 5400);
-		/* Load edge flags from ES threads and store them into VGPRs in GS threads. */
-		for (unsigned i = 0; i < num_vertices; i++) {
-			tmp = ngg_nogs_vertex_ptr(ctx, vtxindex[i]);
-			tmp2 = LLVMConstInt(ctx->ac.i32, ngg_nogs_vertex_size(ctx->shader) - 1, 0);
-			tmp = ac_build_gep0(&ctx->ac, tmp, tmp2);
-			tmp = LLVMBuildLoad(builder, tmp, "");
-			tmp = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, "");
-
-			user_edgeflags[i] = ac_build_alloca_undef(&ctx->ac, ctx->ac.i1, "");
-			LLVMBuildStore(builder, tmp, user_edgeflags[i]);
-		}
-		ac_build_endif(&ctx->ac, 5400);
-	}
-
-	/* Copy Primitive IDs from GS threads to the LDS address corresponding
-	 * to the ES thread of the provoking vertex.
-	 */
-	if (ctx->type == PIPE_SHADER_VERTEX &&
-	    ctx->shader->key.mono.u.vs_export_prim_id) {
-		assert(!unterminated_es_if_block);
-
-		/* Streamout and edge flags use LDS. Make it idle, so that we can reuse it. */
-		if (sel->so.num_outputs || sel->info.writes_edgeflag)
-			ac_build_s_barrier(&ctx->ac);
-
-		ac_build_ifcc(&ctx->ac, is_gs_thread, 5400);
-		/* Extract the PROVOKING_VTX_INDEX field. */
-		LLVMValueRef provoking_vtx_in_prim =
-			si_unpack_param(ctx, ctx->vs_state_bits, 4, 2);
-
-		/* provoking_vtx_index = vtxindex[provoking_vtx_in_prim]; */
-		LLVMValueRef indices = ac_build_gather_values(&ctx->ac, vtxindex, 3);
-		LLVMValueRef provoking_vtx_index =
-			LLVMBuildExtractElement(builder, indices, provoking_vtx_in_prim, "");
-		LLVMValueRef vertex_ptr = ngg_nogs_vertex_ptr(ctx, provoking_vtx_index);
-
-		LLVMBuildStore(builder, ac_get_arg(&ctx->ac, ctx->args.gs_prim_id),
-			       ac_build_gep0(&ctx->ac, vertex_ptr, ctx->ac.i32_0));
-		ac_build_endif(&ctx->ac, 5400);
-	}
-
-	/* Update query buffer */
-	if (ctx->screen->use_ngg_streamout &&
-	    !info->properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD]) {
-		assert(!unterminated_es_if_block);
-
-		tmp = si_unpack_param(ctx, ctx->vs_state_bits, 6, 1);
-		tmp = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, "");
-		ac_build_ifcc(&ctx->ac, tmp, 5029); /* if (STREAMOUT_QUERY_ENABLED) */
-		tmp = LLVMBuildICmp(builder, LLVMIntEQ, get_wave_id_in_tg(ctx), ctx->ac.i32_0, "");
-		ac_build_ifcc(&ctx->ac, tmp, 5030);
-		tmp = LLVMBuildICmp(builder, LLVMIntULE, ac_get_thread_id(&ctx->ac),
-				    sel->so.num_outputs ? ctx->ac.i32_1 : ctx->ac.i32_0, "");
-		ac_build_ifcc(&ctx->ac, tmp, 5031);
-		{
-			LLVMValueRef args[] = {
-				ngg_get_prim_cnt(ctx),
-				ngg_get_query_buf(ctx),
-				LLVMConstInt(ctx->ac.i32, 16, false), /* offset of stream[0].generated_primitives */
-				ctx->ac.i32_0, /* soffset */
-				ctx->ac.i32_0, /* cachepolicy */
-			};
-
-			if (sel->so.num_outputs) {
-				args[0] = ac_build_writelane(&ctx->ac, args[0], emitted_prims, ctx->ac.i32_1);
-				args[2] = ac_build_writelane(&ctx->ac, args[2],
-						LLVMConstInt(ctx->ac.i32, 24, false), ctx->ac.i32_1);
-			}
-
-			/* TODO: should this be 64-bit atomics? */
-			ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.raw.buffer.atomic.add.i32",
-					   ctx->ac.i32, args, 5, 0);
-		}
-		ac_build_endif(&ctx->ac, 5031);
-		ac_build_endif(&ctx->ac, 5030);
-		ac_build_endif(&ctx->ac, 5029);
-	}
-
-	/* Build the primitive export. */
-	if (!gfx10_ngg_export_prim_early(ctx->shader)) {
-		assert(!unterminated_es_if_block);
-		gfx10_ngg_build_export_prim(ctx, user_edgeflags, NULL);
-	}
-
-	/* Export per-vertex data (positions and parameters). */
-	if (!unterminated_es_if_block)
-		ac_build_ifcc(&ctx->ac, is_es_thread, 6002);
-	{
-		unsigned i;
-
-		/* Unconditionally (re-)load the values for proper SSA form. */
-		for (i = 0; i < info->num_outputs; i++) {
-			/* If the NGG cull shader part computed the position, don't
-			 * use the position from the current shader part. Instead,
-			 * load it from LDS.
-			 */
-			if (info->output_semantic_name[i] == TGSI_SEMANTIC_POSITION &&
-			    ctx->shader->key.opt.ngg_culling) {
-				vertex_ptr = ngg_nogs_vertex_ptr(ctx,
-						ac_get_arg(&ctx->ac, ctx->ngg_old_thread_id));
-
-				for (unsigned j = 0; j < 4; j++) {
-					tmp = LLVMConstInt(ctx->ac.i32, lds_pos_x + j, 0);
-					tmp = ac_build_gep0(&ctx->ac, vertex_ptr, tmp);
-					tmp = LLVMBuildLoad(builder, tmp, "");
-					outputs[i].values[j] = ac_to_float(&ctx->ac, tmp);
-				}
-			} else {
-				for (unsigned j = 0; j < 4; j++) {
-					outputs[i].values[j] =
-						LLVMBuildLoad(builder,
-							      addrs[4 * i + j], "");
-				}
-			}
-		}
-
-		if (ctx->shader->key.mono.u.vs_export_prim_id) {
-			outputs[i].semantic_name = TGSI_SEMANTIC_PRIMID;
-			outputs[i].semantic_index = 0;
-
-			if (ctx->type == PIPE_SHADER_VERTEX) {
-				/* Wait for GS stores to finish. */
-				ac_build_s_barrier(&ctx->ac);
-
-				tmp = ngg_nogs_vertex_ptr(ctx, get_thread_id_in_tg(ctx));
-				tmp = ac_build_gep0(&ctx->ac, tmp, ctx->ac.i32_0);
-				outputs[i].values[0] = LLVMBuildLoad(builder, tmp, "");
-			} else {
-				assert(ctx->type == PIPE_SHADER_TESS_EVAL);
-				outputs[i].values[0] = si_get_primitive_id(ctx, 0);
-			}
-
-			outputs[i].values[0] = ac_to_float(&ctx->ac, outputs[i].values[0]);
-			for (unsigned j = 1; j < 4; j++)
-				outputs[i].values[j] = LLVMGetUndef(ctx->ac.f32);
-
-			memset(outputs[i].vertex_stream, 0,
-			       sizeof(outputs[i].vertex_stream));
-			i++;
-		}
-
-		si_llvm_build_vs_exports(ctx, outputs, i);
-	}
-	ac_build_endif(&ctx->ac, 6002);
+   struct si_shader_context *ctx = si_shader_context_from_abi(abi);
+   struct si_shader_selector *sel = ctx->shader->selector;
+   struct si_shader_info *info = &sel->info;
+   struct si_shader_output_values outputs[PIPE_MAX_SHADER_OUTPUTS];
+   LLVMBuilderRef builder = ctx->ac.builder;
+   LLVMValueRef tmp, tmp2;
+
+   assert(!ctx->shader->is_gs_copy_shader);
+   assert(info->num_outputs <= max_outputs);
+
+   LLVMValueRef vertex_ptr = NULL;
+
+   if (sel->so.num_outputs || sel->info.writes_edgeflag)
+      vertex_ptr = ngg_nogs_vertex_ptr(ctx, get_thread_id_in_tg(ctx));
+
+   for (unsigned i = 0; i < info->num_outputs; i++) {
+      outputs[i].semantic_name = info->output_semantic_name[i];
+      outputs[i].semantic_index = info->output_semantic_index[i];
+
+      for (unsigned j = 0; j < 4; j++) {
+         outputs[i].vertex_stream[j] = (info->output_streams[i] >> (2 * j)) & 3;
+
+         /* TODO: we may store more outputs than streamout needs,
+          * but streamout performance isn't that important.
+          */
+         if (sel->so.num_outputs) {
+            tmp = ac_build_gep0(&ctx->ac, vertex_ptr, LLVMConstInt(ctx->ac.i32, 4 * i + j, false));
+            tmp2 = LLVMBuildLoad(builder, addrs[4 * i + j], "");
+            tmp2 = ac_to_integer(&ctx->ac, tmp2);
+            LLVMBuildStore(builder, tmp2, tmp);
+         }
+      }
+
+      /* Store the edgeflag at the end (if streamout is enabled) */
+      if (info->output_semantic_name[i] == TGSI_SEMANTIC_EDGEFLAG && sel->info.writes_edgeflag) {
+         LLVMValueRef edgeflag = LLVMBuildLoad(builder, addrs[4 * i], "");
+         /* The output is a float, but the hw expects a 1-bit integer. */
+         edgeflag = LLVMBuildFPToUI(ctx->ac.builder, edgeflag, ctx->ac.i32, "");
+         edgeflag = ac_build_umin(&ctx->ac, edgeflag, ctx->ac.i32_1);
+
+         tmp = LLVMConstInt(ctx->ac.i32, ngg_nogs_vertex_size(ctx->shader) - 1, 0);
+         tmp = ac_build_gep0(&ctx->ac, vertex_ptr, tmp);
+         LLVMBuildStore(builder, edgeflag, tmp);
+      }
+   }
+
+   bool unterminated_es_if_block =
+      !sel->so.num_outputs && !sel->info.writes_edgeflag &&
+      !ctx->screen->use_ngg_streamout && /* no query buffer */
+      (ctx->type != PIPE_SHADER_VERTEX || !ctx->shader->key.mono.u.vs_export_prim_id);
+
+   if (!unterminated_es_if_block)
+      ac_build_endif(&ctx->ac, ctx->merged_wrap_if_label);
+
+   LLVMValueRef is_gs_thread = si_is_gs_thread(ctx);
+   LLVMValueRef is_es_thread = si_is_es_thread(ctx);
+   LLVMValueRef vtxindex[3];
+
+   if (ctx->shader->key.opt.ngg_culling) {
+      vtxindex[0] = si_unpack_param(ctx, ctx->gs_vtx01_offset, 0, 9);
+      vtxindex[1] = si_unpack_param(ctx, ctx->gs_vtx01_offset, 10, 9);
+      vtxindex[2] = si_unpack_param(ctx, ctx->gs_vtx01_offset, 20, 9);
+   } else {
+      vtxindex[0] = si_unpack_param(ctx, ctx->gs_vtx01_offset, 0, 16);
+      vtxindex[1] = si_unpack_param(ctx, ctx->gs_vtx01_offset, 16, 16);
+      vtxindex[2] = si_unpack_param(ctx, ctx->gs_vtx23_offset, 0, 16);
+   }
+
+   /* Determine the number of vertices per primitive. */
+   unsigned num_vertices;
+   LLVMValueRef num_vertices_val = ngg_get_vertices_per_prim(ctx, &num_vertices);
+
+   /* Streamout */
+   LLVMValueRef emitted_prims = NULL;
+
+   if (sel->so.num_outputs) {
+      assert(!unterminated_es_if_block);
+
+      struct ngg_streamout nggso = {};
+      nggso.num_vertices = num_vertices_val;
+      nggso.prim_enable[0] = is_gs_thread;
+
+      for (unsigned i = 0; i < num_vertices; ++i)
+         nggso.vertices[i] = ngg_nogs_vertex_ptr(ctx, vtxindex[i]);
+
+      build_streamout(ctx, &nggso);
+      emitted_prims = nggso.emit[0];
+   }
+
+   LLVMValueRef user_edgeflags[3] = {};
+
+   if (sel->info.writes_edgeflag) {
+      assert(!unterminated_es_if_block);
+
+      /* Streamout already inserted the barrier, so don't insert it again. */
+      if (!sel->so.num_outputs)
+         ac_build_s_barrier(&ctx->ac);
+
+      ac_build_ifcc(&ctx->ac, is_gs_thread, 5400);
+      /* Load edge flags from ES threads and store them into VGPRs in GS threads. */
+      for (unsigned i = 0; i < num_vertices; i++) {
+         tmp = ngg_nogs_vertex_ptr(ctx, vtxindex[i]);
+         tmp2 = LLVMConstInt(ctx->ac.i32, ngg_nogs_vertex_size(ctx->shader) - 1, 0);
+         tmp = ac_build_gep0(&ctx->ac, tmp, tmp2);
+         tmp = LLVMBuildLoad(builder, tmp, "");
+         tmp = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, "");
+
+         user_edgeflags[i] = ac_build_alloca_undef(&ctx->ac, ctx->ac.i1, "");
+         LLVMBuildStore(builder, tmp, user_edgeflags[i]);
+      }
+      ac_build_endif(&ctx->ac, 5400);
+   }
+
+   /* Copy Primitive IDs from GS threads to the LDS address corresponding
+    * to the ES thread of the provoking vertex.
+    */
+   if (ctx->type == PIPE_SHADER_VERTEX && ctx->shader->key.mono.u.vs_export_prim_id) {
+      assert(!unterminated_es_if_block);
+
+      /* Streamout and edge flags use LDS. Make it idle, so that we can reuse it. */
+      if (sel->so.num_outputs || sel->info.writes_edgeflag)
+         ac_build_s_barrier(&ctx->ac);
+
+      ac_build_ifcc(&ctx->ac, is_gs_thread, 5400);
+      /* Extract the PROVOKING_VTX_INDEX field. */
+      LLVMValueRef provoking_vtx_in_prim = si_unpack_param(ctx, ctx->vs_state_bits, 4, 2);
+
+      /* provoking_vtx_index = vtxindex[provoking_vtx_in_prim]; */
+      LLVMValueRef indices = ac_build_gather_values(&ctx->ac, vtxindex, 3);
+      LLVMValueRef provoking_vtx_index =
+         LLVMBuildExtractElement(builder, indices, provoking_vtx_in_prim, "");
+      LLVMValueRef vertex_ptr = ngg_nogs_vertex_ptr(ctx, provoking_vtx_index);
+
+      LLVMBuildStore(builder, ac_get_arg(&ctx->ac, ctx->args.gs_prim_id),
+                     ac_build_gep0(&ctx->ac, vertex_ptr, ctx->ac.i32_0));
+      ac_build_endif(&ctx->ac, 5400);
+   }
+
+   /* Update query buffer */
+   if (ctx->screen->use_ngg_streamout && !info->properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD]) {
+      assert(!unterminated_es_if_block);
+
+      tmp = si_unpack_param(ctx, ctx->vs_state_bits, 6, 1);
+      tmp = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, "");
+      ac_build_ifcc(&ctx->ac, tmp, 5029); /* if (STREAMOUT_QUERY_ENABLED) */
+      tmp = LLVMBuildICmp(builder, LLVMIntEQ, get_wave_id_in_tg(ctx), ctx->ac.i32_0, "");
+      ac_build_ifcc(&ctx->ac, tmp, 5030);
+      tmp = LLVMBuildICmp(builder, LLVMIntULE, ac_get_thread_id(&ctx->ac),
+                          sel->so.num_outputs ? ctx->ac.i32_1 : ctx->ac.i32_0, "");
+      ac_build_ifcc(&ctx->ac, tmp, 5031);
+      {
+         LLVMValueRef args[] = {
+            ngg_get_prim_cnt(ctx),
+            ngg_get_query_buf(ctx),
+            LLVMConstInt(ctx->ac.i32, 16, false), /* offset of stream[0].generated_primitives */
+            ctx->ac.i32_0,                        /* soffset */
+            ctx->ac.i32_0,                        /* cachepolicy */
+         };
+
+         if (sel->so.num_outputs) {
+            args[0] = ac_build_writelane(&ctx->ac, args[0], emitted_prims, ctx->ac.i32_1);
+            args[2] = ac_build_writelane(&ctx->ac, args[2], LLVMConstInt(ctx->ac.i32, 24, false),
+                                         ctx->ac.i32_1);
+         }
+
+         /* TODO: should this be 64-bit atomics? */
+         ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.raw.buffer.atomic.add.i32", ctx->ac.i32, args, 5,
+                            0);
+      }
+      ac_build_endif(&ctx->ac, 5031);
+      ac_build_endif(&ctx->ac, 5030);
+      ac_build_endif(&ctx->ac, 5029);
+   }
+
+   /* Build the primitive export. */
+   if (!gfx10_ngg_export_prim_early(ctx->shader)) {
+      assert(!unterminated_es_if_block);
+      gfx10_ngg_build_export_prim(ctx, user_edgeflags, NULL);
+   }
+
+   /* Export per-vertex data (positions and parameters). */
+   if (!unterminated_es_if_block)
+      ac_build_ifcc(&ctx->ac, is_es_thread, 6002);
+   {
+      unsigned i;
+
+      /* Unconditionally (re-)load the values for proper SSA form. */
+      for (i = 0; i < info->num_outputs; i++) {
+         /* If the NGG cull shader part computed the position, don't
+          * use the position from the current shader part. Instead,
+          * load it from LDS.
+          */
+         if (info->output_semantic_name[i] == TGSI_SEMANTIC_POSITION &&
+             ctx->shader->key.opt.ngg_culling) {
+            vertex_ptr = ngg_nogs_vertex_ptr(ctx, ac_get_arg(&ctx->ac, ctx->ngg_old_thread_id));
+
+            for (unsigned j = 0; j < 4; j++) {
+               tmp = LLVMConstInt(ctx->ac.i32, lds_pos_x + j, 0);
+               tmp = ac_build_gep0(&ctx->ac, vertex_ptr, tmp);
+               tmp = LLVMBuildLoad(builder, tmp, "");
+               outputs[i].values[j] = ac_to_float(&ctx->ac, tmp);
+            }
+         } else {
+            for (unsigned j = 0; j < 4; j++) {
+               outputs[i].values[j] = LLVMBuildLoad(builder, addrs[4 * i + j], "");
+            }
+         }
+      }
+
+      if (ctx->shader->key.mono.u.vs_export_prim_id) {
+         outputs[i].semantic_name = TGSI_SEMANTIC_PRIMID;
+         outputs[i].semantic_index = 0;
+
+         if (ctx->type == PIPE_SHADER_VERTEX) {
+            /* Wait for GS stores to finish. */
+            ac_build_s_barrier(&ctx->ac);
+
+            tmp = ngg_nogs_vertex_ptr(ctx, get_thread_id_in_tg(ctx));
+            tmp = ac_build_gep0(&ctx->ac, tmp, ctx->ac.i32_0);
+            outputs[i].values[0] = LLVMBuildLoad(builder, tmp, "");
+         } else {
+            assert(ctx->type == PIPE_SHADER_TESS_EVAL);
+            outputs[i].values[0] = si_get_primitive_id(ctx, 0);
+         }
+
+         outputs[i].values[0] = ac_to_float(&ctx->ac, outputs[i].values[0]);
+         for (unsigned j = 1; j < 4; j++)
+            outputs[i].values[j] = LLVMGetUndef(ctx->ac.f32);
+
+         memset(outputs[i].vertex_stream, 0, sizeof(outputs[i].vertex_stream));
+         i++;
+      }
+
+      si_llvm_build_vs_exports(ctx, outputs, i);
+   }
+   ac_build_endif(&ctx->ac, 6002);
 }
 
-static LLVMValueRef
-ngg_gs_get_vertex_storage(struct si_shader_context *ctx)
+static LLVMValueRef ngg_gs_get_vertex_storage(struct si_shader_context *ctx)
 {
-	const struct si_shader_selector *sel = ctx->shader->selector;
-	const struct si_shader_info *info = &sel->info;
-
-	LLVMTypeRef elements[2] = {
-		LLVMArrayType(ctx->ac.i32, 4 * info->num_outputs),
-		LLVMArrayType(ctx->ac.i8, 4),
-	};
-	LLVMTypeRef type = LLVMStructTypeInContext(ctx->ac.context, elements, 2, false);
-	type = LLVMPointerType(LLVMArrayType(type, 0), AC_ADDR_SPACE_LDS);
-	return LLVMBuildBitCast(ctx->ac.builder, ctx->gs_ngg_emit, type, "");
+   const struct si_shader_selector *sel = ctx->shader->selector;
+   const struct si_shader_info *info = &sel->info;
+
+   LLVMTypeRef elements[2] = {
+      LLVMArrayType(ctx->ac.i32, 4 * info->num_outputs),
+      LLVMArrayType(ctx->ac.i8, 4),
+   };
+   LLVMTypeRef type = LLVMStructTypeInContext(ctx->ac.context, elements, 2, false);
+   type = LLVMPointerType(LLVMArrayType(type, 0), AC_ADDR_SPACE_LDS);
+   return LLVMBuildBitCast(ctx->ac.builder, ctx->gs_ngg_emit, type, "");
 }
 
 /**
@@ -1536,452 +1458,424 @@ ngg_gs_get_vertex_storage(struct si_shader_context *ctx)
  *
  * \return an LDS pointer to type {[N x i32], [4 x i8]}
  */
-static LLVMValueRef
-ngg_gs_vertex_ptr(struct si_shader_context *ctx, LLVMValueRef vertexidx)
+static LLVMValueRef ngg_gs_vertex_ptr(struct si_shader_context *ctx, LLVMValueRef vertexidx)
 {
-	struct si_shader_selector *sel = ctx->shader->selector;
-	LLVMBuilderRef builder = ctx->ac.builder;
-	LLVMValueRef storage = ngg_gs_get_vertex_storage(ctx);
-
-	/* gs_max_out_vertices = 2^(write_stride_2exp) * some odd number */
-	unsigned write_stride_2exp = ffs(sel->gs_max_out_vertices) - 1;
-	if (write_stride_2exp) {
-		LLVMValueRef row =
-			LLVMBuildLShr(builder, vertexidx,
-				      LLVMConstInt(ctx->ac.i32, 5, false), "");
-		LLVMValueRef swizzle =
-			LLVMBuildAnd(builder, row,
-				     LLVMConstInt(ctx->ac.i32, (1u << write_stride_2exp) - 1,
-						  false), "");
-		vertexidx = LLVMBuildXor(builder, vertexidx, swizzle, "");
-	}
-
-	return ac_build_gep0(&ctx->ac, storage, vertexidx);
+   struct si_shader_selector *sel = ctx->shader->selector;
+   LLVMBuilderRef builder = ctx->ac.builder;
+   LLVMValueRef storage = ngg_gs_get_vertex_storage(ctx);
+
+   /* gs_max_out_vertices = 2^(write_stride_2exp) * some odd number */
+   unsigned write_stride_2exp = ffs(sel->gs_max_out_vertices) - 1;
+   if (write_stride_2exp) {
+      LLVMValueRef row = LLVMBuildLShr(builder, vertexidx, LLVMConstInt(ctx->ac.i32, 5, false), "");
+      LLVMValueRef swizzle = LLVMBuildAnd(
+         builder, row, LLVMConstInt(ctx->ac.i32, (1u << write_stride_2exp) - 1, false), "");
+      vertexidx = LLVMBuildXor(builder, vertexidx, swizzle, "");
+   }
+
+   return ac_build_gep0(&ctx->ac, storage, vertexidx);
 }
 
-static LLVMValueRef
-ngg_gs_emit_vertex_ptr(struct si_shader_context *ctx, LLVMValueRef gsthread,
-		       LLVMValueRef emitidx)
+static LLVMValueRef ngg_gs_emit_vertex_ptr(struct si_shader_context *ctx, LLVMValueRef gsthread,
+                                           LLVMValueRef emitidx)
 {
-	struct si_shader_selector *sel = ctx->shader->selector;
-	LLVMBuilderRef builder = ctx->ac.builder;
-	LLVMValueRef tmp;
-
-	tmp = LLVMConstInt(ctx->ac.i32, sel->gs_max_out_vertices, false);
-	tmp = LLVMBuildMul(builder, tmp, gsthread, "");
-	const LLVMValueRef vertexidx = LLVMBuildAdd(builder, tmp, emitidx, "");
-	return ngg_gs_vertex_ptr(ctx, vertexidx);
+   struct si_shader_selector *sel = ctx->shader->selector;
+   LLVMBuilderRef builder = ctx->ac.builder;
+   LLVMValueRef tmp;
+
+   tmp = LLVMConstInt(ctx->ac.i32, sel->gs_max_out_vertices, false);
+   tmp = LLVMBuildMul(builder, tmp, gsthread, "");
+   const LLVMValueRef vertexidx = LLVMBuildAdd(builder, tmp, emitidx, "");
+   return ngg_gs_vertex_ptr(ctx, vertexidx);
 }
 
-static LLVMValueRef
-ngg_gs_get_emit_output_ptr(struct si_shader_context *ctx, LLVMValueRef vertexptr,
-			   unsigned out_idx)
+static LLVMValueRef ngg_gs_get_emit_output_ptr(struct si_shader_context *ctx,
+                                               LLVMValueRef vertexptr, unsigned out_idx)
 {
-	LLVMValueRef gep_idx[3] = {
-		ctx->ac.i32_0, /* implied C-style array */
-		ctx->ac.i32_0, /* first struct entry */
-		LLVMConstInt(ctx->ac.i32, out_idx, false),
-	};
-	return LLVMBuildGEP(ctx->ac.builder, vertexptr, gep_idx, 3, "");
+   LLVMValueRef gep_idx[3] = {
+      ctx->ac.i32_0, /* implied C-style array */
+      ctx->ac.i32_0, /* first struct entry */
+      LLVMConstInt(ctx->ac.i32, out_idx, false),
+   };
+   return LLVMBuildGEP(ctx->ac.builder, vertexptr, gep_idx, 3, "");
 }
 
-static LLVMValueRef
-ngg_gs_get_emit_primflag_ptr(struct si_shader_context *ctx, LLVMValueRef vertexptr,
-			     unsigned stream)
+static LLVMValueRef ngg_gs_get_emit_primflag_ptr(struct si_shader_context *ctx,
+                                                 LLVMValueRef vertexptr, unsigned stream)
 {
-	LLVMValueRef gep_idx[3] = {
-		ctx->ac.i32_0, /* implied C-style array */
-		ctx->ac.i32_1, /* second struct entry */
-		LLVMConstInt(ctx->ac.i32, stream, false),
-	};
-	return LLVMBuildGEP(ctx->ac.builder, vertexptr, gep_idx, 3, "");
+   LLVMValueRef gep_idx[3] = {
+      ctx->ac.i32_0, /* implied C-style array */
+      ctx->ac.i32_1, /* second struct entry */
+      LLVMConstInt(ctx->ac.i32, stream, false),
+   };
+   return LLVMBuildGEP(ctx->ac.builder, vertexptr, gep_idx, 3, "");
 }
 
-void gfx10_ngg_gs_emit_vertex(struct si_shader_context *ctx,
-			      unsigned stream,
-			      LLVMValueRef *addrs)
+void gfx10_ngg_gs_emit_vertex(struct si_shader_context *ctx, unsigned stream, LLVMValueRef *addrs)
 {
-	const struct si_shader_selector *sel = ctx->shader->selector;
-	const struct si_shader_info *info = &sel->info;
-	LLVMBuilderRef builder = ctx->ac.builder;
-	LLVMValueRef tmp;
-	const LLVMValueRef vertexidx =
-		LLVMBuildLoad(builder, ctx->gs_next_vertex[stream], "");
-
-	/* If this thread has already emitted the declared maximum number of
-	 * vertices, skip the write: excessive vertex emissions are not
-	 * supposed to have any effect.
-	 */
-	const LLVMValueRef can_emit =
-		LLVMBuildICmp(builder, LLVMIntULT, vertexidx,
-			      LLVMConstInt(ctx->ac.i32, sel->gs_max_out_vertices, false), "");
-
-	tmp = LLVMBuildAdd(builder, vertexidx, ctx->ac.i32_1, "");
-	tmp = LLVMBuildSelect(builder, can_emit, tmp, vertexidx, "");
-	LLVMBuildStore(builder, tmp, ctx->gs_next_vertex[stream]);
-
-	ac_build_ifcc(&ctx->ac, can_emit, 9001);
-
-	const LLVMValueRef vertexptr =
-		ngg_gs_emit_vertex_ptr(ctx, get_thread_id_in_tg(ctx), vertexidx);
-	unsigned out_idx = 0;
-	for (unsigned i = 0; i < info->num_outputs; i++) {
-		for (unsigned chan = 0; chan < 4; chan++, out_idx++) {
-			if (!(info->output_usagemask[i] & (1 << chan)) ||
-			    ((info->output_streams[i] >> (2 * chan)) & 3) != stream)
-				continue;
-
-			LLVMValueRef out_val = LLVMBuildLoad(builder, addrs[4 * i + chan], "");
-			out_val = ac_to_integer(&ctx->ac, out_val);
-			LLVMBuildStore(builder, out_val,
-				       ngg_gs_get_emit_output_ptr(ctx, vertexptr, out_idx));
-		}
-	}
-	assert(out_idx * 4 == sel->gsvs_vertex_size);
-
-	/* Determine and store whether this vertex completed a primitive. */
-	const LLVMValueRef curverts = LLVMBuildLoad(builder, ctx->gs_curprim_verts[stream], "");
-
-	tmp = LLVMConstInt(ctx->ac.i32, u_vertices_per_prim(sel->gs_output_prim) - 1, false);
-	const LLVMValueRef iscompleteprim =
-		LLVMBuildICmp(builder, LLVMIntUGE, curverts, tmp, "");
-
-	/* Since the geometry shader emits triangle strips, we need to
-	 * track which primitive is odd and swap vertex indices to get
-	 * the correct vertex order.
-	 */
-	LLVMValueRef is_odd = ctx->ac.i1false;
-	if (stream == 0 && u_vertices_per_prim(sel->gs_output_prim) == 3) {
-		tmp = LLVMBuildAnd(builder, curverts, ctx->ac.i32_1, "");
-		is_odd = LLVMBuildICmp(builder, LLVMIntEQ, tmp, ctx->ac.i32_1, "");
-	}
-
-	tmp = LLVMBuildAdd(builder, curverts, ctx->ac.i32_1, "");
-	LLVMBuildStore(builder, tmp, ctx->gs_curprim_verts[stream]);
-
-	/* The per-vertex primitive flag encoding:
-	 *   bit 0: whether this vertex finishes a primitive
-	 *   bit 1: whether the primitive is odd (if we are emitting triangle strips)
-	 */
-	tmp = LLVMBuildZExt(builder, iscompleteprim, ctx->ac.i8, "");
-	tmp = LLVMBuildOr(builder, tmp,
-			  LLVMBuildShl(builder,
-				       LLVMBuildZExt(builder, is_odd, ctx->ac.i8, ""),
-				       ctx->ac.i8_1, ""), "");
-	LLVMBuildStore(builder, tmp, ngg_gs_get_emit_primflag_ptr(ctx, vertexptr, stream));
-
-	tmp = LLVMBuildLoad(builder, ctx->gs_generated_prims[stream], "");
-	tmp = LLVMBuildAdd(builder, tmp, LLVMBuildZExt(builder, iscompleteprim, ctx->ac.i32, ""), "");
-	LLVMBuildStore(builder, tmp, ctx->gs_generated_prims[stream]);
-
-	ac_build_endif(&ctx->ac, 9001);
+   const struct si_shader_selector *sel = ctx->shader->selector;
+   const struct si_shader_info *info = &sel->info;
+   LLVMBuilderRef builder = ctx->ac.builder;
+   LLVMValueRef tmp;
+   const LLVMValueRef vertexidx = LLVMBuildLoad(builder, ctx->gs_next_vertex[stream], "");
+
+   /* If this thread has already emitted the declared maximum number of
+    * vertices, skip the write: excessive vertex emissions are not
+    * supposed to have any effect.
+    */
+   const LLVMValueRef can_emit =
+      LLVMBuildICmp(builder, LLVMIntULT, vertexidx,
+                    LLVMConstInt(ctx->ac.i32, sel->gs_max_out_vertices, false), "");
+
+   tmp = LLVMBuildAdd(builder, vertexidx, ctx->ac.i32_1, "");
+   tmp = LLVMBuildSelect(builder, can_emit, tmp, vertexidx, "");
+   LLVMBuildStore(builder, tmp, ctx->gs_next_vertex[stream]);
+
+   ac_build_ifcc(&ctx->ac, can_emit, 9001);
+
+   const LLVMValueRef vertexptr = ngg_gs_emit_vertex_ptr(ctx, get_thread_id_in_tg(ctx), vertexidx);
+   unsigned out_idx = 0;
+   for (unsigned i = 0; i < info->num_outputs; i++) {
+      for (unsigned chan = 0; chan < 4; chan++, out_idx++) {
+         if (!(info->output_usagemask[i] & (1 << chan)) ||
+             ((info->output_streams[i] >> (2 * chan)) & 3) != stream)
+            continue;
+
+         LLVMValueRef out_val = LLVMBuildLoad(builder, addrs[4 * i + chan], "");
+         out_val = ac_to_integer(&ctx->ac, out_val);
+         LLVMBuildStore(builder, out_val, ngg_gs_get_emit_output_ptr(ctx, vertexptr, out_idx));
+      }
+   }
+   assert(out_idx * 4 == sel->gsvs_vertex_size);
+
+   /* Determine and store whether this vertex completed a primitive. */
+   const LLVMValueRef curverts = LLVMBuildLoad(builder, ctx->gs_curprim_verts[stream], "");
+
+   tmp = LLVMConstInt(ctx->ac.i32, u_vertices_per_prim(sel->gs_output_prim) - 1, false);
+   const LLVMValueRef iscompleteprim = LLVMBuildICmp(builder, LLVMIntUGE, curverts, tmp, "");
+
+   /* Since the geometry shader emits triangle strips, we need to
+    * track which primitive is odd and swap vertex indices to get
+    * the correct vertex order.
+    */
+   LLVMValueRef is_odd = ctx->ac.i1false;
+   if (stream == 0 && u_vertices_per_prim(sel->gs_output_prim) == 3) {
+      tmp = LLVMBuildAnd(builder, curverts, ctx->ac.i32_1, "");
+      is_odd = LLVMBuildICmp(builder, LLVMIntEQ, tmp, ctx->ac.i32_1, "");
+   }
+
+   tmp = LLVMBuildAdd(builder, curverts, ctx->ac.i32_1, "");
+   LLVMBuildStore(builder, tmp, ctx->gs_curprim_verts[stream]);
+
+   /* The per-vertex primitive flag encoding:
+    *   bit 0: whether this vertex finishes a primitive
+    *   bit 1: whether the primitive is odd (if we are emitting triangle strips)
+    */
+   tmp = LLVMBuildZExt(builder, iscompleteprim, ctx->ac.i8, "");
+   tmp = LLVMBuildOr(
+      builder, tmp,
+      LLVMBuildShl(builder, LLVMBuildZExt(builder, is_odd, ctx->ac.i8, ""), ctx->ac.i8_1, ""), "");
+   LLVMBuildStore(builder, tmp, ngg_gs_get_emit_primflag_ptr(ctx, vertexptr, stream));
+
+   tmp = LLVMBuildLoad(builder, ctx->gs_generated_prims[stream], "");
+   tmp = LLVMBuildAdd(builder, tmp, LLVMBuildZExt(builder, iscompleteprim, ctx->ac.i32, ""), "");
+   LLVMBuildStore(builder, tmp, ctx->gs_generated_prims[stream]);
+
+   ac_build_endif(&ctx->ac, 9001);
 }
 
 void gfx10_ngg_gs_emit_prologue(struct si_shader_context *ctx)
 {
-	/* Zero out the part of LDS scratch that is used to accumulate the
-	 * per-stream generated primitive count.
-	 */
-	LLVMBuilderRef builder = ctx->ac.builder;
-	LLVMValueRef scratchptr = ctx->gs_ngg_scratch;
-	LLVMValueRef tid = get_thread_id_in_tg(ctx);
-	LLVMValueRef tmp;
-
-	tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, LLVMConstInt(ctx->ac.i32, 4, false), "");
-	ac_build_ifcc(&ctx->ac, tmp, 5090);
-	{
-		LLVMValueRef ptr = ac_build_gep0(&ctx->ac, scratchptr, tid);
-		LLVMBuildStore(builder, ctx->ac.i32_0, ptr);
-	}
-	ac_build_endif(&ctx->ac, 5090);
-
-	ac_build_s_barrier(&ctx->ac);
+   /* Zero out the part of LDS scratch that is used to accumulate the
+    * per-stream generated primitive count.
+    */
+   LLVMBuilderRef builder = ctx->ac.builder;
+   LLVMValueRef scratchptr = ctx->gs_ngg_scratch;
+   LLVMValueRef tid = get_thread_id_in_tg(ctx);
+   LLVMValueRef tmp;
+
+   tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, LLVMConstInt(ctx->ac.i32, 4, false), "");
+   ac_build_ifcc(&ctx->ac, tmp, 5090);
+   {
+      LLVMValueRef ptr = ac_build_gep0(&ctx->ac, scratchptr, tid);
+      LLVMBuildStore(builder, ctx->ac.i32_0, ptr);
+   }
+   ac_build_endif(&ctx->ac, 5090);
+
+   ac_build_s_barrier(&ctx->ac);
 }
 
 void gfx10_ngg_gs_emit_epilogue(struct si_shader_context *ctx)
 {
-	const struct si_shader_selector *sel = ctx->shader->selector;
-	const struct si_shader_info *info = &sel->info;
-	const unsigned verts_per_prim = u_vertices_per_prim(sel->gs_output_prim);
-	LLVMBuilderRef builder = ctx->ac.builder;
-	LLVMValueRef i8_0 = LLVMConstInt(ctx->ac.i8, 0, false);
-	LLVMValueRef tmp, tmp2;
-
-	/* Zero out remaining (non-emitted) primitive flags.
-	 *
-	 * Note: Alternatively, we could pass the relevant gs_next_vertex to
-	 *       the emit threads via LDS. This is likely worse in the expected
-	 *       typical case where each GS thread emits the full set of
-	 *       vertices.
-	 */
-	for (unsigned stream = 0; stream < 4; ++stream) {
-		if (!info->num_stream_output_components[stream])
-			continue;
-
-		const LLVMValueRef gsthread = get_thread_id_in_tg(ctx);
-
-		ac_build_bgnloop(&ctx->ac, 5100);
-
-		const LLVMValueRef vertexidx =
-			LLVMBuildLoad(builder, ctx->gs_next_vertex[stream], "");
-		tmp = LLVMBuildICmp(builder, LLVMIntUGE, vertexidx,
-			LLVMConstInt(ctx->ac.i32, sel->gs_max_out_vertices, false), "");
-		ac_build_ifcc(&ctx->ac, tmp, 5101);
-		ac_build_break(&ctx->ac);
-		ac_build_endif(&ctx->ac, 5101);
-
-		tmp = LLVMBuildAdd(builder, vertexidx, ctx->ac.i32_1, "");
-		LLVMBuildStore(builder, tmp, ctx->gs_next_vertex[stream]);
-
-		tmp = ngg_gs_emit_vertex_ptr(ctx, gsthread, vertexidx);
-		LLVMBuildStore(builder, i8_0, ngg_gs_get_emit_primflag_ptr(ctx, tmp, stream));
-
-		ac_build_endloop(&ctx->ac, 5100);
-	}
-
-	/* Accumulate generated primitives counts across the entire threadgroup. */
-	for (unsigned stream = 0; stream < 4; ++stream) {
-		if (!info->num_stream_output_components[stream])
-			continue;
-
-		LLVMValueRef numprims =
-			LLVMBuildLoad(builder, ctx->gs_generated_prims[stream], "");
-		numprims = ac_build_reduce(&ctx->ac, numprims, nir_op_iadd, ctx->ac.wave_size);
-
-		tmp = LLVMBuildICmp(builder, LLVMIntEQ, ac_get_thread_id(&ctx->ac), ctx->ac.i32_0, "");
-		ac_build_ifcc(&ctx->ac, tmp, 5105);
-		{
-			LLVMBuildAtomicRMW(builder, LLVMAtomicRMWBinOpAdd,
-					   ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch,
-							 LLVMConstInt(ctx->ac.i32, stream, false)),
-					   numprims, LLVMAtomicOrderingMonotonic, false);
-		}
-		ac_build_endif(&ctx->ac, 5105);
-	}
-
-	ac_build_endif(&ctx->ac, ctx->merged_wrap_if_label);
-
-	ac_build_s_barrier(&ctx->ac);
-
-	const LLVMValueRef tid = get_thread_id_in_tg(ctx);
-	LLVMValueRef num_emit_threads = ngg_get_prim_cnt(ctx);
-
-	/* Streamout */
-	if (sel->so.num_outputs) {
-		struct ngg_streamout nggso = {};
-
-		nggso.num_vertices = LLVMConstInt(ctx->ac.i32, verts_per_prim, false);
-
-		LLVMValueRef vertexptr = ngg_gs_vertex_ptr(ctx, tid);
-		for (unsigned stream = 0; stream < 4; ++stream) {
-			if (!info->num_stream_output_components[stream])
-				continue;
-
-			tmp = LLVMBuildLoad(builder, ngg_gs_get_emit_primflag_ptr(ctx, vertexptr, stream), "");
-			tmp = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, "");
-			tmp2 = LLVMBuildICmp(builder, LLVMIntULT, tid, num_emit_threads, "");
-			nggso.prim_enable[stream] = LLVMBuildAnd(builder, tmp, tmp2, "");
-		}
-
-		for (unsigned i = 0; i < verts_per_prim; ++i) {
-			tmp = LLVMBuildSub(builder, tid,
-					   LLVMConstInt(ctx->ac.i32, verts_per_prim - i - 1, false), "");
-			tmp = ngg_gs_vertex_ptr(ctx, tmp);
-			nggso.vertices[i] = ac_build_gep0(&ctx->ac, tmp, ctx->ac.i32_0);
-		}
-
-		build_streamout(ctx, &nggso);
-	}
-
-	/* Write shader query data. */
-	if (ctx->screen->use_ngg_streamout) {
-		tmp = si_unpack_param(ctx, ctx->vs_state_bits, 6, 1);
-		tmp = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, "");
-		ac_build_ifcc(&ctx->ac, tmp, 5109); /* if (STREAMOUT_QUERY_ENABLED) */
-		unsigned num_query_comps = sel->so.num_outputs ? 8 : 4;
-		tmp = LLVMBuildICmp(builder, LLVMIntULT, tid,
-				    LLVMConstInt(ctx->ac.i32, num_query_comps, false), "");
-		ac_build_ifcc(&ctx->ac, tmp, 5110);
-		{
-			LLVMValueRef offset;
-			tmp = tid;
-			if (sel->so.num_outputs)
-				tmp = LLVMBuildAnd(builder, tmp, LLVMConstInt(ctx->ac.i32, 3, false), "");
-			offset = LLVMBuildNUWMul(builder, tmp, LLVMConstInt(ctx->ac.i32, 32, false), "");
-			if (sel->so.num_outputs) {
-				tmp = LLVMBuildLShr(builder, tid, LLVMConstInt(ctx->ac.i32, 2, false), "");
-				tmp = LLVMBuildNUWMul(builder, tmp, LLVMConstInt(ctx->ac.i32, 8, false), "");
-				offset = LLVMBuildAdd(builder, offset, tmp, "");
-			}
-
-			tmp = LLVMBuildLoad(builder, ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, tid), "");
-			LLVMValueRef args[] = {
-				tmp,
-				ngg_get_query_buf(ctx),
-				offset,
-				LLVMConstInt(ctx->ac.i32, 16, false), /* soffset */
-				ctx->ac.i32_0, /* cachepolicy */
-			};
-			ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.raw.buffer.atomic.add.i32",
-					   ctx->ac.i32, args, 5, 0);
-		}
-		ac_build_endif(&ctx->ac, 5110);
-		ac_build_endif(&ctx->ac, 5109);
-	}
-
-	/* Determine vertex liveness. */
-	LLVMValueRef vertliveptr = ac_build_alloca(&ctx->ac, ctx->ac.i1, "vertexlive");
-
-	tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, num_emit_threads, "");
-	ac_build_ifcc(&ctx->ac, tmp, 5120);
-	{
-		for (unsigned i = 0; i < verts_per_prim; ++i) {
-			const LLVMValueRef primidx =
-				LLVMBuildAdd(builder, tid,
-					     LLVMConstInt(ctx->ac.i32, i, false), "");
-
-			if (i > 0) {
-				tmp = LLVMBuildICmp(builder, LLVMIntULT, primidx, num_emit_threads, "");
-				ac_build_ifcc(&ctx->ac, tmp, 5121 + i);
-			}
-
-			/* Load primitive liveness */
-			tmp = ngg_gs_vertex_ptr(ctx, primidx);
-			tmp = LLVMBuildLoad(builder, ngg_gs_get_emit_primflag_ptr(ctx, tmp, 0), "");
-			const LLVMValueRef primlive =
-				LLVMBuildTrunc(builder, tmp, ctx->ac.i1, "");
-
-			tmp = LLVMBuildLoad(builder, vertliveptr, "");
-			tmp = LLVMBuildOr(builder, tmp, primlive, ""),
-			LLVMBuildStore(builder, tmp, vertliveptr);
-
-			if (i > 0)
-				ac_build_endif(&ctx->ac, 5121 + i);
-		}
-	}
-	ac_build_endif(&ctx->ac, 5120);
-
-	/* Inclusive scan addition across the current wave. */
-	LLVMValueRef vertlive = LLVMBuildLoad(builder, vertliveptr, "");
-	struct ac_wg_scan vertlive_scan = {};
-	vertlive_scan.op = nir_op_iadd;
-	vertlive_scan.enable_reduce = true;
-	vertlive_scan.enable_exclusive = true;
-	vertlive_scan.src = vertlive;
-	vertlive_scan.scratch = ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, ctx->ac.i32_0);
-	vertlive_scan.waveidx = get_wave_id_in_tg(ctx);
-	vertlive_scan.numwaves = get_tgsize(ctx);
-	vertlive_scan.maxwaves = 8;
-
-	ac_build_wg_scan(&ctx->ac, &vertlive_scan);
-
-	/* Skip all exports (including index exports) when possible. At least on
-	 * early gfx10 revisions this is also to avoid hangs.
-	 */
-	LLVMValueRef have_exports =
-		LLVMBuildICmp(builder, LLVMIntNE, vertlive_scan.result_reduce, ctx->ac.i32_0, "");
-	num_emit_threads =
-		LLVMBuildSelect(builder, have_exports, num_emit_threads, ctx->ac.i32_0, "");
-
-	/* Allocate export space. Send this message as early as possible, to
-	 * hide the latency of the SQ <-> SPI roundtrip.
-	 *
-	 * Note: We could consider compacting primitives for export as well.
-	 *       PA processes 1 non-null prim / clock, but it fetches 4 DW of
-	 *       prim data per clock and skips null primitives at no additional
-	 *       cost. So compacting primitives can only be beneficial when
-	 *       there are 4 or more contiguous null primitives in the export
-	 *       (in the common case of single-dword prim exports).
-	 */
-	ac_build_sendmsg_gs_alloc_req(&ctx->ac, get_wave_id_in_tg(ctx),
-				      vertlive_scan.result_reduce, num_emit_threads);
-
-	/* Setup the reverse vertex compaction permutation. We re-use stream 1
-	 * of the primitive liveness flags, relying on the fact that each
-	 * threadgroup can have at most 256 threads. */
-	ac_build_ifcc(&ctx->ac, vertlive, 5130);
-	{
-		tmp = ngg_gs_vertex_ptr(ctx, vertlive_scan.result_exclusive);
-		tmp2 = LLVMBuildTrunc(builder, tid, ctx->ac.i8, "");
-		LLVMBuildStore(builder, tmp2, ngg_gs_get_emit_primflag_ptr(ctx, tmp, 1));
-	}
-	ac_build_endif(&ctx->ac, 5130);
-
-	ac_build_s_barrier(&ctx->ac);
-
-	/* Export primitive data */
-	tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, num_emit_threads, "");
-	ac_build_ifcc(&ctx->ac, tmp, 5140);
-	{
-		LLVMValueRef flags;
-		struct ac_ngg_prim prim = {};
-		prim.num_vertices = verts_per_prim;
-
-		tmp = ngg_gs_vertex_ptr(ctx, tid);
-		flags = LLVMBuildLoad(builder, ngg_gs_get_emit_primflag_ptr(ctx, tmp, 0), "");
-		prim.isnull = LLVMBuildNot(builder, LLVMBuildTrunc(builder, flags, ctx->ac.i1, ""), "");
-
-		for (unsigned i = 0; i < verts_per_prim; ++i) {
-			prim.index[i] = LLVMBuildSub(builder, vertlive_scan.result_exclusive,
-				LLVMConstInt(ctx->ac.i32, verts_per_prim - i - 1, false), "");
-			prim.edgeflag[i] = ctx->ac.i1false;
-		}
-
-		/* Geometry shaders output triangle strips, but NGG expects triangles. */
-		if (verts_per_prim == 3) {
-			LLVMValueRef is_odd = LLVMBuildLShr(builder, flags, ctx->ac.i8_1, "");
-			is_odd = LLVMBuildTrunc(builder, is_odd, ctx->ac.i1, "");
-			LLVMValueRef flatshade_first =
-				LLVMBuildICmp(builder, LLVMIntEQ,
-					      si_unpack_param(ctx, ctx->vs_state_bits, 4, 2),
-					      ctx->ac.i32_0, "");
-
-			ac_build_triangle_strip_indices_to_triangle(&ctx->ac, is_odd,
-								    flatshade_first,
-								    prim.index);
-		}
-
-		ac_build_export_prim(&ctx->ac, &prim);
-	}
-	ac_build_endif(&ctx->ac, 5140);
-
-	/* Export position and parameter data */
-	tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, vertlive_scan.result_reduce, "");
-	ac_build_ifcc(&ctx->ac, tmp, 5145);
-	{
-		struct si_shader_output_values outputs[PIPE_MAX_SHADER_OUTPUTS];
-
-		tmp = ngg_gs_vertex_ptr(ctx, tid);
-		tmp = LLVMBuildLoad(builder, ngg_gs_get_emit_primflag_ptr(ctx, tmp, 1), "");
-		tmp = LLVMBuildZExt(builder, tmp, ctx->ac.i32, "");
-		const LLVMValueRef vertexptr = ngg_gs_vertex_ptr(ctx, tmp);
-
-		unsigned out_idx = 0;
-		for (unsigned i = 0; i < info->num_outputs; i++) {
-			outputs[i].semantic_name = info->output_semantic_name[i];
-			outputs[i].semantic_index = info->output_semantic_index[i];
-
-			for (unsigned j = 0; j < 4; j++, out_idx++) {
-				tmp = ngg_gs_get_emit_output_ptr(ctx, vertexptr, out_idx);
-				tmp = LLVMBuildLoad(builder, tmp, "");
-				outputs[i].values[j] = ac_to_float(&ctx->ac, tmp);
-				outputs[i].vertex_stream[j] =
-					(info->output_streams[i] >> (2 * j)) & 3;
-			}
-		}
-
-		si_llvm_build_vs_exports(ctx, outputs, info->num_outputs);
-	}
-	ac_build_endif(&ctx->ac, 5145);
+   const struct si_shader_selector *sel = ctx->shader->selector;
+   const struct si_shader_info *info = &sel->info;
+   const unsigned verts_per_prim = u_vertices_per_prim(sel->gs_output_prim);
+   LLVMBuilderRef builder = ctx->ac.builder;
+   LLVMValueRef i8_0 = LLVMConstInt(ctx->ac.i8, 0, false);
+   LLVMValueRef tmp, tmp2;
+
+   /* Zero out remaining (non-emitted) primitive flags.
+    *
+    * Note: Alternatively, we could pass the relevant gs_next_vertex to
+    *       the emit threads via LDS. This is likely worse in the expected
+    *       typical case where each GS thread emits the full set of
+    *       vertices.
+    */
+   for (unsigned stream = 0; stream < 4; ++stream) {
+      if (!info->num_stream_output_components[stream])
+         continue;
+
+      const LLVMValueRef gsthread = get_thread_id_in_tg(ctx);
+
+      ac_build_bgnloop(&ctx->ac, 5100);
+
+      const LLVMValueRef vertexidx = LLVMBuildLoad(builder, ctx->gs_next_vertex[stream], "");
+      tmp = LLVMBuildICmp(builder, LLVMIntUGE, vertexidx,
+                          LLVMConstInt(ctx->ac.i32, sel->gs_max_out_vertices, false), "");
+      ac_build_ifcc(&ctx->ac, tmp, 5101);
+      ac_build_break(&ctx->ac);
+      ac_build_endif(&ctx->ac, 5101);
+
+      tmp = LLVMBuildAdd(builder, vertexidx, ctx->ac.i32_1, "");
+      LLVMBuildStore(builder, tmp, ctx->gs_next_vertex[stream]);
+
+      tmp = ngg_gs_emit_vertex_ptr(ctx, gsthread, vertexidx);
+      LLVMBuildStore(builder, i8_0, ngg_gs_get_emit_primflag_ptr(ctx, tmp, stream));
+
+      ac_build_endloop(&ctx->ac, 5100);
+   }
+
+   /* Accumulate generated primitives counts across the entire threadgroup. */
+   for (unsigned stream = 0; stream < 4; ++stream) {
+      if (!info->num_stream_output_components[stream])
+         continue;
+
+      LLVMValueRef numprims = LLVMBuildLoad(builder, ctx->gs_generated_prims[stream], "");
+      numprims = ac_build_reduce(&ctx->ac, numprims, nir_op_iadd, ctx->ac.wave_size);
+
+      tmp = LLVMBuildICmp(builder, LLVMIntEQ, ac_get_thread_id(&ctx->ac), ctx->ac.i32_0, "");
+      ac_build_ifcc(&ctx->ac, tmp, 5105);
+      {
+         LLVMBuildAtomicRMW(
+            builder, LLVMAtomicRMWBinOpAdd,
+            ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, LLVMConstInt(ctx->ac.i32, stream, false)),
+            numprims, LLVMAtomicOrderingMonotonic, false);
+      }
+      ac_build_endif(&ctx->ac, 5105);
+   }
+
+   ac_build_endif(&ctx->ac, ctx->merged_wrap_if_label);
+
+   ac_build_s_barrier(&ctx->ac);
+
+   const LLVMValueRef tid = get_thread_id_in_tg(ctx);
+   LLVMValueRef num_emit_threads = ngg_get_prim_cnt(ctx);
+
+   /* Streamout */
+   if (sel->so.num_outputs) {
+      struct ngg_streamout nggso = {};
+
+      nggso.num_vertices = LLVMConstInt(ctx->ac.i32, verts_per_prim, false);
+
+      LLVMValueRef vertexptr = ngg_gs_vertex_ptr(ctx, tid);
+      for (unsigned stream = 0; stream < 4; ++stream) {
+         if (!info->num_stream_output_components[stream])
+            continue;
+
+         tmp = LLVMBuildLoad(builder, ngg_gs_get_emit_primflag_ptr(ctx, vertexptr, stream), "");
+         tmp = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, "");
+         tmp2 = LLVMBuildICmp(builder, LLVMIntULT, tid, num_emit_threads, "");
+         nggso.prim_enable[stream] = LLVMBuildAnd(builder, tmp, tmp2, "");
+      }
+
+      for (unsigned i = 0; i < verts_per_prim; ++i) {
+         tmp = LLVMBuildSub(builder, tid, LLVMConstInt(ctx->ac.i32, verts_per_prim - i - 1, false),
+                            "");
+         tmp = ngg_gs_vertex_ptr(ctx, tmp);
+         nggso.vertices[i] = ac_build_gep0(&ctx->ac, tmp, ctx->ac.i32_0);
+      }
+
+      build_streamout(ctx, &nggso);
+   }
+
+   /* Write shader query data. */
+   if (ctx->screen->use_ngg_streamout) {
+      tmp = si_unpack_param(ctx, ctx->vs_state_bits, 6, 1);
+      tmp = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, "");
+      ac_build_ifcc(&ctx->ac, tmp, 5109); /* if (STREAMOUT_QUERY_ENABLED) */
+      unsigned num_query_comps = sel->so.num_outputs ? 8 : 4;
+      tmp = LLVMBuildICmp(builder, LLVMIntULT, tid,
+                          LLVMConstInt(ctx->ac.i32, num_query_comps, false), "");
+      ac_build_ifcc(&ctx->ac, tmp, 5110);
+      {
+         LLVMValueRef offset;
+         tmp = tid;
+         if (sel->so.num_outputs)
+            tmp = LLVMBuildAnd(builder, tmp, LLVMConstInt(ctx->ac.i32, 3, false), "");
+         offset = LLVMBuildNUWMul(builder, tmp, LLVMConstInt(ctx->ac.i32, 32, false), "");
+         if (sel->so.num_outputs) {
+            tmp = LLVMBuildLShr(builder, tid, LLVMConstInt(ctx->ac.i32, 2, false), "");
+            tmp = LLVMBuildNUWMul(builder, tmp, LLVMConstInt(ctx->ac.i32, 8, false), "");
+            offset = LLVMBuildAdd(builder, offset, tmp, "");
+         }
+
+         tmp = LLVMBuildLoad(builder, ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, tid), "");
+         LLVMValueRef args[] = {
+            tmp,           ngg_get_query_buf(ctx),
+            offset,        LLVMConstInt(ctx->ac.i32, 16, false), /* soffset */
+            ctx->ac.i32_0,                                       /* cachepolicy */
+         };
+         ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.raw.buffer.atomic.add.i32", ctx->ac.i32, args, 5,
+                            0);
+      }
+      ac_build_endif(&ctx->ac, 5110);
+      ac_build_endif(&ctx->ac, 5109);
+   }
+
+   /* Determine vertex liveness. */
+   LLVMValueRef vertliveptr = ac_build_alloca(&ctx->ac, ctx->ac.i1, "vertexlive");
+
+   tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, num_emit_threads, "");
+   ac_build_ifcc(&ctx->ac, tmp, 5120);
+   {
+      for (unsigned i = 0; i < verts_per_prim; ++i) {
+         const LLVMValueRef primidx =
+            LLVMBuildAdd(builder, tid, LLVMConstInt(ctx->ac.i32, i, false), "");
+
+         if (i > 0) {
+            tmp = LLVMBuildICmp(builder, LLVMIntULT, primidx, num_emit_threads, "");
+            ac_build_ifcc(&ctx->ac, tmp, 5121 + i);
+         }
+
+         /* Load primitive liveness */
+         tmp = ngg_gs_vertex_ptr(ctx, primidx);
+         tmp = LLVMBuildLoad(builder, ngg_gs_get_emit_primflag_ptr(ctx, tmp, 0), "");
+         const LLVMValueRef primlive = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, "");
+
+         tmp = LLVMBuildLoad(builder, vertliveptr, "");
+         tmp = LLVMBuildOr(builder, tmp, primlive, ""), LLVMBuildStore(builder, tmp, vertliveptr);
+
+         if (i > 0)
+            ac_build_endif(&ctx->ac, 5121 + i);
+      }
+   }
+   ac_build_endif(&ctx->ac, 5120);
+
+   /* Inclusive scan addition across the current wave. */
+   LLVMValueRef vertlive = LLVMBuildLoad(builder, vertliveptr, "");
+   struct ac_wg_scan vertlive_scan = {};
+   vertlive_scan.op = nir_op_iadd;
+   vertlive_scan.enable_reduce = true;
+   vertlive_scan.enable_exclusive = true;
+   vertlive_scan.src = vertlive;
+   vertlive_scan.scratch = ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, ctx->ac.i32_0);
+   vertlive_scan.waveidx = get_wave_id_in_tg(ctx);
+   vertlive_scan.numwaves = get_tgsize(ctx);
+   vertlive_scan.maxwaves = 8;
+
+   ac_build_wg_scan(&ctx->ac, &vertlive_scan);
+
+   /* Skip all exports (including index exports) when possible. At least on
+    * early gfx10 revisions this is also to avoid hangs.
+    */
+   LLVMValueRef have_exports =
+      LLVMBuildICmp(builder, LLVMIntNE, vertlive_scan.result_reduce, ctx->ac.i32_0, "");
+   num_emit_threads = LLVMBuildSelect(builder, have_exports, num_emit_threads, ctx->ac.i32_0, "");
+
+   /* Allocate export space. Send this message as early as possible, to
+    * hide the latency of the SQ <-> SPI roundtrip.
+    *
+    * Note: We could consider compacting primitives for export as well.
+    *       PA processes 1 non-null prim / clock, but it fetches 4 DW of
+    *       prim data per clock and skips null primitives at no additional
+    *       cost. So compacting primitives can only be beneficial when
+    *       there are 4 or more contiguous null primitives in the export
+    *       (in the common case of single-dword prim exports).
+    */
+   ac_build_sendmsg_gs_alloc_req(&ctx->ac, get_wave_id_in_tg(ctx), vertlive_scan.result_reduce,
+                                 num_emit_threads);
+
+   /* Setup the reverse vertex compaction permutation. We re-use stream 1
+    * of the primitive liveness flags, relying on the fact that each
+    * threadgroup can have at most 256 threads. */
+   ac_build_ifcc(&ctx->ac, vertlive, 5130);
+   {
+      tmp = ngg_gs_vertex_ptr(ctx, vertlive_scan.result_exclusive);
+      tmp2 = LLVMBuildTrunc(builder, tid, ctx->ac.i8, "");
+      LLVMBuildStore(builder, tmp2, ngg_gs_get_emit_primflag_ptr(ctx, tmp, 1));
+   }
+   ac_build_endif(&ctx->ac, 5130);
+
+   ac_build_s_barrier(&ctx->ac);
+
+   /* Export primitive data */
+   tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, num_emit_threads, "");
+   ac_build_ifcc(&ctx->ac, tmp, 5140);
+   {
+      LLVMValueRef flags;
+      struct ac_ngg_prim prim = {};
+      prim.num_vertices = verts_per_prim;
+
+      tmp = ngg_gs_vertex_ptr(ctx, tid);
+      flags = LLVMBuildLoad(builder, ngg_gs_get_emit_primflag_ptr(ctx, tmp, 0), "");
+      prim.isnull = LLVMBuildNot(builder, LLVMBuildTrunc(builder, flags, ctx->ac.i1, ""), "");
+
+      for (unsigned i = 0; i < verts_per_prim; ++i) {
+         prim.index[i] = LLVMBuildSub(builder, vertlive_scan.result_exclusive,
+                                      LLVMConstInt(ctx->ac.i32, verts_per_prim - i - 1, false), "");
+         prim.edgeflag[i] = ctx->ac.i1false;
+      }
+
+      /* Geometry shaders output triangle strips, but NGG expects triangles. */
+      if (verts_per_prim == 3) {
+         LLVMValueRef is_odd = LLVMBuildLShr(builder, flags, ctx->ac.i8_1, "");
+         is_odd = LLVMBuildTrunc(builder, is_odd, ctx->ac.i1, "");
+         LLVMValueRef flatshade_first = LLVMBuildICmp(
+            builder, LLVMIntEQ, si_unpack_param(ctx, ctx->vs_state_bits, 4, 2), ctx->ac.i32_0, "");
+
+         ac_build_triangle_strip_indices_to_triangle(&ctx->ac, is_odd, flatshade_first, prim.index);
+      }
+
+      ac_build_export_prim(&ctx->ac, &prim);
+   }
+   ac_build_endif(&ctx->ac, 5140);
+
+   /* Export position and parameter data */
+   tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, vertlive_scan.result_reduce, "");
+   ac_build_ifcc(&ctx->ac, tmp, 5145);
+   {
+      struct si_shader_output_values outputs[PIPE_MAX_SHADER_OUTPUTS];
+
+      tmp = ngg_gs_vertex_ptr(ctx, tid);
+      tmp = LLVMBuildLoad(builder, ngg_gs_get_emit_primflag_ptr(ctx, tmp, 1), "");
+      tmp = LLVMBuildZExt(builder, tmp, ctx->ac.i32, "");
+      const LLVMValueRef vertexptr = ngg_gs_vertex_ptr(ctx, tmp);
+
+      unsigned out_idx = 0;
+      for (unsigned i = 0; i < info->num_outputs; i++) {
+         outputs[i].semantic_name = info->output_semantic_name[i];
+         outputs[i].semantic_index = info->output_semantic_index[i];
+
+         for (unsigned j = 0; j < 4; j++, out_idx++) {
+            tmp = ngg_gs_get_emit_output_ptr(ctx, vertexptr, out_idx);
+            tmp = LLVMBuildLoad(builder, tmp, "");
+            outputs[i].values[j] = ac_to_float(&ctx->ac, tmp);
+            outputs[i].vertex_stream[j] = (info->output_streams[i] >> (2 * j)) & 3;
+         }
+      }
+
+      si_llvm_build_vs_exports(ctx, outputs, info->num_outputs);
+   }
+   ac_build_endif(&ctx->ac, 5145);
 }
 
 static void clamp_gsprims_to_esverts(unsigned *max_gsprims, unsigned max_esverts,
-				     unsigned min_verts_per_prim, bool use_adjacency)
+                                     unsigned min_verts_per_prim, bool use_adjacency)
 {
-	unsigned max_reuse = max_esverts - min_verts_per_prim;
-	if (use_adjacency)
-		max_reuse /= 2;
-	*max_gsprims = MIN2(*max_gsprims, 1 + max_reuse);
+   unsigned max_reuse = max_esverts - min_verts_per_prim;
+   if (use_adjacency)
+      max_reuse /= 2;
+   *max_gsprims = MIN2(*max_gsprims, 1 + max_reuse);
 }
 
 /**
@@ -1992,172 +1886,165 @@ static void clamp_gsprims_to_esverts(unsigned *max_gsprims, unsigned max_esverts
  */
 void gfx10_ngg_calculate_subgroup_info(struct si_shader *shader)
 {
-	const struct si_shader_selector *gs_sel = shader->selector;
-	const struct si_shader_selector *es_sel =
-		shader->previous_stage_sel ? shader->previous_stage_sel : gs_sel;
-	const enum pipe_shader_type gs_type = gs_sel->type;
-	const unsigned gs_num_invocations = MAX2(gs_sel->gs_num_invocations, 1);
-	const unsigned input_prim = si_get_input_prim(gs_sel);
-	const bool use_adjacency = input_prim >= PIPE_PRIM_LINES_ADJACENCY &&
-				   input_prim <= PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY;
-	const unsigned max_verts_per_prim = u_vertices_per_prim(input_prim);
-	const unsigned min_verts_per_prim =
-		gs_type == PIPE_SHADER_GEOMETRY ? max_verts_per_prim : 1;
-
-	/* All these are in dwords: */
-	/* We can't allow using the whole LDS, because GS waves compete with
-	 * other shader stages for LDS space.
-	 *
-	 * TODO: We should really take the shader's internal LDS use into
-	 *       account. The linker will fail if the size is greater than
-	 *       8K dwords.
-	 */
-	const unsigned max_lds_size = 8 * 1024 - 768;
-	const unsigned target_lds_size = max_lds_size;
-	unsigned esvert_lds_size = 0;
-	unsigned gsprim_lds_size = 0;
-
-	/* All these are per subgroup: */
-	bool max_vert_out_per_gs_instance = false;
-	unsigned max_gsprims_base = 128; /* default prim group size clamp */
-	unsigned max_esverts_base = 128;
-
-	if (shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST) {
-		max_gsprims_base = 128 / 3;
-		max_esverts_base = max_gsprims_base * 3;
-	} else if (shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP) {
-		max_gsprims_base = 126;
-		max_esverts_base = 128;
-	}
-
-	/* Hardware has the following non-natural restrictions on the value
-	 * of GE_CNTL.VERT_GRP_SIZE based on based on the primitive type of
-	 * the draw:
-	 *  - at most 252 for any line input primitive type
-	 *  - at most 251 for any quad input primitive type
-	 *  - at most 251 for triangle strips with adjacency (this happens to
-	 *    be the natural limit for triangle *lists* with adjacency)
-	 */
-	max_esverts_base = MIN2(max_esverts_base, 251 + max_verts_per_prim - 1);
-
-	if (gs_type == PIPE_SHADER_GEOMETRY) {
-		unsigned max_out_verts_per_gsprim =
-			gs_sel->gs_max_out_vertices * gs_num_invocations;
-
-		if (max_out_verts_per_gsprim <= 256) {
-			if (max_out_verts_per_gsprim) {
-				max_gsprims_base = MIN2(max_gsprims_base,
-							256 / max_out_verts_per_gsprim);
-			}
-		} else {
-			/* Use special multi-cycling mode in which each GS
-			 * instance gets its own subgroup. Does not work with
-			 * tessellation. */
-			max_vert_out_per_gs_instance = true;
-			max_gsprims_base = 1;
-			max_out_verts_per_gsprim = gs_sel->gs_max_out_vertices;
-		}
-
-		esvert_lds_size = es_sel->esgs_itemsize / 4;
-		gsprim_lds_size = (gs_sel->gsvs_vertex_size / 4 + 1) * max_out_verts_per_gsprim;
-	} else {
-		/* VS and TES. */
-		/* LDS size for passing data from ES to GS. */
-		esvert_lds_size = ngg_nogs_vertex_size(shader);
-	}
-
-	unsigned max_gsprims = max_gsprims_base;
-	unsigned max_esverts = max_esverts_base;
-
-	if (esvert_lds_size)
-		max_esverts = MIN2(max_esverts, target_lds_size / esvert_lds_size);
-	if (gsprim_lds_size)
-		max_gsprims = MIN2(max_gsprims, target_lds_size / gsprim_lds_size);
-
-	max_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim);
-	clamp_gsprims_to_esverts(&max_gsprims, max_esverts, min_verts_per_prim, use_adjacency);
-	assert(max_esverts >= max_verts_per_prim && max_gsprims >= 1);
-
-	if (esvert_lds_size || gsprim_lds_size) {
-		/* Now that we have a rough proportionality between esverts
-		 * and gsprims based on the primitive type, scale both of them
-		 * down simultaneously based on required LDS space.
-		 *
-		 * We could be smarter about this if we knew how much vertex
-		 * reuse to expect.
-		 */
-		unsigned lds_total = max_esverts * esvert_lds_size +
-				     max_gsprims * gsprim_lds_size;
-		if (lds_total > target_lds_size) {
-			max_esverts = max_esverts * target_lds_size / lds_total;
-			max_gsprims = max_gsprims * target_lds_size / lds_total;
-
-			max_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim);
-			clamp_gsprims_to_esverts(&max_gsprims, max_esverts,
-						 min_verts_per_prim, use_adjacency);
-			assert(max_esverts >= max_verts_per_prim && max_gsprims >= 1);
-		}
-	}
-
-	/* Round up towards full wave sizes for better ALU utilization. */
-	if (!max_vert_out_per_gs_instance) {
-		const unsigned wavesize = gs_sel->screen->ge_wave_size;
-		unsigned orig_max_esverts;
-		unsigned orig_max_gsprims;
-		do {
-			orig_max_esverts = max_esverts;
-			orig_max_gsprims = max_gsprims;
-
-			max_esverts = align(max_esverts, wavesize);
-			max_esverts = MIN2(max_esverts, max_esverts_base);
-			if (esvert_lds_size)
-				max_esverts = MIN2(max_esverts,
-						   (max_lds_size - max_gsprims * gsprim_lds_size) /
-						   esvert_lds_size);
-			max_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim);
-
-			max_gsprims = align(max_gsprims, wavesize);
-			max_gsprims = MIN2(max_gsprims, max_gsprims_base);
-			if (gsprim_lds_size)
-				max_gsprims = MIN2(max_gsprims,
-						   (max_lds_size - max_esverts * esvert_lds_size) /
-						   gsprim_lds_size);
-			clamp_gsprims_to_esverts(&max_gsprims, max_esverts,
-						 min_verts_per_prim, use_adjacency);
-			assert(max_esverts >= max_verts_per_prim && max_gsprims >= 1);
-		} while (orig_max_esverts != max_esverts || orig_max_gsprims != max_gsprims);
-	}
-
-	/* Hardware restriction: minimum value of max_esverts */
-	max_esverts = MAX2(max_esverts, 23 + max_verts_per_prim);
-
-	unsigned max_out_vertices =
-		max_vert_out_per_gs_instance ? gs_sel->gs_max_out_vertices :
-		gs_type == PIPE_SHADER_GEOMETRY ?
-		max_gsprims * gs_num_invocations * gs_sel->gs_max_out_vertices :
-		max_esverts;
-	assert(max_out_vertices <= 256);
-
-	unsigned prim_amp_factor = 1;
-	if (gs_type == PIPE_SHADER_GEOMETRY) {
-		/* Number of output primitives per GS input primitive after
-		 * GS instancing. */
-		prim_amp_factor = gs_sel->gs_max_out_vertices;
-	}
-
-	/* The GE only checks against the maximum number of ES verts after
-	 * allocating a full GS primitive. So we need to ensure that whenever
-	 * this check passes, there is enough space for a full primitive without
-	 * vertex reuse.
-	 */
-	shader->ngg.hw_max_esverts = max_esverts - max_verts_per_prim + 1;
-	shader->ngg.max_gsprims = max_gsprims;
-	shader->ngg.max_out_verts = max_out_vertices;
-	shader->ngg.prim_amp_factor = prim_amp_factor;
-	shader->ngg.max_vert_out_per_gs_instance = max_vert_out_per_gs_instance;
-
-	shader->gs_info.esgs_ring_size = 4 * max_esverts * esvert_lds_size;
-	shader->ngg.ngg_emit_size = max_gsprims * gsprim_lds_size;
-
-	assert(shader->ngg.hw_max_esverts >= 24); /* HW limitation */
+   const struct si_shader_selector *gs_sel = shader->selector;
+   const struct si_shader_selector *es_sel =
+      shader->previous_stage_sel ? shader->previous_stage_sel : gs_sel;
+   const enum pipe_shader_type gs_type = gs_sel->type;
+   const unsigned gs_num_invocations = MAX2(gs_sel->gs_num_invocations, 1);
+   const unsigned input_prim = si_get_input_prim(gs_sel);
+   const bool use_adjacency =
+      input_prim >= PIPE_PRIM_LINES_ADJACENCY && input_prim <= PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY;
+   const unsigned max_verts_per_prim = u_vertices_per_prim(input_prim);
+   const unsigned min_verts_per_prim = gs_type == PIPE_SHADER_GEOMETRY ? max_verts_per_prim : 1;
+
+   /* All these are in dwords: */
+   /* We can't allow using the whole LDS, because GS waves compete with
+    * other shader stages for LDS space.
+    *
+    * TODO: We should really take the shader's internal LDS use into
+    *       account. The linker will fail if the size is greater than
+    *       8K dwords.
+    */
+   const unsigned max_lds_size = 8 * 1024 - 768;
+   const unsigned target_lds_size = max_lds_size;
+   unsigned esvert_lds_size = 0;
+   unsigned gsprim_lds_size = 0;
+
+   /* All these are per subgroup: */
+   bool max_vert_out_per_gs_instance = false;
+   unsigned max_gsprims_base = 128; /* default prim group size clamp */
+   unsigned max_esverts_base = 128;
+
+   if (shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST) {
+      max_gsprims_base = 128 / 3;
+      max_esverts_base = max_gsprims_base * 3;
+   } else if (shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP) {
+      max_gsprims_base = 126;
+      max_esverts_base = 128;
+   }
+
+   /* Hardware has the following non-natural restrictions on the value
+    * of GE_CNTL.VERT_GRP_SIZE based on based on the primitive type of
+    * the draw:
+    *  - at most 252 for any line input primitive type
+    *  - at most 251 for any quad input primitive type
+    *  - at most 251 for triangle strips with adjacency (this happens to
+    *    be the natural limit for triangle *lists* with adjacency)
+    */
+   max_esverts_base = MIN2(max_esverts_base, 251 + max_verts_per_prim - 1);
+
+   if (gs_type == PIPE_SHADER_GEOMETRY) {
+      unsigned max_out_verts_per_gsprim = gs_sel->gs_max_out_vertices * gs_num_invocations;
+
+      if (max_out_verts_per_gsprim <= 256) {
+         if (max_out_verts_per_gsprim) {
+            max_gsprims_base = MIN2(max_gsprims_base, 256 / max_out_verts_per_gsprim);
+         }
+      } else {
+         /* Use special multi-cycling mode in which each GS
+          * instance gets its own subgroup. Does not work with
+          * tessellation. */
+         max_vert_out_per_gs_instance = true;
+         max_gsprims_base = 1;
+         max_out_verts_per_gsprim = gs_sel->gs_max_out_vertices;
+      }
+
+      esvert_lds_size = es_sel->esgs_itemsize / 4;
+      gsprim_lds_size = (gs_sel->gsvs_vertex_size / 4 + 1) * max_out_verts_per_gsprim;
+   } else {
+      /* VS and TES. */
+      /* LDS size for passing data from ES to GS. */
+      esvert_lds_size = ngg_nogs_vertex_size(shader);
+   }
+
+   unsigned max_gsprims = max_gsprims_base;
+   unsigned max_esverts = max_esverts_base;
+
+   if (esvert_lds_size)
+      max_esverts = MIN2(max_esverts, target_lds_size / esvert_lds_size);
+   if (gsprim_lds_size)
+      max_gsprims = MIN2(max_gsprims, target_lds_size / gsprim_lds_size);
+
+   max_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim);
+   clamp_gsprims_to_esverts(&max_gsprims, max_esverts, min_verts_per_prim, use_adjacency);
+   assert(max_esverts >= max_verts_per_prim && max_gsprims >= 1);
+
+   if (esvert_lds_size || gsprim_lds_size) {
+      /* Now that we have a rough proportionality between esverts
+       * and gsprims based on the primitive type, scale both of them
+       * down simultaneously based on required LDS space.
+       *
+       * We could be smarter about this if we knew how much vertex
+       * reuse to expect.
+       */
+      unsigned lds_total = max_esverts * esvert_lds_size + max_gsprims * gsprim_lds_size;
+      if (lds_total > target_lds_size) {
+         max_esverts = max_esverts * target_lds_size / lds_total;
+         max_gsprims = max_gsprims * target_lds_size / lds_total;
+
+         max_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim);
+         clamp_gsprims_to_esverts(&max_gsprims, max_esverts, min_verts_per_prim, use_adjacency);
+         assert(max_esverts >= max_verts_per_prim && max_gsprims >= 1);
+      }
+   }
+
+   /* Round up towards full wave sizes for better ALU utilization. */
+   if (!max_vert_out_per_gs_instance) {
+      const unsigned wavesize = gs_sel->screen->ge_wave_size;
+      unsigned orig_max_esverts;
+      unsigned orig_max_gsprims;
+      do {
+         orig_max_esverts = max_esverts;
+         orig_max_gsprims = max_gsprims;
+
+         max_esverts = align(max_esverts, wavesize);
+         max_esverts = MIN2(max_esverts, max_esverts_base);
+         if (esvert_lds_size)
+            max_esverts =
+               MIN2(max_esverts, (max_lds_size - max_gsprims * gsprim_lds_size) / esvert_lds_size);
+         max_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim);
+
+         max_gsprims = align(max_gsprims, wavesize);
+         max_gsprims = MIN2(max_gsprims, max_gsprims_base);
+         if (gsprim_lds_size)
+            max_gsprims =
+               MIN2(max_gsprims, (max_lds_size - max_esverts * esvert_lds_size) / gsprim_lds_size);
+         clamp_gsprims_to_esverts(&max_gsprims, max_esverts, min_verts_per_prim, use_adjacency);
+         assert(max_esverts >= max_verts_per_prim && max_gsprims >= 1);
+      } while (orig_max_esverts != max_esverts || orig_max_gsprims != max_gsprims);
+   }
+
+   /* Hardware restriction: minimum value of max_esverts */
+   max_esverts = MAX2(max_esverts, 23 + max_verts_per_prim);
+
+   unsigned max_out_vertices =
+      max_vert_out_per_gs_instance
+         ? gs_sel->gs_max_out_vertices
+         : gs_type == PIPE_SHADER_GEOMETRY
+              ? max_gsprims * gs_num_invocations * gs_sel->gs_max_out_vertices
+              : max_esverts;
+   assert(max_out_vertices <= 256);
+
+   unsigned prim_amp_factor = 1;
+   if (gs_type == PIPE_SHADER_GEOMETRY) {
+      /* Number of output primitives per GS input primitive after
+       * GS instancing. */
+      prim_amp_factor = gs_sel->gs_max_out_vertices;
+   }
+
+   /* The GE only checks against the maximum number of ES verts after
+    * allocating a full GS primitive. So we need to ensure that whenever
+    * this check passes, there is enough space for a full primitive without
+    * vertex reuse.
+    */
+   shader->ngg.hw_max_esverts = max_esverts - max_verts_per_prim + 1;
+   shader->ngg.max_gsprims = max_gsprims;
+   shader->ngg.max_out_verts = max_out_vertices;
+   shader->ngg.prim_amp_factor = prim_amp_factor;
+   shader->ngg.max_vert_out_per_gs_instance = max_vert_out_per_gs_instance;
+
+   shader->gs_info.esgs_ring_size = 4 * max_esverts * esvert_lds_size;
+   shader->ngg.ngg_emit_size = max_gsprims * gsprim_lds_size;
+
+   assert(shader->ngg.hw_max_esverts >= 24); /* HW limitation */
 }
diff --git a/src/gallium/drivers/radeonsi/si_blit.c b/src/gallium/drivers/radeonsi/si_blit.c
index e662de16127..ab69c7e4ddd 100644
--- a/src/gallium/drivers/radeonsi/si_blit.c
+++ b/src/gallium/drivers/radeonsi/si_blit.c
@@ -23,1346 +23,1220 @@
  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  */
 
-#include "si_pipe.h"
 #include "si_compute.h"
+#include "si_pipe.h"
 #include "util/format/u_format.h"
 #include "util/u_log.h"
 #include "util/u_surface.h"
 
-enum {
-	SI_COPY          = SI_SAVE_FRAMEBUFFER | SI_SAVE_TEXTURES |
-			   SI_SAVE_FRAGMENT_STATE | SI_DISABLE_RENDER_COND,
+enum
+{
+   SI_COPY =
+      SI_SAVE_FRAMEBUFFER | SI_SAVE_TEXTURES | SI_SAVE_FRAGMENT_STATE | SI_DISABLE_RENDER_COND,
 
-	SI_BLIT          = SI_SAVE_FRAMEBUFFER | SI_SAVE_TEXTURES |
-			   SI_SAVE_FRAGMENT_STATE,
+   SI_BLIT = SI_SAVE_FRAMEBUFFER | SI_SAVE_TEXTURES | SI_SAVE_FRAGMENT_STATE,
 
-	SI_DECOMPRESS    = SI_SAVE_FRAMEBUFFER | SI_SAVE_FRAGMENT_STATE |
-			   SI_DISABLE_RENDER_COND,
+   SI_DECOMPRESS = SI_SAVE_FRAMEBUFFER | SI_SAVE_FRAGMENT_STATE | SI_DISABLE_RENDER_COND,
 
-	SI_COLOR_RESOLVE = SI_SAVE_FRAMEBUFFER | SI_SAVE_FRAGMENT_STATE
+   SI_COLOR_RESOLVE = SI_SAVE_FRAMEBUFFER | SI_SAVE_FRAGMENT_STATE
 };
 
 void si_blitter_begin(struct si_context *sctx, enum si_blitter_op op)
 {
-	util_blitter_save_vertex_shader(sctx->blitter, sctx->vs_shader.cso);
-	util_blitter_save_tessctrl_shader(sctx->blitter, sctx->tcs_shader.cso);
-	util_blitter_save_tesseval_shader(sctx->blitter, sctx->tes_shader.cso);
-	util_blitter_save_geometry_shader(sctx->blitter, sctx->gs_shader.cso);
-	util_blitter_save_so_targets(sctx->blitter, sctx->streamout.num_targets,
-				     (struct pipe_stream_output_target**)sctx->streamout.targets);
-	util_blitter_save_rasterizer(sctx->blitter, sctx->queued.named.rasterizer);
-
-	if (op & SI_SAVE_FRAGMENT_STATE) {
-		util_blitter_save_blend(sctx->blitter, sctx->queued.named.blend);
-		util_blitter_save_depth_stencil_alpha(sctx->blitter, sctx->queued.named.dsa);
-		util_blitter_save_stencil_ref(sctx->blitter, &sctx->stencil_ref.state);
-		util_blitter_save_fragment_shader(sctx->blitter, sctx->ps_shader.cso);
-		util_blitter_save_sample_mask(sctx->blitter, sctx->sample_mask);
-		util_blitter_save_scissor(sctx->blitter, &sctx->scissors[0]);
-		util_blitter_save_window_rectangles(sctx->blitter,
-						    sctx->window_rectangles_include,
-						    sctx->num_window_rectangles,
-						    sctx->window_rectangles);
-	}
-
-	if (op & SI_SAVE_FRAMEBUFFER)
-		util_blitter_save_framebuffer(sctx->blitter, &sctx->framebuffer.state);
-
-	if (op & SI_SAVE_TEXTURES) {
-		util_blitter_save_fragment_sampler_states(
-			sctx->blitter, 2,
-			(void**)sctx->samplers[PIPE_SHADER_FRAGMENT].sampler_states);
-
-		util_blitter_save_fragment_sampler_views(sctx->blitter, 2,
-			sctx->samplers[PIPE_SHADER_FRAGMENT].views);
-	}
-
-	if (op & SI_DISABLE_RENDER_COND)
-		sctx->render_cond_force_off = true;
-
-	if (sctx->screen->dpbb_allowed) {
-		sctx->dpbb_force_off = true;
-		si_mark_atom_dirty(sctx, &sctx->atoms.s.dpbb_state);
-	}
+   util_blitter_save_vertex_shader(sctx->blitter, sctx->vs_shader.cso);
+   util_blitter_save_tessctrl_shader(sctx->blitter, sctx->tcs_shader.cso);
+   util_blitter_save_tesseval_shader(sctx->blitter, sctx->tes_shader.cso);
+   util_blitter_save_geometry_shader(sctx->blitter, sctx->gs_shader.cso);
+   util_blitter_save_so_targets(sctx->blitter, sctx->streamout.num_targets,
+                                (struct pipe_stream_output_target **)sctx->streamout.targets);
+   util_blitter_save_rasterizer(sctx->blitter, sctx->queued.named.rasterizer);
+
+   if (op & SI_SAVE_FRAGMENT_STATE) {
+      util_blitter_save_blend(sctx->blitter, sctx->queued.named.blend);
+      util_blitter_save_depth_stencil_alpha(sctx->blitter, sctx->queued.named.dsa);
+      util_blitter_save_stencil_ref(sctx->blitter, &sctx->stencil_ref.state);
+      util_blitter_save_fragment_shader(sctx->blitter, sctx->ps_shader.cso);
+      util_blitter_save_sample_mask(sctx->blitter, sctx->sample_mask);
+      util_blitter_save_scissor(sctx->blitter, &sctx->scissors[0]);
+      util_blitter_save_window_rectangles(sctx->blitter, sctx->window_rectangles_include,
+                                          sctx->num_window_rectangles, sctx->window_rectangles);
+   }
+
+   if (op & SI_SAVE_FRAMEBUFFER)
+      util_blitter_save_framebuffer(sctx->blitter, &sctx->framebuffer.state);
+
+   if (op & SI_SAVE_TEXTURES) {
+      util_blitter_save_fragment_sampler_states(
+         sctx->blitter, 2, (void **)sctx->samplers[PIPE_SHADER_FRAGMENT].sampler_states);
+
+      util_blitter_save_fragment_sampler_views(sctx->blitter, 2,
+                                               sctx->samplers[PIPE_SHADER_FRAGMENT].views);
+   }
+
+   if (op & SI_DISABLE_RENDER_COND)
+      sctx->render_cond_force_off = true;
+
+   if (sctx->screen->dpbb_allowed) {
+      sctx->dpbb_force_off = true;
+      si_mark_atom_dirty(sctx, &sctx->atoms.s.dpbb_state);
+   }
 }
 
 void si_blitter_end(struct si_context *sctx)
 {
-	if (sctx->screen->dpbb_allowed) {
-		sctx->dpbb_force_off = false;
-		si_mark_atom_dirty(sctx, &sctx->atoms.s.dpbb_state);
-	}
-
-	sctx->render_cond_force_off = false;
-
-	/* Restore shader pointers because the VS blit shader changed all
-	 * non-global VS user SGPRs. */
-	sctx->shader_pointers_dirty |= SI_DESCS_SHADER_MASK(VERTEX);
-	sctx->vertex_buffer_pointer_dirty = sctx->vb_descriptors_buffer != NULL;
-	sctx->vertex_buffer_user_sgprs_dirty = sctx->num_vertex_elements > 0;
-	si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_pointers);
+   if (sctx->screen->dpbb_allowed) {
+      sctx->dpbb_force_off = false;
+      si_mark_atom_dirty(sctx, &sctx->atoms.s.dpbb_state);
+   }
+
+   sctx->render_cond_force_off = false;
+
+   /* Restore shader pointers because the VS blit shader changed all
+    * non-global VS user SGPRs. */
+   sctx->shader_pointers_dirty |= SI_DESCS_SHADER_MASK(VERTEX);
+   sctx->vertex_buffer_pointer_dirty = sctx->vb_descriptors_buffer != NULL;
+   sctx->vertex_buffer_user_sgprs_dirty = sctx->num_vertex_elements > 0;
+   si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_pointers);
 }
 
 static unsigned u_max_sample(struct pipe_resource *r)
 {
-	return r->nr_samples ? r->nr_samples - 1 : 0;
+   return r->nr_samples ? r->nr_samples - 1 : 0;
 }
 
-static unsigned
-si_blit_dbcb_copy(struct si_context *sctx,
-		  struct si_texture *src,
-		  struct si_texture *dst,
-		  unsigned planes, unsigned level_mask,
-		  unsigned first_layer, unsigned last_layer,
-		  unsigned first_sample, unsigned last_sample)
+static unsigned si_blit_dbcb_copy(struct si_context *sctx, struct si_texture *src,
+                                  struct si_texture *dst, unsigned planes, unsigned level_mask,
+                                  unsigned first_layer, unsigned last_layer, unsigned first_sample,
+                                  unsigned last_sample)
 {
-	struct pipe_surface surf_tmpl = {{0}};
-	unsigned layer, sample, checked_last_layer, max_layer;
-	unsigned fully_copied_levels = 0;
+   struct pipe_surface surf_tmpl = {{0}};
+   unsigned layer, sample, checked_last_layer, max_layer;
+   unsigned fully_copied_levels = 0;
 
-	if (planes & PIPE_MASK_Z)
-		sctx->dbcb_depth_copy_enabled = true;
-	if (planes & PIPE_MASK_S)
-		sctx->dbcb_stencil_copy_enabled = true;
-	si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
+   if (planes & PIPE_MASK_Z)
+      sctx->dbcb_depth_copy_enabled = true;
+   if (planes & PIPE_MASK_S)
+      sctx->dbcb_stencil_copy_enabled = true;
+   si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
 
-	assert(sctx->dbcb_depth_copy_enabled || sctx->dbcb_stencil_copy_enabled);
+   assert(sctx->dbcb_depth_copy_enabled || sctx->dbcb_stencil_copy_enabled);
 
-	sctx->decompression_enabled = true;
+   sctx->decompression_enabled = true;
 
-	while (level_mask) {
-		unsigned level = u_bit_scan(&level_mask);
+   while (level_mask) {
+      unsigned level = u_bit_scan(&level_mask);
 
-		/* The smaller the mipmap level, the less layers there are
-		 * as far as 3D textures are concerned. */
-		max_layer = util_max_layer(&src->buffer.b.b, level);
-		checked_last_layer = MIN2(last_layer, max_layer);
+      /* The smaller the mipmap level, the less layers there are
+       * as far as 3D textures are concerned. */
+      max_layer = util_max_layer(&src->buffer.b.b, level);
+      checked_last_layer = MIN2(last_layer, max_layer);
 
-		surf_tmpl.u.tex.level = level;
+      surf_tmpl.u.tex.level = level;
 
-		for (layer = first_layer; layer <= checked_last_layer; layer++) {
-			struct pipe_surface *zsurf, *cbsurf;
+      for (layer = first_layer; layer <= checked_last_layer; layer++) {
+         struct pipe_surface *zsurf, *cbsurf;
 
-			surf_tmpl.format = src->buffer.b.b.format;
-			surf_tmpl.u.tex.first_layer = layer;
-			surf_tmpl.u.tex.last_layer = layer;
+         surf_tmpl.format = src->buffer.b.b.format;
+         surf_tmpl.u.tex.first_layer = layer;
+         surf_tmpl.u.tex.last_layer = layer;
 
-			zsurf = sctx->b.create_surface(&sctx->b, &src->buffer.b.b, &surf_tmpl);
+         zsurf = sctx->b.create_surface(&sctx->b, &src->buffer.b.b, &surf_tmpl);
 
-			surf_tmpl.format = dst->buffer.b.b.format;
-			cbsurf = sctx->b.create_surface(&sctx->b, &dst->buffer.b.b, &surf_tmpl);
+         surf_tmpl.format = dst->buffer.b.b.format;
+         cbsurf = sctx->b.create_surface(&sctx->b, &dst->buffer.b.b, &surf_tmpl);
 
-			for (sample = first_sample; sample <= last_sample; sample++) {
-				if (sample != sctx->dbcb_copy_sample) {
-					sctx->dbcb_copy_sample = sample;
-					si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
-				}
+         for (sample = first_sample; sample <= last_sample; sample++) {
+            if (sample != sctx->dbcb_copy_sample) {
+               sctx->dbcb_copy_sample = sample;
+               si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
+            }
 
-				si_blitter_begin(sctx, SI_DECOMPRESS);
-				util_blitter_custom_depth_stencil(sctx->blitter, zsurf, cbsurf, 1 << sample,
-								  sctx->custom_dsa_flush, 1.0f);
-				si_blitter_end(sctx);
-			}
+            si_blitter_begin(sctx, SI_DECOMPRESS);
+            util_blitter_custom_depth_stencil(sctx->blitter, zsurf, cbsurf, 1 << sample,
+                                              sctx->custom_dsa_flush, 1.0f);
+            si_blitter_end(sctx);
+         }
 
-			pipe_surface_reference(&zsurf, NULL);
-			pipe_surface_reference(&cbsurf, NULL);
-		}
+         pipe_surface_reference(&zsurf, NULL);
+         pipe_surface_reference(&cbsurf, NULL);
+      }
 
-		if (first_layer == 0 && last_layer >= max_layer &&
-		    first_sample == 0 && last_sample >= u_max_sample(&src->buffer.b.b))
-			fully_copied_levels |= 1u << level;
-	}
+      if (first_layer == 0 && last_layer >= max_layer && first_sample == 0 &&
+          last_sample >= u_max_sample(&src->buffer.b.b))
+         fully_copied_levels |= 1u << level;
+   }
 
-	sctx->decompression_enabled = false;
-	sctx->dbcb_depth_copy_enabled = false;
-	sctx->dbcb_stencil_copy_enabled = false;
-	si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
+   sctx->decompression_enabled = false;
+   sctx->dbcb_depth_copy_enabled = false;
+   sctx->dbcb_stencil_copy_enabled = false;
+   si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
 
-	return fully_copied_levels;
+   return fully_copied_levels;
 }
 
 /* Helper function for si_blit_decompress_zs_in_place.
  */
-static void
-si_blit_decompress_zs_planes_in_place(struct si_context *sctx,
-				      struct si_texture *texture,
-				      unsigned planes, unsigned level_mask,
-				      unsigned first_layer, unsigned last_layer)
+static void si_blit_decompress_zs_planes_in_place(struct si_context *sctx,
+                                                  struct si_texture *texture, unsigned planes,
+                                                  unsigned level_mask, unsigned first_layer,
+                                                  unsigned last_layer)
 {
-	struct pipe_surface *zsurf, surf_tmpl = {{0}};
-	unsigned layer, max_layer, checked_last_layer;
-	unsigned fully_decompressed_mask = 0;
+   struct pipe_surface *zsurf, surf_tmpl = {{0}};
+   unsigned layer, max_layer, checked_last_layer;
+   unsigned fully_decompressed_mask = 0;
 
-	if (!level_mask)
-		return;
+   if (!level_mask)
+      return;
 
-	if (planes & PIPE_MASK_S)
-		sctx->db_flush_stencil_inplace = true;
-	if (planes & PIPE_MASK_Z)
-		sctx->db_flush_depth_inplace = true;
-	si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
+   if (planes & PIPE_MASK_S)
+      sctx->db_flush_stencil_inplace = true;
+   if (planes & PIPE_MASK_Z)
+      sctx->db_flush_depth_inplace = true;
+   si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
 
-	surf_tmpl.format = texture->buffer.b.b.format;
+   surf_tmpl.format = texture->buffer.b.b.format;
 
-	sctx->decompression_enabled = true;
+   sctx->decompression_enabled = true;
 
-	while (level_mask) {
-		unsigned level = u_bit_scan(&level_mask);
+   while (level_mask) {
+      unsigned level = u_bit_scan(&level_mask);
 
-		surf_tmpl.u.tex.level = level;
+      surf_tmpl.u.tex.level = level;
 
-		/* The smaller the mipmap level, the less layers there are
-		 * as far as 3D textures are concerned. */
-		max_layer = util_max_layer(&texture->buffer.b.b, level);
-		checked_last_layer = MIN2(last_layer, max_layer);
+      /* The smaller the mipmap level, the less layers there are
+       * as far as 3D textures are concerned. */
+      max_layer = util_max_layer(&texture->buffer.b.b, level);
+      checked_last_layer = MIN2(last_layer, max_layer);
 
-		for (layer = first_layer; layer <= checked_last_layer; layer++) {
-			surf_tmpl.u.tex.first_layer = layer;
-			surf_tmpl.u.tex.last_layer = layer;
+      for (layer = first_layer; layer <= checked_last_layer; layer++) {
+         surf_tmpl.u.tex.first_layer = layer;
+         surf_tmpl.u.tex.last_layer = layer;
 
-			zsurf = sctx->b.create_surface(&sctx->b, &texture->buffer.b.b, &surf_tmpl);
+         zsurf = sctx->b.create_surface(&sctx->b, &texture->buffer.b.b, &surf_tmpl);
 
-			si_blitter_begin(sctx, SI_DECOMPRESS);
-			util_blitter_custom_depth_stencil(sctx->blitter, zsurf, NULL, ~0,
-							  sctx->custom_dsa_flush,
-							  1.0f);
-			si_blitter_end(sctx);
+         si_blitter_begin(sctx, SI_DECOMPRESS);
+         util_blitter_custom_depth_stencil(sctx->blitter, zsurf, NULL, ~0, sctx->custom_dsa_flush,
+                                           1.0f);
+         si_blitter_end(sctx);
 
-			pipe_surface_reference(&zsurf, NULL);
-		}
+         pipe_surface_reference(&zsurf, NULL);
+      }
 
-		/* The texture will always be dirty if some layers aren't flushed.
-		 * I don't think this case occurs often though. */
-		if (first_layer == 0 && last_layer >= max_layer) {
-			fully_decompressed_mask |= 1u << level;
-		}
-	}
+      /* The texture will always be dirty if some layers aren't flushed.
+       * I don't think this case occurs often though. */
+      if (first_layer == 0 && last_layer >= max_layer) {
+         fully_decompressed_mask |= 1u << level;
+      }
+   }
 
-	if (planes & PIPE_MASK_Z)
-		texture->dirty_level_mask &= ~fully_decompressed_mask;
-	if (planes & PIPE_MASK_S)
-		texture->stencil_dirty_level_mask &= ~fully_decompressed_mask;
+   if (planes & PIPE_MASK_Z)
+      texture->dirty_level_mask &= ~fully_decompressed_mask;
+   if (planes & PIPE_MASK_S)
+      texture->stencil_dirty_level_mask &= ~fully_decompressed_mask;
 
-	sctx->decompression_enabled = false;
-	sctx->db_flush_depth_inplace = false;
-	sctx->db_flush_stencil_inplace = false;
-	si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
+   sctx->decompression_enabled = false;
+   sctx->db_flush_depth_inplace = false;
+   sctx->db_flush_stencil_inplace = false;
+   si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
 }
 
 /* Helper function of si_flush_depth_texture: decompress the given levels
  * of Z and/or S planes in place.
  */
-static void
-si_blit_decompress_zs_in_place(struct si_context *sctx,
-			       struct si_texture *texture,
-			       unsigned levels_z, unsigned levels_s,
-			       unsigned first_layer, unsigned last_layer)
+static void si_blit_decompress_zs_in_place(struct si_context *sctx, struct si_texture *texture,
+                                           unsigned levels_z, unsigned levels_s,
+                                           unsigned first_layer, unsigned last_layer)
 {
-	unsigned both = levels_z & levels_s;
-
-	/* First, do combined Z & S decompresses for levels that need it. */
-	if (both) {
-		si_blit_decompress_zs_planes_in_place(
-				sctx, texture, PIPE_MASK_Z | PIPE_MASK_S,
-				both,
-				first_layer, last_layer);
-		levels_z &= ~both;
-		levels_s &= ~both;
-	}
-
-	/* Now do separate Z and S decompresses. */
-	if (levels_z) {
-		si_blit_decompress_zs_planes_in_place(
-				sctx, texture, PIPE_MASK_Z,
-				levels_z,
-				first_layer, last_layer);
-	}
-
-	if (levels_s) {
-		si_blit_decompress_zs_planes_in_place(
-				sctx, texture, PIPE_MASK_S,
-				levels_s,
-				first_layer, last_layer);
-	}
+   unsigned both = levels_z & levels_s;
+
+   /* First, do combined Z & S decompresses for levels that need it. */
+   if (both) {
+      si_blit_decompress_zs_planes_in_place(sctx, texture, PIPE_MASK_Z | PIPE_MASK_S, both,
+                                            first_layer, last_layer);
+      levels_z &= ~both;
+      levels_s &= ~both;
+   }
+
+   /* Now do separate Z and S decompresses. */
+   if (levels_z) {
+      si_blit_decompress_zs_planes_in_place(sctx, texture, PIPE_MASK_Z, levels_z, first_layer,
+                                            last_layer);
+   }
+
+   if (levels_s) {
+      si_blit_decompress_zs_planes_in_place(sctx, texture, PIPE_MASK_S, levels_s, first_layer,
+                                            last_layer);
+   }
 }
 
-static void
-si_decompress_depth(struct si_context *sctx,
-		    struct si_texture *tex,
-		    unsigned required_planes,
-		    unsigned first_level, unsigned last_level,
-		    unsigned first_layer, unsigned last_layer)
+static void si_decompress_depth(struct si_context *sctx, struct si_texture *tex,
+                                unsigned required_planes, unsigned first_level, unsigned last_level,
+                                unsigned first_layer, unsigned last_layer)
 {
-	unsigned inplace_planes = 0;
-	unsigned copy_planes = 0;
-	unsigned level_mask = u_bit_consecutive(first_level, last_level - first_level + 1);
-	unsigned levels_z = 0;
-	unsigned levels_s = 0;
-
-	if (required_planes & PIPE_MASK_Z) {
-		levels_z = level_mask & tex->dirty_level_mask;
-
-		if (levels_z) {
-			if (si_can_sample_zs(tex, false))
-				inplace_planes |= PIPE_MASK_Z;
-			else
-				copy_planes |= PIPE_MASK_Z;
-		}
-	}
-	if (required_planes & PIPE_MASK_S) {
-		levels_s = level_mask & tex->stencil_dirty_level_mask;
-
-		if (levels_s) {
-			if (si_can_sample_zs(tex, true))
-				inplace_planes |= PIPE_MASK_S;
-			else
-				copy_planes |= PIPE_MASK_S;
-		}
-	}
-
-	if (unlikely(sctx->log))
-		u_log_printf(sctx->log,
-			     "\n------------------------------------------------\n"
-			     "Decompress Depth (levels %u - %u, levels Z: 0x%x S: 0x%x)\n\n",
-			     first_level, last_level, levels_z, levels_s);
-
-	/* We may have to allocate the flushed texture here when called from
-	 * si_decompress_subresource.
-	 */
-	if (copy_planes &&
-	    (tex->flushed_depth_texture ||
-	     si_init_flushed_depth_texture(&sctx->b, &tex->buffer.b.b))) {
-		struct si_texture *dst = tex->flushed_depth_texture;
-		unsigned fully_copied_levels;
-		unsigned levels = 0;
-
-		assert(tex->flushed_depth_texture);
-
-		if (util_format_is_depth_and_stencil(dst->buffer.b.b.format))
-			copy_planes = PIPE_MASK_Z | PIPE_MASK_S;
-
-		if (copy_planes & PIPE_MASK_Z) {
-			levels |= levels_z;
-			levels_z = 0;
-		}
-		if (copy_planes & PIPE_MASK_S) {
-			levels |= levels_s;
-			levels_s = 0;
-		}
-
-		fully_copied_levels = si_blit_dbcb_copy(
-			sctx, tex, dst, copy_planes, levels,
-			first_layer, last_layer,
-			0, u_max_sample(&tex->buffer.b.b));
-
-		if (copy_planes & PIPE_MASK_Z)
-			tex->dirty_level_mask &= ~fully_copied_levels;
-		if (copy_planes & PIPE_MASK_S)
-			tex->stencil_dirty_level_mask &= ~fully_copied_levels;
-	}
-
-	if (inplace_planes) {
-		bool has_htile = si_htile_enabled(tex, first_level, inplace_planes);
-		bool tc_compat_htile = vi_tc_compat_htile_enabled(tex, first_level,
-								  inplace_planes);
-
-		/* Don't decompress if there is no HTILE or when HTILE is
-		 * TC-compatible. */
-		if (has_htile && !tc_compat_htile) {
-			si_blit_decompress_zs_in_place(
-						sctx, tex,
-						levels_z, levels_s,
-						first_layer, last_layer);
-		} else {
-			/* This is only a cache flush.
-			 *
-			 * Only clear the mask that we are flushing, because
-			 * si_make_DB_shader_coherent() treats different levels
-			 * and depth and stencil differently.
-			 */
-			if (inplace_planes & PIPE_MASK_Z)
-				tex->dirty_level_mask &= ~levels_z;
-			if (inplace_planes & PIPE_MASK_S)
-				tex->stencil_dirty_level_mask &= ~levels_s;
-		}
-
-		/* Only in-place decompression needs to flush DB caches, or
-		 * when we don't decompress but TC-compatible planes are dirty.
-		 */
-		si_make_DB_shader_coherent(sctx, tex->buffer.b.b.nr_samples,
-					   inplace_planes & PIPE_MASK_S,
-					   tc_compat_htile);
-	}
-	/* set_framebuffer_state takes care of coherency for single-sample.
-	 * The DB->CB copy uses CB for the final writes.
-	 */
-	if (copy_planes && tex->buffer.b.b.nr_samples > 1)
-		si_make_CB_shader_coherent(sctx, tex->buffer.b.b.nr_samples,
-					   false, true /* no DCC */);
+   unsigned inplace_planes = 0;
+   unsigned copy_planes = 0;
+   unsigned level_mask = u_bit_consecutive(first_level, last_level - first_level + 1);
+   unsigned levels_z = 0;
+   unsigned levels_s = 0;
+
+   if (required_planes & PIPE_MASK_Z) {
+      levels_z = level_mask & tex->dirty_level_mask;
+
+      if (levels_z) {
+         if (si_can_sample_zs(tex, false))
+            inplace_planes |= PIPE_MASK_Z;
+         else
+            copy_planes |= PIPE_MASK_Z;
+      }
+   }
+   if (required_planes & PIPE_MASK_S) {
+      levels_s = level_mask & tex->stencil_dirty_level_mask;
+
+      if (levels_s) {
+         if (si_can_sample_zs(tex, true))
+            inplace_planes |= PIPE_MASK_S;
+         else
+            copy_planes |= PIPE_MASK_S;
+      }
+   }
+
+   if (unlikely(sctx->log))
+      u_log_printf(sctx->log,
+                   "\n------------------------------------------------\n"
+                   "Decompress Depth (levels %u - %u, levels Z: 0x%x S: 0x%x)\n\n",
+                   first_level, last_level, levels_z, levels_s);
+
+   /* We may have to allocate the flushed texture here when called from
+    * si_decompress_subresource.
+    */
+   if (copy_planes &&
+       (tex->flushed_depth_texture || si_init_flushed_depth_texture(&sctx->b, &tex->buffer.b.b))) {
+      struct si_texture *dst = tex->flushed_depth_texture;
+      unsigned fully_copied_levels;
+      unsigned levels = 0;
+
+      assert(tex->flushed_depth_texture);
+
+      if (util_format_is_depth_and_stencil(dst->buffer.b.b.format))
+         copy_planes = PIPE_MASK_Z | PIPE_MASK_S;
+
+      if (copy_planes & PIPE_MASK_Z) {
+         levels |= levels_z;
+         levels_z = 0;
+      }
+      if (copy_planes & PIPE_MASK_S) {
+         levels |= levels_s;
+         levels_s = 0;
+      }
+
+      fully_copied_levels = si_blit_dbcb_copy(sctx, tex, dst, copy_planes, levels, first_layer,
+                                              last_layer, 0, u_max_sample(&tex->buffer.b.b));
+
+      if (copy_planes & PIPE_MASK_Z)
+         tex->dirty_level_mask &= ~fully_copied_levels;
+      if (copy_planes & PIPE_MASK_S)
+         tex->stencil_dirty_level_mask &= ~fully_copied_levels;
+   }
+
+   if (inplace_planes) {
+      bool has_htile = si_htile_enabled(tex, first_level, inplace_planes);
+      bool tc_compat_htile = vi_tc_compat_htile_enabled(tex, first_level, inplace_planes);
+
+      /* Don't decompress if there is no HTILE or when HTILE is
+       * TC-compatible. */
+      if (has_htile && !tc_compat_htile) {
+         si_blit_decompress_zs_in_place(sctx, tex, levels_z, levels_s, first_layer, last_layer);
+      } else {
+         /* This is only a cache flush.
+          *
+          * Only clear the mask that we are flushing, because
+          * si_make_DB_shader_coherent() treats different levels
+          * and depth and stencil differently.
+          */
+         if (inplace_planes & PIPE_MASK_Z)
+            tex->dirty_level_mask &= ~levels_z;
+         if (inplace_planes & PIPE_MASK_S)
+            tex->stencil_dirty_level_mask &= ~levels_s;
+      }
+
+      /* Only in-place decompression needs to flush DB caches, or
+       * when we don't decompress but TC-compatible planes are dirty.
+       */
+      si_make_DB_shader_coherent(sctx, tex->buffer.b.b.nr_samples, inplace_planes & PIPE_MASK_S,
+                                 tc_compat_htile);
+   }
+   /* set_framebuffer_state takes care of coherency for single-sample.
+    * The DB->CB copy uses CB for the final writes.
+    */
+   if (copy_planes && tex->buffer.b.b.nr_samples > 1)
+      si_make_CB_shader_coherent(sctx, tex->buffer.b.b.nr_samples, false, true /* no DCC */);
 }
 
-static void
-si_decompress_sampler_depth_textures(struct si_context *sctx,
-				     struct si_samplers *textures)
+static void si_decompress_sampler_depth_textures(struct si_context *sctx,
+                                                 struct si_samplers *textures)
 {
-	unsigned i;
-	unsigned mask = textures->needs_depth_decompress_mask;
+   unsigned i;
+   unsigned mask = textures->needs_depth_decompress_mask;
 
-	while (mask) {
-		struct pipe_sampler_view *view;
-		struct si_sampler_view *sview;
-		struct si_texture *tex;
+   while (mask) {
+      struct pipe_sampler_view *view;
+      struct si_sampler_view *sview;
+      struct si_texture *tex;
 
-		i = u_bit_scan(&mask);
+      i = u_bit_scan(&mask);
 
-		view = textures->views[i];
-		assert(view);
-		sview = (struct si_sampler_view*)view;
+      view = textures->views[i];
+      assert(view);
+      sview = (struct si_sampler_view *)view;
 
-		tex = (struct si_texture *)view->texture;
-		assert(tex->db_compatible);
+      tex = (struct si_texture *)view->texture;
+      assert(tex->db_compatible);
 
-		si_decompress_depth(sctx, tex,
-				    sview->is_stencil_sampler ? PIPE_MASK_S : PIPE_MASK_Z,
-				    view->u.tex.first_level, view->u.tex.last_level,
-				    0, util_max_layer(&tex->buffer.b.b, view->u.tex.first_level));
-	}
+      si_decompress_depth(sctx, tex, sview->is_stencil_sampler ? PIPE_MASK_S : PIPE_MASK_Z,
+                          view->u.tex.first_level, view->u.tex.last_level, 0,
+                          util_max_layer(&tex->buffer.b.b, view->u.tex.first_level));
+   }
 }
 
-static void si_blit_decompress_color(struct si_context *sctx,
-				     struct si_texture *tex,
-				     unsigned first_level, unsigned last_level,
-				     unsigned first_layer, unsigned last_layer,
-				     bool need_dcc_decompress,
-				     bool need_fmask_expand)
+static void si_blit_decompress_color(struct si_context *sctx, struct si_texture *tex,
+                                     unsigned first_level, unsigned last_level,
+                                     unsigned first_layer, unsigned last_layer,
+                                     bool need_dcc_decompress, bool need_fmask_expand)
 {
-	void* custom_blend;
-	unsigned layer, checked_last_layer, max_layer;
-	unsigned level_mask =
-		u_bit_consecutive(first_level, last_level - first_level + 1);
-
-	if (!need_dcc_decompress)
-		level_mask &= tex->dirty_level_mask;
-	if (!level_mask)
-		goto expand_fmask;
-
-	if (unlikely(sctx->log))
-		u_log_printf(sctx->log,
-			     "\n------------------------------------------------\n"
-			     "Decompress Color (levels %u - %u, mask 0x%x)\n\n",
-			     first_level, last_level, level_mask);
-
-	if (need_dcc_decompress) {
-		custom_blend = sctx->custom_blend_dcc_decompress;
-
-		assert(tex->surface.dcc_offset);
-
-		/* disable levels without DCC */
-		for (int i = first_level; i <= last_level; i++) {
-			if (!vi_dcc_enabled(tex, i))
-				level_mask &= ~(1 << i);
-		}
-	} else if (tex->surface.fmask_size) {
-		custom_blend = sctx->custom_blend_fmask_decompress;
-	} else {
-		custom_blend = sctx->custom_blend_eliminate_fastclear;
-	}
-
-	sctx->decompression_enabled = true;
-
-	while (level_mask) {
-		unsigned level = u_bit_scan(&level_mask);
-
-		/* The smaller the mipmap level, the less layers there are
-		 * as far as 3D textures are concerned. */
-		max_layer = util_max_layer(&tex->buffer.b.b, level);
-		checked_last_layer = MIN2(last_layer, max_layer);
-
-		for (layer = first_layer; layer <= checked_last_layer; layer++) {
-			struct pipe_surface *cbsurf, surf_tmpl;
-
-			surf_tmpl.format = tex->buffer.b.b.format;
-			surf_tmpl.u.tex.level = level;
-			surf_tmpl.u.tex.first_layer = layer;
-			surf_tmpl.u.tex.last_layer = layer;
-			cbsurf = sctx->b.create_surface(&sctx->b, &tex->buffer.b.b, &surf_tmpl);
-
-			/* Required before and after FMASK and DCC_DECOMPRESS. */
-			if (custom_blend == sctx->custom_blend_fmask_decompress ||
-			    custom_blend == sctx->custom_blend_dcc_decompress)
-				sctx->flags |= SI_CONTEXT_FLUSH_AND_INV_CB;
-
-			si_blitter_begin(sctx, SI_DECOMPRESS);
-			util_blitter_custom_color(sctx->blitter, cbsurf, custom_blend);
-			si_blitter_end(sctx);
-
-			if (custom_blend == sctx->custom_blend_fmask_decompress ||
-			    custom_blend == sctx->custom_blend_dcc_decompress)
-				sctx->flags |= SI_CONTEXT_FLUSH_AND_INV_CB;
-
-			pipe_surface_reference(&cbsurf, NULL);
-		}
-
-		/* The texture will always be dirty if some layers aren't flushed.
-		 * I don't think this case occurs often though. */
-		if (first_layer == 0 && last_layer >= max_layer) {
-			tex->dirty_level_mask &= ~(1 << level);
-		}
-	}
-
-	sctx->decompression_enabled = false;
-	si_make_CB_shader_coherent(sctx, tex->buffer.b.b.nr_samples,
-				   vi_dcc_enabled(tex, first_level),
-				   tex->surface.u.gfx9.dcc.pipe_aligned);
+   void *custom_blend;
+   unsigned layer, checked_last_layer, max_layer;
+   unsigned level_mask = u_bit_consecutive(first_level, last_level - first_level + 1);
+
+   if (!need_dcc_decompress)
+      level_mask &= tex->dirty_level_mask;
+   if (!level_mask)
+      goto expand_fmask;
+
+   if (unlikely(sctx->log))
+      u_log_printf(sctx->log,
+                   "\n------------------------------------------------\n"
+                   "Decompress Color (levels %u - %u, mask 0x%x)\n\n",
+                   first_level, last_level, level_mask);
+
+   if (need_dcc_decompress) {
+      custom_blend = sctx->custom_blend_dcc_decompress;
+
+      assert(tex->surface.dcc_offset);
+
+      /* disable levels without DCC */
+      for (int i = first_level; i <= last_level; i++) {
+         if (!vi_dcc_enabled(tex, i))
+            level_mask &= ~(1 << i);
+      }
+   } else if (tex->surface.fmask_size) {
+      custom_blend = sctx->custom_blend_fmask_decompress;
+   } else {
+      custom_blend = sctx->custom_blend_eliminate_fastclear;
+   }
+
+   sctx->decompression_enabled = true;
+
+   while (level_mask) {
+      unsigned level = u_bit_scan(&level_mask);
+
+      /* The smaller the mipmap level, the less layers there are
+       * as far as 3D textures are concerned. */
+      max_layer = util_max_layer(&tex->buffer.b.b, level);
+      checked_last_layer = MIN2(last_layer, max_layer);
+
+      for (layer = first_layer; layer <= checked_last_layer; layer++) {
+         struct pipe_surface *cbsurf, surf_tmpl;
+
+         surf_tmpl.format = tex->buffer.b.b.format;
+         surf_tmpl.u.tex.level = level;
+         surf_tmpl.u.tex.first_layer = layer;
+         surf_tmpl.u.tex.last_layer = layer;
+         cbsurf = sctx->b.create_surface(&sctx->b, &tex->buffer.b.b, &surf_tmpl);
+
+         /* Required before and after FMASK and DCC_DECOMPRESS. */
+         if (custom_blend == sctx->custom_blend_fmask_decompress ||
+             custom_blend == sctx->custom_blend_dcc_decompress)
+            sctx->flags |= SI_CONTEXT_FLUSH_AND_INV_CB;
+
+         si_blitter_begin(sctx, SI_DECOMPRESS);
+         util_blitter_custom_color(sctx->blitter, cbsurf, custom_blend);
+         si_blitter_end(sctx);
+
+         if (custom_blend == sctx->custom_blend_fmask_decompress ||
+             custom_blend == sctx->custom_blend_dcc_decompress)
+            sctx->flags |= SI_CONTEXT_FLUSH_AND_INV_CB;
+
+         pipe_surface_reference(&cbsurf, NULL);
+      }
+
+      /* The texture will always be dirty if some layers aren't flushed.
+       * I don't think this case occurs often though. */
+      if (first_layer == 0 && last_layer >= max_layer) {
+         tex->dirty_level_mask &= ~(1 << level);
+      }
+   }
+
+   sctx->decompression_enabled = false;
+   si_make_CB_shader_coherent(sctx, tex->buffer.b.b.nr_samples, vi_dcc_enabled(tex, first_level),
+                              tex->surface.u.gfx9.dcc.pipe_aligned);
 
 expand_fmask:
-	if (need_fmask_expand && tex->surface.fmask_offset && !tex->fmask_is_identity) {
-		si_compute_expand_fmask(&sctx->b, &tex->buffer.b.b);
-		tex->fmask_is_identity = true;
-	}
+   if (need_fmask_expand && tex->surface.fmask_offset && !tex->fmask_is_identity) {
+      si_compute_expand_fmask(&sctx->b, &tex->buffer.b.b);
+      tex->fmask_is_identity = true;
+   }
 }
 
-static void
-si_decompress_color_texture(struct si_context *sctx, struct si_texture *tex,
-			    unsigned first_level, unsigned last_level,
-			    bool need_fmask_expand)
+static void si_decompress_color_texture(struct si_context *sctx, struct si_texture *tex,
+                                        unsigned first_level, unsigned last_level,
+                                        bool need_fmask_expand)
 {
-	/* CMASK or DCC can be discarded and we can still end up here. */
-	if (!tex->cmask_buffer && !tex->surface.fmask_size && !tex->surface.dcc_offset)
-		return;
+   /* CMASK or DCC can be discarded and we can still end up here. */
+   if (!tex->cmask_buffer && !tex->surface.fmask_size && !tex->surface.dcc_offset)
+      return;
 
-	si_blit_decompress_color(sctx, tex, first_level, last_level, 0,
-				 util_max_layer(&tex->buffer.b.b, first_level),
-				 false, need_fmask_expand);
+   si_blit_decompress_color(sctx, tex, first_level, last_level, 0,
+                            util_max_layer(&tex->buffer.b.b, first_level), false,
+                            need_fmask_expand);
 }
 
-static void
-si_decompress_sampler_color_textures(struct si_context *sctx,
-				     struct si_samplers *textures)
+static void si_decompress_sampler_color_textures(struct si_context *sctx,
+                                                 struct si_samplers *textures)
 {
-	unsigned i;
-	unsigned mask = textures->needs_color_decompress_mask;
+   unsigned i;
+   unsigned mask = textures->needs_color_decompress_mask;
 
-	while (mask) {
-		struct pipe_sampler_view *view;
-		struct si_texture *tex;
+   while (mask) {
+      struct pipe_sampler_view *view;
+      struct si_texture *tex;
 
-		i = u_bit_scan(&mask);
+      i = u_bit_scan(&mask);
 
-		view = textures->views[i];
-		assert(view);
+      view = textures->views[i];
+      assert(view);
 
-		tex = (struct si_texture *)view->texture;
+      tex = (struct si_texture *)view->texture;
 
-		si_decompress_color_texture(sctx, tex, view->u.tex.first_level,
-					    view->u.tex.last_level, false);
-	}
+      si_decompress_color_texture(sctx, tex, view->u.tex.first_level, view->u.tex.last_level,
+                                  false);
+   }
 }
 
-static void
-si_decompress_image_color_textures(struct si_context *sctx,
-				   struct si_images *images)
+static void si_decompress_image_color_textures(struct si_context *sctx, struct si_images *images)
 {
-	unsigned i;
-	unsigned mask = images->needs_color_decompress_mask;
+   unsigned i;
+   unsigned mask = images->needs_color_decompress_mask;
 
-	while (mask) {
-		const struct pipe_image_view *view;
-		struct si_texture *tex;
+   while (mask) {
+      const struct pipe_image_view *view;
+      struct si_texture *tex;
 
-		i = u_bit_scan(&mask);
+      i = u_bit_scan(&mask);
 
-		view = &images->views[i];
-		assert(view->resource->target != PIPE_BUFFER);
+      view = &images->views[i];
+      assert(view->resource->target != PIPE_BUFFER);
 
-		tex = (struct si_texture *)view->resource;
+      tex = (struct si_texture *)view->resource;
 
-		si_decompress_color_texture(sctx, tex, view->u.tex.level,
-					    view->u.tex.level,
-					    view->access & PIPE_IMAGE_ACCESS_WRITE);
-	}
+      si_decompress_color_texture(sctx, tex, view->u.tex.level, view->u.tex.level,
+                                  view->access & PIPE_IMAGE_ACCESS_WRITE);
+   }
 }
 
-static void si_check_render_feedback_texture(struct si_context *sctx,
-					     struct si_texture *tex,
-					     unsigned first_level,
-					     unsigned last_level,
-					     unsigned first_layer,
-					     unsigned last_layer)
+static void si_check_render_feedback_texture(struct si_context *sctx, struct si_texture *tex,
+                                             unsigned first_level, unsigned last_level,
+                                             unsigned first_layer, unsigned last_layer)
 {
-	bool render_feedback = false;
+   bool render_feedback = false;
 
-	if (!tex->surface.dcc_offset)
-		return;
+   if (!tex->surface.dcc_offset)
+      return;
 
-	for (unsigned j = 0; j < sctx->framebuffer.state.nr_cbufs; ++j) {
-		struct si_surface * surf;
+   for (unsigned j = 0; j < sctx->framebuffer.state.nr_cbufs; ++j) {
+      struct si_surface *surf;
 
-		if (!sctx->framebuffer.state.cbufs[j])
-			continue;
+      if (!sctx->framebuffer.state.cbufs[j])
+         continue;
 
-		surf = (struct si_surface*)sctx->framebuffer.state.cbufs[j];
+      surf = (struct si_surface *)sctx->framebuffer.state.cbufs[j];
 
-		if (tex == (struct si_texture *)surf->base.texture &&
-		    surf->base.u.tex.level >= first_level &&
-		    surf->base.u.tex.level <= last_level &&
-		    surf->base.u.tex.first_layer <= last_layer &&
-		    surf->base.u.tex.last_layer >= first_layer) {
-			render_feedback = true;
-			break;
-		}
-	}
+      if (tex == (struct si_texture *)surf->base.texture && surf->base.u.tex.level >= first_level &&
+          surf->base.u.tex.level <= last_level && surf->base.u.tex.first_layer <= last_layer &&
+          surf->base.u.tex.last_layer >= first_layer) {
+         render_feedback = true;
+         break;
+      }
+   }
 
-	if (render_feedback)
-		si_texture_disable_dcc(sctx, tex);
+   if (render_feedback)
+      si_texture_disable_dcc(sctx, tex);
 }
 
-static void si_check_render_feedback_textures(struct si_context *sctx,
-                                              struct si_samplers *textures)
+static void si_check_render_feedback_textures(struct si_context *sctx, struct si_samplers *textures)
 {
-	uint32_t mask = textures->enabled_mask;
+   uint32_t mask = textures->enabled_mask;
 
-	while (mask) {
-		const struct pipe_sampler_view *view;
-		struct si_texture *tex;
+   while (mask) {
+      const struct pipe_sampler_view *view;
+      struct si_texture *tex;
 
-		unsigned i = u_bit_scan(&mask);
+      unsigned i = u_bit_scan(&mask);
 
-		view = textures->views[i];
-		if(view->texture->target == PIPE_BUFFER)
-			continue;
+      view = textures->views[i];
+      if (view->texture->target == PIPE_BUFFER)
+         continue;
 
-		tex = (struct si_texture *)view->texture;
+      tex = (struct si_texture *)view->texture;
 
-		si_check_render_feedback_texture(sctx, tex,
-						 view->u.tex.first_level,
-						 view->u.tex.last_level,
-						 view->u.tex.first_layer,
-						 view->u.tex.last_layer);
-	}
+      si_check_render_feedback_texture(sctx, tex, view->u.tex.first_level, view->u.tex.last_level,
+                                       view->u.tex.first_layer, view->u.tex.last_layer);
+   }
 }
 
-static void si_check_render_feedback_images(struct si_context *sctx,
-                                            struct si_images *images)
+static void si_check_render_feedback_images(struct si_context *sctx, struct si_images *images)
 {
-	uint32_t mask = images->enabled_mask;
+   uint32_t mask = images->enabled_mask;
 
-	while (mask) {
-		const struct pipe_image_view *view;
-		struct si_texture *tex;
+   while (mask) {
+      const struct pipe_image_view *view;
+      struct si_texture *tex;
 
-		unsigned i = u_bit_scan(&mask);
+      unsigned i = u_bit_scan(&mask);
 
-		view = &images->views[i];
-		if (view->resource->target == PIPE_BUFFER)
-			continue;
+      view = &images->views[i];
+      if (view->resource->target == PIPE_BUFFER)
+         continue;
 
-		tex = (struct si_texture *)view->resource;
+      tex = (struct si_texture *)view->resource;
 
-		si_check_render_feedback_texture(sctx, tex,
-						 view->u.tex.level,
-						 view->u.tex.level,
-						 view->u.tex.first_layer,
-						 view->u.tex.last_layer);
-	}
+      si_check_render_feedback_texture(sctx, tex, view->u.tex.level, view->u.tex.level,
+                                       view->u.tex.first_layer, view->u.tex.last_layer);
+   }
 }
 
 static void si_check_render_feedback_resident_textures(struct si_context *sctx)
 {
-	util_dynarray_foreach(&sctx->resident_tex_handles,
-			      struct si_texture_handle *, tex_handle) {
-		struct pipe_sampler_view *view;
-		struct si_texture *tex;
-
-		view = (*tex_handle)->view;
-		if (view->texture->target == PIPE_BUFFER)
-			continue;
-
-		tex = (struct si_texture *)view->texture;
-
-		si_check_render_feedback_texture(sctx, tex,
-						 view->u.tex.first_level,
-						 view->u.tex.last_level,
-						 view->u.tex.first_layer,
-						 view->u.tex.last_layer);
-	}
+   util_dynarray_foreach (&sctx->resident_tex_handles, struct si_texture_handle *, tex_handle) {
+      struct pipe_sampler_view *view;
+      struct si_texture *tex;
+
+      view = (*tex_handle)->view;
+      if (view->texture->target == PIPE_BUFFER)
+         continue;
+
+      tex = (struct si_texture *)view->texture;
+
+      si_check_render_feedback_texture(sctx, tex, view->u.tex.first_level, view->u.tex.last_level,
+                                       view->u.tex.first_layer, view->u.tex.last_layer);
+   }
 }
 
 static void si_check_render_feedback_resident_images(struct si_context *sctx)
 {
-	util_dynarray_foreach(&sctx->resident_img_handles,
-			      struct si_image_handle *, img_handle) {
-		struct pipe_image_view *view;
-		struct si_texture *tex;
-
-		view = &(*img_handle)->view;
-		if (view->resource->target == PIPE_BUFFER)
-			continue;
-
-		tex = (struct si_texture *)view->resource;
-
-		si_check_render_feedback_texture(sctx, tex,
-						 view->u.tex.level,
-						 view->u.tex.level,
-						 view->u.tex.first_layer,
-						 view->u.tex.last_layer);
-	}
+   util_dynarray_foreach (&sctx->resident_img_handles, struct si_image_handle *, img_handle) {
+      struct pipe_image_view *view;
+      struct si_texture *tex;
+
+      view = &(*img_handle)->view;
+      if (view->resource->target == PIPE_BUFFER)
+         continue;
+
+      tex = (struct si_texture *)view->resource;
+
+      si_check_render_feedback_texture(sctx, tex, view->u.tex.level, view->u.tex.level,
+                                       view->u.tex.first_layer, view->u.tex.last_layer);
+   }
 }
 
 static void si_check_render_feedback(struct si_context *sctx)
 {
-	if (!sctx->need_check_render_feedback)
-		return;
+   if (!sctx->need_check_render_feedback)
+      return;
 
-	/* There is no render feedback if color writes are disabled.
-	 * (e.g. a pixel shader with image stores)
-	 */
-	if (!si_get_total_colormask(sctx))
-		return;
+   /* There is no render feedback if color writes are disabled.
+    * (e.g. a pixel shader with image stores)
+    */
+   if (!si_get_total_colormask(sctx))
+      return;
 
-	for (int i = 0; i < SI_NUM_SHADERS; ++i) {
-		si_check_render_feedback_images(sctx, &sctx->images[i]);
-		si_check_render_feedback_textures(sctx, &sctx->samplers[i]);
-	}
+   for (int i = 0; i < SI_NUM_SHADERS; ++i) {
+      si_check_render_feedback_images(sctx, &sctx->images[i]);
+      si_check_render_feedback_textures(sctx, &sctx->samplers[i]);
+   }
 
-	si_check_render_feedback_resident_images(sctx);
-	si_check_render_feedback_resident_textures(sctx);
+   si_check_render_feedback_resident_images(sctx);
+   si_check_render_feedback_resident_textures(sctx);
 
-	sctx->need_check_render_feedback = false;
+   sctx->need_check_render_feedback = false;
 }
 
 static void si_decompress_resident_textures(struct si_context *sctx)
 {
-	util_dynarray_foreach(&sctx->resident_tex_needs_color_decompress,
-			      struct si_texture_handle *, tex_handle) {
-		struct pipe_sampler_view *view = (*tex_handle)->view;
-		struct si_texture *tex = (struct si_texture *)view->texture;
-
-		si_decompress_color_texture(sctx, tex, view->u.tex.first_level,
-					    view->u.tex.last_level, false);
-	}
-
-	util_dynarray_foreach(&sctx->resident_tex_needs_depth_decompress,
-			      struct si_texture_handle *, tex_handle) {
-		struct pipe_sampler_view *view = (*tex_handle)->view;
-		struct si_sampler_view *sview = (struct si_sampler_view *)view;
-		struct si_texture *tex = (struct si_texture *)view->texture;
-
-		si_decompress_depth(sctx, tex,
-			sview->is_stencil_sampler ? PIPE_MASK_S : PIPE_MASK_Z,
-			view->u.tex.first_level, view->u.tex.last_level,
-			0, util_max_layer(&tex->buffer.b.b, view->u.tex.first_level));
-	}
+   util_dynarray_foreach (&sctx->resident_tex_needs_color_decompress, struct si_texture_handle *,
+                          tex_handle) {
+      struct pipe_sampler_view *view = (*tex_handle)->view;
+      struct si_texture *tex = (struct si_texture *)view->texture;
+
+      si_decompress_color_texture(sctx, tex, view->u.tex.first_level, view->u.tex.last_level,
+                                  false);
+   }
+
+   util_dynarray_foreach (&sctx->resident_tex_needs_depth_decompress, struct si_texture_handle *,
+                          tex_handle) {
+      struct pipe_sampler_view *view = (*tex_handle)->view;
+      struct si_sampler_view *sview = (struct si_sampler_view *)view;
+      struct si_texture *tex = (struct si_texture *)view->texture;
+
+      si_decompress_depth(sctx, tex, sview->is_stencil_sampler ? PIPE_MASK_S : PIPE_MASK_Z,
+                          view->u.tex.first_level, view->u.tex.last_level, 0,
+                          util_max_layer(&tex->buffer.b.b, view->u.tex.first_level));
+   }
 }
 
 static void si_decompress_resident_images(struct si_context *sctx)
 {
-	util_dynarray_foreach(&sctx->resident_img_needs_color_decompress,
-			      struct si_image_handle *, img_handle) {
-		struct pipe_image_view *view = &(*img_handle)->view;
-		struct si_texture *tex = (struct si_texture *)view->resource;
-
-		si_decompress_color_texture(sctx, tex, view->u.tex.level,
-					    view->u.tex.level,
-					    view->access & PIPE_IMAGE_ACCESS_WRITE);
-	}
+   util_dynarray_foreach (&sctx->resident_img_needs_color_decompress, struct si_image_handle *,
+                          img_handle) {
+      struct pipe_image_view *view = &(*img_handle)->view;
+      struct si_texture *tex = (struct si_texture *)view->resource;
+
+      si_decompress_color_texture(sctx, tex, view->u.tex.level, view->u.tex.level,
+                                  view->access & PIPE_IMAGE_ACCESS_WRITE);
+   }
 }
 
 void si_decompress_textures(struct si_context *sctx, unsigned shader_mask)
 {
-	unsigned compressed_colortex_counter, mask;
-
-	if (sctx->blitter->running)
-		return;
-
-	/* Update the compressed_colortex_mask if necessary. */
-	compressed_colortex_counter = p_atomic_read(&sctx->screen->compressed_colortex_counter);
-	if (compressed_colortex_counter != sctx->last_compressed_colortex_counter) {
-		sctx->last_compressed_colortex_counter = compressed_colortex_counter;
-		si_update_needs_color_decompress_masks(sctx);
-	}
-
-	/* Decompress color & depth textures if needed. */
-	mask = sctx->shader_needs_decompress_mask & shader_mask;
-	while (mask) {
-		unsigned i = u_bit_scan(&mask);
-
-		if (sctx->samplers[i].needs_depth_decompress_mask) {
-			si_decompress_sampler_depth_textures(sctx, &sctx->samplers[i]);
-		}
-		if (sctx->samplers[i].needs_color_decompress_mask) {
-			si_decompress_sampler_color_textures(sctx, &sctx->samplers[i]);
-		}
-		if (sctx->images[i].needs_color_decompress_mask) {
-			si_decompress_image_color_textures(sctx, &sctx->images[i]);
-		}
-	}
-
-	if (shader_mask & u_bit_consecutive(0, SI_NUM_GRAPHICS_SHADERS)) {
-		if (sctx->uses_bindless_samplers)
-			si_decompress_resident_textures(sctx);
-		if (sctx->uses_bindless_images)
-			si_decompress_resident_images(sctx);
-
-		if (sctx->ps_uses_fbfetch) {
-			struct pipe_surface *cb0 = sctx->framebuffer.state.cbufs[0];
-			si_decompress_color_texture(sctx,
-						    (struct si_texture*)cb0->texture,
-						    cb0->u.tex.first_layer,
-						    cb0->u.tex.last_layer, false);
-		}
-
-		si_check_render_feedback(sctx);
-	} else if (shader_mask & (1 << PIPE_SHADER_COMPUTE)) {
-		if (sctx->cs_shader_state.program->sel.info.uses_bindless_samplers)
-			si_decompress_resident_textures(sctx);
-		if (sctx->cs_shader_state.program->sel.info.uses_bindless_images)
-			si_decompress_resident_images(sctx);
-	}
+   unsigned compressed_colortex_counter, mask;
+
+   if (sctx->blitter->running)
+      return;
+
+   /* Update the compressed_colortex_mask if necessary. */
+   compressed_colortex_counter = p_atomic_read(&sctx->screen->compressed_colortex_counter);
+   if (compressed_colortex_counter != sctx->last_compressed_colortex_counter) {
+      sctx->last_compressed_colortex_counter = compressed_colortex_counter;
+      si_update_needs_color_decompress_masks(sctx);
+   }
+
+   /* Decompress color & depth textures if needed. */
+   mask = sctx->shader_needs_decompress_mask & shader_mask;
+   while (mask) {
+      unsigned i = u_bit_scan(&mask);
+
+      if (sctx->samplers[i].needs_depth_decompress_mask) {
+         si_decompress_sampler_depth_textures(sctx, &sctx->samplers[i]);
+      }
+      if (sctx->samplers[i].needs_color_decompress_mask) {
+         si_decompress_sampler_color_textures(sctx, &sctx->samplers[i]);
+      }
+      if (sctx->images[i].needs_color_decompress_mask) {
+         si_decompress_image_color_textures(sctx, &sctx->images[i]);
+      }
+   }
+
+   if (shader_mask & u_bit_consecutive(0, SI_NUM_GRAPHICS_SHADERS)) {
+      if (sctx->uses_bindless_samplers)
+         si_decompress_resident_textures(sctx);
+      if (sctx->uses_bindless_images)
+         si_decompress_resident_images(sctx);
+
+      if (sctx->ps_uses_fbfetch) {
+         struct pipe_surface *cb0 = sctx->framebuffer.state.cbufs[0];
+         si_decompress_color_texture(sctx, (struct si_texture *)cb0->texture,
+                                     cb0->u.tex.first_layer, cb0->u.tex.last_layer, false);
+      }
+
+      si_check_render_feedback(sctx);
+   } else if (shader_mask & (1 << PIPE_SHADER_COMPUTE)) {
+      if (sctx->cs_shader_state.program->sel.info.uses_bindless_samplers)
+         si_decompress_resident_textures(sctx);
+      if (sctx->cs_shader_state.program->sel.info.uses_bindless_images)
+         si_decompress_resident_images(sctx);
+   }
 }
 
 /* Helper for decompressing a portion of a color or depth resource before
  * blitting if any decompression is needed.
  * The driver doesn't decompress resources automatically while u_blitter is
  * rendering. */
-void si_decompress_subresource(struct pipe_context *ctx,
-			       struct pipe_resource *tex,
-			       unsigned planes, unsigned level,
-			       unsigned first_layer, unsigned last_layer)
+void si_decompress_subresource(struct pipe_context *ctx, struct pipe_resource *tex, unsigned planes,
+                               unsigned level, unsigned first_layer, unsigned last_layer)
 {
-	struct si_context *sctx = (struct si_context *)ctx;
-	struct si_texture *stex = (struct si_texture*)tex;
-
-	if (stex->db_compatible) {
-		planes &= PIPE_MASK_Z | PIPE_MASK_S;
-
-		if (!stex->surface.has_stencil)
-			planes &= ~PIPE_MASK_S;
-
-		/* If we've rendered into the framebuffer and it's a blitting
-		 * source, make sure the decompression pass is invoked
-		 * by dirtying the framebuffer.
-		 */
-		if (sctx->framebuffer.state.zsbuf &&
-		    sctx->framebuffer.state.zsbuf->u.tex.level == level &&
-		    sctx->framebuffer.state.zsbuf->texture == tex)
-			si_update_fb_dirtiness_after_rendering(sctx);
-
-		si_decompress_depth(sctx, stex, planes,
-				    level, level,
-				    first_layer, last_layer);
-	} else if (stex->surface.fmask_size || stex->cmask_buffer || stex->surface.dcc_offset) {
-		/* If we've rendered into the framebuffer and it's a blitting
-		 * source, make sure the decompression pass is invoked
-		 * by dirtying the framebuffer.
-		 */
-		for (unsigned i = 0; i < sctx->framebuffer.state.nr_cbufs; i++) {
-			if (sctx->framebuffer.state.cbufs[i] &&
-			    sctx->framebuffer.state.cbufs[i]->u.tex.level == level &&
-			    sctx->framebuffer.state.cbufs[i]->texture == tex) {
-				si_update_fb_dirtiness_after_rendering(sctx);
-				break;
-			}
-		}
-
-		si_blit_decompress_color(sctx, stex, level, level,
-					 first_layer, last_layer, false, false);
-	}
+   struct si_context *sctx = (struct si_context *)ctx;
+   struct si_texture *stex = (struct si_texture *)tex;
+
+   if (stex->db_compatible) {
+      planes &= PIPE_MASK_Z | PIPE_MASK_S;
+
+      if (!stex->surface.has_stencil)
+         planes &= ~PIPE_MASK_S;
+
+      /* If we've rendered into the framebuffer and it's a blitting
+       * source, make sure the decompression pass is invoked
+       * by dirtying the framebuffer.
+       */
+      if (sctx->framebuffer.state.zsbuf && sctx->framebuffer.state.zsbuf->u.tex.level == level &&
+          sctx->framebuffer.state.zsbuf->texture == tex)
+         si_update_fb_dirtiness_after_rendering(sctx);
+
+      si_decompress_depth(sctx, stex, planes, level, level, first_layer, last_layer);
+   } else if (stex->surface.fmask_size || stex->cmask_buffer || stex->surface.dcc_offset) {
+      /* If we've rendered into the framebuffer and it's a blitting
+       * source, make sure the decompression pass is invoked
+       * by dirtying the framebuffer.
+       */
+      for (unsigned i = 0; i < sctx->framebuffer.state.nr_cbufs; i++) {
+         if (sctx->framebuffer.state.cbufs[i] &&
+             sctx->framebuffer.state.cbufs[i]->u.tex.level == level &&
+             sctx->framebuffer.state.cbufs[i]->texture == tex) {
+            si_update_fb_dirtiness_after_rendering(sctx);
+            break;
+         }
+      }
+
+      si_blit_decompress_color(sctx, stex, level, level, first_layer, last_layer, false, false);
+   }
 }
 
 struct texture_orig_info {
-	unsigned format;
-	unsigned width0;
-	unsigned height0;
-	unsigned npix_x;
-	unsigned npix_y;
-	unsigned npix0_x;
-	unsigned npix0_y;
+   unsigned format;
+   unsigned width0;
+   unsigned height0;
+   unsigned npix_x;
+   unsigned npix_y;
+   unsigned npix0_x;
+   unsigned npix0_y;
 };
 
-void si_resource_copy_region(struct pipe_context *ctx,
-			     struct pipe_resource *dst,
-			     unsigned dst_level,
-			     unsigned dstx, unsigned dsty, unsigned dstz,
-			     struct pipe_resource *src,
-			     unsigned src_level,
-			     const struct pipe_box *src_box)
+void si_resource_copy_region(struct pipe_context *ctx, struct pipe_resource *dst,
+                             unsigned dst_level, unsigned dstx, unsigned dsty, unsigned dstz,
+                             struct pipe_resource *src, unsigned src_level,
+                             const struct pipe_box *src_box)
 {
-	struct si_context *sctx = (struct si_context *)ctx;
-	struct si_texture *ssrc = (struct si_texture*)src;
-	struct si_texture *sdst = (struct si_texture*)dst;
-	struct pipe_surface *dst_view, dst_templ;
-	struct pipe_sampler_view src_templ, *src_view;
-	unsigned dst_width, dst_height, src_width0, src_height0;
-	unsigned dst_width0, dst_height0, src_force_level = 0;
-	struct pipe_box sbox, dstbox;
-
-	/* Handle buffers first. */
-	if (dst->target == PIPE_BUFFER && src->target == PIPE_BUFFER) {
-		si_copy_buffer(sctx, dst, src, dstx, src_box->x, src_box->width);
-		return;
-	}
-
-	if (!util_format_is_compressed(src->format) &&
-	    !util_format_is_compressed(dst->format) &&
-	    !util_format_is_depth_or_stencil(src->format) &&
-	    src->nr_samples <= 1 &&
-	    !sdst->surface.dcc_offset &&
-	    !(dst->target != src->target &&
-	      (src->target == PIPE_TEXTURE_1D_ARRAY || dst->target == PIPE_TEXTURE_1D_ARRAY))) {
-		si_compute_copy_image(sctx, dst, dst_level, src, src_level, dstx, dsty, dstz, src_box);
-		return;
-	}
-
-	assert(u_max_sample(dst) == u_max_sample(src));
-
-	/* The driver doesn't decompress resources automatically while
-	 * u_blitter is rendering. */
-	si_decompress_subresource(ctx, src, PIPE_MASK_RGBAZS, src_level,
-				  src_box->z, src_box->z + src_box->depth - 1);
-
-	dst_width = u_minify(dst->width0, dst_level);
-	dst_height = u_minify(dst->height0, dst_level);
-	dst_width0 = dst->width0;
-	dst_height0 = dst->height0;
-	src_width0 = src->width0;
-	src_height0 = src->height0;
-
-	util_blitter_default_dst_texture(&dst_templ, dst, dst_level, dstz);
-	util_blitter_default_src_texture(sctx->blitter, &src_templ, src, src_level);
-
-	if (util_format_is_compressed(src->format) ||
-	    util_format_is_compressed(dst->format)) {
-		unsigned blocksize = ssrc->surface.bpe;
-
-		if (blocksize == 8)
-			src_templ.format = PIPE_FORMAT_R16G16B16A16_UINT; /* 64-bit block */
-		else
-			src_templ.format = PIPE_FORMAT_R32G32B32A32_UINT; /* 128-bit block */
-		dst_templ.format = src_templ.format;
-
-		dst_width = util_format_get_nblocksx(dst->format, dst_width);
-		dst_height = util_format_get_nblocksy(dst->format, dst_height);
-		dst_width0 = util_format_get_nblocksx(dst->format, dst_width0);
-		dst_height0 = util_format_get_nblocksy(dst->format, dst_height0);
-		src_width0 = util_format_get_nblocksx(src->format, src_width0);
-		src_height0 = util_format_get_nblocksy(src->format, src_height0);
-
-		dstx = util_format_get_nblocksx(dst->format, dstx);
-		dsty = util_format_get_nblocksy(dst->format, dsty);
-
-		sbox.x = util_format_get_nblocksx(src->format, src_box->x);
-		sbox.y = util_format_get_nblocksy(src->format, src_box->y);
-		sbox.z = src_box->z;
-		sbox.width = util_format_get_nblocksx(src->format, src_box->width);
-		sbox.height = util_format_get_nblocksy(src->format, src_box->height);
-		sbox.depth = src_box->depth;
-		src_box = &sbox;
-
-		src_force_level = src_level;
-	} else if (!util_blitter_is_copy_supported(sctx->blitter, dst, src)) {
-		if (util_format_is_subsampled_422(src->format)) {
-			src_templ.format = PIPE_FORMAT_R8G8B8A8_UINT;
-			dst_templ.format = PIPE_FORMAT_R8G8B8A8_UINT;
-
-			dst_width = util_format_get_nblocksx(dst->format, dst_width);
-			dst_width0 = util_format_get_nblocksx(dst->format, dst_width0);
-			src_width0 = util_format_get_nblocksx(src->format, src_width0);
-
-			dstx = util_format_get_nblocksx(dst->format, dstx);
-
-			sbox = *src_box;
-			sbox.x = util_format_get_nblocksx(src->format, src_box->x);
-			sbox.width = util_format_get_nblocksx(src->format, src_box->width);
-			src_box = &sbox;
-		} else {
-			unsigned blocksize = ssrc->surface.bpe;
-
-			switch (blocksize) {
-			case 1:
-				dst_templ.format = PIPE_FORMAT_R8_UNORM;
-				src_templ.format = PIPE_FORMAT_R8_UNORM;
-				break;
-			case 2:
-				dst_templ.format = PIPE_FORMAT_R8G8_UNORM;
-				src_templ.format = PIPE_FORMAT_R8G8_UNORM;
-				break;
-			case 4:
-				dst_templ.format = PIPE_FORMAT_R8G8B8A8_UNORM;
-				src_templ.format = PIPE_FORMAT_R8G8B8A8_UNORM;
-				break;
-			case 8:
-				dst_templ.format = PIPE_FORMAT_R16G16B16A16_UINT;
-				src_templ.format = PIPE_FORMAT_R16G16B16A16_UINT;
-				break;
-			case 16:
-				dst_templ.format = PIPE_FORMAT_R32G32B32A32_UINT;
-				src_templ.format = PIPE_FORMAT_R32G32B32A32_UINT;
-				break;
-			default:
-				fprintf(stderr, "Unhandled format %s with blocksize %u\n",
-					util_format_short_name(src->format), blocksize);
-				assert(0);
-			}
-		}
-	}
-
-	/* SNORM8 blitting has precision issues on some chips. Use the SINT
-	 * equivalent instead, which doesn't force DCC decompression.
-	 * Note that some chips avoid this issue by using SDMA.
-	 */
-	if (util_format_is_snorm8(dst_templ.format)) {
-		dst_templ.format = src_templ.format =
-			util_format_snorm8_to_sint8(dst_templ.format);
-	}
-
-	vi_disable_dcc_if_incompatible_format(sctx, dst, dst_level,
-					      dst_templ.format);
-	vi_disable_dcc_if_incompatible_format(sctx, src, src_level,
-					      src_templ.format);
-
-	/* Initialize the surface. */
-	dst_view = si_create_surface_custom(ctx, dst, &dst_templ,
-					      dst_width0, dst_height0,
-					      dst_width, dst_height);
-
-	/* Initialize the sampler view. */
-	src_view = si_create_sampler_view_custom(ctx, src, &src_templ,
-						 src_width0, src_height0,
-						 src_force_level);
-
-	u_box_3d(dstx, dsty, dstz, abs(src_box->width), abs(src_box->height),
-		 abs(src_box->depth), &dstbox);
-
-	/* Copy. */
-	si_blitter_begin(sctx, SI_COPY);
-	util_blitter_blit_generic(sctx->blitter, dst_view, &dstbox,
-				  src_view, src_box, src_width0, src_height0,
-				  PIPE_MASK_RGBAZS, PIPE_TEX_FILTER_NEAREST, NULL,
-				  false);
-	si_blitter_end(sctx);
-
-	pipe_surface_reference(&dst_view, NULL);
-	pipe_sampler_view_reference(&src_view, NULL);
+   struct si_context *sctx = (struct si_context *)ctx;
+   struct si_texture *ssrc = (struct si_texture *)src;
+   struct si_texture *sdst = (struct si_texture *)dst;
+   struct pipe_surface *dst_view, dst_templ;
+   struct pipe_sampler_view src_templ, *src_view;
+   unsigned dst_width, dst_height, src_width0, src_height0;
+   unsigned dst_width0, dst_height0, src_force_level = 0;
+   struct pipe_box sbox, dstbox;
+
+   /* Handle buffers first. */
+   if (dst->target == PIPE_BUFFER && src->target == PIPE_BUFFER) {
+      si_copy_buffer(sctx, dst, src, dstx, src_box->x, src_box->width);
+      return;
+   }
+
+   if (!util_format_is_compressed(src->format) && !util_format_is_compressed(dst->format) &&
+       !util_format_is_depth_or_stencil(src->format) && src->nr_samples <= 1 &&
+       !sdst->surface.dcc_offset &&
+       !(dst->target != src->target &&
+         (src->target == PIPE_TEXTURE_1D_ARRAY || dst->target == PIPE_TEXTURE_1D_ARRAY))) {
+      si_compute_copy_image(sctx, dst, dst_level, src, src_level, dstx, dsty, dstz, src_box);
+      return;
+   }
+
+   assert(u_max_sample(dst) == u_max_sample(src));
+
+   /* The driver doesn't decompress resources automatically while
+    * u_blitter is rendering. */
+   si_decompress_subresource(ctx, src, PIPE_MASK_RGBAZS, src_level, src_box->z,
+                             src_box->z + src_box->depth - 1);
+
+   dst_width = u_minify(dst->width0, dst_level);
+   dst_height = u_minify(dst->height0, dst_level);
+   dst_width0 = dst->width0;
+   dst_height0 = dst->height0;
+   src_width0 = src->width0;
+   src_height0 = src->height0;
+
+   util_blitter_default_dst_texture(&dst_templ, dst, dst_level, dstz);
+   util_blitter_default_src_texture(sctx->blitter, &src_templ, src, src_level);
+
+   if (util_format_is_compressed(src->format) || util_format_is_compressed(dst->format)) {
+      unsigned blocksize = ssrc->surface.bpe;
+
+      if (blocksize == 8)
+         src_templ.format = PIPE_FORMAT_R16G16B16A16_UINT; /* 64-bit block */
+      else
+         src_templ.format = PIPE_FORMAT_R32G32B32A32_UINT; /* 128-bit block */
+      dst_templ.format = src_templ.format;
+
+      dst_width = util_format_get_nblocksx(dst->format, dst_width);
+      dst_height = util_format_get_nblocksy(dst->format, dst_height);
+      dst_width0 = util_format_get_nblocksx(dst->format, dst_width0);
+      dst_height0 = util_format_get_nblocksy(dst->format, dst_height0);
+      src_width0 = util_format_get_nblocksx(src->format, src_width0);
+      src_height0 = util_format_get_nblocksy(src->format, src_height0);
+
+      dstx = util_format_get_nblocksx(dst->format, dstx);
+      dsty = util_format_get_nblocksy(dst->format, dsty);
+
+      sbox.x = util_format_get_nblocksx(src->format, src_box->x);
+      sbox.y = util_format_get_nblocksy(src->format, src_box->y);
+      sbox.z = src_box->z;
+      sbox.width = util_format_get_nblocksx(src->format, src_box->width);
+      sbox.height = util_format_get_nblocksy(src->format, src_box->height);
+      sbox.depth = src_box->depth;
+      src_box = &sbox;
+
+      src_force_level = src_level;
+   } else if (!util_blitter_is_copy_supported(sctx->blitter, dst, src)) {
+      if (util_format_is_subsampled_422(src->format)) {
+         src_templ.format = PIPE_FORMAT_R8G8B8A8_UINT;
+         dst_templ.format = PIPE_FORMAT_R8G8B8A8_UINT;
+
+         dst_width = util_format_get_nblocksx(dst->format, dst_width);
+         dst_width0 = util_format_get_nblocksx(dst->format, dst_width0);
+         src_width0 = util_format_get_nblocksx(src->format, src_width0);
+
+         dstx = util_format_get_nblocksx(dst->format, dstx);
+
+         sbox = *src_box;
+         sbox.x = util_format_get_nblocksx(src->format, src_box->x);
+         sbox.width = util_format_get_nblocksx(src->format, src_box->width);
+         src_box = &sbox;
+      } else {
+         unsigned blocksize = ssrc->surface.bpe;
+
+         switch (blocksize) {
+         case 1:
+            dst_templ.format = PIPE_FORMAT_R8_UNORM;
+            src_templ.format = PIPE_FORMAT_R8_UNORM;
+            break;
+         case 2:
+            dst_templ.format = PIPE_FORMAT_R8G8_UNORM;
+            src_templ.format = PIPE_FORMAT_R8G8_UNORM;
+            break;
+         case 4:
+            dst_templ.format = PIPE_FORMAT_R8G8B8A8_UNORM;
+            src_templ.format = PIPE_FORMAT_R8G8B8A8_UNORM;
+            break;
+         case 8:
+            dst_templ.format = PIPE_FORMAT_R16G16B16A16_UINT;
+            src_templ.format = PIPE_FORMAT_R16G16B16A16_UINT;
+            break;
+         case 16:
+            dst_templ.format = PIPE_FORMAT_R32G32B32A32_UINT;
+            src_templ.format = PIPE_FORMAT_R32G32B32A32_UINT;
+            break;
+         default:
+            fprintf(stderr, "Unhandled format %s with blocksize %u\n",
+                    util_format_short_name(src->format), blocksize);
+            assert(0);
+         }
+      }
+   }
+
+   /* SNORM8 blitting has precision issues on some chips. Use the SINT
+    * equivalent instead, which doesn't force DCC decompression.
+    * Note that some chips avoid this issue by using SDMA.
+    */
+   if (util_format_is_snorm8(dst_templ.format)) {
+      dst_templ.format = src_templ.format = util_format_snorm8_to_sint8(dst_templ.format);
+   }
+
+   vi_disable_dcc_if_incompatible_format(sctx, dst, dst_level, dst_templ.format);
+   vi_disable_dcc_if_incompatible_format(sctx, src, src_level, src_templ.format);
+
+   /* Initialize the surface. */
+   dst_view = si_create_surface_custom(ctx, dst, &dst_templ, dst_width0, dst_height0, dst_width,
+                                       dst_height);
+
+   /* Initialize the sampler view. */
+   src_view =
+      si_create_sampler_view_custom(ctx, src, &src_templ, src_width0, src_height0, src_force_level);
+
+   u_box_3d(dstx, dsty, dstz, abs(src_box->width), abs(src_box->height), abs(src_box->depth),
+            &dstbox);
+
+   /* Copy. */
+   si_blitter_begin(sctx, SI_COPY);
+   util_blitter_blit_generic(sctx->blitter, dst_view, &dstbox, src_view, src_box, src_width0,
+                             src_height0, PIPE_MASK_RGBAZS, PIPE_TEX_FILTER_NEAREST, NULL, false);
+   si_blitter_end(sctx);
+
+   pipe_surface_reference(&dst_view, NULL);
+   pipe_sampler_view_reference(&src_view, NULL);
 }
 
-static void si_do_CB_resolve(struct si_context *sctx,
-			     const struct pipe_blit_info *info,
-			     struct pipe_resource *dst,
-			     unsigned dst_level, unsigned dst_z,
-			     enum pipe_format format)
+static void si_do_CB_resolve(struct si_context *sctx, const struct pipe_blit_info *info,
+                             struct pipe_resource *dst, unsigned dst_level, unsigned dst_z,
+                             enum pipe_format format)
 {
-	/* Required before and after CB_RESOLVE. */
-	sctx->flags |= SI_CONTEXT_FLUSH_AND_INV_CB;
-
-	si_blitter_begin(sctx, SI_COLOR_RESOLVE |
-			 (info->render_condition_enable ? 0 : SI_DISABLE_RENDER_COND));
-	util_blitter_custom_resolve_color(sctx->blitter, dst, dst_level, dst_z,
-					  info->src.resource, info->src.box.z,
-					  ~0, sctx->custom_blend_resolve,
-					  format);
-	si_blitter_end(sctx);
-
-	/* Flush caches for possible texturing. */
-	si_make_CB_shader_coherent(sctx, 1, false, true /* no DCC */);
+   /* Required before and after CB_RESOLVE. */
+   sctx->flags |= SI_CONTEXT_FLUSH_AND_INV_CB;
+
+   si_blitter_begin(
+      sctx, SI_COLOR_RESOLVE | (info->render_condition_enable ? 0 : SI_DISABLE_RENDER_COND));
+   util_blitter_custom_resolve_color(sctx->blitter, dst, dst_level, dst_z, info->src.resource,
+                                     info->src.box.z, ~0, sctx->custom_blend_resolve, format);
+   si_blitter_end(sctx);
+
+   /* Flush caches for possible texturing. */
+   si_make_CB_shader_coherent(sctx, 1, false, true /* no DCC */);
 }
 
-static bool do_hardware_msaa_resolve(struct pipe_context *ctx,
-				     const struct pipe_blit_info *info)
+static bool do_hardware_msaa_resolve(struct pipe_context *ctx, const struct pipe_blit_info *info)
 {
-	struct si_context *sctx = (struct si_context*)ctx;
-	struct si_texture *src = (struct si_texture*)info->src.resource;
-	struct si_texture *dst = (struct si_texture*)info->dst.resource;
-	ASSERTED struct si_texture *stmp;
-	unsigned dst_width = u_minify(info->dst.resource->width0, info->dst.level);
-	unsigned dst_height = u_minify(info->dst.resource->height0, info->dst.level);
-	enum pipe_format format = info->src.format;
-	struct pipe_resource *tmp, templ;
-	struct pipe_blit_info blit;
-
-	/* Check basic requirements for hw resolve. */
-	if (!(info->src.resource->nr_samples > 1 &&
-	      info->dst.resource->nr_samples <= 1 &&
-	      !util_format_is_pure_integer(format) &&
-	      !util_format_is_depth_or_stencil(format) &&
-	      util_max_layer(info->src.resource, 0) == 0))
-		return false;
-
-	/* Hardware MSAA resolve doesn't work if SPI format = NORM16_ABGR and
-	 * the format is R16G16. Use R16A16, which does work.
-	 */
-	if (format == PIPE_FORMAT_R16G16_UNORM)
-		format = PIPE_FORMAT_R16A16_UNORM;
-	if (format == PIPE_FORMAT_R16G16_SNORM)
-		format = PIPE_FORMAT_R16A16_SNORM;
-
-	/* Check the remaining requirements for hw resolve. */
-	if (util_max_layer(info->dst.resource, info->dst.level) == 0 &&
-	    !info->scissor_enable &&
-	    (info->mask & PIPE_MASK_RGBA) == PIPE_MASK_RGBA &&
-	    util_is_format_compatible(util_format_description(info->src.format),
-				      util_format_description(info->dst.format)) &&
-	    dst_width == info->src.resource->width0 &&
-	    dst_height == info->src.resource->height0 &&
-	    info->dst.box.x == 0 &&
-	    info->dst.box.y == 0 &&
-	    info->dst.box.width == dst_width &&
-	    info->dst.box.height == dst_height &&
-	    info->dst.box.depth == 1 &&
-	    info->src.box.x == 0 &&
-	    info->src.box.y == 0 &&
-	    info->src.box.width == dst_width &&
-	    info->src.box.height == dst_height &&
-	    info->src.box.depth == 1 &&
-	    !dst->surface.is_linear &&
-	    (!dst->cmask_buffer || !dst->dirty_level_mask)) { /* dst cannot be fast-cleared */
-		/* Check the last constraint. */
-		if (src->surface.micro_tile_mode != dst->surface.micro_tile_mode) {
-			/* The next fast clear will switch to this mode to
-			 * get direct hw resolve next time if the mode is
-			 * different now.
-			 *
-			 * TODO-GFX10: This does not work in GFX10 because MSAA
-			 * is restricted to 64KB_R_X and 64KB_Z_X swizzle modes.
-			 * In some cases we could change the swizzle of the
-			 * destination texture instead, but the more general
-			 * solution is to implement compute shader resolve.
-			 */
-			src->last_msaa_resolve_target_micro_mode =
-				dst->surface.micro_tile_mode;
-			goto resolve_to_temp;
-		}
-
-		/* Resolving into a surface with DCC is unsupported. Since
-		 * it's being overwritten anyway, clear it to uncompressed.
-		 * This is still the fastest codepath even with this clear.
-		 */
-		if (vi_dcc_enabled(dst, info->dst.level)) {
-			if (!vi_dcc_clear_level(sctx, dst, info->dst.level,
-						DCC_UNCOMPRESSED))
-				goto resolve_to_temp;
-
-			dst->dirty_level_mask &= ~(1 << info->dst.level);
-		}
-
-		/* Resolve directly from src to dst. */
-		si_do_CB_resolve(sctx, info, info->dst.resource,
-				 info->dst.level, info->dst.box.z, format);
-		return true;
-	}
+   struct si_context *sctx = (struct si_context *)ctx;
+   struct si_texture *src = (struct si_texture *)info->src.resource;
+   struct si_texture *dst = (struct si_texture *)info->dst.resource;
+   ASSERTED struct si_texture *stmp;
+   unsigned dst_width = u_minify(info->dst.resource->width0, info->dst.level);
+   unsigned dst_height = u_minify(info->dst.resource->height0, info->dst.level);
+   enum pipe_format format = info->src.format;
+   struct pipe_resource *tmp, templ;
+   struct pipe_blit_info blit;
+
+   /* Check basic requirements for hw resolve. */
+   if (!(info->src.resource->nr_samples > 1 && info->dst.resource->nr_samples <= 1 &&
+         !util_format_is_pure_integer(format) && !util_format_is_depth_or_stencil(format) &&
+         util_max_layer(info->src.resource, 0) == 0))
+      return false;
+
+   /* Hardware MSAA resolve doesn't work if SPI format = NORM16_ABGR and
+    * the format is R16G16. Use R16A16, which does work.
+    */
+   if (format == PIPE_FORMAT_R16G16_UNORM)
+      format = PIPE_FORMAT_R16A16_UNORM;
+   if (format == PIPE_FORMAT_R16G16_SNORM)
+      format = PIPE_FORMAT_R16A16_SNORM;
+
+   /* Check the remaining requirements for hw resolve. */
+   if (util_max_layer(info->dst.resource, info->dst.level) == 0 && !info->scissor_enable &&
+       (info->mask & PIPE_MASK_RGBA) == PIPE_MASK_RGBA &&
+       util_is_format_compatible(util_format_description(info->src.format),
+                                 util_format_description(info->dst.format)) &&
+       dst_width == info->src.resource->width0 && dst_height == info->src.resource->height0 &&
+       info->dst.box.x == 0 && info->dst.box.y == 0 && info->dst.box.width == dst_width &&
+       info->dst.box.height == dst_height && info->dst.box.depth == 1 && info->src.box.x == 0 &&
+       info->src.box.y == 0 && info->src.box.width == dst_width &&
+       info->src.box.height == dst_height && info->src.box.depth == 1 && !dst->surface.is_linear &&
+       (!dst->cmask_buffer || !dst->dirty_level_mask)) { /* dst cannot be fast-cleared */
+      /* Check the last constraint. */
+      if (src->surface.micro_tile_mode != dst->surface.micro_tile_mode) {
+         /* The next fast clear will switch to this mode to
+          * get direct hw resolve next time if the mode is
+          * different now.
+          *
+          * TODO-GFX10: This does not work in GFX10 because MSAA
+          * is restricted to 64KB_R_X and 64KB_Z_X swizzle modes.
+          * In some cases we could change the swizzle of the
+          * destination texture instead, but the more general
+          * solution is to implement compute shader resolve.
+          */
+         src->last_msaa_resolve_target_micro_mode = dst->surface.micro_tile_mode;
+         goto resolve_to_temp;
+      }
+
+      /* Resolving into a surface with DCC is unsupported. Since
+       * it's being overwritten anyway, clear it to uncompressed.
+       * This is still the fastest codepath even with this clear.
+       */
+      if (vi_dcc_enabled(dst, info->dst.level)) {
+         if (!vi_dcc_clear_level(sctx, dst, info->dst.level, DCC_UNCOMPRESSED))
+            goto resolve_to_temp;
+
+         dst->dirty_level_mask &= ~(1 << info->dst.level);
+      }
+
+      /* Resolve directly from src to dst. */
+      si_do_CB_resolve(sctx, info, info->dst.resource, info->dst.level, info->dst.box.z, format);
+      return true;
+   }
 
 resolve_to_temp:
-	/* Shader-based resolve is VERY SLOW. Instead, resolve into
-	 * a temporary texture and blit.
-	 */
-	memset(&templ, 0, sizeof(templ));
-	templ.target = PIPE_TEXTURE_2D;
-	templ.format = info->src.resource->format;
-	templ.width0 = info->src.resource->width0;
-	templ.height0 = info->src.resource->height0;
-	templ.depth0 = 1;
-	templ.array_size = 1;
-	templ.usage = PIPE_USAGE_DEFAULT;
-	templ.flags = SI_RESOURCE_FLAG_FORCE_MSAA_TILING |
-		      SI_RESOURCE_FLAG_FORCE_MICRO_TILE_MODE |
-		      SI_RESOURCE_FLAG_MICRO_TILE_MODE_SET(src->surface.micro_tile_mode) |
-		      SI_RESOURCE_FLAG_DISABLE_DCC;
-
-	/* The src and dst microtile modes must be the same. */
-	if (sctx->chip_class <= GFX8 &&
-	    src->surface.micro_tile_mode == RADEON_MICRO_MODE_DISPLAY)
-		templ.bind = PIPE_BIND_SCANOUT;
-	else
-		templ.bind = 0;
-
-	tmp = ctx->screen->resource_create(ctx->screen, &templ);
-	if (!tmp)
-		return false;
-	stmp = (struct si_texture*)tmp;
-
-	assert(!stmp->surface.is_linear);
-	assert(src->surface.micro_tile_mode == stmp->surface.micro_tile_mode);
-
-	/* resolve */
-	si_do_CB_resolve(sctx, info, tmp, 0, 0, format);
-
-	/* blit */
-	blit = *info;
-	blit.src.resource = tmp;
-	blit.src.box.z = 0;
-
-	si_blitter_begin(sctx, SI_BLIT |
-			 (info->render_condition_enable ? 0 : SI_DISABLE_RENDER_COND));
-	util_blitter_blit(sctx->blitter, &blit);
-	si_blitter_end(sctx);
-
-	pipe_resource_reference(&tmp, NULL);
-	return true;
+   /* Shader-based resolve is VERY SLOW. Instead, resolve into
+    * a temporary texture and blit.
+    */
+   memset(&templ, 0, sizeof(templ));
+   templ.target = PIPE_TEXTURE_2D;
+   templ.format = info->src.resource->format;
+   templ.width0 = info->src.resource->width0;
+   templ.height0 = info->src.resource->height0;
+   templ.depth0 = 1;
+   templ.array_size = 1;
+   templ.usage = PIPE_USAGE_DEFAULT;
+   templ.flags = SI_RESOURCE_FLAG_FORCE_MSAA_TILING | SI_RESOURCE_FLAG_FORCE_MICRO_TILE_MODE |
+                 SI_RESOURCE_FLAG_MICRO_TILE_MODE_SET(src->surface.micro_tile_mode) |
+                 SI_RESOURCE_FLAG_DISABLE_DCC;
+
+   /* The src and dst microtile modes must be the same. */
+   if (sctx->chip_class <= GFX8 && src->surface.micro_tile_mode == RADEON_MICRO_MODE_DISPLAY)
+      templ.bind = PIPE_BIND_SCANOUT;
+   else
+      templ.bind = 0;
+
+   tmp = ctx->screen->resource_create(ctx->screen, &templ);
+   if (!tmp)
+      return false;
+   stmp = (struct si_texture *)tmp;
+
+   assert(!stmp->surface.is_linear);
+   assert(src->surface.micro_tile_mode == stmp->surface.micro_tile_mode);
+
+   /* resolve */
+   si_do_CB_resolve(sctx, info, tmp, 0, 0, format);
+
+   /* blit */
+   blit = *info;
+   blit.src.resource = tmp;
+   blit.src.box.z = 0;
+
+   si_blitter_begin(sctx, SI_BLIT | (info->render_condition_enable ? 0 : SI_DISABLE_RENDER_COND));
+   util_blitter_blit(sctx->blitter, &blit);
+   si_blitter_end(sctx);
+
+   pipe_resource_reference(&tmp, NULL);
+   return true;
 }
 
-static void si_blit(struct pipe_context *ctx,
-		    const struct pipe_blit_info *info)
+static void si_blit(struct pipe_context *ctx, const struct pipe_blit_info *info)
 {
-	struct si_context *sctx = (struct si_context*)ctx;
-	struct si_texture *dst = (struct si_texture *)info->dst.resource;
-
-	if (do_hardware_msaa_resolve(ctx, info)) {
-		return;
-	}
-
-	/* Using SDMA for copying to a linear texture in GTT is much faster.
-	 * This improves DRI PRIME performance.
-	 *
-	 * resource_copy_region can't do this yet, because dma_copy calls it
-	 * on failure (recursion).
-	 */
-	if (dst->surface.is_linear &&
-	    util_can_blit_via_copy_region(info, false)) {
-		sctx->dma_copy(ctx, info->dst.resource, info->dst.level,
-				 info->dst.box.x, info->dst.box.y,
-				 info->dst.box.z,
-				 info->src.resource, info->src.level,
-				 &info->src.box);
-		return;
-	}
-
-	assert(util_blitter_is_blit_supported(sctx->blitter, info));
-
-	/* The driver doesn't decompress resources automatically while
-	 * u_blitter is rendering. */
-	vi_disable_dcc_if_incompatible_format(sctx, info->src.resource,
-					      info->src.level,
-					      info->src.format);
-	vi_disable_dcc_if_incompatible_format(sctx, info->dst.resource,
-					      info->dst.level,
-					      info->dst.format);
-	si_decompress_subresource(ctx, info->src.resource, PIPE_MASK_RGBAZS,
-				  info->src.level,
-				  info->src.box.z,
-				  info->src.box.z + info->src.box.depth - 1);
-
-	if (sctx->screen->debug_flags & DBG(FORCE_SDMA) &&
-	    util_try_blit_via_copy_region(ctx, info))
-		return;
-
-	si_blitter_begin(sctx, SI_BLIT |
-			 (info->render_condition_enable ? 0 : SI_DISABLE_RENDER_COND));
-	util_blitter_blit(sctx->blitter, info);
-	si_blitter_end(sctx);
+   struct si_context *sctx = (struct si_context *)ctx;
+   struct si_texture *dst = (struct si_texture *)info->dst.resource;
+
+   if (do_hardware_msaa_resolve(ctx, info)) {
+      return;
+   }
+
+   /* Using SDMA for copying to a linear texture in GTT is much faster.
+    * This improves DRI PRIME performance.
+    *
+    * resource_copy_region can't do this yet, because dma_copy calls it
+    * on failure (recursion).
+    */
+   if (dst->surface.is_linear && util_can_blit_via_copy_region(info, false)) {
+      sctx->dma_copy(ctx, info->dst.resource, info->dst.level, info->dst.box.x, info->dst.box.y,
+                     info->dst.box.z, info->src.resource, info->src.level, &info->src.box);
+      return;
+   }
+
+   assert(util_blitter_is_blit_supported(sctx->blitter, info));
+
+   /* The driver doesn't decompress resources automatically while
+    * u_blitter is rendering. */
+   vi_disable_dcc_if_incompatible_format(sctx, info->src.resource, info->src.level,
+                                         info->src.format);
+   vi_disable_dcc_if_incompatible_format(sctx, info->dst.resource, info->dst.level,
+                                         info->dst.format);
+   si_decompress_subresource(ctx, info->src.resource, PIPE_MASK_RGBAZS, info->src.level,
+                             info->src.box.z, info->src.box.z + info->src.box.depth - 1);
+
+   if (sctx->screen->debug_flags & DBG(FORCE_SDMA) && util_try_blit_via_copy_region(ctx, info))
+      return;
+
+   si_blitter_begin(sctx, SI_BLIT | (info->render_condition_enable ? 0 : SI_DISABLE_RENDER_COND));
+   util_blitter_blit(sctx->blitter, info);
+   si_blitter_end(sctx);
 }
 
-static bool si_generate_mipmap(struct pipe_context *ctx,
-			       struct pipe_resource *tex,
-			       enum pipe_format format,
-			       unsigned base_level, unsigned last_level,
-			       unsigned first_layer, unsigned last_layer)
+static bool si_generate_mipmap(struct pipe_context *ctx, struct pipe_resource *tex,
+                               enum pipe_format format, unsigned base_level, unsigned last_level,
+                               unsigned first_layer, unsigned last_layer)
 {
-	struct si_context *sctx = (struct si_context*)ctx;
-	struct si_texture *stex = (struct si_texture *)tex;
-
-	if (!util_blitter_is_copy_supported(sctx->blitter, tex, tex))
-		return false;
-
-	/* The driver doesn't decompress resources automatically while
-	 * u_blitter is rendering. */
-	vi_disable_dcc_if_incompatible_format(sctx, tex, base_level,
-					      format);
-	si_decompress_subresource(ctx, tex, PIPE_MASK_RGBAZS,
-				  base_level, first_layer, last_layer);
-
-	/* Clear dirty_level_mask for the levels that will be overwritten. */
-	assert(base_level < last_level);
-	stex->dirty_level_mask &= ~u_bit_consecutive(base_level + 1,
-						     last_level - base_level);
-
-	sctx->generate_mipmap_for_depth = stex->is_depth;
-
-	si_blitter_begin(sctx, SI_BLIT | SI_DISABLE_RENDER_COND);
-	util_blitter_generate_mipmap(sctx->blitter, tex, format,
-				     base_level, last_level,
-				     first_layer, last_layer);
-	si_blitter_end(sctx);
-
-	sctx->generate_mipmap_for_depth = false;
-	return true;
+   struct si_context *sctx = (struct si_context *)ctx;
+   struct si_texture *stex = (struct si_texture *)tex;
+
+   if (!util_blitter_is_copy_supported(sctx->blitter, tex, tex))
+      return false;
+
+   /* The driver doesn't decompress resources automatically while
+    * u_blitter is rendering. */
+   vi_disable_dcc_if_incompatible_format(sctx, tex, base_level, format);
+   si_decompress_subresource(ctx, tex, PIPE_MASK_RGBAZS, base_level, first_layer, last_layer);
+
+   /* Clear dirty_level_mask for the levels that will be overwritten. */
+   assert(base_level < last_level);
+   stex->dirty_level_mask &= ~u_bit_consecutive(base_level + 1, last_level - base_level);
+
+   sctx->generate_mipmap_for_depth = stex->is_depth;
+
+   si_blitter_begin(sctx, SI_BLIT | SI_DISABLE_RENDER_COND);
+   util_blitter_generate_mipmap(sctx->blitter, tex, format, base_level, last_level, first_layer,
+                                last_layer);
+   si_blitter_end(sctx);
+
+   sctx->generate_mipmap_for_depth = false;
+   return true;
 }
 
-static void si_flush_resource(struct pipe_context *ctx,
-			      struct pipe_resource *res)
+static void si_flush_resource(struct pipe_context *ctx, struct pipe_resource *res)
 {
-	struct si_context *sctx = (struct si_context*)ctx;
-	struct si_texture *tex = (struct si_texture*)res;
-
-	assert(res->target != PIPE_BUFFER);
-	assert(!tex->dcc_separate_buffer || tex->dcc_gather_statistics);
-
-	/* st/dri calls flush twice per frame (not a bug), this prevents double
-	 * decompression. */
-	if (tex->dcc_separate_buffer && !tex->separate_dcc_dirty)
-		return;
-
-	if (!tex->is_depth && (tex->cmask_buffer || tex->surface.dcc_offset)) {
-		si_blit_decompress_color(sctx, tex, 0, res->last_level,
-					 0, util_max_layer(res, 0),
-					 tex->dcc_separate_buffer != NULL, false);
-
-		if (tex->surface.display_dcc_offset && tex->displayable_dcc_dirty) {
-			si_retile_dcc(sctx, tex);
-			tex->displayable_dcc_dirty = false;
-		}
-	}
-
-	/* Always do the analysis even if DCC is disabled at the moment. */
-	if (tex->dcc_gather_statistics) {
-		bool separate_dcc_dirty = tex->separate_dcc_dirty;
-
-		/* If the color buffer hasn't been unbound and fast clear hasn't
-		 * been used, separate_dcc_dirty is false, but there may have been
-		 * new rendering. Check if the color buffer is bound and assume
-		 * it's dirty.
-		 *
-		 * Note that DRI2 never unbinds window colorbuffers, which means
-		 * the DCC pipeline statistics query would never be re-set and would
-		 * keep adding new results until all free memory is exhausted if we
-		 * didn't do this.
-		 */
-		if (!separate_dcc_dirty) {
-			for (unsigned i = 0; i < sctx->framebuffer.state.nr_cbufs; i++) {
-				if (sctx->framebuffer.state.cbufs[i] &&
-				    sctx->framebuffer.state.cbufs[i]->texture == res) {
-					separate_dcc_dirty = true;
-					break;
-				}
-			}
-		}
-
-		if (separate_dcc_dirty) {
-			tex->separate_dcc_dirty = false;
-			vi_separate_dcc_process_and_reset_stats(ctx, tex);
-		}
-	}
+   struct si_context *sctx = (struct si_context *)ctx;
+   struct si_texture *tex = (struct si_texture *)res;
+
+   assert(res->target != PIPE_BUFFER);
+   assert(!tex->dcc_separate_buffer || tex->dcc_gather_statistics);
+
+   /* st/dri calls flush twice per frame (not a bug), this prevents double
+    * decompression. */
+   if (tex->dcc_separate_buffer && !tex->separate_dcc_dirty)
+      return;
+
+   if (!tex->is_depth && (tex->cmask_buffer || tex->surface.dcc_offset)) {
+      si_blit_decompress_color(sctx, tex, 0, res->last_level, 0, util_max_layer(res, 0),
+                               tex->dcc_separate_buffer != NULL, false);
+
+      if (tex->surface.display_dcc_offset && tex->displayable_dcc_dirty) {
+         si_retile_dcc(sctx, tex);
+         tex->displayable_dcc_dirty = false;
+      }
+   }
+
+   /* Always do the analysis even if DCC is disabled at the moment. */
+   if (tex->dcc_gather_statistics) {
+      bool separate_dcc_dirty = tex->separate_dcc_dirty;
+
+      /* If the color buffer hasn't been unbound and fast clear hasn't
+       * been used, separate_dcc_dirty is false, but there may have been
+       * new rendering. Check if the color buffer is bound and assume
+       * it's dirty.
+       *
+       * Note that DRI2 never unbinds window colorbuffers, which means
+       * the DCC pipeline statistics query would never be re-set and would
+       * keep adding new results until all free memory is exhausted if we
+       * didn't do this.
+       */
+      if (!separate_dcc_dirty) {
+         for (unsigned i = 0; i < sctx->framebuffer.state.nr_cbufs; i++) {
+            if (sctx->framebuffer.state.cbufs[i] &&
+                sctx->framebuffer.state.cbufs[i]->texture == res) {
+               separate_dcc_dirty = true;
+               break;
+            }
+         }
+      }
+
+      if (separate_dcc_dirty) {
+         tex->separate_dcc_dirty = false;
+         vi_separate_dcc_process_and_reset_stats(ctx, tex);
+      }
+   }
 }
 
 void si_decompress_dcc(struct si_context *sctx, struct si_texture *tex)
 {
-	/* If graphics is disabled, we can't decompress DCC, but it shouldn't
-	 * be compressed either. The caller should simply discard it.
-	 */
-	if (!tex->surface.dcc_offset || !sctx->has_graphics)
-		return;
-
-	si_blit_decompress_color(sctx, tex, 0, tex->buffer.b.b.last_level,
-				 0, util_max_layer(&tex->buffer.b.b, 0),
-				 true, false);
+   /* If graphics is disabled, we can't decompress DCC, but it shouldn't
+    * be compressed either. The caller should simply discard it.
+    */
+   if (!tex->surface.dcc_offset || !sctx->has_graphics)
+      return;
+
+   si_blit_decompress_color(sctx, tex, 0, tex->buffer.b.b.last_level, 0,
+                            util_max_layer(&tex->buffer.b.b, 0), true, false);
 }
 
 void si_init_blit_functions(struct si_context *sctx)
 {
-	sctx->b.resource_copy_region = si_resource_copy_region;
+   sctx->b.resource_copy_region = si_resource_copy_region;
 
-	if (sctx->has_graphics) {
-		sctx->b.blit = si_blit;
-		sctx->b.flush_resource = si_flush_resource;
-		sctx->b.generate_mipmap = si_generate_mipmap;
-	}
+   if (sctx->has_graphics) {
+      sctx->b.blit = si_blit;
+      sctx->b.flush_resource = si_flush_resource;
+      sctx->b.generate_mipmap = si_generate_mipmap;
+   }
 }
diff --git a/src/gallium/drivers/radeonsi/si_buffer.c b/src/gallium/drivers/radeonsi/si_buffer.c
index 38d8e9456c2..eb71636d346 100644
--- a/src/gallium/drivers/radeonsi/si_buffer.c
+++ b/src/gallium/drivers/radeonsi/si_buffer.c
@@ -24,240 +24,227 @@
 
 #include "radeonsi/si_pipe.h"
 #include "util/u_memory.h"
-#include "util/u_upload_mgr.h"
 #include "util/u_transfer.h"
+#include "util/u_upload_mgr.h"
+
 #include <inttypes.h>
 #include <stdio.h>
 
-bool si_rings_is_buffer_referenced(struct si_context *sctx,
-				   struct pb_buffer *buf,
-				   enum radeon_bo_usage usage)
+bool si_rings_is_buffer_referenced(struct si_context *sctx, struct pb_buffer *buf,
+                                   enum radeon_bo_usage usage)
 {
-	if (sctx->ws->cs_is_buffer_referenced(sctx->gfx_cs, buf, usage)) {
-		return true;
-	}
-	if (radeon_emitted(sctx->sdma_cs, 0) &&
-	    sctx->ws->cs_is_buffer_referenced(sctx->sdma_cs, buf, usage)) {
-		return true;
-	}
-	return false;
+   if (sctx->ws->cs_is_buffer_referenced(sctx->gfx_cs, buf, usage)) {
+      return true;
+   }
+   if (radeon_emitted(sctx->sdma_cs, 0) &&
+       sctx->ws->cs_is_buffer_referenced(sctx->sdma_cs, buf, usage)) {
+      return true;
+   }
+   return false;
 }
 
-void *si_buffer_map_sync_with_rings(struct si_context *sctx,
-				    struct si_resource *resource,
-				    unsigned usage)
+void *si_buffer_map_sync_with_rings(struct si_context *sctx, struct si_resource *resource,
+                                    unsigned usage)
 {
-	enum radeon_bo_usage rusage = RADEON_USAGE_READWRITE;
-	bool busy = false;
-
-	assert(!(resource->flags & RADEON_FLAG_SPARSE));
-
-	if (usage & PIPE_TRANSFER_UNSYNCHRONIZED) {
-		return sctx->ws->buffer_map(resource->buf, NULL, usage);
-	}
-
-	if (!(usage & PIPE_TRANSFER_WRITE)) {
-		/* have to wait for the last write */
-		rusage = RADEON_USAGE_WRITE;
-	}
-
-	if (radeon_emitted(sctx->gfx_cs, sctx->initial_gfx_cs_size) &&
-	    sctx->ws->cs_is_buffer_referenced(sctx->gfx_cs,
-						resource->buf, rusage)) {
-		if (usage & PIPE_TRANSFER_DONTBLOCK) {
-			si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
-			return NULL;
-		} else {
-			si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
-			busy = true;
-		}
-	}
-	if (radeon_emitted(sctx->sdma_cs, 0) &&
-	    sctx->ws->cs_is_buffer_referenced(sctx->sdma_cs,
-						resource->buf, rusage)) {
-		if (usage & PIPE_TRANSFER_DONTBLOCK) {
-			si_flush_dma_cs(sctx, PIPE_FLUSH_ASYNC, NULL);
-			return NULL;
-		} else {
-			si_flush_dma_cs(sctx, 0, NULL);
-			busy = true;
-		}
-	}
-
-	if (busy || !sctx->ws->buffer_wait(resource->buf, 0, rusage)) {
-		if (usage & PIPE_TRANSFER_DONTBLOCK) {
-			return NULL;
-		} else {
-			/* We will be wait for the GPU. Wait for any offloaded
-			 * CS flush to complete to avoid busy-waiting in the winsys. */
-			sctx->ws->cs_sync_flush(sctx->gfx_cs);
-			if (sctx->sdma_cs)
-				sctx->ws->cs_sync_flush(sctx->sdma_cs);
-		}
-	}
-
-	/* Setting the CS to NULL will prevent doing checks we have done already. */
-	return sctx->ws->buffer_map(resource->buf, NULL, usage);
+   enum radeon_bo_usage rusage = RADEON_USAGE_READWRITE;
+   bool busy = false;
+
+   assert(!(resource->flags & RADEON_FLAG_SPARSE));
+
+   if (usage & PIPE_TRANSFER_UNSYNCHRONIZED) {
+      return sctx->ws->buffer_map(resource->buf, NULL, usage);
+   }
+
+   if (!(usage & PIPE_TRANSFER_WRITE)) {
+      /* have to wait for the last write */
+      rusage = RADEON_USAGE_WRITE;
+   }
+
+   if (radeon_emitted(sctx->gfx_cs, sctx->initial_gfx_cs_size) &&
+       sctx->ws->cs_is_buffer_referenced(sctx->gfx_cs, resource->buf, rusage)) {
+      if (usage & PIPE_TRANSFER_DONTBLOCK) {
+         si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
+         return NULL;
+      } else {
+         si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
+         busy = true;
+      }
+   }
+   if (radeon_emitted(sctx->sdma_cs, 0) &&
+       sctx->ws->cs_is_buffer_referenced(sctx->sdma_cs, resource->buf, rusage)) {
+      if (usage & PIPE_TRANSFER_DONTBLOCK) {
+         si_flush_dma_cs(sctx, PIPE_FLUSH_ASYNC, NULL);
+         return NULL;
+      } else {
+         si_flush_dma_cs(sctx, 0, NULL);
+         busy = true;
+      }
+   }
+
+   if (busy || !sctx->ws->buffer_wait(resource->buf, 0, rusage)) {
+      if (usage & PIPE_TRANSFER_DONTBLOCK) {
+         return NULL;
+      } else {
+         /* We will be wait for the GPU. Wait for any offloaded
+          * CS flush to complete to avoid busy-waiting in the winsys. */
+         sctx->ws->cs_sync_flush(sctx->gfx_cs);
+         if (sctx->sdma_cs)
+            sctx->ws->cs_sync_flush(sctx->sdma_cs);
+      }
+   }
+
+   /* Setting the CS to NULL will prevent doing checks we have done already. */
+   return sctx->ws->buffer_map(resource->buf, NULL, usage);
 }
 
-void si_init_resource_fields(struct si_screen *sscreen,
-			     struct si_resource *res,
-			     uint64_t size, unsigned alignment)
+void si_init_resource_fields(struct si_screen *sscreen, struct si_resource *res, uint64_t size,
+                             unsigned alignment)
 {
-	struct si_texture *tex = (struct si_texture*)res;
-
-	res->bo_size = size;
-	res->bo_alignment = alignment;
-	res->flags = 0;
-	res->texture_handle_allocated = false;
-	res->image_handle_allocated = false;
-
-	switch (res->b.b.usage) {
-	case PIPE_USAGE_STREAM:
-		res->flags = RADEON_FLAG_GTT_WC;
-		/* fall through */
-	case PIPE_USAGE_STAGING:
-		/* Transfers are likely to occur more often with these
-		 * resources. */
-		res->domains = RADEON_DOMAIN_GTT;
-		break;
-	case PIPE_USAGE_DYNAMIC:
-		/* Older kernels didn't always flush the HDP cache before
-		 * CS execution
-		 */
-		if (!sscreen->info.kernel_flushes_hdp_before_ib) {
-			res->domains = RADEON_DOMAIN_GTT;
-			res->flags |= RADEON_FLAG_GTT_WC;
-			break;
-		}
-		/* fall through */
-	case PIPE_USAGE_DEFAULT:
-	case PIPE_USAGE_IMMUTABLE:
-	default:
-		/* Not listing GTT here improves performance in some
-		 * apps. */
-		res->domains = RADEON_DOMAIN_VRAM;
-		res->flags |= RADEON_FLAG_GTT_WC;
-		break;
-	}
-
-	if (res->b.b.target == PIPE_BUFFER &&
-	    res->b.b.flags & PIPE_RESOURCE_FLAG_MAP_PERSISTENT) {
-		/* Use GTT for all persistent mappings with older
-		 * kernels, because they didn't always flush the HDP
-		 * cache before CS execution.
-		 *
-		 * Write-combined CPU mappings are fine, the kernel
-		 * ensures all CPU writes finish before the GPU
-		 * executes a command stream.
-		 *
-		 * radeon doesn't have good BO move throttling, so put all
-		 * persistent buffers into GTT to prevent VRAM CPU page faults.
-		 */
-		if (!sscreen->info.kernel_flushes_hdp_before_ib ||
-		    !sscreen->info.is_amdgpu)
-			res->domains = RADEON_DOMAIN_GTT;
-	}
-
-	/* Tiled textures are unmappable. Always put them in VRAM. */
-	if ((res->b.b.target != PIPE_BUFFER && !tex->surface.is_linear) ||
-	    res->b.b.flags & SI_RESOURCE_FLAG_UNMAPPABLE) {
-		res->domains = RADEON_DOMAIN_VRAM;
-		res->flags |= RADEON_FLAG_NO_CPU_ACCESS |
-			 RADEON_FLAG_GTT_WC;
-	}
-
-	/* Displayable and shareable surfaces are not suballocated. */
-	if (res->b.b.bind & (PIPE_BIND_SHARED | PIPE_BIND_SCANOUT))
-		res->flags |= RADEON_FLAG_NO_SUBALLOC; /* shareable */
-	else
-		res->flags |= RADEON_FLAG_NO_INTERPROCESS_SHARING;
-
-	if (sscreen->debug_flags & DBG(NO_WC))
-		res->flags &= ~RADEON_FLAG_GTT_WC;
-
-	if (res->b.b.flags & SI_RESOURCE_FLAG_READ_ONLY)
-		res->flags |= RADEON_FLAG_READ_ONLY;
-
-	if (res->b.b.flags & SI_RESOURCE_FLAG_32BIT)
-		res->flags |= RADEON_FLAG_32BIT;
-
-	/* Set expected VRAM and GART usage for the buffer. */
-	res->vram_usage = 0;
-	res->gart_usage = 0;
-	res->max_forced_staging_uploads = 0;
-	res->b.max_forced_staging_uploads = 0;
-
-	if (res->domains & RADEON_DOMAIN_VRAM) {
-		res->vram_usage = size;
-
-		res->max_forced_staging_uploads =
-		res->b.max_forced_staging_uploads =
-			sscreen->info.has_dedicated_vram &&
-			size >= sscreen->info.vram_vis_size / 4 ? 1 : 0;
-	} else if (res->domains & RADEON_DOMAIN_GTT) {
-		res->gart_usage = size;
-	}
+   struct si_texture *tex = (struct si_texture *)res;
+
+   res->bo_size = size;
+   res->bo_alignment = alignment;
+   res->flags = 0;
+   res->texture_handle_allocated = false;
+   res->image_handle_allocated = false;
+
+   switch (res->b.b.usage) {
+   case PIPE_USAGE_STREAM:
+      res->flags = RADEON_FLAG_GTT_WC;
+      /* fall through */
+   case PIPE_USAGE_STAGING:
+      /* Transfers are likely to occur more often with these
+       * resources. */
+      res->domains = RADEON_DOMAIN_GTT;
+      break;
+   case PIPE_USAGE_DYNAMIC:
+      /* Older kernels didn't always flush the HDP cache before
+       * CS execution
+       */
+      if (!sscreen->info.kernel_flushes_hdp_before_ib) {
+         res->domains = RADEON_DOMAIN_GTT;
+         res->flags |= RADEON_FLAG_GTT_WC;
+         break;
+      }
+      /* fall through */
+   case PIPE_USAGE_DEFAULT:
+   case PIPE_USAGE_IMMUTABLE:
+   default:
+      /* Not listing GTT here improves performance in some
+       * apps. */
+      res->domains = RADEON_DOMAIN_VRAM;
+      res->flags |= RADEON_FLAG_GTT_WC;
+      break;
+   }
+
+   if (res->b.b.target == PIPE_BUFFER && res->b.b.flags & PIPE_RESOURCE_FLAG_MAP_PERSISTENT) {
+      /* Use GTT for all persistent mappings with older
+       * kernels, because they didn't always flush the HDP
+       * cache before CS execution.
+       *
+       * Write-combined CPU mappings are fine, the kernel
+       * ensures all CPU writes finish before the GPU
+       * executes a command stream.
+       *
+       * radeon doesn't have good BO move throttling, so put all
+       * persistent buffers into GTT to prevent VRAM CPU page faults.
+       */
+      if (!sscreen->info.kernel_flushes_hdp_before_ib || !sscreen->info.is_amdgpu)
+         res->domains = RADEON_DOMAIN_GTT;
+   }
+
+   /* Tiled textures are unmappable. Always put them in VRAM. */
+   if ((res->b.b.target != PIPE_BUFFER && !tex->surface.is_linear) ||
+       res->b.b.flags & SI_RESOURCE_FLAG_UNMAPPABLE) {
+      res->domains = RADEON_DOMAIN_VRAM;
+      res->flags |= RADEON_FLAG_NO_CPU_ACCESS | RADEON_FLAG_GTT_WC;
+   }
+
+   /* Displayable and shareable surfaces are not suballocated. */
+   if (res->b.b.bind & (PIPE_BIND_SHARED | PIPE_BIND_SCANOUT))
+      res->flags |= RADEON_FLAG_NO_SUBALLOC; /* shareable */
+   else
+      res->flags |= RADEON_FLAG_NO_INTERPROCESS_SHARING;
+
+   if (sscreen->debug_flags & DBG(NO_WC))
+      res->flags &= ~RADEON_FLAG_GTT_WC;
+
+   if (res->b.b.flags & SI_RESOURCE_FLAG_READ_ONLY)
+      res->flags |= RADEON_FLAG_READ_ONLY;
+
+   if (res->b.b.flags & SI_RESOURCE_FLAG_32BIT)
+      res->flags |= RADEON_FLAG_32BIT;
+
+   /* Set expected VRAM and GART usage for the buffer. */
+   res->vram_usage = 0;
+   res->gart_usage = 0;
+   res->max_forced_staging_uploads = 0;
+   res->b.max_forced_staging_uploads = 0;
+
+   if (res->domains & RADEON_DOMAIN_VRAM) {
+      res->vram_usage = size;
+
+      res->max_forced_staging_uploads = res->b.max_forced_staging_uploads =
+         sscreen->info.has_dedicated_vram && size >= sscreen->info.vram_vis_size / 4 ? 1 : 0;
+   } else if (res->domains & RADEON_DOMAIN_GTT) {
+      res->gart_usage = size;
+   }
 }
 
-bool si_alloc_resource(struct si_screen *sscreen,
-		       struct si_resource *res)
+bool si_alloc_resource(struct si_screen *sscreen, struct si_resource *res)
 {
-	struct pb_buffer *old_buf, *new_buf;
-
-	/* Allocate a new resource. */
-	new_buf = sscreen->ws->buffer_create(sscreen->ws, res->bo_size,
-					     res->bo_alignment,
-					     res->domains, res->flags);
-	if (!new_buf) {
-		return false;
-	}
-
-	/* Replace the pointer such that if res->buf wasn't NULL, it won't be
-	 * NULL. This should prevent crashes with multiple contexts using
-	 * the same buffer where one of the contexts invalidates it while
-	 * the others are using it. */
-	old_buf = res->buf;
-	res->buf = new_buf; /* should be atomic */
-	res->gpu_address = sscreen->ws->buffer_get_virtual_address(res->buf);
-
-	if (res->flags & RADEON_FLAG_32BIT) {
-		uint64_t start = res->gpu_address;
-		uint64_t last = start + res->bo_size - 1;
-		(void)start;
-		(void)last;
-
-		assert((start >> 32) == sscreen->info.address32_hi);
-		assert((last >> 32) == sscreen->info.address32_hi);
-	}
-
-	pb_reference(&old_buf, NULL);
-
-	util_range_set_empty(&res->valid_buffer_range);
-	res->TC_L2_dirty = false;
-
-	/* Print debug information. */
-	if (sscreen->debug_flags & DBG(VM) && res->b.b.target == PIPE_BUFFER) {
-		fprintf(stderr, "VM start=0x%"PRIX64"  end=0x%"PRIX64" | Buffer %"PRIu64" bytes\n",
-			res->gpu_address, res->gpu_address + res->buf->size,
-			res->buf->size);
-	}
-
-	if (res->b.b.flags & SI_RESOURCE_FLAG_CLEAR)
-		si_screen_clear_buffer(sscreen, &res->b.b, 0, res->bo_size, 0);
-
-	return true;
+   struct pb_buffer *old_buf, *new_buf;
+
+   /* Allocate a new resource. */
+   new_buf = sscreen->ws->buffer_create(sscreen->ws, res->bo_size, res->bo_alignment, res->domains,
+                                        res->flags);
+   if (!new_buf) {
+      return false;
+   }
+
+   /* Replace the pointer such that if res->buf wasn't NULL, it won't be
+    * NULL. This should prevent crashes with multiple contexts using
+    * the same buffer where one of the contexts invalidates it while
+    * the others are using it. */
+   old_buf = res->buf;
+   res->buf = new_buf; /* should be atomic */
+   res->gpu_address = sscreen->ws->buffer_get_virtual_address(res->buf);
+
+   if (res->flags & RADEON_FLAG_32BIT) {
+      uint64_t start = res->gpu_address;
+      uint64_t last = start + res->bo_size - 1;
+      (void)start;
+      (void)last;
+
+      assert((start >> 32) == sscreen->info.address32_hi);
+      assert((last >> 32) == sscreen->info.address32_hi);
+   }
+
+   pb_reference(&old_buf, NULL);
+
+   util_range_set_empty(&res->valid_buffer_range);
+   res->TC_L2_dirty = false;
+
+   /* Print debug information. */
+   if (sscreen->debug_flags & DBG(VM) && res->b.b.target == PIPE_BUFFER) {
+      fprintf(stderr, "VM start=0x%" PRIX64 "  end=0x%" PRIX64 " | Buffer %" PRIu64 " bytes\n",
+              res->gpu_address, res->gpu_address + res->buf->size, res->buf->size);
+   }
+
+   if (res->b.b.flags & SI_RESOURCE_FLAG_CLEAR)
+      si_screen_clear_buffer(sscreen, &res->b.b, 0, res->bo_size, 0);
+
+   return true;
 }
 
-static void si_buffer_destroy(struct pipe_screen *screen,
-			      struct pipe_resource *buf)
+static void si_buffer_destroy(struct pipe_screen *screen, struct pipe_resource *buf)
 {
-	struct si_resource *buffer = si_resource(buf);
+   struct si_resource *buffer = si_resource(buf);
 
-	threaded_resource_deinit(buf);
-	util_range_destroy(&buffer->valid_buffer_range);
-	pb_reference(&buffer->buf, NULL);
-	FREE(buffer);
+   threaded_resource_deinit(buf);
+   util_range_destroy(&buffer->valid_buffer_range);
+   pb_reference(&buffer->buf, NULL);
+   FREE(buffer);
 }
 
 /* Reallocate the buffer a update all resource bindings where the buffer is
@@ -266,560 +253,511 @@ static void si_buffer_destroy(struct pipe_screen *screen,
  * This is used to avoid CPU-GPU synchronizations, because it makes the buffer
  * idle by discarding its contents.
  */
-static bool
-si_invalidate_buffer(struct si_context *sctx,
-		     struct si_resource *buf)
+static bool si_invalidate_buffer(struct si_context *sctx, struct si_resource *buf)
 {
-	/* Shared buffers can't be reallocated. */
-	if (buf->b.is_shared)
-		return false;
-
-	/* Sparse buffers can't be reallocated. */
-	if (buf->flags & RADEON_FLAG_SPARSE)
-		return false;
-
-	/* In AMD_pinned_memory, the user pointer association only gets
-	 * broken when the buffer is explicitly re-allocated.
-	 */
-	if (buf->b.is_user_ptr)
-		return false;
-
-	/* Check if mapping this buffer would cause waiting for the GPU. */
-	if (si_rings_is_buffer_referenced(sctx, buf->buf, RADEON_USAGE_READWRITE) ||
-	    !sctx->ws->buffer_wait(buf->buf, 0, RADEON_USAGE_READWRITE)) {
-		/* Reallocate the buffer in the same pipe_resource. */
-		si_alloc_resource(sctx->screen, buf);
-		si_rebind_buffer(sctx, &buf->b.b);
-	} else {
-		util_range_set_empty(&buf->valid_buffer_range);
-	}
-
-	return true;
+   /* Shared buffers can't be reallocated. */
+   if (buf->b.is_shared)
+      return false;
+
+   /* Sparse buffers can't be reallocated. */
+   if (buf->flags & RADEON_FLAG_SPARSE)
+      return false;
+
+   /* In AMD_pinned_memory, the user pointer association only gets
+    * broken when the buffer is explicitly re-allocated.
+    */
+   if (buf->b.is_user_ptr)
+      return false;
+
+   /* Check if mapping this buffer would cause waiting for the GPU. */
+   if (si_rings_is_buffer_referenced(sctx, buf->buf, RADEON_USAGE_READWRITE) ||
+       !sctx->ws->buffer_wait(buf->buf, 0, RADEON_USAGE_READWRITE)) {
+      /* Reallocate the buffer in the same pipe_resource. */
+      si_alloc_resource(sctx->screen, buf);
+      si_rebind_buffer(sctx, &buf->b.b);
+   } else {
+      util_range_set_empty(&buf->valid_buffer_range);
+   }
+
+   return true;
 }
 
 /* Replace the storage of dst with src. */
-void si_replace_buffer_storage(struct pipe_context *ctx,
-				 struct pipe_resource *dst,
-				 struct pipe_resource *src)
+void si_replace_buffer_storage(struct pipe_context *ctx, struct pipe_resource *dst,
+                               struct pipe_resource *src)
 {
-	struct si_context *sctx = (struct si_context*)ctx;
-	struct si_resource *sdst = si_resource(dst);
-	struct si_resource *ssrc = si_resource(src);
-
-	pb_reference(&sdst->buf, ssrc->buf);
-	sdst->gpu_address = ssrc->gpu_address;
-	sdst->b.b.bind = ssrc->b.b.bind;
-	sdst->b.max_forced_staging_uploads = ssrc->b.max_forced_staging_uploads;
-	sdst->max_forced_staging_uploads = ssrc->max_forced_staging_uploads;
-	sdst->flags = ssrc->flags;
-
-	assert(sdst->vram_usage == ssrc->vram_usage);
-	assert(sdst->gart_usage == ssrc->gart_usage);
-	assert(sdst->bo_size == ssrc->bo_size);
-	assert(sdst->bo_alignment == ssrc->bo_alignment);
-	assert(sdst->domains == ssrc->domains);
-
-	si_rebind_buffer(sctx, dst);
+   struct si_context *sctx = (struct si_context *)ctx;
+   struct si_resource *sdst = si_resource(dst);
+   struct si_resource *ssrc = si_resource(src);
+
+   pb_reference(&sdst->buf, ssrc->buf);
+   sdst->gpu_address = ssrc->gpu_address;
+   sdst->b.b.bind = ssrc->b.b.bind;
+   sdst->b.max_forced_staging_uploads = ssrc->b.max_forced_staging_uploads;
+   sdst->max_forced_staging_uploads = ssrc->max_forced_staging_uploads;
+   sdst->flags = ssrc->flags;
+
+   assert(sdst->vram_usage == ssrc->vram_usage);
+   assert(sdst->gart_usage == ssrc->gart_usage);
+   assert(sdst->bo_size == ssrc->bo_size);
+   assert(sdst->bo_alignment == ssrc->bo_alignment);
+   assert(sdst->domains == ssrc->domains);
+
+   si_rebind_buffer(sctx, dst);
 }
 
-static void si_invalidate_resource(struct pipe_context *ctx,
-				   struct pipe_resource *resource)
+static void si_invalidate_resource(struct pipe_context *ctx, struct pipe_resource *resource)
 {
-	struct si_context *sctx = (struct si_context*)ctx;
-	struct si_resource *buf = si_resource(resource);
+   struct si_context *sctx = (struct si_context *)ctx;
+   struct si_resource *buf = si_resource(resource);
 
-	/* We currently only do anyting here for buffers */
-	if (resource->target == PIPE_BUFFER)
-		(void)si_invalidate_buffer(sctx, buf);
+   /* We currently only do anyting here for buffers */
+   if (resource->target == PIPE_BUFFER)
+      (void)si_invalidate_buffer(sctx, buf);
 }
 
-static void *si_buffer_get_transfer(struct pipe_context *ctx,
-				    struct pipe_resource *resource,
-				    unsigned usage,
-				    const struct pipe_box *box,
-				    struct pipe_transfer **ptransfer,
-				    void *data, struct si_resource *staging,
-				    unsigned offset)
+static void *si_buffer_get_transfer(struct pipe_context *ctx, struct pipe_resource *resource,
+                                    unsigned usage, const struct pipe_box *box,
+                                    struct pipe_transfer **ptransfer, void *data,
+                                    struct si_resource *staging, unsigned offset)
 {
-	struct si_context *sctx = (struct si_context*)ctx;
-	struct si_transfer *transfer;
-
-	if (usage & TC_TRANSFER_MAP_THREADED_UNSYNC)
-		transfer = slab_alloc(&sctx->pool_transfers_unsync);
-	else
-		transfer = slab_alloc(&sctx->pool_transfers);
-
-	transfer->b.b.resource = NULL;
-	pipe_resource_reference(&transfer->b.b.resource, resource);
-	transfer->b.b.level = 0;
-	transfer->b.b.usage = usage;
-	transfer->b.b.box = *box;
-	transfer->b.b.stride = 0;
-	transfer->b.b.layer_stride = 0;
-	transfer->b.staging = NULL;
-	transfer->offset = offset;
-	transfer->staging = staging;
-	*ptransfer = &transfer->b.b;
-	return data;
+   struct si_context *sctx = (struct si_context *)ctx;
+   struct si_transfer *transfer;
+
+   if (usage & TC_TRANSFER_MAP_THREADED_UNSYNC)
+      transfer = slab_alloc(&sctx->pool_transfers_unsync);
+   else
+      transfer = slab_alloc(&sctx->pool_transfers);
+
+   transfer->b.b.resource = NULL;
+   pipe_resource_reference(&transfer->b.b.resource, resource);
+   transfer->b.b.level = 0;
+   transfer->b.b.usage = usage;
+   transfer->b.b.box = *box;
+   transfer->b.b.stride = 0;
+   transfer->b.b.layer_stride = 0;
+   transfer->b.staging = NULL;
+   transfer->offset = offset;
+   transfer->staging = staging;
+   *ptransfer = &transfer->b.b;
+   return data;
 }
 
-static void *si_buffer_transfer_map(struct pipe_context *ctx,
-				    struct pipe_resource *resource,
-				    unsigned level,
-				    unsigned usage,
-				    const struct pipe_box *box,
-				    struct pipe_transfer **ptransfer)
+static void *si_buffer_transfer_map(struct pipe_context *ctx, struct pipe_resource *resource,
+                                    unsigned level, unsigned usage, const struct pipe_box *box,
+                                    struct pipe_transfer **ptransfer)
 {
-	struct si_context *sctx = (struct si_context*)ctx;
-	struct si_resource *buf = si_resource(resource);
-	uint8_t *data;
-
-	assert(box->x + box->width <= resource->width0);
-
-	/* From GL_AMD_pinned_memory issues:
-	 *
-	 *     4) Is glMapBuffer on a shared buffer guaranteed to return the
-	 *        same system address which was specified at creation time?
-	 *
-	 *        RESOLVED: NO. The GL implementation might return a different
-	 *        virtual mapping of that memory, although the same physical
-	 *        page will be used.
-	 *
-	 * So don't ever use staging buffers.
-	 */
-	if (buf->b.is_user_ptr)
-		usage |= PIPE_TRANSFER_PERSISTENT;
-
-	/* See if the buffer range being mapped has never been initialized,
-	 * in which case it can be mapped unsynchronized. */
-	if (!(usage & (PIPE_TRANSFER_UNSYNCHRONIZED |
-		       TC_TRANSFER_MAP_NO_INFER_UNSYNCHRONIZED)) &&
-	    usage & PIPE_TRANSFER_WRITE &&
-	    !buf->b.is_shared &&
-	    !util_ranges_intersect(&buf->valid_buffer_range, box->x, box->x + box->width)) {
-		usage |= PIPE_TRANSFER_UNSYNCHRONIZED;
-	}
-
-	/* If discarding the entire range, discard the whole resource instead. */
-	if (usage & PIPE_TRANSFER_DISCARD_RANGE &&
-	    box->x == 0 && box->width == resource->width0) {
-		usage |= PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE;
-	}
-
-	/* If a buffer in VRAM is too large and the range is discarded, don't
-	 * map it directly. This makes sure that the buffer stays in VRAM.
-	 */
-	bool force_discard_range = false;
-	if (usage & (PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE |
-		     PIPE_TRANSFER_DISCARD_RANGE) &&
-	    !(usage & PIPE_TRANSFER_PERSISTENT) &&
-	    /* Try not to decrement the counter if it's not positive. Still racy,
-	     * but it makes it harder to wrap the counter from INT_MIN to INT_MAX. */
-	    buf->max_forced_staging_uploads > 0 &&
-	    p_atomic_dec_return(&buf->max_forced_staging_uploads) >= 0) {
-		usage &= ~(PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE |
-			   PIPE_TRANSFER_UNSYNCHRONIZED);
-		usage |= PIPE_TRANSFER_DISCARD_RANGE;
-		force_discard_range = true;
-	}
-
-	if (usage & PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE &&
-	    !(usage & (PIPE_TRANSFER_UNSYNCHRONIZED |
-		       TC_TRANSFER_MAP_NO_INVALIDATE))) {
-		assert(usage & PIPE_TRANSFER_WRITE);
-
-		if (si_invalidate_buffer(sctx, buf)) {
-			/* At this point, the buffer is always idle. */
-			usage |= PIPE_TRANSFER_UNSYNCHRONIZED;
-		} else {
-			/* Fall back to a temporary buffer. */
-			usage |= PIPE_TRANSFER_DISCARD_RANGE;
-		}
-	}
-
-	if (usage & PIPE_TRANSFER_FLUSH_EXPLICIT &&
-	    buf->b.b.flags & SI_RESOURCE_FLAG_UPLOAD_FLUSH_EXPLICIT_VIA_SDMA) {
-		usage &= ~(PIPE_TRANSFER_UNSYNCHRONIZED |
-			   PIPE_TRANSFER_PERSISTENT);
-		usage |= PIPE_TRANSFER_DISCARD_RANGE;
-		force_discard_range = true;
-	}
-
-	if (usage & PIPE_TRANSFER_DISCARD_RANGE &&
-	    ((!(usage & (PIPE_TRANSFER_UNSYNCHRONIZED |
-			 PIPE_TRANSFER_PERSISTENT))) ||
-	     (buf->flags & RADEON_FLAG_SPARSE))) {
-		assert(usage & PIPE_TRANSFER_WRITE);
-
-		/* Check if mapping this buffer would cause waiting for the GPU.
-		 */
-		if (buf->flags & RADEON_FLAG_SPARSE ||
-		    force_discard_range ||
-		    si_rings_is_buffer_referenced(sctx, buf->buf, RADEON_USAGE_READWRITE) ||
-		    !sctx->ws->buffer_wait(buf->buf, 0, RADEON_USAGE_READWRITE)) {
-			/* Do a wait-free write-only transfer using a temporary buffer. */
-			struct u_upload_mgr *uploader;
-			struct si_resource *staging = NULL;
-			unsigned offset;
-
-			/* If we are not called from the driver thread, we have
-			 * to use the uploader from u_threaded_context, which is
-			 * local to the calling thread.
-			 */
-			if (usage & TC_TRANSFER_MAP_THREADED_UNSYNC)
-				uploader = sctx->tc->base.stream_uploader;
-			else
-				uploader = sctx->b.stream_uploader;
-
-			u_upload_alloc(uploader, 0,
-                                       box->width + (box->x % SI_MAP_BUFFER_ALIGNMENT),
-				       sctx->screen->info.tcc_cache_line_size,
-				       &offset, (struct pipe_resource**)&staging,
-                                       (void**)&data);
-
-			if (staging) {
-				data += box->x % SI_MAP_BUFFER_ALIGNMENT;
-				return si_buffer_get_transfer(ctx, resource, usage, box,
-								ptransfer, data, staging, offset);
-			} else if (buf->flags & RADEON_FLAG_SPARSE) {
-				return NULL;
-			}
-		} else {
-			/* At this point, the buffer is always idle (we checked it above). */
-			usage |= PIPE_TRANSFER_UNSYNCHRONIZED;
-		}
-	}
-	/* Use a staging buffer in cached GTT for reads. */
-	else if (((usage & PIPE_TRANSFER_READ) &&
-		  !(usage & PIPE_TRANSFER_PERSISTENT) &&
-		  (buf->domains & RADEON_DOMAIN_VRAM ||
-		   buf->flags & RADEON_FLAG_GTT_WC)) ||
-		 (buf->flags & RADEON_FLAG_SPARSE)) {
-		struct si_resource *staging;
-
-		assert(!(usage & TC_TRANSFER_MAP_THREADED_UNSYNC));
-		staging = si_resource(pipe_buffer_create(
-				ctx->screen, 0, PIPE_USAGE_STAGING,
-				box->width + (box->x % SI_MAP_BUFFER_ALIGNMENT)));
-		if (staging) {
-			/* Copy the VRAM buffer to the staging buffer. */
-			si_sdma_copy_buffer(sctx, &staging->b.b, resource,
-					    box->x % SI_MAP_BUFFER_ALIGNMENT,
-					    box->x, box->width);
-
-			data = si_buffer_map_sync_with_rings(sctx, staging,
-							     usage & ~PIPE_TRANSFER_UNSYNCHRONIZED);
-			if (!data) {
-				si_resource_reference(&staging, NULL);
-				return NULL;
-			}
-			data += box->x % SI_MAP_BUFFER_ALIGNMENT;
-
-			return si_buffer_get_transfer(ctx, resource, usage, box,
-							ptransfer, data, staging, 0);
-		} else if (buf->flags & RADEON_FLAG_SPARSE) {
-			return NULL;
-		}
-	}
-
-	data = si_buffer_map_sync_with_rings(sctx, buf, usage);
-	if (!data) {
-		return NULL;
-	}
-	data += box->x;
-
-	return si_buffer_get_transfer(ctx, resource, usage, box,
-					ptransfer, data, NULL, 0);
+   struct si_context *sctx = (struct si_context *)ctx;
+   struct si_resource *buf = si_resource(resource);
+   uint8_t *data;
+
+   assert(box->x + box->width <= resource->width0);
+
+   /* From GL_AMD_pinned_memory issues:
+    *
+    *     4) Is glMapBuffer on a shared buffer guaranteed to return the
+    *        same system address which was specified at creation time?
+    *
+    *        RESOLVED: NO. The GL implementation might return a different
+    *        virtual mapping of that memory, although the same physical
+    *        page will be used.
+    *
+    * So don't ever use staging buffers.
+    */
+   if (buf->b.is_user_ptr)
+      usage |= PIPE_TRANSFER_PERSISTENT;
+
+   /* See if the buffer range being mapped has never been initialized,
+    * in which case it can be mapped unsynchronized. */
+   if (!(usage & (PIPE_TRANSFER_UNSYNCHRONIZED | TC_TRANSFER_MAP_NO_INFER_UNSYNCHRONIZED)) &&
+       usage & PIPE_TRANSFER_WRITE && !buf->b.is_shared &&
+       !util_ranges_intersect(&buf->valid_buffer_range, box->x, box->x + box->width)) {
+      usage |= PIPE_TRANSFER_UNSYNCHRONIZED;
+   }
+
+   /* If discarding the entire range, discard the whole resource instead. */
+   if (usage & PIPE_TRANSFER_DISCARD_RANGE && box->x == 0 && box->width == resource->width0) {
+      usage |= PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE;
+   }
+
+   /* If a buffer in VRAM is too large and the range is discarded, don't
+    * map it directly. This makes sure that the buffer stays in VRAM.
+    */
+   bool force_discard_range = false;
+   if (usage & (PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE | PIPE_TRANSFER_DISCARD_RANGE) &&
+       !(usage & PIPE_TRANSFER_PERSISTENT) &&
+       /* Try not to decrement the counter if it's not positive. Still racy,
+        * but it makes it harder to wrap the counter from INT_MIN to INT_MAX. */
+       buf->max_forced_staging_uploads > 0 &&
+       p_atomic_dec_return(&buf->max_forced_staging_uploads) >= 0) {
+      usage &= ~(PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE | PIPE_TRANSFER_UNSYNCHRONIZED);
+      usage |= PIPE_TRANSFER_DISCARD_RANGE;
+      force_discard_range = true;
+   }
+
+   if (usage & PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE &&
+       !(usage & (PIPE_TRANSFER_UNSYNCHRONIZED | TC_TRANSFER_MAP_NO_INVALIDATE))) {
+      assert(usage & PIPE_TRANSFER_WRITE);
+
+      if (si_invalidate_buffer(sctx, buf)) {
+         /* At this point, the buffer is always idle. */
+         usage |= PIPE_TRANSFER_UNSYNCHRONIZED;
+      } else {
+         /* Fall back to a temporary buffer. */
+         usage |= PIPE_TRANSFER_DISCARD_RANGE;
+      }
+   }
+
+   if (usage & PIPE_TRANSFER_FLUSH_EXPLICIT &&
+       buf->b.b.flags & SI_RESOURCE_FLAG_UPLOAD_FLUSH_EXPLICIT_VIA_SDMA) {
+      usage &= ~(PIPE_TRANSFER_UNSYNCHRONIZED | PIPE_TRANSFER_PERSISTENT);
+      usage |= PIPE_TRANSFER_DISCARD_RANGE;
+      force_discard_range = true;
+   }
+
+   if (usage & PIPE_TRANSFER_DISCARD_RANGE &&
+       ((!(usage & (PIPE_TRANSFER_UNSYNCHRONIZED | PIPE_TRANSFER_PERSISTENT))) ||
+        (buf->flags & RADEON_FLAG_SPARSE))) {
+      assert(usage & PIPE_TRANSFER_WRITE);
+
+      /* Check if mapping this buffer would cause waiting for the GPU.
+       */
+      if (buf->flags & RADEON_FLAG_SPARSE || force_discard_range ||
+          si_rings_is_buffer_referenced(sctx, buf->buf, RADEON_USAGE_READWRITE) ||
+          !sctx->ws->buffer_wait(buf->buf, 0, RADEON_USAGE_READWRITE)) {
+         /* Do a wait-free write-only transfer using a temporary buffer. */
+         struct u_upload_mgr *uploader;
+         struct si_resource *staging = NULL;
+         unsigned offset;
+
+         /* If we are not called from the driver thread, we have
+          * to use the uploader from u_threaded_context, which is
+          * local to the calling thread.
+          */
+         if (usage & TC_TRANSFER_MAP_THREADED_UNSYNC)
+            uploader = sctx->tc->base.stream_uploader;
+         else
+            uploader = sctx->b.stream_uploader;
+
+         u_upload_alloc(uploader, 0, box->width + (box->x % SI_MAP_BUFFER_ALIGNMENT),
+                        sctx->screen->info.tcc_cache_line_size, &offset,
+                        (struct pipe_resource **)&staging, (void **)&data);
+
+         if (staging) {
+            data += box->x % SI_MAP_BUFFER_ALIGNMENT;
+            return si_buffer_get_transfer(ctx, resource, usage, box, ptransfer, data, staging,
+                                          offset);
+         } else if (buf->flags & RADEON_FLAG_SPARSE) {
+            return NULL;
+         }
+      } else {
+         /* At this point, the buffer is always idle (we checked it above). */
+         usage |= PIPE_TRANSFER_UNSYNCHRONIZED;
+      }
+   }
+   /* Use a staging buffer in cached GTT for reads. */
+   else if (((usage & PIPE_TRANSFER_READ) && !(usage & PIPE_TRANSFER_PERSISTENT) &&
+             (buf->domains & RADEON_DOMAIN_VRAM || buf->flags & RADEON_FLAG_GTT_WC)) ||
+            (buf->flags & RADEON_FLAG_SPARSE)) {
+      struct si_resource *staging;
+
+      assert(!(usage & TC_TRANSFER_MAP_THREADED_UNSYNC));
+      staging = si_resource(pipe_buffer_create(ctx->screen, 0, PIPE_USAGE_STAGING,
+                                               box->width + (box->x % SI_MAP_BUFFER_ALIGNMENT)));
+      if (staging) {
+         /* Copy the VRAM buffer to the staging buffer. */
+         si_sdma_copy_buffer(sctx, &staging->b.b, resource, box->x % SI_MAP_BUFFER_ALIGNMENT,
+                             box->x, box->width);
+
+         data = si_buffer_map_sync_with_rings(sctx, staging, usage & ~PIPE_TRANSFER_UNSYNCHRONIZED);
+         if (!data) {
+            si_resource_reference(&staging, NULL);
+            return NULL;
+         }
+         data += box->x % SI_MAP_BUFFER_ALIGNMENT;
+
+         return si_buffer_get_transfer(ctx, resource, usage, box, ptransfer, data, staging, 0);
+      } else if (buf->flags & RADEON_FLAG_SPARSE) {
+         return NULL;
+      }
+   }
+
+   data = si_buffer_map_sync_with_rings(sctx, buf, usage);
+   if (!data) {
+      return NULL;
+   }
+   data += box->x;
+
+   return si_buffer_get_transfer(ctx, resource, usage, box, ptransfer, data, NULL, 0);
 }
 
-static void si_buffer_do_flush_region(struct pipe_context *ctx,
-				      struct pipe_transfer *transfer,
-				      const struct pipe_box *box)
+static void si_buffer_do_flush_region(struct pipe_context *ctx, struct pipe_transfer *transfer,
+                                      const struct pipe_box *box)
 {
-	struct si_context *sctx = (struct si_context*)ctx;
-	struct si_transfer *stransfer = (struct si_transfer*)transfer;
-	struct si_resource *buf = si_resource(transfer->resource);
-
-	if (stransfer->staging) {
-		unsigned src_offset = stransfer->offset +
-				      transfer->box.x % SI_MAP_BUFFER_ALIGNMENT +
-				      (box->x - transfer->box.x);
-
-		if (buf->b.b.flags & SI_RESOURCE_FLAG_UPLOAD_FLUSH_EXPLICIT_VIA_SDMA) {
-			/* This should be true for all uploaders. */
-			assert(transfer->box.x == 0);
-
-			/* Find a previous upload and extend its range. The last
-			 * upload is likely to be at the end of the list.
-			 */
-			for (int i = sctx->num_sdma_uploads - 1; i >= 0; i--) {
-				struct si_sdma_upload *up = &sctx->sdma_uploads[i];
-
-				if (up->dst != buf)
-					continue;
-
-				assert(up->src == stransfer->staging);
-				assert(box->x > up->dst_offset);
-				up->size = box->x + box->width - up->dst_offset;
-				return;
-			}
-
-			/* Enlarge the array if it's full. */
-			if (sctx->num_sdma_uploads == sctx->max_sdma_uploads) {
-				unsigned size;
-
-				sctx->max_sdma_uploads += 4;
-				size = sctx->max_sdma_uploads * sizeof(sctx->sdma_uploads[0]);
-				sctx->sdma_uploads = realloc(sctx->sdma_uploads, size);
-			}
-
-			/* Add a new upload. */
-			struct si_sdma_upload *up =
-				&sctx->sdma_uploads[sctx->num_sdma_uploads++];
-			up->dst = up->src = NULL;
-			si_resource_reference(&up->dst, buf);
-			si_resource_reference(&up->src, stransfer->staging);
-			up->dst_offset = box->x;
-			up->src_offset = src_offset;
-			up->size = box->width;
-			return;
-		}
-
-		/* Copy the staging buffer into the original one. */
-		si_copy_buffer(sctx, transfer->resource, &stransfer->staging->b.b,
-			       box->x, src_offset, box->width);
-	}
-
-	util_range_add(&buf->b.b, &buf->valid_buffer_range, box->x,
-		       box->x + box->width);
+   struct si_context *sctx = (struct si_context *)ctx;
+   struct si_transfer *stransfer = (struct si_transfer *)transfer;
+   struct si_resource *buf = si_resource(transfer->resource);
+
+   if (stransfer->staging) {
+      unsigned src_offset =
+         stransfer->offset + transfer->box.x % SI_MAP_BUFFER_ALIGNMENT + (box->x - transfer->box.x);
+
+      if (buf->b.b.flags & SI_RESOURCE_FLAG_UPLOAD_FLUSH_EXPLICIT_VIA_SDMA) {
+         /* This should be true for all uploaders. */
+         assert(transfer->box.x == 0);
+
+         /* Find a previous upload and extend its range. The last
+          * upload is likely to be at the end of the list.
+          */
+         for (int i = sctx->num_sdma_uploads - 1; i >= 0; i--) {
+            struct si_sdma_upload *up = &sctx->sdma_uploads[i];
+
+            if (up->dst != buf)
+               continue;
+
+            assert(up->src == stransfer->staging);
+            assert(box->x > up->dst_offset);
+            up->size = box->x + box->width - up->dst_offset;
+            return;
+         }
+
+         /* Enlarge the array if it's full. */
+         if (sctx->num_sdma_uploads == sctx->max_sdma_uploads) {
+            unsigned size;
+
+            sctx->max_sdma_uploads += 4;
+            size = sctx->max_sdma_uploads * sizeof(sctx->sdma_uploads[0]);
+            sctx->sdma_uploads = realloc(sctx->sdma_uploads, size);
+         }
+
+         /* Add a new upload. */
+         struct si_sdma_upload *up = &sctx->sdma_uploads[sctx->num_sdma_uploads++];
+         up->dst = up->src = NULL;
+         si_resource_reference(&up->dst, buf);
+         si_resource_reference(&up->src, stransfer->staging);
+         up->dst_offset = box->x;
+         up->src_offset = src_offset;
+         up->size = box->width;
+         return;
+      }
+
+      /* Copy the staging buffer into the original one. */
+      si_copy_buffer(sctx, transfer->resource, &stransfer->staging->b.b, box->x, src_offset,
+                     box->width);
+   }
+
+   util_range_add(&buf->b.b, &buf->valid_buffer_range, box->x, box->x + box->width);
 }
 
-static void si_buffer_flush_region(struct pipe_context *ctx,
-				   struct pipe_transfer *transfer,
-				   const struct pipe_box *rel_box)
+static void si_buffer_flush_region(struct pipe_context *ctx, struct pipe_transfer *transfer,
+                                   const struct pipe_box *rel_box)
 {
-	unsigned required_usage = PIPE_TRANSFER_WRITE |
-				  PIPE_TRANSFER_FLUSH_EXPLICIT;
+   unsigned required_usage = PIPE_TRANSFER_WRITE | PIPE_TRANSFER_FLUSH_EXPLICIT;
 
-	if ((transfer->usage & required_usage) == required_usage) {
-		struct pipe_box box;
+   if ((transfer->usage & required_usage) == required_usage) {
+      struct pipe_box box;
 
-		u_box_1d(transfer->box.x + rel_box->x, rel_box->width, &box);
-		si_buffer_do_flush_region(ctx, transfer, &box);
-	}
+      u_box_1d(transfer->box.x + rel_box->x, rel_box->width, &box);
+      si_buffer_do_flush_region(ctx, transfer, &box);
+   }
 }
 
-static void si_buffer_transfer_unmap(struct pipe_context *ctx,
-				     struct pipe_transfer *transfer)
+static void si_buffer_transfer_unmap(struct pipe_context *ctx, struct pipe_transfer *transfer)
 {
-	struct si_context *sctx = (struct si_context*)ctx;
-	struct si_transfer *stransfer = (struct si_transfer*)transfer;
+   struct si_context *sctx = (struct si_context *)ctx;
+   struct si_transfer *stransfer = (struct si_transfer *)transfer;
 
-	if (transfer->usage & PIPE_TRANSFER_WRITE &&
-	    !(transfer->usage & PIPE_TRANSFER_FLUSH_EXPLICIT))
-		si_buffer_do_flush_region(ctx, transfer, &transfer->box);
+   if (transfer->usage & PIPE_TRANSFER_WRITE && !(transfer->usage & PIPE_TRANSFER_FLUSH_EXPLICIT))
+      si_buffer_do_flush_region(ctx, transfer, &transfer->box);
 
-	si_resource_reference(&stransfer->staging, NULL);
-	assert(stransfer->b.staging == NULL); /* for threaded context only */
-	pipe_resource_reference(&transfer->resource, NULL);
+   si_resource_reference(&stransfer->staging, NULL);
+   assert(stransfer->b.staging == NULL); /* for threaded context only */
+   pipe_resource_reference(&transfer->resource, NULL);
 
-	/* Don't use pool_transfers_unsync. We are always in the driver
-	 * thread. */
-	slab_free(&sctx->pool_transfers, transfer);
+   /* Don't use pool_transfers_unsync. We are always in the driver
+    * thread. */
+   slab_free(&sctx->pool_transfers, transfer);
 }
 
-static void si_buffer_subdata(struct pipe_context *ctx,
-			      struct pipe_resource *buffer,
-			      unsigned usage, unsigned offset,
-			      unsigned size, const void *data)
+static void si_buffer_subdata(struct pipe_context *ctx, struct pipe_resource *buffer,
+                              unsigned usage, unsigned offset, unsigned size, const void *data)
 {
-	struct pipe_transfer *transfer = NULL;
-	struct pipe_box box;
-	uint8_t *map = NULL;
+   struct pipe_transfer *transfer = NULL;
+   struct pipe_box box;
+   uint8_t *map = NULL;
 
-	usage |= PIPE_TRANSFER_WRITE;
+   usage |= PIPE_TRANSFER_WRITE;
 
-	if (!(usage & PIPE_TRANSFER_MAP_DIRECTLY))
-		usage |= PIPE_TRANSFER_DISCARD_RANGE;
+   if (!(usage & PIPE_TRANSFER_MAP_DIRECTLY))
+      usage |= PIPE_TRANSFER_DISCARD_RANGE;
 
-	u_box_1d(offset, size, &box);
-	map = si_buffer_transfer_map(ctx, buffer, 0, usage, &box, &transfer);
-	if (!map)
-		return;
+   u_box_1d(offset, size, &box);
+   map = si_buffer_transfer_map(ctx, buffer, 0, usage, &box, &transfer);
+   if (!map)
+      return;
 
-	memcpy(map, data, size);
-	si_buffer_transfer_unmap(ctx, transfer);
+   memcpy(map, data, size);
+   si_buffer_transfer_unmap(ctx, transfer);
 }
 
-static const struct u_resource_vtbl si_buffer_vtbl =
-{
-	NULL,				/* get_handle */
-	si_buffer_destroy,		/* resource_destroy */
-	si_buffer_transfer_map,	/* transfer_map */
-	si_buffer_flush_region,	/* transfer_flush_region */
-	si_buffer_transfer_unmap,	/* transfer_unmap */
+static const struct u_resource_vtbl si_buffer_vtbl = {
+   NULL,                     /* get_handle */
+   si_buffer_destroy,        /* resource_destroy */
+   si_buffer_transfer_map,   /* transfer_map */
+   si_buffer_flush_region,   /* transfer_flush_region */
+   si_buffer_transfer_unmap, /* transfer_unmap */
 };
 
-static struct si_resource *
-si_alloc_buffer_struct(struct pipe_screen *screen,
-		       const struct pipe_resource *templ)
+static struct si_resource *si_alloc_buffer_struct(struct pipe_screen *screen,
+                                                  const struct pipe_resource *templ)
 {
-	struct si_resource *buf;
+   struct si_resource *buf;
 
-	buf = MALLOC_STRUCT(si_resource);
+   buf = MALLOC_STRUCT(si_resource);
 
-	buf->b.b = *templ;
-	buf->b.b.next = NULL;
-	pipe_reference_init(&buf->b.b.reference, 1);
-	buf->b.b.screen = screen;
+   buf->b.b = *templ;
+   buf->b.b.next = NULL;
+   pipe_reference_init(&buf->b.b.reference, 1);
+   buf->b.b.screen = screen;
 
-	buf->b.vtbl = &si_buffer_vtbl;
-	threaded_resource_init(&buf->b.b);
+   buf->b.vtbl = &si_buffer_vtbl;
+   threaded_resource_init(&buf->b.b);
 
-	buf->buf = NULL;
-	buf->bind_history = 0;
-	buf->TC_L2_dirty = false;
-	util_range_init(&buf->valid_buffer_range);
-	return buf;
+   buf->buf = NULL;
+   buf->bind_history = 0;
+   buf->TC_L2_dirty = false;
+   util_range_init(&buf->valid_buffer_range);
+   return buf;
 }
 
 static struct pipe_resource *si_buffer_create(struct pipe_screen *screen,
-					      const struct pipe_resource *templ,
-					      unsigned alignment)
+                                              const struct pipe_resource *templ, unsigned alignment)
 {
-	struct si_screen *sscreen = (struct si_screen*)screen;
-	struct si_resource *buf = si_alloc_buffer_struct(screen, templ);
+   struct si_screen *sscreen = (struct si_screen *)screen;
+   struct si_resource *buf = si_alloc_buffer_struct(screen, templ);
 
-	if (templ->flags & PIPE_RESOURCE_FLAG_SPARSE)
-		buf->b.b.flags |= SI_RESOURCE_FLAG_UNMAPPABLE;
+   if (templ->flags & PIPE_RESOURCE_FLAG_SPARSE)
+      buf->b.b.flags |= SI_RESOURCE_FLAG_UNMAPPABLE;
 
-	si_init_resource_fields(sscreen, buf, templ->width0, alignment);
+   si_init_resource_fields(sscreen, buf, templ->width0, alignment);
 
-	if (templ->flags & PIPE_RESOURCE_FLAG_SPARSE)
-		buf->flags |= RADEON_FLAG_SPARSE;
+   if (templ->flags & PIPE_RESOURCE_FLAG_SPARSE)
+      buf->flags |= RADEON_FLAG_SPARSE;
 
-	if (!si_alloc_resource(sscreen, buf)) {
-		FREE(buf);
-		return NULL;
-	}
-	return &buf->b.b;
+   if (!si_alloc_resource(sscreen, buf)) {
+      FREE(buf);
+      return NULL;
+   }
+   return &buf->b.b;
 }
 
-struct pipe_resource *pipe_aligned_buffer_create(struct pipe_screen *screen,
-						 unsigned flags, unsigned usage,
-						 unsigned size, unsigned alignment)
+struct pipe_resource *pipe_aligned_buffer_create(struct pipe_screen *screen, unsigned flags,
+                                                 unsigned usage, unsigned size, unsigned alignment)
 {
-	struct pipe_resource buffer;
-
-	memset(&buffer, 0, sizeof buffer);
-	buffer.target = PIPE_BUFFER;
-	buffer.format = PIPE_FORMAT_R8_UNORM;
-	buffer.bind = 0;
-	buffer.usage = usage;
-	buffer.flags = flags;
-	buffer.width0 = size;
-	buffer.height0 = 1;
-	buffer.depth0 = 1;
-	buffer.array_size = 1;
-	return si_buffer_create(screen, &buffer, alignment);
+   struct pipe_resource buffer;
+
+   memset(&buffer, 0, sizeof buffer);
+   buffer.target = PIPE_BUFFER;
+   buffer.format = PIPE_FORMAT_R8_UNORM;
+   buffer.bind = 0;
+   buffer.usage = usage;
+   buffer.flags = flags;
+   buffer.width0 = size;
+   buffer.height0 = 1;
+   buffer.depth0 = 1;
+   buffer.array_size = 1;
+   return si_buffer_create(screen, &buffer, alignment);
 }
 
-struct si_resource *si_aligned_buffer_create(struct pipe_screen *screen,
-					       unsigned flags, unsigned usage,
-					       unsigned size, unsigned alignment)
+struct si_resource *si_aligned_buffer_create(struct pipe_screen *screen, unsigned flags,
+                                             unsigned usage, unsigned size, unsigned alignment)
 {
-	return si_resource(pipe_aligned_buffer_create(screen, flags, usage,
-							size, alignment));
+   return si_resource(pipe_aligned_buffer_create(screen, flags, usage, size, alignment));
 }
 
-static struct pipe_resource *
-si_buffer_from_user_memory(struct pipe_screen *screen,
-			   const struct pipe_resource *templ,
-			   void *user_memory)
+static struct pipe_resource *si_buffer_from_user_memory(struct pipe_screen *screen,
+                                                        const struct pipe_resource *templ,
+                                                        void *user_memory)
 {
-	struct si_screen *sscreen = (struct si_screen*)screen;
-	struct radeon_winsys *ws = sscreen->ws;
-	struct si_resource *buf = si_alloc_buffer_struct(screen, templ);
-
-	buf->domains = RADEON_DOMAIN_GTT;
-	buf->flags = 0;
-	buf->b.is_user_ptr = true;
-	util_range_add(&buf->b.b, &buf->valid_buffer_range, 0, templ->width0);
-	util_range_add(&buf->b.b, &buf->b.valid_buffer_range, 0, templ->width0);
-
-	/* Convert a user pointer to a buffer. */
-	buf->buf = ws->buffer_from_ptr(ws, user_memory, templ->width0);
-	if (!buf->buf) {
-		FREE(buf);
-		return NULL;
-	}
-
-	buf->gpu_address = ws->buffer_get_virtual_address(buf->buf);
-	buf->vram_usage = 0;
-	buf->gart_usage = templ->width0;
-
-	return &buf->b.b;
+   struct si_screen *sscreen = (struct si_screen *)screen;
+   struct radeon_winsys *ws = sscreen->ws;
+   struct si_resource *buf = si_alloc_buffer_struct(screen, templ);
+
+   buf->domains = RADEON_DOMAIN_GTT;
+   buf->flags = 0;
+   buf->b.is_user_ptr = true;
+   util_range_add(&buf->b.b, &buf->valid_buffer_range, 0, templ->width0);
+   util_range_add(&buf->b.b, &buf->b.valid_buffer_range, 0, templ->width0);
+
+   /* Convert a user pointer to a buffer. */
+   buf->buf = ws->buffer_from_ptr(ws, user_memory, templ->width0);
+   if (!buf->buf) {
+      FREE(buf);
+      return NULL;
+   }
+
+   buf->gpu_address = ws->buffer_get_virtual_address(buf->buf);
+   buf->vram_usage = 0;
+   buf->gart_usage = templ->width0;
+
+   return &buf->b.b;
 }
 
 static struct pipe_resource *si_resource_create(struct pipe_screen *screen,
-						const struct pipe_resource *templ)
+                                                const struct pipe_resource *templ)
 {
-	if (templ->target == PIPE_BUFFER) {
-		return si_buffer_create(screen, templ, 256);
-	} else {
-		return si_texture_create(screen, templ);
-	}
+   if (templ->target == PIPE_BUFFER) {
+      return si_buffer_create(screen, templ, 256);
+   } else {
+      return si_texture_create(screen, templ);
+   }
 }
 
-static bool si_resource_commit(struct pipe_context *pctx,
-			       struct pipe_resource *resource,
-			       unsigned level, struct pipe_box *box,
-			       bool commit)
+static bool si_resource_commit(struct pipe_context *pctx, struct pipe_resource *resource,
+                               unsigned level, struct pipe_box *box, bool commit)
 {
-	struct si_context *ctx = (struct si_context *)pctx;
-	struct si_resource *res = si_resource(resource);
-
-	/*
-	 * Since buffer commitment changes cannot be pipelined, we need to
-	 * (a) flush any pending commands that refer to the buffer we're about
-	 *     to change, and
-	 * (b) wait for threaded submit to finish, including those that were
-	 *     triggered by some other, earlier operation.
-	 */
-	if (radeon_emitted(ctx->gfx_cs, ctx->initial_gfx_cs_size) &&
-	    ctx->ws->cs_is_buffer_referenced(ctx->gfx_cs,
-					       res->buf, RADEON_USAGE_READWRITE)) {
-		si_flush_gfx_cs(ctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
-	}
-	if (radeon_emitted(ctx->sdma_cs, 0) &&
-	    ctx->ws->cs_is_buffer_referenced(ctx->sdma_cs,
-					       res->buf, RADEON_USAGE_READWRITE)) {
-		si_flush_dma_cs(ctx, PIPE_FLUSH_ASYNC, NULL);
-	}
-
-	if (ctx->sdma_cs)
-		ctx->ws->cs_sync_flush(ctx->sdma_cs);
-	ctx->ws->cs_sync_flush(ctx->gfx_cs);
-
-	assert(resource->target == PIPE_BUFFER);
-
-	return ctx->ws->buffer_commit(res->buf, box->x, box->width, commit);
+   struct si_context *ctx = (struct si_context *)pctx;
+   struct si_resource *res = si_resource(resource);
+
+   /*
+    * Since buffer commitment changes cannot be pipelined, we need to
+    * (a) flush any pending commands that refer to the buffer we're about
+    *     to change, and
+    * (b) wait for threaded submit to finish, including those that were
+    *     triggered by some other, earlier operation.
+    */
+   if (radeon_emitted(ctx->gfx_cs, ctx->initial_gfx_cs_size) &&
+       ctx->ws->cs_is_buffer_referenced(ctx->gfx_cs, res->buf, RADEON_USAGE_READWRITE)) {
+      si_flush_gfx_cs(ctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
+   }
+   if (radeon_emitted(ctx->sdma_cs, 0) &&
+       ctx->ws->cs_is_buffer_referenced(ctx->sdma_cs, res->buf, RADEON_USAGE_READWRITE)) {
+      si_flush_dma_cs(ctx, PIPE_FLUSH_ASYNC, NULL);
+   }
+
+   if (ctx->sdma_cs)
+      ctx->ws->cs_sync_flush(ctx->sdma_cs);
+   ctx->ws->cs_sync_flush(ctx->gfx_cs);
+
+   assert(resource->target == PIPE_BUFFER);
+
+   return ctx->ws->buffer_commit(res->buf, box->x, box->width, commit);
 }
 
 void si_init_screen_buffer_functions(struct si_screen *sscreen)
 {
-	sscreen->b.resource_create = si_resource_create;
-	sscreen->b.resource_destroy = u_resource_destroy_vtbl;
-	sscreen->b.resource_from_user_memory = si_buffer_from_user_memory;
+   sscreen->b.resource_create = si_resource_create;
+   sscreen->b.resource_destroy = u_resource_destroy_vtbl;
+   sscreen->b.resource_from_user_memory = si_buffer_from_user_memory;
 }
 
 void si_init_buffer_functions(struct si_context *sctx)
 {
-	sctx->b.invalidate_resource = si_invalidate_resource;
-	sctx->b.transfer_map = u_transfer_map_vtbl;
-	sctx->b.transfer_flush_region = u_transfer_flush_region_vtbl;
-	sctx->b.transfer_unmap = u_transfer_unmap_vtbl;
-	sctx->b.texture_subdata = u_default_texture_subdata;
-	sctx->b.buffer_subdata = si_buffer_subdata;
-	sctx->b.resource_commit = si_resource_commit;
+   sctx->b.invalidate_resource = si_invalidate_resource;
+   sctx->b.transfer_map = u_transfer_map_vtbl;
+   sctx->b.transfer_flush_region = u_transfer_flush_region_vtbl;
+   sctx->b.transfer_unmap = u_transfer_unmap_vtbl;
+   sctx->b.texture_subdata = u_default_texture_subdata;
+   sctx->b.buffer_subdata = si_buffer_subdata;
+   sctx->b.resource_commit = si_resource_commit;
 }
diff --git a/src/gallium/drivers/radeonsi/si_build_pm4.h b/src/gallium/drivers/radeonsi/si_build_pm4.h
index 0b0b64ca13c..8a9b6ea5e34 100644
--- a/src/gallium/drivers/radeonsi/si_build_pm4.h
+++ b/src/gallium/drivers/radeonsi/si_build_pm4.h
@@ -34,131 +34,128 @@
 
 static inline void radeon_set_config_reg_seq(struct radeon_cmdbuf *cs, unsigned reg, unsigned num)
 {
-	assert(reg < SI_CONTEXT_REG_OFFSET);
-	assert(cs->current.cdw + 2 + num <= cs->current.max_dw);
-	radeon_emit(cs, PKT3(PKT3_SET_CONFIG_REG, num, 0));
-	radeon_emit(cs, (reg - SI_CONFIG_REG_OFFSET) >> 2);
+   assert(reg < SI_CONTEXT_REG_OFFSET);
+   assert(cs->current.cdw + 2 + num <= cs->current.max_dw);
+   radeon_emit(cs, PKT3(PKT3_SET_CONFIG_REG, num, 0));
+   radeon_emit(cs, (reg - SI_CONFIG_REG_OFFSET) >> 2);
 }
 
 static inline void radeon_set_config_reg(struct radeon_cmdbuf *cs, unsigned reg, unsigned value)
 {
-	radeon_set_config_reg_seq(cs, reg, 1);
-	radeon_emit(cs, value);
+   radeon_set_config_reg_seq(cs, reg, 1);
+   radeon_emit(cs, value);
 }
 
 static inline void radeon_set_context_reg_seq(struct radeon_cmdbuf *cs, unsigned reg, unsigned num)
 {
-	assert(reg >= SI_CONTEXT_REG_OFFSET);
-	assert(cs->current.cdw + 2 + num <= cs->current.max_dw);
-	radeon_emit(cs, PKT3(PKT3_SET_CONTEXT_REG, num, 0));
-	radeon_emit(cs, (reg - SI_CONTEXT_REG_OFFSET) >> 2);
+   assert(reg >= SI_CONTEXT_REG_OFFSET);
+   assert(cs->current.cdw + 2 + num <= cs->current.max_dw);
+   radeon_emit(cs, PKT3(PKT3_SET_CONTEXT_REG, num, 0));
+   radeon_emit(cs, (reg - SI_CONTEXT_REG_OFFSET) >> 2);
 }
 
 static inline void radeon_set_context_reg(struct radeon_cmdbuf *cs, unsigned reg, unsigned value)
 {
-	radeon_set_context_reg_seq(cs, reg, 1);
-	radeon_emit(cs, value);
+   radeon_set_context_reg_seq(cs, reg, 1);
+   radeon_emit(cs, value);
 }
 
-static inline void radeon_set_context_reg_idx(struct radeon_cmdbuf *cs,
-					      unsigned reg, unsigned idx,
-					      unsigned value)
+static inline void radeon_set_context_reg_idx(struct radeon_cmdbuf *cs, unsigned reg, unsigned idx,
+                                              unsigned value)
 {
-	assert(reg >= SI_CONTEXT_REG_OFFSET);
-	assert(cs->current.cdw + 3 <= cs->current.max_dw);
-	radeon_emit(cs, PKT3(PKT3_SET_CONTEXT_REG, 1, 0));
-	radeon_emit(cs, (reg - SI_CONTEXT_REG_OFFSET) >> 2 | (idx << 28));
-	radeon_emit(cs, value);
+   assert(reg >= SI_CONTEXT_REG_OFFSET);
+   assert(cs->current.cdw + 3 <= cs->current.max_dw);
+   radeon_emit(cs, PKT3(PKT3_SET_CONTEXT_REG, 1, 0));
+   radeon_emit(cs, (reg - SI_CONTEXT_REG_OFFSET) >> 2 | (idx << 28));
+   radeon_emit(cs, value);
 }
 
 static inline void radeon_set_sh_reg_seq(struct radeon_cmdbuf *cs, unsigned reg, unsigned num)
 {
-	assert(reg >= SI_SH_REG_OFFSET && reg < SI_SH_REG_END);
-	assert(cs->current.cdw + 2 + num <= cs->current.max_dw);
-	radeon_emit(cs, PKT3(PKT3_SET_SH_REG, num, 0));
-	radeon_emit(cs, (reg - SI_SH_REG_OFFSET) >> 2);
+   assert(reg >= SI_SH_REG_OFFSET && reg < SI_SH_REG_END);
+   assert(cs->current.cdw + 2 + num <= cs->current.max_dw);
+   radeon_emit(cs, PKT3(PKT3_SET_SH_REG, num, 0));
+   radeon_emit(cs, (reg - SI_SH_REG_OFFSET) >> 2);
 }
 
 static inline void radeon_set_sh_reg(struct radeon_cmdbuf *cs, unsigned reg, unsigned value)
 {
-	radeon_set_sh_reg_seq(cs, reg, 1);
-	radeon_emit(cs, value);
+   radeon_set_sh_reg_seq(cs, reg, 1);
+   radeon_emit(cs, value);
 }
 
 static inline void radeon_set_uconfig_reg_seq(struct radeon_cmdbuf *cs, unsigned reg, unsigned num)
 {
-	assert(reg >= CIK_UCONFIG_REG_OFFSET && reg < CIK_UCONFIG_REG_END);
-	assert(cs->current.cdw + 2 + num <= cs->current.max_dw);
-	radeon_emit(cs, PKT3(PKT3_SET_UCONFIG_REG, num, 0));
-	radeon_emit(cs, (reg - CIK_UCONFIG_REG_OFFSET) >> 2);
+   assert(reg >= CIK_UCONFIG_REG_OFFSET && reg < CIK_UCONFIG_REG_END);
+   assert(cs->current.cdw + 2 + num <= cs->current.max_dw);
+   radeon_emit(cs, PKT3(PKT3_SET_UCONFIG_REG, num, 0));
+   radeon_emit(cs, (reg - CIK_UCONFIG_REG_OFFSET) >> 2);
 }
 
 static inline void radeon_set_uconfig_reg(struct radeon_cmdbuf *cs, unsigned reg, unsigned value)
 {
-	radeon_set_uconfig_reg_seq(cs, reg, 1);
-	radeon_emit(cs, value);
+   radeon_set_uconfig_reg_seq(cs, reg, 1);
+   radeon_emit(cs, value);
 }
 
-static inline void radeon_set_uconfig_reg_idx(struct radeon_cmdbuf *cs,
-					      struct si_screen *screen,
-					      unsigned reg, unsigned idx,
-					      unsigned value)
+static inline void radeon_set_uconfig_reg_idx(struct radeon_cmdbuf *cs, struct si_screen *screen,
+                                              unsigned reg, unsigned idx, unsigned value)
 {
-	assert(reg >= CIK_UCONFIG_REG_OFFSET && reg < CIK_UCONFIG_REG_END);
-	assert(cs->current.cdw + 3 <= cs->current.max_dw);
-	assert(idx != 0);
-	unsigned opcode = PKT3_SET_UCONFIG_REG_INDEX;
-	if (screen->info.chip_class < GFX9 ||
-	    (screen->info.chip_class == GFX9 && screen->info.me_fw_version < 26))
-		opcode = PKT3_SET_UCONFIG_REG;
-	radeon_emit(cs, PKT3(opcode, 1, 0));
-	radeon_emit(cs, (reg - CIK_UCONFIG_REG_OFFSET) >> 2 | (idx << 28));
-	radeon_emit(cs, value);
+   assert(reg >= CIK_UCONFIG_REG_OFFSET && reg < CIK_UCONFIG_REG_END);
+   assert(cs->current.cdw + 3 <= cs->current.max_dw);
+   assert(idx != 0);
+   unsigned opcode = PKT3_SET_UCONFIG_REG_INDEX;
+   if (screen->info.chip_class < GFX9 ||
+       (screen->info.chip_class == GFX9 && screen->info.me_fw_version < 26))
+      opcode = PKT3_SET_UCONFIG_REG;
+   radeon_emit(cs, PKT3(opcode, 1, 0));
+   radeon_emit(cs, (reg - CIK_UCONFIG_REG_OFFSET) >> 2 | (idx << 28));
+   radeon_emit(cs, value);
 }
 
 static inline void radeon_set_context_reg_rmw(struct radeon_cmdbuf *cs, unsigned reg,
-					      unsigned value, unsigned mask)
+                                              unsigned value, unsigned mask)
 {
-	assert(reg >= SI_CONTEXT_REG_OFFSET);
-	assert(cs->current.cdw + 4 <= cs->current.max_dw);
-	radeon_emit(cs, PKT3(PKT3_CONTEXT_REG_RMW, 2, 0));
-	radeon_emit(cs, (reg - SI_CONTEXT_REG_OFFSET) >> 2);
-	radeon_emit(cs, mask);
-	radeon_emit(cs, value);
+   assert(reg >= SI_CONTEXT_REG_OFFSET);
+   assert(cs->current.cdw + 4 <= cs->current.max_dw);
+   radeon_emit(cs, PKT3(PKT3_CONTEXT_REG_RMW, 2, 0));
+   radeon_emit(cs, (reg - SI_CONTEXT_REG_OFFSET) >> 2);
+   radeon_emit(cs, mask);
+   radeon_emit(cs, value);
 }
 
 /* Emit PKT3_CONTEXT_REG_RMW if the register value is different. */
 static inline void radeon_opt_set_context_reg_rmw(struct si_context *sctx, unsigned offset,
-						  enum si_tracked_reg reg, unsigned value,
-						  unsigned mask)
+                                                  enum si_tracked_reg reg, unsigned value,
+                                                  unsigned mask)
 {
-	struct radeon_cmdbuf *cs = sctx->gfx_cs;
+   struct radeon_cmdbuf *cs = sctx->gfx_cs;
 
-	assert((value & ~mask) == 0);
-	value &= mask;
+   assert((value & ~mask) == 0);
+   value &= mask;
 
-	if (((sctx->tracked_regs.reg_saved >> reg) & 0x1) != 0x1 ||
-	    sctx->tracked_regs.reg_value[reg] != value) {
-		radeon_set_context_reg_rmw(cs, offset, value, mask);
+   if (((sctx->tracked_regs.reg_saved >> reg) & 0x1) != 0x1 ||
+       sctx->tracked_regs.reg_value[reg] != value) {
+      radeon_set_context_reg_rmw(cs, offset, value, mask);
 
-		sctx->tracked_regs.reg_saved |= 0x1ull << reg;
-		sctx->tracked_regs.reg_value[reg] = value;
-	}
+      sctx->tracked_regs.reg_saved |= 0x1ull << reg;
+      sctx->tracked_regs.reg_value[reg] = value;
+   }
 }
 
 /* Emit PKT3_SET_CONTEXT_REG if the register value is different. */
 static inline void radeon_opt_set_context_reg(struct si_context *sctx, unsigned offset,
-					      enum si_tracked_reg reg, unsigned value)
+                                              enum si_tracked_reg reg, unsigned value)
 {
-	struct radeon_cmdbuf *cs = sctx->gfx_cs;
+   struct radeon_cmdbuf *cs = sctx->gfx_cs;
 
-	if (((sctx->tracked_regs.reg_saved >> reg) & 0x1) != 0x1 ||
-	    sctx->tracked_regs.reg_value[reg] != value) {
-		radeon_set_context_reg(cs, offset, value);
+   if (((sctx->tracked_regs.reg_saved >> reg) & 0x1) != 0x1 ||
+       sctx->tracked_regs.reg_value[reg] != value) {
+      radeon_set_context_reg(cs, offset, value);
 
-		sctx->tracked_regs.reg_saved |= 0x1ull << reg;
-		sctx->tracked_regs.reg_value[reg] = value;
-	}
+      sctx->tracked_regs.reg_saved |= 0x1ull << reg;
+      sctx->tracked_regs.reg_value[reg] = value;
+   }
 }
 
 /**
@@ -168,98 +165,96 @@ static inline void radeon_opt_set_context_reg(struct si_context *sctx, unsigned
  * @param value2        is written to second register
  */
 static inline void radeon_opt_set_context_reg2(struct si_context *sctx, unsigned offset,
-					       enum si_tracked_reg reg, unsigned value1,
-					       unsigned value2)
+                                               enum si_tracked_reg reg, unsigned value1,
+                                               unsigned value2)
 {
-	struct radeon_cmdbuf *cs = sctx->gfx_cs;
-
-	if (((sctx->tracked_regs.reg_saved >> reg) & 0x3) != 0x3 ||
-	    sctx->tracked_regs.reg_value[reg] != value1 ||
-	    sctx->tracked_regs.reg_value[reg+1] != value2) {
-		radeon_set_context_reg_seq(cs, offset, 2);
-		radeon_emit(cs, value1);
-		radeon_emit(cs, value2);
-
-		sctx->tracked_regs.reg_value[reg] = value1;
-		sctx->tracked_regs.reg_value[reg+1] = value2;
-		sctx->tracked_regs.reg_saved |= 0x3ull << reg;
-	}
+   struct radeon_cmdbuf *cs = sctx->gfx_cs;
+
+   if (((sctx->tracked_regs.reg_saved >> reg) & 0x3) != 0x3 ||
+       sctx->tracked_regs.reg_value[reg] != value1 ||
+       sctx->tracked_regs.reg_value[reg + 1] != value2) {
+      radeon_set_context_reg_seq(cs, offset, 2);
+      radeon_emit(cs, value1);
+      radeon_emit(cs, value2);
+
+      sctx->tracked_regs.reg_value[reg] = value1;
+      sctx->tracked_regs.reg_value[reg + 1] = value2;
+      sctx->tracked_regs.reg_saved |= 0x3ull << reg;
+   }
 }
 
 /**
  * Set 3 consecutive registers if any registers value is different.
  */
 static inline void radeon_opt_set_context_reg3(struct si_context *sctx, unsigned offset,
-					       enum si_tracked_reg reg, unsigned value1,
-					       unsigned value2, unsigned value3)
+                                               enum si_tracked_reg reg, unsigned value1,
+                                               unsigned value2, unsigned value3)
 {
-	struct radeon_cmdbuf *cs = sctx->gfx_cs;
-
-	if (((sctx->tracked_regs.reg_saved >> reg) & 0x7) != 0x7 ||
-	    sctx->tracked_regs.reg_value[reg] != value1 ||
-	    sctx->tracked_regs.reg_value[reg+1] != value2 ||
-	    sctx->tracked_regs.reg_value[reg+2] != value3) {
-		radeon_set_context_reg_seq(cs, offset, 3);
-		radeon_emit(cs, value1);
-		radeon_emit(cs, value2);
-		radeon_emit(cs, value3);
-
-		sctx->tracked_regs.reg_value[reg] = value1;
-		sctx->tracked_regs.reg_value[reg+1] = value2;
-		sctx->tracked_regs.reg_value[reg+2] = value3;
-		sctx->tracked_regs.reg_saved |= 0x7ull << reg;
-	}
+   struct radeon_cmdbuf *cs = sctx->gfx_cs;
+
+   if (((sctx->tracked_regs.reg_saved >> reg) & 0x7) != 0x7 ||
+       sctx->tracked_regs.reg_value[reg] != value1 ||
+       sctx->tracked_regs.reg_value[reg + 1] != value2 ||
+       sctx->tracked_regs.reg_value[reg + 2] != value3) {
+      radeon_set_context_reg_seq(cs, offset, 3);
+      radeon_emit(cs, value1);
+      radeon_emit(cs, value2);
+      radeon_emit(cs, value3);
+
+      sctx->tracked_regs.reg_value[reg] = value1;
+      sctx->tracked_regs.reg_value[reg + 1] = value2;
+      sctx->tracked_regs.reg_value[reg + 2] = value3;
+      sctx->tracked_regs.reg_saved |= 0x7ull << reg;
+   }
 }
 
 /**
  * Set 4 consecutive registers if any registers value is different.
  */
 static inline void radeon_opt_set_context_reg4(struct si_context *sctx, unsigned offset,
-					       enum si_tracked_reg reg, unsigned value1,
-					       unsigned value2, unsigned value3,
-					       unsigned value4)
+                                               enum si_tracked_reg reg, unsigned value1,
+                                               unsigned value2, unsigned value3, unsigned value4)
 {
-	struct radeon_cmdbuf *cs = sctx->gfx_cs;
-
-	if (((sctx->tracked_regs.reg_saved >> reg) & 0xf) != 0xf ||
-	    sctx->tracked_regs.reg_value[reg] != value1 ||
-	    sctx->tracked_regs.reg_value[reg+1] != value2 ||
-	    sctx->tracked_regs.reg_value[reg+2] != value3 ||
-	    sctx->tracked_regs.reg_value[reg+3] != value4) {
-		radeon_set_context_reg_seq(cs, offset, 4);
-		radeon_emit(cs, value1);
-		radeon_emit(cs, value2);
-		radeon_emit(cs, value3);
-		radeon_emit(cs, value4);
-
-		sctx->tracked_regs.reg_value[reg] = value1;
-		sctx->tracked_regs.reg_value[reg+1] = value2;
-		sctx->tracked_regs.reg_value[reg+2] = value3;
-		sctx->tracked_regs.reg_value[reg+3] = value4;
-		sctx->tracked_regs.reg_saved |= 0xfull << reg;
-	}
+   struct radeon_cmdbuf *cs = sctx->gfx_cs;
+
+   if (((sctx->tracked_regs.reg_saved >> reg) & 0xf) != 0xf ||
+       sctx->tracked_regs.reg_value[reg] != value1 ||
+       sctx->tracked_regs.reg_value[reg + 1] != value2 ||
+       sctx->tracked_regs.reg_value[reg + 2] != value3 ||
+       sctx->tracked_regs.reg_value[reg + 3] != value4) {
+      radeon_set_context_reg_seq(cs, offset, 4);
+      radeon_emit(cs, value1);
+      radeon_emit(cs, value2);
+      radeon_emit(cs, value3);
+      radeon_emit(cs, value4);
+
+      sctx->tracked_regs.reg_value[reg] = value1;
+      sctx->tracked_regs.reg_value[reg + 1] = value2;
+      sctx->tracked_regs.reg_value[reg + 2] = value3;
+      sctx->tracked_regs.reg_value[reg + 3] = value4;
+      sctx->tracked_regs.reg_saved |= 0xfull << reg;
+   }
 }
 
 /**
  * Set consecutive registers if any registers value is different.
  */
 static inline void radeon_opt_set_context_regn(struct si_context *sctx, unsigned offset,
-					       unsigned *value, unsigned *saved_val,
-					       unsigned num)
+                                               unsigned *value, unsigned *saved_val, unsigned num)
 {
-	struct radeon_cmdbuf *cs = sctx->gfx_cs;
-	int i, j;
-
-	for (i = 0; i < num; i++) {
-		if (saved_val[i] != value[i]) {
-			radeon_set_context_reg_seq(cs, offset, num);
-			for (j = 0; j < num; j++)
-				radeon_emit(cs, value[j]);
-
-			memcpy(saved_val, value, sizeof(uint32_t) * num);
-			break;
-		}
-	}
+   struct radeon_cmdbuf *cs = sctx->gfx_cs;
+   int i, j;
+
+   for (i = 0; i < num; i++) {
+      if (saved_val[i] != value[i]) {
+         radeon_set_context_reg_seq(cs, offset, num);
+         for (j = 0; j < num; j++)
+            radeon_emit(cs, value[j]);
+
+         memcpy(saved_val, value, sizeof(uint32_t) * num);
+         break;
+      }
+   }
 }
 
 #endif
diff --git a/src/gallium/drivers/radeonsi/si_clear.c b/src/gallium/drivers/radeonsi/si_clear.c
index 2af778b41ad..1e7aa443222 100644
--- a/src/gallium/drivers/radeonsi/si_clear.c
+++ b/src/gallium/drivers/radeonsi/si_clear.c
@@ -24,761 +24,710 @@
 
 #include "si_pipe.h"
 #include "sid.h"
-
 #include "util/format/u_format.h"
 #include "util/u_pack_color.h"
 #include "util/u_surface.h"
 
-enum {
-	SI_CLEAR         = SI_SAVE_FRAGMENT_STATE,
-	SI_CLEAR_SURFACE = SI_SAVE_FRAMEBUFFER | SI_SAVE_FRAGMENT_STATE,
+enum
+{
+   SI_CLEAR = SI_SAVE_FRAGMENT_STATE,
+   SI_CLEAR_SURFACE = SI_SAVE_FRAMEBUFFER | SI_SAVE_FRAGMENT_STATE,
 };
 
-static void si_alloc_separate_cmask(struct si_screen *sscreen,
-				    struct si_texture *tex)
+static void si_alloc_separate_cmask(struct si_screen *sscreen, struct si_texture *tex)
 {
-	/* CMASK for MSAA is allocated in advance or always disabled
-	 * by "nofmask" option.
-	 */
-	if (tex->cmask_buffer || !tex->surface.cmask_size ||
-	    tex->buffer.b.b.nr_samples >= 2)
-                return;
-
-	tex->cmask_buffer =
-		si_aligned_buffer_create(&sscreen->b,
-					 SI_RESOURCE_FLAG_UNMAPPABLE,
-					 PIPE_USAGE_DEFAULT,
-					 tex->surface.cmask_size,
-					 tex->surface.cmask_alignment);
-	if (tex->cmask_buffer == NULL)
-		return;
-
-	tex->cmask_base_address_reg = tex->cmask_buffer->gpu_address >> 8;
-	tex->cb_color_info |= S_028C70_FAST_CLEAR(1);
-
-	p_atomic_inc(&sscreen->compressed_colortex_counter);
+   /* CMASK for MSAA is allocated in advance or always disabled
+    * by "nofmask" option.
+    */
+   if (tex->cmask_buffer || !tex->surface.cmask_size || tex->buffer.b.b.nr_samples >= 2)
+      return;
+
+   tex->cmask_buffer =
+      si_aligned_buffer_create(&sscreen->b, SI_RESOURCE_FLAG_UNMAPPABLE, PIPE_USAGE_DEFAULT,
+                               tex->surface.cmask_size, tex->surface.cmask_alignment);
+   if (tex->cmask_buffer == NULL)
+      return;
+
+   tex->cmask_base_address_reg = tex->cmask_buffer->gpu_address >> 8;
+   tex->cb_color_info |= S_028C70_FAST_CLEAR(1);
+
+   p_atomic_inc(&sscreen->compressed_colortex_counter);
 }
 
-static bool si_set_clear_color(struct si_texture *tex,
-			       enum pipe_format surface_format,
-			       const union pipe_color_union *color)
+static bool si_set_clear_color(struct si_texture *tex, enum pipe_format surface_format,
+                               const union pipe_color_union *color)
 {
-	union util_color uc;
-
-	memset(&uc, 0, sizeof(uc));
-
-	if (tex->surface.bpe == 16) {
-		/* DCC fast clear only:
-		 *   CLEAR_WORD0 = R = G = B
-		 *   CLEAR_WORD1 = A
-		 */
-		assert(color->ui[0] == color->ui[1] &&
-		       color->ui[0] == color->ui[2]);
-		uc.ui[0] = color->ui[0];
-		uc.ui[1] = color->ui[3];
-	} else {
-		util_pack_color_union(surface_format, &uc, color);
-	}
-
-	if (memcmp(tex->color_clear_value, &uc, 2 * sizeof(uint32_t)) == 0)
-		return false;
-
-	memcpy(tex->color_clear_value, &uc, 2 * sizeof(uint32_t));
-	return true;
+   union util_color uc;
+
+   memset(&uc, 0, sizeof(uc));
+
+   if (tex->surface.bpe == 16) {
+      /* DCC fast clear only:
+       *   CLEAR_WORD0 = R = G = B
+       *   CLEAR_WORD1 = A
+       */
+      assert(color->ui[0] == color->ui[1] && color->ui[0] == color->ui[2]);
+      uc.ui[0] = color->ui[0];
+      uc.ui[1] = color->ui[3];
+   } else {
+      util_pack_color_union(surface_format, &uc, color);
+   }
+
+   if (memcmp(tex->color_clear_value, &uc, 2 * sizeof(uint32_t)) == 0)
+      return false;
+
+   memcpy(tex->color_clear_value, &uc, 2 * sizeof(uint32_t));
+   return true;
 }
 
 /** Linearize and convert luminace/intensity to red. */
 enum pipe_format si_simplify_cb_format(enum pipe_format format)
 {
-	format = util_format_linear(format);
-	format = util_format_luminance_to_red(format);
-	return util_format_intensity_to_red(format);
+   format = util_format_linear(format);
+   format = util_format_luminance_to_red(format);
+   return util_format_intensity_to_red(format);
 }
 
 bool vi_alpha_is_on_msb(struct si_screen *sscreen, enum pipe_format format)
 {
-	format = si_simplify_cb_format(format);
-	const struct util_format_description *desc = util_format_description(format);
+   format = si_simplify_cb_format(format);
+   const struct util_format_description *desc = util_format_description(format);
 
-	/* Formats with 3 channels can't have alpha. */
-	if (desc->nr_channels == 3)
-		return true; /* same as xxxA; is any value OK here? */
+   /* Formats with 3 channels can't have alpha. */
+   if (desc->nr_channels == 3)
+      return true; /* same as xxxA; is any value OK here? */
 
-	if (sscreen->info.chip_class >= GFX10 && desc->nr_channels == 1)
-		return desc->swizzle[3] == PIPE_SWIZZLE_X;
+   if (sscreen->info.chip_class >= GFX10 && desc->nr_channels == 1)
+      return desc->swizzle[3] == PIPE_SWIZZLE_X;
 
-	return si_translate_colorswap(format, false) <= 1;
+   return si_translate_colorswap(format, false) <= 1;
 }
 
-static bool vi_get_fast_clear_parameters(struct si_screen *sscreen,
-					 enum pipe_format base_format,
-					 enum pipe_format surface_format,
-					 const union pipe_color_union *color,
-					 uint32_t* clear_value,
-					 bool *eliminate_needed)
+static bool vi_get_fast_clear_parameters(struct si_screen *sscreen, enum pipe_format base_format,
+                                         enum pipe_format surface_format,
+                                         const union pipe_color_union *color, uint32_t *clear_value,
+                                         bool *eliminate_needed)
 {
-	/* If we want to clear without needing a fast clear eliminate step, we
-	 * can set color and alpha independently to 0 or 1 (or 0/max for integer
-	 * formats).
-	 */
-	bool values[4] = {}; /* whether to clear to 0 or 1 */
-	bool color_value = false; /* clear color to 0 or 1 */
-	bool alpha_value = false; /* clear alpha to 0 or 1 */
-	int alpha_channel; /* index of the alpha component */
-	bool has_color = false;
-	bool has_alpha = false;
-
-	const struct util_format_description *desc =
-		util_format_description(si_simplify_cb_format(surface_format));
-
-	/* 128-bit fast clear with different R,G,B values is unsupported. */
-	if (desc->block.bits == 128 &&
-	    (color->ui[0] != color->ui[1] ||
-	     color->ui[0] != color->ui[2]))
-		return false;
-
-	*eliminate_needed = true;
-	*clear_value = DCC_CLEAR_COLOR_REG;
-
-	if (desc->layout != UTIL_FORMAT_LAYOUT_PLAIN)
-		return true; /* need ELIMINATE_FAST_CLEAR */
-
-	bool base_alpha_is_on_msb = vi_alpha_is_on_msb(sscreen, base_format);
-	bool surf_alpha_is_on_msb = vi_alpha_is_on_msb(sscreen, surface_format);
-
-	/* Formats with 3 channels can't have alpha. */
-	if (desc->nr_channels == 3)
-		alpha_channel = -1;
-	else if (surf_alpha_is_on_msb)
-		alpha_channel = desc->nr_channels - 1;
-	else
-		alpha_channel = 0;
-
-	for (int i = 0; i < 4; ++i) {
-		if (desc->swizzle[i] >= PIPE_SWIZZLE_0)
-			continue;
-
-		if (desc->channel[i].pure_integer &&
-		    desc->channel[i].type == UTIL_FORMAT_TYPE_SIGNED) {
-			/* Use the maximum value for clamping the clear color. */
-			int max = u_bit_consecutive(0, desc->channel[i].size - 1);
-
-			values[i] = color->i[i] != 0;
-			if (color->i[i] != 0 && MIN2(color->i[i], max) != max)
-				return true; /* need ELIMINATE_FAST_CLEAR */
-		} else if (desc->channel[i].pure_integer &&
-			   desc->channel[i].type == UTIL_FORMAT_TYPE_UNSIGNED) {
-			/* Use the maximum value for clamping the clear color. */
-			unsigned max = u_bit_consecutive(0, desc->channel[i].size);
-
-			values[i] = color->ui[i] != 0U;
-			if (color->ui[i] != 0U && MIN2(color->ui[i], max) != max)
-				return true; /* need ELIMINATE_FAST_CLEAR */
-		} else {
-			values[i] = color->f[i] != 0.0F;
-			if (color->f[i] != 0.0F && color->f[i] != 1.0F)
-				return true; /* need ELIMINATE_FAST_CLEAR */
-		}
-
-		if (desc->swizzle[i] == alpha_channel) {
-			alpha_value = values[i];
-			has_alpha = true;
-		} else {
-			color_value = values[i];
-			has_color = true;
-		}
-	}
-
-	/* If alpha isn't present, make it the same as color, and vice versa. */
-	if (!has_alpha)
-		alpha_value = color_value;
-	else if (!has_color)
-		color_value = alpha_value;
-
-	if (color_value != alpha_value &&
-	    base_alpha_is_on_msb != surf_alpha_is_on_msb)
-		return true; /* require ELIMINATE_FAST_CLEAR */
-
-	/* Check if all color values are equal if they are present. */
-	for (int i = 0; i < 4; ++i) {
-		if (desc->swizzle[i] <= PIPE_SWIZZLE_W &&
-		    desc->swizzle[i] != alpha_channel &&
-		    values[i] != color_value)
-			return true; /* require ELIMINATE_FAST_CLEAR */
-	}
-
-	/* This doesn't need ELIMINATE_FAST_CLEAR.
-	 * On chips predating Raven2, the DCC clear codes and the CB clear
-	 * color registers must match.
-	 */
-	*eliminate_needed = false;
-
-	if (color_value) {
-		if (alpha_value)
-			*clear_value = DCC_CLEAR_COLOR_1111;
-		else
-			*clear_value = DCC_CLEAR_COLOR_1110;
-	} else {
-		if (alpha_value)
-			*clear_value = DCC_CLEAR_COLOR_0001;
-		else
-			*clear_value = DCC_CLEAR_COLOR_0000;
-	}
-	return true;
+   /* If we want to clear without needing a fast clear eliminate step, we
+    * can set color and alpha independently to 0 or 1 (or 0/max for integer
+    * formats).
+    */
+   bool values[4] = {};      /* whether to clear to 0 or 1 */
+   bool color_value = false; /* clear color to 0 or 1 */
+   bool alpha_value = false; /* clear alpha to 0 or 1 */
+   int alpha_channel;        /* index of the alpha component */
+   bool has_color = false;
+   bool has_alpha = false;
+
+   const struct util_format_description *desc =
+      util_format_description(si_simplify_cb_format(surface_format));
+
+   /* 128-bit fast clear with different R,G,B values is unsupported. */
+   if (desc->block.bits == 128 && (color->ui[0] != color->ui[1] || color->ui[0] != color->ui[2]))
+      return false;
+
+   *eliminate_needed = true;
+   *clear_value = DCC_CLEAR_COLOR_REG;
+
+   if (desc->layout != UTIL_FORMAT_LAYOUT_PLAIN)
+      return true; /* need ELIMINATE_FAST_CLEAR */
+
+   bool base_alpha_is_on_msb = vi_alpha_is_on_msb(sscreen, base_format);
+   bool surf_alpha_is_on_msb = vi_alpha_is_on_msb(sscreen, surface_format);
+
+   /* Formats with 3 channels can't have alpha. */
+   if (desc->nr_channels == 3)
+      alpha_channel = -1;
+   else if (surf_alpha_is_on_msb)
+      alpha_channel = desc->nr_channels - 1;
+   else
+      alpha_channel = 0;
+
+   for (int i = 0; i < 4; ++i) {
+      if (desc->swizzle[i] >= PIPE_SWIZZLE_0)
+         continue;
+
+      if (desc->channel[i].pure_integer && desc->channel[i].type == UTIL_FORMAT_TYPE_SIGNED) {
+         /* Use the maximum value for clamping the clear color. */
+         int max = u_bit_consecutive(0, desc->channel[i].size - 1);
+
+         values[i] = color->i[i] != 0;
+         if (color->i[i] != 0 && MIN2(color->i[i], max) != max)
+            return true; /* need ELIMINATE_FAST_CLEAR */
+      } else if (desc->channel[i].pure_integer &&
+                 desc->channel[i].type == UTIL_FORMAT_TYPE_UNSIGNED) {
+         /* Use the maximum value for clamping the clear color. */
+         unsigned max = u_bit_consecutive(0, desc->channel[i].size);
+
+         values[i] = color->ui[i] != 0U;
+         if (color->ui[i] != 0U && MIN2(color->ui[i], max) != max)
+            return true; /* need ELIMINATE_FAST_CLEAR */
+      } else {
+         values[i] = color->f[i] != 0.0F;
+         if (color->f[i] != 0.0F && color->f[i] != 1.0F)
+            return true; /* need ELIMINATE_FAST_CLEAR */
+      }
+
+      if (desc->swizzle[i] == alpha_channel) {
+         alpha_value = values[i];
+         has_alpha = true;
+      } else {
+         color_value = values[i];
+         has_color = true;
+      }
+   }
+
+   /* If alpha isn't present, make it the same as color, and vice versa. */
+   if (!has_alpha)
+      alpha_value = color_value;
+   else if (!has_color)
+      color_value = alpha_value;
+
+   if (color_value != alpha_value && base_alpha_is_on_msb != surf_alpha_is_on_msb)
+      return true; /* require ELIMINATE_FAST_CLEAR */
+
+   /* Check if all color values are equal if they are present. */
+   for (int i = 0; i < 4; ++i) {
+      if (desc->swizzle[i] <= PIPE_SWIZZLE_W && desc->swizzle[i] != alpha_channel &&
+          values[i] != color_value)
+         return true; /* require ELIMINATE_FAST_CLEAR */
+   }
+
+   /* This doesn't need ELIMINATE_FAST_CLEAR.
+    * On chips predating Raven2, the DCC clear codes and the CB clear
+    * color registers must match.
+    */
+   *eliminate_needed = false;
+
+   if (color_value) {
+      if (alpha_value)
+         *clear_value = DCC_CLEAR_COLOR_1111;
+      else
+         *clear_value = DCC_CLEAR_COLOR_1110;
+   } else {
+      if (alpha_value)
+         *clear_value = DCC_CLEAR_COLOR_0001;
+      else
+         *clear_value = DCC_CLEAR_COLOR_0000;
+   }
+   return true;
 }
 
-bool vi_dcc_clear_level(struct si_context *sctx,
-			struct si_texture *tex,
-			unsigned level, unsigned clear_value)
+bool vi_dcc_clear_level(struct si_context *sctx, struct si_texture *tex, unsigned level,
+                        unsigned clear_value)
 {
-	struct pipe_resource *dcc_buffer;
-	uint64_t dcc_offset, clear_size;
-
-	assert(vi_dcc_enabled(tex, level));
-
-	if (tex->dcc_separate_buffer) {
-		dcc_buffer = &tex->dcc_separate_buffer->b.b;
-		dcc_offset = 0;
-	} else {
-		dcc_buffer = &tex->buffer.b.b;
-		dcc_offset = tex->surface.dcc_offset;
-	}
-
-	if (sctx->chip_class >= GFX9) {
-		/* Mipmap level clears aren't implemented. */
-		if (tex->buffer.b.b.last_level > 0)
-			return false;
-
-		/* 4x and 8x MSAA needs a sophisticated compute shader for
-		 * the clear. See AMDVLK. */
-		if (tex->buffer.b.b.nr_storage_samples >= 4)
-			return false;
-
-		clear_size = tex->surface.dcc_size;
-	} else {
-		unsigned num_layers = util_num_layers(&tex->buffer.b.b, level);
-
-		/* If this is 0, fast clear isn't possible. (can occur with MSAA) */
-		if (!tex->surface.u.legacy.level[level].dcc_fast_clear_size)
-			return false;
-
-		/* Layered 4x and 8x MSAA DCC fast clears need to clear
-		 * dcc_fast_clear_size bytes for each layer. A compute shader
-		 * would be more efficient than separate per-layer clear operations.
-		 */
-		if (tex->buffer.b.b.nr_storage_samples >= 4 && num_layers > 1)
-			return false;
-
-		dcc_offset += tex->surface.u.legacy.level[level].dcc_offset;
-		clear_size = tex->surface.u.legacy.level[level].dcc_fast_clear_size *
-			     num_layers;
-	}
-
-	si_clear_buffer(sctx, dcc_buffer, dcc_offset, clear_size,
-			&clear_value, 4, SI_COHERENCY_CB_META, false);
-	return true;
+   struct pipe_resource *dcc_buffer;
+   uint64_t dcc_offset, clear_size;
+
+   assert(vi_dcc_enabled(tex, level));
+
+   if (tex->dcc_separate_buffer) {
+      dcc_buffer = &tex->dcc_separate_buffer->b.b;
+      dcc_offset = 0;
+   } else {
+      dcc_buffer = &tex->buffer.b.b;
+      dcc_offset = tex->surface.dcc_offset;
+   }
+
+   if (sctx->chip_class >= GFX9) {
+      /* Mipmap level clears aren't implemented. */
+      if (tex->buffer.b.b.last_level > 0)
+         return false;
+
+      /* 4x and 8x MSAA needs a sophisticated compute shader for
+       * the clear. See AMDVLK. */
+      if (tex->buffer.b.b.nr_storage_samples >= 4)
+         return false;
+
+      clear_size = tex->surface.dcc_size;
+   } else {
+      unsigned num_layers = util_num_layers(&tex->buffer.b.b, level);
+
+      /* If this is 0, fast clear isn't possible. (can occur with MSAA) */
+      if (!tex->surface.u.legacy.level[level].dcc_fast_clear_size)
+         return false;
+
+      /* Layered 4x and 8x MSAA DCC fast clears need to clear
+       * dcc_fast_clear_size bytes for each layer. A compute shader
+       * would be more efficient than separate per-layer clear operations.
+       */
+      if (tex->buffer.b.b.nr_storage_samples >= 4 && num_layers > 1)
+         return false;
+
+      dcc_offset += tex->surface.u.legacy.level[level].dcc_offset;
+      clear_size = tex->surface.u.legacy.level[level].dcc_fast_clear_size * num_layers;
+   }
+
+   si_clear_buffer(sctx, dcc_buffer, dcc_offset, clear_size, &clear_value, 4, SI_COHERENCY_CB_META,
+                   false);
+   return true;
 }
 
 /* Set the same micro tile mode as the destination of the last MSAA resolve.
  * This allows hitting the MSAA resolve fast path, which requires that both
  * src and dst micro tile modes match.
  */
-static void si_set_optimal_micro_tile_mode(struct si_screen *sscreen,
-					   struct si_texture *tex)
+static void si_set_optimal_micro_tile_mode(struct si_screen *sscreen, struct si_texture *tex)
 {
-	if (sscreen->info.chip_class >= GFX10 ||
-	    tex->buffer.b.is_shared ||
-	    tex->buffer.b.b.nr_samples <= 1 ||
-	    tex->surface.micro_tile_mode == tex->last_msaa_resolve_target_micro_mode)
-		return;
-
-	assert(sscreen->info.chip_class >= GFX9 ||
-	       tex->surface.u.legacy.level[0].mode == RADEON_SURF_MODE_2D);
-	assert(tex->buffer.b.b.last_level == 0);
-
-	if (sscreen->info.chip_class >= GFX9) {
-		/* 4K or larger tiles only. 0 is linear. 1-3 are 256B tiles. */
-		assert(tex->surface.u.gfx9.surf.swizzle_mode >= 4);
-
-		/* If you do swizzle_mode % 4, you'll get:
-		 *   0 = Depth
-		 *   1 = Standard,
-		 *   2 = Displayable
-		 *   3 = Rotated
-		 *
-		 * Depth-sample order isn't allowed:
-		 */
-		assert(tex->surface.u.gfx9.surf.swizzle_mode % 4 != 0);
-
-		switch (tex->last_msaa_resolve_target_micro_mode) {
-		case RADEON_MICRO_MODE_DISPLAY:
-			tex->surface.u.gfx9.surf.swizzle_mode &= ~0x3;
-			tex->surface.u.gfx9.surf.swizzle_mode += 2; /* D */
-			break;
-		case RADEON_MICRO_MODE_THIN:
-			tex->surface.u.gfx9.surf.swizzle_mode &= ~0x3;
-			tex->surface.u.gfx9.surf.swizzle_mode += 1; /* S */
-			break;
-		case RADEON_MICRO_MODE_ROTATED:
-			tex->surface.u.gfx9.surf.swizzle_mode &= ~0x3;
-			tex->surface.u.gfx9.surf.swizzle_mode += 3; /* R */
-			break;
-		default: /* depth */
-			assert(!"unexpected micro mode");
-			return;
-		}
-	} else if (sscreen->info.chip_class >= GFX7) {
-		/* These magic numbers were copied from addrlib. It doesn't use
-		 * any definitions for them either. They are all 2D_TILED_THIN1
-		 * modes with different bpp and micro tile mode.
-		 */
-		switch (tex->last_msaa_resolve_target_micro_mode) {
-		case RADEON_MICRO_MODE_DISPLAY:
-			tex->surface.u.legacy.tiling_index[0] = 10;
-			break;
-		case RADEON_MICRO_MODE_THIN:
-			tex->surface.u.legacy.tiling_index[0] = 14;
-			break;
-		case RADEON_MICRO_MODE_ROTATED:
-			tex->surface.u.legacy.tiling_index[0] = 28;
-			break;
-		default: /* depth, thick */
-			assert(!"unexpected micro mode");
-			return;
-		}
-	} else { /* GFX6 */
-		switch (tex->last_msaa_resolve_target_micro_mode) {
-		case RADEON_MICRO_MODE_DISPLAY:
-			switch (tex->surface.bpe) {
-			case 1:
-                            tex->surface.u.legacy.tiling_index[0] = 10;
-                            break;
-			case 2:
-                            tex->surface.u.legacy.tiling_index[0] = 11;
-                            break;
-			default: /* 4, 8 */
-                            tex->surface.u.legacy.tiling_index[0] = 12;
-                            break;
-			}
-			break;
-		case RADEON_MICRO_MODE_THIN:
-			switch (tex->surface.bpe) {
-			case 1:
-                                tex->surface.u.legacy.tiling_index[0] = 14;
-                                break;
-			case 2:
-                                tex->surface.u.legacy.tiling_index[0] = 15;
-                                break;
-			case 4:
-                                tex->surface.u.legacy.tiling_index[0] = 16;
-                                break;
-			default: /* 8, 16 */
-                                tex->surface.u.legacy.tiling_index[0] = 17;
-                                break;
-			}
-			break;
-		default: /* depth, thick */
-			assert(!"unexpected micro mode");
-			return;
-		}
-	}
-
-	tex->surface.micro_tile_mode = tex->last_msaa_resolve_target_micro_mode;
-
-	p_atomic_inc(&sscreen->dirty_tex_counter);
+   if (sscreen->info.chip_class >= GFX10 || tex->buffer.b.is_shared ||
+       tex->buffer.b.b.nr_samples <= 1 ||
+       tex->surface.micro_tile_mode == tex->last_msaa_resolve_target_micro_mode)
+      return;
+
+   assert(sscreen->info.chip_class >= GFX9 ||
+          tex->surface.u.legacy.level[0].mode == RADEON_SURF_MODE_2D);
+   assert(tex->buffer.b.b.last_level == 0);
+
+   if (sscreen->info.chip_class >= GFX9) {
+      /* 4K or larger tiles only. 0 is linear. 1-3 are 256B tiles. */
+      assert(tex->surface.u.gfx9.surf.swizzle_mode >= 4);
+
+      /* If you do swizzle_mode % 4, you'll get:
+       *   0 = Depth
+       *   1 = Standard,
+       *   2 = Displayable
+       *   3 = Rotated
+       *
+       * Depth-sample order isn't allowed:
+       */
+      assert(tex->surface.u.gfx9.surf.swizzle_mode % 4 != 0);
+
+      switch (tex->last_msaa_resolve_target_micro_mode) {
+      case RADEON_MICRO_MODE_DISPLAY:
+         tex->surface.u.gfx9.surf.swizzle_mode &= ~0x3;
+         tex->surface.u.gfx9.surf.swizzle_mode += 2; /* D */
+         break;
+      case RADEON_MICRO_MODE_THIN:
+         tex->surface.u.gfx9.surf.swizzle_mode &= ~0x3;
+         tex->surface.u.gfx9.surf.swizzle_mode += 1; /* S */
+         break;
+      case RADEON_MICRO_MODE_ROTATED:
+         tex->surface.u.gfx9.surf.swizzle_mode &= ~0x3;
+         tex->surface.u.gfx9.surf.swizzle_mode += 3; /* R */
+         break;
+      default: /* depth */
+         assert(!"unexpected micro mode");
+         return;
+      }
+   } else if (sscreen->info.chip_class >= GFX7) {
+      /* These magic numbers were copied from addrlib. It doesn't use
+       * any definitions for them either. They are all 2D_TILED_THIN1
+       * modes with different bpp and micro tile mode.
+       */
+      switch (tex->last_msaa_resolve_target_micro_mode) {
+      case RADEON_MICRO_MODE_DISPLAY:
+         tex->surface.u.legacy.tiling_index[0] = 10;
+         break;
+      case RADEON_MICRO_MODE_THIN:
+         tex->surface.u.legacy.tiling_index[0] = 14;
+         break;
+      case RADEON_MICRO_MODE_ROTATED:
+         tex->surface.u.legacy.tiling_index[0] = 28;
+         break;
+      default: /* depth, thick */
+         assert(!"unexpected micro mode");
+         return;
+      }
+   } else { /* GFX6 */
+      switch (tex->last_msaa_resolve_target_micro_mode) {
+      case RADEON_MICRO_MODE_DISPLAY:
+         switch (tex->surface.bpe) {
+         case 1:
+            tex->surface.u.legacy.tiling_index[0] = 10;
+            break;
+         case 2:
+            tex->surface.u.legacy.tiling_index[0] = 11;
+            break;
+         default: /* 4, 8 */
+            tex->surface.u.legacy.tiling_index[0] = 12;
+            break;
+         }
+         break;
+      case RADEON_MICRO_MODE_THIN:
+         switch (tex->surface.bpe) {
+         case 1:
+            tex->surface.u.legacy.tiling_index[0] = 14;
+            break;
+         case 2:
+            tex->surface.u.legacy.tiling_index[0] = 15;
+            break;
+         case 4:
+            tex->surface.u.legacy.tiling_index[0] = 16;
+            break;
+         default: /* 8, 16 */
+            tex->surface.u.legacy.tiling_index[0] = 17;
+            break;
+         }
+         break;
+      default: /* depth, thick */
+         assert(!"unexpected micro mode");
+         return;
+      }
+   }
+
+   tex->surface.micro_tile_mode = tex->last_msaa_resolve_target_micro_mode;
+
+   p_atomic_inc(&sscreen->dirty_tex_counter);
 }
 
-static void si_do_fast_color_clear(struct si_context *sctx,
-				   unsigned *buffers,
-				   const union pipe_color_union *color)
+static void si_do_fast_color_clear(struct si_context *sctx, unsigned *buffers,
+                                   const union pipe_color_union *color)
 {
-	struct pipe_framebuffer_state *fb = &sctx->framebuffer.state;
-	int i;
+   struct pipe_framebuffer_state *fb = &sctx->framebuffer.state;
+   int i;
 
-	/* This function is broken in BE, so just disable this path for now */
+   /* This function is broken in BE, so just disable this path for now */
 #if UTIL_ARCH_BIG_ENDIAN
-	return;
+   return;
 #endif
 
-	if (sctx->render_cond)
-		return;
-
-	for (i = 0; i < fb->nr_cbufs; i++) {
-		struct si_texture *tex;
-		unsigned clear_bit = PIPE_CLEAR_COLOR0 << i;
-
-		if (!fb->cbufs[i])
-			continue;
-
-		/* if this colorbuffer is not being cleared */
-		if (!(*buffers & clear_bit))
-			continue;
-
-		unsigned level = fb->cbufs[i]->u.tex.level;
-		if (level > 0)
-			continue;
-
-		tex = (struct si_texture *)fb->cbufs[i]->texture;
-
-		/* TODO: GFX9: Implement DCC fast clear for level 0 of
-		 * mipmapped textures. Mipmapped DCC has to clear a rectangular
-		 * area of DCC for level 0 (because the whole miptree is
-		 * organized in a 2D plane).
-		 */
-		if (sctx->chip_class >= GFX9 &&
-		    tex->buffer.b.b.last_level > 0)
-			continue;
-
-		/* the clear is allowed if all layers are bound */
-		if (fb->cbufs[i]->u.tex.first_layer != 0 ||
-		    fb->cbufs[i]->u.tex.last_layer != util_max_layer(&tex->buffer.b.b, 0)) {
-			continue;
-		}
-
-		/* only supported on tiled surfaces */
-		if (tex->surface.is_linear) {
-			continue;
-		}
-
-		/* shared textures can't use fast clear without an explicit flush,
-		 * because there is no way to communicate the clear color among
-		 * all clients
-		 */
-		if (tex->buffer.b.is_shared &&
-		    !(tex->buffer.external_usage & PIPE_HANDLE_USAGE_EXPLICIT_FLUSH))
-			continue;
-
-		if (sctx->chip_class <= GFX8 &&
-		    tex->surface.u.legacy.level[0].mode == RADEON_SURF_MODE_1D &&
-		    !sctx->screen->info.htile_cmask_support_1d_tiling)
-			continue;
-
-		/* Use a slow clear for small surfaces where the cost of
-		 * the eliminate pass can be higher than the benefit of fast
-		 * clear. The closed driver does this, but the numbers may differ.
-		 *
-		 * This helps on both dGPUs and APUs, even small APUs like Mullins.
-		 */
-		bool too_small = tex->buffer.b.b.nr_samples <= 1 &&
-				 tex->buffer.b.b.width0 *
-				 tex->buffer.b.b.height0 <= 512 * 512;
-		bool eliminate_needed = false;
-		bool fmask_decompress_needed = false;
-
-		/* Fast clear is the most appropriate place to enable DCC for
-		 * displayable surfaces.
-		 */
-		if (sctx->family == CHIP_STONEY && !too_small) {
-			vi_separate_dcc_try_enable(sctx, tex);
-
-			/* RB+ isn't supported with a CMASK clear only on Stoney,
-			 * so all clears are considered to be hypothetically slow
-			 * clears, which is weighed when determining whether to
-			 * enable separate DCC.
-			 */
-			if (tex->dcc_gather_statistics) /* only for Stoney */
-				tex->num_slow_clears++;
-		}
-
-		/* Try to clear DCC first, otherwise try CMASK. */
-		if (vi_dcc_enabled(tex, 0)) {
-			uint32_t reset_value;
-
-			if (sctx->screen->debug_flags & DBG(NO_DCC_CLEAR))
-				continue;
-
-			if (!vi_get_fast_clear_parameters(sctx->screen,
-							  tex->buffer.b.b.format,
-							  fb->cbufs[i]->format,
-							  color, &reset_value,
-							  &eliminate_needed))
-				continue;
-
-			if (eliminate_needed && too_small)
-				continue;
-
-			/* TODO: This DCC+CMASK clear doesn't work with MSAA. */
-			if (tex->buffer.b.b.nr_samples >= 2 && tex->cmask_buffer &&
-			    eliminate_needed)
-				continue;
-
-			if (!vi_dcc_clear_level(sctx, tex, 0, reset_value))
-				continue;
-
-			tex->separate_dcc_dirty = true;
-			tex->displayable_dcc_dirty = true;
-
-			/* DCC fast clear with MSAA should clear CMASK to 0xC. */
-			if (tex->buffer.b.b.nr_samples >= 2 && tex->cmask_buffer) {
-				uint32_t clear_value = 0xCCCCCCCC;
-				si_clear_buffer(sctx, &tex->cmask_buffer->b.b,
-						tex->surface.cmask_offset, tex->surface.cmask_size,
-						&clear_value, 4, SI_COHERENCY_CB_META, false);
-				fmask_decompress_needed = true;
-			}
-		} else {
-			if (too_small)
-				continue;
-
-			/* 128-bit formats are unusupported */
-			if (tex->surface.bpe > 8) {
-				continue;
-			}
-
-			/* RB+ doesn't work with CMASK fast clear on Stoney. */
-			if (sctx->family == CHIP_STONEY)
-				continue;
-
-			/* ensure CMASK is enabled */
-			si_alloc_separate_cmask(sctx->screen, tex);
-			if (!tex->cmask_buffer)
-				continue;
-
-			/* Do the fast clear. */
-			uint32_t clear_value = 0;
-			si_clear_buffer(sctx, &tex->cmask_buffer->b.b,
-					tex->surface.cmask_offset, tex->surface.cmask_size,
-					&clear_value, 4, SI_COHERENCY_CB_META, false);
-			eliminate_needed = true;
-		}
-
-		if ((eliminate_needed || fmask_decompress_needed) &&
-		    !(tex->dirty_level_mask & (1 << level))) {
-			tex->dirty_level_mask |= 1 << level;
-			p_atomic_inc(&sctx->screen->compressed_colortex_counter);
-		}
-
-		/* We can change the micro tile mode before a full clear. */
-		si_set_optimal_micro_tile_mode(sctx->screen, tex);
-
-		*buffers &= ~clear_bit;
-
-		/* Chips with DCC constant encoding don't need to set the clear
-		 * color registers for DCC clear values 0 and 1.
-		 */
-		if (sctx->screen->info.has_dcc_constant_encode && !eliminate_needed)
-			continue;
-
-		if (si_set_clear_color(tex, fb->cbufs[i]->format, color)) {
-			sctx->framebuffer.dirty_cbufs |= 1 << i;
-			si_mark_atom_dirty(sctx, &sctx->atoms.s.framebuffer);
-		}
-	}
+   if (sctx->render_cond)
+      return;
+
+   for (i = 0; i < fb->nr_cbufs; i++) {
+      struct si_texture *tex;
+      unsigned clear_bit = PIPE_CLEAR_COLOR0 << i;
+
+      if (!fb->cbufs[i])
+         continue;
+
+      /* if this colorbuffer is not being cleared */
+      if (!(*buffers & clear_bit))
+         continue;
+
+      unsigned level = fb->cbufs[i]->u.tex.level;
+      if (level > 0)
+         continue;
+
+      tex = (struct si_texture *)fb->cbufs[i]->texture;
+
+      /* TODO: GFX9: Implement DCC fast clear for level 0 of
+       * mipmapped textures. Mipmapped DCC has to clear a rectangular
+       * area of DCC for level 0 (because the whole miptree is
+       * organized in a 2D plane).
+       */
+      if (sctx->chip_class >= GFX9 && tex->buffer.b.b.last_level > 0)
+         continue;
+
+      /* the clear is allowed if all layers are bound */
+      if (fb->cbufs[i]->u.tex.first_layer != 0 ||
+          fb->cbufs[i]->u.tex.last_layer != util_max_layer(&tex->buffer.b.b, 0)) {
+         continue;
+      }
+
+      /* only supported on tiled surfaces */
+      if (tex->surface.is_linear) {
+         continue;
+      }
+
+      /* shared textures can't use fast clear without an explicit flush,
+       * because there is no way to communicate the clear color among
+       * all clients
+       */
+      if (tex->buffer.b.is_shared &&
+          !(tex->buffer.external_usage & PIPE_HANDLE_USAGE_EXPLICIT_FLUSH))
+         continue;
+
+      if (sctx->chip_class <= GFX8 && tex->surface.u.legacy.level[0].mode == RADEON_SURF_MODE_1D &&
+          !sctx->screen->info.htile_cmask_support_1d_tiling)
+         continue;
+
+      /* Use a slow clear for small surfaces where the cost of
+       * the eliminate pass can be higher than the benefit of fast
+       * clear. The closed driver does this, but the numbers may differ.
+       *
+       * This helps on both dGPUs and APUs, even small APUs like Mullins.
+       */
+      bool too_small = tex->buffer.b.b.nr_samples <= 1 &&
+                       tex->buffer.b.b.width0 * tex->buffer.b.b.height0 <= 512 * 512;
+      bool eliminate_needed = false;
+      bool fmask_decompress_needed = false;
+
+      /* Fast clear is the most appropriate place to enable DCC for
+       * displayable surfaces.
+       */
+      if (sctx->family == CHIP_STONEY && !too_small) {
+         vi_separate_dcc_try_enable(sctx, tex);
+
+         /* RB+ isn't supported with a CMASK clear only on Stoney,
+          * so all clears are considered to be hypothetically slow
+          * clears, which is weighed when determining whether to
+          * enable separate DCC.
+          */
+         if (tex->dcc_gather_statistics) /* only for Stoney */
+            tex->num_slow_clears++;
+      }
+
+      /* Try to clear DCC first, otherwise try CMASK. */
+      if (vi_dcc_enabled(tex, 0)) {
+         uint32_t reset_value;
+
+         if (sctx->screen->debug_flags & DBG(NO_DCC_CLEAR))
+            continue;
+
+         if (!vi_get_fast_clear_parameters(sctx->screen, tex->buffer.b.b.format,
+                                           fb->cbufs[i]->format, color, &reset_value,
+                                           &eliminate_needed))
+            continue;
+
+         if (eliminate_needed && too_small)
+            continue;
+
+         /* TODO: This DCC+CMASK clear doesn't work with MSAA. */
+         if (tex->buffer.b.b.nr_samples >= 2 && tex->cmask_buffer && eliminate_needed)
+            continue;
+
+         if (!vi_dcc_clear_level(sctx, tex, 0, reset_value))
+            continue;
+
+         tex->separate_dcc_dirty = true;
+         tex->displayable_dcc_dirty = true;
+
+         /* DCC fast clear with MSAA should clear CMASK to 0xC. */
+         if (tex->buffer.b.b.nr_samples >= 2 && tex->cmask_buffer) {
+            uint32_t clear_value = 0xCCCCCCCC;
+            si_clear_buffer(sctx, &tex->cmask_buffer->b.b, tex->surface.cmask_offset,
+                            tex->surface.cmask_size, &clear_value, 4, SI_COHERENCY_CB_META, false);
+            fmask_decompress_needed = true;
+         }
+      } else {
+         if (too_small)
+            continue;
+
+         /* 128-bit formats are unusupported */
+         if (tex->surface.bpe > 8) {
+            continue;
+         }
+
+         /* RB+ doesn't work with CMASK fast clear on Stoney. */
+         if (sctx->family == CHIP_STONEY)
+            continue;
+
+         /* ensure CMASK is enabled */
+         si_alloc_separate_cmask(sctx->screen, tex);
+         if (!tex->cmask_buffer)
+            continue;
+
+         /* Do the fast clear. */
+         uint32_t clear_value = 0;
+         si_clear_buffer(sctx, &tex->cmask_buffer->b.b, tex->surface.cmask_offset,
+                         tex->surface.cmask_size, &clear_value, 4, SI_COHERENCY_CB_META, false);
+         eliminate_needed = true;
+      }
+
+      if ((eliminate_needed || fmask_decompress_needed) &&
+          !(tex->dirty_level_mask & (1 << level))) {
+         tex->dirty_level_mask |= 1 << level;
+         p_atomic_inc(&sctx->screen->compressed_colortex_counter);
+      }
+
+      /* We can change the micro tile mode before a full clear. */
+      si_set_optimal_micro_tile_mode(sctx->screen, tex);
+
+      *buffers &= ~clear_bit;
+
+      /* Chips with DCC constant encoding don't need to set the clear
+       * color registers for DCC clear values 0 and 1.
+       */
+      if (sctx->screen->info.has_dcc_constant_encode && !eliminate_needed)
+         continue;
+
+      if (si_set_clear_color(tex, fb->cbufs[i]->format, color)) {
+         sctx->framebuffer.dirty_cbufs |= 1 << i;
+         si_mark_atom_dirty(sctx, &sctx->atoms.s.framebuffer);
+      }
+   }
 }
 
 static void si_clear(struct pipe_context *ctx, unsigned buffers,
-		     const union pipe_color_union *color,
-		     double depth, unsigned stencil)
+                     const union pipe_color_union *color, double depth, unsigned stencil)
 {
-	struct si_context *sctx = (struct si_context *)ctx;
-	struct pipe_framebuffer_state *fb = &sctx->framebuffer.state;
-	struct pipe_surface *zsbuf = fb->zsbuf;
-	struct si_texture *zstex =
-		zsbuf ? (struct si_texture*)zsbuf->texture : NULL;
-	bool needs_db_flush = false;
-
-	if (buffers & PIPE_CLEAR_COLOR) {
-		si_do_fast_color_clear(sctx, &buffers, color);
-		if (!buffers)
-			return; /* all buffers have been fast cleared */
-
-		/* These buffers cannot use fast clear, make sure to disable expansion. */
-		for (unsigned i = 0; i < fb->nr_cbufs; i++) {
-			struct si_texture *tex;
-
-			/* If not clearing this buffer, skip. */
-			if (!(buffers & (PIPE_CLEAR_COLOR0 << i)) || !fb->cbufs[i])
-				continue;
-
-			tex = (struct si_texture *)fb->cbufs[i]->texture;
-			if (tex->surface.fmask_size == 0)
-				tex->dirty_level_mask &= ~(1 << fb->cbufs[i]->u.tex.level);
-		}
-	}
-
-	if (zstex &&
-	    zsbuf->u.tex.first_layer == 0 &&
-	    zsbuf->u.tex.last_layer == util_max_layer(&zstex->buffer.b.b, 0)) {
-		/* TC-compatible HTILE only supports depth clears to 0 or 1. */
-		if (buffers & PIPE_CLEAR_DEPTH &&
-		    si_htile_enabled(zstex, zsbuf->u.tex.level, PIPE_MASK_Z) &&
-		    (!zstex->tc_compatible_htile ||
-		     depth == 0 || depth == 1)) {
-			/* Need to disable EXPCLEAR temporarily if clearing
-			 * to a new value. */
-			if (!zstex->depth_cleared || zstex->depth_clear_value != depth) {
-				sctx->db_depth_disable_expclear = true;
-			}
-
-			if (zstex->depth_clear_value != (float)depth) {
-				if ((zstex->depth_clear_value != 0) != (depth != 0)) {
-					/* ZRANGE_PRECISION register of a bound surface will change so we
-					 * must flush the DB caches. */
-					needs_db_flush = true;
-				}
-				/* Update DB_DEPTH_CLEAR. */
-				zstex->depth_clear_value = depth;
-				sctx->framebuffer.dirty_zsbuf = true;
-				si_mark_atom_dirty(sctx, &sctx->atoms.s.framebuffer);
-			}
-			sctx->db_depth_clear = true;
-			si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
-		}
-
-		/* TC-compatible HTILE only supports stencil clears to 0. */
-		if (buffers & PIPE_CLEAR_STENCIL &&
-		    si_htile_enabled(zstex, zsbuf->u.tex.level, PIPE_MASK_S) &&
-		    (!zstex->tc_compatible_htile || stencil == 0)) {
-			stencil &= 0xff;
-
-			/* Need to disable EXPCLEAR temporarily if clearing
-			 * to a new value. */
-			if (!zstex->stencil_cleared || zstex->stencil_clear_value != stencil) {
-				sctx->db_stencil_disable_expclear = true;
-			}
-
-			if (zstex->stencil_clear_value != (uint8_t)stencil) {
-				/* Update DB_STENCIL_CLEAR. */
-				zstex->stencil_clear_value = stencil;
-				sctx->framebuffer.dirty_zsbuf = true;
-				si_mark_atom_dirty(sctx, &sctx->atoms.s.framebuffer);
-			}
-			sctx->db_stencil_clear = true;
-			si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
-		}
-
-		if (needs_db_flush)
-			sctx->flags |= SI_CONTEXT_FLUSH_AND_INV_DB;
-	}
-
-	si_blitter_begin(sctx, SI_CLEAR);
-	util_blitter_clear(sctx->blitter, fb->width, fb->height,
-			   util_framebuffer_get_num_layers(fb),
-			   buffers, color, depth, stencil,
-			   sctx->framebuffer.nr_samples > 1);
-	si_blitter_end(sctx);
-
-	if (sctx->db_depth_clear) {
-		sctx->db_depth_clear = false;
-		sctx->db_depth_disable_expclear = false;
-		zstex->depth_cleared = true;
-		si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
-	}
-
-	if (sctx->db_stencil_clear) {
-		sctx->db_stencil_clear = false;
-		sctx->db_stencil_disable_expclear = false;
-		zstex->stencil_cleared = true;
-		si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
-	}
+   struct si_context *sctx = (struct si_context *)ctx;
+   struct pipe_framebuffer_state *fb = &sctx->framebuffer.state;
+   struct pipe_surface *zsbuf = fb->zsbuf;
+   struct si_texture *zstex = zsbuf ? (struct si_texture *)zsbuf->texture : NULL;
+   bool needs_db_flush = false;
+
+   if (buffers & PIPE_CLEAR_COLOR) {
+      si_do_fast_color_clear(sctx, &buffers, color);
+      if (!buffers)
+         return; /* all buffers have been fast cleared */
+
+      /* These buffers cannot use fast clear, make sure to disable expansion. */
+      for (unsigned i = 0; i < fb->nr_cbufs; i++) {
+         struct si_texture *tex;
+
+         /* If not clearing this buffer, skip. */
+         if (!(buffers & (PIPE_CLEAR_COLOR0 << i)) || !fb->cbufs[i])
+            continue;
+
+         tex = (struct si_texture *)fb->cbufs[i]->texture;
+         if (tex->surface.fmask_size == 0)
+            tex->dirty_level_mask &= ~(1 << fb->cbufs[i]->u.tex.level);
+      }
+   }
+
+   if (zstex && zsbuf->u.tex.first_layer == 0 &&
+       zsbuf->u.tex.last_layer == util_max_layer(&zstex->buffer.b.b, 0)) {
+      /* TC-compatible HTILE only supports depth clears to 0 or 1. */
+      if (buffers & PIPE_CLEAR_DEPTH && si_htile_enabled(zstex, zsbuf->u.tex.level, PIPE_MASK_Z) &&
+          (!zstex->tc_compatible_htile || depth == 0 || depth == 1)) {
+         /* Need to disable EXPCLEAR temporarily if clearing
+          * to a new value. */
+         if (!zstex->depth_cleared || zstex->depth_clear_value != depth) {
+            sctx->db_depth_disable_expclear = true;
+         }
+
+         if (zstex->depth_clear_value != (float)depth) {
+            if ((zstex->depth_clear_value != 0) != (depth != 0)) {
+               /* ZRANGE_PRECISION register of a bound surface will change so we
+                * must flush the DB caches. */
+               needs_db_flush = true;
+            }
+            /* Update DB_DEPTH_CLEAR. */
+            zstex->depth_clear_value = depth;
+            sctx->framebuffer.dirty_zsbuf = true;
+            si_mark_atom_dirty(sctx, &sctx->atoms.s.framebuffer);
+         }
+         sctx->db_depth_clear = true;
+         si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
+      }
+
+      /* TC-compatible HTILE only supports stencil clears to 0. */
+      if (buffers & PIPE_CLEAR_STENCIL &&
+          si_htile_enabled(zstex, zsbuf->u.tex.level, PIPE_MASK_S) &&
+          (!zstex->tc_compatible_htile || stencil == 0)) {
+         stencil &= 0xff;
+
+         /* Need to disable EXPCLEAR temporarily if clearing
+          * to a new value. */
+         if (!zstex->stencil_cleared || zstex->stencil_clear_value != stencil) {
+            sctx->db_stencil_disable_expclear = true;
+         }
+
+         if (zstex->stencil_clear_value != (uint8_t)stencil) {
+            /* Update DB_STENCIL_CLEAR. */
+            zstex->stencil_clear_value = stencil;
+            sctx->framebuffer.dirty_zsbuf = true;
+            si_mark_atom_dirty(sctx, &sctx->atoms.s.framebuffer);
+         }
+         sctx->db_stencil_clear = true;
+         si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
+      }
+
+      if (needs_db_flush)
+         sctx->flags |= SI_CONTEXT_FLUSH_AND_INV_DB;
+   }
+
+   si_blitter_begin(sctx, SI_CLEAR);
+   util_blitter_clear(sctx->blitter, fb->width, fb->height, util_framebuffer_get_num_layers(fb),
+                      buffers, color, depth, stencil, sctx->framebuffer.nr_samples > 1);
+   si_blitter_end(sctx);
+
+   if (sctx->db_depth_clear) {
+      sctx->db_depth_clear = false;
+      sctx->db_depth_disable_expclear = false;
+      zstex->depth_cleared = true;
+      si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
+   }
+
+   if (sctx->db_stencil_clear) {
+      sctx->db_stencil_clear = false;
+      sctx->db_stencil_disable_expclear = false;
+      zstex->stencil_cleared = true;
+      si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
+   }
 }
 
-static void si_clear_render_target(struct pipe_context *ctx,
-				   struct pipe_surface *dst,
-				   const union pipe_color_union *color,
-				   unsigned dstx, unsigned dsty,
-				   unsigned width, unsigned height,
-				   bool render_condition_enabled)
+static void si_clear_render_target(struct pipe_context *ctx, struct pipe_surface *dst,
+                                   const union pipe_color_union *color, unsigned dstx,
+                                   unsigned dsty, unsigned width, unsigned height,
+                                   bool render_condition_enabled)
 {
-	struct si_context *sctx = (struct si_context *)ctx;
-	struct si_texture *sdst = (struct si_texture*)dst->texture;
-
-	if (dst->texture->nr_samples <= 1 && !sdst->surface.dcc_offset) {
-		si_compute_clear_render_target(ctx, dst, color, dstx, dsty, width,
-					       height, render_condition_enabled);
-		return;
-	}
-
-	si_blitter_begin(sctx, SI_CLEAR_SURFACE |
-			 (render_condition_enabled ? 0 : SI_DISABLE_RENDER_COND));
-	util_blitter_clear_render_target(sctx->blitter, dst, color,
-					 dstx, dsty, width, height);
-	si_blitter_end(sctx);
+   struct si_context *sctx = (struct si_context *)ctx;
+   struct si_texture *sdst = (struct si_texture *)dst->texture;
+
+   if (dst->texture->nr_samples <= 1 && !sdst->surface.dcc_offset) {
+      si_compute_clear_render_target(ctx, dst, color, dstx, dsty, width, height,
+                                     render_condition_enabled);
+      return;
+   }
+
+   si_blitter_begin(sctx,
+                    SI_CLEAR_SURFACE | (render_condition_enabled ? 0 : SI_DISABLE_RENDER_COND));
+   util_blitter_clear_render_target(sctx->blitter, dst, color, dstx, dsty, width, height);
+   si_blitter_end(sctx);
 }
 
-static void si_clear_depth_stencil(struct pipe_context *ctx,
-				   struct pipe_surface *dst,
-				   unsigned clear_flags,
-				   double depth,
-				   unsigned stencil,
-				   unsigned dstx, unsigned dsty,
-				   unsigned width, unsigned height,
-				   bool render_condition_enabled)
+static void si_clear_depth_stencil(struct pipe_context *ctx, struct pipe_surface *dst,
+                                   unsigned clear_flags, double depth, unsigned stencil,
+                                   unsigned dstx, unsigned dsty, unsigned width, unsigned height,
+                                   bool render_condition_enabled)
 {
-	struct si_context *sctx = (struct si_context *)ctx;
+   struct si_context *sctx = (struct si_context *)ctx;
 
-	si_blitter_begin(sctx, SI_CLEAR_SURFACE |
-			 (render_condition_enabled ? 0 : SI_DISABLE_RENDER_COND));
-	util_blitter_clear_depth_stencil(sctx->blitter, dst, clear_flags, depth, stencil,
-					 dstx, dsty, width, height);
-	si_blitter_end(sctx);
+   si_blitter_begin(sctx,
+                    SI_CLEAR_SURFACE | (render_condition_enabled ? 0 : SI_DISABLE_RENDER_COND));
+   util_blitter_clear_depth_stencil(sctx->blitter, dst, clear_flags, depth, stencil, dstx, dsty,
+                                    width, height);
+   si_blitter_end(sctx);
 }
 
-static void si_clear_texture(struct pipe_context *pipe,
-			     struct pipe_resource *tex,
-			     unsigned level,
-			     const struct pipe_box *box,
-			     const void *data)
+static void si_clear_texture(struct pipe_context *pipe, struct pipe_resource *tex, unsigned level,
+                             const struct pipe_box *box, const void *data)
 {
-	struct pipe_screen *screen = pipe->screen;
-	struct si_texture *stex = (struct si_texture*)tex;
-	struct pipe_surface tmpl = {{0}};
-	struct pipe_surface *sf;
-
-	tmpl.format = tex->format;
-	tmpl.u.tex.first_layer = box->z;
-	tmpl.u.tex.last_layer = box->z + box->depth - 1;
-	tmpl.u.tex.level = level;
-	sf = pipe->create_surface(pipe, tex, &tmpl);
-	if (!sf)
-		return;
-
-	if (stex->is_depth) {
-		unsigned clear;
-		float depth;
-		uint8_t stencil = 0;
-
-		/* Depth is always present. */
-		clear = PIPE_CLEAR_DEPTH;
-		util_format_unpack_z_float(tex->format, &depth, data, 1);
-
-		if (stex->surface.has_stencil) {
-			clear |= PIPE_CLEAR_STENCIL;
-			util_format_unpack_s_8uint(tex->format,
-						   &stencil, data, 1);
-		}
-
-		si_clear_depth_stencil(pipe, sf, clear, depth, stencil,
-				       box->x, box->y,
-				       box->width, box->height, false);
-	} else {
-		union pipe_color_union color;
-
-		util_format_unpack_rgba(tex->format, color.ui, data, 1);
-
-		if (screen->is_format_supported(screen, tex->format,
-						tex->target, 0, 0,
-						PIPE_BIND_RENDER_TARGET)) {
-			si_clear_render_target(pipe, sf, &color,
-					       box->x, box->y,
-					       box->width, box->height, false);
-		} else {
-			/* Software fallback - just for R9G9B9E5_FLOAT */
-			util_clear_render_target(pipe, sf, &color,
-						 box->x, box->y,
-						 box->width, box->height);
-		}
-	}
-	pipe_surface_reference(&sf, NULL);
+   struct pipe_screen *screen = pipe->screen;
+   struct si_texture *stex = (struct si_texture *)tex;
+   struct pipe_surface tmpl = {{0}};
+   struct pipe_surface *sf;
+
+   tmpl.format = tex->format;
+   tmpl.u.tex.first_layer = box->z;
+   tmpl.u.tex.last_layer = box->z + box->depth - 1;
+   tmpl.u.tex.level = level;
+   sf = pipe->create_surface(pipe, tex, &tmpl);
+   if (!sf)
+      return;
+
+   if (stex->is_depth) {
+      unsigned clear;
+      float depth;
+      uint8_t stencil = 0;
+
+      /* Depth is always present. */
+      clear = PIPE_CLEAR_DEPTH;
+      util_format_unpack_z_float(tex->format, &depth, data, 1);
+
+      if (stex->surface.has_stencil) {
+         clear |= PIPE_CLEAR_STENCIL;
+         util_format_unpack_s_8uint(tex->format, &stencil, data, 1);
+      }
+
+      si_clear_depth_stencil(pipe, sf, clear, depth, stencil, box->x, box->y, box->width,
+                             box->height, false);
+   } else {
+      union pipe_color_union color;
+
+      util_format_unpack_rgba(tex->format, color.ui, data, 1);
+
+      if (screen->is_format_supported(screen, tex->format, tex->target, 0, 0,
+                                      PIPE_BIND_RENDER_TARGET)) {
+         si_clear_render_target(pipe, sf, &color, box->x, box->y, box->width, box->height, false);
+      } else {
+         /* Software fallback - just for R9G9B9E5_FLOAT */
+         util_clear_render_target(pipe, sf, &color, box->x, box->y, box->width, box->height);
+      }
+   }
+   pipe_surface_reference(&sf, NULL);
 }
 
 void si_init_clear_functions(struct si_context *sctx)
 {
-	sctx->b.clear_render_target = si_clear_render_target;
-	sctx->b.clear_texture = si_clear_texture;
+   sctx->b.clear_render_target = si_clear_render_target;
+   sctx->b.clear_texture = si_clear_texture;
 
-	if (sctx->has_graphics) {
-		sctx->b.clear = si_clear;
-		sctx->b.clear_depth_stencil = si_clear_depth_stencil;
-	}
+   if (sctx->has_graphics) {
+      sctx->b.clear = si_clear;
+      sctx->b.clear_depth_stencil = si_clear_depth_stencil;
+   }
 }
diff --git a/src/gallium/drivers/radeonsi/si_compute.c b/src/gallium/drivers/radeonsi/si_compute.c
index 610c1333597..5dca5730a58 100644
--- a/src/gallium/drivers/radeonsi/si_compute.c
+++ b/src/gallium/drivers/radeonsi/si_compute.c
@@ -23,972 +23,892 @@
  *
  */
 
-#include "nir/tgsi_to_nir.h"
-#include "util/u_async_debug.h"
-#include "util/u_memory.h"
-#include "util/u_upload_mgr.h"
+#include "si_compute.h"
 
 #include "ac_rtld.h"
 #include "amd_kernel_code_t.h"
+#include "nir/tgsi_to_nir.h"
 #include "si_build_pm4.h"
-#include "si_compute.h"
+#include "util/u_async_debug.h"
+#include "util/u_memory.h"
+#include "util/u_upload_mgr.h"
 
-#define COMPUTE_DBG(sscreen, fmt, args...) \
-	do { \
-		if ((sscreen->debug_flags & DBG(COMPUTE))) fprintf(stderr, fmt, ##args); \
-	} while (0);
+#define COMPUTE_DBG(sscreen, fmt, args...)                                                         \
+   do {                                                                                            \
+      if ((sscreen->debug_flags & DBG(COMPUTE)))                                                   \
+         fprintf(stderr, fmt, ##args);                                                             \
+   } while (0);
 
 struct dispatch_packet {
-	uint16_t header;
-	uint16_t setup;
-	uint16_t workgroup_size_x;
-	uint16_t workgroup_size_y;
-	uint16_t workgroup_size_z;
-	uint16_t reserved0;
-	uint32_t grid_size_x;
-	uint32_t grid_size_y;
-	uint32_t grid_size_z;
-	uint32_t private_segment_size;
-	uint32_t group_segment_size;
-	uint64_t kernel_object;
-	uint64_t kernarg_address;
-	uint64_t reserved2;
+   uint16_t header;
+   uint16_t setup;
+   uint16_t workgroup_size_x;
+   uint16_t workgroup_size_y;
+   uint16_t workgroup_size_z;
+   uint16_t reserved0;
+   uint32_t grid_size_x;
+   uint32_t grid_size_y;
+   uint32_t grid_size_z;
+   uint32_t private_segment_size;
+   uint32_t group_segment_size;
+   uint64_t kernel_object;
+   uint64_t kernarg_address;
+   uint64_t reserved2;
 };
 
-static const amd_kernel_code_t *si_compute_get_code_object(
-	const struct si_compute *program,
-	uint64_t symbol_offset)
+static const amd_kernel_code_t *si_compute_get_code_object(const struct si_compute *program,
+                                                           uint64_t symbol_offset)
 {
-	const struct si_shader_selector *sel = &program->sel;
+   const struct si_shader_selector *sel = &program->sel;
 
-	if (program->ir_type != PIPE_SHADER_IR_NATIVE)
-		return NULL;
+   if (program->ir_type != PIPE_SHADER_IR_NATIVE)
+      return NULL;
 
-	struct ac_rtld_binary rtld;
-	if (!ac_rtld_open(&rtld, (struct ac_rtld_open_info){
-			.info = &sel->screen->info,
-			.shader_type = MESA_SHADER_COMPUTE,
-			.wave_size = sel->screen->compute_wave_size,
-			.num_parts = 1,
-			.elf_ptrs = &program->shader.binary.elf_buffer,
-			.elf_sizes = &program->shader.binary.elf_size }))
-		return NULL;
+   struct ac_rtld_binary rtld;
+   if (!ac_rtld_open(&rtld,
+                     (struct ac_rtld_open_info){.info = &sel->screen->info,
+                                                .shader_type = MESA_SHADER_COMPUTE,
+                                                .wave_size = sel->screen->compute_wave_size,
+                                                .num_parts = 1,
+                                                .elf_ptrs = &program->shader.binary.elf_buffer,
+                                                .elf_sizes = &program->shader.binary.elf_size}))
+      return NULL;
 
-	const amd_kernel_code_t *result = NULL;
-	const char *text;
-	size_t size;
-	if (!ac_rtld_get_section_by_name(&rtld, ".text", &text, &size))
-		goto out;
+   const amd_kernel_code_t *result = NULL;
+   const char *text;
+   size_t size;
+   if (!ac_rtld_get_section_by_name(&rtld, ".text", &text, &size))
+      goto out;
 
-	if (symbol_offset + sizeof(amd_kernel_code_t) > size)
-		goto out;
+   if (symbol_offset + sizeof(amd_kernel_code_t) > size)
+      goto out;
 
-	result = (const amd_kernel_code_t*)(text + symbol_offset);
+   result = (const amd_kernel_code_t *)(text + symbol_offset);
 
 out:
-	ac_rtld_close(&rtld);
-	return result;
+   ac_rtld_close(&rtld);
+   return result;
 }
 
 static void code_object_to_config(const amd_kernel_code_t *code_object,
-				  struct ac_shader_config *out_config) {
-
-	uint32_t rsrc1 = code_object->compute_pgm_resource_registers;
-	uint32_t rsrc2 = code_object->compute_pgm_resource_registers >> 32;
-	out_config->num_sgprs = code_object->wavefront_sgpr_count;
-	out_config->num_vgprs = code_object->workitem_vgpr_count;
-	out_config->float_mode = G_00B028_FLOAT_MODE(rsrc1);
-	out_config->rsrc1 = rsrc1;
-	out_config->lds_size = MAX2(out_config->lds_size, G_00B84C_LDS_SIZE(rsrc2));
-	out_config->rsrc2 = rsrc2;
-	out_config->scratch_bytes_per_wave =
-		align(code_object->workitem_private_segment_byte_size * 64, 1024);
+                                  struct ac_shader_config *out_config)
+{
+
+   uint32_t rsrc1 = code_object->compute_pgm_resource_registers;
+   uint32_t rsrc2 = code_object->compute_pgm_resource_registers >> 32;
+   out_config->num_sgprs = code_object->wavefront_sgpr_count;
+   out_config->num_vgprs = code_object->workitem_vgpr_count;
+   out_config->float_mode = G_00B028_FLOAT_MODE(rsrc1);
+   out_config->rsrc1 = rsrc1;
+   out_config->lds_size = MAX2(out_config->lds_size, G_00B84C_LDS_SIZE(rsrc2));
+   out_config->rsrc2 = rsrc2;
+   out_config->scratch_bytes_per_wave =
+      align(code_object->workitem_private_segment_byte_size * 64, 1024);
 }
 
 /* Asynchronous compute shader compilation. */
 static void si_create_compute_state_async(void *job, int thread_index)
 {
-	struct si_compute *program = (struct si_compute *)job;
-	struct si_shader_selector *sel = &program->sel;
-	struct si_shader *shader = &program->shader;
-	struct ac_llvm_compiler *compiler;
-	struct pipe_debug_callback *debug = &sel->compiler_ctx_state.debug;
-	struct si_screen *sscreen = sel->screen;
-
-	assert(!debug->debug_message || debug->async);
-	assert(thread_index >= 0);
-	assert(thread_index < ARRAY_SIZE(sscreen->compiler));
-	compiler = &sscreen->compiler[thread_index];
-
-	if (!compiler->passes)
-		si_init_compiler(sscreen, compiler);
-
-	assert(program->ir_type == PIPE_SHADER_IR_NIR);
-	si_nir_scan_shader(sel->nir, &sel->info);
-
-	/* Store the declared LDS size into si_shader_info for the shader
-	 * cache to include it.
-	 */
-	sel->info.properties[TGSI_PROPERTY_CS_LOCAL_SIZE] = program->local_size;
-
-	si_get_active_slot_masks(&sel->info,
-				 &sel->active_const_and_shader_buffers,
-				 &sel->active_samplers_and_images);
-
-	program->shader.is_monolithic = true;
-	program->reads_variable_block_size =
-		sel->info.uses_block_size &&
-		sel->info.properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] == 0;
-	program->num_cs_user_data_dwords =
-		sel->info.properties[TGSI_PROPERTY_CS_USER_DATA_COMPONENTS_AMD];
-
-	unsigned char ir_sha1_cache_key[20];
-	si_get_ir_cache_key(sel, false, false, ir_sha1_cache_key);
-
-	/* Try to load the shader from the shader cache. */
-	simple_mtx_lock(&sscreen->shader_cache_mutex);
-
-	if (si_shader_cache_load_shader(sscreen, ir_sha1_cache_key, shader)) {
-		simple_mtx_unlock(&sscreen->shader_cache_mutex);
-
-		si_shader_dump_stats_for_shader_db(sscreen, shader, debug);
-		si_shader_dump(sscreen, shader, debug, stderr, true);
-
-		if (!si_shader_binary_upload(sscreen, shader, 0))
-			program->shader.compilation_failed = true;
-	} else {
-		simple_mtx_unlock(&sscreen->shader_cache_mutex);
-
-		if (!si_create_shader_variant(sscreen, compiler, &program->shader, debug)) {
-			program->shader.compilation_failed = true;
-			return;
-		}
-
-		bool scratch_enabled = shader->config.scratch_bytes_per_wave > 0;
-		unsigned user_sgprs = SI_NUM_RESOURCE_SGPRS +
-				      (sel->info.uses_grid_size ? 3 : 0) +
-				      (program->reads_variable_block_size ? 3 : 0) +
-				      program->num_cs_user_data_dwords;
-
-		shader->config.rsrc1 =
-			S_00B848_VGPRS((shader->config.num_vgprs - 1) /
-				       (sscreen->compute_wave_size == 32 ? 8 : 4)) |
-			S_00B848_DX10_CLAMP(1) |
-			S_00B848_MEM_ORDERED(sscreen->info.chip_class >= GFX10) |
-			S_00B848_WGP_MODE(sscreen->info.chip_class >= GFX10) |
-			S_00B848_FLOAT_MODE(shader->config.float_mode);
-
-		if (sscreen->info.chip_class < GFX10) {
-			shader->config.rsrc1 |=
-				S_00B848_SGPRS((shader->config.num_sgprs - 1) / 8);
-		}
-
-		shader->config.rsrc2 =
-			S_00B84C_USER_SGPR(user_sgprs) |
-			S_00B84C_SCRATCH_EN(scratch_enabled) |
-			S_00B84C_TGID_X_EN(sel->info.uses_block_id[0]) |
-			S_00B84C_TGID_Y_EN(sel->info.uses_block_id[1]) |
-			S_00B84C_TGID_Z_EN(sel->info.uses_block_id[2]) |
-			S_00B84C_TG_SIZE_EN(sel->info.uses_subgroup_info) |
-			S_00B84C_TIDIG_COMP_CNT(sel->info.uses_thread_id[2] ? 2 :
-						sel->info.uses_thread_id[1] ? 1 : 0) |
-			S_00B84C_LDS_SIZE(shader->config.lds_size);
-
-		simple_mtx_lock(&sscreen->shader_cache_mutex);
-		si_shader_cache_insert_shader(sscreen, ir_sha1_cache_key,
-					      shader, true);
-		simple_mtx_unlock(&sscreen->shader_cache_mutex);
-	}
-
-	ralloc_free(sel->nir);
-	sel->nir = NULL;
+   struct si_compute *program = (struct si_compute *)job;
+   struct si_shader_selector *sel = &program->sel;
+   struct si_shader *shader = &program->shader;
+   struct ac_llvm_compiler *compiler;
+   struct pipe_debug_callback *debug = &sel->compiler_ctx_state.debug;
+   struct si_screen *sscreen = sel->screen;
+
+   assert(!debug->debug_message || debug->async);
+   assert(thread_index >= 0);
+   assert(thread_index < ARRAY_SIZE(sscreen->compiler));
+   compiler = &sscreen->compiler[thread_index];
+
+   if (!compiler->passes)
+      si_init_compiler(sscreen, compiler);
+
+   assert(program->ir_type == PIPE_SHADER_IR_NIR);
+   si_nir_scan_shader(sel->nir, &sel->info);
+
+   /* Store the declared LDS size into si_shader_info for the shader
+    * cache to include it.
+    */
+   sel->info.properties[TGSI_PROPERTY_CS_LOCAL_SIZE] = program->local_size;
+
+   si_get_active_slot_masks(&sel->info, &sel->active_const_and_shader_buffers,
+                            &sel->active_samplers_and_images);
+
+   program->shader.is_monolithic = true;
+   program->reads_variable_block_size =
+      sel->info.uses_block_size && sel->info.properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] == 0;
+   program->num_cs_user_data_dwords =
+      sel->info.properties[TGSI_PROPERTY_CS_USER_DATA_COMPONENTS_AMD];
+
+   unsigned char ir_sha1_cache_key[20];
+   si_get_ir_cache_key(sel, false, false, ir_sha1_cache_key);
+
+   /* Try to load the shader from the shader cache. */
+   simple_mtx_lock(&sscreen->shader_cache_mutex);
+
+   if (si_shader_cache_load_shader(sscreen, ir_sha1_cache_key, shader)) {
+      simple_mtx_unlock(&sscreen->shader_cache_mutex);
+
+      si_shader_dump_stats_for_shader_db(sscreen, shader, debug);
+      si_shader_dump(sscreen, shader, debug, stderr, true);
+
+      if (!si_shader_binary_upload(sscreen, shader, 0))
+         program->shader.compilation_failed = true;
+   } else {
+      simple_mtx_unlock(&sscreen->shader_cache_mutex);
+
+      if (!si_create_shader_variant(sscreen, compiler, &program->shader, debug)) {
+         program->shader.compilation_failed = true;
+         return;
+      }
+
+      bool scratch_enabled = shader->config.scratch_bytes_per_wave > 0;
+      unsigned user_sgprs = SI_NUM_RESOURCE_SGPRS + (sel->info.uses_grid_size ? 3 : 0) +
+                            (program->reads_variable_block_size ? 3 : 0) +
+                            program->num_cs_user_data_dwords;
+
+      shader->config.rsrc1 = S_00B848_VGPRS((shader->config.num_vgprs - 1) /
+                                            (sscreen->compute_wave_size == 32 ? 8 : 4)) |
+                             S_00B848_DX10_CLAMP(1) |
+                             S_00B848_MEM_ORDERED(sscreen->info.chip_class >= GFX10) |
+                             S_00B848_WGP_MODE(sscreen->info.chip_class >= GFX10) |
+                             S_00B848_FLOAT_MODE(shader->config.float_mode);
+
+      if (sscreen->info.chip_class < GFX10) {
+         shader->config.rsrc1 |= S_00B848_SGPRS((shader->config.num_sgprs - 1) / 8);
+      }
+
+      shader->config.rsrc2 = S_00B84C_USER_SGPR(user_sgprs) | S_00B84C_SCRATCH_EN(scratch_enabled) |
+                             S_00B84C_TGID_X_EN(sel->info.uses_block_id[0]) |
+                             S_00B84C_TGID_Y_EN(sel->info.uses_block_id[1]) |
+                             S_00B84C_TGID_Z_EN(sel->info.uses_block_id[2]) |
+                             S_00B84C_TG_SIZE_EN(sel->info.uses_subgroup_info) |
+                             S_00B84C_TIDIG_COMP_CNT(sel->info.uses_thread_id[2]
+                                                        ? 2
+                                                        : sel->info.uses_thread_id[1] ? 1 : 0) |
+                             S_00B84C_LDS_SIZE(shader->config.lds_size);
+
+      simple_mtx_lock(&sscreen->shader_cache_mutex);
+      si_shader_cache_insert_shader(sscreen, ir_sha1_cache_key, shader, true);
+      simple_mtx_unlock(&sscreen->shader_cache_mutex);
+   }
+
+   ralloc_free(sel->nir);
+   sel->nir = NULL;
 }
 
-static void *si_create_compute_state(
-	struct pipe_context *ctx,
-	const struct pipe_compute_state *cso)
+static void *si_create_compute_state(struct pipe_context *ctx, const struct pipe_compute_state *cso)
 {
-	struct si_context *sctx = (struct si_context *)ctx;
-	struct si_screen *sscreen = (struct si_screen *)ctx->screen;
-	struct si_compute *program = CALLOC_STRUCT(si_compute);
-	struct si_shader_selector *sel = &program->sel;
-
-	pipe_reference_init(&sel->base.reference, 1);
-	sel->type = PIPE_SHADER_COMPUTE;
-	sel->screen = sscreen;
-	program->shader.selector = &program->sel;
-	program->ir_type = cso->ir_type;
-	program->local_size = cso->req_local_mem;
-	program->private_size = cso->req_private_mem;
-	program->input_size = cso->req_input_mem;
-
-	if (cso->ir_type != PIPE_SHADER_IR_NATIVE) {
-		if (cso->ir_type == PIPE_SHADER_IR_TGSI) {
-			program->ir_type = PIPE_SHADER_IR_NIR;
-			sel->nir = tgsi_to_nir(cso->prog, ctx->screen);
-		} else {
-			assert(cso->ir_type == PIPE_SHADER_IR_NIR);
-			sel->nir = (struct nir_shader *) cso->prog;
-		}
-
-		sel->compiler_ctx_state.debug = sctx->debug;
-		sel->compiler_ctx_state.is_debug_context = sctx->is_debug;
-		p_atomic_inc(&sscreen->num_shaders_created);
-
-		si_schedule_initial_compile(sctx, PIPE_SHADER_COMPUTE,
-					    &sel->ready,
-					    &sel->compiler_ctx_state,
-					    program, si_create_compute_state_async);
-	} else {
-		const struct pipe_binary_program_header *header;
-		header = cso->prog;
-
-		program->shader.binary.elf_size = header->num_bytes;
-		program->shader.binary.elf_buffer = malloc(header->num_bytes);
-		if (!program->shader.binary.elf_buffer) {
-			FREE(program);
-			return NULL;
-		}
-		memcpy((void *)program->shader.binary.elf_buffer, header->blob, header->num_bytes);
-
-		const amd_kernel_code_t *code_object =
-			si_compute_get_code_object(program, 0);
-		code_object_to_config(code_object, &program->shader.config);
-
-		si_shader_dump(sctx->screen, &program->shader, &sctx->debug, stderr, true);
-		if (!si_shader_binary_upload(sctx->screen, &program->shader, 0)) {
-			fprintf(stderr, "LLVM failed to upload shader\n");
-			free((void *)program->shader.binary.elf_buffer);
-			FREE(program);
-			return NULL;
-		}
-	}
-
-	return program;
+   struct si_context *sctx = (struct si_context *)ctx;
+   struct si_screen *sscreen = (struct si_screen *)ctx->screen;
+   struct si_compute *program = CALLOC_STRUCT(si_compute);
+   struct si_shader_selector *sel = &program->sel;
+
+   pipe_reference_init(&sel->base.reference, 1);
+   sel->type = PIPE_SHADER_COMPUTE;
+   sel->screen = sscreen;
+   program->shader.selector = &program->sel;
+   program->ir_type = cso->ir_type;
+   program->local_size = cso->req_local_mem;
+   program->private_size = cso->req_private_mem;
+   program->input_size = cso->req_input_mem;
+
+   if (cso->ir_type != PIPE_SHADER_IR_NATIVE) {
+      if (cso->ir_type == PIPE_SHADER_IR_TGSI) {
+         program->ir_type = PIPE_SHADER_IR_NIR;
+         sel->nir = tgsi_to_nir(cso->prog, ctx->screen);
+      } else {
+         assert(cso->ir_type == PIPE_SHADER_IR_NIR);
+         sel->nir = (struct nir_shader *)cso->prog;
+      }
+
+      sel->compiler_ctx_state.debug = sctx->debug;
+      sel->compiler_ctx_state.is_debug_context = sctx->is_debug;
+      p_atomic_inc(&sscreen->num_shaders_created);
+
+      si_schedule_initial_compile(sctx, PIPE_SHADER_COMPUTE, &sel->ready, &sel->compiler_ctx_state,
+                                  program, si_create_compute_state_async);
+   } else {
+      const struct pipe_binary_program_header *header;
+      header = cso->prog;
+
+      program->shader.binary.elf_size = header->num_bytes;
+      program->shader.binary.elf_buffer = malloc(header->num_bytes);
+      if (!program->shader.binary.elf_buffer) {
+         FREE(program);
+         return NULL;
+      }
+      memcpy((void *)program->shader.binary.elf_buffer, header->blob, header->num_bytes);
+
+      const amd_kernel_code_t *code_object = si_compute_get_code_object(program, 0);
+      code_object_to_config(code_object, &program->shader.config);
+
+      si_shader_dump(sctx->screen, &program->shader, &sctx->debug, stderr, true);
+      if (!si_shader_binary_upload(sctx->screen, &program->shader, 0)) {
+         fprintf(stderr, "LLVM failed to upload shader\n");
+         free((void *)program->shader.binary.elf_buffer);
+         FREE(program);
+         return NULL;
+      }
+   }
+
+   return program;
 }
 
 static void si_bind_compute_state(struct pipe_context *ctx, void *state)
 {
-	struct si_context *sctx = (struct si_context*)ctx;
-	struct si_compute *program = (struct si_compute*)state;
-	struct si_shader_selector *sel = &program->sel;
-
-	sctx->cs_shader_state.program = program;
-	if (!program)
-		return;
-
-	/* Wait because we need active slot usage masks. */
-	if (program->ir_type != PIPE_SHADER_IR_NATIVE)
-		util_queue_fence_wait(&sel->ready);
-
-	si_set_active_descriptors(sctx,
-				  SI_DESCS_FIRST_COMPUTE +
-				  SI_SHADER_DESCS_CONST_AND_SHADER_BUFFERS,
-				  sel->active_const_and_shader_buffers);
-	si_set_active_descriptors(sctx,
-				  SI_DESCS_FIRST_COMPUTE +
-				  SI_SHADER_DESCS_SAMPLERS_AND_IMAGES,
-				  sel->active_samplers_and_images);
+   struct si_context *sctx = (struct si_context *)ctx;
+   struct si_compute *program = (struct si_compute *)state;
+   struct si_shader_selector *sel = &program->sel;
+
+   sctx->cs_shader_state.program = program;
+   if (!program)
+      return;
+
+   /* Wait because we need active slot usage masks. */
+   if (program->ir_type != PIPE_SHADER_IR_NATIVE)
+      util_queue_fence_wait(&sel->ready);
+
+   si_set_active_descriptors(sctx,
+                             SI_DESCS_FIRST_COMPUTE + SI_SHADER_DESCS_CONST_AND_SHADER_BUFFERS,
+                             sel->active_const_and_shader_buffers);
+   si_set_active_descriptors(sctx, SI_DESCS_FIRST_COMPUTE + SI_SHADER_DESCS_SAMPLERS_AND_IMAGES,
+                             sel->active_samplers_and_images);
 }
 
-static void si_set_global_binding(
-	struct pipe_context *ctx, unsigned first, unsigned n,
-	struct pipe_resource **resources,
-	uint32_t **handles)
+static void si_set_global_binding(struct pipe_context *ctx, unsigned first, unsigned n,
+                                  struct pipe_resource **resources, uint32_t **handles)
 {
-	unsigned i;
-	struct si_context *sctx = (struct si_context*)ctx;
-	struct si_compute *program = sctx->cs_shader_state.program;
-
-	if (first + n > program->max_global_buffers) {
-		unsigned old_max = program->max_global_buffers;
-		program->max_global_buffers = first + n;
-		program->global_buffers =
-			realloc(program->global_buffers,
-				program->max_global_buffers *
-				sizeof(program->global_buffers[0]));
-		if (!program->global_buffers) {
-			fprintf(stderr, "radeonsi: failed to allocate compute global_buffers\n");
-			return;
-		}
-
-		memset(&program->global_buffers[old_max], 0,
-		       (program->max_global_buffers - old_max) *
-		       sizeof(program->global_buffers[0]));
-	}
-
-	if (!resources) {
-		for (i = 0; i < n; i++) {
-			pipe_resource_reference(&program->global_buffers[first + i], NULL);
-		}
-		return;
-	}
-
-	for (i = 0; i < n; i++) {
-		uint64_t va;
-		uint32_t offset;
-		pipe_resource_reference(&program->global_buffers[first + i], resources[i]);
-		va = si_resource(resources[i])->gpu_address;
-		offset = util_le32_to_cpu(*handles[i]);
-		va += offset;
-		va = util_cpu_to_le64(va);
-		memcpy(handles[i], &va, sizeof(va));
-	}
+   unsigned i;
+   struct si_context *sctx = (struct si_context *)ctx;
+   struct si_compute *program = sctx->cs_shader_state.program;
+
+   if (first + n > program->max_global_buffers) {
+      unsigned old_max = program->max_global_buffers;
+      program->max_global_buffers = first + n;
+      program->global_buffers = realloc(
+         program->global_buffers, program->max_global_buffers * sizeof(program->global_buffers[0]));
+      if (!program->global_buffers) {
+         fprintf(stderr, "radeonsi: failed to allocate compute global_buffers\n");
+         return;
+      }
+
+      memset(&program->global_buffers[old_max], 0,
+             (program->max_global_buffers - old_max) * sizeof(program->global_buffers[0]));
+   }
+
+   if (!resources) {
+      for (i = 0; i < n; i++) {
+         pipe_resource_reference(&program->global_buffers[first + i], NULL);
+      }
+      return;
+   }
+
+   for (i = 0; i < n; i++) {
+      uint64_t va;
+      uint32_t offset;
+      pipe_resource_reference(&program->global_buffers[first + i], resources[i]);
+      va = si_resource(resources[i])->gpu_address;
+      offset = util_le32_to_cpu(*handles[i]);
+      va += offset;
+      va = util_cpu_to_le64(va);
+      memcpy(handles[i], &va, sizeof(va));
+   }
 }
 
 void si_emit_initial_compute_regs(struct si_context *sctx, struct radeon_cmdbuf *cs)
 {
-	uint64_t bc_va;
-
-	radeon_set_sh_reg_seq(cs, R_00B858_COMPUTE_STATIC_THREAD_MGMT_SE0, 2);
-	/* R_00B858_COMPUTE_STATIC_THREAD_MGMT_SE0 / SE1,
-	 * renamed COMPUTE_DESTINATION_EN_SEn on gfx10. */
-	radeon_emit(cs, S_00B858_SH0_CU_EN(0xffff) | S_00B858_SH1_CU_EN(0xffff));
-	radeon_emit(cs, S_00B858_SH0_CU_EN(0xffff) | S_00B858_SH1_CU_EN(0xffff));
-
-	if (sctx->chip_class >= GFX7) {
-		/* Also set R_00B858_COMPUTE_STATIC_THREAD_MGMT_SE2 / SE3 */
-		radeon_set_sh_reg_seq(cs,
-		                     R_00B864_COMPUTE_STATIC_THREAD_MGMT_SE2, 2);
-		radeon_emit(cs, S_00B858_SH0_CU_EN(0xffff) |
-		                S_00B858_SH1_CU_EN(0xffff));
-		radeon_emit(cs, S_00B858_SH0_CU_EN(0xffff) |
-		                S_00B858_SH1_CU_EN(0xffff));
-	}
-
-	if (sctx->chip_class >= GFX10)
-		radeon_set_sh_reg(cs, R_00B8A0_COMPUTE_PGM_RSRC3, 0);
-
-	/* This register has been moved to R_00CD20_COMPUTE_MAX_WAVE_ID
-	 * and is now per pipe, so it should be handled in the
-	 * kernel if we want to use something other than the default value,
-	 * which is now 0x22f.
-	 */
-	if (sctx->chip_class <= GFX6) {
-		/* XXX: This should be:
-		 * (number of compute units) * 4 * (waves per simd) - 1 */
-
-		radeon_set_sh_reg(cs, R_00B82C_COMPUTE_MAX_WAVE_ID,
-		                  0x190 /* Default value */);
-	}
-
-	/* Set the pointer to border colors. */
-	bc_va = sctx->border_color_buffer->gpu_address;
-
-	if (sctx->chip_class >= GFX7) {
-		radeon_set_uconfig_reg_seq(cs, R_030E00_TA_CS_BC_BASE_ADDR, 2);
-		radeon_emit(cs, bc_va >> 8);  /* R_030E00_TA_CS_BC_BASE_ADDR */
-		radeon_emit(cs, S_030E04_ADDRESS(bc_va >> 40)); /* R_030E04_TA_CS_BC_BASE_ADDR_HI */
-	} else {
-		if (sctx->screen->info.si_TA_CS_BC_BASE_ADDR_allowed) {
-			radeon_set_config_reg(cs, R_00950C_TA_CS_BC_BASE_ADDR,
-					      bc_va >> 8);
-		}
-	}
+   uint64_t bc_va;
+
+   radeon_set_sh_reg_seq(cs, R_00B858_COMPUTE_STATIC_THREAD_MGMT_SE0, 2);
+   /* R_00B858_COMPUTE_STATIC_THREAD_MGMT_SE0 / SE1,
+    * renamed COMPUTE_DESTINATION_EN_SEn on gfx10. */
+   radeon_emit(cs, S_00B858_SH0_CU_EN(0xffff) | S_00B858_SH1_CU_EN(0xffff));
+   radeon_emit(cs, S_00B858_SH0_CU_EN(0xffff) | S_00B858_SH1_CU_EN(0xffff));
+
+   if (sctx->chip_class >= GFX7) {
+      /* Also set R_00B858_COMPUTE_STATIC_THREAD_MGMT_SE2 / SE3 */
+      radeon_set_sh_reg_seq(cs, R_00B864_COMPUTE_STATIC_THREAD_MGMT_SE2, 2);
+      radeon_emit(cs, S_00B858_SH0_CU_EN(0xffff) | S_00B858_SH1_CU_EN(0xffff));
+      radeon_emit(cs, S_00B858_SH0_CU_EN(0xffff) | S_00B858_SH1_CU_EN(0xffff));
+   }
+
+   if (sctx->chip_class >= GFX10)
+      radeon_set_sh_reg(cs, R_00B8A0_COMPUTE_PGM_RSRC3, 0);
+
+   /* This register has been moved to R_00CD20_COMPUTE_MAX_WAVE_ID
+    * and is now per pipe, so it should be handled in the
+    * kernel if we want to use something other than the default value,
+    * which is now 0x22f.
+    */
+   if (sctx->chip_class <= GFX6) {
+      /* XXX: This should be:
+       * (number of compute units) * 4 * (waves per simd) - 1 */
+
+      radeon_set_sh_reg(cs, R_00B82C_COMPUTE_MAX_WAVE_ID, 0x190 /* Default value */);
+   }
+
+   /* Set the pointer to border colors. */
+   bc_va = sctx->border_color_buffer->gpu_address;
+
+   if (sctx->chip_class >= GFX7) {
+      radeon_set_uconfig_reg_seq(cs, R_030E00_TA_CS_BC_BASE_ADDR, 2);
+      radeon_emit(cs, bc_va >> 8);                    /* R_030E00_TA_CS_BC_BASE_ADDR */
+      radeon_emit(cs, S_030E04_ADDRESS(bc_va >> 40)); /* R_030E04_TA_CS_BC_BASE_ADDR_HI */
+   } else {
+      if (sctx->screen->info.si_TA_CS_BC_BASE_ADDR_allowed) {
+         radeon_set_config_reg(cs, R_00950C_TA_CS_BC_BASE_ADDR, bc_va >> 8);
+      }
+   }
 }
 
-static bool si_setup_compute_scratch_buffer(struct si_context *sctx,
-                                            struct si_shader *shader,
+static bool si_setup_compute_scratch_buffer(struct si_context *sctx, struct si_shader *shader,
                                             struct ac_shader_config *config)
 {
-	uint64_t scratch_bo_size, scratch_needed;
-	scratch_bo_size = 0;
-	scratch_needed = config->scratch_bytes_per_wave * sctx->scratch_waves;
-	if (sctx->compute_scratch_buffer)
-		scratch_bo_size = sctx->compute_scratch_buffer->b.b.width0;
+   uint64_t scratch_bo_size, scratch_needed;
+   scratch_bo_size = 0;
+   scratch_needed = config->scratch_bytes_per_wave * sctx->scratch_waves;
+   if (sctx->compute_scratch_buffer)
+      scratch_bo_size = sctx->compute_scratch_buffer->b.b.width0;
 
-	if (scratch_bo_size < scratch_needed) {
-		si_resource_reference(&sctx->compute_scratch_buffer, NULL);
+   if (scratch_bo_size < scratch_needed) {
+      si_resource_reference(&sctx->compute_scratch_buffer, NULL);
 
-		sctx->compute_scratch_buffer =
-			si_aligned_buffer_create(&sctx->screen->b,
-						 SI_RESOURCE_FLAG_UNMAPPABLE,
-						 PIPE_USAGE_DEFAULT,
-						 scratch_needed,
-						 sctx->screen->info.pte_fragment_size);
+      sctx->compute_scratch_buffer =
+         si_aligned_buffer_create(&sctx->screen->b, SI_RESOURCE_FLAG_UNMAPPABLE, PIPE_USAGE_DEFAULT,
+                                  scratch_needed, sctx->screen->info.pte_fragment_size);
 
-		if (!sctx->compute_scratch_buffer)
-			return false;
-	}
+      if (!sctx->compute_scratch_buffer)
+         return false;
+   }
 
-	if (sctx->compute_scratch_buffer != shader->scratch_bo && scratch_needed) {
-		uint64_t scratch_va = sctx->compute_scratch_buffer->gpu_address;
+   if (sctx->compute_scratch_buffer != shader->scratch_bo && scratch_needed) {
+      uint64_t scratch_va = sctx->compute_scratch_buffer->gpu_address;
 
-		if (!si_shader_binary_upload(sctx->screen, shader, scratch_va))
-			return false;
+      if (!si_shader_binary_upload(sctx->screen, shader, scratch_va))
+         return false;
 
-		si_resource_reference(&shader->scratch_bo,
-		                        sctx->compute_scratch_buffer);
-	}
+      si_resource_reference(&shader->scratch_bo, sctx->compute_scratch_buffer);
+   }
 
-	return true;
+   return true;
 }
 
-static bool si_switch_compute_shader(struct si_context *sctx,
-                                     struct si_compute *program,
-				     struct si_shader *shader,
-				     const amd_kernel_code_t *code_object,
-				     unsigned offset)
+static bool si_switch_compute_shader(struct si_context *sctx, struct si_compute *program,
+                                     struct si_shader *shader, const amd_kernel_code_t *code_object,
+                                     unsigned offset)
 {
-	struct radeon_cmdbuf *cs = sctx->gfx_cs;
-	struct ac_shader_config inline_config = {0};
-	struct ac_shader_config *config;
-	uint64_t shader_va;
-
-	if (sctx->cs_shader_state.emitted_program == program &&
-	    sctx->cs_shader_state.offset == offset)
-		return true;
-
-	if (program->ir_type != PIPE_SHADER_IR_NATIVE) {
-		config = &shader->config;
-	} else {
-		unsigned lds_blocks;
-
-		config = &inline_config;
-		code_object_to_config(code_object, config);
-
-		lds_blocks = config->lds_size;
-		/* XXX: We are over allocating LDS.  For GFX6, the shader reports
-		* LDS in blocks of 256 bytes, so if there are 4 bytes lds
-		* allocated in the shader and 4 bytes allocated by the state
-		* tracker, then we will set LDS_SIZE to 512 bytes rather than 256.
-		*/
-		if (sctx->chip_class <= GFX6) {
-			lds_blocks += align(program->local_size, 256) >> 8;
-		} else {
-			lds_blocks += align(program->local_size, 512) >> 9;
-		}
-
-		/* TODO: use si_multiwave_lds_size_workaround */
-		assert(lds_blocks <= 0xFF);
-
-		config->rsrc2 &= C_00B84C_LDS_SIZE;
-		config->rsrc2 |=  S_00B84C_LDS_SIZE(lds_blocks);
-	}
-
-	if (!si_setup_compute_scratch_buffer(sctx, shader, config))
-		return false;
-
-	if (shader->scratch_bo) {
-		COMPUTE_DBG(sctx->screen, "Waves: %u; Scratch per wave: %u bytes; "
-		            "Total Scratch: %u bytes\n", sctx->scratch_waves,
-			    config->scratch_bytes_per_wave,
-			    config->scratch_bytes_per_wave *
-			    sctx->scratch_waves);
-
-		radeon_add_to_buffer_list(sctx, sctx->gfx_cs,
-			      shader->scratch_bo, RADEON_USAGE_READWRITE,
-			      RADEON_PRIO_SCRATCH_BUFFER);
-	}
-
-	/* Prefetch the compute shader to TC L2.
-	 *
-	 * We should also prefetch graphics shaders if a compute dispatch was
-	 * the last command, and the compute shader if a draw call was the last
-	 * command. However, that would add more complexity and we're likely
-	 * to get a shader state change in that case anyway.
-	 */
-	if (sctx->chip_class >= GFX7) {
-		cik_prefetch_TC_L2_async(sctx, &program->shader.bo->b.b,
-					 0, program->shader.bo->b.b.width0);
-	}
-
-	shader_va = shader->bo->gpu_address + offset;
-	if (program->ir_type == PIPE_SHADER_IR_NATIVE) {
-		/* Shader code is placed after the amd_kernel_code_t
-		 * struct. */
-		shader_va += sizeof(amd_kernel_code_t);
-	}
-
-	radeon_add_to_buffer_list(sctx, sctx->gfx_cs, shader->bo,
-	                          RADEON_USAGE_READ, RADEON_PRIO_SHADER_BINARY);
-
-	radeon_set_sh_reg_seq(cs, R_00B830_COMPUTE_PGM_LO, 2);
-	radeon_emit(cs, shader_va >> 8);
-	radeon_emit(cs, S_00B834_DATA(shader_va >> 40));
-
-	radeon_set_sh_reg_seq(cs, R_00B848_COMPUTE_PGM_RSRC1, 2);
-	radeon_emit(cs, config->rsrc1);
-	radeon_emit(cs, config->rsrc2);
-
-	COMPUTE_DBG(sctx->screen, "COMPUTE_PGM_RSRC1: 0x%08x "
-		"COMPUTE_PGM_RSRC2: 0x%08x\n", config->rsrc1, config->rsrc2);
-
-	sctx->max_seen_compute_scratch_bytes_per_wave =
-		MAX2(sctx->max_seen_compute_scratch_bytes_per_wave,
-		     config->scratch_bytes_per_wave);
-
-	radeon_set_sh_reg(cs, R_00B860_COMPUTE_TMPRING_SIZE,
-	          S_00B860_WAVES(sctx->scratch_waves)
-	             | S_00B860_WAVESIZE(sctx->max_seen_compute_scratch_bytes_per_wave >> 10));
-
-	sctx->cs_shader_state.emitted_program = program;
-	sctx->cs_shader_state.offset = offset;
-	sctx->cs_shader_state.uses_scratch =
-		config->scratch_bytes_per_wave != 0;
-
-	return true;
+   struct radeon_cmdbuf *cs = sctx->gfx_cs;
+   struct ac_shader_config inline_config = {0};
+   struct ac_shader_config *config;
+   uint64_t shader_va;
+
+   if (sctx->cs_shader_state.emitted_program == program && sctx->cs_shader_state.offset == offset)
+      return true;
+
+   if (program->ir_type != PIPE_SHADER_IR_NATIVE) {
+      config = &shader->config;
+   } else {
+      unsigned lds_blocks;
+
+      config = &inline_config;
+      code_object_to_config(code_object, config);
+
+      lds_blocks = config->lds_size;
+      /* XXX: We are over allocating LDS.  For GFX6, the shader reports
+       * LDS in blocks of 256 bytes, so if there are 4 bytes lds
+       * allocated in the shader and 4 bytes allocated by the state
+       * tracker, then we will set LDS_SIZE to 512 bytes rather than 256.
+       */
+      if (sctx->chip_class <= GFX6) {
+         lds_blocks += align(program->local_size, 256) >> 8;
+      } else {
+         lds_blocks += align(program->local_size, 512) >> 9;
+      }
+
+      /* TODO: use si_multiwave_lds_size_workaround */
+      assert(lds_blocks <= 0xFF);
+
+      config->rsrc2 &= C_00B84C_LDS_SIZE;
+      config->rsrc2 |= S_00B84C_LDS_SIZE(lds_blocks);
+   }
+
+   if (!si_setup_compute_scratch_buffer(sctx, shader, config))
+      return false;
+
+   if (shader->scratch_bo) {
+      COMPUTE_DBG(sctx->screen,
+                  "Waves: %u; Scratch per wave: %u bytes; "
+                  "Total Scratch: %u bytes\n",
+                  sctx->scratch_waves, config->scratch_bytes_per_wave,
+                  config->scratch_bytes_per_wave * sctx->scratch_waves);
+
+      radeon_add_to_buffer_list(sctx, sctx->gfx_cs, shader->scratch_bo, RADEON_USAGE_READWRITE,
+                                RADEON_PRIO_SCRATCH_BUFFER);
+   }
+
+   /* Prefetch the compute shader to TC L2.
+    *
+    * We should also prefetch graphics shaders if a compute dispatch was
+    * the last command, and the compute shader if a draw call was the last
+    * command. However, that would add more complexity and we're likely
+    * to get a shader state change in that case anyway.
+    */
+   if (sctx->chip_class >= GFX7) {
+      cik_prefetch_TC_L2_async(sctx, &program->shader.bo->b.b, 0, program->shader.bo->b.b.width0);
+   }
+
+   shader_va = shader->bo->gpu_address + offset;
+   if (program->ir_type == PIPE_SHADER_IR_NATIVE) {
+      /* Shader code is placed after the amd_kernel_code_t
+       * struct. */
+      shader_va += sizeof(amd_kernel_code_t);
+   }
+
+   radeon_add_to_buffer_list(sctx, sctx->gfx_cs, shader->bo, RADEON_USAGE_READ,
+                             RADEON_PRIO_SHADER_BINARY);
+
+   radeon_set_sh_reg_seq(cs, R_00B830_COMPUTE_PGM_LO, 2);
+   radeon_emit(cs, shader_va >> 8);
+   radeon_emit(cs, S_00B834_DATA(shader_va >> 40));
+
+   radeon_set_sh_reg_seq(cs, R_00B848_COMPUTE_PGM_RSRC1, 2);
+   radeon_emit(cs, config->rsrc1);
+   radeon_emit(cs, config->rsrc2);
+
+   COMPUTE_DBG(sctx->screen,
+               "COMPUTE_PGM_RSRC1: 0x%08x "
+               "COMPUTE_PGM_RSRC2: 0x%08x\n",
+               config->rsrc1, config->rsrc2);
+
+   sctx->max_seen_compute_scratch_bytes_per_wave =
+      MAX2(sctx->max_seen_compute_scratch_bytes_per_wave, config->scratch_bytes_per_wave);
+
+   radeon_set_sh_reg(cs, R_00B860_COMPUTE_TMPRING_SIZE,
+                     S_00B860_WAVES(sctx->scratch_waves) |
+                        S_00B860_WAVESIZE(sctx->max_seen_compute_scratch_bytes_per_wave >> 10));
+
+   sctx->cs_shader_state.emitted_program = program;
+   sctx->cs_shader_state.offset = offset;
+   sctx->cs_shader_state.uses_scratch = config->scratch_bytes_per_wave != 0;
+
+   return true;
 }
 
 static void setup_scratch_rsrc_user_sgprs(struct si_context *sctx,
-					  const amd_kernel_code_t *code_object,
-					  unsigned user_sgpr)
+                                          const amd_kernel_code_t *code_object, unsigned user_sgpr)
 {
-	struct radeon_cmdbuf *cs = sctx->gfx_cs;
-	uint64_t scratch_va = sctx->compute_scratch_buffer->gpu_address;
-
-	unsigned max_private_element_size = AMD_HSA_BITS_GET(
-			code_object->code_properties,
-			AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE);
-
-	uint32_t scratch_dword0 = scratch_va & 0xffffffff;
-	uint32_t scratch_dword1 =
-		S_008F04_BASE_ADDRESS_HI(scratch_va >> 32) |
-		S_008F04_SWIZZLE_ENABLE(1);
-
-	/* Disable address clamping */
-	uint32_t scratch_dword2 = 0xffffffff;
-	uint32_t scratch_dword3 =
-		S_008F0C_INDEX_STRIDE(3) |
-		S_008F0C_ADD_TID_ENABLE(1);
-
-	if (sctx->chip_class >= GFX9) {
-		assert(max_private_element_size == 1); /* always 4 bytes on GFX9 */
-	} else {
-		scratch_dword3 |= S_008F0C_ELEMENT_SIZE(max_private_element_size);
-
-		if (sctx->chip_class < GFX8) {
-			/* BUF_DATA_FORMAT is ignored, but it cannot be
-			 * BUF_DATA_FORMAT_INVALID. */
-			scratch_dword3 |=
-				S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_8);
-		}
-	}
-
-	radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0 +
-							(user_sgpr * 4), 4);
-	radeon_emit(cs, scratch_dword0);
-	radeon_emit(cs, scratch_dword1);
-	radeon_emit(cs, scratch_dword2);
-	radeon_emit(cs, scratch_dword3);
+   struct radeon_cmdbuf *cs = sctx->gfx_cs;
+   uint64_t scratch_va = sctx->compute_scratch_buffer->gpu_address;
+
+   unsigned max_private_element_size =
+      AMD_HSA_BITS_GET(code_object->code_properties, AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE);
+
+   uint32_t scratch_dword0 = scratch_va & 0xffffffff;
+   uint32_t scratch_dword1 =
+      S_008F04_BASE_ADDRESS_HI(scratch_va >> 32) | S_008F04_SWIZZLE_ENABLE(1);
+
+   /* Disable address clamping */
+   uint32_t scratch_dword2 = 0xffffffff;
+   uint32_t scratch_dword3 = S_008F0C_INDEX_STRIDE(3) | S_008F0C_ADD_TID_ENABLE(1);
+
+   if (sctx->chip_class >= GFX9) {
+      assert(max_private_element_size == 1); /* always 4 bytes on GFX9 */
+   } else {
+      scratch_dword3 |= S_008F0C_ELEMENT_SIZE(max_private_element_size);
+
+      if (sctx->chip_class < GFX8) {
+         /* BUF_DATA_FORMAT is ignored, but it cannot be
+          * BUF_DATA_FORMAT_INVALID. */
+         scratch_dword3 |= S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_8);
+      }
+   }
+
+   radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0 + (user_sgpr * 4), 4);
+   radeon_emit(cs, scratch_dword0);
+   radeon_emit(cs, scratch_dword1);
+   radeon_emit(cs, scratch_dword2);
+   radeon_emit(cs, scratch_dword3);
 }
 
-static void si_setup_user_sgprs_co_v2(struct si_context *sctx,
-                                      const amd_kernel_code_t *code_object,
-				      const struct pipe_grid_info *info,
-				      uint64_t kernel_args_va)
+static void si_setup_user_sgprs_co_v2(struct si_context *sctx, const amd_kernel_code_t *code_object,
+                                      const struct pipe_grid_info *info, uint64_t kernel_args_va)
 {
-	struct si_compute *program = sctx->cs_shader_state.program;
-	struct radeon_cmdbuf *cs = sctx->gfx_cs;
-
-	static const enum amd_code_property_mask_t workgroup_count_masks [] = {
-		AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X,
-		AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y,
-		AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z
-	};
-
-	unsigned i, user_sgpr = 0;
-	if (AMD_HSA_BITS_GET(code_object->code_properties,
-			AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER)) {
-		if (code_object->workitem_private_segment_byte_size > 0) {
-			setup_scratch_rsrc_user_sgprs(sctx, code_object,
-								user_sgpr);
-		}
-		user_sgpr += 4;
-	}
-
-	if (AMD_HSA_BITS_GET(code_object->code_properties,
-			AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR)) {
-		struct dispatch_packet dispatch;
-		unsigned dispatch_offset;
-		struct si_resource *dispatch_buf = NULL;
-		uint64_t dispatch_va;
-
-		/* Upload dispatch ptr */
-		memset(&dispatch, 0, sizeof(dispatch));
-
-		dispatch.workgroup_size_x = util_cpu_to_le16(info->block[0]);
-		dispatch.workgroup_size_y = util_cpu_to_le16(info->block[1]);
-		dispatch.workgroup_size_z = util_cpu_to_le16(info->block[2]);
-
-		dispatch.grid_size_x = util_cpu_to_le32(info->grid[0] * info->block[0]);
-		dispatch.grid_size_y = util_cpu_to_le32(info->grid[1] * info->block[1]);
-		dispatch.grid_size_z = util_cpu_to_le32(info->grid[2] * info->block[2]);
-
-		dispatch.private_segment_size = util_cpu_to_le32(program->private_size);
-		dispatch.group_segment_size = util_cpu_to_le32(program->local_size);
-
-		dispatch.kernarg_address = util_cpu_to_le64(kernel_args_va);
-
-		u_upload_data(sctx->b.const_uploader, 0, sizeof(dispatch),
-                              256, &dispatch, &dispatch_offset,
-                              (struct pipe_resource**)&dispatch_buf);
-
-		if (!dispatch_buf) {
-			fprintf(stderr, "Error: Failed to allocate dispatch "
-					"packet.");
-		}
-		radeon_add_to_buffer_list(sctx, sctx->gfx_cs, dispatch_buf,
-				  RADEON_USAGE_READ, RADEON_PRIO_CONST_BUFFER);
-
-		dispatch_va = dispatch_buf->gpu_address + dispatch_offset;
-
-		radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0 +
-							(user_sgpr * 4), 2);
-		radeon_emit(cs, dispatch_va);
-		radeon_emit(cs, S_008F04_BASE_ADDRESS_HI(dispatch_va >> 32) |
-                                S_008F04_STRIDE(0));
-
-		si_resource_reference(&dispatch_buf, NULL);
-		user_sgpr += 2;
-	}
-
-	if (AMD_HSA_BITS_GET(code_object->code_properties,
-			AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR)) {
-		radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0 +
-							(user_sgpr * 4), 2);
-		radeon_emit(cs, kernel_args_va);
-		radeon_emit(cs, S_008F04_BASE_ADDRESS_HI (kernel_args_va >> 32) |
-		                S_008F04_STRIDE(0));
-		user_sgpr += 2;
-	}
-
-	for (i = 0; i < 3 && user_sgpr < 16; i++) {
-		if (code_object->code_properties & workgroup_count_masks[i]) {
-			radeon_set_sh_reg_seq(cs,
-				R_00B900_COMPUTE_USER_DATA_0 +
-				(user_sgpr * 4), 1);
-			radeon_emit(cs, info->grid[i]);
-			user_sgpr += 1;
-		}
-	}
+   struct si_compute *program = sctx->cs_shader_state.program;
+   struct radeon_cmdbuf *cs = sctx->gfx_cs;
+
+   static const enum amd_code_property_mask_t workgroup_count_masks[] = {
+      AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X,
+      AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y,
+      AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z};
+
+   unsigned i, user_sgpr = 0;
+   if (AMD_HSA_BITS_GET(code_object->code_properties,
+                        AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER)) {
+      if (code_object->workitem_private_segment_byte_size > 0) {
+         setup_scratch_rsrc_user_sgprs(sctx, code_object, user_sgpr);
+      }
+      user_sgpr += 4;
+   }
+
+   if (AMD_HSA_BITS_GET(code_object->code_properties, AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR)) {
+      struct dispatch_packet dispatch;
+      unsigned dispatch_offset;
+      struct si_resource *dispatch_buf = NULL;
+      uint64_t dispatch_va;
+
+      /* Upload dispatch ptr */
+      memset(&dispatch, 0, sizeof(dispatch));
+
+      dispatch.workgroup_size_x = util_cpu_to_le16(info->block[0]);
+      dispatch.workgroup_size_y = util_cpu_to_le16(info->block[1]);
+      dispatch.workgroup_size_z = util_cpu_to_le16(info->block[2]);
+
+      dispatch.grid_size_x = util_cpu_to_le32(info->grid[0] * info->block[0]);
+      dispatch.grid_size_y = util_cpu_to_le32(info->grid[1] * info->block[1]);
+      dispatch.grid_size_z = util_cpu_to_le32(info->grid[2] * info->block[2]);
+
+      dispatch.private_segment_size = util_cpu_to_le32(program->private_size);
+      dispatch.group_segment_size = util_cpu_to_le32(program->local_size);
+
+      dispatch.kernarg_address = util_cpu_to_le64(kernel_args_va);
+
+      u_upload_data(sctx->b.const_uploader, 0, sizeof(dispatch), 256, &dispatch, &dispatch_offset,
+                    (struct pipe_resource **)&dispatch_buf);
+
+      if (!dispatch_buf) {
+         fprintf(stderr, "Error: Failed to allocate dispatch "
+                         "packet.");
+      }
+      radeon_add_to_buffer_list(sctx, sctx->gfx_cs, dispatch_buf, RADEON_USAGE_READ,
+                                RADEON_PRIO_CONST_BUFFER);
+
+      dispatch_va = dispatch_buf->gpu_address + dispatch_offset;
+
+      radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0 + (user_sgpr * 4), 2);
+      radeon_emit(cs, dispatch_va);
+      radeon_emit(cs, S_008F04_BASE_ADDRESS_HI(dispatch_va >> 32) | S_008F04_STRIDE(0));
+
+      si_resource_reference(&dispatch_buf, NULL);
+      user_sgpr += 2;
+   }
+
+   if (AMD_HSA_BITS_GET(code_object->code_properties,
+                        AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR)) {
+      radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0 + (user_sgpr * 4), 2);
+      radeon_emit(cs, kernel_args_va);
+      radeon_emit(cs, S_008F04_BASE_ADDRESS_HI(kernel_args_va >> 32) | S_008F04_STRIDE(0));
+      user_sgpr += 2;
+   }
+
+   for (i = 0; i < 3 && user_sgpr < 16; i++) {
+      if (code_object->code_properties & workgroup_count_masks[i]) {
+         radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0 + (user_sgpr * 4), 1);
+         radeon_emit(cs, info->grid[i]);
+         user_sgpr += 1;
+      }
+   }
 }
 
-static bool si_upload_compute_input(struct si_context *sctx,
-				    const amd_kernel_code_t *code_object,
-				    const struct pipe_grid_info *info)
+static bool si_upload_compute_input(struct si_context *sctx, const amd_kernel_code_t *code_object,
+                                    const struct pipe_grid_info *info)
 {
-	struct si_compute *program = sctx->cs_shader_state.program;
-	struct si_resource *input_buffer = NULL;
-	uint32_t kernel_args_offset = 0;
-	uint32_t *kernel_args;
-	void *kernel_args_ptr;
-	uint64_t kernel_args_va;
+   struct si_compute *program = sctx->cs_shader_state.program;
+   struct si_resource *input_buffer = NULL;
+   uint32_t kernel_args_offset = 0;
+   uint32_t *kernel_args;
+   void *kernel_args_ptr;
+   uint64_t kernel_args_va;
 
-	u_upload_alloc(sctx->b.const_uploader, 0, program->input_size,
-		       sctx->screen->info.tcc_cache_line_size,
-		       &kernel_args_offset,
-		       (struct pipe_resource**)&input_buffer, &kernel_args_ptr);
+   u_upload_alloc(sctx->b.const_uploader, 0, program->input_size,
+                  sctx->screen->info.tcc_cache_line_size, &kernel_args_offset,
+                  (struct pipe_resource **)&input_buffer, &kernel_args_ptr);
 
-	if (unlikely(!kernel_args_ptr))
-		return false;
+   if (unlikely(!kernel_args_ptr))
+      return false;
 
-	kernel_args = (uint32_t*)kernel_args_ptr;
-	kernel_args_va = input_buffer->gpu_address + kernel_args_offset;
+   kernel_args = (uint32_t *)kernel_args_ptr;
+   kernel_args_va = input_buffer->gpu_address + kernel_args_offset;
 
-	memcpy(kernel_args, info->input, program->input_size);
+   memcpy(kernel_args, info->input, program->input_size);
 
-	for (unsigned i = 0; i < program->input_size / 4; i++) {
-		COMPUTE_DBG(sctx->screen, "input %u : %u\n", i,
-			kernel_args[i]);
-	}
+   for (unsigned i = 0; i < program->input_size / 4; i++) {
+      COMPUTE_DBG(sctx->screen, "input %u : %u\n", i, kernel_args[i]);
+   }
 
-	radeon_add_to_buffer_list(sctx, sctx->gfx_cs, input_buffer,
-				  RADEON_USAGE_READ, RADEON_PRIO_CONST_BUFFER);
+   radeon_add_to_buffer_list(sctx, sctx->gfx_cs, input_buffer, RADEON_USAGE_READ,
+                             RADEON_PRIO_CONST_BUFFER);
 
-	si_setup_user_sgprs_co_v2(sctx, code_object, info, kernel_args_va);
-	si_resource_reference(&input_buffer, NULL);
-	return true;
+   si_setup_user_sgprs_co_v2(sctx, code_object, info, kernel_args_va);
+   si_resource_reference(&input_buffer, NULL);
+   return true;
 }
 
-static void si_setup_nir_user_data(struct si_context *sctx,
-				   const struct pipe_grid_info *info)
+static void si_setup_nir_user_data(struct si_context *sctx, const struct pipe_grid_info *info)
 {
-	struct si_compute *program = sctx->cs_shader_state.program;
-	struct si_shader_selector *sel = &program->sel;
-	struct radeon_cmdbuf *cs = sctx->gfx_cs;
-	unsigned grid_size_reg = R_00B900_COMPUTE_USER_DATA_0 +
-				 4 * SI_NUM_RESOURCE_SGPRS;
-	unsigned block_size_reg = grid_size_reg +
-				  /* 12 bytes = 3 dwords. */
-				  12 * sel->info.uses_grid_size;
-	unsigned cs_user_data_reg = block_size_reg +
-				    12 * program->reads_variable_block_size;
-
-	if (info->indirect) {
-		if (sel->info.uses_grid_size) {
-			for (unsigned i = 0; i < 3; ++i) {
-				si_cp_copy_data(sctx, sctx->gfx_cs,
-						COPY_DATA_REG, NULL, (grid_size_reg >> 2) + i,
-						COPY_DATA_SRC_MEM, si_resource(info->indirect),
-						info->indirect_offset + 4 * i);
-			}
-		}
-	} else {
-		if (sel->info.uses_grid_size) {
-			radeon_set_sh_reg_seq(cs, grid_size_reg, 3);
-			radeon_emit(cs, info->grid[0]);
-			radeon_emit(cs, info->grid[1]);
-			radeon_emit(cs, info->grid[2]);
-		}
-		if (program->reads_variable_block_size) {
-			radeon_set_sh_reg_seq(cs, block_size_reg, 3);
-			radeon_emit(cs, info->block[0]);
-			radeon_emit(cs, info->block[1]);
-			radeon_emit(cs, info->block[2]);
-		}
-	}
-
-	if (program->num_cs_user_data_dwords) {
-		radeon_set_sh_reg_seq(cs, cs_user_data_reg, program->num_cs_user_data_dwords);
-		radeon_emit_array(cs, sctx->cs_user_data, program->num_cs_user_data_dwords);
-	}
+   struct si_compute *program = sctx->cs_shader_state.program;
+   struct si_shader_selector *sel = &program->sel;
+   struct radeon_cmdbuf *cs = sctx->gfx_cs;
+   unsigned grid_size_reg = R_00B900_COMPUTE_USER_DATA_0 + 4 * SI_NUM_RESOURCE_SGPRS;
+   unsigned block_size_reg = grid_size_reg +
+                             /* 12 bytes = 3 dwords. */
+                             12 * sel->info.uses_grid_size;
+   unsigned cs_user_data_reg = block_size_reg + 12 * program->reads_variable_block_size;
+
+   if (info->indirect) {
+      if (sel->info.uses_grid_size) {
+         for (unsigned i = 0; i < 3; ++i) {
+            si_cp_copy_data(sctx, sctx->gfx_cs, COPY_DATA_REG, NULL, (grid_size_reg >> 2) + i,
+                            COPY_DATA_SRC_MEM, si_resource(info->indirect),
+                            info->indirect_offset + 4 * i);
+         }
+      }
+   } else {
+      if (sel->info.uses_grid_size) {
+         radeon_set_sh_reg_seq(cs, grid_size_reg, 3);
+         radeon_emit(cs, info->grid[0]);
+         radeon_emit(cs, info->grid[1]);
+         radeon_emit(cs, info->grid[2]);
+      }
+      if (program->reads_variable_block_size) {
+         radeon_set_sh_reg_seq(cs, block_size_reg, 3);
+         radeon_emit(cs, info->block[0]);
+         radeon_emit(cs, info->block[1]);
+         radeon_emit(cs, info->block[2]);
+      }
+   }
+
+   if (program->num_cs_user_data_dwords) {
+      radeon_set_sh_reg_seq(cs, cs_user_data_reg, program->num_cs_user_data_dwords);
+      radeon_emit_array(cs, sctx->cs_user_data, program->num_cs_user_data_dwords);
+   }
 }
 
-static void si_emit_dispatch_packets(struct si_context *sctx,
-                                     const struct pipe_grid_info *info)
+static void si_emit_dispatch_packets(struct si_context *sctx, const struct pipe_grid_info *info)
 {
-	struct si_screen *sscreen = sctx->screen;
-	struct radeon_cmdbuf *cs = sctx->gfx_cs;
-	bool render_cond_bit = sctx->render_cond && !sctx->render_cond_force_off;
-	unsigned threads_per_threadgroup =
-		info->block[0] * info->block[1] * info->block[2];
-	unsigned waves_per_threadgroup =
-		DIV_ROUND_UP(threads_per_threadgroup, sscreen->compute_wave_size);
-	unsigned threadgroups_per_cu = 1;
-
-	if (sctx->chip_class >= GFX10 && waves_per_threadgroup == 1)
-		threadgroups_per_cu = 2;
-
-	radeon_set_sh_reg(cs, R_00B854_COMPUTE_RESOURCE_LIMITS,
-			  ac_get_compute_resource_limits(&sscreen->info,
-							 waves_per_threadgroup,
-							 sctx->cs_max_waves_per_sh,
-							 threadgroups_per_cu));
-
-	unsigned dispatch_initiator =
-		S_00B800_COMPUTE_SHADER_EN(1) |
-		S_00B800_FORCE_START_AT_000(1) |
-		/* If the KMD allows it (there is a KMD hw register for it),
-		 * allow launching waves out-of-order. (same as Vulkan) */
-		S_00B800_ORDER_MODE(sctx->chip_class >= GFX7) |
-		S_00B800_CS_W32_EN(sscreen->compute_wave_size == 32);
-
-	const uint *last_block = info->last_block;
-	bool partial_block_en = last_block[0] || last_block[1] || last_block[2];
-
-	radeon_set_sh_reg_seq(cs, R_00B81C_COMPUTE_NUM_THREAD_X, 3);
-
-	if (partial_block_en) {
-		unsigned partial[3];
-
-		/* If no partial_block, these should be an entire block size, not 0. */
-		partial[0] = last_block[0] ? last_block[0] : info->block[0];
-		partial[1] = last_block[1] ? last_block[1] : info->block[1];
-		partial[2] = last_block[2] ? last_block[2] : info->block[2];
-
-		radeon_emit(cs, S_00B81C_NUM_THREAD_FULL(info->block[0]) |
-				S_00B81C_NUM_THREAD_PARTIAL(partial[0]));
-		radeon_emit(cs, S_00B820_NUM_THREAD_FULL(info->block[1]) |
-				S_00B820_NUM_THREAD_PARTIAL(partial[1]));
-		radeon_emit(cs, S_00B824_NUM_THREAD_FULL(info->block[2]) |
-				S_00B824_NUM_THREAD_PARTIAL(partial[2]));
-
-		dispatch_initiator |= S_00B800_PARTIAL_TG_EN(1);
-	} else {
-		radeon_emit(cs, S_00B81C_NUM_THREAD_FULL(info->block[0]));
-		radeon_emit(cs, S_00B820_NUM_THREAD_FULL(info->block[1]));
-		radeon_emit(cs, S_00B824_NUM_THREAD_FULL(info->block[2]));
-	}
-
-	if (info->indirect) {
-		uint64_t base_va = si_resource(info->indirect)->gpu_address;
-
-		radeon_add_to_buffer_list(sctx, sctx->gfx_cs,
-		                 si_resource(info->indirect),
-		                 RADEON_USAGE_READ, RADEON_PRIO_DRAW_INDIRECT);
-
-		radeon_emit(cs, PKT3(PKT3_SET_BASE, 2, 0) |
-		                PKT3_SHADER_TYPE_S(1));
-		radeon_emit(cs, 1);
-		radeon_emit(cs, base_va);
-		radeon_emit(cs, base_va >> 32);
-
-		radeon_emit(cs, PKT3(PKT3_DISPATCH_INDIRECT, 1, render_cond_bit) |
-		                PKT3_SHADER_TYPE_S(1));
-		radeon_emit(cs, info->indirect_offset);
-		radeon_emit(cs, dispatch_initiator);
-	} else {
-		radeon_emit(cs, PKT3(PKT3_DISPATCH_DIRECT, 3, render_cond_bit) |
-		                PKT3_SHADER_TYPE_S(1));
-		radeon_emit(cs, info->grid[0]);
-		radeon_emit(cs, info->grid[1]);
-		radeon_emit(cs, info->grid[2]);
-		radeon_emit(cs, dispatch_initiator);
-	}
+   struct si_screen *sscreen = sctx->screen;
+   struct radeon_cmdbuf *cs = sctx->gfx_cs;
+   bool render_cond_bit = sctx->render_cond && !sctx->render_cond_force_off;
+   unsigned threads_per_threadgroup = info->block[0] * info->block[1] * info->block[2];
+   unsigned waves_per_threadgroup =
+      DIV_ROUND_UP(threads_per_threadgroup, sscreen->compute_wave_size);
+   unsigned threadgroups_per_cu = 1;
+
+   if (sctx->chip_class >= GFX10 && waves_per_threadgroup == 1)
+      threadgroups_per_cu = 2;
+
+   radeon_set_sh_reg(
+      cs, R_00B854_COMPUTE_RESOURCE_LIMITS,
+      ac_get_compute_resource_limits(&sscreen->info, waves_per_threadgroup,
+                                     sctx->cs_max_waves_per_sh, threadgroups_per_cu));
+
+   unsigned dispatch_initiator = S_00B800_COMPUTE_SHADER_EN(1) | S_00B800_FORCE_START_AT_000(1) |
+                                 /* If the KMD allows it (there is a KMD hw register for it),
+                                  * allow launching waves out-of-order. (same as Vulkan) */
+                                 S_00B800_ORDER_MODE(sctx->chip_class >= GFX7) |
+                                 S_00B800_CS_W32_EN(sscreen->compute_wave_size == 32);
+
+   const uint *last_block = info->last_block;
+   bool partial_block_en = last_block[0] || last_block[1] || last_block[2];
+
+   radeon_set_sh_reg_seq(cs, R_00B81C_COMPUTE_NUM_THREAD_X, 3);
+
+   if (partial_block_en) {
+      unsigned partial[3];
+
+      /* If no partial_block, these should be an entire block size, not 0. */
+      partial[0] = last_block[0] ? last_block[0] : info->block[0];
+      partial[1] = last_block[1] ? last_block[1] : info->block[1];
+      partial[2] = last_block[2] ? last_block[2] : info->block[2];
+
+      radeon_emit(
+         cs, S_00B81C_NUM_THREAD_FULL(info->block[0]) | S_00B81C_NUM_THREAD_PARTIAL(partial[0]));
+      radeon_emit(
+         cs, S_00B820_NUM_THREAD_FULL(info->block[1]) | S_00B820_NUM_THREAD_PARTIAL(partial[1]));
+      radeon_emit(
+         cs, S_00B824_NUM_THREAD_FULL(info->block[2]) | S_00B824_NUM_THREAD_PARTIAL(partial[2]));
+
+      dispatch_initiator |= S_00B800_PARTIAL_TG_EN(1);
+   } else {
+      radeon_emit(cs, S_00B81C_NUM_THREAD_FULL(info->block[0]));
+      radeon_emit(cs, S_00B820_NUM_THREAD_FULL(info->block[1]));
+      radeon_emit(cs, S_00B824_NUM_THREAD_FULL(info->block[2]));
+   }
+
+   if (info->indirect) {
+      uint64_t base_va = si_resource(info->indirect)->gpu_address;
+
+      radeon_add_to_buffer_list(sctx, sctx->gfx_cs, si_resource(info->indirect), RADEON_USAGE_READ,
+                                RADEON_PRIO_DRAW_INDIRECT);
+
+      radeon_emit(cs, PKT3(PKT3_SET_BASE, 2, 0) | PKT3_SHADER_TYPE_S(1));
+      radeon_emit(cs, 1);
+      radeon_emit(cs, base_va);
+      radeon_emit(cs, base_va >> 32);
+
+      radeon_emit(cs, PKT3(PKT3_DISPATCH_INDIRECT, 1, render_cond_bit) | PKT3_SHADER_TYPE_S(1));
+      radeon_emit(cs, info->indirect_offset);
+      radeon_emit(cs, dispatch_initiator);
+   } else {
+      radeon_emit(cs, PKT3(PKT3_DISPATCH_DIRECT, 3, render_cond_bit) | PKT3_SHADER_TYPE_S(1));
+      radeon_emit(cs, info->grid[0]);
+      radeon_emit(cs, info->grid[1]);
+      radeon_emit(cs, info->grid[2]);
+      radeon_emit(cs, dispatch_initiator);
+   }
 }
 
-
-static void si_launch_grid(
-		struct pipe_context *ctx, const struct pipe_grid_info *info)
+static void si_launch_grid(struct pipe_context *ctx, const struct pipe_grid_info *info)
 {
-	struct si_context *sctx = (struct si_context*)ctx;
-	struct si_compute *program = sctx->cs_shader_state.program;
-	const amd_kernel_code_t *code_object =
-		si_compute_get_code_object(program, info->pc);
-	int i;
-	/* HW bug workaround when CS threadgroups > 256 threads and async
-	 * compute isn't used, i.e. only one compute job can run at a time.
-	 * If async compute is possible, the threadgroup size must be limited
-	 * to 256 threads on all queues to avoid the bug.
-	 * Only GFX6 and certain GFX7 chips are affected.
-	 */
-	bool cs_regalloc_hang =
-		(sctx->chip_class == GFX6 ||
-		 sctx->family == CHIP_BONAIRE ||
-		 sctx->family == CHIP_KABINI) &&
-		info->block[0] * info->block[1] * info->block[2] > 256;
-
-	if (cs_regalloc_hang)
-		sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
-				 SI_CONTEXT_CS_PARTIAL_FLUSH;
-
-	if (program->ir_type != PIPE_SHADER_IR_NATIVE &&
-	    program->shader.compilation_failed)
-		return;
-
-	if (sctx->has_graphics) {
-		if (sctx->last_num_draw_calls != sctx->num_draw_calls) {
-			si_update_fb_dirtiness_after_rendering(sctx);
-			sctx->last_num_draw_calls = sctx->num_draw_calls;
-		}
-
-		si_decompress_textures(sctx, 1 << PIPE_SHADER_COMPUTE);
-	}
-
-	/* Add buffer sizes for memory checking in need_cs_space. */
-	si_context_add_resource_size(sctx, &program->shader.bo->b.b);
-	/* TODO: add the scratch buffer */
-
-	if (info->indirect) {
-		si_context_add_resource_size(sctx, info->indirect);
-
-		/* Indirect buffers use TC L2 on GFX9, but not older hw. */
-		if (sctx->chip_class <= GFX8 &&
-		    si_resource(info->indirect)->TC_L2_dirty) {
-			sctx->flags |= SI_CONTEXT_WB_L2;
-			si_resource(info->indirect)->TC_L2_dirty = false;
-		}
-	}
-
-	si_need_gfx_cs_space(sctx);
-
-	if (sctx->bo_list_add_all_compute_resources)
-		si_compute_resources_add_all_to_bo_list(sctx);
-
-	if (!sctx->cs_shader_state.initialized) {
-		si_emit_initial_compute_regs(sctx, sctx->gfx_cs);
-
-		sctx->cs_shader_state.emitted_program = NULL;
-		sctx->cs_shader_state.initialized = true;
-	}
-
-	if (sctx->flags)
-		sctx->emit_cache_flush(sctx);
-
-	if (!si_switch_compute_shader(sctx, program, &program->shader,
-					code_object, info->pc))
-		return;
-
-	si_upload_compute_shader_descriptors(sctx);
-	si_emit_compute_shader_pointers(sctx);
-
-	if (sctx->has_graphics &&
-	    si_is_atom_dirty(sctx, &sctx->atoms.s.render_cond)) {
-		sctx->atoms.s.render_cond.emit(sctx);
-		si_set_atom_dirty(sctx, &sctx->atoms.s.render_cond, false);
-	}
-
-	if (program->ir_type == PIPE_SHADER_IR_NATIVE &&
-	    unlikely(!si_upload_compute_input(sctx, code_object, info)))
-		return;
-
-	/* Global buffers */
-	for (i = 0; i < program->max_global_buffers; i++) {
-		struct si_resource *buffer =
-			si_resource(program->global_buffers[i]);
-		if (!buffer) {
-			continue;
-		}
-		radeon_add_to_buffer_list(sctx, sctx->gfx_cs, buffer,
-					  RADEON_USAGE_READWRITE,
-					  RADEON_PRIO_COMPUTE_GLOBAL);
-	}
-
-	if (program->ir_type != PIPE_SHADER_IR_NATIVE)
-		si_setup_nir_user_data(sctx, info);
-
-	si_emit_dispatch_packets(sctx, info);
-
-	if (unlikely(sctx->current_saved_cs)) {
-		si_trace_emit(sctx);
-		si_log_compute_state(sctx, sctx->log);
-	}
-
-	sctx->compute_is_busy = true;
-	sctx->num_compute_calls++;
-	if (sctx->cs_shader_state.uses_scratch)
-		sctx->num_spill_compute_calls++;
-
-	if (cs_regalloc_hang)
-		sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH;
+   struct si_context *sctx = (struct si_context *)ctx;
+   struct si_compute *program = sctx->cs_shader_state.program;
+   const amd_kernel_code_t *code_object = si_compute_get_code_object(program, info->pc);
+   int i;
+   /* HW bug workaround when CS threadgroups > 256 threads and async
+    * compute isn't used, i.e. only one compute job can run at a time.
+    * If async compute is possible, the threadgroup size must be limited
+    * to 256 threads on all queues to avoid the bug.
+    * Only GFX6 and certain GFX7 chips are affected.
+    */
+   bool cs_regalloc_hang =
+      (sctx->chip_class == GFX6 || sctx->family == CHIP_BONAIRE || sctx->family == CHIP_KABINI) &&
+      info->block[0] * info->block[1] * info->block[2] > 256;
+
+   if (cs_regalloc_hang)
+      sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH;
+
+   if (program->ir_type != PIPE_SHADER_IR_NATIVE && program->shader.compilation_failed)
+      return;
+
+   if (sctx->has_graphics) {
+      if (sctx->last_num_draw_calls != sctx->num_draw_calls) {
+         si_update_fb_dirtiness_after_rendering(sctx);
+         sctx->last_num_draw_calls = sctx->num_draw_calls;
+      }
+
+      si_decompress_textures(sctx, 1 << PIPE_SHADER_COMPUTE);
+   }
+
+   /* Add buffer sizes for memory checking in need_cs_space. */
+   si_context_add_resource_size(sctx, &program->shader.bo->b.b);
+   /* TODO: add the scratch buffer */
+
+   if (info->indirect) {
+      si_context_add_resource_size(sctx, info->indirect);
+
+      /* Indirect buffers use TC L2 on GFX9, but not older hw. */
+      if (sctx->chip_class <= GFX8 && si_resource(info->indirect)->TC_L2_dirty) {
+         sctx->flags |= SI_CONTEXT_WB_L2;
+         si_resource(info->indirect)->TC_L2_dirty = false;
+      }
+   }
+
+   si_need_gfx_cs_space(sctx);
+
+   if (sctx->bo_list_add_all_compute_resources)
+      si_compute_resources_add_all_to_bo_list(sctx);
+
+   if (!sctx->cs_shader_state.initialized) {
+      si_emit_initial_compute_regs(sctx, sctx->gfx_cs);
+
+      sctx->cs_shader_state.emitted_program = NULL;
+      sctx->cs_shader_state.initialized = true;
+   }
+
+   if (sctx->flags)
+      sctx->emit_cache_flush(sctx);
+
+   if (!si_switch_compute_shader(sctx, program, &program->shader, code_object, info->pc))
+      return;
+
+   si_upload_compute_shader_descriptors(sctx);
+   si_emit_compute_shader_pointers(sctx);
+
+   if (sctx->has_graphics && si_is_atom_dirty(sctx, &sctx->atoms.s.render_cond)) {
+      sctx->atoms.s.render_cond.emit(sctx);
+      si_set_atom_dirty(sctx, &sctx->atoms.s.render_cond, false);
+   }
+
+   if (program->ir_type == PIPE_SHADER_IR_NATIVE &&
+       unlikely(!si_upload_compute_input(sctx, code_object, info)))
+      return;
+
+   /* Global buffers */
+   for (i = 0; i < program->max_global_buffers; i++) {
+      struct si_resource *buffer = si_resource(program->global_buffers[i]);
+      if (!buffer) {
+         continue;
+      }
+      radeon_add_to_buffer_list(sctx, sctx->gfx_cs, buffer, RADEON_USAGE_READWRITE,
+                                RADEON_PRIO_COMPUTE_GLOBAL);
+   }
+
+   if (program->ir_type != PIPE_SHADER_IR_NATIVE)
+      si_setup_nir_user_data(sctx, info);
+
+   si_emit_dispatch_packets(sctx, info);
+
+   if (unlikely(sctx->current_saved_cs)) {
+      si_trace_emit(sctx);
+      si_log_compute_state(sctx, sctx->log);
+   }
+
+   sctx->compute_is_busy = true;
+   sctx->num_compute_calls++;
+   if (sctx->cs_shader_state.uses_scratch)
+      sctx->num_spill_compute_calls++;
+
+   if (cs_regalloc_hang)
+      sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH;
 }
 
 void si_destroy_compute(struct si_compute *program)
 {
-	struct si_shader_selector *sel = &program->sel;
+   struct si_shader_selector *sel = &program->sel;
 
-	if (program->ir_type != PIPE_SHADER_IR_NATIVE) {
-		util_queue_drop_job(&sel->screen->shader_compiler_queue,
-				    &sel->ready);
-		util_queue_fence_destroy(&sel->ready);
-	}
+   if (program->ir_type != PIPE_SHADER_IR_NATIVE) {
+      util_queue_drop_job(&sel->screen->shader_compiler_queue, &sel->ready);
+      util_queue_fence_destroy(&sel->ready);
+   }
 
-	for (unsigned i = 0; i < program->max_global_buffers; i++)
-		pipe_resource_reference(&program->global_buffers[i], NULL);
-	FREE(program->global_buffers);
+   for (unsigned i = 0; i < program->max_global_buffers; i++)
+      pipe_resource_reference(&program->global_buffers[i], NULL);
+   FREE(program->global_buffers);
 
-	si_shader_destroy(&program->shader);
-	ralloc_free(program->sel.nir);
-	FREE(program);
+   si_shader_destroy(&program->shader);
+   ralloc_free(program->sel.nir);
+   FREE(program);
 }
 
-static void si_delete_compute_state(struct pipe_context *ctx, void* state){
-	struct si_compute *program = (struct si_compute *)state;
-	struct si_context *sctx = (struct si_context*)ctx;
+static void si_delete_compute_state(struct pipe_context *ctx, void *state)
+{
+   struct si_compute *program = (struct si_compute *)state;
+   struct si_context *sctx = (struct si_context *)ctx;
 
-	if (!state)
-		return;
+   if (!state)
+      return;
 
-	if (program == sctx->cs_shader_state.program)
-		sctx->cs_shader_state.program = NULL;
+   if (program == sctx->cs_shader_state.program)
+      sctx->cs_shader_state.program = NULL;
 
-	if (program == sctx->cs_shader_state.emitted_program)
-		sctx->cs_shader_state.emitted_program = NULL;
+   if (program == sctx->cs_shader_state.emitted_program)
+      sctx->cs_shader_state.emitted_program = NULL;
 
-	si_compute_reference(&program, NULL);
+   si_compute_reference(&program, NULL);
 }
 
-static void si_set_compute_resources(struct pipe_context * ctx_,
-		unsigned start, unsigned count,
-		struct pipe_surface ** surfaces) { }
+static void si_set_compute_resources(struct pipe_context *ctx_, unsigned start, unsigned count,
+                                     struct pipe_surface **surfaces)
+{
+}
 
 void si_init_compute_functions(struct si_context *sctx)
 {
-	sctx->b.create_compute_state = si_create_compute_state;
-	sctx->b.delete_compute_state = si_delete_compute_state;
-	sctx->b.bind_compute_state = si_bind_compute_state;
-	sctx->b.set_compute_resources = si_set_compute_resources;
-	sctx->b.set_global_binding = si_set_global_binding;
-	sctx->b.launch_grid = si_launch_grid;
+   sctx->b.create_compute_state = si_create_compute_state;
+   sctx->b.delete_compute_state = si_delete_compute_state;
+   sctx->b.bind_compute_state = si_bind_compute_state;
+   sctx->b.set_compute_resources = si_set_compute_resources;
+   sctx->b.set_global_binding = si_set_global_binding;
+   sctx->b.launch_grid = si_launch_grid;
 }
diff --git a/src/gallium/drivers/radeonsi/si_compute.h b/src/gallium/drivers/radeonsi/si_compute.h
index 14c3c8cb789..7cf06271853 100644
--- a/src/gallium/drivers/radeonsi/si_compute.h
+++ b/src/gallium/drivers/radeonsi/si_compute.h
@@ -25,35 +25,33 @@
 #ifndef SI_COMPUTE_H
 #define SI_COMPUTE_H
 
-#include "util/u_inlines.h"
-
 #include "si_shader.h"
+#include "util/u_inlines.h"
 
 struct si_compute {
-	struct si_shader_selector sel;
-	struct si_shader shader;
+   struct si_shader_selector sel;
+   struct si_shader shader;
 
-	unsigned ir_type;
-	unsigned local_size;
-	unsigned private_size;
-	unsigned input_size;
+   unsigned ir_type;
+   unsigned local_size;
+   unsigned private_size;
+   unsigned input_size;
 
-	int max_global_buffers;
-	struct pipe_resource **global_buffers;
+   int max_global_buffers;
+   struct pipe_resource **global_buffers;
 
-	bool reads_variable_block_size;
-	unsigned num_cs_user_data_dwords;
+   bool reads_variable_block_size;
+   unsigned num_cs_user_data_dwords;
 };
 
 void si_destroy_compute(struct si_compute *program);
 
-static inline void
-si_compute_reference(struct si_compute **dst, struct si_compute *src)
+static inline void si_compute_reference(struct si_compute **dst, struct si_compute *src)
 {
-	if (pipe_reference(&(*dst)->sel.base.reference, &src->sel.base.reference))
-		si_destroy_compute(*dst);
+   if (pipe_reference(&(*dst)->sel.base.reference, &src->sel.base.reference))
+      si_destroy_compute(*dst);
 
-	*dst = src;
+   *dst = src;
 }
 
 #endif /* SI_COMPUTE_H */
diff --git a/src/gallium/drivers/radeonsi/si_compute_blit.c b/src/gallium/drivers/radeonsi/si_compute_blit.c
index de020bfaf8c..6e3b07cb7c8 100644
--- a/src/gallium/drivers/radeonsi/si_compute_blit.c
+++ b/src/gallium/drivers/radeonsi/si_compute_blit.c
@@ -30,758 +30,705 @@
 /* Note: Compute shaders always use SI_COMPUTE_DST_CACHE_POLICY for dst
  * and L2_STREAM for src.
  */
-static enum si_cache_policy get_cache_policy(struct si_context *sctx,
-					     enum si_coherency coher,
-					     uint64_t size)
+static enum si_cache_policy get_cache_policy(struct si_context *sctx, enum si_coherency coher,
+                                             uint64_t size)
 {
-	if ((sctx->chip_class >= GFX9 && (coher == SI_COHERENCY_CB_META ||
-					  coher == SI_COHERENCY_CP)) ||
-	    (sctx->chip_class >= GFX7 && coher == SI_COHERENCY_SHADER))
-		return size <= 256 * 1024 ? L2_LRU : L2_STREAM;
+   if ((sctx->chip_class >= GFX9 && (coher == SI_COHERENCY_CB_META || coher == SI_COHERENCY_CP)) ||
+       (sctx->chip_class >= GFX7 && coher == SI_COHERENCY_SHADER))
+      return size <= 256 * 1024 ? L2_LRU : L2_STREAM;
 
-	return L2_BYPASS;
+   return L2_BYPASS;
 }
 
 unsigned si_get_flush_flags(struct si_context *sctx, enum si_coherency coher,
-			    enum si_cache_policy cache_policy)
+                            enum si_cache_policy cache_policy)
 {
-	switch (coher) {
-	default:
-	case SI_COHERENCY_NONE:
-	case SI_COHERENCY_CP:
-		return 0;
-	case SI_COHERENCY_SHADER:
-		return SI_CONTEXT_INV_SCACHE |
-		       SI_CONTEXT_INV_VCACHE |
-		       (cache_policy == L2_BYPASS ? SI_CONTEXT_INV_L2 : 0);
-	case SI_COHERENCY_CB_META:
-		return SI_CONTEXT_FLUSH_AND_INV_CB;
-	}
+   switch (coher) {
+   default:
+   case SI_COHERENCY_NONE:
+   case SI_COHERENCY_CP:
+      return 0;
+   case SI_COHERENCY_SHADER:
+      return SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE |
+             (cache_policy == L2_BYPASS ? SI_CONTEXT_INV_L2 : 0);
+   case SI_COHERENCY_CB_META:
+      return SI_CONTEXT_FLUSH_AND_INV_CB;
+   }
 }
 
-static void si_launch_grid_internal(struct si_context *sctx,
-				    struct pipe_grid_info *info)
+static void si_launch_grid_internal(struct si_context *sctx, struct pipe_grid_info *info)
 {
-	/* Set settings for driver-internal compute dispatches. */
-	sctx->flags &= ~SI_CONTEXT_START_PIPELINE_STATS;
-	sctx->flags |= SI_CONTEXT_STOP_PIPELINE_STATS;
-	sctx->render_cond_force_off = true;
-	/* Skip decompression to prevent infinite recursion. */
-	sctx->blitter->running = true;
-
-	/* Dispatch compute. */
-	sctx->b.launch_grid(&sctx->b, info);
-
-	/* Restore default settings. */
-	sctx->flags &= ~SI_CONTEXT_STOP_PIPELINE_STATS;
-	sctx->flags |= SI_CONTEXT_START_PIPELINE_STATS;
-	sctx->render_cond_force_off = false;
-	sctx->blitter->running = false;
+   /* Set settings for driver-internal compute dispatches. */
+   sctx->flags &= ~SI_CONTEXT_START_PIPELINE_STATS;
+   sctx->flags |= SI_CONTEXT_STOP_PIPELINE_STATS;
+   sctx->render_cond_force_off = true;
+   /* Skip decompression to prevent infinite recursion. */
+   sctx->blitter->running = true;
+
+   /* Dispatch compute. */
+   sctx->b.launch_grid(&sctx->b, info);
+
+   /* Restore default settings. */
+   sctx->flags &= ~SI_CONTEXT_STOP_PIPELINE_STATS;
+   sctx->flags |= SI_CONTEXT_START_PIPELINE_STATS;
+   sctx->render_cond_force_off = false;
+   sctx->blitter->running = false;
 }
 
-static void si_compute_clear_12bytes_buffer(struct si_context *sctx,
-					struct pipe_resource *dst,
-					unsigned dst_offset,
-					unsigned size,
-					const uint32_t *clear_value,
-					enum si_coherency coher)
+static void si_compute_clear_12bytes_buffer(struct si_context *sctx, struct pipe_resource *dst,
+                                            unsigned dst_offset, unsigned size,
+                                            const uint32_t *clear_value, enum si_coherency coher)
 {
-	struct pipe_context *ctx = &sctx->b;
+   struct pipe_context *ctx = &sctx->b;
 
-	assert(dst_offset % 4 == 0);
-	assert(size % 4 == 0);
-	unsigned size_12 = DIV_ROUND_UP(size, 12);
+   assert(dst_offset % 4 == 0);
+   assert(size % 4 == 0);
+   unsigned size_12 = DIV_ROUND_UP(size, 12);
 
-	unsigned data[4] = {0};
-	memcpy(data, clear_value, 12);
+   unsigned data[4] = {0};
+   memcpy(data, clear_value, 12);
 
-	sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
-		       SI_CONTEXT_CS_PARTIAL_FLUSH |
-		       si_get_flush_flags(sctx, coher, SI_COMPUTE_DST_CACHE_POLICY);
+   sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH |
+                  si_get_flush_flags(sctx, coher, SI_COMPUTE_DST_CACHE_POLICY);
 
-	struct pipe_shader_buffer saved_sb = {0};
-	si_get_shader_buffers(sctx, PIPE_SHADER_COMPUTE, 0, 1, &saved_sb);
+   struct pipe_shader_buffer saved_sb = {0};
+   si_get_shader_buffers(sctx, PIPE_SHADER_COMPUTE, 0, 1, &saved_sb);
 
-	unsigned saved_writable_mask = 0;
-	if (sctx->const_and_shader_buffers[PIPE_SHADER_COMPUTE].writable_mask &
-	    (1u << si_get_shaderbuf_slot(0)))
-		saved_writable_mask = 1;
+   unsigned saved_writable_mask = 0;
+   if (sctx->const_and_shader_buffers[PIPE_SHADER_COMPUTE].writable_mask &
+       (1u << si_get_shaderbuf_slot(0)))
+      saved_writable_mask = 1;
 
-	struct pipe_constant_buffer saved_cb = {};
-	si_get_pipe_constant_buffer(sctx, PIPE_SHADER_COMPUTE, 0, &saved_cb);
+   struct pipe_constant_buffer saved_cb = {};
+   si_get_pipe_constant_buffer(sctx, PIPE_SHADER_COMPUTE, 0, &saved_cb);
 
-	void *saved_cs = sctx->cs_shader_state.program;
+   void *saved_cs = sctx->cs_shader_state.program;
 
-	struct pipe_constant_buffer cb = {};
-	cb.buffer_size = sizeof(data);
-	cb.user_buffer = data;
-	ctx->set_constant_buffer(ctx, PIPE_SHADER_COMPUTE, 0, &cb);
+   struct pipe_constant_buffer cb = {};
+   cb.buffer_size = sizeof(data);
+   cb.user_buffer = data;
+   ctx->set_constant_buffer(ctx, PIPE_SHADER_COMPUTE, 0, &cb);
 
-	struct pipe_shader_buffer sb = {0};
-	sb.buffer = dst;
-	sb.buffer_offset = dst_offset;
-	sb.buffer_size = size;
+   struct pipe_shader_buffer sb = {0};
+   sb.buffer = dst;
+   sb.buffer_offset = dst_offset;
+   sb.buffer_size = size;
 
-	ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, 1, &sb, 0x1);
+   ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, 1, &sb, 0x1);
 
-	struct pipe_grid_info info = {0};
+   struct pipe_grid_info info = {0};
 
-	if (!sctx->cs_clear_12bytes_buffer)
-		sctx->cs_clear_12bytes_buffer =
-			si_clear_12bytes_buffer_shader(ctx);
-	ctx->bind_compute_state(ctx, sctx->cs_clear_12bytes_buffer);
-	info.block[0] = 64;
-	info.last_block[0] = size_12 % 64;
-	info.block[1] = 1;
-	info.block[2] = 1;
-	info.grid[0] = DIV_ROUND_UP(size_12, 64);
-	info.grid[1] = 1;
-	info.grid[2] = 1;
+   if (!sctx->cs_clear_12bytes_buffer)
+      sctx->cs_clear_12bytes_buffer = si_clear_12bytes_buffer_shader(ctx);
+   ctx->bind_compute_state(ctx, sctx->cs_clear_12bytes_buffer);
+   info.block[0] = 64;
+   info.last_block[0] = size_12 % 64;
+   info.block[1] = 1;
+   info.block[2] = 1;
+   info.grid[0] = DIV_ROUND_UP(size_12, 64);
+   info.grid[1] = 1;
+   info.grid[2] = 1;
 
-	si_launch_grid_internal(sctx, &info);
+   si_launch_grid_internal(sctx, &info);
 
-	ctx->bind_compute_state(ctx, saved_cs);
-	ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, 1, &saved_sb, saved_writable_mask);
-	ctx->set_constant_buffer(ctx, PIPE_SHADER_COMPUTE, 0, &saved_cb);
+   ctx->bind_compute_state(ctx, saved_cs);
+   ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, 1, &saved_sb, saved_writable_mask);
+   ctx->set_constant_buffer(ctx, PIPE_SHADER_COMPUTE, 0, &saved_cb);
 
-	pipe_resource_reference(&saved_sb.buffer, NULL);
-	pipe_resource_reference(&saved_cb.buffer, NULL);
+   pipe_resource_reference(&saved_sb.buffer, NULL);
+   pipe_resource_reference(&saved_cb.buffer, NULL);
 }
 
-static void si_compute_do_clear_or_copy(struct si_context *sctx,
-					struct pipe_resource *dst,
-					unsigned dst_offset,
-					struct pipe_resource *src,
-					unsigned src_offset,
-					unsigned size,
-					const uint32_t *clear_value,
-					unsigned clear_value_size,
-					enum si_coherency coher)
+static void si_compute_do_clear_or_copy(struct si_context *sctx, struct pipe_resource *dst,
+                                        unsigned dst_offset, struct pipe_resource *src,
+                                        unsigned src_offset, unsigned size,
+                                        const uint32_t *clear_value, unsigned clear_value_size,
+                                        enum si_coherency coher)
 {
-	struct pipe_context *ctx = &sctx->b;
-
-	assert(src_offset % 4 == 0);
-	assert(dst_offset % 4 == 0);
-	assert(size % 4 == 0);
-
-	assert(dst->target != PIPE_BUFFER || dst_offset + size <= dst->width0);
-	assert(!src || src_offset + size <= src->width0);
-
-	sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
-		       SI_CONTEXT_CS_PARTIAL_FLUSH |
-		       si_get_flush_flags(sctx, coher, SI_COMPUTE_DST_CACHE_POLICY);
-
-	/* Save states. */
-	void *saved_cs = sctx->cs_shader_state.program;
-	struct pipe_shader_buffer saved_sb[2] = {};
-	si_get_shader_buffers(sctx, PIPE_SHADER_COMPUTE, 0, src ? 2 : 1, saved_sb);
-
-	unsigned saved_writable_mask = 0;
-	for (unsigned i = 0; i < (src ? 2 : 1); i++) {
-		if (sctx->const_and_shader_buffers[PIPE_SHADER_COMPUTE].writable_mask &
-		    (1u << si_get_shaderbuf_slot(i)))
-			saved_writable_mask |= 1 << i;
-	}
-
-	/* The memory accesses are coalesced, meaning that the 1st instruction writes
-	 * the 1st contiguous block of data for the whole wave, the 2nd instruction
-	 * writes the 2nd contiguous block of data, etc.
-	 */
-	unsigned dwords_per_thread = src ? SI_COMPUTE_COPY_DW_PER_THREAD :
-					   SI_COMPUTE_CLEAR_DW_PER_THREAD;
-	unsigned instructions_per_thread = MAX2(1, dwords_per_thread / 4);
-	unsigned dwords_per_instruction = dwords_per_thread / instructions_per_thread;
-	unsigned wave_size = sctx->screen->compute_wave_size;
-	unsigned dwords_per_wave = dwords_per_thread * wave_size;
-
-	unsigned num_dwords = size / 4;
-	unsigned num_instructions = DIV_ROUND_UP(num_dwords, dwords_per_instruction);
-
-	struct pipe_grid_info info = {};
-	info.block[0] = MIN2(wave_size, num_instructions);
-	info.block[1] = 1;
-	info.block[2] = 1;
-	info.grid[0] = DIV_ROUND_UP(num_dwords, dwords_per_wave);
-	info.grid[1] = 1;
-	info.grid[2] = 1;
-
-	struct pipe_shader_buffer sb[2] = {};
-	sb[0].buffer = dst;
-	sb[0].buffer_offset = dst_offset;
-	sb[0].buffer_size = size;
-
-	bool shader_dst_stream_policy = SI_COMPUTE_DST_CACHE_POLICY != L2_LRU;
-
-	if (src) {
-		sb[1].buffer = src;
-		sb[1].buffer_offset = src_offset;
-		sb[1].buffer_size = size;
-
-		ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, 2, sb, 0x1);
-
-		if (!sctx->cs_copy_buffer) {
-			sctx->cs_copy_buffer = si_create_dma_compute_shader(&sctx->b,
-							     SI_COMPUTE_COPY_DW_PER_THREAD,
-							     shader_dst_stream_policy, true);
-		}
-		ctx->bind_compute_state(ctx, sctx->cs_copy_buffer);
-	} else {
-		assert(clear_value_size >= 4 &&
-		       clear_value_size <= 16 &&
-		       util_is_power_of_two_or_zero(clear_value_size));
-
-		for (unsigned i = 0; i < 4; i++)
-			sctx->cs_user_data[i] = clear_value[i % (clear_value_size / 4)];
-
-		ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, 1, sb, 0x1);
-
-		if (!sctx->cs_clear_buffer) {
-			sctx->cs_clear_buffer = si_create_dma_compute_shader(&sctx->b,
-							     SI_COMPUTE_CLEAR_DW_PER_THREAD,
-							     shader_dst_stream_policy, false);
-		}
-		ctx->bind_compute_state(ctx, sctx->cs_clear_buffer);
-	}
-
-	si_launch_grid_internal(sctx, &info);
-
-	enum si_cache_policy cache_policy = get_cache_policy(sctx, coher, size);
-	sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH |
-		       (cache_policy == L2_BYPASS ? SI_CONTEXT_WB_L2 : 0);
-
-	if (cache_policy != L2_BYPASS)
-		si_resource(dst)->TC_L2_dirty = true;
-
-	/* Restore states. */
-	ctx->bind_compute_state(ctx, saved_cs);
-	ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, src ? 2 : 1, saved_sb,
-				saved_writable_mask);
-	for (int i = 0; i < 2; i++)
-		pipe_resource_reference(&saved_sb[i].buffer, NULL);
+   struct pipe_context *ctx = &sctx->b;
+
+   assert(src_offset % 4 == 0);
+   assert(dst_offset % 4 == 0);
+   assert(size % 4 == 0);
+
+   assert(dst->target != PIPE_BUFFER || dst_offset + size <= dst->width0);
+   assert(!src || src_offset + size <= src->width0);
+
+   sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH |
+                  si_get_flush_flags(sctx, coher, SI_COMPUTE_DST_CACHE_POLICY);
+
+   /* Save states. */
+   void *saved_cs = sctx->cs_shader_state.program;
+   struct pipe_shader_buffer saved_sb[2] = {};
+   si_get_shader_buffers(sctx, PIPE_SHADER_COMPUTE, 0, src ? 2 : 1, saved_sb);
+
+   unsigned saved_writable_mask = 0;
+   for (unsigned i = 0; i < (src ? 2 : 1); i++) {
+      if (sctx->const_and_shader_buffers[PIPE_SHADER_COMPUTE].writable_mask &
+          (1u << si_get_shaderbuf_slot(i)))
+         saved_writable_mask |= 1 << i;
+   }
+
+   /* The memory accesses are coalesced, meaning that the 1st instruction writes
+    * the 1st contiguous block of data for the whole wave, the 2nd instruction
+    * writes the 2nd contiguous block of data, etc.
+    */
+   unsigned dwords_per_thread =
+      src ? SI_COMPUTE_COPY_DW_PER_THREAD : SI_COMPUTE_CLEAR_DW_PER_THREAD;
+   unsigned instructions_per_thread = MAX2(1, dwords_per_thread / 4);
+   unsigned dwords_per_instruction = dwords_per_thread / instructions_per_thread;
+   unsigned wave_size = sctx->screen->compute_wave_size;
+   unsigned dwords_per_wave = dwords_per_thread * wave_size;
+
+   unsigned num_dwords = size / 4;
+   unsigned num_instructions = DIV_ROUND_UP(num_dwords, dwords_per_instruction);
+
+   struct pipe_grid_info info = {};
+   info.block[0] = MIN2(wave_size, num_instructions);
+   info.block[1] = 1;
+   info.block[2] = 1;
+   info.grid[0] = DIV_ROUND_UP(num_dwords, dwords_per_wave);
+   info.grid[1] = 1;
+   info.grid[2] = 1;
+
+   struct pipe_shader_buffer sb[2] = {};
+   sb[0].buffer = dst;
+   sb[0].buffer_offset = dst_offset;
+   sb[0].buffer_size = size;
+
+   bool shader_dst_stream_policy = SI_COMPUTE_DST_CACHE_POLICY != L2_LRU;
+
+   if (src) {
+      sb[1].buffer = src;
+      sb[1].buffer_offset = src_offset;
+      sb[1].buffer_size = size;
+
+      ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, 2, sb, 0x1);
+
+      if (!sctx->cs_copy_buffer) {
+         sctx->cs_copy_buffer = si_create_dma_compute_shader(
+            &sctx->b, SI_COMPUTE_COPY_DW_PER_THREAD, shader_dst_stream_policy, true);
+      }
+      ctx->bind_compute_state(ctx, sctx->cs_copy_buffer);
+   } else {
+      assert(clear_value_size >= 4 && clear_value_size <= 16 &&
+             util_is_power_of_two_or_zero(clear_value_size));
+
+      for (unsigned i = 0; i < 4; i++)
+         sctx->cs_user_data[i] = clear_value[i % (clear_value_size / 4)];
+
+      ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, 1, sb, 0x1);
+
+      if (!sctx->cs_clear_buffer) {
+         sctx->cs_clear_buffer = si_create_dma_compute_shader(
+            &sctx->b, SI_COMPUTE_CLEAR_DW_PER_THREAD, shader_dst_stream_policy, false);
+      }
+      ctx->bind_compute_state(ctx, sctx->cs_clear_buffer);
+   }
+
+   si_launch_grid_internal(sctx, &info);
+
+   enum si_cache_policy cache_policy = get_cache_policy(sctx, coher, size);
+   sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH | (cache_policy == L2_BYPASS ? SI_CONTEXT_WB_L2 : 0);
+
+   if (cache_policy != L2_BYPASS)
+      si_resource(dst)->TC_L2_dirty = true;
+
+   /* Restore states. */
+   ctx->bind_compute_state(ctx, saved_cs);
+   ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, src ? 2 : 1, saved_sb, saved_writable_mask);
+   for (int i = 0; i < 2; i++)
+      pipe_resource_reference(&saved_sb[i].buffer, NULL);
 }
 
-void si_clear_buffer(struct si_context *sctx, struct pipe_resource *dst,
-		     uint64_t offset, uint64_t size, uint32_t *clear_value,
-		     uint32_t clear_value_size, enum si_coherency coher,
-		     bool force_cpdma)
+void si_clear_buffer(struct si_context *sctx, struct pipe_resource *dst, uint64_t offset,
+                     uint64_t size, uint32_t *clear_value, uint32_t clear_value_size,
+                     enum si_coherency coher, bool force_cpdma)
 {
-	if (!size)
-		return;
-
-	ASSERTED unsigned clear_alignment = MIN2(clear_value_size, 4);
-
-	assert(clear_value_size != 3 && clear_value_size != 6); /* 12 is allowed. */
-	assert(offset % clear_alignment == 0);
-	assert(size % clear_alignment == 0);
-	assert(size < (UINT_MAX & ~0xf)); /* TODO: test 64-bit sizes in all codepaths */
-
-	/* Reduce a large clear value size if possible. */
-	if (clear_value_size > 4) {
-		bool clear_dword_duplicated = true;
-
-		/* See if we can lower large fills to dword fills. */
-		for (unsigned i = 1; i < clear_value_size / 4; i++) {
-			if (clear_value[0] != clear_value[i]) {
-				clear_dword_duplicated = false;
-				break;
-			}
-		}
-		if (clear_dword_duplicated)
-			clear_value_size = 4;
-	}
-
-	/* Expand a small clear value size. */
-	uint32_t tmp_clear_value;
-	if (clear_value_size <= 2) {
-		if (clear_value_size == 1) {
-			tmp_clear_value = *(uint8_t*)clear_value;
-			tmp_clear_value |= (tmp_clear_value << 8) |
-					   (tmp_clear_value << 16) |
-					   (tmp_clear_value << 24);
-		} else {
-			tmp_clear_value = *(uint16_t*)clear_value;
-			tmp_clear_value |= tmp_clear_value << 16;
-		}
-		clear_value = &tmp_clear_value;
-		clear_value_size = 4;
-	}
-
-	if (clear_value_size == 12) {
-		si_compute_clear_12bytes_buffer(sctx, dst, offset, size, clear_value, coher);
-		return;
-	}
-
-	uint64_t aligned_size = size & ~3ull;
-	if (aligned_size >= 4) {
-		/* Before GFX9, CP DMA was very slow when clearing GTT, so never
-		 * use CP DMA clears on those chips, because we can't be certain
-		 * about buffer placements.
-		 */
-		if (clear_value_size > 4 ||
-		    (!force_cpdma &&
-		     clear_value_size == 4 &&
-		     offset % 4 == 0 &&
-		     (size > 32*1024 || sctx->chip_class <= GFX9))) {
-			si_compute_do_clear_or_copy(sctx, dst, offset, NULL, 0,
-						    aligned_size, clear_value,
-						    clear_value_size, coher);
-		} else {
-			assert(clear_value_size == 4);
-			si_cp_dma_clear_buffer(sctx, sctx->gfx_cs, dst, offset,
-					       aligned_size, *clear_value, 0, coher,
-					       get_cache_policy(sctx, coher, size));
-		}
-
-		offset += aligned_size;
-		size -= aligned_size;
-	}
-
-	/* Handle non-dword alignment. */
-	if (size) {
-		assert(dst);
-		assert(dst->target == PIPE_BUFFER);
-		assert(size < 4);
-
-		pipe_buffer_write(&sctx->b, dst, offset, size, clear_value);
-	}
+   if (!size)
+      return;
+
+   ASSERTED unsigned clear_alignment = MIN2(clear_value_size, 4);
+
+   assert(clear_value_size != 3 && clear_value_size != 6); /* 12 is allowed. */
+   assert(offset % clear_alignment == 0);
+   assert(size % clear_alignment == 0);
+   assert(size < (UINT_MAX & ~0xf)); /* TODO: test 64-bit sizes in all codepaths */
+
+   /* Reduce a large clear value size if possible. */
+   if (clear_value_size > 4) {
+      bool clear_dword_duplicated = true;
+
+      /* See if we can lower large fills to dword fills. */
+      for (unsigned i = 1; i < clear_value_size / 4; i++) {
+         if (clear_value[0] != clear_value[i]) {
+            clear_dword_duplicated = false;
+            break;
+         }
+      }
+      if (clear_dword_duplicated)
+         clear_value_size = 4;
+   }
+
+   /* Expand a small clear value size. */
+   uint32_t tmp_clear_value;
+   if (clear_value_size <= 2) {
+      if (clear_value_size == 1) {
+         tmp_clear_value = *(uint8_t *)clear_value;
+         tmp_clear_value |=
+            (tmp_clear_value << 8) | (tmp_clear_value << 16) | (tmp_clear_value << 24);
+      } else {
+         tmp_clear_value = *(uint16_t *)clear_value;
+         tmp_clear_value |= tmp_clear_value << 16;
+      }
+      clear_value = &tmp_clear_value;
+      clear_value_size = 4;
+   }
+
+   if (clear_value_size == 12) {
+      si_compute_clear_12bytes_buffer(sctx, dst, offset, size, clear_value, coher);
+      return;
+   }
+
+   uint64_t aligned_size = size & ~3ull;
+   if (aligned_size >= 4) {
+      /* Before GFX9, CP DMA was very slow when clearing GTT, so never
+       * use CP DMA clears on those chips, because we can't be certain
+       * about buffer placements.
+       */
+      if (clear_value_size > 4 || (!force_cpdma && clear_value_size == 4 && offset % 4 == 0 &&
+                                   (size > 32 * 1024 || sctx->chip_class <= GFX9))) {
+         si_compute_do_clear_or_copy(sctx, dst, offset, NULL, 0, aligned_size, clear_value,
+                                     clear_value_size, coher);
+      } else {
+         assert(clear_value_size == 4);
+         si_cp_dma_clear_buffer(sctx, sctx->gfx_cs, dst, offset, aligned_size, *clear_value, 0,
+                                coher, get_cache_policy(sctx, coher, size));
+      }
+
+      offset += aligned_size;
+      size -= aligned_size;
+   }
+
+   /* Handle non-dword alignment. */
+   if (size) {
+      assert(dst);
+      assert(dst->target == PIPE_BUFFER);
+      assert(size < 4);
+
+      pipe_buffer_write(&sctx->b, dst, offset, size, clear_value);
+   }
 }
 
-static void si_pipe_clear_buffer(struct pipe_context *ctx,
-				 struct pipe_resource *dst,
-				 unsigned offset, unsigned size,
-				 const void *clear_value,
-				 int clear_value_size)
+static void si_pipe_clear_buffer(struct pipe_context *ctx, struct pipe_resource *dst,
+                                 unsigned offset, unsigned size, const void *clear_value,
+                                 int clear_value_size)
 {
-	si_clear_buffer((struct si_context*)ctx, dst, offset, size, (uint32_t*)clear_value,
-			clear_value_size, SI_COHERENCY_SHADER, false);
+   si_clear_buffer((struct si_context *)ctx, dst, offset, size, (uint32_t *)clear_value,
+                   clear_value_size, SI_COHERENCY_SHADER, false);
 }
 
-void si_copy_buffer(struct si_context *sctx,
-		    struct pipe_resource *dst, struct pipe_resource *src,
-		    uint64_t dst_offset, uint64_t src_offset, unsigned size)
+void si_copy_buffer(struct si_context *sctx, struct pipe_resource *dst, struct pipe_resource *src,
+                    uint64_t dst_offset, uint64_t src_offset, unsigned size)
 {
-	if (!size)
-		return;
-
-	enum si_coherency coher = SI_COHERENCY_SHADER;
-	enum si_cache_policy cache_policy = get_cache_policy(sctx, coher, size);
-
-	/* Only use compute for VRAM copies on dGPUs. */
-	if (sctx->screen->info.has_dedicated_vram &&
-	    si_resource(dst)->domains & RADEON_DOMAIN_VRAM &&
-	    si_resource(src)->domains & RADEON_DOMAIN_VRAM &&
-	    size > 32 * 1024 &&
-	    dst_offset % 4 == 0 && src_offset % 4 == 0 && size % 4 == 0) {
-		si_compute_do_clear_or_copy(sctx, dst, dst_offset, src, src_offset,
-					    size, NULL, 0, coher);
-	} else {
-		si_cp_dma_copy_buffer(sctx, dst, src, dst_offset, src_offset, size,
-				      0, coher, cache_policy);
-	}
+   if (!size)
+      return;
+
+   enum si_coherency coher = SI_COHERENCY_SHADER;
+   enum si_cache_policy cache_policy = get_cache_policy(sctx, coher, size);
+
+   /* Only use compute for VRAM copies on dGPUs. */
+   if (sctx->screen->info.has_dedicated_vram && si_resource(dst)->domains & RADEON_DOMAIN_VRAM &&
+       si_resource(src)->domains & RADEON_DOMAIN_VRAM && size > 32 * 1024 && dst_offset % 4 == 0 &&
+       src_offset % 4 == 0 && size % 4 == 0) {
+      si_compute_do_clear_or_copy(sctx, dst, dst_offset, src, src_offset, size, NULL, 0, coher);
+   } else {
+      si_cp_dma_copy_buffer(sctx, dst, src, dst_offset, src_offset, size, 0, coher, cache_policy);
+   }
 }
 
-void si_compute_copy_image(struct si_context *sctx,
-			   struct pipe_resource *dst,
-			   unsigned dst_level,
-			   struct pipe_resource *src,
-			   unsigned src_level,
-			   unsigned dstx, unsigned dsty, unsigned dstz,
-			   const struct pipe_box *src_box)
+void si_compute_copy_image(struct si_context *sctx, struct pipe_resource *dst, unsigned dst_level,
+                           struct pipe_resource *src, unsigned src_level, unsigned dstx,
+                           unsigned dsty, unsigned dstz, const struct pipe_box *src_box)
 {
-	struct pipe_context *ctx = &sctx->b;
-	unsigned width = src_box->width;
-	unsigned height = src_box->height;
-	unsigned depth = src_box->depth;
-	enum pipe_format src_format = util_format_linear(src->format);
-	enum pipe_format dst_format = util_format_linear(dst->format);
-
-	assert(util_format_is_subsampled_422(src_format) ==
-	       util_format_is_subsampled_422(dst_format));
-
-	if (util_format_is_subsampled_422(src_format)) {
-		src_format = dst_format = PIPE_FORMAT_R32_UINT;
-		/* Interpreting 422 subsampled format (16 bpp) as 32 bpp
-		 * should force us to divide src_box->x, dstx and width by 2.
-		 * But given that ac_surface allocates this format as 32 bpp
-		 * and that surf_size is then modified to pack the values
-		 * we must keep the original values to get the correct results.
-		 */
-	}
-	unsigned data[] = {src_box->x, src_box->y, src_box->z, 0,
-	                   dstx, dsty, dstz, 0};
-
-	if (width == 0 || height == 0)
-		return;
-
-	sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH |
-		       si_get_flush_flags(sctx, SI_COHERENCY_SHADER, L2_STREAM);
-
-	/* The driver doesn't decompress resources automatically here. */
-	si_decompress_subresource(ctx, dst, PIPE_MASK_RGBAZS, dst_level,
-				  dstz, dstz + src_box->depth - 1);
-	si_decompress_subresource(ctx, src, PIPE_MASK_RGBAZS, src_level,
-				  src_box->z, src_box->z + src_box->depth - 1);
-
-	/* src and dst have the same number of samples. */
-	si_make_CB_shader_coherent(sctx, src->nr_samples, true,
-				   /* Only src can have DCC.*/
-				   ((struct si_texture*)src)->surface.u.gfx9.dcc.pipe_aligned);
-
-	struct pipe_constant_buffer saved_cb = {};
-	si_get_pipe_constant_buffer(sctx, PIPE_SHADER_COMPUTE, 0, &saved_cb);
-
-	struct si_images *images = &sctx->images[PIPE_SHADER_COMPUTE];
-	struct pipe_image_view saved_image[2] = {0};
-	util_copy_image_view(&saved_image[0], &images->views[0]);
-	util_copy_image_view(&saved_image[1], &images->views[1]);
-
-	void *saved_cs = sctx->cs_shader_state.program;
-
-	struct pipe_constant_buffer cb = {};
-	cb.buffer_size = sizeof(data);
-	cb.user_buffer = data;
-	ctx->set_constant_buffer(ctx, PIPE_SHADER_COMPUTE, 0, &cb);
-
-	struct pipe_image_view image[2] = {0};
-	image[0].resource = src;
-	image[0].shader_access = image[0].access = PIPE_IMAGE_ACCESS_READ;
-	image[0].format = src_format;
-	image[0].u.tex.level = src_level;
-	image[0].u.tex.first_layer = 0;
-	image[0].u.tex.last_layer =
-		src->target == PIPE_TEXTURE_3D ? u_minify(src->depth0, src_level) - 1
-						: (unsigned)(src->array_size - 1);
-	image[1].resource = dst;
-	image[1].shader_access = image[1].access = PIPE_IMAGE_ACCESS_WRITE;
-	image[1].format = dst_format;
-	image[1].u.tex.level = dst_level;
-	image[1].u.tex.first_layer = 0;
-	image[1].u.tex.last_layer =
-		dst->target == PIPE_TEXTURE_3D ? u_minify(dst->depth0, dst_level) - 1
-						: (unsigned)(dst->array_size - 1);
-
-	if (src->format == PIPE_FORMAT_R9G9B9E5_FLOAT)
-		image[0].format = image[1].format = PIPE_FORMAT_R32_UINT;
-
-	/* SNORM8 blitting has precision issues on some chips. Use the SINT
-	 * equivalent instead, which doesn't force DCC decompression.
-	 * Note that some chips avoid this issue by using SDMA.
-	 */
-	if (util_format_is_snorm8(dst->format)) {
-		image[0].format = image[1].format =
-			util_format_snorm8_to_sint8(dst->format);
-	}
-
-	ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 2, image);
-
-	struct pipe_grid_info info = {0};
-
-	if (dst->target == PIPE_TEXTURE_1D_ARRAY && src->target == PIPE_TEXTURE_1D_ARRAY) {
-		if (!sctx->cs_copy_image_1d_array)
-			sctx->cs_copy_image_1d_array =
-				si_create_copy_image_compute_shader_1d_array(ctx);
-		ctx->bind_compute_state(ctx, sctx->cs_copy_image_1d_array);
-		info.block[0] = 64;
-		info.last_block[0] = width % 64;
-		info.block[1] = 1;
-		info.block[2] = 1;
-		info.grid[0] = DIV_ROUND_UP(width, 64);
-		info.grid[1] = depth;
-		info.grid[2] = 1;
-	} else {
-		if (!sctx->cs_copy_image)
-			sctx->cs_copy_image = si_create_copy_image_compute_shader(ctx);
-		ctx->bind_compute_state(ctx, sctx->cs_copy_image);
-		info.block[0] = 8;
-		info.last_block[0] = width % 8;
-		info.block[1] = 8;
-		info.last_block[1] = height % 8;
-		info.block[2] = 1;
-		info.grid[0] = DIV_ROUND_UP(width, 8);
-		info.grid[1] = DIV_ROUND_UP(height, 8);
-		info.grid[2] = depth;
-	}
-
-	si_launch_grid_internal(sctx, &info);
-
-	sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH |
-		       (sctx->chip_class <= GFX8 ? SI_CONTEXT_WB_L2 : 0) |
-		       si_get_flush_flags(sctx, SI_COHERENCY_SHADER, L2_STREAM);
-	ctx->bind_compute_state(ctx, saved_cs);
-	ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 2, saved_image);
-	ctx->set_constant_buffer(ctx, PIPE_SHADER_COMPUTE, 0, &saved_cb);
-	for (int i = 0; i < 2; i++)
-		pipe_resource_reference(&saved_image[i].resource, NULL);
-	pipe_resource_reference(&saved_cb.buffer, NULL);
+   struct pipe_context *ctx = &sctx->b;
+   unsigned width = src_box->width;
+   unsigned height = src_box->height;
+   unsigned depth = src_box->depth;
+   enum pipe_format src_format = util_format_linear(src->format);
+   enum pipe_format dst_format = util_format_linear(dst->format);
+
+   assert(util_format_is_subsampled_422(src_format) == util_format_is_subsampled_422(dst_format));
+
+   if (util_format_is_subsampled_422(src_format)) {
+      src_format = dst_format = PIPE_FORMAT_R32_UINT;
+      /* Interpreting 422 subsampled format (16 bpp) as 32 bpp
+       * should force us to divide src_box->x, dstx and width by 2.
+       * But given that ac_surface allocates this format as 32 bpp
+       * and that surf_size is then modified to pack the values
+       * we must keep the original values to get the correct results.
+       */
+   }
+   unsigned data[] = {src_box->x, src_box->y, src_box->z, 0, dstx, dsty, dstz, 0};
+
+   if (width == 0 || height == 0)
+      return;
+
+   sctx->flags |=
+      SI_CONTEXT_CS_PARTIAL_FLUSH | si_get_flush_flags(sctx, SI_COHERENCY_SHADER, L2_STREAM);
+
+   /* The driver doesn't decompress resources automatically here. */
+   si_decompress_subresource(ctx, dst, PIPE_MASK_RGBAZS, dst_level, dstz,
+                             dstz + src_box->depth - 1);
+   si_decompress_subresource(ctx, src, PIPE_MASK_RGBAZS, src_level, src_box->z,
+                             src_box->z + src_box->depth - 1);
+
+   /* src and dst have the same number of samples. */
+   si_make_CB_shader_coherent(sctx, src->nr_samples, true,
+                              /* Only src can have DCC.*/
+                              ((struct si_texture *)src)->surface.u.gfx9.dcc.pipe_aligned);
+
+   struct pipe_constant_buffer saved_cb = {};
+   si_get_pipe_constant_buffer(sctx, PIPE_SHADER_COMPUTE, 0, &saved_cb);
+
+   struct si_images *images = &sctx->images[PIPE_SHADER_COMPUTE];
+   struct pipe_image_view saved_image[2] = {0};
+   util_copy_image_view(&saved_image[0], &images->views[0]);
+   util_copy_image_view(&saved_image[1], &images->views[1]);
+
+   void *saved_cs = sctx->cs_shader_state.program;
+
+   struct pipe_constant_buffer cb = {};
+   cb.buffer_size = sizeof(data);
+   cb.user_buffer = data;
+   ctx->set_constant_buffer(ctx, PIPE_SHADER_COMPUTE, 0, &cb);
+
+   struct pipe_image_view image[2] = {0};
+   image[0].resource = src;
+   image[0].shader_access = image[0].access = PIPE_IMAGE_ACCESS_READ;
+   image[0].format = src_format;
+   image[0].u.tex.level = src_level;
+   image[0].u.tex.first_layer = 0;
+   image[0].u.tex.last_layer = src->target == PIPE_TEXTURE_3D ? u_minify(src->depth0, src_level) - 1
+                                                              : (unsigned)(src->array_size - 1);
+   image[1].resource = dst;
+   image[1].shader_access = image[1].access = PIPE_IMAGE_ACCESS_WRITE;
+   image[1].format = dst_format;
+   image[1].u.tex.level = dst_level;
+   image[1].u.tex.first_layer = 0;
+   image[1].u.tex.last_layer = dst->target == PIPE_TEXTURE_3D ? u_minify(dst->depth0, dst_level) - 1
+                                                              : (unsigned)(dst->array_size - 1);
+
+   if (src->format == PIPE_FORMAT_R9G9B9E5_FLOAT)
+      image[0].format = image[1].format = PIPE_FORMAT_R32_UINT;
+
+   /* SNORM8 blitting has precision issues on some chips. Use the SINT
+    * equivalent instead, which doesn't force DCC decompression.
+    * Note that some chips avoid this issue by using SDMA.
+    */
+   if (util_format_is_snorm8(dst->format)) {
+      image[0].format = image[1].format = util_format_snorm8_to_sint8(dst->format);
+   }
+
+   ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 2, image);
+
+   struct pipe_grid_info info = {0};
+
+   if (dst->target == PIPE_TEXTURE_1D_ARRAY && src->target == PIPE_TEXTURE_1D_ARRAY) {
+      if (!sctx->cs_copy_image_1d_array)
+         sctx->cs_copy_image_1d_array = si_create_copy_image_compute_shader_1d_array(ctx);
+      ctx->bind_compute_state(ctx, sctx->cs_copy_image_1d_array);
+      info.block[0] = 64;
+      info.last_block[0] = width % 64;
+      info.block[1] = 1;
+      info.block[2] = 1;
+      info.grid[0] = DIV_ROUND_UP(width, 64);
+      info.grid[1] = depth;
+      info.grid[2] = 1;
+   } else {
+      if (!sctx->cs_copy_image)
+         sctx->cs_copy_image = si_create_copy_image_compute_shader(ctx);
+      ctx->bind_compute_state(ctx, sctx->cs_copy_image);
+      info.block[0] = 8;
+      info.last_block[0] = width % 8;
+      info.block[1] = 8;
+      info.last_block[1] = height % 8;
+      info.block[2] = 1;
+      info.grid[0] = DIV_ROUND_UP(width, 8);
+      info.grid[1] = DIV_ROUND_UP(height, 8);
+      info.grid[2] = depth;
+   }
+
+   si_launch_grid_internal(sctx, &info);
+
+   sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH | (sctx->chip_class <= GFX8 ? SI_CONTEXT_WB_L2 : 0) |
+                  si_get_flush_flags(sctx, SI_COHERENCY_SHADER, L2_STREAM);
+   ctx->bind_compute_state(ctx, saved_cs);
+   ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 2, saved_image);
+   ctx->set_constant_buffer(ctx, PIPE_SHADER_COMPUTE, 0, &saved_cb);
+   for (int i = 0; i < 2; i++)
+      pipe_resource_reference(&saved_image[i].resource, NULL);
+   pipe_resource_reference(&saved_cb.buffer, NULL);
 }
 
 void si_retile_dcc(struct si_context *sctx, struct si_texture *tex)
 {
-	struct pipe_context *ctx = &sctx->b;
-
-	sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
-		       SI_CONTEXT_CS_PARTIAL_FLUSH |
-		       si_get_flush_flags(sctx, SI_COHERENCY_CB_META, L2_LRU) |
-		       si_get_flush_flags(sctx, SI_COHERENCY_SHADER, L2_LRU);
-	sctx->emit_cache_flush(sctx);
-
-	/* Save states. */
-	void *saved_cs = sctx->cs_shader_state.program;
-	struct pipe_image_view saved_img[3] = {};
-
-	for (unsigned i = 0; i < 3; i++) {
-		util_copy_image_view(&saved_img[i],
-				     &sctx->images[PIPE_SHADER_COMPUTE].views[i]);
-	}
-
-	/* Set images. */
-	bool use_uint16 = tex->surface.u.gfx9.dcc_retile_use_uint16;
-	unsigned num_elements = tex->surface.u.gfx9.dcc_retile_num_elements;
-	struct pipe_image_view img[3];
-
-	assert(tex->surface.dcc_retile_map_offset && tex->surface.dcc_retile_map_offset <= UINT_MAX);
-	assert(tex->surface.dcc_offset && tex->surface.dcc_offset <= UINT_MAX);
-	assert(tex->surface.display_dcc_offset && tex->surface.display_dcc_offset <= UINT_MAX);
-
-	for (unsigned i = 0; i < 3; i++) {
-		img[i].resource = &tex->buffer.b.b;
-		img[i].access = i == 2 ? PIPE_IMAGE_ACCESS_WRITE : PIPE_IMAGE_ACCESS_READ;
-		img[i].shader_access = SI_IMAGE_ACCESS_AS_BUFFER;
-	}
-
-	img[0].format = use_uint16 ? PIPE_FORMAT_R16G16B16A16_UINT :
-				     PIPE_FORMAT_R32G32B32A32_UINT;
-	img[0].u.buf.offset = tex->surface.dcc_retile_map_offset;
-	img[0].u.buf.size = num_elements * (use_uint16 ? 2 : 4);
-
-	img[1].format = PIPE_FORMAT_R8_UINT;
-	img[1].u.buf.offset = tex->surface.dcc_offset;
-	img[1].u.buf.size = tex->surface.dcc_size;
-
-	img[2].format = PIPE_FORMAT_R8_UINT;
-	img[2].u.buf.offset = tex->surface.display_dcc_offset;
-	img[2].u.buf.size = tex->surface.u.gfx9.display_dcc_size;
-
-	ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 3, img);
-
-	/* Bind the compute shader. */
-	if (!sctx->cs_dcc_retile)
-		sctx->cs_dcc_retile = si_create_dcc_retile_cs(ctx);
-	ctx->bind_compute_state(ctx, sctx->cs_dcc_retile);
-
-	/* Dispatch compute. */
-	/* img[0] has 4 channels per element containing 2 pairs of DCC offsets. */
-	unsigned num_threads = num_elements / 4;
-
-	struct pipe_grid_info info = {};
-	info.block[0] = 64;
-	info.block[1] = 1;
-	info.block[2] = 1;
-	info.grid[0] = DIV_ROUND_UP(num_threads, 64); /* includes the partial block */
-	info.grid[1] = 1;
-	info.grid[2] = 1;
-	info.last_block[0] = num_threads % 64;
-
-	si_launch_grid_internal(sctx, &info);
-
-	/* Don't flush caches or wait. The driver will wait at the end of this IB,
-	 * and L2 will be flushed by the kernel fence.
-	 */
-
-	/* Restore states. */
-	ctx->bind_compute_state(ctx, saved_cs);
-	ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 3, saved_img);
-
-	for (unsigned i = 0; i < 3; i++) {
-		pipe_resource_reference(&saved_img[i].resource, NULL);
-	}
+   struct pipe_context *ctx = &sctx->b;
+
+   sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH |
+                  si_get_flush_flags(sctx, SI_COHERENCY_CB_META, L2_LRU) |
+                  si_get_flush_flags(sctx, SI_COHERENCY_SHADER, L2_LRU);
+   sctx->emit_cache_flush(sctx);
+
+   /* Save states. */
+   void *saved_cs = sctx->cs_shader_state.program;
+   struct pipe_image_view saved_img[3] = {};
+
+   for (unsigned i = 0; i < 3; i++) {
+      util_copy_image_view(&saved_img[i], &sctx->images[PIPE_SHADER_COMPUTE].views[i]);
+   }
+
+   /* Set images. */
+   bool use_uint16 = tex->surface.u.gfx9.dcc_retile_use_uint16;
+   unsigned num_elements = tex->surface.u.gfx9.dcc_retile_num_elements;
+   struct pipe_image_view img[3];
+
+   assert(tex->surface.dcc_retile_map_offset && tex->surface.dcc_retile_map_offset <= UINT_MAX);
+   assert(tex->surface.dcc_offset && tex->surface.dcc_offset <= UINT_MAX);
+   assert(tex->surface.display_dcc_offset && tex->surface.display_dcc_offset <= UINT_MAX);
+
+   for (unsigned i = 0; i < 3; i++) {
+      img[i].resource = &tex->buffer.b.b;
+      img[i].access = i == 2 ? PIPE_IMAGE_ACCESS_WRITE : PIPE_IMAGE_ACCESS_READ;
+      img[i].shader_access = SI_IMAGE_ACCESS_AS_BUFFER;
+   }
+
+   img[0].format = use_uint16 ? PIPE_FORMAT_R16G16B16A16_UINT : PIPE_FORMAT_R32G32B32A32_UINT;
+   img[0].u.buf.offset = tex->surface.dcc_retile_map_offset;
+   img[0].u.buf.size = num_elements * (use_uint16 ? 2 : 4);
+
+   img[1].format = PIPE_FORMAT_R8_UINT;
+   img[1].u.buf.offset = tex->surface.dcc_offset;
+   img[1].u.buf.size = tex->surface.dcc_size;
+
+   img[2].format = PIPE_FORMAT_R8_UINT;
+   img[2].u.buf.offset = tex->surface.display_dcc_offset;
+   img[2].u.buf.size = tex->surface.u.gfx9.display_dcc_size;
+
+   ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 3, img);
+
+   /* Bind the compute shader. */
+   if (!sctx->cs_dcc_retile)
+      sctx->cs_dcc_retile = si_create_dcc_retile_cs(ctx);
+   ctx->bind_compute_state(ctx, sctx->cs_dcc_retile);
+
+   /* Dispatch compute. */
+   /* img[0] has 4 channels per element containing 2 pairs of DCC offsets. */
+   unsigned num_threads = num_elements / 4;
+
+   struct pipe_grid_info info = {};
+   info.block[0] = 64;
+   info.block[1] = 1;
+   info.block[2] = 1;
+   info.grid[0] = DIV_ROUND_UP(num_threads, 64); /* includes the partial block */
+   info.grid[1] = 1;
+   info.grid[2] = 1;
+   info.last_block[0] = num_threads % 64;
+
+   si_launch_grid_internal(sctx, &info);
+
+   /* Don't flush caches or wait. The driver will wait at the end of this IB,
+    * and L2 will be flushed by the kernel fence.
+    */
+
+   /* Restore states. */
+   ctx->bind_compute_state(ctx, saved_cs);
+   ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 3, saved_img);
+
+   for (unsigned i = 0; i < 3; i++) {
+      pipe_resource_reference(&saved_img[i].resource, NULL);
+   }
 }
 
 /* Expand FMASK to make it identity, so that image stores can ignore it. */
 void si_compute_expand_fmask(struct pipe_context *ctx, struct pipe_resource *tex)
 {
-	struct si_context *sctx = (struct si_context *)ctx;
-	bool is_array = tex->target == PIPE_TEXTURE_2D_ARRAY;
-	unsigned log_fragments = util_logbase2(tex->nr_storage_samples);
-	unsigned log_samples = util_logbase2(tex->nr_samples);
-	assert(tex->nr_samples >= 2);
-
-	/* EQAA FMASK expansion is unimplemented. */
-	if (tex->nr_samples != tex->nr_storage_samples)
-		return;
-
-	/* Flush caches and sync engines. */
-	sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH |
-		       si_get_flush_flags(sctx, SI_COHERENCY_SHADER, L2_STREAM);
-	si_make_CB_shader_coherent(sctx, tex->nr_samples, true,
-				   true /* DCC is not possible with image stores */);
-
-	/* Save states. */
-	void *saved_cs = sctx->cs_shader_state.program;
-	struct pipe_image_view saved_image = {0};
-	util_copy_image_view(&saved_image, &sctx->images[PIPE_SHADER_COMPUTE].views[0]);
-
-	/* Bind the image. */
-	struct pipe_image_view image = {0};
-	image.resource = tex;
-	/* Don't set WRITE so as not to trigger FMASK expansion, causing
-	 * an infinite loop. */
-	image.shader_access = image.access = PIPE_IMAGE_ACCESS_READ;
-	image.format = util_format_linear(tex->format);
-	if (is_array)
-		image.u.tex.last_layer = tex->array_size - 1;
-
-	ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 1, &image);
-
-	/* Bind the shader. */
-	void **shader = &sctx->cs_fmask_expand[log_samples - 1][is_array];
-	if (!*shader)
-		*shader = si_create_fmask_expand_cs(ctx, tex->nr_samples, is_array);
-	ctx->bind_compute_state(ctx, *shader);
-
-	/* Dispatch compute. */
-	struct pipe_grid_info info = {0};
-	info.block[0] = 8;
-	info.last_block[0] = tex->width0 % 8;
-	info.block[1] = 8;
-	info.last_block[1] = tex->height0 % 8;
-	info.block[2] = 1;
-	info.grid[0] = DIV_ROUND_UP(tex->width0, 8);
-	info.grid[1] = DIV_ROUND_UP(tex->height0, 8);
-	info.grid[2] = is_array ? tex->array_size : 1;
-
-	si_launch_grid_internal(sctx, &info);
-
-	/* Flush caches and sync engines. */
-	sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH |
-		       (sctx->chip_class <= GFX8 ? SI_CONTEXT_WB_L2 : 0) |
-		       si_get_flush_flags(sctx, SI_COHERENCY_SHADER, L2_STREAM);
-
-	/* Restore previous states. */
-	ctx->bind_compute_state(ctx, saved_cs);
-	ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 1, &saved_image);
-	pipe_resource_reference(&saved_image.resource, NULL);
-
-	/* Array of fully expanded FMASK values, arranged by [log2(fragments)][log2(samples)-1]. */
+   struct si_context *sctx = (struct si_context *)ctx;
+   bool is_array = tex->target == PIPE_TEXTURE_2D_ARRAY;
+   unsigned log_fragments = util_logbase2(tex->nr_storage_samples);
+   unsigned log_samples = util_logbase2(tex->nr_samples);
+   assert(tex->nr_samples >= 2);
+
+   /* EQAA FMASK expansion is unimplemented. */
+   if (tex->nr_samples != tex->nr_storage_samples)
+      return;
+
+   /* Flush caches and sync engines. */
+   sctx->flags |=
+      SI_CONTEXT_CS_PARTIAL_FLUSH | si_get_flush_flags(sctx, SI_COHERENCY_SHADER, L2_STREAM);
+   si_make_CB_shader_coherent(sctx, tex->nr_samples, true,
+                              true /* DCC is not possible with image stores */);
+
+   /* Save states. */
+   void *saved_cs = sctx->cs_shader_state.program;
+   struct pipe_image_view saved_image = {0};
+   util_copy_image_view(&saved_image, &sctx->images[PIPE_SHADER_COMPUTE].views[0]);
+
+   /* Bind the image. */
+   struct pipe_image_view image = {0};
+   image.resource = tex;
+   /* Don't set WRITE so as not to trigger FMASK expansion, causing
+    * an infinite loop. */
+   image.shader_access = image.access = PIPE_IMAGE_ACCESS_READ;
+   image.format = util_format_linear(tex->format);
+   if (is_array)
+      image.u.tex.last_layer = tex->array_size - 1;
+
+   ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 1, &image);
+
+   /* Bind the shader. */
+   void **shader = &sctx->cs_fmask_expand[log_samples - 1][is_array];
+   if (!*shader)
+      *shader = si_create_fmask_expand_cs(ctx, tex->nr_samples, is_array);
+   ctx->bind_compute_state(ctx, *shader);
+
+   /* Dispatch compute. */
+   struct pipe_grid_info info = {0};
+   info.block[0] = 8;
+   info.last_block[0] = tex->width0 % 8;
+   info.block[1] = 8;
+   info.last_block[1] = tex->height0 % 8;
+   info.block[2] = 1;
+   info.grid[0] = DIV_ROUND_UP(tex->width0, 8);
+   info.grid[1] = DIV_ROUND_UP(tex->height0, 8);
+   info.grid[2] = is_array ? tex->array_size : 1;
+
+   si_launch_grid_internal(sctx, &info);
+
+   /* Flush caches and sync engines. */
+   sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH | (sctx->chip_class <= GFX8 ? SI_CONTEXT_WB_L2 : 0) |
+                  si_get_flush_flags(sctx, SI_COHERENCY_SHADER, L2_STREAM);
+
+   /* Restore previous states. */
+   ctx->bind_compute_state(ctx, saved_cs);
+   ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 1, &saved_image);
+   pipe_resource_reference(&saved_image.resource, NULL);
+
+   /* Array of fully expanded FMASK values, arranged by [log2(fragments)][log2(samples)-1]. */
 #define INVALID 0 /* never used */
-	static const uint64_t fmask_expand_values[][4] = {
-		/* samples */
-		/* 2 (8 bpp) 4 (8 bpp)   8 (8-32bpp) 16 (16-64bpp)      fragments */
-		{0x02020202, 0x0E0E0E0E, 0xFEFEFEFE, 0xFFFEFFFE},         /* 1 */
-		{0x02020202, 0xA4A4A4A4, 0xAAA4AAA4, 0xAAAAAAA4},         /* 2 */
-		{INVALID,    0xE4E4E4E4, 0x44443210, 0x4444444444443210}, /* 4 */
-		{INVALID,    INVALID,    0x76543210, 0x8888888876543210}, /* 8 */
-	};
-
-	/* Clear FMASK to identity. */
-	struct si_texture *stex = (struct si_texture*)tex;
-	si_clear_buffer(sctx, tex, stex->surface.fmask_offset, stex->surface.fmask_size,
-			(uint32_t*)&fmask_expand_values[log_fragments][log_samples - 1],
-			4, SI_COHERENCY_SHADER, false);
+   static const uint64_t fmask_expand_values[][4] = {
+      /* samples */
+      /* 2 (8 bpp) 4 (8 bpp)   8 (8-32bpp) 16 (16-64bpp)      fragments */
+      {0x02020202, 0x0E0E0E0E, 0xFEFEFEFE, 0xFFFEFFFE},      /* 1 */
+      {0x02020202, 0xA4A4A4A4, 0xAAA4AAA4, 0xAAAAAAA4},      /* 2 */
+      {INVALID, 0xE4E4E4E4, 0x44443210, 0x4444444444443210}, /* 4 */
+      {INVALID, INVALID, 0x76543210, 0x8888888876543210},    /* 8 */
+   };
+
+   /* Clear FMASK to identity. */
+   struct si_texture *stex = (struct si_texture *)tex;
+   si_clear_buffer(sctx, tex, stex->surface.fmask_offset, stex->surface.fmask_size,
+                   (uint32_t *)&fmask_expand_values[log_fragments][log_samples - 1], 4,
+                   SI_COHERENCY_SHADER, false);
 }
 
 void si_init_compute_blit_functions(struct si_context *sctx)
 {
-	sctx->b.clear_buffer = si_pipe_clear_buffer;
+   sctx->b.clear_buffer = si_pipe_clear_buffer;
 }
 
 /* Clear a region of a color surface to a constant value. */
-void si_compute_clear_render_target(struct pipe_context *ctx,
-				    struct pipe_surface *dstsurf,
-				    const union pipe_color_union *color,
-				    unsigned dstx, unsigned dsty,
-				    unsigned width, unsigned height,
-				    bool render_condition_enabled)
+void si_compute_clear_render_target(struct pipe_context *ctx, struct pipe_surface *dstsurf,
+                                    const union pipe_color_union *color, unsigned dstx,
+                                    unsigned dsty, unsigned width, unsigned height,
+                                    bool render_condition_enabled)
 {
-	struct si_context *sctx = (struct si_context *)ctx;
-	unsigned num_layers = dstsurf->u.tex.last_layer - dstsurf->u.tex.first_layer + 1;
-	unsigned data[4 + sizeof(color->ui)] = {dstx, dsty, dstsurf->u.tex.first_layer, 0};
-
-	if (width == 0 || height == 0)
-		return;
-
-	/* The driver doesn't decompress resources automatically here. */
-	si_decompress_subresource(ctx, dstsurf->texture, PIPE_MASK_RGBA,
-				  dstsurf->u.tex.level, dstsurf->u.tex.first_layer,
-				  dstsurf->u.tex.last_layer);
-
-	if (util_format_is_srgb(dstsurf->format)) {
-		union pipe_color_union color_srgb;
-		for (int i = 0; i < 3; i++)
-			color_srgb.f[i] = util_format_linear_to_srgb_float(color->f[i]);
-		color_srgb.f[3] = color->f[3];
-		memcpy(data + 4, color_srgb.ui, sizeof(color->ui));
-	} else {
-		memcpy(data + 4, color->ui, sizeof(color->ui));
-	}
-
-	sctx->render_cond_force_off = !render_condition_enabled;
-
-	sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH |
-		       si_get_flush_flags(sctx, SI_COHERENCY_SHADER, L2_STREAM);
-	si_make_CB_shader_coherent(sctx, dstsurf->texture->nr_samples, true,
-				   true /* DCC is not possible with image stores */);
-
-	struct pipe_constant_buffer saved_cb = {};
-	si_get_pipe_constant_buffer(sctx, PIPE_SHADER_COMPUTE, 0, &saved_cb);
-
-	struct si_images *images = &sctx->images[PIPE_SHADER_COMPUTE];
-	struct pipe_image_view saved_image = {0};
-	util_copy_image_view(&saved_image, &images->views[0]);
-
-	void *saved_cs = sctx->cs_shader_state.program;
-
-	struct pipe_constant_buffer cb = {};
-	cb.buffer_size = sizeof(data);
-	cb.user_buffer = data;
-	ctx->set_constant_buffer(ctx, PIPE_SHADER_COMPUTE, 0, &cb);
-
-	struct pipe_image_view image = {0};
-	image.resource = dstsurf->texture;
-	image.shader_access = image.access = PIPE_IMAGE_ACCESS_WRITE;
-	image.format = util_format_linear(dstsurf->format);
-	image.u.tex.level = dstsurf->u.tex.level;
-	image.u.tex.first_layer = 0; /* 3D images ignore first_layer (BASE_ARRAY) */
-	image.u.tex.last_layer = dstsurf->u.tex.last_layer;
-
-	ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 1, &image);
-
-	struct pipe_grid_info info = {0};
-
-	if (dstsurf->texture->target != PIPE_TEXTURE_1D_ARRAY) {
-		if (!sctx->cs_clear_render_target)
-			sctx->cs_clear_render_target = si_clear_render_target_shader(ctx);
-		ctx->bind_compute_state(ctx, sctx->cs_clear_render_target);
-		info.block[0] = 8;
-		info.last_block[0] = width % 8;
-		info.block[1] = 8;
-		info.last_block[1] = height % 8;
-		info.block[2] = 1;
-		info.grid[0] = DIV_ROUND_UP(width, 8);
-		info.grid[1] = DIV_ROUND_UP(height, 8);
-		info.grid[2] = num_layers;
-	} else {
-		if (!sctx->cs_clear_render_target_1d_array)
-			sctx->cs_clear_render_target_1d_array =
-				si_clear_render_target_shader_1d_array(ctx);
-		ctx->bind_compute_state(ctx, sctx->cs_clear_render_target_1d_array);
-		info.block[0] = 64;
-		info.last_block[0] = width % 64;
-		info.block[1] = 1;
-		info.block[2] = 1;
-		info.grid[0] = DIV_ROUND_UP(width, 64);
-		info.grid[1] = num_layers;
-		info.grid[2] = 1;
-	}
-
-	si_launch_grid_internal(sctx, &info);
-
-	sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH |
-		       (sctx->chip_class <= GFX8 ? SI_CONTEXT_WB_L2 : 0) |
-		       si_get_flush_flags(sctx, SI_COHERENCY_SHADER, L2_STREAM);
-	ctx->bind_compute_state(ctx, saved_cs);
-	ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 1, &saved_image);
-	ctx->set_constant_buffer(ctx, PIPE_SHADER_COMPUTE, 0, &saved_cb);
-	pipe_resource_reference(&saved_image.resource, NULL);
-	pipe_resource_reference(&saved_cb.buffer, NULL);
+   struct si_context *sctx = (struct si_context *)ctx;
+   unsigned num_layers = dstsurf->u.tex.last_layer - dstsurf->u.tex.first_layer + 1;
+   unsigned data[4 + sizeof(color->ui)] = {dstx, dsty, dstsurf->u.tex.first_layer, 0};
+
+   if (width == 0 || height == 0)
+      return;
+
+   /* The driver doesn't decompress resources automatically here. */
+   si_decompress_subresource(ctx, dstsurf->texture, PIPE_MASK_RGBA, dstsurf->u.tex.level,
+                             dstsurf->u.tex.first_layer, dstsurf->u.tex.last_layer);
+
+   if (util_format_is_srgb(dstsurf->format)) {
+      union pipe_color_union color_srgb;
+      for (int i = 0; i < 3; i++)
+         color_srgb.f[i] = util_format_linear_to_srgb_float(color->f[i]);
+      color_srgb.f[3] = color->f[3];
+      memcpy(data + 4, color_srgb.ui, sizeof(color->ui));
+   } else {
+      memcpy(data + 4, color->ui, sizeof(color->ui));
+   }
+
+   sctx->render_cond_force_off = !render_condition_enabled;
+
+   sctx->flags |=
+      SI_CONTEXT_CS_PARTIAL_FLUSH | si_get_flush_flags(sctx, SI_COHERENCY_SHADER, L2_STREAM);
+   si_make_CB_shader_coherent(sctx, dstsurf->texture->nr_samples, true,
+                              true /* DCC is not possible with image stores */);
+
+   struct pipe_constant_buffer saved_cb = {};
+   si_get_pipe_constant_buffer(sctx, PIPE_SHADER_COMPUTE, 0, &saved_cb);
+
+   struct si_images *images = &sctx->images[PIPE_SHADER_COMPUTE];
+   struct pipe_image_view saved_image = {0};
+   util_copy_image_view(&saved_image, &images->views[0]);
+
+   void *saved_cs = sctx->cs_shader_state.program;
+
+   struct pipe_constant_buffer cb = {};
+   cb.buffer_size = sizeof(data);
+   cb.user_buffer = data;
+   ctx->set_constant_buffer(ctx, PIPE_SHADER_COMPUTE, 0, &cb);
+
+   struct pipe_image_view image = {0};
+   image.resource = dstsurf->texture;
+   image.shader_access = image.access = PIPE_IMAGE_ACCESS_WRITE;
+   image.format = util_format_linear(dstsurf->format);
+   image.u.tex.level = dstsurf->u.tex.level;
+   image.u.tex.first_layer = 0; /* 3D images ignore first_layer (BASE_ARRAY) */
+   image.u.tex.last_layer = dstsurf->u.tex.last_layer;
+
+   ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 1, &image);
+
+   struct pipe_grid_info info = {0};
+
+   if (dstsurf->texture->target != PIPE_TEXTURE_1D_ARRAY) {
+      if (!sctx->cs_clear_render_target)
+         sctx->cs_clear_render_target = si_clear_render_target_shader(ctx);
+      ctx->bind_compute_state(ctx, sctx->cs_clear_render_target);
+      info.block[0] = 8;
+      info.last_block[0] = width % 8;
+      info.block[1] = 8;
+      info.last_block[1] = height % 8;
+      info.block[2] = 1;
+      info.grid[0] = DIV_ROUND_UP(width, 8);
+      info.grid[1] = DIV_ROUND_UP(height, 8);
+      info.grid[2] = num_layers;
+   } else {
+      if (!sctx->cs_clear_render_target_1d_array)
+         sctx->cs_clear_render_target_1d_array = si_clear_render_target_shader_1d_array(ctx);
+      ctx->bind_compute_state(ctx, sctx->cs_clear_render_target_1d_array);
+      info.block[0] = 64;
+      info.last_block[0] = width % 64;
+      info.block[1] = 1;
+      info.block[2] = 1;
+      info.grid[0] = DIV_ROUND_UP(width, 64);
+      info.grid[1] = num_layers;
+      info.grid[2] = 1;
+   }
+
+   si_launch_grid_internal(sctx, &info);
+
+   sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH | (sctx->chip_class <= GFX8 ? SI_CONTEXT_WB_L2 : 0) |
+                  si_get_flush_flags(sctx, SI_COHERENCY_SHADER, L2_STREAM);
+   ctx->bind_compute_state(ctx, saved_cs);
+   ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 1, &saved_image);
+   ctx->set_constant_buffer(ctx, PIPE_SHADER_COMPUTE, 0, &saved_cb);
+   pipe_resource_reference(&saved_image.resource, NULL);
+   pipe_resource_reference(&saved_cb.buffer, NULL);
 }
diff --git a/src/gallium/drivers/radeonsi/si_compute_prim_discard.c b/src/gallium/drivers/radeonsi/si_compute_prim_discard.c
index 7f985ad3c62..389233835eb 100644
--- a/src/gallium/drivers/radeonsi/si_compute_prim_discard.c
+++ b/src/gallium/drivers/radeonsi/si_compute_prim_discard.c
@@ -23,16 +23,15 @@
  *
  */
 
+#include "ac_llvm_cull.h"
+#include "si_build_pm4.h"
 #include "si_pipe.h"
 #include "si_shader_internal.h"
 #include "sid.h"
-#include "si_build_pm4.h"
-#include "ac_llvm_cull.h"
-
+#include "util/fast_idiv_by_const.h"
 #include "util/u_prim.h"
 #include "util/u_suballoc.h"
 #include "util/u_upload_mgr.h"
-#include "util/fast_idiv_by_const.h"
 
 /* Based on:
  * https://frostbite-wp-prd.s3.amazonaws.com/wp-content/uploads/2016/03/29204330/GDC_2016_Compute.pdf
@@ -154,1453 +153,1354 @@
 
 /* At least 256 is needed for the fastest wave launch rate from compute queues
  * due to hw constraints. Nothing in the code needs more than 1 wave/threadgroup. */
-#define THREADGROUP_SIZE		256 /* high numbers limit available VGPRs */
-#define THREADGROUPS_PER_CU		1 /* TGs to launch on 1 CU before going onto the next, max 8 */
-#define MAX_WAVES_PER_SH		0 /* no limit */
-#define INDEX_STORES_USE_SLC		1 /* don't cache indices if L2 is full */
+#define THREADGROUP_SIZE     256 /* high numbers limit available VGPRs */
+#define THREADGROUPS_PER_CU  1   /* TGs to launch on 1 CU before going onto the next, max 8 */
+#define MAX_WAVES_PER_SH     0   /* no limit */
+#define INDEX_STORES_USE_SLC 1   /* don't cache indices if L2 is full */
 /* Don't cull Z. We already do (W < 0) culling for primitives behind the viewer. */
-#define CULL_Z				0
+#define CULL_Z 0
 /* 0 = unordered memory counter, 1 = unordered GDS counter, 2 = ordered GDS counter */
-#define VERTEX_COUNTER_GDS_MODE		2
-#define GDS_SIZE_UNORDERED		(4 * 1024) /* only for the unordered GDS counter */
+#define VERTEX_COUNTER_GDS_MODE 2
+#define GDS_SIZE_UNORDERED      (4 * 1024) /* only for the unordered GDS counter */
 
 /* Grouping compute dispatches for small draw calls: How many primitives from multiple
  * draw calls to process by compute before signaling the gfx IB. This reduces the number
  * of EOP events + REWIND packets, because they decrease performance. */
-#define PRIMS_PER_BATCH			(512 * 1024)
+#define PRIMS_PER_BATCH (512 * 1024)
 /* Draw call splitting at the packet level. This allows signaling the gfx IB
  * for big draw calls sooner, but doesn't allow context flushes between packets.
  * Primitive restart is supported. Only implemented for ordered append. */
-#define SPLIT_PRIMS_PACKET_LEVEL_VALUE	PRIMS_PER_BATCH
+#define SPLIT_PRIMS_PACKET_LEVEL_VALUE PRIMS_PER_BATCH
 /* If there is not enough ring buffer space for the current IB, split draw calls into
  * this number of primitives, so that we can flush the context and get free ring space. */
-#define SPLIT_PRIMS_DRAW_LEVEL		PRIMS_PER_BATCH
+#define SPLIT_PRIMS_DRAW_LEVEL PRIMS_PER_BATCH
 
 /* Derived values. */
-#define WAVES_PER_TG			DIV_ROUND_UP(THREADGROUP_SIZE, 64)
-#define SPLIT_PRIMS_PACKET_LEVEL	(VERTEX_COUNTER_GDS_MODE == 2 ? \
-					 SPLIT_PRIMS_PACKET_LEVEL_VALUE : \
-					 UINT_MAX & ~(THREADGROUP_SIZE - 1))
+#define WAVES_PER_TG DIV_ROUND_UP(THREADGROUP_SIZE, 64)
+#define SPLIT_PRIMS_PACKET_LEVEL                                                                   \
+   (VERTEX_COUNTER_GDS_MODE == 2 ? SPLIT_PRIMS_PACKET_LEVEL_VALUE                                  \
+                                 : UINT_MAX & ~(THREADGROUP_SIZE - 1))
 
-#define REWIND_SIGNAL_BIT		0x80000000
+#define REWIND_SIGNAL_BIT 0x80000000
 /* For emulating the rewind packet on CI. */
-#define FORCE_REWIND_EMULATION		0
+#define FORCE_REWIND_EMULATION 0
 
-void si_initialize_prim_discard_tunables(struct si_screen *sscreen,
-					 bool is_aux_context,
-					 unsigned *prim_discard_vertex_count_threshold,
-					 unsigned *index_ring_size_per_ib)
+void si_initialize_prim_discard_tunables(struct si_screen *sscreen, bool is_aux_context,
+                                         unsigned *prim_discard_vertex_count_threshold,
+                                         unsigned *index_ring_size_per_ib)
 {
-	*prim_discard_vertex_count_threshold = UINT_MAX; /* disable */
-
-	if (sscreen->info.chip_class == GFX6 || /* SI support is not implemented */
-	    !sscreen->info.has_gds_ordered_append ||
-	    sscreen->debug_flags & DBG(NO_PD) ||
-	    is_aux_context)
-		return;
-
-	/* TODO: enable this after the GDS kernel memory management is fixed */
-	bool enable_on_pro_graphics_by_default = false;
-
-	if (sscreen->debug_flags & DBG(ALWAYS_PD) ||
-	    sscreen->debug_flags & DBG(PD) ||
-	    (enable_on_pro_graphics_by_default &&
-	     sscreen->info.is_pro_graphics &&
-	     (sscreen->info.family == CHIP_BONAIRE ||
-	      sscreen->info.family == CHIP_HAWAII ||
-	      sscreen->info.family == CHIP_TONGA ||
-	      sscreen->info.family == CHIP_FIJI ||
-	      sscreen->info.family == CHIP_POLARIS10 ||
-	      sscreen->info.family == CHIP_POLARIS11 ||
-	      sscreen->info.family == CHIP_VEGA10 ||
-	      sscreen->info.family == CHIP_VEGA20))) {
-		*prim_discard_vertex_count_threshold = 6000 * 3; /* 6K triangles */
-
-		if (sscreen->debug_flags & DBG(ALWAYS_PD))
-			*prim_discard_vertex_count_threshold = 0; /* always enable */
-
-		const uint32_t MB = 1024 * 1024;
-		const uint64_t GB = 1024 * 1024 * 1024;
-
-		/* The total size is double this per context.
-		 * Greater numbers allow bigger gfx IBs.
-		 */
-		if (sscreen->info.vram_size <= 2 * GB)
-			*index_ring_size_per_ib = 64 * MB;
-		else if (sscreen->info.vram_size <= 4 * GB)
-			*index_ring_size_per_ib = 128 * MB;
-		else
-			*index_ring_size_per_ib = 256 * MB;
-	}
+   *prim_discard_vertex_count_threshold = UINT_MAX; /* disable */
+
+   if (sscreen->info.chip_class == GFX6 || /* SI support is not implemented */
+       !sscreen->info.has_gds_ordered_append || sscreen->debug_flags & DBG(NO_PD) || is_aux_context)
+      return;
+
+   /* TODO: enable this after the GDS kernel memory management is fixed */
+   bool enable_on_pro_graphics_by_default = false;
+
+   if (sscreen->debug_flags & DBG(ALWAYS_PD) || sscreen->debug_flags & DBG(PD) ||
+       (enable_on_pro_graphics_by_default && sscreen->info.is_pro_graphics &&
+        (sscreen->info.family == CHIP_BONAIRE || sscreen->info.family == CHIP_HAWAII ||
+         sscreen->info.family == CHIP_TONGA || sscreen->info.family == CHIP_FIJI ||
+         sscreen->info.family == CHIP_POLARIS10 || sscreen->info.family == CHIP_POLARIS11 ||
+         sscreen->info.family == CHIP_VEGA10 || sscreen->info.family == CHIP_VEGA20))) {
+      *prim_discard_vertex_count_threshold = 6000 * 3; /* 6K triangles */
+
+      if (sscreen->debug_flags & DBG(ALWAYS_PD))
+         *prim_discard_vertex_count_threshold = 0; /* always enable */
+
+      const uint32_t MB = 1024 * 1024;
+      const uint64_t GB = 1024 * 1024 * 1024;
+
+      /* The total size is double this per context.
+       * Greater numbers allow bigger gfx IBs.
+       */
+      if (sscreen->info.vram_size <= 2 * GB)
+         *index_ring_size_per_ib = 64 * MB;
+      else if (sscreen->info.vram_size <= 4 * GB)
+         *index_ring_size_per_ib = 128 * MB;
+      else
+         *index_ring_size_per_ib = 256 * MB;
+   }
 }
 
 /* Opcode can be "add" or "swap". */
-static LLVMValueRef
-si_build_ds_ordered_op(struct si_shader_context *ctx, const char *opcode,
-		       LLVMValueRef m0, LLVMValueRef value, unsigned ordered_count_index,
-		       bool release, bool done)
+static LLVMValueRef si_build_ds_ordered_op(struct si_shader_context *ctx, const char *opcode,
+                                           LLVMValueRef m0, LLVMValueRef value,
+                                           unsigned ordered_count_index, bool release, bool done)
 {
-	if (ctx->screen->info.chip_class >= GFX10)
-		ordered_count_index |= 1 << 24; /* number of dwords == 1 */
-
-	LLVMValueRef args[] = {
-		LLVMBuildIntToPtr(ctx->ac.builder, m0,
-				  LLVMPointerType(ctx->ac.i32, AC_ADDR_SPACE_GDS), ""),
-		value,
-		LLVMConstInt(ctx->ac.i32, LLVMAtomicOrderingMonotonic, 0), /* ordering */
-		ctx->ac.i32_0, /* scope */
-		ctx->ac.i1false, /* volatile */
-		LLVMConstInt(ctx->ac.i32, ordered_count_index, 0),
-		LLVMConstInt(ctx->ac.i1, release, 0),
-		LLVMConstInt(ctx->ac.i1, done, 0),
-	};
-
-	char intrinsic[64];
-	snprintf(intrinsic, sizeof(intrinsic), "llvm.amdgcn.ds.ordered.%s", opcode);
-	return ac_build_intrinsic(&ctx->ac, intrinsic, ctx->ac.i32, args, ARRAY_SIZE(args), 0);
+   if (ctx->screen->info.chip_class >= GFX10)
+      ordered_count_index |= 1 << 24; /* number of dwords == 1 */
+
+   LLVMValueRef args[] = {
+      LLVMBuildIntToPtr(ctx->ac.builder, m0, LLVMPointerType(ctx->ac.i32, AC_ADDR_SPACE_GDS), ""),
+      value,
+      LLVMConstInt(ctx->ac.i32, LLVMAtomicOrderingMonotonic, 0), /* ordering */
+      ctx->ac.i32_0,                                             /* scope */
+      ctx->ac.i1false,                                           /* volatile */
+      LLVMConstInt(ctx->ac.i32, ordered_count_index, 0),
+      LLVMConstInt(ctx->ac.i1, release, 0),
+      LLVMConstInt(ctx->ac.i1, done, 0),
+   };
+
+   char intrinsic[64];
+   snprintf(intrinsic, sizeof(intrinsic), "llvm.amdgcn.ds.ordered.%s", opcode);
+   return ac_build_intrinsic(&ctx->ac, intrinsic, ctx->ac.i32, args, ARRAY_SIZE(args), 0);
 }
 
 static LLVMValueRef si_expand_32bit_pointer(struct si_shader_context *ctx, LLVMValueRef ptr)
 {
-	uint64_t hi = (uint64_t)ctx->screen->info.address32_hi << 32;
-	ptr = LLVMBuildZExt(ctx->ac.builder, ptr, ctx->ac.i64, "");
-	ptr = LLVMBuildOr(ctx->ac.builder, ptr, LLVMConstInt(ctx->ac.i64, hi, 0), "");
-	return LLVMBuildIntToPtr(ctx->ac.builder, ptr,
-				 LLVMPointerType(ctx->ac.i32, AC_ADDR_SPACE_GLOBAL), "");
+   uint64_t hi = (uint64_t)ctx->screen->info.address32_hi << 32;
+   ptr = LLVMBuildZExt(ctx->ac.builder, ptr, ctx->ac.i64, "");
+   ptr = LLVMBuildOr(ctx->ac.builder, ptr, LLVMConstInt(ctx->ac.i64, hi, 0), "");
+   return LLVMBuildIntToPtr(ctx->ac.builder, ptr,
+                            LLVMPointerType(ctx->ac.i32, AC_ADDR_SPACE_GLOBAL), "");
 }
 
 struct si_thread0_section {
-	struct si_shader_context *ctx;
-	LLVMValueRef vgpr_result; /* a VGPR for the value on thread 0. */
-	LLVMValueRef saved_exec;
+   struct si_shader_context *ctx;
+   LLVMValueRef vgpr_result; /* a VGPR for the value on thread 0. */
+   LLVMValueRef saved_exec;
 };
 
 /* Enter a section that only executes on thread 0. */
 static void si_enter_thread0_section(struct si_shader_context *ctx,
-				     struct si_thread0_section *section,
-				     LLVMValueRef thread_id)
+                                     struct si_thread0_section *section, LLVMValueRef thread_id)
 {
-	section->ctx = ctx;
-	section->vgpr_result = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, "result0");
-
-	/* This IF has 4 instructions:
-	 *   v_and_b32_e32 v, 63, v         ; get the thread ID
-	 *   v_cmp_eq_u32_e32 vcc, 0, v     ; thread ID == 0
-	 *   s_and_saveexec_b64 s, vcc
-	 *   s_cbranch_execz BB0_4
-	 *
-	 * It could just be s_and_saveexec_b64 s, 1.
-	 */
-	ac_build_ifcc(&ctx->ac,
-		      LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ, thread_id,
-				    ctx->ac.i32_0, ""), 12601);
+   section->ctx = ctx;
+   section->vgpr_result = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, "result0");
+
+   /* This IF has 4 instructions:
+    *   v_and_b32_e32 v, 63, v         ; get the thread ID
+    *   v_cmp_eq_u32_e32 vcc, 0, v     ; thread ID == 0
+    *   s_and_saveexec_b64 s, vcc
+    *   s_cbranch_execz BB0_4
+    *
+    * It could just be s_and_saveexec_b64 s, 1.
+    */
+   ac_build_ifcc(&ctx->ac, LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ, thread_id, ctx->ac.i32_0, ""),
+                 12601);
 }
 
 /* Exit a section that only executes on thread 0 and broadcast the result
  * to all threads. */
-static void si_exit_thread0_section(struct si_thread0_section *section,
-				    LLVMValueRef *result)
+static void si_exit_thread0_section(struct si_thread0_section *section, LLVMValueRef *result)
 {
-	struct si_shader_context *ctx = section->ctx;
+   struct si_shader_context *ctx = section->ctx;
 
-	LLVMBuildStore(ctx->ac.builder, *result, section->vgpr_result);
+   LLVMBuildStore(ctx->ac.builder, *result, section->vgpr_result);
 
-	ac_build_endif(&ctx->ac, 12601);
+   ac_build_endif(&ctx->ac, 12601);
 
-	/* Broadcast the result from thread 0 to all threads. */
-	*result = ac_build_readlane(&ctx->ac,
-			LLVMBuildLoad(ctx->ac.builder, section->vgpr_result, ""), NULL);
+   /* Broadcast the result from thread 0 to all threads. */
+   *result =
+      ac_build_readlane(&ctx->ac, LLVMBuildLoad(ctx->ac.builder, section->vgpr_result, ""), NULL);
 }
 
 void si_build_prim_discard_compute_shader(struct si_shader_context *ctx)
 {
-	struct si_shader_key *key = &ctx->shader->key;
-	LLVMBuilderRef builder = ctx->ac.builder;
-	LLVMValueRef vs = ctx->main_fn;
-
-	/* Always inline the VS function. */
-	ac_add_function_attr(ctx->ac.context, vs, -1, AC_FUNC_ATTR_ALWAYSINLINE);
-	LLVMSetLinkage(vs, LLVMPrivateLinkage);
-
-	enum ac_arg_type const_desc_type;
-	if (ctx->shader->selector->info.const_buffers_declared == 1 &&
-	    ctx->shader->selector->info.shader_buffers_declared == 0)
-		const_desc_type = AC_ARG_CONST_FLOAT_PTR;
-	else
-		const_desc_type = AC_ARG_CONST_DESC_PTR;
-
-	memset(&ctx->args, 0, sizeof(ctx->args));
-
-	struct ac_arg param_index_buffers_and_constants, param_vertex_counter;
-	struct ac_arg param_vb_desc, param_const_desc;
-	struct ac_arg param_base_vertex, param_start_instance;
-	struct ac_arg param_block_id, param_local_id, param_ordered_wave_id;
-	struct ac_arg param_restart_index, param_smallprim_precision;
-	struct ac_arg param_num_prims_udiv_multiplier, param_num_prims_udiv_terms;
-	struct ac_arg param_sampler_desc, param_last_wave_prim_id, param_vertex_count_addr;
-
-	ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR,
-		   &param_index_buffers_and_constants);
-	ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &param_vertex_counter);
-	ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &param_last_wave_prim_id);
-	ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &param_vertex_count_addr);
-	ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR,
-		   &param_vb_desc);
-	ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, const_desc_type,
-		   &param_const_desc);
-	ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_IMAGE_PTR,
-		   &param_sampler_desc);
-	ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &param_base_vertex);
-	ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &param_start_instance);
-	ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &param_num_prims_udiv_multiplier);
-	ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &param_num_prims_udiv_terms);
-	ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &param_restart_index);
-	ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, &param_smallprim_precision);
-
-	/* Block ID and thread ID inputs. */
-	ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &param_block_id);
-	if (VERTEX_COUNTER_GDS_MODE == 2)
-		ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &param_ordered_wave_id);
-	ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &param_local_id);
-
-	/* Create the compute shader function. */
-	unsigned old_type = ctx->type;
-	ctx->type = PIPE_SHADER_COMPUTE;
-	si_llvm_create_func(ctx, "prim_discard_cs", NULL, 0, THREADGROUP_SIZE);
-	ctx->type = old_type;
-
-	if (VERTEX_COUNTER_GDS_MODE == 2) {
-		ac_llvm_add_target_dep_function_attr(ctx->main_fn,
-						     "amdgpu-gds-size", 256);
-	} else if (VERTEX_COUNTER_GDS_MODE == 1) {
-		ac_llvm_add_target_dep_function_attr(ctx->main_fn, "amdgpu-gds-size",
-						     GDS_SIZE_UNORDERED);
-	}
-
-	/* Assemble parameters for VS. */
-	LLVMValueRef vs_params[16];
-	unsigned num_vs_params = 0;
-	unsigned param_vertex_id, param_instance_id;
-
-	vs_params[num_vs_params++] = LLVMGetUndef(LLVMTypeOf(LLVMGetParam(vs, 0))); /* RW_BUFFERS */
-	vs_params[num_vs_params++] = LLVMGetUndef(LLVMTypeOf(LLVMGetParam(vs, 1))); /* BINDLESS */
-	vs_params[num_vs_params++] = ac_get_arg(&ctx->ac, param_const_desc);
-	vs_params[num_vs_params++] = ac_get_arg(&ctx->ac, param_sampler_desc);
-	vs_params[num_vs_params++] = LLVMConstInt(ctx->ac.i32,
-					S_VS_STATE_INDEXED(key->opt.cs_indexed), 0);
-	vs_params[num_vs_params++] = ac_get_arg(&ctx->ac, param_base_vertex);
-	vs_params[num_vs_params++] = ac_get_arg(&ctx->ac, param_start_instance);
-	vs_params[num_vs_params++] = ctx->ac.i32_0; /* DrawID */
-	vs_params[num_vs_params++] = ac_get_arg(&ctx->ac, param_vb_desc);
-
-	vs_params[(param_vertex_id = num_vs_params++)] = NULL; /* VertexID */
-	vs_params[(param_instance_id = num_vs_params++)] = NULL; /* InstanceID */
-	vs_params[num_vs_params++] = ctx->ac.i32_0; /* unused (PrimID) */
-	vs_params[num_vs_params++] = ctx->ac.i32_0; /* unused */
-
-	assert(num_vs_params <= ARRAY_SIZE(vs_params));
-	assert(num_vs_params == LLVMCountParamTypes(LLVMGetElementType(LLVMTypeOf(vs))));
-
-	/* Load descriptors. (load 8 dwords at once) */
-	LLVMValueRef input_indexbuf, output_indexbuf, tmp, desc[8];
-
-	LLVMValueRef index_buffers_and_constants = ac_get_arg(&ctx->ac, param_index_buffers_and_constants);
-	tmp = LLVMBuildPointerCast(builder, index_buffers_and_constants,
-				   ac_array_in_const32_addr_space(ctx->ac.v8i32), "");
-	tmp = ac_build_load_to_sgpr(&ctx->ac, tmp, ctx->ac.i32_0);
-
-	for (unsigned i = 0; i < 8; i++)
-		desc[i] = ac_llvm_extract_elem(&ctx->ac, tmp, i);
-
-	input_indexbuf = ac_build_gather_values(&ctx->ac, desc, 4);
-	output_indexbuf = ac_build_gather_values(&ctx->ac, desc + 4, 4);
-
-	/* Compute PrimID and InstanceID. */
-	LLVMValueRef global_thread_id =
-		ac_build_imad(&ctx->ac, ac_get_arg(&ctx->ac, param_block_id),
-			      LLVMConstInt(ctx->ac.i32, THREADGROUP_SIZE, 0),
-			      ac_get_arg(&ctx->ac, param_local_id));
-	LLVMValueRef prim_id = global_thread_id; /* PrimID within an instance */
-	LLVMValueRef instance_id = ctx->ac.i32_0;
-
-	if (key->opt.cs_instancing) {
-		LLVMValueRef num_prims_udiv_terms =
-			ac_get_arg(&ctx->ac, param_num_prims_udiv_terms);
-		LLVMValueRef num_prims_udiv_multiplier =
-			ac_get_arg(&ctx->ac, param_num_prims_udiv_multiplier);
-		/* Unpack num_prims_udiv_terms. */
-		LLVMValueRef post_shift = LLVMBuildAnd(builder, num_prims_udiv_terms,
-						       LLVMConstInt(ctx->ac.i32, 0x1f, 0), "");
-		LLVMValueRef prims_per_instance = LLVMBuildLShr(builder, num_prims_udiv_terms,
-								LLVMConstInt(ctx->ac.i32, 5, 0), "");
-		/* Divide the total prim_id by the number of prims per instance. */
-		instance_id = ac_build_fast_udiv_u31_d_not_one(&ctx->ac, prim_id,
-							       num_prims_udiv_multiplier,
-							       post_shift);
-		/* Compute the remainder. */
-		prim_id = LLVMBuildSub(builder, prim_id,
-				       LLVMBuildMul(builder, instance_id,
-						    prims_per_instance, ""), "");
-	}
-
-	/* Generate indices (like a non-indexed draw call). */
-	LLVMValueRef index[4] = {NULL, NULL, NULL, LLVMGetUndef(ctx->ac.i32)};
-	unsigned vertices_per_prim = 3;
-
-	switch (key->opt.cs_prim_type) {
-	case PIPE_PRIM_TRIANGLES:
-		for (unsigned i = 0; i < 3; i++) {
-			index[i] = ac_build_imad(&ctx->ac, prim_id,
-						 LLVMConstInt(ctx->ac.i32, 3, 0),
-						 LLVMConstInt(ctx->ac.i32, i, 0));
-		}
-		break;
-	case PIPE_PRIM_TRIANGLE_STRIP:
-		for (unsigned i = 0; i < 3; i++) {
-			index[i] = LLVMBuildAdd(builder, prim_id,
-						LLVMConstInt(ctx->ac.i32, i, 0), "");
-		}
-		break;
-	case PIPE_PRIM_TRIANGLE_FAN:
-		/* Vertex 1 is first and vertex 2 is last. This will go to the hw clipper
-		 * and rasterizer as a normal triangle, so we need to put the provoking
-		 * vertex into the correct index variable and preserve orientation at the same time.
-		 * gl_VertexID is preserved, because it's equal to the index.
-		 */
-		if (key->opt.cs_provoking_vertex_first) {
-			index[0] = LLVMBuildAdd(builder, prim_id, LLVMConstInt(ctx->ac.i32, 1, 0), "");
-			index[1] = LLVMBuildAdd(builder, prim_id, LLVMConstInt(ctx->ac.i32, 2, 0), "");
-			index[2] = ctx->ac.i32_0;
-		} else {
-			index[0] = ctx->ac.i32_0;
-			index[1] = LLVMBuildAdd(builder, prim_id, LLVMConstInt(ctx->ac.i32, 1, 0), "");
-			index[2] = LLVMBuildAdd(builder, prim_id, LLVMConstInt(ctx->ac.i32, 2, 0), "");
-		}
-		break;
-	default:
-		unreachable("unexpected primitive type");
-	}
-
-	/* Fetch indices. */
-	if (key->opt.cs_indexed) {
-		for (unsigned i = 0; i < 3; i++) {
-			index[i] = ac_build_buffer_load_format(&ctx->ac, input_indexbuf,
-							       index[i], ctx->ac.i32_0, 1,
-							       0, true);
-			index[i] = ac_to_integer(&ctx->ac, index[i]);
-		}
-	}
-
-	LLVMValueRef ordered_wave_id = NULL;
-
-	/* Extract the ordered wave ID. */
-	if (VERTEX_COUNTER_GDS_MODE == 2) {
-		ordered_wave_id = ac_get_arg(&ctx->ac, param_ordered_wave_id);
-		ordered_wave_id = LLVMBuildLShr(builder, ordered_wave_id,
-						LLVMConstInt(ctx->ac.i32, 6, 0), "");
-		ordered_wave_id = LLVMBuildAnd(builder, ordered_wave_id,
-					       LLVMConstInt(ctx->ac.i32, 0xfff, 0), "");
-	}
-	LLVMValueRef thread_id =
-		LLVMBuildAnd(builder, ac_get_arg(&ctx->ac, param_local_id),
-			     LLVMConstInt(ctx->ac.i32, 63, 0), "");
-
-	/* Every other triangle in a strip has a reversed vertex order, so we
-	 * need to swap vertices of odd primitives to get the correct primitive
-	 * orientation when converting triangle strips to triangles. Primitive
-	 * restart complicates it, because a strip can start anywhere.
-	 */
-	LLVMValueRef prim_restart_accepted = ctx->ac.i1true;
-	LLVMValueRef vertex_counter = ac_get_arg(&ctx->ac, param_vertex_counter);
-
-	if (key->opt.cs_prim_type == PIPE_PRIM_TRIANGLE_STRIP) {
-		/* Without primitive restart, odd primitives have reversed orientation.
-		 * Only primitive restart can flip it with respect to the first vertex
-		 * of the draw call.
-		 */
-		LLVMValueRef first_is_odd = ctx->ac.i1false;
-
-		/* Handle primitive restart. */
-		if (key->opt.cs_primitive_restart) {
-			/* Get the GDS primitive restart continue flag and clear
-			 * the flag in vertex_counter. This flag is used when the draw
-			 * call was split and we need to load the primitive orientation
-			 * flag from GDS for the first wave too.
-			 */
-			LLVMValueRef gds_prim_restart_continue =
-				LLVMBuildLShr(builder, vertex_counter,
-					      LLVMConstInt(ctx->ac.i32, 31, 0), "");
-			gds_prim_restart_continue =
-				LLVMBuildTrunc(builder, gds_prim_restart_continue, ctx->ac.i1, "");
-			vertex_counter = LLVMBuildAnd(builder, vertex_counter,
-						      LLVMConstInt(ctx->ac.i32, 0x7fffffff, 0), "");
-
-			LLVMValueRef index0_is_reset;
-
-			for (unsigned i = 0; i < 3; i++) {
-				LLVMValueRef not_reset = LLVMBuildICmp(builder, LLVMIntNE, index[i],
-								       ac_get_arg(&ctx->ac, param_restart_index),
-								       "");
-				if (i == 0)
-					index0_is_reset = LLVMBuildNot(builder, not_reset, "");
-				prim_restart_accepted = LLVMBuildAnd(builder, prim_restart_accepted,
-								     not_reset, "");
-			}
-
-			/* If the previous waves flip the primitive orientation
-			 * of the current triangle strip, it will be stored in GDS.
-			 *
-			 * Sometimes the correct orientation is not needed, in which case
-			 * we don't need to execute this.
-			 */
-			if (key->opt.cs_need_correct_orientation && VERTEX_COUNTER_GDS_MODE == 2) {
-				/* If there are reset indices in this wave, get the thread index
-				 * where the most recent strip starts relative to each thread.
-				 */
-				LLVMValueRef preceding_threads_mask =
-					LLVMBuildSub(builder,
-						     LLVMBuildShl(builder, ctx->ac.i64_1,
-								  LLVMBuildZExt(builder, thread_id, ctx->ac.i64, ""), ""),
-						     ctx->ac.i64_1, "");
-
-				LLVMValueRef reset_threadmask = ac_get_i1_sgpr_mask(&ctx->ac, index0_is_reset);
-				LLVMValueRef preceding_reset_threadmask =
-					LLVMBuildAnd(builder, reset_threadmask, preceding_threads_mask, "");
-				LLVMValueRef strip_start =
-					ac_build_umsb(&ctx->ac, preceding_reset_threadmask, NULL);
-				strip_start = LLVMBuildAdd(builder, strip_start, ctx->ac.i32_1, "");
-
-				/* This flips the orientatino based on reset indices within this wave only. */
-				first_is_odd = LLVMBuildTrunc(builder, strip_start, ctx->ac.i1, "");
-
-				LLVMValueRef last_strip_start, prev_wave_state, ret, tmp;
-				LLVMValueRef is_first_wave, current_wave_resets_index;
-
-				/* Get the thread index where the last strip starts in this wave.
-				 *
-				 * If the last strip doesn't start in this wave, the thread index
-				 * will be 0.
-				 *
-				 * If the last strip starts in the next wave, the thread index will
-				 * be 64.
-				 */
-				last_strip_start = ac_build_umsb(&ctx->ac, reset_threadmask, NULL);
-				last_strip_start = LLVMBuildAdd(builder, last_strip_start, ctx->ac.i32_1, "");
-
-				struct si_thread0_section section;
-				si_enter_thread0_section(ctx, &section, thread_id);
-
-				/* This must be done in the thread 0 section, because
-				 * we expect PrimID to be 0 for the whole first wave
-				 * in this expression.
-				 *
-				 * NOTE: This will need to be different if we wanna support
-				 * instancing with primitive restart.
-				 */
-				is_first_wave = LLVMBuildICmp(builder, LLVMIntEQ, prim_id, ctx->ac.i32_0, "");
-				is_first_wave = LLVMBuildAnd(builder, is_first_wave,
-							     LLVMBuildNot(builder,
-									  gds_prim_restart_continue, ""), "");
-				current_wave_resets_index = LLVMBuildICmp(builder, LLVMIntNE,
-									  last_strip_start, ctx->ac.i32_0, "");
-
-				ret = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, "prev_state");
-
-				/* Save the last strip start primitive index in GDS and read
-				 * the value that previous waves stored.
-				 *
-				 * if (is_first_wave || current_wave_resets_strip)
-				 *    // Read the value that previous waves stored and store a new one.
-				 *    first_is_odd = ds.ordered.swap(last_strip_start);
-				 * else
-				 *    // Just read the value that previous waves stored.
-				 *    first_is_odd = ds.ordered.add(0);
-				 */
-				ac_build_ifcc(&ctx->ac,
-					      LLVMBuildOr(builder, is_first_wave,
-							  current_wave_resets_index, ""), 12602);
-				{
-					/* The GDS address is always 0 with ordered append. */
-					tmp = si_build_ds_ordered_op(ctx, "swap",
-								     ordered_wave_id, last_strip_start,
-								     1, true, false);
-					LLVMBuildStore(builder, tmp, ret);
-				}
-				ac_build_else(&ctx->ac, 12603);
-				{
-					/* Just read the value from GDS. */
-					tmp = si_build_ds_ordered_op(ctx, "add",
-								     ordered_wave_id, ctx->ac.i32_0,
-								     1, true, false);
-					LLVMBuildStore(builder, tmp, ret);
-				}
-				ac_build_endif(&ctx->ac, 12602);
-
-				prev_wave_state = LLVMBuildLoad(builder, ret, "");
-				/* Ignore the return value if this is the first wave. */
-				prev_wave_state = LLVMBuildSelect(builder, is_first_wave,
-								  ctx->ac.i32_0, prev_wave_state, "");
-				si_exit_thread0_section(&section, &prev_wave_state);
-				prev_wave_state = LLVMBuildTrunc(builder, prev_wave_state, ctx->ac.i1, "");
-
-				/* If the strip start appears to be on thread 0 for the current primitive
-				 * (meaning the reset index is not present in this wave and might have
-				 * appeared in previous waves), use the value from GDS to determine
-				 * primitive orientation.
-				 *
-				 * If the strip start is in this wave for the current primitive, use
-				 * the value from the current wave to determine primitive orientation.
-				 */
-				LLVMValueRef strip_start_is0 = LLVMBuildICmp(builder, LLVMIntEQ,
-									     strip_start, ctx->ac.i32_0, "");
-				first_is_odd = LLVMBuildSelect(builder, strip_start_is0, prev_wave_state,
-							       first_is_odd, "");
-			}
-		}
-		/* prim_is_odd = (first_is_odd + current_is_odd) % 2. */
-		LLVMValueRef prim_is_odd =
-			LLVMBuildXor(builder, first_is_odd,
-				     LLVMBuildTrunc(builder, thread_id, ctx->ac.i1, ""), "");
-
-		/* Convert triangle strip indices to triangle indices. */
-		ac_build_triangle_strip_indices_to_triangle(&ctx->ac, prim_is_odd,
-							    LLVMConstInt(ctx->ac.i1, key->opt.cs_provoking_vertex_first, 0),
-							    index);
-	}
-
-	/* Execute the vertex shader for each vertex to get vertex positions. */
-	LLVMValueRef pos[3][4];
-	for (unsigned i = 0; i < vertices_per_prim; i++) {
-		vs_params[param_vertex_id] = index[i];
-		vs_params[param_instance_id] = instance_id;
-
-		LLVMValueRef ret = ac_build_call(&ctx->ac, vs, vs_params, num_vs_params);
-		for (unsigned chan = 0; chan < 4; chan++)
-			pos[i][chan] = LLVMBuildExtractValue(builder, ret, chan, "");
-	}
-
-	/* Divide XYZ by W. */
-	for (unsigned i = 0; i < vertices_per_prim; i++) {
-		for (unsigned chan = 0; chan < 3; chan++)
-			pos[i][chan] = ac_build_fdiv(&ctx->ac, pos[i][chan], pos[i][3]);
-	}
-
-	/* Load the viewport state. */
-	LLVMValueRef vp = ac_build_load_invariant(&ctx->ac, index_buffers_and_constants,
-						  LLVMConstInt(ctx->ac.i32, 2, 0));
-	vp = LLVMBuildBitCast(builder, vp, ctx->ac.v4f32, "");
-	LLVMValueRef vp_scale[2], vp_translate[2];
-	vp_scale[0] = ac_llvm_extract_elem(&ctx->ac, vp, 0);
-	vp_scale[1] = ac_llvm_extract_elem(&ctx->ac, vp, 1);
-	vp_translate[0] = ac_llvm_extract_elem(&ctx->ac, vp, 2);
-	vp_translate[1] = ac_llvm_extract_elem(&ctx->ac, vp, 3);
-
-	/* Do culling. */
-	struct ac_cull_options options = {};
-	options.cull_front = key->opt.cs_cull_front;
-	options.cull_back = key->opt.cs_cull_back;
-	options.cull_view_xy = true;
-	options.cull_view_near_z = CULL_Z && key->opt.cs_cull_z;
-	options.cull_view_far_z = CULL_Z && key->opt.cs_cull_z;
-	options.cull_small_prims = true;
-	options.cull_zero_area = true;
-	options.cull_w = true;
-	options.use_halfz_clip_space = key->opt.cs_halfz_clip_space;
-
-	LLVMValueRef accepted =
-		ac_cull_triangle(&ctx->ac, pos, prim_restart_accepted,
-				 vp_scale, vp_translate,
-				 ac_get_arg(&ctx->ac, param_smallprim_precision),
-				 &options);
-
-	ac_build_optimization_barrier(&ctx->ac, &accepted);
-	LLVMValueRef accepted_threadmask = ac_get_i1_sgpr_mask(&ctx->ac, accepted);
-
-	/* Count the number of active threads by doing bitcount(accepted). */
-	LLVMValueRef num_prims_accepted =
-		ac_build_intrinsic(&ctx->ac, "llvm.ctpop.i64", ctx->ac.i64,
-				   &accepted_threadmask, 1, AC_FUNC_ATTR_READNONE);
-	num_prims_accepted = LLVMBuildTrunc(builder, num_prims_accepted, ctx->ac.i32, "");
-
-	LLVMValueRef start;
-
-	/* Execute atomic_add on the vertex count. */
-	struct si_thread0_section section;
-	si_enter_thread0_section(ctx, &section, thread_id);
-	{
-		if (VERTEX_COUNTER_GDS_MODE == 0) {
-			LLVMValueRef num_indices = LLVMBuildMul(builder, num_prims_accepted,
-						LLVMConstInt(ctx->ac.i32, vertices_per_prim, 0), "");
-			vertex_counter = si_expand_32bit_pointer(ctx, vertex_counter);
-			start = LLVMBuildAtomicRMW(builder, LLVMAtomicRMWBinOpAdd,
-						   vertex_counter, num_indices,
-						   LLVMAtomicOrderingMonotonic, false);
-		} else if (VERTEX_COUNTER_GDS_MODE == 1) {
-			LLVMValueRef num_indices = LLVMBuildMul(builder, num_prims_accepted,
-						LLVMConstInt(ctx->ac.i32, vertices_per_prim, 0), "");
-			vertex_counter = LLVMBuildIntToPtr(builder, vertex_counter,
-							   LLVMPointerType(ctx->ac.i32, AC_ADDR_SPACE_GDS), "");
-			start = LLVMBuildAtomicRMW(builder, LLVMAtomicRMWBinOpAdd,
-						   vertex_counter, num_indices,
-						   LLVMAtomicOrderingMonotonic, false);
-		} else if (VERTEX_COUNTER_GDS_MODE == 2) {
-			LLVMValueRef tmp_store = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, "");
-
-			/* If the draw call was split into multiple subdraws, each using
-			 * a separate draw packet, we need to start counting from 0 for
-			 * the first compute wave of the subdraw.
-			 *
-			 * vertex_counter contains the primitive ID of the first thread
-			 * in the first wave.
-			 *
-			 * This is only correct with VERTEX_COUNTER_GDS_MODE == 2:
-			 */
-			LLVMValueRef is_first_wave =
-				LLVMBuildICmp(builder, LLVMIntEQ, global_thread_id,
-					      vertex_counter, "");
-
-			/* Store the primitive count for ordered append, not vertex count.
-			 * The idea is to avoid GDS initialization via CP DMA. The shader
-			 * effectively stores the first count using "swap".
-			 *
-			 * if (first_wave) {
-			 *    ds.ordered.swap(num_prims_accepted); // store the first primitive count
-			 *    previous = 0;
-			 * } else {
-			 *    previous = ds.ordered.add(num_prims_accepted) // add the primitive count
-			 * }
-			 */
-			ac_build_ifcc(&ctx->ac, is_first_wave, 12604);
-			{
-				/* The GDS address is always 0 with ordered append. */
-				si_build_ds_ordered_op(ctx, "swap", ordered_wave_id,
-						       num_prims_accepted, 0, true, true);
-				LLVMBuildStore(builder, ctx->ac.i32_0, tmp_store);
-			}
-			ac_build_else(&ctx->ac, 12605);
-			{
-				LLVMBuildStore(builder,
-					       si_build_ds_ordered_op(ctx, "add", ordered_wave_id,
-								      num_prims_accepted, 0,
-								      true, true),
-					       tmp_store);
-			}
-			ac_build_endif(&ctx->ac, 12604);
-
-			start = LLVMBuildLoad(builder, tmp_store, "");
-		}
-	}
-	si_exit_thread0_section(&section, &start);
-
-	/* Write the final vertex count to memory. An EOS/EOP event could do this,
-	 * but those events are super slow and should be avoided if performance
-	 * is a concern. Thanks to GDS ordered append, we can emulate a CS_DONE
-	 * event like this.
-	 */
-	if (VERTEX_COUNTER_GDS_MODE == 2) {
-		ac_build_ifcc(&ctx->ac,
-			      LLVMBuildICmp(builder, LLVMIntEQ, global_thread_id,
-					    ac_get_arg(&ctx->ac, param_last_wave_prim_id), ""),
-			      12606);
-		LLVMValueRef count = LLVMBuildAdd(builder, start, num_prims_accepted, "");
-		count = LLVMBuildMul(builder, count,
-				     LLVMConstInt(ctx->ac.i32, vertices_per_prim, 0), "");
-
-		/* GFX8 needs to disable caching, so that the CP can see the stored value.
-		 * MTYPE=3 bypasses TC L2.
-		 */
-		if (ctx->screen->info.chip_class <= GFX8) {
-			LLVMValueRef desc[] = {
-				ac_get_arg(&ctx->ac, param_vertex_count_addr),
-				LLVMConstInt(ctx->ac.i32,
-					S_008F04_BASE_ADDRESS_HI(ctx->screen->info.address32_hi), 0),
-				LLVMConstInt(ctx->ac.i32, 4, 0),
-				LLVMConstInt(ctx->ac.i32, S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32) |
-						       S_008F0C_MTYPE(3 /* uncached */), 0),
-			};
-			LLVMValueRef rsrc = ac_build_gather_values(&ctx->ac, desc, 4);
-			ac_build_buffer_store_dword(&ctx->ac, rsrc, count, 1, ctx->ac.i32_0,
-						    ctx->ac.i32_0, 0, ac_glc | ac_slc);
-		} else {
-			LLVMBuildStore(builder, count,
-				       si_expand_32bit_pointer(ctx,
-							       ac_get_arg(&ctx->ac,
-									  param_vertex_count_addr)));
-		}
-		ac_build_endif(&ctx->ac, 12606);
-	} else {
-		/* For unordered modes that increment a vertex count instead of
-		 * primitive count, convert it into the primitive index.
-		 */
-		start = LLVMBuildUDiv(builder, start,
-				      LLVMConstInt(ctx->ac.i32, vertices_per_prim, 0), "");
-	}
-
-	/* Now we need to store the indices of accepted primitives into
-	 * the output index buffer.
-	 */
-	ac_build_ifcc(&ctx->ac, accepted, 16607);
-	{
-		/* Get the number of bits set before the index of this thread. */
-		LLVMValueRef prim_index = ac_build_mbcnt(&ctx->ac, accepted_threadmask);
-
-		/* We have lowered instancing. Pack the instance ID into vertex ID. */
-		if (key->opt.cs_instancing) {
-			instance_id = LLVMBuildShl(builder, instance_id,
-						   LLVMConstInt(ctx->ac.i32, 16, 0), "");
-
-			for (unsigned i = 0; i < vertices_per_prim; i++)
-				index[i] = LLVMBuildOr(builder, index[i], instance_id, "");
-		}
-
-		if (VERTEX_COUNTER_GDS_MODE == 2) {
-			/* vertex_counter contains the first primitive ID
-			 * for this dispatch. If the draw call was split into
-			 * multiple subdraws, the first primitive ID is > 0
-			 * for subsequent subdraws. Each subdraw uses a different
-			 * portion of the output index buffer. Offset the store
-			 * vindex by the first primitive ID to get the correct
-			 * store address for the subdraw.
-			 */
-			start = LLVMBuildAdd(builder, start, vertex_counter, "");
-		}
-
-		/* Write indices for accepted primitives. */
-		LLVMValueRef vindex = LLVMBuildAdd(builder, start, prim_index, "");
-		LLVMValueRef vdata = ac_build_gather_values(&ctx->ac, index, 3);
-
-		if (!ac_has_vec3_support(ctx->ac.chip_class, true))
-			vdata = ac_build_expand_to_vec4(&ctx->ac, vdata, 3);
-
-		ac_build_buffer_store_format(&ctx->ac, output_indexbuf, vdata,
-					     vindex, ctx->ac.i32_0, 3,
-					     ac_glc | (INDEX_STORES_USE_SLC ? ac_slc : 0));
-	}
-	ac_build_endif(&ctx->ac, 16607);
-
-	LLVMBuildRetVoid(builder);
+   struct si_shader_key *key = &ctx->shader->key;
+   LLVMBuilderRef builder = ctx->ac.builder;
+   LLVMValueRef vs = ctx->main_fn;
+
+   /* Always inline the VS function. */
+   ac_add_function_attr(ctx->ac.context, vs, -1, AC_FUNC_ATTR_ALWAYSINLINE);
+   LLVMSetLinkage(vs, LLVMPrivateLinkage);
+
+   enum ac_arg_type const_desc_type;
+   if (ctx->shader->selector->info.const_buffers_declared == 1 &&
+       ctx->shader->selector->info.shader_buffers_declared == 0)
+      const_desc_type = AC_ARG_CONST_FLOAT_PTR;
+   else
+      const_desc_type = AC_ARG_CONST_DESC_PTR;
+
+   memset(&ctx->args, 0, sizeof(ctx->args));
+
+   struct ac_arg param_index_buffers_and_constants, param_vertex_counter;
+   struct ac_arg param_vb_desc, param_const_desc;
+   struct ac_arg param_base_vertex, param_start_instance;
+   struct ac_arg param_block_id, param_local_id, param_ordered_wave_id;
+   struct ac_arg param_restart_index, param_smallprim_precision;
+   struct ac_arg param_num_prims_udiv_multiplier, param_num_prims_udiv_terms;
+   struct ac_arg param_sampler_desc, param_last_wave_prim_id, param_vertex_count_addr;
+
+   ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR,
+              &param_index_buffers_and_constants);
+   ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &param_vertex_counter);
+   ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &param_last_wave_prim_id);
+   ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &param_vertex_count_addr);
+   ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR, &param_vb_desc);
+   ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, const_desc_type, &param_const_desc);
+   ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_IMAGE_PTR, &param_sampler_desc);
+   ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &param_base_vertex);
+   ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &param_start_instance);
+   ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &param_num_prims_udiv_multiplier);
+   ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &param_num_prims_udiv_terms);
+   ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &param_restart_index);
+   ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, &param_smallprim_precision);
+
+   /* Block ID and thread ID inputs. */
+   ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &param_block_id);
+   if (VERTEX_COUNTER_GDS_MODE == 2)
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &param_ordered_wave_id);
+   ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &param_local_id);
+
+   /* Create the compute shader function. */
+   unsigned old_type = ctx->type;
+   ctx->type = PIPE_SHADER_COMPUTE;
+   si_llvm_create_func(ctx, "prim_discard_cs", NULL, 0, THREADGROUP_SIZE);
+   ctx->type = old_type;
+
+   if (VERTEX_COUNTER_GDS_MODE == 2) {
+      ac_llvm_add_target_dep_function_attr(ctx->main_fn, "amdgpu-gds-size", 256);
+   } else if (VERTEX_COUNTER_GDS_MODE == 1) {
+      ac_llvm_add_target_dep_function_attr(ctx->main_fn, "amdgpu-gds-size", GDS_SIZE_UNORDERED);
+   }
+
+   /* Assemble parameters for VS. */
+   LLVMValueRef vs_params[16];
+   unsigned num_vs_params = 0;
+   unsigned param_vertex_id, param_instance_id;
+
+   vs_params[num_vs_params++] = LLVMGetUndef(LLVMTypeOf(LLVMGetParam(vs, 0))); /* RW_BUFFERS */
+   vs_params[num_vs_params++] = LLVMGetUndef(LLVMTypeOf(LLVMGetParam(vs, 1))); /* BINDLESS */
+   vs_params[num_vs_params++] = ac_get_arg(&ctx->ac, param_const_desc);
+   vs_params[num_vs_params++] = ac_get_arg(&ctx->ac, param_sampler_desc);
+   vs_params[num_vs_params++] =
+      LLVMConstInt(ctx->ac.i32, S_VS_STATE_INDEXED(key->opt.cs_indexed), 0);
+   vs_params[num_vs_params++] = ac_get_arg(&ctx->ac, param_base_vertex);
+   vs_params[num_vs_params++] = ac_get_arg(&ctx->ac, param_start_instance);
+   vs_params[num_vs_params++] = ctx->ac.i32_0; /* DrawID */
+   vs_params[num_vs_params++] = ac_get_arg(&ctx->ac, param_vb_desc);
+
+   vs_params[(param_vertex_id = num_vs_params++)] = NULL;   /* VertexID */
+   vs_params[(param_instance_id = num_vs_params++)] = NULL; /* InstanceID */
+   vs_params[num_vs_params++] = ctx->ac.i32_0;              /* unused (PrimID) */
+   vs_params[num_vs_params++] = ctx->ac.i32_0;              /* unused */
+
+   assert(num_vs_params <= ARRAY_SIZE(vs_params));
+   assert(num_vs_params == LLVMCountParamTypes(LLVMGetElementType(LLVMTypeOf(vs))));
+
+   /* Load descriptors. (load 8 dwords at once) */
+   LLVMValueRef input_indexbuf, output_indexbuf, tmp, desc[8];
+
+   LLVMValueRef index_buffers_and_constants =
+      ac_get_arg(&ctx->ac, param_index_buffers_and_constants);
+   tmp = LLVMBuildPointerCast(builder, index_buffers_and_constants,
+                              ac_array_in_const32_addr_space(ctx->ac.v8i32), "");
+   tmp = ac_build_load_to_sgpr(&ctx->ac, tmp, ctx->ac.i32_0);
+
+   for (unsigned i = 0; i < 8; i++)
+      desc[i] = ac_llvm_extract_elem(&ctx->ac, tmp, i);
+
+   input_indexbuf = ac_build_gather_values(&ctx->ac, desc, 4);
+   output_indexbuf = ac_build_gather_values(&ctx->ac, desc + 4, 4);
+
+   /* Compute PrimID and InstanceID. */
+   LLVMValueRef global_thread_id = ac_build_imad(&ctx->ac, ac_get_arg(&ctx->ac, param_block_id),
+                                                 LLVMConstInt(ctx->ac.i32, THREADGROUP_SIZE, 0),
+                                                 ac_get_arg(&ctx->ac, param_local_id));
+   LLVMValueRef prim_id = global_thread_id; /* PrimID within an instance */
+   LLVMValueRef instance_id = ctx->ac.i32_0;
+
+   if (key->opt.cs_instancing) {
+      LLVMValueRef num_prims_udiv_terms = ac_get_arg(&ctx->ac, param_num_prims_udiv_terms);
+      LLVMValueRef num_prims_udiv_multiplier =
+         ac_get_arg(&ctx->ac, param_num_prims_udiv_multiplier);
+      /* Unpack num_prims_udiv_terms. */
+      LLVMValueRef post_shift =
+         LLVMBuildAnd(builder, num_prims_udiv_terms, LLVMConstInt(ctx->ac.i32, 0x1f, 0), "");
+      LLVMValueRef prims_per_instance =
+         LLVMBuildLShr(builder, num_prims_udiv_terms, LLVMConstInt(ctx->ac.i32, 5, 0), "");
+      /* Divide the total prim_id by the number of prims per instance. */
+      instance_id =
+         ac_build_fast_udiv_u31_d_not_one(&ctx->ac, prim_id, num_prims_udiv_multiplier, post_shift);
+      /* Compute the remainder. */
+      prim_id = LLVMBuildSub(builder, prim_id,
+                             LLVMBuildMul(builder, instance_id, prims_per_instance, ""), "");
+   }
+
+   /* Generate indices (like a non-indexed draw call). */
+   LLVMValueRef index[4] = {NULL, NULL, NULL, LLVMGetUndef(ctx->ac.i32)};
+   unsigned vertices_per_prim = 3;
+
+   switch (key->opt.cs_prim_type) {
+   case PIPE_PRIM_TRIANGLES:
+      for (unsigned i = 0; i < 3; i++) {
+         index[i] = ac_build_imad(&ctx->ac, prim_id, LLVMConstInt(ctx->ac.i32, 3, 0),
+                                  LLVMConstInt(ctx->ac.i32, i, 0));
+      }
+      break;
+   case PIPE_PRIM_TRIANGLE_STRIP:
+      for (unsigned i = 0; i < 3; i++) {
+         index[i] = LLVMBuildAdd(builder, prim_id, LLVMConstInt(ctx->ac.i32, i, 0), "");
+      }
+      break;
+   case PIPE_PRIM_TRIANGLE_FAN:
+      /* Vertex 1 is first and vertex 2 is last. This will go to the hw clipper
+       * and rasterizer as a normal triangle, so we need to put the provoking
+       * vertex into the correct index variable and preserve orientation at the same time.
+       * gl_VertexID is preserved, because it's equal to the index.
+       */
+      if (key->opt.cs_provoking_vertex_first) {
+         index[0] = LLVMBuildAdd(builder, prim_id, LLVMConstInt(ctx->ac.i32, 1, 0), "");
+         index[1] = LLVMBuildAdd(builder, prim_id, LLVMConstInt(ctx->ac.i32, 2, 0), "");
+         index[2] = ctx->ac.i32_0;
+      } else {
+         index[0] = ctx->ac.i32_0;
+         index[1] = LLVMBuildAdd(builder, prim_id, LLVMConstInt(ctx->ac.i32, 1, 0), "");
+         index[2] = LLVMBuildAdd(builder, prim_id, LLVMConstInt(ctx->ac.i32, 2, 0), "");
+      }
+      break;
+   default:
+      unreachable("unexpected primitive type");
+   }
+
+   /* Fetch indices. */
+   if (key->opt.cs_indexed) {
+      for (unsigned i = 0; i < 3; i++) {
+         index[i] = ac_build_buffer_load_format(&ctx->ac, input_indexbuf, index[i], ctx->ac.i32_0,
+                                                1, 0, true);
+         index[i] = ac_to_integer(&ctx->ac, index[i]);
+      }
+   }
+
+   LLVMValueRef ordered_wave_id = NULL;
+
+   /* Extract the ordered wave ID. */
+   if (VERTEX_COUNTER_GDS_MODE == 2) {
+      ordered_wave_id = ac_get_arg(&ctx->ac, param_ordered_wave_id);
+      ordered_wave_id =
+         LLVMBuildLShr(builder, ordered_wave_id, LLVMConstInt(ctx->ac.i32, 6, 0), "");
+      ordered_wave_id =
+         LLVMBuildAnd(builder, ordered_wave_id, LLVMConstInt(ctx->ac.i32, 0xfff, 0), "");
+   }
+   LLVMValueRef thread_id = LLVMBuildAnd(builder, ac_get_arg(&ctx->ac, param_local_id),
+                                         LLVMConstInt(ctx->ac.i32, 63, 0), "");
+
+   /* Every other triangle in a strip has a reversed vertex order, so we
+    * need to swap vertices of odd primitives to get the correct primitive
+    * orientation when converting triangle strips to triangles. Primitive
+    * restart complicates it, because a strip can start anywhere.
+    */
+   LLVMValueRef prim_restart_accepted = ctx->ac.i1true;
+   LLVMValueRef vertex_counter = ac_get_arg(&ctx->ac, param_vertex_counter);
+
+   if (key->opt.cs_prim_type == PIPE_PRIM_TRIANGLE_STRIP) {
+      /* Without primitive restart, odd primitives have reversed orientation.
+       * Only primitive restart can flip it with respect to the first vertex
+       * of the draw call.
+       */
+      LLVMValueRef first_is_odd = ctx->ac.i1false;
+
+      /* Handle primitive restart. */
+      if (key->opt.cs_primitive_restart) {
+         /* Get the GDS primitive restart continue flag and clear
+          * the flag in vertex_counter. This flag is used when the draw
+          * call was split and we need to load the primitive orientation
+          * flag from GDS for the first wave too.
+          */
+         LLVMValueRef gds_prim_restart_continue =
+            LLVMBuildLShr(builder, vertex_counter, LLVMConstInt(ctx->ac.i32, 31, 0), "");
+         gds_prim_restart_continue =
+            LLVMBuildTrunc(builder, gds_prim_restart_continue, ctx->ac.i1, "");
+         vertex_counter =
+            LLVMBuildAnd(builder, vertex_counter, LLVMConstInt(ctx->ac.i32, 0x7fffffff, 0), "");
+
+         LLVMValueRef index0_is_reset;
+
+         for (unsigned i = 0; i < 3; i++) {
+            LLVMValueRef not_reset = LLVMBuildICmp(builder, LLVMIntNE, index[i],
+                                                   ac_get_arg(&ctx->ac, param_restart_index), "");
+            if (i == 0)
+               index0_is_reset = LLVMBuildNot(builder, not_reset, "");
+            prim_restart_accepted = LLVMBuildAnd(builder, prim_restart_accepted, not_reset, "");
+         }
+
+         /* If the previous waves flip the primitive orientation
+          * of the current triangle strip, it will be stored in GDS.
+          *
+          * Sometimes the correct orientation is not needed, in which case
+          * we don't need to execute this.
+          */
+         if (key->opt.cs_need_correct_orientation && VERTEX_COUNTER_GDS_MODE == 2) {
+            /* If there are reset indices in this wave, get the thread index
+             * where the most recent strip starts relative to each thread.
+             */
+            LLVMValueRef preceding_threads_mask =
+               LLVMBuildSub(builder,
+                            LLVMBuildShl(builder, ctx->ac.i64_1,
+                                         LLVMBuildZExt(builder, thread_id, ctx->ac.i64, ""), ""),
+                            ctx->ac.i64_1, "");
+
+            LLVMValueRef reset_threadmask = ac_get_i1_sgpr_mask(&ctx->ac, index0_is_reset);
+            LLVMValueRef preceding_reset_threadmask =
+               LLVMBuildAnd(builder, reset_threadmask, preceding_threads_mask, "");
+            LLVMValueRef strip_start = ac_build_umsb(&ctx->ac, preceding_reset_threadmask, NULL);
+            strip_start = LLVMBuildAdd(builder, strip_start, ctx->ac.i32_1, "");
+
+            /* This flips the orientatino based on reset indices within this wave only. */
+            first_is_odd = LLVMBuildTrunc(builder, strip_start, ctx->ac.i1, "");
+
+            LLVMValueRef last_strip_start, prev_wave_state, ret, tmp;
+            LLVMValueRef is_first_wave, current_wave_resets_index;
+
+            /* Get the thread index where the last strip starts in this wave.
+             *
+             * If the last strip doesn't start in this wave, the thread index
+             * will be 0.
+             *
+             * If the last strip starts in the next wave, the thread index will
+             * be 64.
+             */
+            last_strip_start = ac_build_umsb(&ctx->ac, reset_threadmask, NULL);
+            last_strip_start = LLVMBuildAdd(builder, last_strip_start, ctx->ac.i32_1, "");
+
+            struct si_thread0_section section;
+            si_enter_thread0_section(ctx, &section, thread_id);
+
+            /* This must be done in the thread 0 section, because
+             * we expect PrimID to be 0 for the whole first wave
+             * in this expression.
+             *
+             * NOTE: This will need to be different if we wanna support
+             * instancing with primitive restart.
+             */
+            is_first_wave = LLVMBuildICmp(builder, LLVMIntEQ, prim_id, ctx->ac.i32_0, "");
+            is_first_wave = LLVMBuildAnd(builder, is_first_wave,
+                                         LLVMBuildNot(builder, gds_prim_restart_continue, ""), "");
+            current_wave_resets_index =
+               LLVMBuildICmp(builder, LLVMIntNE, last_strip_start, ctx->ac.i32_0, "");
+
+            ret = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, "prev_state");
+
+            /* Save the last strip start primitive index in GDS and read
+             * the value that previous waves stored.
+             *
+             * if (is_first_wave || current_wave_resets_strip)
+             *    // Read the value that previous waves stored and store a new one.
+             *    first_is_odd = ds.ordered.swap(last_strip_start);
+             * else
+             *    // Just read the value that previous waves stored.
+             *    first_is_odd = ds.ordered.add(0);
+             */
+            ac_build_ifcc(
+               &ctx->ac, LLVMBuildOr(builder, is_first_wave, current_wave_resets_index, ""), 12602);
+            {
+               /* The GDS address is always 0 with ordered append. */
+               tmp = si_build_ds_ordered_op(ctx, "swap", ordered_wave_id, last_strip_start, 1, true,
+                                            false);
+               LLVMBuildStore(builder, tmp, ret);
+            }
+            ac_build_else(&ctx->ac, 12603);
+            {
+               /* Just read the value from GDS. */
+               tmp = si_build_ds_ordered_op(ctx, "add", ordered_wave_id, ctx->ac.i32_0, 1, true,
+                                            false);
+               LLVMBuildStore(builder, tmp, ret);
+            }
+            ac_build_endif(&ctx->ac, 12602);
+
+            prev_wave_state = LLVMBuildLoad(builder, ret, "");
+            /* Ignore the return value if this is the first wave. */
+            prev_wave_state =
+               LLVMBuildSelect(builder, is_first_wave, ctx->ac.i32_0, prev_wave_state, "");
+            si_exit_thread0_section(&section, &prev_wave_state);
+            prev_wave_state = LLVMBuildTrunc(builder, prev_wave_state, ctx->ac.i1, "");
+
+            /* If the strip start appears to be on thread 0 for the current primitive
+             * (meaning the reset index is not present in this wave and might have
+             * appeared in previous waves), use the value from GDS to determine
+             * primitive orientation.
+             *
+             * If the strip start is in this wave for the current primitive, use
+             * the value from the current wave to determine primitive orientation.
+             */
+            LLVMValueRef strip_start_is0 =
+               LLVMBuildICmp(builder, LLVMIntEQ, strip_start, ctx->ac.i32_0, "");
+            first_is_odd =
+               LLVMBuildSelect(builder, strip_start_is0, prev_wave_state, first_is_odd, "");
+         }
+      }
+      /* prim_is_odd = (first_is_odd + current_is_odd) % 2. */
+      LLVMValueRef prim_is_odd = LLVMBuildXor(
+         builder, first_is_odd, LLVMBuildTrunc(builder, thread_id, ctx->ac.i1, ""), "");
+
+      /* Convert triangle strip indices to triangle indices. */
+      ac_build_triangle_strip_indices_to_triangle(
+         &ctx->ac, prim_is_odd, LLVMConstInt(ctx->ac.i1, key->opt.cs_provoking_vertex_first, 0),
+         index);
+   }
+
+   /* Execute the vertex shader for each vertex to get vertex positions. */
+   LLVMValueRef pos[3][4];
+   for (unsigned i = 0; i < vertices_per_prim; i++) {
+      vs_params[param_vertex_id] = index[i];
+      vs_params[param_instance_id] = instance_id;
+
+      LLVMValueRef ret = ac_build_call(&ctx->ac, vs, vs_params, num_vs_params);
+      for (unsigned chan = 0; chan < 4; chan++)
+         pos[i][chan] = LLVMBuildExtractValue(builder, ret, chan, "");
+   }
+
+   /* Divide XYZ by W. */
+   for (unsigned i = 0; i < vertices_per_prim; i++) {
+      for (unsigned chan = 0; chan < 3; chan++)
+         pos[i][chan] = ac_build_fdiv(&ctx->ac, pos[i][chan], pos[i][3]);
+   }
+
+   /* Load the viewport state. */
+   LLVMValueRef vp = ac_build_load_invariant(&ctx->ac, index_buffers_and_constants,
+                                             LLVMConstInt(ctx->ac.i32, 2, 0));
+   vp = LLVMBuildBitCast(builder, vp, ctx->ac.v4f32, "");
+   LLVMValueRef vp_scale[2], vp_translate[2];
+   vp_scale[0] = ac_llvm_extract_elem(&ctx->ac, vp, 0);
+   vp_scale[1] = ac_llvm_extract_elem(&ctx->ac, vp, 1);
+   vp_translate[0] = ac_llvm_extract_elem(&ctx->ac, vp, 2);
+   vp_translate[1] = ac_llvm_extract_elem(&ctx->ac, vp, 3);
+
+   /* Do culling. */
+   struct ac_cull_options options = {};
+   options.cull_front = key->opt.cs_cull_front;
+   options.cull_back = key->opt.cs_cull_back;
+   options.cull_view_xy = true;
+   options.cull_view_near_z = CULL_Z && key->opt.cs_cull_z;
+   options.cull_view_far_z = CULL_Z && key->opt.cs_cull_z;
+   options.cull_small_prims = true;
+   options.cull_zero_area = true;
+   options.cull_w = true;
+   options.use_halfz_clip_space = key->opt.cs_halfz_clip_space;
+
+   LLVMValueRef accepted =
+      ac_cull_triangle(&ctx->ac, pos, prim_restart_accepted, vp_scale, vp_translate,
+                       ac_get_arg(&ctx->ac, param_smallprim_precision), &options);
+
+   ac_build_optimization_barrier(&ctx->ac, &accepted);
+   LLVMValueRef accepted_threadmask = ac_get_i1_sgpr_mask(&ctx->ac, accepted);
+
+   /* Count the number of active threads by doing bitcount(accepted). */
+   LLVMValueRef num_prims_accepted = ac_build_intrinsic(
+      &ctx->ac, "llvm.ctpop.i64", ctx->ac.i64, &accepted_threadmask, 1, AC_FUNC_ATTR_READNONE);
+   num_prims_accepted = LLVMBuildTrunc(builder, num_prims_accepted, ctx->ac.i32, "");
+
+   LLVMValueRef start;
+
+   /* Execute atomic_add on the vertex count. */
+   struct si_thread0_section section;
+   si_enter_thread0_section(ctx, &section, thread_id);
+   {
+      if (VERTEX_COUNTER_GDS_MODE == 0) {
+         LLVMValueRef num_indices = LLVMBuildMul(
+            builder, num_prims_accepted, LLVMConstInt(ctx->ac.i32, vertices_per_prim, 0), "");
+         vertex_counter = si_expand_32bit_pointer(ctx, vertex_counter);
+         start = LLVMBuildAtomicRMW(builder, LLVMAtomicRMWBinOpAdd, vertex_counter, num_indices,
+                                    LLVMAtomicOrderingMonotonic, false);
+      } else if (VERTEX_COUNTER_GDS_MODE == 1) {
+         LLVMValueRef num_indices = LLVMBuildMul(
+            builder, num_prims_accepted, LLVMConstInt(ctx->ac.i32, vertices_per_prim, 0), "");
+         vertex_counter = LLVMBuildIntToPtr(builder, vertex_counter,
+                                            LLVMPointerType(ctx->ac.i32, AC_ADDR_SPACE_GDS), "");
+         start = LLVMBuildAtomicRMW(builder, LLVMAtomicRMWBinOpAdd, vertex_counter, num_indices,
+                                    LLVMAtomicOrderingMonotonic, false);
+      } else if (VERTEX_COUNTER_GDS_MODE == 2) {
+         LLVMValueRef tmp_store = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, "");
+
+         /* If the draw call was split into multiple subdraws, each using
+          * a separate draw packet, we need to start counting from 0 for
+          * the first compute wave of the subdraw.
+          *
+          * vertex_counter contains the primitive ID of the first thread
+          * in the first wave.
+          *
+          * This is only correct with VERTEX_COUNTER_GDS_MODE == 2:
+          */
+         LLVMValueRef is_first_wave =
+            LLVMBuildICmp(builder, LLVMIntEQ, global_thread_id, vertex_counter, "");
+
+         /* Store the primitive count for ordered append, not vertex count.
+          * The idea is to avoid GDS initialization via CP DMA. The shader
+          * effectively stores the first count using "swap".
+          *
+          * if (first_wave) {
+          *    ds.ordered.swap(num_prims_accepted); // store the first primitive count
+          *    previous = 0;
+          * } else {
+          *    previous = ds.ordered.add(num_prims_accepted) // add the primitive count
+          * }
+          */
+         ac_build_ifcc(&ctx->ac, is_first_wave, 12604);
+         {
+            /* The GDS address is always 0 with ordered append. */
+            si_build_ds_ordered_op(ctx, "swap", ordered_wave_id, num_prims_accepted, 0, true, true);
+            LLVMBuildStore(builder, ctx->ac.i32_0, tmp_store);
+         }
+         ac_build_else(&ctx->ac, 12605);
+         {
+            LLVMBuildStore(builder,
+                           si_build_ds_ordered_op(ctx, "add", ordered_wave_id, num_prims_accepted,
+                                                  0, true, true),
+                           tmp_store);
+         }
+         ac_build_endif(&ctx->ac, 12604);
+
+         start = LLVMBuildLoad(builder, tmp_store, "");
+      }
+   }
+   si_exit_thread0_section(&section, &start);
+
+   /* Write the final vertex count to memory. An EOS/EOP event could do this,
+    * but those events are super slow and should be avoided if performance
+    * is a concern. Thanks to GDS ordered append, we can emulate a CS_DONE
+    * event like this.
+    */
+   if (VERTEX_COUNTER_GDS_MODE == 2) {
+      ac_build_ifcc(&ctx->ac,
+                    LLVMBuildICmp(builder, LLVMIntEQ, global_thread_id,
+                                  ac_get_arg(&ctx->ac, param_last_wave_prim_id), ""),
+                    12606);
+      LLVMValueRef count = LLVMBuildAdd(builder, start, num_prims_accepted, "");
+      count = LLVMBuildMul(builder, count, LLVMConstInt(ctx->ac.i32, vertices_per_prim, 0), "");
+
+      /* GFX8 needs to disable caching, so that the CP can see the stored value.
+       * MTYPE=3 bypasses TC L2.
+       */
+      if (ctx->screen->info.chip_class <= GFX8) {
+         LLVMValueRef desc[] = {
+            ac_get_arg(&ctx->ac, param_vertex_count_addr),
+            LLVMConstInt(ctx->ac.i32, S_008F04_BASE_ADDRESS_HI(ctx->screen->info.address32_hi), 0),
+            LLVMConstInt(ctx->ac.i32, 4, 0),
+            LLVMConstInt(
+               ctx->ac.i32,
+               S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32) | S_008F0C_MTYPE(3 /* uncached */),
+               0),
+         };
+         LLVMValueRef rsrc = ac_build_gather_values(&ctx->ac, desc, 4);
+         ac_build_buffer_store_dword(&ctx->ac, rsrc, count, 1, ctx->ac.i32_0, ctx->ac.i32_0, 0,
+                                     ac_glc | ac_slc);
+      } else {
+         LLVMBuildStore(
+            builder, count,
+            si_expand_32bit_pointer(ctx, ac_get_arg(&ctx->ac, param_vertex_count_addr)));
+      }
+      ac_build_endif(&ctx->ac, 12606);
+   } else {
+      /* For unordered modes that increment a vertex count instead of
+       * primitive count, convert it into the primitive index.
+       */
+      start = LLVMBuildUDiv(builder, start, LLVMConstInt(ctx->ac.i32, vertices_per_prim, 0), "");
+   }
+
+   /* Now we need to store the indices of accepted primitives into
+    * the output index buffer.
+    */
+   ac_build_ifcc(&ctx->ac, accepted, 16607);
+   {
+      /* Get the number of bits set before the index of this thread. */
+      LLVMValueRef prim_index = ac_build_mbcnt(&ctx->ac, accepted_threadmask);
+
+      /* We have lowered instancing. Pack the instance ID into vertex ID. */
+      if (key->opt.cs_instancing) {
+         instance_id = LLVMBuildShl(builder, instance_id, LLVMConstInt(ctx->ac.i32, 16, 0), "");
+
+         for (unsigned i = 0; i < vertices_per_prim; i++)
+            index[i] = LLVMBuildOr(builder, index[i], instance_id, "");
+      }
+
+      if (VERTEX_COUNTER_GDS_MODE == 2) {
+         /* vertex_counter contains the first primitive ID
+          * for this dispatch. If the draw call was split into
+          * multiple subdraws, the first primitive ID is > 0
+          * for subsequent subdraws. Each subdraw uses a different
+          * portion of the output index buffer. Offset the store
+          * vindex by the first primitive ID to get the correct
+          * store address for the subdraw.
+          */
+         start = LLVMBuildAdd(builder, start, vertex_counter, "");
+      }
+
+      /* Write indices for accepted primitives. */
+      LLVMValueRef vindex = LLVMBuildAdd(builder, start, prim_index, "");
+      LLVMValueRef vdata = ac_build_gather_values(&ctx->ac, index, 3);
+
+      if (!ac_has_vec3_support(ctx->ac.chip_class, true))
+         vdata = ac_build_expand_to_vec4(&ctx->ac, vdata, 3);
+
+      ac_build_buffer_store_format(&ctx->ac, output_indexbuf, vdata, vindex, ctx->ac.i32_0, 3,
+                                   ac_glc | (INDEX_STORES_USE_SLC ? ac_slc : 0));
+   }
+   ac_build_endif(&ctx->ac, 16607);
+
+   LLVMBuildRetVoid(builder);
 }
 
 /* Return false if the shader isn't ready. */
 static bool si_shader_select_prim_discard_cs(struct si_context *sctx,
-					     const struct pipe_draw_info *info,
-					     bool primitive_restart)
+                                             const struct pipe_draw_info *info,
+                                             bool primitive_restart)
 {
-	struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
-	struct si_shader_key key;
-
-	/* Primitive restart needs ordered counters. */
-	assert(!primitive_restart || VERTEX_COUNTER_GDS_MODE == 2);
-	assert(!primitive_restart || info->instance_count == 1);
-
-	memset(&key, 0, sizeof(key));
-	si_shader_selector_key_vs(sctx, sctx->vs_shader.cso, &key, &key.part.vs.prolog);
-	assert(!key.part.vs.prolog.instance_divisor_is_fetched);
-
-	key.part.vs.prolog.unpack_instance_id_from_vertex_id = 0;
-	key.opt.vs_as_prim_discard_cs = 1;
-	key.opt.cs_prim_type = info->mode;
-	key.opt.cs_indexed = info->index_size != 0;
-	key.opt.cs_instancing = info->instance_count > 1;
-	key.opt.cs_primitive_restart = primitive_restart;
-	key.opt.cs_provoking_vertex_first = rs->provoking_vertex_first;
-
-	/* Primitive restart with triangle strips needs to preserve primitive
-	 * orientation for cases where front and back primitive orientation matters.
-	 */
-	if (primitive_restart) {
-		struct si_shader_selector *ps = sctx->ps_shader.cso;
-
-		key.opt.cs_need_correct_orientation =
-			rs->cull_front != rs->cull_back ||
-			ps->info.uses_frontface ||
-			(rs->two_side && ps->info.colors_read);
-	}
-
-	if (rs->rasterizer_discard) {
-		/* Just for performance testing and analysis of trivial bottlenecks.
-		 * This should result in a very short compute shader. */
-		key.opt.cs_cull_front = 1;
-		key.opt.cs_cull_back = 1;
-	} else {
-		key.opt.cs_cull_front =
-			sctx->viewports.y_inverted ? rs->cull_back : rs->cull_front;
-		key.opt.cs_cull_back =
-			sctx->viewports.y_inverted ? rs->cull_front : rs->cull_back;
-	}
-
-	if (!rs->depth_clamp_any && CULL_Z) {
-		key.opt.cs_cull_z = 1;
-		key.opt.cs_halfz_clip_space = rs->clip_halfz;
-	}
-
-	sctx->cs_prim_discard_state.cso = sctx->vs_shader.cso;
-	sctx->cs_prim_discard_state.current = NULL;
-
-	if (!sctx->compiler.passes)
-		si_init_compiler(sctx->screen, &sctx->compiler);
-
-	struct si_compiler_ctx_state compiler_state;
-	compiler_state.compiler = &sctx->compiler;
-	compiler_state.debug = sctx->debug;
-	compiler_state.is_debug_context = sctx->is_debug;
-
-	return si_shader_select_with_key(sctx->screen, &sctx->cs_prim_discard_state,
-					 &compiler_state, &key, -1, true) == 0 &&
-	       /* Disallow compute shaders using the scratch buffer. */
-	       sctx->cs_prim_discard_state.current->config.scratch_bytes_per_wave == 0;
+   struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
+   struct si_shader_key key;
+
+   /* Primitive restart needs ordered counters. */
+   assert(!primitive_restart || VERTEX_COUNTER_GDS_MODE == 2);
+   assert(!primitive_restart || info->instance_count == 1);
+
+   memset(&key, 0, sizeof(key));
+   si_shader_selector_key_vs(sctx, sctx->vs_shader.cso, &key, &key.part.vs.prolog);
+   assert(!key.part.vs.prolog.instance_divisor_is_fetched);
+
+   key.part.vs.prolog.unpack_instance_id_from_vertex_id = 0;
+   key.opt.vs_as_prim_discard_cs = 1;
+   key.opt.cs_prim_type = info->mode;
+   key.opt.cs_indexed = info->index_size != 0;
+   key.opt.cs_instancing = info->instance_count > 1;
+   key.opt.cs_primitive_restart = primitive_restart;
+   key.opt.cs_provoking_vertex_first = rs->provoking_vertex_first;
+
+   /* Primitive restart with triangle strips needs to preserve primitive
+    * orientation for cases where front and back primitive orientation matters.
+    */
+   if (primitive_restart) {
+      struct si_shader_selector *ps = sctx->ps_shader.cso;
+
+      key.opt.cs_need_correct_orientation = rs->cull_front != rs->cull_back ||
+                                            ps->info.uses_frontface ||
+                                            (rs->two_side && ps->info.colors_read);
+   }
+
+   if (rs->rasterizer_discard) {
+      /* Just for performance testing and analysis of trivial bottlenecks.
+       * This should result in a very short compute shader. */
+      key.opt.cs_cull_front = 1;
+      key.opt.cs_cull_back = 1;
+   } else {
+      key.opt.cs_cull_front = sctx->viewports.y_inverted ? rs->cull_back : rs->cull_front;
+      key.opt.cs_cull_back = sctx->viewports.y_inverted ? rs->cull_front : rs->cull_back;
+   }
+
+   if (!rs->depth_clamp_any && CULL_Z) {
+      key.opt.cs_cull_z = 1;
+      key.opt.cs_halfz_clip_space = rs->clip_halfz;
+   }
+
+   sctx->cs_prim_discard_state.cso = sctx->vs_shader.cso;
+   sctx->cs_prim_discard_state.current = NULL;
+
+   if (!sctx->compiler.passes)
+      si_init_compiler(sctx->screen, &sctx->compiler);
+
+   struct si_compiler_ctx_state compiler_state;
+   compiler_state.compiler = &sctx->compiler;
+   compiler_state.debug = sctx->debug;
+   compiler_state.is_debug_context = sctx->is_debug;
+
+   return si_shader_select_with_key(sctx->screen, &sctx->cs_prim_discard_state, &compiler_state,
+                                    &key, -1, true) == 0 &&
+          /* Disallow compute shaders using the scratch buffer. */
+          sctx->cs_prim_discard_state.current->config.scratch_bytes_per_wave == 0;
 }
 
 static bool si_initialize_prim_discard_cmdbuf(struct si_context *sctx)
 {
-	if (sctx->index_ring)
-		return true;
-
-	if (!sctx->prim_discard_compute_cs) {
-		struct radeon_winsys *ws = sctx->ws;
-		unsigned gds_size = VERTEX_COUNTER_GDS_MODE == 1 ? GDS_SIZE_UNORDERED :
-				    VERTEX_COUNTER_GDS_MODE == 2 ? 8 : 0;
-		unsigned num_oa_counters = VERTEX_COUNTER_GDS_MODE == 2 ? 2 : 0;
-
-		if (gds_size) {
-			sctx->gds = ws->buffer_create(ws, gds_size, 4,
-						      RADEON_DOMAIN_GDS, 0);
-			if (!sctx->gds)
-				return false;
-
-			ws->cs_add_buffer(sctx->gfx_cs, sctx->gds,
-					  RADEON_USAGE_READWRITE, 0, 0);
-		}
-		if (num_oa_counters) {
-			assert(gds_size);
-			sctx->gds_oa = ws->buffer_create(ws, num_oa_counters,
-							 1, RADEON_DOMAIN_OA, 0);
-			if (!sctx->gds_oa)
-				return false;
-
-			ws->cs_add_buffer(sctx->gfx_cs, sctx->gds_oa,
-					  RADEON_USAGE_READWRITE, 0, 0);
-		}
-
-		sctx->prim_discard_compute_cs =
-			ws->cs_add_parallel_compute_ib(sctx->gfx_cs,
-						       num_oa_counters > 0);
-		if (!sctx->prim_discard_compute_cs)
-			return false;
-	}
-
-	if (!sctx->index_ring) {
-		sctx->index_ring =
-			si_aligned_buffer_create(sctx->b.screen,
-						 SI_RESOURCE_FLAG_UNMAPPABLE,
-						 PIPE_USAGE_DEFAULT,
-						 sctx->index_ring_size_per_ib * 2,
-						 sctx->screen->info.pte_fragment_size);
-		if (!sctx->index_ring)
-			return false;
-	}
-	return true;
+   if (sctx->index_ring)
+      return true;
+
+   if (!sctx->prim_discard_compute_cs) {
+      struct radeon_winsys *ws = sctx->ws;
+      unsigned gds_size =
+         VERTEX_COUNTER_GDS_MODE == 1 ? GDS_SIZE_UNORDERED : VERTEX_COUNTER_GDS_MODE == 2 ? 8 : 0;
+      unsigned num_oa_counters = VERTEX_COUNTER_GDS_MODE == 2 ? 2 : 0;
+
+      if (gds_size) {
+         sctx->gds = ws->buffer_create(ws, gds_size, 4, RADEON_DOMAIN_GDS, 0);
+         if (!sctx->gds)
+            return false;
+
+         ws->cs_add_buffer(sctx->gfx_cs, sctx->gds, RADEON_USAGE_READWRITE, 0, 0);
+      }
+      if (num_oa_counters) {
+         assert(gds_size);
+         sctx->gds_oa = ws->buffer_create(ws, num_oa_counters, 1, RADEON_DOMAIN_OA, 0);
+         if (!sctx->gds_oa)
+            return false;
+
+         ws->cs_add_buffer(sctx->gfx_cs, sctx->gds_oa, RADEON_USAGE_READWRITE, 0, 0);
+      }
+
+      sctx->prim_discard_compute_cs =
+         ws->cs_add_parallel_compute_ib(sctx->gfx_cs, num_oa_counters > 0);
+      if (!sctx->prim_discard_compute_cs)
+         return false;
+   }
+
+   if (!sctx->index_ring) {
+      sctx->index_ring = si_aligned_buffer_create(
+         sctx->b.screen, SI_RESOURCE_FLAG_UNMAPPABLE, PIPE_USAGE_DEFAULT,
+         sctx->index_ring_size_per_ib * 2, sctx->screen->info.pte_fragment_size);
+      if (!sctx->index_ring)
+         return false;
+   }
+   return true;
 }
 
 static bool si_check_ring_space(struct si_context *sctx, unsigned out_indexbuf_size)
 {
-	return sctx->index_ring_offset +
-	       align(out_indexbuf_size, sctx->screen->info.tcc_cache_line_size) <=
-	       sctx->index_ring_size_per_ib;
+   return sctx->index_ring_offset +
+             align(out_indexbuf_size, sctx->screen->info.tcc_cache_line_size) <=
+          sctx->index_ring_size_per_ib;
 }
 
 enum si_prim_discard_outcome
-si_prepare_prim_discard_or_split_draw(struct si_context *sctx,
-				      const struct pipe_draw_info *info,
-				      bool primitive_restart)
+si_prepare_prim_discard_or_split_draw(struct si_context *sctx, const struct pipe_draw_info *info,
+                                      bool primitive_restart)
 {
-	/* If the compute shader compilation isn't finished, this returns false. */
-	if (!si_shader_select_prim_discard_cs(sctx, info, primitive_restart))
-		return SI_PRIM_DISCARD_DISABLED;
-
-	if (!si_initialize_prim_discard_cmdbuf(sctx))
-		return SI_PRIM_DISCARD_DISABLED;
-
-	struct radeon_cmdbuf *gfx_cs = sctx->gfx_cs;
-	unsigned prim = info->mode;
-	unsigned count = info->count;
-	unsigned instance_count = info->instance_count;
-	unsigned num_prims_per_instance = u_decomposed_prims_for_vertices(prim, count);
-	unsigned num_prims = num_prims_per_instance * instance_count;
-	unsigned out_indexbuf_size = num_prims * 12;
-	bool ring_full = !si_check_ring_space(sctx, out_indexbuf_size);
-	const unsigned split_prims_draw_level = SPLIT_PRIMS_DRAW_LEVEL;
-
-	/* Split draws at the draw call level if the ring is full. This makes
-	 * better use of the ring space.
-	 */
-	if (ring_full &&
-	    num_prims > split_prims_draw_level &&
-	    instance_count == 1 && /* TODO: support splitting instanced draws */
-	    (1 << prim) & ((1 << PIPE_PRIM_TRIANGLES) |
-			   (1 << PIPE_PRIM_TRIANGLE_STRIP))) {
-		/* Split draws. */
-		struct pipe_draw_info split_draw = *info;
-		split_draw.primitive_restart = primitive_restart;
-
-		unsigned base_start = split_draw.start;
-
-		if (prim == PIPE_PRIM_TRIANGLES) {
-			unsigned vert_count_per_subdraw = split_prims_draw_level * 3;
-			assert(vert_count_per_subdraw < count);
-
-			for (unsigned start = 0; start < count; start += vert_count_per_subdraw) {
-				split_draw.start = base_start + start;
-				split_draw.count = MIN2(count - start, vert_count_per_subdraw);
-
-				sctx->b.draw_vbo(&sctx->b, &split_draw);
-			}
-		} else if (prim == PIPE_PRIM_TRIANGLE_STRIP) {
-			/* No primitive pair can be split, because strips reverse orientation
-			 * for odd primitives. */
-			STATIC_ASSERT(split_prims_draw_level % 2 == 0);
-
-			unsigned vert_count_per_subdraw = split_prims_draw_level;
-
-			for (unsigned start = 0; start < count - 2; start += vert_count_per_subdraw) {
-				split_draw.start = base_start + start;
-				split_draw.count = MIN2(count - start, vert_count_per_subdraw + 2);
-
-				sctx->b.draw_vbo(&sctx->b, &split_draw);
-
-				if (start == 0 &&
-				    primitive_restart &&
-				    sctx->cs_prim_discard_state.current->key.opt.cs_need_correct_orientation)
-					sctx->preserve_prim_restart_gds_at_flush = true;
-			}
-			sctx->preserve_prim_restart_gds_at_flush = false;
-		} else {
-			assert(0);
-		}
-
-		return SI_PRIM_DISCARD_DRAW_SPLIT;
-	}
-
-	/* Just quit if the draw call doesn't fit into the ring and can't be split. */
-	if (out_indexbuf_size > sctx->index_ring_size_per_ib) {
-		if (SI_PRIM_DISCARD_DEBUG)
-			puts("PD failed: draw call too big, can't be split");
-		return SI_PRIM_DISCARD_DISABLED;
-	}
-
-	unsigned num_subdraws = DIV_ROUND_UP(num_prims, SPLIT_PRIMS_PACKET_LEVEL);
-	unsigned need_compute_dw = 11 /* shader */ + 34 /* first draw */ +
-				   24 * (num_subdraws - 1) + /* subdraws */
-				   20; /* leave some space at the end */
-	unsigned need_gfx_dw = si_get_minimum_num_gfx_cs_dwords(sctx);
-
-	if (sctx->chip_class <= GFX7 || FORCE_REWIND_EMULATION)
-		need_gfx_dw += 9; /* NOP(2) + WAIT_REG_MEM(7), then chain */
-	else
-		need_gfx_dw += num_subdraws * 8; /* use REWIND(2) + DRAW(6) */
-
-	if (ring_full ||
-	    (VERTEX_COUNTER_GDS_MODE == 1 && sctx->compute_gds_offset + 8 > GDS_SIZE_UNORDERED) ||
-	    !sctx->ws->cs_check_space(gfx_cs, need_gfx_dw, false)) {
-		/* If the current IB is empty but the size is too small, add a NOP
-		 * packet to force a flush and get a bigger IB.
-		 */
-		if (!radeon_emitted(gfx_cs, sctx->initial_gfx_cs_size) &&
-		    gfx_cs->current.cdw + need_gfx_dw > gfx_cs->current.max_dw) {
-			radeon_emit(gfx_cs, PKT3(PKT3_NOP, 0, 0));
-			radeon_emit(gfx_cs, 0);
-		}
-
-		si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
-	}
-
-	/* The compute IB is always chained, but we need to call cs_check_space to add more space. */
-	struct radeon_cmdbuf *cs = sctx->prim_discard_compute_cs;
-	ASSERTED bool compute_has_space = sctx->ws->cs_check_space(cs, need_compute_dw, false);
-	assert(compute_has_space);
-	assert(si_check_ring_space(sctx, out_indexbuf_size));
-	return SI_PRIM_DISCARD_ENABLED;
+   /* If the compute shader compilation isn't finished, this returns false. */
+   if (!si_shader_select_prim_discard_cs(sctx, info, primitive_restart))
+      return SI_PRIM_DISCARD_DISABLED;
+
+   if (!si_initialize_prim_discard_cmdbuf(sctx))
+      return SI_PRIM_DISCARD_DISABLED;
+
+   struct radeon_cmdbuf *gfx_cs = sctx->gfx_cs;
+   unsigned prim = info->mode;
+   unsigned count = info->count;
+   unsigned instance_count = info->instance_count;
+   unsigned num_prims_per_instance = u_decomposed_prims_for_vertices(prim, count);
+   unsigned num_prims = num_prims_per_instance * instance_count;
+   unsigned out_indexbuf_size = num_prims * 12;
+   bool ring_full = !si_check_ring_space(sctx, out_indexbuf_size);
+   const unsigned split_prims_draw_level = SPLIT_PRIMS_DRAW_LEVEL;
+
+   /* Split draws at the draw call level if the ring is full. This makes
+    * better use of the ring space.
+    */
+   if (ring_full && num_prims > split_prims_draw_level &&
+       instance_count == 1 && /* TODO: support splitting instanced draws */
+       (1 << prim) & ((1 << PIPE_PRIM_TRIANGLES) | (1 << PIPE_PRIM_TRIANGLE_STRIP))) {
+      /* Split draws. */
+      struct pipe_draw_info split_draw = *info;
+      split_draw.primitive_restart = primitive_restart;
+
+      unsigned base_start = split_draw.start;
+
+      if (prim == PIPE_PRIM_TRIANGLES) {
+         unsigned vert_count_per_subdraw = split_prims_draw_level * 3;
+         assert(vert_count_per_subdraw < count);
+
+         for (unsigned start = 0; start < count; start += vert_count_per_subdraw) {
+            split_draw.start = base_start + start;
+            split_draw.count = MIN2(count - start, vert_count_per_subdraw);
+
+            sctx->b.draw_vbo(&sctx->b, &split_draw);
+         }
+      } else if (prim == PIPE_PRIM_TRIANGLE_STRIP) {
+         /* No primitive pair can be split, because strips reverse orientation
+          * for odd primitives. */
+         STATIC_ASSERT(split_prims_draw_level % 2 == 0);
+
+         unsigned vert_count_per_subdraw = split_prims_draw_level;
+
+         for (unsigned start = 0; start < count - 2; start += vert_count_per_subdraw) {
+            split_draw.start = base_start + start;
+            split_draw.count = MIN2(count - start, vert_count_per_subdraw + 2);
+
+            sctx->b.draw_vbo(&sctx->b, &split_draw);
+
+            if (start == 0 && primitive_restart &&
+                sctx->cs_prim_discard_state.current->key.opt.cs_need_correct_orientation)
+               sctx->preserve_prim_restart_gds_at_flush = true;
+         }
+         sctx->preserve_prim_restart_gds_at_flush = false;
+      } else {
+         assert(0);
+      }
+
+      return SI_PRIM_DISCARD_DRAW_SPLIT;
+   }
+
+   /* Just quit if the draw call doesn't fit into the ring and can't be split. */
+   if (out_indexbuf_size > sctx->index_ring_size_per_ib) {
+      if (SI_PRIM_DISCARD_DEBUG)
+         puts("PD failed: draw call too big, can't be split");
+      return SI_PRIM_DISCARD_DISABLED;
+   }
+
+   unsigned num_subdraws = DIV_ROUND_UP(num_prims, SPLIT_PRIMS_PACKET_LEVEL);
+   unsigned need_compute_dw = 11 /* shader */ + 34 /* first draw */ +
+                              24 * (num_subdraws - 1) + /* subdraws */
+                              20;                       /* leave some space at the end */
+   unsigned need_gfx_dw = si_get_minimum_num_gfx_cs_dwords(sctx);
+
+   if (sctx->chip_class <= GFX7 || FORCE_REWIND_EMULATION)
+      need_gfx_dw += 9; /* NOP(2) + WAIT_REG_MEM(7), then chain */
+   else
+      need_gfx_dw += num_subdraws * 8; /* use REWIND(2) + DRAW(6) */
+
+   if (ring_full ||
+       (VERTEX_COUNTER_GDS_MODE == 1 && sctx->compute_gds_offset + 8 > GDS_SIZE_UNORDERED) ||
+       !sctx->ws->cs_check_space(gfx_cs, need_gfx_dw, false)) {
+      /* If the current IB is empty but the size is too small, add a NOP
+       * packet to force a flush and get a bigger IB.
+       */
+      if (!radeon_emitted(gfx_cs, sctx->initial_gfx_cs_size) &&
+          gfx_cs->current.cdw + need_gfx_dw > gfx_cs->current.max_dw) {
+         radeon_emit(gfx_cs, PKT3(PKT3_NOP, 0, 0));
+         radeon_emit(gfx_cs, 0);
+      }
+
+      si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
+   }
+
+   /* The compute IB is always chained, but we need to call cs_check_space to add more space. */
+   struct radeon_cmdbuf *cs = sctx->prim_discard_compute_cs;
+   ASSERTED bool compute_has_space = sctx->ws->cs_check_space(cs, need_compute_dw, false);
+   assert(compute_has_space);
+   assert(si_check_ring_space(sctx, out_indexbuf_size));
+   return SI_PRIM_DISCARD_ENABLED;
 }
 
 void si_compute_signal_gfx(struct si_context *sctx)
 {
-	struct radeon_cmdbuf *cs = sctx->prim_discard_compute_cs;
-	unsigned writeback_L2_flags = 0;
-
-	/* The writeback L2 flags vary with each chip generation. */
-	/* CI needs to flush vertex indices to memory. */
-	if (sctx->chip_class <= GFX7)
-		writeback_L2_flags = EVENT_TC_WB_ACTION_ENA;
-	else if (sctx->chip_class == GFX8 && VERTEX_COUNTER_GDS_MODE == 0)
-		writeback_L2_flags = EVENT_TC_WB_ACTION_ENA | EVENT_TC_NC_ACTION_ENA;
-
-	if (!sctx->compute_num_prims_in_batch)
-		return;
-
-	assert(sctx->compute_rewind_va);
-
-	/* After the queued dispatches are done and vertex counts are written to
-	 * the gfx IB, signal the gfx IB to continue. CP doesn't wait for
-	 * the dispatches to finish, it only adds the CS_DONE event into the event
-	 * queue.
-	 */
-	si_cp_release_mem(sctx, cs, V_028A90_CS_DONE, writeback_L2_flags,
-			  sctx->chip_class <= GFX8 ? EOP_DST_SEL_MEM : EOP_DST_SEL_TC_L2,
-			  writeback_L2_flags ? EOP_INT_SEL_SEND_DATA_AFTER_WR_CONFIRM :
-					       EOP_INT_SEL_NONE,
-			  EOP_DATA_SEL_VALUE_32BIT,
-			  NULL,
-			  sctx->compute_rewind_va |
-			  ((uint64_t)sctx->screen->info.address32_hi << 32),
-			  REWIND_SIGNAL_BIT, /* signaling value for the REWIND packet */
-			  SI_NOT_QUERY);
-
-	sctx->compute_rewind_va = 0;
-	sctx->compute_num_prims_in_batch = 0;
+   struct radeon_cmdbuf *cs = sctx->prim_discard_compute_cs;
+   unsigned writeback_L2_flags = 0;
+
+   /* The writeback L2 flags vary with each chip generation. */
+   /* CI needs to flush vertex indices to memory. */
+   if (sctx->chip_class <= GFX7)
+      writeback_L2_flags = EVENT_TC_WB_ACTION_ENA;
+   else if (sctx->chip_class == GFX8 && VERTEX_COUNTER_GDS_MODE == 0)
+      writeback_L2_flags = EVENT_TC_WB_ACTION_ENA | EVENT_TC_NC_ACTION_ENA;
+
+   if (!sctx->compute_num_prims_in_batch)
+      return;
+
+   assert(sctx->compute_rewind_va);
+
+   /* After the queued dispatches are done and vertex counts are written to
+    * the gfx IB, signal the gfx IB to continue. CP doesn't wait for
+    * the dispatches to finish, it only adds the CS_DONE event into the event
+    * queue.
+    */
+   si_cp_release_mem(sctx, cs, V_028A90_CS_DONE, writeback_L2_flags,
+                     sctx->chip_class <= GFX8 ? EOP_DST_SEL_MEM : EOP_DST_SEL_TC_L2,
+                     writeback_L2_flags ? EOP_INT_SEL_SEND_DATA_AFTER_WR_CONFIRM : EOP_INT_SEL_NONE,
+                     EOP_DATA_SEL_VALUE_32BIT, NULL,
+                     sctx->compute_rewind_va | ((uint64_t)sctx->screen->info.address32_hi << 32),
+                     REWIND_SIGNAL_BIT, /* signaling value for the REWIND packet */
+                     SI_NOT_QUERY);
+
+   sctx->compute_rewind_va = 0;
+   sctx->compute_num_prims_in_batch = 0;
 }
 
 /* Dispatch a primitive discard compute shader. */
 void si_dispatch_prim_discard_cs_and_draw(struct si_context *sctx,
-					  const struct pipe_draw_info *info,
-					  unsigned index_size,
-					  unsigned base_vertex,
-					  uint64_t input_indexbuf_va,
-					  unsigned input_indexbuf_num_elements)
+                                          const struct pipe_draw_info *info, unsigned index_size,
+                                          unsigned base_vertex, uint64_t input_indexbuf_va,
+                                          unsigned input_indexbuf_num_elements)
 {
-	struct radeon_cmdbuf *gfx_cs = sctx->gfx_cs;
-	struct radeon_cmdbuf *cs = sctx->prim_discard_compute_cs;
-	unsigned num_prims_per_instance = u_decomposed_prims_for_vertices(info->mode, info->count);
-	if (!num_prims_per_instance)
-		return;
-
-	unsigned num_prims = num_prims_per_instance * info->instance_count;
-	unsigned vertices_per_prim, output_indexbuf_format, gfx10_output_indexbuf_format;
-
-	switch (info->mode) {
-	case PIPE_PRIM_TRIANGLES:
-	case PIPE_PRIM_TRIANGLE_STRIP:
-	case PIPE_PRIM_TRIANGLE_FAN:
-		vertices_per_prim = 3;
-		output_indexbuf_format = V_008F0C_BUF_DATA_FORMAT_32_32_32;
-		gfx10_output_indexbuf_format = V_008F0C_IMG_FORMAT_32_32_32_UINT;
-		break;
-	default:
-		unreachable("unsupported primitive type");
-		return;
-	}
-
-	unsigned out_indexbuf_offset;
-	uint64_t output_indexbuf_size = num_prims * vertices_per_prim * 4;
-	bool first_dispatch = !sctx->prim_discard_compute_ib_initialized;
-
-	/* Initialize the compute IB if it's empty. */
-	if (!sctx->prim_discard_compute_ib_initialized) {
-		/* 1) State initialization. */
-		sctx->compute_gds_offset = 0;
-		sctx->compute_ib_last_shader = NULL;
-
-		if (sctx->last_ib_barrier_fence) {
-			assert(!sctx->last_ib_barrier_buf);
-			sctx->ws->cs_add_fence_dependency(gfx_cs,
-							  sctx->last_ib_barrier_fence,
-							  RADEON_DEPENDENCY_PARALLEL_COMPUTE_ONLY);
-		}
-
-		/* 2) IB initialization. */
-
-		/* This needs to be done at the beginning of IBs due to possible
-		 * TTM buffer moves in the kernel.
-		 */
-		if (sctx->chip_class >= GFX10) {
-			radeon_emit(cs, PKT3(PKT3_ACQUIRE_MEM, 6, 0));
-			radeon_emit(cs, 0);		/* CP_COHER_CNTL */
-			radeon_emit(cs, 0xffffffff);	/* CP_COHER_SIZE */
-			radeon_emit(cs, 0xffffff);	/* CP_COHER_SIZE_HI */
-			radeon_emit(cs, 0);		/* CP_COHER_BASE */
-			radeon_emit(cs, 0);		/* CP_COHER_BASE_HI */
-			radeon_emit(cs, 0x0000000A);	/* POLL_INTERVAL */
-			radeon_emit(cs,			/* GCR_CNTL */
-				    S_586_GLI_INV(V_586_GLI_ALL) |
-				    S_586_GLK_INV(1) | S_586_GLV_INV(1) |
-				    S_586_GL1_INV(1) |
-				    S_586_GL2_INV(1) | S_586_GL2_WB(1) |
-				    S_586_GLM_INV(1) | S_586_GLM_WB(1) |
-				    S_586_SEQ(V_586_SEQ_FORWARD));
-		} else {
-			si_emit_surface_sync(sctx, cs,
-					     S_0085F0_TC_ACTION_ENA(1) |
-					     S_0085F0_TCL1_ACTION_ENA(1) |
-					     S_0301F0_TC_WB_ACTION_ENA(sctx->chip_class >= GFX8) |
-					     S_0085F0_SH_ICACHE_ACTION_ENA(1) |
-					     S_0085F0_SH_KCACHE_ACTION_ENA(1));
-		}
-
-		/* Restore the GDS prim restart counter if needed. */
-		if (sctx->preserve_prim_restart_gds_at_flush) {
-			si_cp_copy_data(sctx, cs,
-					COPY_DATA_GDS, NULL, 4,
-					COPY_DATA_SRC_MEM, sctx->wait_mem_scratch, 4);
-		}
-
-		si_emit_initial_compute_regs(sctx, cs);
-
-		radeon_set_sh_reg(cs, R_00B860_COMPUTE_TMPRING_SIZE,
-				  S_00B860_WAVES(sctx->scratch_waves) |
-				  S_00B860_WAVESIZE(0)); /* no scratch */
-
-		/* Only 1D grids are launched. */
-		radeon_set_sh_reg_seq(cs, R_00B820_COMPUTE_NUM_THREAD_Y, 2);
-		radeon_emit(cs, S_00B820_NUM_THREAD_FULL(1) |
-				S_00B820_NUM_THREAD_PARTIAL(1));
-		radeon_emit(cs, S_00B824_NUM_THREAD_FULL(1) |
-				S_00B824_NUM_THREAD_PARTIAL(1));
-
-		radeon_set_sh_reg_seq(cs, R_00B814_COMPUTE_START_Y, 2);
-		radeon_emit(cs, 0);
-		radeon_emit(cs, 0);
-
-		/* Disable ordered alloc for OA resources. */
-		for (unsigned i = 0; i < 2; i++) {
-			radeon_set_uconfig_reg_seq(cs, R_031074_GDS_OA_CNTL, 3);
-			radeon_emit(cs, S_031074_INDEX(i));
-			radeon_emit(cs, 0);
-			radeon_emit(cs, S_03107C_ENABLE(0));
-		}
-
-		if (sctx->last_ib_barrier_buf) {
-			assert(!sctx->last_ib_barrier_fence);
-			radeon_add_to_buffer_list(sctx, gfx_cs, sctx->last_ib_barrier_buf,
-						  RADEON_USAGE_READ, RADEON_PRIO_FENCE);
-			si_cp_wait_mem(sctx, cs,
-				       sctx->last_ib_barrier_buf->gpu_address +
-				       sctx->last_ib_barrier_buf_offset, 1, 1,
-				       WAIT_REG_MEM_EQUAL);
-		}
-
-		sctx->prim_discard_compute_ib_initialized = true;
-	}
-
-	/* Allocate the output index buffer. */
-	output_indexbuf_size = align(output_indexbuf_size,
-				     sctx->screen->info.tcc_cache_line_size);
-	assert(sctx->index_ring_offset + output_indexbuf_size <= sctx->index_ring_size_per_ib);
-	out_indexbuf_offset = sctx->index_ring_base + sctx->index_ring_offset;
-	sctx->index_ring_offset += output_indexbuf_size;
-
-	radeon_add_to_buffer_list(sctx, gfx_cs, sctx->index_ring, RADEON_USAGE_READWRITE,
-				  RADEON_PRIO_SHADER_RW_BUFFER);
-	uint64_t out_indexbuf_va = sctx->index_ring->gpu_address + out_indexbuf_offset;
-
-	/* Prepare index buffer descriptors. */
-	struct si_resource *indexbuf_desc = NULL;
-	unsigned indexbuf_desc_offset;
-	unsigned desc_size = 12 * 4;
-	uint32_t *desc;
-
-	u_upload_alloc(sctx->b.const_uploader, 0, desc_size,
-		       si_optimal_tcc_alignment(sctx, desc_size),
-		       &indexbuf_desc_offset, (struct pipe_resource**)&indexbuf_desc,
-		       (void**)&desc);
-	radeon_add_to_buffer_list(sctx, gfx_cs, indexbuf_desc, RADEON_USAGE_READ,
-				  RADEON_PRIO_DESCRIPTORS);
-
-	/* Input index buffer. */
-	desc[0] = input_indexbuf_va;
-	desc[1] = S_008F04_BASE_ADDRESS_HI(input_indexbuf_va >> 32) |
-		  S_008F04_STRIDE(index_size);
-	desc[2] = input_indexbuf_num_elements * (sctx->chip_class == GFX8 ? index_size : 1);
-
-	if (sctx->chip_class >= GFX10) {
-		desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
-			  S_008F0C_FORMAT(index_size == 1 ? V_008F0C_IMG_FORMAT_8_UINT :
-					  index_size == 2 ? V_008F0C_IMG_FORMAT_16_UINT :
-							    V_008F0C_IMG_FORMAT_32_UINT) |
-			  S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_STRUCTURED_WITH_OFFSET) |
-			  S_008F0C_RESOURCE_LEVEL(1);
-	} else {
-		desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
-			  S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_UINT) |
-			  S_008F0C_DATA_FORMAT(index_size == 1 ? V_008F0C_BUF_DATA_FORMAT_8 :
-					       index_size == 2 ? V_008F0C_BUF_DATA_FORMAT_16 :
-								 V_008F0C_BUF_DATA_FORMAT_32);
-	}
-
-	/* Output index buffer. */
-	desc[4] = out_indexbuf_va;
-	desc[5] = S_008F04_BASE_ADDRESS_HI(out_indexbuf_va >> 32) |
-		  S_008F04_STRIDE(vertices_per_prim * 4);
-	desc[6] = num_prims * (sctx->chip_class == GFX8 ? vertices_per_prim * 4 : 1);
-
-	if (sctx->chip_class >= GFX10) {
-		desc[7] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
-			  S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
-			  S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
-			  S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_0) |
-			  S_008F0C_FORMAT(gfx10_output_indexbuf_format) |
-			  S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_STRUCTURED_WITH_OFFSET) |
-			  S_008F0C_RESOURCE_LEVEL(1);
-	} else {
-		desc[7] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
-			  S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
-			  S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
-			  S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_0) |
-			  S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_UINT) |
-			  S_008F0C_DATA_FORMAT(output_indexbuf_format);
-	}
-
-	/* Viewport state. */
-	struct si_small_prim_cull_info cull_info;
-	si_get_small_prim_cull_info(sctx, &cull_info);
-
-	desc[8] = fui(cull_info.scale[0]);
-	desc[9] = fui(cull_info.scale[1]);
-	desc[10] = fui(cull_info.translate[0]);
-	desc[11] = fui(cull_info.translate[1]);
-
-	/* Better subpixel precision increases the efficiency of small
-	 * primitive culling. */
-	unsigned num_samples = sctx->framebuffer.nr_samples;
-	unsigned quant_mode = sctx->viewports.as_scissor[0].quant_mode;
-	float small_prim_cull_precision;
-
-	if (quant_mode == SI_QUANT_MODE_12_12_FIXED_POINT_1_4096TH)
-		small_prim_cull_precision = num_samples / 4096.0;
-	else if (quant_mode == SI_QUANT_MODE_14_10_FIXED_POINT_1_1024TH)
-		small_prim_cull_precision = num_samples / 1024.0;
-	else
-		small_prim_cull_precision = num_samples / 256.0;
-
-	/* Set user data SGPRs. */
-	/* This can't be greater than 14 if we want the fastest launch rate. */
-	unsigned user_sgprs = 13;
-
-	uint64_t index_buffers_va = indexbuf_desc->gpu_address + indexbuf_desc_offset;
-	unsigned vs_const_desc = si_const_and_shader_buffer_descriptors_idx(PIPE_SHADER_VERTEX);
-	unsigned vs_sampler_desc = si_sampler_and_image_descriptors_idx(PIPE_SHADER_VERTEX);
-	uint64_t vs_const_desc_va = sctx->descriptors[vs_const_desc].gpu_address;
-	uint64_t vs_sampler_desc_va = sctx->descriptors[vs_sampler_desc].gpu_address;
-	uint64_t vb_desc_va = sctx->vb_descriptors_buffer ?
-				      sctx->vb_descriptors_buffer->gpu_address +
-				      sctx->vb_descriptors_offset : 0;
-	unsigned gds_offset, gds_size;
-	struct si_fast_udiv_info32 num_prims_udiv = {};
-
-	if (info->instance_count > 1)
-		num_prims_udiv = si_compute_fast_udiv_info32(num_prims_per_instance, 31);
-
-	/* Limitations on how these two are packed in the user SGPR. */
-	assert(num_prims_udiv.post_shift < 32);
-	assert(num_prims_per_instance < 1 << 27);
-
-	si_resource_reference(&indexbuf_desc, NULL);
-
-	bool primitive_restart = sctx->cs_prim_discard_state.current->key.opt.cs_primitive_restart;
-
-	if (VERTEX_COUNTER_GDS_MODE == 1) {
-		gds_offset = sctx->compute_gds_offset;
-		gds_size = primitive_restart ? 8 : 4;
-		sctx->compute_gds_offset += gds_size;
-
-		/* Reset the counters in GDS for the first dispatch using WRITE_DATA.
-		 * The remainder of the GDS will be cleared after the dispatch packet
-		 * in parallel with compute shaders.
-		 */
-		if (first_dispatch) {
-			radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 2 + gds_size/4, 0));
-			radeon_emit(cs, S_370_DST_SEL(V_370_GDS) | S_370_WR_CONFIRM(1));
-			radeon_emit(cs, gds_offset);
-			radeon_emit(cs, 0);
-			radeon_emit(cs, 0); /* value to write */
-			if (gds_size == 8)
-				radeon_emit(cs, 0);
-		}
-	}
-
-	/* Set shader registers. */
-	struct si_shader *shader = sctx->cs_prim_discard_state.current;
-
-	if (shader != sctx->compute_ib_last_shader) {
-		radeon_add_to_buffer_list(sctx, gfx_cs, shader->bo, RADEON_USAGE_READ,
-					  RADEON_PRIO_SHADER_BINARY);
-		uint64_t shader_va = shader->bo->gpu_address;
-
-		assert(shader->config.scratch_bytes_per_wave == 0);
-		assert(shader->config.num_vgprs * WAVES_PER_TG <= 256 * 4);
-
-		radeon_set_sh_reg_seq(cs, R_00B830_COMPUTE_PGM_LO, 2);
-		radeon_emit(cs, shader_va >> 8);
-		radeon_emit(cs, S_00B834_DATA(shader_va >> 40));
-
-		radeon_set_sh_reg_seq(cs, R_00B848_COMPUTE_PGM_RSRC1, 2);
-		radeon_emit(cs, S_00B848_VGPRS((shader->config.num_vgprs - 1) / 4) |
-				S_00B848_SGPRS(sctx->chip_class <= GFX9 ?
-					       (shader->config.num_sgprs - 1) / 8 : 0) |
-				S_00B848_FLOAT_MODE(shader->config.float_mode) |
-				S_00B848_DX10_CLAMP(1) |
-				S_00B848_MEM_ORDERED(sctx->chip_class >= GFX10) |
-				S_00B848_WGP_MODE(sctx->chip_class >= GFX10));
-		radeon_emit(cs, S_00B84C_SCRATCH_EN(0 /* no scratch */) |
-				S_00B84C_USER_SGPR(user_sgprs) |
-				S_00B84C_TGID_X_EN(1 /* only blockID.x is used */) |
-				S_00B84C_TG_SIZE_EN(VERTEX_COUNTER_GDS_MODE == 2 /* need the wave ID */) |
-				S_00B84C_TIDIG_COMP_CNT(0 /* only threadID.x is used */) |
-				S_00B84C_LDS_SIZE(shader->config.lds_size));
-
-		radeon_set_sh_reg(cs, R_00B854_COMPUTE_RESOURCE_LIMITS,
-			ac_get_compute_resource_limits(&sctx->screen->info,
-						       WAVES_PER_TG,
-						       MAX_WAVES_PER_SH,
-						       THREADGROUPS_PER_CU));
-		sctx->compute_ib_last_shader = shader;
-	}
-
-	STATIC_ASSERT(SPLIT_PRIMS_PACKET_LEVEL % THREADGROUP_SIZE == 0);
-
-	/* Big draw calls are split into smaller dispatches and draw packets. */
-	for (unsigned start_prim = 0; start_prim < num_prims; start_prim += SPLIT_PRIMS_PACKET_LEVEL) {
-		unsigned num_subdraw_prims;
-
-		if (start_prim + SPLIT_PRIMS_PACKET_LEVEL < num_prims)
-			num_subdraw_prims = SPLIT_PRIMS_PACKET_LEVEL;
-		else
-			num_subdraw_prims = num_prims - start_prim;
-
-		/* Small dispatches are executed back to back until a specific primitive
-		 * count is reached. Then, a CS_DONE is inserted to signal the gfx IB
-		 * to start drawing the batch. This batching adds latency to the gfx IB,
-		 * but CS_DONE and REWIND are too slow.
-		 */
-		if (sctx->compute_num_prims_in_batch + num_subdraw_prims > PRIMS_PER_BATCH)
-			si_compute_signal_gfx(sctx);
-
-		if (sctx->compute_num_prims_in_batch == 0) {
-			assert((gfx_cs->gpu_address >> 32) == sctx->screen->info.address32_hi);
-			sctx->compute_rewind_va = gfx_cs->gpu_address + (gfx_cs->current.cdw + 1) * 4;
-
-			if (sctx->chip_class <= GFX7 || FORCE_REWIND_EMULATION) {
-				radeon_emit(gfx_cs, PKT3(PKT3_NOP, 0, 0));
-				radeon_emit(gfx_cs, 0);
-
-				si_cp_wait_mem(sctx, gfx_cs,
-					       sctx->compute_rewind_va |
-					       (uint64_t)sctx->screen->info.address32_hi << 32,
-					       REWIND_SIGNAL_BIT, REWIND_SIGNAL_BIT,
-					       WAIT_REG_MEM_EQUAL | WAIT_REG_MEM_PFP);
-
-				/* Use INDIRECT_BUFFER to chain to a different buffer
-				 * to discard the CP prefetch cache.
-				 */
-				sctx->ws->cs_check_space(gfx_cs, 0, true);
-			} else {
-				radeon_emit(gfx_cs, PKT3(PKT3_REWIND, 0, 0));
-				radeon_emit(gfx_cs, 0);
-			}
-		}
-
-		sctx->compute_num_prims_in_batch += num_subdraw_prims;
-
-		uint32_t count_va = gfx_cs->gpu_address + (gfx_cs->current.cdw + 4) * 4;
-		uint64_t index_va = out_indexbuf_va + start_prim * 12;
-
-		/* Emit the draw packet into the gfx IB. */
-		radeon_emit(gfx_cs, PKT3(PKT3_DRAW_INDEX_2, 4, 0));
-		radeon_emit(gfx_cs, num_prims * vertices_per_prim);
-		radeon_emit(gfx_cs, index_va);
-		radeon_emit(gfx_cs, index_va >> 32);
-		radeon_emit(gfx_cs, 0);
-		radeon_emit(gfx_cs, V_0287F0_DI_SRC_SEL_DMA);
-
-		/* Continue with the compute IB. */
-		if (start_prim == 0) {
-			uint32_t gds_prim_restart_continue_bit = 0;
-
-			if (sctx->preserve_prim_restart_gds_at_flush) {
-				assert(primitive_restart &&
-				       info->mode == PIPE_PRIM_TRIANGLE_STRIP);
-				assert(start_prim < 1 << 31);
-				gds_prim_restart_continue_bit = 1 << 31;
-			}
-
-			radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0, user_sgprs);
-			radeon_emit(cs, index_buffers_va);
-			radeon_emit(cs,
-				    VERTEX_COUNTER_GDS_MODE == 0 ? count_va :
-				    VERTEX_COUNTER_GDS_MODE == 1 ? gds_offset :
-								   start_prim |
-								   gds_prim_restart_continue_bit);
-			radeon_emit(cs, start_prim + num_subdraw_prims - 1);
-			radeon_emit(cs, count_va);
-			radeon_emit(cs, vb_desc_va);
-			radeon_emit(cs, vs_const_desc_va);
-			radeon_emit(cs, vs_sampler_desc_va);
-			radeon_emit(cs, base_vertex);
-			radeon_emit(cs, info->start_instance);
-			radeon_emit(cs, num_prims_udiv.multiplier);
-			radeon_emit(cs, num_prims_udiv.post_shift |
-					(num_prims_per_instance << 5));
-			radeon_emit(cs, info->restart_index);
-			/* small-prim culling precision (same as rasterizer precision = QUANT_MODE) */
-			radeon_emit(cs, fui(small_prim_cull_precision));
-		} else {
-			assert(VERTEX_COUNTER_GDS_MODE == 2);
-			/* Only update the SGPRs that changed. */
-			radeon_set_sh_reg_seq(cs, R_00B904_COMPUTE_USER_DATA_1, 3);
-			radeon_emit(cs, start_prim);
-			radeon_emit(cs, start_prim + num_subdraw_prims - 1);
-			radeon_emit(cs, count_va);
-		}
-
-		/* Set grid dimensions. */
-		unsigned start_block = start_prim / THREADGROUP_SIZE;
-		unsigned num_full_blocks = num_subdraw_prims / THREADGROUP_SIZE;
-		unsigned partial_block_size = num_subdraw_prims % THREADGROUP_SIZE;
-
-		radeon_set_sh_reg(cs, R_00B810_COMPUTE_START_X, start_block);
-		radeon_set_sh_reg(cs, R_00B81C_COMPUTE_NUM_THREAD_X,
-				  S_00B81C_NUM_THREAD_FULL(THREADGROUP_SIZE) |
-				  S_00B81C_NUM_THREAD_PARTIAL(partial_block_size));
-
-		radeon_emit(cs, PKT3(PKT3_DISPATCH_DIRECT, 3, 0) |
-				PKT3_SHADER_TYPE_S(1));
-		radeon_emit(cs, start_block + num_full_blocks + !!partial_block_size);
-		radeon_emit(cs, 1);
-		radeon_emit(cs, 1);
-		radeon_emit(cs, S_00B800_COMPUTE_SHADER_EN(1) |
-				S_00B800_PARTIAL_TG_EN(!!partial_block_size) |
-				S_00B800_ORDERED_APPEND_ENBL(VERTEX_COUNTER_GDS_MODE == 2) |
-				S_00B800_ORDER_MODE(0 /* launch in order */));
-
-		/* This is only for unordered append. Ordered append writes this from
-		 * the shader.
-		 *
-		 * Note that EOP and EOS events are super slow, so emulating the event
-		 * in a shader is an important optimization.
-		 */
-		if (VERTEX_COUNTER_GDS_MODE == 1) {
-			si_cp_release_mem(sctx, cs, V_028A90_CS_DONE, 0,
-					  sctx->chip_class <= GFX8 ? EOP_DST_SEL_MEM : EOP_DST_SEL_TC_L2,
-					  EOP_INT_SEL_NONE,
-					  EOP_DATA_SEL_GDS,
-					  NULL,
-					  count_va | ((uint64_t)sctx->screen->info.address32_hi << 32),
-					  EOP_DATA_GDS(gds_offset / 4, 1),
-					  SI_NOT_QUERY);
-
-			/* Now that compute shaders are running, clear the remainder of GDS. */
-			if (first_dispatch) {
-				unsigned offset = gds_offset + gds_size;
-				si_cp_dma_clear_buffer(sctx, cs, NULL, offset,
-						       GDS_SIZE_UNORDERED - offset,
-						       0,
-						       SI_CPDMA_SKIP_CHECK_CS_SPACE |
-						       SI_CPDMA_SKIP_GFX_SYNC |
-						       SI_CPDMA_SKIP_SYNC_BEFORE,
-						       SI_COHERENCY_NONE, L2_BYPASS);
-			}
-		}
-		first_dispatch = false;
-
-		assert(cs->current.cdw <= cs->current.max_dw);
-		assert(gfx_cs->current.cdw <= gfx_cs->current.max_dw);
-	}
+   struct radeon_cmdbuf *gfx_cs = sctx->gfx_cs;
+   struct radeon_cmdbuf *cs = sctx->prim_discard_compute_cs;
+   unsigned num_prims_per_instance = u_decomposed_prims_for_vertices(info->mode, info->count);
+   if (!num_prims_per_instance)
+      return;
+
+   unsigned num_prims = num_prims_per_instance * info->instance_count;
+   unsigned vertices_per_prim, output_indexbuf_format, gfx10_output_indexbuf_format;
+
+   switch (info->mode) {
+   case PIPE_PRIM_TRIANGLES:
+   case PIPE_PRIM_TRIANGLE_STRIP:
+   case PIPE_PRIM_TRIANGLE_FAN:
+      vertices_per_prim = 3;
+      output_indexbuf_format = V_008F0C_BUF_DATA_FORMAT_32_32_32;
+      gfx10_output_indexbuf_format = V_008F0C_IMG_FORMAT_32_32_32_UINT;
+      break;
+   default:
+      unreachable("unsupported primitive type");
+      return;
+   }
+
+   unsigned out_indexbuf_offset;
+   uint64_t output_indexbuf_size = num_prims * vertices_per_prim * 4;
+   bool first_dispatch = !sctx->prim_discard_compute_ib_initialized;
+
+   /* Initialize the compute IB if it's empty. */
+   if (!sctx->prim_discard_compute_ib_initialized) {
+      /* 1) State initialization. */
+      sctx->compute_gds_offset = 0;
+      sctx->compute_ib_last_shader = NULL;
+
+      if (sctx->last_ib_barrier_fence) {
+         assert(!sctx->last_ib_barrier_buf);
+         sctx->ws->cs_add_fence_dependency(gfx_cs, sctx->last_ib_barrier_fence,
+                                           RADEON_DEPENDENCY_PARALLEL_COMPUTE_ONLY);
+      }
+
+      /* 2) IB initialization. */
+
+      /* This needs to be done at the beginning of IBs due to possible
+       * TTM buffer moves in the kernel.
+       */
+      if (sctx->chip_class >= GFX10) {
+         radeon_emit(cs, PKT3(PKT3_ACQUIRE_MEM, 6, 0));
+         radeon_emit(cs, 0);          /* CP_COHER_CNTL */
+         radeon_emit(cs, 0xffffffff); /* CP_COHER_SIZE */
+         radeon_emit(cs, 0xffffff);   /* CP_COHER_SIZE_HI */
+         radeon_emit(cs, 0);          /* CP_COHER_BASE */
+         radeon_emit(cs, 0);          /* CP_COHER_BASE_HI */
+         radeon_emit(cs, 0x0000000A); /* POLL_INTERVAL */
+         radeon_emit(cs,              /* GCR_CNTL */
+                     S_586_GLI_INV(V_586_GLI_ALL) | S_586_GLK_INV(1) | S_586_GLV_INV(1) |
+                        S_586_GL1_INV(1) | S_586_GL2_INV(1) | S_586_GL2_WB(1) | S_586_GLM_INV(1) |
+                        S_586_GLM_WB(1) | S_586_SEQ(V_586_SEQ_FORWARD));
+      } else {
+         si_emit_surface_sync(sctx, cs,
+                              S_0085F0_TC_ACTION_ENA(1) | S_0085F0_TCL1_ACTION_ENA(1) |
+                                 S_0301F0_TC_WB_ACTION_ENA(sctx->chip_class >= GFX8) |
+                                 S_0085F0_SH_ICACHE_ACTION_ENA(1) |
+                                 S_0085F0_SH_KCACHE_ACTION_ENA(1));
+      }
+
+      /* Restore the GDS prim restart counter if needed. */
+      if (sctx->preserve_prim_restart_gds_at_flush) {
+         si_cp_copy_data(sctx, cs, COPY_DATA_GDS, NULL, 4, COPY_DATA_SRC_MEM,
+                         sctx->wait_mem_scratch, 4);
+      }
+
+      si_emit_initial_compute_regs(sctx, cs);
+
+      radeon_set_sh_reg(
+         cs, R_00B860_COMPUTE_TMPRING_SIZE,
+         S_00B860_WAVES(sctx->scratch_waves) | S_00B860_WAVESIZE(0)); /* no scratch */
+
+      /* Only 1D grids are launched. */
+      radeon_set_sh_reg_seq(cs, R_00B820_COMPUTE_NUM_THREAD_Y, 2);
+      radeon_emit(cs, S_00B820_NUM_THREAD_FULL(1) | S_00B820_NUM_THREAD_PARTIAL(1));
+      radeon_emit(cs, S_00B824_NUM_THREAD_FULL(1) | S_00B824_NUM_THREAD_PARTIAL(1));
+
+      radeon_set_sh_reg_seq(cs, R_00B814_COMPUTE_START_Y, 2);
+      radeon_emit(cs, 0);
+      radeon_emit(cs, 0);
+
+      /* Disable ordered alloc for OA resources. */
+      for (unsigned i = 0; i < 2; i++) {
+         radeon_set_uconfig_reg_seq(cs, R_031074_GDS_OA_CNTL, 3);
+         radeon_emit(cs, S_031074_INDEX(i));
+         radeon_emit(cs, 0);
+         radeon_emit(cs, S_03107C_ENABLE(0));
+      }
+
+      if (sctx->last_ib_barrier_buf) {
+         assert(!sctx->last_ib_barrier_fence);
+         radeon_add_to_buffer_list(sctx, gfx_cs, sctx->last_ib_barrier_buf, RADEON_USAGE_READ,
+                                   RADEON_PRIO_FENCE);
+         si_cp_wait_mem(sctx, cs,
+                        sctx->last_ib_barrier_buf->gpu_address + sctx->last_ib_barrier_buf_offset,
+                        1, 1, WAIT_REG_MEM_EQUAL);
+      }
+
+      sctx->prim_discard_compute_ib_initialized = true;
+   }
+
+   /* Allocate the output index buffer. */
+   output_indexbuf_size = align(output_indexbuf_size, sctx->screen->info.tcc_cache_line_size);
+   assert(sctx->index_ring_offset + output_indexbuf_size <= sctx->index_ring_size_per_ib);
+   out_indexbuf_offset = sctx->index_ring_base + sctx->index_ring_offset;
+   sctx->index_ring_offset += output_indexbuf_size;
+
+   radeon_add_to_buffer_list(sctx, gfx_cs, sctx->index_ring, RADEON_USAGE_READWRITE,
+                             RADEON_PRIO_SHADER_RW_BUFFER);
+   uint64_t out_indexbuf_va = sctx->index_ring->gpu_address + out_indexbuf_offset;
+
+   /* Prepare index buffer descriptors. */
+   struct si_resource *indexbuf_desc = NULL;
+   unsigned indexbuf_desc_offset;
+   unsigned desc_size = 12 * 4;
+   uint32_t *desc;
+
+   u_upload_alloc(sctx->b.const_uploader, 0, desc_size, si_optimal_tcc_alignment(sctx, desc_size),
+                  &indexbuf_desc_offset, (struct pipe_resource **)&indexbuf_desc, (void **)&desc);
+   radeon_add_to_buffer_list(sctx, gfx_cs, indexbuf_desc, RADEON_USAGE_READ,
+                             RADEON_PRIO_DESCRIPTORS);
+
+   /* Input index buffer. */
+   desc[0] = input_indexbuf_va;
+   desc[1] = S_008F04_BASE_ADDRESS_HI(input_indexbuf_va >> 32) | S_008F04_STRIDE(index_size);
+   desc[2] = input_indexbuf_num_elements * (sctx->chip_class == GFX8 ? index_size : 1);
+
+   if (sctx->chip_class >= GFX10) {
+      desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
+                S_008F0C_FORMAT(index_size == 1 ? V_008F0C_IMG_FORMAT_8_UINT
+                                                : index_size == 2 ? V_008F0C_IMG_FORMAT_16_UINT
+                                                                  : V_008F0C_IMG_FORMAT_32_UINT) |
+                S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_STRUCTURED_WITH_OFFSET) |
+                S_008F0C_RESOURCE_LEVEL(1);
+   } else {
+      desc[3] =
+         S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_UINT) |
+         S_008F0C_DATA_FORMAT(index_size == 1 ? V_008F0C_BUF_DATA_FORMAT_8
+                                              : index_size == 2 ? V_008F0C_BUF_DATA_FORMAT_16
+                                                                : V_008F0C_BUF_DATA_FORMAT_32);
+   }
+
+   /* Output index buffer. */
+   desc[4] = out_indexbuf_va;
+   desc[5] =
+      S_008F04_BASE_ADDRESS_HI(out_indexbuf_va >> 32) | S_008F04_STRIDE(vertices_per_prim * 4);
+   desc[6] = num_prims * (sctx->chip_class == GFX8 ? vertices_per_prim * 4 : 1);
+
+   if (sctx->chip_class >= GFX10) {
+      desc[7] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
+                S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_0) |
+                S_008F0C_FORMAT(gfx10_output_indexbuf_format) |
+                S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_STRUCTURED_WITH_OFFSET) |
+                S_008F0C_RESOURCE_LEVEL(1);
+   } else {
+      desc[7] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
+                S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_0) |
+                S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_UINT) |
+                S_008F0C_DATA_FORMAT(output_indexbuf_format);
+   }
+
+   /* Viewport state. */
+   struct si_small_prim_cull_info cull_info;
+   si_get_small_prim_cull_info(sctx, &cull_info);
+
+   desc[8] = fui(cull_info.scale[0]);
+   desc[9] = fui(cull_info.scale[1]);
+   desc[10] = fui(cull_info.translate[0]);
+   desc[11] = fui(cull_info.translate[1]);
+
+   /* Better subpixel precision increases the efficiency of small
+    * primitive culling. */
+   unsigned num_samples = sctx->framebuffer.nr_samples;
+   unsigned quant_mode = sctx->viewports.as_scissor[0].quant_mode;
+   float small_prim_cull_precision;
+
+   if (quant_mode == SI_QUANT_MODE_12_12_FIXED_POINT_1_4096TH)
+      small_prim_cull_precision = num_samples / 4096.0;
+   else if (quant_mode == SI_QUANT_MODE_14_10_FIXED_POINT_1_1024TH)
+      small_prim_cull_precision = num_samples / 1024.0;
+   else
+      small_prim_cull_precision = num_samples / 256.0;
+
+   /* Set user data SGPRs. */
+   /* This can't be greater than 14 if we want the fastest launch rate. */
+   unsigned user_sgprs = 13;
+
+   uint64_t index_buffers_va = indexbuf_desc->gpu_address + indexbuf_desc_offset;
+   unsigned vs_const_desc = si_const_and_shader_buffer_descriptors_idx(PIPE_SHADER_VERTEX);
+   unsigned vs_sampler_desc = si_sampler_and_image_descriptors_idx(PIPE_SHADER_VERTEX);
+   uint64_t vs_const_desc_va = sctx->descriptors[vs_const_desc].gpu_address;
+   uint64_t vs_sampler_desc_va = sctx->descriptors[vs_sampler_desc].gpu_address;
+   uint64_t vb_desc_va = sctx->vb_descriptors_buffer
+                            ? sctx->vb_descriptors_buffer->gpu_address + sctx->vb_descriptors_offset
+                            : 0;
+   unsigned gds_offset, gds_size;
+   struct si_fast_udiv_info32 num_prims_udiv = {};
+
+   if (info->instance_count > 1)
+      num_prims_udiv = si_compute_fast_udiv_info32(num_prims_per_instance, 31);
+
+   /* Limitations on how these two are packed in the user SGPR. */
+   assert(num_prims_udiv.post_shift < 32);
+   assert(num_prims_per_instance < 1 << 27);
+
+   si_resource_reference(&indexbuf_desc, NULL);
+
+   bool primitive_restart = sctx->cs_prim_discard_state.current->key.opt.cs_primitive_restart;
+
+   if (VERTEX_COUNTER_GDS_MODE == 1) {
+      gds_offset = sctx->compute_gds_offset;
+      gds_size = primitive_restart ? 8 : 4;
+      sctx->compute_gds_offset += gds_size;
+
+      /* Reset the counters in GDS for the first dispatch using WRITE_DATA.
+       * The remainder of the GDS will be cleared after the dispatch packet
+       * in parallel with compute shaders.
+       */
+      if (first_dispatch) {
+         radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 2 + gds_size / 4, 0));
+         radeon_emit(cs, S_370_DST_SEL(V_370_GDS) | S_370_WR_CONFIRM(1));
+         radeon_emit(cs, gds_offset);
+         radeon_emit(cs, 0);
+         radeon_emit(cs, 0); /* value to write */
+         if (gds_size == 8)
+            radeon_emit(cs, 0);
+      }
+   }
+
+   /* Set shader registers. */
+   struct si_shader *shader = sctx->cs_prim_discard_state.current;
+
+   if (shader != sctx->compute_ib_last_shader) {
+      radeon_add_to_buffer_list(sctx, gfx_cs, shader->bo, RADEON_USAGE_READ,
+                                RADEON_PRIO_SHADER_BINARY);
+      uint64_t shader_va = shader->bo->gpu_address;
+
+      assert(shader->config.scratch_bytes_per_wave == 0);
+      assert(shader->config.num_vgprs * WAVES_PER_TG <= 256 * 4);
+
+      radeon_set_sh_reg_seq(cs, R_00B830_COMPUTE_PGM_LO, 2);
+      radeon_emit(cs, shader_va >> 8);
+      radeon_emit(cs, S_00B834_DATA(shader_va >> 40));
+
+      radeon_set_sh_reg_seq(cs, R_00B848_COMPUTE_PGM_RSRC1, 2);
+      radeon_emit(
+         cs, S_00B848_VGPRS((shader->config.num_vgprs - 1) / 4) |
+                S_00B848_SGPRS(sctx->chip_class <= GFX9 ? (shader->config.num_sgprs - 1) / 8 : 0) |
+                S_00B848_FLOAT_MODE(shader->config.float_mode) | S_00B848_DX10_CLAMP(1) |
+                S_00B848_MEM_ORDERED(sctx->chip_class >= GFX10) |
+                S_00B848_WGP_MODE(sctx->chip_class >= GFX10));
+      radeon_emit(cs, S_00B84C_SCRATCH_EN(0 /* no scratch */) | S_00B84C_USER_SGPR(user_sgprs) |
+                         S_00B84C_TGID_X_EN(1 /* only blockID.x is used */) |
+                         S_00B84C_TG_SIZE_EN(VERTEX_COUNTER_GDS_MODE == 2 /* need the wave ID */) |
+                         S_00B84C_TIDIG_COMP_CNT(0 /* only threadID.x is used */) |
+                         S_00B84C_LDS_SIZE(shader->config.lds_size));
+
+      radeon_set_sh_reg(cs, R_00B854_COMPUTE_RESOURCE_LIMITS,
+                        ac_get_compute_resource_limits(&sctx->screen->info, WAVES_PER_TG,
+                                                       MAX_WAVES_PER_SH, THREADGROUPS_PER_CU));
+      sctx->compute_ib_last_shader = shader;
+   }
+
+   STATIC_ASSERT(SPLIT_PRIMS_PACKET_LEVEL % THREADGROUP_SIZE == 0);
+
+   /* Big draw calls are split into smaller dispatches and draw packets. */
+   for (unsigned start_prim = 0; start_prim < num_prims; start_prim += SPLIT_PRIMS_PACKET_LEVEL) {
+      unsigned num_subdraw_prims;
+
+      if (start_prim + SPLIT_PRIMS_PACKET_LEVEL < num_prims)
+         num_subdraw_prims = SPLIT_PRIMS_PACKET_LEVEL;
+      else
+         num_subdraw_prims = num_prims - start_prim;
+
+      /* Small dispatches are executed back to back until a specific primitive
+       * count is reached. Then, a CS_DONE is inserted to signal the gfx IB
+       * to start drawing the batch. This batching adds latency to the gfx IB,
+       * but CS_DONE and REWIND are too slow.
+       */
+      if (sctx->compute_num_prims_in_batch + num_subdraw_prims > PRIMS_PER_BATCH)
+         si_compute_signal_gfx(sctx);
+
+      if (sctx->compute_num_prims_in_batch == 0) {
+         assert((gfx_cs->gpu_address >> 32) == sctx->screen->info.address32_hi);
+         sctx->compute_rewind_va = gfx_cs->gpu_address + (gfx_cs->current.cdw + 1) * 4;
+
+         if (sctx->chip_class <= GFX7 || FORCE_REWIND_EMULATION) {
+            radeon_emit(gfx_cs, PKT3(PKT3_NOP, 0, 0));
+            radeon_emit(gfx_cs, 0);
+
+            si_cp_wait_mem(
+               sctx, gfx_cs,
+               sctx->compute_rewind_va | (uint64_t)sctx->screen->info.address32_hi << 32,
+               REWIND_SIGNAL_BIT, REWIND_SIGNAL_BIT, WAIT_REG_MEM_EQUAL | WAIT_REG_MEM_PFP);
+
+            /* Use INDIRECT_BUFFER to chain to a different buffer
+             * to discard the CP prefetch cache.
+             */
+            sctx->ws->cs_check_space(gfx_cs, 0, true);
+         } else {
+            radeon_emit(gfx_cs, PKT3(PKT3_REWIND, 0, 0));
+            radeon_emit(gfx_cs, 0);
+         }
+      }
+
+      sctx->compute_num_prims_in_batch += num_subdraw_prims;
+
+      uint32_t count_va = gfx_cs->gpu_address + (gfx_cs->current.cdw + 4) * 4;
+      uint64_t index_va = out_indexbuf_va + start_prim * 12;
+
+      /* Emit the draw packet into the gfx IB. */
+      radeon_emit(gfx_cs, PKT3(PKT3_DRAW_INDEX_2, 4, 0));
+      radeon_emit(gfx_cs, num_prims * vertices_per_prim);
+      radeon_emit(gfx_cs, index_va);
+      radeon_emit(gfx_cs, index_va >> 32);
+      radeon_emit(gfx_cs, 0);
+      radeon_emit(gfx_cs, V_0287F0_DI_SRC_SEL_DMA);
+
+      /* Continue with the compute IB. */
+      if (start_prim == 0) {
+         uint32_t gds_prim_restart_continue_bit = 0;
+
+         if (sctx->preserve_prim_restart_gds_at_flush) {
+            assert(primitive_restart && info->mode == PIPE_PRIM_TRIANGLE_STRIP);
+            assert(start_prim < 1 << 31);
+            gds_prim_restart_continue_bit = 1 << 31;
+         }
+
+         radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0, user_sgprs);
+         radeon_emit(cs, index_buffers_va);
+         radeon_emit(cs, VERTEX_COUNTER_GDS_MODE == 0
+                            ? count_va
+                            : VERTEX_COUNTER_GDS_MODE == 1
+                                 ? gds_offset
+                                 : start_prim | gds_prim_restart_continue_bit);
+         radeon_emit(cs, start_prim + num_subdraw_prims - 1);
+         radeon_emit(cs, count_va);
+         radeon_emit(cs, vb_desc_va);
+         radeon_emit(cs, vs_const_desc_va);
+         radeon_emit(cs, vs_sampler_desc_va);
+         radeon_emit(cs, base_vertex);
+         radeon_emit(cs, info->start_instance);
+         radeon_emit(cs, num_prims_udiv.multiplier);
+         radeon_emit(cs, num_prims_udiv.post_shift | (num_prims_per_instance << 5));
+         radeon_emit(cs, info->restart_index);
+         /* small-prim culling precision (same as rasterizer precision = QUANT_MODE) */
+         radeon_emit(cs, fui(small_prim_cull_precision));
+      } else {
+         assert(VERTEX_COUNTER_GDS_MODE == 2);
+         /* Only update the SGPRs that changed. */
+         radeon_set_sh_reg_seq(cs, R_00B904_COMPUTE_USER_DATA_1, 3);
+         radeon_emit(cs, start_prim);
+         radeon_emit(cs, start_prim + num_subdraw_prims - 1);
+         radeon_emit(cs, count_va);
+      }
+
+      /* Set grid dimensions. */
+      unsigned start_block = start_prim / THREADGROUP_SIZE;
+      unsigned num_full_blocks = num_subdraw_prims / THREADGROUP_SIZE;
+      unsigned partial_block_size = num_subdraw_prims % THREADGROUP_SIZE;
+
+      radeon_set_sh_reg(cs, R_00B810_COMPUTE_START_X, start_block);
+      radeon_set_sh_reg(cs, R_00B81C_COMPUTE_NUM_THREAD_X,
+                        S_00B81C_NUM_THREAD_FULL(THREADGROUP_SIZE) |
+                           S_00B81C_NUM_THREAD_PARTIAL(partial_block_size));
+
+      radeon_emit(cs, PKT3(PKT3_DISPATCH_DIRECT, 3, 0) | PKT3_SHADER_TYPE_S(1));
+      radeon_emit(cs, start_block + num_full_blocks + !!partial_block_size);
+      radeon_emit(cs, 1);
+      radeon_emit(cs, 1);
+      radeon_emit(cs, S_00B800_COMPUTE_SHADER_EN(1) | S_00B800_PARTIAL_TG_EN(!!partial_block_size) |
+                         S_00B800_ORDERED_APPEND_ENBL(VERTEX_COUNTER_GDS_MODE == 2) |
+                         S_00B800_ORDER_MODE(0 /* launch in order */));
+
+      /* This is only for unordered append. Ordered append writes this from
+       * the shader.
+       *
+       * Note that EOP and EOS events are super slow, so emulating the event
+       * in a shader is an important optimization.
+       */
+      if (VERTEX_COUNTER_GDS_MODE == 1) {
+         si_cp_release_mem(sctx, cs, V_028A90_CS_DONE, 0,
+                           sctx->chip_class <= GFX8 ? EOP_DST_SEL_MEM : EOP_DST_SEL_TC_L2,
+                           EOP_INT_SEL_NONE, EOP_DATA_SEL_GDS, NULL,
+                           count_va | ((uint64_t)sctx->screen->info.address32_hi << 32),
+                           EOP_DATA_GDS(gds_offset / 4, 1), SI_NOT_QUERY);
+
+         /* Now that compute shaders are running, clear the remainder of GDS. */
+         if (first_dispatch) {
+            unsigned offset = gds_offset + gds_size;
+            si_cp_dma_clear_buffer(
+               sctx, cs, NULL, offset, GDS_SIZE_UNORDERED - offset, 0,
+               SI_CPDMA_SKIP_CHECK_CS_SPACE | SI_CPDMA_SKIP_GFX_SYNC | SI_CPDMA_SKIP_SYNC_BEFORE,
+               SI_COHERENCY_NONE, L2_BYPASS);
+         }
+      }
+      first_dispatch = false;
+
+      assert(cs->current.cdw <= cs->current.max_dw);
+      assert(gfx_cs->current.cdw <= gfx_cs->current.max_dw);
+   }
 }
diff --git a/src/gallium/drivers/radeonsi/si_cp_dma.c b/src/gallium/drivers/radeonsi/si_cp_dma.c
index 2ef41e44ded..391c4f8d50b 100644
--- a/src/gallium/drivers/radeonsi/si_cp_dma.c
+++ b/src/gallium/drivers/radeonsi/si_cp_dma.c
@@ -27,232 +27,221 @@
 
 /* Set this if you want the ME to wait until CP DMA is done.
  * It should be set on the last CP DMA packet. */
-#define CP_DMA_SYNC		(1 << 0)
+#define CP_DMA_SYNC (1 << 0)
 
 /* Set this if the source data was used as a destination in a previous CP DMA
  * packet. It's for preventing a read-after-write (RAW) hazard between two
  * CP DMA packets. */
-#define CP_DMA_RAW_WAIT		(1 << 1)
-#define CP_DMA_DST_IS_GDS	(1 << 2)
-#define CP_DMA_CLEAR		(1 << 3)
-#define CP_DMA_PFP_SYNC_ME	(1 << 4)
-#define CP_DMA_SRC_IS_GDS	(1 << 5)
+#define CP_DMA_RAW_WAIT    (1 << 1)
+#define CP_DMA_DST_IS_GDS  (1 << 2)
+#define CP_DMA_CLEAR       (1 << 3)
+#define CP_DMA_PFP_SYNC_ME (1 << 4)
+#define CP_DMA_SRC_IS_GDS  (1 << 5)
 
 /* The max number of bytes that can be copied per packet. */
 static inline unsigned cp_dma_max_byte_count(struct si_context *sctx)
 {
-	unsigned max = sctx->chip_class >= GFX9 ?
-			       S_414_BYTE_COUNT_GFX9(~0u) :
-			       S_414_BYTE_COUNT_GFX6(~0u);
+   unsigned max =
+      sctx->chip_class >= GFX9 ? S_414_BYTE_COUNT_GFX9(~0u) : S_414_BYTE_COUNT_GFX6(~0u);
 
-	/* make it aligned for optimal performance */
-	return max & ~(SI_CPDMA_ALIGNMENT - 1);
+   /* make it aligned for optimal performance */
+   return max & ~(SI_CPDMA_ALIGNMENT - 1);
 }
 
-
 /* Emit a CP DMA packet to do a copy from one buffer to another, or to clear
  * a buffer. The size must fit in bits [20:0]. If CP_DMA_CLEAR is set, src_va is a 32-bit
  * clear value.
  */
-static void si_emit_cp_dma(struct si_context *sctx, struct radeon_cmdbuf *cs,
-			   uint64_t dst_va, uint64_t src_va, unsigned size,
-			   unsigned flags, enum si_cache_policy cache_policy)
+static void si_emit_cp_dma(struct si_context *sctx, struct radeon_cmdbuf *cs, uint64_t dst_va,
+                           uint64_t src_va, unsigned size, unsigned flags,
+                           enum si_cache_policy cache_policy)
 {
-	uint32_t header = 0, command = 0;
-
-	assert(size <= cp_dma_max_byte_count(sctx));
-	assert(sctx->chip_class != GFX6 || cache_policy == L2_BYPASS);
-
-	if (sctx->chip_class >= GFX9)
-		command |= S_414_BYTE_COUNT_GFX9(size);
-	else
-		command |= S_414_BYTE_COUNT_GFX6(size);
-
-	/* Sync flags. */
-	if (flags & CP_DMA_SYNC)
-		header |= S_411_CP_SYNC(1);
-	else {
-		if (sctx->chip_class >= GFX9)
-			command |= S_414_DISABLE_WR_CONFIRM_GFX9(1);
-		else
-			command |= S_414_DISABLE_WR_CONFIRM_GFX6(1);
-	}
-
-	if (flags & CP_DMA_RAW_WAIT)
-		command |= S_414_RAW_WAIT(1);
-
-	/* Src and dst flags. */
-	if (sctx->chip_class >= GFX9 && !(flags & CP_DMA_CLEAR) &&
-	    src_va == dst_va) {
-		header |= S_411_DST_SEL(V_411_NOWHERE); /* prefetch only */
-	} else if (flags & CP_DMA_DST_IS_GDS) {
-		header |= S_411_DST_SEL(V_411_GDS);
-		/* GDS increments the address, not CP. */
-		command |= S_414_DAS(V_414_REGISTER) |
-			   S_414_DAIC(V_414_NO_INCREMENT);
-	} else if (sctx->chip_class >= GFX7 && cache_policy != L2_BYPASS) {
-		header |= S_411_DST_SEL(V_411_DST_ADDR_TC_L2) |
-			  S_500_DST_CACHE_POLICY(cache_policy == L2_STREAM);
-	}
-
-	if (flags & CP_DMA_CLEAR) {
-		header |= S_411_SRC_SEL(V_411_DATA);
-	} else if (flags & CP_DMA_SRC_IS_GDS) {
-		header |= S_411_SRC_SEL(V_411_GDS);
-		/* Both of these are required for GDS. It does increment the address. */
-		command |= S_414_SAS(V_414_REGISTER) |
-			   S_414_SAIC(V_414_NO_INCREMENT);
-	} else if (sctx->chip_class >= GFX7 && cache_policy != L2_BYPASS) {
-		header |= S_411_SRC_SEL(V_411_SRC_ADDR_TC_L2) |
-			  S_500_SRC_CACHE_POLICY(cache_policy == L2_STREAM);
-	}
-
-	if (sctx->chip_class >= GFX7) {
-		radeon_emit(cs, PKT3(PKT3_DMA_DATA, 5, 0));
-		radeon_emit(cs, header);
-		radeon_emit(cs, src_va);	/* SRC_ADDR_LO [31:0] */
-		radeon_emit(cs, src_va >> 32);	/* SRC_ADDR_HI [31:0] */
-		radeon_emit(cs, dst_va);	/* DST_ADDR_LO [31:0] */
-		radeon_emit(cs, dst_va >> 32);	/* DST_ADDR_HI [31:0] */
-		radeon_emit(cs, command);
-	} else {
-		header |= S_411_SRC_ADDR_HI(src_va >> 32);
-
-		radeon_emit(cs, PKT3(PKT3_CP_DMA, 4, 0));
-		radeon_emit(cs, src_va);	/* SRC_ADDR_LO [31:0] */
-		radeon_emit(cs, header);	/* SRC_ADDR_HI [15:0] + flags. */
-		radeon_emit(cs, dst_va);	/* DST_ADDR_LO [31:0] */
-		radeon_emit(cs, (dst_va >> 32) & 0xffff); /* DST_ADDR_HI [15:0] */
-		radeon_emit(cs, command);
-	}
-
-	/* CP DMA is executed in ME, but index buffers are read by PFP.
-	 * This ensures that ME (CP DMA) is idle before PFP starts fetching
-	 * indices. If we wanted to execute CP DMA in PFP, this packet
-	 * should precede it.
-	 */
-	if (sctx->has_graphics && flags & CP_DMA_PFP_SYNC_ME) {
-		radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));
-		radeon_emit(cs, 0);
-	}
+   uint32_t header = 0, command = 0;
+
+   assert(size <= cp_dma_max_byte_count(sctx));
+   assert(sctx->chip_class != GFX6 || cache_policy == L2_BYPASS);
+
+   if (sctx->chip_class >= GFX9)
+      command |= S_414_BYTE_COUNT_GFX9(size);
+   else
+      command |= S_414_BYTE_COUNT_GFX6(size);
+
+   /* Sync flags. */
+   if (flags & CP_DMA_SYNC)
+      header |= S_411_CP_SYNC(1);
+   else {
+      if (sctx->chip_class >= GFX9)
+         command |= S_414_DISABLE_WR_CONFIRM_GFX9(1);
+      else
+         command |= S_414_DISABLE_WR_CONFIRM_GFX6(1);
+   }
+
+   if (flags & CP_DMA_RAW_WAIT)
+      command |= S_414_RAW_WAIT(1);
+
+   /* Src and dst flags. */
+   if (sctx->chip_class >= GFX9 && !(flags & CP_DMA_CLEAR) && src_va == dst_va) {
+      header |= S_411_DST_SEL(V_411_NOWHERE); /* prefetch only */
+   } else if (flags & CP_DMA_DST_IS_GDS) {
+      header |= S_411_DST_SEL(V_411_GDS);
+      /* GDS increments the address, not CP. */
+      command |= S_414_DAS(V_414_REGISTER) | S_414_DAIC(V_414_NO_INCREMENT);
+   } else if (sctx->chip_class >= GFX7 && cache_policy != L2_BYPASS) {
+      header |=
+         S_411_DST_SEL(V_411_DST_ADDR_TC_L2) | S_500_DST_CACHE_POLICY(cache_policy == L2_STREAM);
+   }
+
+   if (flags & CP_DMA_CLEAR) {
+      header |= S_411_SRC_SEL(V_411_DATA);
+   } else if (flags & CP_DMA_SRC_IS_GDS) {
+      header |= S_411_SRC_SEL(V_411_GDS);
+      /* Both of these are required for GDS. It does increment the address. */
+      command |= S_414_SAS(V_414_REGISTER) | S_414_SAIC(V_414_NO_INCREMENT);
+   } else if (sctx->chip_class >= GFX7 && cache_policy != L2_BYPASS) {
+      header |=
+         S_411_SRC_SEL(V_411_SRC_ADDR_TC_L2) | S_500_SRC_CACHE_POLICY(cache_policy == L2_STREAM);
+   }
+
+   if (sctx->chip_class >= GFX7) {
+      radeon_emit(cs, PKT3(PKT3_DMA_DATA, 5, 0));
+      radeon_emit(cs, header);
+      radeon_emit(cs, src_va);       /* SRC_ADDR_LO [31:0] */
+      radeon_emit(cs, src_va >> 32); /* SRC_ADDR_HI [31:0] */
+      radeon_emit(cs, dst_va);       /* DST_ADDR_LO [31:0] */
+      radeon_emit(cs, dst_va >> 32); /* DST_ADDR_HI [31:0] */
+      radeon_emit(cs, command);
+   } else {
+      header |= S_411_SRC_ADDR_HI(src_va >> 32);
+
+      radeon_emit(cs, PKT3(PKT3_CP_DMA, 4, 0));
+      radeon_emit(cs, src_va);                  /* SRC_ADDR_LO [31:0] */
+      radeon_emit(cs, header);                  /* SRC_ADDR_HI [15:0] + flags. */
+      radeon_emit(cs, dst_va);                  /* DST_ADDR_LO [31:0] */
+      radeon_emit(cs, (dst_va >> 32) & 0xffff); /* DST_ADDR_HI [15:0] */
+      radeon_emit(cs, command);
+   }
+
+   /* CP DMA is executed in ME, but index buffers are read by PFP.
+    * This ensures that ME (CP DMA) is idle before PFP starts fetching
+    * indices. If we wanted to execute CP DMA in PFP, this packet
+    * should precede it.
+    */
+   if (sctx->has_graphics && flags & CP_DMA_PFP_SYNC_ME) {
+      radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));
+      radeon_emit(cs, 0);
+   }
 }
 
 void si_cp_dma_wait_for_idle(struct si_context *sctx)
 {
-	/* Issue a dummy DMA that copies zero bytes.
-	 *
-	 * The DMA engine will see that there's no work to do and skip this
-	 * DMA request, however, the CP will see the sync flag and still wait
-	 * for all DMAs to complete.
-	 */
-	si_emit_cp_dma(sctx, sctx->gfx_cs, 0, 0, 0, CP_DMA_SYNC, L2_BYPASS);
+   /* Issue a dummy DMA that copies zero bytes.
+    *
+    * The DMA engine will see that there's no work to do and skip this
+    * DMA request, however, the CP will see the sync flag and still wait
+    * for all DMAs to complete.
+    */
+   si_emit_cp_dma(sctx, sctx->gfx_cs, 0, 0, 0, CP_DMA_SYNC, L2_BYPASS);
 }
 
 static void si_cp_dma_prepare(struct si_context *sctx, struct pipe_resource *dst,
-			      struct pipe_resource *src, unsigned byte_count,
-			      uint64_t remaining_size, unsigned user_flags,
-			      enum si_coherency coher, bool *is_first,
-			      unsigned *packet_flags)
+                              struct pipe_resource *src, unsigned byte_count,
+                              uint64_t remaining_size, unsigned user_flags, enum si_coherency coher,
+                              bool *is_first, unsigned *packet_flags)
 {
-	/* Fast exit for a CPDMA prefetch. */
-	if ((user_flags & SI_CPDMA_SKIP_ALL) == SI_CPDMA_SKIP_ALL) {
-		*is_first = false;
-		return;
-	}
-
-	if (!(user_flags & SI_CPDMA_SKIP_BO_LIST_UPDATE)) {
-		/* Count memory usage in so that need_cs_space can take it into account. */
-		if (dst)
-			si_context_add_resource_size(sctx, dst);
-		if (src)
-			si_context_add_resource_size(sctx, src);
-	}
-
-	if (!(user_flags & SI_CPDMA_SKIP_CHECK_CS_SPACE))
-		si_need_gfx_cs_space(sctx);
-
-	/* This must be done after need_cs_space. */
-	if (!(user_flags & SI_CPDMA_SKIP_BO_LIST_UPDATE)) {
-		if (dst)
-			radeon_add_to_buffer_list(sctx, sctx->gfx_cs,
-						  si_resource(dst),
-						  RADEON_USAGE_WRITE, RADEON_PRIO_CP_DMA);
-		if (src)
-			radeon_add_to_buffer_list(sctx, sctx->gfx_cs,
-						  si_resource(src),
-						  RADEON_USAGE_READ, RADEON_PRIO_CP_DMA);
-	}
-
-	/* Flush the caches for the first copy only.
-	 * Also wait for the previous CP DMA operations.
-	 */
-	if (!(user_flags & SI_CPDMA_SKIP_GFX_SYNC) && sctx->flags)
-		sctx->emit_cache_flush(sctx);
-
-	if (!(user_flags & SI_CPDMA_SKIP_SYNC_BEFORE) && *is_first &&
-	    !(*packet_flags & CP_DMA_CLEAR))
-		*packet_flags |= CP_DMA_RAW_WAIT;
-
-	*is_first = false;
-
-	/* Do the synchronization after the last dma, so that all data
-	 * is written to memory.
-	 */
-	if (!(user_flags & SI_CPDMA_SKIP_SYNC_AFTER) &&
-	    byte_count == remaining_size) {
-		*packet_flags |= CP_DMA_SYNC;
-
-		if (coher == SI_COHERENCY_SHADER)
-			*packet_flags |= CP_DMA_PFP_SYNC_ME;
-	}
+   /* Fast exit for a CPDMA prefetch. */
+   if ((user_flags & SI_CPDMA_SKIP_ALL) == SI_CPDMA_SKIP_ALL) {
+      *is_first = false;
+      return;
+   }
+
+   if (!(user_flags & SI_CPDMA_SKIP_BO_LIST_UPDATE)) {
+      /* Count memory usage in so that need_cs_space can take it into account. */
+      if (dst)
+         si_context_add_resource_size(sctx, dst);
+      if (src)
+         si_context_add_resource_size(sctx, src);
+   }
+
+   if (!(user_flags & SI_CPDMA_SKIP_CHECK_CS_SPACE))
+      si_need_gfx_cs_space(sctx);
+
+   /* This must be done after need_cs_space. */
+   if (!(user_flags & SI_CPDMA_SKIP_BO_LIST_UPDATE)) {
+      if (dst)
+         radeon_add_to_buffer_list(sctx, sctx->gfx_cs, si_resource(dst), RADEON_USAGE_WRITE,
+                                   RADEON_PRIO_CP_DMA);
+      if (src)
+         radeon_add_to_buffer_list(sctx, sctx->gfx_cs, si_resource(src), RADEON_USAGE_READ,
+                                   RADEON_PRIO_CP_DMA);
+   }
+
+   /* Flush the caches for the first copy only.
+    * Also wait for the previous CP DMA operations.
+    */
+   if (!(user_flags & SI_CPDMA_SKIP_GFX_SYNC) && sctx->flags)
+      sctx->emit_cache_flush(sctx);
+
+   if (!(user_flags & SI_CPDMA_SKIP_SYNC_BEFORE) && *is_first && !(*packet_flags & CP_DMA_CLEAR))
+      *packet_flags |= CP_DMA_RAW_WAIT;
+
+   *is_first = false;
+
+   /* Do the synchronization after the last dma, so that all data
+    * is written to memory.
+    */
+   if (!(user_flags & SI_CPDMA_SKIP_SYNC_AFTER) && byte_count == remaining_size) {
+      *packet_flags |= CP_DMA_SYNC;
+
+      if (coher == SI_COHERENCY_SHADER)
+         *packet_flags |= CP_DMA_PFP_SYNC_ME;
+   }
 }
 
 void si_cp_dma_clear_buffer(struct si_context *sctx, struct radeon_cmdbuf *cs,
-			    struct pipe_resource *dst, uint64_t offset,
-			    uint64_t size, unsigned value, unsigned user_flags,
-			    enum si_coherency coher, enum si_cache_policy cache_policy)
+                            struct pipe_resource *dst, uint64_t offset, uint64_t size,
+                            unsigned value, unsigned user_flags, enum si_coherency coher,
+                            enum si_cache_policy cache_policy)
 {
-	struct si_resource *sdst = si_resource(dst);
-	uint64_t va = (sdst ? sdst->gpu_address : 0) + offset;
-	bool is_first = true;
-
-	assert(size && size % 4 == 0);
-
-	/* Mark the buffer range of destination as valid (initialized),
-	 * so that transfer_map knows it should wait for the GPU when mapping
-	 * that range. */
-	if (sdst)
-		util_range_add(dst, &sdst->valid_buffer_range, offset, offset + size);
-
-	/* Flush the caches. */
-	if (sdst && !(user_flags & SI_CPDMA_SKIP_GFX_SYNC)) {
-		sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
-			       SI_CONTEXT_CS_PARTIAL_FLUSH |
-			       si_get_flush_flags(sctx, coher, cache_policy);
-	}
-
-	while (size) {
-		unsigned byte_count = MIN2(size, cp_dma_max_byte_count(sctx));
-		unsigned dma_flags = CP_DMA_CLEAR | (sdst ? 0 : CP_DMA_DST_IS_GDS);
-
-		si_cp_dma_prepare(sctx, dst, NULL, byte_count, size, user_flags,
-				  coher, &is_first, &dma_flags);
-
-		/* Emit the clear packet. */
-		si_emit_cp_dma(sctx, cs, va, value, byte_count, dma_flags, cache_policy);
-
-		size -= byte_count;
-		va += byte_count;
-	}
-
-	if (sdst && cache_policy != L2_BYPASS)
-		sdst->TC_L2_dirty = true;
-
-	/* If it's not a framebuffer fast clear... */
-	if (coher == SI_COHERENCY_SHADER) {
-		sctx->num_cp_dma_calls++;
-		si_prim_discard_signal_next_compute_ib_start(sctx);
-	}
+   struct si_resource *sdst = si_resource(dst);
+   uint64_t va = (sdst ? sdst->gpu_address : 0) + offset;
+   bool is_first = true;
+
+   assert(size && size % 4 == 0);
+
+   /* Mark the buffer range of destination as valid (initialized),
+    * so that transfer_map knows it should wait for the GPU when mapping
+    * that range. */
+   if (sdst)
+      util_range_add(dst, &sdst->valid_buffer_range, offset, offset + size);
+
+   /* Flush the caches. */
+   if (sdst && !(user_flags & SI_CPDMA_SKIP_GFX_SYNC)) {
+      sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH |
+                     si_get_flush_flags(sctx, coher, cache_policy);
+   }
+
+   while (size) {
+      unsigned byte_count = MIN2(size, cp_dma_max_byte_count(sctx));
+      unsigned dma_flags = CP_DMA_CLEAR | (sdst ? 0 : CP_DMA_DST_IS_GDS);
+
+      si_cp_dma_prepare(sctx, dst, NULL, byte_count, size, user_flags, coher, &is_first,
+                        &dma_flags);
+
+      /* Emit the clear packet. */
+      si_emit_cp_dma(sctx, cs, va, value, byte_count, dma_flags, cache_policy);
+
+      size -= byte_count;
+      va += byte_count;
+   }
+
+   if (sdst && cache_policy != L2_BYPASS)
+      sdst->TC_L2_dirty = true;
+
+   /* If it's not a framebuffer fast clear... */
+   if (coher == SI_COHERENCY_SHADER) {
+      sctx->num_cp_dma_calls++;
+      si_prim_discard_signal_next_compute_ib_start(sctx);
+   }
 }
 
 /**
@@ -261,41 +250,34 @@ void si_cp_dma_clear_buffer(struct si_context *sctx, struct radeon_cmdbuf *cs,
  *
  * \param size  Remaining size to the CP DMA alignment.
  */
-static void si_cp_dma_realign_engine(struct si_context *sctx, unsigned size,
-				     unsigned user_flags, enum si_coherency coher,
-				     enum si_cache_policy cache_policy,
-				     bool *is_first)
+static void si_cp_dma_realign_engine(struct si_context *sctx, unsigned size, unsigned user_flags,
+                                     enum si_coherency coher, enum si_cache_policy cache_policy,
+                                     bool *is_first)
 {
-	uint64_t va;
-	unsigned dma_flags = 0;
-	unsigned scratch_size = SI_CPDMA_ALIGNMENT * 2;
-
-	assert(size < SI_CPDMA_ALIGNMENT);
-
-	/* Use the scratch buffer as the dummy buffer. The 3D engine should be
-	 * idle at this point.
-	 */
-	if (!sctx->scratch_buffer ||
-	    sctx->scratch_buffer->b.b.width0 < scratch_size) {
-		si_resource_reference(&sctx->scratch_buffer, NULL);
-		sctx->scratch_buffer =
-			si_aligned_buffer_create(&sctx->screen->b,
-						   SI_RESOURCE_FLAG_UNMAPPABLE,
-						   PIPE_USAGE_DEFAULT,
-						   scratch_size, 256);
-		if (!sctx->scratch_buffer)
-			return;
-
-		si_mark_atom_dirty(sctx, &sctx->atoms.s.scratch_state);
-	}
-
-	si_cp_dma_prepare(sctx, &sctx->scratch_buffer->b.b,
-			  &sctx->scratch_buffer->b.b, size, size, user_flags,
-			  coher, is_first, &dma_flags);
-
-	va = sctx->scratch_buffer->gpu_address;
-	si_emit_cp_dma(sctx, sctx->gfx_cs, va, va + SI_CPDMA_ALIGNMENT, size, dma_flags,
-		       cache_policy);
+   uint64_t va;
+   unsigned dma_flags = 0;
+   unsigned scratch_size = SI_CPDMA_ALIGNMENT * 2;
+
+   assert(size < SI_CPDMA_ALIGNMENT);
+
+   /* Use the scratch buffer as the dummy buffer. The 3D engine should be
+    * idle at this point.
+    */
+   if (!sctx->scratch_buffer || sctx->scratch_buffer->b.b.width0 < scratch_size) {
+      si_resource_reference(&sctx->scratch_buffer, NULL);
+      sctx->scratch_buffer = si_aligned_buffer_create(&sctx->screen->b, SI_RESOURCE_FLAG_UNMAPPABLE,
+                                                      PIPE_USAGE_DEFAULT, scratch_size, 256);
+      if (!sctx->scratch_buffer)
+         return;
+
+      si_mark_atom_dirty(sctx, &sctx->atoms.s.scratch_state);
+   }
+
+   si_cp_dma_prepare(sctx, &sctx->scratch_buffer->b.b, &sctx->scratch_buffer->b.b, size, size,
+                     user_flags, coher, is_first, &dma_flags);
+
+   va = sctx->scratch_buffer->gpu_address;
+   si_emit_cp_dma(sctx, sctx->gfx_cs, va, va + SI_CPDMA_ALIGNMENT, size, dma_flags, cache_policy);
 }
 
 /**
@@ -304,141 +286,131 @@ static void si_cp_dma_realign_engine(struct si_context *sctx, unsigned size,
  *
  * \param user_flags	bitmask of SI_CPDMA_*
  */
-void si_cp_dma_copy_buffer(struct si_context *sctx,
-			   struct pipe_resource *dst, struct pipe_resource *src,
-			   uint64_t dst_offset, uint64_t src_offset, unsigned size,
-			   unsigned user_flags, enum si_coherency coher,
-			   enum si_cache_policy cache_policy)
+void si_cp_dma_copy_buffer(struct si_context *sctx, struct pipe_resource *dst,
+                           struct pipe_resource *src, uint64_t dst_offset, uint64_t src_offset,
+                           unsigned size, unsigned user_flags, enum si_coherency coher,
+                           enum si_cache_policy cache_policy)
 {
-	uint64_t main_dst_offset, main_src_offset;
-	unsigned skipped_size = 0;
-	unsigned realign_size = 0;
-	unsigned gds_flags = (dst ? 0 : CP_DMA_DST_IS_GDS) |
-			     (src ? 0 : CP_DMA_SRC_IS_GDS);
-	bool is_first = true;
-
-	assert(size);
-
-	if (dst) {
-		/* Skip this for the L2 prefetch. */
-		if (dst != src || dst_offset != src_offset) {
-			/* Mark the buffer range of destination as valid (initialized),
-			 * so that transfer_map knows it should wait for the GPU when mapping
-			 * that range. */
-			util_range_add(dst, &si_resource(dst)->valid_buffer_range, dst_offset,
-				       dst_offset + size);
-		}
-
-		dst_offset += si_resource(dst)->gpu_address;
-	}
-	if (src)
-		src_offset += si_resource(src)->gpu_address;
-
-	/* The workarounds aren't needed on Fiji and beyond. */
-	if (sctx->family <= CHIP_CARRIZO ||
-	    sctx->family == CHIP_STONEY) {
-		/* If the size is not aligned, we must add a dummy copy at the end
-		 * just to align the internal counter. Otherwise, the DMA engine
-		 * would slow down by an order of magnitude for following copies.
-		 */
-		if (size % SI_CPDMA_ALIGNMENT)
-			realign_size = SI_CPDMA_ALIGNMENT - (size % SI_CPDMA_ALIGNMENT);
-
-		/* If the copy begins unaligned, we must start copying from the next
-		 * aligned block and the skipped part should be copied after everything
-		 * else has been copied. Only the src alignment matters, not dst.
-		 *
-		 * GDS doesn't need the source address to be aligned.
-		 */
-		if (src && src_offset % SI_CPDMA_ALIGNMENT) {
-			skipped_size = SI_CPDMA_ALIGNMENT - (src_offset % SI_CPDMA_ALIGNMENT);
-			/* The main part will be skipped if the size is too small. */
-			skipped_size = MIN2(skipped_size, size);
-			size -= skipped_size;
-		}
-	}
-
-	/* Flush the caches. */
-	if ((dst || src) && !(user_flags & SI_CPDMA_SKIP_GFX_SYNC)) {
-		sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
-			       SI_CONTEXT_CS_PARTIAL_FLUSH |
-			       si_get_flush_flags(sctx, coher, cache_policy);
-	}
-
-	/* This is the main part doing the copying. Src is always aligned. */
-	main_dst_offset = dst_offset + skipped_size;
-	main_src_offset = src_offset + skipped_size;
-
-	while (size) {
-		unsigned byte_count = MIN2(size, cp_dma_max_byte_count(sctx));
-		unsigned dma_flags = gds_flags;
-
-		si_cp_dma_prepare(sctx, dst, src, byte_count,
-				  size + skipped_size + realign_size,
-				  user_flags, coher, &is_first, &dma_flags);
-
-		si_emit_cp_dma(sctx, sctx->gfx_cs, main_dst_offset, main_src_offset,
-			       byte_count, dma_flags, cache_policy);
-
-		size -= byte_count;
-		main_src_offset += byte_count;
-		main_dst_offset += byte_count;
-	}
-
-	/* Copy the part we skipped because src wasn't aligned. */
-	if (skipped_size) {
-		unsigned dma_flags = gds_flags;
-
-		si_cp_dma_prepare(sctx, dst, src, skipped_size,
-				  skipped_size + realign_size, user_flags,
-				  coher, &is_first, &dma_flags);
-
-		si_emit_cp_dma(sctx, sctx->gfx_cs, dst_offset, src_offset, skipped_size,
-			       dma_flags, cache_policy);
-	}
-
-	/* Finally, realign the engine if the size wasn't aligned. */
-	if (realign_size) {
-		si_cp_dma_realign_engine(sctx, realign_size, user_flags, coher,
-					 cache_policy, &is_first);
-	}
-
-	if (dst && cache_policy != L2_BYPASS)
-		si_resource(dst)->TC_L2_dirty = true;
-
-	/* If it's not a prefetch or GDS copy... */
-	if (dst && src && (dst != src || dst_offset != src_offset)) {
-		sctx->num_cp_dma_calls++;
-		si_prim_discard_signal_next_compute_ib_start(sctx);
-	}
+   uint64_t main_dst_offset, main_src_offset;
+   unsigned skipped_size = 0;
+   unsigned realign_size = 0;
+   unsigned gds_flags = (dst ? 0 : CP_DMA_DST_IS_GDS) | (src ? 0 : CP_DMA_SRC_IS_GDS);
+   bool is_first = true;
+
+   assert(size);
+
+   if (dst) {
+      /* Skip this for the L2 prefetch. */
+      if (dst != src || dst_offset != src_offset) {
+         /* Mark the buffer range of destination as valid (initialized),
+          * so that transfer_map knows it should wait for the GPU when mapping
+          * that range. */
+         util_range_add(dst, &si_resource(dst)->valid_buffer_range, dst_offset, dst_offset + size);
+      }
+
+      dst_offset += si_resource(dst)->gpu_address;
+   }
+   if (src)
+      src_offset += si_resource(src)->gpu_address;
+
+   /* The workarounds aren't needed on Fiji and beyond. */
+   if (sctx->family <= CHIP_CARRIZO || sctx->family == CHIP_STONEY) {
+      /* If the size is not aligned, we must add a dummy copy at the end
+       * just to align the internal counter. Otherwise, the DMA engine
+       * would slow down by an order of magnitude for following copies.
+       */
+      if (size % SI_CPDMA_ALIGNMENT)
+         realign_size = SI_CPDMA_ALIGNMENT - (size % SI_CPDMA_ALIGNMENT);
+
+      /* If the copy begins unaligned, we must start copying from the next
+       * aligned block and the skipped part should be copied after everything
+       * else has been copied. Only the src alignment matters, not dst.
+       *
+       * GDS doesn't need the source address to be aligned.
+       */
+      if (src && src_offset % SI_CPDMA_ALIGNMENT) {
+         skipped_size = SI_CPDMA_ALIGNMENT - (src_offset % SI_CPDMA_ALIGNMENT);
+         /* The main part will be skipped if the size is too small. */
+         skipped_size = MIN2(skipped_size, size);
+         size -= skipped_size;
+      }
+   }
+
+   /* Flush the caches. */
+   if ((dst || src) && !(user_flags & SI_CPDMA_SKIP_GFX_SYNC)) {
+      sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH |
+                     si_get_flush_flags(sctx, coher, cache_policy);
+   }
+
+   /* This is the main part doing the copying. Src is always aligned. */
+   main_dst_offset = dst_offset + skipped_size;
+   main_src_offset = src_offset + skipped_size;
+
+   while (size) {
+      unsigned byte_count = MIN2(size, cp_dma_max_byte_count(sctx));
+      unsigned dma_flags = gds_flags;
+
+      si_cp_dma_prepare(sctx, dst, src, byte_count, size + skipped_size + realign_size, user_flags,
+                        coher, &is_first, &dma_flags);
+
+      si_emit_cp_dma(sctx, sctx->gfx_cs, main_dst_offset, main_src_offset, byte_count, dma_flags,
+                     cache_policy);
+
+      size -= byte_count;
+      main_src_offset += byte_count;
+      main_dst_offset += byte_count;
+   }
+
+   /* Copy the part we skipped because src wasn't aligned. */
+   if (skipped_size) {
+      unsigned dma_flags = gds_flags;
+
+      si_cp_dma_prepare(sctx, dst, src, skipped_size, skipped_size + realign_size, user_flags,
+                        coher, &is_first, &dma_flags);
+
+      si_emit_cp_dma(sctx, sctx->gfx_cs, dst_offset, src_offset, skipped_size, dma_flags,
+                     cache_policy);
+   }
+
+   /* Finally, realign the engine if the size wasn't aligned. */
+   if (realign_size) {
+      si_cp_dma_realign_engine(sctx, realign_size, user_flags, coher, cache_policy, &is_first);
+   }
+
+   if (dst && cache_policy != L2_BYPASS)
+      si_resource(dst)->TC_L2_dirty = true;
+
+   /* If it's not a prefetch or GDS copy... */
+   if (dst && src && (dst != src || dst_offset != src_offset)) {
+      sctx->num_cp_dma_calls++;
+      si_prim_discard_signal_next_compute_ib_start(sctx);
+   }
 }
 
-void cik_prefetch_TC_L2_async(struct si_context *sctx, struct pipe_resource *buf,
-			      uint64_t offset, unsigned size)
+void cik_prefetch_TC_L2_async(struct si_context *sctx, struct pipe_resource *buf, uint64_t offset,
+                              unsigned size)
 {
-	assert(sctx->chip_class >= GFX7);
+   assert(sctx->chip_class >= GFX7);
 
-	si_cp_dma_copy_buffer(sctx, buf, buf, offset, offset, size,
-			      SI_CPDMA_SKIP_ALL, SI_COHERENCY_SHADER, L2_LRU);
+   si_cp_dma_copy_buffer(sctx, buf, buf, offset, offset, size, SI_CPDMA_SKIP_ALL,
+                         SI_COHERENCY_SHADER, L2_LRU);
 }
 
-static void cik_prefetch_shader_async(struct si_context *sctx,
-				      struct si_pm4_state *state)
+static void cik_prefetch_shader_async(struct si_context *sctx, struct si_pm4_state *state)
 {
-	struct pipe_resource *bo = &state->bo[0]->b.b;
-	assert(state->nbo == 1);
+   struct pipe_resource *bo = &state->bo[0]->b.b;
+   assert(state->nbo == 1);
 
-	cik_prefetch_TC_L2_async(sctx, bo, 0, bo->width0);
+   cik_prefetch_TC_L2_async(sctx, bo, 0, bo->width0);
 }
 
 static void cik_prefetch_VBO_descriptors(struct si_context *sctx)
 {
-	if (!sctx->vertex_elements || !sctx->vertex_elements->vb_desc_list_alloc_size)
-		return;
+   if (!sctx->vertex_elements || !sctx->vertex_elements->vb_desc_list_alloc_size)
+      return;
 
-	cik_prefetch_TC_L2_async(sctx, &sctx->vb_descriptors_buffer->b.b,
-				 sctx->vb_descriptors_offset,
-				 sctx->vertex_elements->vb_desc_list_alloc_size);
+   cik_prefetch_TC_L2_async(sctx, &sctx->vb_descriptors_buffer->b.b, sctx->vb_descriptors_offset,
+                            sctx->vertex_elements->vb_desc_list_alloc_size);
 }
 
 /**
@@ -449,191 +421,185 @@ static void cik_prefetch_VBO_descriptors(struct si_context *sctx)
  */
 void cik_emit_prefetch_L2(struct si_context *sctx, bool vertex_stage_only)
 {
-	unsigned mask = sctx->prefetch_L2_mask;
-	assert(mask);
-
-	/* Prefetch shaders and VBO descriptors to TC L2. */
-	if (sctx->chip_class >= GFX9) {
-		/* Choose the right spot for the VBO prefetch. */
-		if (sctx->queued.named.hs) {
-			if (mask & SI_PREFETCH_HS)
-				cik_prefetch_shader_async(sctx, sctx->queued.named.hs);
-			if (mask & SI_PREFETCH_VBO_DESCRIPTORS)
-				cik_prefetch_VBO_descriptors(sctx);
-			if (vertex_stage_only) {
-				sctx->prefetch_L2_mask &= ~(SI_PREFETCH_HS |
-							    SI_PREFETCH_VBO_DESCRIPTORS);
-				return;
-			}
-
-			if (mask & SI_PREFETCH_GS)
-				cik_prefetch_shader_async(sctx, sctx->queued.named.gs);
-			if (mask & SI_PREFETCH_VS)
-				cik_prefetch_shader_async(sctx, sctx->queued.named.vs);
-		} else if (sctx->queued.named.gs) {
-			if (mask & SI_PREFETCH_GS)
-				cik_prefetch_shader_async(sctx, sctx->queued.named.gs);
-			if (mask & SI_PREFETCH_VBO_DESCRIPTORS)
-				cik_prefetch_VBO_descriptors(sctx);
-			if (vertex_stage_only) {
-				sctx->prefetch_L2_mask &= ~(SI_PREFETCH_GS |
-							    SI_PREFETCH_VBO_DESCRIPTORS);
-				return;
-			}
-
-			if (mask & SI_PREFETCH_VS)
-				cik_prefetch_shader_async(sctx, sctx->queued.named.vs);
-		} else {
-			if (mask & SI_PREFETCH_VS)
-				cik_prefetch_shader_async(sctx, sctx->queued.named.vs);
-			if (mask & SI_PREFETCH_VBO_DESCRIPTORS)
-				cik_prefetch_VBO_descriptors(sctx);
-			if (vertex_stage_only) {
-				sctx->prefetch_L2_mask &= ~(SI_PREFETCH_VS |
-							    SI_PREFETCH_VBO_DESCRIPTORS);
-				return;
-			}
-		}
-	} else {
-		/* GFX6-GFX8 */
-		/* Choose the right spot for the VBO prefetch. */
-		if (sctx->tes_shader.cso) {
-			if (mask & SI_PREFETCH_LS)
-				cik_prefetch_shader_async(sctx, sctx->queued.named.ls);
-			if (mask & SI_PREFETCH_VBO_DESCRIPTORS)
-				cik_prefetch_VBO_descriptors(sctx);
-			if (vertex_stage_only) {
-				sctx->prefetch_L2_mask &= ~(SI_PREFETCH_LS |
-							    SI_PREFETCH_VBO_DESCRIPTORS);
-				return;
-			}
-
-			if (mask & SI_PREFETCH_HS)
-				cik_prefetch_shader_async(sctx, sctx->queued.named.hs);
-			if (mask & SI_PREFETCH_ES)
-				cik_prefetch_shader_async(sctx, sctx->queued.named.es);
-			if (mask & SI_PREFETCH_GS)
-				cik_prefetch_shader_async(sctx, sctx->queued.named.gs);
-			if (mask & SI_PREFETCH_VS)
-				cik_prefetch_shader_async(sctx, sctx->queued.named.vs);
-		} else if (sctx->gs_shader.cso) {
-			if (mask & SI_PREFETCH_ES)
-				cik_prefetch_shader_async(sctx, sctx->queued.named.es);
-			if (mask & SI_PREFETCH_VBO_DESCRIPTORS)
-				cik_prefetch_VBO_descriptors(sctx);
-			if (vertex_stage_only) {
-				sctx->prefetch_L2_mask &= ~(SI_PREFETCH_ES |
-							    SI_PREFETCH_VBO_DESCRIPTORS);
-				return;
-			}
-
-			if (mask & SI_PREFETCH_GS)
-				cik_prefetch_shader_async(sctx, sctx->queued.named.gs);
-			if (mask & SI_PREFETCH_VS)
-				cik_prefetch_shader_async(sctx, sctx->queued.named.vs);
-		} else {
-			if (mask & SI_PREFETCH_VS)
-				cik_prefetch_shader_async(sctx, sctx->queued.named.vs);
-			if (mask & SI_PREFETCH_VBO_DESCRIPTORS)
-				cik_prefetch_VBO_descriptors(sctx);
-			if (vertex_stage_only) {
-				sctx->prefetch_L2_mask &= ~(SI_PREFETCH_VS |
-							    SI_PREFETCH_VBO_DESCRIPTORS);
-				return;
-			}
-		}
-	}
-
-	if (mask & SI_PREFETCH_PS)
-		cik_prefetch_shader_async(sctx, sctx->queued.named.ps);
-
-	sctx->prefetch_L2_mask = 0;
+   unsigned mask = sctx->prefetch_L2_mask;
+   assert(mask);
+
+   /* Prefetch shaders and VBO descriptors to TC L2. */
+   if (sctx->chip_class >= GFX9) {
+      /* Choose the right spot for the VBO prefetch. */
+      if (sctx->queued.named.hs) {
+         if (mask & SI_PREFETCH_HS)
+            cik_prefetch_shader_async(sctx, sctx->queued.named.hs);
+         if (mask & SI_PREFETCH_VBO_DESCRIPTORS)
+            cik_prefetch_VBO_descriptors(sctx);
+         if (vertex_stage_only) {
+            sctx->prefetch_L2_mask &= ~(SI_PREFETCH_HS | SI_PREFETCH_VBO_DESCRIPTORS);
+            return;
+         }
+
+         if (mask & SI_PREFETCH_GS)
+            cik_prefetch_shader_async(sctx, sctx->queued.named.gs);
+         if (mask & SI_PREFETCH_VS)
+            cik_prefetch_shader_async(sctx, sctx->queued.named.vs);
+      } else if (sctx->queued.named.gs) {
+         if (mask & SI_PREFETCH_GS)
+            cik_prefetch_shader_async(sctx, sctx->queued.named.gs);
+         if (mask & SI_PREFETCH_VBO_DESCRIPTORS)
+            cik_prefetch_VBO_descriptors(sctx);
+         if (vertex_stage_only) {
+            sctx->prefetch_L2_mask &= ~(SI_PREFETCH_GS | SI_PREFETCH_VBO_DESCRIPTORS);
+            return;
+         }
+
+         if (mask & SI_PREFETCH_VS)
+            cik_prefetch_shader_async(sctx, sctx->queued.named.vs);
+      } else {
+         if (mask & SI_PREFETCH_VS)
+            cik_prefetch_shader_async(sctx, sctx->queued.named.vs);
+         if (mask & SI_PREFETCH_VBO_DESCRIPTORS)
+            cik_prefetch_VBO_descriptors(sctx);
+         if (vertex_stage_only) {
+            sctx->prefetch_L2_mask &= ~(SI_PREFETCH_VS | SI_PREFETCH_VBO_DESCRIPTORS);
+            return;
+         }
+      }
+   } else {
+      /* GFX6-GFX8 */
+      /* Choose the right spot for the VBO prefetch. */
+      if (sctx->tes_shader.cso) {
+         if (mask & SI_PREFETCH_LS)
+            cik_prefetch_shader_async(sctx, sctx->queued.named.ls);
+         if (mask & SI_PREFETCH_VBO_DESCRIPTORS)
+            cik_prefetch_VBO_descriptors(sctx);
+         if (vertex_stage_only) {
+            sctx->prefetch_L2_mask &= ~(SI_PREFETCH_LS | SI_PREFETCH_VBO_DESCRIPTORS);
+            return;
+         }
+
+         if (mask & SI_PREFETCH_HS)
+            cik_prefetch_shader_async(sctx, sctx->queued.named.hs);
+         if (mask & SI_PREFETCH_ES)
+            cik_prefetch_shader_async(sctx, sctx->queued.named.es);
+         if (mask & SI_PREFETCH_GS)
+            cik_prefetch_shader_async(sctx, sctx->queued.named.gs);
+         if (mask & SI_PREFETCH_VS)
+            cik_prefetch_shader_async(sctx, sctx->queued.named.vs);
+      } else if (sctx->gs_shader.cso) {
+         if (mask & SI_PREFETCH_ES)
+            cik_prefetch_shader_async(sctx, sctx->queued.named.es);
+         if (mask & SI_PREFETCH_VBO_DESCRIPTORS)
+            cik_prefetch_VBO_descriptors(sctx);
+         if (vertex_stage_only) {
+            sctx->prefetch_L2_mask &= ~(SI_PREFETCH_ES | SI_PREFETCH_VBO_DESCRIPTORS);
+            return;
+         }
+
+         if (mask & SI_PREFETCH_GS)
+            cik_prefetch_shader_async(sctx, sctx->queued.named.gs);
+         if (mask & SI_PREFETCH_VS)
+            cik_prefetch_shader_async(sctx, sctx->queued.named.vs);
+      } else {
+         if (mask & SI_PREFETCH_VS)
+            cik_prefetch_shader_async(sctx, sctx->queued.named.vs);
+         if (mask & SI_PREFETCH_VBO_DESCRIPTORS)
+            cik_prefetch_VBO_descriptors(sctx);
+         if (vertex_stage_only) {
+            sctx->prefetch_L2_mask &= ~(SI_PREFETCH_VS | SI_PREFETCH_VBO_DESCRIPTORS);
+            return;
+         }
+      }
+   }
+
+   if (mask & SI_PREFETCH_PS)
+      cik_prefetch_shader_async(sctx, sctx->queued.named.ps);
+
+   sctx->prefetch_L2_mask = 0;
 }
 
 void si_test_gds(struct si_context *sctx)
 {
-	struct pipe_context *ctx = &sctx->b;
-	struct pipe_resource *src, *dst;
-	unsigned r[4] = {};
-	unsigned offset = debug_get_num_option("OFFSET", 16);
-
-	src = pipe_buffer_create(ctx->screen, 0, PIPE_USAGE_DEFAULT, 16);
-	dst = pipe_buffer_create(ctx->screen, 0, PIPE_USAGE_DEFAULT, 16);
-	si_cp_dma_clear_buffer(sctx, sctx->gfx_cs, src, 0, 4, 0xabcdef01, 0, SI_COHERENCY_SHADER, L2_BYPASS);
-	si_cp_dma_clear_buffer(sctx, sctx->gfx_cs, src, 4, 4, 0x23456789, 0, SI_COHERENCY_SHADER, L2_BYPASS);
-	si_cp_dma_clear_buffer(sctx, sctx->gfx_cs, src, 8, 4, 0x87654321, 0, SI_COHERENCY_SHADER, L2_BYPASS);
-	si_cp_dma_clear_buffer(sctx, sctx->gfx_cs, src, 12, 4, 0xfedcba98, 0, SI_COHERENCY_SHADER, L2_BYPASS);
-	si_cp_dma_clear_buffer(sctx, sctx->gfx_cs, dst, 0, 16, 0xdeadbeef, 0, SI_COHERENCY_SHADER, L2_BYPASS);
-
-	si_cp_dma_copy_buffer(sctx, NULL, src, offset, 0, 16, 0, SI_COHERENCY_NONE, L2_BYPASS);
-	si_cp_dma_copy_buffer(sctx, dst, NULL, 0, offset, 16, 0, SI_COHERENCY_NONE, L2_BYPASS);
-
-	pipe_buffer_read(ctx, dst, 0, sizeof(r), r);
-	printf("GDS copy  = %08x %08x %08x %08x -> %s\n", r[0], r[1], r[2], r[3],
-			r[0] == 0xabcdef01 && r[1] == 0x23456789 &&
-			r[2] == 0x87654321 && r[3] == 0xfedcba98 ? "pass" : "fail");
-
-	si_cp_dma_clear_buffer(sctx, sctx->gfx_cs, NULL, offset, 16, 0xc1ea4146, 0, SI_COHERENCY_NONE, L2_BYPASS);
-	si_cp_dma_copy_buffer(sctx, dst, NULL, 0, offset, 16, 0, SI_COHERENCY_NONE, L2_BYPASS);
-
-	pipe_buffer_read(ctx, dst, 0, sizeof(r), r);
-	printf("GDS clear = %08x %08x %08x %08x -> %s\n", r[0], r[1], r[2], r[3],
-			r[0] == 0xc1ea4146 && r[1] == 0xc1ea4146 &&
-			r[2] == 0xc1ea4146 && r[3] == 0xc1ea4146 ? "pass" : "fail");
-
-	pipe_resource_reference(&src, NULL);
-	pipe_resource_reference(&dst, NULL);
-	exit(0);
+   struct pipe_context *ctx = &sctx->b;
+   struct pipe_resource *src, *dst;
+   unsigned r[4] = {};
+   unsigned offset = debug_get_num_option("OFFSET", 16);
+
+   src = pipe_buffer_create(ctx->screen, 0, PIPE_USAGE_DEFAULT, 16);
+   dst = pipe_buffer_create(ctx->screen, 0, PIPE_USAGE_DEFAULT, 16);
+   si_cp_dma_clear_buffer(sctx, sctx->gfx_cs, src, 0, 4, 0xabcdef01, 0, SI_COHERENCY_SHADER,
+                          L2_BYPASS);
+   si_cp_dma_clear_buffer(sctx, sctx->gfx_cs, src, 4, 4, 0x23456789, 0, SI_COHERENCY_SHADER,
+                          L2_BYPASS);
+   si_cp_dma_clear_buffer(sctx, sctx->gfx_cs, src, 8, 4, 0x87654321, 0, SI_COHERENCY_SHADER,
+                          L2_BYPASS);
+   si_cp_dma_clear_buffer(sctx, sctx->gfx_cs, src, 12, 4, 0xfedcba98, 0, SI_COHERENCY_SHADER,
+                          L2_BYPASS);
+   si_cp_dma_clear_buffer(sctx, sctx->gfx_cs, dst, 0, 16, 0xdeadbeef, 0, SI_COHERENCY_SHADER,
+                          L2_BYPASS);
+
+   si_cp_dma_copy_buffer(sctx, NULL, src, offset, 0, 16, 0, SI_COHERENCY_NONE, L2_BYPASS);
+   si_cp_dma_copy_buffer(sctx, dst, NULL, 0, offset, 16, 0, SI_COHERENCY_NONE, L2_BYPASS);
+
+   pipe_buffer_read(ctx, dst, 0, sizeof(r), r);
+   printf("GDS copy  = %08x %08x %08x %08x -> %s\n", r[0], r[1], r[2], r[3],
+          r[0] == 0xabcdef01 && r[1] == 0x23456789 && r[2] == 0x87654321 && r[3] == 0xfedcba98
+             ? "pass"
+             : "fail");
+
+   si_cp_dma_clear_buffer(sctx, sctx->gfx_cs, NULL, offset, 16, 0xc1ea4146, 0, SI_COHERENCY_NONE,
+                          L2_BYPASS);
+   si_cp_dma_copy_buffer(sctx, dst, NULL, 0, offset, 16, 0, SI_COHERENCY_NONE, L2_BYPASS);
+
+   pipe_buffer_read(ctx, dst, 0, sizeof(r), r);
+   printf("GDS clear = %08x %08x %08x %08x -> %s\n", r[0], r[1], r[2], r[3],
+          r[0] == 0xc1ea4146 && r[1] == 0xc1ea4146 && r[2] == 0xc1ea4146 && r[3] == 0xc1ea4146
+             ? "pass"
+             : "fail");
+
+   pipe_resource_reference(&src, NULL);
+   pipe_resource_reference(&dst, NULL);
+   exit(0);
 }
 
-void si_cp_write_data(struct si_context *sctx, struct si_resource *buf,
-		      unsigned offset, unsigned size, unsigned dst_sel,
-		      unsigned engine, const void *data)
+void si_cp_write_data(struct si_context *sctx, struct si_resource *buf, unsigned offset,
+                      unsigned size, unsigned dst_sel, unsigned engine, const void *data)
 {
-	struct radeon_cmdbuf *cs = sctx->gfx_cs;
+   struct radeon_cmdbuf *cs = sctx->gfx_cs;
 
-	assert(offset % 4 == 0);
-	assert(size % 4 == 0);
+   assert(offset % 4 == 0);
+   assert(size % 4 == 0);
 
-	if (sctx->chip_class == GFX6 && dst_sel == V_370_MEM)
-		dst_sel = V_370_MEM_GRBM;
+   if (sctx->chip_class == GFX6 && dst_sel == V_370_MEM)
+      dst_sel = V_370_MEM_GRBM;
 
-	radeon_add_to_buffer_list(sctx, cs, buf,
-				  RADEON_USAGE_WRITE, RADEON_PRIO_CP_DMA);
-	uint64_t va = buf->gpu_address + offset;
+   radeon_add_to_buffer_list(sctx, cs, buf, RADEON_USAGE_WRITE, RADEON_PRIO_CP_DMA);
+   uint64_t va = buf->gpu_address + offset;
 
-	radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 2 + size/4, 0));
-	radeon_emit(cs, S_370_DST_SEL(dst_sel) |
-		    S_370_WR_CONFIRM(1) |
-		    S_370_ENGINE_SEL(engine));
-	radeon_emit(cs, va);
-	radeon_emit(cs, va >> 32);
-	radeon_emit_array(cs, (const uint32_t*)data, size/4);
+   radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 2 + size / 4, 0));
+   radeon_emit(cs, S_370_DST_SEL(dst_sel) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(engine));
+   radeon_emit(cs, va);
+   radeon_emit(cs, va >> 32);
+   radeon_emit_array(cs, (const uint32_t *)data, size / 4);
 }
 
-void si_cp_copy_data(struct si_context *sctx, struct radeon_cmdbuf *cs,
-		     unsigned dst_sel, struct si_resource *dst, unsigned dst_offset,
-		     unsigned src_sel, struct si_resource *src, unsigned src_offset)
+void si_cp_copy_data(struct si_context *sctx, struct radeon_cmdbuf *cs, unsigned dst_sel,
+                     struct si_resource *dst, unsigned dst_offset, unsigned src_sel,
+                     struct si_resource *src, unsigned src_offset)
 {
-	/* cs can point to the compute IB, which has the buffer list in gfx_cs. */
-	if (dst) {
-		radeon_add_to_buffer_list(sctx, sctx->gfx_cs, dst,
-					  RADEON_USAGE_WRITE, RADEON_PRIO_CP_DMA);
-	}
-	if (src) {
-		radeon_add_to_buffer_list(sctx, sctx->gfx_cs, src,
-					  RADEON_USAGE_READ, RADEON_PRIO_CP_DMA);
-	}
-
-	uint64_t dst_va = (dst ? dst->gpu_address : 0ull) + dst_offset;
-	uint64_t src_va = (src ? src->gpu_address : 0ull) + src_offset;
-
-	radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
-	radeon_emit(cs, COPY_DATA_SRC_SEL(src_sel) |
-			COPY_DATA_DST_SEL(dst_sel) |
-			COPY_DATA_WR_CONFIRM);
-	radeon_emit(cs, src_va);
-	radeon_emit(cs, src_va >> 32);
-	radeon_emit(cs, dst_va);
-	radeon_emit(cs, dst_va >> 32);
+   /* cs can point to the compute IB, which has the buffer list in gfx_cs. */
+   if (dst) {
+      radeon_add_to_buffer_list(sctx, sctx->gfx_cs, dst, RADEON_USAGE_WRITE, RADEON_PRIO_CP_DMA);
+   }
+   if (src) {
+      radeon_add_to_buffer_list(sctx, sctx->gfx_cs, src, RADEON_USAGE_READ, RADEON_PRIO_CP_DMA);
+   }
+
+   uint64_t dst_va = (dst ? dst->gpu_address : 0ull) + dst_offset;
+   uint64_t src_va = (src ? src->gpu_address : 0ull) + src_offset;
+
+   radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
+   radeon_emit(cs, COPY_DATA_SRC_SEL(src_sel) | COPY_DATA_DST_SEL(dst_sel) | COPY_DATA_WR_CONFIRM);
+   radeon_emit(cs, src_va);
+   radeon_emit(cs, src_va >> 32);
+   radeon_emit(cs, dst_va);
+   radeon_emit(cs, dst_va >> 32);
 }
diff --git a/src/gallium/drivers/radeonsi/si_debug.c b/src/gallium/drivers/radeonsi/si_debug.c
index cbd92c02c73..acd86730d0b 100644
--- a/src/gallium/drivers/radeonsi/si_debug.c
+++ b/src/gallium/drivers/radeonsi/si_debug.c
@@ -22,21 +22,20 @@
  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  */
 
-#include "si_pipe.h"
+#include "ac_debug.h"
+#include "ac_rtld.h"
+#include "driver_ddebug/dd_util.h"
 #include "si_compute.h"
+#include "si_pipe.h"
 #include "sid.h"
 #include "sid_tables.h"
 #include "tgsi/tgsi_from_mesa.h"
-#include "driver_ddebug/dd_util.h"
 #include "util/u_dump.h"
 #include "util/u_log.h"
 #include "util/u_memory.h"
 #include "util/u_string.h"
-#include "ac_debug.h"
-#include "ac_rtld.h"
 
-static void si_dump_bo_list(struct si_context *sctx,
-			    const struct radeon_saved_cs *saved, FILE *f);
+static void si_dump_bo_list(struct si_context *sctx, const struct radeon_saved_cs *saved, FILE *f);
 
 DEBUG_GET_ONCE_OPTION(replace_shaders, "RADEON_REPLACE_SHADERS", NULL)
 
@@ -44,155 +43,148 @@ DEBUG_GET_ONCE_OPTION(replace_shaders, "RADEON_REPLACE_SHADERS", NULL)
  * Store a linearized copy of all chunks of \p cs together with the buffer
  * list in \p saved.
  */
-void si_save_cs(struct radeon_winsys *ws, struct radeon_cmdbuf *cs,
-		struct radeon_saved_cs *saved, bool get_buffer_list)
+void si_save_cs(struct radeon_winsys *ws, struct radeon_cmdbuf *cs, struct radeon_saved_cs *saved,
+                bool get_buffer_list)
 {
-	uint32_t *buf;
-	unsigned i;
-
-	/* Save the IB chunks. */
-	saved->num_dw = cs->prev_dw + cs->current.cdw;
-	saved->ib = MALLOC(4 * saved->num_dw);
-	if (!saved->ib)
-		goto oom;
-
-	buf = saved->ib;
-	for (i = 0; i < cs->num_prev; ++i) {
-		memcpy(buf, cs->prev[i].buf, cs->prev[i].cdw * 4);
-		buf += cs->prev[i].cdw;
-	}
-	memcpy(buf, cs->current.buf, cs->current.cdw * 4);
-
-	if (!get_buffer_list)
-		return;
-
-	/* Save the buffer list. */
-	saved->bo_count = ws->cs_get_buffer_list(cs, NULL);
-	saved->bo_list = CALLOC(saved->bo_count,
-				sizeof(saved->bo_list[0]));
-	if (!saved->bo_list) {
-		FREE(saved->ib);
-		goto oom;
-	}
-	ws->cs_get_buffer_list(cs, saved->bo_list);
-
-	return;
+   uint32_t *buf;
+   unsigned i;
+
+   /* Save the IB chunks. */
+   saved->num_dw = cs->prev_dw + cs->current.cdw;
+   saved->ib = MALLOC(4 * saved->num_dw);
+   if (!saved->ib)
+      goto oom;
+
+   buf = saved->ib;
+   for (i = 0; i < cs->num_prev; ++i) {
+      memcpy(buf, cs->prev[i].buf, cs->prev[i].cdw * 4);
+      buf += cs->prev[i].cdw;
+   }
+   memcpy(buf, cs->current.buf, cs->current.cdw * 4);
+
+   if (!get_buffer_list)
+      return;
+
+   /* Save the buffer list. */
+   saved->bo_count = ws->cs_get_buffer_list(cs, NULL);
+   saved->bo_list = CALLOC(saved->bo_count, sizeof(saved->bo_list[0]));
+   if (!saved->bo_list) {
+      FREE(saved->ib);
+      goto oom;
+   }
+   ws->cs_get_buffer_list(cs, saved->bo_list);
+
+   return;
 
 oom:
-	fprintf(stderr, "%s: out of memory\n", __func__);
-	memset(saved, 0, sizeof(*saved));
+   fprintf(stderr, "%s: out of memory\n", __func__);
+   memset(saved, 0, sizeof(*saved));
 }
 
 void si_clear_saved_cs(struct radeon_saved_cs *saved)
 {
-	FREE(saved->ib);
-	FREE(saved->bo_list);
+   FREE(saved->ib);
+   FREE(saved->bo_list);
 
-	memset(saved, 0, sizeof(*saved));
+   memset(saved, 0, sizeof(*saved));
 }
 
 void si_destroy_saved_cs(struct si_saved_cs *scs)
 {
-	si_clear_saved_cs(&scs->gfx);
-	si_resource_reference(&scs->trace_buf, NULL);
-	free(scs);
+   si_clear_saved_cs(&scs->gfx);
+   si_resource_reference(&scs->trace_buf, NULL);
+   free(scs);
 }
 
-static void si_dump_shader(struct si_screen *sscreen,
-			   struct si_shader *shader, FILE *f)
+static void si_dump_shader(struct si_screen *sscreen, struct si_shader *shader, FILE *f)
 {
-	if (shader->shader_log)
-		fwrite(shader->shader_log, shader->shader_log_size, 1, f);
-	else
-		si_shader_dump(sscreen, shader, NULL, f, false);
+   if (shader->shader_log)
+      fwrite(shader->shader_log, shader->shader_log_size, 1, f);
+   else
+      si_shader_dump(sscreen, shader, NULL, f, false);
 
-	if (shader->bo && sscreen->options.dump_shader_binary) {
-		unsigned size = shader->bo->b.b.width0;
-		fprintf(f, "BO: VA=%"PRIx64" Size=%u\n", shader->bo->gpu_address, size);
+   if (shader->bo && sscreen->options.dump_shader_binary) {
+      unsigned size = shader->bo->b.b.width0;
+      fprintf(f, "BO: VA=%" PRIx64 " Size=%u\n", shader->bo->gpu_address, size);
 
-		const char *mapped = sscreen->ws->buffer_map(shader->bo->buf, NULL,
-						       PIPE_TRANSFER_UNSYNCHRONIZED |
-						       PIPE_TRANSFER_READ |
-						       RADEON_TRANSFER_TEMPORARY);
+      const char *mapped = sscreen->ws->buffer_map(
+         shader->bo->buf, NULL,
+         PIPE_TRANSFER_UNSYNCHRONIZED | PIPE_TRANSFER_READ | RADEON_TRANSFER_TEMPORARY);
 
-		for (unsigned i = 0; i < size; i += 4) {
-			fprintf(f, " %4x: %08x\n", i, *(uint32_t*)(mapped + i));
-		}
+      for (unsigned i = 0; i < size; i += 4) {
+         fprintf(f, " %4x: %08x\n", i, *(uint32_t *)(mapped + i));
+      }
 
-		sscreen->ws->buffer_unmap(shader->bo->buf);
+      sscreen->ws->buffer_unmap(shader->bo->buf);
 
-		fprintf(f, "\n");
-	}
+      fprintf(f, "\n");
+   }
 }
 
 struct si_log_chunk_shader {
-	/* The shader destroy code assumes a current context for unlinking of
-	 * PM4 packets etc.
-	 *
-	 * While we should be able to destroy shaders without a context, doing
-	 * so would happen only very rarely and be therefore likely to fail
-	 * just when you're trying to debug something. Let's just remember the
-	 * current context in the chunk.
-	 */
-	struct si_context *ctx;
-	struct si_shader *shader;
-
-	/* For keep-alive reference counts */
-	struct si_shader_selector *sel;
-	struct si_compute *program;
+   /* The shader destroy code assumes a current context for unlinking of
+    * PM4 packets etc.
+    *
+    * While we should be able to destroy shaders without a context, doing
+    * so would happen only very rarely and be therefore likely to fail
+    * just when you're trying to debug something. Let's just remember the
+    * current context in the chunk.
+    */
+   struct si_context *ctx;
+   struct si_shader *shader;
+
+   /* For keep-alive reference counts */
+   struct si_shader_selector *sel;
+   struct si_compute *program;
 };
 
-static void
-si_log_chunk_shader_destroy(void *data)
+static void si_log_chunk_shader_destroy(void *data)
 {
-	struct si_log_chunk_shader *chunk = data;
-	si_shader_selector_reference(chunk->ctx, &chunk->sel, NULL);
-	si_compute_reference(&chunk->program, NULL);
-	FREE(chunk);
+   struct si_log_chunk_shader *chunk = data;
+   si_shader_selector_reference(chunk->ctx, &chunk->sel, NULL);
+   si_compute_reference(&chunk->program, NULL);
+   FREE(chunk);
 }
 
-static void
-si_log_chunk_shader_print(void *data, FILE *f)
+static void si_log_chunk_shader_print(void *data, FILE *f)
 {
-	struct si_log_chunk_shader *chunk = data;
-	struct si_screen *sscreen = chunk->ctx->screen;
-	si_dump_shader(sscreen, chunk->shader, f);
+   struct si_log_chunk_shader *chunk = data;
+   struct si_screen *sscreen = chunk->ctx->screen;
+   si_dump_shader(sscreen, chunk->shader, f);
 }
 
 static struct u_log_chunk_type si_log_chunk_type_shader = {
-	.destroy = si_log_chunk_shader_destroy,
-	.print = si_log_chunk_shader_print,
+   .destroy = si_log_chunk_shader_destroy,
+   .print = si_log_chunk_shader_print,
 };
 
-static void si_dump_gfx_shader(struct si_context *ctx,
-			       const struct si_shader_ctx_state *state,
-			       struct u_log_context *log)
+static void si_dump_gfx_shader(struct si_context *ctx, const struct si_shader_ctx_state *state,
+                               struct u_log_context *log)
 {
-	struct si_shader *current = state->current;
+   struct si_shader *current = state->current;
 
-	if (!state->cso || !current)
-		return;
+   if (!state->cso || !current)
+      return;
 
-	struct si_log_chunk_shader *chunk = CALLOC_STRUCT(si_log_chunk_shader);
-	chunk->ctx = ctx;
-	chunk->shader = current;
-	si_shader_selector_reference(ctx, &chunk->sel, current->selector);
-	u_log_chunk(log, &si_log_chunk_type_shader, chunk);
+   struct si_log_chunk_shader *chunk = CALLOC_STRUCT(si_log_chunk_shader);
+   chunk->ctx = ctx;
+   chunk->shader = current;
+   si_shader_selector_reference(ctx, &chunk->sel, current->selector);
+   u_log_chunk(log, &si_log_chunk_type_shader, chunk);
 }
 
-static void si_dump_compute_shader(struct si_context *ctx,
-				   struct u_log_context *log)
+static void si_dump_compute_shader(struct si_context *ctx, struct u_log_context *log)
 {
-	const struct si_cs_shader_state *state = &ctx->cs_shader_state;
+   const struct si_cs_shader_state *state = &ctx->cs_shader_state;
 
-	if (!state->program)
-		return;
+   if (!state->program)
+      return;
 
-	struct si_log_chunk_shader *chunk = CALLOC_STRUCT(si_log_chunk_shader);
-	chunk->ctx = ctx;
-	chunk->shader = &state->program->shader;
-	si_compute_reference(&chunk->program, state->program);
-	u_log_chunk(log, &si_log_chunk_type_shader, chunk);
+   struct si_log_chunk_shader *chunk = CALLOC_STRUCT(si_log_chunk_shader);
+   chunk->ctx = ctx;
+   chunk->shader = &state->program->shader;
+   si_compute_reference(&chunk->program, state->program);
+   u_log_chunk(log, &si_log_chunk_type_shader, chunk);
 }
 
 /**
@@ -203,724 +195,664 @@ static void si_dump_compute_shader(struct si_context *ctx,
  */
 bool si_replace_shader(unsigned num, struct si_shader_binary *binary)
 {
-	const char *p = debug_get_option_replace_shaders();
-	const char *semicolon;
-	char *copy = NULL;
-	FILE *f;
-	long filesize, nread;
-	bool replaced = false;
-
-	if (!p)
-		return false;
-
-	while (*p) {
-		unsigned long i;
-		char *endp;
-		i = strtoul(p, &endp, 0);
-
-		p = endp;
-		if (*p != ':') {
-			fprintf(stderr, "RADEON_REPLACE_SHADERS formatted badly.\n");
-			exit(1);
-		}
-		++p;
-
-		if (i == num)
-			break;
-
-		p = strchr(p, ';');
-		if (!p)
-			return false;
-		++p;
-	}
-	if (!*p)
-		return false;
-
-	semicolon = strchr(p, ';');
-	if (semicolon) {
-		p = copy = strndup(p, semicolon - p);
-		if (!copy) {
-			fprintf(stderr, "out of memory\n");
-			return false;
-		}
-	}
-
-	fprintf(stderr, "radeonsi: replace shader %u by %s\n", num, p);
-
-	f = fopen(p, "r");
-	if (!f) {
-		perror("radeonsi: failed to open file");
-		goto out_free;
-	}
-
-	if (fseek(f, 0, SEEK_END) != 0)
-		goto file_error;
-
-	filesize = ftell(f);
-	if (filesize < 0)
-		goto file_error;
-
-	if (fseek(f, 0, SEEK_SET) != 0)
-		goto file_error;
-
-	binary->elf_buffer = MALLOC(filesize);
-	if (!binary->elf_buffer) {
-		fprintf(stderr, "out of memory\n");
-		goto out_close;
-	}
-
-	nread = fread((void*)binary->elf_buffer, 1, filesize, f);
-	if (nread != filesize) {
-		FREE((void*)binary->elf_buffer);
-		binary->elf_buffer = NULL;
-		goto file_error;
-	}
-
-	binary->elf_size = nread;
-	replaced = true;
+   const char *p = debug_get_option_replace_shaders();
+   const char *semicolon;
+   char *copy = NULL;
+   FILE *f;
+   long filesize, nread;
+   bool replaced = false;
+
+   if (!p)
+      return false;
+
+   while (*p) {
+      unsigned long i;
+      char *endp;
+      i = strtoul(p, &endp, 0);
+
+      p = endp;
+      if (*p != ':') {
+         fprintf(stderr, "RADEON_REPLACE_SHADERS formatted badly.\n");
+         exit(1);
+      }
+      ++p;
+
+      if (i == num)
+         break;
+
+      p = strchr(p, ';');
+      if (!p)
+         return false;
+      ++p;
+   }
+   if (!*p)
+      return false;
+
+   semicolon = strchr(p, ';');
+   if (semicolon) {
+      p = copy = strndup(p, semicolon - p);
+      if (!copy) {
+         fprintf(stderr, "out of memory\n");
+         return false;
+      }
+   }
+
+   fprintf(stderr, "radeonsi: replace shader %u by %s\n", num, p);
+
+   f = fopen(p, "r");
+   if (!f) {
+      perror("radeonsi: failed to open file");
+      goto out_free;
+   }
+
+   if (fseek(f, 0, SEEK_END) != 0)
+      goto file_error;
+
+   filesize = ftell(f);
+   if (filesize < 0)
+      goto file_error;
+
+   if (fseek(f, 0, SEEK_SET) != 0)
+      goto file_error;
+
+   binary->elf_buffer = MALLOC(filesize);
+   if (!binary->elf_buffer) {
+      fprintf(stderr, "out of memory\n");
+      goto out_close;
+   }
+
+   nread = fread((void *)binary->elf_buffer, 1, filesize, f);
+   if (nread != filesize) {
+      FREE((void *)binary->elf_buffer);
+      binary->elf_buffer = NULL;
+      goto file_error;
+   }
+
+   binary->elf_size = nread;
+   replaced = true;
 
 out_close:
-	fclose(f);
+   fclose(f);
 out_free:
-	free(copy);
-	return replaced;
+   free(copy);
+   return replaced;
 
 file_error:
-	perror("radeonsi: reading shader");
-	goto out_close;
+   perror("radeonsi: reading shader");
+   goto out_close;
 }
 
 /* Parsed IBs are difficult to read without colors. Use "less -R file" to
  * read them, or use "aha -b -f file" to convert them to html.
  */
-#define COLOR_RESET	"\033[0m"
-#define COLOR_RED	"\033[31m"
-#define COLOR_GREEN	"\033[1;32m"
-#define COLOR_YELLOW	"\033[1;33m"
-#define COLOR_CYAN	"\033[1;36m"
-
-static void si_dump_mmapped_reg(struct si_context *sctx, FILE *f,
-				unsigned offset)
+#define COLOR_RESET  "\033[0m"
+#define COLOR_RED    "\033[31m"
+#define COLOR_GREEN  "\033[1;32m"
+#define COLOR_YELLOW "\033[1;33m"
+#define COLOR_CYAN   "\033[1;36m"
+
+static void si_dump_mmapped_reg(struct si_context *sctx, FILE *f, unsigned offset)
 {
-	struct radeon_winsys *ws = sctx->ws;
-	uint32_t value;
+   struct radeon_winsys *ws = sctx->ws;
+   uint32_t value;
 
-	if (ws->read_registers(ws, offset, 1, &value))
-		ac_dump_reg(f, sctx->chip_class, offset, value, ~0);
+   if (ws->read_registers(ws, offset, 1, &value))
+      ac_dump_reg(f, sctx->chip_class, offset, value, ~0);
 }
 
 static void si_dump_debug_registers(struct si_context *sctx, FILE *f)
 {
-	if (!sctx->screen->info.has_read_registers_query)
-		return;
-
-	fprintf(f, "Memory-mapped registers:\n");
-	si_dump_mmapped_reg(sctx, f, R_008010_GRBM_STATUS);
-
-	/* No other registers can be read on DRM < 3.1.0. */
-	if (!sctx->screen->info.is_amdgpu ||
-	    sctx->screen->info.drm_minor < 1) {
-		fprintf(f, "\n");
-		return;
-	}
-
-	si_dump_mmapped_reg(sctx, f, R_008008_GRBM_STATUS2);
-	si_dump_mmapped_reg(sctx, f, R_008014_GRBM_STATUS_SE0);
-	si_dump_mmapped_reg(sctx, f, R_008018_GRBM_STATUS_SE1);
-	si_dump_mmapped_reg(sctx, f, R_008038_GRBM_STATUS_SE2);
-	si_dump_mmapped_reg(sctx, f, R_00803C_GRBM_STATUS_SE3);
-	si_dump_mmapped_reg(sctx, f, R_00D034_SDMA0_STATUS_REG);
-	si_dump_mmapped_reg(sctx, f, R_00D834_SDMA1_STATUS_REG);
-	if (sctx->chip_class <= GFX8) {
-		si_dump_mmapped_reg(sctx, f, R_000E50_SRBM_STATUS);
-		si_dump_mmapped_reg(sctx, f, R_000E4C_SRBM_STATUS2);
-		si_dump_mmapped_reg(sctx, f, R_000E54_SRBM_STATUS3);
-	}
-	si_dump_mmapped_reg(sctx, f, R_008680_CP_STAT);
-	si_dump_mmapped_reg(sctx, f, R_008674_CP_STALLED_STAT1);
-	si_dump_mmapped_reg(sctx, f, R_008678_CP_STALLED_STAT2);
-	si_dump_mmapped_reg(sctx, f, R_008670_CP_STALLED_STAT3);
-	si_dump_mmapped_reg(sctx, f, R_008210_CP_CPC_STATUS);
-	si_dump_mmapped_reg(sctx, f, R_008214_CP_CPC_BUSY_STAT);
-	si_dump_mmapped_reg(sctx, f, R_008218_CP_CPC_STALLED_STAT1);
-	si_dump_mmapped_reg(sctx, f, R_00821C_CP_CPF_STATUS);
-	si_dump_mmapped_reg(sctx, f, R_008220_CP_CPF_BUSY_STAT);
-	si_dump_mmapped_reg(sctx, f, R_008224_CP_CPF_STALLED_STAT1);
-	fprintf(f, "\n");
+   if (!sctx->screen->info.has_read_registers_query)
+      return;
+
+   fprintf(f, "Memory-mapped registers:\n");
+   si_dump_mmapped_reg(sctx, f, R_008010_GRBM_STATUS);
+
+   /* No other registers can be read on DRM < 3.1.0. */
+   if (!sctx->screen->info.is_amdgpu || sctx->screen->info.drm_minor < 1) {
+      fprintf(f, "\n");
+      return;
+   }
+
+   si_dump_mmapped_reg(sctx, f, R_008008_GRBM_STATUS2);
+   si_dump_mmapped_reg(sctx, f, R_008014_GRBM_STATUS_SE0);
+   si_dump_mmapped_reg(sctx, f, R_008018_GRBM_STATUS_SE1);
+   si_dump_mmapped_reg(sctx, f, R_008038_GRBM_STATUS_SE2);
+   si_dump_mmapped_reg(sctx, f, R_00803C_GRBM_STATUS_SE3);
+   si_dump_mmapped_reg(sctx, f, R_00D034_SDMA0_STATUS_REG);
+   si_dump_mmapped_reg(sctx, f, R_00D834_SDMA1_STATUS_REG);
+   if (sctx->chip_class <= GFX8) {
+      si_dump_mmapped_reg(sctx, f, R_000E50_SRBM_STATUS);
+      si_dump_mmapped_reg(sctx, f, R_000E4C_SRBM_STATUS2);
+      si_dump_mmapped_reg(sctx, f, R_000E54_SRBM_STATUS3);
+   }
+   si_dump_mmapped_reg(sctx, f, R_008680_CP_STAT);
+   si_dump_mmapped_reg(sctx, f, R_008674_CP_STALLED_STAT1);
+   si_dump_mmapped_reg(sctx, f, R_008678_CP_STALLED_STAT2);
+   si_dump_mmapped_reg(sctx, f, R_008670_CP_STALLED_STAT3);
+   si_dump_mmapped_reg(sctx, f, R_008210_CP_CPC_STATUS);
+   si_dump_mmapped_reg(sctx, f, R_008214_CP_CPC_BUSY_STAT);
+   si_dump_mmapped_reg(sctx, f, R_008218_CP_CPC_STALLED_STAT1);
+   si_dump_mmapped_reg(sctx, f, R_00821C_CP_CPF_STATUS);
+   si_dump_mmapped_reg(sctx, f, R_008220_CP_CPF_BUSY_STAT);
+   si_dump_mmapped_reg(sctx, f, R_008224_CP_CPF_STALLED_STAT1);
+   fprintf(f, "\n");
 }
 
 struct si_log_chunk_cs {
-	struct si_context *ctx;
-	struct si_saved_cs *cs;
-	bool dump_bo_list;
-	unsigned gfx_begin, gfx_end;
-	unsigned compute_begin, compute_end;
+   struct si_context *ctx;
+   struct si_saved_cs *cs;
+   bool dump_bo_list;
+   unsigned gfx_begin, gfx_end;
+   unsigned compute_begin, compute_end;
 };
 
 static void si_log_chunk_type_cs_destroy(void *data)
 {
-	struct si_log_chunk_cs *chunk = data;
-	si_saved_cs_reference(&chunk->cs, NULL);
-	free(chunk);
+   struct si_log_chunk_cs *chunk = data;
+   si_saved_cs_reference(&chunk->cs, NULL);
+   free(chunk);
 }
 
-static void si_parse_current_ib(FILE *f, struct radeon_cmdbuf *cs,
-				unsigned begin, unsigned end,
-				int *last_trace_id, unsigned trace_id_count,
-				const char *name, enum chip_class chip_class)
+static void si_parse_current_ib(FILE *f, struct radeon_cmdbuf *cs, unsigned begin, unsigned end,
+                                int *last_trace_id, unsigned trace_id_count, const char *name,
+                                enum chip_class chip_class)
 {
-	unsigned orig_end = end;
+   unsigned orig_end = end;
 
-	assert(begin <= end);
+   assert(begin <= end);
 
-	fprintf(f, "------------------ %s begin (dw = %u) ------------------\n",
-		name, begin);
+   fprintf(f, "------------------ %s begin (dw = %u) ------------------\n", name, begin);
 
-	for (unsigned prev_idx = 0; prev_idx < cs->num_prev; ++prev_idx) {
-		struct radeon_cmdbuf_chunk *chunk = &cs->prev[prev_idx];
+   for (unsigned prev_idx = 0; prev_idx < cs->num_prev; ++prev_idx) {
+      struct radeon_cmdbuf_chunk *chunk = &cs->prev[prev_idx];
 
-		if (begin < chunk->cdw) {
-			ac_parse_ib_chunk(f, chunk->buf + begin,
-					  MIN2(end, chunk->cdw) - begin,
-					  last_trace_id, trace_id_count,
-				          chip_class, NULL, NULL);
-		}
+      if (begin < chunk->cdw) {
+         ac_parse_ib_chunk(f, chunk->buf + begin, MIN2(end, chunk->cdw) - begin, last_trace_id,
+                           trace_id_count, chip_class, NULL, NULL);
+      }
 
-		if (end <= chunk->cdw)
-			return;
+      if (end <= chunk->cdw)
+         return;
 
-		if (begin < chunk->cdw)
-			fprintf(f, "\n---------- Next %s Chunk ----------\n\n",
-				name);
+      if (begin < chunk->cdw)
+         fprintf(f, "\n---------- Next %s Chunk ----------\n\n", name);
 
-		begin -= MIN2(begin, chunk->cdw);
-		end -= chunk->cdw;
-	}
+      begin -= MIN2(begin, chunk->cdw);
+      end -= chunk->cdw;
+   }
 
-	assert(end <= cs->current.cdw);
+   assert(end <= cs->current.cdw);
 
-	ac_parse_ib_chunk(f, cs->current.buf + begin, end - begin, last_trace_id,
-			  trace_id_count, chip_class, NULL, NULL);
+   ac_parse_ib_chunk(f, cs->current.buf + begin, end - begin, last_trace_id, trace_id_count,
+                     chip_class, NULL, NULL);
 
-	fprintf(f, "------------------- %s end (dw = %u) -------------------\n\n",
-		name, orig_end);
+   fprintf(f, "------------------- %s end (dw = %u) -------------------\n\n", name, orig_end);
 }
 
 static void si_log_chunk_type_cs_print(void *data, FILE *f)
 {
-	struct si_log_chunk_cs *chunk = data;
-	struct si_context *ctx = chunk->ctx;
-	struct si_saved_cs *scs = chunk->cs;
-	int last_trace_id = -1;
-	int last_compute_trace_id = -1;
-
-	/* We are expecting that the ddebug pipe has already
-	 * waited for the context, so this buffer should be idle.
-	 * If the GPU is hung, there is no point in waiting for it.
-	 */
-	uint32_t *map = ctx->ws->buffer_map(scs->trace_buf->buf,
-					      NULL,
-					      PIPE_TRANSFER_UNSYNCHRONIZED |
-					      PIPE_TRANSFER_READ);
-	if (map) {
-		last_trace_id = map[0];
-		last_compute_trace_id = map[1];
-	}
-
-	if (chunk->gfx_end != chunk->gfx_begin) {
-		if (chunk->gfx_begin == 0) {
-			if (ctx->init_config)
-				ac_parse_ib(f, ctx->init_config->pm4, ctx->init_config->ndw,
-					    NULL, 0, "IB2: Init config", ctx->chip_class,
-					    NULL, NULL);
-
-			if (ctx->init_config_gs_rings)
-				ac_parse_ib(f, ctx->init_config_gs_rings->pm4,
-					    ctx->init_config_gs_rings->ndw,
-					    NULL, 0, "IB2: Init GS rings", ctx->chip_class,
-					    NULL, NULL);
-		}
-
-		if (scs->flushed) {
-			ac_parse_ib(f, scs->gfx.ib + chunk->gfx_begin,
-				    chunk->gfx_end - chunk->gfx_begin,
-				    &last_trace_id, map ? 1 : 0, "IB", ctx->chip_class,
-				    NULL, NULL);
-		} else {
-			si_parse_current_ib(f, ctx->gfx_cs, chunk->gfx_begin,
-					    chunk->gfx_end, &last_trace_id, map ? 1 : 0,
-					    "IB", ctx->chip_class);
-		}
-	}
-
-	if (chunk->compute_end != chunk->compute_begin) {
-		assert(ctx->prim_discard_compute_cs);
-
-		if (scs->flushed) {
-			ac_parse_ib(f, scs->compute.ib + chunk->compute_begin,
-				    chunk->compute_end - chunk->compute_begin,
-				    &last_compute_trace_id, map ? 1 : 0, "Compute IB", ctx->chip_class,
-				    NULL, NULL);
-		} else {
-			si_parse_current_ib(f, ctx->prim_discard_compute_cs, chunk->compute_begin,
-					    chunk->compute_end, &last_compute_trace_id,
-					    map ? 1 : 0, "Compute IB", ctx->chip_class);
-		}
-	}
-
-	if (chunk->dump_bo_list) {
-		fprintf(f, "Flushing. Time: ");
-		util_dump_ns(f, scs->time_flush);
-		fprintf(f, "\n\n");
-		si_dump_bo_list(ctx, &scs->gfx, f);
-	}
+   struct si_log_chunk_cs *chunk = data;
+   struct si_context *ctx = chunk->ctx;
+   struct si_saved_cs *scs = chunk->cs;
+   int last_trace_id = -1;
+   int last_compute_trace_id = -1;
+
+   /* We are expecting that the ddebug pipe has already
+    * waited for the context, so this buffer should be idle.
+    * If the GPU is hung, there is no point in waiting for it.
+    */
+   uint32_t *map = ctx->ws->buffer_map(scs->trace_buf->buf, NULL,
+                                       PIPE_TRANSFER_UNSYNCHRONIZED | PIPE_TRANSFER_READ);
+   if (map) {
+      last_trace_id = map[0];
+      last_compute_trace_id = map[1];
+   }
+
+   if (chunk->gfx_end != chunk->gfx_begin) {
+      if (chunk->gfx_begin == 0) {
+         if (ctx->init_config)
+            ac_parse_ib(f, ctx->init_config->pm4, ctx->init_config->ndw, NULL, 0,
+                        "IB2: Init config", ctx->chip_class, NULL, NULL);
+
+         if (ctx->init_config_gs_rings)
+            ac_parse_ib(f, ctx->init_config_gs_rings->pm4, ctx->init_config_gs_rings->ndw, NULL, 0,
+                        "IB2: Init GS rings", ctx->chip_class, NULL, NULL);
+      }
+
+      if (scs->flushed) {
+         ac_parse_ib(f, scs->gfx.ib + chunk->gfx_begin, chunk->gfx_end - chunk->gfx_begin,
+                     &last_trace_id, map ? 1 : 0, "IB", ctx->chip_class, NULL, NULL);
+      } else {
+         si_parse_current_ib(f, ctx->gfx_cs, chunk->gfx_begin, chunk->gfx_end, &last_trace_id,
+                             map ? 1 : 0, "IB", ctx->chip_class);
+      }
+   }
+
+   if (chunk->compute_end != chunk->compute_begin) {
+      assert(ctx->prim_discard_compute_cs);
+
+      if (scs->flushed) {
+         ac_parse_ib(f, scs->compute.ib + chunk->compute_begin,
+                     chunk->compute_end - chunk->compute_begin, &last_compute_trace_id, map ? 1 : 0,
+                     "Compute IB", ctx->chip_class, NULL, NULL);
+      } else {
+         si_parse_current_ib(f, ctx->prim_discard_compute_cs, chunk->compute_begin,
+                             chunk->compute_end, &last_compute_trace_id, map ? 1 : 0, "Compute IB",
+                             ctx->chip_class);
+      }
+   }
+
+   if (chunk->dump_bo_list) {
+      fprintf(f, "Flushing. Time: ");
+      util_dump_ns(f, scs->time_flush);
+      fprintf(f, "\n\n");
+      si_dump_bo_list(ctx, &scs->gfx, f);
+   }
 }
 
 static const struct u_log_chunk_type si_log_chunk_type_cs = {
-	.destroy = si_log_chunk_type_cs_destroy,
-	.print = si_log_chunk_type_cs_print,
+   .destroy = si_log_chunk_type_cs_destroy,
+   .print = si_log_chunk_type_cs_print,
 };
 
-static void si_log_cs(struct si_context *ctx, struct u_log_context *log,
-		      bool dump_bo_list)
+static void si_log_cs(struct si_context *ctx, struct u_log_context *log, bool dump_bo_list)
 {
-	assert(ctx->current_saved_cs);
+   assert(ctx->current_saved_cs);
 
-	struct si_saved_cs *scs = ctx->current_saved_cs;
-	unsigned gfx_cur = ctx->gfx_cs->prev_dw + ctx->gfx_cs->current.cdw;
-	unsigned compute_cur = 0;
+   struct si_saved_cs *scs = ctx->current_saved_cs;
+   unsigned gfx_cur = ctx->gfx_cs->prev_dw + ctx->gfx_cs->current.cdw;
+   unsigned compute_cur = 0;
 
-	if (ctx->prim_discard_compute_cs)
-		compute_cur = ctx->prim_discard_compute_cs->prev_dw + ctx->prim_discard_compute_cs->current.cdw;
+   if (ctx->prim_discard_compute_cs)
+      compute_cur =
+         ctx->prim_discard_compute_cs->prev_dw + ctx->prim_discard_compute_cs->current.cdw;
 
-	if (!dump_bo_list &&
-	    gfx_cur == scs->gfx_last_dw &&
-	    compute_cur == scs->compute_last_dw)
-		return;
+   if (!dump_bo_list && gfx_cur == scs->gfx_last_dw && compute_cur == scs->compute_last_dw)
+      return;
 
-	struct si_log_chunk_cs *chunk = calloc(1, sizeof(*chunk));
+   struct si_log_chunk_cs *chunk = calloc(1, sizeof(*chunk));
 
-	chunk->ctx = ctx;
-	si_saved_cs_reference(&chunk->cs, scs);
-	chunk->dump_bo_list = dump_bo_list;
+   chunk->ctx = ctx;
+   si_saved_cs_reference(&chunk->cs, scs);
+   chunk->dump_bo_list = dump_bo_list;
 
-	chunk->gfx_begin = scs->gfx_last_dw;
-	chunk->gfx_end = gfx_cur;
-	scs->gfx_last_dw = gfx_cur;
+   chunk->gfx_begin = scs->gfx_last_dw;
+   chunk->gfx_end = gfx_cur;
+   scs->gfx_last_dw = gfx_cur;
 
-	chunk->compute_begin = scs->compute_last_dw;
-	chunk->compute_end = compute_cur;
-	scs->compute_last_dw = compute_cur;
+   chunk->compute_begin = scs->compute_last_dw;
+   chunk->compute_end = compute_cur;
+   scs->compute_last_dw = compute_cur;
 
-	u_log_chunk(log, &si_log_chunk_type_cs, chunk);
+   u_log_chunk(log, &si_log_chunk_type_cs, chunk);
 }
 
 void si_auto_log_cs(void *data, struct u_log_context *log)
 {
-	struct si_context *ctx = (struct si_context *)data;
-	si_log_cs(ctx, log, false);
+   struct si_context *ctx = (struct si_context *)data;
+   si_log_cs(ctx, log, false);
 }
 
 void si_log_hw_flush(struct si_context *sctx)
 {
-	if (!sctx->log)
-		return;
-
-	si_log_cs(sctx, sctx->log, true);
-
-	if (&sctx->b == sctx->screen->aux_context) {
-		/* The aux context isn't captured by the ddebug wrapper,
-		 * so we dump it on a flush-by-flush basis here.
-		 */
-		FILE *f = dd_get_debug_file(false);
-		if (!f) {
-			fprintf(stderr, "radeonsi: error opening aux context dump file.\n");
-		} else {
-			dd_write_header(f, &sctx->screen->b, 0);
-
-			fprintf(f, "Aux context dump:\n\n");
-			u_log_new_page_print(sctx->log, f);
-
-			fclose(f);
-		}
-	}
+   if (!sctx->log)
+      return;
+
+   si_log_cs(sctx, sctx->log, true);
+
+   if (&sctx->b == sctx->screen->aux_context) {
+      /* The aux context isn't captured by the ddebug wrapper,
+       * so we dump it on a flush-by-flush basis here.
+       */
+      FILE *f = dd_get_debug_file(false);
+      if (!f) {
+         fprintf(stderr, "radeonsi: error opening aux context dump file.\n");
+      } else {
+         dd_write_header(f, &sctx->screen->b, 0);
+
+         fprintf(f, "Aux context dump:\n\n");
+         u_log_new_page_print(sctx->log, f);
+
+         fclose(f);
+      }
+   }
 }
 
 static const char *priority_to_string(enum radeon_bo_priority priority)
 {
 #define ITEM(x) [RADEON_PRIO_##x] = #x
-	static const char *table[64] = {
-		ITEM(FENCE),
-	        ITEM(TRACE),
-	        ITEM(SO_FILLED_SIZE),
-	        ITEM(QUERY),
-	        ITEM(IB1),
-	        ITEM(IB2),
-	        ITEM(DRAW_INDIRECT),
-	        ITEM(INDEX_BUFFER),
-		ITEM(CP_DMA),
-	        ITEM(CONST_BUFFER),
-	        ITEM(DESCRIPTORS),
-	        ITEM(BORDER_COLORS),
-	        ITEM(SAMPLER_BUFFER),
-	        ITEM(VERTEX_BUFFER),
-	        ITEM(SHADER_RW_BUFFER),
-	        ITEM(COMPUTE_GLOBAL),
-	        ITEM(SAMPLER_TEXTURE),
-	        ITEM(SHADER_RW_IMAGE),
-	        ITEM(SAMPLER_TEXTURE_MSAA),
-	        ITEM(COLOR_BUFFER),
-	        ITEM(DEPTH_BUFFER),
-	        ITEM(COLOR_BUFFER_MSAA),
-	        ITEM(DEPTH_BUFFER_MSAA),
-	        ITEM(SEPARATE_META),
-		ITEM(SHADER_BINARY),
-		ITEM(SHADER_RINGS),
-		ITEM(SCRATCH_BUFFER),
-	};
+   static const char *table[64] = {
+      ITEM(FENCE),
+      ITEM(TRACE),
+      ITEM(SO_FILLED_SIZE),
+      ITEM(QUERY),
+      ITEM(IB1),
+      ITEM(IB2),
+      ITEM(DRAW_INDIRECT),
+      ITEM(INDEX_BUFFER),
+      ITEM(CP_DMA),
+      ITEM(CONST_BUFFER),
+      ITEM(DESCRIPTORS),
+      ITEM(BORDER_COLORS),
+      ITEM(SAMPLER_BUFFER),
+      ITEM(VERTEX_BUFFER),
+      ITEM(SHADER_RW_BUFFER),
+      ITEM(COMPUTE_GLOBAL),
+      ITEM(SAMPLER_TEXTURE),
+      ITEM(SHADER_RW_IMAGE),
+      ITEM(SAMPLER_TEXTURE_MSAA),
+      ITEM(COLOR_BUFFER),
+      ITEM(DEPTH_BUFFER),
+      ITEM(COLOR_BUFFER_MSAA),
+      ITEM(DEPTH_BUFFER_MSAA),
+      ITEM(SEPARATE_META),
+      ITEM(SHADER_BINARY),
+      ITEM(SHADER_RINGS),
+      ITEM(SCRATCH_BUFFER),
+   };
 #undef ITEM
 
-	assert(priority < ARRAY_SIZE(table));
-	return table[priority];
+   assert(priority < ARRAY_SIZE(table));
+   return table[priority];
 }
 
 static int bo_list_compare_va(const struct radeon_bo_list_item *a,
-				   const struct radeon_bo_list_item *b)
+                              const struct radeon_bo_list_item *b)
 {
-	return a->vm_address < b->vm_address ? -1 :
-	       a->vm_address > b->vm_address ? 1 : 0;
+   return a->vm_address < b->vm_address ? -1 : a->vm_address > b->vm_address ? 1 : 0;
 }
 
-static void si_dump_bo_list(struct si_context *sctx,
-			    const struct radeon_saved_cs *saved, FILE *f)
+static void si_dump_bo_list(struct si_context *sctx, const struct radeon_saved_cs *saved, FILE *f)
 {
-	unsigned i,j;
-
-	if (!saved->bo_list)
-		return;
-
-	/* Sort the list according to VM adddresses first. */
-	qsort(saved->bo_list, saved->bo_count,
-	      sizeof(saved->bo_list[0]), (void*)bo_list_compare_va);
-
-	fprintf(f, "Buffer list (in units of pages = 4kB):\n"
-		COLOR_YELLOW "        Size    VM start page         "
-		"VM end page           Usage" COLOR_RESET "\n");
-
-	for (i = 0; i < saved->bo_count; i++) {
-		/* Note: Buffer sizes are expected to be aligned to 4k by the winsys. */
-		const unsigned page_size = sctx->screen->info.gart_page_size;
-		uint64_t va = saved->bo_list[i].vm_address;
-		uint64_t size = saved->bo_list[i].bo_size;
-		bool hit = false;
-
-		/* If there's unused virtual memory between 2 buffers, print it. */
-		if (i) {
-			uint64_t previous_va_end = saved->bo_list[i-1].vm_address +
-						   saved->bo_list[i-1].bo_size;
-
-			if (va > previous_va_end) {
-				fprintf(f, "  %10"PRIu64"    -- hole --\n",
-					(va - previous_va_end) / page_size);
-			}
-		}
-
-		/* Print the buffer. */
-		fprintf(f, "  %10"PRIu64"    0x%013"PRIX64"       0x%013"PRIX64"       ",
-			size / page_size, va / page_size, (va + size) / page_size);
-
-		/* Print the usage. */
-		for (j = 0; j < 32; j++) {
-			if (!(saved->bo_list[i].priority_usage & (1u << j)))
-				continue;
-
-			fprintf(f, "%s%s", !hit ? "" : ", ", priority_to_string(j));
-			hit = true;
-		}
-		fprintf(f, "\n");
-	}
-	fprintf(f, "\nNote: The holes represent memory not used by the IB.\n"
-		   "      Other buffers can still be allocated there.\n\n");
+   unsigned i, j;
+
+   if (!saved->bo_list)
+      return;
+
+   /* Sort the list according to VM adddresses first. */
+   qsort(saved->bo_list, saved->bo_count, sizeof(saved->bo_list[0]), (void *)bo_list_compare_va);
+
+   fprintf(f, "Buffer list (in units of pages = 4kB):\n" COLOR_YELLOW
+              "        Size    VM start page         "
+              "VM end page           Usage" COLOR_RESET "\n");
+
+   for (i = 0; i < saved->bo_count; i++) {
+      /* Note: Buffer sizes are expected to be aligned to 4k by the winsys. */
+      const unsigned page_size = sctx->screen->info.gart_page_size;
+      uint64_t va = saved->bo_list[i].vm_address;
+      uint64_t size = saved->bo_list[i].bo_size;
+      bool hit = false;
+
+      /* If there's unused virtual memory between 2 buffers, print it. */
+      if (i) {
+         uint64_t previous_va_end =
+            saved->bo_list[i - 1].vm_address + saved->bo_list[i - 1].bo_size;
+
+         if (va > previous_va_end) {
+            fprintf(f, "  %10" PRIu64 "    -- hole --\n", (va - previous_va_end) / page_size);
+         }
+      }
+
+      /* Print the buffer. */
+      fprintf(f, "  %10" PRIu64 "    0x%013" PRIX64 "       0x%013" PRIX64 "       ",
+              size / page_size, va / page_size, (va + size) / page_size);
+
+      /* Print the usage. */
+      for (j = 0; j < 32; j++) {
+         if (!(saved->bo_list[i].priority_usage & (1u << j)))
+            continue;
+
+         fprintf(f, "%s%s", !hit ? "" : ", ", priority_to_string(j));
+         hit = true;
+      }
+      fprintf(f, "\n");
+   }
+   fprintf(f, "\nNote: The holes represent memory not used by the IB.\n"
+              "      Other buffers can still be allocated there.\n\n");
 }
 
 static void si_dump_framebuffer(struct si_context *sctx, struct u_log_context *log)
 {
-	struct pipe_framebuffer_state *state = &sctx->framebuffer.state;
-	struct si_texture *tex;
-	int i;
-
-	for (i = 0; i < state->nr_cbufs; i++) {
-		if (!state->cbufs[i])
-			continue;
-
-		tex = (struct si_texture*)state->cbufs[i]->texture;
-		u_log_printf(log, COLOR_YELLOW "Color buffer %i:" COLOR_RESET "\n", i);
-		si_print_texture_info(sctx->screen, tex, log);
-		u_log_printf(log, "\n");
-	}
-
-	if (state->zsbuf) {
-		tex = (struct si_texture*)state->zsbuf->texture;
-		u_log_printf(log, COLOR_YELLOW "Depth-stencil buffer:" COLOR_RESET "\n");
-		si_print_texture_info(sctx->screen, tex, log);
-		u_log_printf(log, "\n");
-	}
+   struct pipe_framebuffer_state *state = &sctx->framebuffer.state;
+   struct si_texture *tex;
+   int i;
+
+   for (i = 0; i < state->nr_cbufs; i++) {
+      if (!state->cbufs[i])
+         continue;
+
+      tex = (struct si_texture *)state->cbufs[i]->texture;
+      u_log_printf(log, COLOR_YELLOW "Color buffer %i:" COLOR_RESET "\n", i);
+      si_print_texture_info(sctx->screen, tex, log);
+      u_log_printf(log, "\n");
+   }
+
+   if (state->zsbuf) {
+      tex = (struct si_texture *)state->zsbuf->texture;
+      u_log_printf(log, COLOR_YELLOW "Depth-stencil buffer:" COLOR_RESET "\n");
+      si_print_texture_info(sctx->screen, tex, log);
+      u_log_printf(log, "\n");
+   }
 }
 
 typedef unsigned (*slot_remap_func)(unsigned);
 
 struct si_log_chunk_desc_list {
-	/** Pointer to memory map of buffer where the list is uploader */
-	uint32_t *gpu_list;
-	/** Reference of buffer where the list is uploaded, so that gpu_list
-	 * is kept live. */
-	struct si_resource *buf;
-
-	const char *shader_name;
-	const char *elem_name;
-	slot_remap_func slot_remap;
-	enum chip_class chip_class;
-	unsigned element_dw_size;
-	unsigned num_elements;
-
-	uint32_t list[0];
+   /** Pointer to memory map of buffer where the list is uploader */
+   uint32_t *gpu_list;
+   /** Reference of buffer where the list is uploaded, so that gpu_list
+    * is kept live. */
+   struct si_resource *buf;
+
+   const char *shader_name;
+   const char *elem_name;
+   slot_remap_func slot_remap;
+   enum chip_class chip_class;
+   unsigned element_dw_size;
+   unsigned num_elements;
+
+   uint32_t list[0];
 };
 
-static void
-si_log_chunk_desc_list_destroy(void *data)
+static void si_log_chunk_desc_list_destroy(void *data)
 {
-	struct si_log_chunk_desc_list *chunk = data;
-	si_resource_reference(&chunk->buf, NULL);
-	FREE(chunk);
+   struct si_log_chunk_desc_list *chunk = data;
+   si_resource_reference(&chunk->buf, NULL);
+   FREE(chunk);
 }
 
-static void
-si_log_chunk_desc_list_print(void *data, FILE *f)
+static void si_log_chunk_desc_list_print(void *data, FILE *f)
 {
-	struct si_log_chunk_desc_list *chunk = data;
-	unsigned sq_img_rsrc_word0 = chunk->chip_class >= GFX10 ? R_00A000_SQ_IMG_RSRC_WORD0
-								: R_008F10_SQ_IMG_RSRC_WORD0;
-
-	for (unsigned i = 0; i < chunk->num_elements; i++) {
-		unsigned cpu_dw_offset = i * chunk->element_dw_size;
-		unsigned gpu_dw_offset = chunk->slot_remap(i) * chunk->element_dw_size;
-		const char *list_note = chunk->gpu_list ? "GPU list" : "CPU list";
-		uint32_t *cpu_list = chunk->list + cpu_dw_offset;
-		uint32_t *gpu_list = chunk->gpu_list ? chunk->gpu_list + gpu_dw_offset : cpu_list;
-
-		fprintf(f, COLOR_GREEN "%s%s slot %u (%s):" COLOR_RESET "\n",
-			chunk->shader_name, chunk->elem_name, i, list_note);
-
-		switch (chunk->element_dw_size) {
-		case 4:
-			for (unsigned j = 0; j < 4; j++)
-				ac_dump_reg(f, chunk->chip_class,
-					    R_008F00_SQ_BUF_RSRC_WORD0 + j*4,
-					    gpu_list[j], 0xffffffff);
-			break;
-		case 8:
-			for (unsigned j = 0; j < 8; j++)
-				ac_dump_reg(f, chunk->chip_class,
-					    sq_img_rsrc_word0 + j*4,
-					    gpu_list[j], 0xffffffff);
-
-			fprintf(f, COLOR_CYAN "    Buffer:" COLOR_RESET "\n");
-			for (unsigned j = 0; j < 4; j++)
-				ac_dump_reg(f, chunk->chip_class,
-					    R_008F00_SQ_BUF_RSRC_WORD0 + j*4,
-					    gpu_list[4+j], 0xffffffff);
-			break;
-		case 16:
-			for (unsigned j = 0; j < 8; j++)
-				ac_dump_reg(f, chunk->chip_class,
-					    sq_img_rsrc_word0 + j*4,
-					    gpu_list[j], 0xffffffff);
-
-			fprintf(f, COLOR_CYAN "    Buffer:" COLOR_RESET "\n");
-			for (unsigned j = 0; j < 4; j++)
-				ac_dump_reg(f, chunk->chip_class,
-					    R_008F00_SQ_BUF_RSRC_WORD0 + j*4,
-					    gpu_list[4+j], 0xffffffff);
-
-			fprintf(f, COLOR_CYAN "    FMASK:" COLOR_RESET "\n");
-			for (unsigned j = 0; j < 8; j++)
-				ac_dump_reg(f, chunk->chip_class,
-					    sq_img_rsrc_word0 + j*4,
-					    gpu_list[8+j], 0xffffffff);
-
-			fprintf(f, COLOR_CYAN "    Sampler state:" COLOR_RESET "\n");
-			for (unsigned j = 0; j < 4; j++)
-				ac_dump_reg(f, chunk->chip_class,
-					    R_008F30_SQ_IMG_SAMP_WORD0 + j*4,
-					    gpu_list[12+j], 0xffffffff);
-			break;
-		}
-
-		if (memcmp(gpu_list, cpu_list, chunk->element_dw_size * 4) != 0) {
-			fprintf(f, COLOR_RED "!!!!! This slot was corrupted in GPU memory !!!!!"
-				COLOR_RESET "\n");
-		}
-
-		fprintf(f, "\n");
-	}
-
+   struct si_log_chunk_desc_list *chunk = data;
+   unsigned sq_img_rsrc_word0 =
+      chunk->chip_class >= GFX10 ? R_00A000_SQ_IMG_RSRC_WORD0 : R_008F10_SQ_IMG_RSRC_WORD0;
+
+   for (unsigned i = 0; i < chunk->num_elements; i++) {
+      unsigned cpu_dw_offset = i * chunk->element_dw_size;
+      unsigned gpu_dw_offset = chunk->slot_remap(i) * chunk->element_dw_size;
+      const char *list_note = chunk->gpu_list ? "GPU list" : "CPU list";
+      uint32_t *cpu_list = chunk->list + cpu_dw_offset;
+      uint32_t *gpu_list = chunk->gpu_list ? chunk->gpu_list + gpu_dw_offset : cpu_list;
+
+      fprintf(f, COLOR_GREEN "%s%s slot %u (%s):" COLOR_RESET "\n", chunk->shader_name,
+              chunk->elem_name, i, list_note);
+
+      switch (chunk->element_dw_size) {
+      case 4:
+         for (unsigned j = 0; j < 4; j++)
+            ac_dump_reg(f, chunk->chip_class, R_008F00_SQ_BUF_RSRC_WORD0 + j * 4, gpu_list[j],
+                        0xffffffff);
+         break;
+      case 8:
+         for (unsigned j = 0; j < 8; j++)
+            ac_dump_reg(f, chunk->chip_class, sq_img_rsrc_word0 + j * 4, gpu_list[j], 0xffffffff);
+
+         fprintf(f, COLOR_CYAN "    Buffer:" COLOR_RESET "\n");
+         for (unsigned j = 0; j < 4; j++)
+            ac_dump_reg(f, chunk->chip_class, R_008F00_SQ_BUF_RSRC_WORD0 + j * 4, gpu_list[4 + j],
+                        0xffffffff);
+         break;
+      case 16:
+         for (unsigned j = 0; j < 8; j++)
+            ac_dump_reg(f, chunk->chip_class, sq_img_rsrc_word0 + j * 4, gpu_list[j], 0xffffffff);
+
+         fprintf(f, COLOR_CYAN "    Buffer:" COLOR_RESET "\n");
+         for (unsigned j = 0; j < 4; j++)
+            ac_dump_reg(f, chunk->chip_class, R_008F00_SQ_BUF_RSRC_WORD0 + j * 4, gpu_list[4 + j],
+                        0xffffffff);
+
+         fprintf(f, COLOR_CYAN "    FMASK:" COLOR_RESET "\n");
+         for (unsigned j = 0; j < 8; j++)
+            ac_dump_reg(f, chunk->chip_class, sq_img_rsrc_word0 + j * 4, gpu_list[8 + j],
+                        0xffffffff);
+
+         fprintf(f, COLOR_CYAN "    Sampler state:" COLOR_RESET "\n");
+         for (unsigned j = 0; j < 4; j++)
+            ac_dump_reg(f, chunk->chip_class, R_008F30_SQ_IMG_SAMP_WORD0 + j * 4, gpu_list[12 + j],
+                        0xffffffff);
+         break;
+      }
+
+      if (memcmp(gpu_list, cpu_list, chunk->element_dw_size * 4) != 0) {
+         fprintf(f, COLOR_RED "!!!!! This slot was corrupted in GPU memory !!!!!" COLOR_RESET "\n");
+      }
+
+      fprintf(f, "\n");
+   }
 }
 
 static const struct u_log_chunk_type si_log_chunk_type_descriptor_list = {
-	.destroy = si_log_chunk_desc_list_destroy,
-	.print = si_log_chunk_desc_list_print,
+   .destroy = si_log_chunk_desc_list_destroy,
+   .print = si_log_chunk_desc_list_print,
 };
 
-static void si_dump_descriptor_list(struct si_screen *screen,
-				    struct si_descriptors *desc,
-				    const char *shader_name,
-				    const char *elem_name,
-				    unsigned element_dw_size,
-				    unsigned num_elements,
-				    slot_remap_func slot_remap,
-				    struct u_log_context *log)
+static void si_dump_descriptor_list(struct si_screen *screen, struct si_descriptors *desc,
+                                    const char *shader_name, const char *elem_name,
+                                    unsigned element_dw_size, unsigned num_elements,
+                                    slot_remap_func slot_remap, struct u_log_context *log)
 {
-	if (!desc->list)
-		return;
-
-	/* In some cases, the caller doesn't know how many elements are really
-	 * uploaded. Reduce num_elements to fit in the range of active slots. */
-	unsigned active_range_dw_begin =
-		desc->first_active_slot * desc->element_dw_size;
-	unsigned active_range_dw_end =
-		active_range_dw_begin + desc->num_active_slots * desc->element_dw_size;
-
-	while (num_elements > 0) {
-		int i = slot_remap(num_elements - 1);
-		unsigned dw_begin = i * element_dw_size;
-		unsigned dw_end = dw_begin + element_dw_size;
-
-		if (dw_begin >= active_range_dw_begin && dw_end <= active_range_dw_end)
-			break;
-
-		num_elements--;
-	}
-
-	struct si_log_chunk_desc_list *chunk =
-		CALLOC_VARIANT_LENGTH_STRUCT(si_log_chunk_desc_list,
-					     4 * element_dw_size * num_elements);
-	chunk->shader_name = shader_name;
-	chunk->elem_name = elem_name;
-	chunk->element_dw_size = element_dw_size;
-	chunk->num_elements = num_elements;
-	chunk->slot_remap = slot_remap;
-	chunk->chip_class = screen->info.chip_class;
-
-	si_resource_reference(&chunk->buf, desc->buffer);
-	chunk->gpu_list = desc->gpu_list;
-
-	for (unsigned i = 0; i < num_elements; ++i) {
-		memcpy(&chunk->list[i * element_dw_size],
-		       &desc->list[slot_remap(i) * element_dw_size],
-		       4 * element_dw_size);
-	}
-
-	u_log_chunk(log, &si_log_chunk_type_descriptor_list, chunk);
+   if (!desc->list)
+      return;
+
+   /* In some cases, the caller doesn't know how many elements are really
+    * uploaded. Reduce num_elements to fit in the range of active slots. */
+   unsigned active_range_dw_begin = desc->first_active_slot * desc->element_dw_size;
+   unsigned active_range_dw_end =
+      active_range_dw_begin + desc->num_active_slots * desc->element_dw_size;
+
+   while (num_elements > 0) {
+      int i = slot_remap(num_elements - 1);
+      unsigned dw_begin = i * element_dw_size;
+      unsigned dw_end = dw_begin + element_dw_size;
+
+      if (dw_begin >= active_range_dw_begin && dw_end <= active_range_dw_end)
+         break;
+
+      num_elements--;
+   }
+
+   struct si_log_chunk_desc_list *chunk =
+      CALLOC_VARIANT_LENGTH_STRUCT(si_log_chunk_desc_list, 4 * element_dw_size * num_elements);
+   chunk->shader_name = shader_name;
+   chunk->elem_name = elem_name;
+   chunk->element_dw_size = element_dw_size;
+   chunk->num_elements = num_elements;
+   chunk->slot_remap = slot_remap;
+   chunk->chip_class = screen->info.chip_class;
+
+   si_resource_reference(&chunk->buf, desc->buffer);
+   chunk->gpu_list = desc->gpu_list;
+
+   for (unsigned i = 0; i < num_elements; ++i) {
+      memcpy(&chunk->list[i * element_dw_size], &desc->list[slot_remap(i) * element_dw_size],
+             4 * element_dw_size);
+   }
+
+   u_log_chunk(log, &si_log_chunk_type_descriptor_list, chunk);
 }
 
 static unsigned si_identity(unsigned slot)
 {
-	return slot;
+   return slot;
 }
 
-static void si_dump_descriptors(struct si_context *sctx,
-				enum pipe_shader_type processor,
-				const struct si_shader_info *info,
-				struct u_log_context *log)
+static void si_dump_descriptors(struct si_context *sctx, enum pipe_shader_type processor,
+                                const struct si_shader_info *info, struct u_log_context *log)
 {
-	struct si_descriptors *descs =
-		&sctx->descriptors[SI_DESCS_FIRST_SHADER +
-				   processor * SI_NUM_SHADER_DESCS];
-	static const char *shader_name[] = {"VS", "PS", "GS", "TCS", "TES", "CS"};
-	const char *name = shader_name[processor];
-	unsigned enabled_constbuf, enabled_shaderbuf, enabled_samplers;
-	unsigned enabled_images;
-
-	if (info) {
-		enabled_constbuf = info->const_buffers_declared;
-		enabled_shaderbuf = info->shader_buffers_declared;
-		enabled_samplers = info->samplers_declared;
-		enabled_images = info->images_declared;
-	} else {
-		enabled_constbuf = sctx->const_and_shader_buffers[processor].enabled_mask >>
-				   SI_NUM_SHADER_BUFFERS;
-		enabled_shaderbuf = sctx->const_and_shader_buffers[processor].enabled_mask &
-				    u_bit_consecutive(0, SI_NUM_SHADER_BUFFERS);
-		enabled_shaderbuf = util_bitreverse(enabled_shaderbuf) >>
-				    (32 - SI_NUM_SHADER_BUFFERS);
-		enabled_samplers = sctx->samplers[processor].enabled_mask;
-		enabled_images = sctx->images[processor].enabled_mask;
-	}
-
-	if (processor == PIPE_SHADER_VERTEX &&
-	    sctx->vb_descriptors_buffer &&
-	    sctx->vb_descriptors_gpu_list &&
-	    sctx->vertex_elements) {
-		assert(info); /* only CS may not have an info struct */
-		struct si_descriptors desc = {};
-
-		desc.buffer = sctx->vb_descriptors_buffer;
-		desc.list = sctx->vb_descriptors_gpu_list;
-		desc.gpu_list = sctx->vb_descriptors_gpu_list;
-		desc.element_dw_size = 4;
-		desc.num_active_slots = sctx->vertex_elements->vb_desc_list_alloc_size / 16;
-
-		si_dump_descriptor_list(sctx->screen, &desc, name,
-					" - Vertex buffer", 4, info->num_inputs,
-					si_identity, log);
-	}
-
-	si_dump_descriptor_list(sctx->screen,
-				&descs[SI_SHADER_DESCS_CONST_AND_SHADER_BUFFERS],
-				name, " - Constant buffer", 4,
-				util_last_bit(enabled_constbuf),
-				si_get_constbuf_slot, log);
-	si_dump_descriptor_list(sctx->screen,
-				&descs[SI_SHADER_DESCS_CONST_AND_SHADER_BUFFERS],
-				name, " - Shader buffer", 4,
-				util_last_bit(enabled_shaderbuf),
-				si_get_shaderbuf_slot, log);
-	si_dump_descriptor_list(sctx->screen,
-				&descs[SI_SHADER_DESCS_SAMPLERS_AND_IMAGES],
-				name, " - Sampler", 16,
-				util_last_bit(enabled_samplers),
-				si_get_sampler_slot, log);
-	si_dump_descriptor_list(sctx->screen,
-				&descs[SI_SHADER_DESCS_SAMPLERS_AND_IMAGES],
-				name, " - Image", 8,
-				util_last_bit(enabled_images),
-				si_get_image_slot, log);
+   struct si_descriptors *descs =
+      &sctx->descriptors[SI_DESCS_FIRST_SHADER + processor * SI_NUM_SHADER_DESCS];
+   static const char *shader_name[] = {"VS", "PS", "GS", "TCS", "TES", "CS"};
+   const char *name = shader_name[processor];
+   unsigned enabled_constbuf, enabled_shaderbuf, enabled_samplers;
+   unsigned enabled_images;
+
+   if (info) {
+      enabled_constbuf = info->const_buffers_declared;
+      enabled_shaderbuf = info->shader_buffers_declared;
+      enabled_samplers = info->samplers_declared;
+      enabled_images = info->images_declared;
+   } else {
+      enabled_constbuf =
+         sctx->const_and_shader_buffers[processor].enabled_mask >> SI_NUM_SHADER_BUFFERS;
+      enabled_shaderbuf = sctx->const_and_shader_buffers[processor].enabled_mask &
+                          u_bit_consecutive(0, SI_NUM_SHADER_BUFFERS);
+      enabled_shaderbuf = util_bitreverse(enabled_shaderbuf) >> (32 - SI_NUM_SHADER_BUFFERS);
+      enabled_samplers = sctx->samplers[processor].enabled_mask;
+      enabled_images = sctx->images[processor].enabled_mask;
+   }
+
+   if (processor == PIPE_SHADER_VERTEX && sctx->vb_descriptors_buffer &&
+       sctx->vb_descriptors_gpu_list && sctx->vertex_elements) {
+      assert(info); /* only CS may not have an info struct */
+      struct si_descriptors desc = {};
+
+      desc.buffer = sctx->vb_descriptors_buffer;
+      desc.list = sctx->vb_descriptors_gpu_list;
+      desc.gpu_list = sctx->vb_descriptors_gpu_list;
+      desc.element_dw_size = 4;
+      desc.num_active_slots = sctx->vertex_elements->vb_desc_list_alloc_size / 16;
+
+      si_dump_descriptor_list(sctx->screen, &desc, name, " - Vertex buffer", 4, info->num_inputs,
+                              si_identity, log);
+   }
+
+   si_dump_descriptor_list(sctx->screen, &descs[SI_SHADER_DESCS_CONST_AND_SHADER_BUFFERS], name,
+                           " - Constant buffer", 4, util_last_bit(enabled_constbuf),
+                           si_get_constbuf_slot, log);
+   si_dump_descriptor_list(sctx->screen, &descs[SI_SHADER_DESCS_CONST_AND_SHADER_BUFFERS], name,
+                           " - Shader buffer", 4, util_last_bit(enabled_shaderbuf),
+                           si_get_shaderbuf_slot, log);
+   si_dump_descriptor_list(sctx->screen, &descs[SI_SHADER_DESCS_SAMPLERS_AND_IMAGES], name,
+                           " - Sampler", 16, util_last_bit(enabled_samplers), si_get_sampler_slot,
+                           log);
+   si_dump_descriptor_list(sctx->screen, &descs[SI_SHADER_DESCS_SAMPLERS_AND_IMAGES], name,
+                           " - Image", 8, util_last_bit(enabled_images), si_get_image_slot, log);
 }
 
 static void si_dump_gfx_descriptors(struct si_context *sctx,
-				    const struct si_shader_ctx_state *state,
-				    struct u_log_context *log)
+                                    const struct si_shader_ctx_state *state,
+                                    struct u_log_context *log)
 {
-	if (!state->cso || !state->current)
-		return;
+   if (!state->cso || !state->current)
+      return;
 
-	si_dump_descriptors(sctx, state->cso->type, &state->cso->info, log);
+   si_dump_descriptors(sctx, state->cso->type, &state->cso->info, log);
 }
 
-static void si_dump_compute_descriptors(struct si_context *sctx,
-					struct u_log_context *log)
+static void si_dump_compute_descriptors(struct si_context *sctx, struct u_log_context *log)
 {
-	if (!sctx->cs_shader_state.program)
-		return;
+   if (!sctx->cs_shader_state.program)
+      return;
 
-	si_dump_descriptors(sctx, PIPE_SHADER_COMPUTE, NULL, log);
+   si_dump_descriptors(sctx, PIPE_SHADER_COMPUTE, NULL, log);
 }
 
 struct si_shader_inst {
-	const char *text; /* start of disassembly for this instruction */
-	unsigned textlen;
-	unsigned size;   /* instruction size = 4 or 8 */
-	uint64_t addr; /* instruction address */
+   const char *text; /* start of disassembly for this instruction */
+   unsigned textlen;
+   unsigned size; /* instruction size = 4 or 8 */
+   uint64_t addr; /* instruction address */
 };
 
 /**
@@ -933,344 +865,323 @@ struct si_shader_inst {
  * The caller must keep \p rtld_binary alive as long as \p instructions are
  * used and then close it afterwards.
  */
-static void si_add_split_disasm(struct si_screen *screen,
-				struct ac_rtld_binary *rtld_binary,
-				struct si_shader_binary *binary,
-				uint64_t *addr,
-				unsigned *num,
-				struct si_shader_inst *instructions,
-				enum pipe_shader_type shader_type,
-				unsigned wave_size)
+static void si_add_split_disasm(struct si_screen *screen, struct ac_rtld_binary *rtld_binary,
+                                struct si_shader_binary *binary, uint64_t *addr, unsigned *num,
+                                struct si_shader_inst *instructions,
+                                enum pipe_shader_type shader_type, unsigned wave_size)
 {
-	if (!ac_rtld_open(rtld_binary, (struct ac_rtld_open_info){
-			.info = &screen->info,
-			.shader_type = tgsi_processor_to_shader_stage(shader_type),
-			.wave_size = wave_size,
-			.num_parts = 1,
-			.elf_ptrs = &binary->elf_buffer,
-			.elf_sizes = &binary->elf_size }))
-		return;
-
-	const char *disasm;
-	size_t nbytes;
-	if (!ac_rtld_get_section_by_name(rtld_binary, ".AMDGPU.disasm",
-					 &disasm, &nbytes))
-		return;
-
-	const char *end = disasm + nbytes;
-	while (disasm < end) {
-		const char *semicolon = memchr(disasm, ';', end - disasm);
-		if (!semicolon)
-			break;
-
-		struct si_shader_inst *inst = &instructions[(*num)++];
-		const char *inst_end = memchr(semicolon + 1, '\n', end - semicolon - 1);
-		if (!inst_end)
-			inst_end = end;
-
-		inst->text = disasm;
-		inst->textlen = inst_end - disasm;
-
-		inst->addr = *addr;
-		/* More than 16 chars after ";" means the instruction is 8 bytes long. */
-		inst->size = inst_end - semicolon > 16 ? 8 : 4;
-		*addr += inst->size;
-
-		if (inst_end == end)
-			break;
-		disasm = inst_end + 1;
-	}
+   if (!ac_rtld_open(rtld_binary, (struct ac_rtld_open_info){
+                                     .info = &screen->info,
+                                     .shader_type = tgsi_processor_to_shader_stage(shader_type),
+                                     .wave_size = wave_size,
+                                     .num_parts = 1,
+                                     .elf_ptrs = &binary->elf_buffer,
+                                     .elf_sizes = &binary->elf_size}))
+      return;
+
+   const char *disasm;
+   size_t nbytes;
+   if (!ac_rtld_get_section_by_name(rtld_binary, ".AMDGPU.disasm", &disasm, &nbytes))
+      return;
+
+   const char *end = disasm + nbytes;
+   while (disasm < end) {
+      const char *semicolon = memchr(disasm, ';', end - disasm);
+      if (!semicolon)
+         break;
+
+      struct si_shader_inst *inst = &instructions[(*num)++];
+      const char *inst_end = memchr(semicolon + 1, '\n', end - semicolon - 1);
+      if (!inst_end)
+         inst_end = end;
+
+      inst->text = disasm;
+      inst->textlen = inst_end - disasm;
+
+      inst->addr = *addr;
+      /* More than 16 chars after ";" means the instruction is 8 bytes long. */
+      inst->size = inst_end - semicolon > 16 ? 8 : 4;
+      *addr += inst->size;
+
+      if (inst_end == end)
+         break;
+      disasm = inst_end + 1;
+   }
 }
 
 /* If the shader is being executed, print its asm instructions, and annotate
  * those that are being executed right now with information about waves that
  * execute them. This is most useful during a GPU hang.
  */
-static void si_print_annotated_shader(struct si_shader *shader,
-				      struct ac_wave_info *waves,
-				      unsigned num_waves,
-				      FILE *f)
+static void si_print_annotated_shader(struct si_shader *shader, struct ac_wave_info *waves,
+                                      unsigned num_waves, FILE *f)
 {
-	if (!shader)
-		return;
-
-	struct si_screen *screen = shader->selector->screen;
-	enum pipe_shader_type shader_type = shader->selector->type;
-	uint64_t start_addr = shader->bo->gpu_address;
-	uint64_t end_addr = start_addr + shader->bo->b.b.width0;
-	unsigned i;
-
-	/* See if any wave executes the shader. */
-	for (i = 0; i < num_waves; i++) {
-		if (start_addr <= waves[i].pc && waves[i].pc <= end_addr)
-			break;
-	}
-	if (i == num_waves)
-		return; /* the shader is not being executed */
-
-	/* Remember the first found wave. The waves are sorted according to PC. */
-	waves = &waves[i];
-	num_waves -= i;
-
-	/* Get the list of instructions.
-	 * Buffer size / 4 is the upper bound of the instruction count.
-	 */
-	unsigned num_inst = 0;
-	uint64_t inst_addr = start_addr;
-	unsigned wave_size = si_get_shader_wave_size(shader);
-	struct ac_rtld_binary rtld_binaries[5] = {};
-	struct si_shader_inst *instructions =
-		calloc(shader->bo->b.b.width0 / 4, sizeof(struct si_shader_inst));
-
-	if (shader->prolog) {
-		si_add_split_disasm(screen, &rtld_binaries[0], &shader->prolog->binary,
-				    &inst_addr, &num_inst, instructions, shader_type, wave_size);
-	}
-	if (shader->previous_stage) {
-		si_add_split_disasm(screen, &rtld_binaries[1], &shader->previous_stage->binary,
-				    &inst_addr, &num_inst, instructions, shader_type, wave_size);
-	}
-	if (shader->prolog2) {
-		si_add_split_disasm(screen, &rtld_binaries[2], &shader->prolog2->binary,
-				    &inst_addr, &num_inst, instructions, shader_type, wave_size);
-	}
-	si_add_split_disasm(screen, &rtld_binaries[3], &shader->binary,
-			    &inst_addr, &num_inst, instructions, shader_type, wave_size);
-	if (shader->epilog) {
-		si_add_split_disasm(screen, &rtld_binaries[4], &shader->epilog->binary,
-				    &inst_addr, &num_inst, instructions, shader_type, wave_size);
-	}
-
-	fprintf(f, COLOR_YELLOW "%s - annotated disassembly:" COLOR_RESET "\n",
-		si_get_shader_name(shader));
-
-	/* Print instructions with annotations. */
-	for (i = 0; i < num_inst; i++) {
-		struct si_shader_inst *inst = &instructions[i];
-
-		fprintf(f, "%.*s [PC=0x%"PRIx64", size=%u]\n",
-			inst->textlen, inst->text, inst->addr, inst->size);
-
-		/* Print which waves execute the instruction right now. */
-		while (num_waves && inst->addr == waves->pc) {
-			fprintf(f,
-				"          " COLOR_GREEN "^ SE%u SH%u CU%u "
-				"SIMD%u WAVE%u  EXEC=%016"PRIx64 "  ",
-				waves->se, waves->sh, waves->cu, waves->simd,
-				waves->wave, waves->exec);
-
-			if (inst->size == 4) {
-				fprintf(f, "INST32=%08X" COLOR_RESET "\n",
-					waves->inst_dw0);
-			} else {
-				fprintf(f, "INST64=%08X %08X" COLOR_RESET "\n",
-					waves->inst_dw0, waves->inst_dw1);
-			}
-
-			waves->matched = true;
-			waves = &waves[1];
-			num_waves--;
-		}
-	}
-
-	fprintf(f, "\n\n");
-	free(instructions);
-	for (unsigned i = 0; i < ARRAY_SIZE(rtld_binaries); ++i)
-		ac_rtld_close(&rtld_binaries[i]);
+   if (!shader)
+      return;
+
+   struct si_screen *screen = shader->selector->screen;
+   enum pipe_shader_type shader_type = shader->selector->type;
+   uint64_t start_addr = shader->bo->gpu_address;
+   uint64_t end_addr = start_addr + shader->bo->b.b.width0;
+   unsigned i;
+
+   /* See if any wave executes the shader. */
+   for (i = 0; i < num_waves; i++) {
+      if (start_addr <= waves[i].pc && waves[i].pc <= end_addr)
+         break;
+   }
+   if (i == num_waves)
+      return; /* the shader is not being executed */
+
+   /* Remember the first found wave. The waves are sorted according to PC. */
+   waves = &waves[i];
+   num_waves -= i;
+
+   /* Get the list of instructions.
+    * Buffer size / 4 is the upper bound of the instruction count.
+    */
+   unsigned num_inst = 0;
+   uint64_t inst_addr = start_addr;
+   unsigned wave_size = si_get_shader_wave_size(shader);
+   struct ac_rtld_binary rtld_binaries[5] = {};
+   struct si_shader_inst *instructions =
+      calloc(shader->bo->b.b.width0 / 4, sizeof(struct si_shader_inst));
+
+   if (shader->prolog) {
+      si_add_split_disasm(screen, &rtld_binaries[0], &shader->prolog->binary, &inst_addr, &num_inst,
+                          instructions, shader_type, wave_size);
+   }
+   if (shader->previous_stage) {
+      si_add_split_disasm(screen, &rtld_binaries[1], &shader->previous_stage->binary, &inst_addr,
+                          &num_inst, instructions, shader_type, wave_size);
+   }
+   if (shader->prolog2) {
+      si_add_split_disasm(screen, &rtld_binaries[2], &shader->prolog2->binary, &inst_addr,
+                          &num_inst, instructions, shader_type, wave_size);
+   }
+   si_add_split_disasm(screen, &rtld_binaries[3], &shader->binary, &inst_addr, &num_inst,
+                       instructions, shader_type, wave_size);
+   if (shader->epilog) {
+      si_add_split_disasm(screen, &rtld_binaries[4], &shader->epilog->binary, &inst_addr, &num_inst,
+                          instructions, shader_type, wave_size);
+   }
+
+   fprintf(f, COLOR_YELLOW "%s - annotated disassembly:" COLOR_RESET "\n",
+           si_get_shader_name(shader));
+
+   /* Print instructions with annotations. */
+   for (i = 0; i < num_inst; i++) {
+      struct si_shader_inst *inst = &instructions[i];
+
+      fprintf(f, "%.*s [PC=0x%" PRIx64 ", size=%u]\n", inst->textlen, inst->text, inst->addr,
+              inst->size);
+
+      /* Print which waves execute the instruction right now. */
+      while (num_waves && inst->addr == waves->pc) {
+         fprintf(f,
+                 "          " COLOR_GREEN "^ SE%u SH%u CU%u "
+                 "SIMD%u WAVE%u  EXEC=%016" PRIx64 "  ",
+                 waves->se, waves->sh, waves->cu, waves->simd, waves->wave, waves->exec);
+
+         if (inst->size == 4) {
+            fprintf(f, "INST32=%08X" COLOR_RESET "\n", waves->inst_dw0);
+         } else {
+            fprintf(f, "INST64=%08X %08X" COLOR_RESET "\n", waves->inst_dw0, waves->inst_dw1);
+         }
+
+         waves->matched = true;
+         waves = &waves[1];
+         num_waves--;
+      }
+   }
+
+   fprintf(f, "\n\n");
+   free(instructions);
+   for (unsigned i = 0; i < ARRAY_SIZE(rtld_binaries); ++i)
+      ac_rtld_close(&rtld_binaries[i]);
 }
 
 static void si_dump_annotated_shaders(struct si_context *sctx, FILE *f)
 {
-	struct ac_wave_info waves[AC_MAX_WAVES_PER_CHIP];
-	unsigned num_waves = ac_get_wave_info(sctx->chip_class, waves);
-
-	fprintf(f, COLOR_CYAN "The number of active waves = %u" COLOR_RESET
-		"\n\n", num_waves);
-
-	si_print_annotated_shader(sctx->vs_shader.current, waves, num_waves, f);
-	si_print_annotated_shader(sctx->tcs_shader.current, waves, num_waves, f);
-	si_print_annotated_shader(sctx->tes_shader.current, waves, num_waves, f);
-	si_print_annotated_shader(sctx->gs_shader.current, waves, num_waves, f);
-	si_print_annotated_shader(sctx->ps_shader.current, waves, num_waves, f);
-
-	/* Print waves executing shaders that are not currently bound. */
-	unsigned i;
-	bool found = false;
-	for (i = 0; i < num_waves; i++) {
-		if (waves[i].matched)
-			continue;
-
-		if (!found) {
-			fprintf(f, COLOR_CYAN
-				"Waves not executing currently-bound shaders:"
-				COLOR_RESET "\n");
-			found = true;
-		}
-		fprintf(f, "    SE%u SH%u CU%u SIMD%u WAVE%u  EXEC=%016"PRIx64
-			"  INST=%08X %08X  PC=%"PRIx64"\n",
-			waves[i].se, waves[i].sh, waves[i].cu, waves[i].simd,
-			waves[i].wave, waves[i].exec, waves[i].inst_dw0,
-			waves[i].inst_dw1, waves[i].pc);
-	}
-	if (found)
-		fprintf(f, "\n\n");
+   struct ac_wave_info waves[AC_MAX_WAVES_PER_CHIP];
+   unsigned num_waves = ac_get_wave_info(sctx->chip_class, waves);
+
+   fprintf(f, COLOR_CYAN "The number of active waves = %u" COLOR_RESET "\n\n", num_waves);
+
+   si_print_annotated_shader(sctx->vs_shader.current, waves, num_waves, f);
+   si_print_annotated_shader(sctx->tcs_shader.current, waves, num_waves, f);
+   si_print_annotated_shader(sctx->tes_shader.current, waves, num_waves, f);
+   si_print_annotated_shader(sctx->gs_shader.current, waves, num_waves, f);
+   si_print_annotated_shader(sctx->ps_shader.current, waves, num_waves, f);
+
+   /* Print waves executing shaders that are not currently bound. */
+   unsigned i;
+   bool found = false;
+   for (i = 0; i < num_waves; i++) {
+      if (waves[i].matched)
+         continue;
+
+      if (!found) {
+         fprintf(f, COLOR_CYAN "Waves not executing currently-bound shaders:" COLOR_RESET "\n");
+         found = true;
+      }
+      fprintf(f,
+              "    SE%u SH%u CU%u SIMD%u WAVE%u  EXEC=%016" PRIx64 "  INST=%08X %08X  PC=%" PRIx64
+              "\n",
+              waves[i].se, waves[i].sh, waves[i].cu, waves[i].simd, waves[i].wave, waves[i].exec,
+              waves[i].inst_dw0, waves[i].inst_dw1, waves[i].pc);
+   }
+   if (found)
+      fprintf(f, "\n\n");
 }
 
 static void si_dump_command(const char *title, const char *command, FILE *f)
 {
-	char line[2000];
+   char line[2000];
 
-	FILE *p = popen(command, "r");
-	if (!p)
-		return;
+   FILE *p = popen(command, "r");
+   if (!p)
+      return;
 
-	fprintf(f, COLOR_YELLOW "%s: " COLOR_RESET "\n", title);
-	while (fgets(line, sizeof(line), p))
-		fputs(line, f);
-	fprintf(f, "\n\n");
-	pclose(p);
+   fprintf(f, COLOR_YELLOW "%s: " COLOR_RESET "\n", title);
+   while (fgets(line, sizeof(line), p))
+      fputs(line, f);
+   fprintf(f, "\n\n");
+   pclose(p);
 }
 
-static void si_dump_debug_state(struct pipe_context *ctx, FILE *f,
-				unsigned flags)
+static void si_dump_debug_state(struct pipe_context *ctx, FILE *f, unsigned flags)
 {
-	struct si_context *sctx = (struct si_context*)ctx;
+   struct si_context *sctx = (struct si_context *)ctx;
 
-	if (sctx->log)
-		u_log_flush(sctx->log);
+   if (sctx->log)
+      u_log_flush(sctx->log);
 
-	if (flags & PIPE_DUMP_DEVICE_STATUS_REGISTERS) {
-		si_dump_debug_registers(sctx, f);
+   if (flags & PIPE_DUMP_DEVICE_STATUS_REGISTERS) {
+      si_dump_debug_registers(sctx, f);
 
-		si_dump_annotated_shaders(sctx, f);
-		si_dump_command("Active waves (raw data)", "umr -O halt_waves -wa | column -t", f);
-		si_dump_command("Wave information", "umr -O halt_waves,bits -wa", f);
-	}
+      si_dump_annotated_shaders(sctx, f);
+      si_dump_command("Active waves (raw data)", "umr -O halt_waves -wa | column -t", f);
+      si_dump_command("Wave information", "umr -O halt_waves,bits -wa", f);
+   }
 }
 
 void si_log_draw_state(struct si_context *sctx, struct u_log_context *log)
 {
-	struct si_shader_ctx_state *tcs_shader;
-
-	if (!log)
-		return;
-
-	tcs_shader = &sctx->tcs_shader;
-	if (sctx->tes_shader.cso && !sctx->tcs_shader.cso)
-		tcs_shader = &sctx->fixed_func_tcs_shader;
-
-	si_dump_framebuffer(sctx, log);
-
-	si_dump_gfx_shader(sctx, &sctx->vs_shader, log);
-	si_dump_gfx_shader(sctx, tcs_shader, log);
-	si_dump_gfx_shader(sctx, &sctx->tes_shader, log);
-	si_dump_gfx_shader(sctx, &sctx->gs_shader, log);
-	si_dump_gfx_shader(sctx, &sctx->ps_shader, log);
-
-	si_dump_descriptor_list(sctx->screen,
-				&sctx->descriptors[SI_DESCS_RW_BUFFERS],
-				"", "RW buffers", 4,
-				sctx->descriptors[SI_DESCS_RW_BUFFERS].num_active_slots,
-				si_identity, log);
-	si_dump_gfx_descriptors(sctx, &sctx->vs_shader, log);
-	si_dump_gfx_descriptors(sctx, tcs_shader, log);
-	si_dump_gfx_descriptors(sctx, &sctx->tes_shader, log);
-	si_dump_gfx_descriptors(sctx, &sctx->gs_shader, log);
-	si_dump_gfx_descriptors(sctx, &sctx->ps_shader, log);
+   struct si_shader_ctx_state *tcs_shader;
+
+   if (!log)
+      return;
+
+   tcs_shader = &sctx->tcs_shader;
+   if (sctx->tes_shader.cso && !sctx->tcs_shader.cso)
+      tcs_shader = &sctx->fixed_func_tcs_shader;
+
+   si_dump_framebuffer(sctx, log);
+
+   si_dump_gfx_shader(sctx, &sctx->vs_shader, log);
+   si_dump_gfx_shader(sctx, tcs_shader, log);
+   si_dump_gfx_shader(sctx, &sctx->tes_shader, log);
+   si_dump_gfx_shader(sctx, &sctx->gs_shader, log);
+   si_dump_gfx_shader(sctx, &sctx->ps_shader, log);
+
+   si_dump_descriptor_list(sctx->screen, &sctx->descriptors[SI_DESCS_RW_BUFFERS], "", "RW buffers",
+                           4, sctx->descriptors[SI_DESCS_RW_BUFFERS].num_active_slots, si_identity,
+                           log);
+   si_dump_gfx_descriptors(sctx, &sctx->vs_shader, log);
+   si_dump_gfx_descriptors(sctx, tcs_shader, log);
+   si_dump_gfx_descriptors(sctx, &sctx->tes_shader, log);
+   si_dump_gfx_descriptors(sctx, &sctx->gs_shader, log);
+   si_dump_gfx_descriptors(sctx, &sctx->ps_shader, log);
 }
 
 void si_log_compute_state(struct si_context *sctx, struct u_log_context *log)
 {
-	if (!log)
-		return;
+   if (!log)
+      return;
 
-	si_dump_compute_shader(sctx, log);
-	si_dump_compute_descriptors(sctx, log);
+   si_dump_compute_shader(sctx, log);
+   si_dump_compute_descriptors(sctx, log);
 }
 
-static void si_dump_dma(struct si_context *sctx,
-			struct radeon_saved_cs *saved, FILE *f)
+static void si_dump_dma(struct si_context *sctx, struct radeon_saved_cs *saved, FILE *f)
 {
-	static const char ib_name[] = "sDMA IB";
-	unsigned i;
+   static const char ib_name[] = "sDMA IB";
+   unsigned i;
 
-	si_dump_bo_list(sctx, saved, f);
+   si_dump_bo_list(sctx, saved, f);
 
-	fprintf(f, "------------------ %s begin ------------------\n", ib_name);
+   fprintf(f, "------------------ %s begin ------------------\n", ib_name);
 
-	for (i = 0; i < saved->num_dw; ++i) {
-		fprintf(f, " %08x\n", saved->ib[i]);
-	}
+   for (i = 0; i < saved->num_dw; ++i) {
+      fprintf(f, " %08x\n", saved->ib[i]);
+   }
 
-	fprintf(f, "------------------- %s end -------------------\n", ib_name);
-	fprintf(f, "\n");
+   fprintf(f, "------------------- %s end -------------------\n", ib_name);
+   fprintf(f, "\n");
 
-	fprintf(f, "SDMA Dump Done.\n");
+   fprintf(f, "SDMA Dump Done.\n");
 }
 
-void si_check_vm_faults(struct si_context *sctx,
-			struct radeon_saved_cs *saved, enum ring_type ring)
+void si_check_vm_faults(struct si_context *sctx, struct radeon_saved_cs *saved, enum ring_type ring)
 {
-	struct pipe_screen *screen = sctx->b.screen;
-	FILE *f;
-	uint64_t addr;
-	char cmd_line[4096];
-
-	if (!ac_vm_fault_occured(sctx->chip_class,
-				 &sctx->dmesg_timestamp, &addr))
-		return;
-
-	f = dd_get_debug_file(false);
-	if (!f)
-		return;
-
-	fprintf(f, "VM fault report.\n\n");
-	if (os_get_command_line(cmd_line, sizeof(cmd_line)))
-		fprintf(f, "Command: %s\n", cmd_line);
-	fprintf(f, "Driver vendor: %s\n", screen->get_vendor(screen));
-	fprintf(f, "Device vendor: %s\n", screen->get_device_vendor(screen));
-	fprintf(f, "Device name: %s\n\n", screen->get_name(screen));
-	fprintf(f, "Failing VM page: 0x%08"PRIx64"\n\n", addr);
-
-	if (sctx->apitrace_call_number)
-		fprintf(f, "Last apitrace call: %u\n\n",
-			sctx->apitrace_call_number);
-
-	switch (ring) {
-	case RING_GFX: {
-		struct u_log_context log;
-		u_log_context_init(&log);
-
-		si_log_draw_state(sctx, &log);
-		si_log_compute_state(sctx, &log);
-		si_log_cs(sctx, &log, true);
-
-		u_log_new_page_print(&log, f);
-		u_log_context_destroy(&log);
-		break;
-	}
-	case RING_DMA:
-		si_dump_dma(sctx, saved, f);
-		break;
-
-	default:
-		break;
-	}
-
-	fclose(f);
-
-	fprintf(stderr, "Detected a VM fault, exiting...\n");
-	exit(0);
+   struct pipe_screen *screen = sctx->b.screen;
+   FILE *f;
+   uint64_t addr;
+   char cmd_line[4096];
+
+   if (!ac_vm_fault_occured(sctx->chip_class, &sctx->dmesg_timestamp, &addr))
+      return;
+
+   f = dd_get_debug_file(false);
+   if (!f)
+      return;
+
+   fprintf(f, "VM fault report.\n\n");
+   if (os_get_command_line(cmd_line, sizeof(cmd_line)))
+      fprintf(f, "Command: %s\n", cmd_line);
+   fprintf(f, "Driver vendor: %s\n", screen->get_vendor(screen));
+   fprintf(f, "Device vendor: %s\n", screen->get_device_vendor(screen));
+   fprintf(f, "Device name: %s\n\n", screen->get_name(screen));
+   fprintf(f, "Failing VM page: 0x%08" PRIx64 "\n\n", addr);
+
+   if (sctx->apitrace_call_number)
+      fprintf(f, "Last apitrace call: %u\n\n", sctx->apitrace_call_number);
+
+   switch (ring) {
+   case RING_GFX: {
+      struct u_log_context log;
+      u_log_context_init(&log);
+
+      si_log_draw_state(sctx, &log);
+      si_log_compute_state(sctx, &log);
+      si_log_cs(sctx, &log, true);
+
+      u_log_new_page_print(&log, f);
+      u_log_context_destroy(&log);
+      break;
+   }
+   case RING_DMA:
+      si_dump_dma(sctx, saved, f);
+      break;
+
+   default:
+      break;
+   }
+
+   fclose(f);
+
+   fprintf(stderr, "Detected a VM fault, exiting...\n");
+   exit(0);
 }
 
 void si_init_debug_functions(struct si_context *sctx)
 {
-	sctx->b.dump_debug_state = si_dump_debug_state;
-
-	/* Set the initial dmesg timestamp for this context, so that
-	 * only new messages will be checked for VM faults.
-	 */
-	if (sctx->screen->debug_flags & DBG(CHECK_VM))
-		ac_vm_fault_occured(sctx->chip_class,
-				    &sctx->dmesg_timestamp, NULL);
+   sctx->b.dump_debug_state = si_dump_debug_state;
+
+   /* Set the initial dmesg timestamp for this context, so that
+    * only new messages will be checked for VM faults.
+    */
+   if (sctx->screen->debug_flags & DBG(CHECK_VM))
+      ac_vm_fault_occured(sctx->chip_class, &sctx->dmesg_timestamp, NULL);
 }
diff --git a/src/gallium/drivers/radeonsi/si_debug_options.h b/src/gallium/drivers/radeonsi/si_debug_options.h
index b0e8db8646a..83c7425e094 100644
--- a/src/gallium/drivers/radeonsi/si_debug_options.h
+++ b/src/gallium/drivers/radeonsi/si_debug_options.h
@@ -1,9 +1,11 @@
 OPT_BOOL(aux_debug, false, "Generate ddebug_dumps for the auxiliary context")
 OPT_BOOL(sync_compile, false, "Always compile synchronously (will cause stalls)")
 OPT_BOOL(dump_shader_binary, false, "Dump shader binary as part of ddebug_dumps")
-OPT_BOOL(debug_disassembly, false, "Report shader disassembly as part of driver debug messages (for shader db)")
+OPT_BOOL(debug_disassembly, false,
+         "Report shader disassembly as part of driver debug messages (for shader db)")
 OPT_BOOL(halt_shaders, false, "Halt shaders at the start (will hang)")
-OPT_BOOL(vs_fetch_always_opencode, false, "Always open code vertex fetches (less efficient, purely for testing)")
+OPT_BOOL(vs_fetch_always_opencode, false,
+         "Always open code vertex fetches (less efficient, purely for testing)")
 OPT_BOOL(prim_restart_tri_strips_only, false, "Only enable primitive restart for triangle strips")
 
 #undef OPT_BOOL
diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c b/src/gallium/drivers/radeonsi/si_descriptors.c
index fa2174bac5d..bf3ede49b39 100644
--- a/src/gallium/drivers/radeonsi/si_descriptors.c
+++ b/src/gallium/drivers/radeonsi/si_descriptors.c
@@ -55,14 +55,12 @@
 
 #include "si_pipe.h"
 #include "sid.h"
-
+#include "util/format/u_format.h"
 #include "util/hash_table.h"
 #include "util/u_idalloc.h"
-#include "util/format/u_format.h"
 #include "util/u_memory.h"
 #include "util/u_upload_mgr.h"
 
-
 /* NULL image and buffer descriptor for textures (alpha = 1) and images
  * (alpha = 0).
  *
@@ -75,221 +73,197 @@
  * This is the only reason why the buffer descriptor must be in words [4:7].
  */
 static uint32_t null_texture_descriptor[8] = {
-	0,
-	0,
-	0,
-	S_008F1C_DST_SEL_W(V_008F1C_SQ_SEL_1) |
-	S_008F1C_TYPE(V_008F1C_SQ_RSRC_IMG_1D)
-	/* the rest must contain zeros, which is also used by the buffer
-	 * descriptor */
+   0, 0, 0, S_008F1C_DST_SEL_W(V_008F1C_SQ_SEL_1) | S_008F1C_TYPE(V_008F1C_SQ_RSRC_IMG_1D)
+   /* the rest must contain zeros, which is also used by the buffer
+    * descriptor */
 };
 
 static uint32_t null_image_descriptor[8] = {
-	0,
-	0,
-	0,
-	S_008F1C_TYPE(V_008F1C_SQ_RSRC_IMG_1D)
-	/* the rest must contain zeros, which is also used by the buffer
-	 * descriptor */
+   0, 0, 0, S_008F1C_TYPE(V_008F1C_SQ_RSRC_IMG_1D)
+   /* the rest must contain zeros, which is also used by the buffer
+    * descriptor */
 };
 
 static uint64_t si_desc_extract_buffer_address(const uint32_t *desc)
 {
-	uint64_t va = desc[0] |
-		      ((uint64_t)G_008F04_BASE_ADDRESS_HI(desc[1]) << 32);
+   uint64_t va = desc[0] | ((uint64_t)G_008F04_BASE_ADDRESS_HI(desc[1]) << 32);
 
-	/* Sign-extend the 48-bit address. */
-	va <<= 16;
-	va = (int64_t)va >> 16;
-	return va;
+   /* Sign-extend the 48-bit address. */
+   va <<= 16;
+   va = (int64_t)va >> 16;
+   return va;
 }
 
-static void si_init_descriptor_list(uint32_t *desc_list,
-				    unsigned element_dw_size,
-				    unsigned num_elements,
-				    const uint32_t *null_descriptor)
+static void si_init_descriptor_list(uint32_t *desc_list, unsigned element_dw_size,
+                                    unsigned num_elements, const uint32_t *null_descriptor)
 {
-	int i;
+   int i;
 
-	/* Initialize the array to NULL descriptors if the element size is 8. */
-	if (null_descriptor) {
-		assert(element_dw_size % 8 == 0);
-		for (i = 0; i < num_elements * element_dw_size / 8; i++)
-			memcpy(desc_list + i * 8, null_descriptor, 8 * 4);
-	}
+   /* Initialize the array to NULL descriptors if the element size is 8. */
+   if (null_descriptor) {
+      assert(element_dw_size % 8 == 0);
+      for (i = 0; i < num_elements * element_dw_size / 8; i++)
+         memcpy(desc_list + i * 8, null_descriptor, 8 * 4);
+   }
 }
 
-static void si_init_descriptors(struct si_descriptors *desc,
-				short shader_userdata_rel_index,
-				unsigned element_dw_size,
-				unsigned num_elements)
+static void si_init_descriptors(struct si_descriptors *desc, short shader_userdata_rel_index,
+                                unsigned element_dw_size, unsigned num_elements)
 {
-	desc->list = CALLOC(num_elements, element_dw_size * 4);
-	desc->element_dw_size = element_dw_size;
-	desc->num_elements = num_elements;
-	desc->shader_userdata_offset = shader_userdata_rel_index * 4;
-	desc->slot_index_to_bind_directly = -1;
+   desc->list = CALLOC(num_elements, element_dw_size * 4);
+   desc->element_dw_size = element_dw_size;
+   desc->num_elements = num_elements;
+   desc->shader_userdata_offset = shader_userdata_rel_index * 4;
+   desc->slot_index_to_bind_directly = -1;
 }
 
 static void si_release_descriptors(struct si_descriptors *desc)
 {
-	si_resource_reference(&desc->buffer, NULL);
-	FREE(desc->list);
+   si_resource_reference(&desc->buffer, NULL);
+   FREE(desc->list);
 }
 
-static bool si_upload_descriptors(struct si_context *sctx,
-				  struct si_descriptors *desc)
+static bool si_upload_descriptors(struct si_context *sctx, struct si_descriptors *desc)
 {
-	unsigned slot_size = desc->element_dw_size * 4;
-	unsigned first_slot_offset = desc->first_active_slot * slot_size;
-	unsigned upload_size = desc->num_active_slots * slot_size;
+   unsigned slot_size = desc->element_dw_size * 4;
+   unsigned first_slot_offset = desc->first_active_slot * slot_size;
+   unsigned upload_size = desc->num_active_slots * slot_size;
 
-	/* Skip the upload if no shader is using the descriptors. dirty_mask
-	 * will stay dirty and the descriptors will be uploaded when there is
-	 * a shader using them.
-	 */
-	if (!upload_size)
-		return true;
+   /* Skip the upload if no shader is using the descriptors. dirty_mask
+    * will stay dirty and the descriptors will be uploaded when there is
+    * a shader using them.
+    */
+   if (!upload_size)
+      return true;
 
-	/* If there is just one active descriptor, bind it directly. */
-	if ((int)desc->first_active_slot == desc->slot_index_to_bind_directly &&
-	    desc->num_active_slots == 1) {
-		uint32_t *descriptor = &desc->list[desc->slot_index_to_bind_directly *
-						   desc->element_dw_size];
+   /* If there is just one active descriptor, bind it directly. */
+   if ((int)desc->first_active_slot == desc->slot_index_to_bind_directly &&
+       desc->num_active_slots == 1) {
+      uint32_t *descriptor = &desc->list[desc->slot_index_to_bind_directly * desc->element_dw_size];
 
-		/* The buffer is already in the buffer list. */
-		si_resource_reference(&desc->buffer, NULL);
-		desc->gpu_list = NULL;
-		desc->gpu_address = si_desc_extract_buffer_address(descriptor);
-		si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_pointers);
-		return true;
-	}
+      /* The buffer is already in the buffer list. */
+      si_resource_reference(&desc->buffer, NULL);
+      desc->gpu_list = NULL;
+      desc->gpu_address = si_desc_extract_buffer_address(descriptor);
+      si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_pointers);
+      return true;
+   }
 
-	uint32_t *ptr;
-	unsigned buffer_offset;
-	u_upload_alloc(sctx->b.const_uploader, first_slot_offset, upload_size,
-		       si_optimal_tcc_alignment(sctx, upload_size),
-		       &buffer_offset, (struct pipe_resource**)&desc->buffer,
-		       (void**)&ptr);
-	if (!desc->buffer) {
-		desc->gpu_address = 0;
-		return false; /* skip the draw call */
-	}
+   uint32_t *ptr;
+   unsigned buffer_offset;
+   u_upload_alloc(sctx->b.const_uploader, first_slot_offset, upload_size,
+                  si_optimal_tcc_alignment(sctx, upload_size), &buffer_offset,
+                  (struct pipe_resource **)&desc->buffer, (void **)&ptr);
+   if (!desc->buffer) {
+      desc->gpu_address = 0;
+      return false; /* skip the draw call */
+   }
 
-	util_memcpy_cpu_to_le32(ptr, (char*)desc->list + first_slot_offset,
-				upload_size);
-	desc->gpu_list = ptr - first_slot_offset / 4;
+   util_memcpy_cpu_to_le32(ptr, (char *)desc->list + first_slot_offset, upload_size);
+   desc->gpu_list = ptr - first_slot_offset / 4;
 
-	radeon_add_to_buffer_list(sctx, sctx->gfx_cs, desc->buffer,
-                            RADEON_USAGE_READ, RADEON_PRIO_DESCRIPTORS);
+   radeon_add_to_buffer_list(sctx, sctx->gfx_cs, desc->buffer, RADEON_USAGE_READ,
+                             RADEON_PRIO_DESCRIPTORS);
 
-	/* The shader pointer should point to slot 0. */
-	buffer_offset -= first_slot_offset;
-	desc->gpu_address = desc->buffer->gpu_address + buffer_offset;
+   /* The shader pointer should point to slot 0. */
+   buffer_offset -= first_slot_offset;
+   desc->gpu_address = desc->buffer->gpu_address + buffer_offset;
 
-	assert(desc->buffer->flags & RADEON_FLAG_32BIT);
-	assert((desc->buffer->gpu_address >> 32) == sctx->screen->info.address32_hi);
-	assert((desc->gpu_address >> 32) == sctx->screen->info.address32_hi);
+   assert(desc->buffer->flags & RADEON_FLAG_32BIT);
+   assert((desc->buffer->gpu_address >> 32) == sctx->screen->info.address32_hi);
+   assert((desc->gpu_address >> 32) == sctx->screen->info.address32_hi);
 
-	si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_pointers);
-	return true;
+   si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_pointers);
+   return true;
 }
 
-static void
-si_descriptors_begin_new_cs(struct si_context *sctx, struct si_descriptors *desc)
+static void si_descriptors_begin_new_cs(struct si_context *sctx, struct si_descriptors *desc)
 {
-	if (!desc->buffer)
-		return;
+   if (!desc->buffer)
+      return;
 
-	radeon_add_to_buffer_list(sctx, sctx->gfx_cs, desc->buffer,
-				  RADEON_USAGE_READ, RADEON_PRIO_DESCRIPTORS);
+   radeon_add_to_buffer_list(sctx, sctx->gfx_cs, desc->buffer, RADEON_USAGE_READ,
+                             RADEON_PRIO_DESCRIPTORS);
 }
 
 /* SAMPLER VIEWS */
 
-static inline enum radeon_bo_priority
-si_get_sampler_view_priority(struct si_resource *res)
+static inline enum radeon_bo_priority si_get_sampler_view_priority(struct si_resource *res)
 {
-	if (res->b.b.target == PIPE_BUFFER)
-		return RADEON_PRIO_SAMPLER_BUFFER;
+   if (res->b.b.target == PIPE_BUFFER)
+      return RADEON_PRIO_SAMPLER_BUFFER;
 
-	if (res->b.b.nr_samples > 1)
-		return RADEON_PRIO_SAMPLER_TEXTURE_MSAA;
+   if (res->b.b.nr_samples > 1)
+      return RADEON_PRIO_SAMPLER_TEXTURE_MSAA;
 
-	return RADEON_PRIO_SAMPLER_TEXTURE;
+   return RADEON_PRIO_SAMPLER_TEXTURE;
 }
 
-static struct si_descriptors *
-si_sampler_and_image_descriptors(struct si_context *sctx, unsigned shader)
+static struct si_descriptors *si_sampler_and_image_descriptors(struct si_context *sctx,
+                                                               unsigned shader)
 {
-	return &sctx->descriptors[si_sampler_and_image_descriptors_idx(shader)];
+   return &sctx->descriptors[si_sampler_and_image_descriptors_idx(shader)];
 }
 
 static void si_release_sampler_views(struct si_samplers *samplers)
 {
-	int i;
+   int i;
 
-	for (i = 0; i < ARRAY_SIZE(samplers->views); i++) {
-		pipe_sampler_view_reference(&samplers->views[i], NULL);
-	}
+   for (i = 0; i < ARRAY_SIZE(samplers->views); i++) {
+      pipe_sampler_view_reference(&samplers->views[i], NULL);
+   }
 }
 
-static void si_sampler_view_add_buffer(struct si_context *sctx,
-				       struct pipe_resource *resource,
-				       enum radeon_bo_usage usage,
-				       bool is_stencil_sampler,
-				       bool check_mem)
+static void si_sampler_view_add_buffer(struct si_context *sctx, struct pipe_resource *resource,
+                                       enum radeon_bo_usage usage, bool is_stencil_sampler,
+                                       bool check_mem)
 {
-	struct si_texture *tex = (struct si_texture*)resource;
-	enum radeon_bo_priority priority;
+   struct si_texture *tex = (struct si_texture *)resource;
+   enum radeon_bo_priority priority;
 
-	if (!resource)
-		return;
+   if (!resource)
+      return;
 
-	/* Use the flushed depth texture if direct sampling is unsupported. */
-	if (resource->target != PIPE_BUFFER &&
-	    tex->is_depth && !si_can_sample_zs(tex, is_stencil_sampler))
-		tex = tex->flushed_depth_texture;
+   /* Use the flushed depth texture if direct sampling is unsupported. */
+   if (resource->target != PIPE_BUFFER && tex->is_depth &&
+       !si_can_sample_zs(tex, is_stencil_sampler))
+      tex = tex->flushed_depth_texture;
 
-	priority = si_get_sampler_view_priority(&tex->buffer);
-	radeon_add_to_gfx_buffer_list_check_mem(sctx, &tex->buffer, usage, priority,
-						check_mem);
+   priority = si_get_sampler_view_priority(&tex->buffer);
+   radeon_add_to_gfx_buffer_list_check_mem(sctx, &tex->buffer, usage, priority, check_mem);
 
-	if (resource->target == PIPE_BUFFER)
-		return;
+   if (resource->target == PIPE_BUFFER)
+      return;
 
-	/* Add separate DCC. */
-	if (tex->dcc_separate_buffer) {
-		radeon_add_to_gfx_buffer_list_check_mem(sctx, tex->dcc_separate_buffer,
-							usage, RADEON_PRIO_SEPARATE_META, check_mem);
-	}
+   /* Add separate DCC. */
+   if (tex->dcc_separate_buffer) {
+      radeon_add_to_gfx_buffer_list_check_mem(sctx, tex->dcc_separate_buffer, usage,
+                                              RADEON_PRIO_SEPARATE_META, check_mem);
+   }
 }
 
-static void si_sampler_views_begin_new_cs(struct si_context *sctx,
-					  struct si_samplers *samplers)
+static void si_sampler_views_begin_new_cs(struct si_context *sctx, struct si_samplers *samplers)
 {
-	unsigned mask = samplers->enabled_mask;
+   unsigned mask = samplers->enabled_mask;
 
-	/* Add buffers to the CS. */
-	while (mask) {
-		int i = u_bit_scan(&mask);
-		struct si_sampler_view *sview = (struct si_sampler_view *)samplers->views[i];
+   /* Add buffers to the CS. */
+   while (mask) {
+      int i = u_bit_scan(&mask);
+      struct si_sampler_view *sview = (struct si_sampler_view *)samplers->views[i];
 
-		si_sampler_view_add_buffer(sctx, sview->base.texture,
-					   RADEON_USAGE_READ,
-					   sview->is_stencil_sampler, false);
-	}
+      si_sampler_view_add_buffer(sctx, sview->base.texture, RADEON_USAGE_READ,
+                                 sview->is_stencil_sampler, false);
+   }
 }
 
 /* Set buffer descriptor fields that can be changed by reallocations. */
-static void si_set_buf_desc_address(struct si_resource *buf,
-				    uint64_t offset, uint32_t *state)
+static void si_set_buf_desc_address(struct si_resource *buf, uint64_t offset, uint32_t *state)
 {
-	uint64_t va = buf->gpu_address + offset;
+   uint64_t va = buf->gpu_address + offset;
 
-	state[0] = va;
-	state[1] &= C_008F04_BASE_ADDRESS_HI;
-	state[1] |= S_008F04_BASE_ADDRESS_HI(va >> 32);
+   state[0] = va;
+   state[1] &= C_008F04_BASE_ADDRESS_HI;
+   state[1] |= S_008F04_BASE_ADDRESS_HI(va >> 32);
 }
 
 /* Set texture descriptor fields that can be changed by reallocations.
@@ -302,1316 +276,1195 @@ static void si_set_buf_desc_address(struct si_resource *buf,
  * \param is_stencil		select between separate Z & Stencil
  * \param state			descriptor to update
  */
-void si_set_mutable_tex_desc_fields(struct si_screen *sscreen,
-				    struct si_texture *tex,
-				    const struct legacy_surf_level *base_level_info,
-				    unsigned base_level, unsigned first_level,
-				    unsigned block_width, bool is_stencil,
-				    uint32_t *state)
-{
-	uint64_t va, meta_va = 0;
-
-	if (tex->is_depth && !si_can_sample_zs(tex, is_stencil)) {
-		tex = tex->flushed_depth_texture;
-		is_stencil = false;
-	}
-
-	va = tex->buffer.gpu_address;
-
-	if (sscreen->info.chip_class >= GFX9) {
-		/* Only stencil_offset needs to be added here. */
-		if (is_stencil)
-			va += tex->surface.u.gfx9.stencil_offset;
-		else
-			va += tex->surface.u.gfx9.surf_offset;
-	} else {
-		va += base_level_info->offset;
-	}
-
-	state[0] = va >> 8;
-	state[1] &= C_008F14_BASE_ADDRESS_HI;
-	state[1] |= S_008F14_BASE_ADDRESS_HI(va >> 40);
-
-	/* Only macrotiled modes can set tile swizzle.
-	 * GFX9 doesn't use (legacy) base_level_info.
-	 */
-	if (sscreen->info.chip_class >= GFX9 ||
-	    base_level_info->mode == RADEON_SURF_MODE_2D)
-		state[0] |= tex->surface.tile_swizzle;
-
-	if (sscreen->info.chip_class >= GFX8) {
-		state[6] &= C_008F28_COMPRESSION_EN;
-
-		if (vi_dcc_enabled(tex, first_level)) {
-			meta_va = (!tex->dcc_separate_buffer ? tex->buffer.gpu_address : 0) +
-				  tex->surface.dcc_offset;
-
-			if (sscreen->info.chip_class == GFX8) {
-				meta_va += base_level_info->dcc_offset;
-				assert(base_level_info->mode == RADEON_SURF_MODE_2D);
-			}
-
-			unsigned dcc_tile_swizzle = tex->surface.tile_swizzle << 8;
-			dcc_tile_swizzle &= tex->surface.dcc_alignment - 1;
-			meta_va |= dcc_tile_swizzle;
-		} else if (vi_tc_compat_htile_enabled(tex, first_level,
-						      is_stencil ? PIPE_MASK_S : PIPE_MASK_Z)) {
-			meta_va = tex->buffer.gpu_address + tex->surface.htile_offset;
-		}
-
-		if (meta_va)
-			state[6] |= S_008F28_COMPRESSION_EN(1);
-	}
-
-	if (sscreen->info.chip_class >= GFX8 && sscreen->info.chip_class <= GFX9)
-		state[7] = meta_va >> 8;
-
-	if (sscreen->info.chip_class >= GFX10) {
-		state[3] &= C_00A00C_SW_MODE;
-
-		if (is_stencil) {
-			state[3] |= S_00A00C_SW_MODE(tex->surface.u.gfx9.stencil.swizzle_mode);
-		} else {
-			state[3] |= S_00A00C_SW_MODE(tex->surface.u.gfx9.surf.swizzle_mode);
-		}
-
-		state[6] &= C_00A018_META_DATA_ADDRESS_LO &
-			    C_00A018_META_PIPE_ALIGNED;
-
-		if (meta_va) {
-			struct gfx9_surf_meta_flags meta;
-
-			if (tex->surface.dcc_offset)
-				meta = tex->surface.u.gfx9.dcc;
-			else
-				meta = tex->surface.u.gfx9.htile;
-
-			state[6] |= S_00A018_META_PIPE_ALIGNED(meta.pipe_aligned) |
-				    S_00A018_META_DATA_ADDRESS_LO(meta_va >> 8);
-		}
-
-		state[7] = meta_va >> 16;
-	} else if (sscreen->info.chip_class == GFX9) {
-		state[3] &= C_008F1C_SW_MODE;
-		state[4] &= C_008F20_PITCH;
-
-		if (is_stencil) {
-			state[3] |= S_008F1C_SW_MODE(tex->surface.u.gfx9.stencil.swizzle_mode);
-			state[4] |= S_008F20_PITCH(tex->surface.u.gfx9.stencil.epitch);
-		} else {
-			state[3] |= S_008F1C_SW_MODE(tex->surface.u.gfx9.surf.swizzle_mode);
-			state[4] |= S_008F20_PITCH(tex->surface.u.gfx9.surf.epitch);
-		}
-
-		state[5] &= C_008F24_META_DATA_ADDRESS &
-			    C_008F24_META_PIPE_ALIGNED &
-			    C_008F24_META_RB_ALIGNED;
-		if (meta_va) {
-			struct gfx9_surf_meta_flags meta;
-
-			if (tex->surface.dcc_offset)
-				meta = tex->surface.u.gfx9.dcc;
-			else
-				meta = tex->surface.u.gfx9.htile;
-
-			state[5] |= S_008F24_META_DATA_ADDRESS(meta_va >> 40) |
-				    S_008F24_META_PIPE_ALIGNED(meta.pipe_aligned) |
-				    S_008F24_META_RB_ALIGNED(meta.rb_aligned);
-		}
-	} else {
-		/* GFX6-GFX8 */
-		unsigned pitch = base_level_info->nblk_x * block_width;
-		unsigned index = si_tile_mode_index(tex, base_level, is_stencil);
-
-		state[3] &= C_008F1C_TILING_INDEX;
-		state[3] |= S_008F1C_TILING_INDEX(index);
-		state[4] &= C_008F20_PITCH;
-		state[4] |= S_008F20_PITCH(pitch - 1);
-	}
+void si_set_mutable_tex_desc_fields(struct si_screen *sscreen, struct si_texture *tex,
+                                    const struct legacy_surf_level *base_level_info,
+                                    unsigned base_level, unsigned first_level, unsigned block_width,
+                                    bool is_stencil, uint32_t *state)
+{
+   uint64_t va, meta_va = 0;
+
+   if (tex->is_depth && !si_can_sample_zs(tex, is_stencil)) {
+      tex = tex->flushed_depth_texture;
+      is_stencil = false;
+   }
+
+   va = tex->buffer.gpu_address;
+
+   if (sscreen->info.chip_class >= GFX9) {
+      /* Only stencil_offset needs to be added here. */
+      if (is_stencil)
+         va += tex->surface.u.gfx9.stencil_offset;
+      else
+         va += tex->surface.u.gfx9.surf_offset;
+   } else {
+      va += base_level_info->offset;
+   }
+
+   state[0] = va >> 8;
+   state[1] &= C_008F14_BASE_ADDRESS_HI;
+   state[1] |= S_008F14_BASE_ADDRESS_HI(va >> 40);
+
+   /* Only macrotiled modes can set tile swizzle.
+    * GFX9 doesn't use (legacy) base_level_info.
+    */
+   if (sscreen->info.chip_class >= GFX9 || base_level_info->mode == RADEON_SURF_MODE_2D)
+      state[0] |= tex->surface.tile_swizzle;
+
+   if (sscreen->info.chip_class >= GFX8) {
+      state[6] &= C_008F28_COMPRESSION_EN;
+
+      if (vi_dcc_enabled(tex, first_level)) {
+         meta_va =
+            (!tex->dcc_separate_buffer ? tex->buffer.gpu_address : 0) + tex->surface.dcc_offset;
+
+         if (sscreen->info.chip_class == GFX8) {
+            meta_va += base_level_info->dcc_offset;
+            assert(base_level_info->mode == RADEON_SURF_MODE_2D);
+         }
+
+         unsigned dcc_tile_swizzle = tex->surface.tile_swizzle << 8;
+         dcc_tile_swizzle &= tex->surface.dcc_alignment - 1;
+         meta_va |= dcc_tile_swizzle;
+      } else if (vi_tc_compat_htile_enabled(tex, first_level,
+                                            is_stencil ? PIPE_MASK_S : PIPE_MASK_Z)) {
+         meta_va = tex->buffer.gpu_address + tex->surface.htile_offset;
+      }
+
+      if (meta_va)
+         state[6] |= S_008F28_COMPRESSION_EN(1);
+   }
+
+   if (sscreen->info.chip_class >= GFX8 && sscreen->info.chip_class <= GFX9)
+      state[7] = meta_va >> 8;
+
+   if (sscreen->info.chip_class >= GFX10) {
+      state[3] &= C_00A00C_SW_MODE;
+
+      if (is_stencil) {
+         state[3] |= S_00A00C_SW_MODE(tex->surface.u.gfx9.stencil.swizzle_mode);
+      } else {
+         state[3] |= S_00A00C_SW_MODE(tex->surface.u.gfx9.surf.swizzle_mode);
+      }
+
+      state[6] &= C_00A018_META_DATA_ADDRESS_LO & C_00A018_META_PIPE_ALIGNED;
+
+      if (meta_va) {
+         struct gfx9_surf_meta_flags meta;
+
+         if (tex->surface.dcc_offset)
+            meta = tex->surface.u.gfx9.dcc;
+         else
+            meta = tex->surface.u.gfx9.htile;
+
+         state[6] |= S_00A018_META_PIPE_ALIGNED(meta.pipe_aligned) |
+                     S_00A018_META_DATA_ADDRESS_LO(meta_va >> 8);
+      }
+
+      state[7] = meta_va >> 16;
+   } else if (sscreen->info.chip_class == GFX9) {
+      state[3] &= C_008F1C_SW_MODE;
+      state[4] &= C_008F20_PITCH;
+
+      if (is_stencil) {
+         state[3] |= S_008F1C_SW_MODE(tex->surface.u.gfx9.stencil.swizzle_mode);
+         state[4] |= S_008F20_PITCH(tex->surface.u.gfx9.stencil.epitch);
+      } else {
+         state[3] |= S_008F1C_SW_MODE(tex->surface.u.gfx9.surf.swizzle_mode);
+         state[4] |= S_008F20_PITCH(tex->surface.u.gfx9.surf.epitch);
+      }
+
+      state[5] &=
+         C_008F24_META_DATA_ADDRESS & C_008F24_META_PIPE_ALIGNED & C_008F24_META_RB_ALIGNED;
+      if (meta_va) {
+         struct gfx9_surf_meta_flags meta;
+
+         if (tex->surface.dcc_offset)
+            meta = tex->surface.u.gfx9.dcc;
+         else
+            meta = tex->surface.u.gfx9.htile;
+
+         state[5] |= S_008F24_META_DATA_ADDRESS(meta_va >> 40) |
+                     S_008F24_META_PIPE_ALIGNED(meta.pipe_aligned) |
+                     S_008F24_META_RB_ALIGNED(meta.rb_aligned);
+      }
+   } else {
+      /* GFX6-GFX8 */
+      unsigned pitch = base_level_info->nblk_x * block_width;
+      unsigned index = si_tile_mode_index(tex, base_level, is_stencil);
+
+      state[3] &= C_008F1C_TILING_INDEX;
+      state[3] |= S_008F1C_TILING_INDEX(index);
+      state[4] &= C_008F20_PITCH;
+      state[4] |= S_008F20_PITCH(pitch - 1);
+   }
 }
 
 static void si_set_sampler_state_desc(struct si_sampler_state *sstate,
-				      struct si_sampler_view *sview,
-				      struct si_texture *tex,
-				      uint32_t *desc)
-{
-	if (sview && sview->is_integer)
-		memcpy(desc, sstate->integer_val, 4*4);
-	else if (tex && tex->upgraded_depth &&
-		 (!sview || !sview->is_stencil_sampler))
-		memcpy(desc, sstate->upgraded_depth_val, 4*4);
-	else
-		memcpy(desc, sstate->val, 4*4);
-}
-
-static void si_set_sampler_view_desc(struct si_context *sctx,
-				     struct si_sampler_view *sview,
-				     struct si_sampler_state *sstate,
-				     uint32_t *desc)
-{
-	struct pipe_sampler_view *view = &sview->base;
-	struct si_texture *tex = (struct si_texture *)view->texture;
-	bool is_buffer = tex->buffer.b.b.target == PIPE_BUFFER;
-
-	if (unlikely(!is_buffer && sview->dcc_incompatible)) {
-		if (vi_dcc_enabled(tex, view->u.tex.first_level))
-			if (!si_texture_disable_dcc(sctx, tex))
-				si_decompress_dcc(sctx, tex);
-
-		sview->dcc_incompatible = false;
-	}
-
-	assert(tex); /* views with texture == NULL aren't supported */
-	memcpy(desc, sview->state, 8*4);
-
-	if (is_buffer) {
-		si_set_buf_desc_address(&tex->buffer,
-					sview->base.u.buf.offset,
-					desc + 4);
-	} else {
-		bool is_separate_stencil = tex->db_compatible &&
-					   sview->is_stencil_sampler;
-
-		si_set_mutable_tex_desc_fields(sctx->screen, tex,
-					       sview->base_level_info,
-					       sview->base_level,
-					       sview->base.u.tex.first_level,
-					       sview->block_width,
-					       is_separate_stencil,
-					       desc);
-	}
-
-	if (!is_buffer && tex->surface.fmask_size) {
-		memcpy(desc + 8, sview->fmask_state, 8*4);
-	} else {
-		/* Disable FMASK and bind sampler state in [12:15]. */
-		memcpy(desc + 8, null_texture_descriptor, 4*4);
-
-		if (sstate)
-			si_set_sampler_state_desc(sstate, sview,
-						  is_buffer ? NULL : tex,
-						  desc + 12);
-	}
+                                      struct si_sampler_view *sview, struct si_texture *tex,
+                                      uint32_t *desc)
+{
+   if (sview && sview->is_integer)
+      memcpy(desc, sstate->integer_val, 4 * 4);
+   else if (tex && tex->upgraded_depth && (!sview || !sview->is_stencil_sampler))
+      memcpy(desc, sstate->upgraded_depth_val, 4 * 4);
+   else
+      memcpy(desc, sstate->val, 4 * 4);
+}
+
+static void si_set_sampler_view_desc(struct si_context *sctx, struct si_sampler_view *sview,
+                                     struct si_sampler_state *sstate, uint32_t *desc)
+{
+   struct pipe_sampler_view *view = &sview->base;
+   struct si_texture *tex = (struct si_texture *)view->texture;
+   bool is_buffer = tex->buffer.b.b.target == PIPE_BUFFER;
+
+   if (unlikely(!is_buffer && sview->dcc_incompatible)) {
+      if (vi_dcc_enabled(tex, view->u.tex.first_level))
+         if (!si_texture_disable_dcc(sctx, tex))
+            si_decompress_dcc(sctx, tex);
+
+      sview->dcc_incompatible = false;
+   }
+
+   assert(tex); /* views with texture == NULL aren't supported */
+   memcpy(desc, sview->state, 8 * 4);
+
+   if (is_buffer) {
+      si_set_buf_desc_address(&tex->buffer, sview->base.u.buf.offset, desc + 4);
+   } else {
+      bool is_separate_stencil = tex->db_compatible && sview->is_stencil_sampler;
+
+      si_set_mutable_tex_desc_fields(sctx->screen, tex, sview->base_level_info, sview->base_level,
+                                     sview->base.u.tex.first_level, sview->block_width,
+                                     is_separate_stencil, desc);
+   }
+
+   if (!is_buffer && tex->surface.fmask_size) {
+      memcpy(desc + 8, sview->fmask_state, 8 * 4);
+   } else {
+      /* Disable FMASK and bind sampler state in [12:15]. */
+      memcpy(desc + 8, null_texture_descriptor, 4 * 4);
+
+      if (sstate)
+         si_set_sampler_state_desc(sstate, sview, is_buffer ? NULL : tex, desc + 12);
+   }
 }
 
 static bool color_needs_decompression(struct si_texture *tex)
 {
-	return tex->surface.fmask_size ||
-	       (tex->dirty_level_mask &&
-		(tex->cmask_buffer || tex->surface.dcc_offset));
+   return tex->surface.fmask_size ||
+          (tex->dirty_level_mask && (tex->cmask_buffer || tex->surface.dcc_offset));
 }
 
 static bool depth_needs_decompression(struct si_texture *tex)
 {
-	/* If the depth/stencil texture is TC-compatible, no decompression
-	 * will be done. The decompression function will only flush DB caches
-	 * to make it coherent with shaders. That's necessary because the driver
-	 * doesn't flush DB caches in any other case.
-	 */
-	return tex->db_compatible;
-}
-
-static void si_set_sampler_view(struct si_context *sctx,
-				unsigned shader,
-				unsigned slot, struct pipe_sampler_view *view,
-				bool disallow_early_out)
-{
-	struct si_samplers *samplers = &sctx->samplers[shader];
-	struct si_sampler_view *sview = (struct si_sampler_view*)view;
-	struct si_descriptors *descs = si_sampler_and_image_descriptors(sctx, shader);
-	unsigned desc_slot = si_get_sampler_slot(slot);
-	uint32_t *desc = descs->list + desc_slot * 16;
-
-	if (samplers->views[slot] == view && !disallow_early_out)
-		return;
-
-	if (view) {
-		struct si_texture *tex = (struct si_texture *)view->texture;
-
-		si_set_sampler_view_desc(sctx, sview,
-					 samplers->sampler_states[slot], desc);
-
-		if (tex->buffer.b.b.target == PIPE_BUFFER) {
-			tex->buffer.bind_history |= PIPE_BIND_SAMPLER_VIEW;
-			samplers->needs_depth_decompress_mask &= ~(1u << slot);
-			samplers->needs_color_decompress_mask &= ~(1u << slot);
-		} else {
-			if (depth_needs_decompression(tex)) {
-				samplers->needs_depth_decompress_mask |= 1u << slot;
-			} else {
-				samplers->needs_depth_decompress_mask &= ~(1u << slot);
-			}
-			if (color_needs_decompression(tex)) {
-				samplers->needs_color_decompress_mask |= 1u << slot;
-			} else {
-				samplers->needs_color_decompress_mask &= ~(1u << slot);
-			}
-
-			if (tex->surface.dcc_offset &&
-			    p_atomic_read(&tex->framebuffers_bound))
-				sctx->need_check_render_feedback = true;
-		}
-
-		pipe_sampler_view_reference(&samplers->views[slot], view);
-		samplers->enabled_mask |= 1u << slot;
-
-		/* Since this can flush, it must be done after enabled_mask is
-		 * updated. */
-		si_sampler_view_add_buffer(sctx, view->texture,
-					   RADEON_USAGE_READ,
-					   sview->is_stencil_sampler, true);
-	} else {
-		pipe_sampler_view_reference(&samplers->views[slot], NULL);
-		memcpy(desc, null_texture_descriptor, 8*4);
-		/* Only clear the lower dwords of FMASK. */
-		memcpy(desc + 8, null_texture_descriptor, 4*4);
-		/* Re-set the sampler state if we are transitioning from FMASK. */
-		if (samplers->sampler_states[slot])
-			si_set_sampler_state_desc(samplers->sampler_states[slot], NULL, NULL,
-						  desc + 12);
-
-		samplers->enabled_mask &= ~(1u << slot);
-		samplers->needs_depth_decompress_mask &= ~(1u << slot);
-		samplers->needs_color_decompress_mask &= ~(1u << slot);
-	}
-
-	sctx->descriptors_dirty |= 1u << si_sampler_and_image_descriptors_idx(shader);
-}
-
-static void si_update_shader_needs_decompress_mask(struct si_context *sctx,
-						   unsigned shader)
-{
-	struct si_samplers *samplers = &sctx->samplers[shader];
-	unsigned shader_bit = 1 << shader;
-
-	if (samplers->needs_depth_decompress_mask ||
-	    samplers->needs_color_decompress_mask ||
-	    sctx->images[shader].needs_color_decompress_mask)
-		sctx->shader_needs_decompress_mask |= shader_bit;
-	else
-		sctx->shader_needs_decompress_mask &= ~shader_bit;
-}
-
-static void si_set_sampler_views(struct pipe_context *ctx,
-				 enum pipe_shader_type shader, unsigned start,
-                                 unsigned count,
-				 struct pipe_sampler_view **views)
-{
-	struct si_context *sctx = (struct si_context *)ctx;
-	int i;
-
-	if (!count || shader >= SI_NUM_SHADERS)
-		return;
-
-	if (views) {
-		for (i = 0; i < count; i++)
-			si_set_sampler_view(sctx, shader, start + i, views[i], false);
-	} else {
-		for (i = 0; i < count; i++)
-			si_set_sampler_view(sctx, shader, start + i, NULL, false);
-	}
-
-	si_update_shader_needs_decompress_mask(sctx, shader);
-}
-
-static void
-si_samplers_update_needs_color_decompress_mask(struct si_samplers *samplers)
-{
-	unsigned mask = samplers->enabled_mask;
-
-	while (mask) {
-		int i = u_bit_scan(&mask);
-		struct pipe_resource *res = samplers->views[i]->texture;
-
-		if (res && res->target != PIPE_BUFFER) {
-			struct si_texture *tex = (struct si_texture *)res;
-
-			if (color_needs_decompression(tex)) {
-				samplers->needs_color_decompress_mask |= 1u << i;
-			} else {
-				samplers->needs_color_decompress_mask &= ~(1u << i);
-			}
-		}
-	}
+   /* If the depth/stencil texture is TC-compatible, no decompression
+    * will be done. The decompression function will only flush DB caches
+    * to make it coherent with shaders. That's necessary because the driver
+    * doesn't flush DB caches in any other case.
+    */
+   return tex->db_compatible;
+}
+
+static void si_set_sampler_view(struct si_context *sctx, unsigned shader, unsigned slot,
+                                struct pipe_sampler_view *view, bool disallow_early_out)
+{
+   struct si_samplers *samplers = &sctx->samplers[shader];
+   struct si_sampler_view *sview = (struct si_sampler_view *)view;
+   struct si_descriptors *descs = si_sampler_and_image_descriptors(sctx, shader);
+   unsigned desc_slot = si_get_sampler_slot(slot);
+   uint32_t *desc = descs->list + desc_slot * 16;
+
+   if (samplers->views[slot] == view && !disallow_early_out)
+      return;
+
+   if (view) {
+      struct si_texture *tex = (struct si_texture *)view->texture;
+
+      si_set_sampler_view_desc(sctx, sview, samplers->sampler_states[slot], desc);
+
+      if (tex->buffer.b.b.target == PIPE_BUFFER) {
+         tex->buffer.bind_history |= PIPE_BIND_SAMPLER_VIEW;
+         samplers->needs_depth_decompress_mask &= ~(1u << slot);
+         samplers->needs_color_decompress_mask &= ~(1u << slot);
+      } else {
+         if (depth_needs_decompression(tex)) {
+            samplers->needs_depth_decompress_mask |= 1u << slot;
+         } else {
+            samplers->needs_depth_decompress_mask &= ~(1u << slot);
+         }
+         if (color_needs_decompression(tex)) {
+            samplers->needs_color_decompress_mask |= 1u << slot;
+         } else {
+            samplers->needs_color_decompress_mask &= ~(1u << slot);
+         }
+
+         if (tex->surface.dcc_offset && p_atomic_read(&tex->framebuffers_bound))
+            sctx->need_check_render_feedback = true;
+      }
+
+      pipe_sampler_view_reference(&samplers->views[slot], view);
+      samplers->enabled_mask |= 1u << slot;
+
+      /* Since this can flush, it must be done after enabled_mask is
+       * updated. */
+      si_sampler_view_add_buffer(sctx, view->texture, RADEON_USAGE_READ, sview->is_stencil_sampler,
+                                 true);
+   } else {
+      pipe_sampler_view_reference(&samplers->views[slot], NULL);
+      memcpy(desc, null_texture_descriptor, 8 * 4);
+      /* Only clear the lower dwords of FMASK. */
+      memcpy(desc + 8, null_texture_descriptor, 4 * 4);
+      /* Re-set the sampler state if we are transitioning from FMASK. */
+      if (samplers->sampler_states[slot])
+         si_set_sampler_state_desc(samplers->sampler_states[slot], NULL, NULL, desc + 12);
+
+      samplers->enabled_mask &= ~(1u << slot);
+      samplers->needs_depth_decompress_mask &= ~(1u << slot);
+      samplers->needs_color_decompress_mask &= ~(1u << slot);
+   }
+
+   sctx->descriptors_dirty |= 1u << si_sampler_and_image_descriptors_idx(shader);
+}
+
+static void si_update_shader_needs_decompress_mask(struct si_context *sctx, unsigned shader)
+{
+   struct si_samplers *samplers = &sctx->samplers[shader];
+   unsigned shader_bit = 1 << shader;
+
+   if (samplers->needs_depth_decompress_mask || samplers->needs_color_decompress_mask ||
+       sctx->images[shader].needs_color_decompress_mask)
+      sctx->shader_needs_decompress_mask |= shader_bit;
+   else
+      sctx->shader_needs_decompress_mask &= ~shader_bit;
+}
+
+static void si_set_sampler_views(struct pipe_context *ctx, enum pipe_shader_type shader,
+                                 unsigned start, unsigned count, struct pipe_sampler_view **views)
+{
+   struct si_context *sctx = (struct si_context *)ctx;
+   int i;
+
+   if (!count || shader >= SI_NUM_SHADERS)
+      return;
+
+   if (views) {
+      for (i = 0; i < count; i++)
+         si_set_sampler_view(sctx, shader, start + i, views[i], false);
+   } else {
+      for (i = 0; i < count; i++)
+         si_set_sampler_view(sctx, shader, start + i, NULL, false);
+   }
+
+   si_update_shader_needs_decompress_mask(sctx, shader);
+}
+
+static void si_samplers_update_needs_color_decompress_mask(struct si_samplers *samplers)
+{
+   unsigned mask = samplers->enabled_mask;
+
+   while (mask) {
+      int i = u_bit_scan(&mask);
+      struct pipe_resource *res = samplers->views[i]->texture;
+
+      if (res && res->target != PIPE_BUFFER) {
+         struct si_texture *tex = (struct si_texture *)res;
+
+         if (color_needs_decompression(tex)) {
+            samplers->needs_color_decompress_mask |= 1u << i;
+         } else {
+            samplers->needs_color_decompress_mask &= ~(1u << i);
+         }
+      }
+   }
 }
 
 /* IMAGE VIEWS */
 
-static void
-si_release_image_views(struct si_images *images)
+static void si_release_image_views(struct si_images *images)
 {
-	unsigned i;
+   unsigned i;
 
-	for (i = 0; i < SI_NUM_IMAGES; ++i) {
-		struct pipe_image_view *view = &images->views[i];
+   for (i = 0; i < SI_NUM_IMAGES; ++i) {
+      struct pipe_image_view *view = &images->views[i];
 
-		pipe_resource_reference(&view->resource, NULL);
-	}
+      pipe_resource_reference(&view->resource, NULL);
+   }
 }
 
-static void
-si_image_views_begin_new_cs(struct si_context *sctx, struct si_images *images)
+static void si_image_views_begin_new_cs(struct si_context *sctx, struct si_images *images)
 {
-	uint mask = images->enabled_mask;
+   uint mask = images->enabled_mask;
+
+   /* Add buffers to the CS. */
+   while (mask) {
+      int i = u_bit_scan(&mask);
+      struct pipe_image_view *view = &images->views[i];
 
-	/* Add buffers to the CS. */
-	while (mask) {
-		int i = u_bit_scan(&mask);
-		struct pipe_image_view *view = &images->views[i];
+      assert(view->resource);
 
-		assert(view->resource);
+      si_sampler_view_add_buffer(sctx, view->resource, RADEON_USAGE_READWRITE, false, false);
+   }
+}
 
-		si_sampler_view_add_buffer(sctx, view->resource,
-					   RADEON_USAGE_READWRITE, false, false);
-	}
-}
-
-static void
-si_disable_shader_image(struct si_context *ctx, unsigned shader, unsigned slot)
-{
-	struct si_images *images = &ctx->images[shader];
-
-	if (images->enabled_mask & (1u << slot)) {
-		struct si_descriptors *descs = si_sampler_and_image_descriptors(ctx, shader);
-		unsigned desc_slot = si_get_image_slot(slot);
-
-		pipe_resource_reference(&images->views[slot].resource, NULL);
-		images->needs_color_decompress_mask &= ~(1 << slot);
-
-		memcpy(descs->list + desc_slot*8, null_image_descriptor, 8*4);
-		images->enabled_mask &= ~(1u << slot);
-		ctx->descriptors_dirty |= 1u << si_sampler_and_image_descriptors_idx(shader);
-	}
-}
-
-static void
-si_mark_image_range_valid(const struct pipe_image_view *view)
-{
-	struct si_resource *res = si_resource(view->resource);
-
-	if (res->b.b.target != PIPE_BUFFER)
-		return;
-
-	util_range_add(&res->b.b, &res->valid_buffer_range,
-		       view->u.buf.offset,
-		       view->u.buf.offset + view->u.buf.size);
-}
-
-static void si_set_shader_image_desc(struct si_context *ctx,
-				     const struct pipe_image_view *view,
-				     bool skip_decompress,
-				     uint32_t *desc, uint32_t *fmask_desc)
-{
-	struct si_screen *screen = ctx->screen;
-	struct si_resource *res;
-
-	res = si_resource(view->resource);
-
-	if (res->b.b.target == PIPE_BUFFER ||
-	    view->shader_access & SI_IMAGE_ACCESS_AS_BUFFER) {
-		if (view->access & PIPE_IMAGE_ACCESS_WRITE)
-			si_mark_image_range_valid(view);
-
-		si_make_buffer_descriptor(screen, res,
-					  view->format,
-					  view->u.buf.offset,
-					  view->u.buf.size, desc);
-		si_set_buf_desc_address(res, view->u.buf.offset, desc + 4);
-	} else {
-		static const unsigned char swizzle[4] = { 0, 1, 2, 3 };
-		struct si_texture *tex = (struct si_texture *)res;
-		unsigned level = view->u.tex.level;
-		unsigned width, height, depth, hw_level;
-		bool uses_dcc = vi_dcc_enabled(tex, level);
-		unsigned access = view->access;
-
-		assert(!tex->is_depth);
-		assert(fmask_desc || tex->surface.fmask_offset == 0);
-
-		if (uses_dcc && !skip_decompress &&
-		    (access & PIPE_IMAGE_ACCESS_WRITE ||
-		     !vi_dcc_formats_compatible(screen, res->b.b.format, view->format))) {
-			/* If DCC can't be disabled, at least decompress it.
-			 * The decompression is relatively cheap if the surface
-			 * has been decompressed already.
-			 */
-			if (!si_texture_disable_dcc(ctx, tex))
-				si_decompress_dcc(ctx, tex);
-		}
-
-		if (ctx->chip_class >= GFX9) {
-			/* Always set the base address. The swizzle modes don't
-			 * allow setting mipmap level offsets as the base.
-			 */
-			width = res->b.b.width0;
-			height = res->b.b.height0;
-			depth = res->b.b.depth0;
-			hw_level = level;
-		} else {
-			/* Always force the base level to the selected level.
-			 *
-			 * This is required for 3D textures, where otherwise
-			 * selecting a single slice for non-layered bindings
-			 * fails. It doesn't hurt the other targets.
-			 */
-			width = u_minify(res->b.b.width0, level);
-			height = u_minify(res->b.b.height0, level);
-			depth = u_minify(res->b.b.depth0, level);
-			hw_level = 0;
-		}
-
-		screen->make_texture_descriptor(screen, tex,
-					   false, res->b.b.target,
-					   view->format, swizzle,
-					   hw_level, hw_level,
-					   view->u.tex.first_layer,
-					   view->u.tex.last_layer,
-					   width, height, depth,
-					   desc, fmask_desc);
-		si_set_mutable_tex_desc_fields(screen, tex,
-					       &tex->surface.u.legacy.level[level],
-					       level, level,
-					       util_format_get_blockwidth(view->format),
-					       false, desc);
-	}
-}
-
-static void si_set_shader_image(struct si_context *ctx,
-				unsigned shader,
-				unsigned slot, const struct pipe_image_view *view,
-				bool skip_decompress)
-{
-	struct si_images *images = &ctx->images[shader];
-	struct si_descriptors *descs = si_sampler_and_image_descriptors(ctx, shader);
-	struct si_resource *res;
-
-	if (!view || !view->resource) {
-		si_disable_shader_image(ctx, shader, slot);
-		return;
-	}
-
-	res = si_resource(view->resource);
-
-	if (&images->views[slot] != view)
-		util_copy_image_view(&images->views[slot], view);
-
-	si_set_shader_image_desc(ctx, view, skip_decompress,
-				 descs->list + si_get_image_slot(slot) * 8,
-				 descs->list + si_get_image_slot(slot + SI_NUM_IMAGES) * 8);
-
-	if (res->b.b.target == PIPE_BUFFER ||
-	    view->shader_access & SI_IMAGE_ACCESS_AS_BUFFER) {
-		images->needs_color_decompress_mask &= ~(1 << slot);
-		res->bind_history |= PIPE_BIND_SHADER_IMAGE;
-	} else {
-		struct si_texture *tex = (struct si_texture *)res;
-		unsigned level = view->u.tex.level;
-
-		if (color_needs_decompression(tex)) {
-			images->needs_color_decompress_mask |= 1 << slot;
-		} else {
-			images->needs_color_decompress_mask &= ~(1 << slot);
-		}
-
-		if (vi_dcc_enabled(tex, level) &&
-		    p_atomic_read(&tex->framebuffers_bound))
-			ctx->need_check_render_feedback = true;
-	}
-
-	images->enabled_mask |= 1u << slot;
-	ctx->descriptors_dirty |= 1u << si_sampler_and_image_descriptors_idx(shader);
-
-	/* Since this can flush, it must be done after enabled_mask is updated. */
-	si_sampler_view_add_buffer(ctx, &res->b.b,
-				   (view->access & PIPE_IMAGE_ACCESS_WRITE) ?
-				   RADEON_USAGE_READWRITE : RADEON_USAGE_READ,
-				   false, true);
-}
-
-static void
-si_set_shader_images(struct pipe_context *pipe,
-		     enum pipe_shader_type shader,
-		     unsigned start_slot, unsigned count,
-		     const struct pipe_image_view *views)
-{
-	struct si_context *ctx = (struct si_context *)pipe;
-	unsigned i, slot;
-
-	assert(shader < SI_NUM_SHADERS);
-
-	if (!count)
-		return;
-
-	assert(start_slot + count <= SI_NUM_IMAGES);
-
-	if (views) {
-		for (i = 0, slot = start_slot; i < count; ++i, ++slot)
-			si_set_shader_image(ctx, shader, slot, &views[i], false);
-	} else {
-		for (i = 0, slot = start_slot; i < count; ++i, ++slot)
-			si_set_shader_image(ctx, shader, slot, NULL, false);
-	}
-
-	si_update_shader_needs_decompress_mask(ctx, shader);
-}
-
-static void
-si_images_update_needs_color_decompress_mask(struct si_images *images)
+static void si_disable_shader_image(struct si_context *ctx, unsigned shader, unsigned slot)
 {
-	unsigned mask = images->enabled_mask;
+   struct si_images *images = &ctx->images[shader];
+
+   if (images->enabled_mask & (1u << slot)) {
+      struct si_descriptors *descs = si_sampler_and_image_descriptors(ctx, shader);
+      unsigned desc_slot = si_get_image_slot(slot);
+
+      pipe_resource_reference(&images->views[slot].resource, NULL);
+      images->needs_color_decompress_mask &= ~(1 << slot);
+
+      memcpy(descs->list + desc_slot * 8, null_image_descriptor, 8 * 4);
+      images->enabled_mask &= ~(1u << slot);
+      ctx->descriptors_dirty |= 1u << si_sampler_and_image_descriptors_idx(shader);
+   }
+}
 
-	while (mask) {
-		int i = u_bit_scan(&mask);
-		struct pipe_resource *res = images->views[i].resource;
+static void si_mark_image_range_valid(const struct pipe_image_view *view)
+{
+   struct si_resource *res = si_resource(view->resource);
 
-		if (res && res->target != PIPE_BUFFER) {
-			struct si_texture *tex = (struct si_texture *)res;
+   if (res->b.b.target != PIPE_BUFFER)
+      return;
 
-			if (color_needs_decompression(tex)) {
-				images->needs_color_decompress_mask |= 1 << i;
-			} else {
-				images->needs_color_decompress_mask &= ~(1 << i);
-			}
-		}
-	}
+   util_range_add(&res->b.b, &res->valid_buffer_range, view->u.buf.offset,
+                  view->u.buf.offset + view->u.buf.size);
+}
+
+static void si_set_shader_image_desc(struct si_context *ctx, const struct pipe_image_view *view,
+                                     bool skip_decompress, uint32_t *desc, uint32_t *fmask_desc)
+{
+   struct si_screen *screen = ctx->screen;
+   struct si_resource *res;
+
+   res = si_resource(view->resource);
+
+   if (res->b.b.target == PIPE_BUFFER || view->shader_access & SI_IMAGE_ACCESS_AS_BUFFER) {
+      if (view->access & PIPE_IMAGE_ACCESS_WRITE)
+         si_mark_image_range_valid(view);
+
+      si_make_buffer_descriptor(screen, res, view->format, view->u.buf.offset, view->u.buf.size,
+                                desc);
+      si_set_buf_desc_address(res, view->u.buf.offset, desc + 4);
+   } else {
+      static const unsigned char swizzle[4] = {0, 1, 2, 3};
+      struct si_texture *tex = (struct si_texture *)res;
+      unsigned level = view->u.tex.level;
+      unsigned width, height, depth, hw_level;
+      bool uses_dcc = vi_dcc_enabled(tex, level);
+      unsigned access = view->access;
+
+      assert(!tex->is_depth);
+      assert(fmask_desc || tex->surface.fmask_offset == 0);
+
+      if (uses_dcc && !skip_decompress &&
+          (access & PIPE_IMAGE_ACCESS_WRITE ||
+           !vi_dcc_formats_compatible(screen, res->b.b.format, view->format))) {
+         /* If DCC can't be disabled, at least decompress it.
+          * The decompression is relatively cheap if the surface
+          * has been decompressed already.
+          */
+         if (!si_texture_disable_dcc(ctx, tex))
+            si_decompress_dcc(ctx, tex);
+      }
+
+      if (ctx->chip_class >= GFX9) {
+         /* Always set the base address. The swizzle modes don't
+          * allow setting mipmap level offsets as the base.
+          */
+         width = res->b.b.width0;
+         height = res->b.b.height0;
+         depth = res->b.b.depth0;
+         hw_level = level;
+      } else {
+         /* Always force the base level to the selected level.
+          *
+          * This is required for 3D textures, where otherwise
+          * selecting a single slice for non-layered bindings
+          * fails. It doesn't hurt the other targets.
+          */
+         width = u_minify(res->b.b.width0, level);
+         height = u_minify(res->b.b.height0, level);
+         depth = u_minify(res->b.b.depth0, level);
+         hw_level = 0;
+      }
+
+      screen->make_texture_descriptor(
+         screen, tex, false, res->b.b.target, view->format, swizzle, hw_level, hw_level,
+         view->u.tex.first_layer, view->u.tex.last_layer, width, height, depth, desc, fmask_desc);
+      si_set_mutable_tex_desc_fields(screen, tex, &tex->surface.u.legacy.level[level], level, level,
+                                     util_format_get_blockwidth(view->format), false, desc);
+   }
+}
+
+static void si_set_shader_image(struct si_context *ctx, unsigned shader, unsigned slot,
+                                const struct pipe_image_view *view, bool skip_decompress)
+{
+   struct si_images *images = &ctx->images[shader];
+   struct si_descriptors *descs = si_sampler_and_image_descriptors(ctx, shader);
+   struct si_resource *res;
+
+   if (!view || !view->resource) {
+      si_disable_shader_image(ctx, shader, slot);
+      return;
+   }
+
+   res = si_resource(view->resource);
+
+   if (&images->views[slot] != view)
+      util_copy_image_view(&images->views[slot], view);
+
+   si_set_shader_image_desc(ctx, view, skip_decompress, descs->list + si_get_image_slot(slot) * 8,
+                            descs->list + si_get_image_slot(slot + SI_NUM_IMAGES) * 8);
+
+   if (res->b.b.target == PIPE_BUFFER || view->shader_access & SI_IMAGE_ACCESS_AS_BUFFER) {
+      images->needs_color_decompress_mask &= ~(1 << slot);
+      res->bind_history |= PIPE_BIND_SHADER_IMAGE;
+   } else {
+      struct si_texture *tex = (struct si_texture *)res;
+      unsigned level = view->u.tex.level;
+
+      if (color_needs_decompression(tex)) {
+         images->needs_color_decompress_mask |= 1 << slot;
+      } else {
+         images->needs_color_decompress_mask &= ~(1 << slot);
+      }
+
+      if (vi_dcc_enabled(tex, level) && p_atomic_read(&tex->framebuffers_bound))
+         ctx->need_check_render_feedback = true;
+   }
+
+   images->enabled_mask |= 1u << slot;
+   ctx->descriptors_dirty |= 1u << si_sampler_and_image_descriptors_idx(shader);
+
+   /* Since this can flush, it must be done after enabled_mask is updated. */
+   si_sampler_view_add_buffer(
+      ctx, &res->b.b,
+      (view->access & PIPE_IMAGE_ACCESS_WRITE) ? RADEON_USAGE_READWRITE : RADEON_USAGE_READ, false,
+      true);
+}
+
+static void si_set_shader_images(struct pipe_context *pipe, enum pipe_shader_type shader,
+                                 unsigned start_slot, unsigned count,
+                                 const struct pipe_image_view *views)
+{
+   struct si_context *ctx = (struct si_context *)pipe;
+   unsigned i, slot;
+
+   assert(shader < SI_NUM_SHADERS);
+
+   if (!count)
+      return;
+
+   assert(start_slot + count <= SI_NUM_IMAGES);
+
+   if (views) {
+      for (i = 0, slot = start_slot; i < count; ++i, ++slot)
+         si_set_shader_image(ctx, shader, slot, &views[i], false);
+   } else {
+      for (i = 0, slot = start_slot; i < count; ++i, ++slot)
+         si_set_shader_image(ctx, shader, slot, NULL, false);
+   }
+
+   si_update_shader_needs_decompress_mask(ctx, shader);
+}
+
+static void si_images_update_needs_color_decompress_mask(struct si_images *images)
+{
+   unsigned mask = images->enabled_mask;
+
+   while (mask) {
+      int i = u_bit_scan(&mask);
+      struct pipe_resource *res = images->views[i].resource;
+
+      if (res && res->target != PIPE_BUFFER) {
+         struct si_texture *tex = (struct si_texture *)res;
+
+         if (color_needs_decompression(tex)) {
+            images->needs_color_decompress_mask |= 1 << i;
+         } else {
+            images->needs_color_decompress_mask &= ~(1 << i);
+         }
+      }
+   }
 }
 
 void si_update_ps_colorbuf0_slot(struct si_context *sctx)
 {
-	struct si_buffer_resources *buffers = &sctx->rw_buffers;
-	struct si_descriptors *descs = &sctx->descriptors[SI_DESCS_RW_BUFFERS];
-	unsigned slot = SI_PS_IMAGE_COLORBUF0;
-	struct pipe_surface *surf = NULL;
-
-	/* si_texture_disable_dcc can get us here again. */
-	if (sctx->blitter->running)
-		return;
-
-	/* See whether FBFETCH is used and color buffer 0 is set. */
-	if (sctx->ps_shader.cso &&
-	    sctx->ps_shader.cso->info.uses_fbfetch &&
-	    sctx->framebuffer.state.nr_cbufs &&
-	    sctx->framebuffer.state.cbufs[0])
-		surf = sctx->framebuffer.state.cbufs[0];
-
-	/* Return if FBFETCH transitions from disabled to disabled. */
-	if (!buffers->buffers[slot] && !surf)
-		return;
-
-	sctx->ps_uses_fbfetch = surf != NULL;
-	si_update_ps_iter_samples(sctx);
-
-	if (surf) {
-		struct si_texture *tex = (struct si_texture*)surf->texture;
-		struct pipe_image_view view = {0};
-
-		assert(tex);
-		assert(!tex->is_depth);
-
-		/* Disable DCC, because the texture is used as both a sampler
-		 * and color buffer.
-		 */
-		si_texture_disable_dcc(sctx, tex);
-
-		if (tex->buffer.b.b.nr_samples <= 1 && tex->cmask_buffer) {
-			/* Disable CMASK. */
-			assert(tex->cmask_buffer != &tex->buffer);
-			si_eliminate_fast_color_clear(sctx, tex);
-			si_texture_discard_cmask(sctx->screen, tex);
-		}
-
-		view.resource = surf->texture;
-		view.format = surf->format;
-		view.access = PIPE_IMAGE_ACCESS_READ;
-		view.u.tex.first_layer = surf->u.tex.first_layer;
-		view.u.tex.last_layer = surf->u.tex.last_layer;
-		view.u.tex.level = surf->u.tex.level;
-
-		/* Set the descriptor. */
-		uint32_t *desc = descs->list + slot*4;
-		memset(desc, 0, 16 * 4);
-		si_set_shader_image_desc(sctx, &view, true, desc, desc + 8);
-
-		pipe_resource_reference(&buffers->buffers[slot], &tex->buffer.b.b);
-		radeon_add_to_buffer_list(sctx, sctx->gfx_cs,
-					  &tex->buffer, RADEON_USAGE_READ,
-					  RADEON_PRIO_SHADER_RW_IMAGE);
-		buffers->enabled_mask |= 1u << slot;
-	} else {
-		/* Clear the descriptor. */
-		memset(descs->list + slot*4, 0, 8*4);
-		pipe_resource_reference(&buffers->buffers[slot], NULL);
-		buffers->enabled_mask &= ~(1u << slot);
-	}
-
-	sctx->descriptors_dirty |= 1u << SI_DESCS_RW_BUFFERS;
+   struct si_buffer_resources *buffers = &sctx->rw_buffers;
+   struct si_descriptors *descs = &sctx->descriptors[SI_DESCS_RW_BUFFERS];
+   unsigned slot = SI_PS_IMAGE_COLORBUF0;
+   struct pipe_surface *surf = NULL;
+
+   /* si_texture_disable_dcc can get us here again. */
+   if (sctx->blitter->running)
+      return;
+
+   /* See whether FBFETCH is used and color buffer 0 is set. */
+   if (sctx->ps_shader.cso && sctx->ps_shader.cso->info.uses_fbfetch &&
+       sctx->framebuffer.state.nr_cbufs && sctx->framebuffer.state.cbufs[0])
+      surf = sctx->framebuffer.state.cbufs[0];
+
+   /* Return if FBFETCH transitions from disabled to disabled. */
+   if (!buffers->buffers[slot] && !surf)
+      return;
+
+   sctx->ps_uses_fbfetch = surf != NULL;
+   si_update_ps_iter_samples(sctx);
+
+   if (surf) {
+      struct si_texture *tex = (struct si_texture *)surf->texture;
+      struct pipe_image_view view = {0};
+
+      assert(tex);
+      assert(!tex->is_depth);
+
+      /* Disable DCC, because the texture is used as both a sampler
+       * and color buffer.
+       */
+      si_texture_disable_dcc(sctx, tex);
+
+      if (tex->buffer.b.b.nr_samples <= 1 && tex->cmask_buffer) {
+         /* Disable CMASK. */
+         assert(tex->cmask_buffer != &tex->buffer);
+         si_eliminate_fast_color_clear(sctx, tex);
+         si_texture_discard_cmask(sctx->screen, tex);
+      }
+
+      view.resource = surf->texture;
+      view.format = surf->format;
+      view.access = PIPE_IMAGE_ACCESS_READ;
+      view.u.tex.first_layer = surf->u.tex.first_layer;
+      view.u.tex.last_layer = surf->u.tex.last_layer;
+      view.u.tex.level = surf->u.tex.level;
+
+      /* Set the descriptor. */
+      uint32_t *desc = descs->list + slot * 4;
+      memset(desc, 0, 16 * 4);
+      si_set_shader_image_desc(sctx, &view, true, desc, desc + 8);
+
+      pipe_resource_reference(&buffers->buffers[slot], &tex->buffer.b.b);
+      radeon_add_to_buffer_list(sctx, sctx->gfx_cs, &tex->buffer, RADEON_USAGE_READ,
+                                RADEON_PRIO_SHADER_RW_IMAGE);
+      buffers->enabled_mask |= 1u << slot;
+   } else {
+      /* Clear the descriptor. */
+      memset(descs->list + slot * 4, 0, 8 * 4);
+      pipe_resource_reference(&buffers->buffers[slot], NULL);
+      buffers->enabled_mask &= ~(1u << slot);
+   }
+
+   sctx->descriptors_dirty |= 1u << SI_DESCS_RW_BUFFERS;
 }
 
 /* SAMPLER STATES */
 
-static void si_bind_sampler_states(struct pipe_context *ctx,
-                                   enum pipe_shader_type shader,
+static void si_bind_sampler_states(struct pipe_context *ctx, enum pipe_shader_type shader,
                                    unsigned start, unsigned count, void **states)
 {
-	struct si_context *sctx = (struct si_context *)ctx;
-	struct si_samplers *samplers = &sctx->samplers[shader];
-	struct si_descriptors *desc = si_sampler_and_image_descriptors(sctx, shader);
-	struct si_sampler_state **sstates = (struct si_sampler_state**)states;
-	int i;
+   struct si_context *sctx = (struct si_context *)ctx;
+   struct si_samplers *samplers = &sctx->samplers[shader];
+   struct si_descriptors *desc = si_sampler_and_image_descriptors(sctx, shader);
+   struct si_sampler_state **sstates = (struct si_sampler_state **)states;
+   int i;
 
-	if (!count || shader >= SI_NUM_SHADERS || !sstates)
-		return;
+   if (!count || shader >= SI_NUM_SHADERS || !sstates)
+      return;
 
-	for (i = 0; i < count; i++) {
-		unsigned slot = start + i;
-		unsigned desc_slot = si_get_sampler_slot(slot);
+   for (i = 0; i < count; i++) {
+      unsigned slot = start + i;
+      unsigned desc_slot = si_get_sampler_slot(slot);
 
-		if (!sstates[i] ||
-		    sstates[i] == samplers->sampler_states[slot])
-			continue;
+      if (!sstates[i] || sstates[i] == samplers->sampler_states[slot])
+         continue;
 
 #ifndef NDEBUG
-		assert(sstates[i]->magic == SI_SAMPLER_STATE_MAGIC);
+      assert(sstates[i]->magic == SI_SAMPLER_STATE_MAGIC);
 #endif
-		samplers->sampler_states[slot] = sstates[i];
+      samplers->sampler_states[slot] = sstates[i];
 
-		/* If FMASK is bound, don't overwrite it.
-		 * The sampler state will be set after FMASK is unbound.
-		 */
-		struct si_sampler_view *sview =
-			(struct si_sampler_view *)samplers->views[slot];
+      /* If FMASK is bound, don't overwrite it.
+       * The sampler state will be set after FMASK is unbound.
+       */
+      struct si_sampler_view *sview = (struct si_sampler_view *)samplers->views[slot];
 
-		struct si_texture *tex = NULL;
+      struct si_texture *tex = NULL;
 
-		if (sview && sview->base.texture &&
-		    sview->base.texture->target != PIPE_BUFFER)
-			tex = (struct si_texture *)sview->base.texture;
+      if (sview && sview->base.texture && sview->base.texture->target != PIPE_BUFFER)
+         tex = (struct si_texture *)sview->base.texture;
 
-		if (tex && tex->surface.fmask_size)
-			continue;
+      if (tex && tex->surface.fmask_size)
+         continue;
 
-		si_set_sampler_state_desc(sstates[i], sview, tex,
-					  desc->list + desc_slot * 16 + 12);
+      si_set_sampler_state_desc(sstates[i], sview, tex, desc->list + desc_slot * 16 + 12);
 
-		sctx->descriptors_dirty |= 1u << si_sampler_and_image_descriptors_idx(shader);
-	}
+      sctx->descriptors_dirty |= 1u << si_sampler_and_image_descriptors_idx(shader);
+   }
 }
 
 /* BUFFER RESOURCES */
 
 static void si_init_buffer_resources(struct si_buffer_resources *buffers,
-				     struct si_descriptors *descs,
-				     unsigned num_buffers,
-				     short shader_userdata_rel_index,
-				     enum radeon_bo_priority priority,
-				     enum radeon_bo_priority priority_constbuf)
+                                     struct si_descriptors *descs, unsigned num_buffers,
+                                     short shader_userdata_rel_index,
+                                     enum radeon_bo_priority priority,
+                                     enum radeon_bo_priority priority_constbuf)
 {
-	buffers->priority = priority;
-	buffers->priority_constbuf = priority_constbuf;
-	buffers->buffers = CALLOC(num_buffers, sizeof(struct pipe_resource*));
-	buffers->offsets = CALLOC(num_buffers, sizeof(buffers->offsets[0]));
+   buffers->priority = priority;
+   buffers->priority_constbuf = priority_constbuf;
+   buffers->buffers = CALLOC(num_buffers, sizeof(struct pipe_resource *));
+   buffers->offsets = CALLOC(num_buffers, sizeof(buffers->offsets[0]));
 
-	si_init_descriptors(descs, shader_userdata_rel_index, 4, num_buffers);
+   si_init_descriptors(descs, shader_userdata_rel_index, 4, num_buffers);
 }
 
 static void si_release_buffer_resources(struct si_buffer_resources *buffers,
-					struct si_descriptors *descs)
+                                        struct si_descriptors *descs)
 {
-	int i;
+   int i;
 
-	for (i = 0; i < descs->num_elements; i++) {
-		pipe_resource_reference(&buffers->buffers[i], NULL);
-	}
+   for (i = 0; i < descs->num_elements; i++) {
+      pipe_resource_reference(&buffers->buffers[i], NULL);
+   }
 
-	FREE(buffers->buffers);
-	FREE(buffers->offsets);
+   FREE(buffers->buffers);
+   FREE(buffers->offsets);
 }
 
 static void si_buffer_resources_begin_new_cs(struct si_context *sctx,
-					     struct si_buffer_resources *buffers)
+                                             struct si_buffer_resources *buffers)
 {
-	unsigned mask = buffers->enabled_mask;
+   unsigned mask = buffers->enabled_mask;
 
-	/* Add buffers to the CS. */
-	while (mask) {
-		int i = u_bit_scan(&mask);
+   /* Add buffers to the CS. */
+   while (mask) {
+      int i = u_bit_scan(&mask);
 
-		radeon_add_to_buffer_list(sctx, sctx->gfx_cs,
-			si_resource(buffers->buffers[i]),
-			buffers->writable_mask & (1u << i) ? RADEON_USAGE_READWRITE :
-							     RADEON_USAGE_READ,
-			i < SI_NUM_SHADER_BUFFERS ? buffers->priority :
-						    buffers->priority_constbuf);
-	}
+      radeon_add_to_buffer_list(
+         sctx, sctx->gfx_cs, si_resource(buffers->buffers[i]),
+         buffers->writable_mask & (1u << i) ? RADEON_USAGE_READWRITE : RADEON_USAGE_READ,
+         i < SI_NUM_SHADER_BUFFERS ? buffers->priority : buffers->priority_constbuf);
+   }
 }
 
 static void si_get_buffer_from_descriptors(struct si_buffer_resources *buffers,
-					   struct si_descriptors *descs,
-					   unsigned idx, struct pipe_resource **buf,
-					   unsigned *offset, unsigned *size)
+                                           struct si_descriptors *descs, unsigned idx,
+                                           struct pipe_resource **buf, unsigned *offset,
+                                           unsigned *size)
 {
-	pipe_resource_reference(buf, buffers->buffers[idx]);
-	if (*buf) {
-		struct si_resource *res = si_resource(*buf);
-		const uint32_t *desc = descs->list + idx * 4;
-		uint64_t va;
+   pipe_resource_reference(buf, buffers->buffers[idx]);
+   if (*buf) {
+      struct si_resource *res = si_resource(*buf);
+      const uint32_t *desc = descs->list + idx * 4;
+      uint64_t va;
 
-		*size = desc[2];
+      *size = desc[2];
 
-		assert(G_008F04_STRIDE(desc[1]) == 0);
-		va = si_desc_extract_buffer_address(desc);
+      assert(G_008F04_STRIDE(desc[1]) == 0);
+      va = si_desc_extract_buffer_address(desc);
 
-		assert(va >= res->gpu_address && va + *size <= res->gpu_address + res->bo_size);
-		*offset = va - res->gpu_address;
-	}
+      assert(va >= res->gpu_address && va + *size <= res->gpu_address + res->bo_size);
+      *offset = va - res->gpu_address;
+   }
 }
 
 /* VERTEX BUFFERS */
 
 static void si_vertex_buffers_begin_new_cs(struct si_context *sctx)
 {
-	int count = sctx->num_vertex_elements;
-	int i;
+   int count = sctx->num_vertex_elements;
+   int i;
 
-	for (i = 0; i < count; i++) {
-		int vb = sctx->vertex_elements->vertex_buffer_index[i];
+   for (i = 0; i < count; i++) {
+      int vb = sctx->vertex_elements->vertex_buffer_index[i];
 
-		if (vb >= ARRAY_SIZE(sctx->vertex_buffer))
-			continue;
-		if (!sctx->vertex_buffer[vb].buffer.resource)
-			continue;
+      if (vb >= ARRAY_SIZE(sctx->vertex_buffer))
+         continue;
+      if (!sctx->vertex_buffer[vb].buffer.resource)
+         continue;
 
-		radeon_add_to_buffer_list(sctx, sctx->gfx_cs,
-				      si_resource(sctx->vertex_buffer[vb].buffer.resource),
-				      RADEON_USAGE_READ, RADEON_PRIO_VERTEX_BUFFER);
-	}
+      radeon_add_to_buffer_list(sctx, sctx->gfx_cs,
+                                si_resource(sctx->vertex_buffer[vb].buffer.resource),
+                                RADEON_USAGE_READ, RADEON_PRIO_VERTEX_BUFFER);
+   }
 
-	if (!sctx->vb_descriptors_buffer)
-		return;
-	radeon_add_to_buffer_list(sctx, sctx->gfx_cs,
-				  sctx->vb_descriptors_buffer, RADEON_USAGE_READ,
-				  RADEON_PRIO_DESCRIPTORS);
+   if (!sctx->vb_descriptors_buffer)
+      return;
+   radeon_add_to_buffer_list(sctx, sctx->gfx_cs, sctx->vb_descriptors_buffer, RADEON_USAGE_READ,
+                             RADEON_PRIO_DESCRIPTORS);
 }
 
 bool si_upload_vertex_buffer_descriptors(struct si_context *sctx)
 {
-	unsigned i, count = sctx->num_vertex_elements;
-	uint32_t *ptr;
-
-	if (!sctx->vertex_buffers_dirty || !count)
-		return true;
-
-	struct si_vertex_elements *velems = sctx->vertex_elements;
-	unsigned alloc_size = velems->vb_desc_list_alloc_size;
-
-	if (alloc_size) {
-		/* Vertex buffer descriptors are the only ones which are uploaded
-		 * directly through a staging buffer and don't go through
-		 * the fine-grained upload path.
-		 */
-		u_upload_alloc(sctx->b.const_uploader, 0,
-			       alloc_size,
-			       si_optimal_tcc_alignment(sctx, alloc_size),
-			       &sctx->vb_descriptors_offset,
-			       (struct pipe_resource**)&sctx->vb_descriptors_buffer,
-			       (void**)&ptr);
-		if (!sctx->vb_descriptors_buffer) {
-			sctx->vb_descriptors_offset = 0;
-			sctx->vb_descriptors_gpu_list = NULL;
-			return false;
-		}
-
-		sctx->vb_descriptors_gpu_list = ptr;
-		radeon_add_to_buffer_list(sctx, sctx->gfx_cs,
-					  sctx->vb_descriptors_buffer, RADEON_USAGE_READ,
-					  RADEON_PRIO_DESCRIPTORS);
-		sctx->vertex_buffer_pointer_dirty = true;
-		sctx->prefetch_L2_mask |= SI_PREFETCH_VBO_DESCRIPTORS;
-	} else {
-		si_resource_reference(&sctx->vb_descriptors_buffer, NULL);
-		sctx->vertex_buffer_pointer_dirty = false;
-		sctx->prefetch_L2_mask &= ~SI_PREFETCH_VBO_DESCRIPTORS;
-	}
-
-	assert(count <= SI_MAX_ATTRIBS);
-
-	unsigned first_vb_use_mask = velems->first_vb_use_mask;
-	unsigned num_vbos_in_user_sgprs = sctx->screen->num_vbos_in_user_sgprs;
-
-	for (i = 0; i < count; i++) {
-		struct pipe_vertex_buffer *vb;
-		struct si_resource *buf;
-		unsigned vbo_index = velems->vertex_buffer_index[i];
-		uint32_t *desc = i < num_vbos_in_user_sgprs ?
-					&sctx->vb_descriptor_user_sgprs[i * 4] :
-					&ptr[(i - num_vbos_in_user_sgprs) * 4];
-
-		vb = &sctx->vertex_buffer[vbo_index];
-		buf = si_resource(vb->buffer.resource);
-		if (!buf) {
-			memset(desc, 0, 16);
-			continue;
-		}
-
-		int64_t offset = (int64_t)((int)vb->buffer_offset) +
-				 velems->src_offset[i];
-
-		if (offset >= buf->b.b.width0) {
-			assert(offset < buf->b.b.width0);
-			memset(desc, 0, 16);
-			continue;
-		}
-
-		uint64_t va = buf->gpu_address + offset;
-
-		int64_t num_records = (int64_t)buf->b.b.width0 - offset;
-		if (sctx->chip_class != GFX8 && vb->stride) {
-			/* Round up by rounding down and adding 1 */
-			num_records = (num_records - velems->format_size[i]) /
-				      vb->stride + 1;
-		}
-		assert(num_records >= 0 && num_records <= UINT_MAX);
-
-		uint32_t rsrc_word3 = velems->rsrc_word3[i];
-
-		/* OOB_SELECT chooses the out-of-bounds check:
-		 *  - 1: index >= NUM_RECORDS (Structured)
-		 *  - 3: offset >= NUM_RECORDS (Raw)
-		 */
-		if (sctx->chip_class >= GFX10)
-			rsrc_word3 |= S_008F0C_OOB_SELECT(vb->stride ? V_008F0C_OOB_SELECT_STRUCTURED : V_008F0C_OOB_SELECT_RAW);
-
-		desc[0] = va;
-		desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) |
-			  S_008F04_STRIDE(vb->stride);
-		desc[2] = num_records;
-		desc[3] = rsrc_word3;
-
-		if (first_vb_use_mask & (1 << i)) {
-			radeon_add_to_buffer_list(sctx, sctx->gfx_cs,
-					      si_resource(vb->buffer.resource),
-					      RADEON_USAGE_READ, RADEON_PRIO_VERTEX_BUFFER);
-		}
-	}
-
-	/* Don't flush the const cache. It would have a very negative effect
-	 * on performance (confirmed by testing). New descriptors are always
-	 * uploaded to a fresh new buffer, so I don't think flushing the const
-	 * cache is needed. */
-	si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_pointers);
-	sctx->vertex_buffer_user_sgprs_dirty = num_vbos_in_user_sgprs > 0;
-	sctx->vertex_buffers_dirty = false;
-	return true;
+   unsigned i, count = sctx->num_vertex_elements;
+   uint32_t *ptr;
+
+   if (!sctx->vertex_buffers_dirty || !count)
+      return true;
+
+   struct si_vertex_elements *velems = sctx->vertex_elements;
+   unsigned alloc_size = velems->vb_desc_list_alloc_size;
+
+   if (alloc_size) {
+      /* Vertex buffer descriptors are the only ones which are uploaded
+       * directly through a staging buffer and don't go through
+       * the fine-grained upload path.
+       */
+      u_upload_alloc(sctx->b.const_uploader, 0, alloc_size,
+                     si_optimal_tcc_alignment(sctx, alloc_size), &sctx->vb_descriptors_offset,
+                     (struct pipe_resource **)&sctx->vb_descriptors_buffer, (void **)&ptr);
+      if (!sctx->vb_descriptors_buffer) {
+         sctx->vb_descriptors_offset = 0;
+         sctx->vb_descriptors_gpu_list = NULL;
+         return false;
+      }
+
+      sctx->vb_descriptors_gpu_list = ptr;
+      radeon_add_to_buffer_list(sctx, sctx->gfx_cs, sctx->vb_descriptors_buffer, RADEON_USAGE_READ,
+                                RADEON_PRIO_DESCRIPTORS);
+      sctx->vertex_buffer_pointer_dirty = true;
+      sctx->prefetch_L2_mask |= SI_PREFETCH_VBO_DESCRIPTORS;
+   } else {
+      si_resource_reference(&sctx->vb_descriptors_buffer, NULL);
+      sctx->vertex_buffer_pointer_dirty = false;
+      sctx->prefetch_L2_mask &= ~SI_PREFETCH_VBO_DESCRIPTORS;
+   }
+
+   assert(count <= SI_MAX_ATTRIBS);
+
+   unsigned first_vb_use_mask = velems->first_vb_use_mask;
+   unsigned num_vbos_in_user_sgprs = sctx->screen->num_vbos_in_user_sgprs;
+
+   for (i = 0; i < count; i++) {
+      struct pipe_vertex_buffer *vb;
+      struct si_resource *buf;
+      unsigned vbo_index = velems->vertex_buffer_index[i];
+      uint32_t *desc = i < num_vbos_in_user_sgprs ? &sctx->vb_descriptor_user_sgprs[i * 4]
+                                                  : &ptr[(i - num_vbos_in_user_sgprs) * 4];
+
+      vb = &sctx->vertex_buffer[vbo_index];
+      buf = si_resource(vb->buffer.resource);
+      if (!buf) {
+         memset(desc, 0, 16);
+         continue;
+      }
+
+      int64_t offset = (int64_t)((int)vb->buffer_offset) + velems->src_offset[i];
+
+      if (offset >= buf->b.b.width0) {
+         assert(offset < buf->b.b.width0);
+         memset(desc, 0, 16);
+         continue;
+      }
+
+      uint64_t va = buf->gpu_address + offset;
+
+      int64_t num_records = (int64_t)buf->b.b.width0 - offset;
+      if (sctx->chip_class != GFX8 && vb->stride) {
+         /* Round up by rounding down and adding 1 */
+         num_records = (num_records - velems->format_size[i]) / vb->stride + 1;
+      }
+      assert(num_records >= 0 && num_records <= UINT_MAX);
+
+      uint32_t rsrc_word3 = velems->rsrc_word3[i];
+
+      /* OOB_SELECT chooses the out-of-bounds check:
+       *  - 1: index >= NUM_RECORDS (Structured)
+       *  - 3: offset >= NUM_RECORDS (Raw)
+       */
+      if (sctx->chip_class >= GFX10)
+         rsrc_word3 |= S_008F0C_OOB_SELECT(vb->stride ? V_008F0C_OOB_SELECT_STRUCTURED
+                                                      : V_008F0C_OOB_SELECT_RAW);
+
+      desc[0] = va;
+      desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) | S_008F04_STRIDE(vb->stride);
+      desc[2] = num_records;
+      desc[3] = rsrc_word3;
+
+      if (first_vb_use_mask & (1 << i)) {
+         radeon_add_to_buffer_list(sctx, sctx->gfx_cs, si_resource(vb->buffer.resource),
+                                   RADEON_USAGE_READ, RADEON_PRIO_VERTEX_BUFFER);
+      }
+   }
+
+   /* Don't flush the const cache. It would have a very negative effect
+    * on performance (confirmed by testing). New descriptors are always
+    * uploaded to a fresh new buffer, so I don't think flushing the const
+    * cache is needed. */
+   si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_pointers);
+   sctx->vertex_buffer_user_sgprs_dirty = num_vbos_in_user_sgprs > 0;
+   sctx->vertex_buffers_dirty = false;
+   return true;
 }
 
-
 /* CONSTANT BUFFERS */
 
-static struct si_descriptors *
-si_const_and_shader_buffer_descriptors(struct si_context *sctx, unsigned shader)
-{
-	return &sctx->descriptors[si_const_and_shader_buffer_descriptors_idx(shader)];
-}
-
-void si_upload_const_buffer(struct si_context *sctx, struct si_resource **buf,
-			    const uint8_t *ptr, unsigned size, uint32_t *const_offset)
-{
-	void *tmp;
-
-	u_upload_alloc(sctx->b.const_uploader, 0, size,
-		       si_optimal_tcc_alignment(sctx, size),
-		       const_offset,
-		       (struct pipe_resource**)buf, &tmp);
-	if (*buf)
-		util_memcpy_cpu_to_le32(tmp, ptr, size);
-}
-
-static void si_set_constant_buffer(struct si_context *sctx,
-				   struct si_buffer_resources *buffers,
-				   unsigned descriptors_idx,
-				   uint slot, const struct pipe_constant_buffer *input)
-{
-	struct si_descriptors *descs = &sctx->descriptors[descriptors_idx];
-	assert(slot < descs->num_elements);
-	pipe_resource_reference(&buffers->buffers[slot], NULL);
-
-	/* GFX7 cannot unbind a constant buffer (S_BUFFER_LOAD is buggy
-	 * with a NULL buffer). We need to use a dummy buffer instead. */
-	if (sctx->chip_class == GFX7 &&
-	    (!input || (!input->buffer && !input->user_buffer)))
-		input = &sctx->null_const_buf;
-
-	if (input && (input->buffer || input->user_buffer)) {
-		struct pipe_resource *buffer = NULL;
-		uint64_t va;
-		unsigned buffer_offset;
-
-		/* Upload the user buffer if needed. */
-		if (input->user_buffer) {
-			si_upload_const_buffer(sctx,
-					       (struct si_resource**)&buffer, input->user_buffer,
-					       input->buffer_size, &buffer_offset);
-			if (!buffer) {
-				/* Just unbind on failure. */
-				si_set_constant_buffer(sctx, buffers, descriptors_idx, slot, NULL);
-				return;
-			}
-		} else {
-			pipe_resource_reference(&buffer, input->buffer);
-			buffer_offset = input->buffer_offset;
-		}
-
-		va = si_resource(buffer)->gpu_address + buffer_offset;
-
-		/* Set the descriptor. */
-		uint32_t *desc = descs->list + slot*4;
-		desc[0] = va;
-		desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) |
-			  S_008F04_STRIDE(0);
-		desc[2] = input->buffer_size;
-		desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
-			  S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
-			  S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
-			  S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
-
-		if (sctx->chip_class >= GFX10) {
-			desc[3] |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) |
-				   S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) |
-				   S_008F0C_RESOURCE_LEVEL(1);
-		} else {
-			desc[3] |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
-				   S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
-		}
-
-		buffers->buffers[slot] = buffer;
-		buffers->offsets[slot] = buffer_offset;
-		radeon_add_to_gfx_buffer_list_check_mem(sctx,
-							si_resource(buffer),
-							RADEON_USAGE_READ,
-							buffers->priority_constbuf, true);
-		buffers->enabled_mask |= 1u << slot;
-	} else {
-		/* Clear the descriptor. */
-		memset(descs->list + slot*4, 0, sizeof(uint32_t) * 4);
-		buffers->enabled_mask &= ~(1u << slot);
-	}
-
-	sctx->descriptors_dirty |= 1u << descriptors_idx;
-}
-
-static void si_pipe_set_constant_buffer(struct pipe_context *ctx,
-					enum pipe_shader_type shader, uint slot,
-					const struct pipe_constant_buffer *input)
-{
-	struct si_context *sctx = (struct si_context *)ctx;
-
-	if (shader >= SI_NUM_SHADERS)
-		return;
-
-	if (slot == 0 && input && input->buffer &&
-	    !(si_resource(input->buffer)->flags & RADEON_FLAG_32BIT)) {
-		assert(!"constant buffer 0 must have a 32-bit VM address, use const_uploader");
-		return;
-	}
-
-	if (input && input->buffer)
-		si_resource(input->buffer)->bind_history |= PIPE_BIND_CONSTANT_BUFFER;
-
-	slot = si_get_constbuf_slot(slot);
-	si_set_constant_buffer(sctx, &sctx->const_and_shader_buffers[shader],
-			       si_const_and_shader_buffer_descriptors_idx(shader),
-			       slot, input);
-}
-
-void si_get_pipe_constant_buffer(struct si_context *sctx, uint shader,
-				 uint slot, struct pipe_constant_buffer *cbuf)
-{
-	cbuf->user_buffer = NULL;
-	si_get_buffer_from_descriptors(
-		&sctx->const_and_shader_buffers[shader],
-		si_const_and_shader_buffer_descriptors(sctx, shader),
-		si_get_constbuf_slot(slot),
-		&cbuf->buffer, &cbuf->buffer_offset, &cbuf->buffer_size);
+static struct si_descriptors *si_const_and_shader_buffer_descriptors(struct si_context *sctx,
+                                                                     unsigned shader)
+{
+   return &sctx->descriptors[si_const_and_shader_buffer_descriptors_idx(shader)];
+}
+
+void si_upload_const_buffer(struct si_context *sctx, struct si_resource **buf, const uint8_t *ptr,
+                            unsigned size, uint32_t *const_offset)
+{
+   void *tmp;
+
+   u_upload_alloc(sctx->b.const_uploader, 0, size, si_optimal_tcc_alignment(sctx, size),
+                  const_offset, (struct pipe_resource **)buf, &tmp);
+   if (*buf)
+      util_memcpy_cpu_to_le32(tmp, ptr, size);
+}
+
+static void si_set_constant_buffer(struct si_context *sctx, struct si_buffer_resources *buffers,
+                                   unsigned descriptors_idx, uint slot,
+                                   const struct pipe_constant_buffer *input)
+{
+   struct si_descriptors *descs = &sctx->descriptors[descriptors_idx];
+   assert(slot < descs->num_elements);
+   pipe_resource_reference(&buffers->buffers[slot], NULL);
+
+   /* GFX7 cannot unbind a constant buffer (S_BUFFER_LOAD is buggy
+    * with a NULL buffer). We need to use a dummy buffer instead. */
+   if (sctx->chip_class == GFX7 && (!input || (!input->buffer && !input->user_buffer)))
+      input = &sctx->null_const_buf;
+
+   if (input && (input->buffer || input->user_buffer)) {
+      struct pipe_resource *buffer = NULL;
+      uint64_t va;
+      unsigned buffer_offset;
+
+      /* Upload the user buffer if needed. */
+      if (input->user_buffer) {
+         si_upload_const_buffer(sctx, (struct si_resource **)&buffer, input->user_buffer,
+                                input->buffer_size, &buffer_offset);
+         if (!buffer) {
+            /* Just unbind on failure. */
+            si_set_constant_buffer(sctx, buffers, descriptors_idx, slot, NULL);
+            return;
+         }
+      } else {
+         pipe_resource_reference(&buffer, input->buffer);
+         buffer_offset = input->buffer_offset;
+      }
+
+      va = si_resource(buffer)->gpu_address + buffer_offset;
+
+      /* Set the descriptor. */
+      uint32_t *desc = descs->list + slot * 4;
+      desc[0] = va;
+      desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) | S_008F04_STRIDE(0);
+      desc[2] = input->buffer_size;
+      desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
+                S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
+
+      if (sctx->chip_class >= GFX10) {
+         desc[3] |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) |
+                    S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) | S_008F0C_RESOURCE_LEVEL(1);
+      } else {
+         desc[3] |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
+                    S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
+      }
+
+      buffers->buffers[slot] = buffer;
+      buffers->offsets[slot] = buffer_offset;
+      radeon_add_to_gfx_buffer_list_check_mem(sctx, si_resource(buffer), RADEON_USAGE_READ,
+                                              buffers->priority_constbuf, true);
+      buffers->enabled_mask |= 1u << slot;
+   } else {
+      /* Clear the descriptor. */
+      memset(descs->list + slot * 4, 0, sizeof(uint32_t) * 4);
+      buffers->enabled_mask &= ~(1u << slot);
+   }
+
+   sctx->descriptors_dirty |= 1u << descriptors_idx;
+}
+
+static void si_pipe_set_constant_buffer(struct pipe_context *ctx, enum pipe_shader_type shader,
+                                        uint slot, const struct pipe_constant_buffer *input)
+{
+   struct si_context *sctx = (struct si_context *)ctx;
+
+   if (shader >= SI_NUM_SHADERS)
+      return;
+
+   if (slot == 0 && input && input->buffer &&
+       !(si_resource(input->buffer)->flags & RADEON_FLAG_32BIT)) {
+      assert(!"constant buffer 0 must have a 32-bit VM address, use const_uploader");
+      return;
+   }
+
+   if (input && input->buffer)
+      si_resource(input->buffer)->bind_history |= PIPE_BIND_CONSTANT_BUFFER;
+
+   slot = si_get_constbuf_slot(slot);
+   si_set_constant_buffer(sctx, &sctx->const_and_shader_buffers[shader],
+                          si_const_and_shader_buffer_descriptors_idx(shader), slot, input);
+}
+
+void si_get_pipe_constant_buffer(struct si_context *sctx, uint shader, uint slot,
+                                 struct pipe_constant_buffer *cbuf)
+{
+   cbuf->user_buffer = NULL;
+   si_get_buffer_from_descriptors(
+      &sctx->const_and_shader_buffers[shader], si_const_and_shader_buffer_descriptors(sctx, shader),
+      si_get_constbuf_slot(slot), &cbuf->buffer, &cbuf->buffer_offset, &cbuf->buffer_size);
 }
 
 /* SHADER BUFFERS */
 
-static void si_set_shader_buffer(struct si_context *sctx,
-				 struct si_buffer_resources *buffers,
-				 unsigned descriptors_idx,
-				 uint slot, const struct pipe_shader_buffer *sbuffer,
-				 bool writable, enum radeon_bo_priority priority)
-{
-	struct si_descriptors *descs = &sctx->descriptors[descriptors_idx];
-	uint32_t *desc = descs->list + slot * 4;
-
-	if (!sbuffer || !sbuffer->buffer) {
-		pipe_resource_reference(&buffers->buffers[slot], NULL);
-		memset(desc, 0, sizeof(uint32_t) * 4);
-		buffers->enabled_mask &= ~(1u << slot);
-		buffers->writable_mask &= ~(1u << slot);
-		sctx->descriptors_dirty |= 1u << descriptors_idx;
-		return;
-	}
-
-	struct si_resource *buf = si_resource(sbuffer->buffer);
-	uint64_t va = buf->gpu_address + sbuffer->buffer_offset;
-
-	desc[0] = va;
-	desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) |
-		  S_008F04_STRIDE(0);
-	desc[2] = sbuffer->buffer_size;
-	desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
-		  S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
-		  S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
-		  S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
-
-	if (sctx->chip_class >= GFX10) {
-		desc[3] |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) |
-			   S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) |
-			   S_008F0C_RESOURCE_LEVEL(1);
-	} else {
-		desc[3] |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
-			   S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
-	}
-
-	pipe_resource_reference(&buffers->buffers[slot], &buf->b.b);
-	buffers->offsets[slot] = sbuffer->buffer_offset;
-	radeon_add_to_gfx_buffer_list_check_mem(sctx, buf,
-						writable ? RADEON_USAGE_READWRITE :
-							   RADEON_USAGE_READ,
-						priority, true);
-	if (writable)
-		buffers->writable_mask |= 1u << slot;
-	else
-		buffers->writable_mask &= ~(1u << slot);
-
-	buffers->enabled_mask |= 1u << slot;
-	sctx->descriptors_dirty |= 1u << descriptors_idx;
-
-	util_range_add(&buf->b.b, &buf->valid_buffer_range, sbuffer->buffer_offset,
-		       sbuffer->buffer_offset + sbuffer->buffer_size);
-}
-
-static void si_set_shader_buffers(struct pipe_context *ctx,
-				  enum pipe_shader_type shader,
-				  unsigned start_slot, unsigned count,
-				  const struct pipe_shader_buffer *sbuffers,
-				  unsigned writable_bitmask)
-{
-	struct si_context *sctx = (struct si_context *)ctx;
-	struct si_buffer_resources *buffers = &sctx->const_and_shader_buffers[shader];
-	unsigned descriptors_idx = si_const_and_shader_buffer_descriptors_idx(shader);
-	unsigned i;
-
-	assert(start_slot + count <= SI_NUM_SHADER_BUFFERS);
-
-	for (i = 0; i < count; ++i) {
-		const struct pipe_shader_buffer *sbuffer = sbuffers ? &sbuffers[i] : NULL;
-		unsigned slot = si_get_shaderbuf_slot(start_slot + i);
-
-		if (sbuffer && sbuffer->buffer)
-			si_resource(sbuffer->buffer)->bind_history |= PIPE_BIND_SHADER_BUFFER;
-
-		si_set_shader_buffer(sctx, buffers, descriptors_idx, slot, sbuffer,
-				     !!(writable_bitmask & (1u << i)),
-				     buffers->priority);
-	}
-}
-
-void si_get_shader_buffers(struct si_context *sctx,
-			   enum pipe_shader_type shader,
-			   uint start_slot, uint count,
-			   struct pipe_shader_buffer *sbuf)
-{
-	struct si_buffer_resources *buffers = &sctx->const_and_shader_buffers[shader];
-	struct si_descriptors *descs = si_const_and_shader_buffer_descriptors(sctx, shader);
-
-	for (unsigned i = 0; i < count; ++i) {
-		si_get_buffer_from_descriptors(
-			buffers, descs,
-			si_get_shaderbuf_slot(start_slot + i),
-			&sbuf[i].buffer, &sbuf[i].buffer_offset,
-			&sbuf[i].buffer_size);
-	}
+static void si_set_shader_buffer(struct si_context *sctx, struct si_buffer_resources *buffers,
+                                 unsigned descriptors_idx, uint slot,
+                                 const struct pipe_shader_buffer *sbuffer, bool writable,
+                                 enum radeon_bo_priority priority)
+{
+   struct si_descriptors *descs = &sctx->descriptors[descriptors_idx];
+   uint32_t *desc = descs->list + slot * 4;
+
+   if (!sbuffer || !sbuffer->buffer) {
+      pipe_resource_reference(&buffers->buffers[slot], NULL);
+      memset(desc, 0, sizeof(uint32_t) * 4);
+      buffers->enabled_mask &= ~(1u << slot);
+      buffers->writable_mask &= ~(1u << slot);
+      sctx->descriptors_dirty |= 1u << descriptors_idx;
+      return;
+   }
+
+   struct si_resource *buf = si_resource(sbuffer->buffer);
+   uint64_t va = buf->gpu_address + sbuffer->buffer_offset;
+
+   desc[0] = va;
+   desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) | S_008F04_STRIDE(0);
+   desc[2] = sbuffer->buffer_size;
+   desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
+             S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
+
+   if (sctx->chip_class >= GFX10) {
+      desc[3] |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) |
+                 S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) | S_008F0C_RESOURCE_LEVEL(1);
+   } else {
+      desc[3] |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
+                 S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
+   }
+
+   pipe_resource_reference(&buffers->buffers[slot], &buf->b.b);
+   buffers->offsets[slot] = sbuffer->buffer_offset;
+   radeon_add_to_gfx_buffer_list_check_mem(
+      sctx, buf, writable ? RADEON_USAGE_READWRITE : RADEON_USAGE_READ, priority, true);
+   if (writable)
+      buffers->writable_mask |= 1u << slot;
+   else
+      buffers->writable_mask &= ~(1u << slot);
+
+   buffers->enabled_mask |= 1u << slot;
+   sctx->descriptors_dirty |= 1u << descriptors_idx;
+
+   util_range_add(&buf->b.b, &buf->valid_buffer_range, sbuffer->buffer_offset,
+                  sbuffer->buffer_offset + sbuffer->buffer_size);
+}
+
+static void si_set_shader_buffers(struct pipe_context *ctx, enum pipe_shader_type shader,
+                                  unsigned start_slot, unsigned count,
+                                  const struct pipe_shader_buffer *sbuffers,
+                                  unsigned writable_bitmask)
+{
+   struct si_context *sctx = (struct si_context *)ctx;
+   struct si_buffer_resources *buffers = &sctx->const_and_shader_buffers[shader];
+   unsigned descriptors_idx = si_const_and_shader_buffer_descriptors_idx(shader);
+   unsigned i;
+
+   assert(start_slot + count <= SI_NUM_SHADER_BUFFERS);
+
+   for (i = 0; i < count; ++i) {
+      const struct pipe_shader_buffer *sbuffer = sbuffers ? &sbuffers[i] : NULL;
+      unsigned slot = si_get_shaderbuf_slot(start_slot + i);
+
+      if (sbuffer && sbuffer->buffer)
+         si_resource(sbuffer->buffer)->bind_history |= PIPE_BIND_SHADER_BUFFER;
+
+      si_set_shader_buffer(sctx, buffers, descriptors_idx, slot, sbuffer,
+                           !!(writable_bitmask & (1u << i)), buffers->priority);
+   }
+}
+
+void si_get_shader_buffers(struct si_context *sctx, enum pipe_shader_type shader, uint start_slot,
+                           uint count, struct pipe_shader_buffer *sbuf)
+{
+   struct si_buffer_resources *buffers = &sctx->const_and_shader_buffers[shader];
+   struct si_descriptors *descs = si_const_and_shader_buffer_descriptors(sctx, shader);
+
+   for (unsigned i = 0; i < count; ++i) {
+      si_get_buffer_from_descriptors(buffers, descs, si_get_shaderbuf_slot(start_slot + i),
+                                     &sbuf[i].buffer, &sbuf[i].buffer_offset, &sbuf[i].buffer_size);
+   }
 }
 
 /* RING BUFFERS */
 
-void si_set_rw_buffer(struct si_context *sctx,
-		      uint slot, const struct pipe_constant_buffer *input)
+void si_set_rw_buffer(struct si_context *sctx, uint slot, const struct pipe_constant_buffer *input)
 {
-	si_set_constant_buffer(sctx, &sctx->rw_buffers, SI_DESCS_RW_BUFFERS,
-			       slot, input);
+   si_set_constant_buffer(sctx, &sctx->rw_buffers, SI_DESCS_RW_BUFFERS, slot, input);
 }
 
 void si_set_rw_shader_buffer(struct si_context *sctx, uint slot,
-			     const struct pipe_shader_buffer *sbuffer)
-{
-	si_set_shader_buffer(sctx, &sctx->rw_buffers, SI_DESCS_RW_BUFFERS,
-			     slot, sbuffer, true, RADEON_PRIO_SHADER_RW_BUFFER);
-}
-
-void si_set_ring_buffer(struct si_context *sctx, uint slot,
-			struct pipe_resource *buffer,
-			unsigned stride, unsigned num_records,
-			bool add_tid, bool swizzle,
-			unsigned element_size, unsigned index_stride, uint64_t offset)
-{
-	struct si_buffer_resources *buffers = &sctx->rw_buffers;
-	struct si_descriptors *descs = &sctx->descriptors[SI_DESCS_RW_BUFFERS];
-
-	/* The stride field in the resource descriptor has 14 bits */
-	assert(stride < (1 << 14));
-
-	assert(slot < descs->num_elements);
-	pipe_resource_reference(&buffers->buffers[slot], NULL);
-
-	if (buffer) {
-		uint64_t va;
-
-		va = si_resource(buffer)->gpu_address + offset;
-
-		switch (element_size) {
-		default:
-			assert(!"Unsupported ring buffer element size");
-		case 0:
-		case 2:
-			element_size = 0;
-			break;
-		case 4:
-			element_size = 1;
-			break;
-		case 8:
-			element_size = 2;
-			break;
-		case 16:
-			element_size = 3;
-			break;
-		}
-
-		switch (index_stride) {
-		default:
-			assert(!"Unsupported ring buffer index stride");
-		case 0:
-		case 8:
-			index_stride = 0;
-			break;
-		case 16:
-			index_stride = 1;
-			break;
-		case 32:
-			index_stride = 2;
-			break;
-		case 64:
-			index_stride = 3;
-			break;
-		}
-
-		if (sctx->chip_class >= GFX8 && stride)
-			num_records *= stride;
-
-		/* Set the descriptor. */
-		uint32_t *desc = descs->list + slot*4;
-		desc[0] = va;
-		desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) |
-			  S_008F04_STRIDE(stride) |
-			  S_008F04_SWIZZLE_ENABLE(swizzle);
-		desc[2] = num_records;
-		desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
-			  S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
-			  S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
-			  S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
-			  S_008F0C_INDEX_STRIDE(index_stride) |
-			  S_008F0C_ADD_TID_ENABLE(add_tid);
-
-		if (sctx->chip_class >= GFX9)
-			assert(!swizzle || element_size == 1); /* always 4 bytes on GFX9 */
-		else
-			desc[3] |= S_008F0C_ELEMENT_SIZE(element_size);
-
-		if (sctx->chip_class >= GFX10) {
-			desc[3] |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) |
-				   S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_DISABLED) |
-				   S_008F0C_RESOURCE_LEVEL(1);
-		} else {
-			desc[3] |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
-				   S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
-		}
-
-		pipe_resource_reference(&buffers->buffers[slot], buffer);
-		radeon_add_to_buffer_list(sctx, sctx->gfx_cs,
-				      si_resource(buffer),
-				      RADEON_USAGE_READWRITE, buffers->priority);
-		buffers->enabled_mask |= 1u << slot;
-	} else {
-		/* Clear the descriptor. */
-		memset(descs->list + slot*4, 0, sizeof(uint32_t) * 4);
-		buffers->enabled_mask &= ~(1u << slot);
-	}
-
-	sctx->descriptors_dirty |= 1u << SI_DESCS_RW_BUFFERS;
+                             const struct pipe_shader_buffer *sbuffer)
+{
+   si_set_shader_buffer(sctx, &sctx->rw_buffers, SI_DESCS_RW_BUFFERS, slot, sbuffer, true,
+                        RADEON_PRIO_SHADER_RW_BUFFER);
+}
+
+void si_set_ring_buffer(struct si_context *sctx, uint slot, struct pipe_resource *buffer,
+                        unsigned stride, unsigned num_records, bool add_tid, bool swizzle,
+                        unsigned element_size, unsigned index_stride, uint64_t offset)
+{
+   struct si_buffer_resources *buffers = &sctx->rw_buffers;
+   struct si_descriptors *descs = &sctx->descriptors[SI_DESCS_RW_BUFFERS];
+
+   /* The stride field in the resource descriptor has 14 bits */
+   assert(stride < (1 << 14));
+
+   assert(slot < descs->num_elements);
+   pipe_resource_reference(&buffers->buffers[slot], NULL);
+
+   if (buffer) {
+      uint64_t va;
+
+      va = si_resource(buffer)->gpu_address + offset;
+
+      switch (element_size) {
+      default:
+         assert(!"Unsupported ring buffer element size");
+      case 0:
+      case 2:
+         element_size = 0;
+         break;
+      case 4:
+         element_size = 1;
+         break;
+      case 8:
+         element_size = 2;
+         break;
+      case 16:
+         element_size = 3;
+         break;
+      }
+
+      switch (index_stride) {
+      default:
+         assert(!"Unsupported ring buffer index stride");
+      case 0:
+      case 8:
+         index_stride = 0;
+         break;
+      case 16:
+         index_stride = 1;
+         break;
+      case 32:
+         index_stride = 2;
+         break;
+      case 64:
+         index_stride = 3;
+         break;
+      }
+
+      if (sctx->chip_class >= GFX8 && stride)
+         num_records *= stride;
+
+      /* Set the descriptor. */
+      uint32_t *desc = descs->list + slot * 4;
+      desc[0] = va;
+      desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) | S_008F04_STRIDE(stride) |
+                S_008F04_SWIZZLE_ENABLE(swizzle);
+      desc[2] = num_records;
+      desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
+                S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
+                S_008F0C_INDEX_STRIDE(index_stride) | S_008F0C_ADD_TID_ENABLE(add_tid);
+
+      if (sctx->chip_class >= GFX9)
+         assert(!swizzle || element_size == 1); /* always 4 bytes on GFX9 */
+      else
+         desc[3] |= S_008F0C_ELEMENT_SIZE(element_size);
+
+      if (sctx->chip_class >= GFX10) {
+         desc[3] |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) |
+                    S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_DISABLED) | S_008F0C_RESOURCE_LEVEL(1);
+      } else {
+         desc[3] |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
+                    S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
+      }
+
+      pipe_resource_reference(&buffers->buffers[slot], buffer);
+      radeon_add_to_buffer_list(sctx, sctx->gfx_cs, si_resource(buffer), RADEON_USAGE_READWRITE,
+                                buffers->priority);
+      buffers->enabled_mask |= 1u << slot;
+   } else {
+      /* Clear the descriptor. */
+      memset(descs->list + slot * 4, 0, sizeof(uint32_t) * 4);
+      buffers->enabled_mask &= ~(1u << slot);
+   }
+
+   sctx->descriptors_dirty |= 1u << SI_DESCS_RW_BUFFERS;
 }
 
 /* INTERNAL CONST BUFFERS */
 
-static void si_set_polygon_stipple(struct pipe_context *ctx,
-				   const struct pipe_poly_stipple *state)
+static void si_set_polygon_stipple(struct pipe_context *ctx, const struct pipe_poly_stipple *state)
 {
-	struct si_context *sctx = (struct si_context *)ctx;
-	struct pipe_constant_buffer cb = {};
-	unsigned stipple[32];
-	int i;
+   struct si_context *sctx = (struct si_context *)ctx;
+   struct pipe_constant_buffer cb = {};
+   unsigned stipple[32];
+   int i;
 
-	for (i = 0; i < 32; i++)
-		stipple[i] = util_bitreverse(state->stipple[i]);
+   for (i = 0; i < 32; i++)
+      stipple[i] = util_bitreverse(state->stipple[i]);
 
-	cb.user_buffer = stipple;
-	cb.buffer_size = sizeof(stipple);
+   cb.user_buffer = stipple;
+   cb.buffer_size = sizeof(stipple);
 
-	si_set_rw_buffer(sctx, SI_PS_CONST_POLY_STIPPLE, &cb);
+   si_set_rw_buffer(sctx, SI_PS_CONST_POLY_STIPPLE, &cb);
 }
 
 /* TEXTURE METADATA ENABLE/DISABLE */
 
-static void
-si_resident_handles_update_needs_color_decompress(struct si_context *sctx)
+static void si_resident_handles_update_needs_color_decompress(struct si_context *sctx)
 {
-	util_dynarray_clear(&sctx->resident_tex_needs_color_decompress);
-	util_dynarray_clear(&sctx->resident_img_needs_color_decompress);
+   util_dynarray_clear(&sctx->resident_tex_needs_color_decompress);
+   util_dynarray_clear(&sctx->resident_img_needs_color_decompress);
 
-	util_dynarray_foreach(&sctx->resident_tex_handles,
-			      struct si_texture_handle *, tex_handle) {
-		struct pipe_resource *res = (*tex_handle)->view->texture;
-		struct si_texture *tex;
+   util_dynarray_foreach (&sctx->resident_tex_handles, struct si_texture_handle *, tex_handle) {
+      struct pipe_resource *res = (*tex_handle)->view->texture;
+      struct si_texture *tex;
 
-		if (!res || res->target == PIPE_BUFFER)
-			continue;
+      if (!res || res->target == PIPE_BUFFER)
+         continue;
 
-		tex = (struct si_texture *)res;
-		if (!color_needs_decompression(tex))
-			continue;
+      tex = (struct si_texture *)res;
+      if (!color_needs_decompression(tex))
+         continue;
 
-		util_dynarray_append(&sctx->resident_tex_needs_color_decompress,
-				     struct si_texture_handle *, *tex_handle);
-	}
+      util_dynarray_append(&sctx->resident_tex_needs_color_decompress, struct si_texture_handle *,
+                           *tex_handle);
+   }
 
-	util_dynarray_foreach(&sctx->resident_img_handles,
-			      struct si_image_handle *, img_handle) {
-		struct pipe_image_view *view = &(*img_handle)->view;
-		struct pipe_resource *res = view->resource;
-		struct si_texture *tex;
+   util_dynarray_foreach (&sctx->resident_img_handles, struct si_image_handle *, img_handle) {
+      struct pipe_image_view *view = &(*img_handle)->view;
+      struct pipe_resource *res = view->resource;
+      struct si_texture *tex;
 
-		if (!res || res->target == PIPE_BUFFER)
-			continue;
+      if (!res || res->target == PIPE_BUFFER)
+         continue;
 
-		tex = (struct si_texture *)res;
-		if (!color_needs_decompression(tex))
-			continue;
+      tex = (struct si_texture *)res;
+      if (!color_needs_decompression(tex))
+         continue;
 
-		util_dynarray_append(&sctx->resident_img_needs_color_decompress,
-				     struct si_image_handle *, *img_handle);
-	}
+      util_dynarray_append(&sctx->resident_img_needs_color_decompress, struct si_image_handle *,
+                           *img_handle);
+   }
 }
 
 /* CMASK can be enabled (for fast clear) and disabled (for texture export)
@@ -1620,13 +1473,13 @@ si_resident_handles_update_needs_color_decompress(struct si_context *sctx)
  */
 void si_update_needs_color_decompress_masks(struct si_context *sctx)
 {
-	for (int i = 0; i < SI_NUM_SHADERS; ++i) {
-		si_samplers_update_needs_color_decompress_mask(&sctx->samplers[i]);
-		si_images_update_needs_color_decompress_mask(&sctx->images[i]);
-		si_update_shader_needs_decompress_mask(sctx, i);
-	}
+   for (int i = 0; i < SI_NUM_SHADERS; ++i) {
+      si_samplers_update_needs_color_decompress_mask(&sctx->samplers[i]);
+      si_images_update_needs_color_decompress_mask(&sctx->images[i]);
+      si_update_shader_needs_decompress_mask(sctx, i);
+   }
 
-	si_resident_handles_update_needs_color_decompress(sctx);
+   si_resident_handles_update_needs_color_decompress(sctx);
 }
 
 /* BUFFER DISCARD/INVALIDATION */
@@ -1634,33 +1487,27 @@ void si_update_needs_color_decompress_masks(struct si_context *sctx)
 /* Reset descriptors of buffer resources after \p buf has been invalidated.
  * If buf == NULL, reset all descriptors.
  */
-static void si_reset_buffer_resources(struct si_context *sctx,
-				      struct si_buffer_resources *buffers,
-				      unsigned descriptors_idx,
-				      unsigned slot_mask,
-				      struct pipe_resource *buf,
-				      enum radeon_bo_priority priority)
-{
-	struct si_descriptors *descs = &sctx->descriptors[descriptors_idx];
-	unsigned mask = buffers->enabled_mask & slot_mask;
-
-	while (mask) {
-		unsigned i = u_bit_scan(&mask);
-		struct pipe_resource *buffer = buffers->buffers[i];
-
-		if (buffer && (!buf || buffer == buf)) {
-			si_set_buf_desc_address(si_resource(buffer), buffers->offsets[i],
-						descs->list + i*4);
-			sctx->descriptors_dirty |= 1u << descriptors_idx;
-
-			radeon_add_to_gfx_buffer_list_check_mem(sctx,
-								si_resource(buffer),
-								buffers->writable_mask & (1u << i) ?
-									RADEON_USAGE_READWRITE :
-									RADEON_USAGE_READ,
-								priority, true);
-		}
-	}
+static void si_reset_buffer_resources(struct si_context *sctx, struct si_buffer_resources *buffers,
+                                      unsigned descriptors_idx, unsigned slot_mask,
+                                      struct pipe_resource *buf, enum radeon_bo_priority priority)
+{
+   struct si_descriptors *descs = &sctx->descriptors[descriptors_idx];
+   unsigned mask = buffers->enabled_mask & slot_mask;
+
+   while (mask) {
+      unsigned i = u_bit_scan(&mask);
+      struct pipe_resource *buffer = buffers->buffers[i];
+
+      if (buffer && (!buf || buffer == buf)) {
+         si_set_buf_desc_address(si_resource(buffer), buffers->offsets[i], descs->list + i * 4);
+         sctx->descriptors_dirty |= 1u << descriptors_idx;
+
+         radeon_add_to_gfx_buffer_list_check_mem(
+            sctx, si_resource(buffer),
+            buffers->writable_mask & (1u << i) ? RADEON_USAGE_READWRITE : RADEON_USAGE_READ,
+            priority, true);
+      }
+   }
 }
 
 /* Update all buffer bindings where the buffer is bound, including
@@ -1671,436 +1518,389 @@ static void si_reset_buffer_resources(struct si_context *sctx,
  */
 void si_rebind_buffer(struct si_context *sctx, struct pipe_resource *buf)
 {
-	struct si_resource *buffer = si_resource(buf);
-	unsigned i, shader;
-	unsigned num_elems = sctx->num_vertex_elements;
-
-	/* We changed the buffer, now we need to bind it where the old one
-	 * was bound. This consists of 2 things:
-	 *   1) Updating the resource descriptor and dirtying it.
-	 *   2) Adding a relocation to the CS, so that it's usable.
-	 */
-
-	/* Vertex buffers. */
-	if (!buffer) {
-		if (num_elems)
-			sctx->vertex_buffers_dirty = true;
-	} else if (buffer->bind_history & PIPE_BIND_VERTEX_BUFFER) {
-		for (i = 0; i < num_elems; i++) {
-			int vb = sctx->vertex_elements->vertex_buffer_index[i];
-
-			if (vb >= ARRAY_SIZE(sctx->vertex_buffer))
-				continue;
-			if (!sctx->vertex_buffer[vb].buffer.resource)
-				continue;
-
-			if (sctx->vertex_buffer[vb].buffer.resource == buf) {
-				sctx->vertex_buffers_dirty = true;
-				break;
-			}
-		}
-	}
-
-	/* Streamout buffers. (other internal buffers can't be invalidated) */
-	if (!buffer || buffer->bind_history & PIPE_BIND_STREAM_OUTPUT) {
-		for (i = SI_VS_STREAMOUT_BUF0; i <= SI_VS_STREAMOUT_BUF3; i++) {
-			struct si_buffer_resources *buffers = &sctx->rw_buffers;
-			struct si_descriptors *descs =
-				&sctx->descriptors[SI_DESCS_RW_BUFFERS];
-			struct pipe_resource *buffer = buffers->buffers[i];
-
-			if (!buffer || (buf && buffer != buf))
-				continue;
-
-			si_set_buf_desc_address(si_resource(buffer), buffers->offsets[i],
-						descs->list + i*4);
-			sctx->descriptors_dirty |= 1u << SI_DESCS_RW_BUFFERS;
-
-			radeon_add_to_gfx_buffer_list_check_mem(sctx,
-								si_resource(buffer),
-								RADEON_USAGE_WRITE,
-								RADEON_PRIO_SHADER_RW_BUFFER,
-								true);
-
-			/* Update the streamout state. */
-			if (sctx->streamout.begin_emitted)
-				si_emit_streamout_end(sctx);
-			sctx->streamout.append_bitmask =
-					sctx->streamout.enabled_mask;
-			si_streamout_buffers_dirty(sctx);
-		}
-	}
-
-	/* Constant and shader buffers. */
-	if (!buffer || buffer->bind_history & PIPE_BIND_CONSTANT_BUFFER) {
-		for (shader = 0; shader < SI_NUM_SHADERS; shader++)
-			si_reset_buffer_resources(sctx, &sctx->const_and_shader_buffers[shader],
-						  si_const_and_shader_buffer_descriptors_idx(shader),
-						  u_bit_consecutive(SI_NUM_SHADER_BUFFERS, SI_NUM_CONST_BUFFERS),
-						  buf,
-						  sctx->const_and_shader_buffers[shader].priority_constbuf);
-	}
-
-	if (!buffer || buffer->bind_history & PIPE_BIND_SHADER_BUFFER) {
-		for (shader = 0; shader < SI_NUM_SHADERS; shader++)
-			si_reset_buffer_resources(sctx, &sctx->const_and_shader_buffers[shader],
-						  si_const_and_shader_buffer_descriptors_idx(shader),
-						  u_bit_consecutive(0, SI_NUM_SHADER_BUFFERS),
-						  buf,
-						  sctx->const_and_shader_buffers[shader].priority);
-	}
-
-	if (!buffer || buffer->bind_history & PIPE_BIND_SAMPLER_VIEW) {
-		/* Texture buffers - update bindings. */
-		for (shader = 0; shader < SI_NUM_SHADERS; shader++) {
-			struct si_samplers *samplers = &sctx->samplers[shader];
-			struct si_descriptors *descs =
-				si_sampler_and_image_descriptors(sctx, shader);
-			unsigned mask = samplers->enabled_mask;
-
-			while (mask) {
-				unsigned i = u_bit_scan(&mask);
-				struct pipe_resource *buffer = samplers->views[i]->texture;
-
-				if (buffer && buffer->target == PIPE_BUFFER &&
-				    (!buf || buffer == buf)) {
-					unsigned desc_slot = si_get_sampler_slot(i);
-
-					si_set_buf_desc_address(si_resource(buffer),
-								samplers->views[i]->u.buf.offset,
-								descs->list + desc_slot * 16 + 4);
-					sctx->descriptors_dirty |=
-						1u << si_sampler_and_image_descriptors_idx(shader);
-
-					radeon_add_to_gfx_buffer_list_check_mem(
-						sctx, si_resource(buffer),
-						RADEON_USAGE_READ,
-						RADEON_PRIO_SAMPLER_BUFFER, true);
-				}
-			}
-		}
-	}
-
-	/* Shader images */
-	if (!buffer || buffer->bind_history & PIPE_BIND_SHADER_IMAGE) {
-		for (shader = 0; shader < SI_NUM_SHADERS; ++shader) {
-			struct si_images *images = &sctx->images[shader];
-			struct si_descriptors *descs =
-				si_sampler_and_image_descriptors(sctx, shader);
-			unsigned mask = images->enabled_mask;
-
-			while (mask) {
-				unsigned i = u_bit_scan(&mask);
-				struct pipe_resource *buffer = images->views[i].resource;
-
-				if (buffer && buffer->target == PIPE_BUFFER &&
-				    (!buf || buffer == buf)) {
-					unsigned desc_slot = si_get_image_slot(i);
-
-					if (images->views[i].access & PIPE_IMAGE_ACCESS_WRITE)
-						si_mark_image_range_valid(&images->views[i]);
-
-					si_set_buf_desc_address(si_resource(buffer),
-								images->views[i].u.buf.offset,
-								descs->list + desc_slot * 8 + 4);
-					sctx->descriptors_dirty |=
-						1u << si_sampler_and_image_descriptors_idx(shader);
-
-					radeon_add_to_gfx_buffer_list_check_mem(
-						sctx, si_resource(buffer),
-						RADEON_USAGE_READWRITE,
-						RADEON_PRIO_SAMPLER_BUFFER, true);
-				}
-			}
-		}
-	}
-
-	/* Bindless texture handles */
-	if (!buffer || buffer->texture_handle_allocated) {
-		struct si_descriptors *descs = &sctx->bindless_descriptors;
-
-		util_dynarray_foreach(&sctx->resident_tex_handles,
-				      struct si_texture_handle *, tex_handle) {
-			struct pipe_sampler_view *view = (*tex_handle)->view;
-			unsigned desc_slot = (*tex_handle)->desc_slot;
-			struct pipe_resource *buffer = view->texture;
-
-			if (buffer && buffer->target == PIPE_BUFFER &&
-			    (!buf || buffer == buf)) {
-				si_set_buf_desc_address(si_resource(buffer),
-							view->u.buf.offset,
-							descs->list +
-							desc_slot * 16 + 4);
-
-				(*tex_handle)->desc_dirty = true;
-				sctx->bindless_descriptors_dirty = true;
-
-				radeon_add_to_gfx_buffer_list_check_mem(
-					sctx, si_resource(buffer),
-					RADEON_USAGE_READ,
-					RADEON_PRIO_SAMPLER_BUFFER, true);
-			}
-		}
-	}
-
-	/* Bindless image handles */
-	if (!buffer || buffer->image_handle_allocated) {
-		struct si_descriptors *descs = &sctx->bindless_descriptors;
-
-		util_dynarray_foreach(&sctx->resident_img_handles,
-				      struct si_image_handle *, img_handle) {
-			struct pipe_image_view *view = &(*img_handle)->view;
-			unsigned desc_slot = (*img_handle)->desc_slot;
-			struct pipe_resource *buffer = view->resource;
-
-			if (buffer && buffer->target == PIPE_BUFFER &&
-			    (!buf || buffer == buf)) {
-				if (view->access & PIPE_IMAGE_ACCESS_WRITE)
-					si_mark_image_range_valid(view);
-
-				si_set_buf_desc_address(si_resource(buffer),
-							view->u.buf.offset,
-							descs->list +
-							desc_slot * 16 + 4);
-
-				(*img_handle)->desc_dirty = true;
-				sctx->bindless_descriptors_dirty = true;
-
-				radeon_add_to_gfx_buffer_list_check_mem(
-					sctx, si_resource(buffer),
-					RADEON_USAGE_READWRITE,
-					RADEON_PRIO_SAMPLER_BUFFER, true);
-			}
-		}
-	}
-
-	if (buffer) {
-		/* Do the same for other contexts. They will invoke this function
-		 * with buffer == NULL.
-		 */
-		unsigned new_counter = p_atomic_inc_return(&sctx->screen->dirty_buf_counter);
-
-		/* Skip the update for the current context, because we have already updated
-		 * the buffer bindings.
-		 */
-		if (new_counter == sctx->last_dirty_buf_counter + 1)
-			sctx->last_dirty_buf_counter = new_counter;
-	}
-}
-
-static void si_upload_bindless_descriptor(struct si_context *sctx,
-					  unsigned desc_slot,
-					  unsigned num_dwords)
-{
-	struct si_descriptors *desc = &sctx->bindless_descriptors;
-	unsigned desc_slot_offset = desc_slot * 16;
-	uint32_t *data;
-	uint64_t va;
-
-	data = desc->list + desc_slot_offset;
-	va = desc->gpu_address + desc_slot_offset * 4;
-
-	si_cp_write_data(sctx, desc->buffer, va - desc->buffer->gpu_address,
-			 num_dwords * 4, V_370_TC_L2, V_370_ME, data);
+   struct si_resource *buffer = si_resource(buf);
+   unsigned i, shader;
+   unsigned num_elems = sctx->num_vertex_elements;
+
+   /* We changed the buffer, now we need to bind it where the old one
+    * was bound. This consists of 2 things:
+    *   1) Updating the resource descriptor and dirtying it.
+    *   2) Adding a relocation to the CS, so that it's usable.
+    */
+
+   /* Vertex buffers. */
+   if (!buffer) {
+      if (num_elems)
+         sctx->vertex_buffers_dirty = true;
+   } else if (buffer->bind_history & PIPE_BIND_VERTEX_BUFFER) {
+      for (i = 0; i < num_elems; i++) {
+         int vb = sctx->vertex_elements->vertex_buffer_index[i];
+
+         if (vb >= ARRAY_SIZE(sctx->vertex_buffer))
+            continue;
+         if (!sctx->vertex_buffer[vb].buffer.resource)
+            continue;
+
+         if (sctx->vertex_buffer[vb].buffer.resource == buf) {
+            sctx->vertex_buffers_dirty = true;
+            break;
+         }
+      }
+   }
+
+   /* Streamout buffers. (other internal buffers can't be invalidated) */
+   if (!buffer || buffer->bind_history & PIPE_BIND_STREAM_OUTPUT) {
+      for (i = SI_VS_STREAMOUT_BUF0; i <= SI_VS_STREAMOUT_BUF3; i++) {
+         struct si_buffer_resources *buffers = &sctx->rw_buffers;
+         struct si_descriptors *descs = &sctx->descriptors[SI_DESCS_RW_BUFFERS];
+         struct pipe_resource *buffer = buffers->buffers[i];
+
+         if (!buffer || (buf && buffer != buf))
+            continue;
+
+         si_set_buf_desc_address(si_resource(buffer), buffers->offsets[i], descs->list + i * 4);
+         sctx->descriptors_dirty |= 1u << SI_DESCS_RW_BUFFERS;
+
+         radeon_add_to_gfx_buffer_list_check_mem(sctx, si_resource(buffer), RADEON_USAGE_WRITE,
+                                                 RADEON_PRIO_SHADER_RW_BUFFER, true);
+
+         /* Update the streamout state. */
+         if (sctx->streamout.begin_emitted)
+            si_emit_streamout_end(sctx);
+         sctx->streamout.append_bitmask = sctx->streamout.enabled_mask;
+         si_streamout_buffers_dirty(sctx);
+      }
+   }
+
+   /* Constant and shader buffers. */
+   if (!buffer || buffer->bind_history & PIPE_BIND_CONSTANT_BUFFER) {
+      for (shader = 0; shader < SI_NUM_SHADERS; shader++)
+         si_reset_buffer_resources(sctx, &sctx->const_and_shader_buffers[shader],
+                                   si_const_and_shader_buffer_descriptors_idx(shader),
+                                   u_bit_consecutive(SI_NUM_SHADER_BUFFERS, SI_NUM_CONST_BUFFERS),
+                                   buf, sctx->const_and_shader_buffers[shader].priority_constbuf);
+   }
+
+   if (!buffer || buffer->bind_history & PIPE_BIND_SHADER_BUFFER) {
+      for (shader = 0; shader < SI_NUM_SHADERS; shader++)
+         si_reset_buffer_resources(sctx, &sctx->const_and_shader_buffers[shader],
+                                   si_const_and_shader_buffer_descriptors_idx(shader),
+                                   u_bit_consecutive(0, SI_NUM_SHADER_BUFFERS), buf,
+                                   sctx->const_and_shader_buffers[shader].priority);
+   }
+
+   if (!buffer || buffer->bind_history & PIPE_BIND_SAMPLER_VIEW) {
+      /* Texture buffers - update bindings. */
+      for (shader = 0; shader < SI_NUM_SHADERS; shader++) {
+         struct si_samplers *samplers = &sctx->samplers[shader];
+         struct si_descriptors *descs = si_sampler_and_image_descriptors(sctx, shader);
+         unsigned mask = samplers->enabled_mask;
+
+         while (mask) {
+            unsigned i = u_bit_scan(&mask);
+            struct pipe_resource *buffer = samplers->views[i]->texture;
+
+            if (buffer && buffer->target == PIPE_BUFFER && (!buf || buffer == buf)) {
+               unsigned desc_slot = si_get_sampler_slot(i);
+
+               si_set_buf_desc_address(si_resource(buffer), samplers->views[i]->u.buf.offset,
+                                       descs->list + desc_slot * 16 + 4);
+               sctx->descriptors_dirty |= 1u << si_sampler_and_image_descriptors_idx(shader);
+
+               radeon_add_to_gfx_buffer_list_check_mem(sctx, si_resource(buffer), RADEON_USAGE_READ,
+                                                       RADEON_PRIO_SAMPLER_BUFFER, true);
+            }
+         }
+      }
+   }
+
+   /* Shader images */
+   if (!buffer || buffer->bind_history & PIPE_BIND_SHADER_IMAGE) {
+      for (shader = 0; shader < SI_NUM_SHADERS; ++shader) {
+         struct si_images *images = &sctx->images[shader];
+         struct si_descriptors *descs = si_sampler_and_image_descriptors(sctx, shader);
+         unsigned mask = images->enabled_mask;
+
+         while (mask) {
+            unsigned i = u_bit_scan(&mask);
+            struct pipe_resource *buffer = images->views[i].resource;
+
+            if (buffer && buffer->target == PIPE_BUFFER && (!buf || buffer == buf)) {
+               unsigned desc_slot = si_get_image_slot(i);
+
+               if (images->views[i].access & PIPE_IMAGE_ACCESS_WRITE)
+                  si_mark_image_range_valid(&images->views[i]);
+
+               si_set_buf_desc_address(si_resource(buffer), images->views[i].u.buf.offset,
+                                       descs->list + desc_slot * 8 + 4);
+               sctx->descriptors_dirty |= 1u << si_sampler_and_image_descriptors_idx(shader);
+
+               radeon_add_to_gfx_buffer_list_check_mem(sctx, si_resource(buffer),
+                                                       RADEON_USAGE_READWRITE,
+                                                       RADEON_PRIO_SAMPLER_BUFFER, true);
+            }
+         }
+      }
+   }
+
+   /* Bindless texture handles */
+   if (!buffer || buffer->texture_handle_allocated) {
+      struct si_descriptors *descs = &sctx->bindless_descriptors;
+
+      util_dynarray_foreach (&sctx->resident_tex_handles, struct si_texture_handle *, tex_handle) {
+         struct pipe_sampler_view *view = (*tex_handle)->view;
+         unsigned desc_slot = (*tex_handle)->desc_slot;
+         struct pipe_resource *buffer = view->texture;
+
+         if (buffer && buffer->target == PIPE_BUFFER && (!buf || buffer == buf)) {
+            si_set_buf_desc_address(si_resource(buffer), view->u.buf.offset,
+                                    descs->list + desc_slot * 16 + 4);
+
+            (*tex_handle)->desc_dirty = true;
+            sctx->bindless_descriptors_dirty = true;
+
+            radeon_add_to_gfx_buffer_list_check_mem(sctx, si_resource(buffer), RADEON_USAGE_READ,
+                                                    RADEON_PRIO_SAMPLER_BUFFER, true);
+         }
+      }
+   }
+
+   /* Bindless image handles */
+   if (!buffer || buffer->image_handle_allocated) {
+      struct si_descriptors *descs = &sctx->bindless_descriptors;
+
+      util_dynarray_foreach (&sctx->resident_img_handles, struct si_image_handle *, img_handle) {
+         struct pipe_image_view *view = &(*img_handle)->view;
+         unsigned desc_slot = (*img_handle)->desc_slot;
+         struct pipe_resource *buffer = view->resource;
+
+         if (buffer && buffer->target == PIPE_BUFFER && (!buf || buffer == buf)) {
+            if (view->access & PIPE_IMAGE_ACCESS_WRITE)
+               si_mark_image_range_valid(view);
+
+            si_set_buf_desc_address(si_resource(buffer), view->u.buf.offset,
+                                    descs->list + desc_slot * 16 + 4);
+
+            (*img_handle)->desc_dirty = true;
+            sctx->bindless_descriptors_dirty = true;
+
+            radeon_add_to_gfx_buffer_list_check_mem(
+               sctx, si_resource(buffer), RADEON_USAGE_READWRITE, RADEON_PRIO_SAMPLER_BUFFER, true);
+         }
+      }
+   }
+
+   if (buffer) {
+      /* Do the same for other contexts. They will invoke this function
+       * with buffer == NULL.
+       */
+      unsigned new_counter = p_atomic_inc_return(&sctx->screen->dirty_buf_counter);
+
+      /* Skip the update for the current context, because we have already updated
+       * the buffer bindings.
+       */
+      if (new_counter == sctx->last_dirty_buf_counter + 1)
+         sctx->last_dirty_buf_counter = new_counter;
+   }
+}
+
+static void si_upload_bindless_descriptor(struct si_context *sctx, unsigned desc_slot,
+                                          unsigned num_dwords)
+{
+   struct si_descriptors *desc = &sctx->bindless_descriptors;
+   unsigned desc_slot_offset = desc_slot * 16;
+   uint32_t *data;
+   uint64_t va;
+
+   data = desc->list + desc_slot_offset;
+   va = desc->gpu_address + desc_slot_offset * 4;
+
+   si_cp_write_data(sctx, desc->buffer, va - desc->buffer->gpu_address, num_dwords * 4, V_370_TC_L2,
+                    V_370_ME, data);
 }
 
 static void si_upload_bindless_descriptors(struct si_context *sctx)
 {
-	if (!sctx->bindless_descriptors_dirty)
-		return;
+   if (!sctx->bindless_descriptors_dirty)
+      return;
 
-	/* Wait for graphics/compute to be idle before updating the resident
-	 * descriptors directly in memory, in case the GPU is using them.
-	 */
-	sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
-			 SI_CONTEXT_CS_PARTIAL_FLUSH;
-	sctx->emit_cache_flush(sctx);
+   /* Wait for graphics/compute to be idle before updating the resident
+    * descriptors directly in memory, in case the GPU is using them.
+    */
+   sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH;
+   sctx->emit_cache_flush(sctx);
 
-	util_dynarray_foreach(&sctx->resident_tex_handles,
-			      struct si_texture_handle *, tex_handle) {
-		unsigned desc_slot = (*tex_handle)->desc_slot;
+   util_dynarray_foreach (&sctx->resident_tex_handles, struct si_texture_handle *, tex_handle) {
+      unsigned desc_slot = (*tex_handle)->desc_slot;
 
-		if (!(*tex_handle)->desc_dirty)
-			continue;
+      if (!(*tex_handle)->desc_dirty)
+         continue;
 
-		si_upload_bindless_descriptor(sctx, desc_slot, 16);
-		(*tex_handle)->desc_dirty = false;
-	}
+      si_upload_bindless_descriptor(sctx, desc_slot, 16);
+      (*tex_handle)->desc_dirty = false;
+   }
 
-	util_dynarray_foreach(&sctx->resident_img_handles,
-			      struct si_image_handle *, img_handle) {
-		unsigned desc_slot = (*img_handle)->desc_slot;
+   util_dynarray_foreach (&sctx->resident_img_handles, struct si_image_handle *, img_handle) {
+      unsigned desc_slot = (*img_handle)->desc_slot;
 
-		if (!(*img_handle)->desc_dirty)
-			continue;
+      if (!(*img_handle)->desc_dirty)
+         continue;
 
-		si_upload_bindless_descriptor(sctx, desc_slot, 8);
-		(*img_handle)->desc_dirty = false;
-	}
+      si_upload_bindless_descriptor(sctx, desc_slot, 8);
+      (*img_handle)->desc_dirty = false;
+   }
 
-	/* Invalidate L1 because it doesn't know that L2 changed. */
-	sctx->flags |= SI_CONTEXT_INV_SCACHE;
-	sctx->emit_cache_flush(sctx);
+   /* Invalidate L1 because it doesn't know that L2 changed. */
+   sctx->flags |= SI_CONTEXT_INV_SCACHE;
+   sctx->emit_cache_flush(sctx);
 
-	sctx->bindless_descriptors_dirty = false;
+   sctx->bindless_descriptors_dirty = false;
 }
 
 /* Update mutable image descriptor fields of all resident textures. */
 static void si_update_bindless_texture_descriptor(struct si_context *sctx,
-						  struct si_texture_handle *tex_handle)
+                                                  struct si_texture_handle *tex_handle)
 {
-	struct si_sampler_view *sview = (struct si_sampler_view *)tex_handle->view;
-	struct si_descriptors *desc = &sctx->bindless_descriptors;
-	unsigned desc_slot_offset = tex_handle->desc_slot * 16;
-	uint32_t desc_list[16];
+   struct si_sampler_view *sview = (struct si_sampler_view *)tex_handle->view;
+   struct si_descriptors *desc = &sctx->bindless_descriptors;
+   unsigned desc_slot_offset = tex_handle->desc_slot * 16;
+   uint32_t desc_list[16];
 
-	if (sview->base.texture->target == PIPE_BUFFER)
-		return;
+   if (sview->base.texture->target == PIPE_BUFFER)
+      return;
 
-	memcpy(desc_list, desc->list + desc_slot_offset, sizeof(desc_list));
-	si_set_sampler_view_desc(sctx, sview, &tex_handle->sstate,
-				 desc->list + desc_slot_offset);
+   memcpy(desc_list, desc->list + desc_slot_offset, sizeof(desc_list));
+   si_set_sampler_view_desc(sctx, sview, &tex_handle->sstate, desc->list + desc_slot_offset);
 
-	if (memcmp(desc_list, desc->list + desc_slot_offset,
-		   sizeof(desc_list))) {
-		tex_handle->desc_dirty = true;
-		sctx->bindless_descriptors_dirty = true;
-	}
+   if (memcmp(desc_list, desc->list + desc_slot_offset, sizeof(desc_list))) {
+      tex_handle->desc_dirty = true;
+      sctx->bindless_descriptors_dirty = true;
+   }
 }
 
 static void si_update_bindless_image_descriptor(struct si_context *sctx,
-						struct si_image_handle *img_handle)
+                                                struct si_image_handle *img_handle)
 {
-	struct si_descriptors *desc = &sctx->bindless_descriptors;
-	unsigned desc_slot_offset = img_handle->desc_slot * 16;
-	struct pipe_image_view *view = &img_handle->view;
-	struct pipe_resource *res = view->resource;
-	uint32_t image_desc[16];
-	unsigned desc_size = (res->nr_samples >= 2 ? 16 : 8) * 4;
+   struct si_descriptors *desc = &sctx->bindless_descriptors;
+   unsigned desc_slot_offset = img_handle->desc_slot * 16;
+   struct pipe_image_view *view = &img_handle->view;
+   struct pipe_resource *res = view->resource;
+   uint32_t image_desc[16];
+   unsigned desc_size = (res->nr_samples >= 2 ? 16 : 8) * 4;
 
-	if (res->target == PIPE_BUFFER)
-		return;
+   if (res->target == PIPE_BUFFER)
+      return;
 
-	memcpy(image_desc, desc->list + desc_slot_offset, desc_size);
-	si_set_shader_image_desc(sctx, view, true,
-				 desc->list + desc_slot_offset,
-				 desc->list + desc_slot_offset + 8);
+   memcpy(image_desc, desc->list + desc_slot_offset, desc_size);
+   si_set_shader_image_desc(sctx, view, true, desc->list + desc_slot_offset,
+                            desc->list + desc_slot_offset + 8);
 
-	if (memcmp(image_desc, desc->list + desc_slot_offset, desc_size)) {
-		img_handle->desc_dirty = true;
-		sctx->bindless_descriptors_dirty = true;
-	}
+   if (memcmp(image_desc, desc->list + desc_slot_offset, desc_size)) {
+      img_handle->desc_dirty = true;
+      sctx->bindless_descriptors_dirty = true;
+   }
 }
 
 static void si_update_all_resident_texture_descriptors(struct si_context *sctx)
 {
-	util_dynarray_foreach(&sctx->resident_tex_handles,
-			      struct si_texture_handle *, tex_handle) {
-		si_update_bindless_texture_descriptor(sctx, *tex_handle);
-	}
+   util_dynarray_foreach (&sctx->resident_tex_handles, struct si_texture_handle *, tex_handle) {
+      si_update_bindless_texture_descriptor(sctx, *tex_handle);
+   }
 
-	util_dynarray_foreach(&sctx->resident_img_handles,
-			      struct si_image_handle *, img_handle) {
-		si_update_bindless_image_descriptor(sctx, *img_handle);
-	}
+   util_dynarray_foreach (&sctx->resident_img_handles, struct si_image_handle *, img_handle) {
+      si_update_bindless_image_descriptor(sctx, *img_handle);
+   }
 
-	si_upload_bindless_descriptors(sctx);
+   si_upload_bindless_descriptors(sctx);
 }
 
 /* Update mutable image descriptor fields of all bound textures. */
 void si_update_all_texture_descriptors(struct si_context *sctx)
 {
-	unsigned shader;
+   unsigned shader;
 
-	for (shader = 0; shader < SI_NUM_SHADERS; shader++) {
-		struct si_samplers *samplers = &sctx->samplers[shader];
-		struct si_images *images = &sctx->images[shader];
-		unsigned mask;
+   for (shader = 0; shader < SI_NUM_SHADERS; shader++) {
+      struct si_samplers *samplers = &sctx->samplers[shader];
+      struct si_images *images = &sctx->images[shader];
+      unsigned mask;
 
-		/* Images. */
-		mask = images->enabled_mask;
-		while (mask) {
-			unsigned i = u_bit_scan(&mask);
-			struct pipe_image_view *view = &images->views[i];
+      /* Images. */
+      mask = images->enabled_mask;
+      while (mask) {
+         unsigned i = u_bit_scan(&mask);
+         struct pipe_image_view *view = &images->views[i];
 
-			if (!view->resource ||
-			    view->resource->target == PIPE_BUFFER)
-				continue;
+         if (!view->resource || view->resource->target == PIPE_BUFFER)
+            continue;
 
-			si_set_shader_image(sctx, shader, i, view, true);
-		}
+         si_set_shader_image(sctx, shader, i, view, true);
+      }
 
-		/* Sampler views. */
-		mask = samplers->enabled_mask;
-		while (mask) {
-			unsigned i = u_bit_scan(&mask);
-			struct pipe_sampler_view *view = samplers->views[i];
+      /* Sampler views. */
+      mask = samplers->enabled_mask;
+      while (mask) {
+         unsigned i = u_bit_scan(&mask);
+         struct pipe_sampler_view *view = samplers->views[i];
 
-			if (!view ||
-			    !view->texture ||
-			    view->texture->target == PIPE_BUFFER)
-				continue;
+         if (!view || !view->texture || view->texture->target == PIPE_BUFFER)
+            continue;
 
-			si_set_sampler_view(sctx, shader, i,
-					    samplers->views[i], true);
-		}
+         si_set_sampler_view(sctx, shader, i, samplers->views[i], true);
+      }
 
-		si_update_shader_needs_decompress_mask(sctx, shader);
-	}
+      si_update_shader_needs_decompress_mask(sctx, shader);
+   }
 
-	si_update_all_resident_texture_descriptors(sctx);
-	si_update_ps_colorbuf0_slot(sctx);
+   si_update_all_resident_texture_descriptors(sctx);
+   si_update_ps_colorbuf0_slot(sctx);
 }
 
 /* SHADER USER DATA */
 
-static void si_mark_shader_pointers_dirty(struct si_context *sctx,
-					  unsigned shader)
+static void si_mark_shader_pointers_dirty(struct si_context *sctx, unsigned shader)
 {
-	sctx->shader_pointers_dirty |=
-		u_bit_consecutive(SI_DESCS_FIRST_SHADER + shader * SI_NUM_SHADER_DESCS,
-				  SI_NUM_SHADER_DESCS);
+   sctx->shader_pointers_dirty |=
+      u_bit_consecutive(SI_DESCS_FIRST_SHADER + shader * SI_NUM_SHADER_DESCS, SI_NUM_SHADER_DESCS);
 
-	if (shader == PIPE_SHADER_VERTEX) {
-		sctx->vertex_buffer_pointer_dirty = sctx->vb_descriptors_buffer != NULL;
-		sctx->vertex_buffer_user_sgprs_dirty = sctx->num_vertex_elements > 0 &&
-						       sctx->screen->num_vbos_in_user_sgprs;
-	}
+   if (shader == PIPE_SHADER_VERTEX) {
+      sctx->vertex_buffer_pointer_dirty = sctx->vb_descriptors_buffer != NULL;
+      sctx->vertex_buffer_user_sgprs_dirty =
+         sctx->num_vertex_elements > 0 && sctx->screen->num_vbos_in_user_sgprs;
+   }
 
-	si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_pointers);
+   si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_pointers);
 }
 
 static void si_shader_pointers_begin_new_cs(struct si_context *sctx)
 {
-	sctx->shader_pointers_dirty = u_bit_consecutive(0, SI_NUM_DESCS);
-	sctx->vertex_buffer_pointer_dirty = sctx->vb_descriptors_buffer != NULL;
-	sctx->vertex_buffer_user_sgprs_dirty = sctx->num_vertex_elements > 0 &&
-					       sctx->screen->num_vbos_in_user_sgprs;
-	si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_pointers);
-	sctx->graphics_bindless_pointer_dirty = sctx->bindless_descriptors.buffer != NULL;
-	sctx->compute_bindless_pointer_dirty = sctx->bindless_descriptors.buffer != NULL;
+   sctx->shader_pointers_dirty = u_bit_consecutive(0, SI_NUM_DESCS);
+   sctx->vertex_buffer_pointer_dirty = sctx->vb_descriptors_buffer != NULL;
+   sctx->vertex_buffer_user_sgprs_dirty =
+      sctx->num_vertex_elements > 0 && sctx->screen->num_vbos_in_user_sgprs;
+   si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_pointers);
+   sctx->graphics_bindless_pointer_dirty = sctx->bindless_descriptors.buffer != NULL;
+   sctx->compute_bindless_pointer_dirty = sctx->bindless_descriptors.buffer != NULL;
 }
 
 /* Set a base register address for user data constants in the given shader.
  * This assigns a mapping from PIPE_SHADER_* to SPI_SHADER_USER_DATA_*.
  */
-static void si_set_user_data_base(struct si_context *sctx,
-				  unsigned shader, uint32_t new_base)
+static void si_set_user_data_base(struct si_context *sctx, unsigned shader, uint32_t new_base)
 {
-	uint32_t *base = &sctx->shader_pointers.sh_base[shader];
+   uint32_t *base = &sctx->shader_pointers.sh_base[shader];
 
-	if (*base != new_base) {
-		*base = new_base;
+   if (*base != new_base) {
+      *base = new_base;
 
-		if (new_base)
-			si_mark_shader_pointers_dirty(sctx, shader);
+      if (new_base)
+         si_mark_shader_pointers_dirty(sctx, shader);
 
-		/* Any change in enabled shader stages requires re-emitting
-		 * the VS state SGPR, because it contains the clamp_vertex_color
-		 * state, which can be done in VS, TES, and GS.
-		 */
-		sctx->last_vs_state = ~0;
-	}
+      /* Any change in enabled shader stages requires re-emitting
+       * the VS state SGPR, because it contains the clamp_vertex_color
+       * state, which can be done in VS, TES, and GS.
+       */
+      sctx->last_vs_state = ~0;
+   }
 }
 
 /* This must be called when these are changed between enabled and disabled
@@ -2110,922 +1910,822 @@ static void si_set_user_data_base(struct si_context *sctx,
  */
 void si_shader_change_notify(struct si_context *sctx)
 {
-	/* VS can be bound as VS, ES, or LS. */
-	if (sctx->tes_shader.cso) {
-		if (sctx->chip_class >= GFX10) {
-			si_set_user_data_base(sctx, PIPE_SHADER_VERTEX,
-					      R_00B430_SPI_SHADER_USER_DATA_HS_0);
-		} else if (sctx->chip_class == GFX9) {
-			si_set_user_data_base(sctx, PIPE_SHADER_VERTEX,
-					      R_00B430_SPI_SHADER_USER_DATA_LS_0);
-		} else {
-			si_set_user_data_base(sctx, PIPE_SHADER_VERTEX,
-					      R_00B530_SPI_SHADER_USER_DATA_LS_0);
-		}
-	} else if (sctx->chip_class >= GFX10) {
-		if (sctx->ngg || sctx->gs_shader.cso) {
-			si_set_user_data_base(sctx, PIPE_SHADER_VERTEX,
-					      R_00B230_SPI_SHADER_USER_DATA_GS_0);
-		} else {
-			si_set_user_data_base(sctx, PIPE_SHADER_VERTEX,
-					      R_00B130_SPI_SHADER_USER_DATA_VS_0);
-		}
-	} else if (sctx->gs_shader.cso) {
-		si_set_user_data_base(sctx, PIPE_SHADER_VERTEX,
-				      R_00B330_SPI_SHADER_USER_DATA_ES_0);
-	} else {
-		si_set_user_data_base(sctx, PIPE_SHADER_VERTEX,
-				      R_00B130_SPI_SHADER_USER_DATA_VS_0);
-	}
-
-	/* TES can be bound as ES, VS, or not bound. */
-	if (sctx->tes_shader.cso) {
-		if (sctx->chip_class >= GFX10) {
-			if (sctx->ngg || sctx->gs_shader.cso) {
-				si_set_user_data_base(sctx, PIPE_SHADER_TESS_EVAL,
-						      R_00B230_SPI_SHADER_USER_DATA_GS_0);
-			} else {
-				si_set_user_data_base(sctx, PIPE_SHADER_TESS_EVAL,
-						      R_00B130_SPI_SHADER_USER_DATA_VS_0);
-			}
-		} else if (sctx->gs_shader.cso) {
-			si_set_user_data_base(sctx, PIPE_SHADER_TESS_EVAL,
-					      R_00B330_SPI_SHADER_USER_DATA_ES_0);
-		} else {
-			si_set_user_data_base(sctx, PIPE_SHADER_TESS_EVAL,
-					      R_00B130_SPI_SHADER_USER_DATA_VS_0);
-		}
-	} else {
-		si_set_user_data_base(sctx, PIPE_SHADER_TESS_EVAL, 0);
-	}
-}
-
-static void si_emit_shader_pointer_head(struct radeon_cmdbuf *cs,
-					unsigned sh_offset,
-					unsigned pointer_count)
-{
-	radeon_emit(cs, PKT3(PKT3_SET_SH_REG, pointer_count, 0));
-	radeon_emit(cs, (sh_offset - SI_SH_REG_OFFSET) >> 2);
-}
-
-static void si_emit_shader_pointer_body(struct si_screen *sscreen,
-					struct radeon_cmdbuf *cs,
-					uint64_t va)
-{
-	radeon_emit(cs, va);
-
-	assert(va == 0 || (va >> 32) == sscreen->info.address32_hi);
-}
-
-static void si_emit_shader_pointer(struct si_context *sctx,
-				   struct si_descriptors *desc,
-				   unsigned sh_base)
-{
-	struct radeon_cmdbuf *cs = sctx->gfx_cs;
-	unsigned sh_offset = sh_base + desc->shader_userdata_offset;
-
-	si_emit_shader_pointer_head(cs, sh_offset, 1);
-	si_emit_shader_pointer_body(sctx->screen, cs, desc->gpu_address);
-}
-
-static void si_emit_consecutive_shader_pointers(struct si_context *sctx,
-						unsigned pointer_mask,
-						unsigned sh_base)
-{
-	if (!sh_base)
-		return;
-
-	struct radeon_cmdbuf *cs = sctx->gfx_cs;
-	unsigned mask = sctx->shader_pointers_dirty & pointer_mask;
-
-	while (mask) {
-		int start, count;
-		u_bit_scan_consecutive_range(&mask, &start, &count);
-
-		struct si_descriptors *descs = &sctx->descriptors[start];
-		unsigned sh_offset = sh_base + descs->shader_userdata_offset;
-
-		si_emit_shader_pointer_head(cs, sh_offset, count);
-		for (int i = 0; i < count; i++)
-			si_emit_shader_pointer_body(sctx->screen, cs,
-						    descs[i].gpu_address);
-	}
-}
-
-static void si_emit_global_shader_pointers(struct si_context *sctx,
-					   struct si_descriptors *descs)
-{
-	if (sctx->chip_class >= GFX10) {
-		si_emit_shader_pointer(sctx, descs,
-				       R_00B030_SPI_SHADER_USER_DATA_PS_0);
-		/* HW VS stage only used in non-NGG mode. */
-		si_emit_shader_pointer(sctx, descs,
-				       R_00B130_SPI_SHADER_USER_DATA_VS_0);
-		si_emit_shader_pointer(sctx, descs,
-				       R_00B230_SPI_SHADER_USER_DATA_GS_0);
-		si_emit_shader_pointer(sctx, descs,
-				       R_00B430_SPI_SHADER_USER_DATA_HS_0);
-		return;
-	} else if (sctx->chip_class == GFX9) {
-		/* Broadcast it to all shader stages. */
-		si_emit_shader_pointer(sctx, descs,
-				       R_00B530_SPI_SHADER_USER_DATA_COMMON_0);
-		return;
-	}
-
-	si_emit_shader_pointer(sctx, descs,
-			       R_00B030_SPI_SHADER_USER_DATA_PS_0);
-	si_emit_shader_pointer(sctx, descs,
-			       R_00B130_SPI_SHADER_USER_DATA_VS_0);
-	si_emit_shader_pointer(sctx, descs,
-			       R_00B330_SPI_SHADER_USER_DATA_ES_0);
-	si_emit_shader_pointer(sctx, descs,
-			       R_00B230_SPI_SHADER_USER_DATA_GS_0);
-	si_emit_shader_pointer(sctx, descs,
-			       R_00B430_SPI_SHADER_USER_DATA_HS_0);
-	si_emit_shader_pointer(sctx, descs,
-			       R_00B530_SPI_SHADER_USER_DATA_LS_0);
+   /* VS can be bound as VS, ES, or LS. */
+   if (sctx->tes_shader.cso) {
+      if (sctx->chip_class >= GFX10) {
+         si_set_user_data_base(sctx, PIPE_SHADER_VERTEX, R_00B430_SPI_SHADER_USER_DATA_HS_0);
+      } else if (sctx->chip_class == GFX9) {
+         si_set_user_data_base(sctx, PIPE_SHADER_VERTEX, R_00B430_SPI_SHADER_USER_DATA_LS_0);
+      } else {
+         si_set_user_data_base(sctx, PIPE_SHADER_VERTEX, R_00B530_SPI_SHADER_USER_DATA_LS_0);
+      }
+   } else if (sctx->chip_class >= GFX10) {
+      if (sctx->ngg || sctx->gs_shader.cso) {
+         si_set_user_data_base(sctx, PIPE_SHADER_VERTEX, R_00B230_SPI_SHADER_USER_DATA_GS_0);
+      } else {
+         si_set_user_data_base(sctx, PIPE_SHADER_VERTEX, R_00B130_SPI_SHADER_USER_DATA_VS_0);
+      }
+   } else if (sctx->gs_shader.cso) {
+      si_set_user_data_base(sctx, PIPE_SHADER_VERTEX, R_00B330_SPI_SHADER_USER_DATA_ES_0);
+   } else {
+      si_set_user_data_base(sctx, PIPE_SHADER_VERTEX, R_00B130_SPI_SHADER_USER_DATA_VS_0);
+   }
+
+   /* TES can be bound as ES, VS, or not bound. */
+   if (sctx->tes_shader.cso) {
+      if (sctx->chip_class >= GFX10) {
+         if (sctx->ngg || sctx->gs_shader.cso) {
+            si_set_user_data_base(sctx, PIPE_SHADER_TESS_EVAL, R_00B230_SPI_SHADER_USER_DATA_GS_0);
+         } else {
+            si_set_user_data_base(sctx, PIPE_SHADER_TESS_EVAL, R_00B130_SPI_SHADER_USER_DATA_VS_0);
+         }
+      } else if (sctx->gs_shader.cso) {
+         si_set_user_data_base(sctx, PIPE_SHADER_TESS_EVAL, R_00B330_SPI_SHADER_USER_DATA_ES_0);
+      } else {
+         si_set_user_data_base(sctx, PIPE_SHADER_TESS_EVAL, R_00B130_SPI_SHADER_USER_DATA_VS_0);
+      }
+   } else {
+      si_set_user_data_base(sctx, PIPE_SHADER_TESS_EVAL, 0);
+   }
+}
+
+static void si_emit_shader_pointer_head(struct radeon_cmdbuf *cs, unsigned sh_offset,
+                                        unsigned pointer_count)
+{
+   radeon_emit(cs, PKT3(PKT3_SET_SH_REG, pointer_count, 0));
+   radeon_emit(cs, (sh_offset - SI_SH_REG_OFFSET) >> 2);
+}
+
+static void si_emit_shader_pointer_body(struct si_screen *sscreen, struct radeon_cmdbuf *cs,
+                                        uint64_t va)
+{
+   radeon_emit(cs, va);
+
+   assert(va == 0 || (va >> 32) == sscreen->info.address32_hi);
+}
+
+static void si_emit_shader_pointer(struct si_context *sctx, struct si_descriptors *desc,
+                                   unsigned sh_base)
+{
+   struct radeon_cmdbuf *cs = sctx->gfx_cs;
+   unsigned sh_offset = sh_base + desc->shader_userdata_offset;
+
+   si_emit_shader_pointer_head(cs, sh_offset, 1);
+   si_emit_shader_pointer_body(sctx->screen, cs, desc->gpu_address);
+}
+
+static void si_emit_consecutive_shader_pointers(struct si_context *sctx, unsigned pointer_mask,
+                                                unsigned sh_base)
+{
+   if (!sh_base)
+      return;
+
+   struct radeon_cmdbuf *cs = sctx->gfx_cs;
+   unsigned mask = sctx->shader_pointers_dirty & pointer_mask;
+
+   while (mask) {
+      int start, count;
+      u_bit_scan_consecutive_range(&mask, &start, &count);
+
+      struct si_descriptors *descs = &sctx->descriptors[start];
+      unsigned sh_offset = sh_base + descs->shader_userdata_offset;
+
+      si_emit_shader_pointer_head(cs, sh_offset, count);
+      for (int i = 0; i < count; i++)
+         si_emit_shader_pointer_body(sctx->screen, cs, descs[i].gpu_address);
+   }
+}
+
+static void si_emit_global_shader_pointers(struct si_context *sctx, struct si_descriptors *descs)
+{
+   if (sctx->chip_class >= GFX10) {
+      si_emit_shader_pointer(sctx, descs, R_00B030_SPI_SHADER_USER_DATA_PS_0);
+      /* HW VS stage only used in non-NGG mode. */
+      si_emit_shader_pointer(sctx, descs, R_00B130_SPI_SHADER_USER_DATA_VS_0);
+      si_emit_shader_pointer(sctx, descs, R_00B230_SPI_SHADER_USER_DATA_GS_0);
+      si_emit_shader_pointer(sctx, descs, R_00B430_SPI_SHADER_USER_DATA_HS_0);
+      return;
+   } else if (sctx->chip_class == GFX9) {
+      /* Broadcast it to all shader stages. */
+      si_emit_shader_pointer(sctx, descs, R_00B530_SPI_SHADER_USER_DATA_COMMON_0);
+      return;
+   }
+
+   si_emit_shader_pointer(sctx, descs, R_00B030_SPI_SHADER_USER_DATA_PS_0);
+   si_emit_shader_pointer(sctx, descs, R_00B130_SPI_SHADER_USER_DATA_VS_0);
+   si_emit_shader_pointer(sctx, descs, R_00B330_SPI_SHADER_USER_DATA_ES_0);
+   si_emit_shader_pointer(sctx, descs, R_00B230_SPI_SHADER_USER_DATA_GS_0);
+   si_emit_shader_pointer(sctx, descs, R_00B430_SPI_SHADER_USER_DATA_HS_0);
+   si_emit_shader_pointer(sctx, descs, R_00B530_SPI_SHADER_USER_DATA_LS_0);
 }
 
 void si_emit_graphics_shader_pointers(struct si_context *sctx)
 {
-	uint32_t *sh_base = sctx->shader_pointers.sh_base;
-
-	if (sctx->shader_pointers_dirty & (1 << SI_DESCS_RW_BUFFERS)) {
-		si_emit_global_shader_pointers(sctx,
-					       &sctx->descriptors[SI_DESCS_RW_BUFFERS]);
-	}
-
-	si_emit_consecutive_shader_pointers(sctx, SI_DESCS_SHADER_MASK(VERTEX),
-					    sh_base[PIPE_SHADER_VERTEX]);
-	si_emit_consecutive_shader_pointers(sctx, SI_DESCS_SHADER_MASK(TESS_EVAL),
-					    sh_base[PIPE_SHADER_TESS_EVAL]);
-	si_emit_consecutive_shader_pointers(sctx, SI_DESCS_SHADER_MASK(FRAGMENT),
-					    sh_base[PIPE_SHADER_FRAGMENT]);
-	si_emit_consecutive_shader_pointers(sctx, SI_DESCS_SHADER_MASK(TESS_CTRL),
-					    sh_base[PIPE_SHADER_TESS_CTRL]);
-	si_emit_consecutive_shader_pointers(sctx, SI_DESCS_SHADER_MASK(GEOMETRY),
-					    sh_base[PIPE_SHADER_GEOMETRY]);
-
-	sctx->shader_pointers_dirty &=
-		~u_bit_consecutive(SI_DESCS_RW_BUFFERS, SI_DESCS_FIRST_COMPUTE);
-
-	if (sctx->vertex_buffer_pointer_dirty && sctx->num_vertex_elements) {
-		struct radeon_cmdbuf *cs = sctx->gfx_cs;
-
-		/* Find the location of the VB descriptor pointer. */
-		unsigned sh_dw_offset = SI_VS_NUM_USER_SGPR;
-		if (sctx->chip_class >= GFX9) {
-			if (sctx->tes_shader.cso)
-				sh_dw_offset = GFX9_TCS_NUM_USER_SGPR;
-			else if (sctx->gs_shader.cso)
-				sh_dw_offset = GFX9_VSGS_NUM_USER_SGPR;
-		}
-
-		unsigned sh_offset = sh_base[PIPE_SHADER_VERTEX] + sh_dw_offset * 4;
-		si_emit_shader_pointer_head(cs, sh_offset, 1);
-		si_emit_shader_pointer_body(sctx->screen, cs,
-					    sctx->vb_descriptors_buffer->gpu_address +
-					    sctx->vb_descriptors_offset);
-		sctx->vertex_buffer_pointer_dirty = false;
-	}
-
-	if (sctx->vertex_buffer_user_sgprs_dirty &&
-	    sctx->num_vertex_elements &&
-	    sctx->screen->num_vbos_in_user_sgprs) {
-		struct radeon_cmdbuf *cs = sctx->gfx_cs;
-		unsigned num_desc = MIN2(sctx->num_vertex_elements,
-					 sctx->screen->num_vbos_in_user_sgprs);
-		unsigned sh_offset = sh_base[PIPE_SHADER_VERTEX] + SI_SGPR_VS_VB_DESCRIPTOR_FIRST * 4;
-
-		si_emit_shader_pointer_head(cs, sh_offset, num_desc * 4);
-		radeon_emit_array(cs, sctx->vb_descriptor_user_sgprs, num_desc * 4);
-		sctx->vertex_buffer_user_sgprs_dirty = false;
-	}
-
-	if (sctx->graphics_bindless_pointer_dirty) {
-		si_emit_global_shader_pointers(sctx,
-					       &sctx->bindless_descriptors);
-		sctx->graphics_bindless_pointer_dirty = false;
-	}
+   uint32_t *sh_base = sctx->shader_pointers.sh_base;
+
+   if (sctx->shader_pointers_dirty & (1 << SI_DESCS_RW_BUFFERS)) {
+      si_emit_global_shader_pointers(sctx, &sctx->descriptors[SI_DESCS_RW_BUFFERS]);
+   }
+
+   si_emit_consecutive_shader_pointers(sctx, SI_DESCS_SHADER_MASK(VERTEX),
+                                       sh_base[PIPE_SHADER_VERTEX]);
+   si_emit_consecutive_shader_pointers(sctx, SI_DESCS_SHADER_MASK(TESS_EVAL),
+                                       sh_base[PIPE_SHADER_TESS_EVAL]);
+   si_emit_consecutive_shader_pointers(sctx, SI_DESCS_SHADER_MASK(FRAGMENT),
+                                       sh_base[PIPE_SHADER_FRAGMENT]);
+   si_emit_consecutive_shader_pointers(sctx, SI_DESCS_SHADER_MASK(TESS_CTRL),
+                                       sh_base[PIPE_SHADER_TESS_CTRL]);
+   si_emit_consecutive_shader_pointers(sctx, SI_DESCS_SHADER_MASK(GEOMETRY),
+                                       sh_base[PIPE_SHADER_GEOMETRY]);
+
+   sctx->shader_pointers_dirty &= ~u_bit_consecutive(SI_DESCS_RW_BUFFERS, SI_DESCS_FIRST_COMPUTE);
+
+   if (sctx->vertex_buffer_pointer_dirty && sctx->num_vertex_elements) {
+      struct radeon_cmdbuf *cs = sctx->gfx_cs;
+
+      /* Find the location of the VB descriptor pointer. */
+      unsigned sh_dw_offset = SI_VS_NUM_USER_SGPR;
+      if (sctx->chip_class >= GFX9) {
+         if (sctx->tes_shader.cso)
+            sh_dw_offset = GFX9_TCS_NUM_USER_SGPR;
+         else if (sctx->gs_shader.cso)
+            sh_dw_offset = GFX9_VSGS_NUM_USER_SGPR;
+      }
+
+      unsigned sh_offset = sh_base[PIPE_SHADER_VERTEX] + sh_dw_offset * 4;
+      si_emit_shader_pointer_head(cs, sh_offset, 1);
+      si_emit_shader_pointer_body(
+         sctx->screen, cs, sctx->vb_descriptors_buffer->gpu_address + sctx->vb_descriptors_offset);
+      sctx->vertex_buffer_pointer_dirty = false;
+   }
+
+   if (sctx->vertex_buffer_user_sgprs_dirty && sctx->num_vertex_elements &&
+       sctx->screen->num_vbos_in_user_sgprs) {
+      struct radeon_cmdbuf *cs = sctx->gfx_cs;
+      unsigned num_desc = MIN2(sctx->num_vertex_elements, sctx->screen->num_vbos_in_user_sgprs);
+      unsigned sh_offset = sh_base[PIPE_SHADER_VERTEX] + SI_SGPR_VS_VB_DESCRIPTOR_FIRST * 4;
+
+      si_emit_shader_pointer_head(cs, sh_offset, num_desc * 4);
+      radeon_emit_array(cs, sctx->vb_descriptor_user_sgprs, num_desc * 4);
+      sctx->vertex_buffer_user_sgprs_dirty = false;
+   }
+
+   if (sctx->graphics_bindless_pointer_dirty) {
+      si_emit_global_shader_pointers(sctx, &sctx->bindless_descriptors);
+      sctx->graphics_bindless_pointer_dirty = false;
+   }
 }
 
 void si_emit_compute_shader_pointers(struct si_context *sctx)
 {
-	unsigned base = R_00B900_COMPUTE_USER_DATA_0;
+   unsigned base = R_00B900_COMPUTE_USER_DATA_0;
 
-	si_emit_consecutive_shader_pointers(sctx, SI_DESCS_SHADER_MASK(COMPUTE),
-					    R_00B900_COMPUTE_USER_DATA_0);
-	sctx->shader_pointers_dirty &= ~SI_DESCS_SHADER_MASK(COMPUTE);
+   si_emit_consecutive_shader_pointers(sctx, SI_DESCS_SHADER_MASK(COMPUTE),
+                                       R_00B900_COMPUTE_USER_DATA_0);
+   sctx->shader_pointers_dirty &= ~SI_DESCS_SHADER_MASK(COMPUTE);
 
-	if (sctx->compute_bindless_pointer_dirty) {
-		si_emit_shader_pointer(sctx, &sctx->bindless_descriptors, base);
-		sctx->compute_bindless_pointer_dirty = false;
-	}
+   if (sctx->compute_bindless_pointer_dirty) {
+      si_emit_shader_pointer(sctx, &sctx->bindless_descriptors, base);
+      sctx->compute_bindless_pointer_dirty = false;
+   }
 }
 
 /* BINDLESS */
 
-static void si_init_bindless_descriptors(struct si_context *sctx,
-					 struct si_descriptors *desc,
-					 short shader_userdata_rel_index,
-					 unsigned num_elements)
+static void si_init_bindless_descriptors(struct si_context *sctx, struct si_descriptors *desc,
+                                         short shader_userdata_rel_index, unsigned num_elements)
 {
-	ASSERTED unsigned desc_slot;
+   ASSERTED unsigned desc_slot;
 
-	si_init_descriptors(desc, shader_userdata_rel_index, 16, num_elements);
-	sctx->bindless_descriptors.num_active_slots = num_elements;
+   si_init_descriptors(desc, shader_userdata_rel_index, 16, num_elements);
+   sctx->bindless_descriptors.num_active_slots = num_elements;
 
-	/* The first bindless descriptor is stored at slot 1, because 0 is not
-	 * considered to be a valid handle.
-	 */
-	sctx->num_bindless_descriptors = 1;
+   /* The first bindless descriptor is stored at slot 1, because 0 is not
+    * considered to be a valid handle.
+    */
+   sctx->num_bindless_descriptors = 1;
 
-	/* Track which bindless slots are used (or not). */
-	util_idalloc_init(&sctx->bindless_used_slots);
-	util_idalloc_resize(&sctx->bindless_used_slots, num_elements);
+   /* Track which bindless slots are used (or not). */
+   util_idalloc_init(&sctx->bindless_used_slots);
+   util_idalloc_resize(&sctx->bindless_used_slots, num_elements);
 
-	/* Reserve slot 0 because it's an invalid handle for bindless. */
-	desc_slot = util_idalloc_alloc(&sctx->bindless_used_slots);
-	assert(desc_slot == 0);
+   /* Reserve slot 0 because it's an invalid handle for bindless. */
+   desc_slot = util_idalloc_alloc(&sctx->bindless_used_slots);
+   assert(desc_slot == 0);
 }
 
 static void si_release_bindless_descriptors(struct si_context *sctx)
 {
-	si_release_descriptors(&sctx->bindless_descriptors);
-	util_idalloc_fini(&sctx->bindless_used_slots);
+   si_release_descriptors(&sctx->bindless_descriptors);
+   util_idalloc_fini(&sctx->bindless_used_slots);
 }
 
 static unsigned si_get_first_free_bindless_slot(struct si_context *sctx)
 {
-	struct si_descriptors *desc = &sctx->bindless_descriptors;
-	unsigned desc_slot;
+   struct si_descriptors *desc = &sctx->bindless_descriptors;
+   unsigned desc_slot;
 
-	desc_slot = util_idalloc_alloc(&sctx->bindless_used_slots);
-	if (desc_slot >= desc->num_elements) {
-		/* The array of bindless descriptors is full, resize it. */
-		unsigned slot_size = desc->element_dw_size * 4;
-		unsigned new_num_elements = desc->num_elements * 2;
+   desc_slot = util_idalloc_alloc(&sctx->bindless_used_slots);
+   if (desc_slot >= desc->num_elements) {
+      /* The array of bindless descriptors is full, resize it. */
+      unsigned slot_size = desc->element_dw_size * 4;
+      unsigned new_num_elements = desc->num_elements * 2;
 
-		desc->list = REALLOC(desc->list, desc->num_elements * slot_size,
-				     new_num_elements * slot_size);
-		desc->num_elements = new_num_elements;
-		desc->num_active_slots = new_num_elements;
-	}
+      desc->list =
+         REALLOC(desc->list, desc->num_elements * slot_size, new_num_elements * slot_size);
+      desc->num_elements = new_num_elements;
+      desc->num_active_slots = new_num_elements;
+   }
 
-	assert(desc_slot);
-	return desc_slot;
+   assert(desc_slot);
+   return desc_slot;
 }
 
-static unsigned
-si_create_bindless_descriptor(struct si_context *sctx, uint32_t *desc_list,
-			      unsigned size)
+static unsigned si_create_bindless_descriptor(struct si_context *sctx, uint32_t *desc_list,
+                                              unsigned size)
 {
-	struct si_descriptors *desc = &sctx->bindless_descriptors;
-	unsigned desc_slot, desc_slot_offset;
+   struct si_descriptors *desc = &sctx->bindless_descriptors;
+   unsigned desc_slot, desc_slot_offset;
 
-	/* Find a free slot. */
-	desc_slot = si_get_first_free_bindless_slot(sctx);
+   /* Find a free slot. */
+   desc_slot = si_get_first_free_bindless_slot(sctx);
 
-	/* For simplicity, sampler and image bindless descriptors use fixed
-	 * 16-dword slots for now. Image descriptors only need 8-dword but this
-	 * doesn't really matter because no real apps use image handles.
-	 */
-	desc_slot_offset = desc_slot * 16;
+   /* For simplicity, sampler and image bindless descriptors use fixed
+    * 16-dword slots for now. Image descriptors only need 8-dword but this
+    * doesn't really matter because no real apps use image handles.
+    */
+   desc_slot_offset = desc_slot * 16;
 
-	/* Copy the descriptor into the array. */
-	memcpy(desc->list + desc_slot_offset, desc_list, size);
+   /* Copy the descriptor into the array. */
+   memcpy(desc->list + desc_slot_offset, desc_list, size);
 
-	/* Re-upload the whole array of bindless descriptors into a new buffer.
-	 */
-	if (!si_upload_descriptors(sctx, desc))
-		return 0;
+   /* Re-upload the whole array of bindless descriptors into a new buffer.
+    */
+   if (!si_upload_descriptors(sctx, desc))
+      return 0;
 
-	/* Make sure to re-emit the shader pointers for all stages. */
-	sctx->graphics_bindless_pointer_dirty = true;
-	sctx->compute_bindless_pointer_dirty = true;
+   /* Make sure to re-emit the shader pointers for all stages. */
+   sctx->graphics_bindless_pointer_dirty = true;
+   sctx->compute_bindless_pointer_dirty = true;
 
-	return desc_slot;
+   return desc_slot;
 }
 
-static void si_update_bindless_buffer_descriptor(struct si_context *sctx,
-						 unsigned desc_slot,
-						 struct pipe_resource *resource,
-						 uint64_t offset,
-						 bool *desc_dirty)
+static void si_update_bindless_buffer_descriptor(struct si_context *sctx, unsigned desc_slot,
+                                                 struct pipe_resource *resource, uint64_t offset,
+                                                 bool *desc_dirty)
 {
-	struct si_descriptors *desc = &sctx->bindless_descriptors;
-	struct si_resource *buf = si_resource(resource);
-	unsigned desc_slot_offset = desc_slot * 16;
-	uint32_t *desc_list = desc->list + desc_slot_offset + 4;
-	uint64_t old_desc_va;
+   struct si_descriptors *desc = &sctx->bindless_descriptors;
+   struct si_resource *buf = si_resource(resource);
+   unsigned desc_slot_offset = desc_slot * 16;
+   uint32_t *desc_list = desc->list + desc_slot_offset + 4;
+   uint64_t old_desc_va;
 
-	assert(resource->target == PIPE_BUFFER);
+   assert(resource->target == PIPE_BUFFER);
 
-	/* Retrieve the old buffer addr from the descriptor. */
-	old_desc_va = si_desc_extract_buffer_address(desc_list);
+   /* Retrieve the old buffer addr from the descriptor. */
+   old_desc_va = si_desc_extract_buffer_address(desc_list);
 
-	if (old_desc_va != buf->gpu_address + offset) {
-		/* The buffer has been invalidated when the handle wasn't
-		 * resident, update the descriptor and the dirty flag.
-		 */
-		si_set_buf_desc_address(buf, offset, &desc_list[0]);
+   if (old_desc_va != buf->gpu_address + offset) {
+      /* The buffer has been invalidated when the handle wasn't
+       * resident, update the descriptor and the dirty flag.
+       */
+      si_set_buf_desc_address(buf, offset, &desc_list[0]);
 
-		*desc_dirty = true;
-	}
+      *desc_dirty = true;
+   }
 }
 
-static uint64_t si_create_texture_handle(struct pipe_context *ctx,
-					 struct pipe_sampler_view *view,
-					 const struct pipe_sampler_state *state)
+static uint64_t si_create_texture_handle(struct pipe_context *ctx, struct pipe_sampler_view *view,
+                                         const struct pipe_sampler_state *state)
 {
-	struct si_sampler_view *sview = (struct si_sampler_view *)view;
-	struct si_context *sctx = (struct si_context *)ctx;
-	struct si_texture_handle *tex_handle;
-	struct si_sampler_state *sstate;
-	uint32_t desc_list[16];
-	uint64_t handle;
+   struct si_sampler_view *sview = (struct si_sampler_view *)view;
+   struct si_context *sctx = (struct si_context *)ctx;
+   struct si_texture_handle *tex_handle;
+   struct si_sampler_state *sstate;
+   uint32_t desc_list[16];
+   uint64_t handle;
 
-	tex_handle = CALLOC_STRUCT(si_texture_handle);
-	if (!tex_handle)
-		return 0;
+   tex_handle = CALLOC_STRUCT(si_texture_handle);
+   if (!tex_handle)
+      return 0;
 
-	memset(desc_list, 0, sizeof(desc_list));
-	si_init_descriptor_list(&desc_list[0], 16, 1, null_texture_descriptor);
+   memset(desc_list, 0, sizeof(desc_list));
+   si_init_descriptor_list(&desc_list[0], 16, 1, null_texture_descriptor);
 
-	sstate = ctx->create_sampler_state(ctx, state);
-	if (!sstate) {
-		FREE(tex_handle);
-		return 0;
-	}
+   sstate = ctx->create_sampler_state(ctx, state);
+   if (!sstate) {
+      FREE(tex_handle);
+      return 0;
+   }
 
-	si_set_sampler_view_desc(sctx, sview, sstate, &desc_list[0]);
-	memcpy(&tex_handle->sstate, sstate, sizeof(*sstate));
-	ctx->delete_sampler_state(ctx, sstate);
+   si_set_sampler_view_desc(sctx, sview, sstate, &desc_list[0]);
+   memcpy(&tex_handle->sstate, sstate, sizeof(*sstate));
+   ctx->delete_sampler_state(ctx, sstate);
 
-	tex_handle->desc_slot = si_create_bindless_descriptor(sctx, desc_list,
-							      sizeof(desc_list));
-	if (!tex_handle->desc_slot) {
-		FREE(tex_handle);
-		return 0;
-	}
+   tex_handle->desc_slot = si_create_bindless_descriptor(sctx, desc_list, sizeof(desc_list));
+   if (!tex_handle->desc_slot) {
+      FREE(tex_handle);
+      return 0;
+   }
 
-	handle = tex_handle->desc_slot;
+   handle = tex_handle->desc_slot;
 
-	if (!_mesa_hash_table_insert(sctx->tex_handles,
-				     (void *)(uintptr_t)handle,
-				     tex_handle)) {
-		FREE(tex_handle);
-		return 0;
-	}
+   if (!_mesa_hash_table_insert(sctx->tex_handles, (void *)(uintptr_t)handle, tex_handle)) {
+      FREE(tex_handle);
+      return 0;
+   }
 
-	pipe_sampler_view_reference(&tex_handle->view, view);
+   pipe_sampler_view_reference(&tex_handle->view, view);
 
-	si_resource(sview->base.texture)->texture_handle_allocated = true;
+   si_resource(sview->base.texture)->texture_handle_allocated = true;
 
-	return handle;
+   return handle;
 }
 
 static void si_delete_texture_handle(struct pipe_context *ctx, uint64_t handle)
 {
-	struct si_context *sctx = (struct si_context *)ctx;
-	struct si_texture_handle *tex_handle;
-	struct hash_entry *entry;
-
-	entry = _mesa_hash_table_search(sctx->tex_handles,
-					(void *)(uintptr_t)handle);
-	if (!entry)
-		return;
-
-	tex_handle = (struct si_texture_handle *)entry->data;
-
-	/* Allow this descriptor slot to be re-used. */
-	util_idalloc_free(&sctx->bindless_used_slots, tex_handle->desc_slot);
-
-	pipe_sampler_view_reference(&tex_handle->view, NULL);
-	_mesa_hash_table_remove(sctx->tex_handles, entry);
-	FREE(tex_handle);
-}
-
-static void si_make_texture_handle_resident(struct pipe_context *ctx,
-					    uint64_t handle, bool resident)
-{
-	struct si_context *sctx = (struct si_context *)ctx;
-	struct si_texture_handle *tex_handle;
-	struct si_sampler_view *sview;
-	struct hash_entry *entry;
-
-	entry = _mesa_hash_table_search(sctx->tex_handles,
-					(void *)(uintptr_t)handle);
-	if (!entry)
-		return;
-
-	tex_handle = (struct si_texture_handle *)entry->data;
-	sview = (struct si_sampler_view *)tex_handle->view;
-
-	if (resident) {
-		if (sview->base.texture->target != PIPE_BUFFER) {
-			struct si_texture *tex =
-				(struct si_texture *)sview->base.texture;
-
-			if (depth_needs_decompression(tex)) {
-				util_dynarray_append(
-					&sctx->resident_tex_needs_depth_decompress,
-					struct si_texture_handle *,
-					tex_handle);
-			}
-
-			if (color_needs_decompression(tex)) {
-				util_dynarray_append(
-					&sctx->resident_tex_needs_color_decompress,
-					struct si_texture_handle *,
-					tex_handle);
-			}
-
-			if (tex->surface.dcc_offset &&
-			    p_atomic_read(&tex->framebuffers_bound))
-				sctx->need_check_render_feedback = true;
-
-			si_update_bindless_texture_descriptor(sctx, tex_handle);
-		} else {
-			si_update_bindless_buffer_descriptor(sctx,
-							     tex_handle->desc_slot,
-							     sview->base.texture,
-							     sview->base.u.buf.offset,
-							     &tex_handle->desc_dirty);
-		}
-
-		/* Re-upload the descriptor if it has been updated while it
-		 * wasn't resident.
-		 */
-		if (tex_handle->desc_dirty)
-			sctx->bindless_descriptors_dirty = true;
-
-		/* Add the texture handle to the per-context list. */
-		util_dynarray_append(&sctx->resident_tex_handles,
-				     struct si_texture_handle *, tex_handle);
-
-		/* Add the buffers to the current CS in case si_begin_new_cs()
-		 * is not going to be called.
-		 */
-		si_sampler_view_add_buffer(sctx, sview->base.texture,
-					   RADEON_USAGE_READ,
-					   sview->is_stencil_sampler, false);
-	} else {
-		/* Remove the texture handle from the per-context list. */
-		util_dynarray_delete_unordered(&sctx->resident_tex_handles,
-					       struct si_texture_handle *,
-					       tex_handle);
-
-		if (sview->base.texture->target != PIPE_BUFFER) {
-			util_dynarray_delete_unordered(
-				&sctx->resident_tex_needs_depth_decompress,
-				struct si_texture_handle *, tex_handle);
-
-			util_dynarray_delete_unordered(
-				&sctx->resident_tex_needs_color_decompress,
-				struct si_texture_handle *, tex_handle);
-		}
-	}
-}
-
-static uint64_t si_create_image_handle(struct pipe_context *ctx,
-				       const struct pipe_image_view *view)
-{
-	struct si_context *sctx = (struct si_context *)ctx;
-	struct si_image_handle *img_handle;
-	uint32_t desc_list[16];
-	uint64_t handle;
-
-	if (!view || !view->resource)
-		return 0;
-
-	img_handle = CALLOC_STRUCT(si_image_handle);
-	if (!img_handle)
-		return 0;
-
-	memset(desc_list, 0, sizeof(desc_list));
-	si_init_descriptor_list(&desc_list[0], 8, 2, null_image_descriptor);
-
-	si_set_shader_image_desc(sctx, view, false, &desc_list[0], &desc_list[8]);
-
-	img_handle->desc_slot = si_create_bindless_descriptor(sctx, desc_list,
-							      sizeof(desc_list));
-	if (!img_handle->desc_slot) {
-		FREE(img_handle);
-		return 0;
-	}
-
-	handle = img_handle->desc_slot;
-
-	if (!_mesa_hash_table_insert(sctx->img_handles,
-				     (void *)(uintptr_t)handle,
-				     img_handle)) {
-		FREE(img_handle);
-		return 0;
-	}
-
-	util_copy_image_view(&img_handle->view, view);
-
-	si_resource(view->resource)->image_handle_allocated = true;
-
-	return handle;
+   struct si_context *sctx = (struct si_context *)ctx;
+   struct si_texture_handle *tex_handle;
+   struct hash_entry *entry;
+
+   entry = _mesa_hash_table_search(sctx->tex_handles, (void *)(uintptr_t)handle);
+   if (!entry)
+      return;
+
+   tex_handle = (struct si_texture_handle *)entry->data;
+
+   /* Allow this descriptor slot to be re-used. */
+   util_idalloc_free(&sctx->bindless_used_slots, tex_handle->desc_slot);
+
+   pipe_sampler_view_reference(&tex_handle->view, NULL);
+   _mesa_hash_table_remove(sctx->tex_handles, entry);
+   FREE(tex_handle);
+}
+
+static void si_make_texture_handle_resident(struct pipe_context *ctx, uint64_t handle,
+                                            bool resident)
+{
+   struct si_context *sctx = (struct si_context *)ctx;
+   struct si_texture_handle *tex_handle;
+   struct si_sampler_view *sview;
+   struct hash_entry *entry;
+
+   entry = _mesa_hash_table_search(sctx->tex_handles, (void *)(uintptr_t)handle);
+   if (!entry)
+      return;
+
+   tex_handle = (struct si_texture_handle *)entry->data;
+   sview = (struct si_sampler_view *)tex_handle->view;
+
+   if (resident) {
+      if (sview->base.texture->target != PIPE_BUFFER) {
+         struct si_texture *tex = (struct si_texture *)sview->base.texture;
+
+         if (depth_needs_decompression(tex)) {
+            util_dynarray_append(&sctx->resident_tex_needs_depth_decompress,
+                                 struct si_texture_handle *, tex_handle);
+         }
+
+         if (color_needs_decompression(tex)) {
+            util_dynarray_append(&sctx->resident_tex_needs_color_decompress,
+                                 struct si_texture_handle *, tex_handle);
+         }
+
+         if (tex->surface.dcc_offset && p_atomic_read(&tex->framebuffers_bound))
+            sctx->need_check_render_feedback = true;
+
+         si_update_bindless_texture_descriptor(sctx, tex_handle);
+      } else {
+         si_update_bindless_buffer_descriptor(sctx, tex_handle->desc_slot, sview->base.texture,
+                                              sview->base.u.buf.offset, &tex_handle->desc_dirty);
+      }
+
+      /* Re-upload the descriptor if it has been updated while it
+       * wasn't resident.
+       */
+      if (tex_handle->desc_dirty)
+         sctx->bindless_descriptors_dirty = true;
+
+      /* Add the texture handle to the per-context list. */
+      util_dynarray_append(&sctx->resident_tex_handles, struct si_texture_handle *, tex_handle);
+
+      /* Add the buffers to the current CS in case si_begin_new_cs()
+       * is not going to be called.
+       */
+      si_sampler_view_add_buffer(sctx, sview->base.texture, RADEON_USAGE_READ,
+                                 sview->is_stencil_sampler, false);
+   } else {
+      /* Remove the texture handle from the per-context list. */
+      util_dynarray_delete_unordered(&sctx->resident_tex_handles, struct si_texture_handle *,
+                                     tex_handle);
+
+      if (sview->base.texture->target != PIPE_BUFFER) {
+         util_dynarray_delete_unordered(&sctx->resident_tex_needs_depth_decompress,
+                                        struct si_texture_handle *, tex_handle);
+
+         util_dynarray_delete_unordered(&sctx->resident_tex_needs_color_decompress,
+                                        struct si_texture_handle *, tex_handle);
+      }
+   }
+}
+
+static uint64_t si_create_image_handle(struct pipe_context *ctx, const struct pipe_image_view *view)
+{
+   struct si_context *sctx = (struct si_context *)ctx;
+   struct si_image_handle *img_handle;
+   uint32_t desc_list[16];
+   uint64_t handle;
+
+   if (!view || !view->resource)
+      return 0;
+
+   img_handle = CALLOC_STRUCT(si_image_handle);
+   if (!img_handle)
+      return 0;
+
+   memset(desc_list, 0, sizeof(desc_list));
+   si_init_descriptor_list(&desc_list[0], 8, 2, null_image_descriptor);
+
+   si_set_shader_image_desc(sctx, view, false, &desc_list[0], &desc_list[8]);
+
+   img_handle->desc_slot = si_create_bindless_descriptor(sctx, desc_list, sizeof(desc_list));
+   if (!img_handle->desc_slot) {
+      FREE(img_handle);
+      return 0;
+   }
+
+   handle = img_handle->desc_slot;
+
+   if (!_mesa_hash_table_insert(sctx->img_handles, (void *)(uintptr_t)handle, img_handle)) {
+      FREE(img_handle);
+      return 0;
+   }
+
+   util_copy_image_view(&img_handle->view, view);
+
+   si_resource(view->resource)->image_handle_allocated = true;
+
+   return handle;
 }
 
 static void si_delete_image_handle(struct pipe_context *ctx, uint64_t handle)
 {
-	struct si_context *sctx = (struct si_context *)ctx;
-	struct si_image_handle *img_handle;
-	struct hash_entry *entry;
-
-	entry = _mesa_hash_table_search(sctx->img_handles,
-					(void *)(uintptr_t)handle);
-	if (!entry)
-		return;
-
-	img_handle = (struct si_image_handle *)entry->data;
-
-	util_copy_image_view(&img_handle->view, NULL);
-	_mesa_hash_table_remove(sctx->img_handles, entry);
-	FREE(img_handle);
-}
-
-static void si_make_image_handle_resident(struct pipe_context *ctx,
-					  uint64_t handle, unsigned access,
-					  bool resident)
-{
-	struct si_context *sctx = (struct si_context *)ctx;
-	struct si_image_handle *img_handle;
-	struct pipe_image_view *view;
-	struct si_resource *res;
-	struct hash_entry *entry;
-
-	entry = _mesa_hash_table_search(sctx->img_handles,
-					(void *)(uintptr_t)handle);
-	if (!entry)
-		return;
-
-	img_handle = (struct si_image_handle *)entry->data;
-	view = &img_handle->view;
-	res = si_resource(view->resource);
-
-	if (resident) {
-		if (res->b.b.target != PIPE_BUFFER) {
-			struct si_texture *tex = (struct si_texture *)res;
-			unsigned level = view->u.tex.level;
-
-			if (color_needs_decompression(tex)) {
-				util_dynarray_append(
-					&sctx->resident_img_needs_color_decompress,
-					struct si_image_handle *,
-					img_handle);
-			}
-
-			if (vi_dcc_enabled(tex, level) &&
-			    p_atomic_read(&tex->framebuffers_bound))
-				sctx->need_check_render_feedback = true;
-
-			si_update_bindless_image_descriptor(sctx, img_handle);
-		} else {
-			si_update_bindless_buffer_descriptor(sctx,
-							     img_handle->desc_slot,
-							     view->resource,
-							     view->u.buf.offset,
-							     &img_handle->desc_dirty);
-		}
-
-		/* Re-upload the descriptor if it has been updated while it
-		 * wasn't resident.
-		 */
-		if (img_handle->desc_dirty)
-			sctx->bindless_descriptors_dirty = true;
-
-		/* Add the image handle to the per-context list. */
-		util_dynarray_append(&sctx->resident_img_handles,
-				     struct si_image_handle *, img_handle);
-
-		/* Add the buffers to the current CS in case si_begin_new_cs()
-		 * is not going to be called.
-		 */
-		si_sampler_view_add_buffer(sctx, view->resource,
-					   (access & PIPE_IMAGE_ACCESS_WRITE) ?
-					   RADEON_USAGE_READWRITE :
-					   RADEON_USAGE_READ, false, false);
-	} else {
-		/* Remove the image handle from the per-context list. */
-		util_dynarray_delete_unordered(&sctx->resident_img_handles,
-					       struct si_image_handle *,
-					       img_handle);
-
-		if (res->b.b.target != PIPE_BUFFER) {
-			util_dynarray_delete_unordered(
-				&sctx->resident_img_needs_color_decompress,
-				struct si_image_handle *,
-				img_handle);
-		}
-	}
+   struct si_context *sctx = (struct si_context *)ctx;
+   struct si_image_handle *img_handle;
+   struct hash_entry *entry;
+
+   entry = _mesa_hash_table_search(sctx->img_handles, (void *)(uintptr_t)handle);
+   if (!entry)
+      return;
+
+   img_handle = (struct si_image_handle *)entry->data;
+
+   util_copy_image_view(&img_handle->view, NULL);
+   _mesa_hash_table_remove(sctx->img_handles, entry);
+   FREE(img_handle);
+}
+
+static void si_make_image_handle_resident(struct pipe_context *ctx, uint64_t handle,
+                                          unsigned access, bool resident)
+{
+   struct si_context *sctx = (struct si_context *)ctx;
+   struct si_image_handle *img_handle;
+   struct pipe_image_view *view;
+   struct si_resource *res;
+   struct hash_entry *entry;
+
+   entry = _mesa_hash_table_search(sctx->img_handles, (void *)(uintptr_t)handle);
+   if (!entry)
+      return;
+
+   img_handle = (struct si_image_handle *)entry->data;
+   view = &img_handle->view;
+   res = si_resource(view->resource);
+
+   if (resident) {
+      if (res->b.b.target != PIPE_BUFFER) {
+         struct si_texture *tex = (struct si_texture *)res;
+         unsigned level = view->u.tex.level;
+
+         if (color_needs_decompression(tex)) {
+            util_dynarray_append(&sctx->resident_img_needs_color_decompress,
+                                 struct si_image_handle *, img_handle);
+         }
+
+         if (vi_dcc_enabled(tex, level) && p_atomic_read(&tex->framebuffers_bound))
+            sctx->need_check_render_feedback = true;
+
+         si_update_bindless_image_descriptor(sctx, img_handle);
+      } else {
+         si_update_bindless_buffer_descriptor(sctx, img_handle->desc_slot, view->resource,
+                                              view->u.buf.offset, &img_handle->desc_dirty);
+      }
+
+      /* Re-upload the descriptor if it has been updated while it
+       * wasn't resident.
+       */
+      if (img_handle->desc_dirty)
+         sctx->bindless_descriptors_dirty = true;
+
+      /* Add the image handle to the per-context list. */
+      util_dynarray_append(&sctx->resident_img_handles, struct si_image_handle *, img_handle);
+
+      /* Add the buffers to the current CS in case si_begin_new_cs()
+       * is not going to be called.
+       */
+      si_sampler_view_add_buffer(
+         sctx, view->resource,
+         (access & PIPE_IMAGE_ACCESS_WRITE) ? RADEON_USAGE_READWRITE : RADEON_USAGE_READ, false,
+         false);
+   } else {
+      /* Remove the image handle from the per-context list. */
+      util_dynarray_delete_unordered(&sctx->resident_img_handles, struct si_image_handle *,
+                                     img_handle);
+
+      if (res->b.b.target != PIPE_BUFFER) {
+         util_dynarray_delete_unordered(&sctx->resident_img_needs_color_decompress,
+                                        struct si_image_handle *, img_handle);
+      }
+   }
 }
 
 static void si_resident_buffers_add_all_to_bo_list(struct si_context *sctx)
 {
-	unsigned num_resident_tex_handles, num_resident_img_handles;
+   unsigned num_resident_tex_handles, num_resident_img_handles;
 
-	num_resident_tex_handles = sctx->resident_tex_handles.size /
-				   sizeof(struct si_texture_handle *);
-	num_resident_img_handles = sctx->resident_img_handles.size /
-				   sizeof(struct si_image_handle *);
+   num_resident_tex_handles = sctx->resident_tex_handles.size / sizeof(struct si_texture_handle *);
+   num_resident_img_handles = sctx->resident_img_handles.size / sizeof(struct si_image_handle *);
 
-	/* Add all resident texture handles. */
-	util_dynarray_foreach(&sctx->resident_tex_handles,
-			      struct si_texture_handle *, tex_handle) {
-		struct si_sampler_view *sview =
-			(struct si_sampler_view *)(*tex_handle)->view;
+   /* Add all resident texture handles. */
+   util_dynarray_foreach (&sctx->resident_tex_handles, struct si_texture_handle *, tex_handle) {
+      struct si_sampler_view *sview = (struct si_sampler_view *)(*tex_handle)->view;
 
-		si_sampler_view_add_buffer(sctx, sview->base.texture,
-					   RADEON_USAGE_READ,
-					   sview->is_stencil_sampler, false);
-	}
+      si_sampler_view_add_buffer(sctx, sview->base.texture, RADEON_USAGE_READ,
+                                 sview->is_stencil_sampler, false);
+   }
 
-	/* Add all resident image handles. */
-	util_dynarray_foreach(&sctx->resident_img_handles,
-			      struct si_image_handle *, img_handle) {
-		struct pipe_image_view *view = &(*img_handle)->view;
+   /* Add all resident image handles. */
+   util_dynarray_foreach (&sctx->resident_img_handles, struct si_image_handle *, img_handle) {
+      struct pipe_image_view *view = &(*img_handle)->view;
 
-		si_sampler_view_add_buffer(sctx, view->resource,
-					   RADEON_USAGE_READWRITE,
-					   false, false);
-	}
+      si_sampler_view_add_buffer(sctx, view->resource, RADEON_USAGE_READWRITE, false, false);
+   }
 
-	sctx->num_resident_handles += num_resident_tex_handles +
-					num_resident_img_handles;
-	assert(sctx->bo_list_add_all_resident_resources);
-	sctx->bo_list_add_all_resident_resources = false;
+   sctx->num_resident_handles += num_resident_tex_handles + num_resident_img_handles;
+   assert(sctx->bo_list_add_all_resident_resources);
+   sctx->bo_list_add_all_resident_resources = false;
 }
 
 /* INIT/DEINIT/UPLOAD */
 
 void si_init_all_descriptors(struct si_context *sctx)
 {
-	int i;
-	unsigned first_shader =
-		sctx->has_graphics ? 0 : PIPE_SHADER_COMPUTE;
-
-	for (i = first_shader; i < SI_NUM_SHADERS; i++) {
-		bool is_2nd = sctx->chip_class >= GFX9 &&
-				     (i == PIPE_SHADER_TESS_CTRL ||
-				      i == PIPE_SHADER_GEOMETRY);
-		unsigned num_sampler_slots = SI_NUM_IMAGE_SLOTS / 2 + SI_NUM_SAMPLERS;
-		unsigned num_buffer_slots = SI_NUM_SHADER_BUFFERS + SI_NUM_CONST_BUFFERS;
-		int rel_dw_offset;
-		struct si_descriptors *desc;
-
-		if (is_2nd) {
-			if (i == PIPE_SHADER_TESS_CTRL) {
-				rel_dw_offset = (R_00B408_SPI_SHADER_USER_DATA_ADDR_LO_HS -
-						 R_00B430_SPI_SHADER_USER_DATA_LS_0) / 4;
-			} else if (sctx->chip_class >= GFX10) { /* PIPE_SHADER_GEOMETRY */
-				rel_dw_offset = (R_00B208_SPI_SHADER_USER_DATA_ADDR_LO_GS -
-						 R_00B230_SPI_SHADER_USER_DATA_GS_0) / 4;
-			} else {
-				rel_dw_offset = (R_00B208_SPI_SHADER_USER_DATA_ADDR_LO_GS -
-						 R_00B330_SPI_SHADER_USER_DATA_ES_0) / 4;
-			}
-		} else {
-			rel_dw_offset = SI_SGPR_CONST_AND_SHADER_BUFFERS;
-		}
-		desc = si_const_and_shader_buffer_descriptors(sctx, i);
-		si_init_buffer_resources(&sctx->const_and_shader_buffers[i], desc,
-					 num_buffer_slots, rel_dw_offset,
-					 RADEON_PRIO_SHADER_RW_BUFFER,
-					 RADEON_PRIO_CONST_BUFFER);
-		desc->slot_index_to_bind_directly = si_get_constbuf_slot(0);
-
-		if (is_2nd) {
-			if (i == PIPE_SHADER_TESS_CTRL) {
-				rel_dw_offset = (R_00B40C_SPI_SHADER_USER_DATA_ADDR_HI_HS -
-						 R_00B430_SPI_SHADER_USER_DATA_LS_0) / 4;
-			} else if (sctx->chip_class >= GFX10) { /* PIPE_SHADER_GEOMETRY */
-				rel_dw_offset = (R_00B20C_SPI_SHADER_USER_DATA_ADDR_HI_GS -
-						 R_00B230_SPI_SHADER_USER_DATA_GS_0) / 4;
-			} else {
-				rel_dw_offset = (R_00B20C_SPI_SHADER_USER_DATA_ADDR_HI_GS -
-						 R_00B330_SPI_SHADER_USER_DATA_ES_0) / 4;
-			}
-		} else {
-			rel_dw_offset = SI_SGPR_SAMPLERS_AND_IMAGES;
-		}
-
-		desc = si_sampler_and_image_descriptors(sctx, i);
-		si_init_descriptors(desc, rel_dw_offset, 16, num_sampler_slots);
-
-		int j;
-		for (j = 0; j < SI_NUM_IMAGE_SLOTS; j++)
-			memcpy(desc->list + j * 8, null_image_descriptor, 8 * 4);
-		for (; j < SI_NUM_IMAGE_SLOTS + SI_NUM_SAMPLERS * 2; j++)
-			memcpy(desc->list + j * 8, null_texture_descriptor, 8 * 4);
-	}
-
-	si_init_buffer_resources(&sctx->rw_buffers,
-				 &sctx->descriptors[SI_DESCS_RW_BUFFERS],
-				 SI_NUM_RW_BUFFERS, SI_SGPR_RW_BUFFERS,
-				 /* The second priority is used by
-				  * const buffers in RW buffer slots. */
-				 RADEON_PRIO_SHADER_RINGS, RADEON_PRIO_CONST_BUFFER);
-	sctx->descriptors[SI_DESCS_RW_BUFFERS].num_active_slots = SI_NUM_RW_BUFFERS;
-
-	/* Initialize an array of 1024 bindless descriptors, when the limit is
-	 * reached, just make it larger and re-upload the whole array.
-	 */
-	si_init_bindless_descriptors(sctx, &sctx->bindless_descriptors,
-				     SI_SGPR_BINDLESS_SAMPLERS_AND_IMAGES,
-				     1024);
-
-	sctx->descriptors_dirty = u_bit_consecutive(0, SI_NUM_DESCS);
-
-	/* Set pipe_context functions. */
-	sctx->b.bind_sampler_states = si_bind_sampler_states;
-	sctx->b.set_shader_images = si_set_shader_images;
-	sctx->b.set_constant_buffer = si_pipe_set_constant_buffer;
-	sctx->b.set_shader_buffers = si_set_shader_buffers;
-	sctx->b.set_sampler_views = si_set_sampler_views;
-	sctx->b.create_texture_handle = si_create_texture_handle;
-	sctx->b.delete_texture_handle = si_delete_texture_handle;
-	sctx->b.make_texture_handle_resident = si_make_texture_handle_resident;
-	sctx->b.create_image_handle = si_create_image_handle;
-	sctx->b.delete_image_handle = si_delete_image_handle;
-	sctx->b.make_image_handle_resident = si_make_image_handle_resident;
-
-	if (!sctx->has_graphics)
-		return;
-
-	sctx->b.set_polygon_stipple = si_set_polygon_stipple;
-
-	/* Shader user data. */
-	sctx->atoms.s.shader_pointers.emit = si_emit_graphics_shader_pointers;
-
-	/* Set default and immutable mappings. */
-	if (sctx->ngg) {
-		assert(sctx->chip_class >= GFX10);
-		si_set_user_data_base(sctx, PIPE_SHADER_VERTEX, R_00B230_SPI_SHADER_USER_DATA_GS_0);
-	} else {
-		si_set_user_data_base(sctx, PIPE_SHADER_VERTEX, R_00B130_SPI_SHADER_USER_DATA_VS_0);
-	}
-
-	if (sctx->chip_class == GFX9) {
-		si_set_user_data_base(sctx, PIPE_SHADER_TESS_CTRL,
-				      R_00B430_SPI_SHADER_USER_DATA_LS_0);
-		si_set_user_data_base(sctx, PIPE_SHADER_GEOMETRY,
-				      R_00B330_SPI_SHADER_USER_DATA_ES_0);
-	} else {
-		si_set_user_data_base(sctx, PIPE_SHADER_TESS_CTRL,
-				      R_00B430_SPI_SHADER_USER_DATA_HS_0);
-		si_set_user_data_base(sctx, PIPE_SHADER_GEOMETRY,
-				      R_00B230_SPI_SHADER_USER_DATA_GS_0);
-	}
-	si_set_user_data_base(sctx, PIPE_SHADER_FRAGMENT, R_00B030_SPI_SHADER_USER_DATA_PS_0);
+   int i;
+   unsigned first_shader = sctx->has_graphics ? 0 : PIPE_SHADER_COMPUTE;
+
+   for (i = first_shader; i < SI_NUM_SHADERS; i++) {
+      bool is_2nd =
+         sctx->chip_class >= GFX9 && (i == PIPE_SHADER_TESS_CTRL || i == PIPE_SHADER_GEOMETRY);
+      unsigned num_sampler_slots = SI_NUM_IMAGE_SLOTS / 2 + SI_NUM_SAMPLERS;
+      unsigned num_buffer_slots = SI_NUM_SHADER_BUFFERS + SI_NUM_CONST_BUFFERS;
+      int rel_dw_offset;
+      struct si_descriptors *desc;
+
+      if (is_2nd) {
+         if (i == PIPE_SHADER_TESS_CTRL) {
+            rel_dw_offset =
+               (R_00B408_SPI_SHADER_USER_DATA_ADDR_LO_HS - R_00B430_SPI_SHADER_USER_DATA_LS_0) / 4;
+         } else if (sctx->chip_class >= GFX10) { /* PIPE_SHADER_GEOMETRY */
+            rel_dw_offset =
+               (R_00B208_SPI_SHADER_USER_DATA_ADDR_LO_GS - R_00B230_SPI_SHADER_USER_DATA_GS_0) / 4;
+         } else {
+            rel_dw_offset =
+               (R_00B208_SPI_SHADER_USER_DATA_ADDR_LO_GS - R_00B330_SPI_SHADER_USER_DATA_ES_0) / 4;
+         }
+      } else {
+         rel_dw_offset = SI_SGPR_CONST_AND_SHADER_BUFFERS;
+      }
+      desc = si_const_and_shader_buffer_descriptors(sctx, i);
+      si_init_buffer_resources(&sctx->const_and_shader_buffers[i], desc, num_buffer_slots,
+                               rel_dw_offset, RADEON_PRIO_SHADER_RW_BUFFER,
+                               RADEON_PRIO_CONST_BUFFER);
+      desc->slot_index_to_bind_directly = si_get_constbuf_slot(0);
+
+      if (is_2nd) {
+         if (i == PIPE_SHADER_TESS_CTRL) {
+            rel_dw_offset =
+               (R_00B40C_SPI_SHADER_USER_DATA_ADDR_HI_HS - R_00B430_SPI_SHADER_USER_DATA_LS_0) / 4;
+         } else if (sctx->chip_class >= GFX10) { /* PIPE_SHADER_GEOMETRY */
+            rel_dw_offset =
+               (R_00B20C_SPI_SHADER_USER_DATA_ADDR_HI_GS - R_00B230_SPI_SHADER_USER_DATA_GS_0) / 4;
+         } else {
+            rel_dw_offset =
+               (R_00B20C_SPI_SHADER_USER_DATA_ADDR_HI_GS - R_00B330_SPI_SHADER_USER_DATA_ES_0) / 4;
+         }
+      } else {
+         rel_dw_offset = SI_SGPR_SAMPLERS_AND_IMAGES;
+      }
+
+      desc = si_sampler_and_image_descriptors(sctx, i);
+      si_init_descriptors(desc, rel_dw_offset, 16, num_sampler_slots);
+
+      int j;
+      for (j = 0; j < SI_NUM_IMAGE_SLOTS; j++)
+         memcpy(desc->list + j * 8, null_image_descriptor, 8 * 4);
+      for (; j < SI_NUM_IMAGE_SLOTS + SI_NUM_SAMPLERS * 2; j++)
+         memcpy(desc->list + j * 8, null_texture_descriptor, 8 * 4);
+   }
+
+   si_init_buffer_resources(&sctx->rw_buffers, &sctx->descriptors[SI_DESCS_RW_BUFFERS],
+                            SI_NUM_RW_BUFFERS, SI_SGPR_RW_BUFFERS,
+                            /* The second priority is used by
+                             * const buffers in RW buffer slots. */
+                            RADEON_PRIO_SHADER_RINGS, RADEON_PRIO_CONST_BUFFER);
+   sctx->descriptors[SI_DESCS_RW_BUFFERS].num_active_slots = SI_NUM_RW_BUFFERS;
+
+   /* Initialize an array of 1024 bindless descriptors, when the limit is
+    * reached, just make it larger and re-upload the whole array.
+    */
+   si_init_bindless_descriptors(sctx, &sctx->bindless_descriptors,
+                                SI_SGPR_BINDLESS_SAMPLERS_AND_IMAGES, 1024);
+
+   sctx->descriptors_dirty = u_bit_consecutive(0, SI_NUM_DESCS);
+
+   /* Set pipe_context functions. */
+   sctx->b.bind_sampler_states = si_bind_sampler_states;
+   sctx->b.set_shader_images = si_set_shader_images;
+   sctx->b.set_constant_buffer = si_pipe_set_constant_buffer;
+   sctx->b.set_shader_buffers = si_set_shader_buffers;
+   sctx->b.set_sampler_views = si_set_sampler_views;
+   sctx->b.create_texture_handle = si_create_texture_handle;
+   sctx->b.delete_texture_handle = si_delete_texture_handle;
+   sctx->b.make_texture_handle_resident = si_make_texture_handle_resident;
+   sctx->b.create_image_handle = si_create_image_handle;
+   sctx->b.delete_image_handle = si_delete_image_handle;
+   sctx->b.make_image_handle_resident = si_make_image_handle_resident;
+
+   if (!sctx->has_graphics)
+      return;
+
+   sctx->b.set_polygon_stipple = si_set_polygon_stipple;
+
+   /* Shader user data. */
+   sctx->atoms.s.shader_pointers.emit = si_emit_graphics_shader_pointers;
+
+   /* Set default and immutable mappings. */
+   if (sctx->ngg) {
+      assert(sctx->chip_class >= GFX10);
+      si_set_user_data_base(sctx, PIPE_SHADER_VERTEX, R_00B230_SPI_SHADER_USER_DATA_GS_0);
+   } else {
+      si_set_user_data_base(sctx, PIPE_SHADER_VERTEX, R_00B130_SPI_SHADER_USER_DATA_VS_0);
+   }
+
+   if (sctx->chip_class == GFX9) {
+      si_set_user_data_base(sctx, PIPE_SHADER_TESS_CTRL, R_00B430_SPI_SHADER_USER_DATA_LS_0);
+      si_set_user_data_base(sctx, PIPE_SHADER_GEOMETRY, R_00B330_SPI_SHADER_USER_DATA_ES_0);
+   } else {
+      si_set_user_data_base(sctx, PIPE_SHADER_TESS_CTRL, R_00B430_SPI_SHADER_USER_DATA_HS_0);
+      si_set_user_data_base(sctx, PIPE_SHADER_GEOMETRY, R_00B230_SPI_SHADER_USER_DATA_GS_0);
+   }
+   si_set_user_data_base(sctx, PIPE_SHADER_FRAGMENT, R_00B030_SPI_SHADER_USER_DATA_PS_0);
 }
 
 static bool si_upload_shader_descriptors(struct si_context *sctx, unsigned mask)
 {
-	unsigned dirty = sctx->descriptors_dirty & mask;
+   unsigned dirty = sctx->descriptors_dirty & mask;
 
-	/* Assume nothing will go wrong: */
-	sctx->shader_pointers_dirty |= dirty;
+   /* Assume nothing will go wrong: */
+   sctx->shader_pointers_dirty |= dirty;
 
-	while (dirty) {
-		unsigned i = u_bit_scan(&dirty);
+   while (dirty) {
+      unsigned i = u_bit_scan(&dirty);
 
-		if (!si_upload_descriptors(sctx, &sctx->descriptors[i]))
-			return false;
-	}
+      if (!si_upload_descriptors(sctx, &sctx->descriptors[i]))
+         return false;
+   }
 
-	sctx->descriptors_dirty &= ~mask;
+   sctx->descriptors_dirty &= ~mask;
 
-	si_upload_bindless_descriptors(sctx);
+   si_upload_bindless_descriptors(sctx);
 
-	return true;
+   return true;
 }
 
 bool si_upload_graphics_shader_descriptors(struct si_context *sctx)
 {
-	const unsigned mask = u_bit_consecutive(0, SI_DESCS_FIRST_COMPUTE);
-	return si_upload_shader_descriptors(sctx, mask);
+   const unsigned mask = u_bit_consecutive(0, SI_DESCS_FIRST_COMPUTE);
+   return si_upload_shader_descriptors(sctx, mask);
 }
 
 bool si_upload_compute_shader_descriptors(struct si_context *sctx)
 {
-	/* Does not update rw_buffers as that is not needed for compute shaders
-	 * and the input buffer is using the same SGPR's anyway.
-	 */
-	const unsigned mask = u_bit_consecutive(SI_DESCS_FIRST_COMPUTE,
-						SI_NUM_DESCS - SI_DESCS_FIRST_COMPUTE);
-	return si_upload_shader_descriptors(sctx, mask);
+   /* Does not update rw_buffers as that is not needed for compute shaders
+    * and the input buffer is using the same SGPR's anyway.
+    */
+   const unsigned mask =
+      u_bit_consecutive(SI_DESCS_FIRST_COMPUTE, SI_NUM_DESCS - SI_DESCS_FIRST_COMPUTE);
+   return si_upload_shader_descriptors(sctx, mask);
 }
 
 void si_release_all_descriptors(struct si_context *sctx)
 {
-	int i;
+   int i;
 
-	for (i = 0; i < SI_NUM_SHADERS; i++) {
-		si_release_buffer_resources(&sctx->const_and_shader_buffers[i],
-					    si_const_and_shader_buffer_descriptors(sctx, i));
-		si_release_sampler_views(&sctx->samplers[i]);
-		si_release_image_views(&sctx->images[i]);
-	}
-	si_release_buffer_resources(&sctx->rw_buffers,
-				    &sctx->descriptors[SI_DESCS_RW_BUFFERS]);
-	for (i = 0; i < SI_NUM_VERTEX_BUFFERS; i++)
-		pipe_vertex_buffer_unreference(&sctx->vertex_buffer[i]);
+   for (i = 0; i < SI_NUM_SHADERS; i++) {
+      si_release_buffer_resources(&sctx->const_and_shader_buffers[i],
+                                  si_const_and_shader_buffer_descriptors(sctx, i));
+      si_release_sampler_views(&sctx->samplers[i]);
+      si_release_image_views(&sctx->images[i]);
+   }
+   si_release_buffer_resources(&sctx->rw_buffers, &sctx->descriptors[SI_DESCS_RW_BUFFERS]);
+   for (i = 0; i < SI_NUM_VERTEX_BUFFERS; i++)
+      pipe_vertex_buffer_unreference(&sctx->vertex_buffer[i]);
 
-	for (i = 0; i < SI_NUM_DESCS; ++i)
-		si_release_descriptors(&sctx->descriptors[i]);
+   for (i = 0; i < SI_NUM_DESCS; ++i)
+      si_release_descriptors(&sctx->descriptors[i]);
 
-	si_resource_reference(&sctx->vb_descriptors_buffer, NULL);
-	sctx->vb_descriptors_gpu_list = NULL; /* points into a mapped buffer */
+   si_resource_reference(&sctx->vb_descriptors_buffer, NULL);
+   sctx->vb_descriptors_gpu_list = NULL; /* points into a mapped buffer */
 
-	si_release_bindless_descriptors(sctx);
+   si_release_bindless_descriptors(sctx);
 }
 
 void si_gfx_resources_add_all_to_bo_list(struct si_context *sctx)
 {
-	for (unsigned i = 0; i < SI_NUM_GRAPHICS_SHADERS; i++) {
-		si_buffer_resources_begin_new_cs(sctx, &sctx->const_and_shader_buffers[i]);
-		si_sampler_views_begin_new_cs(sctx, &sctx->samplers[i]);
-		si_image_views_begin_new_cs(sctx, &sctx->images[i]);
-	}
-	si_buffer_resources_begin_new_cs(sctx, &sctx->rw_buffers);
-	si_vertex_buffers_begin_new_cs(sctx);
+   for (unsigned i = 0; i < SI_NUM_GRAPHICS_SHADERS; i++) {
+      si_buffer_resources_begin_new_cs(sctx, &sctx->const_and_shader_buffers[i]);
+      si_sampler_views_begin_new_cs(sctx, &sctx->samplers[i]);
+      si_image_views_begin_new_cs(sctx, &sctx->images[i]);
+   }
+   si_buffer_resources_begin_new_cs(sctx, &sctx->rw_buffers);
+   si_vertex_buffers_begin_new_cs(sctx);
 
-	if (sctx->bo_list_add_all_resident_resources)
-		si_resident_buffers_add_all_to_bo_list(sctx);
+   if (sctx->bo_list_add_all_resident_resources)
+      si_resident_buffers_add_all_to_bo_list(sctx);
 
-	assert(sctx->bo_list_add_all_gfx_resources);
-	sctx->bo_list_add_all_gfx_resources = false;
+   assert(sctx->bo_list_add_all_gfx_resources);
+   sctx->bo_list_add_all_gfx_resources = false;
 }
 
 void si_compute_resources_add_all_to_bo_list(struct si_context *sctx)
 {
-	unsigned sh = PIPE_SHADER_COMPUTE;
+   unsigned sh = PIPE_SHADER_COMPUTE;
 
-	si_buffer_resources_begin_new_cs(sctx, &sctx->const_and_shader_buffers[sh]);
-	si_sampler_views_begin_new_cs(sctx, &sctx->samplers[sh]);
-	si_image_views_begin_new_cs(sctx, &sctx->images[sh]);
-	si_buffer_resources_begin_new_cs(sctx, &sctx->rw_buffers);
+   si_buffer_resources_begin_new_cs(sctx, &sctx->const_and_shader_buffers[sh]);
+   si_sampler_views_begin_new_cs(sctx, &sctx->samplers[sh]);
+   si_image_views_begin_new_cs(sctx, &sctx->images[sh]);
+   si_buffer_resources_begin_new_cs(sctx, &sctx->rw_buffers);
 
-	if (sctx->bo_list_add_all_resident_resources)
-		si_resident_buffers_add_all_to_bo_list(sctx);
+   if (sctx->bo_list_add_all_resident_resources)
+      si_resident_buffers_add_all_to_bo_list(sctx);
 
-	assert(sctx->bo_list_add_all_compute_resources);
-	sctx->bo_list_add_all_compute_resources = false;
+   assert(sctx->bo_list_add_all_compute_resources);
+   sctx->bo_list_add_all_compute_resources = false;
 }
 
 void si_all_descriptors_begin_new_cs(struct si_context *sctx)
 {
-	for (unsigned i = 0; i < SI_NUM_DESCS; ++i)
-		si_descriptors_begin_new_cs(sctx, &sctx->descriptors[i]);
-	si_descriptors_begin_new_cs(sctx, &sctx->bindless_descriptors);
+   for (unsigned i = 0; i < SI_NUM_DESCS; ++i)
+      si_descriptors_begin_new_cs(sctx, &sctx->descriptors[i]);
+   si_descriptors_begin_new_cs(sctx, &sctx->bindless_descriptors);
 
-	si_shader_pointers_begin_new_cs(sctx);
+   si_shader_pointers_begin_new_cs(sctx);
 
-	sctx->bo_list_add_all_resident_resources = true;
-	sctx->bo_list_add_all_gfx_resources = true;
-	sctx->bo_list_add_all_compute_resources = true;
+   sctx->bo_list_add_all_resident_resources = true;
+   sctx->bo_list_add_all_gfx_resources = true;
+   sctx->bo_list_add_all_compute_resources = true;
 }
 
-void si_set_active_descriptors(struct si_context *sctx, unsigned desc_idx,
-			       uint64_t new_active_mask)
+void si_set_active_descriptors(struct si_context *sctx, unsigned desc_idx, uint64_t new_active_mask)
 {
-	struct si_descriptors *desc = &sctx->descriptors[desc_idx];
+   struct si_descriptors *desc = &sctx->descriptors[desc_idx];
 
-	/* Ignore no-op updates and updates that disable all slots. */
-	if (!new_active_mask ||
-	    new_active_mask == u_bit_consecutive64(desc->first_active_slot,
-						   desc->num_active_slots))
-		return;
+   /* Ignore no-op updates and updates that disable all slots. */
+   if (!new_active_mask ||
+       new_active_mask == u_bit_consecutive64(desc->first_active_slot, desc->num_active_slots))
+      return;
 
-	int first, count;
-	u_bit_scan_consecutive_range64(&new_active_mask, &first, &count);
-	assert(new_active_mask == 0);
+   int first, count;
+   u_bit_scan_consecutive_range64(&new_active_mask, &first, &count);
+   assert(new_active_mask == 0);
 
-	/* Upload/dump descriptors if slots are being enabled. */
-	if (first < desc->first_active_slot ||
-	    first + count > desc->first_active_slot + desc->num_active_slots)
-		sctx->descriptors_dirty |= 1u << desc_idx;
+   /* Upload/dump descriptors if slots are being enabled. */
+   if (first < desc->first_active_slot ||
+       first + count > desc->first_active_slot + desc->num_active_slots)
+      sctx->descriptors_dirty |= 1u << desc_idx;
 
-	desc->first_active_slot = first;
-	desc->num_active_slots = count;
+   desc->first_active_slot = first;
+   desc->num_active_slots = count;
 }
 
-void si_set_active_descriptors_for_shader(struct si_context *sctx,
-					  struct si_shader_selector *sel)
+void si_set_active_descriptors_for_shader(struct si_context *sctx, struct si_shader_selector *sel)
 {
-	if (!sel)
-		return;
+   if (!sel)
+      return;
 
-	si_set_active_descriptors(sctx,
-		si_const_and_shader_buffer_descriptors_idx(sel->type),
-		sel->active_const_and_shader_buffers);
-	si_set_active_descriptors(sctx,
-		si_sampler_and_image_descriptors_idx(sel->type),
-		sel->active_samplers_and_images);
+   si_set_active_descriptors(sctx, si_const_and_shader_buffer_descriptors_idx(sel->type),
+                             sel->active_const_and_shader_buffers);
+   si_set_active_descriptors(sctx, si_sampler_and_image_descriptors_idx(sel->type),
+                             sel->active_samplers_and_images);
 }
diff --git a/src/gallium/drivers/radeonsi/si_dma_cs.c b/src/gallium/drivers/radeonsi/si_dma_cs.c
index c58b2b103be..673c3310a1a 100644
--- a/src/gallium/drivers/radeonsi/si_dma_cs.c
+++ b/src/gallium/drivers/radeonsi/si_dma_cs.c
@@ -27,304 +27,279 @@
 
 static void si_dma_emit_wait_idle(struct si_context *sctx)
 {
-	struct radeon_cmdbuf *cs = sctx->sdma_cs;
+   struct radeon_cmdbuf *cs = sctx->sdma_cs;
 
-	/* NOP waits for idle. */
-	if (sctx->chip_class >= GFX7)
-		radeon_emit(cs, 0x00000000); /* NOP */
-	else
-		radeon_emit(cs, 0xf0000000); /* NOP */
+   /* NOP waits for idle. */
+   if (sctx->chip_class >= GFX7)
+      radeon_emit(cs, 0x00000000); /* NOP */
+   else
+      radeon_emit(cs, 0xf0000000); /* NOP */
 }
 
-void si_dma_emit_timestamp(struct si_context *sctx, struct si_resource *dst,
-			   uint64_t offset)
+void si_dma_emit_timestamp(struct si_context *sctx, struct si_resource *dst, uint64_t offset)
 {
-	struct radeon_cmdbuf *cs = sctx->sdma_cs;
-	uint64_t va = dst->gpu_address + offset;
+   struct radeon_cmdbuf *cs = sctx->sdma_cs;
+   uint64_t va = dst->gpu_address + offset;
 
-	if (sctx->chip_class == GFX6) {
-		unreachable("SI DMA doesn't support the timestamp packet.");
-		return;
-	}
+   if (sctx->chip_class == GFX6) {
+      unreachable("SI DMA doesn't support the timestamp packet.");
+      return;
+   }
 
-	/* Mark the buffer range of destination as valid (initialized),
-	 * so that transfer_map knows it should wait for the GPU when mapping
-	 * that range. */
-	util_range_add(&dst->b.b, &dst->valid_buffer_range, offset, offset + 8);
+   /* Mark the buffer range of destination as valid (initialized),
+    * so that transfer_map knows it should wait for the GPU when mapping
+    * that range. */
+   util_range_add(&dst->b.b, &dst->valid_buffer_range, offset, offset + 8);
 
-	assert(va % 8 == 0);
+   assert(va % 8 == 0);
 
-	si_need_dma_space(sctx, 4, dst, NULL);
-	si_dma_emit_wait_idle(sctx);
+   si_need_dma_space(sctx, 4, dst, NULL);
+   si_dma_emit_wait_idle(sctx);
 
-	radeon_emit(cs, CIK_SDMA_PACKET(CIK_SDMA_OPCODE_TIMESTAMP,
-					SDMA_TS_SUB_OPCODE_GET_GLOBAL_TIMESTAMP,
-					0));
-	radeon_emit(cs, va);
-	radeon_emit(cs, va >> 32);
+   radeon_emit(
+      cs, CIK_SDMA_PACKET(CIK_SDMA_OPCODE_TIMESTAMP, SDMA_TS_SUB_OPCODE_GET_GLOBAL_TIMESTAMP, 0));
+   radeon_emit(cs, va);
+   radeon_emit(cs, va >> 32);
 }
 
-void si_sdma_clear_buffer(struct si_context *sctx, struct pipe_resource *dst,
-			  uint64_t offset, uint64_t size, unsigned clear_value)
+void si_sdma_clear_buffer(struct si_context *sctx, struct pipe_resource *dst, uint64_t offset,
+                          uint64_t size, unsigned clear_value)
 {
-	struct radeon_cmdbuf *cs = sctx->sdma_cs;
-	unsigned i, ncopy, csize;
-	struct si_resource *sdst = si_resource(dst);
-
-	assert(offset % 4 == 0);
-	assert(size);
-	assert(size % 4 == 0);
-
-	if (!cs || dst->flags & PIPE_RESOURCE_FLAG_SPARSE ||
-	    sctx->screen->debug_flags & DBG(NO_SDMA_CLEARS)) {
-		sctx->b.clear_buffer(&sctx->b, dst, offset, size, &clear_value, 4);
-		return;
-	}
-
-	/* Mark the buffer range of destination as valid (initialized),
-	 * so that transfer_map knows it should wait for the GPU when mapping
-	 * that range. */
-	util_range_add(dst, &sdst->valid_buffer_range, offset, offset + size);
-
-	offset += sdst->gpu_address;
-
-	if (sctx->chip_class == GFX6) {
-		/* the same maximum size as for copying */
-		ncopy = DIV_ROUND_UP(size, SI_DMA_COPY_MAX_DWORD_ALIGNED_SIZE);
-		si_need_dma_space(sctx, ncopy * 4, sdst, NULL);
-
-		for (i = 0; i < ncopy; i++) {
-			csize = MIN2(size, SI_DMA_COPY_MAX_DWORD_ALIGNED_SIZE);
-			radeon_emit(cs, SI_DMA_PACKET(SI_DMA_PACKET_CONSTANT_FILL, 0,
-						      csize / 4));
-			radeon_emit(cs, offset);
-			radeon_emit(cs, clear_value);
-			radeon_emit(cs, (offset >> 32) << 16);
-			offset += csize;
-			size -= csize;
-		}
-		return;
-	}
-
-	/* The following code is for Sea Islands and later. */
-	/* the same maximum size as for copying */
-	ncopy = DIV_ROUND_UP(size, CIK_SDMA_COPY_MAX_SIZE);
-	si_need_dma_space(sctx, ncopy * 5, sdst, NULL);
-
-	for (i = 0; i < ncopy; i++) {
-		csize = MIN2(size, CIK_SDMA_COPY_MAX_SIZE);
-		radeon_emit(cs, CIK_SDMA_PACKET(CIK_SDMA_PACKET_CONSTANT_FILL, 0,
-						0x8000 /* dword copy */));
-		radeon_emit(cs, offset);
-		radeon_emit(cs, offset >> 32);
-		radeon_emit(cs, clear_value);
-		/* dw count */
-		radeon_emit(cs, (sctx->chip_class >= GFX9 ? csize - 1 : csize) & 0xfffffffc);
-		offset += csize;
-		size -= csize;
-	}
+   struct radeon_cmdbuf *cs = sctx->sdma_cs;
+   unsigned i, ncopy, csize;
+   struct si_resource *sdst = si_resource(dst);
+
+   assert(offset % 4 == 0);
+   assert(size);
+   assert(size % 4 == 0);
+
+   if (!cs || dst->flags & PIPE_RESOURCE_FLAG_SPARSE ||
+       sctx->screen->debug_flags & DBG(NO_SDMA_CLEARS)) {
+      sctx->b.clear_buffer(&sctx->b, dst, offset, size, &clear_value, 4);
+      return;
+   }
+
+   /* Mark the buffer range of destination as valid (initialized),
+    * so that transfer_map knows it should wait for the GPU when mapping
+    * that range. */
+   util_range_add(dst, &sdst->valid_buffer_range, offset, offset + size);
+
+   offset += sdst->gpu_address;
+
+   if (sctx->chip_class == GFX6) {
+      /* the same maximum size as for copying */
+      ncopy = DIV_ROUND_UP(size, SI_DMA_COPY_MAX_DWORD_ALIGNED_SIZE);
+      si_need_dma_space(sctx, ncopy * 4, sdst, NULL);
+
+      for (i = 0; i < ncopy; i++) {
+         csize = MIN2(size, SI_DMA_COPY_MAX_DWORD_ALIGNED_SIZE);
+         radeon_emit(cs, SI_DMA_PACKET(SI_DMA_PACKET_CONSTANT_FILL, 0, csize / 4));
+         radeon_emit(cs, offset);
+         radeon_emit(cs, clear_value);
+         radeon_emit(cs, (offset >> 32) << 16);
+         offset += csize;
+         size -= csize;
+      }
+      return;
+   }
+
+   /* The following code is for Sea Islands and later. */
+   /* the same maximum size as for copying */
+   ncopy = DIV_ROUND_UP(size, CIK_SDMA_COPY_MAX_SIZE);
+   si_need_dma_space(sctx, ncopy * 5, sdst, NULL);
+
+   for (i = 0; i < ncopy; i++) {
+      csize = MIN2(size, CIK_SDMA_COPY_MAX_SIZE);
+      radeon_emit(cs, CIK_SDMA_PACKET(CIK_SDMA_PACKET_CONSTANT_FILL, 0, 0x8000 /* dword copy */));
+      radeon_emit(cs, offset);
+      radeon_emit(cs, offset >> 32);
+      radeon_emit(cs, clear_value);
+      /* dw count */
+      radeon_emit(cs, (sctx->chip_class >= GFX9 ? csize - 1 : csize) & 0xfffffffc);
+      offset += csize;
+      size -= csize;
+   }
 }
 
 void si_sdma_copy_buffer(struct si_context *sctx, struct pipe_resource *dst,
-			 struct pipe_resource *src, uint64_t dst_offset,
-			 uint64_t src_offset, uint64_t size)
+                         struct pipe_resource *src, uint64_t dst_offset, uint64_t src_offset,
+                         uint64_t size)
 {
-	struct radeon_cmdbuf *cs = sctx->sdma_cs;
-	unsigned i, ncopy, csize;
-	struct si_resource *sdst = si_resource(dst);
-	struct si_resource *ssrc = si_resource(src);
-
-	if (!cs ||
-	    dst->flags & PIPE_RESOURCE_FLAG_SPARSE ||
-	    src->flags & PIPE_RESOURCE_FLAG_SPARSE) {
-		si_copy_buffer(sctx, dst, src, dst_offset, src_offset, size);
-		return;
-	}
-
-	/* Mark the buffer range of destination as valid (initialized),
-	 * so that transfer_map knows it should wait for the GPU when mapping
-	 * that range. */
-	util_range_add(dst, &sdst->valid_buffer_range, dst_offset,
-		       dst_offset + size);
-
-	dst_offset += sdst->gpu_address;
-	src_offset += ssrc->gpu_address;
-
-	if (sctx->chip_class == GFX6) {
-		unsigned max_size, sub_cmd, shift;
-
-		/* see whether we should use the dword-aligned or byte-aligned copy */
-		if (!(dst_offset % 4) && !(src_offset % 4) && !(size % 4)) {
-			sub_cmd = SI_DMA_COPY_DWORD_ALIGNED;
-			shift = 2;
-			max_size = SI_DMA_COPY_MAX_DWORD_ALIGNED_SIZE;
-		} else {
-			sub_cmd = SI_DMA_COPY_BYTE_ALIGNED;
-			shift = 0;
-			max_size = SI_DMA_COPY_MAX_BYTE_ALIGNED_SIZE;
-		}
-
-		ncopy = DIV_ROUND_UP(size, max_size);
-		si_need_dma_space(sctx, ncopy * 5, sdst, ssrc);
-
-		for (i = 0; i < ncopy; i++) {
-			csize = MIN2(size, max_size);
-			radeon_emit(cs, SI_DMA_PACKET(SI_DMA_PACKET_COPY, sub_cmd,
-						      csize >> shift));
-			radeon_emit(cs, dst_offset);
-			radeon_emit(cs, src_offset);
-			radeon_emit(cs, (dst_offset >> 32UL) & 0xff);
-			radeon_emit(cs, (src_offset >> 32UL) & 0xff);
-			dst_offset += csize;
-			src_offset += csize;
-			size -= csize;
-		}
-		return;
-	}
-
-	/* The following code is for CI and later. */
-	unsigned align = ~0u;
-	ncopy = DIV_ROUND_UP(size, CIK_SDMA_COPY_MAX_SIZE);
-
-	/* Align copy size to dw if src/dst address are dw aligned */
-	if ((src_offset & 0x3) == 0 &&
-	    (dst_offset & 0x3) == 0 &&
-	    size > 4 &&
-	    (size & 3) != 0) {
-		align = ~0x3u;
-		ncopy++;
-	}
-
-	si_need_dma_space(sctx, ncopy * 7, sdst, ssrc);
-
-	for (i = 0; i < ncopy; i++) {
-		csize = size >= 4 ? MIN2(size & align, CIK_SDMA_COPY_MAX_SIZE) : size;
-		radeon_emit(cs, CIK_SDMA_PACKET(CIK_SDMA_OPCODE_COPY,
-						CIK_SDMA_COPY_SUB_OPCODE_LINEAR,
-						0));
-		radeon_emit(cs, sctx->chip_class >= GFX9 ? csize - 1 : csize);
-		radeon_emit(cs, 0); /* src/dst endian swap */
-		radeon_emit(cs, src_offset);
-		radeon_emit(cs, src_offset >> 32);
-		radeon_emit(cs, dst_offset);
-		radeon_emit(cs, dst_offset >> 32);
-		dst_offset += csize;
-		src_offset += csize;
-		size -= csize;
-	}
+   struct radeon_cmdbuf *cs = sctx->sdma_cs;
+   unsigned i, ncopy, csize;
+   struct si_resource *sdst = si_resource(dst);
+   struct si_resource *ssrc = si_resource(src);
+
+   if (!cs || dst->flags & PIPE_RESOURCE_FLAG_SPARSE || src->flags & PIPE_RESOURCE_FLAG_SPARSE) {
+      si_copy_buffer(sctx, dst, src, dst_offset, src_offset, size);
+      return;
+   }
+
+   /* Mark the buffer range of destination as valid (initialized),
+    * so that transfer_map knows it should wait for the GPU when mapping
+    * that range. */
+   util_range_add(dst, &sdst->valid_buffer_range, dst_offset, dst_offset + size);
+
+   dst_offset += sdst->gpu_address;
+   src_offset += ssrc->gpu_address;
+
+   if (sctx->chip_class == GFX6) {
+      unsigned max_size, sub_cmd, shift;
+
+      /* see whether we should use the dword-aligned or byte-aligned copy */
+      if (!(dst_offset % 4) && !(src_offset % 4) && !(size % 4)) {
+         sub_cmd = SI_DMA_COPY_DWORD_ALIGNED;
+         shift = 2;
+         max_size = SI_DMA_COPY_MAX_DWORD_ALIGNED_SIZE;
+      } else {
+         sub_cmd = SI_DMA_COPY_BYTE_ALIGNED;
+         shift = 0;
+         max_size = SI_DMA_COPY_MAX_BYTE_ALIGNED_SIZE;
+      }
+
+      ncopy = DIV_ROUND_UP(size, max_size);
+      si_need_dma_space(sctx, ncopy * 5, sdst, ssrc);
+
+      for (i = 0; i < ncopy; i++) {
+         csize = MIN2(size, max_size);
+         radeon_emit(cs, SI_DMA_PACKET(SI_DMA_PACKET_COPY, sub_cmd, csize >> shift));
+         radeon_emit(cs, dst_offset);
+         radeon_emit(cs, src_offset);
+         radeon_emit(cs, (dst_offset >> 32UL) & 0xff);
+         radeon_emit(cs, (src_offset >> 32UL) & 0xff);
+         dst_offset += csize;
+         src_offset += csize;
+         size -= csize;
+      }
+      return;
+   }
+
+   /* The following code is for CI and later. */
+   unsigned align = ~0u;
+   ncopy = DIV_ROUND_UP(size, CIK_SDMA_COPY_MAX_SIZE);
+
+   /* Align copy size to dw if src/dst address are dw aligned */
+   if ((src_offset & 0x3) == 0 && (dst_offset & 0x3) == 0 && size > 4 && (size & 3) != 0) {
+      align = ~0x3u;
+      ncopy++;
+   }
+
+   si_need_dma_space(sctx, ncopy * 7, sdst, ssrc);
+
+   for (i = 0; i < ncopy; i++) {
+      csize = size >= 4 ? MIN2(size & align, CIK_SDMA_COPY_MAX_SIZE) : size;
+      radeon_emit(cs, CIK_SDMA_PACKET(CIK_SDMA_OPCODE_COPY, CIK_SDMA_COPY_SUB_OPCODE_LINEAR, 0));
+      radeon_emit(cs, sctx->chip_class >= GFX9 ? csize - 1 : csize);
+      radeon_emit(cs, 0); /* src/dst endian swap */
+      radeon_emit(cs, src_offset);
+      radeon_emit(cs, src_offset >> 32);
+      radeon_emit(cs, dst_offset);
+      radeon_emit(cs, dst_offset >> 32);
+      dst_offset += csize;
+      src_offset += csize;
+      size -= csize;
+   }
 }
 
-void si_need_dma_space(struct si_context *ctx, unsigned num_dw,
-		       struct si_resource *dst, struct si_resource *src)
+void si_need_dma_space(struct si_context *ctx, unsigned num_dw, struct si_resource *dst,
+                       struct si_resource *src)
 {
-	struct radeon_winsys *ws = ctx->ws;
-	uint64_t vram = ctx->sdma_cs->used_vram;
-	uint64_t gtt = ctx->sdma_cs->used_gart;
-
-	if (dst) {
-		vram += dst->vram_usage;
-		gtt += dst->gart_usage;
-	}
-	if (src) {
-		vram += src->vram_usage;
-		gtt += src->gart_usage;
-	}
-
-	/* Flush the GFX IB if DMA depends on it. */
-	if (!ctx->sdma_uploads_in_progress &&
-	    radeon_emitted(ctx->gfx_cs, ctx->initial_gfx_cs_size) &&
-	    ((dst &&
-	      ws->cs_is_buffer_referenced(ctx->gfx_cs, dst->buf,
-					  RADEON_USAGE_READWRITE)) ||
-	     (src &&
-	      ws->cs_is_buffer_referenced(ctx->gfx_cs, src->buf,
-					  RADEON_USAGE_WRITE))))
-		si_flush_gfx_cs(ctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
-
-	/* Flush if there's not enough space, or if the memory usage per IB
-	 * is too large.
-	 *
-	 * IBs using too little memory are limited by the IB submission overhead.
-	 * IBs using too much memory are limited by the kernel/TTM overhead.
-	 * Too long IBs create CPU-GPU pipeline bubbles and add latency.
-	 *
-	 * This heuristic makes sure that DMA requests are executed
-	 * very soon after the call is made and lowers memory usage.
-	 * It improves texture upload performance by keeping the DMA
-	 * engine busy while uploads are being submitted.
-	 */
-	num_dw++; /* for emit_wait_idle below */
-	if (!ctx->sdma_uploads_in_progress &&
-	    (!ws->cs_check_space(ctx->sdma_cs, num_dw, false) ||
-	     ctx->sdma_cs->used_vram + ctx->sdma_cs->used_gart > 64 * 1024 * 1024 ||
-	     !radeon_cs_memory_below_limit(ctx->screen, ctx->sdma_cs, vram, gtt))) {
-		si_flush_dma_cs(ctx, PIPE_FLUSH_ASYNC, NULL);
-		assert((num_dw + ctx->sdma_cs->current.cdw) <= ctx->sdma_cs->current.max_dw);
-	}
-
-	/* Wait for idle if either buffer has been used in the IB before to
-	 * prevent read-after-write hazards.
-	 */
-	if ((dst &&
-	     ws->cs_is_buffer_referenced(ctx->sdma_cs, dst->buf,
-					 RADEON_USAGE_READWRITE)) ||
-	    (src &&
-	     ws->cs_is_buffer_referenced(ctx->sdma_cs, src->buf,
-					 RADEON_USAGE_WRITE)))
-		si_dma_emit_wait_idle(ctx);
-
-	unsigned sync = ctx->sdma_uploads_in_progress ? 0 : RADEON_USAGE_SYNCHRONIZED;
-	if (dst) {
-		ws->cs_add_buffer(ctx->sdma_cs, dst->buf, RADEON_USAGE_WRITE | sync,
-				  dst->domains, 0);
-	}
-	if (src) {
-		ws->cs_add_buffer(ctx->sdma_cs, src->buf, RADEON_USAGE_READ | sync,
-				  src->domains, 0);
-	}
-
-	/* this function is called before all DMA calls, so increment this. */
-	ctx->num_dma_calls++;
+   struct radeon_winsys *ws = ctx->ws;
+   uint64_t vram = ctx->sdma_cs->used_vram;
+   uint64_t gtt = ctx->sdma_cs->used_gart;
+
+   if (dst) {
+      vram += dst->vram_usage;
+      gtt += dst->gart_usage;
+   }
+   if (src) {
+      vram += src->vram_usage;
+      gtt += src->gart_usage;
+   }
+
+   /* Flush the GFX IB if DMA depends on it. */
+   if (!ctx->sdma_uploads_in_progress && radeon_emitted(ctx->gfx_cs, ctx->initial_gfx_cs_size) &&
+       ((dst && ws->cs_is_buffer_referenced(ctx->gfx_cs, dst->buf, RADEON_USAGE_READWRITE)) ||
+        (src && ws->cs_is_buffer_referenced(ctx->gfx_cs, src->buf, RADEON_USAGE_WRITE))))
+      si_flush_gfx_cs(ctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
+
+   /* Flush if there's not enough space, or if the memory usage per IB
+    * is too large.
+    *
+    * IBs using too little memory are limited by the IB submission overhead.
+    * IBs using too much memory are limited by the kernel/TTM overhead.
+    * Too long IBs create CPU-GPU pipeline bubbles and add latency.
+    *
+    * This heuristic makes sure that DMA requests are executed
+    * very soon after the call is made and lowers memory usage.
+    * It improves texture upload performance by keeping the DMA
+    * engine busy while uploads are being submitted.
+    */
+   num_dw++; /* for emit_wait_idle below */
+   if (!ctx->sdma_uploads_in_progress &&
+       (!ws->cs_check_space(ctx->sdma_cs, num_dw, false) ||
+        ctx->sdma_cs->used_vram + ctx->sdma_cs->used_gart > 64 * 1024 * 1024 ||
+        !radeon_cs_memory_below_limit(ctx->screen, ctx->sdma_cs, vram, gtt))) {
+      si_flush_dma_cs(ctx, PIPE_FLUSH_ASYNC, NULL);
+      assert((num_dw + ctx->sdma_cs->current.cdw) <= ctx->sdma_cs->current.max_dw);
+   }
+
+   /* Wait for idle if either buffer has been used in the IB before to
+    * prevent read-after-write hazards.
+    */
+   if ((dst && ws->cs_is_buffer_referenced(ctx->sdma_cs, dst->buf, RADEON_USAGE_READWRITE)) ||
+       (src && ws->cs_is_buffer_referenced(ctx->sdma_cs, src->buf, RADEON_USAGE_WRITE)))
+      si_dma_emit_wait_idle(ctx);
+
+   unsigned sync = ctx->sdma_uploads_in_progress ? 0 : RADEON_USAGE_SYNCHRONIZED;
+   if (dst) {
+      ws->cs_add_buffer(ctx->sdma_cs, dst->buf, RADEON_USAGE_WRITE | sync, dst->domains, 0);
+   }
+   if (src) {
+      ws->cs_add_buffer(ctx->sdma_cs, src->buf, RADEON_USAGE_READ | sync, src->domains, 0);
+   }
+
+   /* this function is called before all DMA calls, so increment this. */
+   ctx->num_dma_calls++;
 }
 
-void si_flush_dma_cs(struct si_context *ctx, unsigned flags,
-		     struct pipe_fence_handle **fence)
+void si_flush_dma_cs(struct si_context *ctx, unsigned flags, struct pipe_fence_handle **fence)
 {
-	struct radeon_cmdbuf *cs = ctx->sdma_cs;
-	struct radeon_saved_cs saved;
-	bool check_vm = (ctx->screen->debug_flags & DBG(CHECK_VM)) != 0;
-
-	if (!radeon_emitted(cs, 0)) {
-		if (fence)
-			ctx->ws->fence_reference(fence, ctx->last_sdma_fence);
-		return;
-	}
-
-	if (check_vm)
-		si_save_cs(ctx->ws, cs, &saved, true);
-
-	ctx->ws->cs_flush(cs, flags, &ctx->last_sdma_fence);
-	if (fence)
-		ctx->ws->fence_reference(fence, ctx->last_sdma_fence);
-
-	if (check_vm) {
-		/* Use conservative timeout 800ms, after which we won't wait any
-		 * longer and assume the GPU is hung.
-		 */
-		ctx->ws->fence_wait(ctx->ws, ctx->last_sdma_fence, 800*1000*1000);
-
-		si_check_vm_faults(ctx, &saved, RING_DMA);
-		si_clear_saved_cs(&saved);
-	}
+   struct radeon_cmdbuf *cs = ctx->sdma_cs;
+   struct radeon_saved_cs saved;
+   bool check_vm = (ctx->screen->debug_flags & DBG(CHECK_VM)) != 0;
+
+   if (!radeon_emitted(cs, 0)) {
+      if (fence)
+         ctx->ws->fence_reference(fence, ctx->last_sdma_fence);
+      return;
+   }
+
+   if (check_vm)
+      si_save_cs(ctx->ws, cs, &saved, true);
+
+   ctx->ws->cs_flush(cs, flags, &ctx->last_sdma_fence);
+   if (fence)
+      ctx->ws->fence_reference(fence, ctx->last_sdma_fence);
+
+   if (check_vm) {
+      /* Use conservative timeout 800ms, after which we won't wait any
+       * longer and assume the GPU is hung.
+       */
+      ctx->ws->fence_wait(ctx->ws, ctx->last_sdma_fence, 800 * 1000 * 1000);
+
+      si_check_vm_faults(ctx, &saved, RING_DMA);
+      si_clear_saved_cs(&saved);
+   }
 }
 
-void si_screen_clear_buffer(struct si_screen *sscreen, struct pipe_resource *dst,
-			    uint64_t offset, uint64_t size, unsigned value)
+void si_screen_clear_buffer(struct si_screen *sscreen, struct pipe_resource *dst, uint64_t offset,
+                            uint64_t size, unsigned value)
 {
-	struct si_context *ctx = (struct si_context*)sscreen->aux_context;
+   struct si_context *ctx = (struct si_context *)sscreen->aux_context;
 
-	simple_mtx_lock(&sscreen->aux_context_lock);
-	si_sdma_clear_buffer(ctx, dst, offset, size, value);
-	sscreen->aux_context->flush(sscreen->aux_context, NULL, 0);
-	simple_mtx_unlock(&sscreen->aux_context_lock);
+   simple_mtx_lock(&sscreen->aux_context_lock);
+   si_sdma_clear_buffer(ctx, dst, offset, size, value);
+   sscreen->aux_context->flush(sscreen->aux_context, NULL, 0);
+   simple_mtx_unlock(&sscreen->aux_context_lock);
 }
diff --git a/src/gallium/drivers/radeonsi/si_fence.c b/src/gallium/drivers/radeonsi/si_fence.c
index 26b5fc4bdba..91d1bed505d 100644
--- a/src/gallium/drivers/radeonsi/si_fence.c
+++ b/src/gallium/drivers/radeonsi/si_fence.c
@@ -23,34 +23,33 @@
  *
  */
 
-#include <libsync.h>
-
+#include "si_build_pm4.h"
 #include "util/os_time.h"
 #include "util/u_memory.h"
 #include "util/u_queue.h"
 #include "util/u_upload_mgr.h"
 
-#include "si_build_pm4.h"
+#include <libsync.h>
 
 struct si_fine_fence {
-	struct si_resource *buf;
-	unsigned offset;
+   struct si_resource *buf;
+   unsigned offset;
 };
 
 struct si_multi_fence {
-	struct pipe_reference reference;
-	struct pipe_fence_handle *gfx;
-	struct pipe_fence_handle *sdma;
-	struct tc_unflushed_batch_token *tc_token;
-	struct util_queue_fence ready;
-
-	/* If the context wasn't flushed at fence creation, this is non-NULL. */
-	struct {
-		struct si_context *ctx;
-		unsigned ib_index;
-	} gfx_unflushed;
-
-	struct si_fine_fence fine;
+   struct pipe_reference reference;
+   struct pipe_fence_handle *gfx;
+   struct pipe_fence_handle *sdma;
+   struct tc_unflushed_batch_token *tc_token;
+   struct util_queue_fence ready;
+
+   /* If the context wasn't flushed at fence creation, this is non-NULL. */
+   struct {
+      struct si_context *ctx;
+      unsigned ib_index;
+   } gfx_unflushed;
+
+   struct si_fine_fence fine;
 };
 
 /**
@@ -66,591 +65,554 @@ struct si_multi_fence {
  * \param old_value	Previous fence value (for a bug workaround)
  * \param new_value	Fence value to write for this event.
  */
-void si_cp_release_mem(struct si_context *ctx, struct radeon_cmdbuf *cs,
-		       unsigned event, unsigned event_flags,
-		       unsigned dst_sel, unsigned int_sel, unsigned data_sel,
-		       struct si_resource *buf, uint64_t va,
-		       uint32_t new_fence, unsigned query_type)
+void si_cp_release_mem(struct si_context *ctx, struct radeon_cmdbuf *cs, unsigned event,
+                       unsigned event_flags, unsigned dst_sel, unsigned int_sel, unsigned data_sel,
+                       struct si_resource *buf, uint64_t va, uint32_t new_fence,
+                       unsigned query_type)
 {
-	unsigned op = EVENT_TYPE(event) |
-		      EVENT_INDEX(event == V_028A90_CS_DONE ||
-				  event == V_028A90_PS_DONE ? 6 : 5) |
-		      event_flags;
-	unsigned sel = EOP_DST_SEL(dst_sel) |
-		       EOP_INT_SEL(int_sel) |
-		       EOP_DATA_SEL(data_sel);
-	bool compute_ib = !ctx->has_graphics ||
-			  cs == ctx->prim_discard_compute_cs;
-
-	if (ctx->chip_class >= GFX9 ||
-	    (compute_ib && ctx->chip_class >= GFX7)) {
-		/* A ZPASS_DONE or PIXEL_STAT_DUMP_EVENT (of the DB occlusion
-		 * counters) must immediately precede every timestamp event to
-		 * prevent a GPU hang on GFX9.
-		 *
-		 * Occlusion queries don't need to do it here, because they
-		 * always do ZPASS_DONE before the timestamp.
-		 */
-		if (ctx->chip_class == GFX9 && !compute_ib &&
-		    query_type != PIPE_QUERY_OCCLUSION_COUNTER &&
-		    query_type != PIPE_QUERY_OCCLUSION_PREDICATE &&
-		    query_type != PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE) {
-			struct si_resource *scratch = ctx->eop_bug_scratch;
-
-			assert(16 * ctx->screen->info.num_render_backends <=
-			       scratch->b.b.width0);
-			radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
-			radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_ZPASS_DONE) | EVENT_INDEX(1));
-			radeon_emit(cs, scratch->gpu_address);
-			radeon_emit(cs, scratch->gpu_address >> 32);
-
-			radeon_add_to_buffer_list(ctx, ctx->gfx_cs, scratch,
-						  RADEON_USAGE_WRITE, RADEON_PRIO_QUERY);
-		}
-
-		radeon_emit(cs, PKT3(PKT3_RELEASE_MEM, ctx->chip_class >= GFX9 ? 6 : 5, 0));
-		radeon_emit(cs, op);
-		radeon_emit(cs, sel);
-		radeon_emit(cs, va);		/* address lo */
-		radeon_emit(cs, va >> 32);	/* address hi */
-		radeon_emit(cs, new_fence);	/* immediate data lo */
-		radeon_emit(cs, 0); /* immediate data hi */
-		if (ctx->chip_class >= GFX9)
-			radeon_emit(cs, 0); /* unused */
-	} else {
-		if (ctx->chip_class == GFX7 ||
-		    ctx->chip_class == GFX8) {
-			struct si_resource *scratch = ctx->eop_bug_scratch;
-			uint64_t va = scratch->gpu_address;
-
-			/* Two EOP events are required to make all engines go idle
-			 * (and optional cache flushes executed) before the timestamp
-			 * is written.
-			 */
-			radeon_emit(cs, PKT3(PKT3_EVENT_WRITE_EOP, 4, 0));
-			radeon_emit(cs, op);
-			radeon_emit(cs, va);
-			radeon_emit(cs, ((va >> 32) & 0xffff) | sel);
-			radeon_emit(cs, 0); /* immediate data */
-			radeon_emit(cs, 0); /* unused */
-
-			radeon_add_to_buffer_list(ctx, ctx->gfx_cs, scratch,
-						  RADEON_USAGE_WRITE, RADEON_PRIO_QUERY);
-		}
-
-		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE_EOP, 4, 0));
-		radeon_emit(cs, op);
-		radeon_emit(cs, va);
-		radeon_emit(cs, ((va >> 32) & 0xffff) | sel);
-		radeon_emit(cs, new_fence); /* immediate data */
-		radeon_emit(cs, 0); /* unused */
-	}
-
-	if (buf) {
-		radeon_add_to_buffer_list(ctx, ctx->gfx_cs, buf, RADEON_USAGE_WRITE,
-					  RADEON_PRIO_QUERY);
-	}
+   unsigned op = EVENT_TYPE(event) |
+                 EVENT_INDEX(event == V_028A90_CS_DONE || event == V_028A90_PS_DONE ? 6 : 5) |
+                 event_flags;
+   unsigned sel = EOP_DST_SEL(dst_sel) | EOP_INT_SEL(int_sel) | EOP_DATA_SEL(data_sel);
+   bool compute_ib = !ctx->has_graphics || cs == ctx->prim_discard_compute_cs;
+
+   if (ctx->chip_class >= GFX9 || (compute_ib && ctx->chip_class >= GFX7)) {
+      /* A ZPASS_DONE or PIXEL_STAT_DUMP_EVENT (of the DB occlusion
+       * counters) must immediately precede every timestamp event to
+       * prevent a GPU hang on GFX9.
+       *
+       * Occlusion queries don't need to do it here, because they
+       * always do ZPASS_DONE before the timestamp.
+       */
+      if (ctx->chip_class == GFX9 && !compute_ib && query_type != PIPE_QUERY_OCCLUSION_COUNTER &&
+          query_type != PIPE_QUERY_OCCLUSION_PREDICATE &&
+          query_type != PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE) {
+         struct si_resource *scratch = ctx->eop_bug_scratch;
+
+         assert(16 * ctx->screen->info.num_render_backends <= scratch->b.b.width0);
+         radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
+         radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_ZPASS_DONE) | EVENT_INDEX(1));
+         radeon_emit(cs, scratch->gpu_address);
+         radeon_emit(cs, scratch->gpu_address >> 32);
+
+         radeon_add_to_buffer_list(ctx, ctx->gfx_cs, scratch, RADEON_USAGE_WRITE,
+                                   RADEON_PRIO_QUERY);
+      }
+
+      radeon_emit(cs, PKT3(PKT3_RELEASE_MEM, ctx->chip_class >= GFX9 ? 6 : 5, 0));
+      radeon_emit(cs, op);
+      radeon_emit(cs, sel);
+      radeon_emit(cs, va);        /* address lo */
+      radeon_emit(cs, va >> 32);  /* address hi */
+      radeon_emit(cs, new_fence); /* immediate data lo */
+      radeon_emit(cs, 0);         /* immediate data hi */
+      if (ctx->chip_class >= GFX9)
+         radeon_emit(cs, 0); /* unused */
+   } else {
+      if (ctx->chip_class == GFX7 || ctx->chip_class == GFX8) {
+         struct si_resource *scratch = ctx->eop_bug_scratch;
+         uint64_t va = scratch->gpu_address;
+
+         /* Two EOP events are required to make all engines go idle
+          * (and optional cache flushes executed) before the timestamp
+          * is written.
+          */
+         radeon_emit(cs, PKT3(PKT3_EVENT_WRITE_EOP, 4, 0));
+         radeon_emit(cs, op);
+         radeon_emit(cs, va);
+         radeon_emit(cs, ((va >> 32) & 0xffff) | sel);
+         radeon_emit(cs, 0); /* immediate data */
+         radeon_emit(cs, 0); /* unused */
+
+         radeon_add_to_buffer_list(ctx, ctx->gfx_cs, scratch, RADEON_USAGE_WRITE,
+                                   RADEON_PRIO_QUERY);
+      }
+
+      radeon_emit(cs, PKT3(PKT3_EVENT_WRITE_EOP, 4, 0));
+      radeon_emit(cs, op);
+      radeon_emit(cs, va);
+      radeon_emit(cs, ((va >> 32) & 0xffff) | sel);
+      radeon_emit(cs, new_fence); /* immediate data */
+      radeon_emit(cs, 0);         /* unused */
+   }
+
+   if (buf) {
+      radeon_add_to_buffer_list(ctx, ctx->gfx_cs, buf, RADEON_USAGE_WRITE, RADEON_PRIO_QUERY);
+   }
 }
 
 unsigned si_cp_write_fence_dwords(struct si_screen *screen)
 {
-	unsigned dwords = 6;
+   unsigned dwords = 6;
 
-	if (screen->info.chip_class == GFX7 ||
-	    screen->info.chip_class == GFX8)
-		dwords *= 2;
+   if (screen->info.chip_class == GFX7 || screen->info.chip_class == GFX8)
+      dwords *= 2;
 
-	return dwords;
+   return dwords;
 }
 
-void si_cp_wait_mem(struct si_context *ctx, struct radeon_cmdbuf *cs,
-		    uint64_t va, uint32_t ref, uint32_t mask, unsigned flags)
+void si_cp_wait_mem(struct si_context *ctx, struct radeon_cmdbuf *cs, uint64_t va, uint32_t ref,
+                    uint32_t mask, unsigned flags)
 {
-	radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, 0));
-	radeon_emit(cs, WAIT_REG_MEM_MEM_SPACE(1) | flags);
-	radeon_emit(cs, va);
-	radeon_emit(cs, va >> 32);
-	radeon_emit(cs, ref); /* reference value */
-	radeon_emit(cs, mask); /* mask */
-	radeon_emit(cs, 4); /* poll interval */
+   radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, 0));
+   radeon_emit(cs, WAIT_REG_MEM_MEM_SPACE(1) | flags);
+   radeon_emit(cs, va);
+   radeon_emit(cs, va >> 32);
+   radeon_emit(cs, ref);  /* reference value */
+   radeon_emit(cs, mask); /* mask */
+   radeon_emit(cs, 4);    /* poll interval */
 }
 
-static void si_add_fence_dependency(struct si_context *sctx,
-				    struct pipe_fence_handle *fence)
+static void si_add_fence_dependency(struct si_context *sctx, struct pipe_fence_handle *fence)
 {
-	struct radeon_winsys *ws = sctx->ws;
+   struct radeon_winsys *ws = sctx->ws;
 
-	if (sctx->sdma_cs)
-		ws->cs_add_fence_dependency(sctx->sdma_cs, fence, 0);
-	ws->cs_add_fence_dependency(sctx->gfx_cs, fence, 0);
+   if (sctx->sdma_cs)
+      ws->cs_add_fence_dependency(sctx->sdma_cs, fence, 0);
+   ws->cs_add_fence_dependency(sctx->gfx_cs, fence, 0);
 }
 
-static void si_add_syncobj_signal(struct si_context *sctx,
-				  struct pipe_fence_handle *fence)
+static void si_add_syncobj_signal(struct si_context *sctx, struct pipe_fence_handle *fence)
 {
-	sctx->ws->cs_add_syncobj_signal(sctx->gfx_cs, fence);
+   sctx->ws->cs_add_syncobj_signal(sctx->gfx_cs, fence);
 }
 
-static void si_fence_reference(struct pipe_screen *screen,
-			       struct pipe_fence_handle **dst,
-			       struct pipe_fence_handle *src)
+static void si_fence_reference(struct pipe_screen *screen, struct pipe_fence_handle **dst,
+                               struct pipe_fence_handle *src)
 {
-	struct radeon_winsys *ws = ((struct si_screen*)screen)->ws;
-	struct si_multi_fence **sdst = (struct si_multi_fence **)dst;
-	struct si_multi_fence *ssrc = (struct si_multi_fence *)src;
-
-	if (pipe_reference(&(*sdst)->reference, &ssrc->reference)) {
-		ws->fence_reference(&(*sdst)->gfx, NULL);
-		ws->fence_reference(&(*sdst)->sdma, NULL);
-		tc_unflushed_batch_token_reference(&(*sdst)->tc_token, NULL);
-		si_resource_reference(&(*sdst)->fine.buf, NULL);
-		FREE(*sdst);
-	}
-        *sdst = ssrc;
+   struct radeon_winsys *ws = ((struct si_screen *)screen)->ws;
+   struct si_multi_fence **sdst = (struct si_multi_fence **)dst;
+   struct si_multi_fence *ssrc = (struct si_multi_fence *)src;
+
+   if (pipe_reference(&(*sdst)->reference, &ssrc->reference)) {
+      ws->fence_reference(&(*sdst)->gfx, NULL);
+      ws->fence_reference(&(*sdst)->sdma, NULL);
+      tc_unflushed_batch_token_reference(&(*sdst)->tc_token, NULL);
+      si_resource_reference(&(*sdst)->fine.buf, NULL);
+      FREE(*sdst);
+   }
+   *sdst = ssrc;
 }
 
 static struct si_multi_fence *si_create_multi_fence()
 {
-	struct si_multi_fence *fence = CALLOC_STRUCT(si_multi_fence);
-	if (!fence)
-		return NULL;
+   struct si_multi_fence *fence = CALLOC_STRUCT(si_multi_fence);
+   if (!fence)
+      return NULL;
 
-	pipe_reference_init(&fence->reference, 1);
-	util_queue_fence_init(&fence->ready);
+   pipe_reference_init(&fence->reference, 1);
+   util_queue_fence_init(&fence->ready);
 
-	return fence;
+   return fence;
 }
 
 struct pipe_fence_handle *si_create_fence(struct pipe_context *ctx,
-					  struct tc_unflushed_batch_token *tc_token)
+                                          struct tc_unflushed_batch_token *tc_token)
 {
-	struct si_multi_fence *fence = si_create_multi_fence();
-	if (!fence)
-		return NULL;
+   struct si_multi_fence *fence = si_create_multi_fence();
+   if (!fence)
+      return NULL;
 
-	util_queue_fence_reset(&fence->ready);
-	tc_unflushed_batch_token_reference(&fence->tc_token, tc_token);
+   util_queue_fence_reset(&fence->ready);
+   tc_unflushed_batch_token_reference(&fence->tc_token, tc_token);
 
-	return (struct pipe_fence_handle *)fence;
+   return (struct pipe_fence_handle *)fence;
 }
 
-static bool si_fine_fence_signaled(struct radeon_winsys *rws,
-				   const struct si_fine_fence *fine)
+static bool si_fine_fence_signaled(struct radeon_winsys *rws, const struct si_fine_fence *fine)
 {
-	char *map = rws->buffer_map(fine->buf->buf, NULL, PIPE_TRANSFER_READ |
-							  PIPE_TRANSFER_UNSYNCHRONIZED);
-	if (!map)
-		return false;
+   char *map =
+      rws->buffer_map(fine->buf->buf, NULL, PIPE_TRANSFER_READ | PIPE_TRANSFER_UNSYNCHRONIZED);
+   if (!map)
+      return false;
 
-	uint32_t *fence = (uint32_t*)(map + fine->offset);
-	return *fence != 0;
+   uint32_t *fence = (uint32_t *)(map + fine->offset);
+   return *fence != 0;
 }
 
-static void si_fine_fence_set(struct si_context *ctx,
-			      struct si_fine_fence *fine,
-			      unsigned flags)
+static void si_fine_fence_set(struct si_context *ctx, struct si_fine_fence *fine, unsigned flags)
 {
-	uint32_t *fence_ptr;
-
-	assert(util_bitcount(flags & (PIPE_FLUSH_TOP_OF_PIPE | PIPE_FLUSH_BOTTOM_OF_PIPE)) == 1);
-
-	/* Use cached system memory for the fence. */
-	u_upload_alloc(ctx->cached_gtt_allocator, 0, 4, 4,
-		       &fine->offset, (struct pipe_resource **)&fine->buf, (void **)&fence_ptr);
-	if (!fine->buf)
-		return;
-
-	*fence_ptr = 0;
-
-	if (flags & PIPE_FLUSH_TOP_OF_PIPE) {
-		uint32_t value = 0x80000000;
-
-		si_cp_write_data(ctx, fine->buf, fine->offset, 4,
-				 V_370_MEM, V_370_PFP, &value);
-	} else if (flags & PIPE_FLUSH_BOTTOM_OF_PIPE) {
-		uint64_t fence_va = fine->buf->gpu_address + fine->offset;
-
-		radeon_add_to_buffer_list(ctx, ctx->gfx_cs, fine->buf,
-					  RADEON_USAGE_WRITE, RADEON_PRIO_QUERY);
-		si_cp_release_mem(ctx, ctx->gfx_cs,
-				  V_028A90_BOTTOM_OF_PIPE_TS, 0,
-				  EOP_DST_SEL_MEM, EOP_INT_SEL_NONE,
-				  EOP_DATA_SEL_VALUE_32BIT,
-				  NULL, fence_va, 0x80000000,
-				  PIPE_QUERY_GPU_FINISHED);
-	} else {
-		assert(false);
-	}
+   uint32_t *fence_ptr;
+
+   assert(util_bitcount(flags & (PIPE_FLUSH_TOP_OF_PIPE | PIPE_FLUSH_BOTTOM_OF_PIPE)) == 1);
+
+   /* Use cached system memory for the fence. */
+   u_upload_alloc(ctx->cached_gtt_allocator, 0, 4, 4, &fine->offset,
+                  (struct pipe_resource **)&fine->buf, (void **)&fence_ptr);
+   if (!fine->buf)
+      return;
+
+   *fence_ptr = 0;
+
+   if (flags & PIPE_FLUSH_TOP_OF_PIPE) {
+      uint32_t value = 0x80000000;
+
+      si_cp_write_data(ctx, fine->buf, fine->offset, 4, V_370_MEM, V_370_PFP, &value);
+   } else if (flags & PIPE_FLUSH_BOTTOM_OF_PIPE) {
+      uint64_t fence_va = fine->buf->gpu_address + fine->offset;
+
+      radeon_add_to_buffer_list(ctx, ctx->gfx_cs, fine->buf, RADEON_USAGE_WRITE, RADEON_PRIO_QUERY);
+      si_cp_release_mem(ctx, ctx->gfx_cs, V_028A90_BOTTOM_OF_PIPE_TS, 0, EOP_DST_SEL_MEM,
+                        EOP_INT_SEL_NONE, EOP_DATA_SEL_VALUE_32BIT, NULL, fence_va, 0x80000000,
+                        PIPE_QUERY_GPU_FINISHED);
+   } else {
+      assert(false);
+   }
 }
 
-static bool si_fence_finish(struct pipe_screen *screen,
-			    struct pipe_context *ctx,
-			    struct pipe_fence_handle *fence,
-			    uint64_t timeout)
+static bool si_fence_finish(struct pipe_screen *screen, struct pipe_context *ctx,
+                            struct pipe_fence_handle *fence, uint64_t timeout)
 {
-	struct radeon_winsys *rws = ((struct si_screen*)screen)->ws;
-	struct si_multi_fence *sfence = (struct si_multi_fence *)fence;
-	struct si_context *sctx;
-	int64_t abs_timeout = os_time_get_absolute_timeout(timeout);
-
-	ctx = threaded_context_unwrap_sync(ctx);
-	sctx = (struct si_context*)(ctx ? ctx : NULL);
-
-	if (!util_queue_fence_is_signalled(&sfence->ready)) {
-		if (sfence->tc_token) {
-			/* Ensure that si_flush_from_st will be called for
-			 * this fence, but only if we're in the API thread
-			 * where the context is current.
-			 *
-			 * Note that the batch containing the flush may already
-			 * be in flight in the driver thread, so the fence
-			 * may not be ready yet when this call returns.
-			 */
-			threaded_context_flush(ctx, sfence->tc_token,
-					       timeout == 0);
-		}
-
-		if (!timeout)
-			return false;
-
-		if (timeout == PIPE_TIMEOUT_INFINITE) {
-			util_queue_fence_wait(&sfence->ready);
-		} else {
-			if (!util_queue_fence_wait_timeout(&sfence->ready, abs_timeout))
-				return false;
-		}
-
-		if (timeout && timeout != PIPE_TIMEOUT_INFINITE) {
-			int64_t time = os_time_get_nano();
-			timeout = abs_timeout > time ? abs_timeout - time : 0;
-		}
-	}
-
-	if (sfence->sdma) {
-		if (!rws->fence_wait(rws, sfence->sdma, timeout))
-			return false;
-
-		/* Recompute the timeout after waiting. */
-		if (timeout && timeout != PIPE_TIMEOUT_INFINITE) {
-			int64_t time = os_time_get_nano();
-			timeout = abs_timeout > time ? abs_timeout - time : 0;
-		}
-	}
-
-	if (!sfence->gfx)
-		return true;
-
-	if (sfence->fine.buf &&
-	    si_fine_fence_signaled(rws, &sfence->fine)) {
-		rws->fence_reference(&sfence->gfx, NULL);
-		si_resource_reference(&sfence->fine.buf, NULL);
-		return true;
-	}
-
-	/* Flush the gfx IB if it hasn't been flushed yet. */
-	if (sctx && sfence->gfx_unflushed.ctx == sctx &&
-	    sfence->gfx_unflushed.ib_index == sctx->num_gfx_cs_flushes) {
-		/* Section 4.1.2 (Signaling) of the OpenGL 4.6 (Core profile)
-		 * spec says:
-		 *
-		 *    "If the sync object being blocked upon will not be
-		 *     signaled in finite time (for example, by an associated
-		 *     fence command issued previously, but not yet flushed to
-		 *     the graphics pipeline), then ClientWaitSync may hang
-		 *     forever. To help prevent this behavior, if
-		 *     ClientWaitSync is called and all of the following are
-		 *     true:
-		 *
-		 *     * the SYNC_FLUSH_COMMANDS_BIT bit is set in flags,
-		 *     * sync is unsignaled when ClientWaitSync is called,
-		 *     * and the calls to ClientWaitSync and FenceSync were
-		 *       issued from the same context,
-		 *
-		 *     then the GL will behave as if the equivalent of Flush
-		 *     were inserted immediately after the creation of sync."
-		 *
-		 * This means we need to flush for such fences even when we're
-		 * not going to wait.
-		 */
-		si_flush_gfx_cs(sctx,
-				(timeout ? 0 : PIPE_FLUSH_ASYNC) |
-				 RADEON_FLUSH_START_NEXT_GFX_IB_NOW,
-				NULL);
-		sfence->gfx_unflushed.ctx = NULL;
-
-		if (!timeout)
-			return false;
-
-		/* Recompute the timeout after all that. */
-		if (timeout && timeout != PIPE_TIMEOUT_INFINITE) {
-			int64_t time = os_time_get_nano();
-			timeout = abs_timeout > time ? abs_timeout - time : 0;
-		}
-	}
-
-	if (rws->fence_wait(rws, sfence->gfx, timeout))
-		return true;
-
-	/* Re-check in case the GPU is slow or hangs, but the commands before
-	 * the fine-grained fence have completed. */
-	if (sfence->fine.buf &&
-	    si_fine_fence_signaled(rws, &sfence->fine))
-		return true;
-
-	return false;
+   struct radeon_winsys *rws = ((struct si_screen *)screen)->ws;
+   struct si_multi_fence *sfence = (struct si_multi_fence *)fence;
+   struct si_context *sctx;
+   int64_t abs_timeout = os_time_get_absolute_timeout(timeout);
+
+   ctx = threaded_context_unwrap_sync(ctx);
+   sctx = (struct si_context *)(ctx ? ctx : NULL);
+
+   if (!util_queue_fence_is_signalled(&sfence->ready)) {
+      if (sfence->tc_token) {
+         /* Ensure that si_flush_from_st will be called for
+          * this fence, but only if we're in the API thread
+          * where the context is current.
+          *
+          * Note that the batch containing the flush may already
+          * be in flight in the driver thread, so the fence
+          * may not be ready yet when this call returns.
+          */
+         threaded_context_flush(ctx, sfence->tc_token, timeout == 0);
+      }
+
+      if (!timeout)
+         return false;
+
+      if (timeout == PIPE_TIMEOUT_INFINITE) {
+         util_queue_fence_wait(&sfence->ready);
+      } else {
+         if (!util_queue_fence_wait_timeout(&sfence->ready, abs_timeout))
+            return false;
+      }
+
+      if (timeout && timeout != PIPE_TIMEOUT_INFINITE) {
+         int64_t time = os_time_get_nano();
+         timeout = abs_timeout > time ? abs_timeout - time : 0;
+      }
+   }
+
+   if (sfence->sdma) {
+      if (!rws->fence_wait(rws, sfence->sdma, timeout))
+         return false;
+
+      /* Recompute the timeout after waiting. */
+      if (timeout && timeout != PIPE_TIMEOUT_INFINITE) {
+         int64_t time = os_time_get_nano();
+         timeout = abs_timeout > time ? abs_timeout - time : 0;
+      }
+   }
+
+   if (!sfence->gfx)
+      return true;
+
+   if (sfence->fine.buf && si_fine_fence_signaled(rws, &sfence->fine)) {
+      rws->fence_reference(&sfence->gfx, NULL);
+      si_resource_reference(&sfence->fine.buf, NULL);
+      return true;
+   }
+
+   /* Flush the gfx IB if it hasn't been flushed yet. */
+   if (sctx && sfence->gfx_unflushed.ctx == sctx &&
+       sfence->gfx_unflushed.ib_index == sctx->num_gfx_cs_flushes) {
+      /* Section 4.1.2 (Signaling) of the OpenGL 4.6 (Core profile)
+       * spec says:
+       *
+       *    "If the sync object being blocked upon will not be
+       *     signaled in finite time (for example, by an associated
+       *     fence command issued previously, but not yet flushed to
+       *     the graphics pipeline), then ClientWaitSync may hang
+       *     forever. To help prevent this behavior, if
+       *     ClientWaitSync is called and all of the following are
+       *     true:
+       *
+       *     * the SYNC_FLUSH_COMMANDS_BIT bit is set in flags,
+       *     * sync is unsignaled when ClientWaitSync is called,
+       *     * and the calls to ClientWaitSync and FenceSync were
+       *       issued from the same context,
+       *
+       *     then the GL will behave as if the equivalent of Flush
+       *     were inserted immediately after the creation of sync."
+       *
+       * This means we need to flush for such fences even when we're
+       * not going to wait.
+       */
+      si_flush_gfx_cs(sctx, (timeout ? 0 : PIPE_FLUSH_ASYNC) | RADEON_FLUSH_START_NEXT_GFX_IB_NOW,
+                      NULL);
+      sfence->gfx_unflushed.ctx = NULL;
+
+      if (!timeout)
+         return false;
+
+      /* Recompute the timeout after all that. */
+      if (timeout && timeout != PIPE_TIMEOUT_INFINITE) {
+         int64_t time = os_time_get_nano();
+         timeout = abs_timeout > time ? abs_timeout - time : 0;
+      }
+   }
+
+   if (rws->fence_wait(rws, sfence->gfx, timeout))
+      return true;
+
+   /* Re-check in case the GPU is slow or hangs, but the commands before
+    * the fine-grained fence have completed. */
+   if (sfence->fine.buf && si_fine_fence_signaled(rws, &sfence->fine))
+      return true;
+
+   return false;
 }
 
-static void si_create_fence_fd(struct pipe_context *ctx,
-			       struct pipe_fence_handle **pfence, int fd,
-			       enum pipe_fd_type type)
+static void si_create_fence_fd(struct pipe_context *ctx, struct pipe_fence_handle **pfence, int fd,
+                               enum pipe_fd_type type)
 {
-	struct si_screen *sscreen = (struct si_screen*)ctx->screen;
-	struct radeon_winsys *ws = sscreen->ws;
-	struct si_multi_fence *sfence;
+   struct si_screen *sscreen = (struct si_screen *)ctx->screen;
+   struct radeon_winsys *ws = sscreen->ws;
+   struct si_multi_fence *sfence;
 
-	*pfence = NULL;
+   *pfence = NULL;
 
-	sfence = si_create_multi_fence();
-	if (!sfence)
-		return;
+   sfence = si_create_multi_fence();
+   if (!sfence)
+      return;
 
-	switch (type) {
-	case PIPE_FD_TYPE_NATIVE_SYNC:
-		if (!sscreen->info.has_fence_to_handle)
-			goto finish;
+   switch (type) {
+   case PIPE_FD_TYPE_NATIVE_SYNC:
+      if (!sscreen->info.has_fence_to_handle)
+         goto finish;
 
-		sfence->gfx = ws->fence_import_sync_file(ws, fd);
-		break;
+      sfence->gfx = ws->fence_import_sync_file(ws, fd);
+      break;
 
-	case PIPE_FD_TYPE_SYNCOBJ:
-		if (!sscreen->info.has_syncobj)
-			goto finish;
+   case PIPE_FD_TYPE_SYNCOBJ:
+      if (!sscreen->info.has_syncobj)
+         goto finish;
 
-		sfence->gfx = ws->fence_import_syncobj(ws, fd);
-		break;
+      sfence->gfx = ws->fence_import_syncobj(ws, fd);
+      break;
 
-	default:
-		unreachable("bad fence fd type when importing");
-	}
+   default:
+      unreachable("bad fence fd type when importing");
+   }
 
 finish:
-	if (!sfence->gfx) {
-		FREE(sfence);
-		return;
-	}
+   if (!sfence->gfx) {
+      FREE(sfence);
+      return;
+   }
 
-	*pfence = (struct pipe_fence_handle*)sfence;
+   *pfence = (struct pipe_fence_handle *)sfence;
 }
 
-static int si_fence_get_fd(struct pipe_screen *screen,
-			   struct pipe_fence_handle *fence)
+static int si_fence_get_fd(struct pipe_screen *screen, struct pipe_fence_handle *fence)
 {
-	struct si_screen *sscreen = (struct si_screen*)screen;
-	struct radeon_winsys *ws = sscreen->ws;
-	struct si_multi_fence *sfence = (struct si_multi_fence *)fence;
-	int gfx_fd = -1, sdma_fd = -1;
-
-	if (!sscreen->info.has_fence_to_handle)
-		return -1;
-
-	util_queue_fence_wait(&sfence->ready);
-
-	/* Deferred fences aren't supported. */
-	assert(!sfence->gfx_unflushed.ctx);
-	if (sfence->gfx_unflushed.ctx)
-		return -1;
-
-	if (sfence->sdma) {
-		sdma_fd = ws->fence_export_sync_file(ws, sfence->sdma);
-		if (sdma_fd == -1)
-			return -1;
-	}
-	if (sfence->gfx) {
-		gfx_fd = ws->fence_export_sync_file(ws, sfence->gfx);
-		if (gfx_fd == -1) {
-			if (sdma_fd != -1)
-				close(sdma_fd);
-			return -1;
-		}
-	}
-
-	/* If we don't have FDs at this point, it means we don't have fences
-	 * either. */
-	if (sdma_fd == -1 && gfx_fd == -1)
-		return ws->export_signalled_sync_file(ws);
-	if (sdma_fd == -1)
-		return gfx_fd;
-	if (gfx_fd == -1)
-		return sdma_fd;
-
-	/* Get a fence that will be a combination of both fences. */
-	sync_accumulate("radeonsi", &gfx_fd, sdma_fd);
-	close(sdma_fd);
-	return gfx_fd;
+   struct si_screen *sscreen = (struct si_screen *)screen;
+   struct radeon_winsys *ws = sscreen->ws;
+   struct si_multi_fence *sfence = (struct si_multi_fence *)fence;
+   int gfx_fd = -1, sdma_fd = -1;
+
+   if (!sscreen->info.has_fence_to_handle)
+      return -1;
+
+   util_queue_fence_wait(&sfence->ready);
+
+   /* Deferred fences aren't supported. */
+   assert(!sfence->gfx_unflushed.ctx);
+   if (sfence->gfx_unflushed.ctx)
+      return -1;
+
+   if (sfence->sdma) {
+      sdma_fd = ws->fence_export_sync_file(ws, sfence->sdma);
+      if (sdma_fd == -1)
+         return -1;
+   }
+   if (sfence->gfx) {
+      gfx_fd = ws->fence_export_sync_file(ws, sfence->gfx);
+      if (gfx_fd == -1) {
+         if (sdma_fd != -1)
+            close(sdma_fd);
+         return -1;
+      }
+   }
+
+   /* If we don't have FDs at this point, it means we don't have fences
+    * either. */
+   if (sdma_fd == -1 && gfx_fd == -1)
+      return ws->export_signalled_sync_file(ws);
+   if (sdma_fd == -1)
+      return gfx_fd;
+   if (gfx_fd == -1)
+      return sdma_fd;
+
+   /* Get a fence that will be a combination of both fences. */
+   sync_accumulate("radeonsi", &gfx_fd, sdma_fd);
+   close(sdma_fd);
+   return gfx_fd;
 }
 
-static void si_flush_from_st(struct pipe_context *ctx,
-			     struct pipe_fence_handle **fence,
-			     unsigned flags)
+static void si_flush_from_st(struct pipe_context *ctx, struct pipe_fence_handle **fence,
+                             unsigned flags)
 {
-	struct pipe_screen *screen = ctx->screen;
-	struct si_context *sctx = (struct si_context *)ctx;
-	struct radeon_winsys *ws = sctx->ws;
-	struct pipe_fence_handle *gfx_fence = NULL;
-	struct pipe_fence_handle *sdma_fence = NULL;
-	bool deferred_fence = false;
-	struct si_fine_fence fine = {};
-	unsigned rflags = PIPE_FLUSH_ASYNC;
-
-	if (flags & PIPE_FLUSH_END_OF_FRAME)
-		rflags |= PIPE_FLUSH_END_OF_FRAME;
-
-	if (flags & (PIPE_FLUSH_TOP_OF_PIPE | PIPE_FLUSH_BOTTOM_OF_PIPE)) {
-		assert(flags & PIPE_FLUSH_DEFERRED);
-		assert(fence);
-
-		si_fine_fence_set(sctx, &fine, flags);
-	}
-
-	/* DMA IBs are preambles to gfx IBs, therefore must be flushed first. */
-	if (sctx->sdma_cs)
-		si_flush_dma_cs(sctx, rflags, fence ? &sdma_fence : NULL);
-
-	if (!radeon_emitted(sctx->gfx_cs, sctx->initial_gfx_cs_size)) {
-		if (fence)
-			ws->fence_reference(&gfx_fence, sctx->last_gfx_fence);
-		if (!(flags & PIPE_FLUSH_DEFERRED))
-			ws->cs_sync_flush(sctx->gfx_cs);
-	} else {
-		/* Instead of flushing, create a deferred fence. Constraints:
-		 * - The state tracker must allow a deferred flush.
-		 * - The state tracker must request a fence.
-		 * - fence_get_fd is not allowed.
-		 * Thread safety in fence_finish must be ensured by the state tracker.
-		 */
-		if (flags & PIPE_FLUSH_DEFERRED &&
-		    !(flags & PIPE_FLUSH_FENCE_FD) &&
-		    fence) {
-			gfx_fence = sctx->ws->cs_get_next_fence(sctx->gfx_cs);
-			deferred_fence = true;
-		} else {
-			si_flush_gfx_cs(sctx, rflags, fence ? &gfx_fence : NULL);
-		}
-	}
-
-	/* Both engines can signal out of order, so we need to keep both fences. */
-	if (fence) {
-		struct si_multi_fence *multi_fence;
-
-		if (flags & TC_FLUSH_ASYNC) {
-			multi_fence = (struct si_multi_fence *)*fence;
-			assert(multi_fence);
-		} else {
-			multi_fence = si_create_multi_fence();
-			if (!multi_fence) {
-				ws->fence_reference(&sdma_fence, NULL);
-				ws->fence_reference(&gfx_fence, NULL);
-				goto finish;
-			}
-
-			screen->fence_reference(screen, fence, NULL);
-			*fence = (struct pipe_fence_handle*)multi_fence;
-		}
-
-		/* If both fences are NULL, fence_finish will always return true. */
-		multi_fence->gfx = gfx_fence;
-		multi_fence->sdma = sdma_fence;
-
-		if (deferred_fence) {
-			multi_fence->gfx_unflushed.ctx = sctx;
-			multi_fence->gfx_unflushed.ib_index = sctx->num_gfx_cs_flushes;
-		}
-
-		multi_fence->fine = fine;
-		fine.buf = NULL;
-
-		if (flags & TC_FLUSH_ASYNC) {
-			util_queue_fence_signal(&multi_fence->ready);
-			tc_unflushed_batch_token_reference(&multi_fence->tc_token, NULL);
-		}
-	}
-	assert(!fine.buf);
+   struct pipe_screen *screen = ctx->screen;
+   struct si_context *sctx = (struct si_context *)ctx;
+   struct radeon_winsys *ws = sctx->ws;
+   struct pipe_fence_handle *gfx_fence = NULL;
+   struct pipe_fence_handle *sdma_fence = NULL;
+   bool deferred_fence = false;
+   struct si_fine_fence fine = {};
+   unsigned rflags = PIPE_FLUSH_ASYNC;
+
+   if (flags & PIPE_FLUSH_END_OF_FRAME)
+      rflags |= PIPE_FLUSH_END_OF_FRAME;
+
+   if (flags & (PIPE_FLUSH_TOP_OF_PIPE | PIPE_FLUSH_BOTTOM_OF_PIPE)) {
+      assert(flags & PIPE_FLUSH_DEFERRED);
+      assert(fence);
+
+      si_fine_fence_set(sctx, &fine, flags);
+   }
+
+   /* DMA IBs are preambles to gfx IBs, therefore must be flushed first. */
+   if (sctx->sdma_cs)
+      si_flush_dma_cs(sctx, rflags, fence ? &sdma_fence : NULL);
+
+   if (!radeon_emitted(sctx->gfx_cs, sctx->initial_gfx_cs_size)) {
+      if (fence)
+         ws->fence_reference(&gfx_fence, sctx->last_gfx_fence);
+      if (!(flags & PIPE_FLUSH_DEFERRED))
+         ws->cs_sync_flush(sctx->gfx_cs);
+   } else {
+      /* Instead of flushing, create a deferred fence. Constraints:
+       * - The state tracker must allow a deferred flush.
+       * - The state tracker must request a fence.
+       * - fence_get_fd is not allowed.
+       * Thread safety in fence_finish must be ensured by the state tracker.
+       */
+      if (flags & PIPE_FLUSH_DEFERRED && !(flags & PIPE_FLUSH_FENCE_FD) && fence) {
+         gfx_fence = sctx->ws->cs_get_next_fence(sctx->gfx_cs);
+         deferred_fence = true;
+      } else {
+         si_flush_gfx_cs(sctx, rflags, fence ? &gfx_fence : NULL);
+      }
+   }
+
+   /* Both engines can signal out of order, so we need to keep both fences. */
+   if (fence) {
+      struct si_multi_fence *multi_fence;
+
+      if (flags & TC_FLUSH_ASYNC) {
+         multi_fence = (struct si_multi_fence *)*fence;
+         assert(multi_fence);
+      } else {
+         multi_fence = si_create_multi_fence();
+         if (!multi_fence) {
+            ws->fence_reference(&sdma_fence, NULL);
+            ws->fence_reference(&gfx_fence, NULL);
+            goto finish;
+         }
+
+         screen->fence_reference(screen, fence, NULL);
+         *fence = (struct pipe_fence_handle *)multi_fence;
+      }
+
+      /* If both fences are NULL, fence_finish will always return true. */
+      multi_fence->gfx = gfx_fence;
+      multi_fence->sdma = sdma_fence;
+
+      if (deferred_fence) {
+         multi_fence->gfx_unflushed.ctx = sctx;
+         multi_fence->gfx_unflushed.ib_index = sctx->num_gfx_cs_flushes;
+      }
+
+      multi_fence->fine = fine;
+      fine.buf = NULL;
+
+      if (flags & TC_FLUSH_ASYNC) {
+         util_queue_fence_signal(&multi_fence->ready);
+         tc_unflushed_batch_token_reference(&multi_fence->tc_token, NULL);
+      }
+   }
+   assert(!fine.buf);
 finish:
-	if (!(flags & (PIPE_FLUSH_DEFERRED | PIPE_FLUSH_ASYNC))) {
-		if (sctx->sdma_cs)
-			ws->cs_sync_flush(sctx->sdma_cs);
-		ws->cs_sync_flush(sctx->gfx_cs);
-	}
+   if (!(flags & (PIPE_FLUSH_DEFERRED | PIPE_FLUSH_ASYNC))) {
+      if (sctx->sdma_cs)
+         ws->cs_sync_flush(sctx->sdma_cs);
+      ws->cs_sync_flush(sctx->gfx_cs);
+   }
 }
 
-static void si_fence_server_signal(struct pipe_context *ctx,
-				   struct pipe_fence_handle *fence)
+static void si_fence_server_signal(struct pipe_context *ctx, struct pipe_fence_handle *fence)
 {
-	struct si_context *sctx = (struct si_context *)ctx;
-	struct si_multi_fence *sfence = (struct si_multi_fence *)fence;
-
-	/* We should have at least one syncobj to signal */
-	assert(sfence->sdma || sfence->gfx);
-
-	if (sfence->sdma)
-		si_add_syncobj_signal(sctx, sfence->sdma);
-	if (sfence->gfx)
-		si_add_syncobj_signal(sctx, sfence->gfx);
-
-	/**
-	 * The spec does not require a flush here. We insert a flush
-	 * because syncobj based signals are not directly placed into
-	 * the command stream. Instead the signal happens when the
-	 * submission associated with the syncobj finishes execution.
-	 *
-	 * Therefore, we must make sure that we flush the pipe to avoid
-	 * new work being emitted and getting executed before the signal
-	 * operation.
-	 * 
-	 * Set sctx->initial_gfx_cs_size to force IB submission even if
-	 * it is empty.
-	 */
-	sctx->initial_gfx_cs_size = 0;
-	si_flush_from_st(ctx, NULL, PIPE_FLUSH_ASYNC);
+   struct si_context *sctx = (struct si_context *)ctx;
+   struct si_multi_fence *sfence = (struct si_multi_fence *)fence;
+
+   /* We should have at least one syncobj to signal */
+   assert(sfence->sdma || sfence->gfx);
+
+   if (sfence->sdma)
+      si_add_syncobj_signal(sctx, sfence->sdma);
+   if (sfence->gfx)
+      si_add_syncobj_signal(sctx, sfence->gfx);
+
+   /**
+    * The spec does not require a flush here. We insert a flush
+    * because syncobj based signals are not directly placed into
+    * the command stream. Instead the signal happens when the
+    * submission associated with the syncobj finishes execution.
+    *
+    * Therefore, we must make sure that we flush the pipe to avoid
+    * new work being emitted and getting executed before the signal
+    * operation.
+    *
+    * Set sctx->initial_gfx_cs_size to force IB submission even if
+    * it is empty.
+    */
+   sctx->initial_gfx_cs_size = 0;
+   si_flush_from_st(ctx, NULL, PIPE_FLUSH_ASYNC);
 }
 
-static void si_fence_server_sync(struct pipe_context *ctx,
-				 struct pipe_fence_handle *fence)
+static void si_fence_server_sync(struct pipe_context *ctx, struct pipe_fence_handle *fence)
 {
-	struct si_context *sctx = (struct si_context *)ctx;
-	struct si_multi_fence *sfence = (struct si_multi_fence *)fence;
-
-	util_queue_fence_wait(&sfence->ready);
-
-	/* Unflushed fences from the same context are no-ops. */
-	if (sfence->gfx_unflushed.ctx &&
-	    sfence->gfx_unflushed.ctx == sctx)
-		return;
-
-	/* All unflushed commands will not start execution before
-	 * this fence dependency is signalled.
-	 *
-	 * Therefore we must flush before inserting the dependency
-	 */
-	si_flush_from_st(ctx, NULL, PIPE_FLUSH_ASYNC);
-
-	if (sfence->sdma)
-		si_add_fence_dependency(sctx, sfence->sdma);
-	if (sfence->gfx)
-		si_add_fence_dependency(sctx, sfence->gfx);
+   struct si_context *sctx = (struct si_context *)ctx;
+   struct si_multi_fence *sfence = (struct si_multi_fence *)fence;
+
+   util_queue_fence_wait(&sfence->ready);
+
+   /* Unflushed fences from the same context are no-ops. */
+   if (sfence->gfx_unflushed.ctx && sfence->gfx_unflushed.ctx == sctx)
+      return;
+
+   /* All unflushed commands will not start execution before
+    * this fence dependency is signalled.
+    *
+    * Therefore we must flush before inserting the dependency
+    */
+   si_flush_from_st(ctx, NULL, PIPE_FLUSH_ASYNC);
+
+   if (sfence->sdma)
+      si_add_fence_dependency(sctx, sfence->sdma);
+   if (sfence->gfx)
+      si_add_fence_dependency(sctx, sfence->gfx);
 }
 
 void si_init_fence_functions(struct si_context *ctx)
 {
-	ctx->b.flush = si_flush_from_st;
-	ctx->b.create_fence_fd = si_create_fence_fd;
-	ctx->b.fence_server_sync = si_fence_server_sync;
-	ctx->b.fence_server_signal = si_fence_server_signal;
+   ctx->b.flush = si_flush_from_st;
+   ctx->b.create_fence_fd = si_create_fence_fd;
+   ctx->b.fence_server_sync = si_fence_server_sync;
+   ctx->b.fence_server_signal = si_fence_server_signal;
 }
 
 void si_init_screen_fence_functions(struct si_screen *screen)
 {
-	screen->b.fence_finish = si_fence_finish;
-	screen->b.fence_reference = si_fence_reference;
-	screen->b.fence_get_fd = si_fence_get_fd;
+   screen->b.fence_finish = si_fence_finish;
+   screen->b.fence_reference = si_fence_reference;
+   screen->b.fence_get_fd = si_fence_get_fd;
 }
diff --git a/src/gallium/drivers/radeonsi/si_get.c b/src/gallium/drivers/radeonsi/si_get.c
index f0a00b17e7e..2a4a23cec13 100644
--- a/src/gallium/drivers/radeonsi/si_get.c
+++ b/src/gallium/drivers/radeonsi/si_get.c
@@ -22,981 +22,947 @@
  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  */
 
-#include "si_pipe.h"
-#include "radeon/radeon_video.h"
-#include "radeon/radeon_vce.h"
+#include "compiler/nir/nir.h"
 #include "radeon/radeon_uvd_enc.h"
-#include "vl/vl_decoder.h"
-#include "vl/vl_video_buffer.h"
+#include "radeon/radeon_vce.h"
+#include "radeon/radeon_video.h"
+#include "si_pipe.h"
 #include "util/u_screen.h"
 #include "util/u_video.h"
-#include "compiler/nir/nir.h"
-
+#include "vl/vl_decoder.h"
+#include "vl/vl_video_buffer.h"
 #include <sys/utsname.h>
 
 static const char *si_get_vendor(struct pipe_screen *pscreen)
 {
-	/* Don't change this. Games such as Alien Isolation are broken if this
-	 * returns "Advanced Micro Devices, Inc."
-	 */
-	return "X.Org";
+   /* Don't change this. Games such as Alien Isolation are broken if this
+    * returns "Advanced Micro Devices, Inc."
+    */
+   return "X.Org";
 }
 
 static const char *si_get_device_vendor(struct pipe_screen *pscreen)
 {
-	return "AMD";
+   return "AMD";
 }
 
 static int si_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
 {
-	struct si_screen *sscreen = (struct si_screen *)pscreen;
-
-	switch (param) {
-	/* Supported features (boolean caps). */
-	case PIPE_CAP_ACCELERATED:
-	case PIPE_CAP_MAX_DUAL_SOURCE_RENDER_TARGETS:
-	case PIPE_CAP_ANISOTROPIC_FILTER:
-	case PIPE_CAP_POINT_SPRITE:
-	case PIPE_CAP_OCCLUSION_QUERY:
-	case PIPE_CAP_TEXTURE_MIRROR_CLAMP:
-	case PIPE_CAP_TEXTURE_SHADOW_LOD:
-	case PIPE_CAP_TEXTURE_MIRROR_CLAMP_TO_EDGE:
-	case PIPE_CAP_BLEND_EQUATION_SEPARATE:
-	case PIPE_CAP_TEXTURE_SWIZZLE:
-	case PIPE_CAP_DEPTH_CLIP_DISABLE:
-	case PIPE_CAP_DEPTH_CLIP_DISABLE_SEPARATE:
-	case PIPE_CAP_SHADER_STENCIL_EXPORT:
-	case PIPE_CAP_VERTEX_ELEMENT_INSTANCE_DIVISOR:
-	case PIPE_CAP_MIXED_COLORBUFFER_FORMATS:
-	case PIPE_CAP_TGSI_FS_COORD_ORIGIN_UPPER_LEFT:
-	case PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_HALF_INTEGER:
-	case PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_INTEGER:
-	case PIPE_CAP_FRAGMENT_SHADER_TEXTURE_LOD:
-	case PIPE_CAP_FRAGMENT_SHADER_DERIVATIVES:
-	case PIPE_CAP_VERTEX_SHADER_SATURATE:
-	case PIPE_CAP_SEAMLESS_CUBE_MAP:
-	case PIPE_CAP_PRIMITIVE_RESTART:
-	case PIPE_CAP_CONDITIONAL_RENDER:
-	case PIPE_CAP_TEXTURE_BARRIER:
-	case PIPE_CAP_INDEP_BLEND_ENABLE:
-	case PIPE_CAP_INDEP_BLEND_FUNC:
-	case PIPE_CAP_SEAMLESS_CUBE_MAP_PER_TEXTURE:
-	case PIPE_CAP_VERTEX_COLOR_UNCLAMPED:
-	case PIPE_CAP_START_INSTANCE:
-	case PIPE_CAP_NPOT_TEXTURES:
-	case PIPE_CAP_MIXED_FRAMEBUFFER_SIZES:
-	case PIPE_CAP_MIXED_COLOR_DEPTH_BITS:
-	case PIPE_CAP_VERTEX_COLOR_CLAMPED:
-	case PIPE_CAP_FRAGMENT_COLOR_CLAMPED:
-	case PIPE_CAP_PREFER_BLIT_BASED_TEXTURE_TRANSFER:
-	case PIPE_CAP_TGSI_INSTANCEID:
-	case PIPE_CAP_COMPUTE:
-	case PIPE_CAP_TEXTURE_BUFFER_OBJECTS:
-	case PIPE_CAP_TGSI_VS_LAYER_VIEWPORT:
-	case PIPE_CAP_QUERY_PIPELINE_STATISTICS:
-	case PIPE_CAP_BUFFER_MAP_PERSISTENT_COHERENT:
-	case PIPE_CAP_CUBE_MAP_ARRAY:
-	case PIPE_CAP_SAMPLE_SHADING:
-	case PIPE_CAP_DRAW_INDIRECT:
-	case PIPE_CAP_CLIP_HALFZ:
-	case PIPE_CAP_TGSI_VS_WINDOW_SPACE_POSITION:
-	case PIPE_CAP_POLYGON_OFFSET_CLAMP:
-	case PIPE_CAP_MULTISAMPLE_Z_RESOLVE:
-	case PIPE_CAP_QUADS_FOLLOW_PROVOKING_VERTEX_CONVENTION:
-	case PIPE_CAP_TGSI_TEXCOORD:
-	case PIPE_CAP_TGSI_FS_FINE_DERIVATIVE:
-	case PIPE_CAP_CONDITIONAL_RENDER_INVERTED:
-	case PIPE_CAP_TEXTURE_FLOAT_LINEAR:
-	case PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR:
-	case PIPE_CAP_SHAREABLE_SHADERS:
-	case PIPE_CAP_DEPTH_BOUNDS_TEST:
-	case PIPE_CAP_SAMPLER_VIEW_TARGET:
-	case PIPE_CAP_TEXTURE_QUERY_LOD:
-	case PIPE_CAP_TEXTURE_GATHER_SM5:
-	case PIPE_CAP_TGSI_TXQS:
-	case PIPE_CAP_FORCE_PERSAMPLE_INTERP:
-	case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS:
-	case PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL:
-	case PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL:
-	case PIPE_CAP_INVALIDATE_BUFFER:
-	case PIPE_CAP_SURFACE_REINTERPRET_BLOCKS:
-	case PIPE_CAP_QUERY_BUFFER_OBJECT:
-	case PIPE_CAP_QUERY_MEMORY_INFO:
-	case PIPE_CAP_TGSI_PACK_HALF_FLOAT:
-	case PIPE_CAP_FRAMEBUFFER_NO_ATTACHMENT:
-	case PIPE_CAP_ROBUST_BUFFER_ACCESS_BEHAVIOR:
-	case PIPE_CAP_GENERATE_MIPMAP:
-	case PIPE_CAP_POLYGON_OFFSET_UNITS_UNSCALED:
-	case PIPE_CAP_STRING_MARKER:
-	case PIPE_CAP_CLEAR_TEXTURE:
-	case PIPE_CAP_CULL_DISTANCE:
-	case PIPE_CAP_TGSI_ARRAY_COMPONENTS:
-	case PIPE_CAP_TGSI_CAN_READ_OUTPUTS:
-	case PIPE_CAP_GLSL_OPTIMIZE_CONSERVATIVELY:
-	case PIPE_CAP_STREAM_OUTPUT_PAUSE_RESUME:
-	case PIPE_CAP_STREAM_OUTPUT_INTERLEAVE_BUFFERS:
-	case PIPE_CAP_DOUBLES:
-	case PIPE_CAP_TGSI_TEX_TXF_LZ:
-	case PIPE_CAP_TGSI_TES_LAYER_VIEWPORT:
-	case PIPE_CAP_BINDLESS_TEXTURE:
-	case PIPE_CAP_QUERY_TIMESTAMP:
-	case PIPE_CAP_QUERY_TIME_ELAPSED:
-	case PIPE_CAP_NIR_SAMPLERS_AS_DEREF:
-	case PIPE_CAP_MEMOBJ:
-	case PIPE_CAP_LOAD_CONSTBUF:
-	case PIPE_CAP_INT64:
-	case PIPE_CAP_INT64_DIVMOD:
-	case PIPE_CAP_TGSI_CLOCK:
-	case PIPE_CAP_CAN_BIND_CONST_BUFFER_AS_VERTEX:
-	case PIPE_CAP_ALLOW_MAPPED_BUFFERS_DURING_EXECUTION:
-	case PIPE_CAP_SIGNED_VERTEX_BUFFER_OFFSET:
-	case PIPE_CAP_TGSI_BALLOT:
-	case PIPE_CAP_TGSI_VOTE:
-	case PIPE_CAP_FBFETCH:
-	case PIPE_CAP_COMPUTE_GRID_INFO_LAST_BLOCK:
-	case PIPE_CAP_IMAGE_LOAD_FORMATTED:
-	case PIPE_CAP_PREFER_COMPUTE_FOR_MULTIMEDIA:
-	case PIPE_CAP_TGSI_DIV:
-	case PIPE_CAP_PACKED_UNIFORMS:
-	case PIPE_CAP_SHADER_SAMPLES_IDENTICAL:
-	case PIPE_CAP_GL_SPIRV:
-	case PIPE_CAP_DRAW_INFO_START_WITH_USER_INDICES:
-		return 1;
-
-	case PIPE_CAP_QUERY_SO_OVERFLOW:
-		return !sscreen->use_ngg_streamout;
-
-	case PIPE_CAP_POST_DEPTH_COVERAGE:
-		return sscreen->info.chip_class >= GFX10;
-
-	case PIPE_CAP_GRAPHICS:
-		return sscreen->info.has_graphics;
-
-	case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:
-		return !SI_BIG_ENDIAN && sscreen->info.has_userptr;
-
-	case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
-		return sscreen->info.has_gpu_reset_status_query;
-
-	case PIPE_CAP_TEXTURE_MULTISAMPLE:
-		return sscreen->info.has_2d_tiling;
-
-        case PIPE_CAP_MIN_MAP_BUFFER_ALIGNMENT:
-                return SI_MAP_BUFFER_ALIGNMENT;
-
-	case PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT:
-	case PIPE_CAP_TEXTURE_BUFFER_OFFSET_ALIGNMENT:
-	case PIPE_CAP_MAX_TEXTURE_GATHER_COMPONENTS:
-	case PIPE_CAP_MAX_STREAM_OUTPUT_BUFFERS:
-	case PIPE_CAP_MAX_VERTEX_STREAMS:
-	case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT:
-	case PIPE_CAP_MAX_WINDOW_RECTANGLES:
-		return 4;
-
-	case PIPE_CAP_GLSL_FEATURE_LEVEL:
-	case PIPE_CAP_GLSL_FEATURE_LEVEL_COMPATIBILITY:
-		if (!sscreen->info.has_indirect_compute_dispatch)
-			return 420;
-		return 460;
-
-	case PIPE_CAP_MAX_TEXTURE_UPLOAD_MEMORY_BUDGET:
-		/* Optimal number for good TexSubImage performance on Polaris10. */
-		return 64 * 1024 * 1024;
-
-	case PIPE_CAP_GL_BEGIN_END_BUFFER_SIZE:
-		return 4096 * 1024;
-
-	case PIPE_CAP_MAX_TEXTURE_BUFFER_SIZE:
-	case PIPE_CAP_MAX_SHADER_BUFFER_SIZE:
-		return MIN2(sscreen->info.max_alloc_size, INT_MAX);
-
-	case PIPE_CAP_VERTEX_BUFFER_OFFSET_4BYTE_ALIGNED_ONLY:
-	case PIPE_CAP_VERTEX_BUFFER_STRIDE_4BYTE_ALIGNED_ONLY:
-	case PIPE_CAP_VERTEX_ELEMENT_SRC_OFFSET_4BYTE_ALIGNED_ONLY:
-		return LLVM_VERSION_MAJOR < 9 && !sscreen->info.has_unaligned_shader_loads;
-
-	case PIPE_CAP_SPARSE_BUFFER_PAGE_SIZE:
-		return sscreen->info.has_sparse_vm_mappings ?
-				RADEON_SPARSE_PAGE_SIZE : 0;
-
-
-	case PIPE_CAP_UMA:
-		return 0;
-
-	case PIPE_CAP_FENCE_SIGNAL:
-		return sscreen->info.has_syncobj;
-
-	case PIPE_CAP_CONSTBUF0_FLAGS:
-		return SI_RESOURCE_FLAG_32BIT;
-
-	case PIPE_CAP_NATIVE_FENCE_FD:
-		return sscreen->info.has_fence_to_handle;
-
-	case PIPE_CAP_DRAW_PARAMETERS:
-	case PIPE_CAP_MULTI_DRAW_INDIRECT:
-	case PIPE_CAP_MULTI_DRAW_INDIRECT_PARAMS:
-		return sscreen->has_draw_indirect_multi;
-
-	case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
-		return 30;
-
-	case PIPE_CAP_MAX_VARYINGS:
-		return 32;
-
-	case PIPE_CAP_TEXTURE_BORDER_COLOR_QUIRK:
-		return sscreen->info.chip_class <= GFX8 ?
-			PIPE_QUIRK_TEXTURE_BORDER_COLOR_SWIZZLE_R600 : 0;
-
-	/* Stream output. */
-	case PIPE_CAP_MAX_STREAM_OUTPUT_SEPARATE_COMPONENTS:
-	case PIPE_CAP_MAX_STREAM_OUTPUT_INTERLEAVED_COMPONENTS:
-		return 32*4;
-
-	/* Geometry shader output. */
-	case PIPE_CAP_MAX_GEOMETRY_OUTPUT_VERTICES:
-		/* gfx9 has to report 256 to make piglit/gs-max-output pass.
-		 * gfx8 and earlier can do 1024.
-		 */
-		return 256;
-	case PIPE_CAP_MAX_GEOMETRY_TOTAL_OUTPUT_COMPONENTS:
-		return 4095;
-	case PIPE_CAP_MAX_GS_INVOCATIONS:
-		/* The closed driver exposes 127, but 125 is the greatest
-		 * number that works. */
-		return 125;
-
-	case PIPE_CAP_MAX_VERTEX_ATTRIB_STRIDE:
-		return 2048;
-
-	/* Texturing. */
-	case PIPE_CAP_MAX_TEXTURE_2D_SIZE:
-		return 16384;
-	case PIPE_CAP_MAX_TEXTURE_CUBE_LEVELS:
-		return 15; /* 16384 */
-	case PIPE_CAP_MAX_TEXTURE_3D_LEVELS:
-		if (sscreen->info.chip_class >= GFX10)
-			return 14;
-		/* textures support 8192, but layered rendering supports 2048 */
-		return 12;
-	case PIPE_CAP_MAX_TEXTURE_ARRAY_LAYERS:
-		if (sscreen->info.chip_class >= GFX10)
-			return 8192;
-		/* textures support 8192, but layered rendering supports 2048 */
-		return 2048;
-
-	/* Viewports and render targets. */
-	case PIPE_CAP_MAX_VIEWPORTS:
-		return SI_MAX_VIEWPORTS;
-	case PIPE_CAP_VIEWPORT_SUBPIXEL_BITS:
-	case PIPE_CAP_RASTERIZER_SUBPIXEL_BITS:
-	case PIPE_CAP_MAX_RENDER_TARGETS:
-		return 8;
-	case PIPE_CAP_FRAMEBUFFER_MSAA_CONSTRAINTS:
-		return sscreen->info.has_eqaa_surface_allocator ? 2 : 0;
-
-	case PIPE_CAP_MIN_TEXTURE_GATHER_OFFSET:
-	case PIPE_CAP_MIN_TEXEL_OFFSET:
-		return -32;
-
-	case PIPE_CAP_MAX_TEXTURE_GATHER_OFFSET:
-	case PIPE_CAP_MAX_TEXEL_OFFSET:
-		return 31;
-
-	case PIPE_CAP_ENDIANNESS:
-		return PIPE_ENDIAN_LITTLE;
-
-	case PIPE_CAP_VENDOR_ID:
-		return ATI_VENDOR_ID;
-	case PIPE_CAP_DEVICE_ID:
-		return sscreen->info.pci_id;
-	case PIPE_CAP_VIDEO_MEMORY:
-		return sscreen->info.vram_size >> 20;
-	case PIPE_CAP_PCI_GROUP:
-		return sscreen->info.pci_domain;
-	case PIPE_CAP_PCI_BUS:
-		return sscreen->info.pci_bus;
-	case PIPE_CAP_PCI_DEVICE:
-		return sscreen->info.pci_dev;
-	case PIPE_CAP_PCI_FUNCTION:
-		return sscreen->info.pci_func;
-	case PIPE_CAP_TGSI_ATOMINC_WRAP:
-		return LLVM_VERSION_MAJOR >= 10;
-
-	default:
-		return u_pipe_screen_get_param_defaults(pscreen, param);
-	}
+   struct si_screen *sscreen = (struct si_screen *)pscreen;
+
+   switch (param) {
+   /* Supported features (boolean caps). */
+   case PIPE_CAP_ACCELERATED:
+   case PIPE_CAP_MAX_DUAL_SOURCE_RENDER_TARGETS:
+   case PIPE_CAP_ANISOTROPIC_FILTER:
+   case PIPE_CAP_POINT_SPRITE:
+   case PIPE_CAP_OCCLUSION_QUERY:
+   case PIPE_CAP_TEXTURE_MIRROR_CLAMP:
+   case PIPE_CAP_TEXTURE_SHADOW_LOD:
+   case PIPE_CAP_TEXTURE_MIRROR_CLAMP_TO_EDGE:
+   case PIPE_CAP_BLEND_EQUATION_SEPARATE:
+   case PIPE_CAP_TEXTURE_SWIZZLE:
+   case PIPE_CAP_DEPTH_CLIP_DISABLE:
+   case PIPE_CAP_DEPTH_CLIP_DISABLE_SEPARATE:
+   case PIPE_CAP_SHADER_STENCIL_EXPORT:
+   case PIPE_CAP_VERTEX_ELEMENT_INSTANCE_DIVISOR:
+   case PIPE_CAP_MIXED_COLORBUFFER_FORMATS:
+   case PIPE_CAP_TGSI_FS_COORD_ORIGIN_UPPER_LEFT:
+   case PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_HALF_INTEGER:
+   case PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_INTEGER:
+   case PIPE_CAP_FRAGMENT_SHADER_TEXTURE_LOD:
+   case PIPE_CAP_FRAGMENT_SHADER_DERIVATIVES:
+   case PIPE_CAP_VERTEX_SHADER_SATURATE:
+   case PIPE_CAP_SEAMLESS_CUBE_MAP:
+   case PIPE_CAP_PRIMITIVE_RESTART:
+   case PIPE_CAP_CONDITIONAL_RENDER:
+   case PIPE_CAP_TEXTURE_BARRIER:
+   case PIPE_CAP_INDEP_BLEND_ENABLE:
+   case PIPE_CAP_INDEP_BLEND_FUNC:
+   case PIPE_CAP_SEAMLESS_CUBE_MAP_PER_TEXTURE:
+   case PIPE_CAP_VERTEX_COLOR_UNCLAMPED:
+   case PIPE_CAP_START_INSTANCE:
+   case PIPE_CAP_NPOT_TEXTURES:
+   case PIPE_CAP_MIXED_FRAMEBUFFER_SIZES:
+   case PIPE_CAP_MIXED_COLOR_DEPTH_BITS:
+   case PIPE_CAP_VERTEX_COLOR_CLAMPED:
+   case PIPE_CAP_FRAGMENT_COLOR_CLAMPED:
+   case PIPE_CAP_PREFER_BLIT_BASED_TEXTURE_TRANSFER:
+   case PIPE_CAP_TGSI_INSTANCEID:
+   case PIPE_CAP_COMPUTE:
+   case PIPE_CAP_TEXTURE_BUFFER_OBJECTS:
+   case PIPE_CAP_TGSI_VS_LAYER_VIEWPORT:
+   case PIPE_CAP_QUERY_PIPELINE_STATISTICS:
+   case PIPE_CAP_BUFFER_MAP_PERSISTENT_COHERENT:
+   case PIPE_CAP_CUBE_MAP_ARRAY:
+   case PIPE_CAP_SAMPLE_SHADING:
+   case PIPE_CAP_DRAW_INDIRECT:
+   case PIPE_CAP_CLIP_HALFZ:
+   case PIPE_CAP_TGSI_VS_WINDOW_SPACE_POSITION:
+   case PIPE_CAP_POLYGON_OFFSET_CLAMP:
+   case PIPE_CAP_MULTISAMPLE_Z_RESOLVE:
+   case PIPE_CAP_QUADS_FOLLOW_PROVOKING_VERTEX_CONVENTION:
+   case PIPE_CAP_TGSI_TEXCOORD:
+   case PIPE_CAP_TGSI_FS_FINE_DERIVATIVE:
+   case PIPE_CAP_CONDITIONAL_RENDER_INVERTED:
+   case PIPE_CAP_TEXTURE_FLOAT_LINEAR:
+   case PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR:
+   case PIPE_CAP_SHAREABLE_SHADERS:
+   case PIPE_CAP_DEPTH_BOUNDS_TEST:
+   case PIPE_CAP_SAMPLER_VIEW_TARGET:
+   case PIPE_CAP_TEXTURE_QUERY_LOD:
+   case PIPE_CAP_TEXTURE_GATHER_SM5:
+   case PIPE_CAP_TGSI_TXQS:
+   case PIPE_CAP_FORCE_PERSAMPLE_INTERP:
+   case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS:
+   case PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL:
+   case PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL:
+   case PIPE_CAP_INVALIDATE_BUFFER:
+   case PIPE_CAP_SURFACE_REINTERPRET_BLOCKS:
+   case PIPE_CAP_QUERY_BUFFER_OBJECT:
+   case PIPE_CAP_QUERY_MEMORY_INFO:
+   case PIPE_CAP_TGSI_PACK_HALF_FLOAT:
+   case PIPE_CAP_FRAMEBUFFER_NO_ATTACHMENT:
+   case PIPE_CAP_ROBUST_BUFFER_ACCESS_BEHAVIOR:
+   case PIPE_CAP_GENERATE_MIPMAP:
+   case PIPE_CAP_POLYGON_OFFSET_UNITS_UNSCALED:
+   case PIPE_CAP_STRING_MARKER:
+   case PIPE_CAP_CLEAR_TEXTURE:
+   case PIPE_CAP_CULL_DISTANCE:
+   case PIPE_CAP_TGSI_ARRAY_COMPONENTS:
+   case PIPE_CAP_TGSI_CAN_READ_OUTPUTS:
+   case PIPE_CAP_GLSL_OPTIMIZE_CONSERVATIVELY:
+   case PIPE_CAP_STREAM_OUTPUT_PAUSE_RESUME:
+   case PIPE_CAP_STREAM_OUTPUT_INTERLEAVE_BUFFERS:
+   case PIPE_CAP_DOUBLES:
+   case PIPE_CAP_TGSI_TEX_TXF_LZ:
+   case PIPE_CAP_TGSI_TES_LAYER_VIEWPORT:
+   case PIPE_CAP_BINDLESS_TEXTURE:
+   case PIPE_CAP_QUERY_TIMESTAMP:
+   case PIPE_CAP_QUERY_TIME_ELAPSED:
+   case PIPE_CAP_NIR_SAMPLERS_AS_DEREF:
+   case PIPE_CAP_MEMOBJ:
+   case PIPE_CAP_LOAD_CONSTBUF:
+   case PIPE_CAP_INT64:
+   case PIPE_CAP_INT64_DIVMOD:
+   case PIPE_CAP_TGSI_CLOCK:
+   case PIPE_CAP_CAN_BIND_CONST_BUFFER_AS_VERTEX:
+   case PIPE_CAP_ALLOW_MAPPED_BUFFERS_DURING_EXECUTION:
+   case PIPE_CAP_SIGNED_VERTEX_BUFFER_OFFSET:
+   case PIPE_CAP_TGSI_BALLOT:
+   case PIPE_CAP_TGSI_VOTE:
+   case PIPE_CAP_FBFETCH:
+   case PIPE_CAP_COMPUTE_GRID_INFO_LAST_BLOCK:
+   case PIPE_CAP_IMAGE_LOAD_FORMATTED:
+   case PIPE_CAP_PREFER_COMPUTE_FOR_MULTIMEDIA:
+   case PIPE_CAP_TGSI_DIV:
+   case PIPE_CAP_PACKED_UNIFORMS:
+   case PIPE_CAP_SHADER_SAMPLES_IDENTICAL:
+   case PIPE_CAP_GL_SPIRV:
+   case PIPE_CAP_DRAW_INFO_START_WITH_USER_INDICES:
+      return 1;
+
+   case PIPE_CAP_QUERY_SO_OVERFLOW:
+      return !sscreen->use_ngg_streamout;
+
+   case PIPE_CAP_POST_DEPTH_COVERAGE:
+      return sscreen->info.chip_class >= GFX10;
+
+   case PIPE_CAP_GRAPHICS:
+      return sscreen->info.has_graphics;
+
+   case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:
+      return !SI_BIG_ENDIAN && sscreen->info.has_userptr;
+
+   case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
+      return sscreen->info.has_gpu_reset_status_query;
+
+   case PIPE_CAP_TEXTURE_MULTISAMPLE:
+      return sscreen->info.has_2d_tiling;
+
+   case PIPE_CAP_MIN_MAP_BUFFER_ALIGNMENT:
+      return SI_MAP_BUFFER_ALIGNMENT;
+
+   case PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT:
+   case PIPE_CAP_TEXTURE_BUFFER_OFFSET_ALIGNMENT:
+   case PIPE_CAP_MAX_TEXTURE_GATHER_COMPONENTS:
+   case PIPE_CAP_MAX_STREAM_OUTPUT_BUFFERS:
+   case PIPE_CAP_MAX_VERTEX_STREAMS:
+   case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT:
+   case PIPE_CAP_MAX_WINDOW_RECTANGLES:
+      return 4;
+
+   case PIPE_CAP_GLSL_FEATURE_LEVEL:
+   case PIPE_CAP_GLSL_FEATURE_LEVEL_COMPATIBILITY:
+      if (!sscreen->info.has_indirect_compute_dispatch)
+         return 420;
+      return 460;
+
+   case PIPE_CAP_MAX_TEXTURE_UPLOAD_MEMORY_BUDGET:
+      /* Optimal number for good TexSubImage performance on Polaris10. */
+      return 64 * 1024 * 1024;
+
+   case PIPE_CAP_GL_BEGIN_END_BUFFER_SIZE:
+      return 4096 * 1024;
+
+   case PIPE_CAP_MAX_TEXTURE_BUFFER_SIZE:
+   case PIPE_CAP_MAX_SHADER_BUFFER_SIZE:
+      return MIN2(sscreen->info.max_alloc_size, INT_MAX);
+
+   case PIPE_CAP_VERTEX_BUFFER_OFFSET_4BYTE_ALIGNED_ONLY:
+   case PIPE_CAP_VERTEX_BUFFER_STRIDE_4BYTE_ALIGNED_ONLY:
+   case PIPE_CAP_VERTEX_ELEMENT_SRC_OFFSET_4BYTE_ALIGNED_ONLY:
+      return LLVM_VERSION_MAJOR < 9 && !sscreen->info.has_unaligned_shader_loads;
+
+   case PIPE_CAP_SPARSE_BUFFER_PAGE_SIZE:
+      return sscreen->info.has_sparse_vm_mappings ? RADEON_SPARSE_PAGE_SIZE : 0;
+
+   case PIPE_CAP_UMA:
+      return 0;
+
+   case PIPE_CAP_FENCE_SIGNAL:
+      return sscreen->info.has_syncobj;
+
+   case PIPE_CAP_CONSTBUF0_FLAGS:
+      return SI_RESOURCE_FLAG_32BIT;
+
+   case PIPE_CAP_NATIVE_FENCE_FD:
+      return sscreen->info.has_fence_to_handle;
+
+   case PIPE_CAP_DRAW_PARAMETERS:
+   case PIPE_CAP_MULTI_DRAW_INDIRECT:
+   case PIPE_CAP_MULTI_DRAW_INDIRECT_PARAMS:
+      return sscreen->has_draw_indirect_multi;
+
+   case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
+      return 30;
+
+   case PIPE_CAP_MAX_VARYINGS:
+      return 32;
+
+   case PIPE_CAP_TEXTURE_BORDER_COLOR_QUIRK:
+      return sscreen->info.chip_class <= GFX8 ? PIPE_QUIRK_TEXTURE_BORDER_COLOR_SWIZZLE_R600 : 0;
+
+   /* Stream output. */
+   case PIPE_CAP_MAX_STREAM_OUTPUT_SEPARATE_COMPONENTS:
+   case PIPE_CAP_MAX_STREAM_OUTPUT_INTERLEAVED_COMPONENTS:
+      return 32 * 4;
+
+   /* Geometry shader output. */
+   case PIPE_CAP_MAX_GEOMETRY_OUTPUT_VERTICES:
+      /* gfx9 has to report 256 to make piglit/gs-max-output pass.
+       * gfx8 and earlier can do 1024.
+       */
+      return 256;
+   case PIPE_CAP_MAX_GEOMETRY_TOTAL_OUTPUT_COMPONENTS:
+      return 4095;
+   case PIPE_CAP_MAX_GS_INVOCATIONS:
+      /* The closed driver exposes 127, but 125 is the greatest
+       * number that works. */
+      return 125;
+
+   case PIPE_CAP_MAX_VERTEX_ATTRIB_STRIDE:
+      return 2048;
+
+   /* Texturing. */
+   case PIPE_CAP_MAX_TEXTURE_2D_SIZE:
+      return 16384;
+   case PIPE_CAP_MAX_TEXTURE_CUBE_LEVELS:
+      return 15; /* 16384 */
+   case PIPE_CAP_MAX_TEXTURE_3D_LEVELS:
+      if (sscreen->info.chip_class >= GFX10)
+         return 14;
+      /* textures support 8192, but layered rendering supports 2048 */
+      return 12;
+   case PIPE_CAP_MAX_TEXTURE_ARRAY_LAYERS:
+      if (sscreen->info.chip_class >= GFX10)
+         return 8192;
+      /* textures support 8192, but layered rendering supports 2048 */
+      return 2048;
+
+   /* Viewports and render targets. */
+   case PIPE_CAP_MAX_VIEWPORTS:
+      return SI_MAX_VIEWPORTS;
+   case PIPE_CAP_VIEWPORT_SUBPIXEL_BITS:
+   case PIPE_CAP_RASTERIZER_SUBPIXEL_BITS:
+   case PIPE_CAP_MAX_RENDER_TARGETS:
+      return 8;
+   case PIPE_CAP_FRAMEBUFFER_MSAA_CONSTRAINTS:
+      return sscreen->info.has_eqaa_surface_allocator ? 2 : 0;
+
+   case PIPE_CAP_MIN_TEXTURE_GATHER_OFFSET:
+   case PIPE_CAP_MIN_TEXEL_OFFSET:
+      return -32;
+
+   case PIPE_CAP_MAX_TEXTURE_GATHER_OFFSET:
+   case PIPE_CAP_MAX_TEXEL_OFFSET:
+      return 31;
+
+   case PIPE_CAP_ENDIANNESS:
+      return PIPE_ENDIAN_LITTLE;
+
+   case PIPE_CAP_VENDOR_ID:
+      return ATI_VENDOR_ID;
+   case PIPE_CAP_DEVICE_ID:
+      return sscreen->info.pci_id;
+   case PIPE_CAP_VIDEO_MEMORY:
+      return sscreen->info.vram_size >> 20;
+   case PIPE_CAP_PCI_GROUP:
+      return sscreen->info.pci_domain;
+   case PIPE_CAP_PCI_BUS:
+      return sscreen->info.pci_bus;
+   case PIPE_CAP_PCI_DEVICE:
+      return sscreen->info.pci_dev;
+   case PIPE_CAP_PCI_FUNCTION:
+      return sscreen->info.pci_func;
+   case PIPE_CAP_TGSI_ATOMINC_WRAP:
+      return LLVM_VERSION_MAJOR >= 10;
+
+   default:
+      return u_pipe_screen_get_param_defaults(pscreen, param);
+   }
 }
 
-static float si_get_paramf(struct pipe_screen* pscreen, enum pipe_capf param)
+static float si_get_paramf(struct pipe_screen *pscreen, enum pipe_capf param)
 {
-	switch (param) {
-	case PIPE_CAPF_MAX_LINE_WIDTH:
-	case PIPE_CAPF_MAX_LINE_WIDTH_AA:
-		/* This depends on the quant mode, though the precise interactions
-		 * are unknown. */
-		return 2048;
-	case PIPE_CAPF_MAX_POINT_WIDTH:
-	case PIPE_CAPF_MAX_POINT_WIDTH_AA:
-		return SI_MAX_POINT_SIZE;
-	case PIPE_CAPF_MAX_TEXTURE_ANISOTROPY:
-		return 16.0f;
-	case PIPE_CAPF_MAX_TEXTURE_LOD_BIAS:
-		return 16.0f;
-	case PIPE_CAPF_MIN_CONSERVATIVE_RASTER_DILATE:
-	case PIPE_CAPF_MAX_CONSERVATIVE_RASTER_DILATE:
-	case PIPE_CAPF_CONSERVATIVE_RASTER_DILATE_GRANULARITY:
-		return 0.0f;
-	}
-	return 0.0f;
+   switch (param) {
+   case PIPE_CAPF_MAX_LINE_WIDTH:
+   case PIPE_CAPF_MAX_LINE_WIDTH_AA:
+      /* This depends on the quant mode, though the precise interactions
+       * are unknown. */
+      return 2048;
+   case PIPE_CAPF_MAX_POINT_WIDTH:
+   case PIPE_CAPF_MAX_POINT_WIDTH_AA:
+      return SI_MAX_POINT_SIZE;
+   case PIPE_CAPF_MAX_TEXTURE_ANISOTROPY:
+      return 16.0f;
+   case PIPE_CAPF_MAX_TEXTURE_LOD_BIAS:
+      return 16.0f;
+   case PIPE_CAPF_MIN_CONSERVATIVE_RASTER_DILATE:
+   case PIPE_CAPF_MAX_CONSERVATIVE_RASTER_DILATE:
+   case PIPE_CAPF_CONSERVATIVE_RASTER_DILATE_GRANULARITY:
+      return 0.0f;
+   }
+   return 0.0f;
 }
 
-static int si_get_shader_param(struct pipe_screen* pscreen,
-			       enum pipe_shader_type shader,
-			       enum pipe_shader_cap param)
+static int si_get_shader_param(struct pipe_screen *pscreen, enum pipe_shader_type shader,
+                               enum pipe_shader_cap param)
 {
-	struct si_screen *sscreen = (struct si_screen *)pscreen;
-
-	switch(shader)
-	{
-	case PIPE_SHADER_FRAGMENT:
-	case PIPE_SHADER_VERTEX:
-	case PIPE_SHADER_GEOMETRY:
-	case PIPE_SHADER_TESS_CTRL:
-	case PIPE_SHADER_TESS_EVAL:
-		break;
-	case PIPE_SHADER_COMPUTE:
-		switch (param) {
-		case PIPE_SHADER_CAP_SUPPORTED_IRS: {
-			int ir = 1 << PIPE_SHADER_IR_NATIVE;
-
-			if (sscreen->info.has_indirect_compute_dispatch)
-				ir |= 1 << PIPE_SHADER_IR_NIR;
-
-			return ir;
-		}
-
-		case PIPE_SHADER_CAP_MAX_CONST_BUFFER_SIZE: {
-			uint64_t max_const_buffer_size;
-			pscreen->get_compute_param(pscreen, PIPE_SHADER_IR_NIR,
-				PIPE_COMPUTE_CAP_MAX_MEM_ALLOC_SIZE,
-				&max_const_buffer_size);
-			return MIN2(max_const_buffer_size, INT_MAX);
-		}
-		default:
-			/* If compute shaders don't require a special value
-			 * for this cap, we can return the same value we
-			 * do for other shader types. */
-			break;
-		}
-		break;
-	default:
-		return 0;
-	}
-
-	switch (param) {
-	/* Shader limits. */
-	case PIPE_SHADER_CAP_MAX_INSTRUCTIONS:
-	case PIPE_SHADER_CAP_MAX_ALU_INSTRUCTIONS:
-	case PIPE_SHADER_CAP_MAX_TEX_INSTRUCTIONS:
-	case PIPE_SHADER_CAP_MAX_TEX_INDIRECTIONS:
-	case PIPE_SHADER_CAP_MAX_CONTROL_FLOW_DEPTH:
-		return 16384;
-	case PIPE_SHADER_CAP_MAX_INPUTS:
-		return shader == PIPE_SHADER_VERTEX ? SI_MAX_ATTRIBS : 32;
-	case PIPE_SHADER_CAP_MAX_OUTPUTS:
-		return shader == PIPE_SHADER_FRAGMENT ? 8 : 32;
-	case PIPE_SHADER_CAP_MAX_TEMPS:
-		return 256; /* Max native temporaries. */
-	case PIPE_SHADER_CAP_MAX_CONST_BUFFER_SIZE:
-		return MIN2(sscreen->info.max_alloc_size, INT_MAX - 3); /* aligned to 4 */
-	case PIPE_SHADER_CAP_MAX_CONST_BUFFERS:
-		return SI_NUM_CONST_BUFFERS;
-	case PIPE_SHADER_CAP_MAX_TEXTURE_SAMPLERS:
-	case PIPE_SHADER_CAP_MAX_SAMPLER_VIEWS:
-		return SI_NUM_SAMPLERS;
-	case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS:
-		return SI_NUM_SHADER_BUFFERS;
-	case PIPE_SHADER_CAP_MAX_SHADER_IMAGES:
-		return SI_NUM_IMAGES;
-	case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT:
-		return 0;
-	case PIPE_SHADER_CAP_PREFERRED_IR:
-		return PIPE_SHADER_IR_NIR;
-	case PIPE_SHADER_CAP_LOWER_IF_THRESHOLD:
-		return 4;
-
-	/* Supported boolean features. */
-	case PIPE_SHADER_CAP_TGSI_CONT_SUPPORTED:
-	case PIPE_SHADER_CAP_TGSI_SQRT_SUPPORTED:
-	case PIPE_SHADER_CAP_INDIRECT_TEMP_ADDR:
-	case PIPE_SHADER_CAP_INDIRECT_CONST_ADDR:
-	case PIPE_SHADER_CAP_INTEGERS:
-	case PIPE_SHADER_CAP_INT64_ATOMICS:
-	case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED:
-	case PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE:
-	case PIPE_SHADER_CAP_TGSI_SKIP_MERGE_REGISTERS:
-	case PIPE_SHADER_CAP_TGSI_DROUND_SUPPORTED:
-	case PIPE_SHADER_CAP_TGSI_LDEXP_SUPPORTED:
-	case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED:
-		return 1;
-
-	case PIPE_SHADER_CAP_INDIRECT_INPUT_ADDR:
-		/* TODO: Indirect indexing of GS inputs is unimplemented. */
-		if (shader == PIPE_SHADER_GEOMETRY)
-			return 0;
-
-		if (shader == PIPE_SHADER_VERTEX &&
-		    !sscreen->llvm_has_working_vgpr_indexing)
-			return 0;
-
-		/* TCS and TES load inputs directly from LDS or offchip
-		 * memory, so indirect indexing is always supported.
-		 * PS has to support indirect indexing, because we can't
-		 * lower that to TEMPs for INTERP instructions.
-		 */
-		return 1;
-
-	case PIPE_SHADER_CAP_INDIRECT_OUTPUT_ADDR:
-		return sscreen->llvm_has_working_vgpr_indexing ||
-		       /* TCS stores outputs directly to memory. */
-		       shader == PIPE_SHADER_TESS_CTRL;
-
-	/* Unsupported boolean features. */
-	case PIPE_SHADER_CAP_FP16:
-	case PIPE_SHADER_CAP_SUBROUTINES:
-	case PIPE_SHADER_CAP_SUPPORTED_IRS:
-	case PIPE_SHADER_CAP_MAX_HW_ATOMIC_COUNTERS:
-	case PIPE_SHADER_CAP_MAX_HW_ATOMIC_COUNTER_BUFFERS:
-		return 0;
-	}
-	return 0;
+   struct si_screen *sscreen = (struct si_screen *)pscreen;
+
+   switch (shader) {
+   case PIPE_SHADER_FRAGMENT:
+   case PIPE_SHADER_VERTEX:
+   case PIPE_SHADER_GEOMETRY:
+   case PIPE_SHADER_TESS_CTRL:
+   case PIPE_SHADER_TESS_EVAL:
+      break;
+   case PIPE_SHADER_COMPUTE:
+      switch (param) {
+      case PIPE_SHADER_CAP_SUPPORTED_IRS: {
+         int ir = 1 << PIPE_SHADER_IR_NATIVE;
+
+         if (sscreen->info.has_indirect_compute_dispatch)
+            ir |= 1 << PIPE_SHADER_IR_NIR;
+
+         return ir;
+      }
+
+      case PIPE_SHADER_CAP_MAX_CONST_BUFFER_SIZE: {
+         uint64_t max_const_buffer_size;
+         pscreen->get_compute_param(pscreen, PIPE_SHADER_IR_NIR,
+                                    PIPE_COMPUTE_CAP_MAX_MEM_ALLOC_SIZE, &max_const_buffer_size);
+         return MIN2(max_const_buffer_size, INT_MAX);
+      }
+      default:
+         /* If compute shaders don't require a special value
+          * for this cap, we can return the same value we
+          * do for other shader types. */
+         break;
+      }
+      break;
+   default:
+      return 0;
+   }
+
+   switch (param) {
+   /* Shader limits. */
+   case PIPE_SHADER_CAP_MAX_INSTRUCTIONS:
+   case PIPE_SHADER_CAP_MAX_ALU_INSTRUCTIONS:
+   case PIPE_SHADER_CAP_MAX_TEX_INSTRUCTIONS:
+   case PIPE_SHADER_CAP_MAX_TEX_INDIRECTIONS:
+   case PIPE_SHADER_CAP_MAX_CONTROL_FLOW_DEPTH:
+      return 16384;
+   case PIPE_SHADER_CAP_MAX_INPUTS:
+      return shader == PIPE_SHADER_VERTEX ? SI_MAX_ATTRIBS : 32;
+   case PIPE_SHADER_CAP_MAX_OUTPUTS:
+      return shader == PIPE_SHADER_FRAGMENT ? 8 : 32;
+   case PIPE_SHADER_CAP_MAX_TEMPS:
+      return 256; /* Max native temporaries. */
+   case PIPE_SHADER_CAP_MAX_CONST_BUFFER_SIZE:
+      return MIN2(sscreen->info.max_alloc_size, INT_MAX - 3); /* aligned to 4 */
+   case PIPE_SHADER_CAP_MAX_CONST_BUFFERS:
+      return SI_NUM_CONST_BUFFERS;
+   case PIPE_SHADER_CAP_MAX_TEXTURE_SAMPLERS:
+   case PIPE_SHADER_CAP_MAX_SAMPLER_VIEWS:
+      return SI_NUM_SAMPLERS;
+   case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS:
+      return SI_NUM_SHADER_BUFFERS;
+   case PIPE_SHADER_CAP_MAX_SHADER_IMAGES:
+      return SI_NUM_IMAGES;
+   case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT:
+      return 0;
+   case PIPE_SHADER_CAP_PREFERRED_IR:
+      return PIPE_SHADER_IR_NIR;
+   case PIPE_SHADER_CAP_LOWER_IF_THRESHOLD:
+      return 4;
+
+   /* Supported boolean features. */
+   case PIPE_SHADER_CAP_TGSI_CONT_SUPPORTED:
+   case PIPE_SHADER_CAP_TGSI_SQRT_SUPPORTED:
+   case PIPE_SHADER_CAP_INDIRECT_TEMP_ADDR:
+   case PIPE_SHADER_CAP_INDIRECT_CONST_ADDR:
+   case PIPE_SHADER_CAP_INTEGERS:
+   case PIPE_SHADER_CAP_INT64_ATOMICS:
+   case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED:
+   case PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE:
+   case PIPE_SHADER_CAP_TGSI_SKIP_MERGE_REGISTERS:
+   case PIPE_SHADER_CAP_TGSI_DROUND_SUPPORTED:
+   case PIPE_SHADER_CAP_TGSI_LDEXP_SUPPORTED:
+   case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED:
+      return 1;
+
+   case PIPE_SHADER_CAP_INDIRECT_INPUT_ADDR:
+      /* TODO: Indirect indexing of GS inputs is unimplemented. */
+      if (shader == PIPE_SHADER_GEOMETRY)
+         return 0;
+
+      if (shader == PIPE_SHADER_VERTEX && !sscreen->llvm_has_working_vgpr_indexing)
+         return 0;
+
+      /* TCS and TES load inputs directly from LDS or offchip
+       * memory, so indirect indexing is always supported.
+       * PS has to support indirect indexing, because we can't
+       * lower that to TEMPs for INTERP instructions.
+       */
+      return 1;
+
+   case PIPE_SHADER_CAP_INDIRECT_OUTPUT_ADDR:
+      return sscreen->llvm_has_working_vgpr_indexing ||
+             /* TCS stores outputs directly to memory. */
+             shader == PIPE_SHADER_TESS_CTRL;
+
+   /* Unsupported boolean features. */
+   case PIPE_SHADER_CAP_FP16:
+   case PIPE_SHADER_CAP_SUBROUTINES:
+   case PIPE_SHADER_CAP_SUPPORTED_IRS:
+   case PIPE_SHADER_CAP_MAX_HW_ATOMIC_COUNTERS:
+   case PIPE_SHADER_CAP_MAX_HW_ATOMIC_COUNTER_BUFFERS:
+      return 0;
+   }
+   return 0;
 }
 
 static const struct nir_shader_compiler_options nir_options = {
-	.lower_scmp = true,
-	.lower_flrp32 = true,
-	.lower_flrp64 = true,
-	.lower_fsat = true,
-	.lower_fdiv = true,
-	.lower_bitfield_insert_to_bitfield_select = true,
-	.lower_bitfield_extract = true,
-	.lower_sub = true,
-	.fuse_ffma = true,
-	.lower_fmod = true,
-	.lower_pack_snorm_4x8 = true,
-	.lower_pack_unorm_4x8 = true,
-	.lower_unpack_snorm_2x16 = true,
-	.lower_unpack_snorm_4x8 = true,
-	.lower_unpack_unorm_2x16 = true,
-	.lower_unpack_unorm_4x8 = true,
-	.lower_extract_byte = true,
-	.lower_extract_word = true,
-	.lower_rotate = true,
-	.lower_to_scalar = true,
-	.optimize_sample_mask_in = true,
-	.max_unroll_iterations = 32,
-	.use_interpolated_input_intrinsics = true,
+   .lower_scmp = true,
+   .lower_flrp32 = true,
+   .lower_flrp64 = true,
+   .lower_fsat = true,
+   .lower_fdiv = true,
+   .lower_bitfield_insert_to_bitfield_select = true,
+   .lower_bitfield_extract = true,
+   .lower_sub = true,
+   .fuse_ffma = true,
+   .lower_fmod = true,
+   .lower_pack_snorm_4x8 = true,
+   .lower_pack_unorm_4x8 = true,
+   .lower_unpack_snorm_2x16 = true,
+   .lower_unpack_snorm_4x8 = true,
+   .lower_unpack_unorm_2x16 = true,
+   .lower_unpack_unorm_4x8 = true,
+   .lower_extract_byte = true,
+   .lower_extract_word = true,
+   .lower_rotate = true,
+   .lower_to_scalar = true,
+   .optimize_sample_mask_in = true,
+   .max_unroll_iterations = 32,
+   .use_interpolated_input_intrinsics = true,
 };
 
-static const void *
-si_get_compiler_options(struct pipe_screen *screen,
-			enum pipe_shader_ir ir,
-			enum pipe_shader_type shader)
+static const void *si_get_compiler_options(struct pipe_screen *screen, enum pipe_shader_ir ir,
+                                           enum pipe_shader_type shader)
 {
-	assert(ir == PIPE_SHADER_IR_NIR);
-	return &nir_options;
+   assert(ir == PIPE_SHADER_IR_NIR);
+   return &nir_options;
 }
 
 static void si_get_driver_uuid(struct pipe_screen *pscreen, char *uuid)
 {
-	ac_compute_driver_uuid(uuid, PIPE_UUID_SIZE);
+   ac_compute_driver_uuid(uuid, PIPE_UUID_SIZE);
 }
 
 static void si_get_device_uuid(struct pipe_screen *pscreen, char *uuid)
 {
-	struct si_screen *sscreen = (struct si_screen *)pscreen;
+   struct si_screen *sscreen = (struct si_screen *)pscreen;
 
-	ac_compute_device_uuid(&sscreen->info, uuid, PIPE_UUID_SIZE);
+   ac_compute_device_uuid(&sscreen->info, uuid, PIPE_UUID_SIZE);
 }
 
-static const char* si_get_name(struct pipe_screen *pscreen)
+static const char *si_get_name(struct pipe_screen *pscreen)
 {
-	struct si_screen *sscreen = (struct si_screen*)pscreen;
+   struct si_screen *sscreen = (struct si_screen *)pscreen;
 
-	return sscreen->renderer_string;
+   return sscreen->renderer_string;
 }
 
-static int si_get_video_param_no_decode(struct pipe_screen *screen,
-					enum pipe_video_profile profile,
-					enum pipe_video_entrypoint entrypoint,
-					enum pipe_video_cap param)
+static int si_get_video_param_no_decode(struct pipe_screen *screen, enum pipe_video_profile profile,
+                                        enum pipe_video_entrypoint entrypoint,
+                                        enum pipe_video_cap param)
 {
-	switch (param) {
-	case PIPE_VIDEO_CAP_SUPPORTED:
-		return vl_profile_supported(screen, profile, entrypoint);
-	case PIPE_VIDEO_CAP_NPOT_TEXTURES:
-		return 1;
-	case PIPE_VIDEO_CAP_MAX_WIDTH:
-	case PIPE_VIDEO_CAP_MAX_HEIGHT:
-		return vl_video_buffer_max_size(screen);
-	case PIPE_VIDEO_CAP_PREFERED_FORMAT:
-		return PIPE_FORMAT_NV12;
-	case PIPE_VIDEO_CAP_PREFERS_INTERLACED:
-		return false;
-	case PIPE_VIDEO_CAP_SUPPORTS_INTERLACED:
-		return false;
-	case PIPE_VIDEO_CAP_SUPPORTS_PROGRESSIVE:
-		return true;
-	case PIPE_VIDEO_CAP_MAX_LEVEL:
-		return vl_level_supported(screen, profile);
-	default:
-		return 0;
-	}
+   switch (param) {
+   case PIPE_VIDEO_CAP_SUPPORTED:
+      return vl_profile_supported(screen, profile, entrypoint);
+   case PIPE_VIDEO_CAP_NPOT_TEXTURES:
+      return 1;
+   case PIPE_VIDEO_CAP_MAX_WIDTH:
+   case PIPE_VIDEO_CAP_MAX_HEIGHT:
+      return vl_video_buffer_max_size(screen);
+   case PIPE_VIDEO_CAP_PREFERED_FORMAT:
+      return PIPE_FORMAT_NV12;
+   case PIPE_VIDEO_CAP_PREFERS_INTERLACED:
+      return false;
+   case PIPE_VIDEO_CAP_SUPPORTS_INTERLACED:
+      return false;
+   case PIPE_VIDEO_CAP_SUPPORTS_PROGRESSIVE:
+      return true;
+   case PIPE_VIDEO_CAP_MAX_LEVEL:
+      return vl_level_supported(screen, profile);
+   default:
+      return 0;
+   }
 }
 
-static int si_get_video_param(struct pipe_screen *screen,
-			      enum pipe_video_profile profile,
-			      enum pipe_video_entrypoint entrypoint,
-			      enum pipe_video_cap param)
+static int si_get_video_param(struct pipe_screen *screen, enum pipe_video_profile profile,
+                              enum pipe_video_entrypoint entrypoint, enum pipe_video_cap param)
 {
-	struct si_screen *sscreen = (struct si_screen *)screen;
-	enum pipe_video_format codec = u_reduce_video_profile(profile);
-
-	if (entrypoint == PIPE_VIDEO_ENTRYPOINT_ENCODE) {
-		switch (param) {
-		case PIPE_VIDEO_CAP_SUPPORTED:
-			return ((codec == PIPE_VIDEO_FORMAT_MPEG4_AVC &&
-				(sscreen->info.family >= CHIP_RAVEN ||
-				 si_vce_is_fw_version_supported(sscreen))) ||
-				(profile == PIPE_VIDEO_PROFILE_HEVC_MAIN &&
-				(sscreen->info.family >= CHIP_RAVEN ||
-				 si_radeon_uvd_enc_supported(sscreen))) ||
-				(profile == PIPE_VIDEO_PROFILE_HEVC_MAIN_10 &&
-				 sscreen->info.family >= CHIP_RENOIR));
-		case PIPE_VIDEO_CAP_NPOT_TEXTURES:
-			return 1;
-		case PIPE_VIDEO_CAP_MAX_WIDTH:
-			return (sscreen->info.family < CHIP_TONGA) ? 2048 : 4096;
-		case PIPE_VIDEO_CAP_MAX_HEIGHT:
-			return (sscreen->info.family < CHIP_TONGA) ? 1152 : 2304;
-		case PIPE_VIDEO_CAP_PREFERED_FORMAT:
-			return PIPE_FORMAT_NV12;
-		case PIPE_VIDEO_CAP_PREFERS_INTERLACED:
-			return false;
-		case PIPE_VIDEO_CAP_SUPPORTS_INTERLACED:
-			return false;
-		case PIPE_VIDEO_CAP_SUPPORTS_PROGRESSIVE:
-			return true;
-		case PIPE_VIDEO_CAP_STACKED_FRAMES:
-			return (sscreen->info.family < CHIP_TONGA) ? 1 : 2;
-		default:
-			return 0;
-		}
-	}
-
-	switch (param) {
-	case PIPE_VIDEO_CAP_SUPPORTED:
-		switch (codec) {
-		case PIPE_VIDEO_FORMAT_MPEG12:
-			return profile != PIPE_VIDEO_PROFILE_MPEG1;
-		case PIPE_VIDEO_FORMAT_MPEG4:
-			return 1;
-		case PIPE_VIDEO_FORMAT_MPEG4_AVC:
-			if ((sscreen->info.family == CHIP_POLARIS10 ||
-			     sscreen->info.family == CHIP_POLARIS11) &&
-			    sscreen->info.uvd_fw_version < UVD_FW_1_66_16 ) {
-				RVID_ERR("POLARIS10/11 firmware version need to be updated.\n");
-				return false;
-			}
-			return true;
-		case PIPE_VIDEO_FORMAT_VC1:
-			return true;
-		case PIPE_VIDEO_FORMAT_HEVC:
-			/* Carrizo only supports HEVC Main */
-			if (sscreen->info.family >= CHIP_STONEY)
-				return (profile == PIPE_VIDEO_PROFILE_HEVC_MAIN ||
-					profile == PIPE_VIDEO_PROFILE_HEVC_MAIN_10);
-			else if (sscreen->info.family >= CHIP_CARRIZO)
-				return profile == PIPE_VIDEO_PROFILE_HEVC_MAIN;
-			return false;
-		case PIPE_VIDEO_FORMAT_JPEG:
-			if (sscreen->info.family >= CHIP_RAVEN)
-				return true;
-			if (sscreen->info.family < CHIP_CARRIZO || sscreen->info.family >= CHIP_VEGA10)
-				return false;
-			if (!(sscreen->info.is_amdgpu && sscreen->info.drm_minor >= 19)) {
-				RVID_ERR("No MJPEG support for the kernel version\n");
-				return false;
-			}
-			return true;
-		case PIPE_VIDEO_FORMAT_VP9:
-			if (sscreen->info.family < CHIP_RAVEN)
-				return false;
-			return true;
-		default:
-			return false;
-		}
-	case PIPE_VIDEO_CAP_NPOT_TEXTURES:
-		return 1;
-	case PIPE_VIDEO_CAP_MAX_WIDTH:
-		switch (codec) {
-		case PIPE_VIDEO_FORMAT_HEVC:
-		case PIPE_VIDEO_FORMAT_VP9:
-			return (sscreen->info.family < CHIP_RENOIR) ?
-			       ((sscreen->info.family < CHIP_TONGA) ? 2048 : 4096) :
-			       8192;
-		default:
-			return (sscreen->info.family < CHIP_TONGA) ? 2048 : 4096;
-		}
-	case PIPE_VIDEO_CAP_MAX_HEIGHT:
-		switch (codec) {
-		case PIPE_VIDEO_FORMAT_HEVC:
-		case PIPE_VIDEO_FORMAT_VP9:
-			return (sscreen->info.family < CHIP_RENOIR) ?
-			       ((sscreen->info.family < CHIP_TONGA) ? 1152 : 4096) :
-			       4352;
-		default:
-			return (sscreen->info.family < CHIP_TONGA) ? 1152 : 4096;
-		}
-	case PIPE_VIDEO_CAP_PREFERED_FORMAT:
-		if (profile == PIPE_VIDEO_PROFILE_HEVC_MAIN_10)
-			return PIPE_FORMAT_P010;
-		else if (profile == PIPE_VIDEO_PROFILE_VP9_PROFILE2)
-			return PIPE_FORMAT_P016;
-		else
-			return PIPE_FORMAT_NV12;
-
-	case PIPE_VIDEO_CAP_PREFERS_INTERLACED:
-	case PIPE_VIDEO_CAP_SUPPORTS_INTERLACED: {
-		enum pipe_video_format format = u_reduce_video_profile(profile);
-
-		if (format == PIPE_VIDEO_FORMAT_HEVC)
-			return false; //The firmware doesn't support interlaced HEVC.
-		else if (format == PIPE_VIDEO_FORMAT_JPEG)
-			return false;
-		else if (format == PIPE_VIDEO_FORMAT_VP9)
-			return false;
-		return true;
-	}
-	case PIPE_VIDEO_CAP_SUPPORTS_PROGRESSIVE:
-		return true;
-	case PIPE_VIDEO_CAP_MAX_LEVEL:
-		switch (profile) {
-		case PIPE_VIDEO_PROFILE_MPEG1:
-			return 0;
-		case PIPE_VIDEO_PROFILE_MPEG2_SIMPLE:
-		case PIPE_VIDEO_PROFILE_MPEG2_MAIN:
-			return 3;
-		case PIPE_VIDEO_PROFILE_MPEG4_SIMPLE:
-			return 3;
-		case PIPE_VIDEO_PROFILE_MPEG4_ADVANCED_SIMPLE:
-			return 5;
-		case PIPE_VIDEO_PROFILE_VC1_SIMPLE:
-			return 1;
-		case PIPE_VIDEO_PROFILE_VC1_MAIN:
-			return 2;
-		case PIPE_VIDEO_PROFILE_VC1_ADVANCED:
-			return 4;
-		case PIPE_VIDEO_PROFILE_MPEG4_AVC_BASELINE:
-		case PIPE_VIDEO_PROFILE_MPEG4_AVC_MAIN:
-		case PIPE_VIDEO_PROFILE_MPEG4_AVC_HIGH:
-			return (sscreen->info.family < CHIP_TONGA) ? 41 : 52;
-		case PIPE_VIDEO_PROFILE_HEVC_MAIN:
-		case PIPE_VIDEO_PROFILE_HEVC_MAIN_10:
-			return 186;
-		default:
-			return 0;
-		}
-	default:
-		return 0;
-	}
+   struct si_screen *sscreen = (struct si_screen *)screen;
+   enum pipe_video_format codec = u_reduce_video_profile(profile);
+
+   if (entrypoint == PIPE_VIDEO_ENTRYPOINT_ENCODE) {
+      switch (param) {
+      case PIPE_VIDEO_CAP_SUPPORTED:
+         return (
+            (codec == PIPE_VIDEO_FORMAT_MPEG4_AVC &&
+             (sscreen->info.family >= CHIP_RAVEN || si_vce_is_fw_version_supported(sscreen))) ||
+            (profile == PIPE_VIDEO_PROFILE_HEVC_MAIN &&
+             (sscreen->info.family >= CHIP_RAVEN || si_radeon_uvd_enc_supported(sscreen))) ||
+            (profile == PIPE_VIDEO_PROFILE_HEVC_MAIN_10 && sscreen->info.family >= CHIP_RENOIR));
+      case PIPE_VIDEO_CAP_NPOT_TEXTURES:
+         return 1;
+      case PIPE_VIDEO_CAP_MAX_WIDTH:
+         return (sscreen->info.family < CHIP_TONGA) ? 2048 : 4096;
+      case PIPE_VIDEO_CAP_MAX_HEIGHT:
+         return (sscreen->info.family < CHIP_TONGA) ? 1152 : 2304;
+      case PIPE_VIDEO_CAP_PREFERED_FORMAT:
+         return PIPE_FORMAT_NV12;
+      case PIPE_VIDEO_CAP_PREFERS_INTERLACED:
+         return false;
+      case PIPE_VIDEO_CAP_SUPPORTS_INTERLACED:
+         return false;
+      case PIPE_VIDEO_CAP_SUPPORTS_PROGRESSIVE:
+         return true;
+      case PIPE_VIDEO_CAP_STACKED_FRAMES:
+         return (sscreen->info.family < CHIP_TONGA) ? 1 : 2;
+      default:
+         return 0;
+      }
+   }
+
+   switch (param) {
+   case PIPE_VIDEO_CAP_SUPPORTED:
+      switch (codec) {
+      case PIPE_VIDEO_FORMAT_MPEG12:
+         return profile != PIPE_VIDEO_PROFILE_MPEG1;
+      case PIPE_VIDEO_FORMAT_MPEG4:
+         return 1;
+      case PIPE_VIDEO_FORMAT_MPEG4_AVC:
+         if ((sscreen->info.family == CHIP_POLARIS10 || sscreen->info.family == CHIP_POLARIS11) &&
+             sscreen->info.uvd_fw_version < UVD_FW_1_66_16) {
+            RVID_ERR("POLARIS10/11 firmware version need to be updated.\n");
+            return false;
+         }
+         return true;
+      case PIPE_VIDEO_FORMAT_VC1:
+         return true;
+      case PIPE_VIDEO_FORMAT_HEVC:
+         /* Carrizo only supports HEVC Main */
+         if (sscreen->info.family >= CHIP_STONEY)
+            return (profile == PIPE_VIDEO_PROFILE_HEVC_MAIN ||
+                    profile == PIPE_VIDEO_PROFILE_HEVC_MAIN_10);
+         else if (sscreen->info.family >= CHIP_CARRIZO)
+            return profile == PIPE_VIDEO_PROFILE_HEVC_MAIN;
+         return false;
+      case PIPE_VIDEO_FORMAT_JPEG:
+         if (sscreen->info.family >= CHIP_RAVEN)
+            return true;
+         if (sscreen->info.family < CHIP_CARRIZO || sscreen->info.family >= CHIP_VEGA10)
+            return false;
+         if (!(sscreen->info.is_amdgpu && sscreen->info.drm_minor >= 19)) {
+            RVID_ERR("No MJPEG support for the kernel version\n");
+            return false;
+         }
+         return true;
+      case PIPE_VIDEO_FORMAT_VP9:
+         if (sscreen->info.family < CHIP_RAVEN)
+            return false;
+         return true;
+      default:
+         return false;
+      }
+   case PIPE_VIDEO_CAP_NPOT_TEXTURES:
+      return 1;
+   case PIPE_VIDEO_CAP_MAX_WIDTH:
+      switch (codec) {
+      case PIPE_VIDEO_FORMAT_HEVC:
+      case PIPE_VIDEO_FORMAT_VP9:
+         return (sscreen->info.family < CHIP_RENOIR)
+                   ? ((sscreen->info.family < CHIP_TONGA) ? 2048 : 4096)
+                   : 8192;
+      default:
+         return (sscreen->info.family < CHIP_TONGA) ? 2048 : 4096;
+      }
+   case PIPE_VIDEO_CAP_MAX_HEIGHT:
+      switch (codec) {
+      case PIPE_VIDEO_FORMAT_HEVC:
+      case PIPE_VIDEO_FORMAT_VP9:
+         return (sscreen->info.family < CHIP_RENOIR)
+                   ? ((sscreen->info.family < CHIP_TONGA) ? 1152 : 4096)
+                   : 4352;
+      default:
+         return (sscreen->info.family < CHIP_TONGA) ? 1152 : 4096;
+      }
+   case PIPE_VIDEO_CAP_PREFERED_FORMAT:
+      if (profile == PIPE_VIDEO_PROFILE_HEVC_MAIN_10)
+         return PIPE_FORMAT_P010;
+      else if (profile == PIPE_VIDEO_PROFILE_VP9_PROFILE2)
+         return PIPE_FORMAT_P016;
+      else
+         return PIPE_FORMAT_NV12;
+
+   case PIPE_VIDEO_CAP_PREFERS_INTERLACED:
+   case PIPE_VIDEO_CAP_SUPPORTS_INTERLACED: {
+      enum pipe_video_format format = u_reduce_video_profile(profile);
+
+      if (format == PIPE_VIDEO_FORMAT_HEVC)
+         return false; // The firmware doesn't support interlaced HEVC.
+      else if (format == PIPE_VIDEO_FORMAT_JPEG)
+         return false;
+      else if (format == PIPE_VIDEO_FORMAT_VP9)
+         return false;
+      return true;
+   }
+   case PIPE_VIDEO_CAP_SUPPORTS_PROGRESSIVE:
+      return true;
+   case PIPE_VIDEO_CAP_MAX_LEVEL:
+      switch (profile) {
+      case PIPE_VIDEO_PROFILE_MPEG1:
+         return 0;
+      case PIPE_VIDEO_PROFILE_MPEG2_SIMPLE:
+      case PIPE_VIDEO_PROFILE_MPEG2_MAIN:
+         return 3;
+      case PIPE_VIDEO_PROFILE_MPEG4_SIMPLE:
+         return 3;
+      case PIPE_VIDEO_PROFILE_MPEG4_ADVANCED_SIMPLE:
+         return 5;
+      case PIPE_VIDEO_PROFILE_VC1_SIMPLE:
+         return 1;
+      case PIPE_VIDEO_PROFILE_VC1_MAIN:
+         return 2;
+      case PIPE_VIDEO_PROFILE_VC1_ADVANCED:
+         return 4;
+      case PIPE_VIDEO_PROFILE_MPEG4_AVC_BASELINE:
+      case PIPE_VIDEO_PROFILE_MPEG4_AVC_MAIN:
+      case PIPE_VIDEO_PROFILE_MPEG4_AVC_HIGH:
+         return (sscreen->info.family < CHIP_TONGA) ? 41 : 52;
+      case PIPE_VIDEO_PROFILE_HEVC_MAIN:
+      case PIPE_VIDEO_PROFILE_HEVC_MAIN_10:
+         return 186;
+      default:
+         return 0;
+      }
+   default:
+      return 0;
+   }
 }
 
-static bool si_vid_is_format_supported(struct pipe_screen *screen,
-				       enum pipe_format format,
-				       enum pipe_video_profile profile,
-				       enum pipe_video_entrypoint entrypoint)
+static bool si_vid_is_format_supported(struct pipe_screen *screen, enum pipe_format format,
+                                       enum pipe_video_profile profile,
+                                       enum pipe_video_entrypoint entrypoint)
 {
-	/* HEVC 10 bit decoding should use P010 instead of NV12 if possible */
-	if (profile == PIPE_VIDEO_PROFILE_HEVC_MAIN_10)
-		return (format == PIPE_FORMAT_NV12) ||
-		       (format == PIPE_FORMAT_P010) ||
-		       (format == PIPE_FORMAT_P016);
-
-	/* Vp9 profile 2 supports 10 bit decoding using P016 */
-	if (profile == PIPE_VIDEO_PROFILE_VP9_PROFILE2)
-		return format == PIPE_FORMAT_P016;
+   /* HEVC 10 bit decoding should use P010 instead of NV12 if possible */
+   if (profile == PIPE_VIDEO_PROFILE_HEVC_MAIN_10)
+      return (format == PIPE_FORMAT_NV12) || (format == PIPE_FORMAT_P010) ||
+             (format == PIPE_FORMAT_P016);
 
+   /* Vp9 profile 2 supports 10 bit decoding using P016 */
+   if (profile == PIPE_VIDEO_PROFILE_VP9_PROFILE2)
+      return format == PIPE_FORMAT_P016;
 
-	/* we can only handle this one with UVD */
-	if (profile != PIPE_VIDEO_PROFILE_UNKNOWN)
-		return format == PIPE_FORMAT_NV12;
+   /* we can only handle this one with UVD */
+   if (profile != PIPE_VIDEO_PROFILE_UNKNOWN)
+      return format == PIPE_FORMAT_NV12;
 
-	return vl_video_buffer_is_format_supported(screen, format, profile, entrypoint);
+   return vl_video_buffer_is_format_supported(screen, format, profile, entrypoint);
 }
 
-static unsigned get_max_threads_per_block(struct si_screen *screen,
-					  enum pipe_shader_ir ir_type)
+static unsigned get_max_threads_per_block(struct si_screen *screen, enum pipe_shader_ir ir_type)
 {
-	if (ir_type == PIPE_SHADER_IR_NATIVE)
-		return 256;
+   if (ir_type == PIPE_SHADER_IR_NATIVE)
+      return 256;
 
-        /* LLVM 10 only supports 1024 threads per block. */
-	return 1024;
+   /* LLVM 10 only supports 1024 threads per block. */
+   return 1024;
 }
 
-static int si_get_compute_param(struct pipe_screen *screen,
-				enum pipe_shader_ir ir_type,
-				enum pipe_compute_cap param,
-				void *ret)
+static int si_get_compute_param(struct pipe_screen *screen, enum pipe_shader_ir ir_type,
+                                enum pipe_compute_cap param, void *ret)
 {
-	struct si_screen *sscreen = (struct si_screen *)screen;
-
-	//TODO: select these params by asic
-	switch (param) {
-	case PIPE_COMPUTE_CAP_IR_TARGET: {
-		const char *gpu, *triple;
-
-		triple = "amdgcn-mesa-mesa3d";
-		gpu = ac_get_llvm_processor_name(sscreen->info.family);
-		if (ret) {
-			sprintf(ret, "%s-%s", gpu, triple);
-		}
-		/* +2 for dash and terminating NIL byte */
-		return (strlen(triple) + strlen(gpu) + 2) * sizeof(char);
-	}
-	case PIPE_COMPUTE_CAP_GRID_DIMENSION:
-		if (ret) {
-			uint64_t *grid_dimension = ret;
-			grid_dimension[0] = 3;
-		}
-		return 1 * sizeof(uint64_t);
-
-	case PIPE_COMPUTE_CAP_MAX_GRID_SIZE:
-		if (ret) {
-			uint64_t *grid_size = ret;
-			grid_size[0] = 65535;
-			grid_size[1] = 65535;
-			grid_size[2] = 65535;
-		}
-		return 3 * sizeof(uint64_t) ;
-
-	case PIPE_COMPUTE_CAP_MAX_BLOCK_SIZE:
-		if (ret) {
-			uint64_t *block_size = ret;
-			unsigned threads_per_block = get_max_threads_per_block(sscreen, ir_type);
-			block_size[0] = threads_per_block;
-			block_size[1] = threads_per_block;
-			block_size[2] = threads_per_block;
-		}
-		return 3 * sizeof(uint64_t);
-
-	case PIPE_COMPUTE_CAP_MAX_THREADS_PER_BLOCK:
-		if (ret) {
-			uint64_t *max_threads_per_block = ret;
-			*max_threads_per_block = get_max_threads_per_block(sscreen, ir_type);
-		}
-		return sizeof(uint64_t);
-	case PIPE_COMPUTE_CAP_ADDRESS_BITS:
-		if (ret) {
-			uint32_t *address_bits = ret;
-			address_bits[0] = 64;
-		}
-		return 1 * sizeof(uint32_t);
-
-	case PIPE_COMPUTE_CAP_MAX_GLOBAL_SIZE:
-		if (ret) {
-			uint64_t *max_global_size = ret;
-			uint64_t max_mem_alloc_size;
-
-			si_get_compute_param(screen, ir_type,
-				PIPE_COMPUTE_CAP_MAX_MEM_ALLOC_SIZE,
-				&max_mem_alloc_size);
-
-			/* In OpenCL, the MAX_MEM_ALLOC_SIZE must be at least
-			 * 1/4 of the MAX_GLOBAL_SIZE.  Since the
-			 * MAX_MEM_ALLOC_SIZE is fixed for older kernels,
-			 * make sure we never report more than
-			 * 4 * MAX_MEM_ALLOC_SIZE.
-			 */
-			*max_global_size = MIN2(4 * max_mem_alloc_size,
-						MAX2(sscreen->info.gart_size,
-						     sscreen->info.vram_size));
-		}
-		return sizeof(uint64_t);
-
-	case PIPE_COMPUTE_CAP_MAX_LOCAL_SIZE:
-		if (ret) {
-			uint64_t *max_local_size = ret;
-			/* Value reported by the closed source driver. */
-			*max_local_size = 32768;
-		}
-		return sizeof(uint64_t);
-
-	case PIPE_COMPUTE_CAP_MAX_INPUT_SIZE:
-		if (ret) {
-			uint64_t *max_input_size = ret;
-			/* Value reported by the closed source driver. */
-			*max_input_size = 1024;
-		}
-		return sizeof(uint64_t);
-
-	case PIPE_COMPUTE_CAP_MAX_MEM_ALLOC_SIZE:
-		if (ret) {
-			uint64_t *max_mem_alloc_size = ret;
-
-			*max_mem_alloc_size = sscreen->info.max_alloc_size;
-		}
-		return sizeof(uint64_t);
-
-	case PIPE_COMPUTE_CAP_MAX_CLOCK_FREQUENCY:
-		if (ret) {
-			uint32_t *max_clock_frequency = ret;
-			*max_clock_frequency = sscreen->info.max_shader_clock;
-		}
-		return sizeof(uint32_t);
-
-	case PIPE_COMPUTE_CAP_MAX_COMPUTE_UNITS:
-		if (ret) {
-			uint32_t *max_compute_units = ret;
-			*max_compute_units = sscreen->info.num_good_compute_units;
-		}
-		return sizeof(uint32_t);
-
-	case PIPE_COMPUTE_CAP_IMAGES_SUPPORTED:
-		if (ret) {
-			uint32_t *images_supported = ret;
-			*images_supported = 0;
-		}
-		return sizeof(uint32_t);
-	case PIPE_COMPUTE_CAP_MAX_PRIVATE_SIZE:
-		break; /* unused */
-	case PIPE_COMPUTE_CAP_SUBGROUP_SIZE:
-		if (ret) {
-			uint32_t *subgroup_size = ret;
-			*subgroup_size = sscreen->compute_wave_size;
-		}
-		return sizeof(uint32_t);
-	case PIPE_COMPUTE_CAP_MAX_VARIABLE_THREADS_PER_BLOCK:
-		if (ret) {
-			uint64_t *max_variable_threads_per_block = ret;
-			if (ir_type == PIPE_SHADER_IR_NATIVE)
-				*max_variable_threads_per_block = 0;
-			else
-				*max_variable_threads_per_block = SI_MAX_VARIABLE_THREADS_PER_BLOCK;
-		}
-		return sizeof(uint64_t);
-	}
-
-        fprintf(stderr, "unknown PIPE_COMPUTE_CAP %d\n", param);
-        return 0;
+   struct si_screen *sscreen = (struct si_screen *)screen;
+
+   // TODO: select these params by asic
+   switch (param) {
+   case PIPE_COMPUTE_CAP_IR_TARGET: {
+      const char *gpu, *triple;
+
+      triple = "amdgcn-mesa-mesa3d";
+      gpu = ac_get_llvm_processor_name(sscreen->info.family);
+      if (ret) {
+         sprintf(ret, "%s-%s", gpu, triple);
+      }
+      /* +2 for dash and terminating NIL byte */
+      return (strlen(triple) + strlen(gpu) + 2) * sizeof(char);
+   }
+   case PIPE_COMPUTE_CAP_GRID_DIMENSION:
+      if (ret) {
+         uint64_t *grid_dimension = ret;
+         grid_dimension[0] = 3;
+      }
+      return 1 * sizeof(uint64_t);
+
+   case PIPE_COMPUTE_CAP_MAX_GRID_SIZE:
+      if (ret) {
+         uint64_t *grid_size = ret;
+         grid_size[0] = 65535;
+         grid_size[1] = 65535;
+         grid_size[2] = 65535;
+      }
+      return 3 * sizeof(uint64_t);
+
+   case PIPE_COMPUTE_CAP_MAX_BLOCK_SIZE:
+      if (ret) {
+         uint64_t *block_size = ret;
+         unsigned threads_per_block = get_max_threads_per_block(sscreen, ir_type);
+         block_size[0] = threads_per_block;
+         block_size[1] = threads_per_block;
+         block_size[2] = threads_per_block;
+      }
+      return 3 * sizeof(uint64_t);
+
+   case PIPE_COMPUTE_CAP_MAX_THREADS_PER_BLOCK:
+      if (ret) {
+         uint64_t *max_threads_per_block = ret;
+         *max_threads_per_block = get_max_threads_per_block(sscreen, ir_type);
+      }
+      return sizeof(uint64_t);
+   case PIPE_COMPUTE_CAP_ADDRESS_BITS:
+      if (ret) {
+         uint32_t *address_bits = ret;
+         address_bits[0] = 64;
+      }
+      return 1 * sizeof(uint32_t);
+
+   case PIPE_COMPUTE_CAP_MAX_GLOBAL_SIZE:
+      if (ret) {
+         uint64_t *max_global_size = ret;
+         uint64_t max_mem_alloc_size;
+
+         si_get_compute_param(screen, ir_type, PIPE_COMPUTE_CAP_MAX_MEM_ALLOC_SIZE,
+                              &max_mem_alloc_size);
+
+         /* In OpenCL, the MAX_MEM_ALLOC_SIZE must be at least
+          * 1/4 of the MAX_GLOBAL_SIZE.  Since the
+          * MAX_MEM_ALLOC_SIZE is fixed for older kernels,
+          * make sure we never report more than
+          * 4 * MAX_MEM_ALLOC_SIZE.
+          */
+         *max_global_size =
+            MIN2(4 * max_mem_alloc_size, MAX2(sscreen->info.gart_size, sscreen->info.vram_size));
+      }
+      return sizeof(uint64_t);
+
+   case PIPE_COMPUTE_CAP_MAX_LOCAL_SIZE:
+      if (ret) {
+         uint64_t *max_local_size = ret;
+         /* Value reported by the closed source driver. */
+         *max_local_size = 32768;
+      }
+      return sizeof(uint64_t);
+
+   case PIPE_COMPUTE_CAP_MAX_INPUT_SIZE:
+      if (ret) {
+         uint64_t *max_input_size = ret;
+         /* Value reported by the closed source driver. */
+         *max_input_size = 1024;
+      }
+      return sizeof(uint64_t);
+
+   case PIPE_COMPUTE_CAP_MAX_MEM_ALLOC_SIZE:
+      if (ret) {
+         uint64_t *max_mem_alloc_size = ret;
+
+         *max_mem_alloc_size = sscreen->info.max_alloc_size;
+      }
+      return sizeof(uint64_t);
+
+   case PIPE_COMPUTE_CAP_MAX_CLOCK_FREQUENCY:
+      if (ret) {
+         uint32_t *max_clock_frequency = ret;
+         *max_clock_frequency = sscreen->info.max_shader_clock;
+      }
+      return sizeof(uint32_t);
+
+   case PIPE_COMPUTE_CAP_MAX_COMPUTE_UNITS:
+      if (ret) {
+         uint32_t *max_compute_units = ret;
+         *max_compute_units = sscreen->info.num_good_compute_units;
+      }
+      return sizeof(uint32_t);
+
+   case PIPE_COMPUTE_CAP_IMAGES_SUPPORTED:
+      if (ret) {
+         uint32_t *images_supported = ret;
+         *images_supported = 0;
+      }
+      return sizeof(uint32_t);
+   case PIPE_COMPUTE_CAP_MAX_PRIVATE_SIZE:
+      break; /* unused */
+   case PIPE_COMPUTE_CAP_SUBGROUP_SIZE:
+      if (ret) {
+         uint32_t *subgroup_size = ret;
+         *subgroup_size = sscreen->compute_wave_size;
+      }
+      return sizeof(uint32_t);
+   case PIPE_COMPUTE_CAP_MAX_VARIABLE_THREADS_PER_BLOCK:
+      if (ret) {
+         uint64_t *max_variable_threads_per_block = ret;
+         if (ir_type == PIPE_SHADER_IR_NATIVE)
+            *max_variable_threads_per_block = 0;
+         else
+            *max_variable_threads_per_block = SI_MAX_VARIABLE_THREADS_PER_BLOCK;
+      }
+      return sizeof(uint64_t);
+   }
+
+   fprintf(stderr, "unknown PIPE_COMPUTE_CAP %d\n", param);
+   return 0;
 }
 
 static uint64_t si_get_timestamp(struct pipe_screen *screen)
 {
-	struct si_screen *sscreen = (struct si_screen*)screen;
+   struct si_screen *sscreen = (struct si_screen *)screen;
 
-	return 1000000 * sscreen->ws->query_value(sscreen->ws, RADEON_TIMESTAMP) /
-			sscreen->info.clock_crystal_freq;
+   return 1000000 * sscreen->ws->query_value(sscreen->ws, RADEON_TIMESTAMP) /
+          sscreen->info.clock_crystal_freq;
 }
 
-static void si_query_memory_info(struct pipe_screen *screen,
-				 struct pipe_memory_info *info)
+static void si_query_memory_info(struct pipe_screen *screen, struct pipe_memory_info *info)
 {
-	struct si_screen *sscreen = (struct si_screen*)screen;
-	struct radeon_winsys *ws = sscreen->ws;
-	unsigned vram_usage, gtt_usage;
-
-	info->total_device_memory = sscreen->info.vram_size / 1024;
-	info->total_staging_memory = sscreen->info.gart_size / 1024;
-
-	/* The real TTM memory usage is somewhat random, because:
-	 *
-	 * 1) TTM delays freeing memory, because it can only free it after
-	 *    fences expire.
-	 *
-	 * 2) The memory usage can be really low if big VRAM evictions are
-	 *    taking place, but the real usage is well above the size of VRAM.
-	 *
-	 * Instead, return statistics of this process.
-	 */
-	vram_usage = ws->query_value(ws, RADEON_VRAM_USAGE) / 1024;
-	gtt_usage =  ws->query_value(ws, RADEON_GTT_USAGE) / 1024;
-
-	info->avail_device_memory =
-		vram_usage <= info->total_device_memory ?
-				info->total_device_memory - vram_usage : 0;
-	info->avail_staging_memory =
-		gtt_usage <= info->total_staging_memory ?
-				info->total_staging_memory - gtt_usage : 0;
-
-	info->device_memory_evicted =
-		ws->query_value(ws, RADEON_NUM_BYTES_MOVED) / 1024;
-
-	if (sscreen->info.is_amdgpu && sscreen->info.drm_minor >= 4)
-		info->nr_device_memory_evictions =
-			ws->query_value(ws, RADEON_NUM_EVICTIONS);
-	else
-		/* Just return the number of evicted 64KB pages. */
-		info->nr_device_memory_evictions = info->device_memory_evicted / 64;
+   struct si_screen *sscreen = (struct si_screen *)screen;
+   struct radeon_winsys *ws = sscreen->ws;
+   unsigned vram_usage, gtt_usage;
+
+   info->total_device_memory = sscreen->info.vram_size / 1024;
+   info->total_staging_memory = sscreen->info.gart_size / 1024;
+
+   /* The real TTM memory usage is somewhat random, because:
+    *
+    * 1) TTM delays freeing memory, because it can only free it after
+    *    fences expire.
+    *
+    * 2) The memory usage can be really low if big VRAM evictions are
+    *    taking place, but the real usage is well above the size of VRAM.
+    *
+    * Instead, return statistics of this process.
+    */
+   vram_usage = ws->query_value(ws, RADEON_VRAM_USAGE) / 1024;
+   gtt_usage = ws->query_value(ws, RADEON_GTT_USAGE) / 1024;
+
+   info->avail_device_memory =
+      vram_usage <= info->total_device_memory ? info->total_device_memory - vram_usage : 0;
+   info->avail_staging_memory =
+      gtt_usage <= info->total_staging_memory ? info->total_staging_memory - gtt_usage : 0;
+
+   info->device_memory_evicted = ws->query_value(ws, RADEON_NUM_BYTES_MOVED) / 1024;
+
+   if (sscreen->info.is_amdgpu && sscreen->info.drm_minor >= 4)
+      info->nr_device_memory_evictions = ws->query_value(ws, RADEON_NUM_EVICTIONS);
+   else
+      /* Just return the number of evicted 64KB pages. */
+      info->nr_device_memory_evictions = info->device_memory_evicted / 64;
 }
 
 static struct disk_cache *si_get_disk_shader_cache(struct pipe_screen *pscreen)
 {
-	struct si_screen *sscreen = (struct si_screen*)pscreen;
+   struct si_screen *sscreen = (struct si_screen *)pscreen;
 
-	return sscreen->disk_shader_cache;
+   return sscreen->disk_shader_cache;
 }
 
 static void si_init_renderer_string(struct si_screen *sscreen)
 {
-	char first_name[256], second_name[32] = {}, kernel_version[128] = {};
-	struct utsname uname_data;
-
-	if (sscreen->info.marketing_name) {
-		snprintf(first_name, sizeof(first_name), "%s",
-			 sscreen->info.marketing_name);
-		snprintf(second_name, sizeof(second_name), "%s, ",
-			 sscreen->info.name);
-	} else {
-		snprintf(first_name, sizeof(first_name), "AMD %s",
-			 sscreen->info.name);
-	}
-
-	if (uname(&uname_data) == 0)
-		snprintf(kernel_version, sizeof(kernel_version),
-			 ", %s", uname_data.release);
-
-	snprintf(sscreen->renderer_string, sizeof(sscreen->renderer_string),
-		 "%s (%sDRM %i.%i.%i%s, LLVM " MESA_LLVM_VERSION_STRING ")",
-		 first_name, second_name, sscreen->info.drm_major,
-		 sscreen->info.drm_minor, sscreen->info.drm_patchlevel,
-		 kernel_version);
+   char first_name[256], second_name[32] = {}, kernel_version[128] = {};
+   struct utsname uname_data;
+
+   if (sscreen->info.marketing_name) {
+      snprintf(first_name, sizeof(first_name), "%s", sscreen->info.marketing_name);
+      snprintf(second_name, sizeof(second_name), "%s, ", sscreen->info.name);
+   } else {
+      snprintf(first_name, sizeof(first_name), "AMD %s", sscreen->info.name);
+   }
+
+   if (uname(&uname_data) == 0)
+      snprintf(kernel_version, sizeof(kernel_version), ", %s", uname_data.release);
+
+   snprintf(sscreen->renderer_string, sizeof(sscreen->renderer_string),
+            "%s (%sDRM %i.%i.%i%s, LLVM " MESA_LLVM_VERSION_STRING ")", first_name, second_name,
+            sscreen->info.drm_major, sscreen->info.drm_minor, sscreen->info.drm_patchlevel,
+            kernel_version);
 }
 
 void si_init_screen_get_functions(struct si_screen *sscreen)
 {
-	sscreen->b.get_name = si_get_name;
-	sscreen->b.get_vendor = si_get_vendor;
-	sscreen->b.get_device_vendor = si_get_device_vendor;
-	sscreen->b.get_param = si_get_param;
-	sscreen->b.get_paramf = si_get_paramf;
-	sscreen->b.get_compute_param = si_get_compute_param;
-	sscreen->b.get_timestamp = si_get_timestamp;
-	sscreen->b.get_shader_param = si_get_shader_param;
-	sscreen->b.get_compiler_options = si_get_compiler_options;
-	sscreen->b.get_device_uuid = si_get_device_uuid;
-	sscreen->b.get_driver_uuid = si_get_driver_uuid;
-	sscreen->b.query_memory_info = si_query_memory_info;
-	sscreen->b.get_disk_shader_cache = si_get_disk_shader_cache;
-
-	if (sscreen->info.has_hw_decode) {
-		sscreen->b.get_video_param = si_get_video_param;
-		sscreen->b.is_video_format_supported = si_vid_is_format_supported;
-	} else {
-		sscreen->b.get_video_param = si_get_video_param_no_decode;
-		sscreen->b.is_video_format_supported = vl_video_buffer_is_format_supported;
-	}
-
-	si_init_renderer_string(sscreen);
+   sscreen->b.get_name = si_get_name;
+   sscreen->b.get_vendor = si_get_vendor;
+   sscreen->b.get_device_vendor = si_get_device_vendor;
+   sscreen->b.get_param = si_get_param;
+   sscreen->b.get_paramf = si_get_paramf;
+   sscreen->b.get_compute_param = si_get_compute_param;
+   sscreen->b.get_timestamp = si_get_timestamp;
+   sscreen->b.get_shader_param = si_get_shader_param;
+   sscreen->b.get_compiler_options = si_get_compiler_options;
+   sscreen->b.get_device_uuid = si_get_device_uuid;
+   sscreen->b.get_driver_uuid = si_get_driver_uuid;
+   sscreen->b.query_memory_info = si_query_memory_info;
+   sscreen->b.get_disk_shader_cache = si_get_disk_shader_cache;
+
+   if (sscreen->info.has_hw_decode) {
+      sscreen->b.get_video_param = si_get_video_param;
+      sscreen->b.is_video_format_supported = si_vid_is_format_supported;
+   } else {
+      sscreen->b.get_video_param = si_get_video_param_no_decode;
+      sscreen->b.is_video_format_supported = vl_video_buffer_is_format_supported;
+   }
+
+   si_init_renderer_string(sscreen);
 }
diff --git a/src/gallium/drivers/radeonsi/si_gfx_cs.c b/src/gallium/drivers/radeonsi/si_gfx_cs.c
index 9311b6e6386..30ba6b02f87 100644
--- a/src/gallium/drivers/radeonsi/si_gfx_cs.c
+++ b/src/gallium/drivers/radeonsi/si_gfx_cs.c
@@ -23,516 +23,499 @@
  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  */
 
-#include "si_pipe.h"
 #include "si_build_pm4.h"
+#include "si_pipe.h"
 #include "sid.h"
-
 #include "util/os_time.h"
 #include "util/u_upload_mgr.h"
 
 /* initialize */
 void si_need_gfx_cs_space(struct si_context *ctx)
 {
-	struct radeon_cmdbuf *cs = ctx->gfx_cs;
-
-	/* There is no need to flush the DMA IB here, because
-	 * si_need_dma_space always flushes the GFX IB if there is
-	 * a conflict, which means any unflushed DMA commands automatically
-	 * precede the GFX IB (= they had no dependency on the GFX IB when
-	 * they were submitted).
-	 */
-
-	/* There are two memory usage counters in the winsys for all buffers
-	 * that have been added (cs_add_buffer) and two counters in the pipe
-	 * driver for those that haven't been added yet.
-	 */
-	if (unlikely(!radeon_cs_memory_below_limit(ctx->screen, ctx->gfx_cs,
-						   ctx->vram, ctx->gtt))) {
-		ctx->gtt = 0;
-		ctx->vram = 0;
-		si_flush_gfx_cs(ctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
-		return;
-	}
-	ctx->gtt = 0;
-	ctx->vram = 0;
-
-	unsigned need_dwords = si_get_minimum_num_gfx_cs_dwords(ctx);
-	if (!ctx->ws->cs_check_space(cs, need_dwords, false))
-		si_flush_gfx_cs(ctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
+   struct radeon_cmdbuf *cs = ctx->gfx_cs;
+
+   /* There is no need to flush the DMA IB here, because
+    * si_need_dma_space always flushes the GFX IB if there is
+    * a conflict, which means any unflushed DMA commands automatically
+    * precede the GFX IB (= they had no dependency on the GFX IB when
+    * they were submitted).
+    */
+
+   /* There are two memory usage counters in the winsys for all buffers
+    * that have been added (cs_add_buffer) and two counters in the pipe
+    * driver for those that haven't been added yet.
+    */
+   if (unlikely(!radeon_cs_memory_below_limit(ctx->screen, ctx->gfx_cs, ctx->vram, ctx->gtt))) {
+      ctx->gtt = 0;
+      ctx->vram = 0;
+      si_flush_gfx_cs(ctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
+      return;
+   }
+   ctx->gtt = 0;
+   ctx->vram = 0;
+
+   unsigned need_dwords = si_get_minimum_num_gfx_cs_dwords(ctx);
+   if (!ctx->ws->cs_check_space(cs, need_dwords, false))
+      si_flush_gfx_cs(ctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
 }
 
 void si_unref_sdma_uploads(struct si_context *sctx)
 {
-	for (unsigned i = 0; i < sctx->num_sdma_uploads; i++) {
-		si_resource_reference(&sctx->sdma_uploads[i].dst, NULL);
-		si_resource_reference(&sctx->sdma_uploads[i].src, NULL);
-	}
-	sctx->num_sdma_uploads = 0;
+   for (unsigned i = 0; i < sctx->num_sdma_uploads; i++) {
+      si_resource_reference(&sctx->sdma_uploads[i].dst, NULL);
+      si_resource_reference(&sctx->sdma_uploads[i].src, NULL);
+   }
+   sctx->num_sdma_uploads = 0;
 }
 
-void si_flush_gfx_cs(struct si_context *ctx, unsigned flags,
-		     struct pipe_fence_handle **fence)
+void si_flush_gfx_cs(struct si_context *ctx, unsigned flags, struct pipe_fence_handle **fence)
 {
-	struct radeon_cmdbuf *cs = ctx->gfx_cs;
-	struct radeon_winsys *ws = ctx->ws;
-	const unsigned wait_ps_cs = SI_CONTEXT_PS_PARTIAL_FLUSH |
-				    SI_CONTEXT_CS_PARTIAL_FLUSH;
-	unsigned wait_flags = 0;
-
-	if (ctx->gfx_flush_in_progress)
-		return;
-
-	if (!ctx->screen->info.kernel_flushes_tc_l2_after_ib) {
-		wait_flags |= wait_ps_cs |
-			      SI_CONTEXT_INV_L2;
-	} else if (ctx->chip_class == GFX6) {
-		/* The kernel flushes L2 before shaders are finished. */
-		wait_flags |= wait_ps_cs;
-	} else if (!(flags & RADEON_FLUSH_START_NEXT_GFX_IB_NOW)) {
-		wait_flags |= wait_ps_cs;
-	}
-
-	/* Drop this flush if it's a no-op. */
-	if (!radeon_emitted(cs, ctx->initial_gfx_cs_size) &&
-	    (!wait_flags || !ctx->gfx_last_ib_is_busy))
-		return;
-
-	if (ctx->b.get_device_reset_status(&ctx->b) != PIPE_NO_RESET)
-		return;
-
-	if (ctx->screen->debug_flags & DBG(CHECK_VM))
-		flags &= ~PIPE_FLUSH_ASYNC;
-
-	ctx->gfx_flush_in_progress = true;
-
-	/* If the state tracker is flushing the GFX IB, si_flush_from_st is
-	 * responsible for flushing the DMA IB and merging the fences from both.
-	 * If the driver flushes the GFX IB internally, and it should never ask
-	 * for a fence handle.
-	 */
-	assert(!radeon_emitted(ctx->sdma_cs, 0) || fence == NULL);
-
-	/* Update the sdma_uploads list by flushing the uploader. */
-	u_upload_unmap(ctx->b.const_uploader);
-
-	/* Execute SDMA uploads. */
-	ctx->sdma_uploads_in_progress = true;
-	for (unsigned i = 0; i < ctx->num_sdma_uploads; i++) {
-		struct si_sdma_upload *up = &ctx->sdma_uploads[i];
-
-		assert(up->src_offset % 4 == 0 && up->dst_offset % 4 == 0 &&
-		       up->size % 4 == 0);
-
-		si_sdma_copy_buffer(ctx, &up->dst->b.b, &up->src->b.b,
-				    up->dst_offset, up->src_offset, up->size);
-	}
-	ctx->sdma_uploads_in_progress = false;
-	si_unref_sdma_uploads(ctx);
-
-	/* Flush SDMA (preamble IB). */
-	if (radeon_emitted(ctx->sdma_cs, 0))
-		si_flush_dma_cs(ctx, flags, NULL);
-
-	if (radeon_emitted(ctx->prim_discard_compute_cs, 0)) {
-		struct radeon_cmdbuf *compute_cs = ctx->prim_discard_compute_cs;
-		si_compute_signal_gfx(ctx);
-
-		/* Make sure compute shaders are idle before leaving the IB, so that
-		 * the next IB doesn't overwrite GDS that might be in use. */
-		radeon_emit(compute_cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
-		radeon_emit(compute_cs, EVENT_TYPE(V_028A90_CS_PARTIAL_FLUSH) |
-					EVENT_INDEX(4));
-
-		/* Save the GDS prim restart counter if needed. */
-		if (ctx->preserve_prim_restart_gds_at_flush) {
-			si_cp_copy_data(ctx, compute_cs,
-					COPY_DATA_DST_MEM, ctx->wait_mem_scratch, 4,
-					COPY_DATA_GDS, NULL, 4);
-		}
-	}
-
-	if (ctx->has_graphics) {
-		if (!list_is_empty(&ctx->active_queries))
-			si_suspend_queries(ctx);
-
-		ctx->streamout.suspended = false;
-		if (ctx->streamout.begin_emitted) {
-			si_emit_streamout_end(ctx);
-			ctx->streamout.suspended = true;
-
-			/* Since NGG streamout uses GDS, we need to make GDS
-			 * idle when we leave the IB, otherwise another process
-			 * might overwrite it while our shaders are busy.
-			 */
-			if (ctx->screen->use_ngg_streamout)
-				wait_flags |= SI_CONTEXT_PS_PARTIAL_FLUSH;
-		}
-	}
-
-	/* Make sure CP DMA is idle at the end of IBs after L2 prefetches
-	 * because the kernel doesn't wait for it. */
-	if (ctx->chip_class >= GFX7)
-		si_cp_dma_wait_for_idle(ctx);
-
-	/* Wait for draw calls to finish if needed. */
-	if (wait_flags) {
-		ctx->flags |= wait_flags;
-		ctx->emit_cache_flush(ctx);
-	}
-	ctx->gfx_last_ib_is_busy = (wait_flags & wait_ps_cs) != wait_ps_cs;
-
-	if (ctx->current_saved_cs) {
-		si_trace_emit(ctx);
-
-		/* Save the IB for debug contexts. */
-		si_save_cs(ws, cs, &ctx->current_saved_cs->gfx, true);
-		ctx->current_saved_cs->flushed = true;
-		ctx->current_saved_cs->time_flush = os_time_get_nano();
-
-		si_log_hw_flush(ctx);
-	}
-
-	if (si_compute_prim_discard_enabled(ctx)) {
-		/* The compute IB can start after the previous gfx IB starts. */
-		if (radeon_emitted(ctx->prim_discard_compute_cs, 0) &&
-		    ctx->last_gfx_fence) {
-			ctx->ws->cs_add_fence_dependency(ctx->gfx_cs,
-							 ctx->last_gfx_fence,
-							 RADEON_DEPENDENCY_PARALLEL_COMPUTE_ONLY |
-							 RADEON_DEPENDENCY_START_FENCE);
-		}
-
-		/* Remember the last execution barrier. It's in the IB.
-		 * It will signal the start of the next compute IB.
-		 */
-		if (flags & RADEON_FLUSH_START_NEXT_GFX_IB_NOW &&
-		    ctx->last_pkt3_write_data) {
-			*ctx->last_pkt3_write_data = PKT3(PKT3_WRITE_DATA, 3, 0);
-			ctx->last_pkt3_write_data = NULL;
-
-			si_resource_reference(&ctx->last_ib_barrier_buf, ctx->barrier_buf);
-			ctx->last_ib_barrier_buf_offset = ctx->barrier_buf_offset;
-			si_resource_reference(&ctx->barrier_buf, NULL);
-
-			ws->fence_reference(&ctx->last_ib_barrier_fence, NULL);
-		}
-	}
-
-	/* Flush the CS. */
-	ws->cs_flush(cs, flags, &ctx->last_gfx_fence);
-	if (fence)
-		ws->fence_reference(fence, ctx->last_gfx_fence);
-
-	ctx->num_gfx_cs_flushes++;
-
-	if (si_compute_prim_discard_enabled(ctx)) {
-		/* Remember the last execution barrier, which is the last fence
-		 * in this case.
-		 */
-		if (!(flags & RADEON_FLUSH_START_NEXT_GFX_IB_NOW)) {
-			ctx->last_pkt3_write_data = NULL;
-			si_resource_reference(&ctx->last_ib_barrier_buf, NULL);
-			ws->fence_reference(&ctx->last_ib_barrier_fence, ctx->last_gfx_fence);
-		}
-	}
-
-	/* Check VM faults if needed. */
-	if (ctx->screen->debug_flags & DBG(CHECK_VM)) {
-		/* Use conservative timeout 800ms, after which we won't wait any
-		 * longer and assume the GPU is hung.
-		 */
-		ctx->ws->fence_wait(ctx->ws, ctx->last_gfx_fence, 800*1000*1000);
-
-		si_check_vm_faults(ctx, &ctx->current_saved_cs->gfx, RING_GFX);
-	}
-
-	if (ctx->current_saved_cs)
-		si_saved_cs_reference(&ctx->current_saved_cs, NULL);
-
-	si_begin_new_gfx_cs(ctx);
-	ctx->gfx_flush_in_progress = false;
+   struct radeon_cmdbuf *cs = ctx->gfx_cs;
+   struct radeon_winsys *ws = ctx->ws;
+   const unsigned wait_ps_cs = SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH;
+   unsigned wait_flags = 0;
+
+   if (ctx->gfx_flush_in_progress)
+      return;
+
+   if (!ctx->screen->info.kernel_flushes_tc_l2_after_ib) {
+      wait_flags |= wait_ps_cs | SI_CONTEXT_INV_L2;
+   } else if (ctx->chip_class == GFX6) {
+      /* The kernel flushes L2 before shaders are finished. */
+      wait_flags |= wait_ps_cs;
+   } else if (!(flags & RADEON_FLUSH_START_NEXT_GFX_IB_NOW)) {
+      wait_flags |= wait_ps_cs;
+   }
+
+   /* Drop this flush if it's a no-op. */
+   if (!radeon_emitted(cs, ctx->initial_gfx_cs_size) && (!wait_flags || !ctx->gfx_last_ib_is_busy))
+      return;
+
+   if (ctx->b.get_device_reset_status(&ctx->b) != PIPE_NO_RESET)
+      return;
+
+   if (ctx->screen->debug_flags & DBG(CHECK_VM))
+      flags &= ~PIPE_FLUSH_ASYNC;
+
+   ctx->gfx_flush_in_progress = true;
+
+   /* If the state tracker is flushing the GFX IB, si_flush_from_st is
+    * responsible for flushing the DMA IB and merging the fences from both.
+    * If the driver flushes the GFX IB internally, and it should never ask
+    * for a fence handle.
+    */
+   assert(!radeon_emitted(ctx->sdma_cs, 0) || fence == NULL);
+
+   /* Update the sdma_uploads list by flushing the uploader. */
+   u_upload_unmap(ctx->b.const_uploader);
+
+   /* Execute SDMA uploads. */
+   ctx->sdma_uploads_in_progress = true;
+   for (unsigned i = 0; i < ctx->num_sdma_uploads; i++) {
+      struct si_sdma_upload *up = &ctx->sdma_uploads[i];
+
+      assert(up->src_offset % 4 == 0 && up->dst_offset % 4 == 0 && up->size % 4 == 0);
+
+      si_sdma_copy_buffer(ctx, &up->dst->b.b, &up->src->b.b, up->dst_offset, up->src_offset,
+                          up->size);
+   }
+   ctx->sdma_uploads_in_progress = false;
+   si_unref_sdma_uploads(ctx);
+
+   /* Flush SDMA (preamble IB). */
+   if (radeon_emitted(ctx->sdma_cs, 0))
+      si_flush_dma_cs(ctx, flags, NULL);
+
+   if (radeon_emitted(ctx->prim_discard_compute_cs, 0)) {
+      struct radeon_cmdbuf *compute_cs = ctx->prim_discard_compute_cs;
+      si_compute_signal_gfx(ctx);
+
+      /* Make sure compute shaders are idle before leaving the IB, so that
+       * the next IB doesn't overwrite GDS that might be in use. */
+      radeon_emit(compute_cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
+      radeon_emit(compute_cs, EVENT_TYPE(V_028A90_CS_PARTIAL_FLUSH) | EVENT_INDEX(4));
+
+      /* Save the GDS prim restart counter if needed. */
+      if (ctx->preserve_prim_restart_gds_at_flush) {
+         si_cp_copy_data(ctx, compute_cs, COPY_DATA_DST_MEM, ctx->wait_mem_scratch, 4,
+                         COPY_DATA_GDS, NULL, 4);
+      }
+   }
+
+   if (ctx->has_graphics) {
+      if (!list_is_empty(&ctx->active_queries))
+         si_suspend_queries(ctx);
+
+      ctx->streamout.suspended = false;
+      if (ctx->streamout.begin_emitted) {
+         si_emit_streamout_end(ctx);
+         ctx->streamout.suspended = true;
+
+         /* Since NGG streamout uses GDS, we need to make GDS
+          * idle when we leave the IB, otherwise another process
+          * might overwrite it while our shaders are busy.
+          */
+         if (ctx->screen->use_ngg_streamout)
+            wait_flags |= SI_CONTEXT_PS_PARTIAL_FLUSH;
+      }
+   }
+
+   /* Make sure CP DMA is idle at the end of IBs after L2 prefetches
+    * because the kernel doesn't wait for it. */
+   if (ctx->chip_class >= GFX7)
+      si_cp_dma_wait_for_idle(ctx);
+
+   /* Wait for draw calls to finish if needed. */
+   if (wait_flags) {
+      ctx->flags |= wait_flags;
+      ctx->emit_cache_flush(ctx);
+   }
+   ctx->gfx_last_ib_is_busy = (wait_flags & wait_ps_cs) != wait_ps_cs;
+
+   if (ctx->current_saved_cs) {
+      si_trace_emit(ctx);
+
+      /* Save the IB for debug contexts. */
+      si_save_cs(ws, cs, &ctx->current_saved_cs->gfx, true);
+      ctx->current_saved_cs->flushed = true;
+      ctx->current_saved_cs->time_flush = os_time_get_nano();
+
+      si_log_hw_flush(ctx);
+   }
+
+   if (si_compute_prim_discard_enabled(ctx)) {
+      /* The compute IB can start after the previous gfx IB starts. */
+      if (radeon_emitted(ctx->prim_discard_compute_cs, 0) && ctx->last_gfx_fence) {
+         ctx->ws->cs_add_fence_dependency(
+            ctx->gfx_cs, ctx->last_gfx_fence,
+            RADEON_DEPENDENCY_PARALLEL_COMPUTE_ONLY | RADEON_DEPENDENCY_START_FENCE);
+      }
+
+      /* Remember the last execution barrier. It's in the IB.
+       * It will signal the start of the next compute IB.
+       */
+      if (flags & RADEON_FLUSH_START_NEXT_GFX_IB_NOW && ctx->last_pkt3_write_data) {
+         *ctx->last_pkt3_write_data = PKT3(PKT3_WRITE_DATA, 3, 0);
+         ctx->last_pkt3_write_data = NULL;
+
+         si_resource_reference(&ctx->last_ib_barrier_buf, ctx->barrier_buf);
+         ctx->last_ib_barrier_buf_offset = ctx->barrier_buf_offset;
+         si_resource_reference(&ctx->barrier_buf, NULL);
+
+         ws->fence_reference(&ctx->last_ib_barrier_fence, NULL);
+      }
+   }
+
+   /* Flush the CS. */
+   ws->cs_flush(cs, flags, &ctx->last_gfx_fence);
+   if (fence)
+      ws->fence_reference(fence, ctx->last_gfx_fence);
+
+   ctx->num_gfx_cs_flushes++;
+
+   if (si_compute_prim_discard_enabled(ctx)) {
+      /* Remember the last execution barrier, which is the last fence
+       * in this case.
+       */
+      if (!(flags & RADEON_FLUSH_START_NEXT_GFX_IB_NOW)) {
+         ctx->last_pkt3_write_data = NULL;
+         si_resource_reference(&ctx->last_ib_barrier_buf, NULL);
+         ws->fence_reference(&ctx->last_ib_barrier_fence, ctx->last_gfx_fence);
+      }
+   }
+
+   /* Check VM faults if needed. */
+   if (ctx->screen->debug_flags & DBG(CHECK_VM)) {
+      /* Use conservative timeout 800ms, after which we won't wait any
+       * longer and assume the GPU is hung.
+       */
+      ctx->ws->fence_wait(ctx->ws, ctx->last_gfx_fence, 800 * 1000 * 1000);
+
+      si_check_vm_faults(ctx, &ctx->current_saved_cs->gfx, RING_GFX);
+   }
+
+   if (ctx->current_saved_cs)
+      si_saved_cs_reference(&ctx->current_saved_cs, NULL);
+
+   si_begin_new_gfx_cs(ctx);
+   ctx->gfx_flush_in_progress = false;
 }
 
 static void si_begin_gfx_cs_debug(struct si_context *ctx)
 {
-	static const uint32_t zeros[1];
-	assert(!ctx->current_saved_cs);
+   static const uint32_t zeros[1];
+   assert(!ctx->current_saved_cs);
 
-	ctx->current_saved_cs = calloc(1, sizeof(*ctx->current_saved_cs));
-	if (!ctx->current_saved_cs)
-		return;
+   ctx->current_saved_cs = calloc(1, sizeof(*ctx->current_saved_cs));
+   if (!ctx->current_saved_cs)
+      return;
 
-	pipe_reference_init(&ctx->current_saved_cs->reference, 1);
+   pipe_reference_init(&ctx->current_saved_cs->reference, 1);
 
-	ctx->current_saved_cs->trace_buf = si_resource(
-		pipe_buffer_create(ctx->b.screen, 0, PIPE_USAGE_STAGING, 8));
-	if (!ctx->current_saved_cs->trace_buf) {
-		free(ctx->current_saved_cs);
-		ctx->current_saved_cs = NULL;
-		return;
-	}
+   ctx->current_saved_cs->trace_buf =
+      si_resource(pipe_buffer_create(ctx->b.screen, 0, PIPE_USAGE_STAGING, 8));
+   if (!ctx->current_saved_cs->trace_buf) {
+      free(ctx->current_saved_cs);
+      ctx->current_saved_cs = NULL;
+      return;
+   }
 
-	pipe_buffer_write_nooverlap(&ctx->b, &ctx->current_saved_cs->trace_buf->b.b,
-				    0, sizeof(zeros), zeros);
-	ctx->current_saved_cs->trace_id = 0;
+   pipe_buffer_write_nooverlap(&ctx->b, &ctx->current_saved_cs->trace_buf->b.b, 0, sizeof(zeros),
+                               zeros);
+   ctx->current_saved_cs->trace_id = 0;
 
-	si_trace_emit(ctx);
+   si_trace_emit(ctx);
 
-	radeon_add_to_buffer_list(ctx, ctx->gfx_cs, ctx->current_saved_cs->trace_buf,
-			      RADEON_USAGE_READWRITE, RADEON_PRIO_TRACE);
+   radeon_add_to_buffer_list(ctx, ctx->gfx_cs, ctx->current_saved_cs->trace_buf,
+                             RADEON_USAGE_READWRITE, RADEON_PRIO_TRACE);
 }
 
 static void si_add_gds_to_buffer_list(struct si_context *sctx)
 {
-	if (sctx->gds) {
-		sctx->ws->cs_add_buffer(sctx->gfx_cs, sctx->gds,
-				       RADEON_USAGE_READWRITE, 0, 0);
-		if (sctx->gds_oa) {
-			sctx->ws->cs_add_buffer(sctx->gfx_cs, sctx->gds_oa,
-					       RADEON_USAGE_READWRITE, 0, 0);
-		}
-	}
+   if (sctx->gds) {
+      sctx->ws->cs_add_buffer(sctx->gfx_cs, sctx->gds, RADEON_USAGE_READWRITE, 0, 0);
+      if (sctx->gds_oa) {
+         sctx->ws->cs_add_buffer(sctx->gfx_cs, sctx->gds_oa, RADEON_USAGE_READWRITE, 0, 0);
+      }
+   }
 }
 
 void si_allocate_gds(struct si_context *sctx)
 {
-	struct radeon_winsys *ws = sctx->ws;
+   struct radeon_winsys *ws = sctx->ws;
 
-	if (sctx->gds)
-		return;
+   if (sctx->gds)
+      return;
 
-	assert(sctx->screen->use_ngg_streamout);
+   assert(sctx->screen->use_ngg_streamout);
 
-	/* 4 streamout GDS counters.
-	 * We need 256B (64 dw) of GDS, otherwise streamout hangs.
-	 */
-	sctx->gds = ws->buffer_create(ws, 256, 4, RADEON_DOMAIN_GDS, 0);
-	sctx->gds_oa = ws->buffer_create(ws, 4, 1, RADEON_DOMAIN_OA, 0);
+   /* 4 streamout GDS counters.
+    * We need 256B (64 dw) of GDS, otherwise streamout hangs.
+    */
+   sctx->gds = ws->buffer_create(ws, 256, 4, RADEON_DOMAIN_GDS, 0);
+   sctx->gds_oa = ws->buffer_create(ws, 4, 1, RADEON_DOMAIN_OA, 0);
 
-	assert(sctx->gds && sctx->gds_oa);
-	si_add_gds_to_buffer_list(sctx);
+   assert(sctx->gds && sctx->gds_oa);
+   si_add_gds_to_buffer_list(sctx);
 }
 
 void si_begin_new_gfx_cs(struct si_context *ctx)
 {
-	if (ctx->is_debug)
-		si_begin_gfx_cs_debug(ctx);
-
-	si_add_gds_to_buffer_list(ctx);
-
-	/* Always invalidate caches at the beginning of IBs, because external
-	 * users (e.g. BO evictions and SDMA/UVD/VCE IBs) can modify our
-	 * buffers.
-	 *
-	 * Note that the cache flush done by the kernel at the end of GFX IBs
-	 * isn't useful here, because that flush can finish after the following
-	 * IB starts drawing.
-	 *
-	 * TODO: Do we also need to invalidate CB & DB caches?
-	 */
-	ctx->flags |= SI_CONTEXT_INV_ICACHE |
-		      SI_CONTEXT_INV_SCACHE |
-		      SI_CONTEXT_INV_VCACHE |
-		      SI_CONTEXT_INV_L2 |
-		      SI_CONTEXT_START_PIPELINE_STATS;
-
-	ctx->cs_shader_state.initialized = false;
-	si_all_descriptors_begin_new_cs(ctx);
-
-	if (!ctx->has_graphics) {
-		ctx->initial_gfx_cs_size = ctx->gfx_cs->current.cdw;
-		return;
-	}
-
-	/* set all valid group as dirty so they get reemited on
-	 * next draw command
-	 */
-	si_pm4_reset_emitted(ctx);
-
-	/* The CS initialization should be emitted before everything else. */
-	si_pm4_emit(ctx, ctx->init_config);
-	if (ctx->init_config_gs_rings)
-		si_pm4_emit(ctx, ctx->init_config_gs_rings);
-
-	if (ctx->queued.named.ls)
-		ctx->prefetch_L2_mask |= SI_PREFETCH_LS;
-	if (ctx->queued.named.hs)
-		ctx->prefetch_L2_mask |= SI_PREFETCH_HS;
-	if (ctx->queued.named.es)
-		ctx->prefetch_L2_mask |= SI_PREFETCH_ES;
-	if (ctx->queued.named.gs)
-		ctx->prefetch_L2_mask |= SI_PREFETCH_GS;
-	if (ctx->queued.named.vs)
-		ctx->prefetch_L2_mask |= SI_PREFETCH_VS;
-	if (ctx->queued.named.ps)
-		ctx->prefetch_L2_mask |= SI_PREFETCH_PS;
-	if (ctx->vb_descriptors_buffer && ctx->vertex_elements)
-		ctx->prefetch_L2_mask |= SI_PREFETCH_VBO_DESCRIPTORS;
-
-	/* CLEAR_STATE disables all colorbuffers, so only enable bound ones. */
-	bool has_clear_state = ctx->screen->info.has_clear_state;
-	if (has_clear_state) {
-		ctx->framebuffer.dirty_cbufs =
-			 u_bit_consecutive(0, ctx->framebuffer.state.nr_cbufs);
-		/* CLEAR_STATE disables the zbuffer, so only enable it if it's bound. */
-		ctx->framebuffer.dirty_zsbuf = ctx->framebuffer.state.zsbuf != NULL;
-	} else {
-		ctx->framebuffer.dirty_cbufs = u_bit_consecutive(0, 8);
-		ctx->framebuffer.dirty_zsbuf = true;
-	}
-	/* This should always be marked as dirty to set the framebuffer scissor
-	 * at least. */
-	si_mark_atom_dirty(ctx, &ctx->atoms.s.framebuffer);
-
-	si_mark_atom_dirty(ctx, &ctx->atoms.s.clip_regs);
-	/* CLEAR_STATE sets zeros. */
-	if (!has_clear_state || ctx->clip_state.any_nonzeros)
-		si_mark_atom_dirty(ctx, &ctx->atoms.s.clip_state);
-	ctx->sample_locs_num_samples = 0;
-	si_mark_atom_dirty(ctx, &ctx->atoms.s.msaa_sample_locs);
-	si_mark_atom_dirty(ctx, &ctx->atoms.s.msaa_config);
-	/* CLEAR_STATE sets 0xffff. */
-	if (!has_clear_state || ctx->sample_mask != 0xffff)
-		si_mark_atom_dirty(ctx, &ctx->atoms.s.sample_mask);
-	si_mark_atom_dirty(ctx, &ctx->atoms.s.cb_render_state);
-	/* CLEAR_STATE sets zeros. */
-	if (!has_clear_state || ctx->blend_color.any_nonzeros)
-		si_mark_atom_dirty(ctx, &ctx->atoms.s.blend_color);
-	si_mark_atom_dirty(ctx, &ctx->atoms.s.db_render_state);
-	if (ctx->chip_class >= GFX9)
-		si_mark_atom_dirty(ctx, &ctx->atoms.s.dpbb_state);
-	si_mark_atom_dirty(ctx, &ctx->atoms.s.stencil_ref);
-	si_mark_atom_dirty(ctx, &ctx->atoms.s.spi_map);
-	if (!ctx->screen->use_ngg_streamout)
-		si_mark_atom_dirty(ctx, &ctx->atoms.s.streamout_enable);
-	si_mark_atom_dirty(ctx, &ctx->atoms.s.render_cond);
-	/* CLEAR_STATE disables all window rectangles. */
-	if (!has_clear_state || ctx->num_window_rectangles > 0)
-		si_mark_atom_dirty(ctx, &ctx->atoms.s.window_rectangles);
-
-	si_mark_atom_dirty(ctx, &ctx->atoms.s.guardband);
-	si_mark_atom_dirty(ctx, &ctx->atoms.s.scissors);
-	si_mark_atom_dirty(ctx, &ctx->atoms.s.viewports);
-
-	si_mark_atom_dirty(ctx, &ctx->atoms.s.scratch_state);
-	if (ctx->scratch_buffer) {
-		si_context_add_resource_size(ctx, &ctx->scratch_buffer->b.b);
-	}
-
-	if (ctx->streamout.suspended) {
-		ctx->streamout.append_bitmask = ctx->streamout.enabled_mask;
-		si_streamout_buffers_dirty(ctx);
-	}
-
-	if (!list_is_empty(&ctx->active_queries))
-		si_resume_queries(ctx);
-
-	assert(!ctx->gfx_cs->prev_dw);
-	ctx->initial_gfx_cs_size = ctx->gfx_cs->current.cdw;
-
-	/* Invalidate various draw states so that they are emitted before
-	 * the first draw call. */
-	si_invalidate_draw_sh_constants(ctx);
-	ctx->last_index_size = -1;
-	ctx->last_primitive_restart_en = -1;
-	ctx->last_restart_index = SI_RESTART_INDEX_UNKNOWN;
-	ctx->last_prim = -1;
-	ctx->last_multi_vgt_param = -1;
-	ctx->last_vs_state = ~0;
-	ctx->last_ls = NULL;
-	ctx->last_tcs = NULL;
-	ctx->last_tes_sh_base = -1;
-	ctx->last_num_tcs_input_cp = -1;
-	ctx->last_ls_hs_config = -1; /* impossible value */
-	ctx->last_binning_enabled = -1;
-	ctx->small_prim_cull_info_dirty = ctx->small_prim_cull_info_buf != NULL;
-
-	ctx->prim_discard_compute_ib_initialized = false;
-
-        /* Compute-based primitive discard:
-         *   The index ring is divided into 2 halves. Switch between the halves
-         *   in the same fashion as doublebuffering.
-         */
-        if (ctx->index_ring_base)
-                ctx->index_ring_base = 0;
-        else
-                ctx->index_ring_base = ctx->index_ring_size_per_ib;
-
-        ctx->index_ring_offset = 0;
-
-	STATIC_ASSERT(SI_NUM_TRACKED_REGS <= sizeof(ctx->tracked_regs.reg_saved) * 8);
-
-	if (has_clear_state) {
-		ctx->tracked_regs.reg_value[SI_TRACKED_DB_RENDER_CONTROL] = 0x00000000;
-		ctx->tracked_regs.reg_value[SI_TRACKED_DB_COUNT_CONTROL] = 0x00000000;
-		ctx->tracked_regs.reg_value[SI_TRACKED_DB_RENDER_OVERRIDE2] = 0x00000000;
-		ctx->tracked_regs.reg_value[SI_TRACKED_DB_SHADER_CONTROL] = 0x00000000;
-		ctx->tracked_regs.reg_value[SI_TRACKED_CB_TARGET_MASK] = 0xffffffff;
-		ctx->tracked_regs.reg_value[SI_TRACKED_CB_DCC_CONTROL] = 0x00000000;
-		ctx->tracked_regs.reg_value[SI_TRACKED_SX_PS_DOWNCONVERT] = 0x00000000;
-		ctx->tracked_regs.reg_value[SI_TRACKED_SX_BLEND_OPT_EPSILON] = 0x00000000;
-		ctx->tracked_regs.reg_value[SI_TRACKED_SX_BLEND_OPT_CONTROL] = 0x00000000;
-		ctx->tracked_regs.reg_value[SI_TRACKED_PA_SC_LINE_CNTL]	= 0x00001000;
-		ctx->tracked_regs.reg_value[SI_TRACKED_PA_SC_AA_CONFIG]	= 0x00000000;
-		ctx->tracked_regs.reg_value[SI_TRACKED_DB_EQAA]	= 0x00000000;
-		ctx->tracked_regs.reg_value[SI_TRACKED_PA_SC_MODE_CNTL_1] = 0x00000000;
-		ctx->tracked_regs.reg_value[SI_TRACKED_PA_SU_PRIM_FILTER_CNTL] = 0;
-		ctx->tracked_regs.reg_value[SI_TRACKED_PA_SU_SMALL_PRIM_FILTER_CNTL] = 0x00000000;
-		ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_VS_OUT_CNTL__VS] = 0x00000000;
-		ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_VS_OUT_CNTL__CL] = 0x00000000;
-		ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_CLIP_CNTL]	= 0x00090000;
-		ctx->tracked_regs.reg_value[SI_TRACKED_PA_SC_BINNER_CNTL_0] = 0x00000003;
-		ctx->tracked_regs.reg_value[SI_TRACKED_DB_DFSM_CONTROL]	= 0x00000000;
-		ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_GB_VERT_CLIP_ADJ]	= 0x3f800000;
-		ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_GB_VERT_DISC_ADJ]	= 0x3f800000;
-		ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_GB_HORZ_CLIP_ADJ]	= 0x3f800000;
-		ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_GB_HORZ_DISC_ADJ]	= 0x3f800000;
-		ctx->tracked_regs.reg_value[SI_TRACKED_PA_SU_HARDWARE_SCREEN_OFFSET] = 0;
-		ctx->tracked_regs.reg_value[SI_TRACKED_PA_SU_VTX_CNTL] = 0x00000005;
-		ctx->tracked_regs.reg_value[SI_TRACKED_PA_SC_CLIPRECT_RULE]	= 0xffff;
-		ctx->tracked_regs.reg_value[SI_TRACKED_PA_SC_LINE_STIPPLE]	= 0;
-		ctx->tracked_regs.reg_value[SI_TRACKED_VGT_ESGS_RING_ITEMSIZE]  = 0x00000000;
-		ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GSVS_RING_OFFSET_1]  = 0x00000000;
-		ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GSVS_RING_OFFSET_2]  = 0x00000000;
-		ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GSVS_RING_OFFSET_3]  = 0x00000000;
-		ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GSVS_RING_ITEMSIZE]  = 0x00000000;
-		ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_MAX_VERT_OUT]  = 0x00000000;
-		ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_VERT_ITEMSIZE]  = 0x00000000;
-		ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_VERT_ITEMSIZE_1]  = 0x00000000;
-		ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_VERT_ITEMSIZE_2]  = 0x00000000;
-		ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_VERT_ITEMSIZE_3]  = 0x00000000;
-		ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_INSTANCE_CNT]  = 0x00000000;
-		ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_ONCHIP_CNTL]  = 0x00000000;
-		ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_MAX_PRIMS_PER_SUBGROUP]  = 0x00000000;
-		ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_MODE]  = 0x00000000;
-		ctx->tracked_regs.reg_value[SI_TRACKED_VGT_PRIMITIVEID_EN]  = 0x00000000;
-		ctx->tracked_regs.reg_value[SI_TRACKED_VGT_REUSE_OFF]  = 0x00000000;
-		ctx->tracked_regs.reg_value[SI_TRACKED_SPI_VS_OUT_CONFIG]  = 0x00000000;
-		ctx->tracked_regs.reg_value[SI_TRACKED_GE_MAX_OUTPUT_PER_SUBGROUP]  = 0x00000000;
-		ctx->tracked_regs.reg_value[SI_TRACKED_GE_NGG_SUBGRP_CNTL]  = 0x00000000;
-		ctx->tracked_regs.reg_value[SI_TRACKED_SPI_SHADER_IDX_FORMAT]  = 0x00000000;
-		ctx->tracked_regs.reg_value[SI_TRACKED_SPI_SHADER_POS_FORMAT]  = 0x00000000;
-		ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_VTE_CNTL]  = 0x00000000;
-		ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_NGG_CNTL]  = 0x00000000;
-		ctx->tracked_regs.reg_value[SI_TRACKED_SPI_PS_INPUT_ENA]  = 0x00000000;
-		ctx->tracked_regs.reg_value[SI_TRACKED_SPI_PS_INPUT_ADDR]  = 0x00000000;
-		ctx->tracked_regs.reg_value[SI_TRACKED_SPI_BARYC_CNTL]  = 0x00000000;
-		ctx->tracked_regs.reg_value[SI_TRACKED_SPI_PS_IN_CONTROL]  = 0x00000002;
-		ctx->tracked_regs.reg_value[SI_TRACKED_SPI_SHADER_Z_FORMAT]  = 0x00000000;
-		ctx->tracked_regs.reg_value[SI_TRACKED_SPI_SHADER_COL_FORMAT]  = 0x00000000;
-		ctx->tracked_regs.reg_value[SI_TRACKED_CB_SHADER_MASK]  = 0xffffffff;
-		ctx->tracked_regs.reg_value[SI_TRACKED_VGT_TF_PARAM]  = 0x00000000;
-		ctx->tracked_regs.reg_value[SI_TRACKED_VGT_VERTEX_REUSE_BLOCK_CNTL]  = 0x0000001e; /* From GFX8 */
-
-		/* Set all cleared context registers to saved. */
-		ctx->tracked_regs.reg_saved = ~(1ull << SI_TRACKED_GE_PC_ALLOC); /* uconfig reg */
-		ctx->last_gs_out_prim = 0; /* cleared by CLEAR_STATE */
-	} else {
-		/* Set all register values to unknown. */
-		ctx->tracked_regs.reg_saved = 0;
-		ctx->last_gs_out_prim = -1; /* unknown */
-	}
-
-	/* 0xffffffff is a impossible value to register SPI_PS_INPUT_CNTL_n */
-	memset(ctx->tracked_regs.spi_ps_input_cntl, 0xff, sizeof(uint32_t) * 32);
+   if (ctx->is_debug)
+      si_begin_gfx_cs_debug(ctx);
+
+   si_add_gds_to_buffer_list(ctx);
+
+   /* Always invalidate caches at the beginning of IBs, because external
+    * users (e.g. BO evictions and SDMA/UVD/VCE IBs) can modify our
+    * buffers.
+    *
+    * Note that the cache flush done by the kernel at the end of GFX IBs
+    * isn't useful here, because that flush can finish after the following
+    * IB starts drawing.
+    *
+    * TODO: Do we also need to invalidate CB & DB caches?
+    */
+   ctx->flags |= SI_CONTEXT_INV_ICACHE | SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE |
+                 SI_CONTEXT_INV_L2 | SI_CONTEXT_START_PIPELINE_STATS;
+
+   ctx->cs_shader_state.initialized = false;
+   si_all_descriptors_begin_new_cs(ctx);
+
+   if (!ctx->has_graphics) {
+      ctx->initial_gfx_cs_size = ctx->gfx_cs->current.cdw;
+      return;
+   }
+
+   /* set all valid group as dirty so they get reemited on
+    * next draw command
+    */
+   si_pm4_reset_emitted(ctx);
+
+   /* The CS initialization should be emitted before everything else. */
+   si_pm4_emit(ctx, ctx->init_config);
+   if (ctx->init_config_gs_rings)
+      si_pm4_emit(ctx, ctx->init_config_gs_rings);
+
+   if (ctx->queued.named.ls)
+      ctx->prefetch_L2_mask |= SI_PREFETCH_LS;
+   if (ctx->queued.named.hs)
+      ctx->prefetch_L2_mask |= SI_PREFETCH_HS;
+   if (ctx->queued.named.es)
+      ctx->prefetch_L2_mask |= SI_PREFETCH_ES;
+   if (ctx->queued.named.gs)
+      ctx->prefetch_L2_mask |= SI_PREFETCH_GS;
+   if (ctx->queued.named.vs)
+      ctx->prefetch_L2_mask |= SI_PREFETCH_VS;
+   if (ctx->queued.named.ps)
+      ctx->prefetch_L2_mask |= SI_PREFETCH_PS;
+   if (ctx->vb_descriptors_buffer && ctx->vertex_elements)
+      ctx->prefetch_L2_mask |= SI_PREFETCH_VBO_DESCRIPTORS;
+
+   /* CLEAR_STATE disables all colorbuffers, so only enable bound ones. */
+   bool has_clear_state = ctx->screen->info.has_clear_state;
+   if (has_clear_state) {
+      ctx->framebuffer.dirty_cbufs = u_bit_consecutive(0, ctx->framebuffer.state.nr_cbufs);
+      /* CLEAR_STATE disables the zbuffer, so only enable it if it's bound. */
+      ctx->framebuffer.dirty_zsbuf = ctx->framebuffer.state.zsbuf != NULL;
+   } else {
+      ctx->framebuffer.dirty_cbufs = u_bit_consecutive(0, 8);
+      ctx->framebuffer.dirty_zsbuf = true;
+   }
+   /* This should always be marked as dirty to set the framebuffer scissor
+    * at least. */
+   si_mark_atom_dirty(ctx, &ctx->atoms.s.framebuffer);
+
+   si_mark_atom_dirty(ctx, &ctx->atoms.s.clip_regs);
+   /* CLEAR_STATE sets zeros. */
+   if (!has_clear_state || ctx->clip_state.any_nonzeros)
+      si_mark_atom_dirty(ctx, &ctx->atoms.s.clip_state);
+   ctx->sample_locs_num_samples = 0;
+   si_mark_atom_dirty(ctx, &ctx->atoms.s.msaa_sample_locs);
+   si_mark_atom_dirty(ctx, &ctx->atoms.s.msaa_config);
+   /* CLEAR_STATE sets 0xffff. */
+   if (!has_clear_state || ctx->sample_mask != 0xffff)
+      si_mark_atom_dirty(ctx, &ctx->atoms.s.sample_mask);
+   si_mark_atom_dirty(ctx, &ctx->atoms.s.cb_render_state);
+   /* CLEAR_STATE sets zeros. */
+   if (!has_clear_state || ctx->blend_color.any_nonzeros)
+      si_mark_atom_dirty(ctx, &ctx->atoms.s.blend_color);
+   si_mark_atom_dirty(ctx, &ctx->atoms.s.db_render_state);
+   if (ctx->chip_class >= GFX9)
+      si_mark_atom_dirty(ctx, &ctx->atoms.s.dpbb_state);
+   si_mark_atom_dirty(ctx, &ctx->atoms.s.stencil_ref);
+   si_mark_atom_dirty(ctx, &ctx->atoms.s.spi_map);
+   if (!ctx->screen->use_ngg_streamout)
+      si_mark_atom_dirty(ctx, &ctx->atoms.s.streamout_enable);
+   si_mark_atom_dirty(ctx, &ctx->atoms.s.render_cond);
+   /* CLEAR_STATE disables all window rectangles. */
+   if (!has_clear_state || ctx->num_window_rectangles > 0)
+      si_mark_atom_dirty(ctx, &ctx->atoms.s.window_rectangles);
+
+   si_mark_atom_dirty(ctx, &ctx->atoms.s.guardband);
+   si_mark_atom_dirty(ctx, &ctx->atoms.s.scissors);
+   si_mark_atom_dirty(ctx, &ctx->atoms.s.viewports);
+
+   si_mark_atom_dirty(ctx, &ctx->atoms.s.scratch_state);
+   if (ctx->scratch_buffer) {
+      si_context_add_resource_size(ctx, &ctx->scratch_buffer->b.b);
+   }
+
+   if (ctx->streamout.suspended) {
+      ctx->streamout.append_bitmask = ctx->streamout.enabled_mask;
+      si_streamout_buffers_dirty(ctx);
+   }
+
+   if (!list_is_empty(&ctx->active_queries))
+      si_resume_queries(ctx);
+
+   assert(!ctx->gfx_cs->prev_dw);
+   ctx->initial_gfx_cs_size = ctx->gfx_cs->current.cdw;
+
+   /* Invalidate various draw states so that they are emitted before
+    * the first draw call. */
+   si_invalidate_draw_sh_constants(ctx);
+   ctx->last_index_size = -1;
+   ctx->last_primitive_restart_en = -1;
+   ctx->last_restart_index = SI_RESTART_INDEX_UNKNOWN;
+   ctx->last_prim = -1;
+   ctx->last_multi_vgt_param = -1;
+   ctx->last_vs_state = ~0;
+   ctx->last_ls = NULL;
+   ctx->last_tcs = NULL;
+   ctx->last_tes_sh_base = -1;
+   ctx->last_num_tcs_input_cp = -1;
+   ctx->last_ls_hs_config = -1; /* impossible value */
+   ctx->last_binning_enabled = -1;
+   ctx->small_prim_cull_info_dirty = ctx->small_prim_cull_info_buf != NULL;
+
+   ctx->prim_discard_compute_ib_initialized = false;
+
+   /* Compute-based primitive discard:
+    *   The index ring is divided into 2 halves. Switch between the halves
+    *   in the same fashion as doublebuffering.
+    */
+   if (ctx->index_ring_base)
+      ctx->index_ring_base = 0;
+   else
+      ctx->index_ring_base = ctx->index_ring_size_per_ib;
+
+   ctx->index_ring_offset = 0;
+
+   STATIC_ASSERT(SI_NUM_TRACKED_REGS <= sizeof(ctx->tracked_regs.reg_saved) * 8);
+
+   if (has_clear_state) {
+      ctx->tracked_regs.reg_value[SI_TRACKED_DB_RENDER_CONTROL] = 0x00000000;
+      ctx->tracked_regs.reg_value[SI_TRACKED_DB_COUNT_CONTROL] = 0x00000000;
+      ctx->tracked_regs.reg_value[SI_TRACKED_DB_RENDER_OVERRIDE2] = 0x00000000;
+      ctx->tracked_regs.reg_value[SI_TRACKED_DB_SHADER_CONTROL] = 0x00000000;
+      ctx->tracked_regs.reg_value[SI_TRACKED_CB_TARGET_MASK] = 0xffffffff;
+      ctx->tracked_regs.reg_value[SI_TRACKED_CB_DCC_CONTROL] = 0x00000000;
+      ctx->tracked_regs.reg_value[SI_TRACKED_SX_PS_DOWNCONVERT] = 0x00000000;
+      ctx->tracked_regs.reg_value[SI_TRACKED_SX_BLEND_OPT_EPSILON] = 0x00000000;
+      ctx->tracked_regs.reg_value[SI_TRACKED_SX_BLEND_OPT_CONTROL] = 0x00000000;
+      ctx->tracked_regs.reg_value[SI_TRACKED_PA_SC_LINE_CNTL] = 0x00001000;
+      ctx->tracked_regs.reg_value[SI_TRACKED_PA_SC_AA_CONFIG] = 0x00000000;
+      ctx->tracked_regs.reg_value[SI_TRACKED_DB_EQAA] = 0x00000000;
+      ctx->tracked_regs.reg_value[SI_TRACKED_PA_SC_MODE_CNTL_1] = 0x00000000;
+      ctx->tracked_regs.reg_value[SI_TRACKED_PA_SU_PRIM_FILTER_CNTL] = 0;
+      ctx->tracked_regs.reg_value[SI_TRACKED_PA_SU_SMALL_PRIM_FILTER_CNTL] = 0x00000000;
+      ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_VS_OUT_CNTL__VS] = 0x00000000;
+      ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_VS_OUT_CNTL__CL] = 0x00000000;
+      ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_CLIP_CNTL] = 0x00090000;
+      ctx->tracked_regs.reg_value[SI_TRACKED_PA_SC_BINNER_CNTL_0] = 0x00000003;
+      ctx->tracked_regs.reg_value[SI_TRACKED_DB_DFSM_CONTROL] = 0x00000000;
+      ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_GB_VERT_CLIP_ADJ] = 0x3f800000;
+      ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_GB_VERT_DISC_ADJ] = 0x3f800000;
+      ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_GB_HORZ_CLIP_ADJ] = 0x3f800000;
+      ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_GB_HORZ_DISC_ADJ] = 0x3f800000;
+      ctx->tracked_regs.reg_value[SI_TRACKED_PA_SU_HARDWARE_SCREEN_OFFSET] = 0;
+      ctx->tracked_regs.reg_value[SI_TRACKED_PA_SU_VTX_CNTL] = 0x00000005;
+      ctx->tracked_regs.reg_value[SI_TRACKED_PA_SC_CLIPRECT_RULE] = 0xffff;
+      ctx->tracked_regs.reg_value[SI_TRACKED_PA_SC_LINE_STIPPLE] = 0;
+      ctx->tracked_regs.reg_value[SI_TRACKED_VGT_ESGS_RING_ITEMSIZE] = 0x00000000;
+      ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GSVS_RING_OFFSET_1] = 0x00000000;
+      ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GSVS_RING_OFFSET_2] = 0x00000000;
+      ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GSVS_RING_OFFSET_3] = 0x00000000;
+      ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GSVS_RING_ITEMSIZE] = 0x00000000;
+      ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_MAX_VERT_OUT] = 0x00000000;
+      ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_VERT_ITEMSIZE] = 0x00000000;
+      ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_VERT_ITEMSIZE_1] = 0x00000000;
+      ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_VERT_ITEMSIZE_2] = 0x00000000;
+      ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_VERT_ITEMSIZE_3] = 0x00000000;
+      ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_INSTANCE_CNT] = 0x00000000;
+      ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_ONCHIP_CNTL] = 0x00000000;
+      ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_MAX_PRIMS_PER_SUBGROUP] = 0x00000000;
+      ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_MODE] = 0x00000000;
+      ctx->tracked_regs.reg_value[SI_TRACKED_VGT_PRIMITIVEID_EN] = 0x00000000;
+      ctx->tracked_regs.reg_value[SI_TRACKED_VGT_REUSE_OFF] = 0x00000000;
+      ctx->tracked_regs.reg_value[SI_TRACKED_SPI_VS_OUT_CONFIG] = 0x00000000;
+      ctx->tracked_regs.reg_value[SI_TRACKED_GE_MAX_OUTPUT_PER_SUBGROUP] = 0x00000000;
+      ctx->tracked_regs.reg_value[SI_TRACKED_GE_NGG_SUBGRP_CNTL] = 0x00000000;
+      ctx->tracked_regs.reg_value[SI_TRACKED_SPI_SHADER_IDX_FORMAT] = 0x00000000;
+      ctx->tracked_regs.reg_value[SI_TRACKED_SPI_SHADER_POS_FORMAT] = 0x00000000;
+      ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_VTE_CNTL] = 0x00000000;
+      ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_NGG_CNTL] = 0x00000000;
+      ctx->tracked_regs.reg_value[SI_TRACKED_SPI_PS_INPUT_ENA] = 0x00000000;
+      ctx->tracked_regs.reg_value[SI_TRACKED_SPI_PS_INPUT_ADDR] = 0x00000000;
+      ctx->tracked_regs.reg_value[SI_TRACKED_SPI_BARYC_CNTL] = 0x00000000;
+      ctx->tracked_regs.reg_value[SI_TRACKED_SPI_PS_IN_CONTROL] = 0x00000002;
+      ctx->tracked_regs.reg_value[SI_TRACKED_SPI_SHADER_Z_FORMAT] = 0x00000000;
+      ctx->tracked_regs.reg_value[SI_TRACKED_SPI_SHADER_COL_FORMAT] = 0x00000000;
+      ctx->tracked_regs.reg_value[SI_TRACKED_CB_SHADER_MASK] = 0xffffffff;
+      ctx->tracked_regs.reg_value[SI_TRACKED_VGT_TF_PARAM] = 0x00000000;
+      ctx->tracked_regs.reg_value[SI_TRACKED_VGT_VERTEX_REUSE_BLOCK_CNTL] =
+         0x0000001e; /* From GFX8 */
+
+      /* Set all cleared context registers to saved. */
+      ctx->tracked_regs.reg_saved = ~(1ull << SI_TRACKED_GE_PC_ALLOC); /* uconfig reg */
+      ctx->last_gs_out_prim = 0;                                       /* cleared by CLEAR_STATE */
+   } else {
+      /* Set all register values to unknown. */
+      ctx->tracked_regs.reg_saved = 0;
+      ctx->last_gs_out_prim = -1; /* unknown */
+   }
+
+   /* 0xffffffff is a impossible value to register SPI_PS_INPUT_CNTL_n */
+   memset(ctx->tracked_regs.spi_ps_input_cntl, 0xff, sizeof(uint32_t) * 32);
 }
diff --git a/src/gallium/drivers/radeonsi/si_gpu_load.c b/src/gallium/drivers/radeonsi/si_gpu_load.c
index 33cd5642230..806f98ad520 100644
--- a/src/gallium/drivers/radeonsi/si_gpu_load.c
+++ b/src/gallium/drivers/radeonsi/si_gpu_load.c
@@ -40,242 +40,234 @@
  * fps (there are too few samples per frame). */
 #define SAMPLES_PER_SEC 10000
 
-#define GRBM_STATUS		0x8010
-#define TA_BUSY(x)		(((x) >> 14) & 0x1)
-#define GDS_BUSY(x)		(((x) >> 15) & 0x1)
-#define VGT_BUSY(x)		(((x) >> 17) & 0x1)
-#define IA_BUSY(x)		(((x) >> 19) & 0x1)
-#define SX_BUSY(x)		(((x) >> 20) & 0x1)
-#define WD_BUSY(x)		(((x) >> 21) & 0x1)
-#define SPI_BUSY(x)		(((x) >> 22) & 0x1)
-#define BCI_BUSY(x)		(((x) >> 23) & 0x1)
-#define SC_BUSY(x)		(((x) >> 24) & 0x1)
-#define PA_BUSY(x)		(((x) >> 25) & 0x1)
-#define DB_BUSY(x)		(((x) >> 26) & 0x1)
-#define CP_BUSY(x)		(((x) >> 29) & 0x1)
-#define CB_BUSY(x)		(((x) >> 30) & 0x1)
-#define GUI_ACTIVE(x)		(((x) >> 31) & 0x1)
-
-#define SRBM_STATUS2		0x0e4c
-#define SDMA_BUSY(x)		(((x) >> 5) & 0x1)
-
-#define CP_STAT                 0x8680
-#define PFP_BUSY(x)		(((x) >> 15) & 0x1)
-#define MEQ_BUSY(x)		(((x) >> 16) & 0x1)
-#define ME_BUSY(x)		(((x) >> 17) & 0x1)
-#define SURFACE_SYNC_BUSY(x)	(((x) >> 21) & 0x1)
-#define DMA_BUSY(x)		(((x) >> 22) & 0x1)
-#define SCRATCH_RAM_BUSY(x)	(((x) >> 24) & 0x1)
+#define GRBM_STATUS   0x8010
+#define TA_BUSY(x)    (((x) >> 14) & 0x1)
+#define GDS_BUSY(x)   (((x) >> 15) & 0x1)
+#define VGT_BUSY(x)   (((x) >> 17) & 0x1)
+#define IA_BUSY(x)    (((x) >> 19) & 0x1)
+#define SX_BUSY(x)    (((x) >> 20) & 0x1)
+#define WD_BUSY(x)    (((x) >> 21) & 0x1)
+#define SPI_BUSY(x)   (((x) >> 22) & 0x1)
+#define BCI_BUSY(x)   (((x) >> 23) & 0x1)
+#define SC_BUSY(x)    (((x) >> 24) & 0x1)
+#define PA_BUSY(x)    (((x) >> 25) & 0x1)
+#define DB_BUSY(x)    (((x) >> 26) & 0x1)
+#define CP_BUSY(x)    (((x) >> 29) & 0x1)
+#define CB_BUSY(x)    (((x) >> 30) & 0x1)
+#define GUI_ACTIVE(x) (((x) >> 31) & 0x1)
+
+#define SRBM_STATUS2 0x0e4c
+#define SDMA_BUSY(x) (((x) >> 5) & 0x1)
+
+#define CP_STAT              0x8680
+#define PFP_BUSY(x)          (((x) >> 15) & 0x1)
+#define MEQ_BUSY(x)          (((x) >> 16) & 0x1)
+#define ME_BUSY(x)           (((x) >> 17) & 0x1)
+#define SURFACE_SYNC_BUSY(x) (((x) >> 21) & 0x1)
+#define DMA_BUSY(x)          (((x) >> 22) & 0x1)
+#define SCRATCH_RAM_BUSY(x)  (((x) >> 24) & 0x1)
 
 #define IDENTITY(x) x
 
-#define UPDATE_COUNTER(field, mask)					\
-	do {								\
-		if (mask(value))					\
-			p_atomic_inc(&counters->named.field.busy);	\
-		else							\
-			p_atomic_inc(&counters->named.field.idle);	\
-	} while (0)
+#define UPDATE_COUNTER(field, mask)                                                                \
+   do {                                                                                            \
+      if (mask(value))                                                                             \
+         p_atomic_inc(&counters->named.field.busy);                                                \
+      else                                                                                         \
+         p_atomic_inc(&counters->named.field.idle);                                                \
+   } while (0)
 
-static void si_update_mmio_counters(struct si_screen *sscreen,
-				    union si_mmio_counters *counters)
+static void si_update_mmio_counters(struct si_screen *sscreen, union si_mmio_counters *counters)
 {
-	uint32_t value = 0;
-	bool gui_busy, sdma_busy = false;
-
-	/* GRBM_STATUS */
-	sscreen->ws->read_registers(sscreen->ws, GRBM_STATUS, 1, &value);
-
-	UPDATE_COUNTER(ta, TA_BUSY);
-	UPDATE_COUNTER(gds, GDS_BUSY);
-	UPDATE_COUNTER(vgt, VGT_BUSY);
-	UPDATE_COUNTER(ia, IA_BUSY);
-	UPDATE_COUNTER(sx, SX_BUSY);
-	UPDATE_COUNTER(wd, WD_BUSY);
-	UPDATE_COUNTER(spi, SPI_BUSY);
-	UPDATE_COUNTER(bci, BCI_BUSY);
-	UPDATE_COUNTER(sc, SC_BUSY);
-	UPDATE_COUNTER(pa, PA_BUSY);
-	UPDATE_COUNTER(db, DB_BUSY);
-	UPDATE_COUNTER(cp, CP_BUSY);
-	UPDATE_COUNTER(cb, CB_BUSY);
-	UPDATE_COUNTER(gui, GUI_ACTIVE);
-	gui_busy = GUI_ACTIVE(value);
-
-	if (sscreen->info.chip_class == GFX7 || sscreen->info.chip_class == GFX8) {
-		/* SRBM_STATUS2 */
-		sscreen->ws->read_registers(sscreen->ws, SRBM_STATUS2, 1, &value);
-
-		UPDATE_COUNTER(sdma, SDMA_BUSY);
-		sdma_busy = SDMA_BUSY(value);
-	}
-
-	if (sscreen->info.chip_class >= GFX8) {
-		/* CP_STAT */
-		sscreen->ws->read_registers(sscreen->ws, CP_STAT, 1, &value);
-
-		UPDATE_COUNTER(pfp, PFP_BUSY);
-		UPDATE_COUNTER(meq, MEQ_BUSY);
-		UPDATE_COUNTER(me, ME_BUSY);
-		UPDATE_COUNTER(surf_sync, SURFACE_SYNC_BUSY);
-		UPDATE_COUNTER(cp_dma, DMA_BUSY);
-		UPDATE_COUNTER(scratch_ram, SCRATCH_RAM_BUSY);
-	}
-
-	value = gui_busy || sdma_busy;
-	UPDATE_COUNTER(gpu, IDENTITY);
+   uint32_t value = 0;
+   bool gui_busy, sdma_busy = false;
+
+   /* GRBM_STATUS */
+   sscreen->ws->read_registers(sscreen->ws, GRBM_STATUS, 1, &value);
+
+   UPDATE_COUNTER(ta, TA_BUSY);
+   UPDATE_COUNTER(gds, GDS_BUSY);
+   UPDATE_COUNTER(vgt, VGT_BUSY);
+   UPDATE_COUNTER(ia, IA_BUSY);
+   UPDATE_COUNTER(sx, SX_BUSY);
+   UPDATE_COUNTER(wd, WD_BUSY);
+   UPDATE_COUNTER(spi, SPI_BUSY);
+   UPDATE_COUNTER(bci, BCI_BUSY);
+   UPDATE_COUNTER(sc, SC_BUSY);
+   UPDATE_COUNTER(pa, PA_BUSY);
+   UPDATE_COUNTER(db, DB_BUSY);
+   UPDATE_COUNTER(cp, CP_BUSY);
+   UPDATE_COUNTER(cb, CB_BUSY);
+   UPDATE_COUNTER(gui, GUI_ACTIVE);
+   gui_busy = GUI_ACTIVE(value);
+
+   if (sscreen->info.chip_class == GFX7 || sscreen->info.chip_class == GFX8) {
+      /* SRBM_STATUS2 */
+      sscreen->ws->read_registers(sscreen->ws, SRBM_STATUS2, 1, &value);
+
+      UPDATE_COUNTER(sdma, SDMA_BUSY);
+      sdma_busy = SDMA_BUSY(value);
+   }
+
+   if (sscreen->info.chip_class >= GFX8) {
+      /* CP_STAT */
+      sscreen->ws->read_registers(sscreen->ws, CP_STAT, 1, &value);
+
+      UPDATE_COUNTER(pfp, PFP_BUSY);
+      UPDATE_COUNTER(meq, MEQ_BUSY);
+      UPDATE_COUNTER(me, ME_BUSY);
+      UPDATE_COUNTER(surf_sync, SURFACE_SYNC_BUSY);
+      UPDATE_COUNTER(cp_dma, DMA_BUSY);
+      UPDATE_COUNTER(scratch_ram, SCRATCH_RAM_BUSY);
+   }
+
+   value = gui_busy || sdma_busy;
+   UPDATE_COUNTER(gpu, IDENTITY);
 }
 
 #undef UPDATE_COUNTER
 
-static int
-si_gpu_load_thread(void *param)
+static int si_gpu_load_thread(void *param)
 {
-	struct si_screen *sscreen = (struct si_screen*)param;
-	const int period_us = 1000000 / SAMPLES_PER_SEC;
-	int sleep_us = period_us;
-	int64_t cur_time, last_time = os_time_get();
-
-	while (!p_atomic_read(&sscreen->gpu_load_stop_thread)) {
-		if (sleep_us)
-			os_time_sleep(sleep_us);
-
-		/* Make sure we sleep the ideal amount of time to match
-		 * the expected frequency. */
-		cur_time = os_time_get();
-
-		if (os_time_timeout(last_time, last_time + period_us,
-				    cur_time))
-			sleep_us = MAX2(sleep_us - 1, 1);
-		else
-			sleep_us += 1;
-
-		/*printf("Hz: %.1f\n", 1000000.0 / (cur_time - last_time));*/
-		last_time = cur_time;
-
-		/* Update the counters. */
-		si_update_mmio_counters(sscreen, &sscreen->mmio_counters);
-	}
-	p_atomic_dec(&sscreen->gpu_load_stop_thread);
-	return 0;
+   struct si_screen *sscreen = (struct si_screen *)param;
+   const int period_us = 1000000 / SAMPLES_PER_SEC;
+   int sleep_us = period_us;
+   int64_t cur_time, last_time = os_time_get();
+
+   while (!p_atomic_read(&sscreen->gpu_load_stop_thread)) {
+      if (sleep_us)
+         os_time_sleep(sleep_us);
+
+      /* Make sure we sleep the ideal amount of time to match
+       * the expected frequency. */
+      cur_time = os_time_get();
+
+      if (os_time_timeout(last_time, last_time + period_us, cur_time))
+         sleep_us = MAX2(sleep_us - 1, 1);
+      else
+         sleep_us += 1;
+
+      /*printf("Hz: %.1f\n", 1000000.0 / (cur_time - last_time));*/
+      last_time = cur_time;
+
+      /* Update the counters. */
+      si_update_mmio_counters(sscreen, &sscreen->mmio_counters);
+   }
+   p_atomic_dec(&sscreen->gpu_load_stop_thread);
+   return 0;
 }
 
 void si_gpu_load_kill_thread(struct si_screen *sscreen)
 {
-	if (!sscreen->gpu_load_thread)
-		return;
+   if (!sscreen->gpu_load_thread)
+      return;
 
-	p_atomic_inc(&sscreen->gpu_load_stop_thread);
-	thrd_join(sscreen->gpu_load_thread, NULL);
-	sscreen->gpu_load_thread = 0;
+   p_atomic_inc(&sscreen->gpu_load_stop_thread);
+   thrd_join(sscreen->gpu_load_thread, NULL);
+   sscreen->gpu_load_thread = 0;
 }
 
-static uint64_t si_read_mmio_counter(struct si_screen *sscreen,
-				     unsigned busy_index)
+static uint64_t si_read_mmio_counter(struct si_screen *sscreen, unsigned busy_index)
 {
-	/* Start the thread if needed. */
-	if (!sscreen->gpu_load_thread) {
-		simple_mtx_lock(&sscreen->gpu_load_mutex);
-		/* Check again inside the mutex. */
-		if (!sscreen->gpu_load_thread)
-			sscreen->gpu_load_thread =
-				u_thread_create(si_gpu_load_thread, sscreen);
-		simple_mtx_unlock(&sscreen->gpu_load_mutex);
-	}
-
-	unsigned busy = p_atomic_read(&sscreen->mmio_counters.array[busy_index]);
-	unsigned idle = p_atomic_read(&sscreen->mmio_counters.array[busy_index + 1]);
-
-	return busy | ((uint64_t)idle << 32);
+   /* Start the thread if needed. */
+   if (!sscreen->gpu_load_thread) {
+      simple_mtx_lock(&sscreen->gpu_load_mutex);
+      /* Check again inside the mutex. */
+      if (!sscreen->gpu_load_thread)
+         sscreen->gpu_load_thread = u_thread_create(si_gpu_load_thread, sscreen);
+      simple_mtx_unlock(&sscreen->gpu_load_mutex);
+   }
+
+   unsigned busy = p_atomic_read(&sscreen->mmio_counters.array[busy_index]);
+   unsigned idle = p_atomic_read(&sscreen->mmio_counters.array[busy_index + 1]);
+
+   return busy | ((uint64_t)idle << 32);
 }
 
-static unsigned si_end_mmio_counter(struct si_screen *sscreen,
-				    uint64_t begin, unsigned busy_index)
+static unsigned si_end_mmio_counter(struct si_screen *sscreen, uint64_t begin, unsigned busy_index)
 {
-	uint64_t end = si_read_mmio_counter(sscreen, busy_index);
-	unsigned busy = (end & 0xffffffff) - (begin & 0xffffffff);
-	unsigned idle = (end >> 32) - (begin >> 32);
-
-	/* Calculate the % of time the busy counter was being incremented.
-	 *
-	 * If no counters were incremented, return the current counter status.
-	 * It's for the case when the load is queried faster than
-	 * the counters are updated.
-	 */
-	if (idle || busy) {
-		return busy*100 / (busy + idle);
-	} else {
-		union si_mmio_counters counters;
-
-		memset(&counters, 0, sizeof(counters));
-		si_update_mmio_counters(sscreen, &counters);
-		return counters.array[busy_index] ? 100 : 0;
-	}
+   uint64_t end = si_read_mmio_counter(sscreen, busy_index);
+   unsigned busy = (end & 0xffffffff) - (begin & 0xffffffff);
+   unsigned idle = (end >> 32) - (begin >> 32);
+
+   /* Calculate the % of time the busy counter was being incremented.
+    *
+    * If no counters were incremented, return the current counter status.
+    * It's for the case when the load is queried faster than
+    * the counters are updated.
+    */
+   if (idle || busy) {
+      return busy * 100 / (busy + idle);
+   } else {
+      union si_mmio_counters counters;
+
+      memset(&counters, 0, sizeof(counters));
+      si_update_mmio_counters(sscreen, &counters);
+      return counters.array[busy_index] ? 100 : 0;
+   }
 }
 
-#define BUSY_INDEX(sscreen, field) (&sscreen->mmio_counters.named.field.busy - \
-				    sscreen->mmio_counters.array)
+#define BUSY_INDEX(sscreen, field)                                                                 \
+   (&sscreen->mmio_counters.named.field.busy - sscreen->mmio_counters.array)
 
-static unsigned busy_index_from_type(struct si_screen *sscreen,
-				     unsigned type)
+static unsigned busy_index_from_type(struct si_screen *sscreen, unsigned type)
 {
-	switch (type) {
-	case SI_QUERY_GPU_LOAD:
-		return BUSY_INDEX(sscreen, gpu);
-	case SI_QUERY_GPU_SHADERS_BUSY:
-		return BUSY_INDEX(sscreen, spi);
-	case SI_QUERY_GPU_TA_BUSY:
-		return BUSY_INDEX(sscreen, ta);
-	case SI_QUERY_GPU_GDS_BUSY:
-		return BUSY_INDEX(sscreen, gds);
-	case SI_QUERY_GPU_VGT_BUSY:
-		return BUSY_INDEX(sscreen, vgt);
-	case SI_QUERY_GPU_IA_BUSY:
-		return BUSY_INDEX(sscreen, ia);
-	case SI_QUERY_GPU_SX_BUSY:
-		return BUSY_INDEX(sscreen, sx);
-	case SI_QUERY_GPU_WD_BUSY:
-		return BUSY_INDEX(sscreen, wd);
-	case SI_QUERY_GPU_BCI_BUSY:
-		return BUSY_INDEX(sscreen, bci);
-	case SI_QUERY_GPU_SC_BUSY:
-		return BUSY_INDEX(sscreen, sc);
-	case SI_QUERY_GPU_PA_BUSY:
-		return BUSY_INDEX(sscreen, pa);
-	case SI_QUERY_GPU_DB_BUSY:
-		return BUSY_INDEX(sscreen, db);
-	case SI_QUERY_GPU_CP_BUSY:
-		return BUSY_INDEX(sscreen, cp);
-	case SI_QUERY_GPU_CB_BUSY:
-		return BUSY_INDEX(sscreen, cb);
-	case SI_QUERY_GPU_SDMA_BUSY:
-		return BUSY_INDEX(sscreen, sdma);
-	case SI_QUERY_GPU_PFP_BUSY:
-		return BUSY_INDEX(sscreen, pfp);
-	case SI_QUERY_GPU_MEQ_BUSY:
-		return BUSY_INDEX(sscreen, meq);
-	case SI_QUERY_GPU_ME_BUSY:
-		return BUSY_INDEX(sscreen, me);
-	case SI_QUERY_GPU_SURF_SYNC_BUSY:
-		return BUSY_INDEX(sscreen, surf_sync);
-	case SI_QUERY_GPU_CP_DMA_BUSY:
-		return BUSY_INDEX(sscreen, cp_dma);
-	case SI_QUERY_GPU_SCRATCH_RAM_BUSY:
-		return BUSY_INDEX(sscreen, scratch_ram);
-	default:
-		unreachable("invalid query type");
-	}
+   switch (type) {
+   case SI_QUERY_GPU_LOAD:
+      return BUSY_INDEX(sscreen, gpu);
+   case SI_QUERY_GPU_SHADERS_BUSY:
+      return BUSY_INDEX(sscreen, spi);
+   case SI_QUERY_GPU_TA_BUSY:
+      return BUSY_INDEX(sscreen, ta);
+   case SI_QUERY_GPU_GDS_BUSY:
+      return BUSY_INDEX(sscreen, gds);
+   case SI_QUERY_GPU_VGT_BUSY:
+      return BUSY_INDEX(sscreen, vgt);
+   case SI_QUERY_GPU_IA_BUSY:
+      return BUSY_INDEX(sscreen, ia);
+   case SI_QUERY_GPU_SX_BUSY:
+      return BUSY_INDEX(sscreen, sx);
+   case SI_QUERY_GPU_WD_BUSY:
+      return BUSY_INDEX(sscreen, wd);
+   case SI_QUERY_GPU_BCI_BUSY:
+      return BUSY_INDEX(sscreen, bci);
+   case SI_QUERY_GPU_SC_BUSY:
+      return BUSY_INDEX(sscreen, sc);
+   case SI_QUERY_GPU_PA_BUSY:
+      return BUSY_INDEX(sscreen, pa);
+   case SI_QUERY_GPU_DB_BUSY:
+      return BUSY_INDEX(sscreen, db);
+   case SI_QUERY_GPU_CP_BUSY:
+      return BUSY_INDEX(sscreen, cp);
+   case SI_QUERY_GPU_CB_BUSY:
+      return BUSY_INDEX(sscreen, cb);
+   case SI_QUERY_GPU_SDMA_BUSY:
+      return BUSY_INDEX(sscreen, sdma);
+   case SI_QUERY_GPU_PFP_BUSY:
+      return BUSY_INDEX(sscreen, pfp);
+   case SI_QUERY_GPU_MEQ_BUSY:
+      return BUSY_INDEX(sscreen, meq);
+   case SI_QUERY_GPU_ME_BUSY:
+      return BUSY_INDEX(sscreen, me);
+   case SI_QUERY_GPU_SURF_SYNC_BUSY:
+      return BUSY_INDEX(sscreen, surf_sync);
+   case SI_QUERY_GPU_CP_DMA_BUSY:
+      return BUSY_INDEX(sscreen, cp_dma);
+   case SI_QUERY_GPU_SCRATCH_RAM_BUSY:
+      return BUSY_INDEX(sscreen, scratch_ram);
+   default:
+      unreachable("invalid query type");
+   }
 }
 
 uint64_t si_begin_counter(struct si_screen *sscreen, unsigned type)
 {
-	unsigned busy_index = busy_index_from_type(sscreen, type);
-	return si_read_mmio_counter(sscreen, busy_index);
+   unsigned busy_index = busy_index_from_type(sscreen, type);
+   return si_read_mmio_counter(sscreen, busy_index);
 }
 
-unsigned si_end_counter(struct si_screen *sscreen, unsigned type,
-			uint64_t begin)
+unsigned si_end_counter(struct si_screen *sscreen, unsigned type, uint64_t begin)
 {
-	unsigned busy_index = busy_index_from_type(sscreen, type);
-	return si_end_mmio_counter(sscreen, begin, busy_index);
+   unsigned busy_index = busy_index_from_type(sscreen, type);
+   return si_end_mmio_counter(sscreen, begin, busy_index);
 }
diff --git a/src/gallium/drivers/radeonsi/si_perfcounter.c b/src/gallium/drivers/radeonsi/si_perfcounter.c
index 974ac430c53..ca13ca8a639 100644
--- a/src/gallium/drivers/radeonsi/si_perfcounter.c
+++ b/src/gallium/drivers/radeonsi/si_perfcounter.c
@@ -26,101 +26,101 @@
 #include "si_query.h"
 #include "util/u_memory.h"
 
+enum si_pc_block_flags
+{
+   /* This block is part of the shader engine */
+   SI_PC_BLOCK_SE = (1 << 0),
 
-enum si_pc_block_flags {
-	/* This block is part of the shader engine */
-	SI_PC_BLOCK_SE = (1 << 0),
-
-	/* Expose per-instance groups instead of summing all instances (within
-	 * an SE). */
-	SI_PC_BLOCK_INSTANCE_GROUPS = (1 << 1),
+   /* Expose per-instance groups instead of summing all instances (within
+    * an SE). */
+   SI_PC_BLOCK_INSTANCE_GROUPS = (1 << 1),
 
-	/* Expose per-SE groups instead of summing instances across SEs. */
-	SI_PC_BLOCK_SE_GROUPS = (1 << 2),
+   /* Expose per-SE groups instead of summing instances across SEs. */
+   SI_PC_BLOCK_SE_GROUPS = (1 << 2),
 
-	/* Shader block */
-	SI_PC_BLOCK_SHADER = (1 << 3),
+   /* Shader block */
+   SI_PC_BLOCK_SHADER = (1 << 3),
 
-	/* Non-shader block with perfcounters windowed by shaders. */
-	SI_PC_BLOCK_SHADER_WINDOWED = (1 << 4),
+   /* Non-shader block with perfcounters windowed by shaders. */
+   SI_PC_BLOCK_SHADER_WINDOWED = (1 << 4),
 };
 
-enum si_pc_reg_layout {
-	/* All secondary selector dwords follow as one block after the primary
-	 * selector dwords for the counters that have secondary selectors.
-	 */
-	SI_PC_MULTI_BLOCK = 0,
+enum si_pc_reg_layout
+{
+   /* All secondary selector dwords follow as one block after the primary
+    * selector dwords for the counters that have secondary selectors.
+    */
+   SI_PC_MULTI_BLOCK = 0,
 
-	/* Each secondary selector dword follows immediately afters the
-	 * corresponding primary.
-	 */
-	SI_PC_MULTI_ALTERNATE = 1,
+   /* Each secondary selector dword follows immediately afters the
+    * corresponding primary.
+    */
+   SI_PC_MULTI_ALTERNATE = 1,
 
-	/* All secondary selector dwords follow as one block after all primary
-	 * selector dwords.
-	 */
-	SI_PC_MULTI_TAIL = 2,
+   /* All secondary selector dwords follow as one block after all primary
+    * selector dwords.
+    */
+   SI_PC_MULTI_TAIL = 2,
 
-	/* Free-form arrangement of selector registers. */
-	SI_PC_MULTI_CUSTOM = 3,
+   /* Free-form arrangement of selector registers. */
+   SI_PC_MULTI_CUSTOM = 3,
 
-	SI_PC_MULTI_MASK = 3,
+   SI_PC_MULTI_MASK = 3,
 
-	/* Registers are laid out in decreasing rather than increasing order. */
-	SI_PC_REG_REVERSE = 4,
+   /* Registers are laid out in decreasing rather than increasing order. */
+   SI_PC_REG_REVERSE = 4,
 
-	SI_PC_FAKE = 8,
+   SI_PC_FAKE = 8,
 };
 
 struct si_pc_block_base {
-	const char *name;
-	unsigned num_counters;
-	unsigned flags;
-
-	unsigned select_or;
-	unsigned select0;
-	unsigned counter0_lo;
-	unsigned *select;
-	unsigned *counters;
-	unsigned num_multi;
-	unsigned num_prelude;
-	unsigned layout;
+   const char *name;
+   unsigned num_counters;
+   unsigned flags;
+
+   unsigned select_or;
+   unsigned select0;
+   unsigned counter0_lo;
+   unsigned *select;
+   unsigned *counters;
+   unsigned num_multi;
+   unsigned num_prelude;
+   unsigned layout;
 };
 
 struct si_pc_block_gfxdescr {
-	struct si_pc_block_base *b;
-	unsigned selectors;
-	unsigned instances;
+   struct si_pc_block_base *b;
+   unsigned selectors;
+   unsigned instances;
 };
 
 struct si_pc_block {
-	const struct si_pc_block_gfxdescr *b;
-	unsigned num_instances;
+   const struct si_pc_block_gfxdescr *b;
+   unsigned num_instances;
 
-	unsigned num_groups;
-	char *group_names;
-	unsigned group_name_stride;
+   unsigned num_groups;
+   char *group_names;
+   unsigned group_name_stride;
 
-	char *selector_names;
-	unsigned selector_name_stride;
+   char *selector_names;
+   unsigned selector_name_stride;
 };
 
 /* The order is chosen to be compatible with GPUPerfStudio's hardcoding of
  * performance counter group IDs.
  */
-static const char * const si_pc_shader_type_suffixes[] = {
-	"", "_ES", "_GS", "_VS", "_PS", "_LS", "_HS", "_CS"
-};
+static const char *const si_pc_shader_type_suffixes[] = {"",    "_ES", "_GS", "_VS",
+                                                         "_PS", "_LS", "_HS", "_CS"};
 
 static const unsigned si_pc_shader_type_bits[] = {
-	0x7f,
-	S_036780_ES_EN(1),
-	S_036780_GS_EN(1),
-	S_036780_VS_EN(1),
-	S_036780_PS_EN(1),
-	S_036780_LS_EN(1),
-	S_036780_HS_EN(1),
-	S_036780_CS_EN(1),
+   0x7f,
+   S_036780_ES_EN(1),
+   S_036780_GS_EN(1),
+   S_036780_VS_EN(1),
+   S_036780_PS_EN(1),
+   S_036780_LS_EN(1),
+   S_036780_HS_EN(1),
+   S_036780_CS_EN(1),
 };
 
 /* Max counters per HW block */
@@ -129,277 +129,274 @@ static const unsigned si_pc_shader_type_bits[] = {
 #define SI_PC_SHADERS_WINDOWING (1u << 31)
 
 struct si_query_group {
-	struct si_query_group *next;
-	struct si_pc_block *block;
-	unsigned sub_gid; /* only used during init */
-	unsigned result_base; /* only used during init */
-	int se;
-	int instance;
-	unsigned num_counters;
-	unsigned selectors[SI_QUERY_MAX_COUNTERS];
+   struct si_query_group *next;
+   struct si_pc_block *block;
+   unsigned sub_gid;     /* only used during init */
+   unsigned result_base; /* only used during init */
+   int se;
+   int instance;
+   unsigned num_counters;
+   unsigned selectors[SI_QUERY_MAX_COUNTERS];
 };
 
 struct si_query_counter {
-	unsigned base;
-	unsigned qwords;
-	unsigned stride; /* in uint64s */
+   unsigned base;
+   unsigned qwords;
+   unsigned stride; /* in uint64s */
 };
 
 struct si_query_pc {
-	struct si_query b;
-	struct si_query_buffer buffer;
+   struct si_query b;
+   struct si_query_buffer buffer;
 
-	/* Size of the results in memory, in bytes. */
-	unsigned result_size;
+   /* Size of the results in memory, in bytes. */
+   unsigned result_size;
 
-	unsigned shaders;
-	unsigned num_counters;
-	struct si_query_counter *counters;
-	struct si_query_group *groups;
+   unsigned shaders;
+   unsigned num_counters;
+   struct si_query_counter *counters;
+   struct si_query_group *groups;
 };
 
-
 static struct si_pc_block_base cik_CB = {
-	.name = "CB",
-	.num_counters = 4,
-	.flags = SI_PC_BLOCK_SE | SI_PC_BLOCK_INSTANCE_GROUPS,
-
-	.select0 = R_037000_CB_PERFCOUNTER_FILTER,
-	.counter0_lo = R_035018_CB_PERFCOUNTER0_LO,
-	.num_multi = 1,
-	.num_prelude = 1,
-	.layout = SI_PC_MULTI_ALTERNATE,
+   .name = "CB",
+   .num_counters = 4,
+   .flags = SI_PC_BLOCK_SE | SI_PC_BLOCK_INSTANCE_GROUPS,
+
+   .select0 = R_037000_CB_PERFCOUNTER_FILTER,
+   .counter0_lo = R_035018_CB_PERFCOUNTER0_LO,
+   .num_multi = 1,
+   .num_prelude = 1,
+   .layout = SI_PC_MULTI_ALTERNATE,
 };
 
 static unsigned cik_CPC_select[] = {
-	R_036024_CPC_PERFCOUNTER0_SELECT,
-	R_036010_CPC_PERFCOUNTER0_SELECT1,
-	R_03600C_CPC_PERFCOUNTER1_SELECT,
+   R_036024_CPC_PERFCOUNTER0_SELECT,
+   R_036010_CPC_PERFCOUNTER0_SELECT1,
+   R_03600C_CPC_PERFCOUNTER1_SELECT,
 };
 static struct si_pc_block_base cik_CPC = {
-	.name = "CPC",
-	.num_counters = 2,
+   .name = "CPC",
+   .num_counters = 2,
 
-	.select = cik_CPC_select,
-	.counter0_lo = R_034018_CPC_PERFCOUNTER0_LO,
-	.num_multi = 1,
-	.layout = SI_PC_MULTI_CUSTOM | SI_PC_REG_REVERSE,
+   .select = cik_CPC_select,
+   .counter0_lo = R_034018_CPC_PERFCOUNTER0_LO,
+   .num_multi = 1,
+   .layout = SI_PC_MULTI_CUSTOM | SI_PC_REG_REVERSE,
 };
 
 static struct si_pc_block_base cik_CPF = {
-	.name = "CPF",
-	.num_counters = 2,
+   .name = "CPF",
+   .num_counters = 2,
 
-	.select0 = R_03601C_CPF_PERFCOUNTER0_SELECT,
-	.counter0_lo = R_034028_CPF_PERFCOUNTER0_LO,
-	.num_multi = 1,
-	.layout = SI_PC_MULTI_ALTERNATE | SI_PC_REG_REVERSE,
+   .select0 = R_03601C_CPF_PERFCOUNTER0_SELECT,
+   .counter0_lo = R_034028_CPF_PERFCOUNTER0_LO,
+   .num_multi = 1,
+   .layout = SI_PC_MULTI_ALTERNATE | SI_PC_REG_REVERSE,
 };
 
 static struct si_pc_block_base cik_CPG = {
-	.name = "CPG",
-	.num_counters = 2,
+   .name = "CPG",
+   .num_counters = 2,
 
-	.select0 = R_036008_CPG_PERFCOUNTER0_SELECT,
-	.counter0_lo = R_034008_CPG_PERFCOUNTER0_LO,
-	.num_multi = 1,
-	.layout = SI_PC_MULTI_ALTERNATE | SI_PC_REG_REVERSE,
+   .select0 = R_036008_CPG_PERFCOUNTER0_SELECT,
+   .counter0_lo = R_034008_CPG_PERFCOUNTER0_LO,
+   .num_multi = 1,
+   .layout = SI_PC_MULTI_ALTERNATE | SI_PC_REG_REVERSE,
 };
 
 static struct si_pc_block_base cik_DB = {
-	.name = "DB",
-	.num_counters = 4,
-	.flags = SI_PC_BLOCK_SE | SI_PC_BLOCK_INSTANCE_GROUPS,
-
-	.select0 = R_037100_DB_PERFCOUNTER0_SELECT,
-	.counter0_lo = R_035100_DB_PERFCOUNTER0_LO,
-	.num_multi = 3, // really only 2, but there's a gap between registers
-	.layout = SI_PC_MULTI_ALTERNATE,
+   .name = "DB",
+   .num_counters = 4,
+   .flags = SI_PC_BLOCK_SE | SI_PC_BLOCK_INSTANCE_GROUPS,
+
+   .select0 = R_037100_DB_PERFCOUNTER0_SELECT,
+   .counter0_lo = R_035100_DB_PERFCOUNTER0_LO,
+   .num_multi = 3, // really only 2, but there's a gap between registers
+   .layout = SI_PC_MULTI_ALTERNATE,
 };
 
 static struct si_pc_block_base cik_GDS = {
-	.name = "GDS",
-	.num_counters = 4,
+   .name = "GDS",
+   .num_counters = 4,
 
-	.select0 = R_036A00_GDS_PERFCOUNTER0_SELECT,
-	.counter0_lo = R_034A00_GDS_PERFCOUNTER0_LO,
-	.num_multi = 1,
-	.layout = SI_PC_MULTI_TAIL,
+   .select0 = R_036A00_GDS_PERFCOUNTER0_SELECT,
+   .counter0_lo = R_034A00_GDS_PERFCOUNTER0_LO,
+   .num_multi = 1,
+   .layout = SI_PC_MULTI_TAIL,
 };
 
 static unsigned cik_GRBM_counters[] = {
-	R_034100_GRBM_PERFCOUNTER0_LO,
-	R_03410C_GRBM_PERFCOUNTER1_LO,
+   R_034100_GRBM_PERFCOUNTER0_LO,
+   R_03410C_GRBM_PERFCOUNTER1_LO,
 };
 static struct si_pc_block_base cik_GRBM = {
-	.name = "GRBM",
-	.num_counters = 2,
+   .name = "GRBM",
+   .num_counters = 2,
 
-	.select0 = R_036100_GRBM_PERFCOUNTER0_SELECT,
-	.counters = cik_GRBM_counters,
+   .select0 = R_036100_GRBM_PERFCOUNTER0_SELECT,
+   .counters = cik_GRBM_counters,
 };
 
 static struct si_pc_block_base cik_GRBMSE = {
-	.name = "GRBMSE",
-	.num_counters = 4,
+   .name = "GRBMSE",
+   .num_counters = 4,
 
-	.select0 = R_036108_GRBM_SE0_PERFCOUNTER_SELECT,
-	.counter0_lo = R_034114_GRBM_SE0_PERFCOUNTER_LO,
+   .select0 = R_036108_GRBM_SE0_PERFCOUNTER_SELECT,
+   .counter0_lo = R_034114_GRBM_SE0_PERFCOUNTER_LO,
 };
 
 static struct si_pc_block_base cik_IA = {
-	.name = "IA",
-	.num_counters = 4,
+   .name = "IA",
+   .num_counters = 4,
 
-	.select0 = R_036210_IA_PERFCOUNTER0_SELECT,
-	.counter0_lo = R_034220_IA_PERFCOUNTER0_LO,
-	.num_multi = 1,
-	.layout = SI_PC_MULTI_TAIL,
+   .select0 = R_036210_IA_PERFCOUNTER0_SELECT,
+   .counter0_lo = R_034220_IA_PERFCOUNTER0_LO,
+   .num_multi = 1,
+   .layout = SI_PC_MULTI_TAIL,
 };
 
 static struct si_pc_block_base cik_PA_SC = {
-	.name = "PA_SC",
-	.num_counters = 8,
-	.flags = SI_PC_BLOCK_SE,
-
-	.select0 = R_036500_PA_SC_PERFCOUNTER0_SELECT,
-	.counter0_lo = R_034500_PA_SC_PERFCOUNTER0_LO,
-	.num_multi = 1,
-	.layout = SI_PC_MULTI_ALTERNATE,
+   .name = "PA_SC",
+   .num_counters = 8,
+   .flags = SI_PC_BLOCK_SE,
+
+   .select0 = R_036500_PA_SC_PERFCOUNTER0_SELECT,
+   .counter0_lo = R_034500_PA_SC_PERFCOUNTER0_LO,
+   .num_multi = 1,
+   .layout = SI_PC_MULTI_ALTERNATE,
 };
 
 /* According to docs, PA_SU counters are only 48 bits wide. */
 static struct si_pc_block_base cik_PA_SU = {
-	.name = "PA_SU",
-	.num_counters = 4,
-	.flags = SI_PC_BLOCK_SE,
-
-	.select0 = R_036400_PA_SU_PERFCOUNTER0_SELECT,
-	.counter0_lo = R_034400_PA_SU_PERFCOUNTER0_LO,
-	.num_multi = 2,
-	.layout = SI_PC_MULTI_ALTERNATE,
+   .name = "PA_SU",
+   .num_counters = 4,
+   .flags = SI_PC_BLOCK_SE,
+
+   .select0 = R_036400_PA_SU_PERFCOUNTER0_SELECT,
+   .counter0_lo = R_034400_PA_SU_PERFCOUNTER0_LO,
+   .num_multi = 2,
+   .layout = SI_PC_MULTI_ALTERNATE,
 };
 
 static struct si_pc_block_base cik_SPI = {
-	.name = "SPI",
-	.num_counters = 6,
-	.flags = SI_PC_BLOCK_SE,
-
-	.select0 = R_036600_SPI_PERFCOUNTER0_SELECT,
-	.counter0_lo = R_034604_SPI_PERFCOUNTER0_LO,
-	.num_multi = 4,
-	.layout = SI_PC_MULTI_BLOCK,
+   .name = "SPI",
+   .num_counters = 6,
+   .flags = SI_PC_BLOCK_SE,
+
+   .select0 = R_036600_SPI_PERFCOUNTER0_SELECT,
+   .counter0_lo = R_034604_SPI_PERFCOUNTER0_LO,
+   .num_multi = 4,
+   .layout = SI_PC_MULTI_BLOCK,
 };
 
 static struct si_pc_block_base cik_SQ = {
-	.name = "SQ",
-	.num_counters = 16,
-	.flags = SI_PC_BLOCK_SE | SI_PC_BLOCK_SHADER,
-
-	.select0 = R_036700_SQ_PERFCOUNTER0_SELECT,
-	.select_or = S_036700_SQC_BANK_MASK(15) |
-			S_036700_SQC_CLIENT_MASK(15) |
-			S_036700_SIMD_MASK(15),
-	.counter0_lo = R_034700_SQ_PERFCOUNTER0_LO,
+   .name = "SQ",
+   .num_counters = 16,
+   .flags = SI_PC_BLOCK_SE | SI_PC_BLOCK_SHADER,
+
+   .select0 = R_036700_SQ_PERFCOUNTER0_SELECT,
+   .select_or = S_036700_SQC_BANK_MASK(15) | S_036700_SQC_CLIENT_MASK(15) | S_036700_SIMD_MASK(15),
+   .counter0_lo = R_034700_SQ_PERFCOUNTER0_LO,
 };
 
 static struct si_pc_block_base cik_SX = {
-	.name = "SX",
-	.num_counters = 4,
-	.flags = SI_PC_BLOCK_SE,
-
-	.select0 = R_036900_SX_PERFCOUNTER0_SELECT,
-	.counter0_lo = R_034900_SX_PERFCOUNTER0_LO,
-	.num_multi = 2,
-	.layout = SI_PC_MULTI_TAIL,
+   .name = "SX",
+   .num_counters = 4,
+   .flags = SI_PC_BLOCK_SE,
+
+   .select0 = R_036900_SX_PERFCOUNTER0_SELECT,
+   .counter0_lo = R_034900_SX_PERFCOUNTER0_LO,
+   .num_multi = 2,
+   .layout = SI_PC_MULTI_TAIL,
 };
 
 static struct si_pc_block_base cik_TA = {
-	.name = "TA",
-	.num_counters = 2,
-	.flags = SI_PC_BLOCK_SE | SI_PC_BLOCK_INSTANCE_GROUPS | SI_PC_BLOCK_SHADER_WINDOWED,
-
-	.select0 = R_036B00_TA_PERFCOUNTER0_SELECT,
-	.counter0_lo = R_034B00_TA_PERFCOUNTER0_LO,
-	.num_multi = 1,
-	.layout = SI_PC_MULTI_ALTERNATE,
+   .name = "TA",
+   .num_counters = 2,
+   .flags = SI_PC_BLOCK_SE | SI_PC_BLOCK_INSTANCE_GROUPS | SI_PC_BLOCK_SHADER_WINDOWED,
+
+   .select0 = R_036B00_TA_PERFCOUNTER0_SELECT,
+   .counter0_lo = R_034B00_TA_PERFCOUNTER0_LO,
+   .num_multi = 1,
+   .layout = SI_PC_MULTI_ALTERNATE,
 };
 
 static struct si_pc_block_base cik_TD = {
-	.name = "TD",
-	.num_counters = 2,
-	.flags = SI_PC_BLOCK_SE | SI_PC_BLOCK_INSTANCE_GROUPS | SI_PC_BLOCK_SHADER_WINDOWED,
-
-	.select0 = R_036C00_TD_PERFCOUNTER0_SELECT,
-	.counter0_lo = R_034C00_TD_PERFCOUNTER0_LO,
-	.num_multi = 1,
-	.layout = SI_PC_MULTI_ALTERNATE,
+   .name = "TD",
+   .num_counters = 2,
+   .flags = SI_PC_BLOCK_SE | SI_PC_BLOCK_INSTANCE_GROUPS | SI_PC_BLOCK_SHADER_WINDOWED,
+
+   .select0 = R_036C00_TD_PERFCOUNTER0_SELECT,
+   .counter0_lo = R_034C00_TD_PERFCOUNTER0_LO,
+   .num_multi = 1,
+   .layout = SI_PC_MULTI_ALTERNATE,
 };
 
 static struct si_pc_block_base cik_TCA = {
-	.name = "TCA",
-	.num_counters = 4,
-	.flags = SI_PC_BLOCK_INSTANCE_GROUPS,
-
-	.select0 = R_036E40_TCA_PERFCOUNTER0_SELECT,
-	.counter0_lo = R_034E40_TCA_PERFCOUNTER0_LO,
-	.num_multi = 2,
-	.layout = SI_PC_MULTI_ALTERNATE,
+   .name = "TCA",
+   .num_counters = 4,
+   .flags = SI_PC_BLOCK_INSTANCE_GROUPS,
+
+   .select0 = R_036E40_TCA_PERFCOUNTER0_SELECT,
+   .counter0_lo = R_034E40_TCA_PERFCOUNTER0_LO,
+   .num_multi = 2,
+   .layout = SI_PC_MULTI_ALTERNATE,
 };
 
 static struct si_pc_block_base cik_TCC = {
-	.name = "TCC",
-	.num_counters = 4,
-	.flags = SI_PC_BLOCK_INSTANCE_GROUPS,
-
-	.select0 = R_036E00_TCC_PERFCOUNTER0_SELECT,
-	.counter0_lo = R_034E00_TCC_PERFCOUNTER0_LO,
-	.num_multi = 2,
-	.layout = SI_PC_MULTI_ALTERNATE,
+   .name = "TCC",
+   .num_counters = 4,
+   .flags = SI_PC_BLOCK_INSTANCE_GROUPS,
+
+   .select0 = R_036E00_TCC_PERFCOUNTER0_SELECT,
+   .counter0_lo = R_034E00_TCC_PERFCOUNTER0_LO,
+   .num_multi = 2,
+   .layout = SI_PC_MULTI_ALTERNATE,
 };
 
 static struct si_pc_block_base cik_TCP = {
-	.name = "TCP",
-	.num_counters = 4,
-	.flags = SI_PC_BLOCK_SE | SI_PC_BLOCK_INSTANCE_GROUPS | SI_PC_BLOCK_SHADER_WINDOWED,
-
-	.select0 = R_036D00_TCP_PERFCOUNTER0_SELECT,
-	.counter0_lo = R_034D00_TCP_PERFCOUNTER0_LO,
-	.num_multi = 2,
-	.layout = SI_PC_MULTI_ALTERNATE,
+   .name = "TCP",
+   .num_counters = 4,
+   .flags = SI_PC_BLOCK_SE | SI_PC_BLOCK_INSTANCE_GROUPS | SI_PC_BLOCK_SHADER_WINDOWED,
+
+   .select0 = R_036D00_TCP_PERFCOUNTER0_SELECT,
+   .counter0_lo = R_034D00_TCP_PERFCOUNTER0_LO,
+   .num_multi = 2,
+   .layout = SI_PC_MULTI_ALTERNATE,
 };
 
 static struct si_pc_block_base cik_VGT = {
-	.name = "VGT",
-	.num_counters = 4,
-	.flags = SI_PC_BLOCK_SE,
-
-	.select0 = R_036230_VGT_PERFCOUNTER0_SELECT,
-	.counter0_lo = R_034240_VGT_PERFCOUNTER0_LO,
-	.num_multi = 1,
-	.layout = SI_PC_MULTI_TAIL,
+   .name = "VGT",
+   .num_counters = 4,
+   .flags = SI_PC_BLOCK_SE,
+
+   .select0 = R_036230_VGT_PERFCOUNTER0_SELECT,
+   .counter0_lo = R_034240_VGT_PERFCOUNTER0_LO,
+   .num_multi = 1,
+   .layout = SI_PC_MULTI_TAIL,
 };
 
 static struct si_pc_block_base cik_WD = {
-	.name = "WD",
-	.num_counters = 4,
+   .name = "WD",
+   .num_counters = 4,
 
-	.select0 = R_036200_WD_PERFCOUNTER0_SELECT,
-	.counter0_lo = R_034200_WD_PERFCOUNTER0_LO,
+   .select0 = R_036200_WD_PERFCOUNTER0_SELECT,
+   .counter0_lo = R_034200_WD_PERFCOUNTER0_LO,
 };
 
 static struct si_pc_block_base cik_MC = {
-	.name = "MC",
-	.num_counters = 4,
+   .name = "MC",
+   .num_counters = 4,
 
-	.layout = SI_PC_FAKE,
+   .layout = SI_PC_FAKE,
 };
 
 static struct si_pc_block_base cik_SRBM = {
-	.name = "SRBM",
-	.num_counters = 2,
+   .name = "SRBM",
+   .num_counters = 2,
 
-	.layout = SI_PC_FAKE,
+   .layout = SI_PC_FAKE,
 };
 
 /* Both the number of instances and selectors varies between chips of the same
@@ -411,947 +408,868 @@ static struct si_pc_block_base cik_SRBM = {
  * blocks here matters.
  */
 static struct si_pc_block_gfxdescr groups_CIK[] = {
-	{ &cik_CB, 226},
-	{ &cik_CPF, 17 },
-	{ &cik_DB, 257},
-	{ &cik_GRBM, 34 },
-	{ &cik_GRBMSE, 15 },
-	{ &cik_PA_SU, 153 },
-	{ &cik_PA_SC, 395 },
-	{ &cik_SPI, 186 },
-	{ &cik_SQ, 252 },
-	{ &cik_SX, 32 },
-	{ &cik_TA, 111, 11 },
-	{ &cik_TCA, 39, 2 },
-	{ &cik_TCC, 160},
-	{ &cik_TD, 55, 11 },
-	{ &cik_TCP, 154, 11 },
-	{ &cik_GDS, 121 },
-	{ &cik_VGT, 140 },
-	{ &cik_IA, 22 },
-	{ &cik_MC, 22 },
-	{ &cik_SRBM, 19 },
-	{ &cik_WD, 22 },
-	{ &cik_CPG, 46 },
-	{ &cik_CPC, 22 },
+   {&cik_CB, 226},     {&cik_CPF, 17},    {&cik_DB, 257},  {&cik_GRBM, 34},   {&cik_GRBMSE, 15},
+   {&cik_PA_SU, 153},  {&cik_PA_SC, 395}, {&cik_SPI, 186}, {&cik_SQ, 252},    {&cik_SX, 32},
+   {&cik_TA, 111, 11}, {&cik_TCA, 39, 2}, {&cik_TCC, 160}, {&cik_TD, 55, 11}, {&cik_TCP, 154, 11},
+   {&cik_GDS, 121},    {&cik_VGT, 140},   {&cik_IA, 22},   {&cik_MC, 22},     {&cik_SRBM, 19},
+   {&cik_WD, 22},      {&cik_CPG, 46},    {&cik_CPC, 22},
 
 };
 
 static struct si_pc_block_gfxdescr groups_VI[] = {
-	{ &cik_CB, 405},
-	{ &cik_CPF, 19 },
-	{ &cik_DB, 257},
-	{ &cik_GRBM, 34 },
-	{ &cik_GRBMSE, 15 },
-	{ &cik_PA_SU, 154 },
-	{ &cik_PA_SC, 397 },
-	{ &cik_SPI, 197 },
-	{ &cik_SQ, 273 },
-	{ &cik_SX, 34 },
-	{ &cik_TA, 119, 16 },
-	{ &cik_TCA, 35, 2 },
-	{ &cik_TCC, 192},
-	{ &cik_TD, 55, 16 },
-	{ &cik_TCP, 180, 16 },
-	{ &cik_GDS, 121 },
-	{ &cik_VGT, 147 },
-	{ &cik_IA, 24 },
-	{ &cik_MC, 22 },
-	{ &cik_SRBM, 27 },
-	{ &cik_WD, 37 },
-	{ &cik_CPG, 48 },
-	{ &cik_CPC, 24 },
+   {&cik_CB, 405},     {&cik_CPF, 19},    {&cik_DB, 257},  {&cik_GRBM, 34},   {&cik_GRBMSE, 15},
+   {&cik_PA_SU, 154},  {&cik_PA_SC, 397}, {&cik_SPI, 197}, {&cik_SQ, 273},    {&cik_SX, 34},
+   {&cik_TA, 119, 16}, {&cik_TCA, 35, 2}, {&cik_TCC, 192}, {&cik_TD, 55, 16}, {&cik_TCP, 180, 16},
+   {&cik_GDS, 121},    {&cik_VGT, 147},   {&cik_IA, 24},   {&cik_MC, 22},     {&cik_SRBM, 27},
+   {&cik_WD, 37},      {&cik_CPG, 48},    {&cik_CPC, 24},
 
 };
 
 static struct si_pc_block_gfxdescr groups_gfx9[] = {
-	{ &cik_CB, 438},
-	{ &cik_CPF, 32 },
-	{ &cik_DB, 328},
-	{ &cik_GRBM, 38 },
-	{ &cik_GRBMSE, 16 },
-	{ &cik_PA_SU, 292 },
-	{ &cik_PA_SC, 491 },
-	{ &cik_SPI, 196 },
-	{ &cik_SQ, 374 },
-	{ &cik_SX, 208 },
-	{ &cik_TA, 119, 16 },
-	{ &cik_TCA, 35, 2 },
-	{ &cik_TCC, 256},
-	{ &cik_TD, 57, 16 },
-	{ &cik_TCP, 85, 16 },
-	{ &cik_GDS, 121 },
-	{ &cik_VGT, 148 },
-	{ &cik_IA, 32 },
-	{ &cik_WD, 58 },
-	{ &cik_CPG, 59 },
-	{ &cik_CPC, 35 },
+   {&cik_CB, 438},     {&cik_CPF, 32},    {&cik_DB, 328},  {&cik_GRBM, 38},   {&cik_GRBMSE, 16},
+   {&cik_PA_SU, 292},  {&cik_PA_SC, 491}, {&cik_SPI, 196}, {&cik_SQ, 374},    {&cik_SX, 208},
+   {&cik_TA, 119, 16}, {&cik_TCA, 35, 2}, {&cik_TCC, 256}, {&cik_TD, 57, 16}, {&cik_TCP, 85, 16},
+   {&cik_GDS, 121},    {&cik_VGT, 148},   {&cik_IA, 32},   {&cik_WD, 58},     {&cik_CPG, 59},
+   {&cik_CPC, 35},
 };
 
 static bool si_pc_block_has_per_se_groups(const struct si_perfcounters *pc,
-					  const struct si_pc_block *block)
+                                          const struct si_pc_block *block)
 {
-	return block->b->b->flags & SI_PC_BLOCK_SE_GROUPS ||
-	       (block->b->b->flags & SI_PC_BLOCK_SE && pc->separate_se);
+   return block->b->b->flags & SI_PC_BLOCK_SE_GROUPS ||
+          (block->b->b->flags & SI_PC_BLOCK_SE && pc->separate_se);
 }
 
 static bool si_pc_block_has_per_instance_groups(const struct si_perfcounters *pc,
-						const struct si_pc_block *block)
+                                                const struct si_pc_block *block)
 {
-	return block->b->b->flags & SI_PC_BLOCK_INSTANCE_GROUPS ||
-	       (block->num_instances > 1 && pc->separate_instance);
+   return block->b->b->flags & SI_PC_BLOCK_INSTANCE_GROUPS ||
+          (block->num_instances > 1 && pc->separate_instance);
 }
 
-static struct si_pc_block *
-lookup_counter(struct si_perfcounters *pc, unsigned index,
-	       unsigned *base_gid, unsigned *sub_index)
+static struct si_pc_block *lookup_counter(struct si_perfcounters *pc, unsigned index,
+                                          unsigned *base_gid, unsigned *sub_index)
 {
-	struct si_pc_block *block = pc->blocks;
-	unsigned bid;
+   struct si_pc_block *block = pc->blocks;
+   unsigned bid;
 
-	*base_gid = 0;
-	for (bid = 0; bid < pc->num_blocks; ++bid, ++block) {
-		unsigned total = block->num_groups * block->b->selectors;
+   *base_gid = 0;
+   for (bid = 0; bid < pc->num_blocks; ++bid, ++block) {
+      unsigned total = block->num_groups * block->b->selectors;
 
-		if (index < total) {
-			*sub_index = index;
-			return block;
-		}
+      if (index < total) {
+         *sub_index = index;
+         return block;
+      }
 
-		index -= total;
-		*base_gid += block->num_groups;
-	}
+      index -= total;
+      *base_gid += block->num_groups;
+   }
 
-	return NULL;
+   return NULL;
 }
 
-static struct si_pc_block *
-lookup_group(struct si_perfcounters *pc, unsigned *index)
+static struct si_pc_block *lookup_group(struct si_perfcounters *pc, unsigned *index)
 {
-	unsigned bid;
-	struct si_pc_block *block = pc->blocks;
+   unsigned bid;
+   struct si_pc_block *block = pc->blocks;
 
-	for (bid = 0; bid < pc->num_blocks; ++bid, ++block) {
-		if (*index < block->num_groups)
-			return block;
-		*index -= block->num_groups;
-	}
+   for (bid = 0; bid < pc->num_blocks; ++bid, ++block) {
+      if (*index < block->num_groups)
+         return block;
+      *index -= block->num_groups;
+   }
 
-	return NULL;
+   return NULL;
 }
 
-static void si_pc_emit_instance(struct si_context *sctx,
-				int se, int instance)
+static void si_pc_emit_instance(struct si_context *sctx, int se, int instance)
 {
-	struct radeon_cmdbuf *cs = sctx->gfx_cs;
-	unsigned value = S_030800_SH_BROADCAST_WRITES(1);
-
-	if (se >= 0) {
-		value |= S_030800_SE_INDEX(se);
-	} else {
-		value |= S_030800_SE_BROADCAST_WRITES(1);
-	}
-
-	if (instance >= 0) {
-		value |= S_030800_INSTANCE_INDEX(instance);
-	} else {
-		value |= S_030800_INSTANCE_BROADCAST_WRITES(1);
-	}
-
-	radeon_set_uconfig_reg(cs, R_030800_GRBM_GFX_INDEX, value);
+   struct radeon_cmdbuf *cs = sctx->gfx_cs;
+   unsigned value = S_030800_SH_BROADCAST_WRITES(1);
+
+   if (se >= 0) {
+      value |= S_030800_SE_INDEX(se);
+   } else {
+      value |= S_030800_SE_BROADCAST_WRITES(1);
+   }
+
+   if (instance >= 0) {
+      value |= S_030800_INSTANCE_INDEX(instance);
+   } else {
+      value |= S_030800_INSTANCE_BROADCAST_WRITES(1);
+   }
+
+   radeon_set_uconfig_reg(cs, R_030800_GRBM_GFX_INDEX, value);
 }
 
-static void si_pc_emit_shaders(struct si_context *sctx,
-			       unsigned shaders)
+static void si_pc_emit_shaders(struct si_context *sctx, unsigned shaders)
 {
-	struct radeon_cmdbuf *cs = sctx->gfx_cs;
+   struct radeon_cmdbuf *cs = sctx->gfx_cs;
 
-	radeon_set_uconfig_reg_seq(cs, R_036780_SQ_PERFCOUNTER_CTRL, 2);
-	radeon_emit(cs, shaders & 0x7f);
-	radeon_emit(cs, 0xffffffff);
+   radeon_set_uconfig_reg_seq(cs, R_036780_SQ_PERFCOUNTER_CTRL, 2);
+   radeon_emit(cs, shaders & 0x7f);
+   radeon_emit(cs, 0xffffffff);
 }
 
-static void si_pc_emit_select(struct si_context *sctx,
-		        struct si_pc_block *block,
-		        unsigned count, unsigned *selectors)
+static void si_pc_emit_select(struct si_context *sctx, struct si_pc_block *block, unsigned count,
+                              unsigned *selectors)
 {
-	struct si_pc_block_base *regs = block->b->b;
-	struct radeon_cmdbuf *cs = sctx->gfx_cs;
-	unsigned idx;
-	unsigned layout_multi = regs->layout & SI_PC_MULTI_MASK;
-	unsigned dw;
-
-	assert(count <= regs->num_counters);
-
-	if (regs->layout & SI_PC_FAKE)
-		return;
-
-	if (layout_multi == SI_PC_MULTI_BLOCK) {
-		assert(!(regs->layout & SI_PC_REG_REVERSE));
-
-		dw = count + regs->num_prelude;
-		if (count >= regs->num_multi)
-			dw += regs->num_multi;
-		radeon_set_uconfig_reg_seq(cs, regs->select0, dw);
-		for (idx = 0; idx < regs->num_prelude; ++idx)
-			radeon_emit(cs, 0);
-		for (idx = 0; idx < MIN2(count, regs->num_multi); ++idx)
-			radeon_emit(cs, selectors[idx] | regs->select_or);
-
-		if (count < regs->num_multi) {
-			unsigned select1 =
-				regs->select0 + 4 * regs->num_multi;
-			radeon_set_uconfig_reg_seq(cs, select1, count);
-		}
-
-		for (idx = 0; idx < MIN2(count, regs->num_multi); ++idx)
-			radeon_emit(cs, 0);
-
-		if (count > regs->num_multi) {
-			for (idx = regs->num_multi; idx < count; ++idx)
-				radeon_emit(cs, selectors[idx] | regs->select_or);
-		}
-	} else if (layout_multi == SI_PC_MULTI_TAIL) {
-		unsigned select1, select1_count;
-
-		assert(!(regs->layout & SI_PC_REG_REVERSE));
-
-		radeon_set_uconfig_reg_seq(cs, regs->select0, count + regs->num_prelude);
-		for (idx = 0; idx < regs->num_prelude; ++idx)
-			radeon_emit(cs, 0);
-		for (idx = 0; idx < count; ++idx)
-			radeon_emit(cs, selectors[idx] | regs->select_or);
-
-		select1 = regs->select0 + 4 * regs->num_counters;
-		select1_count = MIN2(count, regs->num_multi);
-		radeon_set_uconfig_reg_seq(cs, select1, select1_count);
-		for (idx = 0; idx < select1_count; ++idx)
-			radeon_emit(cs, 0);
-	} else if (layout_multi == SI_PC_MULTI_CUSTOM) {
-		unsigned *reg = regs->select;
-		for (idx = 0; idx < count; ++idx) {
-			radeon_set_uconfig_reg(cs, *reg++, selectors[idx] | regs->select_or);
-			if (idx < regs->num_multi)
-				radeon_set_uconfig_reg(cs, *reg++, 0);
-		}
-	} else {
-		assert(layout_multi == SI_PC_MULTI_ALTERNATE);
-
-		unsigned reg_base = regs->select0;
-		unsigned reg_count = count + MIN2(count, regs->num_multi);
-		reg_count += regs->num_prelude;
-
-		if (!(regs->layout & SI_PC_REG_REVERSE)) {
-			radeon_set_uconfig_reg_seq(cs, reg_base, reg_count);
-
-			for (idx = 0; idx < regs->num_prelude; ++idx)
-				radeon_emit(cs, 0);
-			for (idx = 0; idx < count; ++idx) {
-				radeon_emit(cs, selectors[idx] | regs->select_or);
-				if (idx < regs->num_multi)
-					radeon_emit(cs, 0);
-			}
-		} else {
-			reg_base -= (reg_count - 1) * 4;
-			radeon_set_uconfig_reg_seq(cs, reg_base, reg_count);
-
-			for (idx = count; idx > 0; --idx) {
-				if (idx <= regs->num_multi)
-					radeon_emit(cs, 0);
-				radeon_emit(cs, selectors[idx - 1] | regs->select_or);
-			}
-			for (idx = 0; idx < regs->num_prelude; ++idx)
-				radeon_emit(cs, 0);
-		}
-	}
+   struct si_pc_block_base *regs = block->b->b;
+   struct radeon_cmdbuf *cs = sctx->gfx_cs;
+   unsigned idx;
+   unsigned layout_multi = regs->layout & SI_PC_MULTI_MASK;
+   unsigned dw;
+
+   assert(count <= regs->num_counters);
+
+   if (regs->layout & SI_PC_FAKE)
+      return;
+
+   if (layout_multi == SI_PC_MULTI_BLOCK) {
+      assert(!(regs->layout & SI_PC_REG_REVERSE));
+
+      dw = count + regs->num_prelude;
+      if (count >= regs->num_multi)
+         dw += regs->num_multi;
+      radeon_set_uconfig_reg_seq(cs, regs->select0, dw);
+      for (idx = 0; idx < regs->num_prelude; ++idx)
+         radeon_emit(cs, 0);
+      for (idx = 0; idx < MIN2(count, regs->num_multi); ++idx)
+         radeon_emit(cs, selectors[idx] | regs->select_or);
+
+      if (count < regs->num_multi) {
+         unsigned select1 = regs->select0 + 4 * regs->num_multi;
+         radeon_set_uconfig_reg_seq(cs, select1, count);
+      }
+
+      for (idx = 0; idx < MIN2(count, regs->num_multi); ++idx)
+         radeon_emit(cs, 0);
+
+      if (count > regs->num_multi) {
+         for (idx = regs->num_multi; idx < count; ++idx)
+            radeon_emit(cs, selectors[idx] | regs->select_or);
+      }
+   } else if (layout_multi == SI_PC_MULTI_TAIL) {
+      unsigned select1, select1_count;
+
+      assert(!(regs->layout & SI_PC_REG_REVERSE));
+
+      radeon_set_uconfig_reg_seq(cs, regs->select0, count + regs->num_prelude);
+      for (idx = 0; idx < regs->num_prelude; ++idx)
+         radeon_emit(cs, 0);
+      for (idx = 0; idx < count; ++idx)
+         radeon_emit(cs, selectors[idx] | regs->select_or);
+
+      select1 = regs->select0 + 4 * regs->num_counters;
+      select1_count = MIN2(count, regs->num_multi);
+      radeon_set_uconfig_reg_seq(cs, select1, select1_count);
+      for (idx = 0; idx < select1_count; ++idx)
+         radeon_emit(cs, 0);
+   } else if (layout_multi == SI_PC_MULTI_CUSTOM) {
+      unsigned *reg = regs->select;
+      for (idx = 0; idx < count; ++idx) {
+         radeon_set_uconfig_reg(cs, *reg++, selectors[idx] | regs->select_or);
+         if (idx < regs->num_multi)
+            radeon_set_uconfig_reg(cs, *reg++, 0);
+      }
+   } else {
+      assert(layout_multi == SI_PC_MULTI_ALTERNATE);
+
+      unsigned reg_base = regs->select0;
+      unsigned reg_count = count + MIN2(count, regs->num_multi);
+      reg_count += regs->num_prelude;
+
+      if (!(regs->layout & SI_PC_REG_REVERSE)) {
+         radeon_set_uconfig_reg_seq(cs, reg_base, reg_count);
+
+         for (idx = 0; idx < regs->num_prelude; ++idx)
+            radeon_emit(cs, 0);
+         for (idx = 0; idx < count; ++idx) {
+            radeon_emit(cs, selectors[idx] | regs->select_or);
+            if (idx < regs->num_multi)
+               radeon_emit(cs, 0);
+         }
+      } else {
+         reg_base -= (reg_count - 1) * 4;
+         radeon_set_uconfig_reg_seq(cs, reg_base, reg_count);
+
+         for (idx = count; idx > 0; --idx) {
+            if (idx <= regs->num_multi)
+               radeon_emit(cs, 0);
+            radeon_emit(cs, selectors[idx - 1] | regs->select_or);
+         }
+         for (idx = 0; idx < regs->num_prelude; ++idx)
+            radeon_emit(cs, 0);
+      }
+   }
 }
 
-static void si_pc_emit_start(struct si_context *sctx,
-			     struct si_resource *buffer, uint64_t va)
+static void si_pc_emit_start(struct si_context *sctx, struct si_resource *buffer, uint64_t va)
 {
-	struct radeon_cmdbuf *cs = sctx->gfx_cs;
-
-	si_cp_copy_data(sctx, sctx->gfx_cs,
-			COPY_DATA_DST_MEM, buffer, va - buffer->gpu_address,
-			COPY_DATA_IMM, NULL, 1);
-
-	radeon_set_uconfig_reg(cs, R_036020_CP_PERFMON_CNTL,
-			       S_036020_PERFMON_STATE(V_036020_DISABLE_AND_RESET));
-	radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
-	radeon_emit(cs, EVENT_TYPE(V_028A90_PERFCOUNTER_START) | EVENT_INDEX(0));
-	radeon_set_uconfig_reg(cs, R_036020_CP_PERFMON_CNTL,
-			       S_036020_PERFMON_STATE(V_036020_START_COUNTING));
+   struct radeon_cmdbuf *cs = sctx->gfx_cs;
+
+   si_cp_copy_data(sctx, sctx->gfx_cs, COPY_DATA_DST_MEM, buffer, va - buffer->gpu_address,
+                   COPY_DATA_IMM, NULL, 1);
+
+   radeon_set_uconfig_reg(cs, R_036020_CP_PERFMON_CNTL,
+                          S_036020_PERFMON_STATE(V_036020_DISABLE_AND_RESET));
+   radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
+   radeon_emit(cs, EVENT_TYPE(V_028A90_PERFCOUNTER_START) | EVENT_INDEX(0));
+   radeon_set_uconfig_reg(cs, R_036020_CP_PERFMON_CNTL,
+                          S_036020_PERFMON_STATE(V_036020_START_COUNTING));
 }
 
 /* Note: The buffer was already added in si_pc_emit_start, so we don't have to
  * do it again in here. */
-static void si_pc_emit_stop(struct si_context *sctx,
-			    struct si_resource *buffer, uint64_t va)
+static void si_pc_emit_stop(struct si_context *sctx, struct si_resource *buffer, uint64_t va)
 {
-	struct radeon_cmdbuf *cs = sctx->gfx_cs;
-
-	si_cp_release_mem(sctx, cs, V_028A90_BOTTOM_OF_PIPE_TS, 0,
-			  EOP_DST_SEL_MEM, EOP_INT_SEL_NONE,
-			  EOP_DATA_SEL_VALUE_32BIT,
-			  buffer, va, 0, SI_NOT_QUERY);
-	si_cp_wait_mem(sctx, cs, va, 0, 0xffffffff, WAIT_REG_MEM_EQUAL);
-
-	radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
-	radeon_emit(cs, EVENT_TYPE(V_028A90_PERFCOUNTER_SAMPLE) | EVENT_INDEX(0));
-	radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
-	radeon_emit(cs, EVENT_TYPE(V_028A90_PERFCOUNTER_STOP) | EVENT_INDEX(0));
-	radeon_set_uconfig_reg(cs, R_036020_CP_PERFMON_CNTL,
-			       S_036020_PERFMON_STATE(V_036020_STOP_COUNTING) |
-			       S_036020_PERFMON_SAMPLE_ENABLE(1));
+   struct radeon_cmdbuf *cs = sctx->gfx_cs;
+
+   si_cp_release_mem(sctx, cs, V_028A90_BOTTOM_OF_PIPE_TS, 0, EOP_DST_SEL_MEM, EOP_INT_SEL_NONE,
+                     EOP_DATA_SEL_VALUE_32BIT, buffer, va, 0, SI_NOT_QUERY);
+   si_cp_wait_mem(sctx, cs, va, 0, 0xffffffff, WAIT_REG_MEM_EQUAL);
+
+   radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
+   radeon_emit(cs, EVENT_TYPE(V_028A90_PERFCOUNTER_SAMPLE) | EVENT_INDEX(0));
+   radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
+   radeon_emit(cs, EVENT_TYPE(V_028A90_PERFCOUNTER_STOP) | EVENT_INDEX(0));
+   radeon_set_uconfig_reg(
+      cs, R_036020_CP_PERFMON_CNTL,
+      S_036020_PERFMON_STATE(V_036020_STOP_COUNTING) | S_036020_PERFMON_SAMPLE_ENABLE(1));
 }
 
-static void si_pc_emit_read(struct si_context *sctx,
-			    struct si_pc_block *block,
-			    unsigned count, uint64_t va)
+static void si_pc_emit_read(struct si_context *sctx, struct si_pc_block *block, unsigned count,
+                            uint64_t va)
 {
-	struct si_pc_block_base *regs = block->b->b;
-	struct radeon_cmdbuf *cs = sctx->gfx_cs;
-	unsigned idx;
-	unsigned reg = regs->counter0_lo;
-	unsigned reg_delta = 8;
-
-	if (!(regs->layout & SI_PC_FAKE)) {
-		if (regs->layout & SI_PC_REG_REVERSE)
-			reg_delta = -reg_delta;
-
-		for (idx = 0; idx < count; ++idx) {
-			if (regs->counters)
-				reg = regs->counters[idx];
-
-			radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
-			radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_PERF) |
-					COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) |
-					COPY_DATA_COUNT_SEL); /* 64 bits */
-			radeon_emit(cs, reg >> 2);
-			radeon_emit(cs, 0); /* unused */
-			radeon_emit(cs, va);
-			radeon_emit(cs, va >> 32);
-			va += sizeof(uint64_t);
-			reg += reg_delta;
-		}
-	} else {
-		for (idx = 0; idx < count; ++idx) {
-			radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
-			radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_IMM) |
-					COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) |
-					COPY_DATA_COUNT_SEL);
-			radeon_emit(cs, 0); /* immediate */
-			radeon_emit(cs, 0);
-			radeon_emit(cs, va);
-			radeon_emit(cs, va >> 32);
-			va += sizeof(uint64_t);
-		}
-	}
+   struct si_pc_block_base *regs = block->b->b;
+   struct radeon_cmdbuf *cs = sctx->gfx_cs;
+   unsigned idx;
+   unsigned reg = regs->counter0_lo;
+   unsigned reg_delta = 8;
+
+   if (!(regs->layout & SI_PC_FAKE)) {
+      if (regs->layout & SI_PC_REG_REVERSE)
+         reg_delta = -reg_delta;
+
+      for (idx = 0; idx < count; ++idx) {
+         if (regs->counters)
+            reg = regs->counters[idx];
+
+         radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
+         radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_PERF) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) |
+                            COPY_DATA_COUNT_SEL); /* 64 bits */
+         radeon_emit(cs, reg >> 2);
+         radeon_emit(cs, 0); /* unused */
+         radeon_emit(cs, va);
+         radeon_emit(cs, va >> 32);
+         va += sizeof(uint64_t);
+         reg += reg_delta;
+      }
+   } else {
+      for (idx = 0; idx < count; ++idx) {
+         radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
+         radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_IMM) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) |
+                            COPY_DATA_COUNT_SEL);
+         radeon_emit(cs, 0); /* immediate */
+         radeon_emit(cs, 0);
+         radeon_emit(cs, va);
+         radeon_emit(cs, va >> 32);
+         va += sizeof(uint64_t);
+      }
+   }
 }
 
-static void si_pc_query_destroy(struct si_context *sctx,
-				struct si_query *squery)
+static void si_pc_query_destroy(struct si_context *sctx, struct si_query *squery)
 {
-	struct si_query_pc *query = (struct si_query_pc *)squery;
+   struct si_query_pc *query = (struct si_query_pc *)squery;
 
-	while (query->groups) {
-		struct si_query_group *group = query->groups;
-		query->groups = group->next;
-		FREE(group);
-	}
+   while (query->groups) {
+      struct si_query_group *group = query->groups;
+      query->groups = group->next;
+      FREE(group);
+   }
 
-	FREE(query->counters);
+   FREE(query->counters);
 
-	si_query_buffer_destroy(sctx->screen, &query->buffer);
-	FREE(query);
+   si_query_buffer_destroy(sctx->screen, &query->buffer);
+   FREE(query);
 }
 
 static void si_pc_query_resume(struct si_context *sctx, struct si_query *squery)
 /*
-				   struct si_query_hw *hwquery,
-				   struct si_resource *buffer, uint64_t va)*/
+                                   struct si_query_hw *hwquery,
+                                   struct si_resource *buffer, uint64_t va)*/
 {
-	struct si_query_pc *query = (struct si_query_pc *)squery;
-	int current_se = -1;
-	int current_instance = -1;
+   struct si_query_pc *query = (struct si_query_pc *)squery;
+   int current_se = -1;
+   int current_instance = -1;
 
-	if (!si_query_buffer_alloc(sctx, &query->buffer, NULL, query->result_size))
-		return;
-	si_need_gfx_cs_space(sctx);
+   if (!si_query_buffer_alloc(sctx, &query->buffer, NULL, query->result_size))
+      return;
+   si_need_gfx_cs_space(sctx);
 
-	if (query->shaders)
-		si_pc_emit_shaders(sctx, query->shaders);
+   if (query->shaders)
+      si_pc_emit_shaders(sctx, query->shaders);
 
-	for (struct si_query_group *group = query->groups; group; group = group->next) {
-		struct si_pc_block *block = group->block;
+   for (struct si_query_group *group = query->groups; group; group = group->next) {
+      struct si_pc_block *block = group->block;
 
-		if (group->se != current_se || group->instance != current_instance) {
-			current_se = group->se;
-			current_instance = group->instance;
-			si_pc_emit_instance(sctx, group->se, group->instance);
-		}
+      if (group->se != current_se || group->instance != current_instance) {
+         current_se = group->se;
+         current_instance = group->instance;
+         si_pc_emit_instance(sctx, group->se, group->instance);
+      }
 
-		si_pc_emit_select(sctx, block, group->num_counters, group->selectors);
-	}
+      si_pc_emit_select(sctx, block, group->num_counters, group->selectors);
+   }
 
-	if (current_se != -1 || current_instance != -1)
-		si_pc_emit_instance(sctx, -1, -1);
+   if (current_se != -1 || current_instance != -1)
+      si_pc_emit_instance(sctx, -1, -1);
 
-	uint64_t va = query->buffer.buf->gpu_address + query->buffer.results_end;
-	si_pc_emit_start(sctx, query->buffer.buf, va);
+   uint64_t va = query->buffer.buf->gpu_address + query->buffer.results_end;
+   si_pc_emit_start(sctx, query->buffer.buf, va);
 }
 
 static void si_pc_query_suspend(struct si_context *sctx, struct si_query *squery)
 {
-	struct si_query_pc *query = (struct si_query_pc *)squery;
+   struct si_query_pc *query = (struct si_query_pc *)squery;
 
-	if (!query->buffer.buf)
-		return;
+   if (!query->buffer.buf)
+      return;
 
-	uint64_t va = query->buffer.buf->gpu_address + query->buffer.results_end;
-	query->buffer.results_end += query->result_size;
+   uint64_t va = query->buffer.buf->gpu_address + query->buffer.results_end;
+   query->buffer.results_end += query->result_size;
 
-	si_pc_emit_stop(sctx, query->buffer.buf, va);
+   si_pc_emit_stop(sctx, query->buffer.buf, va);
 
-	for (struct si_query_group *group = query->groups; group; group = group->next) {
-		struct si_pc_block *block = group->block;
-		unsigned se = group->se >= 0 ? group->se : 0;
-		unsigned se_end = se + 1;
+   for (struct si_query_group *group = query->groups; group; group = group->next) {
+      struct si_pc_block *block = group->block;
+      unsigned se = group->se >= 0 ? group->se : 0;
+      unsigned se_end = se + 1;
 
-		if ((block->b->b->flags & SI_PC_BLOCK_SE) && (group->se < 0))
-			se_end = sctx->screen->info.max_se;
+      if ((block->b->b->flags & SI_PC_BLOCK_SE) && (group->se < 0))
+         se_end = sctx->screen->info.max_se;
 
-		do {
-			unsigned instance = group->instance >= 0 ? group->instance : 0;
+      do {
+         unsigned instance = group->instance >= 0 ? group->instance : 0;
 
-			do {
-				si_pc_emit_instance(sctx, se, instance);
-				si_pc_emit_read(sctx, block, group->num_counters, va);
-				va += sizeof(uint64_t) * group->num_counters;
-			} while (group->instance < 0 && ++instance < block->num_instances);
-		} while (++se < se_end);
-	}
+         do {
+            si_pc_emit_instance(sctx, se, instance);
+            si_pc_emit_read(sctx, block, group->num_counters, va);
+            va += sizeof(uint64_t) * group->num_counters;
+         } while (group->instance < 0 && ++instance < block->num_instances);
+      } while (++se < se_end);
+   }
 
-	si_pc_emit_instance(sctx, -1, -1);
+   si_pc_emit_instance(sctx, -1, -1);
 }
 
 static bool si_pc_query_begin(struct si_context *ctx, struct si_query *squery)
 {
-	struct si_query_pc *query = (struct si_query_pc *)squery;
+   struct si_query_pc *query = (struct si_query_pc *)squery;
 
-	si_query_buffer_reset(ctx, &query->buffer);
+   si_query_buffer_reset(ctx, &query->buffer);
 
-	list_addtail(&query->b.active_list, &ctx->active_queries);
-	ctx->num_cs_dw_queries_suspend += query->b.num_cs_dw_suspend;
+   list_addtail(&query->b.active_list, &ctx->active_queries);
+   ctx->num_cs_dw_queries_suspend += query->b.num_cs_dw_suspend;
 
-	si_pc_query_resume(ctx, squery);
+   si_pc_query_resume(ctx, squery);
 
-	return true;
+   return true;
 }
 
 static bool si_pc_query_end(struct si_context *ctx, struct si_query *squery)
 {
-	struct si_query_pc *query = (struct si_query_pc *)squery;
+   struct si_query_pc *query = (struct si_query_pc *)squery;
 
-	si_pc_query_suspend(ctx, squery);
+   si_pc_query_suspend(ctx, squery);
 
-	list_del(&squery->active_list);
-	ctx->num_cs_dw_queries_suspend -= squery->num_cs_dw_suspend;
+   list_del(&squery->active_list);
+   ctx->num_cs_dw_queries_suspend -= squery->num_cs_dw_suspend;
 
-	return query->buffer.buf != NULL;
+   return query->buffer.buf != NULL;
 }
 
-static void si_pc_query_add_result(struct si_query_pc *query,
-				   void *buffer,
-				   union pipe_query_result *result)
+static void si_pc_query_add_result(struct si_query_pc *query, void *buffer,
+                                   union pipe_query_result *result)
 {
-	uint64_t *results = buffer;
-	unsigned i, j;
+   uint64_t *results = buffer;
+   unsigned i, j;
 
-	for (i = 0; i < query->num_counters; ++i) {
-		struct si_query_counter *counter = &query->counters[i];
+   for (i = 0; i < query->num_counters; ++i) {
+      struct si_query_counter *counter = &query->counters[i];
 
-		for (j = 0; j < counter->qwords; ++j) {
-			uint32_t value = results[counter->base + j * counter->stride];
-			result->batch[i].u64 += value;
-		}
-	}
+      for (j = 0; j < counter->qwords; ++j) {
+         uint32_t value = results[counter->base + j * counter->stride];
+         result->batch[i].u64 += value;
+      }
+   }
 }
 
-static bool si_pc_query_get_result(struct si_context *sctx, struct si_query *squery,
-				   bool wait, union pipe_query_result *result)
+static bool si_pc_query_get_result(struct si_context *sctx, struct si_query *squery, bool wait,
+                                   union pipe_query_result *result)
 {
-	struct si_query_pc *query = (struct si_query_pc *)squery;
+   struct si_query_pc *query = (struct si_query_pc *)squery;
 
-	memset(result, 0, sizeof(result->batch[0]) * query->num_counters);
+   memset(result, 0, sizeof(result->batch[0]) * query->num_counters);
 
-	for (struct si_query_buffer *qbuf = &query->buffer; qbuf; qbuf = qbuf->previous) {
-		unsigned usage = PIPE_TRANSFER_READ |
-				 (wait ? 0 : PIPE_TRANSFER_DONTBLOCK);
-		unsigned results_base = 0;
-		void *map;
+   for (struct si_query_buffer *qbuf = &query->buffer; qbuf; qbuf = qbuf->previous) {
+      unsigned usage = PIPE_TRANSFER_READ | (wait ? 0 : PIPE_TRANSFER_DONTBLOCK);
+      unsigned results_base = 0;
+      void *map;
 
-		if (squery->b.flushed)
-			map = sctx->ws->buffer_map(qbuf->buf->buf, NULL, usage);
-		else
-			map = si_buffer_map_sync_with_rings(sctx, qbuf->buf, usage);
+      if (squery->b.flushed)
+         map = sctx->ws->buffer_map(qbuf->buf->buf, NULL, usage);
+      else
+         map = si_buffer_map_sync_with_rings(sctx, qbuf->buf, usage);
 
-		if (!map)
-			return false;
+      if (!map)
+         return false;
 
-		while (results_base != qbuf->results_end) {
-			si_pc_query_add_result(query, map + results_base, result);
-			results_base += query->result_size;
-		}
-	}
+      while (results_base != qbuf->results_end) {
+         si_pc_query_add_result(query, map + results_base, result);
+         results_base += query->result_size;
+      }
+   }
 
-	return true;
+   return true;
 }
 
 static const struct si_query_ops batch_query_ops = {
-	.destroy = si_pc_query_destroy,
-	.begin = si_pc_query_begin,
-	.end = si_pc_query_end,
-	.get_result = si_pc_query_get_result,
+   .destroy = si_pc_query_destroy,
+   .begin = si_pc_query_begin,
+   .end = si_pc_query_end,
+   .get_result = si_pc_query_get_result,
 
-	.suspend = si_pc_query_suspend,
-	.resume = si_pc_query_resume,
+   .suspend = si_pc_query_suspend,
+   .resume = si_pc_query_resume,
 };
 
-static struct si_query_group *get_group_state(struct si_screen *screen,
-					      struct si_query_pc *query,
-					      struct si_pc_block *block,
-					      unsigned sub_gid)
+static struct si_query_group *get_group_state(struct si_screen *screen, struct si_query_pc *query,
+                                              struct si_pc_block *block, unsigned sub_gid)
 {
-	struct si_query_group *group = query->groups;
-
-	while (group) {
-		if (group->block == block && group->sub_gid == sub_gid)
-			return group;
-		group = group->next;
-	}
-
-	group = CALLOC_STRUCT(si_query_group);
-	if (!group)
-		return NULL;
-
-	group->block = block;
-	group->sub_gid = sub_gid;
-
-	if (block->b->b->flags & SI_PC_BLOCK_SHADER) {
-		unsigned sub_gids = block->num_instances;
-		unsigned shader_id;
-		unsigned shaders;
-		unsigned query_shaders;
-
-		if (si_pc_block_has_per_se_groups(screen->perfcounters, block))
-			sub_gids = sub_gids * screen->info.max_se;
-		shader_id = sub_gid / sub_gids;
-		sub_gid = sub_gid % sub_gids;
-
-		shaders = si_pc_shader_type_bits[shader_id];
-
-		query_shaders = query->shaders & ~SI_PC_SHADERS_WINDOWING;
-		if (query_shaders && query_shaders != shaders) {
-			fprintf(stderr, "si_perfcounter: incompatible shader groups\n");
-			FREE(group);
-			return NULL;
-		}
-		query->shaders = shaders;
-	}
-
-	if (block->b->b->flags & SI_PC_BLOCK_SHADER_WINDOWED && !query->shaders) {
-		// A non-zero value in query->shaders ensures that the shader
-		// masking is reset unless the user explicitly requests one.
-		query->shaders = SI_PC_SHADERS_WINDOWING;
-	}
-
-	if (si_pc_block_has_per_se_groups(screen->perfcounters, block)) {
-		group->se = sub_gid / block->num_instances;
-		sub_gid = sub_gid % block->num_instances;
-	} else {
-		group->se = -1;
-	}
-
-	if (si_pc_block_has_per_instance_groups(screen->perfcounters, block)) {
-		group->instance = sub_gid;
-	} else {
-		group->instance = -1;
-	}
-
-	group->next = query->groups;
-	query->groups = group;
-
-	return group;
+   struct si_query_group *group = query->groups;
+
+   while (group) {
+      if (group->block == block && group->sub_gid == sub_gid)
+         return group;
+      group = group->next;
+   }
+
+   group = CALLOC_STRUCT(si_query_group);
+   if (!group)
+      return NULL;
+
+   group->block = block;
+   group->sub_gid = sub_gid;
+
+   if (block->b->b->flags & SI_PC_BLOCK_SHADER) {
+      unsigned sub_gids = block->num_instances;
+      unsigned shader_id;
+      unsigned shaders;
+      unsigned query_shaders;
+
+      if (si_pc_block_has_per_se_groups(screen->perfcounters, block))
+         sub_gids = sub_gids * screen->info.max_se;
+      shader_id = sub_gid / sub_gids;
+      sub_gid = sub_gid % sub_gids;
+
+      shaders = si_pc_shader_type_bits[shader_id];
+
+      query_shaders = query->shaders & ~SI_PC_SHADERS_WINDOWING;
+      if (query_shaders && query_shaders != shaders) {
+         fprintf(stderr, "si_perfcounter: incompatible shader groups\n");
+         FREE(group);
+         return NULL;
+      }
+      query->shaders = shaders;
+   }
+
+   if (block->b->b->flags & SI_PC_BLOCK_SHADER_WINDOWED && !query->shaders) {
+      // A non-zero value in query->shaders ensures that the shader
+      // masking is reset unless the user explicitly requests one.
+      query->shaders = SI_PC_SHADERS_WINDOWING;
+   }
+
+   if (si_pc_block_has_per_se_groups(screen->perfcounters, block)) {
+      group->se = sub_gid / block->num_instances;
+      sub_gid = sub_gid % block->num_instances;
+   } else {
+      group->se = -1;
+   }
+
+   if (si_pc_block_has_per_instance_groups(screen->perfcounters, block)) {
+      group->instance = sub_gid;
+   } else {
+      group->instance = -1;
+   }
+
+   group->next = query->groups;
+   query->groups = group;
+
+   return group;
 }
 
-struct pipe_query *si_create_batch_query(struct pipe_context *ctx,
-					 unsigned num_queries,
-					 unsigned *query_types)
+struct pipe_query *si_create_batch_query(struct pipe_context *ctx, unsigned num_queries,
+                                         unsigned *query_types)
 {
-	struct si_screen *screen =
-		(struct si_screen *)ctx->screen;
-	struct si_perfcounters *pc = screen->perfcounters;
-	struct si_pc_block *block;
-	struct si_query_group *group;
-	struct si_query_pc *query;
-	unsigned base_gid, sub_gid, sub_index;
-	unsigned i, j;
-
-	if (!pc)
-		return NULL;
-
-	query = CALLOC_STRUCT(si_query_pc);
-	if (!query)
-		return NULL;
-
-	query->b.ops = &batch_query_ops;
-
-	query->num_counters = num_queries;
-
-	/* Collect selectors per group */
-	for (i = 0; i < num_queries; ++i) {
-		unsigned sub_gid;
-
-		if (query_types[i] < SI_QUERY_FIRST_PERFCOUNTER)
-			goto error;
-
-		block = lookup_counter(pc, query_types[i] - SI_QUERY_FIRST_PERFCOUNTER,
-				       &base_gid, &sub_index);
-		if (!block)
-			goto error;
-
-		sub_gid = sub_index / block->b->selectors;
-		sub_index = sub_index % block->b->selectors;
-
-		group = get_group_state(screen, query, block, sub_gid);
-		if (!group)
-			goto error;
-
-		if (group->num_counters >= block->b->b->num_counters) {
-			fprintf(stderr,
-				"perfcounter group %s: too many selected\n",
-				block->b->b->name);
-			goto error;
-		}
-		group->selectors[group->num_counters] = sub_index;
-		++group->num_counters;
-	}
-
-	/* Compute result bases and CS size per group */
-	query->b.num_cs_dw_suspend = pc->num_stop_cs_dwords;
-	query->b.num_cs_dw_suspend += pc->num_instance_cs_dwords;
-
-	i = 0;
-	for (group = query->groups; group; group = group->next) {
-		struct si_pc_block *block = group->block;
-		unsigned read_dw;
-		unsigned instances = 1;
-
-		if ((block->b->b->flags & SI_PC_BLOCK_SE) && group->se < 0)
-			instances = screen->info.max_se;
-		if (group->instance < 0)
-			instances *= block->num_instances;
-
-		group->result_base = i;
-		query->result_size += sizeof(uint64_t) * instances * group->num_counters;
-		i += instances * group->num_counters;
-
-		read_dw = 6 * group->num_counters;
-		query->b.num_cs_dw_suspend += instances * read_dw;
-		query->b.num_cs_dw_suspend += instances * pc->num_instance_cs_dwords;
-	}
-
-	if (query->shaders) {
-		if (query->shaders == SI_PC_SHADERS_WINDOWING)
-			query->shaders = 0xffffffff;
-	}
-
-	/* Map user-supplied query array to result indices */
-	query->counters = CALLOC(num_queries, sizeof(*query->counters));
-	for (i = 0; i < num_queries; ++i) {
-		struct si_query_counter *counter = &query->counters[i];
-		struct si_pc_block *block;
-
-		block = lookup_counter(pc, query_types[i] - SI_QUERY_FIRST_PERFCOUNTER,
-				       &base_gid, &sub_index);
-
-		sub_gid = sub_index / block->b->selectors;
-		sub_index = sub_index % block->b->selectors;
-
-		group = get_group_state(screen, query, block, sub_gid);
-		assert(group != NULL);
-
-		for (j = 0; j < group->num_counters; ++j) {
-			if (group->selectors[j] == sub_index)
-				break;
-		}
-
-		counter->base = group->result_base + j;
-		counter->stride = group->num_counters;
-
-		counter->qwords = 1;
-		if ((block->b->b->flags & SI_PC_BLOCK_SE) && group->se < 0)
-			counter->qwords = screen->info.max_se;
-		if (group->instance < 0)
-			counter->qwords *= block->num_instances;
-	}
+   struct si_screen *screen = (struct si_screen *)ctx->screen;
+   struct si_perfcounters *pc = screen->perfcounters;
+   struct si_pc_block *block;
+   struct si_query_group *group;
+   struct si_query_pc *query;
+   unsigned base_gid, sub_gid, sub_index;
+   unsigned i, j;
+
+   if (!pc)
+      return NULL;
+
+   query = CALLOC_STRUCT(si_query_pc);
+   if (!query)
+      return NULL;
+
+   query->b.ops = &batch_query_ops;
+
+   query->num_counters = num_queries;
+
+   /* Collect selectors per group */
+   for (i = 0; i < num_queries; ++i) {
+      unsigned sub_gid;
+
+      if (query_types[i] < SI_QUERY_FIRST_PERFCOUNTER)
+         goto error;
+
+      block =
+         lookup_counter(pc, query_types[i] - SI_QUERY_FIRST_PERFCOUNTER, &base_gid, &sub_index);
+      if (!block)
+         goto error;
+
+      sub_gid = sub_index / block->b->selectors;
+      sub_index = sub_index % block->b->selectors;
+
+      group = get_group_state(screen, query, block, sub_gid);
+      if (!group)
+         goto error;
+
+      if (group->num_counters >= block->b->b->num_counters) {
+         fprintf(stderr, "perfcounter group %s: too many selected\n", block->b->b->name);
+         goto error;
+      }
+      group->selectors[group->num_counters] = sub_index;
+      ++group->num_counters;
+   }
+
+   /* Compute result bases and CS size per group */
+   query->b.num_cs_dw_suspend = pc->num_stop_cs_dwords;
+   query->b.num_cs_dw_suspend += pc->num_instance_cs_dwords;
+
+   i = 0;
+   for (group = query->groups; group; group = group->next) {
+      struct si_pc_block *block = group->block;
+      unsigned read_dw;
+      unsigned instances = 1;
+
+      if ((block->b->b->flags & SI_PC_BLOCK_SE) && group->se < 0)
+         instances = screen->info.max_se;
+      if (group->instance < 0)
+         instances *= block->num_instances;
+
+      group->result_base = i;
+      query->result_size += sizeof(uint64_t) * instances * group->num_counters;
+      i += instances * group->num_counters;
+
+      read_dw = 6 * group->num_counters;
+      query->b.num_cs_dw_suspend += instances * read_dw;
+      query->b.num_cs_dw_suspend += instances * pc->num_instance_cs_dwords;
+   }
+
+   if (query->shaders) {
+      if (query->shaders == SI_PC_SHADERS_WINDOWING)
+         query->shaders = 0xffffffff;
+   }
+
+   /* Map user-supplied query array to result indices */
+   query->counters = CALLOC(num_queries, sizeof(*query->counters));
+   for (i = 0; i < num_queries; ++i) {
+      struct si_query_counter *counter = &query->counters[i];
+      struct si_pc_block *block;
+
+      block =
+         lookup_counter(pc, query_types[i] - SI_QUERY_FIRST_PERFCOUNTER, &base_gid, &sub_index);
+
+      sub_gid = sub_index / block->b->selectors;
+      sub_index = sub_index % block->b->selectors;
+
+      group = get_group_state(screen, query, block, sub_gid);
+      assert(group != NULL);
+
+      for (j = 0; j < group->num_counters; ++j) {
+         if (group->selectors[j] == sub_index)
+            break;
+      }
+
+      counter->base = group->result_base + j;
+      counter->stride = group->num_counters;
+
+      counter->qwords = 1;
+      if ((block->b->b->flags & SI_PC_BLOCK_SE) && group->se < 0)
+         counter->qwords = screen->info.max_se;
+      if (group->instance < 0)
+         counter->qwords *= block->num_instances;
+   }
 
-	return (struct pipe_query *)query;
+   return (struct pipe_query *)query;
 
 error:
-	si_pc_query_destroy((struct si_context *)ctx, &query->b);
-	return NULL;
+   si_pc_query_destroy((struct si_context *)ctx, &query->b);
+   return NULL;
 }
 
-static bool si_init_block_names(struct si_screen *screen,
-				struct si_pc_block *block)
+static bool si_init_block_names(struct si_screen *screen, struct si_pc_block *block)
 {
-	bool per_instance_groups = si_pc_block_has_per_instance_groups(screen->perfcounters, block);
-	bool per_se_groups = si_pc_block_has_per_se_groups(screen->perfcounters, block);
-	unsigned i, j, k;
-	unsigned groups_shader = 1, groups_se = 1, groups_instance = 1;
-	unsigned namelen;
-	char *groupname;
-	char *p;
-
-	if (per_instance_groups)
-		groups_instance = block->num_instances;
-	if (per_se_groups)
-		groups_se = screen->info.max_se;
-	if (block->b->b->flags & SI_PC_BLOCK_SHADER)
-		groups_shader = ARRAY_SIZE(si_pc_shader_type_bits);
-
-	namelen = strlen(block->b->b->name);
-	block->group_name_stride = namelen + 1;
-	if (block->b->b->flags & SI_PC_BLOCK_SHADER)
-		block->group_name_stride += 3;
-	if (per_se_groups) {
-		assert(groups_se <= 10);
-		block->group_name_stride += 1;
-
-		if (per_instance_groups)
-			block->group_name_stride += 1;
-	}
-	if (per_instance_groups) {
-		assert(groups_instance <= 100);
-		block->group_name_stride += 2;
-	}
-
-	block->group_names = MALLOC(block->num_groups * block->group_name_stride);
-	if (!block->group_names)
-		return false;
-
-	groupname = block->group_names;
-	for (i = 0; i < groups_shader; ++i) {
-		const char *shader_suffix = si_pc_shader_type_suffixes[i];
-		unsigned shaderlen = strlen(shader_suffix);
-		for (j = 0; j < groups_se; ++j) {
-			for (k = 0; k < groups_instance; ++k) {
-				strcpy(groupname, block->b->b->name);
-				p = groupname + namelen;
-
-				if (block->b->b->flags & SI_PC_BLOCK_SHADER) {
-					strcpy(p, shader_suffix);
-					p += shaderlen;
-				}
-
-				if (per_se_groups) {
-					p += sprintf(p, "%d", j);
-					if (per_instance_groups)
-						*p++ = '_';
-				}
-
-				if (per_instance_groups)
-					p += sprintf(p, "%d", k);
-
-				groupname += block->group_name_stride;
-			}
-		}
-	}
-
-	assert(block->b->selectors <= 1000);
-	block->selector_name_stride = block->group_name_stride + 4;
-	block->selector_names = MALLOC(block->num_groups * block->b->selectors *
-				       block->selector_name_stride);
-	if (!block->selector_names)
-		return false;
-
-	groupname = block->group_names;
-	p = block->selector_names;
-	for (i = 0; i < block->num_groups; ++i) {
-		for (j = 0; j < block->b->selectors; ++j) {
-			sprintf(p, "%s_%03d", groupname, j);
-			p += block->selector_name_stride;
-		}
-		groupname += block->group_name_stride;
-	}
-
-	return true;
+   bool per_instance_groups = si_pc_block_has_per_instance_groups(screen->perfcounters, block);
+   bool per_se_groups = si_pc_block_has_per_se_groups(screen->perfcounters, block);
+   unsigned i, j, k;
+   unsigned groups_shader = 1, groups_se = 1, groups_instance = 1;
+   unsigned namelen;
+   char *groupname;
+   char *p;
+
+   if (per_instance_groups)
+      groups_instance = block->num_instances;
+   if (per_se_groups)
+      groups_se = screen->info.max_se;
+   if (block->b->b->flags & SI_PC_BLOCK_SHADER)
+      groups_shader = ARRAY_SIZE(si_pc_shader_type_bits);
+
+   namelen = strlen(block->b->b->name);
+   block->group_name_stride = namelen + 1;
+   if (block->b->b->flags & SI_PC_BLOCK_SHADER)
+      block->group_name_stride += 3;
+   if (per_se_groups) {
+      assert(groups_se <= 10);
+      block->group_name_stride += 1;
+
+      if (per_instance_groups)
+         block->group_name_stride += 1;
+   }
+   if (per_instance_groups) {
+      assert(groups_instance <= 100);
+      block->group_name_stride += 2;
+   }
+
+   block->group_names = MALLOC(block->num_groups * block->group_name_stride);
+   if (!block->group_names)
+      return false;
+
+   groupname = block->group_names;
+   for (i = 0; i < groups_shader; ++i) {
+      const char *shader_suffix = si_pc_shader_type_suffixes[i];
+      unsigned shaderlen = strlen(shader_suffix);
+      for (j = 0; j < groups_se; ++j) {
+         for (k = 0; k < groups_instance; ++k) {
+            strcpy(groupname, block->b->b->name);
+            p = groupname + namelen;
+
+            if (block->b->b->flags & SI_PC_BLOCK_SHADER) {
+               strcpy(p, shader_suffix);
+               p += shaderlen;
+            }
+
+            if (per_se_groups) {
+               p += sprintf(p, "%d", j);
+               if (per_instance_groups)
+                  *p++ = '_';
+            }
+
+            if (per_instance_groups)
+               p += sprintf(p, "%d", k);
+
+            groupname += block->group_name_stride;
+         }
+      }
+   }
+
+   assert(block->b->selectors <= 1000);
+   block->selector_name_stride = block->group_name_stride + 4;
+   block->selector_names =
+      MALLOC(block->num_groups * block->b->selectors * block->selector_name_stride);
+   if (!block->selector_names)
+      return false;
+
+   groupname = block->group_names;
+   p = block->selector_names;
+   for (i = 0; i < block->num_groups; ++i) {
+      for (j = 0; j < block->b->selectors; ++j) {
+         sprintf(p, "%s_%03d", groupname, j);
+         p += block->selector_name_stride;
+      }
+      groupname += block->group_name_stride;
+   }
+
+   return true;
 }
 
-int si_get_perfcounter_info(struct si_screen *screen,
-			    unsigned index,
-			    struct pipe_driver_query_info *info)
+int si_get_perfcounter_info(struct si_screen *screen, unsigned index,
+                            struct pipe_driver_query_info *info)
 {
-	struct si_perfcounters *pc = screen->perfcounters;
-	struct si_pc_block *block;
-	unsigned base_gid, sub;
-
-	if (!pc)
-		return 0;
-
-	if (!info) {
-		unsigned bid, num_queries = 0;
-
-		for (bid = 0; bid < pc->num_blocks; ++bid) {
-			num_queries += pc->blocks[bid].b->selectors *
-				       pc->blocks[bid].num_groups;
-		}
-
-		return num_queries;
-	}
-
-	block = lookup_counter(pc, index, &base_gid, &sub);
-	if (!block)
-		return 0;
-
-	if (!block->selector_names) {
-		if (!si_init_block_names(screen, block))
-			return 0;
-	}
-	info->name = block->selector_names + sub * block->selector_name_stride;
-	info->query_type = SI_QUERY_FIRST_PERFCOUNTER + index;
-	info->max_value.u64 = 0;
-	info->type = PIPE_DRIVER_QUERY_TYPE_UINT64;
-	info->result_type = PIPE_DRIVER_QUERY_RESULT_TYPE_AVERAGE;
-	info->group_id = base_gid + sub / block->b->selectors;
-	info->flags = PIPE_DRIVER_QUERY_FLAG_BATCH;
-	if (sub > 0 && sub + 1 < block->b->selectors * block->num_groups)
-		info->flags |= PIPE_DRIVER_QUERY_FLAG_DONT_LIST;
-	return 1;
+   struct si_perfcounters *pc = screen->perfcounters;
+   struct si_pc_block *block;
+   unsigned base_gid, sub;
+
+   if (!pc)
+      return 0;
+
+   if (!info) {
+      unsigned bid, num_queries = 0;
+
+      for (bid = 0; bid < pc->num_blocks; ++bid) {
+         num_queries += pc->blocks[bid].b->selectors * pc->blocks[bid].num_groups;
+      }
+
+      return num_queries;
+   }
+
+   block = lookup_counter(pc, index, &base_gid, &sub);
+   if (!block)
+      return 0;
+
+   if (!block->selector_names) {
+      if (!si_init_block_names(screen, block))
+         return 0;
+   }
+   info->name = block->selector_names + sub * block->selector_name_stride;
+   info->query_type = SI_QUERY_FIRST_PERFCOUNTER + index;
+   info->max_value.u64 = 0;
+   info->type = PIPE_DRIVER_QUERY_TYPE_UINT64;
+   info->result_type = PIPE_DRIVER_QUERY_RESULT_TYPE_AVERAGE;
+   info->group_id = base_gid + sub / block->b->selectors;
+   info->flags = PIPE_DRIVER_QUERY_FLAG_BATCH;
+   if (sub > 0 && sub + 1 < block->b->selectors * block->num_groups)
+      info->flags |= PIPE_DRIVER_QUERY_FLAG_DONT_LIST;
+   return 1;
 }
 
-int si_get_perfcounter_group_info(struct si_screen *screen,
-				  unsigned index,
-				  struct pipe_driver_query_group_info *info)
+int si_get_perfcounter_group_info(struct si_screen *screen, unsigned index,
+                                  struct pipe_driver_query_group_info *info)
 {
-	struct si_perfcounters *pc = screen->perfcounters;
-	struct si_pc_block *block;
-
-	if (!pc)
-		return 0;
-
-	if (!info)
-		return pc->num_groups;
-
-	block = lookup_group(pc, &index);
-	if (!block)
-		return 0;
-
-	if (!block->group_names) {
-		if (!si_init_block_names(screen, block))
-			return 0;
-	}
-	info->name = block->group_names + index * block->group_name_stride;
-	info->num_queries = block->b->selectors;
-	info->max_active_queries = block->b->b->num_counters;
-	return 1;
+   struct si_perfcounters *pc = screen->perfcounters;
+   struct si_pc_block *block;
+
+   if (!pc)
+      return 0;
+
+   if (!info)
+      return pc->num_groups;
+
+   block = lookup_group(pc, &index);
+   if (!block)
+      return 0;
+
+   if (!block->group_names) {
+      if (!si_init_block_names(screen, block))
+         return 0;
+   }
+   info->name = block->group_names + index * block->group_name_stride;
+   info->num_queries = block->b->selectors;
+   info->max_active_queries = block->b->b->num_counters;
+   return 1;
 }
 
 void si_destroy_perfcounters(struct si_screen *screen)
 {
-	struct si_perfcounters *pc = screen->perfcounters;
-	unsigned i;
-
-	if (!pc)
-		return;
-
-	for (i = 0; i < pc->num_blocks; ++i) {
-		FREE(pc->blocks[i].group_names);
-		FREE(pc->blocks[i].selector_names);
-	}
-	FREE(pc->blocks);
-	FREE(pc);
-	screen->perfcounters = NULL;
+   struct si_perfcounters *pc = screen->perfcounters;
+   unsigned i;
+
+   if (!pc)
+      return;
+
+   for (i = 0; i < pc->num_blocks; ++i) {
+      FREE(pc->blocks[i].group_names);
+      FREE(pc->blocks[i].selector_names);
+   }
+   FREE(pc->blocks);
+   FREE(pc);
+   screen->perfcounters = NULL;
 }
 
 void si_init_perfcounters(struct si_screen *screen)
 {
-	struct si_perfcounters *pc;
-	const struct si_pc_block_gfxdescr *blocks;
-	unsigned num_blocks;
-	unsigned i;
-
-	switch (screen->info.chip_class) {
-	case GFX7:
-		blocks = groups_CIK;
-		num_blocks = ARRAY_SIZE(groups_CIK);
-		break;
-	case GFX8:
-		blocks = groups_VI;
-		num_blocks = ARRAY_SIZE(groups_VI);
-		break;
-	case GFX9:
-		blocks = groups_gfx9;
-		num_blocks = ARRAY_SIZE(groups_gfx9);
-		break;
-	case GFX6:
-	default:
-		return; /* not implemented */
-	}
-
-	if (screen->info.max_sh_per_se != 1) {
-		/* This should not happen on non-GFX6 chips. */
-		fprintf(stderr, "si_init_perfcounters: max_sh_per_se = %d not "
-			"supported (inaccurate performance counters)\n",
-			screen->info.max_sh_per_se);
-	}
-
-	screen->perfcounters = pc = CALLOC_STRUCT(si_perfcounters);
-	if (!pc)
-		return;
-
-	pc->num_stop_cs_dwords = 14 + si_cp_write_fence_dwords(screen);
-	pc->num_instance_cs_dwords = 3;
-
-	pc->separate_se = debug_get_bool_option("RADEON_PC_SEPARATE_SE", false);
-	pc->separate_instance = debug_get_bool_option("RADEON_PC_SEPARATE_INSTANCE", false);
-
-	pc->blocks = CALLOC(num_blocks, sizeof(struct si_pc_block));
-	if (!pc->blocks)
-		goto error;
-	pc->num_blocks = num_blocks;
-
-	for (i = 0; i < num_blocks; ++i) {
-		struct si_pc_block *block = &pc->blocks[i];
-		block->b = &blocks[i];
-		block->num_instances = MAX2(1, block->b->instances);
-
-		if (!strcmp(block->b->b->name, "CB") ||
-		    !strcmp(block->b->b->name, "DB"))
-			block->num_instances = screen->info.max_se;
-		else if (!strcmp(block->b->b->name, "TCC"))
-			block->num_instances = screen->info.num_tcc_blocks;
-		else if (!strcmp(block->b->b->name, "IA"))
-			block->num_instances = MAX2(1, screen->info.max_se / 2);
-
-		if (si_pc_block_has_per_instance_groups(pc, block)) {
-			block->num_groups = block->num_instances;
-		} else {
-			block->num_groups = 1;
-		}
-
-		if (si_pc_block_has_per_se_groups(pc, block))
-			block->num_groups *= screen->info.max_se;
-		if (block->b->b->flags & SI_PC_BLOCK_SHADER)
-			block->num_groups *= ARRAY_SIZE(si_pc_shader_type_bits);
-
-		pc->num_groups += block->num_groups;
-	}
-
-	return;
+   struct si_perfcounters *pc;
+   const struct si_pc_block_gfxdescr *blocks;
+   unsigned num_blocks;
+   unsigned i;
+
+   switch (screen->info.chip_class) {
+   case GFX7:
+      blocks = groups_CIK;
+      num_blocks = ARRAY_SIZE(groups_CIK);
+      break;
+   case GFX8:
+      blocks = groups_VI;
+      num_blocks = ARRAY_SIZE(groups_VI);
+      break;
+   case GFX9:
+      blocks = groups_gfx9;
+      num_blocks = ARRAY_SIZE(groups_gfx9);
+      break;
+   case GFX6:
+   default:
+      return; /* not implemented */
+   }
+
+   if (screen->info.max_sh_per_se != 1) {
+      /* This should not happen on non-GFX6 chips. */
+      fprintf(stderr,
+              "si_init_perfcounters: max_sh_per_se = %d not "
+              "supported (inaccurate performance counters)\n",
+              screen->info.max_sh_per_se);
+   }
+
+   screen->perfcounters = pc = CALLOC_STRUCT(si_perfcounters);
+   if (!pc)
+      return;
+
+   pc->num_stop_cs_dwords = 14 + si_cp_write_fence_dwords(screen);
+   pc->num_instance_cs_dwords = 3;
+
+   pc->separate_se = debug_get_bool_option("RADEON_PC_SEPARATE_SE", false);
+   pc->separate_instance = debug_get_bool_option("RADEON_PC_SEPARATE_INSTANCE", false);
+
+   pc->blocks = CALLOC(num_blocks, sizeof(struct si_pc_block));
+   if (!pc->blocks)
+      goto error;
+   pc->num_blocks = num_blocks;
+
+   for (i = 0; i < num_blocks; ++i) {
+      struct si_pc_block *block = &pc->blocks[i];
+      block->b = &blocks[i];
+      block->num_instances = MAX2(1, block->b->instances);
+
+      if (!strcmp(block->b->b->name, "CB") || !strcmp(block->b->b->name, "DB"))
+         block->num_instances = screen->info.max_se;
+      else if (!strcmp(block->b->b->name, "TCC"))
+         block->num_instances = screen->info.num_tcc_blocks;
+      else if (!strcmp(block->b->b->name, "IA"))
+         block->num_instances = MAX2(1, screen->info.max_se / 2);
+
+      if (si_pc_block_has_per_instance_groups(pc, block)) {
+         block->num_groups = block->num_instances;
+      } else {
+         block->num_groups = 1;
+      }
+
+      if (si_pc_block_has_per_se_groups(pc, block))
+         block->num_groups *= screen->info.max_se;
+      if (block->b->b->flags & SI_PC_BLOCK_SHADER)
+         block->num_groups *= ARRAY_SIZE(si_pc_shader_type_bits);
+
+      pc->num_groups += block->num_groups;
+   }
+
+   return;
 
 error:
-	si_destroy_perfcounters(screen);
+   si_destroy_perfcounters(screen);
 }
diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c
index d900467964b..816015d1f82 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -24,12 +24,15 @@
  */
 
 #include "si_pipe.h"
+
+#include "driver_ddebug/dd_util.h"
+#include "gallium/winsys/amdgpu/drm/amdgpu_public.h"
+#include "gallium/winsys/radeon/drm/radeon_drm_public.h"
+#include "radeon/radeon_uvd.h"
+#include "si_compute.h"
 #include "si_public.h"
 #include "si_shader_internal.h"
-#include "si_compute.h"
 #include "sid.h"
-
-#include "radeon/radeon_uvd.h"
 #include "util/disk_cache.h"
 #include "util/u_log.h"
 #include "util/u_memory.h"
@@ -38,128 +41,124 @@
 #include "util/u_upload_mgr.h"
 #include "util/xmlconfig.h"
 #include "vl/vl_decoder.h"
-#include "driver_ddebug/dd_util.h"
 
-#include "gallium/winsys/radeon/drm/radeon_drm_public.h"
-#include "gallium/winsys/amdgpu/drm/amdgpu_public.h"
 #include <xf86drm.h>
 
-static struct pipe_context *si_create_context(struct pipe_screen *screen,
-                                              unsigned flags);
+static struct pipe_context *si_create_context(struct pipe_screen *screen, unsigned flags);
 
 static const struct debug_named_value debug_options[] = {
-	/* Shader logging options: */
-	{ "vs", DBG(VS), "Print vertex shaders" },
-	{ "ps", DBG(PS), "Print pixel shaders" },
-	{ "gs", DBG(GS), "Print geometry shaders" },
-	{ "tcs", DBG(TCS), "Print tessellation control shaders" },
-	{ "tes", DBG(TES), "Print tessellation evaluation shaders" },
-	{ "cs", DBG(CS), "Print compute shaders" },
-	{ "noir", DBG(NO_IR), "Don't print the LLVM IR"},
-	{ "nonir", DBG(NO_NIR), "Don't print NIR when printing shaders"},
-	{ "noasm", DBG(NO_ASM), "Don't print disassembled shaders"},
-	{ "preoptir", DBG(PREOPT_IR), "Print the LLVM IR before initial optimizations" },
-
-	/* Shader compiler options the shader cache should be aware of: */
-	{ "gisel", DBG(GISEL), "Enable LLVM global instruction selector." },
-	{ "w32ge", DBG(W32_GE), "Use Wave32 for vertex, tessellation, and geometry shaders." },
-	{ "w32ps", DBG(W32_PS), "Use Wave32 for pixel shaders." },
-	{ "w32cs", DBG(W32_CS), "Use Wave32 for computes shaders." },
-	{ "w64ge", DBG(W64_GE), "Use Wave64 for vertex, tessellation, and geometry shaders." },
-	{ "w64ps", DBG(W64_PS), "Use Wave64 for pixel shaders." },
-	{ "w64cs", DBG(W64_CS), "Use Wave64 for computes shaders." },
-
-	/* Shader compiler options (with no effect on the shader cache): */
-	{ "checkir", DBG(CHECK_IR), "Enable additional sanity checks on shader IR" },
-	{ "mono", DBG(MONOLITHIC_SHADERS), "Use old-style monolithic shaders compiled on demand" },
-	{ "nooptvariant", DBG(NO_OPT_VARIANT), "Disable compiling optimized shader variants." },
-
-	/* Information logging options: */
-	{ "info", DBG(INFO), "Print driver information" },
-	{ "tex", DBG(TEX), "Print texture info" },
-	{ "compute", DBG(COMPUTE), "Print compute info" },
-	{ "vm", DBG(VM), "Print virtual addresses when creating resources" },
-	{ "cache_stats", DBG(CACHE_STATS), "Print shader cache statistics." },
-
-	/* Driver options: */
-	{ "forcedma", DBG(FORCE_SDMA), "Use SDMA for all operations when possible." },
-	{ "nodma", DBG(NO_SDMA), "Disable SDMA" },
-	{ "nodmaclear", DBG(NO_SDMA_CLEARS), "Disable SDMA clears" },
-	{ "nodmacopyimage", DBG(NO_SDMA_COPY_IMAGE), "Disable SDMA image copies" },
-	{ "nowc", DBG(NO_WC), "Disable GTT write combining" },
-	{ "check_vm", DBG(CHECK_VM), "Check VM faults and dump debug info." },
-	{ "reserve_vmid", DBG(RESERVE_VMID), "Force VMID reservation per context." },
-	{ "zerovram", DBG(ZERO_VRAM), "Clear VRAM allocations." },
-
-	/* 3D engine options: */
-	{ "nogfx", DBG(NO_GFX), "Disable graphics. Only multimedia compute paths can be used." },
-	{ "nongg", DBG(NO_NGG), "Disable NGG and use the legacy pipeline." },
-	{ "nggc", DBG(ALWAYS_NGG_CULLING), "Always use NGG culling even when it can hurt." },
-	{ "nonggc", DBG(NO_NGG_CULLING), "Disable NGG culling." },
-	{ "alwayspd", DBG(ALWAYS_PD), "Always enable the primitive discard compute shader." },
-	{ "pd", DBG(PD), "Enable the primitive discard compute shader for large draw calls." },
-	{ "nopd", DBG(NO_PD), "Disable the primitive discard compute shader." },
-	{ "switch_on_eop", DBG(SWITCH_ON_EOP), "Program WD/IA to switch on end-of-packet." },
-	{ "nooutoforder", DBG(NO_OUT_OF_ORDER), "Disable out-of-order rasterization" },
-	{ "nodpbb", DBG(NO_DPBB), "Disable DPBB." },
-	{ "nodfsm", DBG(NO_DFSM), "Disable DFSM." },
-	{ "dpbb", DBG(DPBB), "Enable DPBB." },
-	{ "dfsm", DBG(DFSM), "Enable DFSM." },
-	{ "nohyperz", DBG(NO_HYPERZ), "Disable Hyper-Z" },
-	{ "norbplus", DBG(NO_RB_PLUS), "Disable RB+." },
-	{ "no2d", DBG(NO_2D_TILING), "Disable 2D tiling" },
-	{ "notiling", DBG(NO_TILING), "Disable tiling" },
-	{ "nodcc", DBG(NO_DCC), "Disable DCC." },
-	{ "nodccclear", DBG(NO_DCC_CLEAR), "Disable DCC fast clear." },
-	{ "nodccfb", DBG(NO_DCC_FB), "Disable separate DCC on the main framebuffer" },
-	{ "nodccmsaa", DBG(NO_DCC_MSAA), "Disable DCC for MSAA" },
-	{ "nofmask", DBG(NO_FMASK), "Disable MSAA compression" },
-
-	DEBUG_NAMED_VALUE_END /* must be last */
+   /* Shader logging options: */
+   {"vs", DBG(VS), "Print vertex shaders"},
+   {"ps", DBG(PS), "Print pixel shaders"},
+   {"gs", DBG(GS), "Print geometry shaders"},
+   {"tcs", DBG(TCS), "Print tessellation control shaders"},
+   {"tes", DBG(TES), "Print tessellation evaluation shaders"},
+   {"cs", DBG(CS), "Print compute shaders"},
+   {"noir", DBG(NO_IR), "Don't print the LLVM IR"},
+   {"nonir", DBG(NO_NIR), "Don't print NIR when printing shaders"},
+   {"noasm", DBG(NO_ASM), "Don't print disassembled shaders"},
+   {"preoptir", DBG(PREOPT_IR), "Print the LLVM IR before initial optimizations"},
+
+   /* Shader compiler options the shader cache should be aware of: */
+   {"gisel", DBG(GISEL), "Enable LLVM global instruction selector."},
+   {"w32ge", DBG(W32_GE), "Use Wave32 for vertex, tessellation, and geometry shaders."},
+   {"w32ps", DBG(W32_PS), "Use Wave32 for pixel shaders."},
+   {"w32cs", DBG(W32_CS), "Use Wave32 for computes shaders."},
+   {"w64ge", DBG(W64_GE), "Use Wave64 for vertex, tessellation, and geometry shaders."},
+   {"w64ps", DBG(W64_PS), "Use Wave64 for pixel shaders."},
+   {"w64cs", DBG(W64_CS), "Use Wave64 for computes shaders."},
+
+   /* Shader compiler options (with no effect on the shader cache): */
+   {"checkir", DBG(CHECK_IR), "Enable additional sanity checks on shader IR"},
+   {"mono", DBG(MONOLITHIC_SHADERS), "Use old-style monolithic shaders compiled on demand"},
+   {"nooptvariant", DBG(NO_OPT_VARIANT), "Disable compiling optimized shader variants."},
+
+   /* Information logging options: */
+   {"info", DBG(INFO), "Print driver information"},
+   {"tex", DBG(TEX), "Print texture info"},
+   {"compute", DBG(COMPUTE), "Print compute info"},
+   {"vm", DBG(VM), "Print virtual addresses when creating resources"},
+   {"cache_stats", DBG(CACHE_STATS), "Print shader cache statistics."},
+
+   /* Driver options: */
+   {"forcedma", DBG(FORCE_SDMA), "Use SDMA for all operations when possible."},
+   {"nodma", DBG(NO_SDMA), "Disable SDMA"},
+   {"nodmaclear", DBG(NO_SDMA_CLEARS), "Disable SDMA clears"},
+   {"nodmacopyimage", DBG(NO_SDMA_COPY_IMAGE), "Disable SDMA image copies"},
+   {"nowc", DBG(NO_WC), "Disable GTT write combining"},
+   {"check_vm", DBG(CHECK_VM), "Check VM faults and dump debug info."},
+   {"reserve_vmid", DBG(RESERVE_VMID), "Force VMID reservation per context."},
+   {"zerovram", DBG(ZERO_VRAM), "Clear VRAM allocations."},
+
+   /* 3D engine options: */
+   {"nogfx", DBG(NO_GFX), "Disable graphics. Only multimedia compute paths can be used."},
+   {"nongg", DBG(NO_NGG), "Disable NGG and use the legacy pipeline."},
+   {"nggc", DBG(ALWAYS_NGG_CULLING), "Always use NGG culling even when it can hurt."},
+   {"nonggc", DBG(NO_NGG_CULLING), "Disable NGG culling."},
+   {"alwayspd", DBG(ALWAYS_PD), "Always enable the primitive discard compute shader."},
+   {"pd", DBG(PD), "Enable the primitive discard compute shader for large draw calls."},
+   {"nopd", DBG(NO_PD), "Disable the primitive discard compute shader."},
+   {"switch_on_eop", DBG(SWITCH_ON_EOP), "Program WD/IA to switch on end-of-packet."},
+   {"nooutoforder", DBG(NO_OUT_OF_ORDER), "Disable out-of-order rasterization"},
+   {"nodpbb", DBG(NO_DPBB), "Disable DPBB."},
+   {"nodfsm", DBG(NO_DFSM), "Disable DFSM."},
+   {"dpbb", DBG(DPBB), "Enable DPBB."},
+   {"dfsm", DBG(DFSM), "Enable DFSM."},
+   {"nohyperz", DBG(NO_HYPERZ), "Disable Hyper-Z"},
+   {"norbplus", DBG(NO_RB_PLUS), "Disable RB+."},
+   {"no2d", DBG(NO_2D_TILING), "Disable 2D tiling"},
+   {"notiling", DBG(NO_TILING), "Disable tiling"},
+   {"nodcc", DBG(NO_DCC), "Disable DCC."},
+   {"nodccclear", DBG(NO_DCC_CLEAR), "Disable DCC fast clear."},
+   {"nodccfb", DBG(NO_DCC_FB), "Disable separate DCC on the main framebuffer"},
+   {"nodccmsaa", DBG(NO_DCC_MSAA), "Disable DCC for MSAA"},
+   {"nofmask", DBG(NO_FMASK), "Disable MSAA compression"},
+
+   DEBUG_NAMED_VALUE_END /* must be last */
 };
 
 static const struct debug_named_value test_options[] = {
-	/* Tests: */
-	{ "testdma", DBG(TEST_DMA), "Invoke SDMA tests and exit." },
-	{ "testvmfaultcp", DBG(TEST_VMFAULT_CP), "Invoke a CP VM fault test and exit." },
-	{ "testvmfaultsdma", DBG(TEST_VMFAULT_SDMA), "Invoke a SDMA VM fault test and exit." },
-	{ "testvmfaultshader", DBG(TEST_VMFAULT_SHADER), "Invoke a shader VM fault test and exit." },
-	{ "testdmaperf", DBG(TEST_DMA_PERF), "Test DMA performance" },
-	{ "testgds", DBG(TEST_GDS), "Test GDS." },
-	{ "testgdsmm", DBG(TEST_GDS_MM), "Test GDS memory management." },
-	{ "testgdsoamm", DBG(TEST_GDS_OA_MM), "Test GDS OA memory management." },
-
-	DEBUG_NAMED_VALUE_END /* must be last */
+   /* Tests: */
+   {"testdma", DBG(TEST_DMA), "Invoke SDMA tests and exit."},
+   {"testvmfaultcp", DBG(TEST_VMFAULT_CP), "Invoke a CP VM fault test and exit."},
+   {"testvmfaultsdma", DBG(TEST_VMFAULT_SDMA), "Invoke a SDMA VM fault test and exit."},
+   {"testvmfaultshader", DBG(TEST_VMFAULT_SHADER), "Invoke a shader VM fault test and exit."},
+   {"testdmaperf", DBG(TEST_DMA_PERF), "Test DMA performance"},
+   {"testgds", DBG(TEST_GDS), "Test GDS."},
+   {"testgdsmm", DBG(TEST_GDS_MM), "Test GDS memory management."},
+   {"testgdsoamm", DBG(TEST_GDS_OA_MM), "Test GDS OA memory management."},
+
+   DEBUG_NAMED_VALUE_END /* must be last */
 };
 
 void si_init_compiler(struct si_screen *sscreen, struct ac_llvm_compiler *compiler)
 {
-	/* Only create the less-optimizing version of the compiler on APUs
-	 * predating Ryzen (Raven). */
-	bool create_low_opt_compiler = !sscreen->info.has_dedicated_vram &&
-				       sscreen->info.chip_class <= GFX8;
-
-	enum ac_target_machine_options tm_options =
-		(sscreen->debug_flags & DBG(GISEL) ? AC_TM_ENABLE_GLOBAL_ISEL : 0) |
-		(sscreen->info.chip_class >= GFX9 ? AC_TM_FORCE_ENABLE_XNACK : 0) |
-		(sscreen->info.chip_class < GFX9 ? AC_TM_FORCE_DISABLE_XNACK : 0) |
-		(!sscreen->llvm_has_working_vgpr_indexing ? AC_TM_PROMOTE_ALLOCA_TO_SCRATCH : 0) |
-		(sscreen->debug_flags & DBG(CHECK_IR) ? AC_TM_CHECK_IR : 0) |
-		(create_low_opt_compiler ? AC_TM_CREATE_LOW_OPT : 0);
-
-	ac_init_llvm_once();
-	ac_init_llvm_compiler(compiler, sscreen->info.family, tm_options);
-	compiler->passes = ac_create_llvm_passes(compiler->tm);
-
-	if (compiler->tm_wave32)
-		compiler->passes_wave32 = ac_create_llvm_passes(compiler->tm_wave32);
-	if (compiler->low_opt_tm)
-		compiler->low_opt_passes = ac_create_llvm_passes(compiler->low_opt_tm);
+   /* Only create the less-optimizing version of the compiler on APUs
+    * predating Ryzen (Raven). */
+   bool create_low_opt_compiler =
+      !sscreen->info.has_dedicated_vram && sscreen->info.chip_class <= GFX8;
+
+   enum ac_target_machine_options tm_options =
+      (sscreen->debug_flags & DBG(GISEL) ? AC_TM_ENABLE_GLOBAL_ISEL : 0) |
+      (sscreen->info.chip_class >= GFX9 ? AC_TM_FORCE_ENABLE_XNACK : 0) |
+      (sscreen->info.chip_class < GFX9 ? AC_TM_FORCE_DISABLE_XNACK : 0) |
+      (!sscreen->llvm_has_working_vgpr_indexing ? AC_TM_PROMOTE_ALLOCA_TO_SCRATCH : 0) |
+      (sscreen->debug_flags & DBG(CHECK_IR) ? AC_TM_CHECK_IR : 0) |
+      (create_low_opt_compiler ? AC_TM_CREATE_LOW_OPT : 0);
+
+   ac_init_llvm_once();
+   ac_init_llvm_compiler(compiler, sscreen->info.family, tm_options);
+   compiler->passes = ac_create_llvm_passes(compiler->tm);
+
+   if (compiler->tm_wave32)
+      compiler->passes_wave32 = ac_create_llvm_passes(compiler->tm_wave32);
+   if (compiler->low_opt_tm)
+      compiler->low_opt_passes = ac_create_llvm_passes(compiler->low_opt_tm);
 }
 
 static void si_destroy_compiler(struct ac_llvm_compiler *compiler)
 {
-	ac_destroy_llvm_compiler(compiler);
+   ac_destroy_llvm_compiler(compiler);
 }
 
 /*
@@ -167,195 +166,191 @@ static void si_destroy_compiler(struct ac_llvm_compiler *compiler)
  */
 static void si_destroy_context(struct pipe_context *context)
 {
-	struct si_context *sctx = (struct si_context *)context;
-	int i;
-
-	/* Unreference the framebuffer normally to disable related logic
-	 * properly.
-	 */
-	struct pipe_framebuffer_state fb = {};
-	if (context->set_framebuffer_state)
-		context->set_framebuffer_state(context, &fb);
-
-	si_release_all_descriptors(sctx);
-
-	if (sctx->chip_class >= GFX10 && sctx->has_graphics)
-		gfx10_destroy_query(sctx);
-
-	pipe_resource_reference(&sctx->esgs_ring, NULL);
-	pipe_resource_reference(&sctx->gsvs_ring, NULL);
-	pipe_resource_reference(&sctx->tess_rings, NULL);
-	pipe_resource_reference(&sctx->null_const_buf.buffer, NULL);
-	pipe_resource_reference(&sctx->sample_pos_buffer, NULL);
-	si_resource_reference(&sctx->border_color_buffer, NULL);
-	free(sctx->border_color_table);
-	si_resource_reference(&sctx->scratch_buffer, NULL);
-	si_resource_reference(&sctx->compute_scratch_buffer, NULL);
-	si_resource_reference(&sctx->wait_mem_scratch, NULL);
-	si_resource_reference(&sctx->small_prim_cull_info_buf, NULL);
-
-	si_pm4_free_state(sctx, sctx->init_config, ~0);
-	if (sctx->init_config_gs_rings)
-		si_pm4_free_state(sctx, sctx->init_config_gs_rings, ~0);
-	for (i = 0; i < ARRAY_SIZE(sctx->vgt_shader_config); i++)
-		si_pm4_delete_state(sctx, vgt_shader_config, sctx->vgt_shader_config[i]);
-
-	if (sctx->fixed_func_tcs_shader.cso)
-		sctx->b.delete_tcs_state(&sctx->b, sctx->fixed_func_tcs_shader.cso);
-	if (sctx->custom_dsa_flush)
-		sctx->b.delete_depth_stencil_alpha_state(&sctx->b, sctx->custom_dsa_flush);
-	if (sctx->custom_blend_resolve)
-		sctx->b.delete_blend_state(&sctx->b, sctx->custom_blend_resolve);
-	if (sctx->custom_blend_fmask_decompress)
-		sctx->b.delete_blend_state(&sctx->b, sctx->custom_blend_fmask_decompress);
-	if (sctx->custom_blend_eliminate_fastclear)
-		sctx->b.delete_blend_state(&sctx->b, sctx->custom_blend_eliminate_fastclear);
-	if (sctx->custom_blend_dcc_decompress)
-		sctx->b.delete_blend_state(&sctx->b, sctx->custom_blend_dcc_decompress);
-	if (sctx->vs_blit_pos)
-		sctx->b.delete_vs_state(&sctx->b, sctx->vs_blit_pos);
-	if (sctx->vs_blit_pos_layered)
-		sctx->b.delete_vs_state(&sctx->b, sctx->vs_blit_pos_layered);
-	if (sctx->vs_blit_color)
-		sctx->b.delete_vs_state(&sctx->b, sctx->vs_blit_color);
-	if (sctx->vs_blit_color_layered)
-		sctx->b.delete_vs_state(&sctx->b, sctx->vs_blit_color_layered);
-	if (sctx->vs_blit_texcoord)
-		sctx->b.delete_vs_state(&sctx->b, sctx->vs_blit_texcoord);
-	if (sctx->cs_clear_buffer)
-		sctx->b.delete_compute_state(&sctx->b, sctx->cs_clear_buffer);
-	if (sctx->cs_copy_buffer)
-		sctx->b.delete_compute_state(&sctx->b, sctx->cs_copy_buffer);
-	if (sctx->cs_copy_image)
-		sctx->b.delete_compute_state(&sctx->b, sctx->cs_copy_image);
-	if (sctx->cs_copy_image_1d_array)
-		sctx->b.delete_compute_state(&sctx->b, sctx->cs_copy_image_1d_array);
-	if (sctx->cs_clear_render_target)
-		sctx->b.delete_compute_state(&sctx->b, sctx->cs_clear_render_target);
-	if (sctx->cs_clear_render_target_1d_array)
-		sctx->b.delete_compute_state(&sctx->b, sctx->cs_clear_render_target_1d_array);
-	if (sctx->cs_clear_12bytes_buffer)
-		sctx->b.delete_compute_state(&sctx->b, sctx->cs_clear_12bytes_buffer);
-	if (sctx->cs_dcc_retile)
-		sctx->b.delete_compute_state(&sctx->b, sctx->cs_dcc_retile);
-
-	for (unsigned i = 0; i < ARRAY_SIZE(sctx->cs_fmask_expand); i++) {
-		for (unsigned j = 0; j < ARRAY_SIZE(sctx->cs_fmask_expand[i]); j++) {
-			if (sctx->cs_fmask_expand[i][j]) {
-				sctx->b.delete_compute_state(&sctx->b,
-							     sctx->cs_fmask_expand[i][j]);
-			}
-		}
-	}
-
-	if (sctx->blitter)
-		util_blitter_destroy(sctx->blitter);
-
-	/* Release DCC stats. */
-	for (int i = 0; i < ARRAY_SIZE(sctx->dcc_stats); i++) {
-		assert(!sctx->dcc_stats[i].query_active);
-
-		for (int j = 0; j < ARRAY_SIZE(sctx->dcc_stats[i].ps_stats); j++)
-			if (sctx->dcc_stats[i].ps_stats[j])
-				sctx->b.destroy_query(&sctx->b,
-							sctx->dcc_stats[i].ps_stats[j]);
-
-		si_texture_reference(&sctx->dcc_stats[i].tex, NULL);
-	}
-
-	if (sctx->query_result_shader)
-		sctx->b.delete_compute_state(&sctx->b, sctx->query_result_shader);
-	if (sctx->sh_query_result_shader)
-		sctx->b.delete_compute_state(&sctx->b, sctx->sh_query_result_shader);
-
-	if (sctx->gfx_cs)
-		sctx->ws->cs_destroy(sctx->gfx_cs);
-	if (sctx->sdma_cs)
-		sctx->ws->cs_destroy(sctx->sdma_cs);
-	if (sctx->ctx)
-		sctx->ws->ctx_destroy(sctx->ctx);
-
-	if (sctx->b.stream_uploader)
-		u_upload_destroy(sctx->b.stream_uploader);
-	if (sctx->b.const_uploader)
-		u_upload_destroy(sctx->b.const_uploader);
-	if (sctx->cached_gtt_allocator)
-		u_upload_destroy(sctx->cached_gtt_allocator);
-
-	slab_destroy_child(&sctx->pool_transfers);
-	slab_destroy_child(&sctx->pool_transfers_unsync);
-
-	if (sctx->allocator_zeroed_memory)
-		u_suballocator_destroy(sctx->allocator_zeroed_memory);
-
-	sctx->ws->fence_reference(&sctx->last_gfx_fence, NULL);
-	sctx->ws->fence_reference(&sctx->last_sdma_fence, NULL);
-	sctx->ws->fence_reference(&sctx->last_ib_barrier_fence, NULL);
-	si_resource_reference(&sctx->eop_bug_scratch, NULL);
-	si_resource_reference(&sctx->index_ring, NULL);
-	si_resource_reference(&sctx->barrier_buf, NULL);
-	si_resource_reference(&sctx->last_ib_barrier_buf, NULL);
-	pb_reference(&sctx->gds, NULL);
-	pb_reference(&sctx->gds_oa, NULL);
-
-	si_destroy_compiler(&sctx->compiler);
-
-	si_saved_cs_reference(&sctx->current_saved_cs, NULL);
-
-	_mesa_hash_table_destroy(sctx->tex_handles, NULL);
-	_mesa_hash_table_destroy(sctx->img_handles, NULL);
-
-	util_dynarray_fini(&sctx->resident_tex_handles);
-	util_dynarray_fini(&sctx->resident_img_handles);
-	util_dynarray_fini(&sctx->resident_tex_needs_color_decompress);
-	util_dynarray_fini(&sctx->resident_img_needs_color_decompress);
-	util_dynarray_fini(&sctx->resident_tex_needs_depth_decompress);
-	si_unref_sdma_uploads(sctx);
-	free(sctx->sdma_uploads);
-	FREE(sctx);
+   struct si_context *sctx = (struct si_context *)context;
+   int i;
+
+   /* Unreference the framebuffer normally to disable related logic
+    * properly.
+    */
+   struct pipe_framebuffer_state fb = {};
+   if (context->set_framebuffer_state)
+      context->set_framebuffer_state(context, &fb);
+
+   si_release_all_descriptors(sctx);
+
+   if (sctx->chip_class >= GFX10 && sctx->has_graphics)
+      gfx10_destroy_query(sctx);
+
+   pipe_resource_reference(&sctx->esgs_ring, NULL);
+   pipe_resource_reference(&sctx->gsvs_ring, NULL);
+   pipe_resource_reference(&sctx->tess_rings, NULL);
+   pipe_resource_reference(&sctx->null_const_buf.buffer, NULL);
+   pipe_resource_reference(&sctx->sample_pos_buffer, NULL);
+   si_resource_reference(&sctx->border_color_buffer, NULL);
+   free(sctx->border_color_table);
+   si_resource_reference(&sctx->scratch_buffer, NULL);
+   si_resource_reference(&sctx->compute_scratch_buffer, NULL);
+   si_resource_reference(&sctx->wait_mem_scratch, NULL);
+   si_resource_reference(&sctx->small_prim_cull_info_buf, NULL);
+
+   si_pm4_free_state(sctx, sctx->init_config, ~0);
+   if (sctx->init_config_gs_rings)
+      si_pm4_free_state(sctx, sctx->init_config_gs_rings, ~0);
+   for (i = 0; i < ARRAY_SIZE(sctx->vgt_shader_config); i++)
+      si_pm4_delete_state(sctx, vgt_shader_config, sctx->vgt_shader_config[i]);
+
+   if (sctx->fixed_func_tcs_shader.cso)
+      sctx->b.delete_tcs_state(&sctx->b, sctx->fixed_func_tcs_shader.cso);
+   if (sctx->custom_dsa_flush)
+      sctx->b.delete_depth_stencil_alpha_state(&sctx->b, sctx->custom_dsa_flush);
+   if (sctx->custom_blend_resolve)
+      sctx->b.delete_blend_state(&sctx->b, sctx->custom_blend_resolve);
+   if (sctx->custom_blend_fmask_decompress)
+      sctx->b.delete_blend_state(&sctx->b, sctx->custom_blend_fmask_decompress);
+   if (sctx->custom_blend_eliminate_fastclear)
+      sctx->b.delete_blend_state(&sctx->b, sctx->custom_blend_eliminate_fastclear);
+   if (sctx->custom_blend_dcc_decompress)
+      sctx->b.delete_blend_state(&sctx->b, sctx->custom_blend_dcc_decompress);
+   if (sctx->vs_blit_pos)
+      sctx->b.delete_vs_state(&sctx->b, sctx->vs_blit_pos);
+   if (sctx->vs_blit_pos_layered)
+      sctx->b.delete_vs_state(&sctx->b, sctx->vs_blit_pos_layered);
+   if (sctx->vs_blit_color)
+      sctx->b.delete_vs_state(&sctx->b, sctx->vs_blit_color);
+   if (sctx->vs_blit_color_layered)
+      sctx->b.delete_vs_state(&sctx->b, sctx->vs_blit_color_layered);
+   if (sctx->vs_blit_texcoord)
+      sctx->b.delete_vs_state(&sctx->b, sctx->vs_blit_texcoord);
+   if (sctx->cs_clear_buffer)
+      sctx->b.delete_compute_state(&sctx->b, sctx->cs_clear_buffer);
+   if (sctx->cs_copy_buffer)
+      sctx->b.delete_compute_state(&sctx->b, sctx->cs_copy_buffer);
+   if (sctx->cs_copy_image)
+      sctx->b.delete_compute_state(&sctx->b, sctx->cs_copy_image);
+   if (sctx->cs_copy_image_1d_array)
+      sctx->b.delete_compute_state(&sctx->b, sctx->cs_copy_image_1d_array);
+   if (sctx->cs_clear_render_target)
+      sctx->b.delete_compute_state(&sctx->b, sctx->cs_clear_render_target);
+   if (sctx->cs_clear_render_target_1d_array)
+      sctx->b.delete_compute_state(&sctx->b, sctx->cs_clear_render_target_1d_array);
+   if (sctx->cs_clear_12bytes_buffer)
+      sctx->b.delete_compute_state(&sctx->b, sctx->cs_clear_12bytes_buffer);
+   if (sctx->cs_dcc_retile)
+      sctx->b.delete_compute_state(&sctx->b, sctx->cs_dcc_retile);
+
+   for (unsigned i = 0; i < ARRAY_SIZE(sctx->cs_fmask_expand); i++) {
+      for (unsigned j = 0; j < ARRAY_SIZE(sctx->cs_fmask_expand[i]); j++) {
+         if (sctx->cs_fmask_expand[i][j]) {
+            sctx->b.delete_compute_state(&sctx->b, sctx->cs_fmask_expand[i][j]);
+         }
+      }
+   }
+
+   if (sctx->blitter)
+      util_blitter_destroy(sctx->blitter);
+
+   /* Release DCC stats. */
+   for (int i = 0; i < ARRAY_SIZE(sctx->dcc_stats); i++) {
+      assert(!sctx->dcc_stats[i].query_active);
+
+      for (int j = 0; j < ARRAY_SIZE(sctx->dcc_stats[i].ps_stats); j++)
+         if (sctx->dcc_stats[i].ps_stats[j])
+            sctx->b.destroy_query(&sctx->b, sctx->dcc_stats[i].ps_stats[j]);
+
+      si_texture_reference(&sctx->dcc_stats[i].tex, NULL);
+   }
+
+   if (sctx->query_result_shader)
+      sctx->b.delete_compute_state(&sctx->b, sctx->query_result_shader);
+   if (sctx->sh_query_result_shader)
+      sctx->b.delete_compute_state(&sctx->b, sctx->sh_query_result_shader);
+
+   if (sctx->gfx_cs)
+      sctx->ws->cs_destroy(sctx->gfx_cs);
+   if (sctx->sdma_cs)
+      sctx->ws->cs_destroy(sctx->sdma_cs);
+   if (sctx->ctx)
+      sctx->ws->ctx_destroy(sctx->ctx);
+
+   if (sctx->b.stream_uploader)
+      u_upload_destroy(sctx->b.stream_uploader);
+   if (sctx->b.const_uploader)
+      u_upload_destroy(sctx->b.const_uploader);
+   if (sctx->cached_gtt_allocator)
+      u_upload_destroy(sctx->cached_gtt_allocator);
+
+   slab_destroy_child(&sctx->pool_transfers);
+   slab_destroy_child(&sctx->pool_transfers_unsync);
+
+   if (sctx->allocator_zeroed_memory)
+      u_suballocator_destroy(sctx->allocator_zeroed_memory);
+
+   sctx->ws->fence_reference(&sctx->last_gfx_fence, NULL);
+   sctx->ws->fence_reference(&sctx->last_sdma_fence, NULL);
+   sctx->ws->fence_reference(&sctx->last_ib_barrier_fence, NULL);
+   si_resource_reference(&sctx->eop_bug_scratch, NULL);
+   si_resource_reference(&sctx->index_ring, NULL);
+   si_resource_reference(&sctx->barrier_buf, NULL);
+   si_resource_reference(&sctx->last_ib_barrier_buf, NULL);
+   pb_reference(&sctx->gds, NULL);
+   pb_reference(&sctx->gds_oa, NULL);
+
+   si_destroy_compiler(&sctx->compiler);
+
+   si_saved_cs_reference(&sctx->current_saved_cs, NULL);
+
+   _mesa_hash_table_destroy(sctx->tex_handles, NULL);
+   _mesa_hash_table_destroy(sctx->img_handles, NULL);
+
+   util_dynarray_fini(&sctx->resident_tex_handles);
+   util_dynarray_fini(&sctx->resident_img_handles);
+   util_dynarray_fini(&sctx->resident_tex_needs_color_decompress);
+   util_dynarray_fini(&sctx->resident_img_needs_color_decompress);
+   util_dynarray_fini(&sctx->resident_tex_needs_depth_decompress);
+   si_unref_sdma_uploads(sctx);
+   free(sctx->sdma_uploads);
+   FREE(sctx);
 }
 
 static enum pipe_reset_status si_get_reset_status(struct pipe_context *ctx)
 {
-	struct si_context *sctx = (struct si_context *)ctx;
-	struct si_screen *sscreen = sctx->screen;
-	enum pipe_reset_status status = sctx->ws->ctx_query_reset_status(sctx->ctx);
-
-	if (status != PIPE_NO_RESET) {
-		/* Call the state tracker to set a no-op API dispatch. */
-		if (sctx->device_reset_callback.reset) {
-			sctx->device_reset_callback.reset(sctx->device_reset_callback.data,
-							  status);
-		}
-
-		/* Re-create the auxiliary context, because it won't submit
-		 * any new IBs due to a GPU reset.
-		 */
-		simple_mtx_lock(&sscreen->aux_context_lock);
-
-		struct u_log_context *aux_log = ((struct si_context *)sscreen->aux_context)->log;
-		sscreen->aux_context->set_log_context(sscreen->aux_context, NULL);
-		sscreen->aux_context->destroy(sscreen->aux_context);
-
-		sscreen->aux_context = si_create_context(&sscreen->b,
-			(sscreen->options.aux_debug ? PIPE_CONTEXT_DEBUG : 0) |
-			(sscreen->info.has_graphics ? 0 : PIPE_CONTEXT_COMPUTE_ONLY));
-		sscreen->aux_context->set_log_context(sscreen->aux_context, aux_log);
-		simple_mtx_unlock(&sscreen->aux_context_lock);
-	}
-	return status;
+   struct si_context *sctx = (struct si_context *)ctx;
+   struct si_screen *sscreen = sctx->screen;
+   enum pipe_reset_status status = sctx->ws->ctx_query_reset_status(sctx->ctx);
+
+   if (status != PIPE_NO_RESET) {
+      /* Call the state tracker to set a no-op API dispatch. */
+      if (sctx->device_reset_callback.reset) {
+         sctx->device_reset_callback.reset(sctx->device_reset_callback.data, status);
+      }
+
+      /* Re-create the auxiliary context, because it won't submit
+       * any new IBs due to a GPU reset.
+       */
+      simple_mtx_lock(&sscreen->aux_context_lock);
+
+      struct u_log_context *aux_log = ((struct si_context *)sscreen->aux_context)->log;
+      sscreen->aux_context->set_log_context(sscreen->aux_context, NULL);
+      sscreen->aux_context->destroy(sscreen->aux_context);
+
+      sscreen->aux_context = si_create_context(
+         &sscreen->b, (sscreen->options.aux_debug ? PIPE_CONTEXT_DEBUG : 0) |
+                         (sscreen->info.has_graphics ? 0 : PIPE_CONTEXT_COMPUTE_ONLY));
+      sscreen->aux_context->set_log_context(sscreen->aux_context, aux_log);
+      simple_mtx_unlock(&sscreen->aux_context_lock);
+   }
+   return status;
 }
 
 static void si_set_device_reset_callback(struct pipe_context *ctx,
-					   const struct pipe_device_reset_callback *cb)
+                                         const struct pipe_device_reset_callback *cb)
 {
-	struct si_context *sctx = (struct si_context *)ctx;
+   struct si_context *sctx = (struct si_context *)ctx;
 
-	if (cb)
-		sctx->device_reset_callback = *cb;
-	else
-		memset(&sctx->device_reset_callback, 0,
-		       sizeof(sctx->device_reset_callback));
+   if (cb)
+      sctx->device_reset_callback = *cb;
+   else
+      memset(&sctx->device_reset_callback, 0, sizeof(sctx->device_reset_callback));
 }
 
 /* Apitrace profiling:
@@ -366,989 +361,895 @@ static void si_set_device_reset_callback(struct pipe_context *ctx,
  *      call and print the results.
  *   4) glretrace --benchmark --markers ..
  */
-static void si_emit_string_marker(struct pipe_context *ctx,
-				  const char *string, int len)
+static void si_emit_string_marker(struct pipe_context *ctx, const char *string, int len)
 {
-	struct si_context *sctx = (struct si_context *)ctx;
+   struct si_context *sctx = (struct si_context *)ctx;
 
-	dd_parse_apitrace_marker(string, len, &sctx->apitrace_call_number);
+   dd_parse_apitrace_marker(string, len, &sctx->apitrace_call_number);
 
-	if (sctx->log)
-		u_log_printf(sctx->log, "\nString marker: %*s\n", len, string);
+   if (sctx->log)
+      u_log_printf(sctx->log, "\nString marker: %*s\n", len, string);
 }
 
-static void si_set_debug_callback(struct pipe_context *ctx,
-				  const struct pipe_debug_callback *cb)
+static void si_set_debug_callback(struct pipe_context *ctx, const struct pipe_debug_callback *cb)
 {
-	struct si_context *sctx = (struct si_context *)ctx;
-	struct si_screen *screen = sctx->screen;
+   struct si_context *sctx = (struct si_context *)ctx;
+   struct si_screen *screen = sctx->screen;
 
-	util_queue_finish(&screen->shader_compiler_queue);
-	util_queue_finish(&screen->shader_compiler_queue_low_priority);
+   util_queue_finish(&screen->shader_compiler_queue);
+   util_queue_finish(&screen->shader_compiler_queue_low_priority);
 
-	if (cb)
-		sctx->debug = *cb;
-	else
-		memset(&sctx->debug, 0, sizeof(sctx->debug));
+   if (cb)
+      sctx->debug = *cb;
+   else
+      memset(&sctx->debug, 0, sizeof(sctx->debug));
 }
 
-static void si_set_log_context(struct pipe_context *ctx,
-			       struct u_log_context *log)
+static void si_set_log_context(struct pipe_context *ctx, struct u_log_context *log)
 {
-	struct si_context *sctx = (struct si_context *)ctx;
-	sctx->log = log;
+   struct si_context *sctx = (struct si_context *)ctx;
+   sctx->log = log;
 
-	if (log)
-		u_log_add_auto_logger(log, si_auto_log_cs, sctx);
+   if (log)
+      u_log_add_auto_logger(log, si_auto_log_cs, sctx);
 }
 
-static void si_set_context_param(struct pipe_context *ctx,
-				 enum pipe_context_param param,
-				 unsigned value)
+static void si_set_context_param(struct pipe_context *ctx, enum pipe_context_param param,
+                                 unsigned value)
 {
-	struct radeon_winsys *ws = ((struct si_context *)ctx)->ws;
-
-	switch (param) {
-	case PIPE_CONTEXT_PARAM_PIN_THREADS_TO_L3_CACHE:
-		ws->pin_threads_to_L3_cache(ws, value);
-		break;
-	default:;
-	}
+   struct radeon_winsys *ws = ((struct si_context *)ctx)->ws;
+
+   switch (param) {
+   case PIPE_CONTEXT_PARAM_PIN_THREADS_TO_L3_CACHE:
+      ws->pin_threads_to_L3_cache(ws, value);
+      break;
+   default:;
+   }
 }
 
-static struct pipe_context *si_create_context(struct pipe_screen *screen,
-                                              unsigned flags)
+static struct pipe_context *si_create_context(struct pipe_screen *screen, unsigned flags)
 {
-	struct si_screen* sscreen = (struct si_screen *)screen;
-	STATIC_ASSERT(DBG_COUNT <= 64);
-
-	/* Don't create a context if it's not compute-only and hw is compute-only. */
-	if (!sscreen->info.has_graphics &&
-	    !(flags & PIPE_CONTEXT_COMPUTE_ONLY))
-		return NULL;
-
-	struct si_context *sctx = CALLOC_STRUCT(si_context);
-	struct radeon_winsys *ws = sscreen->ws;
-	int shader, i;
-	bool stop_exec_on_failure = (flags & PIPE_CONTEXT_LOSE_CONTEXT_ON_RESET) != 0;
-
-	if (!sctx)
-		return NULL;
-
-	sctx->has_graphics = sscreen->info.chip_class == GFX6 ||
-			     !(flags & PIPE_CONTEXT_COMPUTE_ONLY);
-
-	if (flags & PIPE_CONTEXT_DEBUG)
-		sscreen->record_llvm_ir = true; /* racy but not critical */
-
-	sctx->b.screen = screen; /* this must be set first */
-	sctx->b.priv = NULL;
-	sctx->b.destroy = si_destroy_context;
-	sctx->screen = sscreen; /* Easy accessing of screen/winsys. */
-	sctx->is_debug = (flags & PIPE_CONTEXT_DEBUG) != 0;
-
-	slab_create_child(&sctx->pool_transfers, &sscreen->pool_transfers);
-	slab_create_child(&sctx->pool_transfers_unsync, &sscreen->pool_transfers);
-
-	sctx->ws = sscreen->ws;
-	sctx->family = sscreen->info.family;
-	sctx->chip_class = sscreen->info.chip_class;
-
-	if (sctx->chip_class == GFX7 ||
-	    sctx->chip_class == GFX8 ||
-	    sctx->chip_class == GFX9) {
-		sctx->eop_bug_scratch = si_resource(
-			pipe_buffer_create(&sscreen->b, 0, PIPE_USAGE_DEFAULT,
-					   16 * sscreen->info.num_render_backends));
-		if (!sctx->eop_bug_scratch)
-			goto fail;
-	}
-
-	/* Initialize context allocators. */
-	sctx->allocator_zeroed_memory =
-		u_suballocator_create(&sctx->b, 128 * 1024,
-				      0, PIPE_USAGE_DEFAULT,
-				      SI_RESOURCE_FLAG_UNMAPPABLE |
-				      SI_RESOURCE_FLAG_CLEAR, false);
-	if (!sctx->allocator_zeroed_memory)
-		goto fail;
-
-	sctx->b.stream_uploader = u_upload_create(&sctx->b, 1024 * 1024,
-						    0, PIPE_USAGE_STREAM,
-						    SI_RESOURCE_FLAG_READ_ONLY);
-	if (!sctx->b.stream_uploader)
-		goto fail;
-
-	sctx->cached_gtt_allocator = u_upload_create(&sctx->b, 16 * 1024,
-						       0, PIPE_USAGE_STAGING, 0);
-	if (!sctx->cached_gtt_allocator)
-		goto fail;
-
-	sctx->ctx = sctx->ws->ctx_create(sctx->ws);
-	if (!sctx->ctx)
-		goto fail;
-
-	if (sscreen->info.num_rings[RING_DMA] &&
-	    !(sscreen->debug_flags & DBG(NO_SDMA)) &&
-	    /* SDMA causes corruption on RX 580:
-	     *    https://gitlab.freedesktop.org/mesa/mesa/issues/1399
-	     *    https://gitlab.freedesktop.org/mesa/mesa/issues/1889
-	     */
-	    (sctx->chip_class != GFX8 || sscreen->debug_flags & DBG(FORCE_SDMA)) &&
-	    /* SDMA timeouts sometimes on gfx10 so disable it for now. See:
-	     *    https://bugs.freedesktop.org/show_bug.cgi?id=111481
-	     *    https://gitlab.freedesktop.org/mesa/mesa/issues/1907
-	     */
-	    (sctx->chip_class != GFX10 || sscreen->debug_flags & DBG(FORCE_SDMA))) {
-		sctx->sdma_cs = sctx->ws->cs_create(sctx->ctx, RING_DMA,
-						   (void*)si_flush_dma_cs,
-						   sctx, stop_exec_on_failure);
-	}
-
-	bool use_sdma_upload = sscreen->info.has_dedicated_vram && sctx->sdma_cs;
-	sctx->b.const_uploader = u_upload_create(&sctx->b, 256 * 1024,
-						 0, PIPE_USAGE_DEFAULT,
-						 SI_RESOURCE_FLAG_32BIT |
-						 (use_sdma_upload ?
-							  SI_RESOURCE_FLAG_UPLOAD_FLUSH_EXPLICIT_VIA_SDMA : 0));
-	if (!sctx->b.const_uploader)
-		goto fail;
-
-	if (use_sdma_upload)
-		u_upload_enable_flush_explicit(sctx->b.const_uploader);
-
-	sctx->gfx_cs = ws->cs_create(sctx->ctx,
-				     sctx->has_graphics ? RING_GFX : RING_COMPUTE,
-				     (void*)si_flush_gfx_cs, sctx, stop_exec_on_failure);
-
-	/* Border colors. */
-	sctx->border_color_table = malloc(SI_MAX_BORDER_COLORS *
-					  sizeof(*sctx->border_color_table));
-	if (!sctx->border_color_table)
-		goto fail;
-
-	sctx->border_color_buffer = si_resource(
-		pipe_buffer_create(screen, 0, PIPE_USAGE_DEFAULT,
-				   SI_MAX_BORDER_COLORS *
-				   sizeof(*sctx->border_color_table)));
-	if (!sctx->border_color_buffer)
-		goto fail;
-
-	sctx->border_color_map =
-		ws->buffer_map(sctx->border_color_buffer->buf,
-			       NULL, PIPE_TRANSFER_WRITE);
-	if (!sctx->border_color_map)
-		goto fail;
-
-	sctx->ngg = sscreen->use_ngg;
-
-	/* Initialize context functions used by graphics and compute. */
-	if (sctx->chip_class >= GFX10)
-		sctx->emit_cache_flush = gfx10_emit_cache_flush;
-	else
-		sctx->emit_cache_flush = si_emit_cache_flush;
-
-	sctx->b.emit_string_marker = si_emit_string_marker;
-	sctx->b.set_debug_callback = si_set_debug_callback;
-	sctx->b.set_log_context = si_set_log_context;
-	sctx->b.set_context_param = si_set_context_param;
-	sctx->b.get_device_reset_status = si_get_reset_status;
-	sctx->b.set_device_reset_callback = si_set_device_reset_callback;
-
-	si_init_all_descriptors(sctx);
-	si_init_buffer_functions(sctx);
-	si_init_clear_functions(sctx);
-	si_init_blit_functions(sctx);
-	si_init_compute_functions(sctx);
-	si_init_compute_blit_functions(sctx);
-	si_init_debug_functions(sctx);
-	si_init_fence_functions(sctx);
-	si_init_query_functions(sctx);
-	si_init_state_compute_functions(sctx);
-	si_init_context_texture_functions(sctx);
-
-	/* Initialize graphics-only context functions. */
-	if (sctx->has_graphics) {
-		if (sctx->chip_class >= GFX10)
-			gfx10_init_query(sctx);
-		si_init_msaa_functions(sctx);
-		si_init_shader_functions(sctx);
-		si_init_state_functions(sctx);
-		si_init_streamout_functions(sctx);
-		si_init_viewport_functions(sctx);
-
-		sctx->blitter = util_blitter_create(&sctx->b);
-		if (sctx->blitter == NULL)
-			goto fail;
-		sctx->blitter->skip_viewport_restore = true;
-
-		/* Some states are expected to be always non-NULL. */
-		sctx->noop_blend = util_blitter_get_noop_blend_state(sctx->blitter);
-		sctx->queued.named.blend = sctx->noop_blend;
-
-		sctx->noop_dsa = util_blitter_get_noop_dsa_state(sctx->blitter);
-		sctx->queued.named.dsa = sctx->noop_dsa;
-
-		sctx->discard_rasterizer_state =
-			util_blitter_get_discard_rasterizer_state(sctx->blitter);
-		sctx->queued.named.rasterizer = sctx->discard_rasterizer_state;
-
-		si_init_draw_functions(sctx);
-
-		/* If aux_context == NULL, we are initializing aux_context right now. */
-		bool is_aux_context = !sscreen->aux_context;
-		si_initialize_prim_discard_tunables(sscreen, is_aux_context,
-						    &sctx->prim_discard_vertex_count_threshold,
-						    &sctx->index_ring_size_per_ib);
-	}
-
-	/* Initialize SDMA functions. */
-	if (sctx->chip_class >= GFX7)
-		cik_init_sdma_functions(sctx);
-	else
-		sctx->dma_copy = si_resource_copy_region;
-
-	if (sscreen->debug_flags & DBG(FORCE_SDMA))
-		sctx->b.resource_copy_region = sctx->dma_copy;
-
-	sctx->sample_mask = 0xffff;
-
-	/* Initialize multimedia functions. */
-	if (sscreen->info.has_hw_decode) {
-		sctx->b.create_video_codec = si_uvd_create_decoder;
-		sctx->b.create_video_buffer = si_video_buffer_create;
-	} else {
-		sctx->b.create_video_codec = vl_create_decoder;
-		sctx->b.create_video_buffer = vl_video_buffer_create;
-	}
-
-	if (sctx->chip_class >= GFX9 ||
-	    si_compute_prim_discard_enabled(sctx)) {
-		sctx->wait_mem_scratch = si_resource(
-			pipe_buffer_create(screen, 0, PIPE_USAGE_DEFAULT, 8));
-		if (!sctx->wait_mem_scratch)
-			goto fail;
-
-		/* Initialize the memory. */
-		si_cp_write_data(sctx, sctx->wait_mem_scratch, 0, 4,
-				 V_370_MEM, V_370_ME, &sctx->wait_mem_number);
-	}
-
-	/* GFX7 cannot unbind a constant buffer (S_BUFFER_LOAD doesn't skip loads
-	 * if NUM_RECORDS == 0). We need to use a dummy buffer instead. */
-	if (sctx->chip_class == GFX7) {
-		sctx->null_const_buf.buffer =
-			pipe_aligned_buffer_create(screen,
-						   SI_RESOURCE_FLAG_32BIT,
-						   PIPE_USAGE_DEFAULT, 16,
-						   sctx->screen->info.tcc_cache_line_size);
-		if (!sctx->null_const_buf.buffer)
-			goto fail;
-		sctx->null_const_buf.buffer_size = sctx->null_const_buf.buffer->width0;
-
-		unsigned start_shader = sctx->has_graphics ? 0 :  PIPE_SHADER_COMPUTE;
-		for (shader = start_shader; shader < SI_NUM_SHADERS; shader++) {
-			for (i = 0; i < SI_NUM_CONST_BUFFERS; i++) {
-				sctx->b.set_constant_buffer(&sctx->b, shader, i,
-							      &sctx->null_const_buf);
-			}
-		}
-
-		si_set_rw_buffer(sctx, SI_HS_CONST_DEFAULT_TESS_LEVELS,
-				 &sctx->null_const_buf);
-		si_set_rw_buffer(sctx, SI_VS_CONST_INSTANCE_DIVISORS,
-				 &sctx->null_const_buf);
-		si_set_rw_buffer(sctx, SI_VS_CONST_CLIP_PLANES,
-				 &sctx->null_const_buf);
-		si_set_rw_buffer(sctx, SI_PS_CONST_POLY_STIPPLE,
-				 &sctx->null_const_buf);
-		si_set_rw_buffer(sctx, SI_PS_CONST_SAMPLE_POSITIONS,
-				 &sctx->null_const_buf);
-	}
-
-	uint64_t max_threads_per_block;
-	screen->get_compute_param(screen, PIPE_SHADER_IR_NIR,
-				  PIPE_COMPUTE_CAP_MAX_THREADS_PER_BLOCK,
-				  &max_threads_per_block);
-
-	/* The maximum number of scratch waves. Scratch space isn't divided
-	 * evenly between CUs. The number is only a function of the number of CUs.
-	 * We can decrease the constant to decrease the scratch buffer size.
-	 *
-	 * sctx->scratch_waves must be >= the maximum posible size of
-	 * 1 threadgroup, so that the hw doesn't hang from being unable
-	 * to start any.
-	 *
-	 * The recommended value is 4 per CU at most. Higher numbers don't
-	 * bring much benefit, but they still occupy chip resources (think
-	 * async compute). I've seen ~2% performance difference between 4 and 32.
-	 */
-	sctx->scratch_waves = MAX2(32 * sscreen->info.num_good_compute_units,
-				   max_threads_per_block / 64);
-
-	/* Bindless handles. */
-	sctx->tex_handles = _mesa_hash_table_create(NULL, _mesa_hash_pointer,
-						    _mesa_key_pointer_equal);
-	sctx->img_handles = _mesa_hash_table_create(NULL, _mesa_hash_pointer,
-						    _mesa_key_pointer_equal);
-
-	util_dynarray_init(&sctx->resident_tex_handles, NULL);
-	util_dynarray_init(&sctx->resident_img_handles, NULL);
-	util_dynarray_init(&sctx->resident_tex_needs_color_decompress, NULL);
-	util_dynarray_init(&sctx->resident_img_needs_color_decompress, NULL);
-	util_dynarray_init(&sctx->resident_tex_needs_depth_decompress, NULL);
-
-	sctx->sample_pos_buffer =
-		pipe_buffer_create(sctx->b.screen, 0, PIPE_USAGE_DEFAULT,
-				   sizeof(sctx->sample_positions));
-	pipe_buffer_write(&sctx->b, sctx->sample_pos_buffer, 0,
-			  sizeof(sctx->sample_positions), &sctx->sample_positions);
-
-	/* this must be last */
-	si_begin_new_gfx_cs(sctx);
-
-	if (sctx->chip_class == GFX7) {
-		/* Clear the NULL constant buffer, because loads should return zeros.
-		 * Note that this forces CP DMA to be used, because clover deadlocks
-		 * for some reason when the compute codepath is used.
-		 */
-		uint32_t clear_value = 0;
-		si_clear_buffer(sctx, sctx->null_const_buf.buffer, 0,
-				sctx->null_const_buf.buffer->width0,
-				&clear_value, 4, SI_COHERENCY_SHADER, true);
-	}
-	return &sctx->b;
+   struct si_screen *sscreen = (struct si_screen *)screen;
+   STATIC_ASSERT(DBG_COUNT <= 64);
+
+   /* Don't create a context if it's not compute-only and hw is compute-only. */
+   if (!sscreen->info.has_graphics && !(flags & PIPE_CONTEXT_COMPUTE_ONLY))
+      return NULL;
+
+   struct si_context *sctx = CALLOC_STRUCT(si_context);
+   struct radeon_winsys *ws = sscreen->ws;
+   int shader, i;
+   bool stop_exec_on_failure = (flags & PIPE_CONTEXT_LOSE_CONTEXT_ON_RESET) != 0;
+
+   if (!sctx)
+      return NULL;
+
+   sctx->has_graphics = sscreen->info.chip_class == GFX6 || !(flags & PIPE_CONTEXT_COMPUTE_ONLY);
+
+   if (flags & PIPE_CONTEXT_DEBUG)
+      sscreen->record_llvm_ir = true; /* racy but not critical */
+
+   sctx->b.screen = screen; /* this must be set first */
+   sctx->b.priv = NULL;
+   sctx->b.destroy = si_destroy_context;
+   sctx->screen = sscreen; /* Easy accessing of screen/winsys. */
+   sctx->is_debug = (flags & PIPE_CONTEXT_DEBUG) != 0;
+
+   slab_create_child(&sctx->pool_transfers, &sscreen->pool_transfers);
+   slab_create_child(&sctx->pool_transfers_unsync, &sscreen->pool_transfers);
+
+   sctx->ws = sscreen->ws;
+   sctx->family = sscreen->info.family;
+   sctx->chip_class = sscreen->info.chip_class;
+
+   if (sctx->chip_class == GFX7 || sctx->chip_class == GFX8 || sctx->chip_class == GFX9) {
+      sctx->eop_bug_scratch = si_resource(pipe_buffer_create(
+         &sscreen->b, 0, PIPE_USAGE_DEFAULT, 16 * sscreen->info.num_render_backends));
+      if (!sctx->eop_bug_scratch)
+         goto fail;
+   }
+
+   /* Initialize context allocators. */
+   sctx->allocator_zeroed_memory =
+      u_suballocator_create(&sctx->b, 128 * 1024, 0, PIPE_USAGE_DEFAULT,
+                            SI_RESOURCE_FLAG_UNMAPPABLE | SI_RESOURCE_FLAG_CLEAR, false);
+   if (!sctx->allocator_zeroed_memory)
+      goto fail;
+
+   sctx->b.stream_uploader =
+      u_upload_create(&sctx->b, 1024 * 1024, 0, PIPE_USAGE_STREAM, SI_RESOURCE_FLAG_READ_ONLY);
+   if (!sctx->b.stream_uploader)
+      goto fail;
+
+   sctx->cached_gtt_allocator = u_upload_create(&sctx->b, 16 * 1024, 0, PIPE_USAGE_STAGING, 0);
+   if (!sctx->cached_gtt_allocator)
+      goto fail;
+
+   sctx->ctx = sctx->ws->ctx_create(sctx->ws);
+   if (!sctx->ctx)
+      goto fail;
+
+   if (sscreen->info.num_rings[RING_DMA] && !(sscreen->debug_flags & DBG(NO_SDMA)) &&
+       /* SDMA causes corruption on RX 580:
+        *    https://gitlab.freedesktop.org/mesa/mesa/issues/1399
+        *    https://gitlab.freedesktop.org/mesa/mesa/issues/1889
+        */
+       (sctx->chip_class != GFX8 || sscreen->debug_flags & DBG(FORCE_SDMA)) &&
+       /* SDMA timeouts sometimes on gfx10 so disable it for now. See:
+        *    https://bugs.freedesktop.org/show_bug.cgi?id=111481
+        *    https://gitlab.freedesktop.org/mesa/mesa/issues/1907
+        */
+       (sctx->chip_class != GFX10 || sscreen->debug_flags & DBG(FORCE_SDMA))) {
+      sctx->sdma_cs = sctx->ws->cs_create(sctx->ctx, RING_DMA, (void *)si_flush_dma_cs, sctx,
+                                          stop_exec_on_failure);
+   }
+
+   bool use_sdma_upload = sscreen->info.has_dedicated_vram && sctx->sdma_cs;
+   sctx->b.const_uploader =
+      u_upload_create(&sctx->b, 256 * 1024, 0, PIPE_USAGE_DEFAULT,
+                      SI_RESOURCE_FLAG_32BIT |
+                         (use_sdma_upload ? SI_RESOURCE_FLAG_UPLOAD_FLUSH_EXPLICIT_VIA_SDMA : 0));
+   if (!sctx->b.const_uploader)
+      goto fail;
+
+   if (use_sdma_upload)
+      u_upload_enable_flush_explicit(sctx->b.const_uploader);
+
+   sctx->gfx_cs = ws->cs_create(sctx->ctx, sctx->has_graphics ? RING_GFX : RING_COMPUTE,
+                                (void *)si_flush_gfx_cs, sctx, stop_exec_on_failure);
+
+   /* Border colors. */
+   sctx->border_color_table = malloc(SI_MAX_BORDER_COLORS * sizeof(*sctx->border_color_table));
+   if (!sctx->border_color_table)
+      goto fail;
+
+   sctx->border_color_buffer = si_resource(pipe_buffer_create(
+      screen, 0, PIPE_USAGE_DEFAULT, SI_MAX_BORDER_COLORS * sizeof(*sctx->border_color_table)));
+   if (!sctx->border_color_buffer)
+      goto fail;
+
+   sctx->border_color_map =
+      ws->buffer_map(sctx->border_color_buffer->buf, NULL, PIPE_TRANSFER_WRITE);
+   if (!sctx->border_color_map)
+      goto fail;
+
+   sctx->ngg = sscreen->use_ngg;
+
+   /* Initialize context functions used by graphics and compute. */
+   if (sctx->chip_class >= GFX10)
+      sctx->emit_cache_flush = gfx10_emit_cache_flush;
+   else
+      sctx->emit_cache_flush = si_emit_cache_flush;
+
+   sctx->b.emit_string_marker = si_emit_string_marker;
+   sctx->b.set_debug_callback = si_set_debug_callback;
+   sctx->b.set_log_context = si_set_log_context;
+   sctx->b.set_context_param = si_set_context_param;
+   sctx->b.get_device_reset_status = si_get_reset_status;
+   sctx->b.set_device_reset_callback = si_set_device_reset_callback;
+
+   si_init_all_descriptors(sctx);
+   si_init_buffer_functions(sctx);
+   si_init_clear_functions(sctx);
+   si_init_blit_functions(sctx);
+   si_init_compute_functions(sctx);
+   si_init_compute_blit_functions(sctx);
+   si_init_debug_functions(sctx);
+   si_init_fence_functions(sctx);
+   si_init_query_functions(sctx);
+   si_init_state_compute_functions(sctx);
+   si_init_context_texture_functions(sctx);
+
+   /* Initialize graphics-only context functions. */
+   if (sctx->has_graphics) {
+      if (sctx->chip_class >= GFX10)
+         gfx10_init_query(sctx);
+      si_init_msaa_functions(sctx);
+      si_init_shader_functions(sctx);
+      si_init_state_functions(sctx);
+      si_init_streamout_functions(sctx);
+      si_init_viewport_functions(sctx);
+
+      sctx->blitter = util_blitter_create(&sctx->b);
+      if (sctx->blitter == NULL)
+         goto fail;
+      sctx->blitter->skip_viewport_restore = true;
+
+      /* Some states are expected to be always non-NULL. */
+      sctx->noop_blend = util_blitter_get_noop_blend_state(sctx->blitter);
+      sctx->queued.named.blend = sctx->noop_blend;
+
+      sctx->noop_dsa = util_blitter_get_noop_dsa_state(sctx->blitter);
+      sctx->queued.named.dsa = sctx->noop_dsa;
+
+      sctx->discard_rasterizer_state = util_blitter_get_discard_rasterizer_state(sctx->blitter);
+      sctx->queued.named.rasterizer = sctx->discard_rasterizer_state;
+
+      si_init_draw_functions(sctx);
+
+      /* If aux_context == NULL, we are initializing aux_context right now. */
+      bool is_aux_context = !sscreen->aux_context;
+      si_initialize_prim_discard_tunables(sscreen, is_aux_context,
+                                          &sctx->prim_discard_vertex_count_threshold,
+                                          &sctx->index_ring_size_per_ib);
+   }
+
+   /* Initialize SDMA functions. */
+   if (sctx->chip_class >= GFX7)
+      cik_init_sdma_functions(sctx);
+   else
+      sctx->dma_copy = si_resource_copy_region;
+
+   if (sscreen->debug_flags & DBG(FORCE_SDMA))
+      sctx->b.resource_copy_region = sctx->dma_copy;
+
+   sctx->sample_mask = 0xffff;
+
+   /* Initialize multimedia functions. */
+   if (sscreen->info.has_hw_decode) {
+      sctx->b.create_video_codec = si_uvd_create_decoder;
+      sctx->b.create_video_buffer = si_video_buffer_create;
+   } else {
+      sctx->b.create_video_codec = vl_create_decoder;
+      sctx->b.create_video_buffer = vl_video_buffer_create;
+   }
+
+   if (sctx->chip_class >= GFX9 || si_compute_prim_discard_enabled(sctx)) {
+      sctx->wait_mem_scratch = si_resource(pipe_buffer_create(screen, 0, PIPE_USAGE_DEFAULT, 8));
+      if (!sctx->wait_mem_scratch)
+         goto fail;
+
+      /* Initialize the memory. */
+      si_cp_write_data(sctx, sctx->wait_mem_scratch, 0, 4, V_370_MEM, V_370_ME,
+                       &sctx->wait_mem_number);
+   }
+
+   /* GFX7 cannot unbind a constant buffer (S_BUFFER_LOAD doesn't skip loads
+    * if NUM_RECORDS == 0). We need to use a dummy buffer instead. */
+   if (sctx->chip_class == GFX7) {
+      sctx->null_const_buf.buffer =
+         pipe_aligned_buffer_create(screen, SI_RESOURCE_FLAG_32BIT, PIPE_USAGE_DEFAULT, 16,
+                                    sctx->screen->info.tcc_cache_line_size);
+      if (!sctx->null_const_buf.buffer)
+         goto fail;
+      sctx->null_const_buf.buffer_size = sctx->null_const_buf.buffer->width0;
+
+      unsigned start_shader = sctx->has_graphics ? 0 : PIPE_SHADER_COMPUTE;
+      for (shader = start_shader; shader < SI_NUM_SHADERS; shader++) {
+         for (i = 0; i < SI_NUM_CONST_BUFFERS; i++) {
+            sctx->b.set_constant_buffer(&sctx->b, shader, i, &sctx->null_const_buf);
+         }
+      }
+
+      si_set_rw_buffer(sctx, SI_HS_CONST_DEFAULT_TESS_LEVELS, &sctx->null_const_buf);
+      si_set_rw_buffer(sctx, SI_VS_CONST_INSTANCE_DIVISORS, &sctx->null_const_buf);
+      si_set_rw_buffer(sctx, SI_VS_CONST_CLIP_PLANES, &sctx->null_const_buf);
+      si_set_rw_buffer(sctx, SI_PS_CONST_POLY_STIPPLE, &sctx->null_const_buf);
+      si_set_rw_buffer(sctx, SI_PS_CONST_SAMPLE_POSITIONS, &sctx->null_const_buf);
+   }
+
+   uint64_t max_threads_per_block;
+   screen->get_compute_param(screen, PIPE_SHADER_IR_NIR, PIPE_COMPUTE_CAP_MAX_THREADS_PER_BLOCK,
+                             &max_threads_per_block);
+
+   /* The maximum number of scratch waves. Scratch space isn't divided
+    * evenly between CUs. The number is only a function of the number of CUs.
+    * We can decrease the constant to decrease the scratch buffer size.
+    *
+    * sctx->scratch_waves must be >= the maximum posible size of
+    * 1 threadgroup, so that the hw doesn't hang from being unable
+    * to start any.
+    *
+    * The recommended value is 4 per CU at most. Higher numbers don't
+    * bring much benefit, but they still occupy chip resources (think
+    * async compute). I've seen ~2% performance difference between 4 and 32.
+    */
+   sctx->scratch_waves =
+      MAX2(32 * sscreen->info.num_good_compute_units, max_threads_per_block / 64);
+
+   /* Bindless handles. */
+   sctx->tex_handles = _mesa_hash_table_create(NULL, _mesa_hash_pointer, _mesa_key_pointer_equal);
+   sctx->img_handles = _mesa_hash_table_create(NULL, _mesa_hash_pointer, _mesa_key_pointer_equal);
+
+   util_dynarray_init(&sctx->resident_tex_handles, NULL);
+   util_dynarray_init(&sctx->resident_img_handles, NULL);
+   util_dynarray_init(&sctx->resident_tex_needs_color_decompress, NULL);
+   util_dynarray_init(&sctx->resident_img_needs_color_decompress, NULL);
+   util_dynarray_init(&sctx->resident_tex_needs_depth_decompress, NULL);
+
+   sctx->sample_pos_buffer =
+      pipe_buffer_create(sctx->b.screen, 0, PIPE_USAGE_DEFAULT, sizeof(sctx->sample_positions));
+   pipe_buffer_write(&sctx->b, sctx->sample_pos_buffer, 0, sizeof(sctx->sample_positions),
+                     &sctx->sample_positions);
+
+   /* this must be last */
+   si_begin_new_gfx_cs(sctx);
+
+   if (sctx->chip_class == GFX7) {
+      /* Clear the NULL constant buffer, because loads should return zeros.
+       * Note that this forces CP DMA to be used, because clover deadlocks
+       * for some reason when the compute codepath is used.
+       */
+      uint32_t clear_value = 0;
+      si_clear_buffer(sctx, sctx->null_const_buf.buffer, 0, sctx->null_const_buf.buffer->width0,
+                      &clear_value, 4, SI_COHERENCY_SHADER, true);
+   }
+   return &sctx->b;
 fail:
-	fprintf(stderr, "radeonsi: Failed to create a context.\n");
-	si_destroy_context(&sctx->b);
-	return NULL;
+   fprintf(stderr, "radeonsi: Failed to create a context.\n");
+   si_destroy_context(&sctx->b);
+   return NULL;
 }
 
-static struct pipe_context *si_pipe_create_context(struct pipe_screen *screen,
-						   void *priv, unsigned flags)
+static struct pipe_context *si_pipe_create_context(struct pipe_screen *screen, void *priv,
+                                                   unsigned flags)
 {
-	struct si_screen *sscreen = (struct si_screen *)screen;
-	struct pipe_context *ctx;
+   struct si_screen *sscreen = (struct si_screen *)screen;
+   struct pipe_context *ctx;
 
-	if (sscreen->debug_flags & DBG(CHECK_VM))
-		flags |= PIPE_CONTEXT_DEBUG;
+   if (sscreen->debug_flags & DBG(CHECK_VM))
+      flags |= PIPE_CONTEXT_DEBUG;
 
-	ctx = si_create_context(screen, flags);
+   ctx = si_create_context(screen, flags);
 
-	if (!(flags & PIPE_CONTEXT_PREFER_THREADED))
-		return ctx;
+   if (!(flags & PIPE_CONTEXT_PREFER_THREADED))
+      return ctx;
 
-	/* Clover (compute-only) is unsupported. */
-	if (flags & PIPE_CONTEXT_COMPUTE_ONLY)
-		return ctx;
+   /* Clover (compute-only) is unsupported. */
+   if (flags & PIPE_CONTEXT_COMPUTE_ONLY)
+      return ctx;
 
-	/* When shaders are logged to stderr, asynchronous compilation is
-	 * disabled too. */
-	if (sscreen->debug_flags & DBG_ALL_SHADERS)
-		return ctx;
+   /* When shaders are logged to stderr, asynchronous compilation is
+    * disabled too. */
+   if (sscreen->debug_flags & DBG_ALL_SHADERS)
+      return ctx;
 
-	/* Use asynchronous flushes only on amdgpu, since the radeon
-	 * implementation for fence_server_sync is incomplete. */
-	return threaded_context_create(ctx, &sscreen->pool_transfers,
-				       si_replace_buffer_storage,
-				       sscreen->info.is_amdgpu ? si_create_fence : NULL,
-				       &((struct si_context*)ctx)->tc);
+   /* Use asynchronous flushes only on amdgpu, since the radeon
+    * implementation for fence_server_sync is incomplete. */
+   return threaded_context_create(ctx, &sscreen->pool_transfers, si_replace_buffer_storage,
+                                  sscreen->info.is_amdgpu ? si_create_fence : NULL,
+                                  &((struct si_context *)ctx)->tc);
 }
 
 /*
  * pipe_screen
  */
-static void si_destroy_screen(struct pipe_screen* pscreen)
+static void si_destroy_screen(struct pipe_screen *pscreen)
 {
-	struct si_screen *sscreen = (struct si_screen *)pscreen;
-	struct si_shader_part *parts[] = {
-		sscreen->vs_prologs,
-		sscreen->tcs_epilogs,
-		sscreen->gs_prologs,
-		sscreen->ps_prologs,
-		sscreen->ps_epilogs
-	};
-	unsigned i;
-
-	if (!sscreen->ws->unref(sscreen->ws))
-		return;
-
-	if (sscreen->debug_flags & DBG(CACHE_STATS)) {
-		printf("live shader cache:   hits = %u, misses = %u\n",
-		       sscreen->live_shader_cache.hits,
-		       sscreen->live_shader_cache.misses);
-		printf("memory shader cache: hits = %u, misses = %u\n",
-		       sscreen->num_memory_shader_cache_hits,
-		       sscreen->num_memory_shader_cache_misses);
-		printf("disk shader cache:   hits = %u, misses = %u\n",
-		       sscreen->num_disk_shader_cache_hits,
-		       sscreen->num_disk_shader_cache_misses);
-	}
-
-	simple_mtx_destroy(&sscreen->aux_context_lock);
-
-	struct u_log_context *aux_log = ((struct si_context *)sscreen->aux_context)->log;
-	if (aux_log) {
-		sscreen->aux_context->set_log_context(sscreen->aux_context, NULL);
-		u_log_context_destroy(aux_log);
-		FREE(aux_log);
-	}
-
-	sscreen->aux_context->destroy(sscreen->aux_context);
-
-	util_queue_destroy(&sscreen->shader_compiler_queue);
-	util_queue_destroy(&sscreen->shader_compiler_queue_low_priority);
-
-	/* Release the reference on glsl types of the compiler threads. */
-	glsl_type_singleton_decref();
-
-	for (i = 0; i < ARRAY_SIZE(sscreen->compiler); i++)
-		si_destroy_compiler(&sscreen->compiler[i]);
-
-	for (i = 0; i < ARRAY_SIZE(sscreen->compiler_lowp); i++)
-		si_destroy_compiler(&sscreen->compiler_lowp[i]);
-
-	/* Free shader parts. */
-	for (i = 0; i < ARRAY_SIZE(parts); i++) {
-		while (parts[i]) {
-			struct si_shader_part *part = parts[i];
-
-			parts[i] = part->next;
-			si_shader_binary_clean(&part->binary);
-			FREE(part);
-		}
-	}
-	simple_mtx_destroy(&sscreen->shader_parts_mutex);
-	si_destroy_shader_cache(sscreen);
-
-	si_destroy_perfcounters(sscreen);
-	si_gpu_load_kill_thread(sscreen);
-
-	simple_mtx_destroy(&sscreen->gpu_load_mutex);
-
-	slab_destroy_parent(&sscreen->pool_transfers);
-
-	disk_cache_destroy(sscreen->disk_shader_cache);
-	util_live_shader_cache_deinit(&sscreen->live_shader_cache);
-	sscreen->ws->destroy(sscreen->ws);
-	FREE(sscreen);
+   struct si_screen *sscreen = (struct si_screen *)pscreen;
+   struct si_shader_part *parts[] = {sscreen->vs_prologs, sscreen->tcs_epilogs, sscreen->gs_prologs,
+                                     sscreen->ps_prologs, sscreen->ps_epilogs};
+   unsigned i;
+
+   if (!sscreen->ws->unref(sscreen->ws))
+      return;
+
+   if (sscreen->debug_flags & DBG(CACHE_STATS)) {
+      printf("live shader cache:   hits = %u, misses = %u\n", sscreen->live_shader_cache.hits,
+             sscreen->live_shader_cache.misses);
+      printf("memory shader cache: hits = %u, misses = %u\n", sscreen->num_memory_shader_cache_hits,
+             sscreen->num_memory_shader_cache_misses);
+      printf("disk shader cache:   hits = %u, misses = %u\n", sscreen->num_disk_shader_cache_hits,
+             sscreen->num_disk_shader_cache_misses);
+   }
+
+   simple_mtx_destroy(&sscreen->aux_context_lock);
+
+   struct u_log_context *aux_log = ((struct si_context *)sscreen->aux_context)->log;
+   if (aux_log) {
+      sscreen->aux_context->set_log_context(sscreen->aux_context, NULL);
+      u_log_context_destroy(aux_log);
+      FREE(aux_log);
+   }
+
+   sscreen->aux_context->destroy(sscreen->aux_context);
+
+   util_queue_destroy(&sscreen->shader_compiler_queue);
+   util_queue_destroy(&sscreen->shader_compiler_queue_low_priority);
+
+   /* Release the reference on glsl types of the compiler threads. */
+   glsl_type_singleton_decref();
+
+   for (i = 0; i < ARRAY_SIZE(sscreen->compiler); i++)
+      si_destroy_compiler(&sscreen->compiler[i]);
+
+   for (i = 0; i < ARRAY_SIZE(sscreen->compiler_lowp); i++)
+      si_destroy_compiler(&sscreen->compiler_lowp[i]);
+
+   /* Free shader parts. */
+   for (i = 0; i < ARRAY_SIZE(parts); i++) {
+      while (parts[i]) {
+         struct si_shader_part *part = parts[i];
+
+         parts[i] = part->next;
+         si_shader_binary_clean(&part->binary);
+         FREE(part);
+      }
+   }
+   simple_mtx_destroy(&sscreen->shader_parts_mutex);
+   si_destroy_shader_cache(sscreen);
+
+   si_destroy_perfcounters(sscreen);
+   si_gpu_load_kill_thread(sscreen);
+
+   simple_mtx_destroy(&sscreen->gpu_load_mutex);
+
+   slab_destroy_parent(&sscreen->pool_transfers);
+
+   disk_cache_destroy(sscreen->disk_shader_cache);
+   util_live_shader_cache_deinit(&sscreen->live_shader_cache);
+   sscreen->ws->destroy(sscreen->ws);
+   FREE(sscreen);
 }
 
 static void si_init_gs_info(struct si_screen *sscreen)
 {
-	sscreen->gs_table_depth = ac_get_gs_table_depth(sscreen->info.chip_class,
-							sscreen->info.family);
+   sscreen->gs_table_depth = ac_get_gs_table_depth(sscreen->info.chip_class, sscreen->info.family);
 }
 
 static void si_test_vmfault(struct si_screen *sscreen, uint64_t test_flags)
 {
-	struct pipe_context *ctx = sscreen->aux_context;
-	struct si_context *sctx = (struct si_context *)ctx;
-	struct pipe_resource *buf =
-		pipe_buffer_create_const0(&sscreen->b, 0, PIPE_USAGE_DEFAULT, 64);
-
-	if (!buf) {
-		puts("Buffer allocation failed.");
-		exit(1);
-	}
-
-	si_resource(buf)->gpu_address = 0; /* cause a VM fault */
-
-	if (test_flags & DBG(TEST_VMFAULT_CP)) {
-		si_cp_dma_copy_buffer(sctx, buf, buf, 0, 4, 4, 0,
-				      SI_COHERENCY_NONE, L2_BYPASS);
-		ctx->flush(ctx, NULL, 0);
-		puts("VM fault test: CP - done.");
-	}
-	if (test_flags & DBG(TEST_VMFAULT_SDMA)) {
-		si_sdma_clear_buffer(sctx, buf, 0, 4, 0);
-		ctx->flush(ctx, NULL, 0);
-		puts("VM fault test: SDMA - done.");
-	}
-	if (test_flags & DBG(TEST_VMFAULT_SHADER)) {
-		util_test_constant_buffer(ctx, buf);
-		puts("VM fault test: Shader - done.");
-	}
-	exit(0);
+   struct pipe_context *ctx = sscreen->aux_context;
+   struct si_context *sctx = (struct si_context *)ctx;
+   struct pipe_resource *buf = pipe_buffer_create_const0(&sscreen->b, 0, PIPE_USAGE_DEFAULT, 64);
+
+   if (!buf) {
+      puts("Buffer allocation failed.");
+      exit(1);
+   }
+
+   si_resource(buf)->gpu_address = 0; /* cause a VM fault */
+
+   if (test_flags & DBG(TEST_VMFAULT_CP)) {
+      si_cp_dma_copy_buffer(sctx, buf, buf, 0, 4, 4, 0, SI_COHERENCY_NONE, L2_BYPASS);
+      ctx->flush(ctx, NULL, 0);
+      puts("VM fault test: CP - done.");
+   }
+   if (test_flags & DBG(TEST_VMFAULT_SDMA)) {
+      si_sdma_clear_buffer(sctx, buf, 0, 4, 0);
+      ctx->flush(ctx, NULL, 0);
+      puts("VM fault test: SDMA - done.");
+   }
+   if (test_flags & DBG(TEST_VMFAULT_SHADER)) {
+      util_test_constant_buffer(ctx, buf);
+      puts("VM fault test: Shader - done.");
+   }
+   exit(0);
 }
 
-static void si_test_gds_memory_management(struct si_context *sctx,
-					  unsigned alloc_size, unsigned alignment,
-					  enum radeon_bo_domain domain)
+static void si_test_gds_memory_management(struct si_context *sctx, unsigned alloc_size,
+                                          unsigned alignment, enum radeon_bo_domain domain)
 {
-	struct radeon_winsys *ws = sctx->ws;
-	struct radeon_cmdbuf *cs[8];
-	struct pb_buffer *gds_bo[ARRAY_SIZE(cs)];
-
-	for (unsigned i = 0; i < ARRAY_SIZE(cs); i++) {
-		cs[i] = ws->cs_create(sctx->ctx, RING_COMPUTE,
-				      NULL, NULL, false);
-		gds_bo[i] = ws->buffer_create(ws, alloc_size, alignment, domain, 0);
-		assert(gds_bo[i]);
-	}
-
-	for (unsigned iterations = 0; iterations < 20000; iterations++) {
-		for (unsigned i = 0; i < ARRAY_SIZE(cs); i++) {
-			/* This clears GDS with CP DMA.
-			 *
-			 * We don't care if GDS is present. Just add some packet
-			 * to make the GPU busy for a moment.
-			 */
-			si_cp_dma_clear_buffer(sctx, cs[i], NULL, 0, alloc_size, 0,
-					       SI_CPDMA_SKIP_BO_LIST_UPDATE |
-					       SI_CPDMA_SKIP_CHECK_CS_SPACE |
-					       SI_CPDMA_SKIP_GFX_SYNC, 0, 0);
-
-			ws->cs_add_buffer(cs[i], gds_bo[i], domain,
-					  RADEON_USAGE_READWRITE, 0);
-			ws->cs_flush(cs[i], PIPE_FLUSH_ASYNC, NULL);
-		}
-	}
-	exit(0);
+   struct radeon_winsys *ws = sctx->ws;
+   struct radeon_cmdbuf *cs[8];
+   struct pb_buffer *gds_bo[ARRAY_SIZE(cs)];
+
+   for (unsigned i = 0; i < ARRAY_SIZE(cs); i++) {
+      cs[i] = ws->cs_create(sctx->ctx, RING_COMPUTE, NULL, NULL, false);
+      gds_bo[i] = ws->buffer_create(ws, alloc_size, alignment, domain, 0);
+      assert(gds_bo[i]);
+   }
+
+   for (unsigned iterations = 0; iterations < 20000; iterations++) {
+      for (unsigned i = 0; i < ARRAY_SIZE(cs); i++) {
+         /* This clears GDS with CP DMA.
+          *
+          * We don't care if GDS is present. Just add some packet
+          * to make the GPU busy for a moment.
+          */
+         si_cp_dma_clear_buffer(
+            sctx, cs[i], NULL, 0, alloc_size, 0,
+            SI_CPDMA_SKIP_BO_LIST_UPDATE | SI_CPDMA_SKIP_CHECK_CS_SPACE | SI_CPDMA_SKIP_GFX_SYNC, 0,
+            0);
+
+         ws->cs_add_buffer(cs[i], gds_bo[i], domain, RADEON_USAGE_READWRITE, 0);
+         ws->cs_flush(cs[i], PIPE_FLUSH_ASYNC, NULL);
+      }
+   }
+   exit(0);
 }
 
 static void si_disk_cache_create(struct si_screen *sscreen)
 {
-	/* Don't use the cache if shader dumping is enabled. */
-	if (sscreen->debug_flags & DBG_ALL_SHADERS)
-		return;
-
-	struct mesa_sha1 ctx;
-	unsigned char sha1[20];
-	char cache_id[20 * 2 + 1];
-
-	_mesa_sha1_init(&ctx);
-
-	if (!disk_cache_get_function_identifier(si_disk_cache_create, &ctx) ||
-	    !disk_cache_get_function_identifier(LLVMInitializeAMDGPUTargetInfo,
-						&ctx))
-		return;
-
-	_mesa_sha1_final(&ctx, sha1);
-	disk_cache_format_hex_id(cache_id, sha1, 20 * 2);
-
-	/* These flags affect shader compilation. */
-	#define ALL_FLAGS (DBG(GISEL))
-	uint64_t shader_debug_flags = sscreen->debug_flags & ALL_FLAGS;
-
-	/* Add the high bits of 32-bit addresses, which affects
-	 * how 32-bit addresses are expanded to 64 bits.
-	 */
-	STATIC_ASSERT(ALL_FLAGS <= UINT_MAX);
-	assert((int16_t)sscreen->info.address32_hi == (int32_t)sscreen->info.address32_hi);
-	shader_debug_flags |= (uint64_t)(sscreen->info.address32_hi & 0xffff) << 32;
-
-	sscreen->disk_shader_cache =
-		disk_cache_create(sscreen->info.name,
-				  cache_id,
-				  shader_debug_flags);
+   /* Don't use the cache if shader dumping is enabled. */
+   if (sscreen->debug_flags & DBG_ALL_SHADERS)
+      return;
+
+   struct mesa_sha1 ctx;
+   unsigned char sha1[20];
+   char cache_id[20 * 2 + 1];
+
+   _mesa_sha1_init(&ctx);
+
+   if (!disk_cache_get_function_identifier(si_disk_cache_create, &ctx) ||
+       !disk_cache_get_function_identifier(LLVMInitializeAMDGPUTargetInfo, &ctx))
+      return;
+
+   _mesa_sha1_final(&ctx, sha1);
+   disk_cache_format_hex_id(cache_id, sha1, 20 * 2);
+
+/* These flags affect shader compilation. */
+#define ALL_FLAGS (DBG(GISEL))
+   uint64_t shader_debug_flags = sscreen->debug_flags & ALL_FLAGS;
+
+   /* Add the high bits of 32-bit addresses, which affects
+    * how 32-bit addresses are expanded to 64 bits.
+    */
+   STATIC_ASSERT(ALL_FLAGS <= UINT_MAX);
+   assert((int16_t)sscreen->info.address32_hi == (int32_t)sscreen->info.address32_hi);
+   shader_debug_flags |= (uint64_t)(sscreen->info.address32_hi & 0xffff) << 32;
+
+   sscreen->disk_shader_cache = disk_cache_create(sscreen->info.name, cache_id, shader_debug_flags);
 }
 
-static void si_set_max_shader_compiler_threads(struct pipe_screen *screen,
-					       unsigned max_threads)
+static void si_set_max_shader_compiler_threads(struct pipe_screen *screen, unsigned max_threads)
 {
-	struct si_screen *sscreen = (struct si_screen *)screen;
+   struct si_screen *sscreen = (struct si_screen *)screen;
 
-	/* This function doesn't allow a greater number of threads than
-	 * the queue had at its creation. */
-	util_queue_adjust_num_threads(&sscreen->shader_compiler_queue,
-				      max_threads);
-	/* Don't change the number of threads on the low priority queue. */
+   /* This function doesn't allow a greater number of threads than
+    * the queue had at its creation. */
+   util_queue_adjust_num_threads(&sscreen->shader_compiler_queue, max_threads);
+   /* Don't change the number of threads on the low priority queue. */
 }
 
-static bool si_is_parallel_shader_compilation_finished(struct pipe_screen *screen,
-						       void *shader,
-						       enum pipe_shader_type shader_type)
+static bool si_is_parallel_shader_compilation_finished(struct pipe_screen *screen, void *shader,
+                                                       enum pipe_shader_type shader_type)
 {
-	struct si_shader_selector *sel = (struct si_shader_selector *)shader;
+   struct si_shader_selector *sel = (struct si_shader_selector *)shader;
 
-	return util_queue_fence_is_signalled(&sel->ready);
+   return util_queue_fence_is_signalled(&sel->ready);
 }
 
-static struct pipe_screen *
-radeonsi_screen_create_impl(struct radeon_winsys *ws,
-			    const struct pipe_screen_config *config)
+static struct pipe_screen *radeonsi_screen_create_impl(struct radeon_winsys *ws,
+                                                       const struct pipe_screen_config *config)
 {
-	struct si_screen *sscreen = CALLOC_STRUCT(si_screen);
-	unsigned hw_threads, num_comp_hi_threads, num_comp_lo_threads;
-	uint64_t test_flags;
-
-	if (!sscreen) {
-		return NULL;
-	}
-
-	sscreen->ws = ws;
-	ws->query_info(ws, &sscreen->info);
-
-	if (sscreen->info.chip_class == GFX10 && LLVM_VERSION_MAJOR < 9) {
-		fprintf(stderr, "radeonsi: Navi family support requires LLVM 9 or higher\n");
-		FREE(sscreen);
-		return NULL;
-	}
-
-	if (sscreen->info.chip_class >= GFX9) {
-		sscreen->se_tile_repeat = 32 * sscreen->info.max_se;
-	} else {
-		ac_get_raster_config(&sscreen->info,
-				     &sscreen->pa_sc_raster_config,
-				     &sscreen->pa_sc_raster_config_1,
-				     &sscreen->se_tile_repeat);
-	}
-
-	sscreen->debug_flags = debug_get_flags_option("R600_DEBUG",
-						      debug_options, 0);
-	sscreen->debug_flags |= debug_get_flags_option("AMD_DEBUG",
-						       debug_options, 0);
-	test_flags = debug_get_flags_option("AMD_TEST",
-					    test_options, 0);
-
-	if (sscreen->debug_flags & DBG(NO_GFX))
-		sscreen->info.has_graphics = false;
-
-	/* Set functions first. */
-	sscreen->b.context_create = si_pipe_create_context;
-	sscreen->b.destroy = si_destroy_screen;
-	sscreen->b.set_max_shader_compiler_threads =
-		si_set_max_shader_compiler_threads;
-	sscreen->b.is_parallel_shader_compilation_finished =
-		si_is_parallel_shader_compilation_finished;
-	sscreen->b.finalize_nir = si_finalize_nir;
-
-	si_init_screen_get_functions(sscreen);
-	si_init_screen_buffer_functions(sscreen);
-	si_init_screen_fence_functions(sscreen);
-	si_init_screen_state_functions(sscreen);
-	si_init_screen_texture_functions(sscreen);
-	si_init_screen_query_functions(sscreen);
-	si_init_screen_live_shader_cache(sscreen);
-
-	/* Set these flags in debug_flags early, so that the shader cache takes
-	 * them into account.
-	 */
-	if (driQueryOptionb(config->options,
-			    "glsl_correct_derivatives_after_discard"))
-		sscreen->debug_flags |= DBG(FS_CORRECT_DERIVS_AFTER_KILL);
-
-	if (sscreen->debug_flags & DBG(INFO))
-		ac_print_gpu_info(&sscreen->info);
-
-	slab_create_parent(&sscreen->pool_transfers,
-			   sizeof(struct si_transfer), 64);
-
-	sscreen->force_aniso = MIN2(16, debug_get_num_option("R600_TEX_ANISO", -1));
-	if (sscreen->force_aniso == -1) {
-		sscreen->force_aniso = MIN2(16, debug_get_num_option("AMD_TEX_ANISO", -1));
-	}
-
-	if (sscreen->force_aniso >= 0) {
-		printf("radeonsi: Forcing anisotropy filter to %ix\n",
-		       /* round down to a power of two */
-		       1 << util_logbase2(sscreen->force_aniso));
-	}
-
-	(void) simple_mtx_init(&sscreen->aux_context_lock, mtx_plain);
-	(void) simple_mtx_init(&sscreen->gpu_load_mutex, mtx_plain);
-
-	si_init_gs_info(sscreen);
-	if (!si_init_shader_cache(sscreen)) {
-		FREE(sscreen);
-		return NULL;
-	}
-
-	{
-#define OPT_BOOL(name, dflt, description) \
-		sscreen->options.name = \
-			driQueryOptionb(config->options, "radeonsi_"#name);
+   struct si_screen *sscreen = CALLOC_STRUCT(si_screen);
+   unsigned hw_threads, num_comp_hi_threads, num_comp_lo_threads;
+   uint64_t test_flags;
+
+   if (!sscreen) {
+      return NULL;
+   }
+
+   sscreen->ws = ws;
+   ws->query_info(ws, &sscreen->info);
+
+   if (sscreen->info.chip_class == GFX10 && LLVM_VERSION_MAJOR < 9) {
+      fprintf(stderr, "radeonsi: Navi family support requires LLVM 9 or higher\n");
+      FREE(sscreen);
+      return NULL;
+   }
+
+   if (sscreen->info.chip_class >= GFX9) {
+      sscreen->se_tile_repeat = 32 * sscreen->info.max_se;
+   } else {
+      ac_get_raster_config(&sscreen->info, &sscreen->pa_sc_raster_config,
+                           &sscreen->pa_sc_raster_config_1, &sscreen->se_tile_repeat);
+   }
+
+   sscreen->debug_flags = debug_get_flags_option("R600_DEBUG", debug_options, 0);
+   sscreen->debug_flags |= debug_get_flags_option("AMD_DEBUG", debug_options, 0);
+   test_flags = debug_get_flags_option("AMD_TEST", test_options, 0);
+
+   if (sscreen->debug_flags & DBG(NO_GFX))
+      sscreen->info.has_graphics = false;
+
+   /* Set functions first. */
+   sscreen->b.context_create = si_pipe_create_context;
+   sscreen->b.destroy = si_destroy_screen;
+   sscreen->b.set_max_shader_compiler_threads = si_set_max_shader_compiler_threads;
+   sscreen->b.is_parallel_shader_compilation_finished = si_is_parallel_shader_compilation_finished;
+   sscreen->b.finalize_nir = si_finalize_nir;
+
+   si_init_screen_get_functions(sscreen);
+   si_init_screen_buffer_functions(sscreen);
+   si_init_screen_fence_functions(sscreen);
+   si_init_screen_state_functions(sscreen);
+   si_init_screen_texture_functions(sscreen);
+   si_init_screen_query_functions(sscreen);
+   si_init_screen_live_shader_cache(sscreen);
+
+   /* Set these flags in debug_flags early, so that the shader cache takes
+    * them into account.
+    */
+   if (driQueryOptionb(config->options, "glsl_correct_derivatives_after_discard"))
+      sscreen->debug_flags |= DBG(FS_CORRECT_DERIVS_AFTER_KILL);
+
+   if (sscreen->debug_flags & DBG(INFO))
+      ac_print_gpu_info(&sscreen->info);
+
+   slab_create_parent(&sscreen->pool_transfers, sizeof(struct si_transfer), 64);
+
+   sscreen->force_aniso = MIN2(16, debug_get_num_option("R600_TEX_ANISO", -1));
+   if (sscreen->force_aniso == -1) {
+      sscreen->force_aniso = MIN2(16, debug_get_num_option("AMD_TEX_ANISO", -1));
+   }
+
+   if (sscreen->force_aniso >= 0) {
+      printf("radeonsi: Forcing anisotropy filter to %ix\n",
+             /* round down to a power of two */
+             1 << util_logbase2(sscreen->force_aniso));
+   }
+
+   (void)simple_mtx_init(&sscreen->aux_context_lock, mtx_plain);
+   (void)simple_mtx_init(&sscreen->gpu_load_mutex, mtx_plain);
+
+   si_init_gs_info(sscreen);
+   if (!si_init_shader_cache(sscreen)) {
+      FREE(sscreen);
+      return NULL;
+   }
+
+   {
+#define OPT_BOOL(name, dflt, description)                                                          \
+   sscreen->options.name = driQueryOptionb(config->options, "radeonsi_" #name);
 #include "si_debug_options.h"
-	}
-
-	si_disk_cache_create(sscreen);
-
-	/* Determine the number of shader compiler threads. */
-	hw_threads = sysconf(_SC_NPROCESSORS_ONLN);
-
-	if (hw_threads >= 12) {
-		num_comp_hi_threads = hw_threads * 3 / 4;
-		num_comp_lo_threads = hw_threads / 3;
-	} else if (hw_threads >= 6) {
-		num_comp_hi_threads = hw_threads - 2;
-		num_comp_lo_threads = hw_threads / 2;
-	} else if (hw_threads >= 2) {
-		num_comp_hi_threads = hw_threads - 1;
-		num_comp_lo_threads = hw_threads / 2;
-	} else {
-		num_comp_hi_threads = 1;
-		num_comp_lo_threads = 1;
-	}
-
-	num_comp_hi_threads = MIN2(num_comp_hi_threads,
-				   ARRAY_SIZE(sscreen->compiler));
-	num_comp_lo_threads = MIN2(num_comp_lo_threads,
-				   ARRAY_SIZE(sscreen->compiler_lowp));
-
-	/* Take a reference on the glsl types for the compiler threads. */
-	glsl_type_singleton_init_or_ref();
-
-	if (!util_queue_init(&sscreen->shader_compiler_queue, "sh",
-			     64, num_comp_hi_threads,
-			     UTIL_QUEUE_INIT_RESIZE_IF_FULL |
-			     UTIL_QUEUE_INIT_SET_FULL_THREAD_AFFINITY)) {
-		si_destroy_shader_cache(sscreen);
-		FREE(sscreen);
-		glsl_type_singleton_decref();
-		return NULL;
-	}
-
-	if (!util_queue_init(&sscreen->shader_compiler_queue_low_priority,
-			     "shlo",
-			     64, num_comp_lo_threads,
-			     UTIL_QUEUE_INIT_RESIZE_IF_FULL |
-			     UTIL_QUEUE_INIT_SET_FULL_THREAD_AFFINITY |
-			     UTIL_QUEUE_INIT_USE_MINIMUM_PRIORITY)) {
-	       si_destroy_shader_cache(sscreen);
-	       FREE(sscreen);
-	       glsl_type_singleton_decref();
-	       return NULL;
-	}
-
-	if (!debug_get_bool_option("RADEON_DISABLE_PERFCOUNTERS", false))
-		si_init_perfcounters(sscreen);
-
-	unsigned prim_discard_vertex_count_threshold, tmp;
-	si_initialize_prim_discard_tunables(sscreen, false,
-					    &prim_discard_vertex_count_threshold,
-					    &tmp);
-	/* Compute-shader-based culling doesn't support VBOs in user SGPRs. */
-	if (prim_discard_vertex_count_threshold == UINT_MAX)
-		sscreen->num_vbos_in_user_sgprs = sscreen->info.chip_class >= GFX9 ? 5 : 1;
-
-	/* Determine tessellation ring info. */
-	bool double_offchip_buffers = sscreen->info.chip_class >= GFX7 &&
-				      sscreen->info.family != CHIP_CARRIZO &&
-				      sscreen->info.family != CHIP_STONEY;
-	/* This must be one less than the maximum number due to a hw limitation.
-	 * Various hardware bugs need this.
-	 */
-	unsigned max_offchip_buffers_per_se;
-
-	if (sscreen->info.chip_class >= GFX10)
-		max_offchip_buffers_per_se = 256;
-	/* Only certain chips can use the maximum value. */
-	else if (sscreen->info.family == CHIP_VEGA12 ||
-	         sscreen->info.family == CHIP_VEGA20)
-		max_offchip_buffers_per_se = double_offchip_buffers ? 128 : 64;
-	else
-		max_offchip_buffers_per_se = double_offchip_buffers ? 127 : 63;
-
-	unsigned max_offchip_buffers = max_offchip_buffers_per_se *
-				       sscreen->info.max_se;
-	unsigned offchip_granularity;
-
-	/* Hawaii has a bug with offchip buffers > 256 that can be worked
-	 * around by setting 4K granularity.
-	 */
-	if (sscreen->info.family == CHIP_HAWAII) {
-		sscreen->tess_offchip_block_dw_size = 4096;
-		offchip_granularity = V_03093C_X_4K_DWORDS;
-	} else {
-		sscreen->tess_offchip_block_dw_size = 8192;
-		offchip_granularity = V_03093C_X_8K_DWORDS;
-	}
-
-	sscreen->tess_factor_ring_size = 32768 * sscreen->info.max_se;
-	sscreen->tess_offchip_ring_size = max_offchip_buffers *
-					  sscreen->tess_offchip_block_dw_size * 4;
-
-	if (sscreen->info.chip_class >= GFX7) {
-		if (sscreen->info.chip_class >= GFX8)
-			--max_offchip_buffers;
-		sscreen->vgt_hs_offchip_param =
-			S_03093C_OFFCHIP_BUFFERING(max_offchip_buffers) |
-			S_03093C_OFFCHIP_GRANULARITY(offchip_granularity);
-	} else {
-		assert(offchip_granularity == V_03093C_X_8K_DWORDS);
-		sscreen->vgt_hs_offchip_param =
-			S_0089B0_OFFCHIP_BUFFERING(max_offchip_buffers);
-	}
-
-	sscreen->has_draw_indirect_multi =
-		(sscreen->info.family >= CHIP_POLARIS10) ||
-		(sscreen->info.chip_class == GFX8 &&
-		 sscreen->info.pfp_fw_version >= 121 &&
-		 sscreen->info.me_fw_version >= 87) ||
-		(sscreen->info.chip_class == GFX7 &&
-		 sscreen->info.pfp_fw_version >= 211 &&
-		 sscreen->info.me_fw_version >= 173) ||
-		(sscreen->info.chip_class == GFX6 &&
-		 sscreen->info.pfp_fw_version >= 79 &&
-		 sscreen->info.me_fw_version >= 142);
-
-	sscreen->has_out_of_order_rast = sscreen->info.has_out_of_order_rast &&
-					 !(sscreen->debug_flags & DBG(NO_OUT_OF_ORDER));
-	sscreen->assume_no_z_fights =
-		driQueryOptionb(config->options, "radeonsi_assume_no_z_fights") ||
-		driQueryOptionb(config->options, "allow_draw_out_of_order");
-	sscreen->commutative_blend_add =
-		driQueryOptionb(config->options, "radeonsi_commutative_blend_add") ||
-		driQueryOptionb(config->options, "allow_draw_out_of_order");
-
-	sscreen->use_ngg = sscreen->info.chip_class >= GFX10 &&
-			   sscreen->info.family != CHIP_NAVI14 &&
-			   !(sscreen->debug_flags & DBG(NO_NGG));
-	sscreen->use_ngg_culling = sscreen->use_ngg &&
-				   !(sscreen->debug_flags & DBG(NO_NGG_CULLING));
-	sscreen->always_use_ngg_culling = sscreen->use_ngg_culling &&
-					  sscreen->debug_flags & DBG(ALWAYS_NGG_CULLING);
-	sscreen->use_ngg_streamout = false;
-
-	/* Only enable primitive binning on APUs by default. */
-	if (sscreen->info.chip_class >= GFX10) {
-		sscreen->dpbb_allowed = true;
-		sscreen->dfsm_allowed = !sscreen->info.has_dedicated_vram;
-	} else if (sscreen->info.chip_class == GFX9) {
-		sscreen->dpbb_allowed = !sscreen->info.has_dedicated_vram;
-		sscreen->dfsm_allowed = !sscreen->info.has_dedicated_vram;
-	}
-
-	/* Process DPBB enable flags. */
-	if (sscreen->debug_flags & DBG(DPBB)) {
-		sscreen->dpbb_allowed = true;
-		if (sscreen->debug_flags & DBG(DFSM))
-			sscreen->dfsm_allowed = true;
-	}
-
-	/* Process DPBB disable flags. */
-	if (sscreen->debug_flags & DBG(NO_DPBB)) {
-		sscreen->dpbb_allowed = false;
-		sscreen->dfsm_allowed = false;
-	} else if (sscreen->debug_flags & DBG(NO_DFSM)) {
-		sscreen->dfsm_allowed = false;
-	}
-
-	/* While it would be nice not to have this flag, we are constrained
-	 * by the reality that LLVM 9.0 has buggy VGPR indexing on GFX9.
-	 */
-	sscreen->llvm_has_working_vgpr_indexing = sscreen->info.chip_class != GFX9;
-
-	sscreen->dcc_msaa_allowed =
-		!(sscreen->debug_flags & DBG(NO_DCC_MSAA));
-
-	(void) simple_mtx_init(&sscreen->shader_parts_mutex, mtx_plain);
-	sscreen->use_monolithic_shaders =
-		(sscreen->debug_flags & DBG(MONOLITHIC_SHADERS)) != 0;
-
-	sscreen->barrier_flags.cp_to_L2 = SI_CONTEXT_INV_SCACHE |
-					  SI_CONTEXT_INV_VCACHE;
-	if (sscreen->info.chip_class <= GFX8) {
-		sscreen->barrier_flags.cp_to_L2 |= SI_CONTEXT_INV_L2;
-		sscreen->barrier_flags.L2_to_cp |= SI_CONTEXT_WB_L2;
-	}
-
-	if (debug_get_bool_option("RADEON_DUMP_SHADERS", false))
-		sscreen->debug_flags |= DBG_ALL_SHADERS;
-
-	/* Syntax:
-	 *     EQAA=s,z,c
-	 * Example:
-	 *     EQAA=8,4,2
-
-	 * That means 8 coverage samples, 4 Z/S samples, and 2 color samples.
-	 * Constraints:
-	 *     s >= z >= c (ignoring this only wastes memory)
-	 *     s = [2..16]
-	 *     z = [2..8]
-	 *     c = [2..8]
-	 *
-	 * Only MSAA color and depth buffers are overriden.
-	 */
-	if (sscreen->info.has_eqaa_surface_allocator) {
-		const char *eqaa = debug_get_option("EQAA", NULL);
-		unsigned s,z,f;
-
-		if (eqaa && sscanf(eqaa, "%u,%u,%u", &s, &z, &f) == 3 && s && z && f) {
-			sscreen->eqaa_force_coverage_samples = s;
-			sscreen->eqaa_force_z_samples = z;
-			sscreen->eqaa_force_color_samples = f;
-		}
-	}
-
-	sscreen->ge_wave_size = 64;
-	sscreen->ps_wave_size = 64;
-	sscreen->compute_wave_size = 64;
-
-	if (sscreen->info.chip_class >= GFX10) {
-		/* Pixels shaders: Wave64 is recommended.
-		 * Compute shaders: There are piglit failures with Wave32.
-		 */
-		sscreen->ge_wave_size = 32;
-
-		if (sscreen->debug_flags & DBG(W32_GE))
-			sscreen->ge_wave_size = 32;
-		if (sscreen->debug_flags & DBG(W32_PS))
-			sscreen->ps_wave_size = 32;
-		if (sscreen->debug_flags & DBG(W32_CS))
-			sscreen->compute_wave_size = 32;
-
-		if (sscreen->debug_flags & DBG(W64_GE))
-			sscreen->ge_wave_size = 64;
-		if (sscreen->debug_flags & DBG(W64_PS))
-			sscreen->ps_wave_size = 64;
-		if (sscreen->debug_flags & DBG(W64_CS))
-			sscreen->compute_wave_size = 64;
-	}
-
-	/* Create the auxiliary context. This must be done last. */
-	sscreen->aux_context = si_create_context(&sscreen->b,
-		(sscreen->options.aux_debug ? PIPE_CONTEXT_DEBUG : 0) |
-		(sscreen->info.has_graphics ? 0 : PIPE_CONTEXT_COMPUTE_ONLY));
-	if (sscreen->options.aux_debug) {
-		struct u_log_context *log = CALLOC_STRUCT(u_log_context);
-		u_log_context_init(log);
-		sscreen->aux_context->set_log_context(sscreen->aux_context, log);
-	}
-
-	if (test_flags & DBG(TEST_DMA))
-		si_test_dma(sscreen);
-
-	if (test_flags & DBG(TEST_DMA_PERF)) {
-		si_test_dma_perf(sscreen);
-	}
-
-	if (test_flags & (DBG(TEST_VMFAULT_CP) |
-				      DBG(TEST_VMFAULT_SDMA) |
-				      DBG(TEST_VMFAULT_SHADER)))
-		si_test_vmfault(sscreen, test_flags);
-
-	if (test_flags & DBG(TEST_GDS))
-		si_test_gds((struct si_context*)sscreen->aux_context);
-
-	if (test_flags & DBG(TEST_GDS_MM)) {
-		si_test_gds_memory_management((struct si_context*)sscreen->aux_context,
-					      32 * 1024, 4, RADEON_DOMAIN_GDS);
-	}
-	if (test_flags & DBG(TEST_GDS_OA_MM)) {
-		si_test_gds_memory_management((struct si_context*)sscreen->aux_context,
-					      4, 1, RADEON_DOMAIN_OA);
-	}
-
-	STATIC_ASSERT(sizeof(union si_vgt_stages_key) == 4);
-	return &sscreen->b;
+   }
+
+   si_disk_cache_create(sscreen);
+
+   /* Determine the number of shader compiler threads. */
+   hw_threads = sysconf(_SC_NPROCESSORS_ONLN);
+
+   if (hw_threads >= 12) {
+      num_comp_hi_threads = hw_threads * 3 / 4;
+      num_comp_lo_threads = hw_threads / 3;
+   } else if (hw_threads >= 6) {
+      num_comp_hi_threads = hw_threads - 2;
+      num_comp_lo_threads = hw_threads / 2;
+   } else if (hw_threads >= 2) {
+      num_comp_hi_threads = hw_threads - 1;
+      num_comp_lo_threads = hw_threads / 2;
+   } else {
+      num_comp_hi_threads = 1;
+      num_comp_lo_threads = 1;
+   }
+
+   num_comp_hi_threads = MIN2(num_comp_hi_threads, ARRAY_SIZE(sscreen->compiler));
+   num_comp_lo_threads = MIN2(num_comp_lo_threads, ARRAY_SIZE(sscreen->compiler_lowp));
+
+   /* Take a reference on the glsl types for the compiler threads. */
+   glsl_type_singleton_init_or_ref();
+
+   if (!util_queue_init(
+          &sscreen->shader_compiler_queue, "sh", 64, num_comp_hi_threads,
+          UTIL_QUEUE_INIT_RESIZE_IF_FULL | UTIL_QUEUE_INIT_SET_FULL_THREAD_AFFINITY)) {
+      si_destroy_shader_cache(sscreen);
+      FREE(sscreen);
+      glsl_type_singleton_decref();
+      return NULL;
+   }
+
+   if (!util_queue_init(&sscreen->shader_compiler_queue_low_priority, "shlo", 64,
+                        num_comp_lo_threads,
+                        UTIL_QUEUE_INIT_RESIZE_IF_FULL | UTIL_QUEUE_INIT_SET_FULL_THREAD_AFFINITY |
+                           UTIL_QUEUE_INIT_USE_MINIMUM_PRIORITY)) {
+      si_destroy_shader_cache(sscreen);
+      FREE(sscreen);
+      glsl_type_singleton_decref();
+      return NULL;
+   }
+
+   if (!debug_get_bool_option("RADEON_DISABLE_PERFCOUNTERS", false))
+      si_init_perfcounters(sscreen);
+
+   unsigned prim_discard_vertex_count_threshold, tmp;
+   si_initialize_prim_discard_tunables(sscreen, false, &prim_discard_vertex_count_threshold, &tmp);
+   /* Compute-shader-based culling doesn't support VBOs in user SGPRs. */
+   if (prim_discard_vertex_count_threshold == UINT_MAX)
+      sscreen->num_vbos_in_user_sgprs = sscreen->info.chip_class >= GFX9 ? 5 : 1;
+
+   /* Determine tessellation ring info. */
+   bool double_offchip_buffers = sscreen->info.chip_class >= GFX7 &&
+                                 sscreen->info.family != CHIP_CARRIZO &&
+                                 sscreen->info.family != CHIP_STONEY;
+   /* This must be one less than the maximum number due to a hw limitation.
+    * Various hardware bugs need this.
+    */
+   unsigned max_offchip_buffers_per_se;
+
+   if (sscreen->info.chip_class >= GFX10)
+      max_offchip_buffers_per_se = 256;
+   /* Only certain chips can use the maximum value. */
+   else if (sscreen->info.family == CHIP_VEGA12 || sscreen->info.family == CHIP_VEGA20)
+      max_offchip_buffers_per_se = double_offchip_buffers ? 128 : 64;
+   else
+      max_offchip_buffers_per_se = double_offchip_buffers ? 127 : 63;
+
+   unsigned max_offchip_buffers = max_offchip_buffers_per_se * sscreen->info.max_se;
+   unsigned offchip_granularity;
+
+   /* Hawaii has a bug with offchip buffers > 256 that can be worked
+    * around by setting 4K granularity.
+    */
+   if (sscreen->info.family == CHIP_HAWAII) {
+      sscreen->tess_offchip_block_dw_size = 4096;
+      offchip_granularity = V_03093C_X_4K_DWORDS;
+   } else {
+      sscreen->tess_offchip_block_dw_size = 8192;
+      offchip_granularity = V_03093C_X_8K_DWORDS;
+   }
+
+   sscreen->tess_factor_ring_size = 32768 * sscreen->info.max_se;
+   sscreen->tess_offchip_ring_size = max_offchip_buffers * sscreen->tess_offchip_block_dw_size * 4;
+
+   if (sscreen->info.chip_class >= GFX7) {
+      if (sscreen->info.chip_class >= GFX8)
+         --max_offchip_buffers;
+      sscreen->vgt_hs_offchip_param = S_03093C_OFFCHIP_BUFFERING(max_offchip_buffers) |
+                                      S_03093C_OFFCHIP_GRANULARITY(offchip_granularity);
+   } else {
+      assert(offchip_granularity == V_03093C_X_8K_DWORDS);
+      sscreen->vgt_hs_offchip_param = S_0089B0_OFFCHIP_BUFFERING(max_offchip_buffers);
+   }
+
+   sscreen->has_draw_indirect_multi =
+      (sscreen->info.family >= CHIP_POLARIS10) ||
+      (sscreen->info.chip_class == GFX8 && sscreen->info.pfp_fw_version >= 121 &&
+       sscreen->info.me_fw_version >= 87) ||
+      (sscreen->info.chip_class == GFX7 && sscreen->info.pfp_fw_version >= 211 &&
+       sscreen->info.me_fw_version >= 173) ||
+      (sscreen->info.chip_class == GFX6 && sscreen->info.pfp_fw_version >= 79 &&
+       sscreen->info.me_fw_version >= 142);
+
+   sscreen->has_out_of_order_rast =
+      sscreen->info.has_out_of_order_rast && !(sscreen->debug_flags & DBG(NO_OUT_OF_ORDER));
+   sscreen->assume_no_z_fights = driQueryOptionb(config->options, "radeonsi_assume_no_z_fights") ||
+                                 driQueryOptionb(config->options, "allow_draw_out_of_order");
+   sscreen->commutative_blend_add =
+      driQueryOptionb(config->options, "radeonsi_commutative_blend_add") ||
+      driQueryOptionb(config->options, "allow_draw_out_of_order");
+
+   sscreen->use_ngg = sscreen->info.chip_class >= GFX10 && sscreen->info.family != CHIP_NAVI14 &&
+                      !(sscreen->debug_flags & DBG(NO_NGG));
+   sscreen->use_ngg_culling = sscreen->use_ngg && !(sscreen->debug_flags & DBG(NO_NGG_CULLING));
+   sscreen->always_use_ngg_culling =
+      sscreen->use_ngg_culling && sscreen->debug_flags & DBG(ALWAYS_NGG_CULLING);
+   sscreen->use_ngg_streamout = false;
+
+   /* Only enable primitive binning on APUs by default. */
+   if (sscreen->info.chip_class >= GFX10) {
+      sscreen->dpbb_allowed = true;
+      sscreen->dfsm_allowed = !sscreen->info.has_dedicated_vram;
+   } else if (sscreen->info.chip_class == GFX9) {
+      sscreen->dpbb_allowed = !sscreen->info.has_dedicated_vram;
+      sscreen->dfsm_allowed = !sscreen->info.has_dedicated_vram;
+   }
+
+   /* Process DPBB enable flags. */
+   if (sscreen->debug_flags & DBG(DPBB)) {
+      sscreen->dpbb_allowed = true;
+      if (sscreen->debug_flags & DBG(DFSM))
+         sscreen->dfsm_allowed = true;
+   }
+
+   /* Process DPBB disable flags. */
+   if (sscreen->debug_flags & DBG(NO_DPBB)) {
+      sscreen->dpbb_allowed = false;
+      sscreen->dfsm_allowed = false;
+   } else if (sscreen->debug_flags & DBG(NO_DFSM)) {
+      sscreen->dfsm_allowed = false;
+   }
+
+   /* While it would be nice not to have this flag, we are constrained
+    * by the reality that LLVM 9.0 has buggy VGPR indexing on GFX9.
+    */
+   sscreen->llvm_has_working_vgpr_indexing = sscreen->info.chip_class != GFX9;
+
+   sscreen->dcc_msaa_allowed = !(sscreen->debug_flags & DBG(NO_DCC_MSAA));
+
+   (void)simple_mtx_init(&sscreen->shader_parts_mutex, mtx_plain);
+   sscreen->use_monolithic_shaders = (sscreen->debug_flags & DBG(MONOLITHIC_SHADERS)) != 0;
+
+   sscreen->barrier_flags.cp_to_L2 = SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE;
+   if (sscreen->info.chip_class <= GFX8) {
+      sscreen->barrier_flags.cp_to_L2 |= SI_CONTEXT_INV_L2;
+      sscreen->barrier_flags.L2_to_cp |= SI_CONTEXT_WB_L2;
+   }
+
+   if (debug_get_bool_option("RADEON_DUMP_SHADERS", false))
+      sscreen->debug_flags |= DBG_ALL_SHADERS;
+
+   /* Syntax:
+    *     EQAA=s,z,c
+    * Example:
+    *     EQAA=8,4,2
+
+    * That means 8 coverage samples, 4 Z/S samples, and 2 color samples.
+    * Constraints:
+    *     s >= z >= c (ignoring this only wastes memory)
+    *     s = [2..16]
+    *     z = [2..8]
+    *     c = [2..8]
+    *
+    * Only MSAA color and depth buffers are overriden.
+    */
+   if (sscreen->info.has_eqaa_surface_allocator) {
+      const char *eqaa = debug_get_option("EQAA", NULL);
+      unsigned s, z, f;
+
+      if (eqaa && sscanf(eqaa, "%u,%u,%u", &s, &z, &f) == 3 && s && z && f) {
+         sscreen->eqaa_force_coverage_samples = s;
+         sscreen->eqaa_force_z_samples = z;
+         sscreen->eqaa_force_color_samples = f;
+      }
+   }
+
+   sscreen->ge_wave_size = 64;
+   sscreen->ps_wave_size = 64;
+   sscreen->compute_wave_size = 64;
+
+   if (sscreen->info.chip_class >= GFX10) {
+      /* Pixels shaders: Wave64 is recommended.
+       * Compute shaders: There are piglit failures with Wave32.
+       */
+      sscreen->ge_wave_size = 32;
+
+      if (sscreen->debug_flags & DBG(W32_GE))
+         sscreen->ge_wave_size = 32;
+      if (sscreen->debug_flags & DBG(W32_PS))
+         sscreen->ps_wave_size = 32;
+      if (sscreen->debug_flags & DBG(W32_CS))
+         sscreen->compute_wave_size = 32;
+
+      if (sscreen->debug_flags & DBG(W64_GE))
+         sscreen->ge_wave_size = 64;
+      if (sscreen->debug_flags & DBG(W64_PS))
+         sscreen->ps_wave_size = 64;
+      if (sscreen->debug_flags & DBG(W64_CS))
+         sscreen->compute_wave_size = 64;
+   }
+
+   /* Create the auxiliary context. This must be done last. */
+   sscreen->aux_context = si_create_context(
+      &sscreen->b, (sscreen->options.aux_debug ? PIPE_CONTEXT_DEBUG : 0) |
+                      (sscreen->info.has_graphics ? 0 : PIPE_CONTEXT_COMPUTE_ONLY));
+   if (sscreen->options.aux_debug) {
+      struct u_log_context *log = CALLOC_STRUCT(u_log_context);
+      u_log_context_init(log);
+      sscreen->aux_context->set_log_context(sscreen->aux_context, log);
+   }
+
+   if (test_flags & DBG(TEST_DMA))
+      si_test_dma(sscreen);
+
+   if (test_flags & DBG(TEST_DMA_PERF)) {
+      si_test_dma_perf(sscreen);
+   }
+
+   if (test_flags & (DBG(TEST_VMFAULT_CP) | DBG(TEST_VMFAULT_SDMA) | DBG(TEST_VMFAULT_SHADER)))
+      si_test_vmfault(sscreen, test_flags);
+
+   if (test_flags & DBG(TEST_GDS))
+      si_test_gds((struct si_context *)sscreen->aux_context);
+
+   if (test_flags & DBG(TEST_GDS_MM)) {
+      si_test_gds_memory_management((struct si_context *)sscreen->aux_context, 32 * 1024, 4,
+                                    RADEON_DOMAIN_GDS);
+   }
+   if (test_flags & DBG(TEST_GDS_OA_MM)) {
+      si_test_gds_memory_management((struct si_context *)sscreen->aux_context, 4, 1,
+                                    RADEON_DOMAIN_OA);
+   }
+
+   STATIC_ASSERT(sizeof(union si_vgt_stages_key) == 4);
+   return &sscreen->b;
 }
 
 struct pipe_screen *radeonsi_screen_create(int fd, const struct pipe_screen_config *config)
 {
-	drmVersionPtr version = drmGetVersion(fd);
-	struct radeon_winsys *rw = NULL;
-
-	switch (version->version_major) {
-	case 2:
-		rw = radeon_drm_winsys_create(fd, config, radeonsi_screen_create_impl);
-		break;
-	case 3:
-		rw = amdgpu_winsys_create(fd, config, radeonsi_screen_create_impl);
-		break;
-	}
-
-	drmFreeVersion(version);
-	return rw ? rw->screen : NULL;
+   drmVersionPtr version = drmGetVersion(fd);
+   struct radeon_winsys *rw = NULL;
+
+   switch (version->version_major) {
+   case 2:
+      rw = radeon_drm_winsys_create(fd, config, radeonsi_screen_create_impl);
+      break;
+   case 3:
+      rw = amdgpu_winsys_create(fd, config, radeonsi_screen_create_impl);
+      break;
+   }
+
+   drmFreeVersion(version);
+   return rw ? rw->screen : NULL;
 }
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
index 400f2152243..30f7832f71c 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -27,7 +27,6 @@
 
 #include "si_shader.h"
 #include "si_state.h"
-
 #include "util/u_dynarray.h"
 #include "util/u_idalloc.h"
 #include "util/u_threaded_context.h"
@@ -38,201 +37,207 @@
 #define SI_BIG_ENDIAN 0
 #endif
 
-#define ATI_VENDOR_ID			0x1002
-#define SI_PRIM_DISCARD_DEBUG		0
-#define SI_NOT_QUERY			0xffffffff
+#define ATI_VENDOR_ID         0x1002
+#define SI_PRIM_DISCARD_DEBUG 0
+#define SI_NOT_QUERY          0xffffffff
 
 /* The base vertex and primitive restart can be any number, but we must pick
  * one which will mean "unknown" for the purpose of state tracking and
  * the number shouldn't be a commonly-used one. */
-#define SI_BASE_VERTEX_UNKNOWN		INT_MIN
-#define SI_RESTART_INDEX_UNKNOWN	INT_MIN
-#define SI_INSTANCE_COUNT_UNKNOWN	INT_MIN
-#define SI_NUM_SMOOTH_AA_SAMPLES	8
-#define SI_MAX_POINT_SIZE		2048
-#define SI_GS_PER_ES			128
+#define SI_BASE_VERTEX_UNKNOWN    INT_MIN
+#define SI_RESTART_INDEX_UNKNOWN  INT_MIN
+#define SI_INSTANCE_COUNT_UNKNOWN INT_MIN
+#define SI_NUM_SMOOTH_AA_SAMPLES  8
+#define SI_MAX_POINT_SIZE         2048
+#define SI_GS_PER_ES              128
 /* Alignment for optimal CP DMA performance. */
-#define SI_CPDMA_ALIGNMENT		32
+#define SI_CPDMA_ALIGNMENT 32
 
 /* Tunables for compute-based clear_buffer and copy_buffer: */
-#define SI_COMPUTE_CLEAR_DW_PER_THREAD	4
-#define SI_COMPUTE_COPY_DW_PER_THREAD	4
-#define SI_COMPUTE_DST_CACHE_POLICY	L2_STREAM
+#define SI_COMPUTE_CLEAR_DW_PER_THREAD 4
+#define SI_COMPUTE_COPY_DW_PER_THREAD  4
+#define SI_COMPUTE_DST_CACHE_POLICY    L2_STREAM
 
 /* Pipeline & streamout query controls. */
-#define SI_CONTEXT_START_PIPELINE_STATS	(1 << 0)
-#define SI_CONTEXT_STOP_PIPELINE_STATS	(1 << 1)
+#define SI_CONTEXT_START_PIPELINE_STATS  (1 << 0)
+#define SI_CONTEXT_STOP_PIPELINE_STATS   (1 << 1)
 #define SI_CONTEXT_FLUSH_FOR_RENDER_COND (1 << 2)
 /* Instruction cache. */
-#define SI_CONTEXT_INV_ICACHE		(1 << 3)
+#define SI_CONTEXT_INV_ICACHE (1 << 3)
 /* Scalar cache. (GFX6-9: scalar L1; GFX10: scalar L0)
  * GFX10: This also invalidates the L1 shader array cache. */
-#define SI_CONTEXT_INV_SCACHE		(1 << 4)
+#define SI_CONTEXT_INV_SCACHE (1 << 4)
 /* Vector cache. (GFX6-9: vector L1; GFX10: vector L0)
  * GFX10: This also invalidates the L1 shader array cache. */
-#define SI_CONTEXT_INV_VCACHE		(1 << 5)
+#define SI_CONTEXT_INV_VCACHE (1 << 5)
 /* L2 cache + L2 metadata cache writeback & invalidate.
  * GFX6-8: Used by shaders only. GFX9-10: Used by everything. */
-#define SI_CONTEXT_INV_L2		(1 << 6)
+#define SI_CONTEXT_INV_L2 (1 << 6)
 /* L2 writeback (write dirty L2 lines to memory for non-L2 clients).
  * Only used for coherency with non-L2 clients like CB, DB, CP on GFX6-8.
  * GFX6-7 will do complete invalidation, because the writeback is unsupported. */
-#define SI_CONTEXT_WB_L2		(1 << 7)
+#define SI_CONTEXT_WB_L2 (1 << 7)
 /* Writeback & invalidate the L2 metadata cache only. It can only be coupled with
  * a CB or DB flush. */
-#define SI_CONTEXT_INV_L2_METADATA	(1 << 8)
+#define SI_CONTEXT_INV_L2_METADATA (1 << 8)
 /* Framebuffer caches. */
-#define SI_CONTEXT_FLUSH_AND_INV_DB	(1 << 9)
+#define SI_CONTEXT_FLUSH_AND_INV_DB      (1 << 9)
 #define SI_CONTEXT_FLUSH_AND_INV_DB_META (1 << 10)
-#define SI_CONTEXT_FLUSH_AND_INV_CB	(1 << 11)
+#define SI_CONTEXT_FLUSH_AND_INV_CB      (1 << 11)
 /* Engine synchronization. */
-#define SI_CONTEXT_VS_PARTIAL_FLUSH	(1 << 12)
-#define SI_CONTEXT_PS_PARTIAL_FLUSH	(1 << 13)
-#define SI_CONTEXT_CS_PARTIAL_FLUSH	(1 << 14)
-#define SI_CONTEXT_VGT_FLUSH		(1 << 15)
-#define SI_CONTEXT_VGT_STREAMOUT_SYNC	(1 << 16)
-
-#define SI_PREFETCH_VBO_DESCRIPTORS	(1 << 0)
-#define SI_PREFETCH_LS			(1 << 1)
-#define SI_PREFETCH_HS			(1 << 2)
-#define SI_PREFETCH_ES			(1 << 3)
-#define SI_PREFETCH_GS			(1 << 4)
-#define SI_PREFETCH_VS			(1 << 5)
-#define SI_PREFETCH_PS			(1 << 6)
-
-#define SI_MAX_BORDER_COLORS		4096
-#define SI_MAX_VIEWPORTS		16
-#define SIX_BITS			0x3F
-#define SI_MAP_BUFFER_ALIGNMENT		64
+#define SI_CONTEXT_VS_PARTIAL_FLUSH   (1 << 12)
+#define SI_CONTEXT_PS_PARTIAL_FLUSH   (1 << 13)
+#define SI_CONTEXT_CS_PARTIAL_FLUSH   (1 << 14)
+#define SI_CONTEXT_VGT_FLUSH          (1 << 15)
+#define SI_CONTEXT_VGT_STREAMOUT_SYNC (1 << 16)
+
+#define SI_PREFETCH_VBO_DESCRIPTORS (1 << 0)
+#define SI_PREFETCH_LS              (1 << 1)
+#define SI_PREFETCH_HS              (1 << 2)
+#define SI_PREFETCH_ES              (1 << 3)
+#define SI_PREFETCH_GS              (1 << 4)
+#define SI_PREFETCH_VS              (1 << 5)
+#define SI_PREFETCH_PS              (1 << 6)
+
+#define SI_MAX_BORDER_COLORS              4096
+#define SI_MAX_VIEWPORTS                  16
+#define SIX_BITS                          0x3F
+#define SI_MAP_BUFFER_ALIGNMENT           64
 #define SI_MAX_VARIABLE_THREADS_PER_BLOCK 1024
 
-#define SI_RESOURCE_FLAG_TRANSFER	(PIPE_RESOURCE_FLAG_DRV_PRIV << 0)
-#define SI_RESOURCE_FLAG_FLUSHED_DEPTH	(PIPE_RESOURCE_FLAG_DRV_PRIV << 1)
+#define SI_RESOURCE_FLAG_TRANSFER          (PIPE_RESOURCE_FLAG_DRV_PRIV << 0)
+#define SI_RESOURCE_FLAG_FLUSHED_DEPTH     (PIPE_RESOURCE_FLAG_DRV_PRIV << 1)
 #define SI_RESOURCE_FLAG_FORCE_MSAA_TILING (PIPE_RESOURCE_FLAG_DRV_PRIV << 2)
-#define SI_RESOURCE_FLAG_DISABLE_DCC	(PIPE_RESOURCE_FLAG_DRV_PRIV << 3)
-#define SI_RESOURCE_FLAG_UNMAPPABLE	(PIPE_RESOURCE_FLAG_DRV_PRIV << 4)
-#define SI_RESOURCE_FLAG_READ_ONLY	(PIPE_RESOURCE_FLAG_DRV_PRIV << 5)
-#define SI_RESOURCE_FLAG_32BIT		(PIPE_RESOURCE_FLAG_DRV_PRIV << 6)
-#define SI_RESOURCE_FLAG_CLEAR		(PIPE_RESOURCE_FLAG_DRV_PRIV << 7)
+#define SI_RESOURCE_FLAG_DISABLE_DCC       (PIPE_RESOURCE_FLAG_DRV_PRIV << 3)
+#define SI_RESOURCE_FLAG_UNMAPPABLE        (PIPE_RESOURCE_FLAG_DRV_PRIV << 4)
+#define SI_RESOURCE_FLAG_READ_ONLY         (PIPE_RESOURCE_FLAG_DRV_PRIV << 5)
+#define SI_RESOURCE_FLAG_32BIT             (PIPE_RESOURCE_FLAG_DRV_PRIV << 6)
+#define SI_RESOURCE_FLAG_CLEAR             (PIPE_RESOURCE_FLAG_DRV_PRIV << 7)
 /* For const_uploader, upload data via GTT and copy to VRAM on context flush via SDMA. */
-#define SI_RESOURCE_FLAG_UPLOAD_FLUSH_EXPLICIT_VIA_SDMA  (PIPE_RESOURCE_FLAG_DRV_PRIV << 8)
+#define SI_RESOURCE_FLAG_UPLOAD_FLUSH_EXPLICIT_VIA_SDMA (PIPE_RESOURCE_FLAG_DRV_PRIV << 8)
 /* Set a micro tile mode: */
-#define SI_RESOURCE_FLAG_FORCE_MICRO_TILE_MODE  (PIPE_RESOURCE_FLAG_DRV_PRIV << 9)
-#define SI_RESOURCE_FLAG_MICRO_TILE_MODE_SHIFT  (util_logbase2(PIPE_RESOURCE_FLAG_DRV_PRIV) + 10)
-#define SI_RESOURCE_FLAG_MICRO_TILE_MODE_SET(x) (((x) & 0x3) << SI_RESOURCE_FLAG_MICRO_TILE_MODE_SHIFT)
-#define SI_RESOURCE_FLAG_MICRO_TILE_MODE_GET(x) (((x) >> SI_RESOURCE_FLAG_MICRO_TILE_MODE_SHIFT) & 0x3)
+#define SI_RESOURCE_FLAG_FORCE_MICRO_TILE_MODE (PIPE_RESOURCE_FLAG_DRV_PRIV << 9)
+#define SI_RESOURCE_FLAG_MICRO_TILE_MODE_SHIFT (util_logbase2(PIPE_RESOURCE_FLAG_DRV_PRIV) + 10)
+#define SI_RESOURCE_FLAG_MICRO_TILE_MODE_SET(x)                                                    \
+   (((x)&0x3) << SI_RESOURCE_FLAG_MICRO_TILE_MODE_SHIFT)
+#define SI_RESOURCE_FLAG_MICRO_TILE_MODE_GET(x)                                                    \
+   (((x) >> SI_RESOURCE_FLAG_MICRO_TILE_MODE_SHIFT) & 0x3)
 
 enum si_clear_code
 {
-	DCC_CLEAR_COLOR_0000   = 0x00000000,
-	DCC_CLEAR_COLOR_0001   = 0x40404040,
-	DCC_CLEAR_COLOR_1110   = 0x80808080,
-	DCC_CLEAR_COLOR_1111   = 0xC0C0C0C0,
-	DCC_CLEAR_COLOR_REG    = 0x20202020,
-	DCC_UNCOMPRESSED       = 0xFFFFFFFF,
+   DCC_CLEAR_COLOR_0000 = 0x00000000,
+   DCC_CLEAR_COLOR_0001 = 0x40404040,
+   DCC_CLEAR_COLOR_1110 = 0x80808080,
+   DCC_CLEAR_COLOR_1111 = 0xC0C0C0C0,
+   DCC_CLEAR_COLOR_REG = 0x20202020,
+   DCC_UNCOMPRESSED = 0xFFFFFFFF,
 };
 
-#define SI_IMAGE_ACCESS_AS_BUFFER	(1 << 7)
+#define SI_IMAGE_ACCESS_AS_BUFFER (1 << 7)
 
 /* Debug flags. */
-enum {
-	/* Shader logging options: */
-	DBG_VS = PIPE_SHADER_VERTEX,
-	DBG_PS = PIPE_SHADER_FRAGMENT,
-	DBG_GS = PIPE_SHADER_GEOMETRY,
-	DBG_TCS = PIPE_SHADER_TESS_CTRL,
-	DBG_TES = PIPE_SHADER_TESS_EVAL,
-	DBG_CS = PIPE_SHADER_COMPUTE,
-	DBG_NO_IR,
-	DBG_NO_NIR,
-	DBG_NO_ASM,
-	DBG_PREOPT_IR,
-
-	/* Shader compiler options the shader cache should be aware of: */
-	DBG_FS_CORRECT_DERIVS_AFTER_KILL,
-	DBG_GISEL,
-	DBG_W32_GE,
-	DBG_W32_PS,
-	DBG_W32_CS,
-	DBG_W64_GE,
-	DBG_W64_PS,
-	DBG_W64_CS,
-
-	/* Shader compiler options (with no effect on the shader cache): */
-	DBG_CHECK_IR,
-	DBG_MONOLITHIC_SHADERS,
-	DBG_NO_OPT_VARIANT,
-
-	/* Information logging options: */
-	DBG_INFO,
-	DBG_TEX,
-	DBG_COMPUTE,
-	DBG_VM,
-	DBG_CACHE_STATS,
-
-	/* Driver options: */
-	DBG_FORCE_SDMA,
-	DBG_NO_SDMA,
-	DBG_NO_SDMA_CLEARS,
-	DBG_NO_SDMA_COPY_IMAGE,
-	DBG_NO_WC,
-	DBG_CHECK_VM,
-	DBG_RESERVE_VMID,
-	DBG_ZERO_VRAM,
-
-	/* 3D engine options: */
-	DBG_NO_GFX,
-	DBG_NO_NGG,
-	DBG_ALWAYS_NGG_CULLING,
-	DBG_NO_NGG_CULLING,
-	DBG_ALWAYS_PD,
-	DBG_PD,
-	DBG_NO_PD,
-	DBG_SWITCH_ON_EOP,
-	DBG_NO_OUT_OF_ORDER,
-	DBG_NO_DPBB,
-	DBG_NO_DFSM,
-	DBG_DPBB,
-	DBG_DFSM,
-	DBG_NO_HYPERZ,
-	DBG_NO_RB_PLUS,
-	DBG_NO_2D_TILING,
-	DBG_NO_TILING,
-	DBG_NO_DCC,
-	DBG_NO_DCC_CLEAR,
-	DBG_NO_DCC_FB,
-	DBG_NO_DCC_MSAA,
-	DBG_NO_FMASK,
-
-	DBG_COUNT
+enum
+{
+   /* Shader logging options: */
+   DBG_VS = PIPE_SHADER_VERTEX,
+   DBG_PS = PIPE_SHADER_FRAGMENT,
+   DBG_GS = PIPE_SHADER_GEOMETRY,
+   DBG_TCS = PIPE_SHADER_TESS_CTRL,
+   DBG_TES = PIPE_SHADER_TESS_EVAL,
+   DBG_CS = PIPE_SHADER_COMPUTE,
+   DBG_NO_IR,
+   DBG_NO_NIR,
+   DBG_NO_ASM,
+   DBG_PREOPT_IR,
+
+   /* Shader compiler options the shader cache should be aware of: */
+   DBG_FS_CORRECT_DERIVS_AFTER_KILL,
+   DBG_GISEL,
+   DBG_W32_GE,
+   DBG_W32_PS,
+   DBG_W32_CS,
+   DBG_W64_GE,
+   DBG_W64_PS,
+   DBG_W64_CS,
+
+   /* Shader compiler options (with no effect on the shader cache): */
+   DBG_CHECK_IR,
+   DBG_MONOLITHIC_SHADERS,
+   DBG_NO_OPT_VARIANT,
+
+   /* Information logging options: */
+   DBG_INFO,
+   DBG_TEX,
+   DBG_COMPUTE,
+   DBG_VM,
+   DBG_CACHE_STATS,
+
+   /* Driver options: */
+   DBG_FORCE_SDMA,
+   DBG_NO_SDMA,
+   DBG_NO_SDMA_CLEARS,
+   DBG_NO_SDMA_COPY_IMAGE,
+   DBG_NO_WC,
+   DBG_CHECK_VM,
+   DBG_RESERVE_VMID,
+   DBG_ZERO_VRAM,
+
+   /* 3D engine options: */
+   DBG_NO_GFX,
+   DBG_NO_NGG,
+   DBG_ALWAYS_NGG_CULLING,
+   DBG_NO_NGG_CULLING,
+   DBG_ALWAYS_PD,
+   DBG_PD,
+   DBG_NO_PD,
+   DBG_SWITCH_ON_EOP,
+   DBG_NO_OUT_OF_ORDER,
+   DBG_NO_DPBB,
+   DBG_NO_DFSM,
+   DBG_DPBB,
+   DBG_DFSM,
+   DBG_NO_HYPERZ,
+   DBG_NO_RB_PLUS,
+   DBG_NO_2D_TILING,
+   DBG_NO_TILING,
+   DBG_NO_DCC,
+   DBG_NO_DCC_CLEAR,
+   DBG_NO_DCC_FB,
+   DBG_NO_DCC_MSAA,
+   DBG_NO_FMASK,
+
+   DBG_COUNT
 };
 
-enum {
-	/* Tests: */
-	DBG_TEST_DMA,
-	DBG_TEST_VMFAULT_CP,
-	DBG_TEST_VMFAULT_SDMA,
-	DBG_TEST_VMFAULT_SHADER,
-	DBG_TEST_DMA_PERF,
-	DBG_TEST_GDS,
-	DBG_TEST_GDS_MM,
-	DBG_TEST_GDS_OA_MM,
+enum
+{
+   /* Tests: */
+   DBG_TEST_DMA,
+   DBG_TEST_VMFAULT_CP,
+   DBG_TEST_VMFAULT_SDMA,
+   DBG_TEST_VMFAULT_SHADER,
+   DBG_TEST_DMA_PERF,
+   DBG_TEST_GDS,
+   DBG_TEST_GDS_MM,
+   DBG_TEST_GDS_OA_MM,
 };
 
-#define DBG_ALL_SHADERS		(((1 << (DBG_CS + 1)) - 1))
-#define DBG(name)		(1ull << DBG_##name)
+#define DBG_ALL_SHADERS (((1 << (DBG_CS + 1)) - 1))
+#define DBG(name)       (1ull << DBG_##name)
 
-enum si_cache_policy {
-	L2_BYPASS,
-	L2_STREAM, /* same as SLC=1 */
-	L2_LRU,    /* same as SLC=0 */
+enum si_cache_policy
+{
+   L2_BYPASS,
+   L2_STREAM, /* same as SLC=1 */
+   L2_LRU,    /* same as SLC=0 */
 };
 
-enum si_coherency {
-	SI_COHERENCY_NONE, /* no cache flushes needed */
-	SI_COHERENCY_SHADER,
-	SI_COHERENCY_CB_META,
-	SI_COHERENCY_CP,
+enum si_coherency
+{
+   SI_COHERENCY_NONE, /* no cache flushes needed */
+   SI_COHERENCY_SHADER,
+   SI_COHERENCY_CB_META,
+   SI_COHERENCY_CP,
 };
 
 struct si_compute;
@@ -244,528 +249,523 @@ struct u_suballocator;
  * at the moment.
  */
 struct si_resource {
-	struct threaded_resource	b;
-
-	/* Winsys objects. */
-	struct pb_buffer		*buf;
-	uint64_t			gpu_address;
-	/* Memory usage if the buffer placement is optimal. */
-	uint64_t			vram_usage;
-	uint64_t			gart_usage;
-
-	/* Resource properties. */
-	uint64_t			bo_size;
-	unsigned			bo_alignment;
-	enum radeon_bo_domain		domains;
-	enum radeon_bo_flag		flags;
-	unsigned			bind_history;
-	int				max_forced_staging_uploads;
-
-	/* The buffer range which is initialized (with a write transfer,
-	 * streamout, DMA, or as a random access target). The rest of
-	 * the buffer is considered invalid and can be mapped unsynchronized.
-	 *
-	 * This allows unsychronized mapping of a buffer range which hasn't
-	 * been used yet. It's for applications which forget to use
-	 * the unsynchronized map flag and expect the driver to figure it out.
-         */
-	struct util_range		valid_buffer_range;
-
-	/* For buffers only. This indicates that a write operation has been
-	 * performed by TC L2, but the cache hasn't been flushed.
-	 * Any hw block which doesn't use or bypasses TC L2 should check this
-	 * flag and flush the cache before using the buffer.
-	 *
-	 * For example, TC L2 must be flushed if a buffer which has been
-	 * modified by a shader store instruction is about to be used as
-	 * an index buffer. The reason is that VGT DMA index fetching doesn't
-	 * use TC L2.
-	 */
-	bool				TC_L2_dirty;
-
-	/* Whether this resource is referenced by bindless handles. */
-	bool				texture_handle_allocated;
-	bool				image_handle_allocated;
-
-	/* Whether the resource has been exported via resource_get_handle. */
-	unsigned			external_usage; /* PIPE_HANDLE_USAGE_* */
+   struct threaded_resource b;
+
+   /* Winsys objects. */
+   struct pb_buffer *buf;
+   uint64_t gpu_address;
+   /* Memory usage if the buffer placement is optimal. */
+   uint64_t vram_usage;
+   uint64_t gart_usage;
+
+   /* Resource properties. */
+   uint64_t bo_size;
+   unsigned bo_alignment;
+   enum radeon_bo_domain domains;
+   enum radeon_bo_flag flags;
+   unsigned bind_history;
+   int max_forced_staging_uploads;
+
+   /* The buffer range which is initialized (with a write transfer,
+    * streamout, DMA, or as a random access target). The rest of
+    * the buffer is considered invalid and can be mapped unsynchronized.
+    *
+    * This allows unsychronized mapping of a buffer range which hasn't
+    * been used yet. It's for applications which forget to use
+    * the unsynchronized map flag and expect the driver to figure it out.
+    */
+   struct util_range valid_buffer_range;
+
+   /* For buffers only. This indicates that a write operation has been
+    * performed by TC L2, but the cache hasn't been flushed.
+    * Any hw block which doesn't use or bypasses TC L2 should check this
+    * flag and flush the cache before using the buffer.
+    *
+    * For example, TC L2 must be flushed if a buffer which has been
+    * modified by a shader store instruction is about to be used as
+    * an index buffer. The reason is that VGT DMA index fetching doesn't
+    * use TC L2.
+    */
+   bool TC_L2_dirty;
+
+   /* Whether this resource is referenced by bindless handles. */
+   bool texture_handle_allocated;
+   bool image_handle_allocated;
+
+   /* Whether the resource has been exported via resource_get_handle. */
+   unsigned external_usage; /* PIPE_HANDLE_USAGE_* */
 };
 
 struct si_transfer {
-	struct threaded_transfer	b;
-	struct si_resource		*staging;
-	unsigned			offset;
+   struct threaded_transfer b;
+   struct si_resource *staging;
+   unsigned offset;
 };
 
 struct si_texture {
-	struct si_resource		buffer;
-
-	struct radeon_surf		surface;
-	struct si_texture		*flushed_depth_texture;
-
-	/* One texture allocation can contain these buffers:
-	 * - image (pixel data)
-	 * - FMASK buffer (MSAA compression)
-	 * - CMASK buffer (MSAA compression and/or legacy fast color clear)
-	 * - HTILE buffer (Z/S compression and fast Z/S clear)
-	 * - DCC buffer (color compression and new fast color clear)
-	 * - displayable DCC buffer (if the DCC buffer is not displayable)
-	 * - DCC retile mapping buffer (if the DCC buffer is not displayable)
-	 */
-	uint64_t			cmask_base_address_reg;
-	struct si_resource		*cmask_buffer;
-	unsigned			cb_color_info; /* fast clear enable bit */
-	unsigned			color_clear_value[2];
-	unsigned			last_msaa_resolve_target_micro_mode;
-	unsigned			num_level0_transfers;
-	unsigned			plane_index; /* other planes are different pipe_resources */
-	unsigned			num_planes;
-
-	/* Depth buffer compression and fast clear. */
-	float				depth_clear_value;
-	uint16_t			dirty_level_mask; /* each bit says if that mipmap is compressed */
-	uint16_t			stencil_dirty_level_mask; /* each bit says if that mipmap is compressed */
-	enum pipe_format		db_render_format:16;
-	uint8_t				stencil_clear_value;
-	bool				fmask_is_identity:1;
-	bool				tc_compatible_htile:1;
-	bool				htile_stencil_disabled:1;
-	bool				depth_cleared:1; /* if it was cleared at least once */
-	bool				stencil_cleared:1; /* if it was cleared at least once */
-	bool				upgraded_depth:1; /* upgraded from unorm to Z32_FLOAT */
-	bool				is_depth:1;
-	bool				db_compatible:1;
-	bool				can_sample_z:1;
-	bool				can_sample_s:1;
-
-	/* We need to track DCC dirtiness, because st/dri usually calls
-	 * flush_resource twice per frame (not a bug) and we don't wanna
-	 * decompress DCC twice. Also, the dirty tracking must be done even
-	 * if DCC isn't used, because it's required by the DCC usage analysis
-	 * for a possible future enablement.
-	 */
-	bool				separate_dcc_dirty:1;
-	bool				displayable_dcc_dirty:1;
-
-	/* Statistics gathering for the DCC enablement heuristic. */
-	bool				dcc_gather_statistics:1;
-	/* Counter that should be non-zero if the texture is bound to a
-	 * framebuffer.
-	 */
-	unsigned                        framebuffers_bound;
-	/* Whether the texture is a displayable back buffer and needs DCC
-	 * decompression, which is expensive. Therefore, it's enabled only
-	 * if statistics suggest that it will pay off and it's allocated
-	 * separately. It can't be bound as a sampler by apps. Limited to
-	 * target == 2D and last_level == 0. If enabled, dcc_offset contains
-	 * the absolute GPUVM address, not the relative one.
-	 */
-	struct si_resource		*dcc_separate_buffer;
-	/* When DCC is temporarily disabled, the separate buffer is here. */
-	struct si_resource		*last_dcc_separate_buffer;
-	/* Estimate of how much this color buffer is written to in units of
-	 * full-screen draws: ps_invocations / (width * height)
-	 * Shader kills, late Z, and blending with trivial discards make it
-	 * inaccurate (we need to count CB updates, not PS invocations).
-	 */
-	unsigned			ps_draw_ratio;
-	/* The number of clears since the last DCC usage analysis. */
-	unsigned			num_slow_clears;
+   struct si_resource buffer;
+
+   struct radeon_surf surface;
+   struct si_texture *flushed_depth_texture;
+
+   /* One texture allocation can contain these buffers:
+    * - image (pixel data)
+    * - FMASK buffer (MSAA compression)
+    * - CMASK buffer (MSAA compression and/or legacy fast color clear)
+    * - HTILE buffer (Z/S compression and fast Z/S clear)
+    * - DCC buffer (color compression and new fast color clear)
+    * - displayable DCC buffer (if the DCC buffer is not displayable)
+    * - DCC retile mapping buffer (if the DCC buffer is not displayable)
+    */
+   uint64_t cmask_base_address_reg;
+   struct si_resource *cmask_buffer;
+   unsigned cb_color_info; /* fast clear enable bit */
+   unsigned color_clear_value[2];
+   unsigned last_msaa_resolve_target_micro_mode;
+   unsigned num_level0_transfers;
+   unsigned plane_index; /* other planes are different pipe_resources */
+   unsigned num_planes;
+
+   /* Depth buffer compression and fast clear. */
+   float depth_clear_value;
+   uint16_t dirty_level_mask;         /* each bit says if that mipmap is compressed */
+   uint16_t stencil_dirty_level_mask; /* each bit says if that mipmap is compressed */
+   enum pipe_format db_render_format : 16;
+   uint8_t stencil_clear_value;
+   bool fmask_is_identity : 1;
+   bool tc_compatible_htile : 1;
+   bool htile_stencil_disabled : 1;
+   bool depth_cleared : 1;   /* if it was cleared at least once */
+   bool stencil_cleared : 1; /* if it was cleared at least once */
+   bool upgraded_depth : 1;  /* upgraded from unorm to Z32_FLOAT */
+   bool is_depth : 1;
+   bool db_compatible : 1;
+   bool can_sample_z : 1;
+   bool can_sample_s : 1;
+
+   /* We need to track DCC dirtiness, because st/dri usually calls
+    * flush_resource twice per frame (not a bug) and we don't wanna
+    * decompress DCC twice. Also, the dirty tracking must be done even
+    * if DCC isn't used, because it's required by the DCC usage analysis
+    * for a possible future enablement.
+    */
+   bool separate_dcc_dirty : 1;
+   bool displayable_dcc_dirty : 1;
+
+   /* Statistics gathering for the DCC enablement heuristic. */
+   bool dcc_gather_statistics : 1;
+   /* Counter that should be non-zero if the texture is bound to a
+    * framebuffer.
+    */
+   unsigned framebuffers_bound;
+   /* Whether the texture is a displayable back buffer and needs DCC
+    * decompression, which is expensive. Therefore, it's enabled only
+    * if statistics suggest that it will pay off and it's allocated
+    * separately. It can't be bound as a sampler by apps. Limited to
+    * target == 2D and last_level == 0. If enabled, dcc_offset contains
+    * the absolute GPUVM address, not the relative one.
+    */
+   struct si_resource *dcc_separate_buffer;
+   /* When DCC is temporarily disabled, the separate buffer is here. */
+   struct si_resource *last_dcc_separate_buffer;
+   /* Estimate of how much this color buffer is written to in units of
+    * full-screen draws: ps_invocations / (width * height)
+    * Shader kills, late Z, and blending with trivial discards make it
+    * inaccurate (we need to count CB updates, not PS invocations).
+    */
+   unsigned ps_draw_ratio;
+   /* The number of clears since the last DCC usage analysis. */
+   unsigned num_slow_clears;
 };
 
 struct si_surface {
-	struct pipe_surface		base;
-
-	/* These can vary with block-compressed textures. */
-	uint16_t width0;
-	uint16_t height0;
-
-	bool color_initialized:1;
-	bool depth_initialized:1;
-
-	/* Misc. color flags. */
-	bool color_is_int8:1;
-	bool color_is_int10:1;
-	bool dcc_incompatible:1;
-
-	/* Color registers. */
-	unsigned cb_color_info;
-	unsigned cb_color_view;
-	unsigned cb_color_attrib;
-	unsigned cb_color_attrib2;	/* GFX9 and later */
-	unsigned cb_color_attrib3;	/* GFX10 and later */
-	unsigned cb_dcc_control;	/* GFX8 and later */
-	unsigned spi_shader_col_format:8;	/* no blending, no alpha-to-coverage. */
-	unsigned spi_shader_col_format_alpha:8;	/* alpha-to-coverage */
-	unsigned spi_shader_col_format_blend:8;	/* blending without alpha. */
-	unsigned spi_shader_col_format_blend_alpha:8; /* blending with alpha. */
-
-	/* DB registers. */
-	uint64_t db_depth_base;		/* DB_Z_READ/WRITE_BASE */
-	uint64_t db_stencil_base;
-	uint64_t db_htile_data_base;
-	unsigned db_depth_info;
-	unsigned db_z_info;
-	unsigned db_z_info2;		/* GFX9 only */
-	unsigned db_depth_view;
-	unsigned db_depth_size;
-	unsigned db_depth_slice;
-	unsigned db_stencil_info;
-	unsigned db_stencil_info2;	/* GFX9 only */
-	unsigned db_htile_surface;
+   struct pipe_surface base;
+
+   /* These can vary with block-compressed textures. */
+   uint16_t width0;
+   uint16_t height0;
+
+   bool color_initialized : 1;
+   bool depth_initialized : 1;
+
+   /* Misc. color flags. */
+   bool color_is_int8 : 1;
+   bool color_is_int10 : 1;
+   bool dcc_incompatible : 1;
+
+   /* Color registers. */
+   unsigned cb_color_info;
+   unsigned cb_color_view;
+   unsigned cb_color_attrib;
+   unsigned cb_color_attrib2;                      /* GFX9 and later */
+   unsigned cb_color_attrib3;                      /* GFX10 and later */
+   unsigned cb_dcc_control;                        /* GFX8 and later */
+   unsigned spi_shader_col_format : 8;             /* no blending, no alpha-to-coverage. */
+   unsigned spi_shader_col_format_alpha : 8;       /* alpha-to-coverage */
+   unsigned spi_shader_col_format_blend : 8;       /* blending without alpha. */
+   unsigned spi_shader_col_format_blend_alpha : 8; /* blending with alpha. */
+
+   /* DB registers. */
+   uint64_t db_depth_base; /* DB_Z_READ/WRITE_BASE */
+   uint64_t db_stencil_base;
+   uint64_t db_htile_data_base;
+   unsigned db_depth_info;
+   unsigned db_z_info;
+   unsigned db_z_info2; /* GFX9 only */
+   unsigned db_depth_view;
+   unsigned db_depth_size;
+   unsigned db_depth_slice;
+   unsigned db_stencil_info;
+   unsigned db_stencil_info2; /* GFX9 only */
+   unsigned db_htile_surface;
 };
 
 struct si_mmio_counter {
-	unsigned busy;
-	unsigned idle;
+   unsigned busy;
+   unsigned idle;
 };
 
 union si_mmio_counters {
-	struct {
-		/* For global GPU load including SDMA. */
-		struct si_mmio_counter gpu;
-
-		/* GRBM_STATUS */
-		struct si_mmio_counter spi;
-		struct si_mmio_counter gui;
-		struct si_mmio_counter ta;
-		struct si_mmio_counter gds;
-		struct si_mmio_counter vgt;
-		struct si_mmio_counter ia;
-		struct si_mmio_counter sx;
-		struct si_mmio_counter wd;
-		struct si_mmio_counter bci;
-		struct si_mmio_counter sc;
-		struct si_mmio_counter pa;
-		struct si_mmio_counter db;
-		struct si_mmio_counter cp;
-		struct si_mmio_counter cb;
-
-		/* SRBM_STATUS2 */
-		struct si_mmio_counter sdma;
-
-		/* CP_STAT */
-		struct si_mmio_counter pfp;
-		struct si_mmio_counter meq;
-		struct si_mmio_counter me;
-		struct si_mmio_counter surf_sync;
-		struct si_mmio_counter cp_dma;
-		struct si_mmio_counter scratch_ram;
-	} named;
-	unsigned array[0];
+   struct {
+      /* For global GPU load including SDMA. */
+      struct si_mmio_counter gpu;
+
+      /* GRBM_STATUS */
+      struct si_mmio_counter spi;
+      struct si_mmio_counter gui;
+      struct si_mmio_counter ta;
+      struct si_mmio_counter gds;
+      struct si_mmio_counter vgt;
+      struct si_mmio_counter ia;
+      struct si_mmio_counter sx;
+      struct si_mmio_counter wd;
+      struct si_mmio_counter bci;
+      struct si_mmio_counter sc;
+      struct si_mmio_counter pa;
+      struct si_mmio_counter db;
+      struct si_mmio_counter cp;
+      struct si_mmio_counter cb;
+
+      /* SRBM_STATUS2 */
+      struct si_mmio_counter sdma;
+
+      /* CP_STAT */
+      struct si_mmio_counter pfp;
+      struct si_mmio_counter meq;
+      struct si_mmio_counter me;
+      struct si_mmio_counter surf_sync;
+      struct si_mmio_counter cp_dma;
+      struct si_mmio_counter scratch_ram;
+   } named;
+   unsigned array[0];
 };
 
 struct si_memory_object {
-	struct pipe_memory_object	b;
-	struct pb_buffer		*buf;
-	uint32_t			stride;
+   struct pipe_memory_object b;
+   struct pb_buffer *buf;
+   uint32_t stride;
 };
 
 /* Saved CS data for debugging features. */
 struct radeon_saved_cs {
-	uint32_t			*ib;
-	unsigned			num_dw;
+   uint32_t *ib;
+   unsigned num_dw;
 
-	struct radeon_bo_list_item	*bo_list;
-	unsigned			bo_count;
+   struct radeon_bo_list_item *bo_list;
+   unsigned bo_count;
 };
 
 struct si_screen {
-	struct pipe_screen		b;
-	struct radeon_winsys		*ws;
-	struct disk_cache		*disk_shader_cache;
-
-	struct radeon_info		info;
-	uint64_t			debug_flags;
-	char				renderer_string[183];
-
-	void (*make_texture_descriptor)(
-			struct si_screen *screen,
-			struct si_texture *tex,
-			bool sampler,
-			enum pipe_texture_target target,
-			enum pipe_format pipe_format,
-			const unsigned char state_swizzle[4],
-			unsigned first_level, unsigned last_level,
-			unsigned first_layer, unsigned last_layer,
-			unsigned width, unsigned height, unsigned depth,
-			uint32_t *state,
-			uint32_t *fmask_state);
-
-	unsigned			num_vbos_in_user_sgprs;
-	unsigned			pa_sc_raster_config;
-	unsigned			pa_sc_raster_config_1;
-	unsigned			se_tile_repeat;
-	unsigned			gs_table_depth;
-	unsigned			tess_offchip_block_dw_size;
-	unsigned			tess_offchip_ring_size;
-	unsigned			tess_factor_ring_size;
-	unsigned			vgt_hs_offchip_param;
-	unsigned			eqaa_force_coverage_samples;
-	unsigned			eqaa_force_z_samples;
-	unsigned			eqaa_force_color_samples;
-	bool				has_draw_indirect_multi;
-	bool				has_out_of_order_rast;
-	bool				assume_no_z_fights;
-	bool				commutative_blend_add;
-	bool				dpbb_allowed;
-	bool				dfsm_allowed;
-	bool				llvm_has_working_vgpr_indexing;
-	bool				use_ngg;
-	bool				use_ngg_culling;
-	bool				always_use_ngg_culling;
-	bool				use_ngg_streamout;
-
-	struct {
-#define OPT_BOOL(name, dflt, description) bool name:1;
+   struct pipe_screen b;
+   struct radeon_winsys *ws;
+   struct disk_cache *disk_shader_cache;
+
+   struct radeon_info info;
+   uint64_t debug_flags;
+   char renderer_string[183];
+
+   void (*make_texture_descriptor)(struct si_screen *screen, struct si_texture *tex, bool sampler,
+                                   enum pipe_texture_target target, enum pipe_format pipe_format,
+                                   const unsigned char state_swizzle[4], unsigned first_level,
+                                   unsigned last_level, unsigned first_layer, unsigned last_layer,
+                                   unsigned width, unsigned height, unsigned depth, uint32_t *state,
+                                   uint32_t *fmask_state);
+
+   unsigned num_vbos_in_user_sgprs;
+   unsigned pa_sc_raster_config;
+   unsigned pa_sc_raster_config_1;
+   unsigned se_tile_repeat;
+   unsigned gs_table_depth;
+   unsigned tess_offchip_block_dw_size;
+   unsigned tess_offchip_ring_size;
+   unsigned tess_factor_ring_size;
+   unsigned vgt_hs_offchip_param;
+   unsigned eqaa_force_coverage_samples;
+   unsigned eqaa_force_z_samples;
+   unsigned eqaa_force_color_samples;
+   bool has_draw_indirect_multi;
+   bool has_out_of_order_rast;
+   bool assume_no_z_fights;
+   bool commutative_blend_add;
+   bool dpbb_allowed;
+   bool dfsm_allowed;
+   bool llvm_has_working_vgpr_indexing;
+   bool use_ngg;
+   bool use_ngg_culling;
+   bool always_use_ngg_culling;
+   bool use_ngg_streamout;
+
+   struct {
+#define OPT_BOOL(name, dflt, description) bool name : 1;
 #include "si_debug_options.h"
-	} options;
-
-	/* Whether shaders are monolithic (1-part) or separate (3-part). */
-	bool				use_monolithic_shaders;
-	bool				record_llvm_ir;
-	bool				dcc_msaa_allowed;
-
-	struct slab_parent_pool		pool_transfers;
-
-	/* Texture filter settings. */
-	int				force_aniso; /* -1 = disabled */
-
-	/* Auxiliary context. Mainly used to initialize resources.
-	 * It must be locked prior to using and flushed before unlocking. */
-	struct pipe_context		*aux_context;
-	simple_mtx_t			aux_context_lock;
-
-	/* This must be in the screen, because UE4 uses one context for
-	 * compilation and another one for rendering.
-	 */
-	unsigned			num_compilations;
-	/* Along with ST_DEBUG=precompile, this should show if applications
-	 * are loading shaders on demand. This is a monotonic counter.
-	 */
-	unsigned			num_shaders_created;
-	unsigned			num_memory_shader_cache_hits;
-	unsigned			num_memory_shader_cache_misses;
-	unsigned			num_disk_shader_cache_hits;
-	unsigned			num_disk_shader_cache_misses;
-
-	/* GPU load thread. */
-	simple_mtx_t			gpu_load_mutex;
-	thrd_t				gpu_load_thread;
-	union si_mmio_counters	mmio_counters;
-	volatile unsigned		gpu_load_stop_thread; /* bool */
-
-	/* Performance counters. */
-	struct si_perfcounters	*perfcounters;
-
-	/* If pipe_screen wants to recompute and re-emit the framebuffer,
-	 * sampler, and image states of all contexts, it should atomically
-	 * increment this.
-	 *
-	 * Each context will compare this with its own last known value of
-	 * the counter before drawing and re-emit the states accordingly.
-	 */
-	unsigned			dirty_tex_counter;
-	unsigned			dirty_buf_counter;
-
-	/* Atomically increment this counter when an existing texture's
-	 * metadata is enabled or disabled in a way that requires changing
-	 * contexts' compressed texture binding masks.
-	 */
-	unsigned			compressed_colortex_counter;
-
-	struct {
-		/* Context flags to set so that all writes from earlier jobs
-		 * in the CP are seen by L2 clients.
-		 */
-		unsigned cp_to_L2;
-
-		/* Context flags to set so that all writes from earlier jobs
-		 * that end in L2 are seen by CP.
-		 */
-		unsigned L2_to_cp;
-	} barrier_flags;
-
-	simple_mtx_t			shader_parts_mutex;
-	struct si_shader_part		*vs_prologs;
-	struct si_shader_part		*tcs_epilogs;
-	struct si_shader_part		*gs_prologs;
-	struct si_shader_part		*ps_prologs;
-	struct si_shader_part		*ps_epilogs;
-
-	/* Shader cache in memory.
-	 *
-	 * Design & limitations:
-	 * - The shader cache is per screen (= per process), never saved to
-	 *   disk, and skips redundant shader compilations from NIR to bytecode.
-	 * - It can only be used with one-variant-per-shader support, in which
-	 *   case only the main (typically middle) part of shaders is cached.
-	 * - Only VS, TCS, TES, PS are cached, out of which only the hw VS
-	 *   variants of VS and TES are cached, so LS and ES aren't.
-	 * - GS and CS aren't cached, but it's certainly possible to cache
-	 *   those as well.
-	 */
-	simple_mtx_t			shader_cache_mutex;
-	struct hash_table		*shader_cache;
-
-	/* Shader cache of live shaders. */
-	struct util_live_shader_cache	live_shader_cache;
-
-	/* Shader compiler queue for multithreaded compilation. */
-	struct util_queue		shader_compiler_queue;
-	/* Use at most 3 normal compiler threads on quadcore and better.
-	 * Hyperthreaded CPUs report the number of threads, but we want
-	 * the number of cores. We only need this many threads for shader-db. */
-	struct ac_llvm_compiler		compiler[24]; /* used by the queue only */
-
-	struct util_queue		shader_compiler_queue_low_priority;
-	/* Use at most 2 low priority threads on quadcore and better.
-	 * We want to minimize the impact on multithreaded Mesa. */
-	struct ac_llvm_compiler		compiler_lowp[10];
-
-	unsigned			compute_wave_size;
-	unsigned			ps_wave_size;
-	unsigned			ge_wave_size;
+   } options;
+
+   /* Whether shaders are monolithic (1-part) or separate (3-part). */
+   bool use_monolithic_shaders;
+   bool record_llvm_ir;
+   bool dcc_msaa_allowed;
+
+   struct slab_parent_pool pool_transfers;
+
+   /* Texture filter settings. */
+   int force_aniso; /* -1 = disabled */
+
+   /* Auxiliary context. Mainly used to initialize resources.
+    * It must be locked prior to using and flushed before unlocking. */
+   struct pipe_context *aux_context;
+   simple_mtx_t aux_context_lock;
+
+   /* This must be in the screen, because UE4 uses one context for
+    * compilation and another one for rendering.
+    */
+   unsigned num_compilations;
+   /* Along with ST_DEBUG=precompile, this should show if applications
+    * are loading shaders on demand. This is a monotonic counter.
+    */
+   unsigned num_shaders_created;
+   unsigned num_memory_shader_cache_hits;
+   unsigned num_memory_shader_cache_misses;
+   unsigned num_disk_shader_cache_hits;
+   unsigned num_disk_shader_cache_misses;
+
+   /* GPU load thread. */
+   simple_mtx_t gpu_load_mutex;
+   thrd_t gpu_load_thread;
+   union si_mmio_counters mmio_counters;
+   volatile unsigned gpu_load_stop_thread; /* bool */
+
+   /* Performance counters. */
+   struct si_perfcounters *perfcounters;
+
+   /* If pipe_screen wants to recompute and re-emit the framebuffer,
+    * sampler, and image states of all contexts, it should atomically
+    * increment this.
+    *
+    * Each context will compare this with its own last known value of
+    * the counter before drawing and re-emit the states accordingly.
+    */
+   unsigned dirty_tex_counter;
+   unsigned dirty_buf_counter;
+
+   /* Atomically increment this counter when an existing texture's
+    * metadata is enabled or disabled in a way that requires changing
+    * contexts' compressed texture binding masks.
+    */
+   unsigned compressed_colortex_counter;
+
+   struct {
+      /* Context flags to set so that all writes from earlier jobs
+       * in the CP are seen by L2 clients.
+       */
+      unsigned cp_to_L2;
+
+      /* Context flags to set so that all writes from earlier jobs
+       * that end in L2 are seen by CP.
+       */
+      unsigned L2_to_cp;
+   } barrier_flags;
+
+   simple_mtx_t shader_parts_mutex;
+   struct si_shader_part *vs_prologs;
+   struct si_shader_part *tcs_epilogs;
+   struct si_shader_part *gs_prologs;
+   struct si_shader_part *ps_prologs;
+   struct si_shader_part *ps_epilogs;
+
+   /* Shader cache in memory.
+    *
+    * Design & limitations:
+    * - The shader cache is per screen (= per process), never saved to
+    *   disk, and skips redundant shader compilations from NIR to bytecode.
+    * - It can only be used with one-variant-per-shader support, in which
+    *   case only the main (typically middle) part of shaders is cached.
+    * - Only VS, TCS, TES, PS are cached, out of which only the hw VS
+    *   variants of VS and TES are cached, so LS and ES aren't.
+    * - GS and CS aren't cached, but it's certainly possible to cache
+    *   those as well.
+    */
+   simple_mtx_t shader_cache_mutex;
+   struct hash_table *shader_cache;
+
+   /* Shader cache of live shaders. */
+   struct util_live_shader_cache live_shader_cache;
+
+   /* Shader compiler queue for multithreaded compilation. */
+   struct util_queue shader_compiler_queue;
+   /* Use at most 3 normal compiler threads on quadcore and better.
+    * Hyperthreaded CPUs report the number of threads, but we want
+    * the number of cores. We only need this many threads for shader-db. */
+   struct ac_llvm_compiler compiler[24]; /* used by the queue only */
+
+   struct util_queue shader_compiler_queue_low_priority;
+   /* Use at most 2 low priority threads on quadcore and better.
+    * We want to minimize the impact on multithreaded Mesa. */
+   struct ac_llvm_compiler compiler_lowp[10];
+
+   unsigned compute_wave_size;
+   unsigned ps_wave_size;
+   unsigned ge_wave_size;
 };
 
 struct si_blend_color {
-	struct pipe_blend_color		state;
-	bool				any_nonzeros;
+   struct pipe_blend_color state;
+   bool any_nonzeros;
 };
 
 struct si_sampler_view {
-	struct pipe_sampler_view	base;
-        /* [0..7] = image descriptor
-         * [4..7] = buffer descriptor */
-	uint32_t			state[8];
-	uint32_t			fmask_state[8];
-	const struct legacy_surf_level	*base_level_info;
-	ubyte				base_level;
-	ubyte				block_width;
-	bool is_stencil_sampler;
-	bool is_integer;
-	bool dcc_incompatible;
+   struct pipe_sampler_view base;
+   /* [0..7] = image descriptor
+    * [4..7] = buffer descriptor */
+   uint32_t state[8];
+   uint32_t fmask_state[8];
+   const struct legacy_surf_level *base_level_info;
+   ubyte base_level;
+   ubyte block_width;
+   bool is_stencil_sampler;
+   bool is_integer;
+   bool dcc_incompatible;
 };
 
 #define SI_SAMPLER_STATE_MAGIC 0x34f1c35a
 
 struct si_sampler_state {
 #ifndef NDEBUG
-	unsigned			magic;
+   unsigned magic;
 #endif
-	uint32_t			val[4];
-	uint32_t			integer_val[4];
-	uint32_t			upgraded_depth_val[4];
+   uint32_t val[4];
+   uint32_t integer_val[4];
+   uint32_t upgraded_depth_val[4];
 };
 
 struct si_cs_shader_state {
-	struct si_compute		*program;
-	struct si_compute		*emitted_program;
-	unsigned			offset;
-	bool				initialized;
-	bool				uses_scratch;
+   struct si_compute *program;
+   struct si_compute *emitted_program;
+   unsigned offset;
+   bool initialized;
+   bool uses_scratch;
 };
 
 struct si_samplers {
-	struct pipe_sampler_view	*views[SI_NUM_SAMPLERS];
-	struct si_sampler_state		*sampler_states[SI_NUM_SAMPLERS];
+   struct pipe_sampler_view *views[SI_NUM_SAMPLERS];
+   struct si_sampler_state *sampler_states[SI_NUM_SAMPLERS];
 
-	/* The i-th bit is set if that element is enabled (non-NULL resource). */
-	unsigned			enabled_mask;
-	uint32_t			needs_depth_decompress_mask;
-	uint32_t			needs_color_decompress_mask;
+   /* The i-th bit is set if that element is enabled (non-NULL resource). */
+   unsigned enabled_mask;
+   uint32_t needs_depth_decompress_mask;
+   uint32_t needs_color_decompress_mask;
 };
 
 struct si_images {
-	struct pipe_image_view		views[SI_NUM_IMAGES];
-	uint32_t			needs_color_decompress_mask;
-	unsigned			enabled_mask;
+   struct pipe_image_view views[SI_NUM_IMAGES];
+   uint32_t needs_color_decompress_mask;
+   unsigned enabled_mask;
 };
 
 struct si_framebuffer {
-	struct pipe_framebuffer_state	state;
-	unsigned			colorbuf_enabled_4bit;
-	unsigned			spi_shader_col_format;
-	unsigned			spi_shader_col_format_alpha;
-	unsigned			spi_shader_col_format_blend;
-	unsigned			spi_shader_col_format_blend_alpha;
-	ubyte				nr_samples:5; /* at most 16xAA */
-	ubyte				log_samples:3; /* at most 4 = 16xAA */
-	ubyte				nr_color_samples; /* at most 8xAA */
-	ubyte				compressed_cb_mask;
-	ubyte				uncompressed_cb_mask;
-	ubyte				displayable_dcc_cb_mask;
-	ubyte				color_is_int8;
-	ubyte				color_is_int10;
-	ubyte				dirty_cbufs;
-	ubyte				dcc_overwrite_combiner_watermark;
-	ubyte				min_bytes_per_pixel;
-	bool				dirty_zsbuf;
-	bool				any_dst_linear;
-	bool				CB_has_shader_readable_metadata;
-	bool				DB_has_shader_readable_metadata;
-	bool				all_DCC_pipe_aligned;
+   struct pipe_framebuffer_state state;
+   unsigned colorbuf_enabled_4bit;
+   unsigned spi_shader_col_format;
+   unsigned spi_shader_col_format_alpha;
+   unsigned spi_shader_col_format_blend;
+   unsigned spi_shader_col_format_blend_alpha;
+   ubyte nr_samples : 5;   /* at most 16xAA */
+   ubyte log_samples : 3;  /* at most 4 = 16xAA */
+   ubyte nr_color_samples; /* at most 8xAA */
+   ubyte compressed_cb_mask;
+   ubyte uncompressed_cb_mask;
+   ubyte displayable_dcc_cb_mask;
+   ubyte color_is_int8;
+   ubyte color_is_int10;
+   ubyte dirty_cbufs;
+   ubyte dcc_overwrite_combiner_watermark;
+   ubyte min_bytes_per_pixel;
+   bool dirty_zsbuf;
+   bool any_dst_linear;
+   bool CB_has_shader_readable_metadata;
+   bool DB_has_shader_readable_metadata;
+   bool all_DCC_pipe_aligned;
 };
 
-enum si_quant_mode {
-	/* This is the list we want to support. */
-	SI_QUANT_MODE_16_8_FIXED_POINT_1_256TH,
-	SI_QUANT_MODE_14_10_FIXED_POINT_1_1024TH,
-	SI_QUANT_MODE_12_12_FIXED_POINT_1_4096TH,
+enum si_quant_mode
+{
+   /* This is the list we want to support. */
+   SI_QUANT_MODE_16_8_FIXED_POINT_1_256TH,
+   SI_QUANT_MODE_14_10_FIXED_POINT_1_1024TH,
+   SI_QUANT_MODE_12_12_FIXED_POINT_1_4096TH,
 };
 
 struct si_signed_scissor {
-	int minx;
-	int miny;
-	int maxx;
-	int maxy;
-	enum si_quant_mode quant_mode;
+   int minx;
+   int miny;
+   int maxx;
+   int maxy;
+   enum si_quant_mode quant_mode;
 };
 
 struct si_viewports {
-	struct pipe_viewport_state	states[SI_MAX_VIEWPORTS];
-	struct si_signed_scissor	as_scissor[SI_MAX_VIEWPORTS];
-	bool				y_inverted;
+   struct pipe_viewport_state states[SI_MAX_VIEWPORTS];
+   struct si_signed_scissor as_scissor[SI_MAX_VIEWPORTS];
+   bool y_inverted;
 };
 
 struct si_clip_state {
-	struct pipe_clip_state		state;
-	bool				any_nonzeros;
+   struct pipe_clip_state state;
+   bool any_nonzeros;
 };
 
 struct si_streamout_target {
-	struct pipe_stream_output_target b;
+   struct pipe_stream_output_target b;
 
-	/* The buffer where BUFFER_FILLED_SIZE is stored. */
-	struct si_resource	*buf_filled_size;
-	unsigned		buf_filled_size_offset;
-	bool			buf_filled_size_valid;
+   /* The buffer where BUFFER_FILLED_SIZE is stored. */
+   struct si_resource *buf_filled_size;
+   unsigned buf_filled_size_offset;
+   bool buf_filled_size_valid;
 
-	unsigned		stride_in_dw;
+   unsigned stride_in_dw;
 };
 
 struct si_streamout {
-	bool				begin_emitted;
+   bool begin_emitted;
 
-	unsigned			enabled_mask;
-	unsigned			num_targets;
-	struct si_streamout_target	*targets[PIPE_MAX_SO_BUFFERS];
+   unsigned enabled_mask;
+   unsigned num_targets;
+   struct si_streamout_target *targets[PIPE_MAX_SO_BUFFERS];
 
-	unsigned			append_bitmask;
-	bool				suspended;
+   unsigned append_bitmask;
+   bool suspended;
 
-	/* External state which comes from the vertex shader,
-	 * it must be set explicitly when binding a shader. */
-	uint16_t			*stride_in_dw;
-	unsigned			enabled_stream_buffers_mask; /* stream0 buffers0-3 in 4 LSB */
+   /* External state which comes from the vertex shader,
+    * it must be set explicitly when binding a shader. */
+   uint16_t *stride_in_dw;
+   unsigned enabled_stream_buffers_mask; /* stream0 buffers0-3 in 4 LSB */
 
-	/* The state of VGT_STRMOUT_BUFFER_(CONFIG|EN). */
-	unsigned			hw_enabled_mask;
+   /* The state of VGT_STRMOUT_BUFFER_(CONFIG|EN). */
+   unsigned hw_enabled_mask;
 
-	/* The state of VGT_STRMOUT_(CONFIG|EN). */
-	bool				streamout_enabled;
-	bool				prims_gen_query_enabled;
-	int				num_prims_gen_queries;
+   /* The state of VGT_STRMOUT_(CONFIG|EN). */
+   bool streamout_enabled;
+   bool prims_gen_query_enabled;
+   int num_prims_gen_queries;
 };
 
 /* A shader state consists of the shader selector, which is a constant state
@@ -773,494 +773,488 @@ struct si_streamout {
  * the current shader variant selected for this context.
  */
 struct si_shader_ctx_state {
-	struct si_shader_selector	*cso;
-	struct si_shader		*current;
+   struct si_shader_selector *cso;
+   struct si_shader *current;
 };
 
 #define SI_NUM_VGT_PARAM_KEY_BITS 12
-#define SI_NUM_VGT_PARAM_STATES (1 << SI_NUM_VGT_PARAM_KEY_BITS)
+#define SI_NUM_VGT_PARAM_STATES   (1 << SI_NUM_VGT_PARAM_KEY_BITS)
 
 /* The IA_MULTI_VGT_PARAM key used to index the table of precomputed values.
  * Some fields are set by state-change calls, most are set by draw_vbo.
  */
 union si_vgt_param_key {
-	struct {
+   struct {
 #if UTIL_ARCH_LITTLE_ENDIAN
-		unsigned prim:4;
-		unsigned uses_instancing:1;
-		unsigned multi_instances_smaller_than_primgroup:1;
-		unsigned primitive_restart:1;
-		unsigned count_from_stream_output:1;
-		unsigned line_stipple_enabled:1;
-		unsigned uses_tess:1;
-		unsigned tess_uses_prim_id:1;
-		unsigned uses_gs:1;
-		unsigned _pad:32 - SI_NUM_VGT_PARAM_KEY_BITS;
+      unsigned prim : 4;
+      unsigned uses_instancing : 1;
+      unsigned multi_instances_smaller_than_primgroup : 1;
+      unsigned primitive_restart : 1;
+      unsigned count_from_stream_output : 1;
+      unsigned line_stipple_enabled : 1;
+      unsigned uses_tess : 1;
+      unsigned tess_uses_prim_id : 1;
+      unsigned uses_gs : 1;
+      unsigned _pad : 32 - SI_NUM_VGT_PARAM_KEY_BITS;
 #else /* UTIL_ARCH_BIG_ENDIAN */
-		unsigned _pad:32 - SI_NUM_VGT_PARAM_KEY_BITS;
-		unsigned uses_gs:1;
-		unsigned tess_uses_prim_id:1;
-		unsigned uses_tess:1;
-		unsigned line_stipple_enabled:1;
-		unsigned count_from_stream_output:1;
-		unsigned primitive_restart:1;
-		unsigned multi_instances_smaller_than_primgroup:1;
-		unsigned uses_instancing:1;
-		unsigned prim:4;
+      unsigned _pad : 32 - SI_NUM_VGT_PARAM_KEY_BITS;
+      unsigned uses_gs : 1;
+      unsigned tess_uses_prim_id : 1;
+      unsigned uses_tess : 1;
+      unsigned line_stipple_enabled : 1;
+      unsigned count_from_stream_output : 1;
+      unsigned primitive_restart : 1;
+      unsigned multi_instances_smaller_than_primgroup : 1;
+      unsigned uses_instancing : 1;
+      unsigned prim : 4;
 #endif
-	} u;
-	uint32_t index;
+   } u;
+   uint32_t index;
 };
 
 #define SI_NUM_VGT_STAGES_KEY_BITS 6
-#define SI_NUM_VGT_STAGES_STATES (1 << SI_NUM_VGT_STAGES_KEY_BITS)
+#define SI_NUM_VGT_STAGES_STATES   (1 << SI_NUM_VGT_STAGES_KEY_BITS)
 
 /* The VGT_SHADER_STAGES key used to index the table of precomputed values.
  * Some fields are set by state-change calls, most are set by draw_vbo.
  */
 union si_vgt_stages_key {
-	struct {
+   struct {
 #if UTIL_ARCH_LITTLE_ENDIAN
-		unsigned tess:1;
-		unsigned gs:1;
-		unsigned ngg_gs_fast_launch:1;
-		unsigned ngg_passthrough:1;
-		unsigned ngg:1; /* gfx10+ */
-		unsigned streamout:1; /* only used with NGG */
-		unsigned _pad:32 - SI_NUM_VGT_STAGES_KEY_BITS;
+      unsigned tess : 1;
+      unsigned gs : 1;
+      unsigned ngg_gs_fast_launch : 1;
+      unsigned ngg_passthrough : 1;
+      unsigned ngg : 1;       /* gfx10+ */
+      unsigned streamout : 1; /* only used with NGG */
+      unsigned _pad : 32 - SI_NUM_VGT_STAGES_KEY_BITS;
 #else /* UTIL_ARCH_BIG_ENDIAN */
-		unsigned _pad:32 - SI_NUM_VGT_STAGES_KEY_BITS;
-		unsigned streamout:1;
-		unsigned ngg:1;
-		unsigned ngg_passthrough:1;
-		unsigned ngg_gs_fast_launch:1;
-		unsigned gs:1;
-		unsigned tess:1;
+      unsigned _pad : 32 - SI_NUM_VGT_STAGES_KEY_BITS;
+      unsigned streamout : 1;
+      unsigned ngg : 1;
+      unsigned ngg_passthrough : 1;
+      unsigned ngg_gs_fast_launch : 1;
+      unsigned gs : 1;
+      unsigned tess : 1;
 #endif
-	} u;
-	uint32_t index;
+   } u;
+   uint32_t index;
 };
 
-struct si_texture_handle
-{
-	unsigned			desc_slot;
-	bool				desc_dirty;
-	struct pipe_sampler_view	*view;
-	struct si_sampler_state		sstate;
+struct si_texture_handle {
+   unsigned desc_slot;
+   bool desc_dirty;
+   struct pipe_sampler_view *view;
+   struct si_sampler_state sstate;
 };
 
-struct si_image_handle
-{
-	unsigned			desc_slot;
-	bool				desc_dirty;
-	struct pipe_image_view		view;
+struct si_image_handle {
+   unsigned desc_slot;
+   bool desc_dirty;
+   struct pipe_image_view view;
 };
 
 struct si_saved_cs {
-	struct pipe_reference	reference;
-	struct si_context	*ctx;
-	struct radeon_saved_cs	gfx;
-	struct radeon_saved_cs	compute;
-	struct si_resource	*trace_buf;
-	unsigned		trace_id;
-
-	unsigned		gfx_last_dw;
-	unsigned		compute_last_dw;
-	bool			flushed;
-	int64_t			time_flush;
+   struct pipe_reference reference;
+   struct si_context *ctx;
+   struct radeon_saved_cs gfx;
+   struct radeon_saved_cs compute;
+   struct si_resource *trace_buf;
+   unsigned trace_id;
+
+   unsigned gfx_last_dw;
+   unsigned compute_last_dw;
+   bool flushed;
+   int64_t time_flush;
 };
 
 struct si_sdma_upload {
-	struct si_resource	*dst;
-	struct si_resource	*src;
-	unsigned		src_offset;
-	unsigned		dst_offset;
-	unsigned		size;
+   struct si_resource *dst;
+   struct si_resource *src;
+   unsigned src_offset;
+   unsigned dst_offset;
+   unsigned size;
 };
 
 struct si_small_prim_cull_info {
-	float scale[2], translate[2];
+   float scale[2], translate[2];
 };
 
 struct si_context {
-	struct pipe_context		b; /* base class */
-
-	enum radeon_family		family;
-	enum chip_class			chip_class;
-
-	struct radeon_winsys		*ws;
-	struct radeon_winsys_ctx	*ctx;
-	struct radeon_cmdbuf		*gfx_cs; /* compute IB if graphics is disabled */
-	struct radeon_cmdbuf		*sdma_cs;
-	struct pipe_fence_handle	*last_gfx_fence;
-	struct pipe_fence_handle	*last_sdma_fence;
-	struct si_resource		*eop_bug_scratch;
-	struct u_upload_mgr		*cached_gtt_allocator;
-	struct threaded_context		*tc;
-	struct u_suballocator		*allocator_zeroed_memory;
-	struct slab_child_pool		pool_transfers;
-	struct slab_child_pool		pool_transfers_unsync; /* for threaded_context */
-	struct pipe_device_reset_callback device_reset_callback;
-	struct u_log_context		*log;
-	void				*query_result_shader;
-	void				*sh_query_result_shader;
-
-	void (*emit_cache_flush)(struct si_context *ctx);
-
-	struct blitter_context		*blitter;
-	void				*noop_blend;
-	void				*noop_dsa;
-	void				*discard_rasterizer_state;
-	void				*custom_dsa_flush;
-	void				*custom_blend_resolve;
-	void				*custom_blend_fmask_decompress;
-	void				*custom_blend_eliminate_fastclear;
-	void				*custom_blend_dcc_decompress;
-	void				*vs_blit_pos;
-	void				*vs_blit_pos_layered;
-	void				*vs_blit_color;
-	void				*vs_blit_color_layered;
-	void				*vs_blit_texcoord;
-	void				*cs_clear_buffer;
-	void				*cs_copy_buffer;
-	void				*cs_copy_image;
-	void				*cs_copy_image_1d_array;
-	void				*cs_clear_render_target;
-	void				*cs_clear_render_target_1d_array;
-	void				*cs_clear_12bytes_buffer;
-	void				*cs_dcc_retile;
-	void				*cs_fmask_expand[3][2]; /* [log2(samples)-1][is_array] */
-	struct si_screen		*screen;
-	struct pipe_debug_callback	debug;
-	struct ac_llvm_compiler		compiler; /* only non-threaded compilation */
-	struct si_shader_ctx_state	fixed_func_tcs_shader;
-	/* Offset 0: EOP flush number; Offset 4: GDS prim restart counter */
-	struct si_resource		*wait_mem_scratch;
-	unsigned			wait_mem_number;
-	uint16_t			prefetch_L2_mask;
-
-	bool				has_graphics;
-	bool				gfx_flush_in_progress:1;
-	bool				gfx_last_ib_is_busy:1;
-	bool				compute_is_busy:1;
-
-	unsigned			num_gfx_cs_flushes;
-	unsigned			initial_gfx_cs_size;
-	unsigned			last_dirty_tex_counter;
-	unsigned			last_dirty_buf_counter;
-	unsigned			last_compressed_colortex_counter;
-	unsigned			last_num_draw_calls;
-	unsigned			flags; /* flush flags */
-	/* Current unaccounted memory usage. */
-	uint64_t			vram;
-	uint64_t			gtt;
-
-	/* Compute-based primitive discard. */
-	unsigned			prim_discard_vertex_count_threshold;
-	struct pb_buffer		*gds;
-	struct pb_buffer		*gds_oa;
-	struct radeon_cmdbuf		*prim_discard_compute_cs;
-	unsigned			compute_gds_offset;
-	struct si_shader		*compute_ib_last_shader;
-	uint32_t			compute_rewind_va;
-	unsigned			compute_num_prims_in_batch;
-	bool				preserve_prim_restart_gds_at_flush;
-	/* index_ring is divided into 2 halves for doublebuffering. */
-	struct si_resource		*index_ring;
-	unsigned			index_ring_base; /* offset of a per-IB portion */
-	unsigned			index_ring_offset; /* offset within a per-IB portion */
-	unsigned			index_ring_size_per_ib; /* max available size per IB */
-	bool				prim_discard_compute_ib_initialized;
-	/* For tracking the last execution barrier - it can be either
-	 * a WRITE_DATA packet or a fence. */
-	uint32_t			*last_pkt3_write_data;
-	struct si_resource		*barrier_buf;
-	unsigned			barrier_buf_offset;
-	struct pipe_fence_handle	*last_ib_barrier_fence;
-	struct si_resource		*last_ib_barrier_buf;
-	unsigned			last_ib_barrier_buf_offset;
-
-	/* Atoms (direct states). */
-	union si_state_atoms		atoms;
-	unsigned			dirty_atoms; /* mask */
-	/* PM4 states (precomputed immutable states) */
-	unsigned			dirty_states;
-	union si_state			queued;
-	union si_state			emitted;
-
-	/* Atom declarations. */
-	struct si_framebuffer		framebuffer;
-	unsigned			sample_locs_num_samples;
-	uint16_t			sample_mask;
-	unsigned			last_cb_target_mask;
-	struct si_blend_color		blend_color;
-	struct si_clip_state		clip_state;
-	struct si_shader_data		shader_pointers;
-	struct si_stencil_ref		stencil_ref;
-	struct pipe_scissor_state	scissors[SI_MAX_VIEWPORTS];
-	struct si_streamout		streamout;
-	struct si_viewports		viewports;
-	unsigned			num_window_rectangles;
-	bool				window_rectangles_include;
-	struct pipe_scissor_state	window_rectangles[4];
-
-	/* Precomputed states. */
-	struct si_pm4_state		*init_config;
-	struct si_pm4_state		*init_config_gs_rings;
-	bool				init_config_has_vgt_flush;
-	struct si_pm4_state		*vgt_shader_config[SI_NUM_VGT_STAGES_STATES];
-
-	/* shaders */
-	struct si_shader_ctx_state	ps_shader;
-	struct si_shader_ctx_state	gs_shader;
-	struct si_shader_ctx_state	vs_shader;
-	struct si_shader_ctx_state	tcs_shader;
-	struct si_shader_ctx_state	tes_shader;
-	struct si_shader_ctx_state	cs_prim_discard_state;
-	struct si_cs_shader_state	cs_shader_state;
-
-	/* shader information */
-	struct si_vertex_elements	*vertex_elements;
-	unsigned			num_vertex_elements;
-	unsigned			sprite_coord_enable;
-	unsigned			cs_max_waves_per_sh;
-	bool				flatshade;
-	bool				do_update_shaders;
-
-	/* shader descriptors */
-	struct si_descriptors		descriptors[SI_NUM_DESCS];
-	unsigned			descriptors_dirty;
-	unsigned			shader_pointers_dirty;
-	unsigned			shader_needs_decompress_mask;
-	struct si_buffer_resources	rw_buffers;
-	struct si_buffer_resources	const_and_shader_buffers[SI_NUM_SHADERS];
-	struct si_samplers		samplers[SI_NUM_SHADERS];
-	struct si_images		images[SI_NUM_SHADERS];
-	bool				bo_list_add_all_resident_resources;
-	bool				bo_list_add_all_gfx_resources;
-	bool				bo_list_add_all_compute_resources;
-
-	/* other shader resources */
-	struct pipe_constant_buffer	null_const_buf; /* used for set_constant_buffer(NULL) on GFX7 */
-	struct pipe_resource		*esgs_ring;
-	struct pipe_resource		*gsvs_ring;
-	struct pipe_resource		*tess_rings;
-	union pipe_color_union		*border_color_table; /* in CPU memory, any endian */
-	struct si_resource		*border_color_buffer;
-	union pipe_color_union		*border_color_map; /* in VRAM (slow access), little endian */
-	unsigned			border_color_count;
-	unsigned			num_vs_blit_sgprs;
-	uint32_t			vs_blit_sh_data[SI_VS_BLIT_SGPRS_POS_TEXCOORD];
-	uint32_t			cs_user_data[4];
-
-	/* Vertex buffers. */
-	bool				vertex_buffers_dirty;
-	bool				vertex_buffer_pointer_dirty;
-	bool				vertex_buffer_user_sgprs_dirty;
-	struct pipe_vertex_buffer	vertex_buffer[SI_NUM_VERTEX_BUFFERS];
-	uint16_t			vertex_buffer_unaligned; /* bitmask of not dword-aligned buffers */
-	uint32_t			*vb_descriptors_gpu_list;
-	struct si_resource		*vb_descriptors_buffer;
-	unsigned			vb_descriptors_offset;
-	unsigned			vb_descriptor_user_sgprs[5*4];
-
-	/* MSAA config state. */
-	int				ps_iter_samples;
-	bool				ps_uses_fbfetch;
-	bool				smoothing_enabled;
-
-	/* DB render state. */
-	unsigned		ps_db_shader_control;
-	unsigned		dbcb_copy_sample;
-	bool			dbcb_depth_copy_enabled:1;
-	bool			dbcb_stencil_copy_enabled:1;
-	bool			db_flush_depth_inplace:1;
-	bool			db_flush_stencil_inplace:1;
-	bool			db_depth_clear:1;
-	bool			db_depth_disable_expclear:1;
-	bool			db_stencil_clear:1;
-	bool			db_stencil_disable_expclear:1;
-	bool			occlusion_queries_disabled:1;
-	bool			generate_mipmap_for_depth:1;
-
-	/* Emitted draw state. */
-	bool			gs_tri_strip_adj_fix:1;
-	bool			ls_vgpr_fix:1;
-	bool			prim_discard_cs_instancing:1;
-	bool			ngg:1;
-	uint8_t			ngg_culling;
-	int			last_index_size;
-	int			last_base_vertex;
-	int			last_start_instance;
-	int			last_instance_count;
-	int			last_drawid;
-	int			last_sh_base_reg;
-	int			last_primitive_restart_en;
-	int			last_restart_index;
-	int			last_prim;
-	int			last_multi_vgt_param;
-	int			last_gs_out_prim;
-	int			last_binning_enabled;
-	unsigned		current_vs_state;
-	unsigned		last_vs_state;
-	enum pipe_prim_type	current_rast_prim; /* primitive type after TES, GS */
-
-	struct si_small_prim_cull_info last_small_prim_cull_info;
-	struct si_resource	*small_prim_cull_info_buf;
-	uint64_t		small_prim_cull_info_address;
-	bool			small_prim_cull_info_dirty;
-
-	/* Scratch buffer */
-	struct si_resource	*scratch_buffer;
-	unsigned		scratch_waves;
-	unsigned		spi_tmpring_size;
-	unsigned		max_seen_scratch_bytes_per_wave;
-	unsigned		max_seen_compute_scratch_bytes_per_wave;
-
-	struct si_resource	*compute_scratch_buffer;
-
-	/* Emitted derived tessellation state. */
-	/* Local shader (VS), or HS if LS-HS are merged. */
-	struct si_shader	*last_ls;
-	struct si_shader_selector *last_tcs;
-	int			last_num_tcs_input_cp;
-	int			last_tes_sh_base;
-	bool			last_tess_uses_primid;
-	unsigned		last_num_patches;
-	int			last_ls_hs_config;
-
-	/* Debug state. */
-	bool			is_debug;
-	struct si_saved_cs	*current_saved_cs;
-	uint64_t		dmesg_timestamp;
-	unsigned		apitrace_call_number;
-
-	/* Other state */
-	bool need_check_render_feedback;
-	bool			decompression_enabled;
-	bool			dpbb_force_off;
-	bool			vs_writes_viewport_index;
-	bool			vs_disables_clipping_viewport;
-
-	/* Precomputed IA_MULTI_VGT_PARAM */
-	union si_vgt_param_key  ia_multi_vgt_param_key;
-	unsigned		ia_multi_vgt_param[SI_NUM_VGT_PARAM_STATES];
-
-	/* Bindless descriptors. */
-	struct si_descriptors	bindless_descriptors;
-	struct util_idalloc	bindless_used_slots;
-	unsigned		num_bindless_descriptors;
-	bool			bindless_descriptors_dirty;
-	bool			graphics_bindless_pointer_dirty;
-	bool			compute_bindless_pointer_dirty;
-
-	/* Allocated bindless handles */
-	struct hash_table	*tex_handles;
-	struct hash_table	*img_handles;
-
-	/* Resident bindless handles */
-	struct util_dynarray	resident_tex_handles;
-	struct util_dynarray	resident_img_handles;
-
-	/* Resident bindless handles which need decompression */
-	struct util_dynarray	resident_tex_needs_color_decompress;
-	struct util_dynarray	resident_img_needs_color_decompress;
-	struct util_dynarray	resident_tex_needs_depth_decompress;
-
-	/* Bindless state */
-	bool			uses_bindless_samplers;
-	bool			uses_bindless_images;
-
-	/* MSAA sample locations.
-	 * The first index is the sample index.
-	 * The second index is the coordinate: X, Y. */
-	struct {
-		float			x1[1][2];
-		float			x2[2][2];
-		float			x4[4][2];
-		float			x8[8][2];
-		float			x16[16][2];
-	} sample_positions;
-	struct pipe_resource *sample_pos_buffer;
-
-	/* Misc stats. */
-	unsigned			num_draw_calls;
-	unsigned			num_decompress_calls;
-	unsigned			num_mrt_draw_calls;
-	unsigned			num_prim_restart_calls;
-	unsigned			num_spill_draw_calls;
-	unsigned			num_compute_calls;
-	unsigned			num_spill_compute_calls;
-	unsigned			num_dma_calls;
-	unsigned			num_cp_dma_calls;
-	unsigned			num_vs_flushes;
-	unsigned			num_ps_flushes;
-	unsigned			num_cs_flushes;
-	unsigned			num_cb_cache_flushes;
-	unsigned			num_db_cache_flushes;
-	unsigned			num_L2_invalidates;
-	unsigned			num_L2_writebacks;
-	unsigned			num_resident_handles;
-	uint64_t			num_alloc_tex_transfer_bytes;
-	unsigned			last_tex_ps_draw_ratio; /* for query */
-	unsigned			compute_num_verts_accepted;
-	unsigned			compute_num_verts_rejected;
-	unsigned			compute_num_verts_ineligible; /* due to low vertex count */
-	unsigned			context_roll;
-
-	/* Queries. */
-	/* Maintain the list of active queries for pausing between IBs. */
-	int				num_occlusion_queries;
-	int				num_perfect_occlusion_queries;
-	int				num_pipeline_stat_queries;
-	struct list_head		active_queries;
-	unsigned			num_cs_dw_queries_suspend;
-
-	/* Render condition. */
-	struct pipe_query		*render_cond;
-	unsigned			render_cond_mode;
-	bool				render_cond_invert;
-	bool				render_cond_force_off; /* for u_blitter */
-
-	/* For uploading data via GTT and copy to VRAM on context flush via SDMA. */
-	bool				sdma_uploads_in_progress;
-	struct si_sdma_upload		*sdma_uploads;
-	unsigned			num_sdma_uploads;
-	unsigned			max_sdma_uploads;
-
-	/* Shader-based queries. */
-	struct list_head		shader_query_buffers;
-	unsigned			num_active_shader_queries;
-
-	/* Statistics gathering for the DCC enablement heuristic. It can't be
-	 * in si_texture because si_texture can be shared by multiple
-	 * contexts. This is for back buffers only. We shouldn't get too many
-	 * of those.
-	 *
-	 * X11 DRI3 rotates among a finite set of back buffers. They should
-	 * all fit in this array. If they don't, separate DCC might never be
-	 * enabled by DCC stat gathering.
-	 */
-	struct {
-		struct si_texture		*tex;
-		/* Query queue: 0 = usually active, 1 = waiting, 2 = readback. */
-		struct pipe_query		*ps_stats[3];
-		/* If all slots are used and another slot is needed,
-		 * the least recently used slot is evicted based on this. */
-		int64_t				last_use_timestamp;
-		bool				query_active;
-	} dcc_stats[5];
-
-	/* Copy one resource to another using async DMA. */
-	void (*dma_copy)(struct pipe_context *ctx,
-			 struct pipe_resource *dst,
-			 unsigned dst_level,
-			 unsigned dst_x, unsigned dst_y, unsigned dst_z,
-			 struct pipe_resource *src,
-			 unsigned src_level,
-			 const struct pipe_box *src_box);
-
-	struct si_tracked_regs			tracked_regs;
+   struct pipe_context b; /* base class */
+
+   enum radeon_family family;
+   enum chip_class chip_class;
+
+   struct radeon_winsys *ws;
+   struct radeon_winsys_ctx *ctx;
+   struct radeon_cmdbuf *gfx_cs; /* compute IB if graphics is disabled */
+   struct radeon_cmdbuf *sdma_cs;
+   struct pipe_fence_handle *last_gfx_fence;
+   struct pipe_fence_handle *last_sdma_fence;
+   struct si_resource *eop_bug_scratch;
+   struct u_upload_mgr *cached_gtt_allocator;
+   struct threaded_context *tc;
+   struct u_suballocator *allocator_zeroed_memory;
+   struct slab_child_pool pool_transfers;
+   struct slab_child_pool pool_transfers_unsync; /* for threaded_context */
+   struct pipe_device_reset_callback device_reset_callback;
+   struct u_log_context *log;
+   void *query_result_shader;
+   void *sh_query_result_shader;
+
+   void (*emit_cache_flush)(struct si_context *ctx);
+
+   struct blitter_context *blitter;
+   void *noop_blend;
+   void *noop_dsa;
+   void *discard_rasterizer_state;
+   void *custom_dsa_flush;
+   void *custom_blend_resolve;
+   void *custom_blend_fmask_decompress;
+   void *custom_blend_eliminate_fastclear;
+   void *custom_blend_dcc_decompress;
+   void *vs_blit_pos;
+   void *vs_blit_pos_layered;
+   void *vs_blit_color;
+   void *vs_blit_color_layered;
+   void *vs_blit_texcoord;
+   void *cs_clear_buffer;
+   void *cs_copy_buffer;
+   void *cs_copy_image;
+   void *cs_copy_image_1d_array;
+   void *cs_clear_render_target;
+   void *cs_clear_render_target_1d_array;
+   void *cs_clear_12bytes_buffer;
+   void *cs_dcc_retile;
+   void *cs_fmask_expand[3][2]; /* [log2(samples)-1][is_array] */
+   struct si_screen *screen;
+   struct pipe_debug_callback debug;
+   struct ac_llvm_compiler compiler; /* only non-threaded compilation */
+   struct si_shader_ctx_state fixed_func_tcs_shader;
+   /* Offset 0: EOP flush number; Offset 4: GDS prim restart counter */
+   struct si_resource *wait_mem_scratch;
+   unsigned wait_mem_number;
+   uint16_t prefetch_L2_mask;
+
+   bool has_graphics;
+   bool gfx_flush_in_progress : 1;
+   bool gfx_last_ib_is_busy : 1;
+   bool compute_is_busy : 1;
+
+   unsigned num_gfx_cs_flushes;
+   unsigned initial_gfx_cs_size;
+   unsigned last_dirty_tex_counter;
+   unsigned last_dirty_buf_counter;
+   unsigned last_compressed_colortex_counter;
+   unsigned last_num_draw_calls;
+   unsigned flags; /* flush flags */
+   /* Current unaccounted memory usage. */
+   uint64_t vram;
+   uint64_t gtt;
+
+   /* Compute-based primitive discard. */
+   unsigned prim_discard_vertex_count_threshold;
+   struct pb_buffer *gds;
+   struct pb_buffer *gds_oa;
+   struct radeon_cmdbuf *prim_discard_compute_cs;
+   unsigned compute_gds_offset;
+   struct si_shader *compute_ib_last_shader;
+   uint32_t compute_rewind_va;
+   unsigned compute_num_prims_in_batch;
+   bool preserve_prim_restart_gds_at_flush;
+   /* index_ring is divided into 2 halves for doublebuffering. */
+   struct si_resource *index_ring;
+   unsigned index_ring_base;        /* offset of a per-IB portion */
+   unsigned index_ring_offset;      /* offset within a per-IB portion */
+   unsigned index_ring_size_per_ib; /* max available size per IB */
+   bool prim_discard_compute_ib_initialized;
+   /* For tracking the last execution barrier - it can be either
+    * a WRITE_DATA packet or a fence. */
+   uint32_t *last_pkt3_write_data;
+   struct si_resource *barrier_buf;
+   unsigned barrier_buf_offset;
+   struct pipe_fence_handle *last_ib_barrier_fence;
+   struct si_resource *last_ib_barrier_buf;
+   unsigned last_ib_barrier_buf_offset;
+
+   /* Atoms (direct states). */
+   union si_state_atoms atoms;
+   unsigned dirty_atoms; /* mask */
+   /* PM4 states (precomputed immutable states) */
+   unsigned dirty_states;
+   union si_state queued;
+   union si_state emitted;
+
+   /* Atom declarations. */
+   struct si_framebuffer framebuffer;
+   unsigned sample_locs_num_samples;
+   uint16_t sample_mask;
+   unsigned last_cb_target_mask;
+   struct si_blend_color blend_color;
+   struct si_clip_state clip_state;
+   struct si_shader_data shader_pointers;
+   struct si_stencil_ref stencil_ref;
+   struct pipe_scissor_state scissors[SI_MAX_VIEWPORTS];
+   struct si_streamout streamout;
+   struct si_viewports viewports;
+   unsigned num_window_rectangles;
+   bool window_rectangles_include;
+   struct pipe_scissor_state window_rectangles[4];
+
+   /* Precomputed states. */
+   struct si_pm4_state *init_config;
+   struct si_pm4_state *init_config_gs_rings;
+   bool init_config_has_vgt_flush;
+   struct si_pm4_state *vgt_shader_config[SI_NUM_VGT_STAGES_STATES];
+
+   /* shaders */
+   struct si_shader_ctx_state ps_shader;
+   struct si_shader_ctx_state gs_shader;
+   struct si_shader_ctx_state vs_shader;
+   struct si_shader_ctx_state tcs_shader;
+   struct si_shader_ctx_state tes_shader;
+   struct si_shader_ctx_state cs_prim_discard_state;
+   struct si_cs_shader_state cs_shader_state;
+
+   /* shader information */
+   struct si_vertex_elements *vertex_elements;
+   unsigned num_vertex_elements;
+   unsigned sprite_coord_enable;
+   unsigned cs_max_waves_per_sh;
+   bool flatshade;
+   bool do_update_shaders;
+
+   /* shader descriptors */
+   struct si_descriptors descriptors[SI_NUM_DESCS];
+   unsigned descriptors_dirty;
+   unsigned shader_pointers_dirty;
+   unsigned shader_needs_decompress_mask;
+   struct si_buffer_resources rw_buffers;
+   struct si_buffer_resources const_and_shader_buffers[SI_NUM_SHADERS];
+   struct si_samplers samplers[SI_NUM_SHADERS];
+   struct si_images images[SI_NUM_SHADERS];
+   bool bo_list_add_all_resident_resources;
+   bool bo_list_add_all_gfx_resources;
+   bool bo_list_add_all_compute_resources;
+
+   /* other shader resources */
+   struct pipe_constant_buffer null_const_buf; /* used for set_constant_buffer(NULL) on GFX7 */
+   struct pipe_resource *esgs_ring;
+   struct pipe_resource *gsvs_ring;
+   struct pipe_resource *tess_rings;
+   union pipe_color_union *border_color_table; /* in CPU memory, any endian */
+   struct si_resource *border_color_buffer;
+   union pipe_color_union *border_color_map; /* in VRAM (slow access), little endian */
+   unsigned border_color_count;
+   unsigned num_vs_blit_sgprs;
+   uint32_t vs_blit_sh_data[SI_VS_BLIT_SGPRS_POS_TEXCOORD];
+   uint32_t cs_user_data[4];
+
+   /* Vertex buffers. */
+   bool vertex_buffers_dirty;
+   bool vertex_buffer_pointer_dirty;
+   bool vertex_buffer_user_sgprs_dirty;
+   struct pipe_vertex_buffer vertex_buffer[SI_NUM_VERTEX_BUFFERS];
+   uint16_t vertex_buffer_unaligned; /* bitmask of not dword-aligned buffers */
+   uint32_t *vb_descriptors_gpu_list;
+   struct si_resource *vb_descriptors_buffer;
+   unsigned vb_descriptors_offset;
+   unsigned vb_descriptor_user_sgprs[5 * 4];
+
+   /* MSAA config state. */
+   int ps_iter_samples;
+   bool ps_uses_fbfetch;
+   bool smoothing_enabled;
+
+   /* DB render state. */
+   unsigned ps_db_shader_control;
+   unsigned dbcb_copy_sample;
+   bool dbcb_depth_copy_enabled : 1;
+   bool dbcb_stencil_copy_enabled : 1;
+   bool db_flush_depth_inplace : 1;
+   bool db_flush_stencil_inplace : 1;
+   bool db_depth_clear : 1;
+   bool db_depth_disable_expclear : 1;
+   bool db_stencil_clear : 1;
+   bool db_stencil_disable_expclear : 1;
+   bool occlusion_queries_disabled : 1;
+   bool generate_mipmap_for_depth : 1;
+
+   /* Emitted draw state. */
+   bool gs_tri_strip_adj_fix : 1;
+   bool ls_vgpr_fix : 1;
+   bool prim_discard_cs_instancing : 1;
+   bool ngg : 1;
+   uint8_t ngg_culling;
+   int last_index_size;
+   int last_base_vertex;
+   int last_start_instance;
+   int last_instance_count;
+   int last_drawid;
+   int last_sh_base_reg;
+   int last_primitive_restart_en;
+   int last_restart_index;
+   int last_prim;
+   int last_multi_vgt_param;
+   int last_gs_out_prim;
+   int last_binning_enabled;
+   unsigned current_vs_state;
+   unsigned last_vs_state;
+   enum pipe_prim_type current_rast_prim; /* primitive type after TES, GS */
+
+   struct si_small_prim_cull_info last_small_prim_cull_info;
+   struct si_resource *small_prim_cull_info_buf;
+   uint64_t small_prim_cull_info_address;
+   bool small_prim_cull_info_dirty;
+
+   /* Scratch buffer */
+   struct si_resource *scratch_buffer;
+   unsigned scratch_waves;
+   unsigned spi_tmpring_size;
+   unsigned max_seen_scratch_bytes_per_wave;
+   unsigned max_seen_compute_scratch_bytes_per_wave;
+
+   struct si_resource *compute_scratch_buffer;
+
+   /* Emitted derived tessellation state. */
+   /* Local shader (VS), or HS if LS-HS are merged. */
+   struct si_shader *last_ls;
+   struct si_shader_selector *last_tcs;
+   int last_num_tcs_input_cp;
+   int last_tes_sh_base;
+   bool last_tess_uses_primid;
+   unsigned last_num_patches;
+   int last_ls_hs_config;
+
+   /* Debug state. */
+   bool is_debug;
+   struct si_saved_cs *current_saved_cs;
+   uint64_t dmesg_timestamp;
+   unsigned apitrace_call_number;
+
+   /* Other state */
+   bool need_check_render_feedback;
+   bool decompression_enabled;
+   bool dpbb_force_off;
+   bool vs_writes_viewport_index;
+   bool vs_disables_clipping_viewport;
+
+   /* Precomputed IA_MULTI_VGT_PARAM */
+   union si_vgt_param_key ia_multi_vgt_param_key;
+   unsigned ia_multi_vgt_param[SI_NUM_VGT_PARAM_STATES];
+
+   /* Bindless descriptors. */
+   struct si_descriptors bindless_descriptors;
+   struct util_idalloc bindless_used_slots;
+   unsigned num_bindless_descriptors;
+   bool bindless_descriptors_dirty;
+   bool graphics_bindless_pointer_dirty;
+   bool compute_bindless_pointer_dirty;
+
+   /* Allocated bindless handles */
+   struct hash_table *tex_handles;
+   struct hash_table *img_handles;
+
+   /* Resident bindless handles */
+   struct util_dynarray resident_tex_handles;
+   struct util_dynarray resident_img_handles;
+
+   /* Resident bindless handles which need decompression */
+   struct util_dynarray resident_tex_needs_color_decompress;
+   struct util_dynarray resident_img_needs_color_decompress;
+   struct util_dynarray resident_tex_needs_depth_decompress;
+
+   /* Bindless state */
+   bool uses_bindless_samplers;
+   bool uses_bindless_images;
+
+   /* MSAA sample locations.
+    * The first index is the sample index.
+    * The second index is the coordinate: X, Y. */
+   struct {
+      float x1[1][2];
+      float x2[2][2];
+      float x4[4][2];
+      float x8[8][2];
+      float x16[16][2];
+   } sample_positions;
+   struct pipe_resource *sample_pos_buffer;
+
+   /* Misc stats. */
+   unsigned num_draw_calls;
+   unsigned num_decompress_calls;
+   unsigned num_mrt_draw_calls;
+   unsigned num_prim_restart_calls;
+   unsigned num_spill_draw_calls;
+   unsigned num_compute_calls;
+   unsigned num_spill_compute_calls;
+   unsigned num_dma_calls;
+   unsigned num_cp_dma_calls;
+   unsigned num_vs_flushes;
+   unsigned num_ps_flushes;
+   unsigned num_cs_flushes;
+   unsigned num_cb_cache_flushes;
+   unsigned num_db_cache_flushes;
+   unsigned num_L2_invalidates;
+   unsigned num_L2_writebacks;
+   unsigned num_resident_handles;
+   uint64_t num_alloc_tex_transfer_bytes;
+   unsigned last_tex_ps_draw_ratio; /* for query */
+   unsigned compute_num_verts_accepted;
+   unsigned compute_num_verts_rejected;
+   unsigned compute_num_verts_ineligible; /* due to low vertex count */
+   unsigned context_roll;
+
+   /* Queries. */
+   /* Maintain the list of active queries for pausing between IBs. */
+   int num_occlusion_queries;
+   int num_perfect_occlusion_queries;
+   int num_pipeline_stat_queries;
+   struct list_head active_queries;
+   unsigned num_cs_dw_queries_suspend;
+
+   /* Render condition. */
+   struct pipe_query *render_cond;
+   unsigned render_cond_mode;
+   bool render_cond_invert;
+   bool render_cond_force_off; /* for u_blitter */
+
+   /* For uploading data via GTT and copy to VRAM on context flush via SDMA. */
+   bool sdma_uploads_in_progress;
+   struct si_sdma_upload *sdma_uploads;
+   unsigned num_sdma_uploads;
+   unsigned max_sdma_uploads;
+
+   /* Shader-based queries. */
+   struct list_head shader_query_buffers;
+   unsigned num_active_shader_queries;
+
+   /* Statistics gathering for the DCC enablement heuristic. It can't be
+    * in si_texture because si_texture can be shared by multiple
+    * contexts. This is for back buffers only. We shouldn't get too many
+    * of those.
+    *
+    * X11 DRI3 rotates among a finite set of back buffers. They should
+    * all fit in this array. If they don't, separate DCC might never be
+    * enabled by DCC stat gathering.
+    */
+   struct {
+      struct si_texture *tex;
+      /* Query queue: 0 = usually active, 1 = waiting, 2 = readback. */
+      struct pipe_query *ps_stats[3];
+      /* If all slots are used and another slot is needed,
+       * the least recently used slot is evicted based on this. */
+      int64_t last_use_timestamp;
+      bool query_active;
+   } dcc_stats[5];
+
+   /* Copy one resource to another using async DMA. */
+   void (*dma_copy)(struct pipe_context *ctx, struct pipe_resource *dst, unsigned dst_level,
+                    unsigned dst_x, unsigned dst_y, unsigned dst_z, struct pipe_resource *src,
+                    unsigned src_level, const struct pipe_box *src_box);
+
+   struct si_tracked_regs tracked_regs;
 };
 
 /* cik_sdma.c */
@@ -1269,124 +1263,99 @@ void cik_init_sdma_functions(struct si_context *sctx);
 /* si_blit.c */
 enum si_blitter_op /* bitmask */
 {
-	SI_SAVE_TEXTURES      = 1,
-	SI_SAVE_FRAMEBUFFER   = 2,
-	SI_SAVE_FRAGMENT_STATE = 4,
-	SI_DISABLE_RENDER_COND = 8,
+   SI_SAVE_TEXTURES = 1,
+   SI_SAVE_FRAMEBUFFER = 2,
+   SI_SAVE_FRAGMENT_STATE = 4,
+   SI_DISABLE_RENDER_COND = 8,
 };
 
 void si_blitter_begin(struct si_context *sctx, enum si_blitter_op op);
 void si_blitter_end(struct si_context *sctx);
 void si_init_blit_functions(struct si_context *sctx);
 void si_decompress_textures(struct si_context *sctx, unsigned shader_mask);
-void si_decompress_subresource(struct pipe_context *ctx,
-			       struct pipe_resource *tex,
-			       unsigned planes, unsigned level,
-			       unsigned first_layer, unsigned last_layer);
-void si_resource_copy_region(struct pipe_context *ctx,
-			     struct pipe_resource *dst,
-			     unsigned dst_level,
-			     unsigned dstx, unsigned dsty, unsigned dstz,
-			     struct pipe_resource *src,
-			     unsigned src_level,
-			     const struct pipe_box *src_box);
+void si_decompress_subresource(struct pipe_context *ctx, struct pipe_resource *tex, unsigned planes,
+                               unsigned level, unsigned first_layer, unsigned last_layer);
+void si_resource_copy_region(struct pipe_context *ctx, struct pipe_resource *dst,
+                             unsigned dst_level, unsigned dstx, unsigned dsty, unsigned dstz,
+                             struct pipe_resource *src, unsigned src_level,
+                             const struct pipe_box *src_box);
 void si_decompress_dcc(struct si_context *sctx, struct si_texture *tex);
 
 /* si_buffer.c */
-bool si_rings_is_buffer_referenced(struct si_context *sctx,
-				   struct pb_buffer *buf,
-				   enum radeon_bo_usage usage);
-void *si_buffer_map_sync_with_rings(struct si_context *sctx,
-				    struct si_resource *resource,
-				    unsigned usage);
-void si_init_resource_fields(struct si_screen *sscreen,
-			     struct si_resource *res,
-			     uint64_t size, unsigned alignment);
-bool si_alloc_resource(struct si_screen *sscreen,
-		       struct si_resource *res);
-struct pipe_resource *pipe_aligned_buffer_create(struct pipe_screen *screen,
-						 unsigned flags, unsigned usage,
-						 unsigned size, unsigned alignment);
-struct si_resource *si_aligned_buffer_create(struct pipe_screen *screen,
-					       unsigned flags, unsigned usage,
-					       unsigned size, unsigned alignment);
-void si_replace_buffer_storage(struct pipe_context *ctx,
-			       struct pipe_resource *dst,
-			       struct pipe_resource *src);
+bool si_rings_is_buffer_referenced(struct si_context *sctx, struct pb_buffer *buf,
+                                   enum radeon_bo_usage usage);
+void *si_buffer_map_sync_with_rings(struct si_context *sctx, struct si_resource *resource,
+                                    unsigned usage);
+void si_init_resource_fields(struct si_screen *sscreen, struct si_resource *res, uint64_t size,
+                             unsigned alignment);
+bool si_alloc_resource(struct si_screen *sscreen, struct si_resource *res);
+struct pipe_resource *pipe_aligned_buffer_create(struct pipe_screen *screen, unsigned flags,
+                                                 unsigned usage, unsigned size, unsigned alignment);
+struct si_resource *si_aligned_buffer_create(struct pipe_screen *screen, unsigned flags,
+                                             unsigned usage, unsigned size, unsigned alignment);
+void si_replace_buffer_storage(struct pipe_context *ctx, struct pipe_resource *dst,
+                               struct pipe_resource *src);
 void si_init_screen_buffer_functions(struct si_screen *sscreen);
 void si_init_buffer_functions(struct si_context *sctx);
 
 /* si_clear.c */
 enum pipe_format si_simplify_cb_format(enum pipe_format format);
 bool vi_alpha_is_on_msb(struct si_screen *sscreen, enum pipe_format format);
-bool vi_dcc_clear_level(struct si_context *sctx,
-			struct si_texture *tex,
-			unsigned level, unsigned clear_value);
+bool vi_dcc_clear_level(struct si_context *sctx, struct si_texture *tex, unsigned level,
+                        unsigned clear_value);
 void si_init_clear_functions(struct si_context *sctx);
 
 /* si_compute_blit.c */
 unsigned si_get_flush_flags(struct si_context *sctx, enum si_coherency coher,
-			    enum si_cache_policy cache_policy);
-void si_clear_buffer(struct si_context *sctx, struct pipe_resource *dst,
-		     uint64_t offset, uint64_t size, uint32_t *clear_value,
-		     uint32_t clear_value_size, enum si_coherency coher,
-		     bool force_cpdma);
-void si_copy_buffer(struct si_context *sctx,
-		    struct pipe_resource *dst, struct pipe_resource *src,
-		    uint64_t dst_offset, uint64_t src_offset, unsigned size);
-void si_compute_copy_image(struct si_context *sctx,
-			   struct pipe_resource *dst,
-			   unsigned dst_level,
-			   struct pipe_resource *src,
-			   unsigned src_level,
-			   unsigned dstx, unsigned dsty, unsigned dstz,
-			   const struct pipe_box *src_box);
-void si_compute_clear_render_target(struct pipe_context *ctx,
-                                    struct pipe_surface *dstsurf,
-                                    const union pipe_color_union *color,
-                                    unsigned dstx, unsigned dsty,
-                                    unsigned width, unsigned height,
-				    bool render_condition_enabled);
+                            enum si_cache_policy cache_policy);
+void si_clear_buffer(struct si_context *sctx, struct pipe_resource *dst, uint64_t offset,
+                     uint64_t size, uint32_t *clear_value, uint32_t clear_value_size,
+                     enum si_coherency coher, bool force_cpdma);
+void si_copy_buffer(struct si_context *sctx, struct pipe_resource *dst, struct pipe_resource *src,
+                    uint64_t dst_offset, uint64_t src_offset, unsigned size);
+void si_compute_copy_image(struct si_context *sctx, struct pipe_resource *dst, unsigned dst_level,
+                           struct pipe_resource *src, unsigned src_level, unsigned dstx,
+                           unsigned dsty, unsigned dstz, const struct pipe_box *src_box);
+void si_compute_clear_render_target(struct pipe_context *ctx, struct pipe_surface *dstsurf,
+                                    const union pipe_color_union *color, unsigned dstx,
+                                    unsigned dsty, unsigned width, unsigned height,
+                                    bool render_condition_enabled);
 void si_retile_dcc(struct si_context *sctx, struct si_texture *tex);
 void si_compute_expand_fmask(struct pipe_context *ctx, struct pipe_resource *tex);
 void si_init_compute_blit_functions(struct si_context *sctx);
 
 /* si_cp_dma.c */
-#define SI_CPDMA_SKIP_CHECK_CS_SPACE	(1 << 0) /* don't call need_cs_space */
-#define SI_CPDMA_SKIP_SYNC_AFTER	(1 << 1) /* don't wait for DMA after the copy */
-#define SI_CPDMA_SKIP_SYNC_BEFORE	(1 << 2) /* don't wait for DMA before the copy (RAW hazards) */
-#define SI_CPDMA_SKIP_GFX_SYNC		(1 << 3) /* don't flush caches and don't wait for PS/CS */
-#define SI_CPDMA_SKIP_BO_LIST_UPDATE	(1 << 4) /* don't update the BO list */
-#define SI_CPDMA_SKIP_ALL (SI_CPDMA_SKIP_CHECK_CS_SPACE | \
-			   SI_CPDMA_SKIP_SYNC_AFTER | \
-			   SI_CPDMA_SKIP_SYNC_BEFORE | \
-			   SI_CPDMA_SKIP_GFX_SYNC | \
-			   SI_CPDMA_SKIP_BO_LIST_UPDATE)
+#define SI_CPDMA_SKIP_CHECK_CS_SPACE (1 << 0) /* don't call need_cs_space */
+#define SI_CPDMA_SKIP_SYNC_AFTER     (1 << 1) /* don't wait for DMA after the copy */
+#define SI_CPDMA_SKIP_SYNC_BEFORE    (1 << 2) /* don't wait for DMA before the copy (RAW hazards) */
+#define SI_CPDMA_SKIP_GFX_SYNC       (1 << 3) /* don't flush caches and don't wait for PS/CS */
+#define SI_CPDMA_SKIP_BO_LIST_UPDATE (1 << 4) /* don't update the BO list */
+#define SI_CPDMA_SKIP_ALL                                                                          \
+   (SI_CPDMA_SKIP_CHECK_CS_SPACE | SI_CPDMA_SKIP_SYNC_AFTER | SI_CPDMA_SKIP_SYNC_BEFORE |          \
+    SI_CPDMA_SKIP_GFX_SYNC | SI_CPDMA_SKIP_BO_LIST_UPDATE)
 
 void si_cp_dma_wait_for_idle(struct si_context *sctx);
 void si_cp_dma_clear_buffer(struct si_context *sctx, struct radeon_cmdbuf *cs,
-			    struct pipe_resource *dst, uint64_t offset,
-			    uint64_t size, unsigned value, unsigned user_flags,
-			    enum si_coherency coher, enum si_cache_policy cache_policy);
-void si_cp_dma_copy_buffer(struct si_context *sctx,
-			   struct pipe_resource *dst, struct pipe_resource *src,
-			   uint64_t dst_offset, uint64_t src_offset, unsigned size,
-			   unsigned user_flags, enum si_coherency coher,
-			   enum si_cache_policy cache_policy);
-void cik_prefetch_TC_L2_async(struct si_context *sctx, struct pipe_resource *buf,
-			      uint64_t offset, unsigned size);
+                            struct pipe_resource *dst, uint64_t offset, uint64_t size,
+                            unsigned value, unsigned user_flags, enum si_coherency coher,
+                            enum si_cache_policy cache_policy);
+void si_cp_dma_copy_buffer(struct si_context *sctx, struct pipe_resource *dst,
+                           struct pipe_resource *src, uint64_t dst_offset, uint64_t src_offset,
+                           unsigned size, unsigned user_flags, enum si_coherency coher,
+                           enum si_cache_policy cache_policy);
+void cik_prefetch_TC_L2_async(struct si_context *sctx, struct pipe_resource *buf, uint64_t offset,
+                              unsigned size);
 void cik_emit_prefetch_L2(struct si_context *sctx, bool vertex_stage_only);
 void si_test_gds(struct si_context *sctx);
-void si_cp_write_data(struct si_context *sctx, struct si_resource *buf,
-		      unsigned offset, unsigned size, unsigned dst_sel,
-		      unsigned engine, const void *data);
-void si_cp_copy_data(struct si_context *sctx, struct radeon_cmdbuf *cs,
-		     unsigned dst_sel, struct si_resource *dst, unsigned dst_offset,
-		     unsigned src_sel, struct si_resource *src, unsigned src_offset);
+void si_cp_write_data(struct si_context *sctx, struct si_resource *buf, unsigned offset,
+                      unsigned size, unsigned dst_sel, unsigned engine, const void *data);
+void si_cp_copy_data(struct si_context *sctx, struct radeon_cmdbuf *cs, unsigned dst_sel,
+                     struct si_resource *dst, unsigned dst_offset, unsigned src_sel,
+                     struct si_resource *src, unsigned src_offset);
 
 /* si_debug.c */
-void si_save_cs(struct radeon_winsys *ws, struct radeon_cmdbuf *cs,
-		struct radeon_saved_cs *saved, bool get_buffer_list);
+void si_save_cs(struct radeon_winsys *ws, struct radeon_cmdbuf *cs, struct radeon_saved_cs *saved,
+                bool get_buffer_list);
 void si_clear_saved_cs(struct radeon_saved_cs *saved);
 void si_destroy_saved_cs(struct si_saved_cs *scs);
 void si_auto_log_cs(void *data, struct u_log_context *log);
@@ -1394,45 +1363,41 @@ void si_log_hw_flush(struct si_context *sctx);
 void si_log_draw_state(struct si_context *sctx, struct u_log_context *log);
 void si_log_compute_state(struct si_context *sctx, struct u_log_context *log);
 void si_init_debug_functions(struct si_context *sctx);
-void si_check_vm_faults(struct si_context *sctx,
-			struct radeon_saved_cs *saved, enum ring_type ring);
+void si_check_vm_faults(struct si_context *sctx, struct radeon_saved_cs *saved,
+                        enum ring_type ring);
 bool si_replace_shader(unsigned num, struct si_shader_binary *binary);
 
 /* si_dma_cs.c */
-void si_dma_emit_timestamp(struct si_context *sctx, struct si_resource *dst,
-			   uint64_t offset);
-void si_sdma_clear_buffer(struct si_context *sctx, struct pipe_resource *dst,
-			  uint64_t offset, uint64_t size, unsigned clear_value);
+void si_dma_emit_timestamp(struct si_context *sctx, struct si_resource *dst, uint64_t offset);
+void si_sdma_clear_buffer(struct si_context *sctx, struct pipe_resource *dst, uint64_t offset,
+                          uint64_t size, unsigned clear_value);
 void si_sdma_copy_buffer(struct si_context *sctx, struct pipe_resource *dst,
-			 struct pipe_resource *src, uint64_t dst_offset,
-			 uint64_t src_offset, uint64_t size);
-void si_need_dma_space(struct si_context *ctx, unsigned num_dw,
-		       struct si_resource *dst, struct si_resource *src);
-void si_flush_dma_cs(struct si_context *ctx, unsigned flags,
-		     struct pipe_fence_handle **fence);
-void si_screen_clear_buffer(struct si_screen *sscreen, struct pipe_resource *dst,
-			    uint64_t offset, uint64_t size, unsigned value);
+                         struct pipe_resource *src, uint64_t dst_offset, uint64_t src_offset,
+                         uint64_t size);
+void si_need_dma_space(struct si_context *ctx, unsigned num_dw, struct si_resource *dst,
+                       struct si_resource *src);
+void si_flush_dma_cs(struct si_context *ctx, unsigned flags, struct pipe_fence_handle **fence);
+void si_screen_clear_buffer(struct si_screen *sscreen, struct pipe_resource *dst, uint64_t offset,
+                            uint64_t size, unsigned value);
 
 /* si_fence.c */
-void si_cp_release_mem(struct si_context *ctx, struct radeon_cmdbuf *cs,
-		       unsigned event, unsigned event_flags,
-		       unsigned dst_sel, unsigned int_sel, unsigned data_sel,
-		       struct si_resource *buf, uint64_t va,
-		       uint32_t new_fence, unsigned query_type);
+void si_cp_release_mem(struct si_context *ctx, struct radeon_cmdbuf *cs, unsigned event,
+                       unsigned event_flags, unsigned dst_sel, unsigned int_sel, unsigned data_sel,
+                       struct si_resource *buf, uint64_t va, uint32_t new_fence,
+                       unsigned query_type);
 unsigned si_cp_write_fence_dwords(struct si_screen *screen);
-void si_cp_wait_mem(struct si_context *ctx, struct radeon_cmdbuf *cs,
-		      uint64_t va, uint32_t ref, uint32_t mask, unsigned flags);
+void si_cp_wait_mem(struct si_context *ctx, struct radeon_cmdbuf *cs, uint64_t va, uint32_t ref,
+                    uint32_t mask, unsigned flags);
 void si_init_fence_functions(struct si_context *ctx);
 void si_init_screen_fence_functions(struct si_screen *screen);
 struct pipe_fence_handle *si_create_fence(struct pipe_context *ctx,
-					  struct tc_unflushed_batch_token *tc_token);
+                                          struct tc_unflushed_batch_token *tc_token);
 
 /* si_get.c */
 void si_init_screen_get_functions(struct si_screen *sscreen);
 
 /* si_gfx_cs.c */
-void si_flush_gfx_cs(struct si_context *ctx, unsigned flags,
-		     struct pipe_fence_handle **fence);
+void si_flush_gfx_cs(struct si_context *ctx, unsigned flags, struct pipe_fence_handle **fence);
 void si_allocate_gds(struct si_context *ctx);
 void si_begin_new_gfx_cs(struct si_context *ctx);
 void si_need_gfx_cs_space(struct si_context *ctx);
@@ -1441,36 +1406,32 @@ void si_unref_sdma_uploads(struct si_context *sctx);
 /* si_gpu_load.c */
 void si_gpu_load_kill_thread(struct si_screen *sscreen);
 uint64_t si_begin_counter(struct si_screen *sscreen, unsigned type);
-unsigned si_end_counter(struct si_screen *sscreen, unsigned type,
-			uint64_t begin);
+unsigned si_end_counter(struct si_screen *sscreen, unsigned type, uint64_t begin);
 
 /* si_compute.c */
 void si_emit_initial_compute_regs(struct si_context *sctx, struct radeon_cmdbuf *cs);
 void si_init_compute_functions(struct si_context *sctx);
 
 /* si_compute_prim_discard.c */
-enum si_prim_discard_outcome {
-	SI_PRIM_DISCARD_ENABLED,
-	SI_PRIM_DISCARD_DISABLED,
-	SI_PRIM_DISCARD_DRAW_SPLIT,
+enum si_prim_discard_outcome
+{
+   SI_PRIM_DISCARD_ENABLED,
+   SI_PRIM_DISCARD_DISABLED,
+   SI_PRIM_DISCARD_DRAW_SPLIT,
 };
 
 void si_build_prim_discard_compute_shader(struct si_shader_context *ctx);
 enum si_prim_discard_outcome
-si_prepare_prim_discard_or_split_draw(struct si_context *sctx,
-				      const struct pipe_draw_info *info,
-				      bool primitive_restart);
+si_prepare_prim_discard_or_split_draw(struct si_context *sctx, const struct pipe_draw_info *info,
+                                      bool primitive_restart);
 void si_compute_signal_gfx(struct si_context *sctx);
 void si_dispatch_prim_discard_cs_and_draw(struct si_context *sctx,
-					  const struct pipe_draw_info *info,
-					  unsigned index_size,
-					  unsigned base_vertex,
-					  uint64_t input_indexbuf_va,
-					  unsigned input_indexbuf_max_elements);
-void si_initialize_prim_discard_tunables(struct si_screen *sscreen,
-					 bool is_aux_context,
-					 unsigned *prim_discard_vertex_count_threshold,
-					 unsigned *index_ring_size_per_ib);
+                                          const struct pipe_draw_info *info, unsigned index_size,
+                                          unsigned base_vertex, uint64_t input_indexbuf_va,
+                                          unsigned input_indexbuf_max_elements);
+void si_initialize_prim_discard_tunables(struct si_screen *sscreen, bool is_aux_context,
+                                         unsigned *prim_discard_vertex_count_threshold,
+                                         unsigned *index_ring_size_per_ib);
 
 /* si_pipe.c */
 void si_init_compiler(struct si_screen *sscreen, struct ac_llvm_compiler *compiler);
@@ -1487,19 +1448,17 @@ void si_resume_queries(struct si_context *sctx);
 
 /* si_shaderlib_tgsi.c */
 void *si_get_blitter_vs(struct si_context *sctx, enum blitter_attrib_type type,
-			unsigned num_layers);
+                        unsigned num_layers);
 void *si_create_fixed_func_tcs(struct si_context *sctx);
-void *si_create_dma_compute_shader(struct pipe_context *ctx,
-				   unsigned num_dwords_per_thread,
-				   bool dst_stream_cache_policy, bool is_copy);
+void *si_create_dma_compute_shader(struct pipe_context *ctx, unsigned num_dwords_per_thread,
+                                   bool dst_stream_cache_policy, bool is_copy);
 void *si_create_copy_image_compute_shader(struct pipe_context *ctx);
 void *si_create_copy_image_compute_shader_1d_array(struct pipe_context *ctx);
 void *si_clear_render_target_shader(struct pipe_context *ctx);
 void *si_clear_render_target_shader_1d_array(struct pipe_context *ctx);
 void *si_clear_12bytes_buffer_shader(struct pipe_context *ctx);
 void *si_create_dcc_retile_cs(struct pipe_context *ctx);
-void *si_create_fmask_expand_cs(struct pipe_context *ctx, unsigned num_samples,
-				bool is_array);
+void *si_create_fmask_expand_cs(struct pipe_context *ctx, unsigned num_samples, bool is_array);
 void *si_create_query_result_cs(struct si_context *sctx);
 void *gfx10_create_sh_query_result_cs(struct si_context *sctx);
 
@@ -1515,370 +1474,317 @@ void si_test_dma_perf(struct si_screen *sscreen);
 
 /* si_uvd.c */
 struct pipe_video_codec *si_uvd_create_decoder(struct pipe_context *context,
-					       const struct pipe_video_codec *templ);
+                                               const struct pipe_video_codec *templ);
 
 struct pipe_video_buffer *si_video_buffer_create(struct pipe_context *pipe,
-						 const struct pipe_video_buffer *tmpl);
+                                                 const struct pipe_video_buffer *tmpl);
 
 /* si_viewport.c */
 void si_update_ngg_small_prim_precision(struct si_context *ctx);
-void si_get_small_prim_cull_info(struct si_context *sctx,
-				 struct si_small_prim_cull_info *out);
+void si_get_small_prim_cull_info(struct si_context *sctx, struct si_small_prim_cull_info *out);
 void si_update_vs_viewport_state(struct si_context *ctx);
 void si_init_viewport_functions(struct si_context *ctx);
 
 /* si_texture.c */
-bool si_prepare_for_dma_blit(struct si_context *sctx,
-			     struct si_texture *dst,
-			     unsigned dst_level, unsigned dstx,
-			     unsigned dsty, unsigned dstz,
-			     struct si_texture *src,
-			     unsigned src_level,
-			     const struct pipe_box *src_box);
-void si_eliminate_fast_color_clear(struct si_context *sctx,
-				   struct si_texture *tex);
-void si_texture_discard_cmask(struct si_screen *sscreen,
-			      struct si_texture *tex);
-bool si_init_flushed_depth_texture(struct pipe_context *ctx,
-				   struct pipe_resource *texture);
-void si_print_texture_info(struct si_screen *sscreen,
-			   struct si_texture *tex, struct u_log_context *log);
+bool si_prepare_for_dma_blit(struct si_context *sctx, struct si_texture *dst, unsigned dst_level,
+                             unsigned dstx, unsigned dsty, unsigned dstz, struct si_texture *src,
+                             unsigned src_level, const struct pipe_box *src_box);
+void si_eliminate_fast_color_clear(struct si_context *sctx, struct si_texture *tex);
+void si_texture_discard_cmask(struct si_screen *sscreen, struct si_texture *tex);
+bool si_init_flushed_depth_texture(struct pipe_context *ctx, struct pipe_resource *texture);
+void si_print_texture_info(struct si_screen *sscreen, struct si_texture *tex,
+                           struct u_log_context *log);
 struct pipe_resource *si_texture_create(struct pipe_screen *screen,
-					const struct pipe_resource *templ);
-bool vi_dcc_formats_compatible(struct si_screen *sscreen,
-			       enum pipe_format format1,
-			       enum pipe_format format2);
-bool vi_dcc_formats_are_incompatible(struct pipe_resource *tex,
-				     unsigned level,
-				     enum pipe_format view_format);
-void vi_disable_dcc_if_incompatible_format(struct si_context *sctx,
-					   struct pipe_resource *tex,
-					   unsigned level,
-					   enum pipe_format view_format);
+                                        const struct pipe_resource *templ);
+bool vi_dcc_formats_compatible(struct si_screen *sscreen, enum pipe_format format1,
+                               enum pipe_format format2);
+bool vi_dcc_formats_are_incompatible(struct pipe_resource *tex, unsigned level,
+                                     enum pipe_format view_format);
+void vi_disable_dcc_if_incompatible_format(struct si_context *sctx, struct pipe_resource *tex,
+                                           unsigned level, enum pipe_format view_format);
 struct pipe_surface *si_create_surface_custom(struct pipe_context *pipe,
-					      struct pipe_resource *texture,
-					      const struct pipe_surface *templ,
-					      unsigned width0, unsigned height0,
-					      unsigned width, unsigned height);
+                                              struct pipe_resource *texture,
+                                              const struct pipe_surface *templ, unsigned width0,
+                                              unsigned height0, unsigned width, unsigned height);
 unsigned si_translate_colorswap(enum pipe_format format, bool do_endian_swap);
-void vi_separate_dcc_try_enable(struct si_context *sctx,
-				struct si_texture *tex);
-void vi_separate_dcc_start_query(struct si_context *sctx,
-				 struct si_texture *tex);
-void vi_separate_dcc_stop_query(struct si_context *sctx,
-				struct si_texture *tex);
-void vi_separate_dcc_process_and_reset_stats(struct pipe_context *ctx,
-					     struct si_texture *tex);
-bool si_texture_disable_dcc(struct si_context *sctx,
-			    struct si_texture *tex);
+void vi_separate_dcc_try_enable(struct si_context *sctx, struct si_texture *tex);
+void vi_separate_dcc_start_query(struct si_context *sctx, struct si_texture *tex);
+void vi_separate_dcc_stop_query(struct si_context *sctx, struct si_texture *tex);
+void vi_separate_dcc_process_and_reset_stats(struct pipe_context *ctx, struct si_texture *tex);
+bool si_texture_disable_dcc(struct si_context *sctx, struct si_texture *tex);
 void si_init_screen_texture_functions(struct si_screen *sscreen);
 void si_init_context_texture_functions(struct si_context *sctx);
 
-
 /*
  * common helpers
  */
 
 static inline struct si_resource *si_resource(struct pipe_resource *r)
 {
-	return (struct si_resource*)r;
+   return (struct si_resource *)r;
 }
 
-static inline void
-si_resource_reference(struct si_resource **ptr, struct si_resource *res)
+static inline void si_resource_reference(struct si_resource **ptr, struct si_resource *res)
 {
-	pipe_resource_reference((struct pipe_resource **)ptr,
-				(struct pipe_resource *)res);
+   pipe_resource_reference((struct pipe_resource **)ptr, (struct pipe_resource *)res);
 }
 
-static inline void
-si_texture_reference(struct si_texture **ptr, struct si_texture *res)
+static inline void si_texture_reference(struct si_texture **ptr, struct si_texture *res)
 {
-	pipe_resource_reference((struct pipe_resource **)ptr, &res->buffer.b.b);
+   pipe_resource_reference((struct pipe_resource **)ptr, &res->buffer.b.b);
 }
 
 static inline void
 si_shader_selector_reference(struct si_context *sctx, /* sctx can optionally be NULL */
-			     struct si_shader_selector **dst,
-			     struct si_shader_selector *src)
+                             struct si_shader_selector **dst, struct si_shader_selector *src)
 {
-	if (*dst == src)
-		return;
+   if (*dst == src)
+      return;
 
-	struct si_screen *sscreen = src ? src->screen : (*dst)->screen;
-	util_shader_reference(&sctx->b, &sscreen->live_shader_cache,
-			      (void**)dst, src);
+   struct si_screen *sscreen = src ? src->screen : (*dst)->screen;
+   util_shader_reference(&sctx->b, &sscreen->live_shader_cache, (void **)dst, src);
 }
 
-static inline bool
-vi_dcc_enabled(struct si_texture *tex, unsigned level)
+static inline bool vi_dcc_enabled(struct si_texture *tex, unsigned level)
 {
-	return tex->surface.dcc_offset && level < tex->surface.num_dcc_levels;
+   return tex->surface.dcc_offset && level < tex->surface.num_dcc_levels;
 }
 
-static inline unsigned
-si_tile_mode_index(struct si_texture *tex, unsigned level, bool stencil)
+static inline unsigned si_tile_mode_index(struct si_texture *tex, unsigned level, bool stencil)
 {
-	if (stencil)
-		return tex->surface.u.legacy.stencil_tiling_index[level];
-	else
-		return tex->surface.u.legacy.tiling_index[level];
+   if (stencil)
+      return tex->surface.u.legacy.stencil_tiling_index[level];
+   else
+      return tex->surface.u.legacy.tiling_index[level];
 }
 
-static inline unsigned
-si_get_minimum_num_gfx_cs_dwords(struct si_context *sctx)
+static inline unsigned si_get_minimum_num_gfx_cs_dwords(struct si_context *sctx)
 {
-	/* Don't count the needed CS space exactly and just use an upper bound.
-	 *
-	 * Also reserve space for stopping queries at the end of IB, because
-	 * the number of active queries is unlimited in theory.
-	 */
-	return 2048 + sctx->num_cs_dw_queries_suspend;
+   /* Don't count the needed CS space exactly and just use an upper bound.
+    *
+    * Also reserve space for stopping queries at the end of IB, because
+    * the number of active queries is unlimited in theory.
+    */
+   return 2048 + sctx->num_cs_dw_queries_suspend;
 }
 
-static inline void
-si_context_add_resource_size(struct si_context *sctx, struct pipe_resource *r)
+static inline void si_context_add_resource_size(struct si_context *sctx, struct pipe_resource *r)
 {
-	if (r) {
-		/* Add memory usage for need_gfx_cs_space */
-		sctx->vram += si_resource(r)->vram_usage;
-		sctx->gtt += si_resource(r)->gart_usage;
-	}
+   if (r) {
+      /* Add memory usage for need_gfx_cs_space */
+      sctx->vram += si_resource(r)->vram_usage;
+      sctx->gtt += si_resource(r)->gart_usage;
+   }
 }
 
-static inline void
-si_invalidate_draw_sh_constants(struct si_context *sctx)
+static inline void si_invalidate_draw_sh_constants(struct si_context *sctx)
 {
-	sctx->last_base_vertex = SI_BASE_VERTEX_UNKNOWN;
-	sctx->last_instance_count = SI_INSTANCE_COUNT_UNKNOWN;
+   sctx->last_base_vertex = SI_BASE_VERTEX_UNKNOWN;
+   sctx->last_instance_count = SI_INSTANCE_COUNT_UNKNOWN;
 }
 
-static inline unsigned
-si_get_atom_bit(struct si_context *sctx, struct si_atom *atom)
+static inline unsigned si_get_atom_bit(struct si_context *sctx, struct si_atom *atom)
 {
-	return 1 << (atom - sctx->atoms.array);
+   return 1 << (atom - sctx->atoms.array);
 }
 
-static inline void
-si_set_atom_dirty(struct si_context *sctx, struct si_atom *atom, bool dirty)
+static inline void si_set_atom_dirty(struct si_context *sctx, struct si_atom *atom, bool dirty)
 {
-	unsigned bit = si_get_atom_bit(sctx, atom);
+   unsigned bit = si_get_atom_bit(sctx, atom);
 
-	if (dirty)
-		sctx->dirty_atoms |= bit;
-	else
-		sctx->dirty_atoms &= ~bit;
+   if (dirty)
+      sctx->dirty_atoms |= bit;
+   else
+      sctx->dirty_atoms &= ~bit;
 }
 
-static inline bool
-si_is_atom_dirty(struct si_context *sctx, struct si_atom *atom)
+static inline bool si_is_atom_dirty(struct si_context *sctx, struct si_atom *atom)
 {
-	return (sctx->dirty_atoms & si_get_atom_bit(sctx, atom)) != 0;
+   return (sctx->dirty_atoms & si_get_atom_bit(sctx, atom)) != 0;
 }
 
-static inline void
-si_mark_atom_dirty(struct si_context *sctx, struct si_atom *atom)
+static inline void si_mark_atom_dirty(struct si_context *sctx, struct si_atom *atom)
 {
-	si_set_atom_dirty(sctx, atom, true);
+   si_set_atom_dirty(sctx, atom, true);
 }
 
 static inline struct si_shader_ctx_state *si_get_vs(struct si_context *sctx)
 {
-	if (sctx->gs_shader.cso)
-		return &sctx->gs_shader;
-	if (sctx->tes_shader.cso)
-		return &sctx->tes_shader;
+   if (sctx->gs_shader.cso)
+      return &sctx->gs_shader;
+   if (sctx->tes_shader.cso)
+      return &sctx->tes_shader;
 
-	return &sctx->vs_shader;
+   return &sctx->vs_shader;
 }
 
 static inline struct si_shader_info *si_get_vs_info(struct si_context *sctx)
 {
-	struct si_shader_ctx_state *vs = si_get_vs(sctx);
+   struct si_shader_ctx_state *vs = si_get_vs(sctx);
 
-	return vs->cso ? &vs->cso->info : NULL;
+   return vs->cso ? &vs->cso->info : NULL;
 }
 
-static inline struct si_shader* si_get_vs_state(struct si_context *sctx)
+static inline struct si_shader *si_get_vs_state(struct si_context *sctx)
 {
-	if (sctx->gs_shader.cso &&
-	    sctx->gs_shader.current &&
-	    !sctx->gs_shader.current->key.as_ngg)
-		return sctx->gs_shader.cso->gs_copy_shader;
+   if (sctx->gs_shader.cso && sctx->gs_shader.current && !sctx->gs_shader.current->key.as_ngg)
+      return sctx->gs_shader.cso->gs_copy_shader;
 
-	struct si_shader_ctx_state *vs = si_get_vs(sctx);
-	return vs->current ? vs->current : NULL;
+   struct si_shader_ctx_state *vs = si_get_vs(sctx);
+   return vs->current ? vs->current : NULL;
 }
 
-static inline bool si_can_dump_shader(struct si_screen *sscreen,
-				      unsigned processor)
+static inline bool si_can_dump_shader(struct si_screen *sscreen, unsigned processor)
 {
-	return sscreen->debug_flags & (1 << processor);
+   return sscreen->debug_flags & (1 << processor);
 }
 
 static inline bool si_get_strmout_en(struct si_context *sctx)
 {
-	return sctx->streamout.streamout_enabled ||
-	       sctx->streamout.prims_gen_query_enabled;
+   return sctx->streamout.streamout_enabled || sctx->streamout.prims_gen_query_enabled;
 }
 
-static inline unsigned
-si_optimal_tcc_alignment(struct si_context *sctx, unsigned upload_size)
+static inline unsigned si_optimal_tcc_alignment(struct si_context *sctx, unsigned upload_size)
 {
-	unsigned alignment, tcc_cache_line_size;
-
-	/* If the upload size is less than the cache line size (e.g. 16, 32),
-	 * the whole thing will fit into a cache line if we align it to its size.
-	 * The idea is that multiple small uploads can share a cache line.
-	 * If the upload size is greater, align it to the cache line size.
-	 */
-	alignment = util_next_power_of_two(upload_size);
-	tcc_cache_line_size = sctx->screen->info.tcc_cache_line_size;
-	return MIN2(alignment, tcc_cache_line_size);
+   unsigned alignment, tcc_cache_line_size;
+
+   /* If the upload size is less than the cache line size (e.g. 16, 32),
+    * the whole thing will fit into a cache line if we align it to its size.
+    * The idea is that multiple small uploads can share a cache line.
+    * If the upload size is greater, align it to the cache line size.
+    */
+   alignment = util_next_power_of_two(upload_size);
+   tcc_cache_line_size = sctx->screen->info.tcc_cache_line_size;
+   return MIN2(alignment, tcc_cache_line_size);
 }
 
-static inline void
-si_saved_cs_reference(struct si_saved_cs **dst, struct si_saved_cs *src)
+static inline void si_saved_cs_reference(struct si_saved_cs **dst, struct si_saved_cs *src)
 {
-	if (pipe_reference(&(*dst)->reference, &src->reference))
-		si_destroy_saved_cs(*dst);
+   if (pipe_reference(&(*dst)->reference, &src->reference))
+      si_destroy_saved_cs(*dst);
 
-	*dst = src;
+   *dst = src;
 }
 
-static inline void
-si_make_CB_shader_coherent(struct si_context *sctx, unsigned num_samples,
-			   bool shaders_read_metadata, bool dcc_pipe_aligned)
+static inline void si_make_CB_shader_coherent(struct si_context *sctx, unsigned num_samples,
+                                              bool shaders_read_metadata, bool dcc_pipe_aligned)
 {
-	sctx->flags |= SI_CONTEXT_FLUSH_AND_INV_CB |
-		       SI_CONTEXT_INV_VCACHE;
-
-	if (sctx->chip_class >= GFX10) {
-		if (sctx->screen->info.tcc_harvested)
-			sctx->flags |= SI_CONTEXT_INV_L2;
-		else if (shaders_read_metadata)
-			sctx->flags |= SI_CONTEXT_INV_L2_METADATA;
-	} else if (sctx->chip_class == GFX9) {
-		/* Single-sample color is coherent with shaders on GFX9, but
-		 * L2 metadata must be flushed if shaders read metadata.
-		 * (DCC, CMASK).
-		 */
-		if (num_samples >= 2 ||
-		    (shaders_read_metadata && !dcc_pipe_aligned))
-			sctx->flags |= SI_CONTEXT_INV_L2;
-		else if (shaders_read_metadata)
-			sctx->flags |= SI_CONTEXT_INV_L2_METADATA;
-	} else {
-		/* GFX6-GFX8 */
-		sctx->flags |= SI_CONTEXT_INV_L2;
-	}
+   sctx->flags |= SI_CONTEXT_FLUSH_AND_INV_CB | SI_CONTEXT_INV_VCACHE;
+
+   if (sctx->chip_class >= GFX10) {
+      if (sctx->screen->info.tcc_harvested)
+         sctx->flags |= SI_CONTEXT_INV_L2;
+      else if (shaders_read_metadata)
+         sctx->flags |= SI_CONTEXT_INV_L2_METADATA;
+   } else if (sctx->chip_class == GFX9) {
+      /* Single-sample color is coherent with shaders on GFX9, but
+       * L2 metadata must be flushed if shaders read metadata.
+       * (DCC, CMASK).
+       */
+      if (num_samples >= 2 || (shaders_read_metadata && !dcc_pipe_aligned))
+         sctx->flags |= SI_CONTEXT_INV_L2;
+      else if (shaders_read_metadata)
+         sctx->flags |= SI_CONTEXT_INV_L2_METADATA;
+   } else {
+      /* GFX6-GFX8 */
+      sctx->flags |= SI_CONTEXT_INV_L2;
+   }
 }
 
-static inline void
-si_make_DB_shader_coherent(struct si_context *sctx, unsigned num_samples,
-			   bool include_stencil, bool shaders_read_metadata)
+static inline void si_make_DB_shader_coherent(struct si_context *sctx, unsigned num_samples,
+                                              bool include_stencil, bool shaders_read_metadata)
 {
-	sctx->flags |= SI_CONTEXT_FLUSH_AND_INV_DB |
-		       SI_CONTEXT_INV_VCACHE;
-
-	if (sctx->chip_class >= GFX10) {
-		if (sctx->screen->info.tcc_harvested)
-			sctx->flags |= SI_CONTEXT_INV_L2;
-		else if (shaders_read_metadata)
-			sctx->flags |= SI_CONTEXT_INV_L2_METADATA;
-	} else if (sctx->chip_class == GFX9) {
-		/* Single-sample depth (not stencil) is coherent with shaders
-		 * on GFX9, but L2 metadata must be flushed if shaders read
-		 * metadata.
-		 */
-		if (num_samples >= 2 || include_stencil)
-			sctx->flags |= SI_CONTEXT_INV_L2;
-		else if (shaders_read_metadata)
-			sctx->flags |= SI_CONTEXT_INV_L2_METADATA;
-	} else {
-		/* GFX6-GFX8 */
-		sctx->flags |= SI_CONTEXT_INV_L2;
-	}
+   sctx->flags |= SI_CONTEXT_FLUSH_AND_INV_DB | SI_CONTEXT_INV_VCACHE;
+
+   if (sctx->chip_class >= GFX10) {
+      if (sctx->screen->info.tcc_harvested)
+         sctx->flags |= SI_CONTEXT_INV_L2;
+      else if (shaders_read_metadata)
+         sctx->flags |= SI_CONTEXT_INV_L2_METADATA;
+   } else if (sctx->chip_class == GFX9) {
+      /* Single-sample depth (not stencil) is coherent with shaders
+       * on GFX9, but L2 metadata must be flushed if shaders read
+       * metadata.
+       */
+      if (num_samples >= 2 || include_stencil)
+         sctx->flags |= SI_CONTEXT_INV_L2;
+      else if (shaders_read_metadata)
+         sctx->flags |= SI_CONTEXT_INV_L2_METADATA;
+   } else {
+      /* GFX6-GFX8 */
+      sctx->flags |= SI_CONTEXT_INV_L2;
+   }
 }
 
-static inline bool
-si_can_sample_zs(struct si_texture *tex, bool stencil_sampler)
+static inline bool si_can_sample_zs(struct si_texture *tex, bool stencil_sampler)
 {
-	return (stencil_sampler && tex->can_sample_s) ||
-	       (!stencil_sampler && tex->can_sample_z);
+   return (stencil_sampler && tex->can_sample_s) || (!stencil_sampler && tex->can_sample_z);
 }
 
-static inline bool
-si_htile_enabled(struct si_texture *tex, unsigned level, unsigned zs_mask)
+static inline bool si_htile_enabled(struct si_texture *tex, unsigned level, unsigned zs_mask)
 {
-	if (zs_mask == PIPE_MASK_S && tex->htile_stencil_disabled)
-		return false;
+   if (zs_mask == PIPE_MASK_S && tex->htile_stencil_disabled)
+      return false;
 
-	return tex->surface.htile_offset && level == 0;
+   return tex->surface.htile_offset && level == 0;
 }
 
-static inline bool
-vi_tc_compat_htile_enabled(struct si_texture *tex, unsigned level, unsigned zs_mask)
+static inline bool vi_tc_compat_htile_enabled(struct si_texture *tex, unsigned level,
+                                              unsigned zs_mask)
 {
-	assert(!tex->tc_compatible_htile || tex->surface.htile_offset);
-	return tex->tc_compatible_htile && si_htile_enabled(tex, level, zs_mask);
+   assert(!tex->tc_compatible_htile || tex->surface.htile_offset);
+   return tex->tc_compatible_htile && si_htile_enabled(tex, level, zs_mask);
 }
 
 static inline unsigned si_get_ps_iter_samples(struct si_context *sctx)
 {
-	if (sctx->ps_uses_fbfetch)
-		return sctx->framebuffer.nr_color_samples;
+   if (sctx->ps_uses_fbfetch)
+      return sctx->framebuffer.nr_color_samples;
 
-	return MIN2(sctx->ps_iter_samples, sctx->framebuffer.nr_color_samples);
+   return MIN2(sctx->ps_iter_samples, sctx->framebuffer.nr_color_samples);
 }
 
 static inline unsigned si_get_total_colormask(struct si_context *sctx)
 {
-	if (sctx->queued.named.rasterizer->rasterizer_discard)
-		return 0;
+   if (sctx->queued.named.rasterizer->rasterizer_discard)
+      return 0;
 
-	struct si_shader_selector *ps = sctx->ps_shader.cso;
-	if (!ps)
-		return 0;
+   struct si_shader_selector *ps = sctx->ps_shader.cso;
+   if (!ps)
+      return 0;
 
-	unsigned colormask = sctx->framebuffer.colorbuf_enabled_4bit &
-			     sctx->queued.named.blend->cb_target_mask;
+   unsigned colormask =
+      sctx->framebuffer.colorbuf_enabled_4bit & sctx->queued.named.blend->cb_target_mask;
 
-	if (!ps->info.properties[TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS])
-		colormask &= ps->colors_written_4bit;
-	else if (!ps->colors_written_4bit)
-		colormask = 0; /* color0 writes all cbufs, but it's not written */
+   if (!ps->info.properties[TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS])
+      colormask &= ps->colors_written_4bit;
+   else if (!ps->colors_written_4bit)
+      colormask = 0; /* color0 writes all cbufs, but it's not written */
 
-	return colormask;
+   return colormask;
 }
 
-#define UTIL_ALL_PRIM_LINE_MODES ((1 << PIPE_PRIM_LINES) | \
-				  (1 << PIPE_PRIM_LINE_LOOP) | \
-				  (1 << PIPE_PRIM_LINE_STRIP) | \
-				  (1 << PIPE_PRIM_LINES_ADJACENCY) | \
-				  (1 << PIPE_PRIM_LINE_STRIP_ADJACENCY))
+#define UTIL_ALL_PRIM_LINE_MODES                                                                   \
+   ((1 << PIPE_PRIM_LINES) | (1 << PIPE_PRIM_LINE_LOOP) | (1 << PIPE_PRIM_LINE_STRIP) |            \
+    (1 << PIPE_PRIM_LINES_ADJACENCY) | (1 << PIPE_PRIM_LINE_STRIP_ADJACENCY))
 
 static inline bool util_prim_is_lines(unsigned prim)
 {
-	return ((1 << prim) & UTIL_ALL_PRIM_LINE_MODES) != 0;
+   return ((1 << prim) & UTIL_ALL_PRIM_LINE_MODES) != 0;
 }
 
 static inline bool util_prim_is_points_or_lines(unsigned prim)
 {
-	return ((1 << prim) & (UTIL_ALL_PRIM_LINE_MODES |
-			       (1 << PIPE_PRIM_POINTS))) != 0;
+   return ((1 << prim) & (UTIL_ALL_PRIM_LINE_MODES | (1 << PIPE_PRIM_POINTS))) != 0;
 }
 
 static inline bool util_rast_prim_is_triangles(unsigned prim)
 {
-	return ((1 << prim) & ((1 << PIPE_PRIM_TRIANGLES) |
-			       (1 << PIPE_PRIM_TRIANGLE_STRIP) |
-			       (1 << PIPE_PRIM_TRIANGLE_FAN) |
-			       (1 << PIPE_PRIM_QUADS) |
-			       (1 << PIPE_PRIM_QUAD_STRIP) |
-			       (1 << PIPE_PRIM_POLYGON) |
-			       (1 << PIPE_PRIM_TRIANGLES_ADJACENCY) |
-			       (1 << PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY)));
+   return ((1 << prim) &
+           ((1 << PIPE_PRIM_TRIANGLES) | (1 << PIPE_PRIM_TRIANGLE_STRIP) |
+            (1 << PIPE_PRIM_TRIANGLE_FAN) | (1 << PIPE_PRIM_QUADS) | (1 << PIPE_PRIM_QUAD_STRIP) |
+            (1 << PIPE_PRIM_POLYGON) | (1 << PIPE_PRIM_TRIANGLES_ADJACENCY) |
+            (1 << PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY)));
 }
 
 /**
@@ -1888,20 +1794,18 @@ static inline bool util_rast_prim_is_triangles(unsigned prim)
  * \param vram      VRAM memory size not added to the buffer list yet
  * \param gtt       GTT memory size not added to the buffer list yet
  */
-static inline bool
-radeon_cs_memory_below_limit(struct si_screen *screen,
-			     struct radeon_cmdbuf *cs,
-			     uint64_t vram, uint64_t gtt)
+static inline bool radeon_cs_memory_below_limit(struct si_screen *screen, struct radeon_cmdbuf *cs,
+                                                uint64_t vram, uint64_t gtt)
 {
-	vram += cs->used_vram;
-	gtt += cs->used_gart;
+   vram += cs->used_vram;
+   gtt += cs->used_gart;
 
-	/* Anything that goes above the VRAM size should go to GTT. */
-	if (vram > screen->info.vram_size)
-		gtt += vram - screen->info.vram_size;
+   /* Anything that goes above the VRAM size should go to GTT. */
+   if (vram > screen->info.vram_size)
+      gtt += vram - screen->info.vram_size;
 
-	/* Now we just need to check if we have enough GTT. */
-	return gtt < screen->info.gart_size * 0.7;
+   /* Now we just need to check if we have enough GTT. */
+   return gtt < screen->info.gart_size * 0.7;
 }
 
 /**
@@ -1914,17 +1818,13 @@ radeon_cs_memory_below_limit(struct si_screen *screen,
  * The buffer list becomes empty after every context flush and must be
  * rebuilt.
  */
-static inline void radeon_add_to_buffer_list(struct si_context *sctx,
-					     struct radeon_cmdbuf *cs,
-					     struct si_resource *bo,
-					     enum radeon_bo_usage usage,
-					     enum radeon_bo_priority priority)
+static inline void radeon_add_to_buffer_list(struct si_context *sctx, struct radeon_cmdbuf *cs,
+                                             struct si_resource *bo, enum radeon_bo_usage usage,
+                                             enum radeon_bo_priority priority)
 {
-	assert(usage);
-	sctx->ws->cs_add_buffer(
-		cs, bo->buf,
-		(enum radeon_bo_usage)(usage | RADEON_USAGE_SYNCHRONIZED),
-		bo->domains, priority);
+   assert(usage);
+   sctx->ws->cs_add_buffer(cs, bo->buf, (enum radeon_bo_usage)(usage | RADEON_USAGE_SYNCHRONIZED),
+                           bo->domains, priority);
 }
 
 /**
@@ -1944,52 +1844,49 @@ static inline void radeon_add_to_buffer_list(struct si_context *sctx,
  * - if shader resource "enabled_mask" is not up-to-date or there is
  *   a different constraint disallowing a context flush
  */
-static inline void
-radeon_add_to_gfx_buffer_list_check_mem(struct si_context *sctx,
-					struct si_resource *bo,
-					enum radeon_bo_usage usage,
-					enum radeon_bo_priority priority,
-					bool check_mem)
+static inline void radeon_add_to_gfx_buffer_list_check_mem(struct si_context *sctx,
+                                                           struct si_resource *bo,
+                                                           enum radeon_bo_usage usage,
+                                                           enum radeon_bo_priority priority,
+                                                           bool check_mem)
 {
-	if (check_mem &&
-	    !radeon_cs_memory_below_limit(sctx->screen, sctx->gfx_cs,
-					  sctx->vram + bo->vram_usage,
-					  sctx->gtt + bo->gart_usage))
-		si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
+   if (check_mem &&
+       !radeon_cs_memory_below_limit(sctx->screen, sctx->gfx_cs, sctx->vram + bo->vram_usage,
+                                     sctx->gtt + bo->gart_usage))
+      si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
 
-	radeon_add_to_buffer_list(sctx, sctx->gfx_cs, bo, usage, priority);
+   radeon_add_to_buffer_list(sctx, sctx->gfx_cs, bo, usage, priority);
 }
 
 static inline bool si_compute_prim_discard_enabled(struct si_context *sctx)
 {
-	return sctx->prim_discard_vertex_count_threshold != UINT_MAX;
+   return sctx->prim_discard_vertex_count_threshold != UINT_MAX;
 }
 
 static inline unsigned si_get_wave_size(struct si_screen *sscreen,
-					enum pipe_shader_type shader_type,
-					bool ngg, bool es, bool prim_discard_cs)
+                                        enum pipe_shader_type shader_type, bool ngg, bool es,
+                                        bool prim_discard_cs)
 {
-	if (shader_type == PIPE_SHADER_COMPUTE)
-		return sscreen->compute_wave_size;
-	else if (shader_type == PIPE_SHADER_FRAGMENT)
-		return sscreen->ps_wave_size;
-	else if ((shader_type == PIPE_SHADER_VERTEX && prim_discard_cs) || /* only Wave64 implemented */
-		 (shader_type == PIPE_SHADER_VERTEX && es && !ngg) ||
-		 (shader_type == PIPE_SHADER_TESS_EVAL && es && !ngg) ||
-		 (shader_type == PIPE_SHADER_GEOMETRY && !ngg)) /* legacy GS only supports Wave64 */
-		return 64;
-	else
-		return sscreen->ge_wave_size;
+   if (shader_type == PIPE_SHADER_COMPUTE)
+      return sscreen->compute_wave_size;
+   else if (shader_type == PIPE_SHADER_FRAGMENT)
+      return sscreen->ps_wave_size;
+   else if ((shader_type == PIPE_SHADER_VERTEX && prim_discard_cs) || /* only Wave64 implemented */
+            (shader_type == PIPE_SHADER_VERTEX && es && !ngg) ||
+            (shader_type == PIPE_SHADER_TESS_EVAL && es && !ngg) ||
+            (shader_type == PIPE_SHADER_GEOMETRY && !ngg)) /* legacy GS only supports Wave64 */
+      return 64;
+   else
+      return sscreen->ge_wave_size;
 }
 
 static inline unsigned si_get_shader_wave_size(struct si_shader *shader)
 {
-	return si_get_wave_size(shader->selector->screen, shader->selector->type,
-				shader->key.as_ngg, shader->key.as_es,
-				shader->key.opt.vs_as_prim_discard_cs);
+   return si_get_wave_size(shader->selector->screen, shader->selector->type, shader->key.as_ngg,
+                           shader->key.as_es, shader->key.opt.vs_as_prim_discard_cs);
 }
 
-#define PRINT_ERR(fmt, args...) \
-	fprintf(stderr, "EE %s:%d %s - " fmt, __FILE__, __LINE__, __func__, ##args)
+#define PRINT_ERR(fmt, args...)                                                                    \
+   fprintf(stderr, "EE %s:%d %s - " fmt, __FILE__, __LINE__, __func__, ##args)
 
 #endif
diff --git a/src/gallium/drivers/radeonsi/si_pm4.c b/src/gallium/drivers/radeonsi/si_pm4.c
index 0b7d53e745d..9b63ba69973 100644
--- a/src/gallium/drivers/radeonsi/si_pm4.c
+++ b/src/gallium/drivers/radeonsi/si_pm4.c
@@ -22,170 +22,159 @@
  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  */
 
-#include "util/u_memory.h"
 #include "si_pipe.h"
 #include "sid.h"
+#include "util/u_memory.h"
 
 void si_pm4_cmd_begin(struct si_pm4_state *state, unsigned opcode)
 {
-	state->last_opcode = opcode;
-	state->last_pm4 = state->ndw++;
+   state->last_opcode = opcode;
+   state->last_pm4 = state->ndw++;
 }
 
 void si_pm4_cmd_add(struct si_pm4_state *state, uint32_t dw)
 {
-	state->pm4[state->ndw++] = dw;
+   state->pm4[state->ndw++] = dw;
 }
 
 void si_pm4_cmd_end(struct si_pm4_state *state, bool predicate)
 {
-	unsigned count;
-	count = state->ndw - state->last_pm4 - 2;
-	state->pm4[state->last_pm4] =
-		PKT3(state->last_opcode, count, predicate);
+   unsigned count;
+   count = state->ndw - state->last_pm4 - 2;
+   state->pm4[state->last_pm4] = PKT3(state->last_opcode, count, predicate);
 
-	assert(state->ndw <= SI_PM4_MAX_DW);
+   assert(state->ndw <= SI_PM4_MAX_DW);
 }
 
 void si_pm4_set_reg(struct si_pm4_state *state, unsigned reg, uint32_t val)
 {
-	unsigned opcode;
+   unsigned opcode;
 
-	if (reg >= SI_CONFIG_REG_OFFSET && reg < SI_CONFIG_REG_END) {
-		opcode = PKT3_SET_CONFIG_REG;
-		reg -= SI_CONFIG_REG_OFFSET;
+   if (reg >= SI_CONFIG_REG_OFFSET && reg < SI_CONFIG_REG_END) {
+      opcode = PKT3_SET_CONFIG_REG;
+      reg -= SI_CONFIG_REG_OFFSET;
 
-	} else if (reg >= SI_SH_REG_OFFSET && reg < SI_SH_REG_END) {
-		opcode = PKT3_SET_SH_REG;
-		reg -= SI_SH_REG_OFFSET;
+   } else if (reg >= SI_SH_REG_OFFSET && reg < SI_SH_REG_END) {
+      opcode = PKT3_SET_SH_REG;
+      reg -= SI_SH_REG_OFFSET;
 
-	} else if (reg >= SI_CONTEXT_REG_OFFSET && reg < SI_CONTEXT_REG_END) {
-		opcode = PKT3_SET_CONTEXT_REG;
-		reg -= SI_CONTEXT_REG_OFFSET;
+   } else if (reg >= SI_CONTEXT_REG_OFFSET && reg < SI_CONTEXT_REG_END) {
+      opcode = PKT3_SET_CONTEXT_REG;
+      reg -= SI_CONTEXT_REG_OFFSET;
 
-	} else if (reg >= CIK_UCONFIG_REG_OFFSET && reg < CIK_UCONFIG_REG_END) {
-		opcode = PKT3_SET_UCONFIG_REG;
-		reg -= CIK_UCONFIG_REG_OFFSET;
+   } else if (reg >= CIK_UCONFIG_REG_OFFSET && reg < CIK_UCONFIG_REG_END) {
+      opcode = PKT3_SET_UCONFIG_REG;
+      reg -= CIK_UCONFIG_REG_OFFSET;
 
-	} else {
-		PRINT_ERR("Invalid register offset %08x!\n", reg);
-		return;
-	}
+   } else {
+      PRINT_ERR("Invalid register offset %08x!\n", reg);
+      return;
+   }
 
-	reg >>= 2;
+   reg >>= 2;
 
-	if (opcode != state->last_opcode || reg != (state->last_reg + 1)) {
-		si_pm4_cmd_begin(state, opcode);
-		si_pm4_cmd_add(state, reg);
-	}
+   if (opcode != state->last_opcode || reg != (state->last_reg + 1)) {
+      si_pm4_cmd_begin(state, opcode);
+      si_pm4_cmd_add(state, reg);
+   }
 
-	state->last_reg = reg;
-	si_pm4_cmd_add(state, val);
-	si_pm4_cmd_end(state, false);
+   state->last_reg = reg;
+   si_pm4_cmd_add(state, val);
+   si_pm4_cmd_end(state, false);
 }
 
-void si_pm4_add_bo(struct si_pm4_state *state,
-                   struct si_resource *bo,
-                   enum radeon_bo_usage usage,
-		   enum radeon_bo_priority priority)
+void si_pm4_add_bo(struct si_pm4_state *state, struct si_resource *bo, enum radeon_bo_usage usage,
+                   enum radeon_bo_priority priority)
 {
-	unsigned idx = state->nbo++;
-	assert(idx < SI_PM4_MAX_BO);
+   unsigned idx = state->nbo++;
+   assert(idx < SI_PM4_MAX_BO);
 
-	si_resource_reference(&state->bo[idx], bo);
-	state->bo_usage[idx] = usage;
-	state->bo_priority[idx] = priority;
+   si_resource_reference(&state->bo[idx], bo);
+   state->bo_usage[idx] = usage;
+   state->bo_priority[idx] = priority;
 }
 
 void si_pm4_clear_state(struct si_pm4_state *state)
 {
-	for (int i = 0; i < state->nbo; ++i)
-		si_resource_reference(&state->bo[i], NULL);
-	si_resource_reference(&state->indirect_buffer, NULL);
-	state->nbo = 0;
-	state->ndw = 0;
+   for (int i = 0; i < state->nbo; ++i)
+      si_resource_reference(&state->bo[i], NULL);
+   si_resource_reference(&state->indirect_buffer, NULL);
+   state->nbo = 0;
+   state->ndw = 0;
 }
 
-void si_pm4_free_state(struct si_context *sctx,
-		       struct si_pm4_state *state,
-		       unsigned idx)
+void si_pm4_free_state(struct si_context *sctx, struct si_pm4_state *state, unsigned idx)
 {
-	if (!state)
-		return;
+   if (!state)
+      return;
 
-	if (idx != ~0 && sctx->emitted.array[idx] == state) {
-		sctx->emitted.array[idx] = NULL;
-	}
+   if (idx != ~0 && sctx->emitted.array[idx] == state) {
+      sctx->emitted.array[idx] = NULL;
+   }
 
-	si_pm4_clear_state(state);
-	FREE(state);
+   si_pm4_clear_state(state);
+   FREE(state);
 }
 
 void si_pm4_emit(struct si_context *sctx, struct si_pm4_state *state)
 {
-	struct radeon_cmdbuf *cs = sctx->gfx_cs;
-
-	for (int i = 0; i < state->nbo; ++i) {
-		radeon_add_to_buffer_list(sctx, sctx->gfx_cs, state->bo[i],
-				      state->bo_usage[i], state->bo_priority[i]);
-	}
-
-	if (!state->indirect_buffer) {
-		radeon_emit_array(cs, state->pm4, state->ndw);
-	} else {
-		struct si_resource *ib = state->indirect_buffer;
-
-		radeon_add_to_buffer_list(sctx, sctx->gfx_cs, ib,
-					  RADEON_USAGE_READ,
-                                          RADEON_PRIO_IB2);
-
-		radeon_emit(cs, PKT3(PKT3_INDIRECT_BUFFER_CIK, 2, 0));
-		radeon_emit(cs, ib->gpu_address);
-		radeon_emit(cs, ib->gpu_address >> 32);
-		radeon_emit(cs, (ib->b.b.width0 >> 2) & 0xfffff);
-	}
-
-	if (state->atom.emit)
-		state->atom.emit(sctx);
+   struct radeon_cmdbuf *cs = sctx->gfx_cs;
+
+   for (int i = 0; i < state->nbo; ++i) {
+      radeon_add_to_buffer_list(sctx, sctx->gfx_cs, state->bo[i], state->bo_usage[i],
+                                state->bo_priority[i]);
+   }
+
+   if (!state->indirect_buffer) {
+      radeon_emit_array(cs, state->pm4, state->ndw);
+   } else {
+      struct si_resource *ib = state->indirect_buffer;
+
+      radeon_add_to_buffer_list(sctx, sctx->gfx_cs, ib, RADEON_USAGE_READ, RADEON_PRIO_IB2);
+
+      radeon_emit(cs, PKT3(PKT3_INDIRECT_BUFFER_CIK, 2, 0));
+      radeon_emit(cs, ib->gpu_address);
+      radeon_emit(cs, ib->gpu_address >> 32);
+      radeon_emit(cs, (ib->b.b.width0 >> 2) & 0xfffff);
+   }
+
+   if (state->atom.emit)
+      state->atom.emit(sctx);
 }
 
 void si_pm4_reset_emitted(struct si_context *sctx)
 {
-	memset(&sctx->emitted, 0, sizeof(sctx->emitted));
-	sctx->dirty_states |= u_bit_consecutive(0, SI_NUM_STATES);
+   memset(&sctx->emitted, 0, sizeof(sctx->emitted));
+   sctx->dirty_states |= u_bit_consecutive(0, SI_NUM_STATES);
 }
 
-void si_pm4_upload_indirect_buffer(struct si_context *sctx,
-				   struct si_pm4_state *state)
+void si_pm4_upload_indirect_buffer(struct si_context *sctx, struct si_pm4_state *state)
 {
-	struct pipe_screen *screen = sctx->b.screen;
-	unsigned aligned_ndw = align(state->ndw, 8);
-
-	/* only supported on GFX7 and later */
-	if (sctx->chip_class < GFX7)
-		return;
-
-	assert(state->ndw);
-	assert(aligned_ndw <= SI_PM4_MAX_DW);
-
-	si_resource_reference(&state->indirect_buffer, NULL);
-	/* TODO: this hangs with 1024 or higher alignment on GFX9. */
-	state->indirect_buffer =
-		si_aligned_buffer_create(screen, 0,
-					 PIPE_USAGE_DEFAULT, aligned_ndw * 4,
-					 256);
-	if (!state->indirect_buffer)
-		return;
-
-	/* Pad the IB to 8 DWs to meet CP fetch alignment requirements. */
-	if (sctx->screen->info.gfx_ib_pad_with_type2) {
-		for (int i = state->ndw; i < aligned_ndw; i++)
-			state->pm4[i] = 0x80000000; /* type2 nop packet */
-	} else {
-		for (int i = state->ndw; i < aligned_ndw; i++)
-			state->pm4[i] = 0xffff1000; /* type3 nop packet */
-	}
-
-	pipe_buffer_write(&sctx->b, &state->indirect_buffer->b.b,
-			  0, aligned_ndw *4, state->pm4);
+   struct pipe_screen *screen = sctx->b.screen;
+   unsigned aligned_ndw = align(state->ndw, 8);
+
+   /* only supported on GFX7 and later */
+   if (sctx->chip_class < GFX7)
+      return;
+
+   assert(state->ndw);
+   assert(aligned_ndw <= SI_PM4_MAX_DW);
+
+   si_resource_reference(&state->indirect_buffer, NULL);
+   /* TODO: this hangs with 1024 or higher alignment on GFX9. */
+   state->indirect_buffer =
+      si_aligned_buffer_create(screen, 0, PIPE_USAGE_DEFAULT, aligned_ndw * 4, 256);
+   if (!state->indirect_buffer)
+      return;
+
+   /* Pad the IB to 8 DWs to meet CP fetch alignment requirements. */
+   if (sctx->screen->info.gfx_ib_pad_with_type2) {
+      for (int i = state->ndw; i < aligned_ndw; i++)
+         state->pm4[i] = 0x80000000; /* type2 nop packet */
+   } else {
+      for (int i = state->ndw; i < aligned_ndw; i++)
+         state->pm4[i] = 0xffff1000; /* type3 nop packet */
+   }
+
+   pipe_buffer_write(&sctx->b, &state->indirect_buffer->b.b, 0, aligned_ndw * 4, state->pm4);
 }
diff --git a/src/gallium/drivers/radeonsi/si_pm4.h b/src/gallium/drivers/radeonsi/si_pm4.h
index c91a90bc638..783833e5a42 100644
--- a/src/gallium/drivers/radeonsi/si_pm4.h
+++ b/src/gallium/drivers/radeonsi/si_pm4.h
@@ -27,8 +27,8 @@
 
 #include "radeon/radeon_winsys.h"
 
-#define SI_PM4_MAX_DW		176
-#define SI_PM4_MAX_BO		3
+#define SI_PM4_MAX_DW 176
+#define SI_PM4_MAX_BO 3
 
 // forward defines
 struct si_context;
@@ -37,32 +37,31 @@ struct si_context;
  * command buffer (AKA indirect buffer, AKA IB, AKA command stream, AKA CS).
  */
 struct si_atom {
-	void (*emit)(struct si_context *ctx);
+   void (*emit)(struct si_context *ctx);
 };
 
-struct si_pm4_state
-{
-	/* optional indirect buffer */
-	struct si_resource	*indirect_buffer;
+struct si_pm4_state {
+   /* optional indirect buffer */
+   struct si_resource *indirect_buffer;
 
-	/* PKT3_SET_*_REG handling */
-	unsigned	last_opcode;
-	unsigned	last_reg;
-	unsigned	last_pm4;
+   /* PKT3_SET_*_REG handling */
+   unsigned last_opcode;
+   unsigned last_reg;
+   unsigned last_pm4;
 
-	/* commands for the DE */
-	unsigned	ndw;
-	uint32_t	pm4[SI_PM4_MAX_DW];
+   /* commands for the DE */
+   unsigned ndw;
+   uint32_t pm4[SI_PM4_MAX_DW];
 
-	/* BO's referenced by this state */
-	unsigned		nbo;
-	struct si_resource	*bo[SI_PM4_MAX_BO];
-	enum radeon_bo_usage	bo_usage[SI_PM4_MAX_BO];
-	enum radeon_bo_priority	bo_priority[SI_PM4_MAX_BO];
+   /* BO's referenced by this state */
+   unsigned nbo;
+   struct si_resource *bo[SI_PM4_MAX_BO];
+   enum radeon_bo_usage bo_usage[SI_PM4_MAX_BO];
+   enum radeon_bo_priority bo_priority[SI_PM4_MAX_BO];
 
-	/* For shader states only */
-	struct si_shader *shader;
-	struct si_atom atom;
+   /* For shader states only */
+   struct si_shader *shader;
+   struct si_atom atom;
 };
 
 void si_pm4_cmd_begin(struct si_pm4_state *state, unsigned opcode);
@@ -70,17 +69,12 @@ void si_pm4_cmd_add(struct si_pm4_state *state, uint32_t dw);
 void si_pm4_cmd_end(struct si_pm4_state *state, bool predicate);
 
 void si_pm4_set_reg(struct si_pm4_state *state, unsigned reg, uint32_t val);
-void si_pm4_add_bo(struct si_pm4_state *state,
-		   struct si_resource *bo,
-		   enum radeon_bo_usage usage,
-		   enum radeon_bo_priority priority);
-void si_pm4_upload_indirect_buffer(struct si_context *sctx,
-				   struct si_pm4_state *state);
+void si_pm4_add_bo(struct si_pm4_state *state, struct si_resource *bo, enum radeon_bo_usage usage,
+                   enum radeon_bo_priority priority);
+void si_pm4_upload_indirect_buffer(struct si_context *sctx, struct si_pm4_state *state);
 
 void si_pm4_clear_state(struct si_pm4_state *state);
-void si_pm4_free_state(struct si_context *sctx,
-		       struct si_pm4_state *state,
-		       unsigned idx);
+void si_pm4_free_state(struct si_context *sctx, struct si_pm4_state *state, unsigned idx);
 
 void si_pm4_emit(struct si_context *sctx, struct si_pm4_state *state);
 void si_pm4_reset_emitted(struct si_context *sctx);
diff --git a/src/gallium/drivers/radeonsi/si_query.c b/src/gallium/drivers/radeonsi/si_query.c
index bf80862e095..6ad293301cb 100644
--- a/src/gallium/drivers/radeonsi/si_query.c
+++ b/src/gallium/drivers/radeonsi/si_query.c
@@ -24,1368 +24,1312 @@
  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  */
 
-#include "si_pipe.h"
 #include "si_query.h"
-#include "util/u_memory.h"
-#include "util/u_upload_mgr.h"
+
+#include "amd/common/sid.h"
+#include "si_pipe.h"
 #include "util/os_time.h"
+#include "util/u_memory.h"
 #include "util/u_suballoc.h"
-#include "amd/common/sid.h"
+#include "util/u_upload_mgr.h"
 
 static const struct si_query_ops query_hw_ops;
 
 struct si_hw_query_params {
-	unsigned start_offset;
-	unsigned end_offset;
-	unsigned fence_offset;
-	unsigned pair_stride;
-	unsigned pair_count;
+   unsigned start_offset;
+   unsigned end_offset;
+   unsigned fence_offset;
+   unsigned pair_stride;
+   unsigned pair_count;
 };
 
 /* Queries without buffer handling or suspend/resume. */
 struct si_query_sw {
-	struct si_query b;
+   struct si_query b;
 
-	uint64_t begin_result;
-	uint64_t end_result;
+   uint64_t begin_result;
+   uint64_t end_result;
 
-	uint64_t begin_time;
-	uint64_t end_time;
+   uint64_t begin_time;
+   uint64_t end_time;
 
-	/* Fence for GPU_FINISHED. */
-	struct pipe_fence_handle *fence;
+   /* Fence for GPU_FINISHED. */
+   struct pipe_fence_handle *fence;
 };
 
-static void si_query_sw_destroy(struct si_context *sctx,
-				struct si_query *squery)
+static void si_query_sw_destroy(struct si_context *sctx, struct si_query *squery)
 {
-	struct si_query_sw *query = (struct si_query_sw *)squery;
+   struct si_query_sw *query = (struct si_query_sw *)squery;
 
-	sctx->b.screen->fence_reference(sctx->b.screen, &query->fence, NULL);
-	FREE(query);
+   sctx->b.screen->fence_reference(sctx->b.screen, &query->fence, NULL);
+   FREE(query);
 }
 
 static enum radeon_value_id winsys_id_from_type(unsigned type)
 {
-	switch (type) {
-	case SI_QUERY_REQUESTED_VRAM: return RADEON_REQUESTED_VRAM_MEMORY;
-	case SI_QUERY_REQUESTED_GTT: return RADEON_REQUESTED_GTT_MEMORY;
-	case SI_QUERY_MAPPED_VRAM: return RADEON_MAPPED_VRAM;
-	case SI_QUERY_MAPPED_GTT: return RADEON_MAPPED_GTT;
-	case SI_QUERY_BUFFER_WAIT_TIME: return RADEON_BUFFER_WAIT_TIME_NS;
-	case SI_QUERY_NUM_MAPPED_BUFFERS: return RADEON_NUM_MAPPED_BUFFERS;
-	case SI_QUERY_NUM_GFX_IBS: return RADEON_NUM_GFX_IBS;
-	case SI_QUERY_NUM_SDMA_IBS: return RADEON_NUM_SDMA_IBS;
-	case SI_QUERY_GFX_BO_LIST_SIZE: return RADEON_GFX_BO_LIST_COUNTER;
-	case SI_QUERY_GFX_IB_SIZE: return RADEON_GFX_IB_SIZE_COUNTER;
-	case SI_QUERY_NUM_BYTES_MOVED: return RADEON_NUM_BYTES_MOVED;
-	case SI_QUERY_NUM_EVICTIONS: return RADEON_NUM_EVICTIONS;
-	case SI_QUERY_NUM_VRAM_CPU_PAGE_FAULTS: return RADEON_NUM_VRAM_CPU_PAGE_FAULTS;
-	case SI_QUERY_VRAM_USAGE: return RADEON_VRAM_USAGE;
-	case SI_QUERY_VRAM_VIS_USAGE: return RADEON_VRAM_VIS_USAGE;
-	case SI_QUERY_GTT_USAGE: return RADEON_GTT_USAGE;
-	case SI_QUERY_GPU_TEMPERATURE: return RADEON_GPU_TEMPERATURE;
-	case SI_QUERY_CURRENT_GPU_SCLK: return RADEON_CURRENT_SCLK;
-	case SI_QUERY_CURRENT_GPU_MCLK: return RADEON_CURRENT_MCLK;
-	case SI_QUERY_CS_THREAD_BUSY: return RADEON_CS_THREAD_TIME;
-	default: unreachable("query type does not correspond to winsys id");
-	}
+   switch (type) {
+   case SI_QUERY_REQUESTED_VRAM:
+      return RADEON_REQUESTED_VRAM_MEMORY;
+   case SI_QUERY_REQUESTED_GTT:
+      return RADEON_REQUESTED_GTT_MEMORY;
+   case SI_QUERY_MAPPED_VRAM:
+      return RADEON_MAPPED_VRAM;
+   case SI_QUERY_MAPPED_GTT:
+      return RADEON_MAPPED_GTT;
+   case SI_QUERY_BUFFER_WAIT_TIME:
+      return RADEON_BUFFER_WAIT_TIME_NS;
+   case SI_QUERY_NUM_MAPPED_BUFFERS:
+      return RADEON_NUM_MAPPED_BUFFERS;
+   case SI_QUERY_NUM_GFX_IBS:
+      return RADEON_NUM_GFX_IBS;
+   case SI_QUERY_NUM_SDMA_IBS:
+      return RADEON_NUM_SDMA_IBS;
+   case SI_QUERY_GFX_BO_LIST_SIZE:
+      return RADEON_GFX_BO_LIST_COUNTER;
+   case SI_QUERY_GFX_IB_SIZE:
+      return RADEON_GFX_IB_SIZE_COUNTER;
+   case SI_QUERY_NUM_BYTES_MOVED:
+      return RADEON_NUM_BYTES_MOVED;
+   case SI_QUERY_NUM_EVICTIONS:
+      return RADEON_NUM_EVICTIONS;
+   case SI_QUERY_NUM_VRAM_CPU_PAGE_FAULTS:
+      return RADEON_NUM_VRAM_CPU_PAGE_FAULTS;
+   case SI_QUERY_VRAM_USAGE:
+      return RADEON_VRAM_USAGE;
+   case SI_QUERY_VRAM_VIS_USAGE:
+      return RADEON_VRAM_VIS_USAGE;
+   case SI_QUERY_GTT_USAGE:
+      return RADEON_GTT_USAGE;
+   case SI_QUERY_GPU_TEMPERATURE:
+      return RADEON_GPU_TEMPERATURE;
+   case SI_QUERY_CURRENT_GPU_SCLK:
+      return RADEON_CURRENT_SCLK;
+   case SI_QUERY_CURRENT_GPU_MCLK:
+      return RADEON_CURRENT_MCLK;
+   case SI_QUERY_CS_THREAD_BUSY:
+      return RADEON_CS_THREAD_TIME;
+   default:
+      unreachable("query type does not correspond to winsys id");
+   }
 }
 
 static int64_t si_finish_dma_get_cpu_time(struct si_context *sctx)
 {
-	struct pipe_fence_handle *fence = NULL;
+   struct pipe_fence_handle *fence = NULL;
 
-	si_flush_dma_cs(sctx, 0, &fence);
-	if (fence) {
-		sctx->ws->fence_wait(sctx->ws, fence, PIPE_TIMEOUT_INFINITE);
-		sctx->ws->fence_reference(&fence, NULL);
-	}
+   si_flush_dma_cs(sctx, 0, &fence);
+   if (fence) {
+      sctx->ws->fence_wait(sctx->ws, fence, PIPE_TIMEOUT_INFINITE);
+      sctx->ws->fence_reference(&fence, NULL);
+   }
 
-	return os_time_get_nano();
+   return os_time_get_nano();
 }
 
-static bool si_query_sw_begin(struct si_context *sctx,
-			      struct si_query *squery)
+static bool si_query_sw_begin(struct si_context *sctx, struct si_query *squery)
 {
-	struct si_query_sw *query = (struct si_query_sw *)squery;
-	enum radeon_value_id ws_id;
-
-	switch(query->b.type) {
-	case PIPE_QUERY_TIMESTAMP_DISJOINT:
-	case PIPE_QUERY_GPU_FINISHED:
-		break;
-	case SI_QUERY_TIME_ELAPSED_SDMA_SI:
-		query->begin_result = si_finish_dma_get_cpu_time(sctx);
-		break;
-	case SI_QUERY_DRAW_CALLS:
-		query->begin_result = sctx->num_draw_calls;
-		break;
-	case SI_QUERY_DECOMPRESS_CALLS:
-		query->begin_result = sctx->num_decompress_calls;
-		break;
-	case SI_QUERY_MRT_DRAW_CALLS:
-		query->begin_result = sctx->num_mrt_draw_calls;
-		break;
-	case SI_QUERY_PRIM_RESTART_CALLS:
-		query->begin_result = sctx->num_prim_restart_calls;
-		break;
-	case SI_QUERY_SPILL_DRAW_CALLS:
-		query->begin_result = sctx->num_spill_draw_calls;
-		break;
-	case SI_QUERY_COMPUTE_CALLS:
-		query->begin_result = sctx->num_compute_calls;
-		break;
-	case SI_QUERY_SPILL_COMPUTE_CALLS:
-		query->begin_result = sctx->num_spill_compute_calls;
-		break;
-	case SI_QUERY_DMA_CALLS:
-		query->begin_result = sctx->num_dma_calls;
-		break;
-	case SI_QUERY_CP_DMA_CALLS:
-		query->begin_result = sctx->num_cp_dma_calls;
-		break;
-	case SI_QUERY_NUM_VS_FLUSHES:
-		query->begin_result = sctx->num_vs_flushes;
-		break;
-	case SI_QUERY_NUM_PS_FLUSHES:
-		query->begin_result = sctx->num_ps_flushes;
-		break;
-	case SI_QUERY_NUM_CS_FLUSHES:
-		query->begin_result = sctx->num_cs_flushes;
-		break;
-	case SI_QUERY_NUM_CB_CACHE_FLUSHES:
-		query->begin_result = sctx->num_cb_cache_flushes;
-		break;
-	case SI_QUERY_NUM_DB_CACHE_FLUSHES:
-		query->begin_result = sctx->num_db_cache_flushes;
-		break;
-	case SI_QUERY_NUM_L2_INVALIDATES:
-		query->begin_result = sctx->num_L2_invalidates;
-		break;
-	case SI_QUERY_NUM_L2_WRITEBACKS:
-		query->begin_result = sctx->num_L2_writebacks;
-		break;
-	case SI_QUERY_NUM_RESIDENT_HANDLES:
-		query->begin_result = sctx->num_resident_handles;
-		break;
-	case SI_QUERY_TC_OFFLOADED_SLOTS:
-		query->begin_result = sctx->tc ? sctx->tc->num_offloaded_slots : 0;
-		break;
-	case SI_QUERY_TC_DIRECT_SLOTS:
-		query->begin_result = sctx->tc ? sctx->tc->num_direct_slots : 0;
-		break;
-	case SI_QUERY_TC_NUM_SYNCS:
-		query->begin_result = sctx->tc ? sctx->tc->num_syncs : 0;
-		break;
-	case SI_QUERY_REQUESTED_VRAM:
-	case SI_QUERY_REQUESTED_GTT:
-	case SI_QUERY_MAPPED_VRAM:
-	case SI_QUERY_MAPPED_GTT:
-	case SI_QUERY_VRAM_USAGE:
-	case SI_QUERY_VRAM_VIS_USAGE:
-	case SI_QUERY_GTT_USAGE:
-	case SI_QUERY_GPU_TEMPERATURE:
-	case SI_QUERY_CURRENT_GPU_SCLK:
-	case SI_QUERY_CURRENT_GPU_MCLK:
-	case SI_QUERY_BACK_BUFFER_PS_DRAW_RATIO:
-	case SI_QUERY_NUM_MAPPED_BUFFERS:
-		query->begin_result = 0;
-		break;
-	case SI_QUERY_BUFFER_WAIT_TIME:
-	case SI_QUERY_GFX_IB_SIZE:
-	case SI_QUERY_NUM_GFX_IBS:
-	case SI_QUERY_NUM_SDMA_IBS:
-	case SI_QUERY_NUM_BYTES_MOVED:
-	case SI_QUERY_NUM_EVICTIONS:
-	case SI_QUERY_NUM_VRAM_CPU_PAGE_FAULTS: {
-		enum radeon_value_id ws_id = winsys_id_from_type(query->b.type);
-		query->begin_result = sctx->ws->query_value(sctx->ws, ws_id);
-		break;
-	}
-	case SI_QUERY_GFX_BO_LIST_SIZE:
-		ws_id = winsys_id_from_type(query->b.type);
-		query->begin_result = sctx->ws->query_value(sctx->ws, ws_id);
-		query->begin_time = sctx->ws->query_value(sctx->ws,
-							  RADEON_NUM_GFX_IBS);
-		break;
-	case SI_QUERY_CS_THREAD_BUSY:
-		ws_id = winsys_id_from_type(query->b.type);
-		query->begin_result = sctx->ws->query_value(sctx->ws, ws_id);
-		query->begin_time = os_time_get_nano();
-		break;
-	case SI_QUERY_GALLIUM_THREAD_BUSY:
-		query->begin_result =
-			sctx->tc ? util_queue_get_thread_time_nano(&sctx->tc->queue, 0) : 0;
-		query->begin_time = os_time_get_nano();
-		break;
-	case SI_QUERY_GPU_LOAD:
-	case SI_QUERY_GPU_SHADERS_BUSY:
-	case SI_QUERY_GPU_TA_BUSY:
-	case SI_QUERY_GPU_GDS_BUSY:
-	case SI_QUERY_GPU_VGT_BUSY:
-	case SI_QUERY_GPU_IA_BUSY:
-	case SI_QUERY_GPU_SX_BUSY:
-	case SI_QUERY_GPU_WD_BUSY:
-	case SI_QUERY_GPU_BCI_BUSY:
-	case SI_QUERY_GPU_SC_BUSY:
-	case SI_QUERY_GPU_PA_BUSY:
-	case SI_QUERY_GPU_DB_BUSY:
-	case SI_QUERY_GPU_CP_BUSY:
-	case SI_QUERY_GPU_CB_BUSY:
-	case SI_QUERY_GPU_SDMA_BUSY:
-	case SI_QUERY_GPU_PFP_BUSY:
-	case SI_QUERY_GPU_MEQ_BUSY:
-	case SI_QUERY_GPU_ME_BUSY:
-	case SI_QUERY_GPU_SURF_SYNC_BUSY:
-	case SI_QUERY_GPU_CP_DMA_BUSY:
-	case SI_QUERY_GPU_SCRATCH_RAM_BUSY:
-		query->begin_result = si_begin_counter(sctx->screen,
-							 query->b.type);
-		break;
-	case SI_QUERY_NUM_COMPILATIONS:
-		query->begin_result = p_atomic_read(&sctx->screen->num_compilations);
-		break;
-	case SI_QUERY_NUM_SHADERS_CREATED:
-		query->begin_result = p_atomic_read(&sctx->screen->num_shaders_created);
-		break;
-	case SI_QUERY_LIVE_SHADER_CACHE_HITS:
-		query->begin_result = sctx->screen->live_shader_cache.hits;
-		break;
-	case SI_QUERY_LIVE_SHADER_CACHE_MISSES:
-		query->begin_result = sctx->screen->live_shader_cache.misses;
-		break;
-	case SI_QUERY_MEMORY_SHADER_CACHE_HITS:
-		query->begin_result = sctx->screen->num_memory_shader_cache_hits;
-		break;
-	case SI_QUERY_MEMORY_SHADER_CACHE_MISSES:
-		query->begin_result = sctx->screen->num_memory_shader_cache_misses;
-		break;
-	case SI_QUERY_DISK_SHADER_CACHE_HITS:
-		query->begin_result = sctx->screen->num_disk_shader_cache_hits;
-		break;
-	case SI_QUERY_DISK_SHADER_CACHE_MISSES:
-		query->begin_result = sctx->screen->num_disk_shader_cache_misses;
-		break;
-	case SI_QUERY_PD_NUM_PRIMS_ACCEPTED:
-		query->begin_result = sctx->compute_num_verts_accepted;
-		break;
-	case SI_QUERY_PD_NUM_PRIMS_REJECTED:
-		query->begin_result = sctx->compute_num_verts_rejected;
-		break;
-	case SI_QUERY_PD_NUM_PRIMS_INELIGIBLE:
-		query->begin_result = sctx->compute_num_verts_ineligible;
-		break;
-	case SI_QUERY_GPIN_ASIC_ID:
-	case SI_QUERY_GPIN_NUM_SIMD:
-	case SI_QUERY_GPIN_NUM_RB:
-	case SI_QUERY_GPIN_NUM_SPI:
-	case SI_QUERY_GPIN_NUM_SE:
-		break;
-	default:
-		unreachable("si_query_sw_begin: bad query type");
-	}
-
-	return true;
+   struct si_query_sw *query = (struct si_query_sw *)squery;
+   enum radeon_value_id ws_id;
+
+   switch (query->b.type) {
+   case PIPE_QUERY_TIMESTAMP_DISJOINT:
+   case PIPE_QUERY_GPU_FINISHED:
+      break;
+   case SI_QUERY_TIME_ELAPSED_SDMA_SI:
+      query->begin_result = si_finish_dma_get_cpu_time(sctx);
+      break;
+   case SI_QUERY_DRAW_CALLS:
+      query->begin_result = sctx->num_draw_calls;
+      break;
+   case SI_QUERY_DECOMPRESS_CALLS:
+      query->begin_result = sctx->num_decompress_calls;
+      break;
+   case SI_QUERY_MRT_DRAW_CALLS:
+      query->begin_result = sctx->num_mrt_draw_calls;
+      break;
+   case SI_QUERY_PRIM_RESTART_CALLS:
+      query->begin_result = sctx->num_prim_restart_calls;
+      break;
+   case SI_QUERY_SPILL_DRAW_CALLS:
+      query->begin_result = sctx->num_spill_draw_calls;
+      break;
+   case SI_QUERY_COMPUTE_CALLS:
+      query->begin_result = sctx->num_compute_calls;
+      break;
+   case SI_QUERY_SPILL_COMPUTE_CALLS:
+      query->begin_result = sctx->num_spill_compute_calls;
+      break;
+   case SI_QUERY_DMA_CALLS:
+      query->begin_result = sctx->num_dma_calls;
+      break;
+   case SI_QUERY_CP_DMA_CALLS:
+      query->begin_result = sctx->num_cp_dma_calls;
+      break;
+   case SI_QUERY_NUM_VS_FLUSHES:
+      query->begin_result = sctx->num_vs_flushes;
+      break;
+   case SI_QUERY_NUM_PS_FLUSHES:
+      query->begin_result = sctx->num_ps_flushes;
+      break;
+   case SI_QUERY_NUM_CS_FLUSHES:
+      query->begin_result = sctx->num_cs_flushes;
+      break;
+   case SI_QUERY_NUM_CB_CACHE_FLUSHES:
+      query->begin_result = sctx->num_cb_cache_flushes;
+      break;
+   case SI_QUERY_NUM_DB_CACHE_FLUSHES:
+      query->begin_result = sctx->num_db_cache_flushes;
+      break;
+   case SI_QUERY_NUM_L2_INVALIDATES:
+      query->begin_result = sctx->num_L2_invalidates;
+      break;
+   case SI_QUERY_NUM_L2_WRITEBACKS:
+      query->begin_result = sctx->num_L2_writebacks;
+      break;
+   case SI_QUERY_NUM_RESIDENT_HANDLES:
+      query->begin_result = sctx->num_resident_handles;
+      break;
+   case SI_QUERY_TC_OFFLOADED_SLOTS:
+      query->begin_result = sctx->tc ? sctx->tc->num_offloaded_slots : 0;
+      break;
+   case SI_QUERY_TC_DIRECT_SLOTS:
+      query->begin_result = sctx->tc ? sctx->tc->num_direct_slots : 0;
+      break;
+   case SI_QUERY_TC_NUM_SYNCS:
+      query->begin_result = sctx->tc ? sctx->tc->num_syncs : 0;
+      break;
+   case SI_QUERY_REQUESTED_VRAM:
+   case SI_QUERY_REQUESTED_GTT:
+   case SI_QUERY_MAPPED_VRAM:
+   case SI_QUERY_MAPPED_GTT:
+   case SI_QUERY_VRAM_USAGE:
+   case SI_QUERY_VRAM_VIS_USAGE:
+   case SI_QUERY_GTT_USAGE:
+   case SI_QUERY_GPU_TEMPERATURE:
+   case SI_QUERY_CURRENT_GPU_SCLK:
+   case SI_QUERY_CURRENT_GPU_MCLK:
+   case SI_QUERY_BACK_BUFFER_PS_DRAW_RATIO:
+   case SI_QUERY_NUM_MAPPED_BUFFERS:
+      query->begin_result = 0;
+      break;
+   case SI_QUERY_BUFFER_WAIT_TIME:
+   case SI_QUERY_GFX_IB_SIZE:
+   case SI_QUERY_NUM_GFX_IBS:
+   case SI_QUERY_NUM_SDMA_IBS:
+   case SI_QUERY_NUM_BYTES_MOVED:
+   case SI_QUERY_NUM_EVICTIONS:
+   case SI_QUERY_NUM_VRAM_CPU_PAGE_FAULTS: {
+      enum radeon_value_id ws_id = winsys_id_from_type(query->b.type);
+      query->begin_result = sctx->ws->query_value(sctx->ws, ws_id);
+      break;
+   }
+   case SI_QUERY_GFX_BO_LIST_SIZE:
+      ws_id = winsys_id_from_type(query->b.type);
+      query->begin_result = sctx->ws->query_value(sctx->ws, ws_id);
+      query->begin_time = sctx->ws->query_value(sctx->ws, RADEON_NUM_GFX_IBS);
+      break;
+   case SI_QUERY_CS_THREAD_BUSY:
+      ws_id = winsys_id_from_type(query->b.type);
+      query->begin_result = sctx->ws->query_value(sctx->ws, ws_id);
+      query->begin_time = os_time_get_nano();
+      break;
+   case SI_QUERY_GALLIUM_THREAD_BUSY:
+      query->begin_result = sctx->tc ? util_queue_get_thread_time_nano(&sctx->tc->queue, 0) : 0;
+      query->begin_time = os_time_get_nano();
+      break;
+   case SI_QUERY_GPU_LOAD:
+   case SI_QUERY_GPU_SHADERS_BUSY:
+   case SI_QUERY_GPU_TA_BUSY:
+   case SI_QUERY_GPU_GDS_BUSY:
+   case SI_QUERY_GPU_VGT_BUSY:
+   case SI_QUERY_GPU_IA_BUSY:
+   case SI_QUERY_GPU_SX_BUSY:
+   case SI_QUERY_GPU_WD_BUSY:
+   case SI_QUERY_GPU_BCI_BUSY:
+   case SI_QUERY_GPU_SC_BUSY:
+   case SI_QUERY_GPU_PA_BUSY:
+   case SI_QUERY_GPU_DB_BUSY:
+   case SI_QUERY_GPU_CP_BUSY:
+   case SI_QUERY_GPU_CB_BUSY:
+   case SI_QUERY_GPU_SDMA_BUSY:
+   case SI_QUERY_GPU_PFP_BUSY:
+   case SI_QUERY_GPU_MEQ_BUSY:
+   case SI_QUERY_GPU_ME_BUSY:
+   case SI_QUERY_GPU_SURF_SYNC_BUSY:
+   case SI_QUERY_GPU_CP_DMA_BUSY:
+   case SI_QUERY_GPU_SCRATCH_RAM_BUSY:
+      query->begin_result = si_begin_counter(sctx->screen, query->b.type);
+      break;
+   case SI_QUERY_NUM_COMPILATIONS:
+      query->begin_result = p_atomic_read(&sctx->screen->num_compilations);
+      break;
+   case SI_QUERY_NUM_SHADERS_CREATED:
+      query->begin_result = p_atomic_read(&sctx->screen->num_shaders_created);
+      break;
+   case SI_QUERY_LIVE_SHADER_CACHE_HITS:
+      query->begin_result = sctx->screen->live_shader_cache.hits;
+      break;
+   case SI_QUERY_LIVE_SHADER_CACHE_MISSES:
+      query->begin_result = sctx->screen->live_shader_cache.misses;
+      break;
+   case SI_QUERY_MEMORY_SHADER_CACHE_HITS:
+      query->begin_result = sctx->screen->num_memory_shader_cache_hits;
+      break;
+   case SI_QUERY_MEMORY_SHADER_CACHE_MISSES:
+      query->begin_result = sctx->screen->num_memory_shader_cache_misses;
+      break;
+   case SI_QUERY_DISK_SHADER_CACHE_HITS:
+      query->begin_result = sctx->screen->num_disk_shader_cache_hits;
+      break;
+   case SI_QUERY_DISK_SHADER_CACHE_MISSES:
+      query->begin_result = sctx->screen->num_disk_shader_cache_misses;
+      break;
+   case SI_QUERY_PD_NUM_PRIMS_ACCEPTED:
+      query->begin_result = sctx->compute_num_verts_accepted;
+      break;
+   case SI_QUERY_PD_NUM_PRIMS_REJECTED:
+      query->begin_result = sctx->compute_num_verts_rejected;
+      break;
+   case SI_QUERY_PD_NUM_PRIMS_INELIGIBLE:
+      query->begin_result = sctx->compute_num_verts_ineligible;
+      break;
+   case SI_QUERY_GPIN_ASIC_ID:
+   case SI_QUERY_GPIN_NUM_SIMD:
+   case SI_QUERY_GPIN_NUM_RB:
+   case SI_QUERY_GPIN_NUM_SPI:
+   case SI_QUERY_GPIN_NUM_SE:
+      break;
+   default:
+      unreachable("si_query_sw_begin: bad query type");
+   }
+
+   return true;
 }
 
-static bool si_query_sw_end(struct si_context *sctx,
-			    struct si_query *squery)
+static bool si_query_sw_end(struct si_context *sctx, struct si_query *squery)
 {
-	struct si_query_sw *query = (struct si_query_sw *)squery;
-	enum radeon_value_id ws_id;
-
-	switch(query->b.type) {
-	case PIPE_QUERY_TIMESTAMP_DISJOINT:
-		break;
-	case PIPE_QUERY_GPU_FINISHED:
-		sctx->b.flush(&sctx->b, &query->fence, PIPE_FLUSH_DEFERRED);
-		break;
-	case SI_QUERY_TIME_ELAPSED_SDMA_SI:
-		query->end_result = si_finish_dma_get_cpu_time(sctx);
-		break;
-	case SI_QUERY_DRAW_CALLS:
-		query->end_result = sctx->num_draw_calls;
-		break;
-	case SI_QUERY_DECOMPRESS_CALLS:
-		query->end_result = sctx->num_decompress_calls;
-		break;
-	case SI_QUERY_MRT_DRAW_CALLS:
-		query->end_result = sctx->num_mrt_draw_calls;
-		break;
-	case SI_QUERY_PRIM_RESTART_CALLS:
-		query->end_result = sctx->num_prim_restart_calls;
-		break;
-	case SI_QUERY_SPILL_DRAW_CALLS:
-		query->end_result = sctx->num_spill_draw_calls;
-		break;
-	case SI_QUERY_COMPUTE_CALLS:
-		query->end_result = sctx->num_compute_calls;
-		break;
-	case SI_QUERY_SPILL_COMPUTE_CALLS:
-		query->end_result = sctx->num_spill_compute_calls;
-		break;
-	case SI_QUERY_DMA_CALLS:
-		query->end_result = sctx->num_dma_calls;
-		break;
-	case SI_QUERY_CP_DMA_CALLS:
-		query->end_result = sctx->num_cp_dma_calls;
-		break;
-	case SI_QUERY_NUM_VS_FLUSHES:
-		query->end_result = sctx->num_vs_flushes;
-		break;
-	case SI_QUERY_NUM_PS_FLUSHES:
-		query->end_result = sctx->num_ps_flushes;
-		break;
-	case SI_QUERY_NUM_CS_FLUSHES:
-		query->end_result = sctx->num_cs_flushes;
-		break;
-	case SI_QUERY_NUM_CB_CACHE_FLUSHES:
-		query->end_result = sctx->num_cb_cache_flushes;
-		break;
-	case SI_QUERY_NUM_DB_CACHE_FLUSHES:
-		query->end_result = sctx->num_db_cache_flushes;
-		break;
-	case SI_QUERY_NUM_L2_INVALIDATES:
-		query->end_result = sctx->num_L2_invalidates;
-		break;
-	case SI_QUERY_NUM_L2_WRITEBACKS:
-		query->end_result = sctx->num_L2_writebacks;
-		break;
-	case SI_QUERY_NUM_RESIDENT_HANDLES:
-		query->end_result = sctx->num_resident_handles;
-		break;
-	case SI_QUERY_TC_OFFLOADED_SLOTS:
-		query->end_result = sctx->tc ? sctx->tc->num_offloaded_slots : 0;
-		break;
-	case SI_QUERY_TC_DIRECT_SLOTS:
-		query->end_result = sctx->tc ? sctx->tc->num_direct_slots : 0;
-		break;
-	case SI_QUERY_TC_NUM_SYNCS:
-		query->end_result = sctx->tc ? sctx->tc->num_syncs : 0;
-		break;
-	case SI_QUERY_REQUESTED_VRAM:
-	case SI_QUERY_REQUESTED_GTT:
-	case SI_QUERY_MAPPED_VRAM:
-	case SI_QUERY_MAPPED_GTT:
-	case SI_QUERY_VRAM_USAGE:
-	case SI_QUERY_VRAM_VIS_USAGE:
-	case SI_QUERY_GTT_USAGE:
-	case SI_QUERY_GPU_TEMPERATURE:
-	case SI_QUERY_CURRENT_GPU_SCLK:
-	case SI_QUERY_CURRENT_GPU_MCLK:
-	case SI_QUERY_BUFFER_WAIT_TIME:
-	case SI_QUERY_GFX_IB_SIZE:
-	case SI_QUERY_NUM_MAPPED_BUFFERS:
-	case SI_QUERY_NUM_GFX_IBS:
-	case SI_QUERY_NUM_SDMA_IBS:
-	case SI_QUERY_NUM_BYTES_MOVED:
-	case SI_QUERY_NUM_EVICTIONS:
-	case SI_QUERY_NUM_VRAM_CPU_PAGE_FAULTS: {
-		enum radeon_value_id ws_id = winsys_id_from_type(query->b.type);
-		query->end_result = sctx->ws->query_value(sctx->ws, ws_id);
-		break;
-	}
-	case SI_QUERY_GFX_BO_LIST_SIZE:
-		ws_id = winsys_id_from_type(query->b.type);
-		query->end_result = sctx->ws->query_value(sctx->ws, ws_id);
-		query->end_time = sctx->ws->query_value(sctx->ws,
-							RADEON_NUM_GFX_IBS);
-		break;
-	case SI_QUERY_CS_THREAD_BUSY:
-		ws_id = winsys_id_from_type(query->b.type);
-		query->end_result = sctx->ws->query_value(sctx->ws, ws_id);
-		query->end_time = os_time_get_nano();
-		break;
-	case SI_QUERY_GALLIUM_THREAD_BUSY:
-		query->end_result =
-			sctx->tc ? util_queue_get_thread_time_nano(&sctx->tc->queue, 0) : 0;
-		query->end_time = os_time_get_nano();
-		break;
-	case SI_QUERY_GPU_LOAD:
-	case SI_QUERY_GPU_SHADERS_BUSY:
-	case SI_QUERY_GPU_TA_BUSY:
-	case SI_QUERY_GPU_GDS_BUSY:
-	case SI_QUERY_GPU_VGT_BUSY:
-	case SI_QUERY_GPU_IA_BUSY:
-	case SI_QUERY_GPU_SX_BUSY:
-	case SI_QUERY_GPU_WD_BUSY:
-	case SI_QUERY_GPU_BCI_BUSY:
-	case SI_QUERY_GPU_SC_BUSY:
-	case SI_QUERY_GPU_PA_BUSY:
-	case SI_QUERY_GPU_DB_BUSY:
-	case SI_QUERY_GPU_CP_BUSY:
-	case SI_QUERY_GPU_CB_BUSY:
-	case SI_QUERY_GPU_SDMA_BUSY:
-	case SI_QUERY_GPU_PFP_BUSY:
-	case SI_QUERY_GPU_MEQ_BUSY:
-	case SI_QUERY_GPU_ME_BUSY:
-	case SI_QUERY_GPU_SURF_SYNC_BUSY:
-	case SI_QUERY_GPU_CP_DMA_BUSY:
-	case SI_QUERY_GPU_SCRATCH_RAM_BUSY:
-		query->end_result = si_end_counter(sctx->screen,
-						     query->b.type,
-						     query->begin_result);
-		query->begin_result = 0;
-		break;
-	case SI_QUERY_NUM_COMPILATIONS:
-		query->end_result = p_atomic_read(&sctx->screen->num_compilations);
-		break;
-	case SI_QUERY_NUM_SHADERS_CREATED:
-		query->end_result = p_atomic_read(&sctx->screen->num_shaders_created);
-		break;
-	case SI_QUERY_BACK_BUFFER_PS_DRAW_RATIO:
-		query->end_result = sctx->last_tex_ps_draw_ratio;
-		break;
-	case SI_QUERY_LIVE_SHADER_CACHE_HITS:
-		query->end_result = sctx->screen->live_shader_cache.hits;
-		break;
-	case SI_QUERY_LIVE_SHADER_CACHE_MISSES:
-		query->end_result = sctx->screen->live_shader_cache.misses;
-		break;
-	case SI_QUERY_MEMORY_SHADER_CACHE_HITS:
-		query->end_result = sctx->screen->num_memory_shader_cache_hits;
-		break;
-	case SI_QUERY_MEMORY_SHADER_CACHE_MISSES:
-		query->end_result = sctx->screen->num_memory_shader_cache_misses;
-		break;
-	case SI_QUERY_DISK_SHADER_CACHE_HITS:
-		query->end_result = sctx->screen->num_disk_shader_cache_hits;
-		break;
-	case SI_QUERY_DISK_SHADER_CACHE_MISSES:
-		query->end_result = sctx->screen->num_disk_shader_cache_misses;
-		break;
-	case SI_QUERY_PD_NUM_PRIMS_ACCEPTED:
-		query->end_result = sctx->compute_num_verts_accepted;
-		break;
-	case SI_QUERY_PD_NUM_PRIMS_REJECTED:
-		query->end_result = sctx->compute_num_verts_rejected;
-		break;
-	case SI_QUERY_PD_NUM_PRIMS_INELIGIBLE:
-		query->end_result = sctx->compute_num_verts_ineligible;
-		break;
-	case SI_QUERY_GPIN_ASIC_ID:
-	case SI_QUERY_GPIN_NUM_SIMD:
-	case SI_QUERY_GPIN_NUM_RB:
-	case SI_QUERY_GPIN_NUM_SPI:
-	case SI_QUERY_GPIN_NUM_SE:
-		break;
-	default:
-		unreachable("si_query_sw_end: bad query type");
-	}
-
-	return true;
+   struct si_query_sw *query = (struct si_query_sw *)squery;
+   enum radeon_value_id ws_id;
+
+   switch (query->b.type) {
+   case PIPE_QUERY_TIMESTAMP_DISJOINT:
+      break;
+   case PIPE_QUERY_GPU_FINISHED:
+      sctx->b.flush(&sctx->b, &query->fence, PIPE_FLUSH_DEFERRED);
+      break;
+   case SI_QUERY_TIME_ELAPSED_SDMA_SI:
+      query->end_result = si_finish_dma_get_cpu_time(sctx);
+      break;
+   case SI_QUERY_DRAW_CALLS:
+      query->end_result = sctx->num_draw_calls;
+      break;
+   case SI_QUERY_DECOMPRESS_CALLS:
+      query->end_result = sctx->num_decompress_calls;
+      break;
+   case SI_QUERY_MRT_DRAW_CALLS:
+      query->end_result = sctx->num_mrt_draw_calls;
+      break;
+   case SI_QUERY_PRIM_RESTART_CALLS:
+      query->end_result = sctx->num_prim_restart_calls;
+      break;
+   case SI_QUERY_SPILL_DRAW_CALLS:
+      query->end_result = sctx->num_spill_draw_calls;
+      break;
+   case SI_QUERY_COMPUTE_CALLS:
+      query->end_result = sctx->num_compute_calls;
+      break;
+   case SI_QUERY_SPILL_COMPUTE_CALLS:
+      query->end_result = sctx->num_spill_compute_calls;
+      break;
+   case SI_QUERY_DMA_CALLS:
+      query->end_result = sctx->num_dma_calls;
+      break;
+   case SI_QUERY_CP_DMA_CALLS:
+      query->end_result = sctx->num_cp_dma_calls;
+      break;
+   case SI_QUERY_NUM_VS_FLUSHES:
+      query->end_result = sctx->num_vs_flushes;
+      break;
+   case SI_QUERY_NUM_PS_FLUSHES:
+      query->end_result = sctx->num_ps_flushes;
+      break;
+   case SI_QUERY_NUM_CS_FLUSHES:
+      query->end_result = sctx->num_cs_flushes;
+      break;
+   case SI_QUERY_NUM_CB_CACHE_FLUSHES:
+      query->end_result = sctx->num_cb_cache_flushes;
+      break;
+   case SI_QUERY_NUM_DB_CACHE_FLUSHES:
+      query->end_result = sctx->num_db_cache_flushes;
+      break;
+   case SI_QUERY_NUM_L2_INVALIDATES:
+      query->end_result = sctx->num_L2_invalidates;
+      break;
+   case SI_QUERY_NUM_L2_WRITEBACKS:
+      query->end_result = sctx->num_L2_writebacks;
+      break;
+   case SI_QUERY_NUM_RESIDENT_HANDLES:
+      query->end_result = sctx->num_resident_handles;
+      break;
+   case SI_QUERY_TC_OFFLOADED_SLOTS:
+      query->end_result = sctx->tc ? sctx->tc->num_offloaded_slots : 0;
+      break;
+   case SI_QUERY_TC_DIRECT_SLOTS:
+      query->end_result = sctx->tc ? sctx->tc->num_direct_slots : 0;
+      break;
+   case SI_QUERY_TC_NUM_SYNCS:
+      query->end_result = sctx->tc ? sctx->tc->num_syncs : 0;
+      break;
+   case SI_QUERY_REQUESTED_VRAM:
+   case SI_QUERY_REQUESTED_GTT:
+   case SI_QUERY_MAPPED_VRAM:
+   case SI_QUERY_MAPPED_GTT:
+   case SI_QUERY_VRAM_USAGE:
+   case SI_QUERY_VRAM_VIS_USAGE:
+   case SI_QUERY_GTT_USAGE:
+   case SI_QUERY_GPU_TEMPERATURE:
+   case SI_QUERY_CURRENT_GPU_SCLK:
+   case SI_QUERY_CURRENT_GPU_MCLK:
+   case SI_QUERY_BUFFER_WAIT_TIME:
+   case SI_QUERY_GFX_IB_SIZE:
+   case SI_QUERY_NUM_MAPPED_BUFFERS:
+   case SI_QUERY_NUM_GFX_IBS:
+   case SI_QUERY_NUM_SDMA_IBS:
+   case SI_QUERY_NUM_BYTES_MOVED:
+   case SI_QUERY_NUM_EVICTIONS:
+   case SI_QUERY_NUM_VRAM_CPU_PAGE_FAULTS: {
+      enum radeon_value_id ws_id = winsys_id_from_type(query->b.type);
+      query->end_result = sctx->ws->query_value(sctx->ws, ws_id);
+      break;
+   }
+   case SI_QUERY_GFX_BO_LIST_SIZE:
+      ws_id = winsys_id_from_type(query->b.type);
+      query->end_result = sctx->ws->query_value(sctx->ws, ws_id);
+      query->end_time = sctx->ws->query_value(sctx->ws, RADEON_NUM_GFX_IBS);
+      break;
+   case SI_QUERY_CS_THREAD_BUSY:
+      ws_id = winsys_id_from_type(query->b.type);
+      query->end_result = sctx->ws->query_value(sctx->ws, ws_id);
+      query->end_time = os_time_get_nano();
+      break;
+   case SI_QUERY_GALLIUM_THREAD_BUSY:
+      query->end_result = sctx->tc ? util_queue_get_thread_time_nano(&sctx->tc->queue, 0) : 0;
+      query->end_time = os_time_get_nano();
+      break;
+   case SI_QUERY_GPU_LOAD:
+   case SI_QUERY_GPU_SHADERS_BUSY:
+   case SI_QUERY_GPU_TA_BUSY:
+   case SI_QUERY_GPU_GDS_BUSY:
+   case SI_QUERY_GPU_VGT_BUSY:
+   case SI_QUERY_GPU_IA_BUSY:
+   case SI_QUERY_GPU_SX_BUSY:
+   case SI_QUERY_GPU_WD_BUSY:
+   case SI_QUERY_GPU_BCI_BUSY:
+   case SI_QUERY_GPU_SC_BUSY:
+   case SI_QUERY_GPU_PA_BUSY:
+   case SI_QUERY_GPU_DB_BUSY:
+   case SI_QUERY_GPU_CP_BUSY:
+   case SI_QUERY_GPU_CB_BUSY:
+   case SI_QUERY_GPU_SDMA_BUSY:
+   case SI_QUERY_GPU_PFP_BUSY:
+   case SI_QUERY_GPU_MEQ_BUSY:
+   case SI_QUERY_GPU_ME_BUSY:
+   case SI_QUERY_GPU_SURF_SYNC_BUSY:
+   case SI_QUERY_GPU_CP_DMA_BUSY:
+   case SI_QUERY_GPU_SCRATCH_RAM_BUSY:
+      query->end_result = si_end_counter(sctx->screen, query->b.type, query->begin_result);
+      query->begin_result = 0;
+      break;
+   case SI_QUERY_NUM_COMPILATIONS:
+      query->end_result = p_atomic_read(&sctx->screen->num_compilations);
+      break;
+   case SI_QUERY_NUM_SHADERS_CREATED:
+      query->end_result = p_atomic_read(&sctx->screen->num_shaders_created);
+      break;
+   case SI_QUERY_BACK_BUFFER_PS_DRAW_RATIO:
+      query->end_result = sctx->last_tex_ps_draw_ratio;
+      break;
+   case SI_QUERY_LIVE_SHADER_CACHE_HITS:
+      query->end_result = sctx->screen->live_shader_cache.hits;
+      break;
+   case SI_QUERY_LIVE_SHADER_CACHE_MISSES:
+      query->end_result = sctx->screen->live_shader_cache.misses;
+      break;
+   case SI_QUERY_MEMORY_SHADER_CACHE_HITS:
+      query->end_result = sctx->screen->num_memory_shader_cache_hits;
+      break;
+   case SI_QUERY_MEMORY_SHADER_CACHE_MISSES:
+      query->end_result = sctx->screen->num_memory_shader_cache_misses;
+      break;
+   case SI_QUERY_DISK_SHADER_CACHE_HITS:
+      query->end_result = sctx->screen->num_disk_shader_cache_hits;
+      break;
+   case SI_QUERY_DISK_SHADER_CACHE_MISSES:
+      query->end_result = sctx->screen->num_disk_shader_cache_misses;
+      break;
+   case SI_QUERY_PD_NUM_PRIMS_ACCEPTED:
+      query->end_result = sctx->compute_num_verts_accepted;
+      break;
+   case SI_QUERY_PD_NUM_PRIMS_REJECTED:
+      query->end_result = sctx->compute_num_verts_rejected;
+      break;
+   case SI_QUERY_PD_NUM_PRIMS_INELIGIBLE:
+      query->end_result = sctx->compute_num_verts_ineligible;
+      break;
+   case SI_QUERY_GPIN_ASIC_ID:
+   case SI_QUERY_GPIN_NUM_SIMD:
+   case SI_QUERY_GPIN_NUM_RB:
+   case SI_QUERY_GPIN_NUM_SPI:
+   case SI_QUERY_GPIN_NUM_SE:
+      break;
+   default:
+      unreachable("si_query_sw_end: bad query type");
+   }
+
+   return true;
 }
 
-static bool si_query_sw_get_result(struct si_context *sctx,
-				   struct si_query *squery,
-				   bool wait,
-				   union pipe_query_result *result)
+static bool si_query_sw_get_result(struct si_context *sctx, struct si_query *squery, bool wait,
+                                   union pipe_query_result *result)
 {
-	struct si_query_sw *query = (struct si_query_sw *)squery;
-
-	switch (query->b.type) {
-	case PIPE_QUERY_TIMESTAMP_DISJOINT:
-		/* Convert from cycles per millisecond to cycles per second (Hz). */
-		result->timestamp_disjoint.frequency =
-			(uint64_t)sctx->screen->info.clock_crystal_freq * 1000;
-		result->timestamp_disjoint.disjoint = false;
-		return true;
-	case PIPE_QUERY_GPU_FINISHED: {
-		struct pipe_screen *screen = sctx->b.screen;
-		struct pipe_context *ctx = squery->b.flushed ? NULL : &sctx->b;
-
-		result->b = screen->fence_finish(screen, ctx, query->fence,
-						 wait ? PIPE_TIMEOUT_INFINITE : 0);
-		return result->b;
-	}
-
-	case SI_QUERY_GFX_BO_LIST_SIZE:
-		result->u64 = (query->end_result - query->begin_result) /
-			      (query->end_time - query->begin_time);
-		return true;
-	case SI_QUERY_CS_THREAD_BUSY:
-	case SI_QUERY_GALLIUM_THREAD_BUSY:
-		result->u64 = (query->end_result - query->begin_result) * 100 /
-			      (query->end_time - query->begin_time);
-		return true;
-	case SI_QUERY_PD_NUM_PRIMS_ACCEPTED:
-	case SI_QUERY_PD_NUM_PRIMS_REJECTED:
-	case SI_QUERY_PD_NUM_PRIMS_INELIGIBLE:
-		result->u64 = ((unsigned)query->end_result -
-			       (unsigned)query->begin_result) / 3;
-		return true;
-	case SI_QUERY_GPIN_ASIC_ID:
-		result->u32 = 0;
-		return true;
-	case SI_QUERY_GPIN_NUM_SIMD:
-		result->u32 = sctx->screen->info.num_good_compute_units;
-		return true;
-	case SI_QUERY_GPIN_NUM_RB:
-		result->u32 = sctx->screen->info.num_render_backends;
-		return true;
-	case SI_QUERY_GPIN_NUM_SPI:
-		result->u32 = 1; /* all supported chips have one SPI per SE */
-		return true;
-	case SI_QUERY_GPIN_NUM_SE:
-		result->u32 = sctx->screen->info.max_se;
-		return true;
-	}
-
-	result->u64 = query->end_result - query->begin_result;
-
-	switch (query->b.type) {
-	case SI_QUERY_BUFFER_WAIT_TIME:
-	case SI_QUERY_GPU_TEMPERATURE:
-		result->u64 /= 1000;
-		break;
-	case SI_QUERY_CURRENT_GPU_SCLK:
-	case SI_QUERY_CURRENT_GPU_MCLK:
-		result->u64 *= 1000000;
-		break;
-	}
-
-	return true;
+   struct si_query_sw *query = (struct si_query_sw *)squery;
+
+   switch (query->b.type) {
+   case PIPE_QUERY_TIMESTAMP_DISJOINT:
+      /* Convert from cycles per millisecond to cycles per second (Hz). */
+      result->timestamp_disjoint.frequency = (uint64_t)sctx->screen->info.clock_crystal_freq * 1000;
+      result->timestamp_disjoint.disjoint = false;
+      return true;
+   case PIPE_QUERY_GPU_FINISHED: {
+      struct pipe_screen *screen = sctx->b.screen;
+      struct pipe_context *ctx = squery->b.flushed ? NULL : &sctx->b;
+
+      result->b = screen->fence_finish(screen, ctx, query->fence, wait ? PIPE_TIMEOUT_INFINITE : 0);
+      return result->b;
+   }
+
+   case SI_QUERY_GFX_BO_LIST_SIZE:
+      result->u64 =
+         (query->end_result - query->begin_result) / (query->end_time - query->begin_time);
+      return true;
+   case SI_QUERY_CS_THREAD_BUSY:
+   case SI_QUERY_GALLIUM_THREAD_BUSY:
+      result->u64 =
+         (query->end_result - query->begin_result) * 100 / (query->end_time - query->begin_time);
+      return true;
+   case SI_QUERY_PD_NUM_PRIMS_ACCEPTED:
+   case SI_QUERY_PD_NUM_PRIMS_REJECTED:
+   case SI_QUERY_PD_NUM_PRIMS_INELIGIBLE:
+      result->u64 = ((unsigned)query->end_result - (unsigned)query->begin_result) / 3;
+      return true;
+   case SI_QUERY_GPIN_ASIC_ID:
+      result->u32 = 0;
+      return true;
+   case SI_QUERY_GPIN_NUM_SIMD:
+      result->u32 = sctx->screen->info.num_good_compute_units;
+      return true;
+   case SI_QUERY_GPIN_NUM_RB:
+      result->u32 = sctx->screen->info.num_render_backends;
+      return true;
+   case SI_QUERY_GPIN_NUM_SPI:
+      result->u32 = 1; /* all supported chips have one SPI per SE */
+      return true;
+   case SI_QUERY_GPIN_NUM_SE:
+      result->u32 = sctx->screen->info.max_se;
+      return true;
+   }
+
+   result->u64 = query->end_result - query->begin_result;
+
+   switch (query->b.type) {
+   case SI_QUERY_BUFFER_WAIT_TIME:
+   case SI_QUERY_GPU_TEMPERATURE:
+      result->u64 /= 1000;
+      break;
+   case SI_QUERY_CURRENT_GPU_SCLK:
+   case SI_QUERY_CURRENT_GPU_MCLK:
+      result->u64 *= 1000000;
+      break;
+   }
+
+   return true;
 }
 
-
-static const struct si_query_ops sw_query_ops = {
-	.destroy = si_query_sw_destroy,
-	.begin = si_query_sw_begin,
-	.end = si_query_sw_end,
-	.get_result = si_query_sw_get_result,
-	.get_result_resource = NULL
-};
+static const struct si_query_ops sw_query_ops = {.destroy = si_query_sw_destroy,
+                                                 .begin = si_query_sw_begin,
+                                                 .end = si_query_sw_end,
+                                                 .get_result = si_query_sw_get_result,
+                                                 .get_result_resource = NULL};
 
 static struct pipe_query *si_query_sw_create(unsigned query_type)
 {
-	struct si_query_sw *query;
+   struct si_query_sw *query;
 
-	query = CALLOC_STRUCT(si_query_sw);
-	if (!query)
-		return NULL;
+   query = CALLOC_STRUCT(si_query_sw);
+   if (!query)
+      return NULL;
 
-	query->b.type = query_type;
-	query->b.ops = &sw_query_ops;
+   query->b.type = query_type;
+   query->b.ops = &sw_query_ops;
 
-	return (struct pipe_query *)query;
+   return (struct pipe_query *)query;
 }
 
 void si_query_buffer_destroy(struct si_screen *sscreen, struct si_query_buffer *buffer)
 {
-	struct si_query_buffer *prev = buffer->previous;
+   struct si_query_buffer *prev = buffer->previous;
 
-	/* Release all query buffers. */
-	while (prev) {
-		struct si_query_buffer *qbuf = prev;
-		prev = prev->previous;
-		si_resource_reference(&qbuf->buf, NULL);
-		FREE(qbuf);
-	}
+   /* Release all query buffers. */
+   while (prev) {
+      struct si_query_buffer *qbuf = prev;
+      prev = prev->previous;
+      si_resource_reference(&qbuf->buf, NULL);
+      FREE(qbuf);
+   }
 
-	si_resource_reference(&buffer->buf, NULL);
+   si_resource_reference(&buffer->buf, NULL);
 }
 
 void si_query_buffer_reset(struct si_context *sctx, struct si_query_buffer *buffer)
 {
-	/* Discard all query buffers except for the oldest. */
-	while (buffer->previous) {
-		struct si_query_buffer *qbuf = buffer->previous;
-		buffer->previous = qbuf->previous;
-
-		si_resource_reference(&buffer->buf, NULL);
-		buffer->buf = qbuf->buf; /* move ownership */
-		FREE(qbuf);
-	}
-	buffer->results_end = 0;
-
-	if (!buffer->buf)
-		return;
-
-	/* Discard even the oldest buffer if it can't be mapped without a stall. */
-	if (si_rings_is_buffer_referenced(sctx, buffer->buf->buf, RADEON_USAGE_READWRITE) ||
-	    !sctx->ws->buffer_wait(buffer->buf->buf, 0, RADEON_USAGE_READWRITE)) {
-		si_resource_reference(&buffer->buf, NULL);
-	} else {
-		buffer->unprepared = true;
-	}
+   /* Discard all query buffers except for the oldest. */
+   while (buffer->previous) {
+      struct si_query_buffer *qbuf = buffer->previous;
+      buffer->previous = qbuf->previous;
+
+      si_resource_reference(&buffer->buf, NULL);
+      buffer->buf = qbuf->buf; /* move ownership */
+      FREE(qbuf);
+   }
+   buffer->results_end = 0;
+
+   if (!buffer->buf)
+      return;
+
+   /* Discard even the oldest buffer if it can't be mapped without a stall. */
+   if (si_rings_is_buffer_referenced(sctx, buffer->buf->buf, RADEON_USAGE_READWRITE) ||
+       !sctx->ws->buffer_wait(buffer->buf->buf, 0, RADEON_USAGE_READWRITE)) {
+      si_resource_reference(&buffer->buf, NULL);
+   } else {
+      buffer->unprepared = true;
+   }
 }
 
 bool si_query_buffer_alloc(struct si_context *sctx, struct si_query_buffer *buffer,
-			   bool (*prepare_buffer)(struct si_context *, struct si_query_buffer*),
-			   unsigned size)
+                           bool (*prepare_buffer)(struct si_context *, struct si_query_buffer *),
+                           unsigned size)
 {
-	bool unprepared = buffer->unprepared;
-	buffer->unprepared = false;
-
-	if (!buffer->buf || buffer->results_end + size > buffer->buf->b.b.width0) {
-		if (buffer->buf) {
-			struct si_query_buffer *qbuf = MALLOC_STRUCT(si_query_buffer);
-			memcpy(qbuf, buffer, sizeof(*qbuf));
-			buffer->previous = qbuf;
-		}
-		buffer->results_end = 0;
-
-		/* Queries are normally read by the CPU after
-		 * being written by the gpu, hence staging is probably a good
-		 * usage pattern.
-		 */
-		struct si_screen *screen = sctx->screen;
-		unsigned buf_size = MAX2(size, screen->info.min_alloc_size);
-		buffer->buf = si_resource(
-			pipe_buffer_create(&screen->b, 0, PIPE_USAGE_STAGING, buf_size));
-		if (unlikely(!buffer->buf))
-			return false;
-		unprepared = true;
-	}
-
-	if (unprepared && prepare_buffer) {
-		if (unlikely(!prepare_buffer(sctx, buffer))) {
-			si_resource_reference(&buffer->buf, NULL);
-			return false;
-		}
-	}
-
-	return true;
+   bool unprepared = buffer->unprepared;
+   buffer->unprepared = false;
+
+   if (!buffer->buf || buffer->results_end + size > buffer->buf->b.b.width0) {
+      if (buffer->buf) {
+         struct si_query_buffer *qbuf = MALLOC_STRUCT(si_query_buffer);
+         memcpy(qbuf, buffer, sizeof(*qbuf));
+         buffer->previous = qbuf;
+      }
+      buffer->results_end = 0;
+
+      /* Queries are normally read by the CPU after
+       * being written by the gpu, hence staging is probably a good
+       * usage pattern.
+       */
+      struct si_screen *screen = sctx->screen;
+      unsigned buf_size = MAX2(size, screen->info.min_alloc_size);
+      buffer->buf = si_resource(pipe_buffer_create(&screen->b, 0, PIPE_USAGE_STAGING, buf_size));
+      if (unlikely(!buffer->buf))
+         return false;
+      unprepared = true;
+   }
+
+   if (unprepared && prepare_buffer) {
+      if (unlikely(!prepare_buffer(sctx, buffer))) {
+         si_resource_reference(&buffer->buf, NULL);
+         return false;
+      }
+   }
+
+   return true;
 }
 
-
 void si_query_hw_destroy(struct si_context *sctx, struct si_query *squery)
 {
-	struct si_query_hw *query = (struct si_query_hw *)squery;
+   struct si_query_hw *query = (struct si_query_hw *)squery;
 
-	si_query_buffer_destroy(sctx->screen, &query->buffer);
-	si_resource_reference(&query->workaround_buf, NULL);
-	FREE(squery);
+   si_query_buffer_destroy(sctx->screen, &query->buffer);
+   si_resource_reference(&query->workaround_buf, NULL);
+   FREE(squery);
 }
 
-static bool si_query_hw_prepare_buffer(struct si_context *sctx,
-				       struct si_query_buffer *qbuf)
+static bool si_query_hw_prepare_buffer(struct si_context *sctx, struct si_query_buffer *qbuf)
 {
-	static const struct si_query_hw si_query_hw_s;
-	struct si_query_hw *query = container_of(qbuf, &si_query_hw_s, buffer);
-	struct si_screen *screen = sctx->screen;
-
-	/* The caller ensures that the buffer is currently unused by the GPU. */
-	uint32_t *results = screen->ws->buffer_map(qbuf->buf->buf, NULL,
-						   PIPE_TRANSFER_WRITE |
-						   PIPE_TRANSFER_UNSYNCHRONIZED);
-	if (!results)
-		return false;
-
-	memset(results, 0, qbuf->buf->b.b.width0);
-
-	if (query->b.type == PIPE_QUERY_OCCLUSION_COUNTER ||
-	    query->b.type == PIPE_QUERY_OCCLUSION_PREDICATE ||
-	    query->b.type == PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE) {
-		unsigned max_rbs = screen->info.num_render_backends;
-		unsigned enabled_rb_mask = screen->info.enabled_rb_mask;
-		unsigned num_results;
-		unsigned i, j;
-
-		/* Set top bits for unused backends. */
-		num_results = qbuf->buf->b.b.width0 / query->result_size;
-		for (j = 0; j < num_results; j++) {
-			for (i = 0; i < max_rbs; i++) {
-				if (!(enabled_rb_mask & (1<<i))) {
-					results[(i * 4)+1] = 0x80000000;
-					results[(i * 4)+3] = 0x80000000;
-				}
-			}
-			results += 4 * max_rbs;
-		}
-	}
-
-	return true;
+   static const struct si_query_hw si_query_hw_s;
+   struct si_query_hw *query = container_of(qbuf, &si_query_hw_s, buffer);
+   struct si_screen *screen = sctx->screen;
+
+   /* The caller ensures that the buffer is currently unused by the GPU. */
+   uint32_t *results = screen->ws->buffer_map(qbuf->buf->buf, NULL,
+                                              PIPE_TRANSFER_WRITE | PIPE_TRANSFER_UNSYNCHRONIZED);
+   if (!results)
+      return false;
+
+   memset(results, 0, qbuf->buf->b.b.width0);
+
+   if (query->b.type == PIPE_QUERY_OCCLUSION_COUNTER ||
+       query->b.type == PIPE_QUERY_OCCLUSION_PREDICATE ||
+       query->b.type == PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE) {
+      unsigned max_rbs = screen->info.num_render_backends;
+      unsigned enabled_rb_mask = screen->info.enabled_rb_mask;
+      unsigned num_results;
+      unsigned i, j;
+
+      /* Set top bits for unused backends. */
+      num_results = qbuf->buf->b.b.width0 / query->result_size;
+      for (j = 0; j < num_results; j++) {
+         for (i = 0; i < max_rbs; i++) {
+            if (!(enabled_rb_mask & (1 << i))) {
+               results[(i * 4) + 1] = 0x80000000;
+               results[(i * 4) + 3] = 0x80000000;
+            }
+         }
+         results += 4 * max_rbs;
+      }
+   }
+
+   return true;
 }
 
-static void si_query_hw_get_result_resource(struct si_context *sctx,
-					    struct si_query *squery,
-					    bool wait,
-					    enum pipe_query_value_type result_type,
-					    int index,
-					    struct pipe_resource *resource,
-					    unsigned offset);
-
-static void si_query_hw_do_emit_start(struct si_context *sctx,
-				      struct si_query_hw *query,
-				      struct si_resource *buffer,
-				      uint64_t va);
-static void si_query_hw_do_emit_stop(struct si_context *sctx,
-				     struct si_query_hw *query,
-				     struct si_resource *buffer,
-				     uint64_t va);
-static void si_query_hw_add_result(struct si_screen *sscreen,
-				   struct si_query_hw *, void *buffer,
-				   union pipe_query_result *result);
-static void si_query_hw_clear_result(struct si_query_hw *,
-				     union pipe_query_result *);
+static void si_query_hw_get_result_resource(struct si_context *sctx, struct si_query *squery,
+                                            bool wait, enum pipe_query_value_type result_type,
+                                            int index, struct pipe_resource *resource,
+                                            unsigned offset);
+
+static void si_query_hw_do_emit_start(struct si_context *sctx, struct si_query_hw *query,
+                                      struct si_resource *buffer, uint64_t va);
+static void si_query_hw_do_emit_stop(struct si_context *sctx, struct si_query_hw *query,
+                                     struct si_resource *buffer, uint64_t va);
+static void si_query_hw_add_result(struct si_screen *sscreen, struct si_query_hw *, void *buffer,
+                                   union pipe_query_result *result);
+static void si_query_hw_clear_result(struct si_query_hw *, union pipe_query_result *);
 
 static struct si_query_hw_ops query_hw_default_hw_ops = {
-	.prepare_buffer = si_query_hw_prepare_buffer,
-	.emit_start = si_query_hw_do_emit_start,
-	.emit_stop = si_query_hw_do_emit_stop,
-	.clear_result = si_query_hw_clear_result,
-	.add_result = si_query_hw_add_result,
+   .prepare_buffer = si_query_hw_prepare_buffer,
+   .emit_start = si_query_hw_do_emit_start,
+   .emit_stop = si_query_hw_do_emit_stop,
+   .clear_result = si_query_hw_clear_result,
+   .add_result = si_query_hw_add_result,
 };
 
-static struct pipe_query *si_query_hw_create(struct si_screen *sscreen,
-					     unsigned query_type,
-					     unsigned index)
+static struct pipe_query *si_query_hw_create(struct si_screen *sscreen, unsigned query_type,
+                                             unsigned index)
 {
-	struct si_query_hw *query = CALLOC_STRUCT(si_query_hw);
-	if (!query)
-		return NULL;
-
-	query->b.type = query_type;
-	query->b.ops = &query_hw_ops;
-	query->ops = &query_hw_default_hw_ops;
-
-	switch (query_type) {
-	case PIPE_QUERY_OCCLUSION_COUNTER:
-	case PIPE_QUERY_OCCLUSION_PREDICATE:
-	case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
-		query->result_size = 16 * sscreen->info.num_render_backends;
-		query->result_size += 16; /* for the fence + alignment */
-		query->b.num_cs_dw_suspend = 6 + si_cp_write_fence_dwords(sscreen);
-		break;
-	case SI_QUERY_TIME_ELAPSED_SDMA:
-		/* GET_GLOBAL_TIMESTAMP only works if the offset is a multiple of 32. */
-		query->result_size = 64;
-		break;
-	case PIPE_QUERY_TIME_ELAPSED:
-		query->result_size = 24;
-		query->b.num_cs_dw_suspend = 8 + si_cp_write_fence_dwords(sscreen);
-		break;
-	case PIPE_QUERY_TIMESTAMP:
-		query->result_size = 16;
-		query->b.num_cs_dw_suspend = 8 + si_cp_write_fence_dwords(sscreen);
-		query->flags = SI_QUERY_HW_FLAG_NO_START;
-		break;
-	case PIPE_QUERY_PRIMITIVES_EMITTED:
-	case PIPE_QUERY_PRIMITIVES_GENERATED:
-	case PIPE_QUERY_SO_STATISTICS:
-	case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
-		/* NumPrimitivesWritten, PrimitiveStorageNeeded. */
-		query->result_size = 32;
-		query->b.num_cs_dw_suspend = 6;
-		query->stream = index;
-		break;
-	case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
-		/* NumPrimitivesWritten, PrimitiveStorageNeeded. */
-		query->result_size = 32 * SI_MAX_STREAMS;
-		query->b.num_cs_dw_suspend = 6 * SI_MAX_STREAMS;
-		break;
-	case PIPE_QUERY_PIPELINE_STATISTICS:
-		/* 11 values on GCN. */
-		query->result_size = 11 * 16;
-		query->result_size += 8; /* for the fence + alignment */
-		query->b.num_cs_dw_suspend = 6 + si_cp_write_fence_dwords(sscreen);
-		break;
-	default:
-		assert(0);
-		FREE(query);
-		return NULL;
-	}
-
-	return (struct pipe_query *)query;
+   struct si_query_hw *query = CALLOC_STRUCT(si_query_hw);
+   if (!query)
+      return NULL;
+
+   query->b.type = query_type;
+   query->b.ops = &query_hw_ops;
+   query->ops = &query_hw_default_hw_ops;
+
+   switch (query_type) {
+   case PIPE_QUERY_OCCLUSION_COUNTER:
+   case PIPE_QUERY_OCCLUSION_PREDICATE:
+   case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
+      query->result_size = 16 * sscreen->info.num_render_backends;
+      query->result_size += 16; /* for the fence + alignment */
+      query->b.num_cs_dw_suspend = 6 + si_cp_write_fence_dwords(sscreen);
+      break;
+   case SI_QUERY_TIME_ELAPSED_SDMA:
+      /* GET_GLOBAL_TIMESTAMP only works if the offset is a multiple of 32. */
+      query->result_size = 64;
+      break;
+   case PIPE_QUERY_TIME_ELAPSED:
+      query->result_size = 24;
+      query->b.num_cs_dw_suspend = 8 + si_cp_write_fence_dwords(sscreen);
+      break;
+   case PIPE_QUERY_TIMESTAMP:
+      query->result_size = 16;
+      query->b.num_cs_dw_suspend = 8 + si_cp_write_fence_dwords(sscreen);
+      query->flags = SI_QUERY_HW_FLAG_NO_START;
+      break;
+   case PIPE_QUERY_PRIMITIVES_EMITTED:
+   case PIPE_QUERY_PRIMITIVES_GENERATED:
+   case PIPE_QUERY_SO_STATISTICS:
+   case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
+      /* NumPrimitivesWritten, PrimitiveStorageNeeded. */
+      query->result_size = 32;
+      query->b.num_cs_dw_suspend = 6;
+      query->stream = index;
+      break;
+   case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
+      /* NumPrimitivesWritten, PrimitiveStorageNeeded. */
+      query->result_size = 32 * SI_MAX_STREAMS;
+      query->b.num_cs_dw_suspend = 6 * SI_MAX_STREAMS;
+      break;
+   case PIPE_QUERY_PIPELINE_STATISTICS:
+      /* 11 values on GCN. */
+      query->result_size = 11 * 16;
+      query->result_size += 8; /* for the fence + alignment */
+      query->b.num_cs_dw_suspend = 6 + si_cp_write_fence_dwords(sscreen);
+      break;
+   default:
+      assert(0);
+      FREE(query);
+      return NULL;
+   }
+
+   return (struct pipe_query *)query;
 }
 
-static void si_update_occlusion_query_state(struct si_context *sctx,
-					    unsigned type, int diff)
+static void si_update_occlusion_query_state(struct si_context *sctx, unsigned type, int diff)
 {
-	if (type == PIPE_QUERY_OCCLUSION_COUNTER ||
-	    type == PIPE_QUERY_OCCLUSION_PREDICATE ||
-	    type == PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE) {
-		bool old_enable = sctx->num_occlusion_queries != 0;
-		bool old_perfect_enable =
-			sctx->num_perfect_occlusion_queries != 0;
-		bool enable, perfect_enable;
-
-		sctx->num_occlusion_queries += diff;
-		assert(sctx->num_occlusion_queries >= 0);
-
-		if (type != PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE) {
-			sctx->num_perfect_occlusion_queries += diff;
-			assert(sctx->num_perfect_occlusion_queries >= 0);
-		}
-
-		enable = sctx->num_occlusion_queries != 0;
-		perfect_enable = sctx->num_perfect_occlusion_queries != 0;
-
-		if (enable != old_enable || perfect_enable != old_perfect_enable) {
-			si_set_occlusion_query_state(sctx, old_perfect_enable);
-		}
-	}
+   if (type == PIPE_QUERY_OCCLUSION_COUNTER || type == PIPE_QUERY_OCCLUSION_PREDICATE ||
+       type == PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE) {
+      bool old_enable = sctx->num_occlusion_queries != 0;
+      bool old_perfect_enable = sctx->num_perfect_occlusion_queries != 0;
+      bool enable, perfect_enable;
+
+      sctx->num_occlusion_queries += diff;
+      assert(sctx->num_occlusion_queries >= 0);
+
+      if (type != PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE) {
+         sctx->num_perfect_occlusion_queries += diff;
+         assert(sctx->num_perfect_occlusion_queries >= 0);
+      }
+
+      enable = sctx->num_occlusion_queries != 0;
+      perfect_enable = sctx->num_perfect_occlusion_queries != 0;
+
+      if (enable != old_enable || perfect_enable != old_perfect_enable) {
+         si_set_occlusion_query_state(sctx, old_perfect_enable);
+      }
+   }
 }
 
 static unsigned event_type_for_stream(unsigned stream)
 {
-	switch (stream) {
-	default:
-	case 0: return V_028A90_SAMPLE_STREAMOUTSTATS;
-	case 1: return V_028A90_SAMPLE_STREAMOUTSTATS1;
-	case 2: return V_028A90_SAMPLE_STREAMOUTSTATS2;
-	case 3: return V_028A90_SAMPLE_STREAMOUTSTATS3;
-	}
+   switch (stream) {
+   default:
+   case 0:
+      return V_028A90_SAMPLE_STREAMOUTSTATS;
+   case 1:
+      return V_028A90_SAMPLE_STREAMOUTSTATS1;
+   case 2:
+      return V_028A90_SAMPLE_STREAMOUTSTATS2;
+   case 3:
+      return V_028A90_SAMPLE_STREAMOUTSTATS3;
+   }
 }
 
-static void emit_sample_streamout(struct radeon_cmdbuf *cs, uint64_t va,
-				  unsigned stream)
+static void emit_sample_streamout(struct radeon_cmdbuf *cs, uint64_t va, unsigned stream)
 {
-	radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
-	radeon_emit(cs, EVENT_TYPE(event_type_for_stream(stream)) | EVENT_INDEX(3));
-	radeon_emit(cs, va);
-	radeon_emit(cs, va >> 32);
+   radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
+   radeon_emit(cs, EVENT_TYPE(event_type_for_stream(stream)) | EVENT_INDEX(3));
+   radeon_emit(cs, va);
+   radeon_emit(cs, va >> 32);
 }
 
-static void si_query_hw_do_emit_start(struct si_context *sctx,
-					struct si_query_hw *query,
-					struct si_resource *buffer,
-					uint64_t va)
+static void si_query_hw_do_emit_start(struct si_context *sctx, struct si_query_hw *query,
+                                      struct si_resource *buffer, uint64_t va)
 {
-	struct radeon_cmdbuf *cs = sctx->gfx_cs;
-
-	switch (query->b.type) {
-	case SI_QUERY_TIME_ELAPSED_SDMA:
-		si_dma_emit_timestamp(sctx, buffer, va - buffer->gpu_address);
-		return;
-	case PIPE_QUERY_OCCLUSION_COUNTER:
-	case PIPE_QUERY_OCCLUSION_PREDICATE:
-	case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
-		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
-		radeon_emit(cs, EVENT_TYPE(V_028A90_ZPASS_DONE) | EVENT_INDEX(1));
-		radeon_emit(cs, va);
-		radeon_emit(cs, va >> 32);
-		break;
-	case PIPE_QUERY_PRIMITIVES_EMITTED:
-	case PIPE_QUERY_PRIMITIVES_GENERATED:
-	case PIPE_QUERY_SO_STATISTICS:
-	case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
-		emit_sample_streamout(cs, va, query->stream);
-		break;
-	case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
-		for (unsigned stream = 0; stream < SI_MAX_STREAMS; ++stream)
-			emit_sample_streamout(cs, va + 32 * stream, stream);
-		break;
-	case PIPE_QUERY_TIME_ELAPSED:
-		si_cp_release_mem(sctx, cs, V_028A90_BOTTOM_OF_PIPE_TS, 0,
-				  EOP_DST_SEL_MEM, EOP_INT_SEL_NONE,
-				  EOP_DATA_SEL_TIMESTAMP, NULL, va,
-				  0, query->b.type);
-		break;
-	case PIPE_QUERY_PIPELINE_STATISTICS:
-		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
-		radeon_emit(cs, EVENT_TYPE(V_028A90_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2));
-		radeon_emit(cs, va);
-		radeon_emit(cs, va >> 32);
-		break;
-	default:
-		assert(0);
-	}
-	radeon_add_to_buffer_list(sctx, sctx->gfx_cs, query->buffer.buf, RADEON_USAGE_WRITE,
-				  RADEON_PRIO_QUERY);
+   struct radeon_cmdbuf *cs = sctx->gfx_cs;
+
+   switch (query->b.type) {
+   case SI_QUERY_TIME_ELAPSED_SDMA:
+      si_dma_emit_timestamp(sctx, buffer, va - buffer->gpu_address);
+      return;
+   case PIPE_QUERY_OCCLUSION_COUNTER:
+   case PIPE_QUERY_OCCLUSION_PREDICATE:
+   case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
+      radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
+      radeon_emit(cs, EVENT_TYPE(V_028A90_ZPASS_DONE) | EVENT_INDEX(1));
+      radeon_emit(cs, va);
+      radeon_emit(cs, va >> 32);
+      break;
+   case PIPE_QUERY_PRIMITIVES_EMITTED:
+   case PIPE_QUERY_PRIMITIVES_GENERATED:
+   case PIPE_QUERY_SO_STATISTICS:
+   case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
+      emit_sample_streamout(cs, va, query->stream);
+      break;
+   case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
+      for (unsigned stream = 0; stream < SI_MAX_STREAMS; ++stream)
+         emit_sample_streamout(cs, va + 32 * stream, stream);
+      break;
+   case PIPE_QUERY_TIME_ELAPSED:
+      si_cp_release_mem(sctx, cs, V_028A90_BOTTOM_OF_PIPE_TS, 0, EOP_DST_SEL_MEM, EOP_INT_SEL_NONE,
+                        EOP_DATA_SEL_TIMESTAMP, NULL, va, 0, query->b.type);
+      break;
+   case PIPE_QUERY_PIPELINE_STATISTICS:
+      radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
+      radeon_emit(cs, EVENT_TYPE(V_028A90_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2));
+      radeon_emit(cs, va);
+      radeon_emit(cs, va >> 32);
+      break;
+   default:
+      assert(0);
+   }
+   radeon_add_to_buffer_list(sctx, sctx->gfx_cs, query->buffer.buf, RADEON_USAGE_WRITE,
+                             RADEON_PRIO_QUERY);
 }
 
-static void si_query_hw_emit_start(struct si_context *sctx,
-				   struct si_query_hw *query)
+static void si_query_hw_emit_start(struct si_context *sctx, struct si_query_hw *query)
 {
-	uint64_t va;
+   uint64_t va;
 
-	if (!si_query_buffer_alloc(sctx, &query->buffer, query->ops->prepare_buffer,
-				   query->result_size))
-		return;
+   if (!si_query_buffer_alloc(sctx, &query->buffer, query->ops->prepare_buffer, query->result_size))
+      return;
 
-	si_update_occlusion_query_state(sctx, query->b.type, 1);
-	si_update_prims_generated_query_state(sctx, query->b.type, 1);
+   si_update_occlusion_query_state(sctx, query->b.type, 1);
+   si_update_prims_generated_query_state(sctx, query->b.type, 1);
 
-	if (query->b.type == PIPE_QUERY_PIPELINE_STATISTICS)
-		sctx->num_pipeline_stat_queries++;
+   if (query->b.type == PIPE_QUERY_PIPELINE_STATISTICS)
+      sctx->num_pipeline_stat_queries++;
 
-	if (query->b.type != SI_QUERY_TIME_ELAPSED_SDMA)
-		si_need_gfx_cs_space(sctx);
+   if (query->b.type != SI_QUERY_TIME_ELAPSED_SDMA)
+      si_need_gfx_cs_space(sctx);
 
-	va = query->buffer.buf->gpu_address + query->buffer.results_end;
-	query->ops->emit_start(sctx, query, query->buffer.buf, va);
+   va = query->buffer.buf->gpu_address + query->buffer.results_end;
+   query->ops->emit_start(sctx, query, query->buffer.buf, va);
 }
 
-static void si_query_hw_do_emit_stop(struct si_context *sctx,
-				       struct si_query_hw *query,
-				       struct si_resource *buffer,
-				       uint64_t va)
+static void si_query_hw_do_emit_stop(struct si_context *sctx, struct si_query_hw *query,
+                                     struct si_resource *buffer, uint64_t va)
 {
-	struct radeon_cmdbuf *cs = sctx->gfx_cs;
-	uint64_t fence_va = 0;
-
-	switch (query->b.type) {
-	case SI_QUERY_TIME_ELAPSED_SDMA:
-		si_dma_emit_timestamp(sctx, buffer, va + 32 - buffer->gpu_address);
-		return;
-	case PIPE_QUERY_OCCLUSION_COUNTER:
-	case PIPE_QUERY_OCCLUSION_PREDICATE:
-	case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
-		va += 8;
-		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
-		radeon_emit(cs, EVENT_TYPE(V_028A90_ZPASS_DONE) | EVENT_INDEX(1));
-		radeon_emit(cs, va);
-		radeon_emit(cs, va >> 32);
-
-		fence_va = va + sctx->screen->info.num_render_backends * 16 - 8;
-		break;
-	case PIPE_QUERY_PRIMITIVES_EMITTED:
-	case PIPE_QUERY_PRIMITIVES_GENERATED:
-	case PIPE_QUERY_SO_STATISTICS:
-	case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
-		va += 16;
-		emit_sample_streamout(cs, va, query->stream);
-		break;
-	case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
-		va += 16;
-		for (unsigned stream = 0; stream < SI_MAX_STREAMS; ++stream)
-			emit_sample_streamout(cs, va + 32 * stream, stream);
-		break;
-	case PIPE_QUERY_TIME_ELAPSED:
-		va += 8;
-		/* fall through */
-	case PIPE_QUERY_TIMESTAMP:
-		si_cp_release_mem(sctx, cs, V_028A90_BOTTOM_OF_PIPE_TS, 0,
-				  EOP_DST_SEL_MEM, EOP_INT_SEL_NONE,
-				  EOP_DATA_SEL_TIMESTAMP, NULL, va,
-				  0, query->b.type);
-		fence_va = va + 8;
-		break;
-	case PIPE_QUERY_PIPELINE_STATISTICS: {
-		unsigned sample_size = (query->result_size - 8) / 2;
-
-		va += sample_size;
-		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
-		radeon_emit(cs, EVENT_TYPE(V_028A90_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2));
-		radeon_emit(cs, va);
-		radeon_emit(cs, va >> 32);
-
-		fence_va = va + sample_size;
-		break;
-	}
-	default:
-		assert(0);
-	}
-	radeon_add_to_buffer_list(sctx, sctx->gfx_cs, query->buffer.buf, RADEON_USAGE_WRITE,
-				  RADEON_PRIO_QUERY);
-
-	if (fence_va) {
-		si_cp_release_mem(sctx, cs, V_028A90_BOTTOM_OF_PIPE_TS, 0,
-				  EOP_DST_SEL_MEM, EOP_INT_SEL_NONE,
-				  EOP_DATA_SEL_VALUE_32BIT,
-				  query->buffer.buf, fence_va, 0x80000000,
-				  query->b.type);
-	}
+   struct radeon_cmdbuf *cs = sctx->gfx_cs;
+   uint64_t fence_va = 0;
+
+   switch (query->b.type) {
+   case SI_QUERY_TIME_ELAPSED_SDMA:
+      si_dma_emit_timestamp(sctx, buffer, va + 32 - buffer->gpu_address);
+      return;
+   case PIPE_QUERY_OCCLUSION_COUNTER:
+   case PIPE_QUERY_OCCLUSION_PREDICATE:
+   case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
+      va += 8;
+      radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
+      radeon_emit(cs, EVENT_TYPE(V_028A90_ZPASS_DONE) | EVENT_INDEX(1));
+      radeon_emit(cs, va);
+      radeon_emit(cs, va >> 32);
+
+      fence_va = va + sctx->screen->info.num_render_backends * 16 - 8;
+      break;
+   case PIPE_QUERY_PRIMITIVES_EMITTED:
+   case PIPE_QUERY_PRIMITIVES_GENERATED:
+   case PIPE_QUERY_SO_STATISTICS:
+   case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
+      va += 16;
+      emit_sample_streamout(cs, va, query->stream);
+      break;
+   case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
+      va += 16;
+      for (unsigned stream = 0; stream < SI_MAX_STREAMS; ++stream)
+         emit_sample_streamout(cs, va + 32 * stream, stream);
+      break;
+   case PIPE_QUERY_TIME_ELAPSED:
+      va += 8;
+      /* fall through */
+   case PIPE_QUERY_TIMESTAMP:
+      si_cp_release_mem(sctx, cs, V_028A90_BOTTOM_OF_PIPE_TS, 0, EOP_DST_SEL_MEM, EOP_INT_SEL_NONE,
+                        EOP_DATA_SEL_TIMESTAMP, NULL, va, 0, query->b.type);
+      fence_va = va + 8;
+      break;
+   case PIPE_QUERY_PIPELINE_STATISTICS: {
+      unsigned sample_size = (query->result_size - 8) / 2;
+
+      va += sample_size;
+      radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
+      radeon_emit(cs, EVENT_TYPE(V_028A90_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2));
+      radeon_emit(cs, va);
+      radeon_emit(cs, va >> 32);
+
+      fence_va = va + sample_size;
+      break;
+   }
+   default:
+      assert(0);
+   }
+   radeon_add_to_buffer_list(sctx, sctx->gfx_cs, query->buffer.buf, RADEON_USAGE_WRITE,
+                             RADEON_PRIO_QUERY);
+
+   if (fence_va) {
+      si_cp_release_mem(sctx, cs, V_028A90_BOTTOM_OF_PIPE_TS, 0, EOP_DST_SEL_MEM, EOP_INT_SEL_NONE,
+                        EOP_DATA_SEL_VALUE_32BIT, query->buffer.buf, fence_va, 0x80000000,
+                        query->b.type);
+   }
 }
 
-static void si_query_hw_emit_stop(struct si_context *sctx,
-				  struct si_query_hw *query)
+static void si_query_hw_emit_stop(struct si_context *sctx, struct si_query_hw *query)
 {
-	uint64_t va;
+   uint64_t va;
 
-	/* The queries which need begin already called this in begin_query. */
-	if (query->flags & SI_QUERY_HW_FLAG_NO_START) {
-		si_need_gfx_cs_space(sctx);
-		if (!si_query_buffer_alloc(sctx, &query->buffer, query->ops->prepare_buffer,
-					   query->result_size))
-			return;
-	}
+   /* The queries which need begin already called this in begin_query. */
+   if (query->flags & SI_QUERY_HW_FLAG_NO_START) {
+      si_need_gfx_cs_space(sctx);
+      if (!si_query_buffer_alloc(sctx, &query->buffer, query->ops->prepare_buffer,
+                                 query->result_size))
+         return;
+   }
 
-	if (!query->buffer.buf)
-		return; // previous buffer allocation failure
+   if (!query->buffer.buf)
+      return; // previous buffer allocation failure
 
-	/* emit end query */
-	va = query->buffer.buf->gpu_address + query->buffer.results_end;
+   /* emit end query */
+   va = query->buffer.buf->gpu_address + query->buffer.results_end;
 
-	query->ops->emit_stop(sctx, query, query->buffer.buf, va);
+   query->ops->emit_stop(sctx, query, query->buffer.buf, va);
 
-	query->buffer.results_end += query->result_size;
+   query->buffer.results_end += query->result_size;
 
-	si_update_occlusion_query_state(sctx, query->b.type, -1);
-	si_update_prims_generated_query_state(sctx, query->b.type, -1);
+   si_update_occlusion_query_state(sctx, query->b.type, -1);
+   si_update_prims_generated_query_state(sctx, query->b.type, -1);
 
-	if (query->b.type == PIPE_QUERY_PIPELINE_STATISTICS)
-		sctx->num_pipeline_stat_queries--;
+   if (query->b.type == PIPE_QUERY_PIPELINE_STATISTICS)
+      sctx->num_pipeline_stat_queries--;
 }
 
-static void emit_set_predicate(struct si_context *ctx,
-			       struct si_resource *buf, uint64_t va,
-			       uint32_t op)
+static void emit_set_predicate(struct si_context *ctx, struct si_resource *buf, uint64_t va,
+                               uint32_t op)
 {
-	struct radeon_cmdbuf *cs = ctx->gfx_cs;
-
-	if (ctx->chip_class >= GFX9) {
-		radeon_emit(cs, PKT3(PKT3_SET_PREDICATION, 2, 0));
-		radeon_emit(cs, op);
-		radeon_emit(cs, va);
-		radeon_emit(cs, va >> 32);
-	} else {
-		radeon_emit(cs, PKT3(PKT3_SET_PREDICATION, 1, 0));
-		radeon_emit(cs, va);
-		radeon_emit(cs, op | ((va >> 32) & 0xFF));
-	}
-	radeon_add_to_buffer_list(ctx, ctx->gfx_cs, buf, RADEON_USAGE_READ,
-				  RADEON_PRIO_QUERY);
+   struct radeon_cmdbuf *cs = ctx->gfx_cs;
+
+   if (ctx->chip_class >= GFX9) {
+      radeon_emit(cs, PKT3(PKT3_SET_PREDICATION, 2, 0));
+      radeon_emit(cs, op);
+      radeon_emit(cs, va);
+      radeon_emit(cs, va >> 32);
+   } else {
+      radeon_emit(cs, PKT3(PKT3_SET_PREDICATION, 1, 0));
+      radeon_emit(cs, va);
+      radeon_emit(cs, op | ((va >> 32) & 0xFF));
+   }
+   radeon_add_to_buffer_list(ctx, ctx->gfx_cs, buf, RADEON_USAGE_READ, RADEON_PRIO_QUERY);
 }
 
 static void si_emit_query_predication(struct si_context *ctx)
 {
-	struct si_query_hw *query = (struct si_query_hw *)ctx->render_cond;
-	struct si_query_buffer *qbuf;
-	uint32_t op;
-	bool flag_wait, invert;
-
-	if (!query)
-		return;
-
-	if (ctx->screen->use_ngg_streamout &&
-	    (query->b.type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ||
-	     query->b.type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE)) {
-		assert(!"not implemented");
-	}
-
-	invert = ctx->render_cond_invert;
-	flag_wait = ctx->render_cond_mode == PIPE_RENDER_COND_WAIT ||
-		    ctx->render_cond_mode == PIPE_RENDER_COND_BY_REGION_WAIT;
-
-	if (query->workaround_buf) {
-		op = PRED_OP(PREDICATION_OP_BOOL64);
-	} else {
-		switch (query->b.type) {
-		case PIPE_QUERY_OCCLUSION_COUNTER:
-		case PIPE_QUERY_OCCLUSION_PREDICATE:
-		case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
-			op = PRED_OP(PREDICATION_OP_ZPASS);
-			break;
-		case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
-		case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
-			op = PRED_OP(PREDICATION_OP_PRIMCOUNT);
-			invert = !invert;
-			break;
-		default:
-			assert(0);
-			return;
-		}
-	}
-
-	/* if true then invert, see GL_ARB_conditional_render_inverted */
-	if (invert)
-		op |= PREDICATION_DRAW_NOT_VISIBLE; /* Draw if not visible or overflow */
-	else
-		op |= PREDICATION_DRAW_VISIBLE; /* Draw if visible or no overflow */
-
-	/* Use the value written by compute shader as a workaround. Note that
-	 * the wait flag does not apply in this predication mode.
-	 *
-	 * The shader outputs the result value to L2. Workarounds only affect GFX8
-	 * and later, where the CP reads data from L2, so we don't need an
-	 * additional flush.
-	 */
-	if (query->workaround_buf) {
-		uint64_t va = query->workaround_buf->gpu_address + query->workaround_offset;
-		emit_set_predicate(ctx, query->workaround_buf, va, op);
-		return;
-	}
-
-	op |= flag_wait ? PREDICATION_HINT_WAIT : PREDICATION_HINT_NOWAIT_DRAW;
-
-	/* emit predicate packets for all data blocks */
-	for (qbuf = &query->buffer; qbuf; qbuf = qbuf->previous) {
-		unsigned results_base = 0;
-		uint64_t va_base = qbuf->buf->gpu_address;
-
-		while (results_base < qbuf->results_end) {
-			uint64_t va = va_base + results_base;
-
-			if (query->b.type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE) {
-				for (unsigned stream = 0; stream < SI_MAX_STREAMS; ++stream) {
-					emit_set_predicate(ctx, qbuf->buf, va + 32 * stream, op);
-
-					/* set CONTINUE bit for all packets except the first */
-					op |= PREDICATION_CONTINUE;
-				}
-			} else {
-				emit_set_predicate(ctx, qbuf->buf, va, op);
-				op |= PREDICATION_CONTINUE;
-			}
-
-			results_base += query->result_size;
-		}
-	}
+   struct si_query_hw *query = (struct si_query_hw *)ctx->render_cond;
+   struct si_query_buffer *qbuf;
+   uint32_t op;
+   bool flag_wait, invert;
+
+   if (!query)
+      return;
+
+   if (ctx->screen->use_ngg_streamout && (query->b.type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ||
+                                          query->b.type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE)) {
+      assert(!"not implemented");
+   }
+
+   invert = ctx->render_cond_invert;
+   flag_wait = ctx->render_cond_mode == PIPE_RENDER_COND_WAIT ||
+               ctx->render_cond_mode == PIPE_RENDER_COND_BY_REGION_WAIT;
+
+   if (query->workaround_buf) {
+      op = PRED_OP(PREDICATION_OP_BOOL64);
+   } else {
+      switch (query->b.type) {
+      case PIPE_QUERY_OCCLUSION_COUNTER:
+      case PIPE_QUERY_OCCLUSION_PREDICATE:
+      case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
+         op = PRED_OP(PREDICATION_OP_ZPASS);
+         break;
+      case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
+      case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
+         op = PRED_OP(PREDICATION_OP_PRIMCOUNT);
+         invert = !invert;
+         break;
+      default:
+         assert(0);
+         return;
+      }
+   }
+
+   /* if true then invert, see GL_ARB_conditional_render_inverted */
+   if (invert)
+      op |= PREDICATION_DRAW_NOT_VISIBLE; /* Draw if not visible or overflow */
+   else
+      op |= PREDICATION_DRAW_VISIBLE; /* Draw if visible or no overflow */
+
+   /* Use the value written by compute shader as a workaround. Note that
+    * the wait flag does not apply in this predication mode.
+    *
+    * The shader outputs the result value to L2. Workarounds only affect GFX8
+    * and later, where the CP reads data from L2, so we don't need an
+    * additional flush.
+    */
+   if (query->workaround_buf) {
+      uint64_t va = query->workaround_buf->gpu_address + query->workaround_offset;
+      emit_set_predicate(ctx, query->workaround_buf, va, op);
+      return;
+   }
+
+   op |= flag_wait ? PREDICATION_HINT_WAIT : PREDICATION_HINT_NOWAIT_DRAW;
+
+   /* emit predicate packets for all data blocks */
+   for (qbuf = &query->buffer; qbuf; qbuf = qbuf->previous) {
+      unsigned results_base = 0;
+      uint64_t va_base = qbuf->buf->gpu_address;
+
+      while (results_base < qbuf->results_end) {
+         uint64_t va = va_base + results_base;
+
+         if (query->b.type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE) {
+            for (unsigned stream = 0; stream < SI_MAX_STREAMS; ++stream) {
+               emit_set_predicate(ctx, qbuf->buf, va + 32 * stream, op);
+
+               /* set CONTINUE bit for all packets except the first */
+               op |= PREDICATION_CONTINUE;
+            }
+         } else {
+            emit_set_predicate(ctx, qbuf->buf, va, op);
+            op |= PREDICATION_CONTINUE;
+         }
+
+         results_base += query->result_size;
+      }
+   }
 }
 
-static struct pipe_query *si_create_query(struct pipe_context *ctx, unsigned query_type, unsigned index)
+static struct pipe_query *si_create_query(struct pipe_context *ctx, unsigned query_type,
+                                          unsigned index)
 {
-	struct si_screen *sscreen =
-		(struct si_screen *)ctx->screen;
-
-	if (query_type == PIPE_QUERY_TIMESTAMP_DISJOINT ||
-	    query_type == PIPE_QUERY_GPU_FINISHED ||
-	    (query_type >= PIPE_QUERY_DRIVER_SPECIFIC &&
-	     query_type != SI_QUERY_TIME_ELAPSED_SDMA))
-		return si_query_sw_create(query_type);
-
-	if (sscreen->use_ngg_streamout &&
-	    (query_type == PIPE_QUERY_PRIMITIVES_EMITTED ||
-	     query_type == PIPE_QUERY_PRIMITIVES_GENERATED ||
-	     query_type == PIPE_QUERY_SO_STATISTICS ||
-	     query_type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ||
-	     query_type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE))
-		return gfx10_sh_query_create(sscreen, query_type, index);
-
-	return si_query_hw_create(sscreen, query_type, index);
+   struct si_screen *sscreen = (struct si_screen *)ctx->screen;
+
+   if (query_type == PIPE_QUERY_TIMESTAMP_DISJOINT || query_type == PIPE_QUERY_GPU_FINISHED ||
+       (query_type >= PIPE_QUERY_DRIVER_SPECIFIC && query_type != SI_QUERY_TIME_ELAPSED_SDMA))
+      return si_query_sw_create(query_type);
+
+   if (sscreen->use_ngg_streamout &&
+       (query_type == PIPE_QUERY_PRIMITIVES_EMITTED ||
+        query_type == PIPE_QUERY_PRIMITIVES_GENERATED || query_type == PIPE_QUERY_SO_STATISTICS ||
+        query_type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ||
+        query_type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE))
+      return gfx10_sh_query_create(sscreen, query_type, index);
+
+   return si_query_hw_create(sscreen, query_type, index);
 }
 
 static void si_destroy_query(struct pipe_context *ctx, struct pipe_query *query)
 {
-	struct si_context *sctx = (struct si_context *)ctx;
-	struct si_query *squery = (struct si_query *)query;
+   struct si_context *sctx = (struct si_context *)ctx;
+   struct si_query *squery = (struct si_query *)query;
 
-	squery->ops->destroy(sctx, squery);
+   squery->ops->destroy(sctx, squery);
 }
 
-static bool si_begin_query(struct pipe_context *ctx,
-			   struct pipe_query *query)
+static bool si_begin_query(struct pipe_context *ctx, struct pipe_query *query)
 {
-	struct si_context *sctx = (struct si_context *)ctx;
-	struct si_query *squery = (struct si_query *)query;
+   struct si_context *sctx = (struct si_context *)ctx;
+   struct si_query *squery = (struct si_query *)query;
 
-	return squery->ops->begin(sctx, squery);
+   return squery->ops->begin(sctx, squery);
 }
 
-bool si_query_hw_begin(struct si_context *sctx,
-		       struct si_query *squery)
+bool si_query_hw_begin(struct si_context *sctx, struct si_query *squery)
 {
-	struct si_query_hw *query = (struct si_query_hw *)squery;
+   struct si_query_hw *query = (struct si_query_hw *)squery;
 
-	if (query->flags & SI_QUERY_HW_FLAG_NO_START) {
-		assert(0);
-		return false;
-	}
+   if (query->flags & SI_QUERY_HW_FLAG_NO_START) {
+      assert(0);
+      return false;
+   }
 
-	if (!(query->flags & SI_QUERY_HW_FLAG_BEGIN_RESUMES))
-		si_query_buffer_reset(sctx, &query->buffer);
+   if (!(query->flags & SI_QUERY_HW_FLAG_BEGIN_RESUMES))
+      si_query_buffer_reset(sctx, &query->buffer);
 
-	si_resource_reference(&query->workaround_buf, NULL);
+   si_resource_reference(&query->workaround_buf, NULL);
 
-	si_query_hw_emit_start(sctx, query);
-	if (!query->buffer.buf)
-		return false;
+   si_query_hw_emit_start(sctx, query);
+   if (!query->buffer.buf)
+      return false;
 
-	list_addtail(&query->b.active_list, &sctx->active_queries);
-	sctx->num_cs_dw_queries_suspend += query->b.num_cs_dw_suspend;
-	return true;
+   list_addtail(&query->b.active_list, &sctx->active_queries);
+   sctx->num_cs_dw_queries_suspend += query->b.num_cs_dw_suspend;
+   return true;
 }
 
 static bool si_end_query(struct pipe_context *ctx, struct pipe_query *query)
 {
-	struct si_context *sctx = (struct si_context *)ctx;
-	struct si_query *squery = (struct si_query *)query;
+   struct si_context *sctx = (struct si_context *)ctx;
+   struct si_query *squery = (struct si_query *)query;
 
-	return squery->ops->end(sctx, squery);
+   return squery->ops->end(sctx, squery);
 }
 
-bool si_query_hw_end(struct si_context *sctx,
-		     struct si_query *squery)
+bool si_query_hw_end(struct si_context *sctx, struct si_query *squery)
 {
-	struct si_query_hw *query = (struct si_query_hw *)squery;
+   struct si_query_hw *query = (struct si_query_hw *)squery;
 
-	if (query->flags & SI_QUERY_HW_FLAG_NO_START)
-		si_query_buffer_reset(sctx, &query->buffer);
+   if (query->flags & SI_QUERY_HW_FLAG_NO_START)
+      si_query_buffer_reset(sctx, &query->buffer);
 
-	si_query_hw_emit_stop(sctx, query);
+   si_query_hw_emit_stop(sctx, query);
 
-	if (!(query->flags & SI_QUERY_HW_FLAG_NO_START)) {
-		list_delinit(&query->b.active_list);
-		sctx->num_cs_dw_queries_suspend -= query->b.num_cs_dw_suspend;
-	}
+   if (!(query->flags & SI_QUERY_HW_FLAG_NO_START)) {
+      list_delinit(&query->b.active_list);
+      sctx->num_cs_dw_queries_suspend -= query->b.num_cs_dw_suspend;
+   }
 
-	if (!query->buffer.buf)
-		return false;
+   if (!query->buffer.buf)
+      return false;
 
-	return true;
+   return true;
 }
 
-static void si_get_hw_query_params(struct si_context *sctx,
-				   struct si_query_hw *squery, int index,
-				   struct si_hw_query_params *params)
+static void si_get_hw_query_params(struct si_context *sctx, struct si_query_hw *squery, int index,
+                                   struct si_hw_query_params *params)
 {
-	unsigned max_rbs = sctx->screen->info.num_render_backends;
-
-	params->pair_stride = 0;
-	params->pair_count = 1;
-
-	switch (squery->b.type) {
-	case PIPE_QUERY_OCCLUSION_COUNTER:
-	case PIPE_QUERY_OCCLUSION_PREDICATE:
-	case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
-		params->start_offset = 0;
-		params->end_offset = 8;
-		params->fence_offset = max_rbs * 16;
-		params->pair_stride = 16;
-		params->pair_count = max_rbs;
-		break;
-	case PIPE_QUERY_TIME_ELAPSED:
-		params->start_offset = 0;
-		params->end_offset = 8;
-		params->fence_offset = 16;
-		break;
-	case PIPE_QUERY_TIMESTAMP:
-		params->start_offset = 0;
-		params->end_offset = 0;
-		params->fence_offset = 8;
-		break;
-	case PIPE_QUERY_PRIMITIVES_EMITTED:
-		params->start_offset = 8;
-		params->end_offset = 24;
-		params->fence_offset = params->end_offset + 4;
-		break;
-	case PIPE_QUERY_PRIMITIVES_GENERATED:
-		params->start_offset = 0;
-		params->end_offset = 16;
-		params->fence_offset = params->end_offset + 4;
-		break;
-	case PIPE_QUERY_SO_STATISTICS:
-		params->start_offset = 8 - index * 8;
-		params->end_offset = 24 - index * 8;
-		params->fence_offset = params->end_offset + 4;
-		break;
-	case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
-		params->pair_count = SI_MAX_STREAMS;
-		params->pair_stride = 32;
-	case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
-		params->start_offset = 0;
-		params->end_offset = 16;
-
-		/* We can re-use the high dword of the last 64-bit value as a
-		 * fence: it is initialized as 0, and the high bit is set by
-		 * the write of the streamout stats event.
-		 */
-		params->fence_offset = squery->result_size - 4;
-		break;
-	case PIPE_QUERY_PIPELINE_STATISTICS:
-	{
-		static const unsigned offsets[] = {56, 48, 24, 32, 40, 16, 8, 0, 64, 72, 80};
-		params->start_offset = offsets[index];
-		params->end_offset = 88 + offsets[index];
-		params->fence_offset = 2 * 88;
-		break;
-	}
-	default:
-		unreachable("si_get_hw_query_params unsupported");
-	}
+   unsigned max_rbs = sctx->screen->info.num_render_backends;
+
+   params->pair_stride = 0;
+   params->pair_count = 1;
+
+   switch (squery->b.type) {
+   case PIPE_QUERY_OCCLUSION_COUNTER:
+   case PIPE_QUERY_OCCLUSION_PREDICATE:
+   case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
+      params->start_offset = 0;
+      params->end_offset = 8;
+      params->fence_offset = max_rbs * 16;
+      params->pair_stride = 16;
+      params->pair_count = max_rbs;
+      break;
+   case PIPE_QUERY_TIME_ELAPSED:
+      params->start_offset = 0;
+      params->end_offset = 8;
+      params->fence_offset = 16;
+      break;
+   case PIPE_QUERY_TIMESTAMP:
+      params->start_offset = 0;
+      params->end_offset = 0;
+      params->fence_offset = 8;
+      break;
+   case PIPE_QUERY_PRIMITIVES_EMITTED:
+      params->start_offset = 8;
+      params->end_offset = 24;
+      params->fence_offset = params->end_offset + 4;
+      break;
+   case PIPE_QUERY_PRIMITIVES_GENERATED:
+      params->start_offset = 0;
+      params->end_offset = 16;
+      params->fence_offset = params->end_offset + 4;
+      break;
+   case PIPE_QUERY_SO_STATISTICS:
+      params->start_offset = 8 - index * 8;
+      params->end_offset = 24 - index * 8;
+      params->fence_offset = params->end_offset + 4;
+      break;
+   case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
+      params->pair_count = SI_MAX_STREAMS;
+      params->pair_stride = 32;
+   case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
+      params->start_offset = 0;
+      params->end_offset = 16;
+
+      /* We can re-use the high dword of the last 64-bit value as a
+       * fence: it is initialized as 0, and the high bit is set by
+       * the write of the streamout stats event.
+       */
+      params->fence_offset = squery->result_size - 4;
+      break;
+   case PIPE_QUERY_PIPELINE_STATISTICS: {
+      static const unsigned offsets[] = {56, 48, 24, 32, 40, 16, 8, 0, 64, 72, 80};
+      params->start_offset = offsets[index];
+      params->end_offset = 88 + offsets[index];
+      params->fence_offset = 2 * 88;
+      break;
+   }
+   default:
+      unreachable("si_get_hw_query_params unsupported");
+   }
 }
 
 static unsigned si_query_read_result(void *map, unsigned start_index, unsigned end_index,
-				     bool test_status_bit)
+                                     bool test_status_bit)
 {
-	uint32_t *current_result = (uint32_t*)map;
-	uint64_t start, end;
-
-	start = (uint64_t)current_result[start_index] |
-		(uint64_t)current_result[start_index+1] << 32;
-	end = (uint64_t)current_result[end_index] |
-	      (uint64_t)current_result[end_index+1] << 32;
-
-	if (!test_status_bit ||
-	    ((start & 0x8000000000000000UL) && (end & 0x8000000000000000UL))) {
-		return end - start;
-	}
-	return 0;
+   uint32_t *current_result = (uint32_t *)map;
+   uint64_t start, end;
+
+   start = (uint64_t)current_result[start_index] | (uint64_t)current_result[start_index + 1] << 32;
+   end = (uint64_t)current_result[end_index] | (uint64_t)current_result[end_index + 1] << 32;
+
+   if (!test_status_bit || ((start & 0x8000000000000000UL) && (end & 0x8000000000000000UL))) {
+      return end - start;
+   }
+   return 0;
 }
 
-static void si_query_hw_add_result(struct si_screen *sscreen,
-				     struct si_query_hw *query,
-				     void *buffer,
-				     union pipe_query_result *result)
+static void si_query_hw_add_result(struct si_screen *sscreen, struct si_query_hw *query,
+                                   void *buffer, union pipe_query_result *result)
 {
-	unsigned max_rbs = sscreen->info.num_render_backends;
-
-	switch (query->b.type) {
-	case PIPE_QUERY_OCCLUSION_COUNTER: {
-		for (unsigned i = 0; i < max_rbs; ++i) {
-			unsigned results_base = i * 16;
-			result->u64 +=
-				si_query_read_result(buffer + results_base, 0, 2, true);
-		}
-		break;
-	}
-	case PIPE_QUERY_OCCLUSION_PREDICATE:
-	case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE: {
-		for (unsigned i = 0; i < max_rbs; ++i) {
-			unsigned results_base = i * 16;
-			result->b = result->b ||
-				si_query_read_result(buffer + results_base, 0, 2, true) != 0;
-		}
-		break;
-	}
-	case PIPE_QUERY_TIME_ELAPSED:
-		result->u64 += si_query_read_result(buffer, 0, 2, false);
-		break;
-	case SI_QUERY_TIME_ELAPSED_SDMA:
-		result->u64 += si_query_read_result(buffer, 0, 32/4, false);
-		break;
-	case PIPE_QUERY_TIMESTAMP:
-		result->u64 = *(uint64_t*)buffer;
-		break;
-	case PIPE_QUERY_PRIMITIVES_EMITTED:
-		/* SAMPLE_STREAMOUTSTATS stores this structure:
-		 * {
-		 *    u64 NumPrimitivesWritten;
-		 *    u64 PrimitiveStorageNeeded;
-		 * }
-		 * We only need NumPrimitivesWritten here. */
-		result->u64 += si_query_read_result(buffer, 2, 6, true);
-		break;
-	case PIPE_QUERY_PRIMITIVES_GENERATED:
-		/* Here we read PrimitiveStorageNeeded. */
-		result->u64 += si_query_read_result(buffer, 0, 4, true);
-		break;
-	case PIPE_QUERY_SO_STATISTICS:
-		result->so_statistics.num_primitives_written +=
-			si_query_read_result(buffer, 2, 6, true);
-		result->so_statistics.primitives_storage_needed +=
-			si_query_read_result(buffer, 0, 4, true);
-		break;
-	case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
-		result->b = result->b ||
-			si_query_read_result(buffer, 2, 6, true) !=
-			si_query_read_result(buffer, 0, 4, true);
-		break;
-	case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
-		for (unsigned stream = 0; stream < SI_MAX_STREAMS; ++stream) {
-			result->b = result->b ||
-				si_query_read_result(buffer, 2, 6, true) !=
-				si_query_read_result(buffer, 0, 4, true);
-			buffer = (char *)buffer + 32;
-		}
-		break;
-	case PIPE_QUERY_PIPELINE_STATISTICS:
-		result->pipeline_statistics.ps_invocations +=
-			si_query_read_result(buffer, 0, 22, false);
-		result->pipeline_statistics.c_primitives +=
-			si_query_read_result(buffer, 2, 24, false);
-		result->pipeline_statistics.c_invocations +=
-			si_query_read_result(buffer, 4, 26, false);
-		result->pipeline_statistics.vs_invocations +=
-			si_query_read_result(buffer, 6, 28, false);
-		result->pipeline_statistics.gs_invocations +=
-			si_query_read_result(buffer, 8, 30, false);
-		result->pipeline_statistics.gs_primitives +=
-			si_query_read_result(buffer, 10, 32, false);
-		result->pipeline_statistics.ia_primitives +=
-			si_query_read_result(buffer, 12, 34, false);
-		result->pipeline_statistics.ia_vertices +=
-			si_query_read_result(buffer, 14, 36, false);
-		result->pipeline_statistics.hs_invocations +=
-			si_query_read_result(buffer, 16, 38, false);
-		result->pipeline_statistics.ds_invocations +=
-			si_query_read_result(buffer, 18, 40, false);
-		result->pipeline_statistics.cs_invocations +=
-			si_query_read_result(buffer, 20, 42, false);
+   unsigned max_rbs = sscreen->info.num_render_backends;
+
+   switch (query->b.type) {
+   case PIPE_QUERY_OCCLUSION_COUNTER: {
+      for (unsigned i = 0; i < max_rbs; ++i) {
+         unsigned results_base = i * 16;
+         result->u64 += si_query_read_result(buffer + results_base, 0, 2, true);
+      }
+      break;
+   }
+   case PIPE_QUERY_OCCLUSION_PREDICATE:
+   case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE: {
+      for (unsigned i = 0; i < max_rbs; ++i) {
+         unsigned results_base = i * 16;
+         result->b = result->b || si_query_read_result(buffer + results_base, 0, 2, true) != 0;
+      }
+      break;
+   }
+   case PIPE_QUERY_TIME_ELAPSED:
+      result->u64 += si_query_read_result(buffer, 0, 2, false);
+      break;
+   case SI_QUERY_TIME_ELAPSED_SDMA:
+      result->u64 += si_query_read_result(buffer, 0, 32 / 4, false);
+      break;
+   case PIPE_QUERY_TIMESTAMP:
+      result->u64 = *(uint64_t *)buffer;
+      break;
+   case PIPE_QUERY_PRIMITIVES_EMITTED:
+      /* SAMPLE_STREAMOUTSTATS stores this structure:
+       * {
+       *    u64 NumPrimitivesWritten;
+       *    u64 PrimitiveStorageNeeded;
+       * }
+       * We only need NumPrimitivesWritten here. */
+      result->u64 += si_query_read_result(buffer, 2, 6, true);
+      break;
+   case PIPE_QUERY_PRIMITIVES_GENERATED:
+      /* Here we read PrimitiveStorageNeeded. */
+      result->u64 += si_query_read_result(buffer, 0, 4, true);
+      break;
+   case PIPE_QUERY_SO_STATISTICS:
+      result->so_statistics.num_primitives_written += si_query_read_result(buffer, 2, 6, true);
+      result->so_statistics.primitives_storage_needed += si_query_read_result(buffer, 0, 4, true);
+      break;
+   case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
+      result->b = result->b || si_query_read_result(buffer, 2, 6, true) !=
+                                  si_query_read_result(buffer, 0, 4, true);
+      break;
+   case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
+      for (unsigned stream = 0; stream < SI_MAX_STREAMS; ++stream) {
+         result->b = result->b || si_query_read_result(buffer, 2, 6, true) !=
+                                     si_query_read_result(buffer, 0, 4, true);
+         buffer = (char *)buffer + 32;
+      }
+      break;
+   case PIPE_QUERY_PIPELINE_STATISTICS:
+      result->pipeline_statistics.ps_invocations += si_query_read_result(buffer, 0, 22, false);
+      result->pipeline_statistics.c_primitives += si_query_read_result(buffer, 2, 24, false);
+      result->pipeline_statistics.c_invocations += si_query_read_result(buffer, 4, 26, false);
+      result->pipeline_statistics.vs_invocations += si_query_read_result(buffer, 6, 28, false);
+      result->pipeline_statistics.gs_invocations += si_query_read_result(buffer, 8, 30, false);
+      result->pipeline_statistics.gs_primitives += si_query_read_result(buffer, 10, 32, false);
+      result->pipeline_statistics.ia_primitives += si_query_read_result(buffer, 12, 34, false);
+      result->pipeline_statistics.ia_vertices += si_query_read_result(buffer, 14, 36, false);
+      result->pipeline_statistics.hs_invocations += si_query_read_result(buffer, 16, 38, false);
+      result->pipeline_statistics.ds_invocations += si_query_read_result(buffer, 18, 40, false);
+      result->pipeline_statistics.cs_invocations += si_query_read_result(buffer, 20, 42, false);
 #if 0 /* for testing */
 		printf("Pipeline stats: IA verts=%llu, IA prims=%llu, VS=%llu, HS=%llu, "
 		       "DS=%llu, GS=%llu, GS prims=%llu, Clipper=%llu, "
@@ -1402,444 +1346,416 @@ static void si_query_hw_add_result(struct si_screen *sscreen,
 		       result->pipeline_statistics.ps_invocations,
 		       result->pipeline_statistics.cs_invocations);
 #endif
-		break;
-	default:
-		assert(0);
-	}
+      break;
+   default:
+      assert(0);
+   }
 }
 
 void si_query_hw_suspend(struct si_context *sctx, struct si_query *query)
 {
-	si_query_hw_emit_stop(sctx, (struct si_query_hw *)query);
+   si_query_hw_emit_stop(sctx, (struct si_query_hw *)query);
 }
 
 void si_query_hw_resume(struct si_context *sctx, struct si_query *query)
 {
-	si_query_hw_emit_start(sctx, (struct si_query_hw *)query);
+   si_query_hw_emit_start(sctx, (struct si_query_hw *)query);
 }
 
 static const struct si_query_ops query_hw_ops = {
-	.destroy = si_query_hw_destroy,
-	.begin = si_query_hw_begin,
-	.end = si_query_hw_end,
-	.get_result = si_query_hw_get_result,
-	.get_result_resource = si_query_hw_get_result_resource,
-
-	.suspend = si_query_hw_suspend,
-	.resume = si_query_hw_resume,
+   .destroy = si_query_hw_destroy,
+   .begin = si_query_hw_begin,
+   .end = si_query_hw_end,
+   .get_result = si_query_hw_get_result,
+   .get_result_resource = si_query_hw_get_result_resource,
+
+   .suspend = si_query_hw_suspend,
+   .resume = si_query_hw_resume,
 };
 
-static bool si_get_query_result(struct pipe_context *ctx,
-				struct pipe_query *query, bool wait,
-				union pipe_query_result *result)
+static bool si_get_query_result(struct pipe_context *ctx, struct pipe_query *query, bool wait,
+                                union pipe_query_result *result)
 {
-	struct si_context *sctx = (struct si_context *)ctx;
-	struct si_query *squery = (struct si_query *)query;
+   struct si_context *sctx = (struct si_context *)ctx;
+   struct si_query *squery = (struct si_query *)query;
 
-	return squery->ops->get_result(sctx, squery, wait, result);
+   return squery->ops->get_result(sctx, squery, wait, result);
 }
 
-static void si_get_query_result_resource(struct pipe_context *ctx,
-					 struct pipe_query *query,
-					 bool wait,
-					 enum pipe_query_value_type result_type,
-					 int index,
-					 struct pipe_resource *resource,
-					 unsigned offset)
+static void si_get_query_result_resource(struct pipe_context *ctx, struct pipe_query *query,
+                                         bool wait, enum pipe_query_value_type result_type,
+                                         int index, struct pipe_resource *resource, unsigned offset)
 {
-	struct si_context *sctx = (struct si_context *)ctx;
-	struct si_query *squery = (struct si_query *)query;
+   struct si_context *sctx = (struct si_context *)ctx;
+   struct si_query *squery = (struct si_query *)query;
 
-	squery->ops->get_result_resource(sctx, squery, wait, result_type, index,
-	                                 resource, offset);
+   squery->ops->get_result_resource(sctx, squery, wait, result_type, index, resource, offset);
 }
 
-static void si_query_hw_clear_result(struct si_query_hw *query,
-				       union pipe_query_result *result)
+static void si_query_hw_clear_result(struct si_query_hw *query, union pipe_query_result *result)
 {
-	util_query_clear_result(result, query->b.type);
+   util_query_clear_result(result, query->b.type);
 }
 
-bool si_query_hw_get_result(struct si_context *sctx,
-			    struct si_query *squery,
-			    bool wait, union pipe_query_result *result)
+bool si_query_hw_get_result(struct si_context *sctx, struct si_query *squery, bool wait,
+                            union pipe_query_result *result)
 {
-	struct si_screen *sscreen = sctx->screen;
-	struct si_query_hw *query = (struct si_query_hw *)squery;
-	struct si_query_buffer *qbuf;
-
-	query->ops->clear_result(query, result);
-
-	for (qbuf = &query->buffer; qbuf; qbuf = qbuf->previous) {
-		unsigned usage = PIPE_TRANSFER_READ |
-				 (wait ? 0 : PIPE_TRANSFER_DONTBLOCK);
-		unsigned results_base = 0;
-		void *map;
-
-		if (squery->b.flushed)
-			map = sctx->ws->buffer_map(qbuf->buf->buf, NULL, usage);
-		else
-			map = si_buffer_map_sync_with_rings(sctx, qbuf->buf, usage);
-
-		if (!map)
-			return false;
-
-		while (results_base != qbuf->results_end) {
-			query->ops->add_result(sscreen, query, map + results_base,
-					       result);
-			results_base += query->result_size;
-		}
-	}
-
-	/* Convert the time to expected units. */
-	if (squery->type == PIPE_QUERY_TIME_ELAPSED ||
-	    squery->type == SI_QUERY_TIME_ELAPSED_SDMA ||
-	    squery->type == PIPE_QUERY_TIMESTAMP) {
-		result->u64 = (1000000 * result->u64) / sscreen->info.clock_crystal_freq;
-	}
-	return true;
+   struct si_screen *sscreen = sctx->screen;
+   struct si_query_hw *query = (struct si_query_hw *)squery;
+   struct si_query_buffer *qbuf;
+
+   query->ops->clear_result(query, result);
+
+   for (qbuf = &query->buffer; qbuf; qbuf = qbuf->previous) {
+      unsigned usage = PIPE_TRANSFER_READ | (wait ? 0 : PIPE_TRANSFER_DONTBLOCK);
+      unsigned results_base = 0;
+      void *map;
+
+      if (squery->b.flushed)
+         map = sctx->ws->buffer_map(qbuf->buf->buf, NULL, usage);
+      else
+         map = si_buffer_map_sync_with_rings(sctx, qbuf->buf, usage);
+
+      if (!map)
+         return false;
+
+      while (results_base != qbuf->results_end) {
+         query->ops->add_result(sscreen, query, map + results_base, result);
+         results_base += query->result_size;
+      }
+   }
+
+   /* Convert the time to expected units. */
+   if (squery->type == PIPE_QUERY_TIME_ELAPSED || squery->type == SI_QUERY_TIME_ELAPSED_SDMA ||
+       squery->type == PIPE_QUERY_TIMESTAMP) {
+      result->u64 = (1000000 * result->u64) / sscreen->info.clock_crystal_freq;
+   }
+   return true;
 }
 
-static void si_query_hw_get_result_resource(struct si_context *sctx,
-                                              struct si_query *squery,
-                                              bool wait,
-                                              enum pipe_query_value_type result_type,
-                                              int index,
-                                              struct pipe_resource *resource,
-                                              unsigned offset)
+static void si_query_hw_get_result_resource(struct si_context *sctx, struct si_query *squery,
+                                            bool wait, enum pipe_query_value_type result_type,
+                                            int index, struct pipe_resource *resource,
+                                            unsigned offset)
 {
-	struct si_query_hw *query = (struct si_query_hw *)squery;
-	struct si_query_buffer *qbuf;
-	struct si_query_buffer *qbuf_prev;
-	struct pipe_resource *tmp_buffer = NULL;
-	unsigned tmp_buffer_offset = 0;
-	struct si_qbo_state saved_state = {};
-	struct pipe_grid_info grid = {};
-	struct pipe_constant_buffer constant_buffer = {};
-	struct pipe_shader_buffer ssbo[3];
-	struct si_hw_query_params params;
-	struct {
-		uint32_t end_offset;
-		uint32_t result_stride;
-		uint32_t result_count;
-		uint32_t config;
-		uint32_t fence_offset;
-		uint32_t pair_stride;
-		uint32_t pair_count;
-	} consts;
-
-	if (!sctx->query_result_shader) {
-		sctx->query_result_shader = si_create_query_result_cs(sctx);
-		if (!sctx->query_result_shader)
-			return;
-	}
-
-	if (query->buffer.previous) {
-		u_suballocator_alloc(sctx->allocator_zeroed_memory, 16, 16,
-				     &tmp_buffer_offset, &tmp_buffer);
-		if (!tmp_buffer)
-			return;
-	}
-
-	si_save_qbo_state(sctx, &saved_state);
-
-	si_get_hw_query_params(sctx, query, index >= 0 ? index : 0, &params);
-	consts.end_offset = params.end_offset - params.start_offset;
-	consts.fence_offset = params.fence_offset - params.start_offset;
-	consts.result_stride = query->result_size;
-	consts.pair_stride = params.pair_stride;
-	consts.pair_count = params.pair_count;
-
-	constant_buffer.buffer_size = sizeof(consts);
-	constant_buffer.user_buffer = &consts;
-
-	ssbo[1].buffer = tmp_buffer;
-	ssbo[1].buffer_offset = tmp_buffer_offset;
-	ssbo[1].buffer_size = 16;
-
-	ssbo[2] = ssbo[1];
-
-	sctx->b.bind_compute_state(&sctx->b, sctx->query_result_shader);
-
-	grid.block[0] = 1;
-	grid.block[1] = 1;
-	grid.block[2] = 1;
-	grid.grid[0] = 1;
-	grid.grid[1] = 1;
-	grid.grid[2] = 1;
-
-	consts.config = 0;
-	if (index < 0)
-		consts.config |= 4;
-	if (query->b.type == PIPE_QUERY_OCCLUSION_PREDICATE ||
-	    query->b.type == PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE)
-		consts.config |= 8;
-	else if (query->b.type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ||
-		 query->b.type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE)
-		consts.config |= 8 | 256;
-	else if (query->b.type == PIPE_QUERY_TIMESTAMP ||
-		 query->b.type == PIPE_QUERY_TIME_ELAPSED)
-		consts.config |= 32;
-
-	switch (result_type) {
-	case PIPE_QUERY_TYPE_U64:
-	case PIPE_QUERY_TYPE_I64:
-		consts.config |= 64;
-		break;
-	case PIPE_QUERY_TYPE_I32:
-		consts.config |= 128;
-		break;
-	case PIPE_QUERY_TYPE_U32:
-		break;
-	}
-
-	sctx->flags |= sctx->screen->barrier_flags.cp_to_L2;
-
-	for (qbuf = &query->buffer; qbuf; qbuf = qbuf_prev) {
-		if (query->b.type != PIPE_QUERY_TIMESTAMP) {
-			qbuf_prev = qbuf->previous;
-			consts.result_count = qbuf->results_end / query->result_size;
-			consts.config &= ~3;
-			if (qbuf != &query->buffer)
-				consts.config |= 1;
-			if (qbuf->previous)
-				consts.config |= 2;
-		} else {
-			/* Only read the last timestamp. */
-			qbuf_prev = NULL;
-			consts.result_count = 0;
-			consts.config |= 16;
-			params.start_offset += qbuf->results_end - query->result_size;
-		}
-
-		sctx->b.set_constant_buffer(&sctx->b, PIPE_SHADER_COMPUTE, 0, &constant_buffer);
-
-		ssbo[0].buffer = &qbuf->buf->b.b;
-		ssbo[0].buffer_offset = params.start_offset;
-		ssbo[0].buffer_size = qbuf->results_end - params.start_offset;
-
-		if (!qbuf->previous) {
-			ssbo[2].buffer = resource;
-			ssbo[2].buffer_offset = offset;
-			ssbo[2].buffer_size = 8;
-
-			si_resource(resource)->TC_L2_dirty = true;
-		}
-
-		sctx->b.set_shader_buffers(&sctx->b, PIPE_SHADER_COMPUTE, 0, 3, ssbo,
-					   1 << 2);
-
-		if (wait && qbuf == &query->buffer) {
-			uint64_t va;
-
-			/* Wait for result availability. Wait only for readiness
-			 * of the last entry, since the fence writes should be
-			 * serialized in the CP.
-			 */
-			va = qbuf->buf->gpu_address + qbuf->results_end - query->result_size;
-			va += params.fence_offset;
-
-			si_cp_wait_mem(sctx, sctx->gfx_cs, va, 0x80000000,
-				       0x80000000, WAIT_REG_MEM_EQUAL);
-		}
-
-		sctx->b.launch_grid(&sctx->b, &grid);
-		sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH;
-	}
-
-	si_restore_qbo_state(sctx, &saved_state);
-	pipe_resource_reference(&tmp_buffer, NULL);
+   struct si_query_hw *query = (struct si_query_hw *)squery;
+   struct si_query_buffer *qbuf;
+   struct si_query_buffer *qbuf_prev;
+   struct pipe_resource *tmp_buffer = NULL;
+   unsigned tmp_buffer_offset = 0;
+   struct si_qbo_state saved_state = {};
+   struct pipe_grid_info grid = {};
+   struct pipe_constant_buffer constant_buffer = {};
+   struct pipe_shader_buffer ssbo[3];
+   struct si_hw_query_params params;
+   struct {
+      uint32_t end_offset;
+      uint32_t result_stride;
+      uint32_t result_count;
+      uint32_t config;
+      uint32_t fence_offset;
+      uint32_t pair_stride;
+      uint32_t pair_count;
+   } consts;
+
+   if (!sctx->query_result_shader) {
+      sctx->query_result_shader = si_create_query_result_cs(sctx);
+      if (!sctx->query_result_shader)
+         return;
+   }
+
+   if (query->buffer.previous) {
+      u_suballocator_alloc(sctx->allocator_zeroed_memory, 16, 16, &tmp_buffer_offset, &tmp_buffer);
+      if (!tmp_buffer)
+         return;
+   }
+
+   si_save_qbo_state(sctx, &saved_state);
+
+   si_get_hw_query_params(sctx, query, index >= 0 ? index : 0, &params);
+   consts.end_offset = params.end_offset - params.start_offset;
+   consts.fence_offset = params.fence_offset - params.start_offset;
+   consts.result_stride = query->result_size;
+   consts.pair_stride = params.pair_stride;
+   consts.pair_count = params.pair_count;
+
+   constant_buffer.buffer_size = sizeof(consts);
+   constant_buffer.user_buffer = &consts;
+
+   ssbo[1].buffer = tmp_buffer;
+   ssbo[1].buffer_offset = tmp_buffer_offset;
+   ssbo[1].buffer_size = 16;
+
+   ssbo[2] = ssbo[1];
+
+   sctx->b.bind_compute_state(&sctx->b, sctx->query_result_shader);
+
+   grid.block[0] = 1;
+   grid.block[1] = 1;
+   grid.block[2] = 1;
+   grid.grid[0] = 1;
+   grid.grid[1] = 1;
+   grid.grid[2] = 1;
+
+   consts.config = 0;
+   if (index < 0)
+      consts.config |= 4;
+   if (query->b.type == PIPE_QUERY_OCCLUSION_PREDICATE ||
+       query->b.type == PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE)
+      consts.config |= 8;
+   else if (query->b.type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ||
+            query->b.type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE)
+      consts.config |= 8 | 256;
+   else if (query->b.type == PIPE_QUERY_TIMESTAMP || query->b.type == PIPE_QUERY_TIME_ELAPSED)
+      consts.config |= 32;
+
+   switch (result_type) {
+   case PIPE_QUERY_TYPE_U64:
+   case PIPE_QUERY_TYPE_I64:
+      consts.config |= 64;
+      break;
+   case PIPE_QUERY_TYPE_I32:
+      consts.config |= 128;
+      break;
+   case PIPE_QUERY_TYPE_U32:
+      break;
+   }
+
+   sctx->flags |= sctx->screen->barrier_flags.cp_to_L2;
+
+   for (qbuf = &query->buffer; qbuf; qbuf = qbuf_prev) {
+      if (query->b.type != PIPE_QUERY_TIMESTAMP) {
+         qbuf_prev = qbuf->previous;
+         consts.result_count = qbuf->results_end / query->result_size;
+         consts.config &= ~3;
+         if (qbuf != &query->buffer)
+            consts.config |= 1;
+         if (qbuf->previous)
+            consts.config |= 2;
+      } else {
+         /* Only read the last timestamp. */
+         qbuf_prev = NULL;
+         consts.result_count = 0;
+         consts.config |= 16;
+         params.start_offset += qbuf->results_end - query->result_size;
+      }
+
+      sctx->b.set_constant_buffer(&sctx->b, PIPE_SHADER_COMPUTE, 0, &constant_buffer);
+
+      ssbo[0].buffer = &qbuf->buf->b.b;
+      ssbo[0].buffer_offset = params.start_offset;
+      ssbo[0].buffer_size = qbuf->results_end - params.start_offset;
+
+      if (!qbuf->previous) {
+         ssbo[2].buffer = resource;
+         ssbo[2].buffer_offset = offset;
+         ssbo[2].buffer_size = 8;
+
+         si_resource(resource)->TC_L2_dirty = true;
+      }
+
+      sctx->b.set_shader_buffers(&sctx->b, PIPE_SHADER_COMPUTE, 0, 3, ssbo, 1 << 2);
+
+      if (wait && qbuf == &query->buffer) {
+         uint64_t va;
+
+         /* Wait for result availability. Wait only for readiness
+          * of the last entry, since the fence writes should be
+          * serialized in the CP.
+          */
+         va = qbuf->buf->gpu_address + qbuf->results_end - query->result_size;
+         va += params.fence_offset;
+
+         si_cp_wait_mem(sctx, sctx->gfx_cs, va, 0x80000000, 0x80000000, WAIT_REG_MEM_EQUAL);
+      }
+
+      sctx->b.launch_grid(&sctx->b, &grid);
+      sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH;
+   }
+
+   si_restore_qbo_state(sctx, &saved_state);
+   pipe_resource_reference(&tmp_buffer, NULL);
 }
 
-static void si_render_condition(struct pipe_context *ctx,
-				struct pipe_query *query,
-				bool condition,
-				enum pipe_render_cond_flag mode)
+static void si_render_condition(struct pipe_context *ctx, struct pipe_query *query, bool condition,
+                                enum pipe_render_cond_flag mode)
 {
-	struct si_context *sctx = (struct si_context *)ctx;
-	struct si_query_hw *squery = (struct si_query_hw *)query;
-	struct si_atom *atom = &sctx->atoms.s.render_cond;
-
-	if (query) {
-		bool needs_workaround = false;
-
-		/* There was a firmware regression in GFX8 which causes successive
-		 * SET_PREDICATION packets to give the wrong answer for
-		 * non-inverted stream overflow predication.
-		 */
-		if (((sctx->chip_class == GFX8 && sctx->screen->info.pfp_fw_feature < 49) ||
-		     (sctx->chip_class == GFX9 && sctx->screen->info.pfp_fw_feature < 38)) &&
-		    !condition &&
-		    (squery->b.type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE ||
-		     (squery->b.type == PIPE_QUERY_SO_OVERFLOW_PREDICATE &&
-		      (squery->buffer.previous ||
-		       squery->buffer.results_end > squery->result_size)))) {
-			needs_workaround = true;
-		}
-
-		if (needs_workaround && !squery->workaround_buf) {
-			bool old_force_off = sctx->render_cond_force_off;
-			sctx->render_cond_force_off = true;
-
-			u_suballocator_alloc(
-				sctx->allocator_zeroed_memory, 8, 8,
-				&squery->workaround_offset,
-				(struct pipe_resource **)&squery->workaround_buf);
-
-			/* Reset to NULL to avoid a redundant SET_PREDICATION
-			 * from launching the compute grid.
-			 */
-			sctx->render_cond = NULL;
-
-			ctx->get_query_result_resource(
-				ctx, query, true, PIPE_QUERY_TYPE_U64, 0,
-				&squery->workaround_buf->b.b, squery->workaround_offset);
-
-			/* Settings this in the render cond atom is too late,
-			 * so set it here. */
-			sctx->flags |= sctx->screen->barrier_flags.L2_to_cp |
-				       SI_CONTEXT_FLUSH_FOR_RENDER_COND;
-
-			sctx->render_cond_force_off = old_force_off;
-		}
-	}
-
-	sctx->render_cond = query;
-	sctx->render_cond_invert = condition;
-	sctx->render_cond_mode = mode;
-
-	si_set_atom_dirty(sctx, atom, query != NULL);
+   struct si_context *sctx = (struct si_context *)ctx;
+   struct si_query_hw *squery = (struct si_query_hw *)query;
+   struct si_atom *atom = &sctx->atoms.s.render_cond;
+
+   if (query) {
+      bool needs_workaround = false;
+
+      /* There was a firmware regression in GFX8 which causes successive
+       * SET_PREDICATION packets to give the wrong answer for
+       * non-inverted stream overflow predication.
+       */
+      if (((sctx->chip_class == GFX8 && sctx->screen->info.pfp_fw_feature < 49) ||
+           (sctx->chip_class == GFX9 && sctx->screen->info.pfp_fw_feature < 38)) &&
+          !condition &&
+          (squery->b.type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE ||
+           (squery->b.type == PIPE_QUERY_SO_OVERFLOW_PREDICATE &&
+            (squery->buffer.previous || squery->buffer.results_end > squery->result_size)))) {
+         needs_workaround = true;
+      }
+
+      if (needs_workaround && !squery->workaround_buf) {
+         bool old_force_off = sctx->render_cond_force_off;
+         sctx->render_cond_force_off = true;
+
+         u_suballocator_alloc(sctx->allocator_zeroed_memory, 8, 8, &squery->workaround_offset,
+                              (struct pipe_resource **)&squery->workaround_buf);
+
+         /* Reset to NULL to avoid a redundant SET_PREDICATION
+          * from launching the compute grid.
+          */
+         sctx->render_cond = NULL;
+
+         ctx->get_query_result_resource(ctx, query, true, PIPE_QUERY_TYPE_U64, 0,
+                                        &squery->workaround_buf->b.b, squery->workaround_offset);
+
+         /* Settings this in the render cond atom is too late,
+          * so set it here. */
+         sctx->flags |= sctx->screen->barrier_flags.L2_to_cp | SI_CONTEXT_FLUSH_FOR_RENDER_COND;
+
+         sctx->render_cond_force_off = old_force_off;
+      }
+   }
+
+   sctx->render_cond = query;
+   sctx->render_cond_invert = condition;
+   sctx->render_cond_mode = mode;
+
+   si_set_atom_dirty(sctx, atom, query != NULL);
 }
 
 void si_suspend_queries(struct si_context *sctx)
 {
-	struct si_query *query;
+   struct si_query *query;
 
-	LIST_FOR_EACH_ENTRY(query, &sctx->active_queries, active_list)
-		query->ops->suspend(sctx, query);
+   LIST_FOR_EACH_ENTRY (query, &sctx->active_queries, active_list)
+      query->ops->suspend(sctx, query);
 }
 
 void si_resume_queries(struct si_context *sctx)
 {
-	struct si_query *query;
+   struct si_query *query;
 
-	/* Check CS space here. Resuming must not be interrupted by flushes. */
-	si_need_gfx_cs_space(sctx);
+   /* Check CS space here. Resuming must not be interrupted by flushes. */
+   si_need_gfx_cs_space(sctx);
 
-	LIST_FOR_EACH_ENTRY(query, &sctx->active_queries, active_list)
-		query->ops->resume(sctx, query);
+   LIST_FOR_EACH_ENTRY (query, &sctx->active_queries, active_list)
+      query->ops->resume(sctx, query);
 }
 
-#define XFULL(name_, query_type_, type_, result_type_, group_id_) \
-	{ \
-		.name = name_, \
-		.query_type = SI_QUERY_##query_type_, \
-		.type = PIPE_DRIVER_QUERY_TYPE_##type_, \
-		.result_type = PIPE_DRIVER_QUERY_RESULT_TYPE_##result_type_, \
-		.group_id = group_id_ \
-	}
+#define XFULL(name_, query_type_, type_, result_type_, group_id_)                                  \
+   {                                                                                               \
+      .name = name_, .query_type = SI_QUERY_##query_type_, .type = PIPE_DRIVER_QUERY_TYPE_##type_, \
+      .result_type = PIPE_DRIVER_QUERY_RESULT_TYPE_##result_type_, .group_id = group_id_           \
+   }
 
-#define X(name_, query_type_, type_, result_type_) \
-	XFULL(name_, query_type_, type_, result_type_, ~(unsigned)0)
+#define X(name_, query_type_, type_, result_type_)                                                 \
+   XFULL(name_, query_type_, type_, result_type_, ~(unsigned)0)
 
-#define XG(group_, name_, query_type_, type_, result_type_) \
-	XFULL(name_, query_type_, type_, result_type_, SI_QUERY_GROUP_##group_)
+#define XG(group_, name_, query_type_, type_, result_type_)                                        \
+   XFULL(name_, query_type_, type_, result_type_, SI_QUERY_GROUP_##group_)
 
 static struct pipe_driver_query_info si_driver_query_list[] = {
-	X("num-compilations",		NUM_COMPILATIONS,	UINT64, CUMULATIVE),
-	X("num-shaders-created",	NUM_SHADERS_CREATED,	UINT64, CUMULATIVE),
-	X("draw-calls",			DRAW_CALLS,		UINT64, AVERAGE),
-	X("decompress-calls",		DECOMPRESS_CALLS,	UINT64, AVERAGE),
-	X("MRT-draw-calls",		MRT_DRAW_CALLS,		UINT64, AVERAGE),
-	X("prim-restart-calls",		PRIM_RESTART_CALLS,	UINT64, AVERAGE),
-	X("spill-draw-calls",		SPILL_DRAW_CALLS,	UINT64, AVERAGE),
-	X("compute-calls",		COMPUTE_CALLS,		UINT64, AVERAGE),
-	X("spill-compute-calls",	SPILL_COMPUTE_CALLS,	UINT64, AVERAGE),
-	X("dma-calls",			DMA_CALLS,		UINT64, AVERAGE),
-	X("cp-dma-calls",		CP_DMA_CALLS,		UINT64, AVERAGE),
-	X("num-vs-flushes",		NUM_VS_FLUSHES,		UINT64, AVERAGE),
-	X("num-ps-flushes",		NUM_PS_FLUSHES,		UINT64, AVERAGE),
-	X("num-cs-flushes",		NUM_CS_FLUSHES,		UINT64, AVERAGE),
-	X("num-CB-cache-flushes",	NUM_CB_CACHE_FLUSHES,	UINT64, AVERAGE),
-	X("num-DB-cache-flushes",	NUM_DB_CACHE_FLUSHES,	UINT64, AVERAGE),
-	X("num-L2-invalidates",		NUM_L2_INVALIDATES,	UINT64, AVERAGE),
-	X("num-L2-writebacks",		NUM_L2_WRITEBACKS,	UINT64, AVERAGE),
-	X("num-resident-handles",	NUM_RESIDENT_HANDLES,	UINT64, AVERAGE),
-	X("tc-offloaded-slots",		TC_OFFLOADED_SLOTS,     UINT64, AVERAGE),
-	X("tc-direct-slots",		TC_DIRECT_SLOTS,	UINT64, AVERAGE),
-	X("tc-num-syncs",		TC_NUM_SYNCS,		UINT64, AVERAGE),
-	X("CS-thread-busy",		CS_THREAD_BUSY,		UINT64, AVERAGE),
-	X("gallium-thread-busy",	GALLIUM_THREAD_BUSY,	UINT64, AVERAGE),
-	X("requested-VRAM",		REQUESTED_VRAM,		BYTES, AVERAGE),
-	X("requested-GTT",		REQUESTED_GTT,		BYTES, AVERAGE),
-	X("mapped-VRAM",		MAPPED_VRAM,		BYTES, AVERAGE),
-	X("mapped-GTT",			MAPPED_GTT,		BYTES, AVERAGE),
-	X("buffer-wait-time",		BUFFER_WAIT_TIME,	MICROSECONDS, CUMULATIVE),
-	X("num-mapped-buffers",		NUM_MAPPED_BUFFERS,	UINT64, AVERAGE),
-	X("num-GFX-IBs",		NUM_GFX_IBS,		UINT64, AVERAGE),
-	X("num-SDMA-IBs",		NUM_SDMA_IBS,		UINT64, AVERAGE),
-	X("GFX-BO-list-size",		GFX_BO_LIST_SIZE,	UINT64, AVERAGE),
-	X("GFX-IB-size",		GFX_IB_SIZE,		UINT64, AVERAGE),
-	X("num-bytes-moved",		NUM_BYTES_MOVED,	BYTES, CUMULATIVE),
-	X("num-evictions",		NUM_EVICTIONS,		UINT64, CUMULATIVE),
-	X("VRAM-CPU-page-faults",	NUM_VRAM_CPU_PAGE_FAULTS, UINT64, CUMULATIVE),
-	X("VRAM-usage",			VRAM_USAGE,		BYTES, AVERAGE),
-	X("VRAM-vis-usage",		VRAM_VIS_USAGE,		BYTES, AVERAGE),
-	X("GTT-usage",			GTT_USAGE,		BYTES, AVERAGE),
-	X("back-buffer-ps-draw-ratio",	BACK_BUFFER_PS_DRAW_RATIO, UINT64, AVERAGE),
-	X("live-shader-cache-hits",	LIVE_SHADER_CACHE_HITS, UINT, CUMULATIVE),
-	X("live-shader-cache-misses",	LIVE_SHADER_CACHE_MISSES, UINT, CUMULATIVE),
-	X("memory-shader-cache-hits",	MEMORY_SHADER_CACHE_HITS, UINT, CUMULATIVE),
-	X("memory-shader-cache-misses",	MEMORY_SHADER_CACHE_MISSES, UINT, CUMULATIVE),
-	X("disk-shader-cache-hits",	DISK_SHADER_CACHE_HITS, UINT, CUMULATIVE),
-	X("disk-shader-cache-misses",	DISK_SHADER_CACHE_MISSES, UINT, CUMULATIVE),
-
-	/* GPIN queries are for the benefit of old versions of GPUPerfStudio,
-	 * which use it as a fallback path to detect the GPU type.
-	 *
-	 * Note: The names of these queries are significant for GPUPerfStudio
-	 * (and possibly their order as well). */
-	XG(GPIN, "GPIN_000",		GPIN_ASIC_ID,		UINT, AVERAGE),
-	XG(GPIN, "GPIN_001",		GPIN_NUM_SIMD,		UINT, AVERAGE),
-	XG(GPIN, "GPIN_002",		GPIN_NUM_RB,		UINT, AVERAGE),
-	XG(GPIN, "GPIN_003",		GPIN_NUM_SPI,		UINT, AVERAGE),
-	XG(GPIN, "GPIN_004",		GPIN_NUM_SE,		UINT, AVERAGE),
-
-	X("temperature",		GPU_TEMPERATURE,	UINT64, AVERAGE),
-	X("shader-clock",		CURRENT_GPU_SCLK,	HZ, AVERAGE),
-	X("memory-clock",		CURRENT_GPU_MCLK,	HZ, AVERAGE),
-
-	/* The following queries must be at the end of the list because their
-	 * availability is adjusted dynamically based on the DRM version. */
-	X("GPU-load",			GPU_LOAD,		UINT64, AVERAGE),
-	X("GPU-shaders-busy",		GPU_SHADERS_BUSY,	UINT64, AVERAGE),
-	X("GPU-ta-busy",		GPU_TA_BUSY,		UINT64, AVERAGE),
-	X("GPU-gds-busy",		GPU_GDS_BUSY,		UINT64, AVERAGE),
-	X("GPU-vgt-busy",		GPU_VGT_BUSY,		UINT64, AVERAGE),
-	X("GPU-ia-busy",		GPU_IA_BUSY,		UINT64, AVERAGE),
-	X("GPU-sx-busy",		GPU_SX_BUSY,		UINT64, AVERAGE),
-	X("GPU-wd-busy",		GPU_WD_BUSY,		UINT64, AVERAGE),
-	X("GPU-bci-busy",		GPU_BCI_BUSY,		UINT64, AVERAGE),
-	X("GPU-sc-busy",		GPU_SC_BUSY,		UINT64, AVERAGE),
-	X("GPU-pa-busy",		GPU_PA_BUSY,		UINT64, AVERAGE),
-	X("GPU-db-busy",		GPU_DB_BUSY,		UINT64, AVERAGE),
-	X("GPU-cp-busy",		GPU_CP_BUSY,		UINT64, AVERAGE),
-	X("GPU-cb-busy",		GPU_CB_BUSY,		UINT64, AVERAGE),
-
-	/* SRBM_STATUS2 */
-	X("GPU-sdma-busy",		GPU_SDMA_BUSY,		UINT64, AVERAGE),
-
-	/* CP_STAT */
-	X("GPU-pfp-busy",		GPU_PFP_BUSY,		UINT64, AVERAGE),
-	X("GPU-meq-busy",		GPU_MEQ_BUSY,		UINT64, AVERAGE),
-	X("GPU-me-busy",		GPU_ME_BUSY,		UINT64, AVERAGE),
-	X("GPU-surf-sync-busy",		GPU_SURF_SYNC_BUSY,	UINT64, AVERAGE),
-	X("GPU-cp-dma-busy",		GPU_CP_DMA_BUSY,	UINT64, AVERAGE),
-	X("GPU-scratch-ram-busy",	GPU_SCRATCH_RAM_BUSY,	UINT64, AVERAGE),
-
-	X("pd-num-prims-accepted",	PD_NUM_PRIMS_ACCEPTED,	UINT64, AVERAGE),
-	X("pd-num-prims-rejected",	PD_NUM_PRIMS_REJECTED,	UINT64, AVERAGE),
-	X("pd-num-prims-ineligible",	PD_NUM_PRIMS_INELIGIBLE,UINT64, AVERAGE),
+   X("num-compilations", NUM_COMPILATIONS, UINT64, CUMULATIVE),
+   X("num-shaders-created", NUM_SHADERS_CREATED, UINT64, CUMULATIVE),
+   X("draw-calls", DRAW_CALLS, UINT64, AVERAGE),
+   X("decompress-calls", DECOMPRESS_CALLS, UINT64, AVERAGE),
+   X("MRT-draw-calls", MRT_DRAW_CALLS, UINT64, AVERAGE),
+   X("prim-restart-calls", PRIM_RESTART_CALLS, UINT64, AVERAGE),
+   X("spill-draw-calls", SPILL_DRAW_CALLS, UINT64, AVERAGE),
+   X("compute-calls", COMPUTE_CALLS, UINT64, AVERAGE),
+   X("spill-compute-calls", SPILL_COMPUTE_CALLS, UINT64, AVERAGE),
+   X("dma-calls", DMA_CALLS, UINT64, AVERAGE),
+   X("cp-dma-calls", CP_DMA_CALLS, UINT64, AVERAGE),
+   X("num-vs-flushes", NUM_VS_FLUSHES, UINT64, AVERAGE),
+   X("num-ps-flushes", NUM_PS_FLUSHES, UINT64, AVERAGE),
+   X("num-cs-flushes", NUM_CS_FLUSHES, UINT64, AVERAGE),
+   X("num-CB-cache-flushes", NUM_CB_CACHE_FLUSHES, UINT64, AVERAGE),
+   X("num-DB-cache-flushes", NUM_DB_CACHE_FLUSHES, UINT64, AVERAGE),
+   X("num-L2-invalidates", NUM_L2_INVALIDATES, UINT64, AVERAGE),
+   X("num-L2-writebacks", NUM_L2_WRITEBACKS, UINT64, AVERAGE),
+   X("num-resident-handles", NUM_RESIDENT_HANDLES, UINT64, AVERAGE),
+   X("tc-offloaded-slots", TC_OFFLOADED_SLOTS, UINT64, AVERAGE),
+   X("tc-direct-slots", TC_DIRECT_SLOTS, UINT64, AVERAGE),
+   X("tc-num-syncs", TC_NUM_SYNCS, UINT64, AVERAGE),
+   X("CS-thread-busy", CS_THREAD_BUSY, UINT64, AVERAGE),
+   X("gallium-thread-busy", GALLIUM_THREAD_BUSY, UINT64, AVERAGE),
+   X("requested-VRAM", REQUESTED_VRAM, BYTES, AVERAGE),
+   X("requested-GTT", REQUESTED_GTT, BYTES, AVERAGE),
+   X("mapped-VRAM", MAPPED_VRAM, BYTES, AVERAGE),
+   X("mapped-GTT", MAPPED_GTT, BYTES, AVERAGE),
+   X("buffer-wait-time", BUFFER_WAIT_TIME, MICROSECONDS, CUMULATIVE),
+   X("num-mapped-buffers", NUM_MAPPED_BUFFERS, UINT64, AVERAGE),
+   X("num-GFX-IBs", NUM_GFX_IBS, UINT64, AVERAGE),
+   X("num-SDMA-IBs", NUM_SDMA_IBS, UINT64, AVERAGE),
+   X("GFX-BO-list-size", GFX_BO_LIST_SIZE, UINT64, AVERAGE),
+   X("GFX-IB-size", GFX_IB_SIZE, UINT64, AVERAGE),
+   X("num-bytes-moved", NUM_BYTES_MOVED, BYTES, CUMULATIVE),
+   X("num-evictions", NUM_EVICTIONS, UINT64, CUMULATIVE),
+   X("VRAM-CPU-page-faults", NUM_VRAM_CPU_PAGE_FAULTS, UINT64, CUMULATIVE),
+   X("VRAM-usage", VRAM_USAGE, BYTES, AVERAGE),
+   X("VRAM-vis-usage", VRAM_VIS_USAGE, BYTES, AVERAGE),
+   X("GTT-usage", GTT_USAGE, BYTES, AVERAGE),
+   X("back-buffer-ps-draw-ratio", BACK_BUFFER_PS_DRAW_RATIO, UINT64, AVERAGE),
+   X("live-shader-cache-hits", LIVE_SHADER_CACHE_HITS, UINT, CUMULATIVE),
+   X("live-shader-cache-misses", LIVE_SHADER_CACHE_MISSES, UINT, CUMULATIVE),
+   X("memory-shader-cache-hits", MEMORY_SHADER_CACHE_HITS, UINT, CUMULATIVE),
+   X("memory-shader-cache-misses", MEMORY_SHADER_CACHE_MISSES, UINT, CUMULATIVE),
+   X("disk-shader-cache-hits", DISK_SHADER_CACHE_HITS, UINT, CUMULATIVE),
+   X("disk-shader-cache-misses", DISK_SHADER_CACHE_MISSES, UINT, CUMULATIVE),
+
+   /* GPIN queries are for the benefit of old versions of GPUPerfStudio,
+    * which use it as a fallback path to detect the GPU type.
+    *
+    * Note: The names of these queries are significant for GPUPerfStudio
+    * (and possibly their order as well). */
+   XG(GPIN, "GPIN_000", GPIN_ASIC_ID, UINT, AVERAGE),
+   XG(GPIN, "GPIN_001", GPIN_NUM_SIMD, UINT, AVERAGE),
+   XG(GPIN, "GPIN_002", GPIN_NUM_RB, UINT, AVERAGE),
+   XG(GPIN, "GPIN_003", GPIN_NUM_SPI, UINT, AVERAGE),
+   XG(GPIN, "GPIN_004", GPIN_NUM_SE, UINT, AVERAGE),
+
+   X("temperature", GPU_TEMPERATURE, UINT64, AVERAGE),
+   X("shader-clock", CURRENT_GPU_SCLK, HZ, AVERAGE),
+   X("memory-clock", CURRENT_GPU_MCLK, HZ, AVERAGE),
+
+   /* The following queries must be at the end of the list because their
+    * availability is adjusted dynamically based on the DRM version. */
+   X("GPU-load", GPU_LOAD, UINT64, AVERAGE),
+   X("GPU-shaders-busy", GPU_SHADERS_BUSY, UINT64, AVERAGE),
+   X("GPU-ta-busy", GPU_TA_BUSY, UINT64, AVERAGE),
+   X("GPU-gds-busy", GPU_GDS_BUSY, UINT64, AVERAGE),
+   X("GPU-vgt-busy", GPU_VGT_BUSY, UINT64, AVERAGE),
+   X("GPU-ia-busy", GPU_IA_BUSY, UINT64, AVERAGE),
+   X("GPU-sx-busy", GPU_SX_BUSY, UINT64, AVERAGE),
+   X("GPU-wd-busy", GPU_WD_BUSY, UINT64, AVERAGE),
+   X("GPU-bci-busy", GPU_BCI_BUSY, UINT64, AVERAGE),
+   X("GPU-sc-busy", GPU_SC_BUSY, UINT64, AVERAGE),
+   X("GPU-pa-busy", GPU_PA_BUSY, UINT64, AVERAGE),
+   X("GPU-db-busy", GPU_DB_BUSY, UINT64, AVERAGE),
+   X("GPU-cp-busy", GPU_CP_BUSY, UINT64, AVERAGE),
+   X("GPU-cb-busy", GPU_CB_BUSY, UINT64, AVERAGE),
+
+   /* SRBM_STATUS2 */
+   X("GPU-sdma-busy", GPU_SDMA_BUSY, UINT64, AVERAGE),
+
+   /* CP_STAT */
+   X("GPU-pfp-busy", GPU_PFP_BUSY, UINT64, AVERAGE),
+   X("GPU-meq-busy", GPU_MEQ_BUSY, UINT64, AVERAGE),
+   X("GPU-me-busy", GPU_ME_BUSY, UINT64, AVERAGE),
+   X("GPU-surf-sync-busy", GPU_SURF_SYNC_BUSY, UINT64, AVERAGE),
+   X("GPU-cp-dma-busy", GPU_CP_DMA_BUSY, UINT64, AVERAGE),
+   X("GPU-scratch-ram-busy", GPU_SCRATCH_RAM_BUSY, UINT64, AVERAGE),
+
+   X("pd-num-prims-accepted", PD_NUM_PRIMS_ACCEPTED, UINT64, AVERAGE),
+   X("pd-num-prims-rejected", PD_NUM_PRIMS_REJECTED, UINT64, AVERAGE),
+   X("pd-num-prims-ineligible", PD_NUM_PRIMS_INELIGIBLE, UINT64, AVERAGE),
 };
 
 #undef X
@@ -1848,119 +1764,116 @@ static struct pipe_driver_query_info si_driver_query_list[] = {
 
 static unsigned si_get_num_queries(struct si_screen *sscreen)
 {
-	/* amdgpu */
-	if (sscreen->info.is_amdgpu) {
-		if (sscreen->info.chip_class >= GFX8)
-			return ARRAY_SIZE(si_driver_query_list);
-		else
-			return ARRAY_SIZE(si_driver_query_list) - 7;
-	}
-
-	/* radeon */
-	if (sscreen->info.has_read_registers_query) {
-		if (sscreen->info.chip_class == GFX7)
-			return ARRAY_SIZE(si_driver_query_list) - 6;
-		else
-			return ARRAY_SIZE(si_driver_query_list) - 7;
-	}
-
-	return ARRAY_SIZE(si_driver_query_list) - 21;
+   /* amdgpu */
+   if (sscreen->info.is_amdgpu) {
+      if (sscreen->info.chip_class >= GFX8)
+         return ARRAY_SIZE(si_driver_query_list);
+      else
+         return ARRAY_SIZE(si_driver_query_list) - 7;
+   }
+
+   /* radeon */
+   if (sscreen->info.has_read_registers_query) {
+      if (sscreen->info.chip_class == GFX7)
+         return ARRAY_SIZE(si_driver_query_list) - 6;
+      else
+         return ARRAY_SIZE(si_driver_query_list) - 7;
+   }
+
+   return ARRAY_SIZE(si_driver_query_list) - 21;
 }
 
-static int si_get_driver_query_info(struct pipe_screen *screen,
-				    unsigned index,
-				    struct pipe_driver_query_info *info)
+static int si_get_driver_query_info(struct pipe_screen *screen, unsigned index,
+                                    struct pipe_driver_query_info *info)
 {
-	struct si_screen *sscreen = (struct si_screen*)screen;
-	unsigned num_queries = si_get_num_queries(sscreen);
-
-	if (!info) {
-		unsigned num_perfcounters =
-			si_get_perfcounter_info(sscreen, 0, NULL);
-
-		return num_queries + num_perfcounters;
-	}
-
-	if (index >= num_queries)
-		return si_get_perfcounter_info(sscreen, index - num_queries, info);
-
-	*info = si_driver_query_list[index];
-
-	switch (info->query_type) {
-	case SI_QUERY_REQUESTED_VRAM:
-	case SI_QUERY_VRAM_USAGE:
-	case SI_QUERY_MAPPED_VRAM:
-		info->max_value.u64 = sscreen->info.vram_size;
-		break;
-	case SI_QUERY_REQUESTED_GTT:
-	case SI_QUERY_GTT_USAGE:
-	case SI_QUERY_MAPPED_GTT:
-		info->max_value.u64 = sscreen->info.gart_size;
-		break;
-	case SI_QUERY_GPU_TEMPERATURE:
-		info->max_value.u64 = 125;
-		break;
-	case SI_QUERY_VRAM_VIS_USAGE:
-		info->max_value.u64 = sscreen->info.vram_vis_size;
-		break;
-	}
-
-	if (info->group_id != ~(unsigned)0 && sscreen->perfcounters)
-		info->group_id += sscreen->perfcounters->num_groups;
-
-	return 1;
+   struct si_screen *sscreen = (struct si_screen *)screen;
+   unsigned num_queries = si_get_num_queries(sscreen);
+
+   if (!info) {
+      unsigned num_perfcounters = si_get_perfcounter_info(sscreen, 0, NULL);
+
+      return num_queries + num_perfcounters;
+   }
+
+   if (index >= num_queries)
+      return si_get_perfcounter_info(sscreen, index - num_queries, info);
+
+   *info = si_driver_query_list[index];
+
+   switch (info->query_type) {
+   case SI_QUERY_REQUESTED_VRAM:
+   case SI_QUERY_VRAM_USAGE:
+   case SI_QUERY_MAPPED_VRAM:
+      info->max_value.u64 = sscreen->info.vram_size;
+      break;
+   case SI_QUERY_REQUESTED_GTT:
+   case SI_QUERY_GTT_USAGE:
+   case SI_QUERY_MAPPED_GTT:
+      info->max_value.u64 = sscreen->info.gart_size;
+      break;
+   case SI_QUERY_GPU_TEMPERATURE:
+      info->max_value.u64 = 125;
+      break;
+   case SI_QUERY_VRAM_VIS_USAGE:
+      info->max_value.u64 = sscreen->info.vram_vis_size;
+      break;
+   }
+
+   if (info->group_id != ~(unsigned)0 && sscreen->perfcounters)
+      info->group_id += sscreen->perfcounters->num_groups;
+
+   return 1;
 }
 
 /* Note: Unfortunately, GPUPerfStudio hardcodes the order of hardware
  * performance counter groups, so be careful when changing this and related
  * functions.
  */
-static int si_get_driver_query_group_info(struct pipe_screen *screen,
-					  unsigned index,
-					  struct pipe_driver_query_group_info *info)
+static int si_get_driver_query_group_info(struct pipe_screen *screen, unsigned index,
+                                          struct pipe_driver_query_group_info *info)
 {
-	struct si_screen *sscreen = (struct si_screen *)screen;
-	unsigned num_pc_groups = 0;
+   struct si_screen *sscreen = (struct si_screen *)screen;
+   unsigned num_pc_groups = 0;
 
-	if (sscreen->perfcounters)
-		num_pc_groups = sscreen->perfcounters->num_groups;
+   if (sscreen->perfcounters)
+      num_pc_groups = sscreen->perfcounters->num_groups;
 
-	if (!info)
-		return num_pc_groups + SI_NUM_SW_QUERY_GROUPS;
+   if (!info)
+      return num_pc_groups + SI_NUM_SW_QUERY_GROUPS;
 
-	if (index < num_pc_groups)
-		return si_get_perfcounter_group_info(sscreen, index, info);
+   if (index < num_pc_groups)
+      return si_get_perfcounter_group_info(sscreen, index, info);
 
-	index -= num_pc_groups;
-	if (index >= SI_NUM_SW_QUERY_GROUPS)
-		return 0;
+   index -= num_pc_groups;
+   if (index >= SI_NUM_SW_QUERY_GROUPS)
+      return 0;
 
-	info->name = "GPIN";
-	info->max_active_queries = 5;
-	info->num_queries = 5;
-	return 1;
+   info->name = "GPIN";
+   info->max_active_queries = 5;
+   info->num_queries = 5;
+   return 1;
 }
 
 void si_init_query_functions(struct si_context *sctx)
 {
-	sctx->b.create_query = si_create_query;
-	sctx->b.create_batch_query = si_create_batch_query;
-	sctx->b.destroy_query = si_destroy_query;
-	sctx->b.begin_query = si_begin_query;
-	sctx->b.end_query = si_end_query;
-	sctx->b.get_query_result = si_get_query_result;
-	sctx->b.get_query_result_resource = si_get_query_result_resource;
-
-	if (sctx->has_graphics) {
-		sctx->atoms.s.render_cond.emit = si_emit_query_predication;
-		sctx->b.render_condition = si_render_condition;
-	}
-
-	list_inithead(&sctx->active_queries);
+   sctx->b.create_query = si_create_query;
+   sctx->b.create_batch_query = si_create_batch_query;
+   sctx->b.destroy_query = si_destroy_query;
+   sctx->b.begin_query = si_begin_query;
+   sctx->b.end_query = si_end_query;
+   sctx->b.get_query_result = si_get_query_result;
+   sctx->b.get_query_result_resource = si_get_query_result_resource;
+
+   if (sctx->has_graphics) {
+      sctx->atoms.s.render_cond.emit = si_emit_query_predication;
+      sctx->b.render_condition = si_render_condition;
+   }
+
+   list_inithead(&sctx->active_queries);
 }
 
 void si_init_screen_query_functions(struct si_screen *sscreen)
 {
-	sscreen->b.get_driver_query_info = si_get_driver_query_info;
-	sscreen->b.get_driver_query_group_info = si_get_driver_query_group_info;
+   sscreen->b.get_driver_query_info = si_get_driver_query_info;
+   sscreen->b.get_driver_query_group_info = si_get_driver_query_group_info;
 }
diff --git a/src/gallium/drivers/radeonsi/si_query.h b/src/gallium/drivers/radeonsi/si_query.h
index 6c4386451cc..1eaa3b255a6 100644
--- a/src/gallium/drivers/radeonsi/si_query.h
+++ b/src/gallium/drivers/radeonsi/si_query.h
@@ -40,236 +40,220 @@ struct si_resource;
 
 #define SI_MAX_STREAMS 4
 
-enum {
-	SI_QUERY_DRAW_CALLS = PIPE_QUERY_DRIVER_SPECIFIC,
-	SI_QUERY_DECOMPRESS_CALLS,
-	SI_QUERY_MRT_DRAW_CALLS,
-	SI_QUERY_PRIM_RESTART_CALLS,
-	SI_QUERY_SPILL_DRAW_CALLS,
-	SI_QUERY_COMPUTE_CALLS,
-	SI_QUERY_SPILL_COMPUTE_CALLS,
-	SI_QUERY_DMA_CALLS,
-	SI_QUERY_CP_DMA_CALLS,
-	SI_QUERY_NUM_VS_FLUSHES,
-	SI_QUERY_NUM_PS_FLUSHES,
-	SI_QUERY_NUM_CS_FLUSHES,
-	SI_QUERY_NUM_CB_CACHE_FLUSHES,
-	SI_QUERY_NUM_DB_CACHE_FLUSHES,
-	SI_QUERY_NUM_L2_INVALIDATES,
-	SI_QUERY_NUM_L2_WRITEBACKS,
-	SI_QUERY_NUM_RESIDENT_HANDLES,
-	SI_QUERY_TC_OFFLOADED_SLOTS,
-	SI_QUERY_TC_DIRECT_SLOTS,
-	SI_QUERY_TC_NUM_SYNCS,
-	SI_QUERY_CS_THREAD_BUSY,
-	SI_QUERY_GALLIUM_THREAD_BUSY,
-	SI_QUERY_REQUESTED_VRAM,
-	SI_QUERY_REQUESTED_GTT,
-	SI_QUERY_MAPPED_VRAM,
-	SI_QUERY_MAPPED_GTT,
-	SI_QUERY_BUFFER_WAIT_TIME,
-	SI_QUERY_NUM_MAPPED_BUFFERS,
-	SI_QUERY_NUM_GFX_IBS,
-	SI_QUERY_NUM_SDMA_IBS,
-	SI_QUERY_GFX_BO_LIST_SIZE,
-	SI_QUERY_GFX_IB_SIZE,
-	SI_QUERY_NUM_BYTES_MOVED,
-	SI_QUERY_NUM_EVICTIONS,
-	SI_QUERY_NUM_VRAM_CPU_PAGE_FAULTS,
-	SI_QUERY_VRAM_USAGE,
-	SI_QUERY_VRAM_VIS_USAGE,
-	SI_QUERY_GTT_USAGE,
-	SI_QUERY_GPU_TEMPERATURE,
-	SI_QUERY_CURRENT_GPU_SCLK,
-	SI_QUERY_CURRENT_GPU_MCLK,
-	SI_QUERY_GPU_LOAD,
-	SI_QUERY_GPU_SHADERS_BUSY,
-	SI_QUERY_GPU_TA_BUSY,
-	SI_QUERY_GPU_GDS_BUSY,
-	SI_QUERY_GPU_VGT_BUSY,
-	SI_QUERY_GPU_IA_BUSY,
-	SI_QUERY_GPU_SX_BUSY,
-	SI_QUERY_GPU_WD_BUSY,
-	SI_QUERY_GPU_BCI_BUSY,
-	SI_QUERY_GPU_SC_BUSY,
-	SI_QUERY_GPU_PA_BUSY,
-	SI_QUERY_GPU_DB_BUSY,
-	SI_QUERY_GPU_CP_BUSY,
-	SI_QUERY_GPU_CB_BUSY,
-	SI_QUERY_GPU_SDMA_BUSY,
-	SI_QUERY_GPU_PFP_BUSY,
-	SI_QUERY_GPU_MEQ_BUSY,
-	SI_QUERY_GPU_ME_BUSY,
-	SI_QUERY_GPU_SURF_SYNC_BUSY,
-	SI_QUERY_GPU_CP_DMA_BUSY,
-	SI_QUERY_GPU_SCRATCH_RAM_BUSY,
-	SI_QUERY_NUM_COMPILATIONS,
-	SI_QUERY_NUM_SHADERS_CREATED,
-	SI_QUERY_BACK_BUFFER_PS_DRAW_RATIO,
-	SI_QUERY_GPIN_ASIC_ID,
-	SI_QUERY_GPIN_NUM_SIMD,
-	SI_QUERY_GPIN_NUM_RB,
-	SI_QUERY_GPIN_NUM_SPI,
-	SI_QUERY_GPIN_NUM_SE,
-	SI_QUERY_TIME_ELAPSED_SDMA,
-	SI_QUERY_TIME_ELAPSED_SDMA_SI, /* emulated, measured on the CPU */
-	SI_QUERY_PD_NUM_PRIMS_ACCEPTED,
-	SI_QUERY_PD_NUM_PRIMS_REJECTED,
-	SI_QUERY_PD_NUM_PRIMS_INELIGIBLE,
-	SI_QUERY_LIVE_SHADER_CACHE_HITS,
-	SI_QUERY_LIVE_SHADER_CACHE_MISSES,
-	SI_QUERY_MEMORY_SHADER_CACHE_HITS,
-	SI_QUERY_MEMORY_SHADER_CACHE_MISSES,
-	SI_QUERY_DISK_SHADER_CACHE_HITS,
-	SI_QUERY_DISK_SHADER_CACHE_MISSES,
-
-	SI_QUERY_FIRST_PERFCOUNTER = PIPE_QUERY_DRIVER_SPECIFIC + 100,
+enum
+{
+   SI_QUERY_DRAW_CALLS = PIPE_QUERY_DRIVER_SPECIFIC,
+   SI_QUERY_DECOMPRESS_CALLS,
+   SI_QUERY_MRT_DRAW_CALLS,
+   SI_QUERY_PRIM_RESTART_CALLS,
+   SI_QUERY_SPILL_DRAW_CALLS,
+   SI_QUERY_COMPUTE_CALLS,
+   SI_QUERY_SPILL_COMPUTE_CALLS,
+   SI_QUERY_DMA_CALLS,
+   SI_QUERY_CP_DMA_CALLS,
+   SI_QUERY_NUM_VS_FLUSHES,
+   SI_QUERY_NUM_PS_FLUSHES,
+   SI_QUERY_NUM_CS_FLUSHES,
+   SI_QUERY_NUM_CB_CACHE_FLUSHES,
+   SI_QUERY_NUM_DB_CACHE_FLUSHES,
+   SI_QUERY_NUM_L2_INVALIDATES,
+   SI_QUERY_NUM_L2_WRITEBACKS,
+   SI_QUERY_NUM_RESIDENT_HANDLES,
+   SI_QUERY_TC_OFFLOADED_SLOTS,
+   SI_QUERY_TC_DIRECT_SLOTS,
+   SI_QUERY_TC_NUM_SYNCS,
+   SI_QUERY_CS_THREAD_BUSY,
+   SI_QUERY_GALLIUM_THREAD_BUSY,
+   SI_QUERY_REQUESTED_VRAM,
+   SI_QUERY_REQUESTED_GTT,
+   SI_QUERY_MAPPED_VRAM,
+   SI_QUERY_MAPPED_GTT,
+   SI_QUERY_BUFFER_WAIT_TIME,
+   SI_QUERY_NUM_MAPPED_BUFFERS,
+   SI_QUERY_NUM_GFX_IBS,
+   SI_QUERY_NUM_SDMA_IBS,
+   SI_QUERY_GFX_BO_LIST_SIZE,
+   SI_QUERY_GFX_IB_SIZE,
+   SI_QUERY_NUM_BYTES_MOVED,
+   SI_QUERY_NUM_EVICTIONS,
+   SI_QUERY_NUM_VRAM_CPU_PAGE_FAULTS,
+   SI_QUERY_VRAM_USAGE,
+   SI_QUERY_VRAM_VIS_USAGE,
+   SI_QUERY_GTT_USAGE,
+   SI_QUERY_GPU_TEMPERATURE,
+   SI_QUERY_CURRENT_GPU_SCLK,
+   SI_QUERY_CURRENT_GPU_MCLK,
+   SI_QUERY_GPU_LOAD,
+   SI_QUERY_GPU_SHADERS_BUSY,
+   SI_QUERY_GPU_TA_BUSY,
+   SI_QUERY_GPU_GDS_BUSY,
+   SI_QUERY_GPU_VGT_BUSY,
+   SI_QUERY_GPU_IA_BUSY,
+   SI_QUERY_GPU_SX_BUSY,
+   SI_QUERY_GPU_WD_BUSY,
+   SI_QUERY_GPU_BCI_BUSY,
+   SI_QUERY_GPU_SC_BUSY,
+   SI_QUERY_GPU_PA_BUSY,
+   SI_QUERY_GPU_DB_BUSY,
+   SI_QUERY_GPU_CP_BUSY,
+   SI_QUERY_GPU_CB_BUSY,
+   SI_QUERY_GPU_SDMA_BUSY,
+   SI_QUERY_GPU_PFP_BUSY,
+   SI_QUERY_GPU_MEQ_BUSY,
+   SI_QUERY_GPU_ME_BUSY,
+   SI_QUERY_GPU_SURF_SYNC_BUSY,
+   SI_QUERY_GPU_CP_DMA_BUSY,
+   SI_QUERY_GPU_SCRATCH_RAM_BUSY,
+   SI_QUERY_NUM_COMPILATIONS,
+   SI_QUERY_NUM_SHADERS_CREATED,
+   SI_QUERY_BACK_BUFFER_PS_DRAW_RATIO,
+   SI_QUERY_GPIN_ASIC_ID,
+   SI_QUERY_GPIN_NUM_SIMD,
+   SI_QUERY_GPIN_NUM_RB,
+   SI_QUERY_GPIN_NUM_SPI,
+   SI_QUERY_GPIN_NUM_SE,
+   SI_QUERY_TIME_ELAPSED_SDMA,
+   SI_QUERY_TIME_ELAPSED_SDMA_SI, /* emulated, measured on the CPU */
+   SI_QUERY_PD_NUM_PRIMS_ACCEPTED,
+   SI_QUERY_PD_NUM_PRIMS_REJECTED,
+   SI_QUERY_PD_NUM_PRIMS_INELIGIBLE,
+   SI_QUERY_LIVE_SHADER_CACHE_HITS,
+   SI_QUERY_LIVE_SHADER_CACHE_MISSES,
+   SI_QUERY_MEMORY_SHADER_CACHE_HITS,
+   SI_QUERY_MEMORY_SHADER_CACHE_MISSES,
+   SI_QUERY_DISK_SHADER_CACHE_HITS,
+   SI_QUERY_DISK_SHADER_CACHE_MISSES,
+
+   SI_QUERY_FIRST_PERFCOUNTER = PIPE_QUERY_DRIVER_SPECIFIC + 100,
 };
 
-enum {
-	SI_QUERY_GROUP_GPIN = 0,
-	SI_NUM_SW_QUERY_GROUPS
+enum
+{
+   SI_QUERY_GROUP_GPIN = 0,
+   SI_NUM_SW_QUERY_GROUPS
 };
 
 struct si_query_ops {
-	void (*destroy)(struct si_context *, struct si_query *);
-	bool (*begin)(struct si_context *, struct si_query *);
-	bool (*end)(struct si_context *, struct si_query *);
-	bool (*get_result)(struct si_context *,
-			   struct si_query *, bool wait,
-			   union pipe_query_result *result);
-	void (*get_result_resource)(struct si_context *,
-				    struct si_query *, bool wait,
-				    enum pipe_query_value_type result_type,
-				    int index,
-				    struct pipe_resource *resource,
-				    unsigned offset);
-
-	void (*suspend)(struct si_context *, struct si_query *);
-	void (*resume)(struct si_context *, struct si_query *);
+   void (*destroy)(struct si_context *, struct si_query *);
+   bool (*begin)(struct si_context *, struct si_query *);
+   bool (*end)(struct si_context *, struct si_query *);
+   bool (*get_result)(struct si_context *, struct si_query *, bool wait,
+                      union pipe_query_result *result);
+   void (*get_result_resource)(struct si_context *, struct si_query *, bool wait,
+                               enum pipe_query_value_type result_type, int index,
+                               struct pipe_resource *resource, unsigned offset);
+
+   void (*suspend)(struct si_context *, struct si_query *);
+   void (*resume)(struct si_context *, struct si_query *);
 };
 
 struct si_query {
-	struct threaded_query b;
-	const struct si_query_ops *ops;
+   struct threaded_query b;
+   const struct si_query_ops *ops;
 
-	/* The PIPE_QUERY_xxx type of query */
-	unsigned type;
+   /* The PIPE_QUERY_xxx type of query */
+   unsigned type;
 
-	/* The number of dwords for suspend. */
-	unsigned num_cs_dw_suspend;
+   /* The number of dwords for suspend. */
+   unsigned num_cs_dw_suspend;
 
-	/* Linked list of queries that must be suspended at end of CS. */
-	struct list_head active_list;
+   /* Linked list of queries that must be suspended at end of CS. */
+   struct list_head active_list;
 };
 
-enum {
-	SI_QUERY_HW_FLAG_NO_START = (1 << 0),
-	/* gap */
-	/* whether begin_query doesn't clear the result */
-	SI_QUERY_HW_FLAG_BEGIN_RESUMES = (1 << 2),
+enum
+{
+   SI_QUERY_HW_FLAG_NO_START = (1 << 0),
+   /* gap */
+   /* whether begin_query doesn't clear the result */
+   SI_QUERY_HW_FLAG_BEGIN_RESUMES = (1 << 2),
 };
 
 struct si_query_hw_ops {
-	bool (*prepare_buffer)(struct si_context *, struct si_query_buffer *);
-	void (*emit_start)(struct si_context *,
-			   struct si_query_hw *,
-			   struct si_resource *buffer, uint64_t va);
-	void (*emit_stop)(struct si_context *,
-			  struct si_query_hw *,
-			  struct si_resource *buffer, uint64_t va);
-	void (*clear_result)(struct si_query_hw *, union pipe_query_result *);
-	void (*add_result)(struct si_screen *screen,
-			   struct si_query_hw *, void *buffer,
-			   union pipe_query_result *result);
+   bool (*prepare_buffer)(struct si_context *, struct si_query_buffer *);
+   void (*emit_start)(struct si_context *, struct si_query_hw *, struct si_resource *buffer,
+                      uint64_t va);
+   void (*emit_stop)(struct si_context *, struct si_query_hw *, struct si_resource *buffer,
+                     uint64_t va);
+   void (*clear_result)(struct si_query_hw *, union pipe_query_result *);
+   void (*add_result)(struct si_screen *screen, struct si_query_hw *, void *buffer,
+                      union pipe_query_result *result);
 };
 
 struct si_query_buffer {
-	/* The buffer where query results are stored. */
-	struct si_resource		*buf;
-	/* If a query buffer is full, a new buffer is created and the old one
-	 * is put in here. When we calculate the result, we sum up the samples
-	 * from all buffers. */
-	struct si_query_buffer	*previous;
-	/* Offset of the next free result after current query data */
-	unsigned			results_end;
-	bool unprepared;
+   /* The buffer where query results are stored. */
+   struct si_resource *buf;
+   /* If a query buffer is full, a new buffer is created and the old one
+    * is put in here. When we calculate the result, we sum up the samples
+    * from all buffers. */
+   struct si_query_buffer *previous;
+   /* Offset of the next free result after current query data */
+   unsigned results_end;
+   bool unprepared;
 };
 
 void si_query_buffer_destroy(struct si_screen *sctx, struct si_query_buffer *buffer);
 void si_query_buffer_reset(struct si_context *sctx, struct si_query_buffer *buffer);
 bool si_query_buffer_alloc(struct si_context *sctx, struct si_query_buffer *buffer,
-			   bool (*prepare_buffer)(struct si_context *, struct si_query_buffer*),
-			   unsigned size);
-
+                           bool (*prepare_buffer)(struct si_context *, struct si_query_buffer *),
+                           unsigned size);
 
 struct si_query_hw {
-	struct si_query b;
-	struct si_query_hw_ops *ops;
-	unsigned flags;
-
-	/* The query buffer and how many results are in it. */
-	struct si_query_buffer buffer;
-	/* Size of the result in memory for both begin_query and end_query,
-	 * this can be one or two numbers, or it could even be a size of a structure. */
-	unsigned result_size;
-	/* For transform feedback: which stream the query is for */
-	unsigned stream;
-
-	/* Workaround via compute shader */
-	struct si_resource *workaround_buf;
-	unsigned workaround_offset;
+   struct si_query b;
+   struct si_query_hw_ops *ops;
+   unsigned flags;
+
+   /* The query buffer and how many results are in it. */
+   struct si_query_buffer buffer;
+   /* Size of the result in memory for both begin_query and end_query,
+    * this can be one or two numbers, or it could even be a size of a structure. */
+   unsigned result_size;
+   /* For transform feedback: which stream the query is for */
+   unsigned stream;
+
+   /* Workaround via compute shader */
+   struct si_resource *workaround_buf;
+   unsigned workaround_offset;
 };
 
-void si_query_hw_destroy(struct si_context *sctx,
-			 struct si_query *squery);
-bool si_query_hw_begin(struct si_context *sctx,
-		       struct si_query *squery);
-bool si_query_hw_end(struct si_context *sctx,
-		     struct si_query *squery);
-bool si_query_hw_get_result(struct si_context *sctx,
-			    struct si_query *squery,
-			    bool wait,
-			    union pipe_query_result *result);
+void si_query_hw_destroy(struct si_context *sctx, struct si_query *squery);
+bool si_query_hw_begin(struct si_context *sctx, struct si_query *squery);
+bool si_query_hw_end(struct si_context *sctx, struct si_query *squery);
+bool si_query_hw_get_result(struct si_context *sctx, struct si_query *squery, bool wait,
+                            union pipe_query_result *result);
 void si_query_hw_suspend(struct si_context *sctx, struct si_query *query);
 void si_query_hw_resume(struct si_context *sctx, struct si_query *query);
 
-
 /* Shader-based queries */
-struct pipe_query *gfx10_sh_query_create(struct si_screen *screen,
-					 enum pipe_query_type query_type,
-					 unsigned index);
-
+struct pipe_query *gfx10_sh_query_create(struct si_screen *screen, enum pipe_query_type query_type,
+                                         unsigned index);
 
 /* Performance counters */
 struct si_perfcounters {
-	unsigned num_groups;
-	unsigned num_blocks;
-	struct si_pc_block *blocks;
+   unsigned num_groups;
+   unsigned num_blocks;
+   struct si_pc_block *blocks;
 
-	unsigned num_stop_cs_dwords;
-	unsigned num_instance_cs_dwords;
+   unsigned num_stop_cs_dwords;
+   unsigned num_instance_cs_dwords;
 
-	bool separate_se;
-	bool separate_instance;
+   bool separate_se;
+   bool separate_instance;
 };
 
-struct pipe_query *si_create_batch_query(struct pipe_context *ctx,
-					 unsigned num_queries,
-					 unsigned *query_types);
+struct pipe_query *si_create_batch_query(struct pipe_context *ctx, unsigned num_queries,
+                                         unsigned *query_types);
 
-int si_get_perfcounter_info(struct si_screen *,
-			    unsigned index,
-			    struct pipe_driver_query_info *info);
-int si_get_perfcounter_group_info(struct si_screen *,
-				  unsigned index,
-				  struct pipe_driver_query_group_info *info);
+int si_get_perfcounter_info(struct si_screen *, unsigned index,
+                            struct pipe_driver_query_info *info);
+int si_get_perfcounter_group_info(struct si_screen *, unsigned index,
+                                  struct pipe_driver_query_group_info *info);
 
 struct si_qbo_state {
-	void *saved_compute;
-	struct pipe_constant_buffer saved_const0;
-	struct pipe_shader_buffer saved_ssbo[3];
-	unsigned saved_ssbo_writable_mask;
+   void *saved_compute;
+   struct pipe_constant_buffer saved_const0;
+   struct pipe_shader_buffer saved_ssbo[3];
+   unsigned saved_ssbo_writable_mask;
 };
 
 #endif /* SI_QUERY_H */
diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index f0e60087dbf..e615b81c293 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -22,43 +22,38 @@
  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  */
 
-#include "util/u_memory.h"
-#include "tgsi/tgsi_strings.h"
-#include "tgsi/tgsi_from_mesa.h"
-
 #include "ac_exp_param.h"
 #include "ac_rtld.h"
-#include "si_shader_internal.h"
-#include "si_pipe.h"
-#include "sid.h"
-
 #include "compiler/nir/nir.h"
 #include "compiler/nir/nir_serialize.h"
+#include "si_pipe.h"
+#include "si_shader_internal.h"
+#include "sid.h"
+#include "tgsi/tgsi_from_mesa.h"
+#include "tgsi/tgsi_strings.h"
+#include "util/u_memory.h"
 
-static const char scratch_rsrc_dword0_symbol[] =
-	"SCRATCH_RSRC_DWORD0";
+static const char scratch_rsrc_dword0_symbol[] = "SCRATCH_RSRC_DWORD0";
 
-static const char scratch_rsrc_dword1_symbol[] =
-	"SCRATCH_RSRC_DWORD1";
+static const char scratch_rsrc_dword1_symbol[] = "SCRATCH_RSRC_DWORD1";
 
 static void si_dump_shader_key(const struct si_shader *shader, FILE *f);
 
 /** Whether the shader runs as a combination of multiple API shaders */
 bool si_is_multi_part_shader(struct si_shader *shader)
 {
-	if (shader->selector->screen->info.chip_class <= GFX8)
-		return false;
+   if (shader->selector->screen->info.chip_class <= GFX8)
+      return false;
 
-	return shader->key.as_ls ||
-	       shader->key.as_es ||
-	       shader->selector->type == PIPE_SHADER_TESS_CTRL ||
-	       shader->selector->type == PIPE_SHADER_GEOMETRY;
+   return shader->key.as_ls || shader->key.as_es ||
+          shader->selector->type == PIPE_SHADER_TESS_CTRL ||
+          shader->selector->type == PIPE_SHADER_GEOMETRY;
 }
 
 /** Whether the shader runs on a merged HW stage (LSHS or ESGS) */
 bool si_is_merged_shader(struct si_shader *shader)
 {
-	return shader->key.as_ngg || si_is_multi_part_shader(shader);
+   return shader->key.as_ngg || si_is_multi_part_shader(shader);
 }
 
 /**
@@ -68,19 +63,19 @@ bool si_is_merged_shader(struct si_shader *shader)
  */
 unsigned si_shader_io_get_unique_index_patch(unsigned semantic_name, unsigned index)
 {
-	switch (semantic_name) {
-	case TGSI_SEMANTIC_TESSOUTER:
-		return 0;
-	case TGSI_SEMANTIC_TESSINNER:
-		return 1;
-	case TGSI_SEMANTIC_PATCH:
-		assert(index < 30);
-		return 2 + index;
-
-	default:
-		assert(!"invalid semantic name");
-		return 0;
-	}
+   switch (semantic_name) {
+   case TGSI_SEMANTIC_TESSOUTER:
+      return 0;
+   case TGSI_SEMANTIC_TESSINNER:
+      return 1;
+   case TGSI_SEMANTIC_PATCH:
+      assert(index < 30);
+      return 2 + index;
+
+   default:
+      assert(!"invalid semantic name");
+      return 0;
+   }
 }
 
 /**
@@ -88,1527 +83,1420 @@ unsigned si_shader_io_get_unique_index_patch(unsigned semantic_name, unsigned in
  * less than 64, so that a 64-bit bitmask of used inputs or outputs can be
  * calculated.
  */
-unsigned si_shader_io_get_unique_index(unsigned semantic_name, unsigned index,
-				       unsigned is_varying)
+unsigned si_shader_io_get_unique_index(unsigned semantic_name, unsigned index, unsigned is_varying)
 {
-	switch (semantic_name) {
-	case TGSI_SEMANTIC_POSITION:
-		return 0;
-	case TGSI_SEMANTIC_GENERIC:
-		/* Since some shader stages use the the highest used IO index
-		 * to determine the size to allocate for inputs/outputs
-		 * (in LDS, tess and GS rings). GENERIC should be placed right
-		 * after POSITION to make that size as small as possible.
-		 */
-		if (index < SI_MAX_IO_GENERIC)
-			return 1 + index;
-
-		assert(!"invalid generic index");
-		return 0;
-	case TGSI_SEMANTIC_FOG:
-		return SI_MAX_IO_GENERIC + 1;
-	case TGSI_SEMANTIC_COLOR:
-		assert(index < 2);
-		return SI_MAX_IO_GENERIC + 2 + index;
-	case TGSI_SEMANTIC_BCOLOR:
-		assert(index < 2);
-		/* If it's a varying, COLOR and BCOLOR alias. */
-		if (is_varying)
-			return SI_MAX_IO_GENERIC + 2 + index;
-		else
-			return SI_MAX_IO_GENERIC + 4 + index;
-	case TGSI_SEMANTIC_TEXCOORD:
-		assert(index < 8);
-		return SI_MAX_IO_GENERIC + 6 + index;
-
-	/* These are rarely used between LS and HS or ES and GS. */
-	case TGSI_SEMANTIC_CLIPDIST:
-		assert(index < 2);
-		return SI_MAX_IO_GENERIC + 6 + 8 + index;
-	case TGSI_SEMANTIC_CLIPVERTEX:
-		return SI_MAX_IO_GENERIC + 6 + 8 + 2;
-	case TGSI_SEMANTIC_PSIZE:
-		return SI_MAX_IO_GENERIC + 6 + 8 + 3;
-
-	/* These can't be written by LS, HS, and ES. */
-	case TGSI_SEMANTIC_LAYER:
-		return SI_MAX_IO_GENERIC + 6 + 8 + 4;
-	case TGSI_SEMANTIC_VIEWPORT_INDEX:
-		return SI_MAX_IO_GENERIC + 6 + 8 + 5;
-	case TGSI_SEMANTIC_PRIMID:
-		STATIC_ASSERT(SI_MAX_IO_GENERIC + 6 + 8 + 6 <= 63);
-		return SI_MAX_IO_GENERIC + 6 + 8 + 6;
-	default:
-		fprintf(stderr, "invalid semantic name = %u\n", semantic_name);
-		assert(!"invalid semantic name");
-		return 0;
-	}
+   switch (semantic_name) {
+   case TGSI_SEMANTIC_POSITION:
+      return 0;
+   case TGSI_SEMANTIC_GENERIC:
+      /* Since some shader stages use the the highest used IO index
+       * to determine the size to allocate for inputs/outputs
+       * (in LDS, tess and GS rings). GENERIC should be placed right
+       * after POSITION to make that size as small as possible.
+       */
+      if (index < SI_MAX_IO_GENERIC)
+         return 1 + index;
+
+      assert(!"invalid generic index");
+      return 0;
+   case TGSI_SEMANTIC_FOG:
+      return SI_MAX_IO_GENERIC + 1;
+   case TGSI_SEMANTIC_COLOR:
+      assert(index < 2);
+      return SI_MAX_IO_GENERIC + 2 + index;
+   case TGSI_SEMANTIC_BCOLOR:
+      assert(index < 2);
+      /* If it's a varying, COLOR and BCOLOR alias. */
+      if (is_varying)
+         return SI_MAX_IO_GENERIC + 2 + index;
+      else
+         return SI_MAX_IO_GENERIC + 4 + index;
+   case TGSI_SEMANTIC_TEXCOORD:
+      assert(index < 8);
+      return SI_MAX_IO_GENERIC + 6 + index;
+
+   /* These are rarely used between LS and HS or ES and GS. */
+   case TGSI_SEMANTIC_CLIPDIST:
+      assert(index < 2);
+      return SI_MAX_IO_GENERIC + 6 + 8 + index;
+   case TGSI_SEMANTIC_CLIPVERTEX:
+      return SI_MAX_IO_GENERIC + 6 + 8 + 2;
+   case TGSI_SEMANTIC_PSIZE:
+      return SI_MAX_IO_GENERIC + 6 + 8 + 3;
+
+   /* These can't be written by LS, HS, and ES. */
+   case TGSI_SEMANTIC_LAYER:
+      return SI_MAX_IO_GENERIC + 6 + 8 + 4;
+   case TGSI_SEMANTIC_VIEWPORT_INDEX:
+      return SI_MAX_IO_GENERIC + 6 + 8 + 5;
+   case TGSI_SEMANTIC_PRIMID:
+      STATIC_ASSERT(SI_MAX_IO_GENERIC + 6 + 8 + 6 <= 63);
+      return SI_MAX_IO_GENERIC + 6 + 8 + 6;
+   default:
+      fprintf(stderr, "invalid semantic name = %u\n", semantic_name);
+      assert(!"invalid semantic name");
+      return 0;
+   }
 }
 
 static void si_dump_streamout(struct pipe_stream_output_info *so)
 {
-	unsigned i;
-
-	if (so->num_outputs)
-		fprintf(stderr, "STREAMOUT\n");
-
-	for (i = 0; i < so->num_outputs; i++) {
-		unsigned mask = ((1 << so->output[i].num_components) - 1) <<
-				so->output[i].start_component;
-		fprintf(stderr, "  %i: BUF%i[%i..%i] <- OUT[%i].%s%s%s%s\n",
-			i, so->output[i].output_buffer,
-			so->output[i].dst_offset, so->output[i].dst_offset + so->output[i].num_components - 1,
-			so->output[i].register_index,
-			mask & 1 ? "x" : "",
-		        mask & 2 ? "y" : "",
-		        mask & 4 ? "z" : "",
-		        mask & 8 ? "w" : "");
-	}
+   unsigned i;
+
+   if (so->num_outputs)
+      fprintf(stderr, "STREAMOUT\n");
+
+   for (i = 0; i < so->num_outputs; i++) {
+      unsigned mask = ((1 << so->output[i].num_components) - 1) << so->output[i].start_component;
+      fprintf(stderr, "  %i: BUF%i[%i..%i] <- OUT[%i].%s%s%s%s\n", i, so->output[i].output_buffer,
+              so->output[i].dst_offset, so->output[i].dst_offset + so->output[i].num_components - 1,
+              so->output[i].register_index, mask & 1 ? "x" : "", mask & 2 ? "y" : "",
+              mask & 4 ? "z" : "", mask & 8 ? "w" : "");
+   }
 }
 
 static void declare_streamout_params(struct si_shader_context *ctx,
-				     struct pipe_stream_output_info *so)
+                                     struct pipe_stream_output_info *so)
 {
-	if (ctx->screen->use_ngg_streamout) {
-		if (ctx->type == PIPE_SHADER_TESS_EVAL)
-			ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
-		return;
-	}
-
-	/* Streamout SGPRs. */
-	if (so->num_outputs) {
-		ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->streamout_config);
-		ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->streamout_write_index);
-	} else if (ctx->type == PIPE_SHADER_TESS_EVAL) {
-		ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
-	}
-
-	/* A streamout buffer offset is loaded if the stride is non-zero. */
-	for (int i = 0; i < 4; i++) {
-		if (!so->stride[i])
-			continue;
-
-		ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->streamout_offset[i]);
-	}
+   if (ctx->screen->use_ngg_streamout) {
+      if (ctx->type == PIPE_SHADER_TESS_EVAL)
+         ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
+      return;
+   }
+
+   /* Streamout SGPRs. */
+   if (so->num_outputs) {
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->streamout_config);
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->streamout_write_index);
+   } else if (ctx->type == PIPE_SHADER_TESS_EVAL) {
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
+   }
+
+   /* A streamout buffer offset is loaded if the stride is non-zero. */
+   for (int i = 0; i < 4; i++) {
+      if (!so->stride[i])
+         continue;
+
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->streamout_offset[i]);
+   }
 }
 
 unsigned si_get_max_workgroup_size(const struct si_shader *shader)
 {
-	switch (shader->selector->type) {
-	case PIPE_SHADER_VERTEX:
-	case PIPE_SHADER_TESS_EVAL:
-		return shader->key.as_ngg ? 128 : 0;
-
-	case PIPE_SHADER_TESS_CTRL:
-		/* Return this so that LLVM doesn't remove s_barrier
-		 * instructions on chips where we use s_barrier. */
-		return shader->selector->screen->info.chip_class >= GFX7 ? 128 : 0;
-
-	case PIPE_SHADER_GEOMETRY:
-		return shader->selector->screen->info.chip_class >= GFX9 ? 128 : 0;
-
-	case PIPE_SHADER_COMPUTE:
-		break; /* see below */
-
-	default:
-		return 0;
-	}
-
-	const unsigned *properties = shader->selector->info.properties;
-	unsigned max_work_group_size =
-	               properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] *
-	               properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT] *
-	               properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH];
-
-	if (!max_work_group_size) {
-		/* This is a variable group size compute shader,
-		 * compile it for the maximum possible group size.
-		 */
-		max_work_group_size = SI_MAX_VARIABLE_THREADS_PER_BLOCK;
-	}
-	return max_work_group_size;
+   switch (shader->selector->type) {
+   case PIPE_SHADER_VERTEX:
+   case PIPE_SHADER_TESS_EVAL:
+      return shader->key.as_ngg ? 128 : 0;
+
+   case PIPE_SHADER_TESS_CTRL:
+      /* Return this so that LLVM doesn't remove s_barrier
+       * instructions on chips where we use s_barrier. */
+      return shader->selector->screen->info.chip_class >= GFX7 ? 128 : 0;
+
+   case PIPE_SHADER_GEOMETRY:
+      return shader->selector->screen->info.chip_class >= GFX9 ? 128 : 0;
+
+   case PIPE_SHADER_COMPUTE:
+      break; /* see below */
+
+   default:
+      return 0;
+   }
+
+   const unsigned *properties = shader->selector->info.properties;
+   unsigned max_work_group_size = properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] *
+                                  properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT] *
+                                  properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH];
+
+   if (!max_work_group_size) {
+      /* This is a variable group size compute shader,
+       * compile it for the maximum possible group size.
+       */
+      max_work_group_size = SI_MAX_VARIABLE_THREADS_PER_BLOCK;
+   }
+   return max_work_group_size;
 }
 
-static void declare_const_and_shader_buffers(struct si_shader_context *ctx,
-					     bool assign_params)
+static void declare_const_and_shader_buffers(struct si_shader_context *ctx, bool assign_params)
 {
-	enum ac_arg_type const_shader_buf_type;
+   enum ac_arg_type const_shader_buf_type;
 
-	if (ctx->shader->selector->info.const_buffers_declared == 1 &&
-	    ctx->shader->selector->info.shader_buffers_declared == 0)
-		const_shader_buf_type = AC_ARG_CONST_FLOAT_PTR;
-	else
-		const_shader_buf_type = AC_ARG_CONST_DESC_PTR;
+   if (ctx->shader->selector->info.const_buffers_declared == 1 &&
+       ctx->shader->selector->info.shader_buffers_declared == 0)
+      const_shader_buf_type = AC_ARG_CONST_FLOAT_PTR;
+   else
+      const_shader_buf_type = AC_ARG_CONST_DESC_PTR;
 
-	ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, const_shader_buf_type,
-		   assign_params ? &ctx->const_and_shader_buffers :
-		   &ctx->other_const_and_shader_buffers);
+   ac_add_arg(
+      &ctx->args, AC_ARG_SGPR, 1, const_shader_buf_type,
+      assign_params ? &ctx->const_and_shader_buffers : &ctx->other_const_and_shader_buffers);
 }
 
-static void declare_samplers_and_images(struct si_shader_context *ctx,
-					bool assign_params)
+static void declare_samplers_and_images(struct si_shader_context *ctx, bool assign_params)
 {
-	ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_IMAGE_PTR,
-		   assign_params ? &ctx->samplers_and_images :
-		   &ctx->other_samplers_and_images);
+   ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_IMAGE_PTR,
+              assign_params ? &ctx->samplers_and_images : &ctx->other_samplers_and_images);
 }
 
-static void declare_per_stage_desc_pointers(struct si_shader_context *ctx,
-					    bool assign_params)
+static void declare_per_stage_desc_pointers(struct si_shader_context *ctx, bool assign_params)
 {
-	declare_const_and_shader_buffers(ctx, assign_params);
-	declare_samplers_and_images(ctx, assign_params);
+   declare_const_and_shader_buffers(ctx, assign_params);
+   declare_samplers_and_images(ctx, assign_params);
 }
 
 static void declare_global_desc_pointers(struct si_shader_context *ctx)
 {
-	ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR,
-		   &ctx->rw_buffers);
-	ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_IMAGE_PTR,
-		   &ctx->bindless_samplers_and_images);
+   ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR, &ctx->rw_buffers);
+   ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_IMAGE_PTR,
+              &ctx->bindless_samplers_and_images);
 }
 
 static void declare_vs_specific_input_sgprs(struct si_shader_context *ctx)
 {
-	ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->vs_state_bits);
-	if (!ctx->shader->is_gs_copy_shader) {
-		ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.base_vertex);
-		ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.start_instance);
-		ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.draw_id);
-	}
+   ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->vs_state_bits);
+   if (!ctx->shader->is_gs_copy_shader) {
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.base_vertex);
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.start_instance);
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.draw_id);
+   }
 }
 
 static void declare_vb_descriptor_input_sgprs(struct si_shader_context *ctx)
 {
-	ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR, &ctx->vertex_buffers);
+   ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR, &ctx->vertex_buffers);
 
-	unsigned num_vbos_in_user_sgprs = ctx->shader->selector->num_vbos_in_user_sgprs;
-	if (num_vbos_in_user_sgprs) {
-		unsigned user_sgprs = ctx->args.num_sgprs_used;
+   unsigned num_vbos_in_user_sgprs = ctx->shader->selector->num_vbos_in_user_sgprs;
+   if (num_vbos_in_user_sgprs) {
+      unsigned user_sgprs = ctx->args.num_sgprs_used;
 
-		if (si_is_merged_shader(ctx->shader))
-			user_sgprs -= 8;
-		assert(user_sgprs <= SI_SGPR_VS_VB_DESCRIPTOR_FIRST);
+      if (si_is_merged_shader(ctx->shader))
+         user_sgprs -= 8;
+      assert(user_sgprs <= SI_SGPR_VS_VB_DESCRIPTOR_FIRST);
 
-		/* Declare unused SGPRs to align VB descriptors to 4 SGPRs (hw requirement). */
-		for (unsigned i = user_sgprs; i < SI_SGPR_VS_VB_DESCRIPTOR_FIRST; i++)
-			ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
+      /* Declare unused SGPRs to align VB descriptors to 4 SGPRs (hw requirement). */
+      for (unsigned i = user_sgprs; i < SI_SGPR_VS_VB_DESCRIPTOR_FIRST; i++)
+         ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
 
-		assert(num_vbos_in_user_sgprs <= ARRAY_SIZE(ctx->vb_descriptors));
-		for (unsigned i = 0; i < num_vbos_in_user_sgprs; i++)
-			ac_add_arg(&ctx->args, AC_ARG_SGPR, 4, AC_ARG_INT, &ctx->vb_descriptors[i]);
-	}
+      assert(num_vbos_in_user_sgprs <= ARRAY_SIZE(ctx->vb_descriptors));
+      for (unsigned i = 0; i < num_vbos_in_user_sgprs; i++)
+         ac_add_arg(&ctx->args, AC_ARG_SGPR, 4, AC_ARG_INT, &ctx->vb_descriptors[i]);
+   }
 }
 
-static void declare_vs_input_vgprs(struct si_shader_context *ctx,
-				   unsigned *num_prolog_vgprs,
-				   bool ngg_cull_shader)
+static void declare_vs_input_vgprs(struct si_shader_context *ctx, unsigned *num_prolog_vgprs,
+                                   bool ngg_cull_shader)
 {
-	struct si_shader *shader = ctx->shader;
-
-	ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.vertex_id);
-	if (shader->key.as_ls) {
-		ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->rel_auto_id);
-		if (ctx->screen->info.chip_class >= GFX10) {
-			ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* user VGPR */
-			ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.instance_id);
-		} else {
-			ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.instance_id);
-			ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* unused */
-		}
-	} else if (ctx->screen->info.chip_class >= GFX10) {
-		ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* user VGPR */
-		ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT,
-			   &ctx->vs_prim_id); /* user vgpr or PrimID (legacy) */
-		ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.instance_id);
-	} else {
-		ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.instance_id);
-		ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->vs_prim_id);
-		ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* unused */
-	}
-
-	if (!shader->is_gs_copy_shader) {
-		if (shader->key.opt.ngg_culling && !ngg_cull_shader) {
-			ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT,
-				   &ctx->ngg_old_thread_id);
-		}
-
-		/* Vertex load indices. */
-		if (shader->selector->info.num_inputs) {
-			ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT,
-				   &ctx->vertex_index0);
-			for (unsigned i = 1; i < shader->selector->info.num_inputs; i++)
-				ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL);
-		}
-		*num_prolog_vgprs += shader->selector->info.num_inputs;
-	}
+   struct si_shader *shader = ctx->shader;
+
+   ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.vertex_id);
+   if (shader->key.as_ls) {
+      ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->rel_auto_id);
+      if (ctx->screen->info.chip_class >= GFX10) {
+         ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* user VGPR */
+         ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.instance_id);
+      } else {
+         ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.instance_id);
+         ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* unused */
+      }
+   } else if (ctx->screen->info.chip_class >= GFX10) {
+      ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* user VGPR */
+      ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT,
+                 &ctx->vs_prim_id); /* user vgpr or PrimID (legacy) */
+      ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.instance_id);
+   } else {
+      ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.instance_id);
+      ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->vs_prim_id);
+      ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* unused */
+   }
+
+   if (!shader->is_gs_copy_shader) {
+      if (shader->key.opt.ngg_culling && !ngg_cull_shader) {
+         ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->ngg_old_thread_id);
+      }
+
+      /* Vertex load indices. */
+      if (shader->selector->info.num_inputs) {
+         ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->vertex_index0);
+         for (unsigned i = 1; i < shader->selector->info.num_inputs; i++)
+            ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL);
+      }
+      *num_prolog_vgprs += shader->selector->info.num_inputs;
+   }
 }
 
-static void declare_vs_blit_inputs(struct si_shader_context *ctx,
-				   unsigned vs_blit_property)
+static void declare_vs_blit_inputs(struct si_shader_context *ctx, unsigned vs_blit_property)
 {
-	ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT,
-		   &ctx->vs_blit_inputs); /* i16 x1, y1 */
-	ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* i16 x1, y1 */
-	ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* depth */
-
-	if (vs_blit_property == SI_VS_BLIT_SGPRS_POS_COLOR) {
-		ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* color0 */
-		ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* color1 */
-		ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* color2 */
-		ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* color3 */
-	} else if (vs_blit_property == SI_VS_BLIT_SGPRS_POS_TEXCOORD) {
-		ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.x1 */
-		ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.y1 */
-		ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.x2 */
-		ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.y2 */
-		ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.z */
-		ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.w */
-	}
+   ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->vs_blit_inputs); /* i16 x1, y1 */
+   ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);                 /* i16 x1, y1 */
+   ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL);               /* depth */
+
+   if (vs_blit_property == SI_VS_BLIT_SGPRS_POS_COLOR) {
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* color0 */
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* color1 */
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* color2 */
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* color3 */
+   } else if (vs_blit_property == SI_VS_BLIT_SGPRS_POS_TEXCOORD) {
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.x1 */
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.y1 */
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.x2 */
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.y2 */
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.z */
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.w */
+   }
 }
 
 static void declare_tes_input_vgprs(struct si_shader_context *ctx, bool ngg_cull_shader)
 {
-	ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &ctx->tes_u);
-	ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &ctx->tes_v);
-	ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->tes_rel_patch_id);
-	ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tes_patch_id);
-
-	if (ctx->shader->key.opt.ngg_culling && !ngg_cull_shader) {
-		ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT,
-			   &ctx->ngg_old_thread_id);
-	}
+   ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &ctx->tes_u);
+   ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &ctx->tes_v);
+   ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->tes_rel_patch_id);
+   ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tes_patch_id);
+
+   if (ctx->shader->key.opt.ngg_culling && !ngg_cull_shader) {
+      ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->ngg_old_thread_id);
+   }
 }
 
-enum {
-	/* Convenient merged shader definitions. */
-	SI_SHADER_MERGED_VERTEX_TESSCTRL = PIPE_SHADER_TYPES,
-	SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY,
+enum
+{
+   /* Convenient merged shader definitions. */
+   SI_SHADER_MERGED_VERTEX_TESSCTRL = PIPE_SHADER_TYPES,
+   SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY,
 };
 
-void si_add_arg_checked(struct ac_shader_args *args,
-			enum ac_arg_regfile file,
-			unsigned registers, enum ac_arg_type type,
-			struct ac_arg *arg,
-			unsigned idx)
+void si_add_arg_checked(struct ac_shader_args *args, enum ac_arg_regfile file, unsigned registers,
+                        enum ac_arg_type type, struct ac_arg *arg, unsigned idx)
 {
-	assert(args->arg_count == idx);
-	ac_add_arg(args, file, registers, type, arg);
+   assert(args->arg_count == idx);
+   ac_add_arg(args, file, registers, type, arg);
 }
 
 void si_create_function(struct si_shader_context *ctx, bool ngg_cull_shader)
 {
-	struct si_shader *shader = ctx->shader;
-	LLVMTypeRef returns[AC_MAX_ARGS];
-	unsigned i, num_return_sgprs;
-	unsigned num_returns = 0;
-	unsigned num_prolog_vgprs = 0;
-	unsigned type = ctx->type;
-	unsigned vs_blit_property =
-		shader->selector->info.properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD];
-
-	memset(&ctx->args, 0, sizeof(ctx->args));
-
-	/* Set MERGED shaders. */
-	if (ctx->screen->info.chip_class >= GFX9) {
-		if (shader->key.as_ls || type == PIPE_SHADER_TESS_CTRL)
-			type = SI_SHADER_MERGED_VERTEX_TESSCTRL; /* LS or HS */
-		else if (shader->key.as_es || shader->key.as_ngg || type == PIPE_SHADER_GEOMETRY)
-			type = SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY;
-	}
-
-	switch (type) {
-	case PIPE_SHADER_VERTEX:
-		declare_global_desc_pointers(ctx);
-
-		if (vs_blit_property) {
-			declare_vs_blit_inputs(ctx, vs_blit_property);
-
-			/* VGPRs */
-			declare_vs_input_vgprs(ctx, &num_prolog_vgprs, ngg_cull_shader);
-			break;
-		}
-
-		declare_per_stage_desc_pointers(ctx, true);
-		declare_vs_specific_input_sgprs(ctx); 
-		if (!shader->is_gs_copy_shader)
-			declare_vb_descriptor_input_sgprs(ctx);
-
-		if (shader->key.as_es) {
-			ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT,
-				   &ctx->es2gs_offset);
-		} else if (shader->key.as_ls) {
-			/* no extra parameters */
-		} else {
-			/* The locations of the other parameters are assigned dynamically. */
-			declare_streamout_params(ctx, &shader->selector->so);
-		}
-
-		/* VGPRs */
-		declare_vs_input_vgprs(ctx, &num_prolog_vgprs, ngg_cull_shader);
-
-		/* Return values */
-		if (shader->key.opt.vs_as_prim_discard_cs) {
-			for (i = 0; i < 4; i++)
-				returns[num_returns++] = ctx->ac.f32; /* VGPRs */
-		}
-		break;
-
-	case PIPE_SHADER_TESS_CTRL: /* GFX6-GFX8 */
-		declare_global_desc_pointers(ctx);
-		declare_per_stage_desc_pointers(ctx, true);
-		ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_layout);
-		ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_out_lds_offsets);
-		ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_out_lds_layout);
-		ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->vs_state_bits);
-		ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_offset);
-		ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_factor_offset);
-
-		/* VGPRs */
-		ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tcs_patch_id);
-		ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tcs_rel_ids);
-
-		/* param_tcs_offchip_offset and param_tcs_factor_offset are
-		 * placed after the user SGPRs.
-		 */
-		for (i = 0; i < GFX6_TCS_NUM_USER_SGPR + 2; i++)
-			returns[num_returns++] = ctx->ac.i32; /* SGPRs */
-		for (i = 0; i < 11; i++)
-			returns[num_returns++] = ctx->ac.f32; /* VGPRs */
-		break;
-
-	case SI_SHADER_MERGED_VERTEX_TESSCTRL:
-		/* Merged stages have 8 system SGPRs at the beginning. */
-		/* SPI_SHADER_USER_DATA_ADDR_LO/HI_HS */
-		declare_per_stage_desc_pointers(ctx,
-						ctx->type == PIPE_SHADER_TESS_CTRL);
-		ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_offset);
-		ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->merged_wave_info);
-		ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_factor_offset);
-		ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->merged_scratch_offset);
-		ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
-		ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
-
-		declare_global_desc_pointers(ctx);
-		declare_per_stage_desc_pointers(ctx,
-						ctx->type == PIPE_SHADER_VERTEX);
-		declare_vs_specific_input_sgprs(ctx);
-
-		ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_layout);
-		ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_out_lds_offsets);
-		ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_out_lds_layout);
-		declare_vb_descriptor_input_sgprs(ctx);
-
-		/* VGPRs (first TCS, then VS) */
-		ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tcs_patch_id);
-		ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tcs_rel_ids);
-
-		if (ctx->type == PIPE_SHADER_VERTEX) {
-			declare_vs_input_vgprs(ctx, &num_prolog_vgprs, ngg_cull_shader);
-
-			/* LS return values are inputs to the TCS main shader part. */
-			for (i = 0; i < 8 + GFX9_TCS_NUM_USER_SGPR; i++)
-				returns[num_returns++] = ctx->ac.i32; /* SGPRs */
-			for (i = 0; i < 2; i++)
-				returns[num_returns++] = ctx->ac.f32; /* VGPRs */
-		} else {
-			/* TCS return values are inputs to the TCS epilog.
-			 *
-			 * param_tcs_offchip_offset, param_tcs_factor_offset,
-			 * param_tcs_offchip_layout, and param_rw_buffers
-			 * should be passed to the epilog.
-			 */
-			for (i = 0; i <= 8 + GFX9_SGPR_TCS_OUT_LAYOUT; i++)
-				returns[num_returns++] = ctx->ac.i32; /* SGPRs */
-			for (i = 0; i < 11; i++)
-				returns[num_returns++] = ctx->ac.f32; /* VGPRs */
-		}
-		break;
-
-	case SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY:
-		/* Merged stages have 8 system SGPRs at the beginning. */
-		/* SPI_SHADER_USER_DATA_ADDR_LO/HI_GS */
-		declare_per_stage_desc_pointers(ctx,
-						ctx->type == PIPE_SHADER_GEOMETRY);
-
-		if (ctx->shader->key.as_ngg)
-			ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->gs_tg_info);
-		else
-			ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->gs2vs_offset);
-
-		ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->merged_wave_info);
-		ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_offset);
-		ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->merged_scratch_offset);
-		ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR,
-			   &ctx->small_prim_cull_info); /* SPI_SHADER_PGM_LO_GS << 8 */
-		ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused (SPI_SHADER_PGM_LO/HI_GS >> 24) */
-
-		declare_global_desc_pointers(ctx);
-		if (ctx->type != PIPE_SHADER_VERTEX || !vs_blit_property) {
-			declare_per_stage_desc_pointers(ctx,
-							(ctx->type == PIPE_SHADER_VERTEX ||
-							 ctx->type == PIPE_SHADER_TESS_EVAL));
-		}
-
-		if (ctx->type == PIPE_SHADER_VERTEX) {
-			if (vs_blit_property)
-				declare_vs_blit_inputs(ctx, vs_blit_property);
-			else
-				declare_vs_specific_input_sgprs(ctx);
-		} else {
-			ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->vs_state_bits);
-			ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_layout);
-			ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tes_offchip_addr);
-			/* Declare as many input SGPRs as the VS has. */
-		}
-
-		if (ctx->type == PIPE_SHADER_VERTEX)
-			declare_vb_descriptor_input_sgprs(ctx);
-
-		/* VGPRs (first GS, then VS/TES) */
-		ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx01_offset);
-		ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx23_offset);
-		ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_prim_id);
-		ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_invocation_id);
-		ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx45_offset);
-
-		if (ctx->type == PIPE_SHADER_VERTEX) {
-			declare_vs_input_vgprs(ctx, &num_prolog_vgprs, ngg_cull_shader);
-		} else if (ctx->type == PIPE_SHADER_TESS_EVAL) {
-			declare_tes_input_vgprs(ctx, ngg_cull_shader);
-		}
-
-		if ((ctx->shader->key.as_es || ngg_cull_shader) &&
-		    (ctx->type == PIPE_SHADER_VERTEX ||
-		     ctx->type == PIPE_SHADER_TESS_EVAL)) {
-			unsigned num_user_sgprs, num_vgprs;
-
-			if (ctx->type == PIPE_SHADER_VERTEX) {
-				/* For the NGG cull shader, add 1 SGPR to hold
-				 * the vertex buffer pointer.
-				 */
-				num_user_sgprs = GFX9_VSGS_NUM_USER_SGPR + ngg_cull_shader;
-
-				if (ngg_cull_shader && shader->selector->num_vbos_in_user_sgprs) {
-					assert(num_user_sgprs <= 8 + SI_SGPR_VS_VB_DESCRIPTOR_FIRST);
-					num_user_sgprs = SI_SGPR_VS_VB_DESCRIPTOR_FIRST +
-							 shader->selector->num_vbos_in_user_sgprs * 4;
-				}
-			} else {
-				num_user_sgprs = GFX9_TESGS_NUM_USER_SGPR;
-			}
-
-			/* The NGG cull shader has to return all 9 VGPRs + the old thread ID.
-			 *
-			 * The normal merged ESGS shader only has to return the 5 VGPRs
-			 * for the GS stage.
-			 */
-			num_vgprs = ngg_cull_shader ? 10 : 5;
-
-			/* ES return values are inputs to GS. */
-			for (i = 0; i < 8 + num_user_sgprs; i++)
-				returns[num_returns++] = ctx->ac.i32; /* SGPRs */
-			for (i = 0; i < num_vgprs; i++)
-				returns[num_returns++] = ctx->ac.f32; /* VGPRs */
-		}
-		break;
-
-	case PIPE_SHADER_TESS_EVAL:
-		declare_global_desc_pointers(ctx);
-		declare_per_stage_desc_pointers(ctx, true);
-		ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->vs_state_bits);
-		ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_layout);
-		ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tes_offchip_addr);
-
-		if (shader->key.as_es) {
-			ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_offset);
-			ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
-			ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->es2gs_offset);
-		} else {
-			declare_streamout_params(ctx, &shader->selector->so);
-			ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_offset);
-		}
-
-		/* VGPRs */
-		declare_tes_input_vgprs(ctx, ngg_cull_shader);
-		break;
-
-	case PIPE_SHADER_GEOMETRY:
-		declare_global_desc_pointers(ctx);
-		declare_per_stage_desc_pointers(ctx, true);
-		ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->gs2vs_offset);
-		ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->gs_wave_id);
-
-		/* VGPRs */
-		ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx_offset[0]);
-		ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx_offset[1]);
-		ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_prim_id);
-		ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx_offset[2]);
-		ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx_offset[3]);
-		ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx_offset[4]);
-		ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx_offset[5]);
-		ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_invocation_id);
-		break;
-
-	case PIPE_SHADER_FRAGMENT:
-		declare_global_desc_pointers(ctx);
-		declare_per_stage_desc_pointers(ctx, true);
-		si_add_arg_checked(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL,
-				SI_PARAM_ALPHA_REF);
-		si_add_arg_checked(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT,
-				&ctx->args.prim_mask, SI_PARAM_PRIM_MASK);
-
-		si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT, &ctx->args.persp_sample,
-				SI_PARAM_PERSP_SAMPLE);
-		si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT,
-				&ctx->args.persp_center, SI_PARAM_PERSP_CENTER);
-		si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT,
-				&ctx->args.persp_centroid, SI_PARAM_PERSP_CENTROID);
-		si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 3, AC_ARG_INT,
-				NULL, SI_PARAM_PERSP_PULL_MODEL);
-		si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT,
-				&ctx->args.linear_sample, SI_PARAM_LINEAR_SAMPLE);
-		si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT,
-				&ctx->args.linear_center, SI_PARAM_LINEAR_CENTER);
-		si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT,
-				&ctx->args.linear_centroid, SI_PARAM_LINEAR_CENTROID);
-		si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 3, AC_ARG_FLOAT,
-				NULL, SI_PARAM_LINE_STIPPLE_TEX);
-		si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT,
-				&ctx->args.frag_pos[0], SI_PARAM_POS_X_FLOAT);
-		si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT,
-				&ctx->args.frag_pos[1], SI_PARAM_POS_Y_FLOAT);
-		si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT,
-				&ctx->args.frag_pos[2], SI_PARAM_POS_Z_FLOAT);
-		si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT,
-				&ctx->args.frag_pos[3], SI_PARAM_POS_W_FLOAT);
-		shader->info.face_vgpr_index = ctx->args.num_vgprs_used;
-		si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT,
-				&ctx->args.front_face, SI_PARAM_FRONT_FACE);
-		shader->info.ancillary_vgpr_index = ctx->args.num_vgprs_used;
-		si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT,
-				&ctx->args.ancillary, SI_PARAM_ANCILLARY);
-		si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT,
-				&ctx->args.sample_coverage, SI_PARAM_SAMPLE_COVERAGE);
-		si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT,
-				&ctx->pos_fixed_pt, SI_PARAM_POS_FIXED_PT);
-
-		/* Color inputs from the prolog. */
-		if (shader->selector->info.colors_read) {
-			unsigned num_color_elements =
-				util_bitcount(shader->selector->info.colors_read);
-
-			for (i = 0; i < num_color_elements; i++)
-				ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, NULL);
-
-			num_prolog_vgprs += num_color_elements;
-		}
-
-		/* Outputs for the epilog. */
-		num_return_sgprs = SI_SGPR_ALPHA_REF + 1;
-		num_returns =
-			num_return_sgprs +
-			util_bitcount(shader->selector->info.colors_written) * 4 +
-			shader->selector->info.writes_z +
-			shader->selector->info.writes_stencil +
-			shader->selector->info.writes_samplemask +
-			1 /* SampleMaskIn */;
-
-		num_returns = MAX2(num_returns,
-				   num_return_sgprs +
-				   PS_EPILOG_SAMPLEMASK_MIN_LOC + 1);
-
-		for (i = 0; i < num_return_sgprs; i++)
-			returns[i] = ctx->ac.i32;
-		for (; i < num_returns; i++)
-			returns[i] = ctx->ac.f32;
-		break;
-
-	case PIPE_SHADER_COMPUTE:
-		declare_global_desc_pointers(ctx);
-		declare_per_stage_desc_pointers(ctx, true);
-		if (shader->selector->info.uses_grid_size)
-			ac_add_arg(&ctx->args, AC_ARG_SGPR, 3, AC_ARG_INT,
-				   &ctx->args.num_work_groups);
-		if (shader->selector->info.uses_block_size &&
-		    shader->selector->info.properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] == 0)
-			ac_add_arg(&ctx->args, AC_ARG_SGPR, 3, AC_ARG_INT, &ctx->block_size);
-
-		unsigned cs_user_data_dwords =
-			shader->selector->info.properties[TGSI_PROPERTY_CS_USER_DATA_COMPONENTS_AMD];
-		if (cs_user_data_dwords) {
-			ac_add_arg(&ctx->args, AC_ARG_SGPR, cs_user_data_dwords, AC_ARG_INT,
-				   &ctx->cs_user_data);
-		}
-
-		/* Hardware SGPRs. */
-		for (i = 0; i < 3; i++) {
-			if (shader->selector->info.uses_block_id[i]) {
-				ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT,
-					   &ctx->args.workgroup_ids[i]);
-			}
-		}
-		if (shader->selector->info.uses_subgroup_info)
-			ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.tg_size);
-
-		/* Hardware VGPRs. */
-		ac_add_arg(&ctx->args, AC_ARG_VGPR, 3, AC_ARG_INT,
-			   &ctx->args.local_invocation_ids);
-		break;
-	default:
-		assert(0 && "unimplemented shader");
-		return;
-	}
-
-	si_llvm_create_func(ctx, ngg_cull_shader ? "ngg_cull_main" : "main",
-			    returns, num_returns, si_get_max_workgroup_size(shader));
-
-	/* Reserve register locations for VGPR inputs the PS prolog may need. */
-	if (ctx->type == PIPE_SHADER_FRAGMENT && !ctx->shader->is_monolithic) {
-		ac_llvm_add_target_dep_function_attr(ctx->main_fn,
-						     "InitialPSInputAddr",
-						     S_0286D0_PERSP_SAMPLE_ENA(1) |
-						     S_0286D0_PERSP_CENTER_ENA(1) |
-						     S_0286D0_PERSP_CENTROID_ENA(1) |
-						     S_0286D0_LINEAR_SAMPLE_ENA(1) |
-						     S_0286D0_LINEAR_CENTER_ENA(1) |
-						     S_0286D0_LINEAR_CENTROID_ENA(1) |
-						     S_0286D0_FRONT_FACE_ENA(1) |
-						     S_0286D0_ANCILLARY_ENA(1) |
-						     S_0286D0_POS_FIXED_PT_ENA(1));
-	}
-
-	shader->info.num_input_sgprs = ctx->args.num_sgprs_used;
-	shader->info.num_input_vgprs = ctx->args.num_vgprs_used;
-
-	assert(shader->info.num_input_vgprs >= num_prolog_vgprs);
-	shader->info.num_input_vgprs -= num_prolog_vgprs;
-
-	if (shader->key.as_ls || ctx->type == PIPE_SHADER_TESS_CTRL) {
-		if (USE_LDS_SYMBOLS && LLVM_VERSION_MAJOR >= 9) {
-			/* The LSHS size is not known until draw time, so we append it
-			 * at the end of whatever LDS use there may be in the rest of
-			 * the shader (currently none, unless LLVM decides to do its
-			 * own LDS-based lowering).
-			 */
-			ctx->ac.lds = LLVMAddGlobalInAddressSpace(
-				ctx->ac.module, LLVMArrayType(ctx->ac.i32, 0),
-				"__lds_end", AC_ADDR_SPACE_LDS);
-			LLVMSetAlignment(ctx->ac.lds, 256);
-		} else {
-			ac_declare_lds_as_pointer(&ctx->ac);
-		}
-	}
-
-	/* Unlike radv, we override these arguments in the prolog, so to the
-	 * API shader they appear as normal arguments.
-	 */
-	if (ctx->type == PIPE_SHADER_VERTEX) {
-		ctx->abi.vertex_id = ac_get_arg(&ctx->ac, ctx->args.vertex_id);
-		ctx->abi.instance_id = ac_get_arg(&ctx->ac, ctx->args.instance_id);
-	} else if (ctx->type == PIPE_SHADER_FRAGMENT) {
-		ctx->abi.persp_centroid = ac_get_arg(&ctx->ac, ctx->args.persp_centroid);
-		ctx->abi.linear_centroid = ac_get_arg(&ctx->ac, ctx->args.linear_centroid);
-	}
+   struct si_shader *shader = ctx->shader;
+   LLVMTypeRef returns[AC_MAX_ARGS];
+   unsigned i, num_return_sgprs;
+   unsigned num_returns = 0;
+   unsigned num_prolog_vgprs = 0;
+   unsigned type = ctx->type;
+   unsigned vs_blit_property = shader->selector->info.properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD];
+
+   memset(&ctx->args, 0, sizeof(ctx->args));
+
+   /* Set MERGED shaders. */
+   if (ctx->screen->info.chip_class >= GFX9) {
+      if (shader->key.as_ls || type == PIPE_SHADER_TESS_CTRL)
+         type = SI_SHADER_MERGED_VERTEX_TESSCTRL; /* LS or HS */
+      else if (shader->key.as_es || shader->key.as_ngg || type == PIPE_SHADER_GEOMETRY)
+         type = SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY;
+   }
+
+   switch (type) {
+   case PIPE_SHADER_VERTEX:
+      declare_global_desc_pointers(ctx);
+
+      if (vs_blit_property) {
+         declare_vs_blit_inputs(ctx, vs_blit_property);
+
+         /* VGPRs */
+         declare_vs_input_vgprs(ctx, &num_prolog_vgprs, ngg_cull_shader);
+         break;
+      }
+
+      declare_per_stage_desc_pointers(ctx, true);
+      declare_vs_specific_input_sgprs(ctx);
+      if (!shader->is_gs_copy_shader)
+         declare_vb_descriptor_input_sgprs(ctx);
+
+      if (shader->key.as_es) {
+         ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->es2gs_offset);
+      } else if (shader->key.as_ls) {
+         /* no extra parameters */
+      } else {
+         /* The locations of the other parameters are assigned dynamically. */
+         declare_streamout_params(ctx, &shader->selector->so);
+      }
+
+      /* VGPRs */
+      declare_vs_input_vgprs(ctx, &num_prolog_vgprs, ngg_cull_shader);
+
+      /* Return values */
+      if (shader->key.opt.vs_as_prim_discard_cs) {
+         for (i = 0; i < 4; i++)
+            returns[num_returns++] = ctx->ac.f32; /* VGPRs */
+      }
+      break;
+
+   case PIPE_SHADER_TESS_CTRL: /* GFX6-GFX8 */
+      declare_global_desc_pointers(ctx);
+      declare_per_stage_desc_pointers(ctx, true);
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_layout);
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_out_lds_offsets);
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_out_lds_layout);
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->vs_state_bits);
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_offset);
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_factor_offset);
+
+      /* VGPRs */
+      ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tcs_patch_id);
+      ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tcs_rel_ids);
+
+      /* param_tcs_offchip_offset and param_tcs_factor_offset are
+       * placed after the user SGPRs.
+       */
+      for (i = 0; i < GFX6_TCS_NUM_USER_SGPR + 2; i++)
+         returns[num_returns++] = ctx->ac.i32; /* SGPRs */
+      for (i = 0; i < 11; i++)
+         returns[num_returns++] = ctx->ac.f32; /* VGPRs */
+      break;
+
+   case SI_SHADER_MERGED_VERTEX_TESSCTRL:
+      /* Merged stages have 8 system SGPRs at the beginning. */
+      /* SPI_SHADER_USER_DATA_ADDR_LO/HI_HS */
+      declare_per_stage_desc_pointers(ctx, ctx->type == PIPE_SHADER_TESS_CTRL);
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_offset);
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->merged_wave_info);
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_factor_offset);
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->merged_scratch_offset);
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
+
+      declare_global_desc_pointers(ctx);
+      declare_per_stage_desc_pointers(ctx, ctx->type == PIPE_SHADER_VERTEX);
+      declare_vs_specific_input_sgprs(ctx);
+
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_layout);
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_out_lds_offsets);
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_out_lds_layout);
+      declare_vb_descriptor_input_sgprs(ctx);
+
+      /* VGPRs (first TCS, then VS) */
+      ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tcs_patch_id);
+      ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tcs_rel_ids);
+
+      if (ctx->type == PIPE_SHADER_VERTEX) {
+         declare_vs_input_vgprs(ctx, &num_prolog_vgprs, ngg_cull_shader);
+
+         /* LS return values are inputs to the TCS main shader part. */
+         for (i = 0; i < 8 + GFX9_TCS_NUM_USER_SGPR; i++)
+            returns[num_returns++] = ctx->ac.i32; /* SGPRs */
+         for (i = 0; i < 2; i++)
+            returns[num_returns++] = ctx->ac.f32; /* VGPRs */
+      } else {
+         /* TCS return values are inputs to the TCS epilog.
+          *
+          * param_tcs_offchip_offset, param_tcs_factor_offset,
+          * param_tcs_offchip_layout, and param_rw_buffers
+          * should be passed to the epilog.
+          */
+         for (i = 0; i <= 8 + GFX9_SGPR_TCS_OUT_LAYOUT; i++)
+            returns[num_returns++] = ctx->ac.i32; /* SGPRs */
+         for (i = 0; i < 11; i++)
+            returns[num_returns++] = ctx->ac.f32; /* VGPRs */
+      }
+      break;
+
+   case SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY:
+      /* Merged stages have 8 system SGPRs at the beginning. */
+      /* SPI_SHADER_USER_DATA_ADDR_LO/HI_GS */
+      declare_per_stage_desc_pointers(ctx, ctx->type == PIPE_SHADER_GEOMETRY);
+
+      if (ctx->shader->key.as_ngg)
+         ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->gs_tg_info);
+      else
+         ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->gs2vs_offset);
+
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->merged_wave_info);
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_offset);
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->merged_scratch_offset);
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR,
+                 &ctx->small_prim_cull_info); /* SPI_SHADER_PGM_LO_GS << 8 */
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT,
+                 NULL); /* unused (SPI_SHADER_PGM_LO/HI_GS >> 24) */
+
+      declare_global_desc_pointers(ctx);
+      if (ctx->type != PIPE_SHADER_VERTEX || !vs_blit_property) {
+         declare_per_stage_desc_pointers(
+            ctx, (ctx->type == PIPE_SHADER_VERTEX || ctx->type == PIPE_SHADER_TESS_EVAL));
+      }
+
+      if (ctx->type == PIPE_SHADER_VERTEX) {
+         if (vs_blit_property)
+            declare_vs_blit_inputs(ctx, vs_blit_property);
+         else
+            declare_vs_specific_input_sgprs(ctx);
+      } else {
+         ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->vs_state_bits);
+         ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_layout);
+         ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tes_offchip_addr);
+         /* Declare as many input SGPRs as the VS has. */
+      }
+
+      if (ctx->type == PIPE_SHADER_VERTEX)
+         declare_vb_descriptor_input_sgprs(ctx);
+
+      /* VGPRs (first GS, then VS/TES) */
+      ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx01_offset);
+      ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx23_offset);
+      ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_prim_id);
+      ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_invocation_id);
+      ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx45_offset);
+
+      if (ctx->type == PIPE_SHADER_VERTEX) {
+         declare_vs_input_vgprs(ctx, &num_prolog_vgprs, ngg_cull_shader);
+      } else if (ctx->type == PIPE_SHADER_TESS_EVAL) {
+         declare_tes_input_vgprs(ctx, ngg_cull_shader);
+      }
+
+      if ((ctx->shader->key.as_es || ngg_cull_shader) &&
+          (ctx->type == PIPE_SHADER_VERTEX || ctx->type == PIPE_SHADER_TESS_EVAL)) {
+         unsigned num_user_sgprs, num_vgprs;
+
+         if (ctx->type == PIPE_SHADER_VERTEX) {
+            /* For the NGG cull shader, add 1 SGPR to hold
+             * the vertex buffer pointer.
+             */
+            num_user_sgprs = GFX9_VSGS_NUM_USER_SGPR + ngg_cull_shader;
+
+            if (ngg_cull_shader && shader->selector->num_vbos_in_user_sgprs) {
+               assert(num_user_sgprs <= 8 + SI_SGPR_VS_VB_DESCRIPTOR_FIRST);
+               num_user_sgprs =
+                  SI_SGPR_VS_VB_DESCRIPTOR_FIRST + shader->selector->num_vbos_in_user_sgprs * 4;
+            }
+         } else {
+            num_user_sgprs = GFX9_TESGS_NUM_USER_SGPR;
+         }
+
+         /* The NGG cull shader has to return all 9 VGPRs + the old thread ID.
+          *
+          * The normal merged ESGS shader only has to return the 5 VGPRs
+          * for the GS stage.
+          */
+         num_vgprs = ngg_cull_shader ? 10 : 5;
+
+         /* ES return values are inputs to GS. */
+         for (i = 0; i < 8 + num_user_sgprs; i++)
+            returns[num_returns++] = ctx->ac.i32; /* SGPRs */
+         for (i = 0; i < num_vgprs; i++)
+            returns[num_returns++] = ctx->ac.f32; /* VGPRs */
+      }
+      break;
+
+   case PIPE_SHADER_TESS_EVAL:
+      declare_global_desc_pointers(ctx);
+      declare_per_stage_desc_pointers(ctx, true);
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->vs_state_bits);
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_layout);
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tes_offchip_addr);
+
+      if (shader->key.as_es) {
+         ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_offset);
+         ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
+         ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->es2gs_offset);
+      } else {
+         declare_streamout_params(ctx, &shader->selector->so);
+         ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_offset);
+      }
+
+      /* VGPRs */
+      declare_tes_input_vgprs(ctx, ngg_cull_shader);
+      break;
+
+   case PIPE_SHADER_GEOMETRY:
+      declare_global_desc_pointers(ctx);
+      declare_per_stage_desc_pointers(ctx, true);
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->gs2vs_offset);
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->gs_wave_id);
+
+      /* VGPRs */
+      ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx_offset[0]);
+      ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx_offset[1]);
+      ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_prim_id);
+      ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx_offset[2]);
+      ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx_offset[3]);
+      ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx_offset[4]);
+      ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx_offset[5]);
+      ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_invocation_id);
+      break;
+
+   case PIPE_SHADER_FRAGMENT:
+      declare_global_desc_pointers(ctx);
+      declare_per_stage_desc_pointers(ctx, true);
+      si_add_arg_checked(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL, SI_PARAM_ALPHA_REF);
+      si_add_arg_checked(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.prim_mask,
+                         SI_PARAM_PRIM_MASK);
+
+      si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT, &ctx->args.persp_sample,
+                         SI_PARAM_PERSP_SAMPLE);
+      si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT, &ctx->args.persp_center,
+                         SI_PARAM_PERSP_CENTER);
+      si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT, &ctx->args.persp_centroid,
+                         SI_PARAM_PERSP_CENTROID);
+      si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 3, AC_ARG_INT, NULL, SI_PARAM_PERSP_PULL_MODEL);
+      si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT, &ctx->args.linear_sample,
+                         SI_PARAM_LINEAR_SAMPLE);
+      si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT, &ctx->args.linear_center,
+                         SI_PARAM_LINEAR_CENTER);
+      si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT, &ctx->args.linear_centroid,
+                         SI_PARAM_LINEAR_CENTROID);
+      si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 3, AC_ARG_FLOAT, NULL, SI_PARAM_LINE_STIPPLE_TEX);
+      si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &ctx->args.frag_pos[0],
+                         SI_PARAM_POS_X_FLOAT);
+      si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &ctx->args.frag_pos[1],
+                         SI_PARAM_POS_Y_FLOAT);
+      si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &ctx->args.frag_pos[2],
+                         SI_PARAM_POS_Z_FLOAT);
+      si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &ctx->args.frag_pos[3],
+                         SI_PARAM_POS_W_FLOAT);
+      shader->info.face_vgpr_index = ctx->args.num_vgprs_used;
+      si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.front_face,
+                         SI_PARAM_FRONT_FACE);
+      shader->info.ancillary_vgpr_index = ctx->args.num_vgprs_used;
+      si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.ancillary,
+                         SI_PARAM_ANCILLARY);
+      si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &ctx->args.sample_coverage,
+                         SI_PARAM_SAMPLE_COVERAGE);
+      si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->pos_fixed_pt,
+                         SI_PARAM_POS_FIXED_PT);
+
+      /* Color inputs from the prolog. */
+      if (shader->selector->info.colors_read) {
+         unsigned num_color_elements = util_bitcount(shader->selector->info.colors_read);
+
+         for (i = 0; i < num_color_elements; i++)
+            ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, NULL);
+
+         num_prolog_vgprs += num_color_elements;
+      }
+
+      /* Outputs for the epilog. */
+      num_return_sgprs = SI_SGPR_ALPHA_REF + 1;
+      num_returns = num_return_sgprs + util_bitcount(shader->selector->info.colors_written) * 4 +
+                    shader->selector->info.writes_z + shader->selector->info.writes_stencil +
+                    shader->selector->info.writes_samplemask + 1 /* SampleMaskIn */;
+
+      num_returns = MAX2(num_returns, num_return_sgprs + PS_EPILOG_SAMPLEMASK_MIN_LOC + 1);
+
+      for (i = 0; i < num_return_sgprs; i++)
+         returns[i] = ctx->ac.i32;
+      for (; i < num_returns; i++)
+         returns[i] = ctx->ac.f32;
+      break;
+
+   case PIPE_SHADER_COMPUTE:
+      declare_global_desc_pointers(ctx);
+      declare_per_stage_desc_pointers(ctx, true);
+      if (shader->selector->info.uses_grid_size)
+         ac_add_arg(&ctx->args, AC_ARG_SGPR, 3, AC_ARG_INT, &ctx->args.num_work_groups);
+      if (shader->selector->info.uses_block_size &&
+          shader->selector->info.properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] == 0)
+         ac_add_arg(&ctx->args, AC_ARG_SGPR, 3, AC_ARG_INT, &ctx->block_size);
+
+      unsigned cs_user_data_dwords =
+         shader->selector->info.properties[TGSI_PROPERTY_CS_USER_DATA_COMPONENTS_AMD];
+      if (cs_user_data_dwords) {
+         ac_add_arg(&ctx->args, AC_ARG_SGPR, cs_user_data_dwords, AC_ARG_INT, &ctx->cs_user_data);
+      }
+
+      /* Hardware SGPRs. */
+      for (i = 0; i < 3; i++) {
+         if (shader->selector->info.uses_block_id[i]) {
+            ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.workgroup_ids[i]);
+         }
+      }
+      if (shader->selector->info.uses_subgroup_info)
+         ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.tg_size);
+
+      /* Hardware VGPRs. */
+      ac_add_arg(&ctx->args, AC_ARG_VGPR, 3, AC_ARG_INT, &ctx->args.local_invocation_ids);
+      break;
+   default:
+      assert(0 && "unimplemented shader");
+      return;
+   }
+
+   si_llvm_create_func(ctx, ngg_cull_shader ? "ngg_cull_main" : "main", returns, num_returns,
+                       si_get_max_workgroup_size(shader));
+
+   /* Reserve register locations for VGPR inputs the PS prolog may need. */
+   if (ctx->type == PIPE_SHADER_FRAGMENT && !ctx->shader->is_monolithic) {
+      ac_llvm_add_target_dep_function_attr(
+         ctx->main_fn, "InitialPSInputAddr",
+         S_0286D0_PERSP_SAMPLE_ENA(1) | S_0286D0_PERSP_CENTER_ENA(1) |
+            S_0286D0_PERSP_CENTROID_ENA(1) | S_0286D0_LINEAR_SAMPLE_ENA(1) |
+            S_0286D0_LINEAR_CENTER_ENA(1) | S_0286D0_LINEAR_CENTROID_ENA(1) |
+            S_0286D0_FRONT_FACE_ENA(1) | S_0286D0_ANCILLARY_ENA(1) | S_0286D0_POS_FIXED_PT_ENA(1));
+   }
+
+   shader->info.num_input_sgprs = ctx->args.num_sgprs_used;
+   shader->info.num_input_vgprs = ctx->args.num_vgprs_used;
+
+   assert(shader->info.num_input_vgprs >= num_prolog_vgprs);
+   shader->info.num_input_vgprs -= num_prolog_vgprs;
+
+   if (shader->key.as_ls || ctx->type == PIPE_SHADER_TESS_CTRL) {
+      if (USE_LDS_SYMBOLS && LLVM_VERSION_MAJOR >= 9) {
+         /* The LSHS size is not known until draw time, so we append it
+          * at the end of whatever LDS use there may be in the rest of
+          * the shader (currently none, unless LLVM decides to do its
+          * own LDS-based lowering).
+          */
+         ctx->ac.lds = LLVMAddGlobalInAddressSpace(ctx->ac.module, LLVMArrayType(ctx->ac.i32, 0),
+                                                   "__lds_end", AC_ADDR_SPACE_LDS);
+         LLVMSetAlignment(ctx->ac.lds, 256);
+      } else {
+         ac_declare_lds_as_pointer(&ctx->ac);
+      }
+   }
+
+   /* Unlike radv, we override these arguments in the prolog, so to the
+    * API shader they appear as normal arguments.
+    */
+   if (ctx->type == PIPE_SHADER_VERTEX) {
+      ctx->abi.vertex_id = ac_get_arg(&ctx->ac, ctx->args.vertex_id);
+      ctx->abi.instance_id = ac_get_arg(&ctx->ac, ctx->args.instance_id);
+   } else if (ctx->type == PIPE_SHADER_FRAGMENT) {
+      ctx->abi.persp_centroid = ac_get_arg(&ctx->ac, ctx->args.persp_centroid);
+      ctx->abi.linear_centroid = ac_get_arg(&ctx->ac, ctx->args.linear_centroid);
+   }
 }
 
 /* For the UMR disassembler. */
-#define DEBUGGER_END_OF_CODE_MARKER	0xbf9f0000 /* invalid instruction */
-#define DEBUGGER_NUM_MARKERS		5
+#define DEBUGGER_END_OF_CODE_MARKER 0xbf9f0000 /* invalid instruction */
+#define DEBUGGER_NUM_MARKERS        5
 
-static bool si_shader_binary_open(struct si_screen *screen,
-				  struct si_shader *shader,
-				  struct ac_rtld_binary *rtld)
+static bool si_shader_binary_open(struct si_screen *screen, struct si_shader *shader,
+                                  struct ac_rtld_binary *rtld)
 {
-	const struct si_shader_selector *sel = shader->selector;
-	const char *part_elfs[5];
-	size_t part_sizes[5];
-	unsigned num_parts = 0;
-
-#define add_part(shader_or_part) \
-	if (shader_or_part) { \
-		part_elfs[num_parts] = (shader_or_part)->binary.elf_buffer; \
-		part_sizes[num_parts] = (shader_or_part)->binary.elf_size; \
-		num_parts++; \
-	}
-
-	add_part(shader->prolog);
-	add_part(shader->previous_stage);
-	add_part(shader->prolog2);
-	add_part(shader);
-	add_part(shader->epilog);
+   const struct si_shader_selector *sel = shader->selector;
+   const char *part_elfs[5];
+   size_t part_sizes[5];
+   unsigned num_parts = 0;
+
+#define add_part(shader_or_part)                                                                   \
+   if (shader_or_part) {                                                                           \
+      part_elfs[num_parts] = (shader_or_part)->binary.elf_buffer;                                  \
+      part_sizes[num_parts] = (shader_or_part)->binary.elf_size;                                   \
+      num_parts++;                                                                                 \
+   }
+
+   add_part(shader->prolog);
+   add_part(shader->previous_stage);
+   add_part(shader->prolog2);
+   add_part(shader);
+   add_part(shader->epilog);
 
 #undef add_part
 
-	struct ac_rtld_symbol lds_symbols[2];
-	unsigned num_lds_symbols = 0;
-
-	if (sel && screen->info.chip_class >= GFX9 && !shader->is_gs_copy_shader &&
-	    (sel->type == PIPE_SHADER_GEOMETRY || shader->key.as_ngg)) {
-		/* We add this symbol even on LLVM <= 8 to ensure that
-		 * shader->config.lds_size is set correctly below.
-		 */
-		struct ac_rtld_symbol *sym = &lds_symbols[num_lds_symbols++];
-		sym->name = "esgs_ring";
-		sym->size = shader->gs_info.esgs_ring_size;
-		sym->align = 64 * 1024;
-	}
-
-	if (shader->key.as_ngg && sel->type == PIPE_SHADER_GEOMETRY) {
-		struct ac_rtld_symbol *sym = &lds_symbols[num_lds_symbols++];
-		sym->name = "ngg_emit";
-		sym->size = shader->ngg.ngg_emit_size * 4;
-		sym->align = 4;
-	}
-
-	bool ok = ac_rtld_open(rtld, (struct ac_rtld_open_info){
-			.info = &screen->info,
-			.options = {
-				.halt_at_entry = screen->options.halt_shaders,
-			},
-			.shader_type = tgsi_processor_to_shader_stage(sel->type),
-			.wave_size = si_get_shader_wave_size(shader),
-			.num_parts = num_parts,
-			.elf_ptrs = part_elfs,
-			.elf_sizes = part_sizes,
-			.num_shared_lds_symbols = num_lds_symbols,
-			.shared_lds_symbols = lds_symbols });
-
-	if (rtld->lds_size > 0) {
-		unsigned alloc_granularity = screen->info.chip_class >= GFX7 ? 512 : 256;
-		shader->config.lds_size =
-			align(rtld->lds_size, alloc_granularity) / alloc_granularity;
-	}
-
-	return ok;
+   struct ac_rtld_symbol lds_symbols[2];
+   unsigned num_lds_symbols = 0;
+
+   if (sel && screen->info.chip_class >= GFX9 && !shader->is_gs_copy_shader &&
+       (sel->type == PIPE_SHADER_GEOMETRY || shader->key.as_ngg)) {
+      /* We add this symbol even on LLVM <= 8 to ensure that
+       * shader->config.lds_size is set correctly below.
+       */
+      struct ac_rtld_symbol *sym = &lds_symbols[num_lds_symbols++];
+      sym->name = "esgs_ring";
+      sym->size = shader->gs_info.esgs_ring_size;
+      sym->align = 64 * 1024;
+   }
+
+   if (shader->key.as_ngg && sel->type == PIPE_SHADER_GEOMETRY) {
+      struct ac_rtld_symbol *sym = &lds_symbols[num_lds_symbols++];
+      sym->name = "ngg_emit";
+      sym->size = shader->ngg.ngg_emit_size * 4;
+      sym->align = 4;
+   }
+
+   bool ok = ac_rtld_open(
+      rtld, (struct ac_rtld_open_info){.info = &screen->info,
+                                       .options =
+                                          {
+                                             .halt_at_entry = screen->options.halt_shaders,
+                                          },
+                                       .shader_type = tgsi_processor_to_shader_stage(sel->type),
+                                       .wave_size = si_get_shader_wave_size(shader),
+                                       .num_parts = num_parts,
+                                       .elf_ptrs = part_elfs,
+                                       .elf_sizes = part_sizes,
+                                       .num_shared_lds_symbols = num_lds_symbols,
+                                       .shared_lds_symbols = lds_symbols});
+
+   if (rtld->lds_size > 0) {
+      unsigned alloc_granularity = screen->info.chip_class >= GFX7 ? 512 : 256;
+      shader->config.lds_size = align(rtld->lds_size, alloc_granularity) / alloc_granularity;
+   }
+
+   return ok;
 }
 
 static unsigned si_get_shader_binary_size(struct si_screen *screen, struct si_shader *shader)
 {
-	struct ac_rtld_binary rtld;
-	si_shader_binary_open(screen, shader, &rtld);
-	return rtld.exec_size;
+   struct ac_rtld_binary rtld;
+   si_shader_binary_open(screen, shader, &rtld);
+   return rtld.exec_size;
 }
 
 static bool si_get_external_symbol(void *data, const char *name, uint64_t *value)
 {
-	uint64_t *scratch_va = data;
-
-	if (!strcmp(scratch_rsrc_dword0_symbol, name)) {
-		*value = (uint32_t)*scratch_va;
-		return true;
-	}
-	if (!strcmp(scratch_rsrc_dword1_symbol, name)) {
-		/* Enable scratch coalescing. */
-		*value = S_008F04_BASE_ADDRESS_HI(*scratch_va >> 32) |
-			 S_008F04_SWIZZLE_ENABLE(1);
-		return true;
-	}
-
-	return false;
+   uint64_t *scratch_va = data;
+
+   if (!strcmp(scratch_rsrc_dword0_symbol, name)) {
+      *value = (uint32_t)*scratch_va;
+      return true;
+   }
+   if (!strcmp(scratch_rsrc_dword1_symbol, name)) {
+      /* Enable scratch coalescing. */
+      *value = S_008F04_BASE_ADDRESS_HI(*scratch_va >> 32) | S_008F04_SWIZZLE_ENABLE(1);
+      return true;
+   }
+
+   return false;
 }
 
 bool si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader,
-			     uint64_t scratch_va)
+                             uint64_t scratch_va)
 {
-	struct ac_rtld_binary binary;
-	if (!si_shader_binary_open(sscreen, shader, &binary))
-		return false;
-
-	si_resource_reference(&shader->bo, NULL);
-	shader->bo = si_aligned_buffer_create(&sscreen->b,
-					      sscreen->info.cpdma_prefetch_writes_memory ?
-						0 : SI_RESOURCE_FLAG_READ_ONLY,
-                                              PIPE_USAGE_IMMUTABLE,
-                                              align(binary.rx_size, SI_CPDMA_ALIGNMENT),
-                                              256);
-	if (!shader->bo)
-		return false;
-
-	/* Upload. */
-	struct ac_rtld_upload_info u = {};
-	u.binary = &binary;
-	u.get_external_symbol = si_get_external_symbol;
-	u.cb_data = &scratch_va;
-	u.rx_va = shader->bo->gpu_address;
-	u.rx_ptr = sscreen->ws->buffer_map(shader->bo->buf, NULL,
-					PIPE_TRANSFER_READ_WRITE |
-					PIPE_TRANSFER_UNSYNCHRONIZED |
-					RADEON_TRANSFER_TEMPORARY);
-	if (!u.rx_ptr)
-		return false;
-
-	bool ok = ac_rtld_upload(&u);
-
-	sscreen->ws->buffer_unmap(shader->bo->buf);
-	ac_rtld_close(&binary);
-
-	return ok;
+   struct ac_rtld_binary binary;
+   if (!si_shader_binary_open(sscreen, shader, &binary))
+      return false;
+
+   si_resource_reference(&shader->bo, NULL);
+   shader->bo = si_aligned_buffer_create(
+      &sscreen->b, sscreen->info.cpdma_prefetch_writes_memory ? 0 : SI_RESOURCE_FLAG_READ_ONLY,
+      PIPE_USAGE_IMMUTABLE, align(binary.rx_size, SI_CPDMA_ALIGNMENT), 256);
+   if (!shader->bo)
+      return false;
+
+   /* Upload. */
+   struct ac_rtld_upload_info u = {};
+   u.binary = &binary;
+   u.get_external_symbol = si_get_external_symbol;
+   u.cb_data = &scratch_va;
+   u.rx_va = shader->bo->gpu_address;
+   u.rx_ptr = sscreen->ws->buffer_map(
+      shader->bo->buf, NULL,
+      PIPE_TRANSFER_READ_WRITE | PIPE_TRANSFER_UNSYNCHRONIZED | RADEON_TRANSFER_TEMPORARY);
+   if (!u.rx_ptr)
+      return false;
+
+   bool ok = ac_rtld_upload(&u);
+
+   sscreen->ws->buffer_unmap(shader->bo->buf);
+   ac_rtld_close(&binary);
+
+   return ok;
 }
 
 static void si_shader_dump_disassembly(struct si_screen *screen,
-				       const struct si_shader_binary *binary,
-				       enum pipe_shader_type shader_type,
-				       unsigned wave_size,
-				       struct pipe_debug_callback *debug,
-				       const char *name, FILE *file)
+                                       const struct si_shader_binary *binary,
+                                       enum pipe_shader_type shader_type, unsigned wave_size,
+                                       struct pipe_debug_callback *debug, const char *name,
+                                       FILE *file)
 {
-	struct ac_rtld_binary rtld_binary;
-
-	if (!ac_rtld_open(&rtld_binary, (struct ac_rtld_open_info){
-			.info = &screen->info,
-			.shader_type = tgsi_processor_to_shader_stage(shader_type),
-			.wave_size = wave_size,
-			.num_parts = 1,
-			.elf_ptrs = &binary->elf_buffer,
-			.elf_sizes = &binary->elf_size }))
-		return;
-
-	const char *disasm;
-	size_t nbytes;
-
-	if (!ac_rtld_get_section_by_name(&rtld_binary, ".AMDGPU.disasm", &disasm, &nbytes))
-		goto out;
-
-	if (nbytes > INT_MAX)
-		goto out;
-
-	if (debug && debug->debug_message) {
-		/* Very long debug messages are cut off, so send the
-		 * disassembly one line at a time. This causes more
-		 * overhead, but on the plus side it simplifies
-		 * parsing of resulting logs.
-		 */
-		pipe_debug_message(debug, SHADER_INFO,
-				   "Shader Disassembly Begin");
-
-		uint64_t line = 0;
-		while (line < nbytes) {
-			int count = nbytes - line;
-			const char *nl = memchr(disasm + line, '\n', nbytes - line);
-			if (nl)
-				count = nl - (disasm + line);
-
-			if (count) {
-				pipe_debug_message(debug, SHADER_INFO,
-						   "%.*s", count, disasm + line);
-			}
-
-			line += count + 1;
-		}
-
-		pipe_debug_message(debug, SHADER_INFO,
-				   "Shader Disassembly End");
-	}
-
-	if (file) {
-		fprintf(file, "Shader %s disassembly:\n", name);
-		fprintf(file, "%*s", (int)nbytes, disasm);
-	}
+   struct ac_rtld_binary rtld_binary;
+
+   if (!ac_rtld_open(&rtld_binary, (struct ac_rtld_open_info){
+                                      .info = &screen->info,
+                                      .shader_type = tgsi_processor_to_shader_stage(shader_type),
+                                      .wave_size = wave_size,
+                                      .num_parts = 1,
+                                      .elf_ptrs = &binary->elf_buffer,
+                                      .elf_sizes = &binary->elf_size}))
+      return;
+
+   const char *disasm;
+   size_t nbytes;
+
+   if (!ac_rtld_get_section_by_name(&rtld_binary, ".AMDGPU.disasm", &disasm, &nbytes))
+      goto out;
+
+   if (nbytes > INT_MAX)
+      goto out;
+
+   if (debug && debug->debug_message) {
+      /* Very long debug messages are cut off, so send the
+       * disassembly one line at a time. This causes more
+       * overhead, but on the plus side it simplifies
+       * parsing of resulting logs.
+       */
+      pipe_debug_message(debug, SHADER_INFO, "Shader Disassembly Begin");
+
+      uint64_t line = 0;
+      while (line < nbytes) {
+         int count = nbytes - line;
+         const char *nl = memchr(disasm + line, '\n', nbytes - line);
+         if (nl)
+            count = nl - (disasm + line);
+
+         if (count) {
+            pipe_debug_message(debug, SHADER_INFO, "%.*s", count, disasm + line);
+         }
+
+         line += count + 1;
+      }
+
+      pipe_debug_message(debug, SHADER_INFO, "Shader Disassembly End");
+   }
+
+   if (file) {
+      fprintf(file, "Shader %s disassembly:\n", name);
+      fprintf(file, "%*s", (int)nbytes, disasm);
+   }
 
 out:
-	ac_rtld_close(&rtld_binary);
+   ac_rtld_close(&rtld_binary);
 }
 
 static void si_calculate_max_simd_waves(struct si_shader *shader)
 {
-	struct si_screen *sscreen = shader->selector->screen;
-	struct ac_shader_config *conf = &shader->config;
-	unsigned num_inputs = shader->selector->info.num_inputs;
-	unsigned lds_increment = sscreen->info.chip_class >= GFX7 ? 512 : 256;
-	unsigned lds_per_wave = 0;
-	unsigned max_simd_waves;
-
-	max_simd_waves = sscreen->info.max_wave64_per_simd;
-
-	/* Compute LDS usage for PS. */
-	switch (shader->selector->type) {
-	case PIPE_SHADER_FRAGMENT:
-		/* The minimum usage per wave is (num_inputs * 48). The maximum
-		 * usage is (num_inputs * 48 * 16).
-		 * We can get anything in between and it varies between waves.
-		 *
-		 * The 48 bytes per input for a single primitive is equal to
-		 * 4 bytes/component * 4 components/input * 3 points.
-		 *
-		 * Other stages don't know the size at compile time or don't
-		 * allocate LDS per wave, but instead they do it per thread group.
-		 */
-		lds_per_wave = conf->lds_size * lds_increment +
-			       align(num_inputs * 48, lds_increment);
-		break;
-	case PIPE_SHADER_COMPUTE:
-		if (shader->selector) {
-			unsigned max_workgroup_size =
-				si_get_max_workgroup_size(shader);
-			lds_per_wave = (conf->lds_size * lds_increment) /
-				       DIV_ROUND_UP(max_workgroup_size,
-						    sscreen->compute_wave_size);
-		}
-		break;
-	default:;
-	}
-
-	/* Compute the per-SIMD wave counts. */
-	if (conf->num_sgprs) {
-		max_simd_waves =
-			MIN2(max_simd_waves,
-			     sscreen->info.num_physical_sgprs_per_simd / conf->num_sgprs);
-	}
-
-	if (conf->num_vgprs) {
-		/* Always print wave limits as Wave64, so that we can compare
-		 * Wave32 and Wave64 with shader-db fairly. */
-		unsigned max_vgprs = sscreen->info.num_physical_wave64_vgprs_per_simd;
-		max_simd_waves = MIN2(max_simd_waves, max_vgprs / conf->num_vgprs);
-	}
-
-	unsigned max_lds_per_simd = sscreen->info.lds_size_per_workgroup / 4;
-	if (lds_per_wave)
-		max_simd_waves = MIN2(max_simd_waves, max_lds_per_simd / lds_per_wave);
-
-	shader->info.max_simd_waves = max_simd_waves;
+   struct si_screen *sscreen = shader->selector->screen;
+   struct ac_shader_config *conf = &shader->config;
+   unsigned num_inputs = shader->selector->info.num_inputs;
+   unsigned lds_increment = sscreen->info.chip_class >= GFX7 ? 512 : 256;
+   unsigned lds_per_wave = 0;
+   unsigned max_simd_waves;
+
+   max_simd_waves = sscreen->info.max_wave64_per_simd;
+
+   /* Compute LDS usage for PS. */
+   switch (shader->selector->type) {
+   case PIPE_SHADER_FRAGMENT:
+      /* The minimum usage per wave is (num_inputs * 48). The maximum
+       * usage is (num_inputs * 48 * 16).
+       * We can get anything in between and it varies between waves.
+       *
+       * The 48 bytes per input for a single primitive is equal to
+       * 4 bytes/component * 4 components/input * 3 points.
+       *
+       * Other stages don't know the size at compile time or don't
+       * allocate LDS per wave, but instead they do it per thread group.
+       */
+      lds_per_wave = conf->lds_size * lds_increment + align(num_inputs * 48, lds_increment);
+      break;
+   case PIPE_SHADER_COMPUTE:
+      if (shader->selector) {
+         unsigned max_workgroup_size = si_get_max_workgroup_size(shader);
+         lds_per_wave = (conf->lds_size * lds_increment) /
+                        DIV_ROUND_UP(max_workgroup_size, sscreen->compute_wave_size);
+      }
+      break;
+   default:;
+   }
+
+   /* Compute the per-SIMD wave counts. */
+   if (conf->num_sgprs) {
+      max_simd_waves =
+         MIN2(max_simd_waves, sscreen->info.num_physical_sgprs_per_simd / conf->num_sgprs);
+   }
+
+   if (conf->num_vgprs) {
+      /* Always print wave limits as Wave64, so that we can compare
+       * Wave32 and Wave64 with shader-db fairly. */
+      unsigned max_vgprs = sscreen->info.num_physical_wave64_vgprs_per_simd;
+      max_simd_waves = MIN2(max_simd_waves, max_vgprs / conf->num_vgprs);
+   }
+
+   unsigned max_lds_per_simd = sscreen->info.lds_size_per_workgroup / 4;
+   if (lds_per_wave)
+      max_simd_waves = MIN2(max_simd_waves, max_lds_per_simd / lds_per_wave);
+
+   shader->info.max_simd_waves = max_simd_waves;
 }
 
-void si_shader_dump_stats_for_shader_db(struct si_screen *screen,
-					struct si_shader *shader,
-					struct pipe_debug_callback *debug)
+void si_shader_dump_stats_for_shader_db(struct si_screen *screen, struct si_shader *shader,
+                                        struct pipe_debug_callback *debug)
 {
-	const struct ac_shader_config *conf = &shader->config;
-
-	if (screen->options.debug_disassembly)
-		si_shader_dump_disassembly(screen, &shader->binary,
-					   shader->selector->type,
-					   si_get_shader_wave_size(shader),
-					   debug, "main", NULL);
-
-	pipe_debug_message(debug, SHADER_INFO,
-			   "Shader Stats: SGPRS: %d VGPRS: %d Code Size: %d "
-			   "LDS: %d Scratch: %d Max Waves: %d Spilled SGPRs: %d "
-			   "Spilled VGPRs: %d PrivMem VGPRs: %d",
-			   conf->num_sgprs, conf->num_vgprs,
-			   si_get_shader_binary_size(screen, shader),
-			   conf->lds_size, conf->scratch_bytes_per_wave,
-			   shader->info.max_simd_waves, conf->spilled_sgprs,
-			   conf->spilled_vgprs, shader->info.private_mem_vgprs);
+   const struct ac_shader_config *conf = &shader->config;
+
+   if (screen->options.debug_disassembly)
+      si_shader_dump_disassembly(screen, &shader->binary, shader->selector->type,
+                                 si_get_shader_wave_size(shader), debug, "main", NULL);
+
+   pipe_debug_message(debug, SHADER_INFO,
+                      "Shader Stats: SGPRS: %d VGPRS: %d Code Size: %d "
+                      "LDS: %d Scratch: %d Max Waves: %d Spilled SGPRs: %d "
+                      "Spilled VGPRs: %d PrivMem VGPRs: %d",
+                      conf->num_sgprs, conf->num_vgprs, si_get_shader_binary_size(screen, shader),
+                      conf->lds_size, conf->scratch_bytes_per_wave, shader->info.max_simd_waves,
+                      conf->spilled_sgprs, conf->spilled_vgprs, shader->info.private_mem_vgprs);
 }
 
-static void si_shader_dump_stats(struct si_screen *sscreen,
-				 struct si_shader *shader,
-				 FILE *file,
-				 bool check_debug_option)
+static void si_shader_dump_stats(struct si_screen *sscreen, struct si_shader *shader, FILE *file,
+                                 bool check_debug_option)
 {
-	const struct ac_shader_config *conf = &shader->config;
-
-	if (!check_debug_option ||
-	    si_can_dump_shader(sscreen, shader->selector->type)) {
-		if (shader->selector->type == PIPE_SHADER_FRAGMENT) {
-			fprintf(file, "*** SHADER CONFIG ***\n"
-				"SPI_PS_INPUT_ADDR = 0x%04x\n"
-				"SPI_PS_INPUT_ENA  = 0x%04x\n",
-				conf->spi_ps_input_addr, conf->spi_ps_input_ena);
-		}
-
-		fprintf(file, "*** SHADER STATS ***\n"
-			"SGPRS: %d\n"
-			"VGPRS: %d\n"
-		        "Spilled SGPRs: %d\n"
-			"Spilled VGPRs: %d\n"
-			"Private memory VGPRs: %d\n"
-			"Code Size: %d bytes\n"
-			"LDS: %d blocks\n"
-			"Scratch: %d bytes per wave\n"
-			"Max Waves: %d\n"
-			"********************\n\n\n",
-			conf->num_sgprs, conf->num_vgprs,
-			conf->spilled_sgprs, conf->spilled_vgprs,
-			shader->info.private_mem_vgprs,
-			si_get_shader_binary_size(sscreen, shader),
-			conf->lds_size, conf->scratch_bytes_per_wave,
-			shader->info.max_simd_waves);
-	}
+   const struct ac_shader_config *conf = &shader->config;
+
+   if (!check_debug_option || si_can_dump_shader(sscreen, shader->selector->type)) {
+      if (shader->selector->type == PIPE_SHADER_FRAGMENT) {
+         fprintf(file,
+                 "*** SHADER CONFIG ***\n"
+                 "SPI_PS_INPUT_ADDR = 0x%04x\n"
+                 "SPI_PS_INPUT_ENA  = 0x%04x\n",
+                 conf->spi_ps_input_addr, conf->spi_ps_input_ena);
+      }
+
+      fprintf(file,
+              "*** SHADER STATS ***\n"
+              "SGPRS: %d\n"
+              "VGPRS: %d\n"
+              "Spilled SGPRs: %d\n"
+              "Spilled VGPRs: %d\n"
+              "Private memory VGPRs: %d\n"
+              "Code Size: %d bytes\n"
+              "LDS: %d blocks\n"
+              "Scratch: %d bytes per wave\n"
+              "Max Waves: %d\n"
+              "********************\n\n\n",
+              conf->num_sgprs, conf->num_vgprs, conf->spilled_sgprs, conf->spilled_vgprs,
+              shader->info.private_mem_vgprs, si_get_shader_binary_size(sscreen, shader),
+              conf->lds_size, conf->scratch_bytes_per_wave, shader->info.max_simd_waves);
+   }
 }
 
 const char *si_get_shader_name(const struct si_shader *shader)
 {
-	switch (shader->selector->type) {
-	case PIPE_SHADER_VERTEX:
-		if (shader->key.as_es)
-			return "Vertex Shader as ES";
-		else if (shader->key.as_ls)
-			return "Vertex Shader as LS";
-		else if (shader->key.opt.vs_as_prim_discard_cs)
-			return "Vertex Shader as Primitive Discard CS";
-		else if (shader->key.as_ngg)
-			return "Vertex Shader as ESGS";
-		else
-			return "Vertex Shader as VS";
-	case PIPE_SHADER_TESS_CTRL:
-		return "Tessellation Control Shader";
-	case PIPE_SHADER_TESS_EVAL:
-		if (shader->key.as_es)
-			return "Tessellation Evaluation Shader as ES";
-		else if (shader->key.as_ngg)
-			return "Tessellation Evaluation Shader as ESGS";
-		else
-			return "Tessellation Evaluation Shader as VS";
-	case PIPE_SHADER_GEOMETRY:
-		if (shader->is_gs_copy_shader)
-			return "GS Copy Shader as VS";
-		else
-			return "Geometry Shader";
-	case PIPE_SHADER_FRAGMENT:
-		return "Pixel Shader";
-	case PIPE_SHADER_COMPUTE:
-		return "Compute Shader";
-	default:
-		return "Unknown Shader";
-	}
+   switch (shader->selector->type) {
+   case PIPE_SHADER_VERTEX:
+      if (shader->key.as_es)
+         return "Vertex Shader as ES";
+      else if (shader->key.as_ls)
+         return "Vertex Shader as LS";
+      else if (shader->key.opt.vs_as_prim_discard_cs)
+         return "Vertex Shader as Primitive Discard CS";
+      else if (shader->key.as_ngg)
+         return "Vertex Shader as ESGS";
+      else
+         return "Vertex Shader as VS";
+   case PIPE_SHADER_TESS_CTRL:
+      return "Tessellation Control Shader";
+   case PIPE_SHADER_TESS_EVAL:
+      if (shader->key.as_es)
+         return "Tessellation Evaluation Shader as ES";
+      else if (shader->key.as_ngg)
+         return "Tessellation Evaluation Shader as ESGS";
+      else
+         return "Tessellation Evaluation Shader as VS";
+   case PIPE_SHADER_GEOMETRY:
+      if (shader->is_gs_copy_shader)
+         return "GS Copy Shader as VS";
+      else
+         return "Geometry Shader";
+   case PIPE_SHADER_FRAGMENT:
+      return "Pixel Shader";
+   case PIPE_SHADER_COMPUTE:
+      return "Compute Shader";
+   default:
+      return "Unknown Shader";
+   }
 }
 
 void si_shader_dump(struct si_screen *sscreen, struct si_shader *shader,
-		    struct pipe_debug_callback *debug,
-		    FILE *file, bool check_debug_option)
+                    struct pipe_debug_callback *debug, FILE *file, bool check_debug_option)
 {
-	enum pipe_shader_type shader_type = shader->selector->type;
-
-	if (!check_debug_option ||
-	    si_can_dump_shader(sscreen, shader_type))
-		si_dump_shader_key(shader, file);
-
-	if (!check_debug_option && shader->binary.llvm_ir_string) {
-		if (shader->previous_stage &&
-		    shader->previous_stage->binary.llvm_ir_string) {
-			fprintf(file, "\n%s - previous stage - LLVM IR:\n\n",
-				si_get_shader_name(shader));
-			fprintf(file, "%s\n", shader->previous_stage->binary.llvm_ir_string);
-		}
-
-		fprintf(file, "\n%s - main shader part - LLVM IR:\n\n",
-			si_get_shader_name(shader));
-		fprintf(file, "%s\n", shader->binary.llvm_ir_string);
-	}
-
-	if (!check_debug_option ||
-	    (si_can_dump_shader(sscreen, shader_type) &&
-	     !(sscreen->debug_flags & DBG(NO_ASM)))) {
-		unsigned wave_size = si_get_shader_wave_size(shader);
-
-		fprintf(file, "\n%s:\n", si_get_shader_name(shader));
-
-		if (shader->prolog)
-			si_shader_dump_disassembly(sscreen, &shader->prolog->binary,
-						   shader_type, wave_size, debug, "prolog", file);
-		if (shader->previous_stage)
-			si_shader_dump_disassembly(sscreen, &shader->previous_stage->binary,
-						   shader_type, wave_size, debug, "previous stage", file);
-		if (shader->prolog2)
-			si_shader_dump_disassembly(sscreen, &shader->prolog2->binary,
-						   shader_type, wave_size, debug, "prolog2", file);
-
-		si_shader_dump_disassembly(sscreen, &shader->binary, shader_type,
-					   wave_size, debug, "main", file);
-
-		if (shader->epilog)
-			si_shader_dump_disassembly(sscreen, &shader->epilog->binary,
-						   shader_type, wave_size, debug, "epilog", file);
-		fprintf(file, "\n");
-	}
-
-	si_shader_dump_stats(sscreen, shader, file, check_debug_option);
+   enum pipe_shader_type shader_type = shader->selector->type;
+
+   if (!check_debug_option || si_can_dump_shader(sscreen, shader_type))
+      si_dump_shader_key(shader, file);
+
+   if (!check_debug_option && shader->binary.llvm_ir_string) {
+      if (shader->previous_stage && shader->previous_stage->binary.llvm_ir_string) {
+         fprintf(file, "\n%s - previous stage - LLVM IR:\n\n", si_get_shader_name(shader));
+         fprintf(file, "%s\n", shader->previous_stage->binary.llvm_ir_string);
+      }
+
+      fprintf(file, "\n%s - main shader part - LLVM IR:\n\n", si_get_shader_name(shader));
+      fprintf(file, "%s\n", shader->binary.llvm_ir_string);
+   }
+
+   if (!check_debug_option ||
+       (si_can_dump_shader(sscreen, shader_type) && !(sscreen->debug_flags & DBG(NO_ASM)))) {
+      unsigned wave_size = si_get_shader_wave_size(shader);
+
+      fprintf(file, "\n%s:\n", si_get_shader_name(shader));
+
+      if (shader->prolog)
+         si_shader_dump_disassembly(sscreen, &shader->prolog->binary, shader_type, wave_size, debug,
+                                    "prolog", file);
+      if (shader->previous_stage)
+         si_shader_dump_disassembly(sscreen, &shader->previous_stage->binary, shader_type,
+                                    wave_size, debug, "previous stage", file);
+      if (shader->prolog2)
+         si_shader_dump_disassembly(sscreen, &shader->prolog2->binary, shader_type, wave_size,
+                                    debug, "prolog2", file);
+
+      si_shader_dump_disassembly(sscreen, &shader->binary, shader_type, wave_size, debug, "main",
+                                 file);
+
+      if (shader->epilog)
+         si_shader_dump_disassembly(sscreen, &shader->epilog->binary, shader_type, wave_size, debug,
+                                    "epilog", file);
+      fprintf(file, "\n");
+   }
+
+   si_shader_dump_stats(sscreen, shader, file, check_debug_option);
 }
 
 static void si_dump_shader_key_vs(const struct si_shader_key *key,
-				  const struct si_vs_prolog_bits *prolog,
-				  const char *prefix, FILE *f)
+                                  const struct si_vs_prolog_bits *prolog, const char *prefix,
+                                  FILE *f)
 {
-	fprintf(f, "  %s.instance_divisor_is_one = %u\n",
-		prefix, prolog->instance_divisor_is_one);
-	fprintf(f, "  %s.instance_divisor_is_fetched = %u\n",
-		prefix, prolog->instance_divisor_is_fetched);
-	fprintf(f, "  %s.unpack_instance_id_from_vertex_id = %u\n",
-		prefix, prolog->unpack_instance_id_from_vertex_id);
-	fprintf(f, "  %s.ls_vgpr_fix = %u\n",
-		prefix, prolog->ls_vgpr_fix);
-
-	fprintf(f, "  mono.vs.fetch_opencode = %x\n", key->mono.vs_fetch_opencode);
-	fprintf(f, "  mono.vs.fix_fetch = {");
-	for (int i = 0; i < SI_MAX_ATTRIBS; i++) {
-		union si_vs_fix_fetch fix = key->mono.vs_fix_fetch[i];
-		if (i)
-			fprintf(f, ", ");
-		if (!fix.bits)
-			fprintf(f, "0");
-		else
-			fprintf(f, "%u.%u.%u.%u", fix.u.reverse, fix.u.log_size,
-				fix.u.num_channels_m1, fix.u.format);
-	}
-	fprintf(f, "}\n");
+   fprintf(f, "  %s.instance_divisor_is_one = %u\n", prefix, prolog->instance_divisor_is_one);
+   fprintf(f, "  %s.instance_divisor_is_fetched = %u\n", prefix,
+           prolog->instance_divisor_is_fetched);
+   fprintf(f, "  %s.unpack_instance_id_from_vertex_id = %u\n", prefix,
+           prolog->unpack_instance_id_from_vertex_id);
+   fprintf(f, "  %s.ls_vgpr_fix = %u\n", prefix, prolog->ls_vgpr_fix);
+
+   fprintf(f, "  mono.vs.fetch_opencode = %x\n", key->mono.vs_fetch_opencode);
+   fprintf(f, "  mono.vs.fix_fetch = {");
+   for (int i = 0; i < SI_MAX_ATTRIBS; i++) {
+      union si_vs_fix_fetch fix = key->mono.vs_fix_fetch[i];
+      if (i)
+         fprintf(f, ", ");
+      if (!fix.bits)
+         fprintf(f, "0");
+      else
+         fprintf(f, "%u.%u.%u.%u", fix.u.reverse, fix.u.log_size, fix.u.num_channels_m1,
+                 fix.u.format);
+   }
+   fprintf(f, "}\n");
 }
 
 static void si_dump_shader_key(const struct si_shader *shader, FILE *f)
 {
-	const struct si_shader_key *key = &shader->key;
-	enum pipe_shader_type shader_type = shader->selector->type;
-
-	fprintf(f, "SHADER KEY\n");
-
-	switch (shader_type) {
-	case PIPE_SHADER_VERTEX:
-		si_dump_shader_key_vs(key, &key->part.vs.prolog,
-				      "part.vs.prolog", f);
-		fprintf(f, "  as_es = %u\n", key->as_es);
-		fprintf(f, "  as_ls = %u\n", key->as_ls);
-		fprintf(f, "  as_ngg = %u\n", key->as_ngg);
-		fprintf(f, "  mono.u.vs_export_prim_id = %u\n",
-			key->mono.u.vs_export_prim_id);
-		fprintf(f, "  opt.vs_as_prim_discard_cs = %u\n",
-			key->opt.vs_as_prim_discard_cs);
-		fprintf(f, "  opt.cs_prim_type = %s\n",
-			tgsi_primitive_names[key->opt.cs_prim_type]);
-		fprintf(f, "  opt.cs_indexed = %u\n",
-			key->opt.cs_indexed);
-		fprintf(f, "  opt.cs_instancing = %u\n",
-			key->opt.cs_instancing);
-		fprintf(f, "  opt.cs_primitive_restart = %u\n",
-			key->opt.cs_primitive_restart);
-		fprintf(f, "  opt.cs_provoking_vertex_first = %u\n",
-			key->opt.cs_provoking_vertex_first);
-		fprintf(f, "  opt.cs_need_correct_orientation = %u\n",
-			key->opt.cs_need_correct_orientation);
-		fprintf(f, "  opt.cs_cull_front = %u\n",
-			key->opt.cs_cull_front);
-		fprintf(f, "  opt.cs_cull_back = %u\n",
-			key->opt.cs_cull_back);
-		fprintf(f, "  opt.cs_cull_z = %u\n",
-			key->opt.cs_cull_z);
-		fprintf(f, "  opt.cs_halfz_clip_space = %u\n",
-			key->opt.cs_halfz_clip_space);
-		break;
-
-	case PIPE_SHADER_TESS_CTRL:
-		if (shader->selector->screen->info.chip_class >= GFX9) {
-			si_dump_shader_key_vs(key, &key->part.tcs.ls_prolog,
-					      "part.tcs.ls_prolog", f);
-		}
-		fprintf(f, "  part.tcs.epilog.prim_mode = %u\n", key->part.tcs.epilog.prim_mode);
-		fprintf(f, "  mono.u.ff_tcs_inputs_to_copy = 0x%"PRIx64"\n", key->mono.u.ff_tcs_inputs_to_copy);
-		break;
-
-	case PIPE_SHADER_TESS_EVAL:
-		fprintf(f, "  as_es = %u\n", key->as_es);
-		fprintf(f, "  as_ngg = %u\n", key->as_ngg);
-		fprintf(f, "  mono.u.vs_export_prim_id = %u\n",
-			key->mono.u.vs_export_prim_id);
-		break;
-
-	case PIPE_SHADER_GEOMETRY:
-		if (shader->is_gs_copy_shader)
-			break;
-
-		if (shader->selector->screen->info.chip_class >= GFX9 &&
-		    key->part.gs.es->type == PIPE_SHADER_VERTEX) {
-			si_dump_shader_key_vs(key, &key->part.gs.vs_prolog,
-					      "part.gs.vs_prolog", f);
-		}
-		fprintf(f, "  part.gs.prolog.tri_strip_adj_fix = %u\n", key->part.gs.prolog.tri_strip_adj_fix);
-		fprintf(f, "  part.gs.prolog.gfx9_prev_is_vs = %u\n", key->part.gs.prolog.gfx9_prev_is_vs);
-		fprintf(f, "  as_ngg = %u\n", key->as_ngg);
-		break;
-
-	case PIPE_SHADER_COMPUTE:
-		break;
-
-	case PIPE_SHADER_FRAGMENT:
-		fprintf(f, "  part.ps.prolog.color_two_side = %u\n", key->part.ps.prolog.color_two_side);
-		fprintf(f, "  part.ps.prolog.flatshade_colors = %u\n", key->part.ps.prolog.flatshade_colors);
-		fprintf(f, "  part.ps.prolog.poly_stipple = %u\n", key->part.ps.prolog.poly_stipple);
-		fprintf(f, "  part.ps.prolog.force_persp_sample_interp = %u\n", key->part.ps.prolog.force_persp_sample_interp);
-		fprintf(f, "  part.ps.prolog.force_linear_sample_interp = %u\n", key->part.ps.prolog.force_linear_sample_interp);
-		fprintf(f, "  part.ps.prolog.force_persp_center_interp = %u\n", key->part.ps.prolog.force_persp_center_interp);
-		fprintf(f, "  part.ps.prolog.force_linear_center_interp = %u\n", key->part.ps.prolog.force_linear_center_interp);
-		fprintf(f, "  part.ps.prolog.bc_optimize_for_persp = %u\n", key->part.ps.prolog.bc_optimize_for_persp);
-		fprintf(f, "  part.ps.prolog.bc_optimize_for_linear = %u\n", key->part.ps.prolog.bc_optimize_for_linear);
-		fprintf(f, "  part.ps.prolog.samplemask_log_ps_iter = %u\n", key->part.ps.prolog.samplemask_log_ps_iter);
-		fprintf(f, "  part.ps.epilog.spi_shader_col_format = 0x%x\n", key->part.ps.epilog.spi_shader_col_format);
-		fprintf(f, "  part.ps.epilog.color_is_int8 = 0x%X\n", key->part.ps.epilog.color_is_int8);
-		fprintf(f, "  part.ps.epilog.color_is_int10 = 0x%X\n", key->part.ps.epilog.color_is_int10);
-		fprintf(f, "  part.ps.epilog.last_cbuf = %u\n", key->part.ps.epilog.last_cbuf);
-		fprintf(f, "  part.ps.epilog.alpha_func = %u\n", key->part.ps.epilog.alpha_func);
-		fprintf(f, "  part.ps.epilog.alpha_to_one = %u\n", key->part.ps.epilog.alpha_to_one);
-		fprintf(f, "  part.ps.epilog.poly_line_smoothing = %u\n", key->part.ps.epilog.poly_line_smoothing);
-		fprintf(f, "  part.ps.epilog.clamp_color = %u\n", key->part.ps.epilog.clamp_color);
-		fprintf(f, "  mono.u.ps.interpolate_at_sample_force_center = %u\n", key->mono.u.ps.interpolate_at_sample_force_center);
-		fprintf(f, "  mono.u.ps.fbfetch_msaa = %u\n", key->mono.u.ps.fbfetch_msaa);
-		fprintf(f, "  mono.u.ps.fbfetch_is_1D = %u\n", key->mono.u.ps.fbfetch_is_1D);
-		fprintf(f, "  mono.u.ps.fbfetch_layered = %u\n", key->mono.u.ps.fbfetch_layered);
-		break;
-
-	default:
-		assert(0);
-	}
-
-	if ((shader_type == PIPE_SHADER_GEOMETRY ||
-	     shader_type == PIPE_SHADER_TESS_EVAL ||
-	     shader_type == PIPE_SHADER_VERTEX) &&
-	    !key->as_es && !key->as_ls) {
-		fprintf(f, "  opt.kill_outputs = 0x%"PRIx64"\n", key->opt.kill_outputs);
-		fprintf(f, "  opt.clip_disable = %u\n", key->opt.clip_disable);
-		if (shader_type != PIPE_SHADER_GEOMETRY)
-			fprintf(f, "  opt.ngg_culling = 0x%x\n", key->opt.ngg_culling);
-	}
+   const struct si_shader_key *key = &shader->key;
+   enum pipe_shader_type shader_type = shader->selector->type;
+
+   fprintf(f, "SHADER KEY\n");
+
+   switch (shader_type) {
+   case PIPE_SHADER_VERTEX:
+      si_dump_shader_key_vs(key, &key->part.vs.prolog, "part.vs.prolog", f);
+      fprintf(f, "  as_es = %u\n", key->as_es);
+      fprintf(f, "  as_ls = %u\n", key->as_ls);
+      fprintf(f, "  as_ngg = %u\n", key->as_ngg);
+      fprintf(f, "  mono.u.vs_export_prim_id = %u\n", key->mono.u.vs_export_prim_id);
+      fprintf(f, "  opt.vs_as_prim_discard_cs = %u\n", key->opt.vs_as_prim_discard_cs);
+      fprintf(f, "  opt.cs_prim_type = %s\n", tgsi_primitive_names[key->opt.cs_prim_type]);
+      fprintf(f, "  opt.cs_indexed = %u\n", key->opt.cs_indexed);
+      fprintf(f, "  opt.cs_instancing = %u\n", key->opt.cs_instancing);
+      fprintf(f, "  opt.cs_primitive_restart = %u\n", key->opt.cs_primitive_restart);
+      fprintf(f, "  opt.cs_provoking_vertex_first = %u\n", key->opt.cs_provoking_vertex_first);
+      fprintf(f, "  opt.cs_need_correct_orientation = %u\n", key->opt.cs_need_correct_orientation);
+      fprintf(f, "  opt.cs_cull_front = %u\n", key->opt.cs_cull_front);
+      fprintf(f, "  opt.cs_cull_back = %u\n", key->opt.cs_cull_back);
+      fprintf(f, "  opt.cs_cull_z = %u\n", key->opt.cs_cull_z);
+      fprintf(f, "  opt.cs_halfz_clip_space = %u\n", key->opt.cs_halfz_clip_space);
+      break;
+
+   case PIPE_SHADER_TESS_CTRL:
+      if (shader->selector->screen->info.chip_class >= GFX9) {
+         si_dump_shader_key_vs(key, &key->part.tcs.ls_prolog, "part.tcs.ls_prolog", f);
+      }
+      fprintf(f, "  part.tcs.epilog.prim_mode = %u\n", key->part.tcs.epilog.prim_mode);
+      fprintf(f, "  mono.u.ff_tcs_inputs_to_copy = 0x%" PRIx64 "\n",
+              key->mono.u.ff_tcs_inputs_to_copy);
+      break;
+
+   case PIPE_SHADER_TESS_EVAL:
+      fprintf(f, "  as_es = %u\n", key->as_es);
+      fprintf(f, "  as_ngg = %u\n", key->as_ngg);
+      fprintf(f, "  mono.u.vs_export_prim_id = %u\n", key->mono.u.vs_export_prim_id);
+      break;
+
+   case PIPE_SHADER_GEOMETRY:
+      if (shader->is_gs_copy_shader)
+         break;
+
+      if (shader->selector->screen->info.chip_class >= GFX9 &&
+          key->part.gs.es->type == PIPE_SHADER_VERTEX) {
+         si_dump_shader_key_vs(key, &key->part.gs.vs_prolog, "part.gs.vs_prolog", f);
+      }
+      fprintf(f, "  part.gs.prolog.tri_strip_adj_fix = %u\n",
+              key->part.gs.prolog.tri_strip_adj_fix);
+      fprintf(f, "  part.gs.prolog.gfx9_prev_is_vs = %u\n", key->part.gs.prolog.gfx9_prev_is_vs);
+      fprintf(f, "  as_ngg = %u\n", key->as_ngg);
+      break;
+
+   case PIPE_SHADER_COMPUTE:
+      break;
+
+   case PIPE_SHADER_FRAGMENT:
+      fprintf(f, "  part.ps.prolog.color_two_side = %u\n", key->part.ps.prolog.color_two_side);
+      fprintf(f, "  part.ps.prolog.flatshade_colors = %u\n", key->part.ps.prolog.flatshade_colors);
+      fprintf(f, "  part.ps.prolog.poly_stipple = %u\n", key->part.ps.prolog.poly_stipple);
+      fprintf(f, "  part.ps.prolog.force_persp_sample_interp = %u\n",
+              key->part.ps.prolog.force_persp_sample_interp);
+      fprintf(f, "  part.ps.prolog.force_linear_sample_interp = %u\n",
+              key->part.ps.prolog.force_linear_sample_interp);
+      fprintf(f, "  part.ps.prolog.force_persp_center_interp = %u\n",
+              key->part.ps.prolog.force_persp_center_interp);
+      fprintf(f, "  part.ps.prolog.force_linear_center_interp = %u\n",
+              key->part.ps.prolog.force_linear_center_interp);
+      fprintf(f, "  part.ps.prolog.bc_optimize_for_persp = %u\n",
+              key->part.ps.prolog.bc_optimize_for_persp);
+      fprintf(f, "  part.ps.prolog.bc_optimize_for_linear = %u\n",
+              key->part.ps.prolog.bc_optimize_for_linear);
+      fprintf(f, "  part.ps.prolog.samplemask_log_ps_iter = %u\n",
+              key->part.ps.prolog.samplemask_log_ps_iter);
+      fprintf(f, "  part.ps.epilog.spi_shader_col_format = 0x%x\n",
+              key->part.ps.epilog.spi_shader_col_format);
+      fprintf(f, "  part.ps.epilog.color_is_int8 = 0x%X\n", key->part.ps.epilog.color_is_int8);
+      fprintf(f, "  part.ps.epilog.color_is_int10 = 0x%X\n", key->part.ps.epilog.color_is_int10);
+      fprintf(f, "  part.ps.epilog.last_cbuf = %u\n", key->part.ps.epilog.last_cbuf);
+      fprintf(f, "  part.ps.epilog.alpha_func = %u\n", key->part.ps.epilog.alpha_func);
+      fprintf(f, "  part.ps.epilog.alpha_to_one = %u\n", key->part.ps.epilog.alpha_to_one);
+      fprintf(f, "  part.ps.epilog.poly_line_smoothing = %u\n",
+              key->part.ps.epilog.poly_line_smoothing);
+      fprintf(f, "  part.ps.epilog.clamp_color = %u\n", key->part.ps.epilog.clamp_color);
+      fprintf(f, "  mono.u.ps.interpolate_at_sample_force_center = %u\n",
+              key->mono.u.ps.interpolate_at_sample_force_center);
+      fprintf(f, "  mono.u.ps.fbfetch_msaa = %u\n", key->mono.u.ps.fbfetch_msaa);
+      fprintf(f, "  mono.u.ps.fbfetch_is_1D = %u\n", key->mono.u.ps.fbfetch_is_1D);
+      fprintf(f, "  mono.u.ps.fbfetch_layered = %u\n", key->mono.u.ps.fbfetch_layered);
+      break;
+
+   default:
+      assert(0);
+   }
+
+   if ((shader_type == PIPE_SHADER_GEOMETRY || shader_type == PIPE_SHADER_TESS_EVAL ||
+        shader_type == PIPE_SHADER_VERTEX) &&
+       !key->as_es && !key->as_ls) {
+      fprintf(f, "  opt.kill_outputs = 0x%" PRIx64 "\n", key->opt.kill_outputs);
+      fprintf(f, "  opt.clip_disable = %u\n", key->opt.clip_disable);
+      if (shader_type != PIPE_SHADER_GEOMETRY)
+         fprintf(f, "  opt.ngg_culling = 0x%x\n", key->opt.ngg_culling);
+   }
 }
 
 static void si_optimize_vs_outputs(struct si_shader_context *ctx)
 {
-	struct si_shader *shader = ctx->shader;
-	struct si_shader_info *info = &shader->selector->info;
-
-	if ((ctx->type != PIPE_SHADER_VERTEX &&
-	     ctx->type != PIPE_SHADER_TESS_EVAL) ||
-	    shader->key.as_ls ||
-	    shader->key.as_es)
-		return;
-
-	ac_optimize_vs_outputs(&ctx->ac,
-			       ctx->main_fn,
-			       shader->info.vs_output_param_offset,
-			       info->num_outputs,
-			       &shader->info.nr_param_exports);
+   struct si_shader *shader = ctx->shader;
+   struct si_shader_info *info = &shader->selector->info;
+
+   if ((ctx->type != PIPE_SHADER_VERTEX && ctx->type != PIPE_SHADER_TESS_EVAL) ||
+       shader->key.as_ls || shader->key.as_es)
+      return;
+
+   ac_optimize_vs_outputs(&ctx->ac, ctx->main_fn, shader->info.vs_output_param_offset,
+                          info->num_outputs, &shader->info.nr_param_exports);
 }
 
 static bool si_vs_needs_prolog(const struct si_shader_selector *sel,
-			       const struct si_vs_prolog_bits *prolog_key,
-			       const struct si_shader_key *key,
-			       bool ngg_cull_shader)
+                               const struct si_vs_prolog_bits *prolog_key,
+                               const struct si_shader_key *key, bool ngg_cull_shader)
 {
-	/* VGPR initialization fixup for Vega10 and Raven is always done in the
-	 * VS prolog. */
-	return sel->vs_needs_prolog ||
-	       prolog_key->ls_vgpr_fix ||
-	       prolog_key->unpack_instance_id_from_vertex_id ||
-	       (ngg_cull_shader && key->opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL);
+   /* VGPR initialization fixup for Vega10 and Raven is always done in the
+    * VS prolog. */
+   return sel->vs_needs_prolog || prolog_key->ls_vgpr_fix ||
+          prolog_key->unpack_instance_id_from_vertex_id ||
+          (ngg_cull_shader && key->opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL);
 }
 
-static bool si_build_main_function(struct si_shader_context *ctx,
-				   struct si_shader *shader,
-				   struct nir_shader *nir, bool free_nir,
-				   bool ngg_cull_shader)
+static bool si_build_main_function(struct si_shader_context *ctx, struct si_shader *shader,
+                                   struct nir_shader *nir, bool free_nir, bool ngg_cull_shader)
 {
-	struct si_shader_selector *sel = shader->selector;
-	const struct si_shader_info *info = &sel->info;
-
-	ctx->shader = shader;
-	ctx->type = sel->type;
-
-	ctx->num_const_buffers = util_last_bit(info->const_buffers_declared);
-	ctx->num_shader_buffers = util_last_bit(info->shader_buffers_declared);
-
-	ctx->num_samplers = util_last_bit(info->samplers_declared);
-	ctx->num_images = util_last_bit(info->images_declared);
-
-	si_llvm_init_resource_callbacks(ctx);
-
-	switch (ctx->type) {
-	case PIPE_SHADER_VERTEX:
-		si_llvm_init_vs_callbacks(ctx, ngg_cull_shader);
-		break;
-	case PIPE_SHADER_TESS_CTRL:
-		si_llvm_init_tcs_callbacks(ctx);
-		break;
-	case PIPE_SHADER_TESS_EVAL:
-		si_llvm_init_tes_callbacks(ctx, ngg_cull_shader);
-		break;
-	case PIPE_SHADER_GEOMETRY:
-		si_llvm_init_gs_callbacks(ctx);
-		break;
-	case PIPE_SHADER_FRAGMENT:
-		si_llvm_init_ps_callbacks(ctx);
-		break;
-	case PIPE_SHADER_COMPUTE:
-		ctx->abi.load_local_group_size = si_llvm_get_block_size;
-		break;
-	default:
-		assert(!"Unsupported shader type");
-		return false;
-	}
-
-	si_create_function(ctx, ngg_cull_shader);
-
-	if (ctx->shader->key.as_es || ctx->type == PIPE_SHADER_GEOMETRY)
-		si_preload_esgs_ring(ctx);
-
-	if (ctx->type == PIPE_SHADER_GEOMETRY)
-		si_preload_gs_rings(ctx);
-	else if (ctx->type == PIPE_SHADER_TESS_EVAL)
-		si_llvm_preload_tes_rings(ctx);
-
-	if (ctx->type == PIPE_SHADER_TESS_CTRL &&
-	    sel->info.tessfactors_are_def_in_all_invocs) {
-		for (unsigned i = 0; i < 6; i++) {
-			ctx->invoc0_tess_factors[i] =
-				ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, "");
-		}
-	}
-
-	if (ctx->type == PIPE_SHADER_GEOMETRY) {
-		for (unsigned i = 0; i < 4; i++) {
-			ctx->gs_next_vertex[i] =
-				ac_build_alloca(&ctx->ac, ctx->ac.i32, "");
-		}
-		if (shader->key.as_ngg) {
-			for (unsigned i = 0; i < 4; ++i) {
-				ctx->gs_curprim_verts[i] =
-					ac_build_alloca(&ctx->ac, ctx->ac.i32, "");
-				ctx->gs_generated_prims[i] =
-					ac_build_alloca(&ctx->ac, ctx->ac.i32, "");
-			}
-
-			unsigned scratch_size = 8;
-			if (sel->so.num_outputs)
-				scratch_size = 44;
-
-			assert(!ctx->gs_ngg_scratch);
-			LLVMTypeRef ai32 = LLVMArrayType(ctx->ac.i32, scratch_size);
-			ctx->gs_ngg_scratch = LLVMAddGlobalInAddressSpace(ctx->ac.module,
-				ai32, "ngg_scratch", AC_ADDR_SPACE_LDS);
-			LLVMSetInitializer(ctx->gs_ngg_scratch, LLVMGetUndef(ai32));
-			LLVMSetAlignment(ctx->gs_ngg_scratch, 4);
-
-			ctx->gs_ngg_emit = LLVMAddGlobalInAddressSpace(ctx->ac.module,
-				LLVMArrayType(ctx->ac.i32, 0), "ngg_emit", AC_ADDR_SPACE_LDS);
-			LLVMSetLinkage(ctx->gs_ngg_emit, LLVMExternalLinkage);
-			LLVMSetAlignment(ctx->gs_ngg_emit, 4);
-		}
-	}
-
-	if (ctx->type != PIPE_SHADER_GEOMETRY &&
-	    (shader->key.as_ngg && !shader->key.as_es)) {
-		/* Unconditionally declare scratch space base for streamout and
-		 * vertex compaction. Whether space is actually allocated is
-		 * determined during linking / PM4 creation.
-		 *
-		 * Add an extra dword per vertex to ensure an odd stride, which
-		 * avoids bank conflicts for SoA accesses.
-		 */
-		if (!gfx10_is_ngg_passthrough(shader))
-			si_llvm_declare_esgs_ring(ctx);
-
-		/* This is really only needed when streamout and / or vertex
-		 * compaction is enabled.
-		 */
-		if (!ctx->gs_ngg_scratch &&
-		    (sel->so.num_outputs || shader->key.opt.ngg_culling)) {
-			LLVMTypeRef asi32 = LLVMArrayType(ctx->ac.i32, 8);
-			ctx->gs_ngg_scratch = LLVMAddGlobalInAddressSpace(ctx->ac.module,
-				asi32, "ngg_scratch", AC_ADDR_SPACE_LDS);
-			LLVMSetInitializer(ctx->gs_ngg_scratch, LLVMGetUndef(asi32));
-			LLVMSetAlignment(ctx->gs_ngg_scratch, 4);
-		}
-	}
-
-	/* For GFX9 merged shaders:
-	 * - Set EXEC for the first shader. If the prolog is present, set
-	 *   EXEC there instead.
-	 * - Add a barrier before the second shader.
-	 * - In the second shader, reset EXEC to ~0 and wrap the main part in
-	 *   an if-statement. This is required for correctness in geometry
-	 *   shaders, to ensure that empty GS waves do not send GS_EMIT and
-	 *   GS_CUT messages.
-	 *
-	 * For monolithic merged shaders, the first shader is wrapped in an
-	 * if-block together with its prolog in si_build_wrapper_function.
-	 *
-	 * NGG vertex and tess eval shaders running as the last
-	 * vertex/geometry stage handle execution explicitly using
-	 * if-statements.
-	 */
-	if (ctx->screen->info.chip_class >= GFX9) {
-		if (!shader->is_monolithic &&
-		    (shader->key.as_es || shader->key.as_ls) &&
-		    (ctx->type == PIPE_SHADER_TESS_EVAL ||
-		     (ctx->type == PIPE_SHADER_VERTEX &&
-		      !si_vs_needs_prolog(sel, &shader->key.part.vs.prolog,
-					  &shader->key, ngg_cull_shader)))) {
-			si_init_exec_from_input(ctx,
-						ctx->merged_wave_info, 0);
-		} else if (ctx->type == PIPE_SHADER_TESS_CTRL ||
-			   ctx->type == PIPE_SHADER_GEOMETRY ||
-			   (shader->key.as_ngg && !shader->key.as_es)) {
-			LLVMValueRef thread_enabled;
-			bool nested_barrier;
-
-			if (!shader->is_monolithic ||
-			    (ctx->type == PIPE_SHADER_TESS_EVAL &&
-			     shader->key.as_ngg && !shader->key.as_es &&
-			     !shader->key.opt.ngg_culling))
-				ac_init_exec_full_mask(&ctx->ac);
-
-			if ((ctx->type == PIPE_SHADER_VERTEX ||
-			     ctx->type == PIPE_SHADER_TESS_EVAL) &&
-			    shader->key.as_ngg && !shader->key.as_es &&
-			    !shader->key.opt.ngg_culling) {
-				gfx10_ngg_build_sendmsg_gs_alloc_req(ctx);
-
-				/* Build the primitive export at the beginning
-				 * of the shader if possible.
-				 */
-				if (gfx10_ngg_export_prim_early(shader))
-					gfx10_ngg_build_export_prim(ctx, NULL, NULL);
-			}
-
-			if (ctx->type == PIPE_SHADER_TESS_CTRL ||
-			    ctx->type == PIPE_SHADER_GEOMETRY) {
-				if (ctx->type == PIPE_SHADER_GEOMETRY && shader->key.as_ngg) {
-					gfx10_ngg_gs_emit_prologue(ctx);
-					nested_barrier = false;
-				} else {
-					nested_barrier = true;
-				}
-
-				thread_enabled = si_is_gs_thread(ctx);
-			} else {
-				thread_enabled = si_is_es_thread(ctx);
-				nested_barrier = false;
-			}
-
-			ctx->merged_wrap_if_entry_block = LLVMGetInsertBlock(ctx->ac.builder);
-			ctx->merged_wrap_if_label = 11500;
-			ac_build_ifcc(&ctx->ac, thread_enabled, ctx->merged_wrap_if_label);
-
-			if (nested_barrier) {
-				/* Execute a barrier before the second shader in
-				 * a merged shader.
-				 *
-				 * Execute the barrier inside the conditional block,
-				 * so that empty waves can jump directly to s_endpgm,
-				 * which will also signal the barrier.
-				 *
-				 * This is possible in gfx9, because an empty wave
-				 * for the second shader does not participate in
-				 * the epilogue. With NGG, empty waves may still
-				 * be required to export data (e.g. GS output vertices),
-				 * so we cannot let them exit early.
-				 *
-				 * If the shader is TCS and the TCS epilog is present
-				 * and contains a barrier, it will wait there and then
-				 * reach s_endpgm.
-				 */
-				si_llvm_emit_barrier(ctx);
-			}
-		}
-	}
-
-	bool success = si_nir_build_llvm(ctx, nir);
-	if (free_nir)
-		ralloc_free(nir);
-	if (!success) {
-		fprintf(stderr, "Failed to translate shader from NIR to LLVM\n");
-		return false;
-	}
-
-	si_llvm_build_ret(ctx, ctx->return_value);
-	return true;
+   struct si_shader_selector *sel = shader->selector;
+   const struct si_shader_info *info = &sel->info;
+
+   ctx->shader = shader;
+   ctx->type = sel->type;
+
+   ctx->num_const_buffers = util_last_bit(info->const_buffers_declared);
+   ctx->num_shader_buffers = util_last_bit(info->shader_buffers_declared);
+
+   ctx->num_samplers = util_last_bit(info->samplers_declared);
+   ctx->num_images = util_last_bit(info->images_declared);
+
+   si_llvm_init_resource_callbacks(ctx);
+
+   switch (ctx->type) {
+   case PIPE_SHADER_VERTEX:
+      si_llvm_init_vs_callbacks(ctx, ngg_cull_shader);
+      break;
+   case PIPE_SHADER_TESS_CTRL:
+      si_llvm_init_tcs_callbacks(ctx);
+      break;
+   case PIPE_SHADER_TESS_EVAL:
+      si_llvm_init_tes_callbacks(ctx, ngg_cull_shader);
+      break;
+   case PIPE_SHADER_GEOMETRY:
+      si_llvm_init_gs_callbacks(ctx);
+      break;
+   case PIPE_SHADER_FRAGMENT:
+      si_llvm_init_ps_callbacks(ctx);
+      break;
+   case PIPE_SHADER_COMPUTE:
+      ctx->abi.load_local_group_size = si_llvm_get_block_size;
+      break;
+   default:
+      assert(!"Unsupported shader type");
+      return false;
+   }
+
+   si_create_function(ctx, ngg_cull_shader);
+
+   if (ctx->shader->key.as_es || ctx->type == PIPE_SHADER_GEOMETRY)
+      si_preload_esgs_ring(ctx);
+
+   if (ctx->type == PIPE_SHADER_GEOMETRY)
+      si_preload_gs_rings(ctx);
+   else if (ctx->type == PIPE_SHADER_TESS_EVAL)
+      si_llvm_preload_tes_rings(ctx);
+
+   if (ctx->type == PIPE_SHADER_TESS_CTRL && sel->info.tessfactors_are_def_in_all_invocs) {
+      for (unsigned i = 0; i < 6; i++) {
+         ctx->invoc0_tess_factors[i] = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, "");
+      }
+   }
+
+   if (ctx->type == PIPE_SHADER_GEOMETRY) {
+      for (unsigned i = 0; i < 4; i++) {
+         ctx->gs_next_vertex[i] = ac_build_alloca(&ctx->ac, ctx->ac.i32, "");
+      }
+      if (shader->key.as_ngg) {
+         for (unsigned i = 0; i < 4; ++i) {
+            ctx->gs_curprim_verts[i] = ac_build_alloca(&ctx->ac, ctx->ac.i32, "");
+            ctx->gs_generated_prims[i] = ac_build_alloca(&ctx->ac, ctx->ac.i32, "");
+         }
+
+         unsigned scratch_size = 8;
+         if (sel->so.num_outputs)
+            scratch_size = 44;
+
+         assert(!ctx->gs_ngg_scratch);
+         LLVMTypeRef ai32 = LLVMArrayType(ctx->ac.i32, scratch_size);
+         ctx->gs_ngg_scratch =
+            LLVMAddGlobalInAddressSpace(ctx->ac.module, ai32, "ngg_scratch", AC_ADDR_SPACE_LDS);
+         LLVMSetInitializer(ctx->gs_ngg_scratch, LLVMGetUndef(ai32));
+         LLVMSetAlignment(ctx->gs_ngg_scratch, 4);
+
+         ctx->gs_ngg_emit = LLVMAddGlobalInAddressSpace(
+            ctx->ac.module, LLVMArrayType(ctx->ac.i32, 0), "ngg_emit", AC_ADDR_SPACE_LDS);
+         LLVMSetLinkage(ctx->gs_ngg_emit, LLVMExternalLinkage);
+         LLVMSetAlignment(ctx->gs_ngg_emit, 4);
+      }
+   }
+
+   if (ctx->type != PIPE_SHADER_GEOMETRY && (shader->key.as_ngg && !shader->key.as_es)) {
+      /* Unconditionally declare scratch space base for streamout and
+       * vertex compaction. Whether space is actually allocated is
+       * determined during linking / PM4 creation.
+       *
+       * Add an extra dword per vertex to ensure an odd stride, which
+       * avoids bank conflicts for SoA accesses.
+       */
+      if (!gfx10_is_ngg_passthrough(shader))
+         si_llvm_declare_esgs_ring(ctx);
+
+      /* This is really only needed when streamout and / or vertex
+       * compaction is enabled.
+       */
+      if (!ctx->gs_ngg_scratch && (sel->so.num_outputs || shader->key.opt.ngg_culling)) {
+         LLVMTypeRef asi32 = LLVMArrayType(ctx->ac.i32, 8);
+         ctx->gs_ngg_scratch =
+            LLVMAddGlobalInAddressSpace(ctx->ac.module, asi32, "ngg_scratch", AC_ADDR_SPACE_LDS);
+         LLVMSetInitializer(ctx->gs_ngg_scratch, LLVMGetUndef(asi32));
+         LLVMSetAlignment(ctx->gs_ngg_scratch, 4);
+      }
+   }
+
+   /* For GFX9 merged shaders:
+    * - Set EXEC for the first shader. If the prolog is present, set
+    *   EXEC there instead.
+    * - Add a barrier before the second shader.
+    * - In the second shader, reset EXEC to ~0 and wrap the main part in
+    *   an if-statement. This is required for correctness in geometry
+    *   shaders, to ensure that empty GS waves do not send GS_EMIT and
+    *   GS_CUT messages.
+    *
+    * For monolithic merged shaders, the first shader is wrapped in an
+    * if-block together with its prolog in si_build_wrapper_function.
+    *
+    * NGG vertex and tess eval shaders running as the last
+    * vertex/geometry stage handle execution explicitly using
+    * if-statements.
+    */
+   if (ctx->screen->info.chip_class >= GFX9) {
+      if (!shader->is_monolithic && (shader->key.as_es || shader->key.as_ls) &&
+          (ctx->type == PIPE_SHADER_TESS_EVAL ||
+           (ctx->type == PIPE_SHADER_VERTEX &&
+            !si_vs_needs_prolog(sel, &shader->key.part.vs.prolog, &shader->key, ngg_cull_shader)))) {
+         si_init_exec_from_input(ctx, ctx->merged_wave_info, 0);
+      } else if (ctx->type == PIPE_SHADER_TESS_CTRL || ctx->type == PIPE_SHADER_GEOMETRY ||
+                 (shader->key.as_ngg && !shader->key.as_es)) {
+         LLVMValueRef thread_enabled;
+         bool nested_barrier;
+
+         if (!shader->is_monolithic || (ctx->type == PIPE_SHADER_TESS_EVAL && shader->key.as_ngg &&
+                                        !shader->key.as_es && !shader->key.opt.ngg_culling))
+            ac_init_exec_full_mask(&ctx->ac);
+
+         if ((ctx->type == PIPE_SHADER_VERTEX || ctx->type == PIPE_SHADER_TESS_EVAL) &&
+             shader->key.as_ngg && !shader->key.as_es && !shader->key.opt.ngg_culling) {
+            gfx10_ngg_build_sendmsg_gs_alloc_req(ctx);
+
+            /* Build the primitive export at the beginning
+             * of the shader if possible.
+             */
+            if (gfx10_ngg_export_prim_early(shader))
+               gfx10_ngg_build_export_prim(ctx, NULL, NULL);
+         }
+
+         if (ctx->type == PIPE_SHADER_TESS_CTRL || ctx->type == PIPE_SHADER_GEOMETRY) {
+            if (ctx->type == PIPE_SHADER_GEOMETRY && shader->key.as_ngg) {
+               gfx10_ngg_gs_emit_prologue(ctx);
+               nested_barrier = false;
+            } else {
+               nested_barrier = true;
+            }
+
+            thread_enabled = si_is_gs_thread(ctx);
+         } else {
+            thread_enabled = si_is_es_thread(ctx);
+            nested_barrier = false;
+         }
+
+         ctx->merged_wrap_if_entry_block = LLVMGetInsertBlock(ctx->ac.builder);
+         ctx->merged_wrap_if_label = 11500;
+         ac_build_ifcc(&ctx->ac, thread_enabled, ctx->merged_wrap_if_label);
+
+         if (nested_barrier) {
+            /* Execute a barrier before the second shader in
+             * a merged shader.
+             *
+             * Execute the barrier inside the conditional block,
+             * so that empty waves can jump directly to s_endpgm,
+             * which will also signal the barrier.
+             *
+             * This is possible in gfx9, because an empty wave
+             * for the second shader does not participate in
+             * the epilogue. With NGG, empty waves may still
+             * be required to export data (e.g. GS output vertices),
+             * so we cannot let them exit early.
+             *
+             * If the shader is TCS and the TCS epilog is present
+             * and contains a barrier, it will wait there and then
+             * reach s_endpgm.
+             */
+            si_llvm_emit_barrier(ctx);
+         }
+      }
+   }
+
+   bool success = si_nir_build_llvm(ctx, nir);
+   if (free_nir)
+      ralloc_free(nir);
+   if (!success) {
+      fprintf(stderr, "Failed to translate shader from NIR to LLVM\n");
+      return false;
+   }
+
+   si_llvm_build_ret(ctx, ctx->return_value);
+   return true;
 }
 
 /**
@@ -1622,425 +1510,385 @@ static bool si_build_main_function(struct si_shader_context *ctx,
  * \param shader_out       The vertex shader, or the next shader if merging LS+HS or ES+GS.
  * \param key              Output shader part key.
  */
-static void si_get_vs_prolog_key(const struct si_shader_info *info,
-				 unsigned num_input_sgprs,
-				 bool ngg_cull_shader,
-				 const struct si_vs_prolog_bits *prolog_key,
-				 struct si_shader *shader_out,
-				 union si_shader_part_key *key)
+static void si_get_vs_prolog_key(const struct si_shader_info *info, unsigned num_input_sgprs,
+                                 bool ngg_cull_shader, const struct si_vs_prolog_bits *prolog_key,
+                                 struct si_shader *shader_out, union si_shader_part_key *key)
 {
-	memset(key, 0, sizeof(*key));
-	key->vs_prolog.states = *prolog_key;
-	key->vs_prolog.num_input_sgprs = num_input_sgprs;
-	key->vs_prolog.num_inputs = info->num_inputs;
-	key->vs_prolog.as_ls = shader_out->key.as_ls;
-	key->vs_prolog.as_es = shader_out->key.as_es;
-	key->vs_prolog.as_ngg = shader_out->key.as_ngg;
-	key->vs_prolog.as_prim_discard_cs = shader_out->key.opt.vs_as_prim_discard_cs;
-
-	if (ngg_cull_shader) {
-		key->vs_prolog.gs_fast_launch_tri_list = !!(shader_out->key.opt.ngg_culling &
-							    SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST);
-		key->vs_prolog.gs_fast_launch_tri_strip = !!(shader_out->key.opt.ngg_culling &
-							     SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP);
-	} else {
-		key->vs_prolog.has_ngg_cull_inputs = !!shader_out->key.opt.ngg_culling;
-	}
-
-	if (shader_out->selector->type == PIPE_SHADER_TESS_CTRL) {
-		key->vs_prolog.as_ls = 1;
-		key->vs_prolog.num_merged_next_stage_vgprs = 2;
-	} else if (shader_out->selector->type == PIPE_SHADER_GEOMETRY) {
-		key->vs_prolog.as_es = 1;
-		key->vs_prolog.num_merged_next_stage_vgprs = 5;
-	} else if (shader_out->key.as_ngg) {
-		key->vs_prolog.num_merged_next_stage_vgprs = 5;
-	}
-
-	/* Only one of these combinations can be set. as_ngg can be set with as_es. */
-	assert(key->vs_prolog.as_ls +
-	       key->vs_prolog.as_ngg +
-	       (key->vs_prolog.as_es && !key->vs_prolog.as_ngg) +
-	       key->vs_prolog.as_prim_discard_cs <= 1);
-
-	/* Enable loading the InstanceID VGPR. */
-	uint16_t input_mask = u_bit_consecutive(0, info->num_inputs);
-
-	if ((key->vs_prolog.states.instance_divisor_is_one |
-	     key->vs_prolog.states.instance_divisor_is_fetched) & input_mask)
-		shader_out->info.uses_instanceid = true;
+   memset(key, 0, sizeof(*key));
+   key->vs_prolog.states = *prolog_key;
+   key->vs_prolog.num_input_sgprs = num_input_sgprs;
+   key->vs_prolog.num_inputs = info->num_inputs;
+   key->vs_prolog.as_ls = shader_out->key.as_ls;
+   key->vs_prolog.as_es = shader_out->key.as_es;
+   key->vs_prolog.as_ngg = shader_out->key.as_ngg;
+   key->vs_prolog.as_prim_discard_cs = shader_out->key.opt.vs_as_prim_discard_cs;
+
+   if (ngg_cull_shader) {
+      key->vs_prolog.gs_fast_launch_tri_list =
+         !!(shader_out->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST);
+      key->vs_prolog.gs_fast_launch_tri_strip =
+         !!(shader_out->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP);
+   } else {
+      key->vs_prolog.has_ngg_cull_inputs = !!shader_out->key.opt.ngg_culling;
+   }
+
+   if (shader_out->selector->type == PIPE_SHADER_TESS_CTRL) {
+      key->vs_prolog.as_ls = 1;
+      key->vs_prolog.num_merged_next_stage_vgprs = 2;
+   } else if (shader_out->selector->type == PIPE_SHADER_GEOMETRY) {
+      key->vs_prolog.as_es = 1;
+      key->vs_prolog.num_merged_next_stage_vgprs = 5;
+   } else if (shader_out->key.as_ngg) {
+      key->vs_prolog.num_merged_next_stage_vgprs = 5;
+   }
+
+   /* Only one of these combinations can be set. as_ngg can be set with as_es. */
+   assert(key->vs_prolog.as_ls + key->vs_prolog.as_ngg +
+             (key->vs_prolog.as_es && !key->vs_prolog.as_ngg) + key->vs_prolog.as_prim_discard_cs <=
+          1);
+
+   /* Enable loading the InstanceID VGPR. */
+   uint16_t input_mask = u_bit_consecutive(0, info->num_inputs);
+
+   if ((key->vs_prolog.states.instance_divisor_is_one |
+        key->vs_prolog.states.instance_divisor_is_fetched) &
+       input_mask)
+      shader_out->info.uses_instanceid = true;
 }
 
 static bool si_should_optimize_less(struct ac_llvm_compiler *compiler,
-				    struct si_shader_selector *sel)
+                                    struct si_shader_selector *sel)
 {
-	if (!compiler->low_opt_passes)
-		return false;
+   if (!compiler->low_opt_passes)
+      return false;
 
-	/* Assume a slow CPU. */
-	assert(!sel->screen->info.has_dedicated_vram &&
-	       sel->screen->info.chip_class <= GFX8);
+   /* Assume a slow CPU. */
+   assert(!sel->screen->info.has_dedicated_vram && sel->screen->info.chip_class <= GFX8);
 
-	/* For a crazy dEQP test containing 2597 memory opcodes, mostly
-	 * buffer stores. */
-	return sel->type == PIPE_SHADER_COMPUTE &&
-	       sel->info.num_memory_instructions > 1000;
+   /* For a crazy dEQP test containing 2597 memory opcodes, mostly
+    * buffer stores. */
+   return sel->type == PIPE_SHADER_COMPUTE && sel->info.num_memory_instructions > 1000;
 }
 
-static struct nir_shader *get_nir_shader(struct si_shader_selector *sel,
-					 bool *free_nir)
+static struct nir_shader *get_nir_shader(struct si_shader_selector *sel, bool *free_nir)
 {
-	*free_nir = false;
-
-	if (sel->nir) {
-		return sel->nir;
-	} else if (sel->nir_binary) {
-		struct pipe_screen *screen = &sel->screen->b;
-		const void *options =
-			screen->get_compiler_options(screen, PIPE_SHADER_IR_NIR,
-						     sel->type);
-
-		struct blob_reader blob_reader;
-		blob_reader_init(&blob_reader, sel->nir_binary, sel->nir_size);
-		*free_nir = true;
-		return nir_deserialize(NULL, options, &blob_reader);
-	}
-	return NULL;
+   *free_nir = false;
+
+   if (sel->nir) {
+      return sel->nir;
+   } else if (sel->nir_binary) {
+      struct pipe_screen *screen = &sel->screen->b;
+      const void *options = screen->get_compiler_options(screen, PIPE_SHADER_IR_NIR, sel->type);
+
+      struct blob_reader blob_reader;
+      blob_reader_init(&blob_reader, sel->nir_binary, sel->nir_size);
+      *free_nir = true;
+      return nir_deserialize(NULL, options, &blob_reader);
+   }
+   return NULL;
 }
 
-static bool si_llvm_compile_shader(struct si_screen *sscreen,
-				   struct ac_llvm_compiler *compiler,
-				   struct si_shader *shader,
-				   struct pipe_debug_callback *debug,
-				   struct nir_shader *nir,
-				   bool free_nir)
+static bool si_llvm_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
+                                   struct si_shader *shader, struct pipe_debug_callback *debug,
+                                   struct nir_shader *nir, bool free_nir)
 {
-	struct si_shader_selector *sel = shader->selector;
-	struct si_shader_context ctx;
-
-	si_llvm_context_init(&ctx, sscreen, compiler, si_get_shader_wave_size(shader));
-
-	LLVMValueRef ngg_cull_main_fn = NULL;
-	if (shader->key.opt.ngg_culling) {
-		if (!si_build_main_function(&ctx, shader, nir, false, true)) {
-			si_llvm_dispose(&ctx);
-			return false;
-		}
-		ngg_cull_main_fn = ctx.main_fn;
-		ctx.main_fn = NULL;
-	}
-
-	if (!si_build_main_function(&ctx, shader, nir, free_nir, false)) {
-		si_llvm_dispose(&ctx);
-		return false;
-	}
-
-	if (shader->is_monolithic && ctx.type == PIPE_SHADER_VERTEX) {
-		LLVMValueRef parts[4];
-		unsigned num_parts = 0;
-		bool has_prolog = false;
-		LLVMValueRef main_fn = ctx.main_fn;
-
-		if (ngg_cull_main_fn) {
-			if (si_vs_needs_prolog(sel, &shader->key.part.vs.prolog,
-					       &shader->key, true)) {
-				union si_shader_part_key prolog_key;
-				si_get_vs_prolog_key(&sel->info,
-						     shader->info.num_input_sgprs,
-						     true,
-						     &shader->key.part.vs.prolog,
-						     shader, &prolog_key);
-				prolog_key.vs_prolog.is_monolithic = true;
-				si_llvm_build_vs_prolog(&ctx, &prolog_key);
-				parts[num_parts++] = ctx.main_fn;
-				has_prolog = true;
-			}
-			parts[num_parts++] = ngg_cull_main_fn;
-		}
-
-		if (si_vs_needs_prolog(sel, &shader->key.part.vs.prolog,
-				       &shader->key, false)) {
-			union si_shader_part_key prolog_key;
-			si_get_vs_prolog_key(&sel->info,
-					     shader->info.num_input_sgprs,
-					     false,
-					     &shader->key.part.vs.prolog,
-					     shader, &prolog_key);
-			prolog_key.vs_prolog.is_monolithic = true;
-			si_llvm_build_vs_prolog(&ctx, &prolog_key);
-			parts[num_parts++] = ctx.main_fn;
-			has_prolog = true;
-		}
-		parts[num_parts++] = main_fn;
-
-		si_build_wrapper_function(&ctx, parts, num_parts,
-					  has_prolog ? 1 : 0, 0);
-
-		if (ctx.shader->key.opt.vs_as_prim_discard_cs)
-			si_build_prim_discard_compute_shader(&ctx);
-	} else if (shader->is_monolithic && ctx.type == PIPE_SHADER_TESS_EVAL &&
-		   ngg_cull_main_fn) {
-		LLVMValueRef parts[2];
-
-		parts[0] = ngg_cull_main_fn;
-		parts[1] = ctx.main_fn;
-
-		si_build_wrapper_function(&ctx, parts, 2, 0, 0);
-	} else if (shader->is_monolithic && ctx.type == PIPE_SHADER_TESS_CTRL) {
-		if (sscreen->info.chip_class >= GFX9) {
-			struct si_shader_selector *ls = shader->key.part.tcs.ls;
-			LLVMValueRef parts[4];
-			bool vs_needs_prolog =
-				si_vs_needs_prolog(ls, &shader->key.part.tcs.ls_prolog,
-						   &shader->key, false);
-
-			/* TCS main part */
-			parts[2] = ctx.main_fn;
-
-			/* TCS epilog */
-			union si_shader_part_key tcs_epilog_key;
-			memset(&tcs_epilog_key, 0, sizeof(tcs_epilog_key));
-			tcs_epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog;
-			si_llvm_build_tcs_epilog(&ctx, &tcs_epilog_key);
-			parts[3] = ctx.main_fn;
-
-			/* VS as LS main part */
-			nir = get_nir_shader(ls, &free_nir);
-			struct si_shader shader_ls = {};
-			shader_ls.selector = ls;
-			shader_ls.key.as_ls = 1;
-			shader_ls.key.mono = shader->key.mono;
-			shader_ls.key.opt = shader->key.opt;
-			shader_ls.is_monolithic = true;
-
-			if (!si_build_main_function(&ctx, &shader_ls, nir, free_nir, false)) {
-				si_llvm_dispose(&ctx);
-				return false;
-			}
-			shader->info.uses_instanceid |= ls->info.uses_instanceid;
-			parts[1] = ctx.main_fn;
-
-			/* LS prolog */
-			if (vs_needs_prolog) {
-				union si_shader_part_key vs_prolog_key;
-				si_get_vs_prolog_key(&ls->info,
-						     shader_ls.info.num_input_sgprs,
-						     false,
-						     &shader->key.part.tcs.ls_prolog,
-						     shader, &vs_prolog_key);
-				vs_prolog_key.vs_prolog.is_monolithic = true;
-				si_llvm_build_vs_prolog(&ctx, &vs_prolog_key);
-				parts[0] = ctx.main_fn;
-			}
-
-			/* Reset the shader context. */
-			ctx.shader = shader;
-			ctx.type = PIPE_SHADER_TESS_CTRL;
-
-			si_build_wrapper_function(&ctx,
-						  parts + !vs_needs_prolog,
-						  4 - !vs_needs_prolog, vs_needs_prolog,
-						  vs_needs_prolog ? 2 : 1);
-		} else {
-			LLVMValueRef parts[2];
-			union si_shader_part_key epilog_key;
-
-			parts[0] = ctx.main_fn;
-
-			memset(&epilog_key, 0, sizeof(epilog_key));
-			epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog;
-			si_llvm_build_tcs_epilog(&ctx, &epilog_key);
-			parts[1] = ctx.main_fn;
-
-			si_build_wrapper_function(&ctx, parts, 2, 0, 0);
-		}
-	} else if (shader->is_monolithic && ctx.type == PIPE_SHADER_GEOMETRY) {
-		if (ctx.screen->info.chip_class >= GFX9) {
-			struct si_shader_selector *es = shader->key.part.gs.es;
-			LLVMValueRef es_prolog = NULL;
-			LLVMValueRef es_main = NULL;
-			LLVMValueRef gs_prolog = NULL;
-			LLVMValueRef gs_main = ctx.main_fn;
-
-			/* GS prolog */
-			union si_shader_part_key gs_prolog_key;
-			memset(&gs_prolog_key, 0, sizeof(gs_prolog_key));
-			gs_prolog_key.gs_prolog.states = shader->key.part.gs.prolog;
-			gs_prolog_key.gs_prolog.is_monolithic = true;
-			gs_prolog_key.gs_prolog.as_ngg = shader->key.as_ngg;
-			si_llvm_build_gs_prolog(&ctx, &gs_prolog_key);
-			gs_prolog = ctx.main_fn;
-
-			/* ES main part */
-			nir = get_nir_shader(es, &free_nir);
-			struct si_shader shader_es = {};
-			shader_es.selector = es;
-			shader_es.key.as_es = 1;
-			shader_es.key.as_ngg = shader->key.as_ngg;
-			shader_es.key.mono = shader->key.mono;
-			shader_es.key.opt = shader->key.opt;
-			shader_es.is_monolithic = true;
-
-			if (!si_build_main_function(&ctx, &shader_es, nir, free_nir, false)) {
-				si_llvm_dispose(&ctx);
-				return false;
-			}
-			shader->info.uses_instanceid |= es->info.uses_instanceid;
-			es_main = ctx.main_fn;
-
-			/* ES prolog */
-			if (es->type == PIPE_SHADER_VERTEX &&
-			    si_vs_needs_prolog(es, &shader->key.part.gs.vs_prolog,
-					       &shader->key, false)) {
-				union si_shader_part_key vs_prolog_key;
-				si_get_vs_prolog_key(&es->info,
-						     shader_es.info.num_input_sgprs,
-						     false,
-						     &shader->key.part.gs.vs_prolog,
-						     shader, &vs_prolog_key);
-				vs_prolog_key.vs_prolog.is_monolithic = true;
-				si_llvm_build_vs_prolog(&ctx, &vs_prolog_key);
-				es_prolog = ctx.main_fn;
-			}
-
-			/* Reset the shader context. */
-			ctx.shader = shader;
-			ctx.type = PIPE_SHADER_GEOMETRY;
-
-			/* Prepare the array of shader parts. */
-			LLVMValueRef parts[4];
-			unsigned num_parts = 0, main_part, next_first_part;
-
-			if (es_prolog)
-				parts[num_parts++] = es_prolog;
-
-			parts[main_part = num_parts++] = es_main;
-			parts[next_first_part = num_parts++] = gs_prolog;
-			parts[num_parts++] = gs_main;
-
-			si_build_wrapper_function(&ctx, parts, num_parts,
-						  main_part, next_first_part);
-		} else {
-			LLVMValueRef parts[2];
-			union si_shader_part_key prolog_key;
-
-			parts[1] = ctx.main_fn;
-
-			memset(&prolog_key, 0, sizeof(prolog_key));
-			prolog_key.gs_prolog.states = shader->key.part.gs.prolog;
-			si_llvm_build_gs_prolog(&ctx, &prolog_key);
-			parts[0] = ctx.main_fn;
-
-			si_build_wrapper_function(&ctx, parts, 2, 1, 0);
-		}
-	} else if (shader->is_monolithic && ctx.type == PIPE_SHADER_FRAGMENT) {
-		si_llvm_build_monolithic_ps(&ctx, shader);
-	}
-
-	si_llvm_optimize_module(&ctx);
-
-	/* Post-optimization transformations and analysis. */
-	si_optimize_vs_outputs(&ctx);
-
-	if ((debug && debug->debug_message) ||
-	    si_can_dump_shader(sscreen, ctx.type)) {
-		ctx.shader->info.private_mem_vgprs =
-			ac_count_scratch_private_memory(ctx.main_fn);
-	}
-
-	/* Make sure the input is a pointer and not integer followed by inttoptr. */
-	assert(LLVMGetTypeKind(LLVMTypeOf(LLVMGetParam(ctx.main_fn, 0))) ==
-	       LLVMPointerTypeKind);
-
-	/* Compile to bytecode. */
-	if (!si_compile_llvm(sscreen, &shader->binary, &shader->config, compiler,
-			     &ctx.ac, debug, ctx.type, si_get_shader_name(shader),
-			     si_should_optimize_less(compiler, shader->selector))) {
-		si_llvm_dispose(&ctx);
-		fprintf(stderr, "LLVM failed to compile shader\n");
-		return false;
-	}
-
-	si_llvm_dispose(&ctx);
-	return true;
+   struct si_shader_selector *sel = shader->selector;
+   struct si_shader_context ctx;
+
+   si_llvm_context_init(&ctx, sscreen, compiler, si_get_shader_wave_size(shader));
+
+   LLVMValueRef ngg_cull_main_fn = NULL;
+   if (shader->key.opt.ngg_culling) {
+      if (!si_build_main_function(&ctx, shader, nir, false, true)) {
+         si_llvm_dispose(&ctx);
+         return false;
+      }
+      ngg_cull_main_fn = ctx.main_fn;
+      ctx.main_fn = NULL;
+   }
+
+   if (!si_build_main_function(&ctx, shader, nir, free_nir, false)) {
+      si_llvm_dispose(&ctx);
+      return false;
+   }
+
+   if (shader->is_monolithic && ctx.type == PIPE_SHADER_VERTEX) {
+      LLVMValueRef parts[4];
+      unsigned num_parts = 0;
+      bool has_prolog = false;
+      LLVMValueRef main_fn = ctx.main_fn;
+
+      if (ngg_cull_main_fn) {
+         if (si_vs_needs_prolog(sel, &shader->key.part.vs.prolog, &shader->key, true)) {
+            union si_shader_part_key prolog_key;
+            si_get_vs_prolog_key(&sel->info, shader->info.num_input_sgprs, true,
+                                 &shader->key.part.vs.prolog, shader, &prolog_key);
+            prolog_key.vs_prolog.is_monolithic = true;
+            si_llvm_build_vs_prolog(&ctx, &prolog_key);
+            parts[num_parts++] = ctx.main_fn;
+            has_prolog = true;
+         }
+         parts[num_parts++] = ngg_cull_main_fn;
+      }
+
+      if (si_vs_needs_prolog(sel, &shader->key.part.vs.prolog, &shader->key, false)) {
+         union si_shader_part_key prolog_key;
+         si_get_vs_prolog_key(&sel->info, shader->info.num_input_sgprs, false,
+                              &shader->key.part.vs.prolog, shader, &prolog_key);
+         prolog_key.vs_prolog.is_monolithic = true;
+         si_llvm_build_vs_prolog(&ctx, &prolog_key);
+         parts[num_parts++] = ctx.main_fn;
+         has_prolog = true;
+      }
+      parts[num_parts++] = main_fn;
+
+      si_build_wrapper_function(&ctx, parts, num_parts, has_prolog ? 1 : 0, 0);
+
+      if (ctx.shader->key.opt.vs_as_prim_discard_cs)
+         si_build_prim_discard_compute_shader(&ctx);
+   } else if (shader->is_monolithic && ctx.type == PIPE_SHADER_TESS_EVAL && ngg_cull_main_fn) {
+      LLVMValueRef parts[2];
+
+      parts[0] = ngg_cull_main_fn;
+      parts[1] = ctx.main_fn;
+
+      si_build_wrapper_function(&ctx, parts, 2, 0, 0);
+   } else if (shader->is_monolithic && ctx.type == PIPE_SHADER_TESS_CTRL) {
+      if (sscreen->info.chip_class >= GFX9) {
+         struct si_shader_selector *ls = shader->key.part.tcs.ls;
+         LLVMValueRef parts[4];
+         bool vs_needs_prolog =
+            si_vs_needs_prolog(ls, &shader->key.part.tcs.ls_prolog, &shader->key, false);
+
+         /* TCS main part */
+         parts[2] = ctx.main_fn;
+
+         /* TCS epilog */
+         union si_shader_part_key tcs_epilog_key;
+         memset(&tcs_epilog_key, 0, sizeof(tcs_epilog_key));
+         tcs_epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog;
+         si_llvm_build_tcs_epilog(&ctx, &tcs_epilog_key);
+         parts[3] = ctx.main_fn;
+
+         /* VS as LS main part */
+         nir = get_nir_shader(ls, &free_nir);
+         struct si_shader shader_ls = {};
+         shader_ls.selector = ls;
+         shader_ls.key.as_ls = 1;
+         shader_ls.key.mono = shader->key.mono;
+         shader_ls.key.opt = shader->key.opt;
+         shader_ls.is_monolithic = true;
+
+         if (!si_build_main_function(&ctx, &shader_ls, nir, free_nir, false)) {
+            si_llvm_dispose(&ctx);
+            return false;
+         }
+         shader->info.uses_instanceid |= ls->info.uses_instanceid;
+         parts[1] = ctx.main_fn;
+
+         /* LS prolog */
+         if (vs_needs_prolog) {
+            union si_shader_part_key vs_prolog_key;
+            si_get_vs_prolog_key(&ls->info, shader_ls.info.num_input_sgprs, false,
+                                 &shader->key.part.tcs.ls_prolog, shader, &vs_prolog_key);
+            vs_prolog_key.vs_prolog.is_monolithic = true;
+            si_llvm_build_vs_prolog(&ctx, &vs_prolog_key);
+            parts[0] = ctx.main_fn;
+         }
+
+         /* Reset the shader context. */
+         ctx.shader = shader;
+         ctx.type = PIPE_SHADER_TESS_CTRL;
+
+         si_build_wrapper_function(&ctx, parts + !vs_needs_prolog, 4 - !vs_needs_prolog,
+                                   vs_needs_prolog, vs_needs_prolog ? 2 : 1);
+      } else {
+         LLVMValueRef parts[2];
+         union si_shader_part_key epilog_key;
+
+         parts[0] = ctx.main_fn;
+
+         memset(&epilog_key, 0, sizeof(epilog_key));
+         epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog;
+         si_llvm_build_tcs_epilog(&ctx, &epilog_key);
+         parts[1] = ctx.main_fn;
+
+         si_build_wrapper_function(&ctx, parts, 2, 0, 0);
+      }
+   } else if (shader->is_monolithic && ctx.type == PIPE_SHADER_GEOMETRY) {
+      if (ctx.screen->info.chip_class >= GFX9) {
+         struct si_shader_selector *es = shader->key.part.gs.es;
+         LLVMValueRef es_prolog = NULL;
+         LLVMValueRef es_main = NULL;
+         LLVMValueRef gs_prolog = NULL;
+         LLVMValueRef gs_main = ctx.main_fn;
+
+         /* GS prolog */
+         union si_shader_part_key gs_prolog_key;
+         memset(&gs_prolog_key, 0, sizeof(gs_prolog_key));
+         gs_prolog_key.gs_prolog.states = shader->key.part.gs.prolog;
+         gs_prolog_key.gs_prolog.is_monolithic = true;
+         gs_prolog_key.gs_prolog.as_ngg = shader->key.as_ngg;
+         si_llvm_build_gs_prolog(&ctx, &gs_prolog_key);
+         gs_prolog = ctx.main_fn;
+
+         /* ES main part */
+         nir = get_nir_shader(es, &free_nir);
+         struct si_shader shader_es = {};
+         shader_es.selector = es;
+         shader_es.key.as_es = 1;
+         shader_es.key.as_ngg = shader->key.as_ngg;
+         shader_es.key.mono = shader->key.mono;
+         shader_es.key.opt = shader->key.opt;
+         shader_es.is_monolithic = true;
+
+         if (!si_build_main_function(&ctx, &shader_es, nir, free_nir, false)) {
+            si_llvm_dispose(&ctx);
+            return false;
+         }
+         shader->info.uses_instanceid |= es->info.uses_instanceid;
+         es_main = ctx.main_fn;
+
+         /* ES prolog */
+         if (es->type == PIPE_SHADER_VERTEX &&
+             si_vs_needs_prolog(es, &shader->key.part.gs.vs_prolog, &shader->key, false)) {
+            union si_shader_part_key vs_prolog_key;
+            si_get_vs_prolog_key(&es->info, shader_es.info.num_input_sgprs, false,
+                                 &shader->key.part.gs.vs_prolog, shader, &vs_prolog_key);
+            vs_prolog_key.vs_prolog.is_monolithic = true;
+            si_llvm_build_vs_prolog(&ctx, &vs_prolog_key);
+            es_prolog = ctx.main_fn;
+         }
+
+         /* Reset the shader context. */
+         ctx.shader = shader;
+         ctx.type = PIPE_SHADER_GEOMETRY;
+
+         /* Prepare the array of shader parts. */
+         LLVMValueRef parts[4];
+         unsigned num_parts = 0, main_part, next_first_part;
+
+         if (es_prolog)
+            parts[num_parts++] = es_prolog;
+
+         parts[main_part = num_parts++] = es_main;
+         parts[next_first_part = num_parts++] = gs_prolog;
+         parts[num_parts++] = gs_main;
+
+         si_build_wrapper_function(&ctx, parts, num_parts, main_part, next_first_part);
+      } else {
+         LLVMValueRef parts[2];
+         union si_shader_part_key prolog_key;
+
+         parts[1] = ctx.main_fn;
+
+         memset(&prolog_key, 0, sizeof(prolog_key));
+         prolog_key.gs_prolog.states = shader->key.part.gs.prolog;
+         si_llvm_build_gs_prolog(&ctx, &prolog_key);
+         parts[0] = ctx.main_fn;
+
+         si_build_wrapper_function(&ctx, parts, 2, 1, 0);
+      }
+   } else if (shader->is_monolithic && ctx.type == PIPE_SHADER_FRAGMENT) {
+      si_llvm_build_monolithic_ps(&ctx, shader);
+   }
+
+   si_llvm_optimize_module(&ctx);
+
+   /* Post-optimization transformations and analysis. */
+   si_optimize_vs_outputs(&ctx);
+
+   if ((debug && debug->debug_message) || si_can_dump_shader(sscreen, ctx.type)) {
+      ctx.shader->info.private_mem_vgprs = ac_count_scratch_private_memory(ctx.main_fn);
+   }
+
+   /* Make sure the input is a pointer and not integer followed by inttoptr. */
+   assert(LLVMGetTypeKind(LLVMTypeOf(LLVMGetParam(ctx.main_fn, 0))) == LLVMPointerTypeKind);
+
+   /* Compile to bytecode. */
+   if (!si_compile_llvm(sscreen, &shader->binary, &shader->config, compiler, &ctx.ac, debug,
+                        ctx.type, si_get_shader_name(shader),
+                        si_should_optimize_less(compiler, shader->selector))) {
+      si_llvm_dispose(&ctx);
+      fprintf(stderr, "LLVM failed to compile shader\n");
+      return false;
+   }
+
+   si_llvm_dispose(&ctx);
+   return true;
 }
 
-bool si_compile_shader(struct si_screen *sscreen,
-		       struct ac_llvm_compiler *compiler,
-		       struct si_shader *shader,
-		       struct pipe_debug_callback *debug)
+bool si_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
+                       struct si_shader *shader, struct pipe_debug_callback *debug)
 {
-	struct si_shader_selector *sel = shader->selector;
-	bool free_nir;
-	struct nir_shader *nir = get_nir_shader(sel, &free_nir);
-
-	/* Dump NIR before doing NIR->LLVM conversion in case the
-	 * conversion fails. */
-	if (si_can_dump_shader(sscreen, sel->type) &&
-	    !(sscreen->debug_flags & DBG(NO_NIR))) {
-		nir_print_shader(nir, stderr);
-		si_dump_streamout(&sel->so);
-	}
-
-	memset(shader->info.vs_output_param_offset, AC_EXP_PARAM_UNDEFINED,
-	       sizeof(shader->info.vs_output_param_offset));
-
-	shader->info.uses_instanceid = sel->info.uses_instanceid;
-
-	/* TODO: ACO could compile non-monolithic shaders here (starting
-	 * with PS and NGG VS), but monolithic shaders should be compiled
-	 * by LLVM due to more complicated compilation.
-	 */
-	if (!si_llvm_compile_shader(sscreen, compiler, shader, debug, nir, free_nir))
-		return false;
-
-	/* Validate SGPR and VGPR usage for compute to detect compiler bugs.
-	 * LLVM 3.9svn has this bug.
-	 */
-	if (sel->type == PIPE_SHADER_COMPUTE) {
-		unsigned wave_size = sscreen->compute_wave_size;
-		unsigned max_vgprs = sscreen->info.num_physical_wave64_vgprs_per_simd *
-				     (wave_size == 32 ? 2 : 1);
-		unsigned max_sgprs = sscreen->info.num_physical_sgprs_per_simd;
-		unsigned max_sgprs_per_wave = 128;
-		unsigned simds_per_tg = 4; /* assuming WGP mode on gfx10 */
-		unsigned threads_per_tg = si_get_max_workgroup_size(shader);
-		unsigned waves_per_tg = DIV_ROUND_UP(threads_per_tg, wave_size);
-		unsigned waves_per_simd = DIV_ROUND_UP(waves_per_tg, simds_per_tg);
-
-		max_vgprs = max_vgprs / waves_per_simd;
-		max_sgprs = MIN2(max_sgprs / waves_per_simd, max_sgprs_per_wave);
-
-		if (shader->config.num_sgprs > max_sgprs ||
-		    shader->config.num_vgprs > max_vgprs) {
-			fprintf(stderr, "LLVM failed to compile a shader correctly: "
-				"SGPR:VGPR usage is %u:%u, but the hw limit is %u:%u\n",
-				shader->config.num_sgprs, shader->config.num_vgprs,
-				max_sgprs, max_vgprs);
-
-			/* Just terminate the process, because dependent
-			 * shaders can hang due to bad input data, but use
-			 * the env var to allow shader-db to work.
-			 */
-			if (!debug_get_bool_option("SI_PASS_BAD_SHADERS", false))
-				abort();
-		}
-	}
-
-	/* Add the scratch offset to input SGPRs. */
-	if (shader->config.scratch_bytes_per_wave && !si_is_merged_shader(shader))
-		shader->info.num_input_sgprs += 1; /* scratch byte offset */
-
-	/* Calculate the number of fragment input VGPRs. */
-	if (sel->type == PIPE_SHADER_FRAGMENT) {
-		shader->info.num_input_vgprs = ac_get_fs_input_vgpr_cnt(&shader->config,
-						&shader->info.face_vgpr_index,
-						&shader->info.ancillary_vgpr_index);
-	}
-
-	si_calculate_max_simd_waves(shader);
-	si_shader_dump_stats_for_shader_db(sscreen, shader, debug);
-	return true;
+   struct si_shader_selector *sel = shader->selector;
+   bool free_nir;
+   struct nir_shader *nir = get_nir_shader(sel, &free_nir);
+
+   /* Dump NIR before doing NIR->LLVM conversion in case the
+    * conversion fails. */
+   if (si_can_dump_shader(sscreen, sel->type) && !(sscreen->debug_flags & DBG(NO_NIR))) {
+      nir_print_shader(nir, stderr);
+      si_dump_streamout(&sel->so);
+   }
+
+   memset(shader->info.vs_output_param_offset, AC_EXP_PARAM_UNDEFINED,
+          sizeof(shader->info.vs_output_param_offset));
+
+   shader->info.uses_instanceid = sel->info.uses_instanceid;
+
+   /* TODO: ACO could compile non-monolithic shaders here (starting
+    * with PS and NGG VS), but monolithic shaders should be compiled
+    * by LLVM due to more complicated compilation.
+    */
+   if (!si_llvm_compile_shader(sscreen, compiler, shader, debug, nir, free_nir))
+      return false;
+
+   /* Validate SGPR and VGPR usage for compute to detect compiler bugs.
+    * LLVM 3.9svn has this bug.
+    */
+   if (sel->type == PIPE_SHADER_COMPUTE) {
+      unsigned wave_size = sscreen->compute_wave_size;
+      unsigned max_vgprs =
+         sscreen->info.num_physical_wave64_vgprs_per_simd * (wave_size == 32 ? 2 : 1);
+      unsigned max_sgprs = sscreen->info.num_physical_sgprs_per_simd;
+      unsigned max_sgprs_per_wave = 128;
+      unsigned simds_per_tg = 4; /* assuming WGP mode on gfx10 */
+      unsigned threads_per_tg = si_get_max_workgroup_size(shader);
+      unsigned waves_per_tg = DIV_ROUND_UP(threads_per_tg, wave_size);
+      unsigned waves_per_simd = DIV_ROUND_UP(waves_per_tg, simds_per_tg);
+
+      max_vgprs = max_vgprs / waves_per_simd;
+      max_sgprs = MIN2(max_sgprs / waves_per_simd, max_sgprs_per_wave);
+
+      if (shader->config.num_sgprs > max_sgprs || shader->config.num_vgprs > max_vgprs) {
+         fprintf(stderr,
+                 "LLVM failed to compile a shader correctly: "
+                 "SGPR:VGPR usage is %u:%u, but the hw limit is %u:%u\n",
+                 shader->config.num_sgprs, shader->config.num_vgprs, max_sgprs, max_vgprs);
+
+         /* Just terminate the process, because dependent
+          * shaders can hang due to bad input data, but use
+          * the env var to allow shader-db to work.
+          */
+         if (!debug_get_bool_option("SI_PASS_BAD_SHADERS", false))
+            abort();
+      }
+   }
+
+   /* Add the scratch offset to input SGPRs. */
+   if (shader->config.scratch_bytes_per_wave && !si_is_merged_shader(shader))
+      shader->info.num_input_sgprs += 1; /* scratch byte offset */
+
+   /* Calculate the number of fragment input VGPRs. */
+   if (sel->type == PIPE_SHADER_FRAGMENT) {
+      shader->info.num_input_vgprs = ac_get_fs_input_vgpr_cnt(
+         &shader->config, &shader->info.face_vgpr_index, &shader->info.ancillary_vgpr_index);
+   }
+
+   si_calculate_max_simd_waves(shader);
+   si_shader_dump_stats_for_shader_db(sscreen, shader, debug);
+   return true;
 }
 
 /**
@@ -2057,335 +1905,300 @@ bool si_compile_shader(struct si_screen *sscreen,
  * \return		non-NULL on success
  */
 static struct si_shader_part *
-si_get_shader_part(struct si_screen *sscreen,
-		   struct si_shader_part **list,
-		   enum pipe_shader_type type,
-		   bool prolog,
-		   union si_shader_part_key *key,
-		   struct ac_llvm_compiler *compiler,
-		   struct pipe_debug_callback *debug,
-		   void (*build)(struct si_shader_context *,
-				 union si_shader_part_key *),
-		   const char *name)
+si_get_shader_part(struct si_screen *sscreen, struct si_shader_part **list,
+                   enum pipe_shader_type type, bool prolog, union si_shader_part_key *key,
+                   struct ac_llvm_compiler *compiler, struct pipe_debug_callback *debug,
+                   void (*build)(struct si_shader_context *, union si_shader_part_key *),
+                   const char *name)
 {
-	struct si_shader_part *result;
-
-	simple_mtx_lock(&sscreen->shader_parts_mutex);
-
-	/* Find existing. */
-	for (result = *list; result; result = result->next) {
-		if (memcmp(&result->key, key, sizeof(*key)) == 0) {
-			simple_mtx_unlock(&sscreen->shader_parts_mutex);
-			return result;
-		}
-	}
-
-	/* Compile a new one. */
-	result = CALLOC_STRUCT(si_shader_part);
-	result->key = *key;
-
-	struct si_shader_selector sel = {};
-	sel.screen = sscreen;
-
-	struct si_shader shader = {};
-	shader.selector = &sel;
-
-	switch (type) {
-	case PIPE_SHADER_VERTEX:
-		shader.key.as_ls = key->vs_prolog.as_ls;
-		shader.key.as_es = key->vs_prolog.as_es;
-		shader.key.as_ngg = key->vs_prolog.as_ngg;
-		shader.key.opt.vs_as_prim_discard_cs = key->vs_prolog.as_prim_discard_cs;
-		break;
-	case PIPE_SHADER_TESS_CTRL:
-		assert(!prolog);
-		shader.key.part.tcs.epilog = key->tcs_epilog.states;
-		break;
-	case PIPE_SHADER_GEOMETRY:
-		assert(prolog);
-		shader.key.as_ngg = key->gs_prolog.as_ngg;
-		break;
-	case PIPE_SHADER_FRAGMENT:
-		if (prolog)
-			shader.key.part.ps.prolog = key->ps_prolog.states;
-		else
-			shader.key.part.ps.epilog = key->ps_epilog.states;
-		break;
-	default:
-		unreachable("bad shader part");
-	}
-
-	struct si_shader_context ctx;
-	si_llvm_context_init(&ctx, sscreen, compiler,
-			     si_get_wave_size(sscreen, type, shader.key.as_ngg,
-					      shader.key.as_es,
-					      shader.key.opt.vs_as_prim_discard_cs));
-	ctx.shader = &shader;
-	ctx.type = type;
-
-	build(&ctx, key);
-
-	/* Compile. */
-	si_llvm_optimize_module(&ctx);
-
-	if (!si_compile_llvm(sscreen, &result->binary, &result->config, compiler,
-			     &ctx.ac, debug, ctx.type, name, false)) {
-		FREE(result);
-		result = NULL;
-		goto out;
-	}
-
-	result->next = *list;
-	*list = result;
+   struct si_shader_part *result;
+
+   simple_mtx_lock(&sscreen->shader_parts_mutex);
+
+   /* Find existing. */
+   for (result = *list; result; result = result->next) {
+      if (memcmp(&result->key, key, sizeof(*key)) == 0) {
+         simple_mtx_unlock(&sscreen->shader_parts_mutex);
+         return result;
+      }
+   }
+
+   /* Compile a new one. */
+   result = CALLOC_STRUCT(si_shader_part);
+   result->key = *key;
+
+   struct si_shader_selector sel = {};
+   sel.screen = sscreen;
+
+   struct si_shader shader = {};
+   shader.selector = &sel;
+
+   switch (type) {
+   case PIPE_SHADER_VERTEX:
+      shader.key.as_ls = key->vs_prolog.as_ls;
+      shader.key.as_es = key->vs_prolog.as_es;
+      shader.key.as_ngg = key->vs_prolog.as_ngg;
+      shader.key.opt.vs_as_prim_discard_cs = key->vs_prolog.as_prim_discard_cs;
+      break;
+   case PIPE_SHADER_TESS_CTRL:
+      assert(!prolog);
+      shader.key.part.tcs.epilog = key->tcs_epilog.states;
+      break;
+   case PIPE_SHADER_GEOMETRY:
+      assert(prolog);
+      shader.key.as_ngg = key->gs_prolog.as_ngg;
+      break;
+   case PIPE_SHADER_FRAGMENT:
+      if (prolog)
+         shader.key.part.ps.prolog = key->ps_prolog.states;
+      else
+         shader.key.part.ps.epilog = key->ps_epilog.states;
+      break;
+   default:
+      unreachable("bad shader part");
+   }
+
+   struct si_shader_context ctx;
+   si_llvm_context_init(&ctx, sscreen, compiler,
+                        si_get_wave_size(sscreen, type, shader.key.as_ngg, shader.key.as_es,
+                                         shader.key.opt.vs_as_prim_discard_cs));
+   ctx.shader = &shader;
+   ctx.type = type;
+
+   build(&ctx, key);
+
+   /* Compile. */
+   si_llvm_optimize_module(&ctx);
+
+   if (!si_compile_llvm(sscreen, &result->binary, &result->config, compiler, &ctx.ac, debug,
+                        ctx.type, name, false)) {
+      FREE(result);
+      result = NULL;
+      goto out;
+   }
+
+   result->next = *list;
+   *list = result;
 
 out:
-	si_llvm_dispose(&ctx);
-	simple_mtx_unlock(&sscreen->shader_parts_mutex);
-	return result;
+   si_llvm_dispose(&ctx);
+   simple_mtx_unlock(&sscreen->shader_parts_mutex);
+   return result;
 }
 
-static bool si_get_vs_prolog(struct si_screen *sscreen,
-			     struct ac_llvm_compiler *compiler,
-			     struct si_shader *shader,
-			     struct pipe_debug_callback *debug,
-			     struct si_shader *main_part,
-			     const struct si_vs_prolog_bits *key)
+static bool si_get_vs_prolog(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
+                             struct si_shader *shader, struct pipe_debug_callback *debug,
+                             struct si_shader *main_part, const struct si_vs_prolog_bits *key)
 {
-	struct si_shader_selector *vs = main_part->selector;
-
-	if (!si_vs_needs_prolog(vs, key, &shader->key, false))
-		return true;
-
-	/* Get the prolog. */
-	union si_shader_part_key prolog_key;
-	si_get_vs_prolog_key(&vs->info, main_part->info.num_input_sgprs, false,
-			     key, shader, &prolog_key);
-
-	shader->prolog =
-		si_get_shader_part(sscreen, &sscreen->vs_prologs,
-				   PIPE_SHADER_VERTEX, true, &prolog_key, compiler,
-				   debug, si_llvm_build_vs_prolog,
-				   "Vertex Shader Prolog");
-	return shader->prolog != NULL;
+   struct si_shader_selector *vs = main_part->selector;
+
+   if (!si_vs_needs_prolog(vs, key, &shader->key, false))
+      return true;
+
+   /* Get the prolog. */
+   union si_shader_part_key prolog_key;
+   si_get_vs_prolog_key(&vs->info, main_part->info.num_input_sgprs, false, key, shader,
+                        &prolog_key);
+
+   shader->prolog =
+      si_get_shader_part(sscreen, &sscreen->vs_prologs, PIPE_SHADER_VERTEX, true, &prolog_key,
+                         compiler, debug, si_llvm_build_vs_prolog, "Vertex Shader Prolog");
+   return shader->prolog != NULL;
 }
 
 /**
  * Select and compile (or reuse) vertex shader parts (prolog & epilog).
  */
-static bool si_shader_select_vs_parts(struct si_screen *sscreen,
-				      struct ac_llvm_compiler *compiler,
-				      struct si_shader *shader,
-				      struct pipe_debug_callback *debug)
+static bool si_shader_select_vs_parts(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
+                                      struct si_shader *shader, struct pipe_debug_callback *debug)
 {
-	return si_get_vs_prolog(sscreen, compiler, shader, debug, shader,
-				&shader->key.part.vs.prolog);
+   return si_get_vs_prolog(sscreen, compiler, shader, debug, shader, &shader->key.part.vs.prolog);
 }
 
 /**
  * Select and compile (or reuse) TCS parts (epilog).
  */
-static bool si_shader_select_tcs_parts(struct si_screen *sscreen,
-				       struct ac_llvm_compiler *compiler,
-				       struct si_shader *shader,
-				       struct pipe_debug_callback *debug)
+static bool si_shader_select_tcs_parts(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
+                                       struct si_shader *shader, struct pipe_debug_callback *debug)
 {
-	if (sscreen->info.chip_class >= GFX9) {
-		struct si_shader *ls_main_part =
-			shader->key.part.tcs.ls->main_shader_part_ls;
-
-		if (!si_get_vs_prolog(sscreen, compiler, shader, debug, ls_main_part,
-				      &shader->key.part.tcs.ls_prolog))
-			return false;
-
-		shader->previous_stage = ls_main_part;
-	}
-
-	/* Get the epilog. */
-	union si_shader_part_key epilog_key;
-	memset(&epilog_key, 0, sizeof(epilog_key));
-	epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog;
-
-	shader->epilog = si_get_shader_part(sscreen, &sscreen->tcs_epilogs,
-					    PIPE_SHADER_TESS_CTRL, false,
-					    &epilog_key, compiler, debug,
-					    si_llvm_build_tcs_epilog,
-					    "Tessellation Control Shader Epilog");
-	return shader->epilog != NULL;
+   if (sscreen->info.chip_class >= GFX9) {
+      struct si_shader *ls_main_part = shader->key.part.tcs.ls->main_shader_part_ls;
+
+      if (!si_get_vs_prolog(sscreen, compiler, shader, debug, ls_main_part,
+                            &shader->key.part.tcs.ls_prolog))
+         return false;
+
+      shader->previous_stage = ls_main_part;
+   }
+
+   /* Get the epilog. */
+   union si_shader_part_key epilog_key;
+   memset(&epilog_key, 0, sizeof(epilog_key));
+   epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog;
+
+   shader->epilog = si_get_shader_part(sscreen, &sscreen->tcs_epilogs, PIPE_SHADER_TESS_CTRL, false,
+                                       &epilog_key, compiler, debug, si_llvm_build_tcs_epilog,
+                                       "Tessellation Control Shader Epilog");
+   return shader->epilog != NULL;
 }
 
 /**
  * Select and compile (or reuse) GS parts (prolog).
  */
-static bool si_shader_select_gs_parts(struct si_screen *sscreen,
-				      struct ac_llvm_compiler *compiler,
-				      struct si_shader *shader,
-				      struct pipe_debug_callback *debug)
+static bool si_shader_select_gs_parts(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
+                                      struct si_shader *shader, struct pipe_debug_callback *debug)
 {
-	if (sscreen->info.chip_class >= GFX9) {
-		struct si_shader *es_main_part;
-		enum pipe_shader_type es_type = shader->key.part.gs.es->type;
-
-		if (shader->key.as_ngg)
-			es_main_part = shader->key.part.gs.es->main_shader_part_ngg_es;
-		else
-			es_main_part = shader->key.part.gs.es->main_shader_part_es;
-
-		if (es_type == PIPE_SHADER_VERTEX &&
-		    !si_get_vs_prolog(sscreen, compiler, shader, debug, es_main_part,
-				      &shader->key.part.gs.vs_prolog))
-			return false;
-
-		shader->previous_stage = es_main_part;
-	}
-
-	if (!shader->key.part.gs.prolog.tri_strip_adj_fix)
-		return true;
-
-	union si_shader_part_key prolog_key;
-	memset(&prolog_key, 0, sizeof(prolog_key));
-	prolog_key.gs_prolog.states = shader->key.part.gs.prolog;
-	prolog_key.gs_prolog.as_ngg = shader->key.as_ngg;
-
-	shader->prolog2 = si_get_shader_part(sscreen, &sscreen->gs_prologs,
-					    PIPE_SHADER_GEOMETRY, true,
-					    &prolog_key, compiler, debug,
-					    si_llvm_build_gs_prolog,
-					    "Geometry Shader Prolog");
-	return shader->prolog2 != NULL;
+   if (sscreen->info.chip_class >= GFX9) {
+      struct si_shader *es_main_part;
+      enum pipe_shader_type es_type = shader->key.part.gs.es->type;
+
+      if (shader->key.as_ngg)
+         es_main_part = shader->key.part.gs.es->main_shader_part_ngg_es;
+      else
+         es_main_part = shader->key.part.gs.es->main_shader_part_es;
+
+      if (es_type == PIPE_SHADER_VERTEX &&
+          !si_get_vs_prolog(sscreen, compiler, shader, debug, es_main_part,
+                            &shader->key.part.gs.vs_prolog))
+         return false;
+
+      shader->previous_stage = es_main_part;
+   }
+
+   if (!shader->key.part.gs.prolog.tri_strip_adj_fix)
+      return true;
+
+   union si_shader_part_key prolog_key;
+   memset(&prolog_key, 0, sizeof(prolog_key));
+   prolog_key.gs_prolog.states = shader->key.part.gs.prolog;
+   prolog_key.gs_prolog.as_ngg = shader->key.as_ngg;
+
+   shader->prolog2 =
+      si_get_shader_part(sscreen, &sscreen->gs_prologs, PIPE_SHADER_GEOMETRY, true, &prolog_key,
+                         compiler, debug, si_llvm_build_gs_prolog, "Geometry Shader Prolog");
+   return shader->prolog2 != NULL;
 }
 
 /**
  * Compute the PS prolog key, which contains all the information needed to
  * build the PS prolog function, and set related bits in shader->config.
  */
-void si_get_ps_prolog_key(struct si_shader *shader,
-			  union si_shader_part_key *key,
-			  bool separate_prolog)
+void si_get_ps_prolog_key(struct si_shader *shader, union si_shader_part_key *key,
+                          bool separate_prolog)
 {
-	struct si_shader_info *info = &shader->selector->info;
-
-	memset(key, 0, sizeof(*key));
-	key->ps_prolog.states = shader->key.part.ps.prolog;
-	key->ps_prolog.colors_read = info->colors_read;
-	key->ps_prolog.num_input_sgprs = shader->info.num_input_sgprs;
-	key->ps_prolog.num_input_vgprs = shader->info.num_input_vgprs;
-	key->ps_prolog.wqm = info->uses_derivatives &&
-		(key->ps_prolog.colors_read ||
-		 key->ps_prolog.states.force_persp_sample_interp ||
-		 key->ps_prolog.states.force_linear_sample_interp ||
-		 key->ps_prolog.states.force_persp_center_interp ||
-		 key->ps_prolog.states.force_linear_center_interp ||
-		 key->ps_prolog.states.bc_optimize_for_persp ||
-		 key->ps_prolog.states.bc_optimize_for_linear);
-	key->ps_prolog.ancillary_vgpr_index = shader->info.ancillary_vgpr_index;
-
-	if (info->colors_read) {
-		unsigned *color = shader->selector->color_attr_index;
-
-		if (shader->key.part.ps.prolog.color_two_side) {
-			/* BCOLORs are stored after the last input. */
-			key->ps_prolog.num_interp_inputs = info->num_inputs;
-			key->ps_prolog.face_vgpr_index = shader->info.face_vgpr_index;
-			if (separate_prolog)
-				shader->config.spi_ps_input_ena |= S_0286CC_FRONT_FACE_ENA(1);
-		}
-
-		for (unsigned i = 0; i < 2; i++) {
-			unsigned interp = info->input_interpolate[color[i]];
-			unsigned location = info->input_interpolate_loc[color[i]];
-
-			if (!(info->colors_read & (0xf << i*4)))
-				continue;
-
-			key->ps_prolog.color_attr_index[i] = color[i];
-
-			if (shader->key.part.ps.prolog.flatshade_colors &&
-			    interp == TGSI_INTERPOLATE_COLOR)
-				interp = TGSI_INTERPOLATE_CONSTANT;
-
-			switch (interp) {
-			case TGSI_INTERPOLATE_CONSTANT:
-				key->ps_prolog.color_interp_vgpr_index[i] = -1;
-				break;
-			case TGSI_INTERPOLATE_PERSPECTIVE:
-			case TGSI_INTERPOLATE_COLOR:
-				/* Force the interpolation location for colors here. */
-				if (shader->key.part.ps.prolog.force_persp_sample_interp)
-					location = TGSI_INTERPOLATE_LOC_SAMPLE;
-				if (shader->key.part.ps.prolog.force_persp_center_interp)
-					location = TGSI_INTERPOLATE_LOC_CENTER;
-
-				switch (location) {
-				case TGSI_INTERPOLATE_LOC_SAMPLE:
-					key->ps_prolog.color_interp_vgpr_index[i] = 0;
-					if (separate_prolog) {
-						shader->config.spi_ps_input_ena |=
-							S_0286CC_PERSP_SAMPLE_ENA(1);
-					}
-					break;
-				case TGSI_INTERPOLATE_LOC_CENTER:
-					key->ps_prolog.color_interp_vgpr_index[i] = 2;
-					if (separate_prolog) {
-						shader->config.spi_ps_input_ena |=
-							S_0286CC_PERSP_CENTER_ENA(1);
-					}
-					break;
-				case TGSI_INTERPOLATE_LOC_CENTROID:
-					key->ps_prolog.color_interp_vgpr_index[i] = 4;
-					if (separate_prolog) {
-						shader->config.spi_ps_input_ena |=
-							S_0286CC_PERSP_CENTROID_ENA(1);
-					}
-					break;
-				default:
-					assert(0);
-				}
-				break;
-			case TGSI_INTERPOLATE_LINEAR:
-				/* Force the interpolation location for colors here. */
-				if (shader->key.part.ps.prolog.force_linear_sample_interp)
-					location = TGSI_INTERPOLATE_LOC_SAMPLE;
-				if (shader->key.part.ps.prolog.force_linear_center_interp)
-					location = TGSI_INTERPOLATE_LOC_CENTER;
-
-				/* The VGPR assignment for non-monolithic shaders
-				 * works because InitialPSInputAddr is set on the
-				 * main shader and PERSP_PULL_MODEL is never used.
-				 */
-				switch (location) {
-				case TGSI_INTERPOLATE_LOC_SAMPLE:
-					key->ps_prolog.color_interp_vgpr_index[i] =
-						separate_prolog ? 6 : 9;
-					if (separate_prolog) {
-						shader->config.spi_ps_input_ena |=
-							S_0286CC_LINEAR_SAMPLE_ENA(1);
-					}
-					break;
-				case TGSI_INTERPOLATE_LOC_CENTER:
-					key->ps_prolog.color_interp_vgpr_index[i] =
-						separate_prolog ? 8 : 11;
-					if (separate_prolog) {
-						shader->config.spi_ps_input_ena |=
-							S_0286CC_LINEAR_CENTER_ENA(1);
-					}
-					break;
-				case TGSI_INTERPOLATE_LOC_CENTROID:
-					key->ps_prolog.color_interp_vgpr_index[i] =
-						separate_prolog ? 10 : 13;
-					if (separate_prolog) {
-						shader->config.spi_ps_input_ena |=
-							S_0286CC_LINEAR_CENTROID_ENA(1);
-					}
-					break;
-				default:
-					assert(0);
-				}
-				break;
-			default:
-				assert(0);
-			}
-		}
-	}
+   struct si_shader_info *info = &shader->selector->info;
+
+   memset(key, 0, sizeof(*key));
+   key->ps_prolog.states = shader->key.part.ps.prolog;
+   key->ps_prolog.colors_read = info->colors_read;
+   key->ps_prolog.num_input_sgprs = shader->info.num_input_sgprs;
+   key->ps_prolog.num_input_vgprs = shader->info.num_input_vgprs;
+   key->ps_prolog.wqm =
+      info->uses_derivatives &&
+      (key->ps_prolog.colors_read || key->ps_prolog.states.force_persp_sample_interp ||
+       key->ps_prolog.states.force_linear_sample_interp ||
+       key->ps_prolog.states.force_persp_center_interp ||
+       key->ps_prolog.states.force_linear_center_interp ||
+       key->ps_prolog.states.bc_optimize_for_persp || key->ps_prolog.states.bc_optimize_for_linear);
+   key->ps_prolog.ancillary_vgpr_index = shader->info.ancillary_vgpr_index;
+
+   if (info->colors_read) {
+      unsigned *color = shader->selector->color_attr_index;
+
+      if (shader->key.part.ps.prolog.color_two_side) {
+         /* BCOLORs are stored after the last input. */
+         key->ps_prolog.num_interp_inputs = info->num_inputs;
+         key->ps_prolog.face_vgpr_index = shader->info.face_vgpr_index;
+         if (separate_prolog)
+            shader->config.spi_ps_input_ena |= S_0286CC_FRONT_FACE_ENA(1);
+      }
+
+      for (unsigned i = 0; i < 2; i++) {
+         unsigned interp = info->input_interpolate[color[i]];
+         unsigned location = info->input_interpolate_loc[color[i]];
+
+         if (!(info->colors_read & (0xf << i * 4)))
+            continue;
+
+         key->ps_prolog.color_attr_index[i] = color[i];
+
+         if (shader->key.part.ps.prolog.flatshade_colors && interp == TGSI_INTERPOLATE_COLOR)
+            interp = TGSI_INTERPOLATE_CONSTANT;
+
+         switch (interp) {
+         case TGSI_INTERPOLATE_CONSTANT:
+            key->ps_prolog.color_interp_vgpr_index[i] = -1;
+            break;
+         case TGSI_INTERPOLATE_PERSPECTIVE:
+         case TGSI_INTERPOLATE_COLOR:
+            /* Force the interpolation location for colors here. */
+            if (shader->key.part.ps.prolog.force_persp_sample_interp)
+               location = TGSI_INTERPOLATE_LOC_SAMPLE;
+            if (shader->key.part.ps.prolog.force_persp_center_interp)
+               location = TGSI_INTERPOLATE_LOC_CENTER;
+
+            switch (location) {
+            case TGSI_INTERPOLATE_LOC_SAMPLE:
+               key->ps_prolog.color_interp_vgpr_index[i] = 0;
+               if (separate_prolog) {
+                  shader->config.spi_ps_input_ena |= S_0286CC_PERSP_SAMPLE_ENA(1);
+               }
+               break;
+            case TGSI_INTERPOLATE_LOC_CENTER:
+               key->ps_prolog.color_interp_vgpr_index[i] = 2;
+               if (separate_prolog) {
+                  shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
+               }
+               break;
+            case TGSI_INTERPOLATE_LOC_CENTROID:
+               key->ps_prolog.color_interp_vgpr_index[i] = 4;
+               if (separate_prolog) {
+                  shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTROID_ENA(1);
+               }
+               break;
+            default:
+               assert(0);
+            }
+            break;
+         case TGSI_INTERPOLATE_LINEAR:
+            /* Force the interpolation location for colors here. */
+            if (shader->key.part.ps.prolog.force_linear_sample_interp)
+               location = TGSI_INTERPOLATE_LOC_SAMPLE;
+            if (shader->key.part.ps.prolog.force_linear_center_interp)
+               location = TGSI_INTERPOLATE_LOC_CENTER;
+
+            /* The VGPR assignment for non-monolithic shaders
+             * works because InitialPSInputAddr is set on the
+             * main shader and PERSP_PULL_MODEL is never used.
+             */
+            switch (location) {
+            case TGSI_INTERPOLATE_LOC_SAMPLE:
+               key->ps_prolog.color_interp_vgpr_index[i] = separate_prolog ? 6 : 9;
+               if (separate_prolog) {
+                  shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_SAMPLE_ENA(1);
+               }
+               break;
+            case TGSI_INTERPOLATE_LOC_CENTER:
+               key->ps_prolog.color_interp_vgpr_index[i] = separate_prolog ? 8 : 11;
+               if (separate_prolog) {
+                  shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1);
+               }
+               break;
+            case TGSI_INTERPOLATE_LOC_CENTROID:
+               key->ps_prolog.color_interp_vgpr_index[i] = separate_prolog ? 10 : 13;
+               if (separate_prolog) {
+                  shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTROID_ENA(1);
+               }
+               break;
+            default:
+               assert(0);
+            }
+            break;
+         default:
+            assert(0);
+         }
+      }
+   }
 }
 
 /**
@@ -2393,331 +2206,308 @@ void si_get_ps_prolog_key(struct si_shader *shader,
  */
 bool si_need_ps_prolog(const union si_shader_part_key *key)
 {
-	return key->ps_prolog.colors_read ||
-	       key->ps_prolog.states.force_persp_sample_interp ||
-	       key->ps_prolog.states.force_linear_sample_interp ||
-	       key->ps_prolog.states.force_persp_center_interp ||
-	       key->ps_prolog.states.force_linear_center_interp ||
-	       key->ps_prolog.states.bc_optimize_for_persp ||
-	       key->ps_prolog.states.bc_optimize_for_linear ||
-	       key->ps_prolog.states.poly_stipple ||
-	       key->ps_prolog.states.samplemask_log_ps_iter;
+   return key->ps_prolog.colors_read || key->ps_prolog.states.force_persp_sample_interp ||
+          key->ps_prolog.states.force_linear_sample_interp ||
+          key->ps_prolog.states.force_persp_center_interp ||
+          key->ps_prolog.states.force_linear_center_interp ||
+          key->ps_prolog.states.bc_optimize_for_persp ||
+          key->ps_prolog.states.bc_optimize_for_linear || key->ps_prolog.states.poly_stipple ||
+          key->ps_prolog.states.samplemask_log_ps_iter;
 }
 
 /**
  * Compute the PS epilog key, which contains all the information needed to
  * build the PS epilog function.
  */
-void si_get_ps_epilog_key(struct si_shader *shader,
-			  union si_shader_part_key *key)
+void si_get_ps_epilog_key(struct si_shader *shader, union si_shader_part_key *key)
 {
-	struct si_shader_info *info = &shader->selector->info;
-	memset(key, 0, sizeof(*key));
-	key->ps_epilog.colors_written = info->colors_written;
-	key->ps_epilog.writes_z = info->writes_z;
-	key->ps_epilog.writes_stencil = info->writes_stencil;
-	key->ps_epilog.writes_samplemask = info->writes_samplemask;
-	key->ps_epilog.states = shader->key.part.ps.epilog;
+   struct si_shader_info *info = &shader->selector->info;
+   memset(key, 0, sizeof(*key));
+   key->ps_epilog.colors_written = info->colors_written;
+   key->ps_epilog.writes_z = info->writes_z;
+   key->ps_epilog.writes_stencil = info->writes_stencil;
+   key->ps_epilog.writes_samplemask = info->writes_samplemask;
+   key->ps_epilog.states = shader->key.part.ps.epilog;
 }
 
 /**
  * Select and compile (or reuse) pixel shader parts (prolog & epilog).
  */
-static bool si_shader_select_ps_parts(struct si_screen *sscreen,
-				      struct ac_llvm_compiler *compiler,
-				      struct si_shader *shader,
-				      struct pipe_debug_callback *debug)
+static bool si_shader_select_ps_parts(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
+                                      struct si_shader *shader, struct pipe_debug_callback *debug)
 {
-	union si_shader_part_key prolog_key;
-	union si_shader_part_key epilog_key;
-
-	/* Get the prolog. */
-	si_get_ps_prolog_key(shader, &prolog_key, true);
-
-	/* The prolog is a no-op if these aren't set. */
-	if (si_need_ps_prolog(&prolog_key)) {
-		shader->prolog =
-			si_get_shader_part(sscreen, &sscreen->ps_prologs,
-					   PIPE_SHADER_FRAGMENT, true,
-					   &prolog_key, compiler, debug,
-					   si_llvm_build_ps_prolog,
-					   "Fragment Shader Prolog");
-		if (!shader->prolog)
-			return false;
-	}
-
-	/* Get the epilog. */
-	si_get_ps_epilog_key(shader, &epilog_key);
-
-	shader->epilog =
-		si_get_shader_part(sscreen, &sscreen->ps_epilogs,
-				   PIPE_SHADER_FRAGMENT, false,
-				   &epilog_key, compiler, debug,
-				   si_llvm_build_ps_epilog,
-				   "Fragment Shader Epilog");
-	if (!shader->epilog)
-		return false;
-
-	/* Enable POS_FIXED_PT if polygon stippling is enabled. */
-	if (shader->key.part.ps.prolog.poly_stipple) {
-		shader->config.spi_ps_input_ena |= S_0286CC_POS_FIXED_PT_ENA(1);
-		assert(G_0286CC_POS_FIXED_PT_ENA(shader->config.spi_ps_input_addr));
-	}
-
-	/* Set up the enable bits for per-sample shading if needed. */
-	if (shader->key.part.ps.prolog.force_persp_sample_interp &&
-	    (G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_ena) ||
-	     G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
-		shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTER_ENA;
-		shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA;
-		shader->config.spi_ps_input_ena |= S_0286CC_PERSP_SAMPLE_ENA(1);
-	}
-	if (shader->key.part.ps.prolog.force_linear_sample_interp &&
-	    (G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_ena) ||
-	     G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
-		shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTER_ENA;
-		shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA;
-		shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_SAMPLE_ENA(1);
-	}
-	if (shader->key.part.ps.prolog.force_persp_center_interp &&
-	    (G_0286CC_PERSP_SAMPLE_ENA(shader->config.spi_ps_input_ena) ||
-	     G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
-		shader->config.spi_ps_input_ena &= C_0286CC_PERSP_SAMPLE_ENA;
-		shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA;
-		shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
-	}
-	if (shader->key.part.ps.prolog.force_linear_center_interp &&
-	    (G_0286CC_LINEAR_SAMPLE_ENA(shader->config.spi_ps_input_ena) ||
-	     G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
-		shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_SAMPLE_ENA;
-		shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA;
-		shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1);
-	}
-
-	/* POW_W_FLOAT requires that one of the perspective weights is enabled. */
-	if (G_0286CC_POS_W_FLOAT_ENA(shader->config.spi_ps_input_ena) &&
-	    !(shader->config.spi_ps_input_ena & 0xf)) {
-		shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
-		assert(G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_addr));
-	}
-
-	/* At least one pair of interpolation weights must be enabled. */
-	if (!(shader->config.spi_ps_input_ena & 0x7f)) {
-		shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1);
-		assert(G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_addr));
-	}
-
-	/* Samplemask fixup requires the sample ID. */
-	if (shader->key.part.ps.prolog.samplemask_log_ps_iter) {
-		shader->config.spi_ps_input_ena |= S_0286CC_ANCILLARY_ENA(1);
-		assert(G_0286CC_ANCILLARY_ENA(shader->config.spi_ps_input_addr));
-	}
-
-	/* The sample mask input is always enabled, because the API shader always
-	 * passes it through to the epilog. Disable it here if it's unused.
-	 */
-	if (!shader->key.part.ps.epilog.poly_line_smoothing &&
-	    !shader->selector->info.reads_samplemask)
-		shader->config.spi_ps_input_ena &= C_0286CC_SAMPLE_COVERAGE_ENA;
-
-	return true;
+   union si_shader_part_key prolog_key;
+   union si_shader_part_key epilog_key;
+
+   /* Get the prolog. */
+   si_get_ps_prolog_key(shader, &prolog_key, true);
+
+   /* The prolog is a no-op if these aren't set. */
+   if (si_need_ps_prolog(&prolog_key)) {
+      shader->prolog =
+         si_get_shader_part(sscreen, &sscreen->ps_prologs, PIPE_SHADER_FRAGMENT, true, &prolog_key,
+                            compiler, debug, si_llvm_build_ps_prolog, "Fragment Shader Prolog");
+      if (!shader->prolog)
+         return false;
+   }
+
+   /* Get the epilog. */
+   si_get_ps_epilog_key(shader, &epilog_key);
+
+   shader->epilog =
+      si_get_shader_part(sscreen, &sscreen->ps_epilogs, PIPE_SHADER_FRAGMENT, false, &epilog_key,
+                         compiler, debug, si_llvm_build_ps_epilog, "Fragment Shader Epilog");
+   if (!shader->epilog)
+      return false;
+
+   /* Enable POS_FIXED_PT if polygon stippling is enabled. */
+   if (shader->key.part.ps.prolog.poly_stipple) {
+      shader->config.spi_ps_input_ena |= S_0286CC_POS_FIXED_PT_ENA(1);
+      assert(G_0286CC_POS_FIXED_PT_ENA(shader->config.spi_ps_input_addr));
+   }
+
+   /* Set up the enable bits for per-sample shading if needed. */
+   if (shader->key.part.ps.prolog.force_persp_sample_interp &&
+       (G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_ena) ||
+        G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
+      shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTER_ENA;
+      shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA;
+      shader->config.spi_ps_input_ena |= S_0286CC_PERSP_SAMPLE_ENA(1);
+   }
+   if (shader->key.part.ps.prolog.force_linear_sample_interp &&
+       (G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_ena) ||
+        G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
+      shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTER_ENA;
+      shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA;
+      shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_SAMPLE_ENA(1);
+   }
+   if (shader->key.part.ps.prolog.force_persp_center_interp &&
+       (G_0286CC_PERSP_SAMPLE_ENA(shader->config.spi_ps_input_ena) ||
+        G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
+      shader->config.spi_ps_input_ena &= C_0286CC_PERSP_SAMPLE_ENA;
+      shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA;
+      shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
+   }
+   if (shader->key.part.ps.prolog.force_linear_center_interp &&
+       (G_0286CC_LINEAR_SAMPLE_ENA(shader->config.spi_ps_input_ena) ||
+        G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
+      shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_SAMPLE_ENA;
+      shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA;
+      shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1);
+   }
+
+   /* POW_W_FLOAT requires that one of the perspective weights is enabled. */
+   if (G_0286CC_POS_W_FLOAT_ENA(shader->config.spi_ps_input_ena) &&
+       !(shader->config.spi_ps_input_ena & 0xf)) {
+      shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
+      assert(G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_addr));
+   }
+
+   /* At least one pair of interpolation weights must be enabled. */
+   if (!(shader->config.spi_ps_input_ena & 0x7f)) {
+      shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1);
+      assert(G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_addr));
+   }
+
+   /* Samplemask fixup requires the sample ID. */
+   if (shader->key.part.ps.prolog.samplemask_log_ps_iter) {
+      shader->config.spi_ps_input_ena |= S_0286CC_ANCILLARY_ENA(1);
+      assert(G_0286CC_ANCILLARY_ENA(shader->config.spi_ps_input_addr));
+   }
+
+   /* The sample mask input is always enabled, because the API shader always
+    * passes it through to the epilog. Disable it here if it's unused.
+    */
+   if (!shader->key.part.ps.epilog.poly_line_smoothing && !shader->selector->info.reads_samplemask)
+      shader->config.spi_ps_input_ena &= C_0286CC_SAMPLE_COVERAGE_ENA;
+
+   return true;
 }
 
-void si_multiwave_lds_size_workaround(struct si_screen *sscreen,
-				      unsigned *lds_size)
+void si_multiwave_lds_size_workaround(struct si_screen *sscreen, unsigned *lds_size)
 {
-	/* If tessellation is all offchip and on-chip GS isn't used, this
-	 * workaround is not needed.
-	 */
-	return;
-
-	/* SPI barrier management bug:
-	 *   Make sure we have at least 4k of LDS in use to avoid the bug.
-	 *   It applies to workgroup sizes of more than one wavefront.
-	 */
-	if (sscreen->info.family == CHIP_BONAIRE ||
-	    sscreen->info.family == CHIP_KABINI)
-		*lds_size = MAX2(*lds_size, 8);
+   /* If tessellation is all offchip and on-chip GS isn't used, this
+    * workaround is not needed.
+    */
+   return;
+
+   /* SPI barrier management bug:
+    *   Make sure we have at least 4k of LDS in use to avoid the bug.
+    *   It applies to workgroup sizes of more than one wavefront.
+    */
+   if (sscreen->info.family == CHIP_BONAIRE || sscreen->info.family == CHIP_KABINI)
+      *lds_size = MAX2(*lds_size, 8);
 }
 
 void si_fix_resource_usage(struct si_screen *sscreen, struct si_shader *shader)
 {
-	unsigned min_sgprs = shader->info.num_input_sgprs + 2; /* VCC */
+   unsigned min_sgprs = shader->info.num_input_sgprs + 2; /* VCC */
 
-	shader->config.num_sgprs = MAX2(shader->config.num_sgprs, min_sgprs);
+   shader->config.num_sgprs = MAX2(shader->config.num_sgprs, min_sgprs);
 
-	if (shader->selector->type == PIPE_SHADER_COMPUTE &&
-	    si_get_max_workgroup_size(shader) > sscreen->compute_wave_size) {
-		si_multiwave_lds_size_workaround(sscreen,
-						 &shader->config.lds_size);
-	}
+   if (shader->selector->type == PIPE_SHADER_COMPUTE &&
+       si_get_max_workgroup_size(shader) > sscreen->compute_wave_size) {
+      si_multiwave_lds_size_workaround(sscreen, &shader->config.lds_size);
+   }
 }
 
-bool si_create_shader_variant(struct si_screen *sscreen,
-			      struct ac_llvm_compiler *compiler,
-			      struct si_shader *shader,
-			      struct pipe_debug_callback *debug)
+bool si_create_shader_variant(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
+                              struct si_shader *shader, struct pipe_debug_callback *debug)
 {
-	struct si_shader_selector *sel = shader->selector;
-	struct si_shader *mainp = *si_get_main_shader_part(sel, &shader->key);
-
-	/* LS, ES, VS are compiled on demand if the main part hasn't been
-	 * compiled for that stage.
-	 *
-	 * GS are compiled on demand if the main part hasn't been compiled
-	 * for the chosen NGG-ness.
-	 *
-	 * Vertex shaders are compiled on demand when a vertex fetch
-	 * workaround must be applied.
-	 */
-	if (shader->is_monolithic) {
-		/* Monolithic shader (compiled as a whole, has many variants,
-		 * may take a long time to compile).
-		 */
-		if (!si_compile_shader(sscreen, compiler, shader, debug))
-			return false;
-	} else {
-		/* The shader consists of several parts:
-		 *
-		 * - the middle part is the user shader, it has 1 variant only
-		 *   and it was compiled during the creation of the shader
-		 *   selector
-		 * - the prolog part is inserted at the beginning
-		 * - the epilog part is inserted at the end
-		 *
-		 * The prolog and epilog have many (but simple) variants.
-		 *
-		 * Starting with gfx9, geometry and tessellation control
-		 * shaders also contain the prolog and user shader parts of
-		 * the previous shader stage.
-		 */
-
-		if (!mainp)
-			return false;
-
-		/* Copy the compiled shader data over. */
-		shader->is_binary_shared = true;
-		shader->binary = mainp->binary;
-		shader->config = mainp->config;
-		shader->info.num_input_sgprs = mainp->info.num_input_sgprs;
-		shader->info.num_input_vgprs = mainp->info.num_input_vgprs;
-		shader->info.face_vgpr_index = mainp->info.face_vgpr_index;
-		shader->info.ancillary_vgpr_index = mainp->info.ancillary_vgpr_index;
-		memcpy(shader->info.vs_output_param_offset,
-		       mainp->info.vs_output_param_offset,
-		       sizeof(mainp->info.vs_output_param_offset));
-		shader->info.uses_instanceid = mainp->info.uses_instanceid;
-		shader->info.nr_pos_exports = mainp->info.nr_pos_exports;
-		shader->info.nr_param_exports = mainp->info.nr_param_exports;
-
-		/* Select prologs and/or epilogs. */
-		switch (sel->type) {
-		case PIPE_SHADER_VERTEX:
-			if (!si_shader_select_vs_parts(sscreen, compiler, shader, debug))
-				return false;
-			break;
-		case PIPE_SHADER_TESS_CTRL:
-			if (!si_shader_select_tcs_parts(sscreen, compiler, shader, debug))
-				return false;
-			break;
-		case PIPE_SHADER_TESS_EVAL:
-			break;
-		case PIPE_SHADER_GEOMETRY:
-			if (!si_shader_select_gs_parts(sscreen, compiler, shader, debug))
-				return false;
-			break;
-		case PIPE_SHADER_FRAGMENT:
-			if (!si_shader_select_ps_parts(sscreen, compiler, shader, debug))
-				return false;
-
-			/* Make sure we have at least as many VGPRs as there
-			 * are allocated inputs.
-			 */
-			shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
-							shader->info.num_input_vgprs);
-			break;
-		default:;
-		}
-
-		/* Update SGPR and VGPR counts. */
-		if (shader->prolog) {
-			shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
-							shader->prolog->config.num_sgprs);
-			shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
-							shader->prolog->config.num_vgprs);
-		}
-		if (shader->previous_stage) {
-			shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
-							shader->previous_stage->config.num_sgprs);
-			shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
-							shader->previous_stage->config.num_vgprs);
-			shader->config.spilled_sgprs =
-				MAX2(shader->config.spilled_sgprs,
-				     shader->previous_stage->config.spilled_sgprs);
-			shader->config.spilled_vgprs =
-				MAX2(shader->config.spilled_vgprs,
-				     shader->previous_stage->config.spilled_vgprs);
-			shader->info.private_mem_vgprs =
-				MAX2(shader->info.private_mem_vgprs,
-				     shader->previous_stage->info.private_mem_vgprs);
-			shader->config.scratch_bytes_per_wave =
-				MAX2(shader->config.scratch_bytes_per_wave,
-				     shader->previous_stage->config.scratch_bytes_per_wave);
-			shader->info.uses_instanceid |=
-				shader->previous_stage->info.uses_instanceid;
-		}
-		if (shader->prolog2) {
-			shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
-							shader->prolog2->config.num_sgprs);
-			shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
-							shader->prolog2->config.num_vgprs);
-		}
-		if (shader->epilog) {
-			shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
-							shader->epilog->config.num_sgprs);
-			shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
-							shader->epilog->config.num_vgprs);
-		}
-		si_calculate_max_simd_waves(shader);
-	}
-
-	if (shader->key.as_ngg) {
-		assert(!shader->key.as_es && !shader->key.as_ls);
-		gfx10_ngg_calculate_subgroup_info(shader);
-	} else if (sscreen->info.chip_class >= GFX9 && sel->type == PIPE_SHADER_GEOMETRY) {
-		gfx9_get_gs_info(shader->previous_stage_sel, sel, &shader->gs_info);
-	}
-
-	si_fix_resource_usage(sscreen, shader);
-	si_shader_dump(sscreen, shader, debug, stderr, true);
-
-	/* Upload. */
-	if (!si_shader_binary_upload(sscreen, shader, 0)) {
-		fprintf(stderr, "LLVM failed to upload shader\n");
-		return false;
-	}
-
-	return true;
+   struct si_shader_selector *sel = shader->selector;
+   struct si_shader *mainp = *si_get_main_shader_part(sel, &shader->key);
+
+   /* LS, ES, VS are compiled on demand if the main part hasn't been
+    * compiled for that stage.
+    *
+    * GS are compiled on demand if the main part hasn't been compiled
+    * for the chosen NGG-ness.
+    *
+    * Vertex shaders are compiled on demand when a vertex fetch
+    * workaround must be applied.
+    */
+   if (shader->is_monolithic) {
+      /* Monolithic shader (compiled as a whole, has many variants,
+       * may take a long time to compile).
+       */
+      if (!si_compile_shader(sscreen, compiler, shader, debug))
+         return false;
+   } else {
+      /* The shader consists of several parts:
+       *
+       * - the middle part is the user shader, it has 1 variant only
+       *   and it was compiled during the creation of the shader
+       *   selector
+       * - the prolog part is inserted at the beginning
+       * - the epilog part is inserted at the end
+       *
+       * The prolog and epilog have many (but simple) variants.
+       *
+       * Starting with gfx9, geometry and tessellation control
+       * shaders also contain the prolog and user shader parts of
+       * the previous shader stage.
+       */
+
+      if (!mainp)
+         return false;
+
+      /* Copy the compiled shader data over. */
+      shader->is_binary_shared = true;
+      shader->binary = mainp->binary;
+      shader->config = mainp->config;
+      shader->info.num_input_sgprs = mainp->info.num_input_sgprs;
+      shader->info.num_input_vgprs = mainp->info.num_input_vgprs;
+      shader->info.face_vgpr_index = mainp->info.face_vgpr_index;
+      shader->info.ancillary_vgpr_index = mainp->info.ancillary_vgpr_index;
+      memcpy(shader->info.vs_output_param_offset, mainp->info.vs_output_param_offset,
+             sizeof(mainp->info.vs_output_param_offset));
+      shader->info.uses_instanceid = mainp->info.uses_instanceid;
+      shader->info.nr_pos_exports = mainp->info.nr_pos_exports;
+      shader->info.nr_param_exports = mainp->info.nr_param_exports;
+
+      /* Select prologs and/or epilogs. */
+      switch (sel->type) {
+      case PIPE_SHADER_VERTEX:
+         if (!si_shader_select_vs_parts(sscreen, compiler, shader, debug))
+            return false;
+         break;
+      case PIPE_SHADER_TESS_CTRL:
+         if (!si_shader_select_tcs_parts(sscreen, compiler, shader, debug))
+            return false;
+         break;
+      case PIPE_SHADER_TESS_EVAL:
+         break;
+      case PIPE_SHADER_GEOMETRY:
+         if (!si_shader_select_gs_parts(sscreen, compiler, shader, debug))
+            return false;
+         break;
+      case PIPE_SHADER_FRAGMENT:
+         if (!si_shader_select_ps_parts(sscreen, compiler, shader, debug))
+            return false;
+
+         /* Make sure we have at least as many VGPRs as there
+          * are allocated inputs.
+          */
+         shader->config.num_vgprs = MAX2(shader->config.num_vgprs, shader->info.num_input_vgprs);
+         break;
+      default:;
+      }
+
+      /* Update SGPR and VGPR counts. */
+      if (shader->prolog) {
+         shader->config.num_sgprs =
+            MAX2(shader->config.num_sgprs, shader->prolog->config.num_sgprs);
+         shader->config.num_vgprs =
+            MAX2(shader->config.num_vgprs, shader->prolog->config.num_vgprs);
+      }
+      if (shader->previous_stage) {
+         shader->config.num_sgprs =
+            MAX2(shader->config.num_sgprs, shader->previous_stage->config.num_sgprs);
+         shader->config.num_vgprs =
+            MAX2(shader->config.num_vgprs, shader->previous_stage->config.num_vgprs);
+         shader->config.spilled_sgprs =
+            MAX2(shader->config.spilled_sgprs, shader->previous_stage->config.spilled_sgprs);
+         shader->config.spilled_vgprs =
+            MAX2(shader->config.spilled_vgprs, shader->previous_stage->config.spilled_vgprs);
+         shader->info.private_mem_vgprs =
+            MAX2(shader->info.private_mem_vgprs, shader->previous_stage->info.private_mem_vgprs);
+         shader->config.scratch_bytes_per_wave =
+            MAX2(shader->config.scratch_bytes_per_wave,
+                 shader->previous_stage->config.scratch_bytes_per_wave);
+         shader->info.uses_instanceid |= shader->previous_stage->info.uses_instanceid;
+      }
+      if (shader->prolog2) {
+         shader->config.num_sgprs =
+            MAX2(shader->config.num_sgprs, shader->prolog2->config.num_sgprs);
+         shader->config.num_vgprs =
+            MAX2(shader->config.num_vgprs, shader->prolog2->config.num_vgprs);
+      }
+      if (shader->epilog) {
+         shader->config.num_sgprs =
+            MAX2(shader->config.num_sgprs, shader->epilog->config.num_sgprs);
+         shader->config.num_vgprs =
+            MAX2(shader->config.num_vgprs, shader->epilog->config.num_vgprs);
+      }
+      si_calculate_max_simd_waves(shader);
+   }
+
+   if (shader->key.as_ngg) {
+      assert(!shader->key.as_es && !shader->key.as_ls);
+      gfx10_ngg_calculate_subgroup_info(shader);
+   } else if (sscreen->info.chip_class >= GFX9 && sel->type == PIPE_SHADER_GEOMETRY) {
+      gfx9_get_gs_info(shader->previous_stage_sel, sel, &shader->gs_info);
+   }
+
+   si_fix_resource_usage(sscreen, shader);
+   si_shader_dump(sscreen, shader, debug, stderr, true);
+
+   /* Upload. */
+   if (!si_shader_binary_upload(sscreen, shader, 0)) {
+      fprintf(stderr, "LLVM failed to upload shader\n");
+      return false;
+   }
+
+   return true;
 }
 
 void si_shader_binary_clean(struct si_shader_binary *binary)
 {
-	free((void *)binary->elf_buffer);
-	binary->elf_buffer = NULL;
+   free((void *)binary->elf_buffer);
+   binary->elf_buffer = NULL;
 
-	free(binary->llvm_ir_string);
-	binary->llvm_ir_string = NULL;
+   free(binary->llvm_ir_string);
+   binary->llvm_ir_string = NULL;
 }
 
 void si_shader_destroy(struct si_shader *shader)
 {
-	if (shader->scratch_bo)
-		si_resource_reference(&shader->scratch_bo, NULL);
+   if (shader->scratch_bo)
+      si_resource_reference(&shader->scratch_bo, NULL);
 
-	si_resource_reference(&shader->bo, NULL);
+   si_resource_reference(&shader->bo, NULL);
 
-	if (!shader->is_binary_shared)
-		si_shader_binary_clean(&shader->binary);
+   if (!shader->is_binary_shared)
+      si_shader_binary_clean(&shader->binary);
 
-	free(shader->shader_log);
+   free(shader->shader_log);
 }
diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h
index ef571a5d684..4b3bdf4a30e 100644
--- a/src/gallium/drivers/radeonsi/si_shader.h
+++ b/src/gallium/drivers/radeonsi/si_shader.h
@@ -131,14 +131,13 @@
 #ifndef SI_SHADER_H
 #define SI_SHADER_H
 
-#include "util/u_inlines.h"
-#include "util/u_live_shader_cache.h"
-#include "util/u_queue.h"
-#include "util/simple_mtx.h"
-
 #include "ac_binary.h"
 #include "ac_llvm_build.h"
 #include "ac_llvm_util.h"
+#include "util/simple_mtx.h"
+#include "util/u_inlines.h"
+#include "util/u_live_shader_cache.h"
+#include "util/u_queue.h"
 
 #include <stdio.h>
 
@@ -150,136 +149,139 @@ struct nir_shader;
 struct si_shader;
 struct si_context;
 
-#define SI_MAX_ATTRIBS		16
-#define SI_MAX_VS_OUTPUTS	40
+#define SI_MAX_ATTRIBS    16
+#define SI_MAX_VS_OUTPUTS 40
 
 /* Shader IO unique indices are supported for TGSI_SEMANTIC_GENERIC with an
  * index smaller than this.
  */
-#define SI_MAX_IO_GENERIC       32
+#define SI_MAX_IO_GENERIC 32
 
 #define SI_NGG_PRIM_EDGE_FLAG_BITS ((1 << 9) | (1 << 19) | (1 << 29))
 
 /* SGPR user data indices */
-enum {
-	SI_SGPR_RW_BUFFERS,  /* rings (& stream-out, VS only) */
-	SI_SGPR_BINDLESS_SAMPLERS_AND_IMAGES,
-	SI_SGPR_CONST_AND_SHADER_BUFFERS, /* or just a constant buffer 0 pointer */
-	SI_SGPR_SAMPLERS_AND_IMAGES,
-	SI_NUM_RESOURCE_SGPRS,
-
-	/* API VS, TES without GS, GS copy shader */
-	SI_SGPR_VS_STATE_BITS = SI_NUM_RESOURCE_SGPRS,
-	SI_NUM_VS_STATE_RESOURCE_SGPRS,
-
-	/* all VS variants */
-	SI_SGPR_BASE_VERTEX = SI_NUM_VS_STATE_RESOURCE_SGPRS,
-	SI_SGPR_START_INSTANCE,
-	SI_SGPR_DRAWID,
-	SI_VS_NUM_USER_SGPR,
-
-	SI_SGPR_VS_BLIT_DATA = SI_SGPR_CONST_AND_SHADER_BUFFERS,
-
-	/* TES */
-	SI_SGPR_TES_OFFCHIP_LAYOUT = SI_NUM_VS_STATE_RESOURCE_SGPRS,
-	SI_SGPR_TES_OFFCHIP_ADDR,
-	SI_TES_NUM_USER_SGPR,
-
-	/* GFX6-8: TCS only */
-	GFX6_SGPR_TCS_OFFCHIP_LAYOUT = SI_NUM_RESOURCE_SGPRS,
-	GFX6_SGPR_TCS_OUT_OFFSETS,
-	GFX6_SGPR_TCS_OUT_LAYOUT,
-	GFX6_SGPR_TCS_IN_LAYOUT,
-	GFX6_TCS_NUM_USER_SGPR,
-
-	/* GFX9: Merged shaders. */
-	/* 2ND_CONST_AND_SHADER_BUFFERS is set in USER_DATA_ADDR_LO (SGPR0). */
-	/* 2ND_SAMPLERS_AND_IMAGES is set in USER_DATA_ADDR_HI (SGPR1). */
-	GFX9_MERGED_NUM_USER_SGPR = SI_VS_NUM_USER_SGPR,
-
-	/* GFX9: Merged LS-HS (VS-TCS) only. */
-	GFX9_SGPR_TCS_OFFCHIP_LAYOUT = GFX9_MERGED_NUM_USER_SGPR,
-	GFX9_SGPR_TCS_OUT_OFFSETS,
-	GFX9_SGPR_TCS_OUT_LAYOUT,
-	GFX9_TCS_NUM_USER_SGPR,
-
-	/* GS limits */
-	GFX6_GS_NUM_USER_SGPR = SI_NUM_RESOURCE_SGPRS,
-	GFX9_VSGS_NUM_USER_SGPR = SI_VS_NUM_USER_SGPR,
-	GFX9_TESGS_NUM_USER_SGPR = SI_TES_NUM_USER_SGPR,
-	SI_GSCOPY_NUM_USER_SGPR = SI_NUM_VS_STATE_RESOURCE_SGPRS,
-
-	/* PS only */
-	SI_SGPR_ALPHA_REF	= SI_NUM_RESOURCE_SGPRS,
-	SI_PS_NUM_USER_SGPR,
-
-	/* The value has to be 12, because the hw requires that descriptors
-	 * are aligned to 4 SGPRs.
-	 */
-	SI_SGPR_VS_VB_DESCRIPTOR_FIRST = 12,
+enum
+{
+   SI_SGPR_RW_BUFFERS, /* rings (& stream-out, VS only) */
+   SI_SGPR_BINDLESS_SAMPLERS_AND_IMAGES,
+   SI_SGPR_CONST_AND_SHADER_BUFFERS, /* or just a constant buffer 0 pointer */
+   SI_SGPR_SAMPLERS_AND_IMAGES,
+   SI_NUM_RESOURCE_SGPRS,
+
+   /* API VS, TES without GS, GS copy shader */
+   SI_SGPR_VS_STATE_BITS = SI_NUM_RESOURCE_SGPRS,
+   SI_NUM_VS_STATE_RESOURCE_SGPRS,
+
+   /* all VS variants */
+   SI_SGPR_BASE_VERTEX = SI_NUM_VS_STATE_RESOURCE_SGPRS,
+   SI_SGPR_START_INSTANCE,
+   SI_SGPR_DRAWID,
+   SI_VS_NUM_USER_SGPR,
+
+   SI_SGPR_VS_BLIT_DATA = SI_SGPR_CONST_AND_SHADER_BUFFERS,
+
+   /* TES */
+   SI_SGPR_TES_OFFCHIP_LAYOUT = SI_NUM_VS_STATE_RESOURCE_SGPRS,
+   SI_SGPR_TES_OFFCHIP_ADDR,
+   SI_TES_NUM_USER_SGPR,
+
+   /* GFX6-8: TCS only */
+   GFX6_SGPR_TCS_OFFCHIP_LAYOUT = SI_NUM_RESOURCE_SGPRS,
+   GFX6_SGPR_TCS_OUT_OFFSETS,
+   GFX6_SGPR_TCS_OUT_LAYOUT,
+   GFX6_SGPR_TCS_IN_LAYOUT,
+   GFX6_TCS_NUM_USER_SGPR,
+
+   /* GFX9: Merged shaders. */
+   /* 2ND_CONST_AND_SHADER_BUFFERS is set in USER_DATA_ADDR_LO (SGPR0). */
+   /* 2ND_SAMPLERS_AND_IMAGES is set in USER_DATA_ADDR_HI (SGPR1). */
+   GFX9_MERGED_NUM_USER_SGPR = SI_VS_NUM_USER_SGPR,
+
+   /* GFX9: Merged LS-HS (VS-TCS) only. */
+   GFX9_SGPR_TCS_OFFCHIP_LAYOUT = GFX9_MERGED_NUM_USER_SGPR,
+   GFX9_SGPR_TCS_OUT_OFFSETS,
+   GFX9_SGPR_TCS_OUT_LAYOUT,
+   GFX9_TCS_NUM_USER_SGPR,
+
+   /* GS limits */
+   GFX6_GS_NUM_USER_SGPR = SI_NUM_RESOURCE_SGPRS,
+   GFX9_VSGS_NUM_USER_SGPR = SI_VS_NUM_USER_SGPR,
+   GFX9_TESGS_NUM_USER_SGPR = SI_TES_NUM_USER_SGPR,
+   SI_GSCOPY_NUM_USER_SGPR = SI_NUM_VS_STATE_RESOURCE_SGPRS,
+
+   /* PS only */
+   SI_SGPR_ALPHA_REF = SI_NUM_RESOURCE_SGPRS,
+   SI_PS_NUM_USER_SGPR,
+
+   /* The value has to be 12, because the hw requires that descriptors
+    * are aligned to 4 SGPRs.
+    */
+   SI_SGPR_VS_VB_DESCRIPTOR_FIRST = 12,
 };
 
 /* LLVM function parameter indices */
-enum {
-	SI_NUM_RESOURCE_PARAMS = 4,
-
-	/* PS only parameters */
-	SI_PARAM_ALPHA_REF = SI_NUM_RESOURCE_PARAMS,
-	SI_PARAM_PRIM_MASK,
-	SI_PARAM_PERSP_SAMPLE,
-	SI_PARAM_PERSP_CENTER,
-	SI_PARAM_PERSP_CENTROID,
-	SI_PARAM_PERSP_PULL_MODEL,
-	SI_PARAM_LINEAR_SAMPLE,
-	SI_PARAM_LINEAR_CENTER,
-	SI_PARAM_LINEAR_CENTROID,
-	SI_PARAM_LINE_STIPPLE_TEX,
-	SI_PARAM_POS_X_FLOAT,
-	SI_PARAM_POS_Y_FLOAT,
-	SI_PARAM_POS_Z_FLOAT,
-	SI_PARAM_POS_W_FLOAT,
-	SI_PARAM_FRONT_FACE,
-	SI_PARAM_ANCILLARY,
-	SI_PARAM_SAMPLE_COVERAGE,
-	SI_PARAM_POS_FIXED_PT,
-
-	SI_NUM_PARAMS = SI_PARAM_POS_FIXED_PT + 9, /* +8 for COLOR[0..1] */
+enum
+{
+   SI_NUM_RESOURCE_PARAMS = 4,
+
+   /* PS only parameters */
+   SI_PARAM_ALPHA_REF = SI_NUM_RESOURCE_PARAMS,
+   SI_PARAM_PRIM_MASK,
+   SI_PARAM_PERSP_SAMPLE,
+   SI_PARAM_PERSP_CENTER,
+   SI_PARAM_PERSP_CENTROID,
+   SI_PARAM_PERSP_PULL_MODEL,
+   SI_PARAM_LINEAR_SAMPLE,
+   SI_PARAM_LINEAR_CENTER,
+   SI_PARAM_LINEAR_CENTROID,
+   SI_PARAM_LINE_STIPPLE_TEX,
+   SI_PARAM_POS_X_FLOAT,
+   SI_PARAM_POS_Y_FLOAT,
+   SI_PARAM_POS_Z_FLOAT,
+   SI_PARAM_POS_W_FLOAT,
+   SI_PARAM_FRONT_FACE,
+   SI_PARAM_ANCILLARY,
+   SI_PARAM_SAMPLE_COVERAGE,
+   SI_PARAM_POS_FIXED_PT,
+
+   SI_NUM_PARAMS = SI_PARAM_POS_FIXED_PT + 9, /* +8 for COLOR[0..1] */
 };
 
 /* Fields of driver-defined VS state SGPR. */
-#define S_VS_STATE_CLAMP_VERTEX_COLOR(x)	(((unsigned)(x) & 0x1) << 0)
-#define C_VS_STATE_CLAMP_VERTEX_COLOR		0xFFFFFFFE
-#define S_VS_STATE_INDEXED(x)			(((unsigned)(x) & 0x1) << 1)
-#define C_VS_STATE_INDEXED			0xFFFFFFFD
-#define S_VS_STATE_OUTPRIM(x)			(((unsigned)(x) & 0x3) << 2)
-#define C_VS_STATE_OUTPRIM			0xFFFFFFF3
-#define S_VS_STATE_PROVOKING_VTX_INDEX(x)	(((unsigned)(x) & 0x3) << 4)
-#define C_VS_STATE_PROVOKING_VTX_INDEX		0xFFFFFFCF
-#define S_VS_STATE_STREAMOUT_QUERY_ENABLED(x)	(((unsigned)(x) & 0x1) << 6)
-#define C_VS_STATE_STREAMOUT_QUERY_ENABLED	0xFFFFFFBF
-#define S_VS_STATE_SMALL_PRIM_PRECISION(x)	(((unsigned)(x) & 0xF) << 7)
-#define C_VS_STATE_SMALL_PRIM_PRECISION		0xFFFFF87F
-#define S_VS_STATE_LS_OUT_PATCH_SIZE(x)		(((unsigned)(x) & 0x1FFF) << 11)
-#define C_VS_STATE_LS_OUT_PATCH_SIZE		0xFF0007FF
-#define S_VS_STATE_LS_OUT_VERTEX_SIZE(x)	(((unsigned)(x) & 0xFF) << 24)
-#define C_VS_STATE_LS_OUT_VERTEX_SIZE		0x00FFFFFF
-
-enum {
-	/* Use a property enum that CS wouldn't use. */
-	TGSI_PROPERTY_CS_LOCAL_SIZE = TGSI_PROPERTY_FS_COORD_ORIGIN,
-
-	/* These represent the number of SGPRs the shader uses. */
-	SI_VS_BLIT_SGPRS_POS = 3,
-	SI_VS_BLIT_SGPRS_POS_COLOR = 7,
-	SI_VS_BLIT_SGPRS_POS_TEXCOORD = 9,
+#define S_VS_STATE_CLAMP_VERTEX_COLOR(x)      (((unsigned)(x)&0x1) << 0)
+#define C_VS_STATE_CLAMP_VERTEX_COLOR         0xFFFFFFFE
+#define S_VS_STATE_INDEXED(x)                 (((unsigned)(x)&0x1) << 1)
+#define C_VS_STATE_INDEXED                    0xFFFFFFFD
+#define S_VS_STATE_OUTPRIM(x)                 (((unsigned)(x)&0x3) << 2)
+#define C_VS_STATE_OUTPRIM                    0xFFFFFFF3
+#define S_VS_STATE_PROVOKING_VTX_INDEX(x)     (((unsigned)(x)&0x3) << 4)
+#define C_VS_STATE_PROVOKING_VTX_INDEX        0xFFFFFFCF
+#define S_VS_STATE_STREAMOUT_QUERY_ENABLED(x) (((unsigned)(x)&0x1) << 6)
+#define C_VS_STATE_STREAMOUT_QUERY_ENABLED    0xFFFFFFBF
+#define S_VS_STATE_SMALL_PRIM_PRECISION(x)    (((unsigned)(x)&0xF) << 7)
+#define C_VS_STATE_SMALL_PRIM_PRECISION       0xFFFFF87F
+#define S_VS_STATE_LS_OUT_PATCH_SIZE(x)       (((unsigned)(x)&0x1FFF) << 11)
+#define C_VS_STATE_LS_OUT_PATCH_SIZE          0xFF0007FF
+#define S_VS_STATE_LS_OUT_VERTEX_SIZE(x)      (((unsigned)(x)&0xFF) << 24)
+#define C_VS_STATE_LS_OUT_VERTEX_SIZE         0x00FFFFFF
+
+enum
+{
+   /* Use a property enum that CS wouldn't use. */
+   TGSI_PROPERTY_CS_LOCAL_SIZE = TGSI_PROPERTY_FS_COORD_ORIGIN,
+
+   /* These represent the number of SGPRs the shader uses. */
+   SI_VS_BLIT_SGPRS_POS = 3,
+   SI_VS_BLIT_SGPRS_POS_COLOR = 7,
+   SI_VS_BLIT_SGPRS_POS_TEXCOORD = 9,
 };
 
-#define SI_NGG_CULL_VIEW_SMALLPRIMS		(1 << 0) /* view.xy + small prims */
-#define SI_NGG_CULL_BACK_FACE			(1 << 1) /* back faces */
-#define SI_NGG_CULL_FRONT_FACE			(1 << 2) /* front faces */
-#define SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST	(1 << 3) /* GS fast launch: triangles */
-#define SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP	(1 << 4) /* GS fast launch: triangle strip */
-#define SI_NGG_CULL_GS_FAST_LAUNCH_ALL		(0x3 << 3) /* GS fast launch (both prim types) */
+#define SI_NGG_CULL_VIEW_SMALLPRIMS          (1 << 0)   /* view.xy + small prims */
+#define SI_NGG_CULL_BACK_FACE                (1 << 1)   /* back faces */
+#define SI_NGG_CULL_FRONT_FACE               (1 << 2)   /* front faces */
+#define SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST  (1 << 3)   /* GS fast launch: triangles */
+#define SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP (1 << 4)   /* GS fast launch: triangle strip */
+#define SI_NGG_CULL_GS_FAST_LAUNCH_ALL       (0x3 << 3) /* GS fast launch (both prim types) */
 
 /**
  * For VS shader keys, describe any fixups required for vertex fetch.
@@ -292,190 +294,190 @@ enum {
  * buffer_load_format_xyzw).
  */
 union si_vs_fix_fetch {
-	struct {
-		uint8_t log_size : 2; /* 1, 2, 4, 8 or bytes per channel */
-		uint8_t num_channels_m1 : 2; /* number of channels minus 1 */
-		uint8_t format : 3; /* AC_FETCH_FORMAT_xxx */
-		uint8_t reverse : 1; /* reverse XYZ channels */
-	} u;
-	uint8_t bits;
+   struct {
+      uint8_t log_size : 2;        /* 1, 2, 4, 8 or bytes per channel */
+      uint8_t num_channels_m1 : 2; /* number of channels minus 1 */
+      uint8_t format : 3;          /* AC_FETCH_FORMAT_xxx */
+      uint8_t reverse : 1;         /* reverse XYZ channels */
+   } u;
+   uint8_t bits;
 };
 
 struct si_shader;
 
 /* State of the context creating the shader object. */
 struct si_compiler_ctx_state {
-	/* Should only be used by si_init_shader_selector_async and
-	 * si_build_shader_variant if thread_index == -1 (non-threaded). */
-	struct ac_llvm_compiler		*compiler;
+   /* Should only be used by si_init_shader_selector_async and
+    * si_build_shader_variant if thread_index == -1 (non-threaded). */
+   struct ac_llvm_compiler *compiler;
 
-	/* Used if thread_index == -1 or if debug.async is true. */
-	struct pipe_debug_callback	debug;
+   /* Used if thread_index == -1 or if debug.async is true. */
+   struct pipe_debug_callback debug;
 
-	/* Used for creating the log string for gallium/ddebug. */
-	bool				is_debug_context;
+   /* Used for creating the log string for gallium/ddebug. */
+   bool is_debug_context;
 };
 
 struct si_shader_info {
-	ubyte num_inputs;
-	ubyte num_outputs;
-	ubyte input_semantic_name[PIPE_MAX_SHADER_INPUTS]; /**< TGSI_SEMANTIC_x */
-	ubyte input_semantic_index[PIPE_MAX_SHADER_INPUTS];
-	ubyte input_interpolate[PIPE_MAX_SHADER_INPUTS];
-	ubyte input_interpolate_loc[PIPE_MAX_SHADER_INPUTS];
-	ubyte input_usage_mask[PIPE_MAX_SHADER_INPUTS];
-	ubyte output_semantic_name[PIPE_MAX_SHADER_OUTPUTS]; /**< TGSI_SEMANTIC_x */
-	ubyte output_semantic_index[PIPE_MAX_SHADER_OUTPUTS];
-	ubyte output_usagemask[PIPE_MAX_SHADER_OUTPUTS];
-	ubyte output_streams[PIPE_MAX_SHADER_OUTPUTS];
-
-	ubyte processor;
-
-	int constbuf0_num_slots;
-	unsigned const_buffers_declared; /**< bitmask of declared const buffers */
-	unsigned samplers_declared; /**< bitmask of declared samplers */
-	ubyte num_stream_output_components[4];
-
-	uint num_memory_instructions; /**< sampler, buffer, and image instructions */
-
-	/**
-	 * If a tessellation control shader reads outputs, this describes which ones.
-	 */
-	bool reads_pervertex_outputs;
-	bool reads_perpatch_outputs;
-	bool reads_tessfactor_outputs;
-
-	ubyte colors_read; /**< which color components are read by the FS */
-	ubyte colors_written;
-	bool reads_samplemask; /**< does fragment shader read sample mask? */
-	bool reads_tess_factors; /**< If TES reads TESSINNER or TESSOUTER */
-	bool writes_z;  /**< does fragment shader write Z value? */
-	bool writes_stencil; /**< does fragment shader write stencil value? */
-	bool writes_samplemask; /**< does fragment shader write sample mask? */
-	bool writes_edgeflag; /**< vertex shader outputs edgeflag */
-	bool uses_kill;  /**< KILL or KILL_IF instruction used? */
-	bool uses_persp_center;
-	bool uses_persp_centroid;
-	bool uses_persp_sample;
-	bool uses_linear_center;
-	bool uses_linear_centroid;
-	bool uses_linear_sample;
-	bool uses_persp_opcode_interp_sample;
-	bool uses_linear_opcode_interp_sample;
-	bool uses_instanceid;
-	bool uses_vertexid;
-	bool uses_vertexid_nobase;
-	bool uses_basevertex;
-	bool uses_drawid;
-	bool uses_primid;
-	bool uses_frontface;
-	bool uses_invocationid;
-	bool uses_thread_id[3];
-	bool uses_block_id[3];
-	bool uses_block_size;
-	bool uses_grid_size;
-	bool uses_subgroup_info;
-	bool writes_position;
-	bool writes_psize;
-	bool writes_clipvertex;
-	bool writes_primid;
-	bool writes_viewport_index;
-	bool writes_layer;
-	bool writes_memory; /**< contains stores or atomics to buffers or images */
-	bool uses_derivatives;
-	bool uses_bindless_samplers;
-	bool uses_bindless_images;
-	bool uses_fbfetch;
-	unsigned clipdist_writemask;
-	unsigned culldist_writemask;
-	unsigned num_written_culldistance;
-	unsigned num_written_clipdistance;
-
-	unsigned images_declared; /**< bitmask of declared images */
-	unsigned msaa_images_declared; /**< bitmask of declared MSAA images */
-	unsigned shader_buffers_declared; /**< bitmask of declared shader buffers */
-
-	unsigned properties[TGSI_PROPERTY_COUNT]; /* index with TGSI_PROPERTY_ */
-
-	/** Whether all codepaths write tess factors in all invocations. */
-	bool tessfactors_are_def_in_all_invocs;
+   ubyte num_inputs;
+   ubyte num_outputs;
+   ubyte input_semantic_name[PIPE_MAX_SHADER_INPUTS]; /**< TGSI_SEMANTIC_x */
+   ubyte input_semantic_index[PIPE_MAX_SHADER_INPUTS];
+   ubyte input_interpolate[PIPE_MAX_SHADER_INPUTS];
+   ubyte input_interpolate_loc[PIPE_MAX_SHADER_INPUTS];
+   ubyte input_usage_mask[PIPE_MAX_SHADER_INPUTS];
+   ubyte output_semantic_name[PIPE_MAX_SHADER_OUTPUTS]; /**< TGSI_SEMANTIC_x */
+   ubyte output_semantic_index[PIPE_MAX_SHADER_OUTPUTS];
+   ubyte output_usagemask[PIPE_MAX_SHADER_OUTPUTS];
+   ubyte output_streams[PIPE_MAX_SHADER_OUTPUTS];
+
+   ubyte processor;
+
+   int constbuf0_num_slots;
+   unsigned const_buffers_declared; /**< bitmask of declared const buffers */
+   unsigned samplers_declared;      /**< bitmask of declared samplers */
+   ubyte num_stream_output_components[4];
+
+   uint num_memory_instructions; /**< sampler, buffer, and image instructions */
+
+   /**
+    * If a tessellation control shader reads outputs, this describes which ones.
+    */
+   bool reads_pervertex_outputs;
+   bool reads_perpatch_outputs;
+   bool reads_tessfactor_outputs;
+
+   ubyte colors_read; /**< which color components are read by the FS */
+   ubyte colors_written;
+   bool reads_samplemask;   /**< does fragment shader read sample mask? */
+   bool reads_tess_factors; /**< If TES reads TESSINNER or TESSOUTER */
+   bool writes_z;           /**< does fragment shader write Z value? */
+   bool writes_stencil;     /**< does fragment shader write stencil value? */
+   bool writes_samplemask;  /**< does fragment shader write sample mask? */
+   bool writes_edgeflag;    /**< vertex shader outputs edgeflag */
+   bool uses_kill;          /**< KILL or KILL_IF instruction used? */
+   bool uses_persp_center;
+   bool uses_persp_centroid;
+   bool uses_persp_sample;
+   bool uses_linear_center;
+   bool uses_linear_centroid;
+   bool uses_linear_sample;
+   bool uses_persp_opcode_interp_sample;
+   bool uses_linear_opcode_interp_sample;
+   bool uses_instanceid;
+   bool uses_vertexid;
+   bool uses_vertexid_nobase;
+   bool uses_basevertex;
+   bool uses_drawid;
+   bool uses_primid;
+   bool uses_frontface;
+   bool uses_invocationid;
+   bool uses_thread_id[3];
+   bool uses_block_id[3];
+   bool uses_block_size;
+   bool uses_grid_size;
+   bool uses_subgroup_info;
+   bool writes_position;
+   bool writes_psize;
+   bool writes_clipvertex;
+   bool writes_primid;
+   bool writes_viewport_index;
+   bool writes_layer;
+   bool writes_memory; /**< contains stores or atomics to buffers or images */
+   bool uses_derivatives;
+   bool uses_bindless_samplers;
+   bool uses_bindless_images;
+   bool uses_fbfetch;
+   unsigned clipdist_writemask;
+   unsigned culldist_writemask;
+   unsigned num_written_culldistance;
+   unsigned num_written_clipdistance;
+
+   unsigned images_declared;         /**< bitmask of declared images */
+   unsigned msaa_images_declared;    /**< bitmask of declared MSAA images */
+   unsigned shader_buffers_declared; /**< bitmask of declared shader buffers */
+
+   unsigned properties[TGSI_PROPERTY_COUNT]; /* index with TGSI_PROPERTY_ */
+
+   /** Whether all codepaths write tess factors in all invocations. */
+   bool tessfactors_are_def_in_all_invocs;
 };
 
 /* A shader selector is a gallium CSO and contains shader variants and
  * binaries for one NIR program. This can be shared by multiple contexts.
  */
 struct si_shader_selector {
-	struct util_live_shader	base;
-	struct si_screen	*screen;
-	struct util_queue_fence ready;
-	struct si_compiler_ctx_state compiler_ctx_state;
-
-	simple_mtx_t		mutex;
-	struct si_shader	*first_variant; /* immutable after the first variant */
-	struct si_shader	*last_variant; /* mutable */
-
-	/* The compiled NIR shader without a prolog and/or epilog (not
-	 * uploaded to a buffer object).
-	 */
-	struct si_shader	*main_shader_part;
-	struct si_shader	*main_shader_part_ls; /* as_ls is set in the key */
-	struct si_shader	*main_shader_part_es; /* as_es is set in the key */
-	struct si_shader	*main_shader_part_ngg; /* as_ngg is set in the key */
-	struct si_shader	*main_shader_part_ngg_es; /* for Wave32 TES before legacy GS */
-
-	struct si_shader	*gs_copy_shader;
-
-	struct nir_shader       *nir;
-	void			*nir_binary;
-	unsigned		nir_size;
-
-	struct pipe_stream_output_info  so;
-	struct si_shader_info		info;
-
-	/* PIPE_SHADER_[VERTEX|FRAGMENT|...] */
-	enum pipe_shader_type type;
-	bool		vs_needs_prolog;
-	bool		prim_discard_cs_allowed;
-	bool		ngg_culling_allowed;
-	unsigned	num_vs_inputs;
-	unsigned	num_vbos_in_user_sgprs;
-	unsigned	pa_cl_vs_out_cntl;
-	ubyte		clipdist_mask;
-	ubyte		culldist_mask;
-	unsigned	rast_prim;
-
-	/* ES parameters. */
-	unsigned	esgs_itemsize; /* vertex stride */
-	unsigned	lshs_vertex_stride;
-
-	/* GS parameters. */
-	unsigned	gs_input_verts_per_prim;
-	unsigned	gs_output_prim;
-	unsigned	gs_max_out_vertices;
-	unsigned	gs_num_invocations;
-	unsigned	max_gs_stream; /* count - 1 */
-	unsigned	gsvs_vertex_size;
-	unsigned	max_gsvs_emit_size;
-	unsigned	enabled_streamout_buffer_mask;
-	bool		tess_turns_off_ngg;
-
-	/* PS parameters. */
-	unsigned	color_attr_index[2];
-	unsigned	db_shader_control;
-	/* Set 0xf or 0x0 (4 bits) per each written output.
-	 * ANDed with spi_shader_col_format.
-	 */
-	unsigned	colors_written_4bit;
-
-	uint64_t	outputs_written_before_ps; /* "get_unique_index" bits */
-	uint64_t	outputs_written;	/* "get_unique_index" bits */
-	uint32_t	patch_outputs_written;	/* "get_unique_index_patch" bits */
-
-	uint64_t	inputs_read;		/* "get_unique_index" bits */
-
-	/* bitmasks of used descriptor slots */
-	uint32_t	active_const_and_shader_buffers;
-	uint64_t	active_samplers_and_images;
+   struct util_live_shader base;
+   struct si_screen *screen;
+   struct util_queue_fence ready;
+   struct si_compiler_ctx_state compiler_ctx_state;
+
+   simple_mtx_t mutex;
+   struct si_shader *first_variant; /* immutable after the first variant */
+   struct si_shader *last_variant;  /* mutable */
+
+   /* The compiled NIR shader without a prolog and/or epilog (not
+    * uploaded to a buffer object).
+    */
+   struct si_shader *main_shader_part;
+   struct si_shader *main_shader_part_ls;     /* as_ls is set in the key */
+   struct si_shader *main_shader_part_es;     /* as_es is set in the key */
+   struct si_shader *main_shader_part_ngg;    /* as_ngg is set in the key */
+   struct si_shader *main_shader_part_ngg_es; /* for Wave32 TES before legacy GS */
+
+   struct si_shader *gs_copy_shader;
+
+   struct nir_shader *nir;
+   void *nir_binary;
+   unsigned nir_size;
+
+   struct pipe_stream_output_info so;
+   struct si_shader_info info;
+
+   /* PIPE_SHADER_[VERTEX|FRAGMENT|...] */
+   enum pipe_shader_type type;
+   bool vs_needs_prolog;
+   bool prim_discard_cs_allowed;
+   bool ngg_culling_allowed;
+   unsigned num_vs_inputs;
+   unsigned num_vbos_in_user_sgprs;
+   unsigned pa_cl_vs_out_cntl;
+   ubyte clipdist_mask;
+   ubyte culldist_mask;
+   unsigned rast_prim;
+
+   /* ES parameters. */
+   unsigned esgs_itemsize; /* vertex stride */
+   unsigned lshs_vertex_stride;
+
+   /* GS parameters. */
+   unsigned gs_input_verts_per_prim;
+   unsigned gs_output_prim;
+   unsigned gs_max_out_vertices;
+   unsigned gs_num_invocations;
+   unsigned max_gs_stream; /* count - 1 */
+   unsigned gsvs_vertex_size;
+   unsigned max_gsvs_emit_size;
+   unsigned enabled_streamout_buffer_mask;
+   bool tess_turns_off_ngg;
+
+   /* PS parameters. */
+   unsigned color_attr_index[2];
+   unsigned db_shader_control;
+   /* Set 0xf or 0x0 (4 bits) per each written output.
+    * ANDed with spi_shader_col_format.
+    */
+   unsigned colors_written_4bit;
+
+   uint64_t outputs_written_before_ps; /* "get_unique_index" bits */
+   uint64_t outputs_written;           /* "get_unique_index" bits */
+   uint32_t patch_outputs_written;     /* "get_unique_index_patch" bits */
+
+   uint64_t inputs_read; /* "get_unique_index" bits */
+
+   /* bitmasks of used descriptor slots */
+   uint32_t active_const_and_shader_buffers;
+   uint64_t active_samplers_and_images;
 };
 
 /* Valid shader configurations:
@@ -506,184 +508,184 @@ struct si_shader_selector {
 
 /* Common VS bits between the shader key and the prolog key. */
 struct si_vs_prolog_bits {
-	/* - If neither "is_one" nor "is_fetched" has a bit set, the instance
-	 *   divisor is 0.
-	 * - If "is_one" has a bit set, the instance divisor is 1.
-	 * - If "is_fetched" has a bit set, the instance divisor will be loaded
-	 *   from the constant buffer.
-	 */
-	uint16_t	instance_divisor_is_one;     /* bitmask of inputs */
-	uint16_t	instance_divisor_is_fetched; /* bitmask of inputs */
-	unsigned	ls_vgpr_fix:1;
-	unsigned	unpack_instance_id_from_vertex_id:1;
+   /* - If neither "is_one" nor "is_fetched" has a bit set, the instance
+    *   divisor is 0.
+    * - If "is_one" has a bit set, the instance divisor is 1.
+    * - If "is_fetched" has a bit set, the instance divisor will be loaded
+    *   from the constant buffer.
+    */
+   uint16_t instance_divisor_is_one;     /* bitmask of inputs */
+   uint16_t instance_divisor_is_fetched; /* bitmask of inputs */
+   unsigned ls_vgpr_fix : 1;
+   unsigned unpack_instance_id_from_vertex_id : 1;
 };
 
 /* Common TCS bits between the shader key and the epilog key. */
 struct si_tcs_epilog_bits {
-	unsigned	prim_mode:3;
-	unsigned	invoc0_tess_factors_are_def:1;
-	unsigned	tes_reads_tess_factors:1;
+   unsigned prim_mode : 3;
+   unsigned invoc0_tess_factors_are_def : 1;
+   unsigned tes_reads_tess_factors : 1;
 };
 
 struct si_gs_prolog_bits {
-	unsigned	tri_strip_adj_fix:1;
-	unsigned	gfx9_prev_is_vs:1;
+   unsigned tri_strip_adj_fix : 1;
+   unsigned gfx9_prev_is_vs : 1;
 };
 
 /* Common PS bits between the shader key and the prolog key. */
 struct si_ps_prolog_bits {
-	unsigned	color_two_side:1;
-	unsigned	flatshade_colors:1;
-	unsigned	poly_stipple:1;
-	unsigned	force_persp_sample_interp:1;
-	unsigned	force_linear_sample_interp:1;
-	unsigned	force_persp_center_interp:1;
-	unsigned	force_linear_center_interp:1;
-	unsigned	bc_optimize_for_persp:1;
-	unsigned	bc_optimize_for_linear:1;
-	unsigned	samplemask_log_ps_iter:3;
+   unsigned color_two_side : 1;
+   unsigned flatshade_colors : 1;
+   unsigned poly_stipple : 1;
+   unsigned force_persp_sample_interp : 1;
+   unsigned force_linear_sample_interp : 1;
+   unsigned force_persp_center_interp : 1;
+   unsigned force_linear_center_interp : 1;
+   unsigned bc_optimize_for_persp : 1;
+   unsigned bc_optimize_for_linear : 1;
+   unsigned samplemask_log_ps_iter : 3;
 };
 
 /* Common PS bits between the shader key and the epilog key. */
 struct si_ps_epilog_bits {
-	unsigned	spi_shader_col_format;
-	unsigned	color_is_int8:8;
-	unsigned	color_is_int10:8;
-	unsigned	last_cbuf:3;
-	unsigned	alpha_func:3;
-	unsigned	alpha_to_one:1;
-	unsigned	poly_line_smoothing:1;
-	unsigned	clamp_color:1;
+   unsigned spi_shader_col_format;
+   unsigned color_is_int8 : 8;
+   unsigned color_is_int10 : 8;
+   unsigned last_cbuf : 3;
+   unsigned alpha_func : 3;
+   unsigned alpha_to_one : 1;
+   unsigned poly_line_smoothing : 1;
+   unsigned clamp_color : 1;
 };
 
 union si_shader_part_key {
-	struct {
-		struct si_vs_prolog_bits states;
-		unsigned	num_input_sgprs:6;
-		/* For merged stages such as LS-HS, HS input VGPRs are first. */
-		unsigned	num_merged_next_stage_vgprs:3;
-		unsigned	num_inputs:5;
-		unsigned	as_ls:1;
-		unsigned	as_es:1;
-		unsigned	as_ngg:1;
-		unsigned	as_prim_discard_cs:1;
-		unsigned	has_ngg_cull_inputs:1; /* from the NGG cull shader */
-		unsigned	gs_fast_launch_tri_list:1; /* for NGG culling */
-		unsigned	gs_fast_launch_tri_strip:1; /* for NGG culling */
-		/* Prologs for monolithic shaders shouldn't set EXEC. */
-		unsigned	is_monolithic:1;
-	} vs_prolog;
-	struct {
-		struct si_tcs_epilog_bits states;
-	} tcs_epilog;
-	struct {
-		struct si_gs_prolog_bits states;
-		/* Prologs of monolithic shaders shouldn't set EXEC. */
-		unsigned	is_monolithic:1;
-		unsigned	as_ngg:1;
-	} gs_prolog;
-	struct {
-		struct si_ps_prolog_bits states;
-		unsigned	num_input_sgprs:6;
-		unsigned	num_input_vgprs:5;
-		/* Color interpolation and two-side color selection. */
-		unsigned	colors_read:8; /* color input components read */
-		unsigned	num_interp_inputs:5; /* BCOLOR is at this location */
-		unsigned	face_vgpr_index:5;
-		unsigned	ancillary_vgpr_index:5;
-		unsigned	wqm:1;
-		char		color_attr_index[2];
-		signed char	color_interp_vgpr_index[2]; /* -1 == constant */
-	} ps_prolog;
-	struct {
-		struct si_ps_epilog_bits states;
-		unsigned	colors_written:8;
-		unsigned	writes_z:1;
-		unsigned	writes_stencil:1;
-		unsigned	writes_samplemask:1;
-	} ps_epilog;
+   struct {
+      struct si_vs_prolog_bits states;
+      unsigned num_input_sgprs : 6;
+      /* For merged stages such as LS-HS, HS input VGPRs are first. */
+      unsigned num_merged_next_stage_vgprs : 3;
+      unsigned num_inputs : 5;
+      unsigned as_ls : 1;
+      unsigned as_es : 1;
+      unsigned as_ngg : 1;
+      unsigned as_prim_discard_cs : 1;
+      unsigned has_ngg_cull_inputs : 1;      /* from the NGG cull shader */
+      unsigned gs_fast_launch_tri_list : 1;  /* for NGG culling */
+      unsigned gs_fast_launch_tri_strip : 1; /* for NGG culling */
+      /* Prologs for monolithic shaders shouldn't set EXEC. */
+      unsigned is_monolithic : 1;
+   } vs_prolog;
+   struct {
+      struct si_tcs_epilog_bits states;
+   } tcs_epilog;
+   struct {
+      struct si_gs_prolog_bits states;
+      /* Prologs of monolithic shaders shouldn't set EXEC. */
+      unsigned is_monolithic : 1;
+      unsigned as_ngg : 1;
+   } gs_prolog;
+   struct {
+      struct si_ps_prolog_bits states;
+      unsigned num_input_sgprs : 6;
+      unsigned num_input_vgprs : 5;
+      /* Color interpolation and two-side color selection. */
+      unsigned colors_read : 8;       /* color input components read */
+      unsigned num_interp_inputs : 5; /* BCOLOR is at this location */
+      unsigned face_vgpr_index : 5;
+      unsigned ancillary_vgpr_index : 5;
+      unsigned wqm : 1;
+      char color_attr_index[2];
+      signed char color_interp_vgpr_index[2]; /* -1 == constant */
+   } ps_prolog;
+   struct {
+      struct si_ps_epilog_bits states;
+      unsigned colors_written : 8;
+      unsigned writes_z : 1;
+      unsigned writes_stencil : 1;
+      unsigned writes_samplemask : 1;
+   } ps_epilog;
 };
 
 struct si_shader_key {
-	/* Prolog and epilog flags. */
-	union {
-		struct {
-			struct si_vs_prolog_bits prolog;
-		} vs;
-		struct {
-			struct si_vs_prolog_bits ls_prolog; /* for merged LS-HS */
-			struct si_shader_selector *ls;   /* for merged LS-HS */
-			struct si_tcs_epilog_bits epilog;
-		} tcs; /* tessellation control shader */
-		struct {
-			struct si_vs_prolog_bits vs_prolog; /* for merged ES-GS */
-			struct si_shader_selector *es;   /* for merged ES-GS */
-			struct si_gs_prolog_bits prolog;
-		} gs;
-		struct {
-			struct si_ps_prolog_bits prolog;
-			struct si_ps_epilog_bits epilog;
-		} ps;
-	} part;
-
-	/* These three are initially set according to the NEXT_SHADER property,
-	 * or guessed if the property doesn't seem correct.
-	 */
-	unsigned as_es:1; /* export shader, which precedes GS */
-	unsigned as_ls:1; /* local shader, which precedes TCS */
-	unsigned as_ngg:1; /* VS, TES, or GS compiled as NGG primitive shader */
-
-	/* Flags for monolithic compilation only. */
-	struct {
-		/* Whether fetch should be opencoded according to vs_fix_fetch.
-		 * Otherwise, if vs_fix_fetch is non-zero, buffer_load_format_xyzw
-		 * with minimal fixups is used. */
-		uint16_t vs_fetch_opencode;
-		union si_vs_fix_fetch vs_fix_fetch[SI_MAX_ATTRIBS];
-
-		union {
-			uint64_t	ff_tcs_inputs_to_copy; /* for fixed-func TCS */
-			/* When PS needs PrimID and GS is disabled. */
-			unsigned	vs_export_prim_id:1;
-			struct {
-				unsigned interpolate_at_sample_force_center:1;
-				unsigned fbfetch_msaa:1;
-				unsigned fbfetch_is_1D:1;
-				unsigned fbfetch_layered:1;
-			} ps;
-		} u;
-	} mono;
-
-	/* Optimization flags for asynchronous compilation only. */
-	struct {
-		/* For HW VS (it can be VS, TES, GS) */
-		uint64_t	kill_outputs; /* "get_unique_index" bits */
-		unsigned	clip_disable:1;
-
-		/* For NGG VS and TES. */
-		unsigned	ngg_culling:5; /* SI_NGG_CULL_* */
-
-		/* For shaders where monolithic variants have better code.
-		 *
-		 * This is a flag that has no effect on code generation,
-		 * but forces monolithic shaders to be used as soon as
-		 * possible, because it's in the "opt" group.
-		 */
-		unsigned	prefer_mono:1;
-
-		/* Primitive discard compute shader. */
-		unsigned	vs_as_prim_discard_cs:1;
-		unsigned	cs_prim_type:4;
-		unsigned	cs_indexed:1;
-		unsigned	cs_instancing:1;
-		unsigned	cs_primitive_restart:1;
-		unsigned	cs_provoking_vertex_first:1;
-		unsigned	cs_need_correct_orientation:1;
-		unsigned	cs_cull_front:1;
-		unsigned	cs_cull_back:1;
-		unsigned	cs_cull_z:1;
-		unsigned	cs_halfz_clip_space:1;
-	} opt;
+   /* Prolog and epilog flags. */
+   union {
+      struct {
+         struct si_vs_prolog_bits prolog;
+      } vs;
+      struct {
+         struct si_vs_prolog_bits ls_prolog; /* for merged LS-HS */
+         struct si_shader_selector *ls;      /* for merged LS-HS */
+         struct si_tcs_epilog_bits epilog;
+      } tcs; /* tessellation control shader */
+      struct {
+         struct si_vs_prolog_bits vs_prolog; /* for merged ES-GS */
+         struct si_shader_selector *es;      /* for merged ES-GS */
+         struct si_gs_prolog_bits prolog;
+      } gs;
+      struct {
+         struct si_ps_prolog_bits prolog;
+         struct si_ps_epilog_bits epilog;
+      } ps;
+   } part;
+
+   /* These three are initially set according to the NEXT_SHADER property,
+    * or guessed if the property doesn't seem correct.
+    */
+   unsigned as_es : 1;  /* export shader, which precedes GS */
+   unsigned as_ls : 1;  /* local shader, which precedes TCS */
+   unsigned as_ngg : 1; /* VS, TES, or GS compiled as NGG primitive shader */
+
+   /* Flags for monolithic compilation only. */
+   struct {
+      /* Whether fetch should be opencoded according to vs_fix_fetch.
+       * Otherwise, if vs_fix_fetch is non-zero, buffer_load_format_xyzw
+       * with minimal fixups is used. */
+      uint16_t vs_fetch_opencode;
+      union si_vs_fix_fetch vs_fix_fetch[SI_MAX_ATTRIBS];
+
+      union {
+         uint64_t ff_tcs_inputs_to_copy; /* for fixed-func TCS */
+         /* When PS needs PrimID and GS is disabled. */
+         unsigned vs_export_prim_id : 1;
+         struct {
+            unsigned interpolate_at_sample_force_center : 1;
+            unsigned fbfetch_msaa : 1;
+            unsigned fbfetch_is_1D : 1;
+            unsigned fbfetch_layered : 1;
+         } ps;
+      } u;
+   } mono;
+
+   /* Optimization flags for asynchronous compilation only. */
+   struct {
+      /* For HW VS (it can be VS, TES, GS) */
+      uint64_t kill_outputs; /* "get_unique_index" bits */
+      unsigned clip_disable : 1;
+
+      /* For NGG VS and TES. */
+      unsigned ngg_culling : 5; /* SI_NGG_CULL_* */
+
+      /* For shaders where monolithic variants have better code.
+       *
+       * This is a flag that has no effect on code generation,
+       * but forces monolithic shaders to be used as soon as
+       * possible, because it's in the "opt" group.
+       */
+      unsigned prefer_mono : 1;
+
+      /* Primitive discard compute shader. */
+      unsigned vs_as_prim_discard_cs : 1;
+      unsigned cs_prim_type : 4;
+      unsigned cs_indexed : 1;
+      unsigned cs_instancing : 1;
+      unsigned cs_primitive_restart : 1;
+      unsigned cs_provoking_vertex_first : 1;
+      unsigned cs_need_correct_orientation : 1;
+      unsigned cs_cull_front : 1;
+      unsigned cs_cull_back : 1;
+      unsigned cs_cull_z : 1;
+      unsigned cs_halfz_clip_space : 1;
+   } opt;
 };
 
 /* Restore the pack alignment to default. */
@@ -691,232 +693,214 @@ struct si_shader_key {
 
 /* GCN-specific shader info. */
 struct si_shader_binary_info {
-	ubyte			vs_output_param_offset[SI_MAX_VS_OUTPUTS];
-	ubyte			num_input_sgprs;
-	ubyte			num_input_vgprs;
-	signed char		face_vgpr_index;
-	signed char		ancillary_vgpr_index;
-	bool			uses_instanceid;
-	ubyte			nr_pos_exports;
-	ubyte			nr_param_exports;
-	unsigned		private_mem_vgprs;
-	unsigned		max_simd_waves;
+   ubyte vs_output_param_offset[SI_MAX_VS_OUTPUTS];
+   ubyte num_input_sgprs;
+   ubyte num_input_vgprs;
+   signed char face_vgpr_index;
+   signed char ancillary_vgpr_index;
+   bool uses_instanceid;
+   ubyte nr_pos_exports;
+   ubyte nr_param_exports;
+   unsigned private_mem_vgprs;
+   unsigned max_simd_waves;
 };
 
 struct si_shader_binary {
-	const char *elf_buffer;
-	size_t elf_size;
+   const char *elf_buffer;
+   size_t elf_size;
 
-	char *llvm_ir_string;
+   char *llvm_ir_string;
 };
 
 struct gfx9_gs_info {
-	unsigned es_verts_per_subgroup;
-	unsigned gs_prims_per_subgroup;
-	unsigned gs_inst_prims_in_subgroup;
-	unsigned max_prims_per_subgroup;
-	unsigned esgs_ring_size; /* in bytes */
+   unsigned es_verts_per_subgroup;
+   unsigned gs_prims_per_subgroup;
+   unsigned gs_inst_prims_in_subgroup;
+   unsigned max_prims_per_subgroup;
+   unsigned esgs_ring_size; /* in bytes */
 };
 
 struct si_shader {
-	struct si_compiler_ctx_state	compiler_ctx_state;
-
-	struct si_shader_selector	*selector;
-	struct si_shader_selector	*previous_stage_sel; /* for refcounting */
-	struct si_shader		*next_variant;
-
-	struct si_shader_part		*prolog;
-	struct si_shader		*previous_stage; /* for GFX9 */
-	struct si_shader_part		*prolog2;
-	struct si_shader_part		*epilog;
-
-	struct si_pm4_state		*pm4;
-	struct si_resource		*bo;
-	struct si_resource		*scratch_bo;
-	struct si_shader_key		key;
-	struct util_queue_fence		ready;
-	bool				compilation_failed;
-	bool				is_monolithic;
-	bool				is_optimized;
-	bool				is_binary_shared;
-	bool				is_gs_copy_shader;
-
-	/* The following data is all that's needed for binary shaders. */
-	struct si_shader_binary		binary;
-	struct ac_shader_config		config;
-	struct si_shader_binary_info	info;
-
-	struct {
-		uint16_t ngg_emit_size; /* in dwords */
-		uint16_t hw_max_esverts;
-		uint16_t max_gsprims;
-		uint16_t max_out_verts;
-		uint16_t prim_amp_factor;
-		bool max_vert_out_per_gs_instance;
-	} ngg;
-
-	/* Shader key + LLVM IR + disassembly + statistics.
-	 * Generated for debug contexts only.
-	 */
-	char				*shader_log;
-	size_t				shader_log_size;
-
-	struct gfx9_gs_info gs_info;
-
-	/* For save precompute context registers values. */
-	union {
-		struct {
-			unsigned	vgt_gsvs_ring_offset_1;
-			unsigned	vgt_gsvs_ring_offset_2;
-			unsigned	vgt_gsvs_ring_offset_3;
-			unsigned	vgt_gsvs_ring_itemsize;
-			unsigned	vgt_gs_max_vert_out;
-			unsigned	vgt_gs_vert_itemsize;
-			unsigned	vgt_gs_vert_itemsize_1;
-			unsigned	vgt_gs_vert_itemsize_2;
-			unsigned	vgt_gs_vert_itemsize_3;
-			unsigned	vgt_gs_instance_cnt;
-			unsigned	vgt_gs_onchip_cntl;
-			unsigned	vgt_gs_max_prims_per_subgroup;
-			unsigned	vgt_esgs_ring_itemsize;
-		} gs;
-
-		struct {
-			unsigned	ge_max_output_per_subgroup;
-			unsigned	ge_ngg_subgrp_cntl;
-			unsigned	vgt_primitiveid_en;
-			unsigned	vgt_gs_onchip_cntl;
-			unsigned	vgt_gs_instance_cnt;
-			unsigned	vgt_esgs_ring_itemsize;
-			unsigned	spi_vs_out_config;
-			unsigned	spi_shader_idx_format;
-			unsigned	spi_shader_pos_format;
-			unsigned	pa_cl_vte_cntl;
-			unsigned	pa_cl_ngg_cntl;
-			unsigned	vgt_gs_max_vert_out; /* for API GS */
-			unsigned	ge_pc_alloc; /* uconfig register */
-		} ngg;
-
-		struct {
-			unsigned	vgt_gs_mode;
-			unsigned	vgt_primitiveid_en;
-			unsigned	vgt_reuse_off;
-			unsigned	spi_vs_out_config;
-			unsigned	spi_shader_pos_format;
-			unsigned	pa_cl_vte_cntl;
-			unsigned	ge_pc_alloc; /* uconfig register */
-		} vs;
-
-		struct {
-			unsigned	spi_ps_input_ena;
-			unsigned	spi_ps_input_addr;
-			unsigned	spi_baryc_cntl;
-			unsigned	spi_ps_in_control;
-			unsigned	spi_shader_z_format;
-			unsigned	spi_shader_col_format;
-			unsigned	cb_shader_mask;
-		} ps;
-	} ctx_reg;
-
-	/*For save precompute registers value */
-	unsigned vgt_tf_param; /* VGT_TF_PARAM */
-	unsigned vgt_vertex_reuse_block_cntl; /* VGT_VERTEX_REUSE_BLOCK_CNTL */
-	unsigned pa_cl_vs_out_cntl;
-	unsigned ge_cntl;
+   struct si_compiler_ctx_state compiler_ctx_state;
+
+   struct si_shader_selector *selector;
+   struct si_shader_selector *previous_stage_sel; /* for refcounting */
+   struct si_shader *next_variant;
+
+   struct si_shader_part *prolog;
+   struct si_shader *previous_stage; /* for GFX9 */
+   struct si_shader_part *prolog2;
+   struct si_shader_part *epilog;
+
+   struct si_pm4_state *pm4;
+   struct si_resource *bo;
+   struct si_resource *scratch_bo;
+   struct si_shader_key key;
+   struct util_queue_fence ready;
+   bool compilation_failed;
+   bool is_monolithic;
+   bool is_optimized;
+   bool is_binary_shared;
+   bool is_gs_copy_shader;
+
+   /* The following data is all that's needed for binary shaders. */
+   struct si_shader_binary binary;
+   struct ac_shader_config config;
+   struct si_shader_binary_info info;
+
+   struct {
+      uint16_t ngg_emit_size; /* in dwords */
+      uint16_t hw_max_esverts;
+      uint16_t max_gsprims;
+      uint16_t max_out_verts;
+      uint16_t prim_amp_factor;
+      bool max_vert_out_per_gs_instance;
+   } ngg;
+
+   /* Shader key + LLVM IR + disassembly + statistics.
+    * Generated for debug contexts only.
+    */
+   char *shader_log;
+   size_t shader_log_size;
+
+   struct gfx9_gs_info gs_info;
+
+   /* For save precompute context registers values. */
+   union {
+      struct {
+         unsigned vgt_gsvs_ring_offset_1;
+         unsigned vgt_gsvs_ring_offset_2;
+         unsigned vgt_gsvs_ring_offset_3;
+         unsigned vgt_gsvs_ring_itemsize;
+         unsigned vgt_gs_max_vert_out;
+         unsigned vgt_gs_vert_itemsize;
+         unsigned vgt_gs_vert_itemsize_1;
+         unsigned vgt_gs_vert_itemsize_2;
+         unsigned vgt_gs_vert_itemsize_3;
+         unsigned vgt_gs_instance_cnt;
+         unsigned vgt_gs_onchip_cntl;
+         unsigned vgt_gs_max_prims_per_subgroup;
+         unsigned vgt_esgs_ring_itemsize;
+      } gs;
+
+      struct {
+         unsigned ge_max_output_per_subgroup;
+         unsigned ge_ngg_subgrp_cntl;
+         unsigned vgt_primitiveid_en;
+         unsigned vgt_gs_onchip_cntl;
+         unsigned vgt_gs_instance_cnt;
+         unsigned vgt_esgs_ring_itemsize;
+         unsigned spi_vs_out_config;
+         unsigned spi_shader_idx_format;
+         unsigned spi_shader_pos_format;
+         unsigned pa_cl_vte_cntl;
+         unsigned pa_cl_ngg_cntl;
+         unsigned vgt_gs_max_vert_out; /* for API GS */
+         unsigned ge_pc_alloc;         /* uconfig register */
+      } ngg;
+
+      struct {
+         unsigned vgt_gs_mode;
+         unsigned vgt_primitiveid_en;
+         unsigned vgt_reuse_off;
+         unsigned spi_vs_out_config;
+         unsigned spi_shader_pos_format;
+         unsigned pa_cl_vte_cntl;
+         unsigned ge_pc_alloc; /* uconfig register */
+      } vs;
+
+      struct {
+         unsigned spi_ps_input_ena;
+         unsigned spi_ps_input_addr;
+         unsigned spi_baryc_cntl;
+         unsigned spi_ps_in_control;
+         unsigned spi_shader_z_format;
+         unsigned spi_shader_col_format;
+         unsigned cb_shader_mask;
+      } ps;
+   } ctx_reg;
+
+   /*For save precompute registers value */
+   unsigned vgt_tf_param;                /* VGT_TF_PARAM */
+   unsigned vgt_vertex_reuse_block_cntl; /* VGT_VERTEX_REUSE_BLOCK_CNTL */
+   unsigned pa_cl_vs_out_cntl;
+   unsigned ge_cntl;
 };
 
 struct si_shader_part {
-	struct si_shader_part *next;
-	union si_shader_part_key key;
-	struct si_shader_binary binary;
-	struct ac_shader_config config;
+   struct si_shader_part *next;
+   union si_shader_part_key key;
+   struct si_shader_binary binary;
+   struct ac_shader_config config;
 };
 
 /* si_shader.c */
-bool si_compile_shader(struct si_screen *sscreen,
-		       struct ac_llvm_compiler *compiler,
-		       struct si_shader *shader,
-		       struct pipe_debug_callback *debug);
-bool si_create_shader_variant(struct si_screen *sscreen,
-			      struct ac_llvm_compiler *compiler,
-			      struct si_shader *shader,
-			      struct pipe_debug_callback *debug);
+bool si_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
+                       struct si_shader *shader, struct pipe_debug_callback *debug);
+bool si_create_shader_variant(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
+                              struct si_shader *shader, struct pipe_debug_callback *debug);
 void si_shader_destroy(struct si_shader *shader);
 unsigned si_shader_io_get_unique_index_patch(unsigned semantic_name, unsigned index);
-unsigned si_shader_io_get_unique_index(unsigned semantic_name, unsigned index,
-				       unsigned is_varying);
+unsigned si_shader_io_get_unique_index(unsigned semantic_name, unsigned index, unsigned is_varying);
 bool si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader,
-			     uint64_t scratch_va);
+                             uint64_t scratch_va);
 void si_shader_dump(struct si_screen *sscreen, struct si_shader *shader,
-		    struct pipe_debug_callback *debug,
-		    FILE *f, bool check_debug_option);
-void si_shader_dump_stats_for_shader_db(struct si_screen *screen,
-					struct si_shader *shader,
-					struct pipe_debug_callback *debug);
-void si_multiwave_lds_size_workaround(struct si_screen *sscreen,
-				      unsigned *lds_size);
+                    struct pipe_debug_callback *debug, FILE *f, bool check_debug_option);
+void si_shader_dump_stats_for_shader_db(struct si_screen *screen, struct si_shader *shader,
+                                        struct pipe_debug_callback *debug);
+void si_multiwave_lds_size_workaround(struct si_screen *sscreen, unsigned *lds_size);
 const char *si_get_shader_name(const struct si_shader *shader);
 void si_shader_binary_clean(struct si_shader_binary *binary);
 
 /* si_shader_llvm_gs.c */
-struct si_shader *
-si_generate_gs_copy_shader(struct si_screen *sscreen,
-			   struct ac_llvm_compiler *compiler,
-			   struct si_shader_selector *gs_selector,
-			   struct pipe_debug_callback *debug);
+struct si_shader *si_generate_gs_copy_shader(struct si_screen *sscreen,
+                                             struct ac_llvm_compiler *compiler,
+                                             struct si_shader_selector *gs_selector,
+                                             struct pipe_debug_callback *debug);
 
 /* si_shader_nir.c */
-void si_nir_scan_shader(const struct nir_shader *nir,
-			struct si_shader_info *info);
+void si_nir_scan_shader(const struct nir_shader *nir, struct si_shader_info *info);
 void si_nir_adjust_driver_locations(struct nir_shader *nir);
 void si_finalize_nir(struct pipe_screen *screen, void *nirptr, bool optimize);
 
 /* si_state_shaders.c */
-void gfx9_get_gs_info(struct si_shader_selector *es,
-		      struct si_shader_selector *gs,
-		      struct gfx9_gs_info *out);
+void gfx9_get_gs_info(struct si_shader_selector *es, struct si_shader_selector *gs,
+                      struct gfx9_gs_info *out);
 
 /* Inline helpers. */
 
 /* Return the pointer to the main shader part's pointer. */
-static inline struct si_shader **
-si_get_main_shader_part(struct si_shader_selector *sel,
-			struct si_shader_key *key)
+static inline struct si_shader **si_get_main_shader_part(struct si_shader_selector *sel,
+                                                         struct si_shader_key *key)
 {
-	if (key->as_ls)
-		return &sel->main_shader_part_ls;
-	if (key->as_es && key->as_ngg)
-		return &sel->main_shader_part_ngg_es;
-	if (key->as_es)
-		return &sel->main_shader_part_es;
-	if (key->as_ngg)
-		return &sel->main_shader_part_ngg;
-	return &sel->main_shader_part;
+   if (key->as_ls)
+      return &sel->main_shader_part_ls;
+   if (key->as_es && key->as_ngg)
+      return &sel->main_shader_part_ngg_es;
+   if (key->as_es)
+      return &sel->main_shader_part_es;
+   if (key->as_ngg)
+      return &sel->main_shader_part_ngg;
+   return &sel->main_shader_part;
 }
 
-static inline bool
-gfx10_is_ngg_passthrough(struct si_shader *shader)
+static inline bool gfx10_is_ngg_passthrough(struct si_shader *shader)
 {
-	struct si_shader_selector *sel = shader->selector;
-
-	return sel->type != PIPE_SHADER_GEOMETRY &&
-	       !sel->so.num_outputs &&
-	       !sel->info.writes_edgeflag &&
-	       !shader->key.opt.ngg_culling &&
-	       (sel->type != PIPE_SHADER_VERTEX ||
-		!shader->key.mono.u.vs_export_prim_id);
+   struct si_shader_selector *sel = shader->selector;
+
+   return sel->type != PIPE_SHADER_GEOMETRY && !sel->so.num_outputs && !sel->info.writes_edgeflag &&
+          !shader->key.opt.ngg_culling &&
+          (sel->type != PIPE_SHADER_VERTEX || !shader->key.mono.u.vs_export_prim_id);
 }
 
-static inline bool
-si_shader_uses_bindless_samplers(struct si_shader_selector *selector)
+static inline bool si_shader_uses_bindless_samplers(struct si_shader_selector *selector)
 {
-	return selector ? selector->info.uses_bindless_samplers : false;
+   return selector ? selector->info.uses_bindless_samplers : false;
 }
 
-static inline bool
-si_shader_uses_bindless_images(struct si_shader_selector *selector)
+static inline bool si_shader_uses_bindless_images(struct si_shader_selector *selector)
 {
-	return selector ? selector->info.uses_bindless_images : false;
+   return selector ? selector->info.uses_bindless_images : false;
 }
 
 #endif
diff --git a/src/gallium/drivers/radeonsi/si_shader_internal.h b/src/gallium/drivers/radeonsi/si_shader_internal.h
index 47173142d44..2191604b706 100644
--- a/src/gallium/drivers/radeonsi/si_shader_internal.h
+++ b/src/gallium/drivers/radeonsi/si_shader_internal.h
@@ -25,8 +25,8 @@
 #ifndef SI_SHADER_PRIVATE_H
 #define SI_SHADER_PRIVATE_H
 
-#include "si_shader.h"
 #include "ac_shader_abi.h"
+#include "si_shader.h"
 
 struct pipe_debug_callback;
 
@@ -38,275 +38,245 @@ struct pipe_debug_callback;
 #define PS_EPILOG_SAMPLEMASK_MIN_LOC 14
 
 struct si_shader_output_values {
-	LLVMValueRef values[4];
-	unsigned semantic_name;
-	unsigned semantic_index;
-	ubyte vertex_stream[4];
+   LLVMValueRef values[4];
+   unsigned semantic_name;
+   unsigned semantic_index;
+   ubyte vertex_stream[4];
 };
 
 struct si_shader_context {
-	struct ac_llvm_context ac;
-	struct si_shader *shader;
-	struct si_screen *screen;
-
-	unsigned type; /* PIPE_SHADER_* specifies the type of shader. */
-
-	/* For clamping the non-constant index in resource indexing: */
-	unsigned num_const_buffers;
-	unsigned num_shader_buffers;
-	unsigned num_images;
-	unsigned num_samplers;
-
-	struct ac_shader_args args;
-	struct ac_shader_abi abi;
-
-	LLVMValueRef inputs[RADEON_LLVM_MAX_INPUTS];
-
-	LLVMBasicBlockRef merged_wrap_if_entry_block;
-	int merged_wrap_if_label;
-
-	LLVMValueRef main_fn;
-	LLVMTypeRef return_type;
-
-	struct ac_arg const_and_shader_buffers;
-	struct ac_arg samplers_and_images;
-
-	/* For merged shaders, the per-stage descriptors for the stage other
-	 * than the one we're processing, used to pass them through from the
-	 * first stage to the second.
-	 */
-	struct ac_arg other_const_and_shader_buffers;
-	struct ac_arg other_samplers_and_images;
-
-	struct ac_arg rw_buffers;
-	struct ac_arg bindless_samplers_and_images;
-	/* Common inputs for merged shaders. */
-	struct ac_arg merged_wave_info;
-	struct ac_arg merged_scratch_offset;
-	struct ac_arg small_prim_cull_info;
-	/* API VS */
-	struct ac_arg vertex_buffers;
-	struct ac_arg vb_descriptors[5];
-	struct ac_arg rel_auto_id;
-	struct ac_arg vs_prim_id;
-	struct ac_arg vertex_index0;
-	/* VS states and layout of LS outputs / TCS inputs at the end
-	 *   [0] = clamp vertex color
-	 *   [1] = indexed
-	 *   [2:3] = NGG: output primitive type
-	 *   [4:5] = NGG: provoking vertex index
-	 *   [6]   = NGG: streamout queries enabled
-	 *   [7:10] = NGG: small prim filter precision = num_samples / quant_mode,
-	 *            but in reality it's: 1/2^n, from 1/16 to 1/4096 = 1/2^4 to 1/2^12
-	 *            Only the first 4 bits of the exponent are stored.
-	 *            Set it like this: (fui(num_samples / quant_mode) >> 23)
-	 *            Expand to FP32 like this: ((0x70 | value) << 23);
-	 *            With 0x70 = 112, we get 2^(112 + value - 127) = 2^(value - 15)
-	 *            = 1/2^(15 - value) in FP32
-	 *   [11:23] = stride between patches in DW = num_inputs * num_vertices * 4
-	 *             max = 32*32*4 + 32*4
-	 *   [24:31] = stride between vertices in DW = num_inputs * 4
-	 *             max = 32*4
-	 */
-	struct ac_arg vs_state_bits;
-	struct ac_arg vs_blit_inputs;
-	struct ac_arg ngg_old_thread_id; /* generated by the NGG cull shader */
-	/* HW VS */
-	struct ac_arg streamout_config;
-	struct ac_arg streamout_write_index;
-	struct ac_arg streamout_offset[4];
-
-	/* API TCS & TES */
-	/* Layout of TCS outputs in the offchip buffer
-	 * # 6 bits
-	 *   [0:5] = the number of patches per threadgroup, max = NUM_PATCHES (40)
-	 * # 6 bits
-	 *   [6:11] = the number of output vertices per patch, max = 32
-	 * # 20 bits
-	 *   [12:31] = the offset of per patch attributes in the buffer in bytes.
-	 *             max = NUM_PATCHES*32*32*16
-	 */
-	struct ac_arg tcs_offchip_layout;
-
-	/* API TCS */
-	/* Offsets where TCS outputs and TCS patch outputs live in LDS:
-	 *   [0:15] = TCS output patch0 offset / 16, max = NUM_PATCHES * 32 * 32
-	 *   [16:31] = TCS output patch0 offset for per-patch / 16
-	 *             max = (NUM_PATCHES + 1) * 32*32
-	 */
-	struct ac_arg tcs_out_lds_offsets;
-	/* Layout of TCS outputs / TES inputs:
-	 *   [0:12] = stride between output patches in DW, num_outputs * num_vertices * 4
-	 *            max = 32*32*4 + 32*4
-	 *   [13:18] = gl_PatchVerticesIn, max = 32
-	 *   [19:31] = high 13 bits of the 32-bit address of tessellation ring buffers
-	 */
-	struct ac_arg tcs_out_lds_layout;
-	struct ac_arg tcs_offchip_offset;
-	struct ac_arg tcs_factor_offset;
-
-	/* API TES */
-	struct ac_arg tes_offchip_addr;
-	struct ac_arg tes_u;
-	struct ac_arg tes_v;
-	struct ac_arg tes_rel_patch_id;
-	/* HW ES */
-	struct ac_arg es2gs_offset;
-	/* HW GS */
-	/* On gfx10:
-	 *  - bits 0..11: ordered_wave_id
-	 *  - bits 12..20: number of vertices in group
-	 *  - bits 22..30: number of primitives in group
-	 */
-	struct ac_arg gs_tg_info;
-	/* API GS */
-	struct ac_arg gs2vs_offset;
-	struct ac_arg gs_wave_id; /* GFX6 */
-	struct ac_arg gs_vtx_offset[6]; /* in dwords (GFX6) */
-	struct ac_arg gs_vtx01_offset; /* in dwords (GFX9) */
-	struct ac_arg gs_vtx23_offset; /* in dwords (GFX9) */
-	struct ac_arg gs_vtx45_offset; /* in dwords (GFX9) */
-	/* PS */
-	struct ac_arg pos_fixed_pt;
-	/* CS */
-	struct ac_arg block_size;
-	struct ac_arg cs_user_data;
-
-	struct ac_llvm_compiler *compiler;
-
-	/* Preloaded descriptors. */
-	LLVMValueRef esgs_ring;
-	LLVMValueRef gsvs_ring[4];
-	LLVMValueRef tess_offchip_ring;
-
-	LLVMValueRef invoc0_tess_factors[6]; /* outer[4], inner[2] */
-	LLVMValueRef gs_next_vertex[4];
-	LLVMValueRef gs_curprim_verts[4];
-	LLVMValueRef gs_generated_prims[4];
-	LLVMValueRef gs_ngg_emit;
-	LLVMValueRef gs_ngg_scratch;
-	LLVMValueRef postponed_kill;
-	LLVMValueRef return_value;
+   struct ac_llvm_context ac;
+   struct si_shader *shader;
+   struct si_screen *screen;
+
+   unsigned type; /* PIPE_SHADER_* specifies the type of shader. */
+
+   /* For clamping the non-constant index in resource indexing: */
+   unsigned num_const_buffers;
+   unsigned num_shader_buffers;
+   unsigned num_images;
+   unsigned num_samplers;
+
+   struct ac_shader_args args;
+   struct ac_shader_abi abi;
+
+   LLVMValueRef inputs[RADEON_LLVM_MAX_INPUTS];
+
+   LLVMBasicBlockRef merged_wrap_if_entry_block;
+   int merged_wrap_if_label;
+
+   LLVMValueRef main_fn;
+   LLVMTypeRef return_type;
+
+   struct ac_arg const_and_shader_buffers;
+   struct ac_arg samplers_and_images;
+
+   /* For merged shaders, the per-stage descriptors for the stage other
+    * than the one we're processing, used to pass them through from the
+    * first stage to the second.
+    */
+   struct ac_arg other_const_and_shader_buffers;
+   struct ac_arg other_samplers_and_images;
+
+   struct ac_arg rw_buffers;
+   struct ac_arg bindless_samplers_and_images;
+   /* Common inputs for merged shaders. */
+   struct ac_arg merged_wave_info;
+   struct ac_arg merged_scratch_offset;
+   struct ac_arg small_prim_cull_info;
+   /* API VS */
+   struct ac_arg vertex_buffers;
+   struct ac_arg vb_descriptors[5];
+   struct ac_arg rel_auto_id;
+   struct ac_arg vs_prim_id;
+   struct ac_arg vertex_index0;
+   /* VS states and layout of LS outputs / TCS inputs at the end
+    *   [0] = clamp vertex color
+    *   [1] = indexed
+    *   [2:3] = NGG: output primitive type
+    *   [4:5] = NGG: provoking vertex index
+    *   [6]   = NGG: streamout queries enabled
+    *   [7:10] = NGG: small prim filter precision = num_samples / quant_mode,
+    *            but in reality it's: 1/2^n, from 1/16 to 1/4096 = 1/2^4 to 1/2^12
+    *            Only the first 4 bits of the exponent are stored.
+    *            Set it like this: (fui(num_samples / quant_mode) >> 23)
+    *            Expand to FP32 like this: ((0x70 | value) << 23);
+    *            With 0x70 = 112, we get 2^(112 + value - 127) = 2^(value - 15)
+    *            = 1/2^(15 - value) in FP32
+    *   [11:23] = stride between patches in DW = num_inputs * num_vertices * 4
+    *             max = 32*32*4 + 32*4
+    *   [24:31] = stride between vertices in DW = num_inputs * 4
+    *             max = 32*4
+    */
+   struct ac_arg vs_state_bits;
+   struct ac_arg vs_blit_inputs;
+   struct ac_arg ngg_old_thread_id; /* generated by the NGG cull shader */
+   /* HW VS */
+   struct ac_arg streamout_config;
+   struct ac_arg streamout_write_index;
+   struct ac_arg streamout_offset[4];
+
+   /* API TCS & TES */
+   /* Layout of TCS outputs in the offchip buffer
+    * # 6 bits
+    *   [0:5] = the number of patches per threadgroup, max = NUM_PATCHES (40)
+    * # 6 bits
+    *   [6:11] = the number of output vertices per patch, max = 32
+    * # 20 bits
+    *   [12:31] = the offset of per patch attributes in the buffer in bytes.
+    *             max = NUM_PATCHES*32*32*16
+    */
+   struct ac_arg tcs_offchip_layout;
+
+   /* API TCS */
+   /* Offsets where TCS outputs and TCS patch outputs live in LDS:
+    *   [0:15] = TCS output patch0 offset / 16, max = NUM_PATCHES * 32 * 32
+    *   [16:31] = TCS output patch0 offset for per-patch / 16
+    *             max = (NUM_PATCHES + 1) * 32*32
+    */
+   struct ac_arg tcs_out_lds_offsets;
+   /* Layout of TCS outputs / TES inputs:
+    *   [0:12] = stride between output patches in DW, num_outputs * num_vertices * 4
+    *            max = 32*32*4 + 32*4
+    *   [13:18] = gl_PatchVerticesIn, max = 32
+    *   [19:31] = high 13 bits of the 32-bit address of tessellation ring buffers
+    */
+   struct ac_arg tcs_out_lds_layout;
+   struct ac_arg tcs_offchip_offset;
+   struct ac_arg tcs_factor_offset;
+
+   /* API TES */
+   struct ac_arg tes_offchip_addr;
+   struct ac_arg tes_u;
+   struct ac_arg tes_v;
+   struct ac_arg tes_rel_patch_id;
+   /* HW ES */
+   struct ac_arg es2gs_offset;
+   /* HW GS */
+   /* On gfx10:
+    *  - bits 0..11: ordered_wave_id
+    *  - bits 12..20: number of vertices in group
+    *  - bits 22..30: number of primitives in group
+    */
+   struct ac_arg gs_tg_info;
+   /* API GS */
+   struct ac_arg gs2vs_offset;
+   struct ac_arg gs_wave_id;       /* GFX6 */
+   struct ac_arg gs_vtx_offset[6]; /* in dwords (GFX6) */
+   struct ac_arg gs_vtx01_offset;  /* in dwords (GFX9) */
+   struct ac_arg gs_vtx23_offset;  /* in dwords (GFX9) */
+   struct ac_arg gs_vtx45_offset;  /* in dwords (GFX9) */
+   /* PS */
+   struct ac_arg pos_fixed_pt;
+   /* CS */
+   struct ac_arg block_size;
+   struct ac_arg cs_user_data;
+
+   struct ac_llvm_compiler *compiler;
+
+   /* Preloaded descriptors. */
+   LLVMValueRef esgs_ring;
+   LLVMValueRef gsvs_ring[4];
+   LLVMValueRef tess_offchip_ring;
+
+   LLVMValueRef invoc0_tess_factors[6]; /* outer[4], inner[2] */
+   LLVMValueRef gs_next_vertex[4];
+   LLVMValueRef gs_curprim_verts[4];
+   LLVMValueRef gs_generated_prims[4];
+   LLVMValueRef gs_ngg_emit;
+   LLVMValueRef gs_ngg_scratch;
+   LLVMValueRef postponed_kill;
+   LLVMValueRef return_value;
 };
 
-static inline struct si_shader_context *
-si_shader_context_from_abi(struct ac_shader_abi *abi)
+static inline struct si_shader_context *si_shader_context_from_abi(struct ac_shader_abi *abi)
 {
-	struct si_shader_context *ctx = NULL;
-	return container_of(abi, ctx, abi);
+   struct si_shader_context *ctx = NULL;
+   return container_of(abi, ctx, abi);
 }
 
 bool si_is_multi_part_shader(struct si_shader *shader);
 bool si_is_merged_shader(struct si_shader *shader);
-void si_add_arg_checked(struct ac_shader_args *args,
-			enum ac_arg_regfile file,
-			unsigned registers, enum ac_arg_type type,
-			struct ac_arg *arg,
-			unsigned idx);
+void si_add_arg_checked(struct ac_shader_args *args, enum ac_arg_regfile file, unsigned registers,
+                        enum ac_arg_type type, struct ac_arg *arg, unsigned idx);
 unsigned si_get_max_workgroup_size(const struct si_shader *shader);
 bool si_need_ps_prolog(const union si_shader_part_key *key);
-void si_get_ps_prolog_key(struct si_shader *shader,
-			  union si_shader_part_key *key,
-			  bool separate_prolog);
-void si_get_ps_epilog_key(struct si_shader *shader,
-			  union si_shader_part_key *key);
+void si_get_ps_prolog_key(struct si_shader *shader, union si_shader_part_key *key,
+                          bool separate_prolog);
+void si_get_ps_epilog_key(struct si_shader *shader, union si_shader_part_key *key);
 void si_fix_resource_usage(struct si_screen *sscreen, struct si_shader *shader);
 void si_create_function(struct si_shader_context *ctx, bool ngg_cull_shader);
 
 bool gfx10_ngg_export_prim_early(struct si_shader *shader);
 void gfx10_ngg_build_sendmsg_gs_alloc_req(struct si_shader_context *ctx);
-void gfx10_ngg_build_export_prim(struct si_shader_context *ctx,
-				 LLVMValueRef user_edgeflags[3],
-				 LLVMValueRef prim_passthrough);
-void gfx10_emit_ngg_culling_epilogue_4x_wave32(struct ac_shader_abi *abi,
-					       unsigned max_outputs,
-					       LLVMValueRef *addrs);
-void gfx10_emit_ngg_epilogue(struct ac_shader_abi *abi,
-			     unsigned max_outputs,
-			     LLVMValueRef *addrs);
-void gfx10_ngg_gs_emit_vertex(struct si_shader_context *ctx,
-			      unsigned stream,
-			      LLVMValueRef *addrs);
+void gfx10_ngg_build_export_prim(struct si_shader_context *ctx, LLVMValueRef user_edgeflags[3],
+                                 LLVMValueRef prim_passthrough);
+void gfx10_emit_ngg_culling_epilogue_4x_wave32(struct ac_shader_abi *abi, unsigned max_outputs,
+                                               LLVMValueRef *addrs);
+void gfx10_emit_ngg_epilogue(struct ac_shader_abi *abi, unsigned max_outputs, LLVMValueRef *addrs);
+void gfx10_ngg_gs_emit_vertex(struct si_shader_context *ctx, unsigned stream, LLVMValueRef *addrs);
 void gfx10_ngg_gs_emit_prologue(struct si_shader_context *ctx);
 void gfx10_ngg_gs_emit_epilogue(struct si_shader_context *ctx);
 void gfx10_ngg_calculate_subgroup_info(struct si_shader *shader);
 
 /* si_shader_llvm.c */
-bool si_compile_llvm(struct si_screen *sscreen,
-		     struct si_shader_binary *binary,
-		     struct ac_shader_config *conf,
-		     struct ac_llvm_compiler *compiler,
-		     struct ac_llvm_context *ac,
-		     struct pipe_debug_callback *debug,
-		     enum pipe_shader_type shader_type,
-		     const char *name,
-		     bool less_optimized);
-void si_llvm_context_init(struct si_shader_context *ctx,
-			  struct si_screen *sscreen,
-			  struct ac_llvm_compiler *compiler,
-			  unsigned wave_size);
-void si_llvm_create_func(struct si_shader_context *ctx, const char *name,
-			 LLVMTypeRef *return_types, unsigned num_return_elems,
-			 unsigned max_workgroup_size);
+bool si_compile_llvm(struct si_screen *sscreen, struct si_shader_binary *binary,
+                     struct ac_shader_config *conf, struct ac_llvm_compiler *compiler,
+                     struct ac_llvm_context *ac, struct pipe_debug_callback *debug,
+                     enum pipe_shader_type shader_type, const char *name, bool less_optimized);
+void si_llvm_context_init(struct si_shader_context *ctx, struct si_screen *sscreen,
+                          struct ac_llvm_compiler *compiler, unsigned wave_size);
+void si_llvm_create_func(struct si_shader_context *ctx, const char *name, LLVMTypeRef *return_types,
+                         unsigned num_return_elems, unsigned max_workgroup_size);
 void si_llvm_optimize_module(struct si_shader_context *ctx);
 void si_llvm_dispose(struct si_shader_context *ctx);
-LLVMValueRef si_buffer_load_const(struct si_shader_context *ctx,
-				  LLVMValueRef resource, LLVMValueRef offset);
+LLVMValueRef si_buffer_load_const(struct si_shader_context *ctx, LLVMValueRef resource,
+                                  LLVMValueRef offset);
 void si_llvm_build_ret(struct si_shader_context *ctx, LLVMValueRef ret);
 LLVMValueRef si_insert_input_ret(struct si_shader_context *ctx, LLVMValueRef ret,
-				 struct ac_arg param, unsigned return_index);
+                                 struct ac_arg param, unsigned return_index);
 LLVMValueRef si_insert_input_ret_float(struct si_shader_context *ctx, LLVMValueRef ret,
-				       struct ac_arg param, unsigned return_index);
+                                       struct ac_arg param, unsigned return_index);
 LLVMValueRef si_insert_input_ptr(struct si_shader_context *ctx, LLVMValueRef ret,
-				 struct ac_arg param, unsigned return_index);
+                                 struct ac_arg param, unsigned return_index);
 LLVMValueRef si_prolog_get_rw_buffers(struct si_shader_context *ctx);
-LLVMValueRef si_build_gather_64bit(struct si_shader_context *ctx,
-				   LLVMTypeRef type, LLVMValueRef val1,
-				   LLVMValueRef val2);
+LLVMValueRef si_build_gather_64bit(struct si_shader_context *ctx, LLVMTypeRef type,
+                                   LLVMValueRef val1, LLVMValueRef val2);
 void si_llvm_emit_barrier(struct si_shader_context *ctx);
 void si_llvm_declare_esgs_ring(struct si_shader_context *ctx);
 void si_init_exec_from_input(struct si_shader_context *ctx, struct ac_arg param,
-			     unsigned bitoffset);
-LLVMValueRef si_unpack_param(struct si_shader_context *ctx,
-			     struct ac_arg param, unsigned rshift,
-			     unsigned bitwidth);
-LLVMValueRef si_get_primitive_id(struct si_shader_context *ctx,
-				 unsigned swizzle);
+                             unsigned bitoffset);
+LLVMValueRef si_unpack_param(struct si_shader_context *ctx, struct ac_arg param, unsigned rshift,
+                             unsigned bitwidth);
+LLVMValueRef si_get_primitive_id(struct si_shader_context *ctx, unsigned swizzle);
 LLVMValueRef si_llvm_get_block_size(struct ac_shader_abi *abi);
 void si_llvm_declare_compute_memory(struct si_shader_context *ctx);
 bool si_nir_build_llvm(struct si_shader_context *ctx, struct nir_shader *nir);
 void si_build_wrapper_function(struct si_shader_context *ctx, LLVMValueRef *parts,
-			       unsigned num_parts, unsigned main_part,
-			       unsigned next_shader_first_part);
+                               unsigned num_parts, unsigned main_part,
+                               unsigned next_shader_first_part);
 
 /* si_shader_llvm_gs.c */
 LLVMValueRef si_is_es_thread(struct si_shader_context *ctx);
 LLVMValueRef si_is_gs_thread(struct si_shader_context *ctx);
-void si_llvm_emit_es_epilogue(struct ac_shader_abi *abi, unsigned max_outputs,
-			      LLVMValueRef *addrs);
+void si_llvm_emit_es_epilogue(struct ac_shader_abi *abi, unsigned max_outputs, LLVMValueRef *addrs);
 void si_preload_esgs_ring(struct si_shader_context *ctx);
 void si_preload_gs_rings(struct si_shader_context *ctx);
-void si_llvm_build_gs_prolog(struct si_shader_context *ctx,
-			     union si_shader_part_key *key);
+void si_llvm_build_gs_prolog(struct si_shader_context *ctx, union si_shader_part_key *key);
 void si_llvm_init_gs_callbacks(struct si_shader_context *ctx);
 
 /* si_shader_llvm_tess.c */
 void si_llvm_preload_tes_rings(struct si_shader_context *ctx);
-void si_llvm_emit_ls_epilogue(struct ac_shader_abi *abi, unsigned max_outputs,
-			      LLVMValueRef *addrs);
-void si_llvm_build_tcs_epilog(struct si_shader_context *ctx,
-			      union si_shader_part_key *key);
+void si_llvm_emit_ls_epilogue(struct ac_shader_abi *abi, unsigned max_outputs, LLVMValueRef *addrs);
+void si_llvm_build_tcs_epilog(struct si_shader_context *ctx, union si_shader_part_key *key);
 void si_llvm_init_tcs_callbacks(struct si_shader_context *ctx);
 void si_llvm_init_tes_callbacks(struct si_shader_context *ctx, bool ngg_cull_shader);
 
 /* si_shader_llvm_ps.c */
 LLVMValueRef si_get_sample_id(struct si_shader_context *ctx);
-void si_llvm_build_ps_prolog(struct si_shader_context *ctx,
-			     union si_shader_part_key *key);
-void si_llvm_build_ps_epilog(struct si_shader_context *ctx,
-			     union si_shader_part_key *key);
-void si_llvm_build_monolithic_ps(struct si_shader_context *ctx,
-				 struct si_shader *shader);
+void si_llvm_build_ps_prolog(struct si_shader_context *ctx, union si_shader_part_key *key);
+void si_llvm_build_ps_epilog(struct si_shader_context *ctx, union si_shader_part_key *key);
+void si_llvm_build_monolithic_ps(struct si_shader_context *ctx, struct si_shader *shader);
 void si_llvm_init_ps_callbacks(struct si_shader_context *ctx);
 
 /* si_shader_llvm_resources.c */
@@ -314,21 +284,16 @@ void si_llvm_init_resource_callbacks(struct si_shader_context *ctx);
 
 /* si_shader_llvm_vs.c */
 void si_llvm_load_vs_inputs(struct si_shader_context *ctx, struct nir_shader *nir);
-void si_llvm_streamout_store_output(struct si_shader_context *ctx,
-				    LLVMValueRef const *so_buffers,
-				    LLVMValueRef const *so_write_offsets,
-				    struct pipe_stream_output *stream_out,
-				    struct si_shader_output_values *shader_out);
-void si_llvm_emit_streamout(struct si_shader_context *ctx,
-			    struct si_shader_output_values *outputs,
-			    unsigned noutput, unsigned stream);
+void si_llvm_streamout_store_output(struct si_shader_context *ctx, LLVMValueRef const *so_buffers,
+                                    LLVMValueRef const *so_write_offsets,
+                                    struct pipe_stream_output *stream_out,
+                                    struct si_shader_output_values *shader_out);
+void si_llvm_emit_streamout(struct si_shader_context *ctx, struct si_shader_output_values *outputs,
+                            unsigned noutput, unsigned stream);
 void si_llvm_build_vs_exports(struct si_shader_context *ctx,
-			      struct si_shader_output_values *outputs,
-			      unsigned noutput);
-void si_llvm_emit_vs_epilogue(struct ac_shader_abi *abi, unsigned max_outputs,
-			      LLVMValueRef *addrs);
-void si_llvm_build_vs_prolog(struct si_shader_context *ctx,
-			     union si_shader_part_key *key);
+                              struct si_shader_output_values *outputs, unsigned noutput);
+void si_llvm_emit_vs_epilogue(struct ac_shader_abi *abi, unsigned max_outputs, LLVMValueRef *addrs);
+void si_llvm_build_vs_prolog(struct si_shader_context *ctx, union si_shader_part_key *key);
 void si_llvm_init_vs_callbacks(struct si_shader_context *ctx, bool ngg_cull_shader);
 
 #endif
diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm.c b/src/gallium/drivers/radeonsi/si_shader_llvm.c
index dca604afe40..d8bcb4ad55c 100644
--- a/src/gallium/drivers/radeonsi/si_shader_llvm.c
+++ b/src/gallium/drivers/radeonsi/si_shader_llvm.c
@@ -22,298 +22,272 @@
  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  */
 
-#include "si_shader_internal.h"
-#include "si_pipe.h"
-#include "ac_rtld.h"
 #include "ac_nir_to_llvm.h"
+#include "ac_rtld.h"
+#include "si_pipe.h"
+#include "si_shader_internal.h"
 #include "sid.h"
-
 #include "tgsi/tgsi_from_mesa.h"
 #include "util/u_memory.h"
 
 struct si_llvm_diagnostics {
-	struct pipe_debug_callback *debug;
-	unsigned retval;
+   struct pipe_debug_callback *debug;
+   unsigned retval;
 };
 
 static void si_diagnostic_handler(LLVMDiagnosticInfoRef di, void *context)
 {
-	struct si_llvm_diagnostics *diag = (struct si_llvm_diagnostics *)context;
-	LLVMDiagnosticSeverity severity = LLVMGetDiagInfoSeverity(di);
-	const char *severity_str = NULL;
-
-	switch (severity) {
-	case LLVMDSError:
-		severity_str = "error";
-		break;
-	case LLVMDSWarning:
-		severity_str = "warning";
-		break;
-	case LLVMDSRemark:
-	case LLVMDSNote:
-	default:
-		return;
-	}
-
-	char *description = LLVMGetDiagInfoDescription(di);
-
-	pipe_debug_message(diag->debug, SHADER_INFO,
-			   "LLVM diagnostic (%s): %s", severity_str, description);
-
-	if (severity == LLVMDSError) {
-		diag->retval = 1;
-		fprintf(stderr,"LLVM triggered Diagnostic Handler: %s\n", description);
-	}
-
-	LLVMDisposeMessage(description);
+   struct si_llvm_diagnostics *diag = (struct si_llvm_diagnostics *)context;
+   LLVMDiagnosticSeverity severity = LLVMGetDiagInfoSeverity(di);
+   const char *severity_str = NULL;
+
+   switch (severity) {
+   case LLVMDSError:
+      severity_str = "error";
+      break;
+   case LLVMDSWarning:
+      severity_str = "warning";
+      break;
+   case LLVMDSRemark:
+   case LLVMDSNote:
+   default:
+      return;
+   }
+
+   char *description = LLVMGetDiagInfoDescription(di);
+
+   pipe_debug_message(diag->debug, SHADER_INFO, "LLVM diagnostic (%s): %s", severity_str,
+                      description);
+
+   if (severity == LLVMDSError) {
+      diag->retval = 1;
+      fprintf(stderr, "LLVM triggered Diagnostic Handler: %s\n", description);
+   }
+
+   LLVMDisposeMessage(description);
 }
 
-bool si_compile_llvm(struct si_screen *sscreen,
-		     struct si_shader_binary *binary,
-		     struct ac_shader_config *conf,
-		     struct ac_llvm_compiler *compiler,
-		     struct ac_llvm_context *ac,
-		     struct pipe_debug_callback *debug,
-		     enum pipe_shader_type shader_type,
-		     const char *name,
-		     bool less_optimized)
+bool si_compile_llvm(struct si_screen *sscreen, struct si_shader_binary *binary,
+                     struct ac_shader_config *conf, struct ac_llvm_compiler *compiler,
+                     struct ac_llvm_context *ac, struct pipe_debug_callback *debug,
+                     enum pipe_shader_type shader_type, const char *name, bool less_optimized)
 {
-	unsigned count = p_atomic_inc_return(&sscreen->num_compilations);
-
-	if (si_can_dump_shader(sscreen, shader_type)) {
-		fprintf(stderr, "radeonsi: Compiling shader %d\n", count);
-
-		if (!(sscreen->debug_flags & (DBG(NO_IR) | DBG(PREOPT_IR)))) {
-			fprintf(stderr, "%s LLVM IR:\n\n", name);
-			ac_dump_module(ac->module);
-			fprintf(stderr, "\n");
-		}
-	}
-
-	if (sscreen->record_llvm_ir) {
-		char *ir = LLVMPrintModuleToString(ac->module);
-		binary->llvm_ir_string = strdup(ir);
-		LLVMDisposeMessage(ir);
-	}
-
-	if (!si_replace_shader(count, binary)) {
-		struct ac_compiler_passes *passes = compiler->passes;
-
-		if (ac->wave_size == 32)
-			passes = compiler->passes_wave32;
-		else if (less_optimized && compiler->low_opt_passes)
-			passes = compiler->low_opt_passes;
-
-		struct si_llvm_diagnostics diag = {debug};
-		LLVMContextSetDiagnosticHandler(ac->context, si_diagnostic_handler, &diag);
-
-		if (!ac_compile_module_to_elf(passes, ac->module,
-					      (char **)&binary->elf_buffer,
-					      &binary->elf_size))
-			diag.retval = 1;
-
-		if (diag.retval != 0) {
-			pipe_debug_message(debug, SHADER_INFO, "LLVM compilation failed");
-			return false;
-		}
-	}
-
-	struct ac_rtld_binary rtld;
-	if (!ac_rtld_open(&rtld, (struct ac_rtld_open_info){
-			.info = &sscreen->info,
-			.shader_type = tgsi_processor_to_shader_stage(shader_type),
-			.wave_size = ac->wave_size,
-			.num_parts = 1,
-			.elf_ptrs = &binary->elf_buffer,
-			.elf_sizes = &binary->elf_size }))
-		return false;
-
-	bool ok = ac_rtld_read_config(&rtld, conf);
-	ac_rtld_close(&rtld);
-	return ok;
+   unsigned count = p_atomic_inc_return(&sscreen->num_compilations);
+
+   if (si_can_dump_shader(sscreen, shader_type)) {
+      fprintf(stderr, "radeonsi: Compiling shader %d\n", count);
+
+      if (!(sscreen->debug_flags & (DBG(NO_IR) | DBG(PREOPT_IR)))) {
+         fprintf(stderr, "%s LLVM IR:\n\n", name);
+         ac_dump_module(ac->module);
+         fprintf(stderr, "\n");
+      }
+   }
+
+   if (sscreen->record_llvm_ir) {
+      char *ir = LLVMPrintModuleToString(ac->module);
+      binary->llvm_ir_string = strdup(ir);
+      LLVMDisposeMessage(ir);
+   }
+
+   if (!si_replace_shader(count, binary)) {
+      struct ac_compiler_passes *passes = compiler->passes;
+
+      if (ac->wave_size == 32)
+         passes = compiler->passes_wave32;
+      else if (less_optimized && compiler->low_opt_passes)
+         passes = compiler->low_opt_passes;
+
+      struct si_llvm_diagnostics diag = {debug};
+      LLVMContextSetDiagnosticHandler(ac->context, si_diagnostic_handler, &diag);
+
+      if (!ac_compile_module_to_elf(passes, ac->module, (char **)&binary->elf_buffer,
+                                    &binary->elf_size))
+         diag.retval = 1;
+
+      if (diag.retval != 0) {
+         pipe_debug_message(debug, SHADER_INFO, "LLVM compilation failed");
+         return false;
+      }
+   }
+
+   struct ac_rtld_binary rtld;
+   if (!ac_rtld_open(&rtld, (struct ac_rtld_open_info){
+                               .info = &sscreen->info,
+                               .shader_type = tgsi_processor_to_shader_stage(shader_type),
+                               .wave_size = ac->wave_size,
+                               .num_parts = 1,
+                               .elf_ptrs = &binary->elf_buffer,
+                               .elf_sizes = &binary->elf_size}))
+      return false;
+
+   bool ok = ac_rtld_read_config(&rtld, conf);
+   ac_rtld_close(&rtld);
+   return ok;
 }
 
-void si_llvm_context_init(struct si_shader_context *ctx,
-			  struct si_screen *sscreen,
-			  struct ac_llvm_compiler *compiler,
-			  unsigned wave_size)
+void si_llvm_context_init(struct si_shader_context *ctx, struct si_screen *sscreen,
+                          struct ac_llvm_compiler *compiler, unsigned wave_size)
 {
-	memset(ctx, 0, sizeof(*ctx));
-	ctx->screen = sscreen;
-	ctx->compiler = compiler;
-
-	ac_llvm_context_init(&ctx->ac, compiler, sscreen->info.chip_class,
-			     sscreen->info.family,
-			     AC_FLOAT_MODE_NO_SIGNED_ZEROS_FP_MATH,
-			     wave_size, 64);
+   memset(ctx, 0, sizeof(*ctx));
+   ctx->screen = sscreen;
+   ctx->compiler = compiler;
+
+   ac_llvm_context_init(&ctx->ac, compiler, sscreen->info.chip_class, sscreen->info.family,
+                        AC_FLOAT_MODE_NO_SIGNED_ZEROS_FP_MATH, wave_size, 64);
 }
 
-void si_llvm_create_func(struct si_shader_context *ctx, const char *name,
-			 LLVMTypeRef *return_types, unsigned num_return_elems,
-			 unsigned max_workgroup_size)
+void si_llvm_create_func(struct si_shader_context *ctx, const char *name, LLVMTypeRef *return_types,
+                         unsigned num_return_elems, unsigned max_workgroup_size)
 {
-	LLVMTypeRef ret_type;
-	enum ac_llvm_calling_convention call_conv;
-	enum pipe_shader_type real_shader_type;
-
-	if (num_return_elems)
-		ret_type = LLVMStructTypeInContext(ctx->ac.context,
-						   return_types,
-						   num_return_elems, true);
-	else
-		ret_type = ctx->ac.voidt;
-
-	real_shader_type = ctx->type;
-
-	/* LS is merged into HS (TCS), and ES is merged into GS. */
-	if (ctx->screen->info.chip_class >= GFX9) {
-		if (ctx->shader->key.as_ls)
-			real_shader_type = PIPE_SHADER_TESS_CTRL;
-		else if (ctx->shader->key.as_es || ctx->shader->key.as_ngg)
-			real_shader_type = PIPE_SHADER_GEOMETRY;
-	}
-
-	switch (real_shader_type) {
-	case PIPE_SHADER_VERTEX:
-	case PIPE_SHADER_TESS_EVAL:
-		call_conv = AC_LLVM_AMDGPU_VS;
-		break;
-	case PIPE_SHADER_TESS_CTRL:
-		call_conv = AC_LLVM_AMDGPU_HS;
-		break;
-	case PIPE_SHADER_GEOMETRY:
-		call_conv = AC_LLVM_AMDGPU_GS;
-		break;
-	case PIPE_SHADER_FRAGMENT:
-		call_conv = AC_LLVM_AMDGPU_PS;
-		break;
-	case PIPE_SHADER_COMPUTE:
-		call_conv = AC_LLVM_AMDGPU_CS;
-		break;
-	default:
-		unreachable("Unhandle shader type");
-	}
-
-	/* Setup the function */
-	ctx->return_type = ret_type;
-	ctx->main_fn = ac_build_main(&ctx->args, &ctx->ac, call_conv, name,
-				     ret_type, ctx->ac.module);
-	ctx->return_value = LLVMGetUndef(ctx->return_type);
-
-	if (ctx->screen->info.address32_hi) {
-		ac_llvm_add_target_dep_function_attr(ctx->main_fn,
-						     "amdgpu-32bit-address-high-bits",
-						     ctx->screen->info.address32_hi);
-	}
-
-	LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
-					   "no-signed-zeros-fp-math",
-					   "true");
-
-	ac_llvm_set_workgroup_size(ctx->main_fn, max_workgroup_size);
+   LLVMTypeRef ret_type;
+   enum ac_llvm_calling_convention call_conv;
+   enum pipe_shader_type real_shader_type;
+
+   if (num_return_elems)
+      ret_type = LLVMStructTypeInContext(ctx->ac.context, return_types, num_return_elems, true);
+   else
+      ret_type = ctx->ac.voidt;
+
+   real_shader_type = ctx->type;
+
+   /* LS is merged into HS (TCS), and ES is merged into GS. */
+   if (ctx->screen->info.chip_class >= GFX9) {
+      if (ctx->shader->key.as_ls)
+         real_shader_type = PIPE_SHADER_TESS_CTRL;
+      else if (ctx->shader->key.as_es || ctx->shader->key.as_ngg)
+         real_shader_type = PIPE_SHADER_GEOMETRY;
+   }
+
+   switch (real_shader_type) {
+   case PIPE_SHADER_VERTEX:
+   case PIPE_SHADER_TESS_EVAL:
+      call_conv = AC_LLVM_AMDGPU_VS;
+      break;
+   case PIPE_SHADER_TESS_CTRL:
+      call_conv = AC_LLVM_AMDGPU_HS;
+      break;
+   case PIPE_SHADER_GEOMETRY:
+      call_conv = AC_LLVM_AMDGPU_GS;
+      break;
+   case PIPE_SHADER_FRAGMENT:
+      call_conv = AC_LLVM_AMDGPU_PS;
+      break;
+   case PIPE_SHADER_COMPUTE:
+      call_conv = AC_LLVM_AMDGPU_CS;
+      break;
+   default:
+      unreachable("Unhandle shader type");
+   }
+
+   /* Setup the function */
+   ctx->return_type = ret_type;
+   ctx->main_fn = ac_build_main(&ctx->args, &ctx->ac, call_conv, name, ret_type, ctx->ac.module);
+   ctx->return_value = LLVMGetUndef(ctx->return_type);
+
+   if (ctx->screen->info.address32_hi) {
+      ac_llvm_add_target_dep_function_attr(ctx->main_fn, "amdgpu-32bit-address-high-bits",
+                                           ctx->screen->info.address32_hi);
+   }
+
+   LLVMAddTargetDependentFunctionAttr(ctx->main_fn, "no-signed-zeros-fp-math", "true");
+
+   ac_llvm_set_workgroup_size(ctx->main_fn, max_workgroup_size);
 }
 
 void si_llvm_optimize_module(struct si_shader_context *ctx)
 {
-	/* Dump LLVM IR before any optimization passes */
-	if (ctx->screen->debug_flags & DBG(PREOPT_IR) &&
-	    si_can_dump_shader(ctx->screen, ctx->type))
-		LLVMDumpModule(ctx->ac.module);
-
-	/* Run the pass */
-	LLVMRunPassManager(ctx->compiler->passmgr, ctx->ac.module);
-	LLVMDisposeBuilder(ctx->ac.builder);
+   /* Dump LLVM IR before any optimization passes */
+   if (ctx->screen->debug_flags & DBG(PREOPT_IR) && si_can_dump_shader(ctx->screen, ctx->type))
+      LLVMDumpModule(ctx->ac.module);
+
+   /* Run the pass */
+   LLVMRunPassManager(ctx->compiler->passmgr, ctx->ac.module);
+   LLVMDisposeBuilder(ctx->ac.builder);
 }
 
 void si_llvm_dispose(struct si_shader_context *ctx)
 {
-	LLVMDisposeModule(ctx->ac.module);
-	LLVMContextDispose(ctx->ac.context);
-	ac_llvm_context_dispose(&ctx->ac);
+   LLVMDisposeModule(ctx->ac.module);
+   LLVMContextDispose(ctx->ac.context);
+   ac_llvm_context_dispose(&ctx->ac);
 }
 
 /**
  * Load a dword from a constant buffer.
  */
-LLVMValueRef si_buffer_load_const(struct si_shader_context *ctx,
-				  LLVMValueRef resource, LLVMValueRef offset)
+LLVMValueRef si_buffer_load_const(struct si_shader_context *ctx, LLVMValueRef resource,
+                                  LLVMValueRef offset)
 {
-	return ac_build_buffer_load(&ctx->ac, resource, 1, NULL, offset, NULL,
-				    0, 0, true, true);
+   return ac_build_buffer_load(&ctx->ac, resource, 1, NULL, offset, NULL, 0, 0, true, true);
 }
 
 void si_llvm_build_ret(struct si_shader_context *ctx, LLVMValueRef ret)
 {
-	if (LLVMGetTypeKind(LLVMTypeOf(ret)) == LLVMVoidTypeKind)
-		LLVMBuildRetVoid(ctx->ac.builder);
-	else
-		LLVMBuildRet(ctx->ac.builder, ret);
+   if (LLVMGetTypeKind(LLVMTypeOf(ret)) == LLVMVoidTypeKind)
+      LLVMBuildRetVoid(ctx->ac.builder);
+   else
+      LLVMBuildRet(ctx->ac.builder, ret);
 }
 
 LLVMValueRef si_insert_input_ret(struct si_shader_context *ctx, LLVMValueRef ret,
-				 struct ac_arg param, unsigned return_index)
+                                 struct ac_arg param, unsigned return_index)
 {
-	return LLVMBuildInsertValue(ctx->ac.builder, ret,
-				    ac_get_arg(&ctx->ac, param),
-				    return_index, "");
+   return LLVMBuildInsertValue(ctx->ac.builder, ret, ac_get_arg(&ctx->ac, param), return_index, "");
 }
 
 LLVMValueRef si_insert_input_ret_float(struct si_shader_context *ctx, LLVMValueRef ret,
-				       struct ac_arg param, unsigned return_index)
+                                       struct ac_arg param, unsigned return_index)
 {
-	LLVMBuilderRef builder = ctx->ac.builder;
-	LLVMValueRef p = ac_get_arg(&ctx->ac, param);
+   LLVMBuilderRef builder = ctx->ac.builder;
+   LLVMValueRef p = ac_get_arg(&ctx->ac, param);
 
-	return LLVMBuildInsertValue(builder, ret,
-				    ac_to_float(&ctx->ac, p),
-				    return_index, "");
+   return LLVMBuildInsertValue(builder, ret, ac_to_float(&ctx->ac, p), return_index, "");
 }
 
 LLVMValueRef si_insert_input_ptr(struct si_shader_context *ctx, LLVMValueRef ret,
-				 struct ac_arg param, unsigned return_index)
+                                 struct ac_arg param, unsigned return_index)
 {
-	LLVMBuilderRef builder = ctx->ac.builder;
-	LLVMValueRef ptr = ac_get_arg(&ctx->ac, param);
-	ptr = LLVMBuildPtrToInt(builder, ptr, ctx->ac.i32, "");
-	return LLVMBuildInsertValue(builder, ret, ptr, return_index, "");
+   LLVMBuilderRef builder = ctx->ac.builder;
+   LLVMValueRef ptr = ac_get_arg(&ctx->ac, param);
+   ptr = LLVMBuildPtrToInt(builder, ptr, ctx->ac.i32, "");
+   return LLVMBuildInsertValue(builder, ret, ptr, return_index, "");
 }
 
 LLVMValueRef si_prolog_get_rw_buffers(struct si_shader_context *ctx)
 {
-	LLVMValueRef ptr[2], list;
-	bool merged_shader = si_is_merged_shader(ctx->shader);
+   LLVMValueRef ptr[2], list;
+   bool merged_shader = si_is_merged_shader(ctx->shader);
 
-	ptr[0] = LLVMGetParam(ctx->main_fn, (merged_shader ? 8 : 0) + SI_SGPR_RW_BUFFERS);
-	list = LLVMBuildIntToPtr(ctx->ac.builder, ptr[0],
-				 ac_array_in_const32_addr_space(ctx->ac.v4i32), "");
-	return list;
+   ptr[0] = LLVMGetParam(ctx->main_fn, (merged_shader ? 8 : 0) + SI_SGPR_RW_BUFFERS);
+   list =
+      LLVMBuildIntToPtr(ctx->ac.builder, ptr[0], ac_array_in_const32_addr_space(ctx->ac.v4i32), "");
+   return list;
 }
 
-LLVMValueRef si_build_gather_64bit(struct si_shader_context *ctx,
-				   LLVMTypeRef type, LLVMValueRef val1,
-				   LLVMValueRef val2)
+LLVMValueRef si_build_gather_64bit(struct si_shader_context *ctx, LLVMTypeRef type,
+                                   LLVMValueRef val1, LLVMValueRef val2)
 {
-	LLVMValueRef values[2] = {
-		ac_to_integer(&ctx->ac, val1),
-		ac_to_integer(&ctx->ac, val2),
-	};
-	LLVMValueRef result = ac_build_gather_values(&ctx->ac, values, 2);
-	return LLVMBuildBitCast(ctx->ac.builder, result, type, "");
+   LLVMValueRef values[2] = {
+      ac_to_integer(&ctx->ac, val1),
+      ac_to_integer(&ctx->ac, val2),
+   };
+   LLVMValueRef result = ac_build_gather_values(&ctx->ac, values, 2);
+   return LLVMBuildBitCast(ctx->ac.builder, result, type, "");
 }
 
 void si_llvm_emit_barrier(struct si_shader_context *ctx)
 {
-	/* GFX6 only (thanks to a hw bug workaround):
-	 * The real barrier instruction isnât needed, because an entire patch
-	 * always fits into a single wave.
-	 */
-	if (ctx->screen->info.chip_class == GFX6 &&
-	    ctx->type == PIPE_SHADER_TESS_CTRL) {
-		ac_build_waitcnt(&ctx->ac, AC_WAIT_LGKM | AC_WAIT_VLOAD | AC_WAIT_VSTORE);
-		return;
-	}
-
-	ac_build_s_barrier(&ctx->ac);
+   /* GFX6 only (thanks to a hw bug workaround):
+    * The real barrier instruction isnât needed, because an entire patch
+    * always fits into a single wave.
+    */
+   if (ctx->screen->info.chip_class == GFX6 && ctx->type == PIPE_SHADER_TESS_CTRL) {
+      ac_build_waitcnt(&ctx->ac, AC_WAIT_LGKM | AC_WAIT_VLOAD | AC_WAIT_VSTORE);
+      return;
+   }
+
+   ac_build_s_barrier(&ctx->ac);
 }
 
 /* Ensure that the esgs ring is declared.
@@ -323,187 +297,169 @@ void si_llvm_emit_barrier(struct si_shader_context *ctx)
  */
 void si_llvm_declare_esgs_ring(struct si_shader_context *ctx)
 {
-	if (ctx->esgs_ring)
-		return;
+   if (ctx->esgs_ring)
+      return;
 
-	assert(!LLVMGetNamedGlobal(ctx->ac.module, "esgs_ring"));
+   assert(!LLVMGetNamedGlobal(ctx->ac.module, "esgs_ring"));
 
-	ctx->esgs_ring = LLVMAddGlobalInAddressSpace(
-		ctx->ac.module, LLVMArrayType(ctx->ac.i32, 0),
-		"esgs_ring",
-		AC_ADDR_SPACE_LDS);
-	LLVMSetLinkage(ctx->esgs_ring, LLVMExternalLinkage);
-	LLVMSetAlignment(ctx->esgs_ring, 64 * 1024);
+   ctx->esgs_ring = LLVMAddGlobalInAddressSpace(ctx->ac.module, LLVMArrayType(ctx->ac.i32, 0),
+                                                "esgs_ring", AC_ADDR_SPACE_LDS);
+   LLVMSetLinkage(ctx->esgs_ring, LLVMExternalLinkage);
+   LLVMSetAlignment(ctx->esgs_ring, 64 * 1024);
 }
 
-void si_init_exec_from_input(struct si_shader_context *ctx, struct ac_arg param,
-			     unsigned bitoffset)
+void si_init_exec_from_input(struct si_shader_context *ctx, struct ac_arg param, unsigned bitoffset)
 {
-	LLVMValueRef args[] = {
-		ac_get_arg(&ctx->ac, param),
-		LLVMConstInt(ctx->ac.i32, bitoffset, 0),
-	};
-	ac_build_intrinsic(&ctx->ac,
-			   "llvm.amdgcn.init.exec.from.input",
-			   ctx->ac.voidt, args, 2, AC_FUNC_ATTR_CONVERGENT);
+   LLVMValueRef args[] = {
+      ac_get_arg(&ctx->ac, param),
+      LLVMConstInt(ctx->ac.i32, bitoffset, 0),
+   };
+   ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.init.exec.from.input", ctx->ac.voidt, args, 2,
+                      AC_FUNC_ATTR_CONVERGENT);
 }
 
 /**
  * Get the value of a shader input parameter and extract a bitfield.
  */
-static LLVMValueRef unpack_llvm_param(struct si_shader_context *ctx,
-				      LLVMValueRef value, unsigned rshift,
-				      unsigned bitwidth)
+static LLVMValueRef unpack_llvm_param(struct si_shader_context *ctx, LLVMValueRef value,
+                                      unsigned rshift, unsigned bitwidth)
 {
-	if (LLVMGetTypeKind(LLVMTypeOf(value)) == LLVMFloatTypeKind)
-		value = ac_to_integer(&ctx->ac, value);
+   if (LLVMGetTypeKind(LLVMTypeOf(value)) == LLVMFloatTypeKind)
+      value = ac_to_integer(&ctx->ac, value);
 
-	if (rshift)
-		value = LLVMBuildLShr(ctx->ac.builder, value,
-				      LLVMConstInt(ctx->ac.i32, rshift, 0), "");
+   if (rshift)
+      value = LLVMBuildLShr(ctx->ac.builder, value, LLVMConstInt(ctx->ac.i32, rshift, 0), "");
 
-	if (rshift + bitwidth < 32) {
-		unsigned mask = (1 << bitwidth) - 1;
-		value = LLVMBuildAnd(ctx->ac.builder, value,
-				     LLVMConstInt(ctx->ac.i32, mask, 0), "");
-	}
+   if (rshift + bitwidth < 32) {
+      unsigned mask = (1 << bitwidth) - 1;
+      value = LLVMBuildAnd(ctx->ac.builder, value, LLVMConstInt(ctx->ac.i32, mask, 0), "");
+   }
 
-	return value;
+   return value;
 }
 
-LLVMValueRef si_unpack_param(struct si_shader_context *ctx,
-			     struct ac_arg param, unsigned rshift,
-			     unsigned bitwidth)
+LLVMValueRef si_unpack_param(struct si_shader_context *ctx, struct ac_arg param, unsigned rshift,
+                             unsigned bitwidth)
 {
-	LLVMValueRef value = ac_get_arg(&ctx->ac, param);
+   LLVMValueRef value = ac_get_arg(&ctx->ac, param);
 
-	return unpack_llvm_param(ctx, value, rshift, bitwidth);
+   return unpack_llvm_param(ctx, value, rshift, bitwidth);
 }
 
-LLVMValueRef si_get_primitive_id(struct si_shader_context *ctx,
-				 unsigned swizzle)
+LLVMValueRef si_get_primitive_id(struct si_shader_context *ctx, unsigned swizzle)
 {
-	if (swizzle > 0)
-		return ctx->ac.i32_0;
-
-	switch (ctx->type) {
-	case PIPE_SHADER_VERTEX:
-		return ac_get_arg(&ctx->ac, ctx->vs_prim_id);
-	case PIPE_SHADER_TESS_CTRL:
-		return ac_get_arg(&ctx->ac, ctx->args.tcs_patch_id);
-	case PIPE_SHADER_TESS_EVAL:
-		return ac_get_arg(&ctx->ac, ctx->args.tes_patch_id);
-	case PIPE_SHADER_GEOMETRY:
-		return ac_get_arg(&ctx->ac, ctx->args.gs_prim_id);
-	default:
-		assert(0);
-		return ctx->ac.i32_0;
-	}
+   if (swizzle > 0)
+      return ctx->ac.i32_0;
+
+   switch (ctx->type) {
+   case PIPE_SHADER_VERTEX:
+      return ac_get_arg(&ctx->ac, ctx->vs_prim_id);
+   case PIPE_SHADER_TESS_CTRL:
+      return ac_get_arg(&ctx->ac, ctx->args.tcs_patch_id);
+   case PIPE_SHADER_TESS_EVAL:
+      return ac_get_arg(&ctx->ac, ctx->args.tes_patch_id);
+   case PIPE_SHADER_GEOMETRY:
+      return ac_get_arg(&ctx->ac, ctx->args.gs_prim_id);
+   default:
+      assert(0);
+      return ctx->ac.i32_0;
+   }
 }
 
 LLVMValueRef si_llvm_get_block_size(struct ac_shader_abi *abi)
 {
-	struct si_shader_context *ctx = si_shader_context_from_abi(abi);
+   struct si_shader_context *ctx = si_shader_context_from_abi(abi);
 
-	LLVMValueRef values[3];
-	LLVMValueRef result;
-	unsigned i;
-	unsigned *properties = ctx->shader->selector->info.properties;
+   LLVMValueRef values[3];
+   LLVMValueRef result;
+   unsigned i;
+   unsigned *properties = ctx->shader->selector->info.properties;
 
-	if (properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] != 0) {
-		unsigned sizes[3] = {
-			properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH],
-			properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT],
-			properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH]
-		};
+   if (properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] != 0) {
+      unsigned sizes[3] = {properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH],
+                           properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT],
+                           properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH]};
 
-		for (i = 0; i < 3; ++i)
-			values[i] = LLVMConstInt(ctx->ac.i32, sizes[i], 0);
+      for (i = 0; i < 3; ++i)
+         values[i] = LLVMConstInt(ctx->ac.i32, sizes[i], 0);
 
-		result = ac_build_gather_values(&ctx->ac, values, 3);
-	} else {
-		result = ac_get_arg(&ctx->ac, ctx->block_size);
-	}
+      result = ac_build_gather_values(&ctx->ac, values, 3);
+   } else {
+      result = ac_get_arg(&ctx->ac, ctx->block_size);
+   }
 
-	return result;
+   return result;
 }
 
 void si_llvm_declare_compute_memory(struct si_shader_context *ctx)
 {
-	struct si_shader_selector *sel = ctx->shader->selector;
-	unsigned lds_size = sel->info.properties[TGSI_PROPERTY_CS_LOCAL_SIZE];
+   struct si_shader_selector *sel = ctx->shader->selector;
+   unsigned lds_size = sel->info.properties[TGSI_PROPERTY_CS_LOCAL_SIZE];
 
-	LLVMTypeRef i8p = LLVMPointerType(ctx->ac.i8, AC_ADDR_SPACE_LDS);
-	LLVMValueRef var;
+   LLVMTypeRef i8p = LLVMPointerType(ctx->ac.i8, AC_ADDR_SPACE_LDS);
+   LLVMValueRef var;
 
-	assert(!ctx->ac.lds);
+   assert(!ctx->ac.lds);
 
-	var = LLVMAddGlobalInAddressSpace(ctx->ac.module,
-	                                  LLVMArrayType(ctx->ac.i8, lds_size),
-	                                  "compute_lds",
-	                                  AC_ADDR_SPACE_LDS);
-	LLVMSetAlignment(var, 64 * 1024);
+   var = LLVMAddGlobalInAddressSpace(ctx->ac.module, LLVMArrayType(ctx->ac.i8, lds_size),
+                                     "compute_lds", AC_ADDR_SPACE_LDS);
+   LLVMSetAlignment(var, 64 * 1024);
 
-	ctx->ac.lds = LLVMBuildBitCast(ctx->ac.builder, var, i8p, "");
+   ctx->ac.lds = LLVMBuildBitCast(ctx->ac.builder, var, i8p, "");
 }
 
 bool si_nir_build_llvm(struct si_shader_context *ctx, struct nir_shader *nir)
 {
-	if (nir->info.stage == MESA_SHADER_VERTEX) {
-		si_llvm_load_vs_inputs(ctx, nir);
-	} else if (nir->info.stage == MESA_SHADER_FRAGMENT) {
-                unsigned colors_read =
-                        ctx->shader->selector->info.colors_read;
-                LLVMValueRef main_fn = ctx->main_fn;
-
-                LLVMValueRef undef = LLVMGetUndef(ctx->ac.f32);
-
-                unsigned offset = SI_PARAM_POS_FIXED_PT + 1;
-
-                if (colors_read & 0x0f) {
-                        unsigned mask = colors_read & 0x0f;
-                        LLVMValueRef values[4];
-                        values[0] = mask & 0x1 ? LLVMGetParam(main_fn, offset++) : undef;
-                        values[1] = mask & 0x2 ? LLVMGetParam(main_fn, offset++) : undef;
-                        values[2] = mask & 0x4 ? LLVMGetParam(main_fn, offset++) : undef;
-                        values[3] = mask & 0x8 ? LLVMGetParam(main_fn, offset++) : undef;
-                        ctx->abi.color0 =
-                                ac_to_integer(&ctx->ac,
-                                              ac_build_gather_values(&ctx->ac, values, 4));
-                }
-                if (colors_read & 0xf0) {
-                        unsigned mask = (colors_read & 0xf0) >> 4;
-                        LLVMValueRef values[4];
-                        values[0] = mask & 0x1 ? LLVMGetParam(main_fn, offset++) : undef;
-                        values[1] = mask & 0x2 ? LLVMGetParam(main_fn, offset++) : undef;
-                        values[2] = mask & 0x4 ? LLVMGetParam(main_fn, offset++) : undef;
-                        values[3] = mask & 0x8 ? LLVMGetParam(main_fn, offset++) : undef;
-                        ctx->abi.color1 =
-                                ac_to_integer(&ctx->ac,
-                                              ac_build_gather_values(&ctx->ac, values, 4));
-                }
-
-		ctx->abi.interp_at_sample_force_center =
-			ctx->shader->key.mono.u.ps.interpolate_at_sample_force_center;
-	} else if (nir->info.stage == MESA_SHADER_COMPUTE) {
-		if (nir->info.cs.user_data_components_amd) {
-			ctx->abi.user_data = ac_get_arg(&ctx->ac, ctx->cs_user_data);
-			ctx->abi.user_data = ac_build_expand_to_vec4(&ctx->ac, ctx->abi.user_data,
-								     nir->info.cs.user_data_components_amd);
-		}
-	}
-
-	ctx->abi.inputs = &ctx->inputs[0];
-	ctx->abi.clamp_shadow_reference = true;
-	ctx->abi.robust_buffer_access = true;
-
-	if (ctx->shader->selector->info.properties[TGSI_PROPERTY_CS_LOCAL_SIZE]) {
-		assert(gl_shader_stage_is_compute(nir->info.stage));
-		si_llvm_declare_compute_memory(ctx);
-	}
-	ac_nir_translate(&ctx->ac, &ctx->abi, &ctx->args, nir);
-
-	return true;
+   if (nir->info.stage == MESA_SHADER_VERTEX) {
+      si_llvm_load_vs_inputs(ctx, nir);
+   } else if (nir->info.stage == MESA_SHADER_FRAGMENT) {
+      unsigned colors_read = ctx->shader->selector->info.colors_read;
+      LLVMValueRef main_fn = ctx->main_fn;
+
+      LLVMValueRef undef = LLVMGetUndef(ctx->ac.f32);
+
+      unsigned offset = SI_PARAM_POS_FIXED_PT + 1;
+
+      if (colors_read & 0x0f) {
+         unsigned mask = colors_read & 0x0f;
+         LLVMValueRef values[4];
+         values[0] = mask & 0x1 ? LLVMGetParam(main_fn, offset++) : undef;
+         values[1] = mask & 0x2 ? LLVMGetParam(main_fn, offset++) : undef;
+         values[2] = mask & 0x4 ? LLVMGetParam(main_fn, offset++) : undef;
+         values[3] = mask & 0x8 ? LLVMGetParam(main_fn, offset++) : undef;
+         ctx->abi.color0 = ac_to_integer(&ctx->ac, ac_build_gather_values(&ctx->ac, values, 4));
+      }
+      if (colors_read & 0xf0) {
+         unsigned mask = (colors_read & 0xf0) >> 4;
+         LLVMValueRef values[4];
+         values[0] = mask & 0x1 ? LLVMGetParam(main_fn, offset++) : undef;
+         values[1] = mask & 0x2 ? LLVMGetParam(main_fn, offset++) : undef;
+         values[2] = mask & 0x4 ? LLVMGetParam(main_fn, offset++) : undef;
+         values[3] = mask & 0x8 ? LLVMGetParam(main_fn, offset++) : undef;
+         ctx->abi.color1 = ac_to_integer(&ctx->ac, ac_build_gather_values(&ctx->ac, values, 4));
+      }
+
+      ctx->abi.interp_at_sample_force_center =
+         ctx->shader->key.mono.u.ps.interpolate_at_sample_force_center;
+   } else if (nir->info.stage == MESA_SHADER_COMPUTE) {
+      if (nir->info.cs.user_data_components_amd) {
+         ctx->abi.user_data = ac_get_arg(&ctx->ac, ctx->cs_user_data);
+         ctx->abi.user_data = ac_build_expand_to_vec4(&ctx->ac, ctx->abi.user_data,
+                                                      nir->info.cs.user_data_components_amd);
+      }
+   }
+
+   ctx->abi.inputs = &ctx->inputs[0];
+   ctx->abi.clamp_shadow_reference = true;
+   ctx->abi.robust_buffer_access = true;
+
+   if (ctx->shader->selector->info.properties[TGSI_PROPERTY_CS_LOCAL_SIZE]) {
+      assert(gl_shader_stage_is_compute(nir->info.stage));
+      si_llvm_declare_compute_memory(ctx);
+   }
+   ac_nir_translate(&ctx->ac, &ctx->abi, &ctx->args, nir);
+
+   return true;
 }
 
 /**
@@ -511,278 +467,270 @@ bool si_nir_build_llvm(struct si_shader_context *ctx, struct nir_shader *nir)
  * runs them in sequence to form a monolithic shader.
  */
 void si_build_wrapper_function(struct si_shader_context *ctx, LLVMValueRef *parts,
-			       unsigned num_parts, unsigned main_part,
-			       unsigned next_shader_first_part)
+                               unsigned num_parts, unsigned main_part,
+                               unsigned next_shader_first_part)
 {
-	LLVMBuilderRef builder = ctx->ac.builder;
-	/* PS epilog has one arg per color component; gfx9 merged shader
-	 * prologs need to forward 40 SGPRs.
-	 */
-	LLVMValueRef initial[AC_MAX_ARGS], out[AC_MAX_ARGS];
-	LLVMTypeRef function_type;
-	unsigned num_first_params;
-	unsigned num_out, initial_num_out;
-	ASSERTED unsigned num_out_sgpr; /* used in debug checks */
-	ASSERTED unsigned initial_num_out_sgpr; /* used in debug checks */
-	unsigned num_sgprs, num_vgprs;
-	unsigned gprs;
-
-	memset(&ctx->args, 0, sizeof(ctx->args));
-
-	for (unsigned i = 0; i < num_parts; ++i) {
-		ac_add_function_attr(ctx->ac.context, parts[i], -1,
-				     AC_FUNC_ATTR_ALWAYSINLINE);
-		LLVMSetLinkage(parts[i], LLVMPrivateLinkage);
-	}
-
-	/* The parameters of the wrapper function correspond to those of the
-	 * first part in terms of SGPRs and VGPRs, but we use the types of the
-	 * main part to get the right types. This is relevant for the
-	 * dereferenceable attribute on descriptor table pointers.
-	 */
-	num_sgprs = 0;
-	num_vgprs = 0;
-
-	function_type = LLVMGetElementType(LLVMTypeOf(parts[0]));
-	num_first_params = LLVMCountParamTypes(function_type);
-
-	for (unsigned i = 0; i < num_first_params; ++i) {
-		LLVMValueRef param = LLVMGetParam(parts[0], i);
-
-		if (ac_is_sgpr_param(param)) {
-			assert(num_vgprs == 0);
-			num_sgprs += ac_get_type_size(LLVMTypeOf(param)) / 4;
-		} else {
-			num_vgprs += ac_get_type_size(LLVMTypeOf(param)) / 4;
-		}
-	}
-
-	gprs = 0;
-	while (gprs < num_sgprs + num_vgprs) {
-		LLVMValueRef param = LLVMGetParam(parts[main_part], ctx->args.arg_count);
-		LLVMTypeRef type = LLVMTypeOf(param);
-		unsigned size = ac_get_type_size(type) / 4;
-
-		/* This is going to get casted anyways, so we don't have to
-		 * have the exact same type. But we do have to preserve the
-		 * pointer-ness so that LLVM knows about it.
-		 */
-		enum ac_arg_type arg_type = AC_ARG_INT;
-		if (LLVMGetTypeKind(type) == LLVMPointerTypeKind) {
-			type = LLVMGetElementType(type);
-
-			if (LLVMGetTypeKind(type) == LLVMVectorTypeKind) {
-				if (LLVMGetVectorSize(type) == 4)
-					arg_type = AC_ARG_CONST_DESC_PTR;
-				else if (LLVMGetVectorSize(type) == 8)
-					arg_type = AC_ARG_CONST_IMAGE_PTR;
-				else
-					assert(0);
-			} else if (type == ctx->ac.f32) {
-				arg_type = AC_ARG_CONST_FLOAT_PTR;
-			} else {
-				assert(0);
-			}
-		}
-
-		ac_add_arg(&ctx->args, gprs < num_sgprs ? AC_ARG_SGPR : AC_ARG_VGPR,
-			   size, arg_type, NULL);
-
-		assert(ac_is_sgpr_param(param) == (gprs < num_sgprs));
-		assert(gprs + size <= num_sgprs + num_vgprs &&
-		       (gprs >= num_sgprs || gprs + size <= num_sgprs));
-
-		gprs += size;
-	}
-
-	/* Prepare the return type. */
-	unsigned num_returns = 0;
-	LLVMTypeRef returns[AC_MAX_ARGS], last_func_type, return_type;
-
-	last_func_type = LLVMGetElementType(LLVMTypeOf(parts[num_parts - 1]));
-	return_type = LLVMGetReturnType(last_func_type);
-
-	switch (LLVMGetTypeKind(return_type)) {
-	case LLVMStructTypeKind:
-		num_returns = LLVMCountStructElementTypes(return_type);
-		assert(num_returns <= ARRAY_SIZE(returns));
-		LLVMGetStructElementTypes(return_type, returns);
-		break;
-	case LLVMVoidTypeKind:
-		break;
-	default:
-		unreachable("unexpected type");
-	}
-
-	si_llvm_create_func(ctx, "wrapper", returns, num_returns,
-			    si_get_max_workgroup_size(ctx->shader));
-
-	if (si_is_merged_shader(ctx->shader))
-		ac_init_exec_full_mask(&ctx->ac);
-
-	/* Record the arguments of the function as if they were an output of
-	 * a previous part.
-	 */
-	num_out = 0;
-	num_out_sgpr = 0;
-
-	for (unsigned i = 0; i < ctx->args.arg_count; ++i) {
-		LLVMValueRef param = LLVMGetParam(ctx->main_fn, i);
-		LLVMTypeRef param_type = LLVMTypeOf(param);
-		LLVMTypeRef out_type = ctx->args.args[i].file == AC_ARG_SGPR ? ctx->ac.i32 : ctx->ac.f32;
-		unsigned size = ac_get_type_size(param_type) / 4;
-
-		if (size == 1) {
-			if (LLVMGetTypeKind(param_type) == LLVMPointerTypeKind) {
-				param = LLVMBuildPtrToInt(builder, param, ctx->ac.i32, "");
-				param_type = ctx->ac.i32;
-			}
-
-			if (param_type != out_type)
-				param = LLVMBuildBitCast(builder, param, out_type, "");
-			out[num_out++] = param;
-		} else {
-			LLVMTypeRef vector_type = LLVMVectorType(out_type, size);
-
-			if (LLVMGetTypeKind(param_type) == LLVMPointerTypeKind) {
-				param = LLVMBuildPtrToInt(builder, param, ctx->ac.i64, "");
-				param_type = ctx->ac.i64;
-			}
-
-			if (param_type != vector_type)
-				param = LLVMBuildBitCast(builder, param, vector_type, "");
-
-			for (unsigned j = 0; j < size; ++j)
-				out[num_out++] = LLVMBuildExtractElement(
-					builder, param, LLVMConstInt(ctx->ac.i32, j, 0), "");
-		}
-
-		if (ctx->args.args[i].file == AC_ARG_SGPR)
-			num_out_sgpr = num_out;
-	}
-
-	memcpy(initial, out, sizeof(out));
-	initial_num_out = num_out;
-	initial_num_out_sgpr = num_out_sgpr;
-
-	/* Now chain the parts. */
-	LLVMValueRef ret = NULL;
-	for (unsigned part = 0; part < num_parts; ++part) {
-		LLVMValueRef in[AC_MAX_ARGS];
-		LLVMTypeRef ret_type;
-		unsigned out_idx = 0;
-		unsigned num_params = LLVMCountParams(parts[part]);
-
-		/* Merged shaders are executed conditionally depending
-		 * on the number of enabled threads passed in the input SGPRs. */
-		if (si_is_multi_part_shader(ctx->shader) && part == 0) {
-			LLVMValueRef ena, count = initial[3];
-
-			count = LLVMBuildAnd(builder, count,
-					     LLVMConstInt(ctx->ac.i32, 0x7f, 0), "");
-			ena = LLVMBuildICmp(builder, LLVMIntULT,
-					    ac_get_thread_id(&ctx->ac), count, "");
-			ac_build_ifcc(&ctx->ac, ena, 6506);
-		}
-
-		/* Derive arguments for the next part from outputs of the
-		 * previous one.
-		 */
-		for (unsigned param_idx = 0; param_idx < num_params; ++param_idx) {
-			LLVMValueRef param;
-			LLVMTypeRef param_type;
-			bool is_sgpr;
-			unsigned param_size;
-			LLVMValueRef arg = NULL;
-
-			param = LLVMGetParam(parts[part], param_idx);
-			param_type = LLVMTypeOf(param);
-			param_size = ac_get_type_size(param_type) / 4;
-			is_sgpr = ac_is_sgpr_param(param);
-
-			if (is_sgpr) {
-				ac_add_function_attr(ctx->ac.context, parts[part],
-						     param_idx + 1, AC_FUNC_ATTR_INREG);
-			} else if (out_idx < num_out_sgpr) {
-				/* Skip returned SGPRs the current part doesn't
-				 * declare on the input. */
-				out_idx = num_out_sgpr;
-			}
-
-			assert(out_idx + param_size <= (is_sgpr ? num_out_sgpr : num_out));
-
-			if (param_size == 1)
-				arg = out[out_idx];
-			else
-				arg = ac_build_gather_values(&ctx->ac, &out[out_idx], param_size);
-
-			if (LLVMTypeOf(arg) != param_type) {
-				if (LLVMGetTypeKind(param_type) == LLVMPointerTypeKind) {
-					if (LLVMGetPointerAddressSpace(param_type) ==
-					    AC_ADDR_SPACE_CONST_32BIT) {
-						arg = LLVMBuildBitCast(builder, arg, ctx->ac.i32, "");
-						arg = LLVMBuildIntToPtr(builder, arg, param_type, "");
-					} else {
-						arg = LLVMBuildBitCast(builder, arg, ctx->ac.i64, "");
-						arg = LLVMBuildIntToPtr(builder, arg, param_type, "");
-					}
-				} else {
-					arg = LLVMBuildBitCast(builder, arg, param_type, "");
-				}
-			}
-
-			in[param_idx] = arg;
-			out_idx += param_size;
-		}
-
-		ret = ac_build_call(&ctx->ac, parts[part], in, num_params);
-
-		if (si_is_multi_part_shader(ctx->shader) &&
-		    part + 1 == next_shader_first_part) {
-			ac_build_endif(&ctx->ac, 6506);
-
-			/* The second half of the merged shader should use
-			 * the inputs from the toplevel (wrapper) function,
-			 * not the return value from the last call.
-			 *
-			 * That's because the last call was executed condi-
-			 * tionally, so we can't consume it in the main
-			 * block.
-			 */
-			memcpy(out, initial, sizeof(initial));
-			num_out = initial_num_out;
-			num_out_sgpr = initial_num_out_sgpr;
-			continue;
-		}
-
-		/* Extract the returned GPRs. */
-		ret_type = LLVMTypeOf(ret);
-		num_out = 0;
-		num_out_sgpr = 0;
-
-		if (LLVMGetTypeKind(ret_type) != LLVMVoidTypeKind) {
-			assert(LLVMGetTypeKind(ret_type) == LLVMStructTypeKind);
-
-			unsigned ret_size = LLVMCountStructElementTypes(ret_type);
-
-			for (unsigned i = 0; i < ret_size; ++i) {
-				LLVMValueRef val =
-					LLVMBuildExtractValue(builder, ret, i, "");
-
-				assert(num_out < ARRAY_SIZE(out));
-				out[num_out++] = val;
-
-				if (LLVMTypeOf(val) == ctx->ac.i32) {
-					assert(num_out_sgpr + 1 == num_out);
-					num_out_sgpr = num_out;
-				}
-			}
-		}
-	}
-
-	/* Return the value from the last part. */
-	if (LLVMGetTypeKind(LLVMTypeOf(ret)) == LLVMVoidTypeKind)
-		LLVMBuildRetVoid(builder);
-	else
-		LLVMBuildRet(builder, ret);
+   LLVMBuilderRef builder = ctx->ac.builder;
+   /* PS epilog has one arg per color component; gfx9 merged shader
+    * prologs need to forward 40 SGPRs.
+    */
+   LLVMValueRef initial[AC_MAX_ARGS], out[AC_MAX_ARGS];
+   LLVMTypeRef function_type;
+   unsigned num_first_params;
+   unsigned num_out, initial_num_out;
+   ASSERTED unsigned num_out_sgpr;         /* used in debug checks */
+   ASSERTED unsigned initial_num_out_sgpr; /* used in debug checks */
+   unsigned num_sgprs, num_vgprs;
+   unsigned gprs;
+
+   memset(&ctx->args, 0, sizeof(ctx->args));
+
+   for (unsigned i = 0; i < num_parts; ++i) {
+      ac_add_function_attr(ctx->ac.context, parts[i], -1, AC_FUNC_ATTR_ALWAYSINLINE);
+      LLVMSetLinkage(parts[i], LLVMPrivateLinkage);
+   }
+
+   /* The parameters of the wrapper function correspond to those of the
+    * first part in terms of SGPRs and VGPRs, but we use the types of the
+    * main part to get the right types. This is relevant for the
+    * dereferenceable attribute on descriptor table pointers.
+    */
+   num_sgprs = 0;
+   num_vgprs = 0;
+
+   function_type = LLVMGetElementType(LLVMTypeOf(parts[0]));
+   num_first_params = LLVMCountParamTypes(function_type);
+
+   for (unsigned i = 0; i < num_first_params; ++i) {
+      LLVMValueRef param = LLVMGetParam(parts[0], i);
+
+      if (ac_is_sgpr_param(param)) {
+         assert(num_vgprs == 0);
+         num_sgprs += ac_get_type_size(LLVMTypeOf(param)) / 4;
+      } else {
+         num_vgprs += ac_get_type_size(LLVMTypeOf(param)) / 4;
+      }
+   }
+
+   gprs = 0;
+   while (gprs < num_sgprs + num_vgprs) {
+      LLVMValueRef param = LLVMGetParam(parts[main_part], ctx->args.arg_count);
+      LLVMTypeRef type = LLVMTypeOf(param);
+      unsigned size = ac_get_type_size(type) / 4;
+
+      /* This is going to get casted anyways, so we don't have to
+       * have the exact same type. But we do have to preserve the
+       * pointer-ness so that LLVM knows about it.
+       */
+      enum ac_arg_type arg_type = AC_ARG_INT;
+      if (LLVMGetTypeKind(type) == LLVMPointerTypeKind) {
+         type = LLVMGetElementType(type);
+
+         if (LLVMGetTypeKind(type) == LLVMVectorTypeKind) {
+            if (LLVMGetVectorSize(type) == 4)
+               arg_type = AC_ARG_CONST_DESC_PTR;
+            else if (LLVMGetVectorSize(type) == 8)
+               arg_type = AC_ARG_CONST_IMAGE_PTR;
+            else
+               assert(0);
+         } else if (type == ctx->ac.f32) {
+            arg_type = AC_ARG_CONST_FLOAT_PTR;
+         } else {
+            assert(0);
+         }
+      }
+
+      ac_add_arg(&ctx->args, gprs < num_sgprs ? AC_ARG_SGPR : AC_ARG_VGPR, size, arg_type, NULL);
+
+      assert(ac_is_sgpr_param(param) == (gprs < num_sgprs));
+      assert(gprs + size <= num_sgprs + num_vgprs &&
+             (gprs >= num_sgprs || gprs + size <= num_sgprs));
+
+      gprs += size;
+   }
+
+   /* Prepare the return type. */
+   unsigned num_returns = 0;
+   LLVMTypeRef returns[AC_MAX_ARGS], last_func_type, return_type;
+
+   last_func_type = LLVMGetElementType(LLVMTypeOf(parts[num_parts - 1]));
+   return_type = LLVMGetReturnType(last_func_type);
+
+   switch (LLVMGetTypeKind(return_type)) {
+   case LLVMStructTypeKind:
+      num_returns = LLVMCountStructElementTypes(return_type);
+      assert(num_returns <= ARRAY_SIZE(returns));
+      LLVMGetStructElementTypes(return_type, returns);
+      break;
+   case LLVMVoidTypeKind:
+      break;
+   default:
+      unreachable("unexpected type");
+   }
+
+   si_llvm_create_func(ctx, "wrapper", returns, num_returns,
+                       si_get_max_workgroup_size(ctx->shader));
+
+   if (si_is_merged_shader(ctx->shader))
+      ac_init_exec_full_mask(&ctx->ac);
+
+   /* Record the arguments of the function as if they were an output of
+    * a previous part.
+    */
+   num_out = 0;
+   num_out_sgpr = 0;
+
+   for (unsigned i = 0; i < ctx->args.arg_count; ++i) {
+      LLVMValueRef param = LLVMGetParam(ctx->main_fn, i);
+      LLVMTypeRef param_type = LLVMTypeOf(param);
+      LLVMTypeRef out_type = ctx->args.args[i].file == AC_ARG_SGPR ? ctx->ac.i32 : ctx->ac.f32;
+      unsigned size = ac_get_type_size(param_type) / 4;
+
+      if (size == 1) {
+         if (LLVMGetTypeKind(param_type) == LLVMPointerTypeKind) {
+            param = LLVMBuildPtrToInt(builder, param, ctx->ac.i32, "");
+            param_type = ctx->ac.i32;
+         }
+
+         if (param_type != out_type)
+            param = LLVMBuildBitCast(builder, param, out_type, "");
+         out[num_out++] = param;
+      } else {
+         LLVMTypeRef vector_type = LLVMVectorType(out_type, size);
+
+         if (LLVMGetTypeKind(param_type) == LLVMPointerTypeKind) {
+            param = LLVMBuildPtrToInt(builder, param, ctx->ac.i64, "");
+            param_type = ctx->ac.i64;
+         }
+
+         if (param_type != vector_type)
+            param = LLVMBuildBitCast(builder, param, vector_type, "");
+
+         for (unsigned j = 0; j < size; ++j)
+            out[num_out++] =
+               LLVMBuildExtractElement(builder, param, LLVMConstInt(ctx->ac.i32, j, 0), "");
+      }
+
+      if (ctx->args.args[i].file == AC_ARG_SGPR)
+         num_out_sgpr = num_out;
+   }
+
+   memcpy(initial, out, sizeof(out));
+   initial_num_out = num_out;
+   initial_num_out_sgpr = num_out_sgpr;
+
+   /* Now chain the parts. */
+   LLVMValueRef ret = NULL;
+   for (unsigned part = 0; part < num_parts; ++part) {
+      LLVMValueRef in[AC_MAX_ARGS];
+      LLVMTypeRef ret_type;
+      unsigned out_idx = 0;
+      unsigned num_params = LLVMCountParams(parts[part]);
+
+      /* Merged shaders are executed conditionally depending
+       * on the number of enabled threads passed in the input SGPRs. */
+      if (si_is_multi_part_shader(ctx->shader) && part == 0) {
+         LLVMValueRef ena, count = initial[3];
+
+         count = LLVMBuildAnd(builder, count, LLVMConstInt(ctx->ac.i32, 0x7f, 0), "");
+         ena = LLVMBuildICmp(builder, LLVMIntULT, ac_get_thread_id(&ctx->ac), count, "");
+         ac_build_ifcc(&ctx->ac, ena, 6506);
+      }
+
+      /* Derive arguments for the next part from outputs of the
+       * previous one.
+       */
+      for (unsigned param_idx = 0; param_idx < num_params; ++param_idx) {
+         LLVMValueRef param;
+         LLVMTypeRef param_type;
+         bool is_sgpr;
+         unsigned param_size;
+         LLVMValueRef arg = NULL;
+
+         param = LLVMGetParam(parts[part], param_idx);
+         param_type = LLVMTypeOf(param);
+         param_size = ac_get_type_size(param_type) / 4;
+         is_sgpr = ac_is_sgpr_param(param);
+
+         if (is_sgpr) {
+            ac_add_function_attr(ctx->ac.context, parts[part], param_idx + 1, AC_FUNC_ATTR_INREG);
+         } else if (out_idx < num_out_sgpr) {
+            /* Skip returned SGPRs the current part doesn't
+             * declare on the input. */
+            out_idx = num_out_sgpr;
+         }
+
+         assert(out_idx + param_size <= (is_sgpr ? num_out_sgpr : num_out));
+
+         if (param_size == 1)
+            arg = out[out_idx];
+         else
+            arg = ac_build_gather_values(&ctx->ac, &out[out_idx], param_size);
+
+         if (LLVMTypeOf(arg) != param_type) {
+            if (LLVMGetTypeKind(param_type) == LLVMPointerTypeKind) {
+               if (LLVMGetPointerAddressSpace(param_type) == AC_ADDR_SPACE_CONST_32BIT) {
+                  arg = LLVMBuildBitCast(builder, arg, ctx->ac.i32, "");
+                  arg = LLVMBuildIntToPtr(builder, arg, param_type, "");
+               } else {
+                  arg = LLVMBuildBitCast(builder, arg, ctx->ac.i64, "");
+                  arg = LLVMBuildIntToPtr(builder, arg, param_type, "");
+               }
+            } else {
+               arg = LLVMBuildBitCast(builder, arg, param_type, "");
+            }
+         }
+
+         in[param_idx] = arg;
+         out_idx += param_size;
+      }
+
+      ret = ac_build_call(&ctx->ac, parts[part], in, num_params);
+
+      if (si_is_multi_part_shader(ctx->shader) && part + 1 == next_shader_first_part) {
+         ac_build_endif(&ctx->ac, 6506);
+
+         /* The second half of the merged shader should use
+          * the inputs from the toplevel (wrapper) function,
+          * not the return value from the last call.
+          *
+          * That's because the last call was executed condi-
+          * tionally, so we can't consume it in the main
+          * block.
+          */
+         memcpy(out, initial, sizeof(initial));
+         num_out = initial_num_out;
+         num_out_sgpr = initial_num_out_sgpr;
+         continue;
+      }
+
+      /* Extract the returned GPRs. */
+      ret_type = LLVMTypeOf(ret);
+      num_out = 0;
+      num_out_sgpr = 0;
+
+      if (LLVMGetTypeKind(ret_type) != LLVMVoidTypeKind) {
+         assert(LLVMGetTypeKind(ret_type) == LLVMStructTypeKind);
+
+         unsigned ret_size = LLVMCountStructElementTypes(ret_type);
+
+         for (unsigned i = 0; i < ret_size; ++i) {
+            LLVMValueRef val = LLVMBuildExtractValue(builder, ret, i, "");
+
+            assert(num_out < ARRAY_SIZE(out));
+            out[num_out++] = val;
+
+            if (LLVMTypeOf(val) == ctx->ac.i32) {
+               assert(num_out_sgpr + 1 == num_out);
+               num_out_sgpr = num_out;
+            }
+         }
+      }
+   }
+
+   /* Return the value from the last part. */
+   if (LLVMGetTypeKind(LLVMTypeOf(ret)) == LLVMVoidTypeKind)
+      LLVMBuildRetVoid(builder);
+   else
+      LLVMBuildRet(builder, ret);
 }
diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm_gs.c b/src/gallium/drivers/radeonsi/si_shader_llvm_gs.c
index 99ffdd2e980..2a609572d84 100644
--- a/src/gallium/drivers/radeonsi/si_shader_llvm_gs.c
+++ b/src/gallium/drivers/radeonsi/si_shader_llvm_gs.c
@@ -22,759 +22,693 @@
  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  */
 
-#include "si_shader_internal.h"
 #include "si_pipe.h"
+#include "si_shader_internal.h"
 #include "sid.h"
 #include "util/u_memory.h"
 
 LLVMValueRef si_is_es_thread(struct si_shader_context *ctx)
 {
-	/* Return true if the current thread should execute an ES thread. */
-	return LLVMBuildICmp(ctx->ac.builder, LLVMIntULT,
-			     ac_get_thread_id(&ctx->ac),
-			     si_unpack_param(ctx, ctx->merged_wave_info, 0, 8), "");
+   /* Return true if the current thread should execute an ES thread. */
+   return LLVMBuildICmp(ctx->ac.builder, LLVMIntULT, ac_get_thread_id(&ctx->ac),
+                        si_unpack_param(ctx, ctx->merged_wave_info, 0, 8), "");
 }
 
 LLVMValueRef si_is_gs_thread(struct si_shader_context *ctx)
 {
-	/* Return true if the current thread should execute a GS thread. */
-	return LLVMBuildICmp(ctx->ac.builder, LLVMIntULT,
-			     ac_get_thread_id(&ctx->ac),
-			     si_unpack_param(ctx, ctx->merged_wave_info, 8, 8), "");
+   /* Return true if the current thread should execute a GS thread. */
+   return LLVMBuildICmp(ctx->ac.builder, LLVMIntULT, ac_get_thread_id(&ctx->ac),
+                        si_unpack_param(ctx, ctx->merged_wave_info, 8, 8), "");
 }
 
-static LLVMValueRef si_llvm_load_input_gs(struct ac_shader_abi *abi,
-					  unsigned input_index,
-					  unsigned vtx_offset_param,
-					  LLVMTypeRef type,
-					  unsigned swizzle)
+static LLVMValueRef si_llvm_load_input_gs(struct ac_shader_abi *abi, unsigned input_index,
+                                          unsigned vtx_offset_param, LLVMTypeRef type,
+                                          unsigned swizzle)
 {
-	struct si_shader_context *ctx = si_shader_context_from_abi(abi);
-	struct si_shader *shader = ctx->shader;
-	LLVMValueRef vtx_offset, soffset;
-	struct si_shader_info *info = &shader->selector->info;
-	unsigned semantic_name = info->input_semantic_name[input_index];
-	unsigned semantic_index = info->input_semantic_index[input_index];
-	unsigned param;
-	LLVMValueRef value;
-
-	param = si_shader_io_get_unique_index(semantic_name, semantic_index, false);
-
-	/* GFX9 has the ESGS ring in LDS. */
-	if (ctx->screen->info.chip_class >= GFX9) {
-		unsigned index = vtx_offset_param;
-
-		switch (index / 2) {
-		case 0:
-			vtx_offset = si_unpack_param(ctx, ctx->gs_vtx01_offset,
-						     index % 2 ? 16 : 0, 16);
-			break;
-		case 1:
-			vtx_offset = si_unpack_param(ctx, ctx->gs_vtx23_offset,
-						     index % 2 ? 16 : 0, 16);
-			break;
-		case 2:
-			vtx_offset = si_unpack_param(ctx, ctx->gs_vtx45_offset,
-						     index % 2 ? 16 : 0, 16);
-			break;
-		default:
-			assert(0);
-			return NULL;
-		}
-
-		unsigned offset = param * 4 + swizzle;
-		vtx_offset = LLVMBuildAdd(ctx->ac.builder, vtx_offset,
-					  LLVMConstInt(ctx->ac.i32, offset, false), "");
-
-		LLVMValueRef ptr = ac_build_gep0(&ctx->ac, ctx->esgs_ring, vtx_offset);
-		LLVMValueRef value = LLVMBuildLoad(ctx->ac.builder, ptr, "");
-		if (ac_get_type_size(type) == 8) {
-			ptr = LLVMBuildGEP(ctx->ac.builder, ptr,
-					   &ctx->ac.i32_1, 1, "");
-			LLVMValueRef values[2] = {
-				value,
-				LLVMBuildLoad(ctx->ac.builder, ptr, "")
-			};
-			value = ac_build_gather_values(&ctx->ac, values, 2);
-		}
-		return LLVMBuildBitCast(ctx->ac.builder, value, type, "");
-	}
-
-	/* GFX6: input load from the ESGS ring in memory. */
-	if (swizzle == ~0) {
-		LLVMValueRef values[4];
-		unsigned chan;
-		for (chan = 0; chan < 4; chan++) {
-			values[chan] = si_llvm_load_input_gs(abi, input_index, vtx_offset_param,
-							     type, chan);
-		}
-		return ac_build_gather_values(&ctx->ac, values, 4);
-	}
-
-	/* Get the vertex offset parameter on GFX6. */
-	LLVMValueRef gs_vtx_offset = ac_get_arg(&ctx->ac,
-						ctx->gs_vtx_offset[vtx_offset_param]);
-
-	vtx_offset = LLVMBuildMul(ctx->ac.builder, gs_vtx_offset,
-				  LLVMConstInt(ctx->ac.i32, 4, 0), "");
-
-	soffset = LLVMConstInt(ctx->ac.i32, (param * 4 + swizzle) * 256, 0);
-
-	value = ac_build_buffer_load(&ctx->ac, ctx->esgs_ring, 1, ctx->ac.i32_0,
-				     vtx_offset, soffset, 0, ac_glc, true, false);
-	if (ac_get_type_size(type) == 8) {
-		LLVMValueRef value2;
-		soffset = LLVMConstInt(ctx->ac.i32, (param * 4 + swizzle + 1) * 256, 0);
-
-		value2 = ac_build_buffer_load(&ctx->ac, ctx->esgs_ring, 1,
-					      ctx->ac.i32_0, vtx_offset, soffset,
-					      0, ac_glc, true, false);
-		return si_build_gather_64bit(ctx, type, value, value2);
-	}
-	return LLVMBuildBitCast(ctx->ac.builder, value, type, "");
+   struct si_shader_context *ctx = si_shader_context_from_abi(abi);
+   struct si_shader *shader = ctx->shader;
+   LLVMValueRef vtx_offset, soffset;
+   struct si_shader_info *info = &shader->selector->info;
+   unsigned semantic_name = info->input_semantic_name[input_index];
+   unsigned semantic_index = info->input_semantic_index[input_index];
+   unsigned param;
+   LLVMValueRef value;
+
+   param = si_shader_io_get_unique_index(semantic_name, semantic_index, false);
+
+   /* GFX9 has the ESGS ring in LDS. */
+   if (ctx->screen->info.chip_class >= GFX9) {
+      unsigned index = vtx_offset_param;
+
+      switch (index / 2) {
+      case 0:
+         vtx_offset = si_unpack_param(ctx, ctx->gs_vtx01_offset, index % 2 ? 16 : 0, 16);
+         break;
+      case 1:
+         vtx_offset = si_unpack_param(ctx, ctx->gs_vtx23_offset, index % 2 ? 16 : 0, 16);
+         break;
+      case 2:
+         vtx_offset = si_unpack_param(ctx, ctx->gs_vtx45_offset, index % 2 ? 16 : 0, 16);
+         break;
+      default:
+         assert(0);
+         return NULL;
+      }
+
+      unsigned offset = param * 4 + swizzle;
+      vtx_offset =
+         LLVMBuildAdd(ctx->ac.builder, vtx_offset, LLVMConstInt(ctx->ac.i32, offset, false), "");
+
+      LLVMValueRef ptr = ac_build_gep0(&ctx->ac, ctx->esgs_ring, vtx_offset);
+      LLVMValueRef value = LLVMBuildLoad(ctx->ac.builder, ptr, "");
+      if (ac_get_type_size(type) == 8) {
+         ptr = LLVMBuildGEP(ctx->ac.builder, ptr, &ctx->ac.i32_1, 1, "");
+         LLVMValueRef values[2] = {value, LLVMBuildLoad(ctx->ac.builder, ptr, "")};
+         value = ac_build_gather_values(&ctx->ac, values, 2);
+      }
+      return LLVMBuildBitCast(ctx->ac.builder, value, type, "");
+   }
+
+   /* GFX6: input load from the ESGS ring in memory. */
+   if (swizzle == ~0) {
+      LLVMValueRef values[4];
+      unsigned chan;
+      for (chan = 0; chan < 4; chan++) {
+         values[chan] = si_llvm_load_input_gs(abi, input_index, vtx_offset_param, type, chan);
+      }
+      return ac_build_gather_values(&ctx->ac, values, 4);
+   }
+
+   /* Get the vertex offset parameter on GFX6. */
+   LLVMValueRef gs_vtx_offset = ac_get_arg(&ctx->ac, ctx->gs_vtx_offset[vtx_offset_param]);
+
+   vtx_offset = LLVMBuildMul(ctx->ac.builder, gs_vtx_offset, LLVMConstInt(ctx->ac.i32, 4, 0), "");
+
+   soffset = LLVMConstInt(ctx->ac.i32, (param * 4 + swizzle) * 256, 0);
+
+   value = ac_build_buffer_load(&ctx->ac, ctx->esgs_ring, 1, ctx->ac.i32_0, vtx_offset, soffset, 0,
+                                ac_glc, true, false);
+   if (ac_get_type_size(type) == 8) {
+      LLVMValueRef value2;
+      soffset = LLVMConstInt(ctx->ac.i32, (param * 4 + swizzle + 1) * 256, 0);
+
+      value2 = ac_build_buffer_load(&ctx->ac, ctx->esgs_ring, 1, ctx->ac.i32_0, vtx_offset, soffset,
+                                    0, ac_glc, true, false);
+      return si_build_gather_64bit(ctx, type, value, value2);
+   }
+   return LLVMBuildBitCast(ctx->ac.builder, value, type, "");
 }
 
-static LLVMValueRef si_nir_load_input_gs(struct ac_shader_abi *abi,
-					 unsigned location,
-					 unsigned driver_location,
-					 unsigned component,
-					 unsigned num_components,
-					 unsigned vertex_index,
-					 unsigned const_index,
-					 LLVMTypeRef type)
+static LLVMValueRef si_nir_load_input_gs(struct ac_shader_abi *abi, unsigned location,
+                                         unsigned driver_location, unsigned component,
+                                         unsigned num_components, unsigned vertex_index,
+                                         unsigned const_index, LLVMTypeRef type)
 {
-	struct si_shader_context *ctx = si_shader_context_from_abi(abi);
+   struct si_shader_context *ctx = si_shader_context_from_abi(abi);
 
-	LLVMValueRef value[4];
-	for (unsigned i = 0; i < num_components; i++) {
-		unsigned offset = i;
-		if (ac_get_type_size(type) == 8)
-			offset *= 2;
+   LLVMValueRef value[4];
+   for (unsigned i = 0; i < num_components; i++) {
+      unsigned offset = i;
+      if (ac_get_type_size(type) == 8)
+         offset *= 2;
 
-		offset += component;
-		value[i + component] = si_llvm_load_input_gs(&ctx->abi, driver_location  / 4 + const_index,
-							     vertex_index, type, offset);
-	}
+      offset += component;
+      value[i + component] = si_llvm_load_input_gs(&ctx->abi, driver_location / 4 + const_index,
+                                                   vertex_index, type, offset);
+   }
 
-	return ac_build_varying_gather_values(&ctx->ac, value, num_components, component);
+   return ac_build_varying_gather_values(&ctx->ac, value, num_components, component);
 }
 
 /* Pass GS inputs from ES to GS on GFX9. */
 static void si_set_es_return_value_for_gs(struct si_shader_context *ctx)
 {
-	LLVMValueRef ret = ctx->return_value;
-
-	ret = si_insert_input_ptr(ctx, ret, ctx->other_const_and_shader_buffers, 0);
-	ret = si_insert_input_ptr(ctx, ret, ctx->other_samplers_and_images, 1);
-	if (ctx->shader->key.as_ngg)
-		ret = si_insert_input_ptr(ctx, ret, ctx->gs_tg_info, 2);
-	else
-		ret = si_insert_input_ret(ctx, ret, ctx->gs2vs_offset, 2);
-	ret = si_insert_input_ret(ctx, ret, ctx->merged_wave_info, 3);
-	ret = si_insert_input_ret(ctx, ret, ctx->merged_scratch_offset, 5);
-
-	ret = si_insert_input_ptr(ctx, ret, ctx->rw_buffers,
-				  8 + SI_SGPR_RW_BUFFERS);
-	ret = si_insert_input_ptr(ctx, ret,
-				  ctx->bindless_samplers_and_images,
-				  8 + SI_SGPR_BINDLESS_SAMPLERS_AND_IMAGES);
-	if (ctx->screen->use_ngg) {
-		ret = si_insert_input_ptr(ctx, ret, ctx->vs_state_bits,
-					  8 + SI_SGPR_VS_STATE_BITS);
-	}
-
-	unsigned vgpr;
-	if (ctx->type == PIPE_SHADER_VERTEX)
-		vgpr = 8 + GFX9_VSGS_NUM_USER_SGPR;
-	else
-		vgpr = 8 + GFX9_TESGS_NUM_USER_SGPR;
-
-	ret = si_insert_input_ret_float(ctx, ret, ctx->gs_vtx01_offset, vgpr++);
-	ret = si_insert_input_ret_float(ctx, ret, ctx->gs_vtx23_offset, vgpr++);
-	ret = si_insert_input_ret_float(ctx, ret, ctx->args.gs_prim_id, vgpr++);
-	ret = si_insert_input_ret_float(ctx, ret, ctx->args.gs_invocation_id, vgpr++);
-	ret = si_insert_input_ret_float(ctx, ret, ctx->gs_vtx45_offset, vgpr++);
-	ctx->return_value = ret;
+   LLVMValueRef ret = ctx->return_value;
+
+   ret = si_insert_input_ptr(ctx, ret, ctx->other_const_and_shader_buffers, 0);
+   ret = si_insert_input_ptr(ctx, ret, ctx->other_samplers_and_images, 1);
+   if (ctx->shader->key.as_ngg)
+      ret = si_insert_input_ptr(ctx, ret, ctx->gs_tg_info, 2);
+   else
+      ret = si_insert_input_ret(ctx, ret, ctx->gs2vs_offset, 2);
+   ret = si_insert_input_ret(ctx, ret, ctx->merged_wave_info, 3);
+   ret = si_insert_input_ret(ctx, ret, ctx->merged_scratch_offset, 5);
+
+   ret = si_insert_input_ptr(ctx, ret, ctx->rw_buffers, 8 + SI_SGPR_RW_BUFFERS);
+   ret = si_insert_input_ptr(ctx, ret, ctx->bindless_samplers_and_images,
+                             8 + SI_SGPR_BINDLESS_SAMPLERS_AND_IMAGES);
+   if (ctx->screen->use_ngg) {
+      ret = si_insert_input_ptr(ctx, ret, ctx->vs_state_bits, 8 + SI_SGPR_VS_STATE_BITS);
+   }
+
+   unsigned vgpr;
+   if (ctx->type == PIPE_SHADER_VERTEX)
+      vgpr = 8 + GFX9_VSGS_NUM_USER_SGPR;
+   else
+      vgpr = 8 + GFX9_TESGS_NUM_USER_SGPR;
+
+   ret = si_insert_input_ret_float(ctx, ret, ctx->gs_vtx01_offset, vgpr++);
+   ret = si_insert_input_ret_float(ctx, ret, ctx->gs_vtx23_offset, vgpr++);
+   ret = si_insert_input_ret_float(ctx, ret, ctx->args.gs_prim_id, vgpr++);
+   ret = si_insert_input_ret_float(ctx, ret, ctx->args.gs_invocation_id, vgpr++);
+   ret = si_insert_input_ret_float(ctx, ret, ctx->gs_vtx45_offset, vgpr++);
+   ctx->return_value = ret;
 }
 
-void si_llvm_emit_es_epilogue(struct ac_shader_abi *abi, unsigned max_outputs,
-			      LLVMValueRef *addrs)
+void si_llvm_emit_es_epilogue(struct ac_shader_abi *abi, unsigned max_outputs, LLVMValueRef *addrs)
 {
-	struct si_shader_context *ctx = si_shader_context_from_abi(abi);
-	struct si_shader *es = ctx->shader;
-	struct si_shader_info *info = &es->selector->info;
-	LLVMValueRef lds_base = NULL;
-	unsigned chan;
-	int i;
-
-	if (ctx->screen->info.chip_class >= GFX9 && info->num_outputs) {
-		unsigned itemsize_dw = es->selector->esgs_itemsize / 4;
-		LLVMValueRef vertex_idx = ac_get_thread_id(&ctx->ac);
-		LLVMValueRef wave_idx = si_unpack_param(ctx, ctx->merged_wave_info, 24, 4);
-		vertex_idx = LLVMBuildOr(ctx->ac.builder, vertex_idx,
-					 LLVMBuildMul(ctx->ac.builder, wave_idx,
-						      LLVMConstInt(ctx->ac.i32, ctx->ac.wave_size, false), ""), "");
-		lds_base = LLVMBuildMul(ctx->ac.builder, vertex_idx,
-					LLVMConstInt(ctx->ac.i32, itemsize_dw, 0), "");
-	}
-
-	for (i = 0; i < info->num_outputs; i++) {
-		int param;
-
-		if (info->output_semantic_name[i] == TGSI_SEMANTIC_VIEWPORT_INDEX ||
-		    info->output_semantic_name[i] == TGSI_SEMANTIC_LAYER)
-			continue;
-
-		param = si_shader_io_get_unique_index(info->output_semantic_name[i],
-						      info->output_semantic_index[i], false);
-
-		for (chan = 0; chan < 4; chan++) {
-			if (!(info->output_usagemask[i] & (1 << chan)))
-				continue;
-
-			LLVMValueRef out_val = LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + chan], "");
-			out_val = ac_to_integer(&ctx->ac, out_val);
-
-			/* GFX9 has the ESGS ring in LDS. */
-			if (ctx->screen->info.chip_class >= GFX9) {
-				LLVMValueRef idx = LLVMConstInt(ctx->ac.i32, param * 4 + chan, false);
-				idx = LLVMBuildAdd(ctx->ac.builder, lds_base, idx, "");
-				ac_build_indexed_store(&ctx->ac, ctx->esgs_ring, idx, out_val);
-				continue;
-			}
-
-			ac_build_buffer_store_dword(&ctx->ac,
-						    ctx->esgs_ring,
-						    out_val, 1, NULL,
-						    ac_get_arg(&ctx->ac, ctx->es2gs_offset),
-						    (4 * param + chan) * 4,
-						    ac_glc | ac_slc | ac_swizzled);
-		}
-	}
-
-	if (ctx->screen->info.chip_class >= GFX9)
-		si_set_es_return_value_for_gs(ctx);
+   struct si_shader_context *ctx = si_shader_context_from_abi(abi);
+   struct si_shader *es = ctx->shader;
+   struct si_shader_info *info = &es->selector->info;
+   LLVMValueRef lds_base = NULL;
+   unsigned chan;
+   int i;
+
+   if (ctx->screen->info.chip_class >= GFX9 && info->num_outputs) {
+      unsigned itemsize_dw = es->selector->esgs_itemsize / 4;
+      LLVMValueRef vertex_idx = ac_get_thread_id(&ctx->ac);
+      LLVMValueRef wave_idx = si_unpack_param(ctx, ctx->merged_wave_info, 24, 4);
+      vertex_idx =
+         LLVMBuildOr(ctx->ac.builder, vertex_idx,
+                     LLVMBuildMul(ctx->ac.builder, wave_idx,
+                                  LLVMConstInt(ctx->ac.i32, ctx->ac.wave_size, false), ""),
+                     "");
+      lds_base =
+         LLVMBuildMul(ctx->ac.builder, vertex_idx, LLVMConstInt(ctx->ac.i32, itemsize_dw, 0), "");
+   }
+
+   for (i = 0; i < info->num_outputs; i++) {
+      int param;
+
+      if (info->output_semantic_name[i] == TGSI_SEMANTIC_VIEWPORT_INDEX ||
+          info->output_semantic_name[i] == TGSI_SEMANTIC_LAYER)
+         continue;
+
+      param = si_shader_io_get_unique_index(info->output_semantic_name[i],
+                                            info->output_semantic_index[i], false);
+
+      for (chan = 0; chan < 4; chan++) {
+         if (!(info->output_usagemask[i] & (1 << chan)))
+            continue;
+
+         LLVMValueRef out_val = LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + chan], "");
+         out_val = ac_to_integer(&ctx->ac, out_val);
+
+         /* GFX9 has the ESGS ring in LDS. */
+         if (ctx->screen->info.chip_class >= GFX9) {
+            LLVMValueRef idx = LLVMConstInt(ctx->ac.i32, param * 4 + chan, false);
+            idx = LLVMBuildAdd(ctx->ac.builder, lds_base, idx, "");
+            ac_build_indexed_store(&ctx->ac, ctx->esgs_ring, idx, out_val);
+            continue;
+         }
+
+         ac_build_buffer_store_dword(&ctx->ac, ctx->esgs_ring, out_val, 1, NULL,
+                                     ac_get_arg(&ctx->ac, ctx->es2gs_offset),
+                                     (4 * param + chan) * 4, ac_glc | ac_slc | ac_swizzled);
+      }
+   }
+
+   if (ctx->screen->info.chip_class >= GFX9)
+      si_set_es_return_value_for_gs(ctx);
 }
 
 static LLVMValueRef si_get_gs_wave_id(struct si_shader_context *ctx)
 {
-	if (ctx->screen->info.chip_class >= GFX9)
-		return si_unpack_param(ctx, ctx->merged_wave_info, 16, 8);
-	else
-		return ac_get_arg(&ctx->ac, ctx->gs_wave_id);
+   if (ctx->screen->info.chip_class >= GFX9)
+      return si_unpack_param(ctx, ctx->merged_wave_info, 16, 8);
+   else
+      return ac_get_arg(&ctx->ac, ctx->gs_wave_id);
 }
 
 static void emit_gs_epilogue(struct si_shader_context *ctx)
 {
-	if (ctx->shader->key.as_ngg) {
-		gfx10_ngg_gs_emit_epilogue(ctx);
-		return;
-	}
+   if (ctx->shader->key.as_ngg) {
+      gfx10_ngg_gs_emit_epilogue(ctx);
+      return;
+   }
 
-	if (ctx->screen->info.chip_class >= GFX10)
-		LLVMBuildFence(ctx->ac.builder, LLVMAtomicOrderingRelease, false, "");
+   if (ctx->screen->info.chip_class >= GFX10)
+      LLVMBuildFence(ctx->ac.builder, LLVMAtomicOrderingRelease, false, "");
 
-	ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_NOP | AC_SENDMSG_GS_DONE,
-			 si_get_gs_wave_id(ctx));
+   ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_NOP | AC_SENDMSG_GS_DONE, si_get_gs_wave_id(ctx));
 
-	if (ctx->screen->info.chip_class >= GFX9)
-		ac_build_endif(&ctx->ac, ctx->merged_wrap_if_label);
+   if (ctx->screen->info.chip_class >= GFX9)
+      ac_build_endif(&ctx->ac, ctx->merged_wrap_if_label);
 }
 
-static void si_llvm_emit_gs_epilogue(struct ac_shader_abi *abi,
-				     unsigned max_outputs,
-				     LLVMValueRef *addrs)
+static void si_llvm_emit_gs_epilogue(struct ac_shader_abi *abi, unsigned max_outputs,
+                                     LLVMValueRef *addrs)
 {
-	struct si_shader_context *ctx = si_shader_context_from_abi(abi);
-	struct si_shader_info UNUSED *info = &ctx->shader->selector->info;
+   struct si_shader_context *ctx = si_shader_context_from_abi(abi);
+   struct si_shader_info UNUSED *info = &ctx->shader->selector->info;
 
-	assert(info->num_outputs <= max_outputs);
+   assert(info->num_outputs <= max_outputs);
 
-	emit_gs_epilogue(ctx);
+   emit_gs_epilogue(ctx);
 }
 
 /* Emit one vertex from the geometry shader */
-static void si_llvm_emit_vertex(struct ac_shader_abi *abi,
-				unsigned stream,
-				LLVMValueRef *addrs)
+static void si_llvm_emit_vertex(struct ac_shader_abi *abi, unsigned stream, LLVMValueRef *addrs)
 {
-	struct si_shader_context *ctx = si_shader_context_from_abi(abi);
-
-	if (ctx->shader->key.as_ngg) {
-		gfx10_ngg_gs_emit_vertex(ctx, stream, addrs);
-		return;
-	}
-
-	struct si_shader_info *info = &ctx->shader->selector->info;
-	struct si_shader *shader = ctx->shader;
-	LLVMValueRef soffset = ac_get_arg(&ctx->ac, ctx->gs2vs_offset);
-	LLVMValueRef gs_next_vertex;
-	LLVMValueRef can_emit;
-	unsigned chan, offset;
-	int i;
-
-	/* Write vertex attribute values to GSVS ring */
-	gs_next_vertex = LLVMBuildLoad(ctx->ac.builder,
-				       ctx->gs_next_vertex[stream],
-				       "");
-
-	/* If this thread has already emitted the declared maximum number of
-	 * vertices, skip the write: excessive vertex emissions are not
-	 * supposed to have any effect.
-	 *
-	 * If the shader has no writes to memory, kill it instead. This skips
-	 * further memory loads and may allow LLVM to skip to the end
-	 * altogether.
-	 */
-	can_emit = LLVMBuildICmp(ctx->ac.builder, LLVMIntULT, gs_next_vertex,
-				 LLVMConstInt(ctx->ac.i32,
-					      shader->selector->gs_max_out_vertices, 0), "");
-
-	bool use_kill = !info->writes_memory;
-	if (use_kill) {
-		ac_build_kill_if_false(&ctx->ac, can_emit);
-	} else {
-		ac_build_ifcc(&ctx->ac, can_emit, 6505);
-	}
-
-	offset = 0;
-	for (i = 0; i < info->num_outputs; i++) {
-		for (chan = 0; chan < 4; chan++) {
-			if (!(info->output_usagemask[i] & (1 << chan)) ||
-			    ((info->output_streams[i] >> (2 * chan)) & 3) != stream)
-				continue;
-
-			LLVMValueRef out_val = LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + chan], "");
-			LLVMValueRef voffset =
-				LLVMConstInt(ctx->ac.i32, offset *
-					     shader->selector->gs_max_out_vertices, 0);
-			offset++;
-
-			voffset = LLVMBuildAdd(ctx->ac.builder, voffset, gs_next_vertex, "");
-			voffset = LLVMBuildMul(ctx->ac.builder, voffset,
-					       LLVMConstInt(ctx->ac.i32, 4, 0), "");
-
-			out_val = ac_to_integer(&ctx->ac, out_val);
-
-			ac_build_buffer_store_dword(&ctx->ac,
-						    ctx->gsvs_ring[stream],
-						    out_val, 1,
-						    voffset, soffset, 0,
-						    ac_glc | ac_slc | ac_swizzled);
-		}
-	}
-
-	gs_next_vertex = LLVMBuildAdd(ctx->ac.builder, gs_next_vertex, ctx->ac.i32_1, "");
-	LLVMBuildStore(ctx->ac.builder, gs_next_vertex, ctx->gs_next_vertex[stream]);
-
-	/* Signal vertex emission if vertex data was written. */
-	if (offset) {
-		ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_EMIT | AC_SENDMSG_GS | (stream << 8),
-				 si_get_gs_wave_id(ctx));
-	}
-
-	if (!use_kill)
-		ac_build_endif(&ctx->ac, 6505);
+   struct si_shader_context *ctx = si_shader_context_from_abi(abi);
+
+   if (ctx->shader->key.as_ngg) {
+      gfx10_ngg_gs_emit_vertex(ctx, stream, addrs);
+      return;
+   }
+
+   struct si_shader_info *info = &ctx->shader->selector->info;
+   struct si_shader *shader = ctx->shader;
+   LLVMValueRef soffset = ac_get_arg(&ctx->ac, ctx->gs2vs_offset);
+   LLVMValueRef gs_next_vertex;
+   LLVMValueRef can_emit;
+   unsigned chan, offset;
+   int i;
+
+   /* Write vertex attribute values to GSVS ring */
+   gs_next_vertex = LLVMBuildLoad(ctx->ac.builder, ctx->gs_next_vertex[stream], "");
+
+   /* If this thread has already emitted the declared maximum number of
+    * vertices, skip the write: excessive vertex emissions are not
+    * supposed to have any effect.
+    *
+    * If the shader has no writes to memory, kill it instead. This skips
+    * further memory loads and may allow LLVM to skip to the end
+    * altogether.
+    */
+   can_emit =
+      LLVMBuildICmp(ctx->ac.builder, LLVMIntULT, gs_next_vertex,
+                    LLVMConstInt(ctx->ac.i32, shader->selector->gs_max_out_vertices, 0), "");
+
+   bool use_kill = !info->writes_memory;
+   if (use_kill) {
+      ac_build_kill_if_false(&ctx->ac, can_emit);
+   } else {
+      ac_build_ifcc(&ctx->ac, can_emit, 6505);
+   }
+
+   offset = 0;
+   for (i = 0; i < info->num_outputs; i++) {
+      for (chan = 0; chan < 4; chan++) {
+         if (!(info->output_usagemask[i] & (1 << chan)) ||
+             ((info->output_streams[i] >> (2 * chan)) & 3) != stream)
+            continue;
+
+         LLVMValueRef out_val = LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + chan], "");
+         LLVMValueRef voffset =
+            LLVMConstInt(ctx->ac.i32, offset * shader->selector->gs_max_out_vertices, 0);
+         offset++;
+
+         voffset = LLVMBuildAdd(ctx->ac.builder, voffset, gs_next_vertex, "");
+         voffset = LLVMBuildMul(ctx->ac.builder, voffset, LLVMConstInt(ctx->ac.i32, 4, 0), "");
+
+         out_val = ac_to_integer(&ctx->ac, out_val);
+
+         ac_build_buffer_store_dword(&ctx->ac, ctx->gsvs_ring[stream], out_val, 1, voffset, soffset,
+                                     0, ac_glc | ac_slc | ac_swizzled);
+      }
+   }
+
+   gs_next_vertex = LLVMBuildAdd(ctx->ac.builder, gs_next_vertex, ctx->ac.i32_1, "");
+   LLVMBuildStore(ctx->ac.builder, gs_next_vertex, ctx->gs_next_vertex[stream]);
+
+   /* Signal vertex emission if vertex data was written. */
+   if (offset) {
+      ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_EMIT | AC_SENDMSG_GS | (stream << 8),
+                       si_get_gs_wave_id(ctx));
+   }
+
+   if (!use_kill)
+      ac_build_endif(&ctx->ac, 6505);
 }
 
 /* Cut one primitive from the geometry shader */
-static void si_llvm_emit_primitive(struct ac_shader_abi *abi,
-				   unsigned stream)
+static void si_llvm_emit_primitive(struct ac_shader_abi *abi, unsigned stream)
 {
-	struct si_shader_context *ctx = si_shader_context_from_abi(abi);
+   struct si_shader_context *ctx = si_shader_context_from_abi(abi);
 
-	if (ctx->shader->key.as_ngg) {
-		LLVMBuildStore(ctx->ac.builder, ctx->ac.i32_0, ctx->gs_curprim_verts[stream]);
-		return;
-	}
+   if (ctx->shader->key.as_ngg) {
+      LLVMBuildStore(ctx->ac.builder, ctx->ac.i32_0, ctx->gs_curprim_verts[stream]);
+      return;
+   }
 
-	/* Signal primitive cut */
-	ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_CUT | AC_SENDMSG_GS | (stream << 8),
-			 si_get_gs_wave_id(ctx));
+   /* Signal primitive cut */
+   ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_CUT | AC_SENDMSG_GS | (stream << 8),
+                    si_get_gs_wave_id(ctx));
 }
 
 void si_preload_esgs_ring(struct si_shader_context *ctx)
 {
-	if (ctx->screen->info.chip_class <= GFX8) {
-		unsigned ring =
-			ctx->type == PIPE_SHADER_GEOMETRY ? SI_GS_RING_ESGS
-							  : SI_ES_RING_ESGS;
-		LLVMValueRef offset = LLVMConstInt(ctx->ac.i32, ring, 0);
-		LLVMValueRef buf_ptr = ac_get_arg(&ctx->ac, ctx->rw_buffers);
-
-		ctx->esgs_ring =
-			ac_build_load_to_sgpr(&ctx->ac, buf_ptr, offset);
-	} else {
-		if (USE_LDS_SYMBOLS && LLVM_VERSION_MAJOR >= 9) {
-			/* Declare the ESGS ring as an explicit LDS symbol. */
-			si_llvm_declare_esgs_ring(ctx);
-		} else {
-			ac_declare_lds_as_pointer(&ctx->ac);
-			ctx->esgs_ring = ctx->ac.lds;
-		}
-	}
+   if (ctx->screen->info.chip_class <= GFX8) {
+      unsigned ring = ctx->type == PIPE_SHADER_GEOMETRY ? SI_GS_RING_ESGS : SI_ES_RING_ESGS;
+      LLVMValueRef offset = LLVMConstInt(ctx->ac.i32, ring, 0);
+      LLVMValueRef buf_ptr = ac_get_arg(&ctx->ac, ctx->rw_buffers);
+
+      ctx->esgs_ring = ac_build_load_to_sgpr(&ctx->ac, buf_ptr, offset);
+   } else {
+      if (USE_LDS_SYMBOLS && LLVM_VERSION_MAJOR >= 9) {
+         /* Declare the ESGS ring as an explicit LDS symbol. */
+         si_llvm_declare_esgs_ring(ctx);
+      } else {
+         ac_declare_lds_as_pointer(&ctx->ac);
+         ctx->esgs_ring = ctx->ac.lds;
+      }
+   }
 }
 
 void si_preload_gs_rings(struct si_shader_context *ctx)
 {
-	const struct si_shader_selector *sel = ctx->shader->selector;
-	LLVMBuilderRef builder = ctx->ac.builder;
-	LLVMValueRef offset = LLVMConstInt(ctx->ac.i32, SI_RING_GSVS, 0);
-	LLVMValueRef buf_ptr = ac_get_arg(&ctx->ac, ctx->rw_buffers);
-	LLVMValueRef base_ring = ac_build_load_to_sgpr(&ctx->ac, buf_ptr, offset);
-
-	/* The conceptual layout of the GSVS ring is
-	 *   v0c0 .. vLv0 v0c1 .. vLc1 ..
-	 * but the real memory layout is swizzled across
-	 * threads:
-	 *   t0v0c0 .. t15v0c0 t0v1c0 .. t15v1c0 ... t15vLcL
-	 *   t16v0c0 ..
-	 * Override the buffer descriptor accordingly.
-	 */
-	LLVMTypeRef v2i64 = LLVMVectorType(ctx->ac.i64, 2);
-	uint64_t stream_offset = 0;
-
-	for (unsigned stream = 0; stream < 4; ++stream) {
-		unsigned num_components;
-		unsigned stride;
-		unsigned num_records;
-		LLVMValueRef ring, tmp;
-
-		num_components = sel->info.num_stream_output_components[stream];
-		if (!num_components)
-			continue;
-
-		stride = 4 * num_components * sel->gs_max_out_vertices;
-
-		/* Limit on the stride field for <= GFX7. */
-		assert(stride < (1 << 14));
-
-		num_records = ctx->ac.wave_size;
-
-		ring = LLVMBuildBitCast(builder, base_ring, v2i64, "");
-		tmp = LLVMBuildExtractElement(builder, ring, ctx->ac.i32_0, "");
-		tmp = LLVMBuildAdd(builder, tmp,
-				   LLVMConstInt(ctx->ac.i64,
-						stream_offset, 0), "");
-		stream_offset += stride * ctx->ac.wave_size;
-
-		ring = LLVMBuildInsertElement(builder, ring, tmp, ctx->ac.i32_0, "");
-		ring = LLVMBuildBitCast(builder, ring, ctx->ac.v4i32, "");
-		tmp = LLVMBuildExtractElement(builder, ring, ctx->ac.i32_1, "");
-		tmp = LLVMBuildOr(builder, tmp,
-			LLVMConstInt(ctx->ac.i32,
-				     S_008F04_STRIDE(stride) |
-				     S_008F04_SWIZZLE_ENABLE(1), 0), "");
-		ring = LLVMBuildInsertElement(builder, ring, tmp, ctx->ac.i32_1, "");
-		ring = LLVMBuildInsertElement(builder, ring,
-				LLVMConstInt(ctx->ac.i32, num_records, 0),
-				LLVMConstInt(ctx->ac.i32, 2, 0), "");
-
-		uint32_t rsrc3 =
-				S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
-				S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
-				S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
-				S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
-				S_008F0C_INDEX_STRIDE(1) | /* index_stride = 16 (elements) */
-				S_008F0C_ADD_TID_ENABLE(1);
-
-		if (ctx->ac.chip_class >= GFX10) {
-			rsrc3 |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) |
-				 S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_DISABLED) |
-				 S_008F0C_RESOURCE_LEVEL(1);
-		} else {
-			rsrc3 |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
-				 S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32) |
-				 S_008F0C_ELEMENT_SIZE(1); /* element_size = 4 (bytes) */
-		}
-
-		ring = LLVMBuildInsertElement(builder, ring,
-			LLVMConstInt(ctx->ac.i32, rsrc3, false),
-			LLVMConstInt(ctx->ac.i32, 3, 0), "");
-
-		ctx->gsvs_ring[stream] = ring;
-	}
+   const struct si_shader_selector *sel = ctx->shader->selector;
+   LLVMBuilderRef builder = ctx->ac.builder;
+   LLVMValueRef offset = LLVMConstInt(ctx->ac.i32, SI_RING_GSVS, 0);
+   LLVMValueRef buf_ptr = ac_get_arg(&ctx->ac, ctx->rw_buffers);
+   LLVMValueRef base_ring = ac_build_load_to_sgpr(&ctx->ac, buf_ptr, offset);
+
+   /* The conceptual layout of the GSVS ring is
+    *   v0c0 .. vLv0 v0c1 .. vLc1 ..
+    * but the real memory layout is swizzled across
+    * threads:
+    *   t0v0c0 .. t15v0c0 t0v1c0 .. t15v1c0 ... t15vLcL
+    *   t16v0c0 ..
+    * Override the buffer descriptor accordingly.
+    */
+   LLVMTypeRef v2i64 = LLVMVectorType(ctx->ac.i64, 2);
+   uint64_t stream_offset = 0;
+
+   for (unsigned stream = 0; stream < 4; ++stream) {
+      unsigned num_components;
+      unsigned stride;
+      unsigned num_records;
+      LLVMValueRef ring, tmp;
+
+      num_components = sel->info.num_stream_output_components[stream];
+      if (!num_components)
+         continue;
+
+      stride = 4 * num_components * sel->gs_max_out_vertices;
+
+      /* Limit on the stride field for <= GFX7. */
+      assert(stride < (1 << 14));
+
+      num_records = ctx->ac.wave_size;
+
+      ring = LLVMBuildBitCast(builder, base_ring, v2i64, "");
+      tmp = LLVMBuildExtractElement(builder, ring, ctx->ac.i32_0, "");
+      tmp = LLVMBuildAdd(builder, tmp, LLVMConstInt(ctx->ac.i64, stream_offset, 0), "");
+      stream_offset += stride * ctx->ac.wave_size;
+
+      ring = LLVMBuildInsertElement(builder, ring, tmp, ctx->ac.i32_0, "");
+      ring = LLVMBuildBitCast(builder, ring, ctx->ac.v4i32, "");
+      tmp = LLVMBuildExtractElement(builder, ring, ctx->ac.i32_1, "");
+      tmp = LLVMBuildOr(
+         builder, tmp,
+         LLVMConstInt(ctx->ac.i32, S_008F04_STRIDE(stride) | S_008F04_SWIZZLE_ENABLE(1), 0), "");
+      ring = LLVMBuildInsertElement(builder, ring, tmp, ctx->ac.i32_1, "");
+      ring = LLVMBuildInsertElement(builder, ring, LLVMConstInt(ctx->ac.i32, num_records, 0),
+                                    LLVMConstInt(ctx->ac.i32, 2, 0), "");
+
+      uint32_t rsrc3 =
+         S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
+         S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
+         S_008F0C_INDEX_STRIDE(1) | /* index_stride = 16 (elements) */
+         S_008F0C_ADD_TID_ENABLE(1);
+
+      if (ctx->ac.chip_class >= GFX10) {
+         rsrc3 |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) |
+                  S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_DISABLED) | S_008F0C_RESOURCE_LEVEL(1);
+      } else {
+         rsrc3 |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
+                  S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32) |
+                  S_008F0C_ELEMENT_SIZE(1); /* element_size = 4 (bytes) */
+      }
+
+      ring = LLVMBuildInsertElement(builder, ring, LLVMConstInt(ctx->ac.i32, rsrc3, false),
+                                    LLVMConstInt(ctx->ac.i32, 3, 0), "");
+
+      ctx->gsvs_ring[stream] = ring;
+   }
 }
 
 /* Generate code for the hardware VS shader stage to go with a geometry shader */
-struct si_shader *
-si_generate_gs_copy_shader(struct si_screen *sscreen,
-			   struct ac_llvm_compiler *compiler,
-			   struct si_shader_selector *gs_selector,
-			   struct pipe_debug_callback *debug)
+struct si_shader *si_generate_gs_copy_shader(struct si_screen *sscreen,
+                                             struct ac_llvm_compiler *compiler,
+                                             struct si_shader_selector *gs_selector,
+                                             struct pipe_debug_callback *debug)
 {
-	struct si_shader_context ctx;
-	struct si_shader *shader;
-	LLVMBuilderRef builder;
-	struct si_shader_output_values outputs[SI_MAX_VS_OUTPUTS];
-	struct si_shader_info *gsinfo = &gs_selector->info;
-	int i;
-
-
-	shader = CALLOC_STRUCT(si_shader);
-	if (!shader)
-		return NULL;
-
-	/* We can leave the fence as permanently signaled because the GS copy
-	 * shader only becomes visible globally after it has been compiled. */
-	util_queue_fence_init(&shader->ready);
-
-	shader->selector = gs_selector;
-	shader->is_gs_copy_shader = true;
+   struct si_shader_context ctx;
+   struct si_shader *shader;
+   LLVMBuilderRef builder;
+   struct si_shader_output_values outputs[SI_MAX_VS_OUTPUTS];
+   struct si_shader_info *gsinfo = &gs_selector->info;
+   int i;
+
+   shader = CALLOC_STRUCT(si_shader);
+   if (!shader)
+      return NULL;
+
+   /* We can leave the fence as permanently signaled because the GS copy
+    * shader only becomes visible globally after it has been compiled. */
+   util_queue_fence_init(&shader->ready);
+
+   shader->selector = gs_selector;
+   shader->is_gs_copy_shader = true;
+
+   si_llvm_context_init(&ctx, sscreen, compiler,
+                        si_get_wave_size(sscreen, PIPE_SHADER_VERTEX, false, false, false));
+   ctx.shader = shader;
+   ctx.type = PIPE_SHADER_VERTEX;
+
+   builder = ctx.ac.builder;
+
+   si_create_function(&ctx, false);
+
+   LLVMValueRef buf_ptr = ac_get_arg(&ctx.ac, ctx.rw_buffers);
+   ctx.gsvs_ring[0] =
+      ac_build_load_to_sgpr(&ctx.ac, buf_ptr, LLVMConstInt(ctx.ac.i32, SI_RING_GSVS, 0));
+
+   LLVMValueRef voffset =
+      LLVMBuildMul(ctx.ac.builder, ctx.abi.vertex_id, LLVMConstInt(ctx.ac.i32, 4, 0), "");
+
+   /* Fetch the vertex stream ID.*/
+   LLVMValueRef stream_id;
+
+   if (!sscreen->use_ngg_streamout && gs_selector->so.num_outputs)
+      stream_id = si_unpack_param(&ctx, ctx.streamout_config, 24, 2);
+   else
+      stream_id = ctx.ac.i32_0;
+
+   /* Fill in output information. */
+   for (i = 0; i < gsinfo->num_outputs; ++i) {
+      outputs[i].semantic_name = gsinfo->output_semantic_name[i];
+      outputs[i].semantic_index = gsinfo->output_semantic_index[i];
+
+      for (int chan = 0; chan < 4; chan++) {
+         outputs[i].vertex_stream[chan] = (gsinfo->output_streams[i] >> (2 * chan)) & 3;
+      }
+   }
+
+   LLVMBasicBlockRef end_bb;
+   LLVMValueRef switch_inst;
+
+   end_bb = LLVMAppendBasicBlockInContext(ctx.ac.context, ctx.main_fn, "end");
+   switch_inst = LLVMBuildSwitch(builder, stream_id, end_bb, 4);
+
+   for (int stream = 0; stream < 4; stream++) {
+      LLVMBasicBlockRef bb;
+      unsigned offset;
+
+      if (!gsinfo->num_stream_output_components[stream])
+         continue;
+
+      if (stream > 0 && !gs_selector->so.num_outputs)
+         continue;
+
+      bb = LLVMInsertBasicBlockInContext(ctx.ac.context, end_bb, "out");
+      LLVMAddCase(switch_inst, LLVMConstInt(ctx.ac.i32, stream, 0), bb);
+      LLVMPositionBuilderAtEnd(builder, bb);
+
+      /* Fetch vertex data from GSVS ring */
+      offset = 0;
+      for (i = 0; i < gsinfo->num_outputs; ++i) {
+         for (unsigned chan = 0; chan < 4; chan++) {
+            if (!(gsinfo->output_usagemask[i] & (1 << chan)) ||
+                outputs[i].vertex_stream[chan] != stream) {
+               outputs[i].values[chan] = LLVMGetUndef(ctx.ac.f32);
+               continue;
+            }
+
+            LLVMValueRef soffset =
+               LLVMConstInt(ctx.ac.i32, offset * gs_selector->gs_max_out_vertices * 16 * 4, 0);
+            offset++;
+
+            outputs[i].values[chan] =
+               ac_build_buffer_load(&ctx.ac, ctx.gsvs_ring[0], 1, ctx.ac.i32_0, voffset, soffset, 0,
+                                    ac_glc | ac_slc, true, false);
+         }
+      }
+
+      /* Streamout and exports. */
+      if (!sscreen->use_ngg_streamout && gs_selector->so.num_outputs) {
+         si_llvm_emit_streamout(&ctx, outputs, gsinfo->num_outputs, stream);
+      }
+
+      if (stream == 0)
+         si_llvm_build_vs_exports(&ctx, outputs, gsinfo->num_outputs);
+
+      LLVMBuildBr(builder, end_bb);
+   }
+
+   LLVMPositionBuilderAtEnd(builder, end_bb);
+
+   LLVMBuildRetVoid(ctx.ac.builder);
+
+   ctx.type = PIPE_SHADER_GEOMETRY; /* override for shader dumping */
+   si_llvm_optimize_module(&ctx);
+
+   bool ok = false;
+   if (si_compile_llvm(sscreen, &ctx.shader->binary, &ctx.shader->config, ctx.compiler, &ctx.ac,
+                       debug, PIPE_SHADER_GEOMETRY, "GS Copy Shader", false)) {
+      if (si_can_dump_shader(sscreen, PIPE_SHADER_GEOMETRY))
+         fprintf(stderr, "GS Copy Shader:\n");
+      si_shader_dump(sscreen, ctx.shader, debug, stderr, true);
+
+      if (!ctx.shader->config.scratch_bytes_per_wave)
+         ok = si_shader_binary_upload(sscreen, ctx.shader, 0);
+      else
+         ok = true;
+   }
 
-	si_llvm_context_init(&ctx, sscreen, compiler,
-			     si_get_wave_size(sscreen, PIPE_SHADER_VERTEX,
-					      false, false, false));
-	ctx.shader = shader;
-	ctx.type = PIPE_SHADER_VERTEX;
-
-	builder = ctx.ac.builder;
-
-	si_create_function(&ctx, false);
-
-	LLVMValueRef buf_ptr = ac_get_arg(&ctx.ac, ctx.rw_buffers);
-	ctx.gsvs_ring[0] = ac_build_load_to_sgpr(&ctx.ac, buf_ptr,
-						 LLVMConstInt(ctx.ac.i32, SI_RING_GSVS, 0));
-
-	LLVMValueRef voffset =
-		LLVMBuildMul(ctx.ac.builder, ctx.abi.vertex_id,
-			     LLVMConstInt(ctx.ac.i32, 4, 0), "");
-
-	/* Fetch the vertex stream ID.*/
-	LLVMValueRef stream_id;
-
-	if (!sscreen->use_ngg_streamout && gs_selector->so.num_outputs)
-		stream_id = si_unpack_param(&ctx, ctx.streamout_config, 24, 2);
-	else
-		stream_id = ctx.ac.i32_0;
-
-	/* Fill in output information. */
-	for (i = 0; i < gsinfo->num_outputs; ++i) {
-		outputs[i].semantic_name = gsinfo->output_semantic_name[i];
-		outputs[i].semantic_index = gsinfo->output_semantic_index[i];
-
-		for (int chan = 0; chan < 4; chan++) {
-			outputs[i].vertex_stream[chan] =
-				(gsinfo->output_streams[i] >> (2 * chan)) & 3;
-		}
-	}
-
-	LLVMBasicBlockRef end_bb;
-	LLVMValueRef switch_inst;
-
-	end_bb = LLVMAppendBasicBlockInContext(ctx.ac.context, ctx.main_fn, "end");
-	switch_inst = LLVMBuildSwitch(builder, stream_id, end_bb, 4);
-
-	for (int stream = 0; stream < 4; stream++) {
-		LLVMBasicBlockRef bb;
-		unsigned offset;
-
-		if (!gsinfo->num_stream_output_components[stream])
-			continue;
-
-		if (stream > 0 && !gs_selector->so.num_outputs)
-			continue;
-
-		bb = LLVMInsertBasicBlockInContext(ctx.ac.context, end_bb, "out");
-		LLVMAddCase(switch_inst, LLVMConstInt(ctx.ac.i32, stream, 0), bb);
-		LLVMPositionBuilderAtEnd(builder, bb);
-
-		/* Fetch vertex data from GSVS ring */
-		offset = 0;
-		for (i = 0; i < gsinfo->num_outputs; ++i) {
-			for (unsigned chan = 0; chan < 4; chan++) {
-				if (!(gsinfo->output_usagemask[i] & (1 << chan)) ||
-				    outputs[i].vertex_stream[chan] != stream) {
-					outputs[i].values[chan] = LLVMGetUndef(ctx.ac.f32);
-					continue;
-				}
-
-				LLVMValueRef soffset = LLVMConstInt(ctx.ac.i32,
-					offset * gs_selector->gs_max_out_vertices * 16 * 4, 0);
-				offset++;
-
-				outputs[i].values[chan] =
-					ac_build_buffer_load(&ctx.ac,
-							     ctx.gsvs_ring[0], 1,
-							     ctx.ac.i32_0, voffset,
-							     soffset, 0, ac_glc | ac_slc,
-							     true, false);
-			}
-		}
-
-		/* Streamout and exports. */
-		if (!sscreen->use_ngg_streamout && gs_selector->so.num_outputs) {
-			si_llvm_emit_streamout(&ctx, outputs,
-					       gsinfo->num_outputs,
-					       stream);
-		}
-
-		if (stream == 0)
-			si_llvm_build_vs_exports(&ctx, outputs, gsinfo->num_outputs);
-
-		LLVMBuildBr(builder, end_bb);
-	}
-
-	LLVMPositionBuilderAtEnd(builder, end_bb);
-
-	LLVMBuildRetVoid(ctx.ac.builder);
-
-	ctx.type = PIPE_SHADER_GEOMETRY; /* override for shader dumping */
-	si_llvm_optimize_module(&ctx);
-
-	bool ok = false;
-	if (si_compile_llvm(sscreen, &ctx.shader->binary,
-			    &ctx.shader->config, ctx.compiler, &ctx.ac,
-			    debug, PIPE_SHADER_GEOMETRY,
-			    "GS Copy Shader", false)) {
-		if (si_can_dump_shader(sscreen, PIPE_SHADER_GEOMETRY))
-			fprintf(stderr, "GS Copy Shader:\n");
-		si_shader_dump(sscreen, ctx.shader, debug, stderr, true);
-
-		if (!ctx.shader->config.scratch_bytes_per_wave)
-			ok = si_shader_binary_upload(sscreen, ctx.shader, 0);
-		else
-			ok = true;
-	}
-
-	si_llvm_dispose(&ctx);
-
-	if (!ok) {
-		FREE(shader);
-		shader = NULL;
-	} else {
-		si_fix_resource_usage(sscreen, shader);
-	}
-	return shader;
+   si_llvm_dispose(&ctx);
+
+   if (!ok) {
+      FREE(shader);
+      shader = NULL;
+   } else {
+      si_fix_resource_usage(sscreen, shader);
+   }
+   return shader;
 }
 
 /**
  * Build the GS prolog function. Rotate the input vertices for triangle strips
  * with adjacency.
  */
-void si_llvm_build_gs_prolog(struct si_shader_context *ctx,
-			     union si_shader_part_key *key)
+void si_llvm_build_gs_prolog(struct si_shader_context *ctx, union si_shader_part_key *key)
 {
-	unsigned num_sgprs, num_vgprs;
-	LLVMBuilderRef builder = ctx->ac.builder;
-	LLVMTypeRef returns[AC_MAX_ARGS];
-	LLVMValueRef func, ret;
-
-	memset(&ctx->args, 0, sizeof(ctx->args));
-
-	if (ctx->screen->info.chip_class >= GFX9) {
-		if (key->gs_prolog.states.gfx9_prev_is_vs)
-			num_sgprs = 8 + GFX9_VSGS_NUM_USER_SGPR;
-		else
-			num_sgprs = 8 + GFX9_TESGS_NUM_USER_SGPR;
-		num_vgprs = 5; /* ES inputs are not needed by GS */
-	} else {
-		num_sgprs = GFX6_GS_NUM_USER_SGPR + 2;
-		num_vgprs = 8;
-	}
-
-	for (unsigned i = 0; i < num_sgprs; ++i) {
-		ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
-		returns[i] = ctx->ac.i32;
-	}
-
-	for (unsigned i = 0; i < num_vgprs; ++i) {
-		ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL);
-		returns[num_sgprs + i] = ctx->ac.f32;
-	}
-
-	/* Create the function. */
-	si_llvm_create_func(ctx, "gs_prolog", returns, num_sgprs + num_vgprs, 0);
-	func = ctx->main_fn;
-
-	/* Set the full EXEC mask for the prolog, because we are only fiddling
-	 * with registers here. The main shader part will set the correct EXEC
-	 * mask.
-	 */
-	if (ctx->screen->info.chip_class >= GFX9 && !key->gs_prolog.is_monolithic)
-		ac_init_exec_full_mask(&ctx->ac);
-
-	/* Copy inputs to outputs. This should be no-op, as the registers match,
-	 * but it will prevent the compiler from overwriting them unintentionally.
-	 */
-	ret = ctx->return_value;
-	for (unsigned i = 0; i < num_sgprs; i++) {
-		LLVMValueRef p = LLVMGetParam(func, i);
-		ret = LLVMBuildInsertValue(builder, ret, p, i, "");
-	}
-	for (unsigned i = 0; i < num_vgprs; i++) {
-		LLVMValueRef p = LLVMGetParam(func, num_sgprs + i);
-		p = ac_to_float(&ctx->ac, p);
-		ret = LLVMBuildInsertValue(builder, ret, p, num_sgprs + i, "");
-	}
-
-	if (key->gs_prolog.states.tri_strip_adj_fix) {
-		/* Remap the input vertices for every other primitive. */
-		const struct ac_arg gfx6_vtx_params[6] = {
-			{ .used = true, .arg_index = num_sgprs },
-			{ .used = true, .arg_index = num_sgprs + 1 },
-			{ .used = true, .arg_index = num_sgprs + 3 },
-			{ .used = true, .arg_index = num_sgprs + 4 },
-			{ .used = true, .arg_index = num_sgprs + 5 },
-			{ .used = true, .arg_index = num_sgprs + 6 },
-		};
-		const struct ac_arg gfx9_vtx_params[3] = {
-			{ .used = true, .arg_index = num_sgprs },
-			{ .used = true, .arg_index = num_sgprs + 1 },
-			{ .used = true, .arg_index = num_sgprs + 4 },
-		};
-		LLVMValueRef vtx_in[6], vtx_out[6];
-		LLVMValueRef prim_id, rotate;
-
-		if (ctx->screen->info.chip_class >= GFX9) {
-			for (unsigned i = 0; i < 3; i++) {
-				vtx_in[i*2] = si_unpack_param(ctx, gfx9_vtx_params[i], 0, 16);
-				vtx_in[i*2+1] = si_unpack_param(ctx, gfx9_vtx_params[i], 16, 16);
-			}
-		} else {
-			for (unsigned i = 0; i < 6; i++)
-				vtx_in[i] = ac_get_arg(&ctx->ac, gfx6_vtx_params[i]);
-		}
-
-		prim_id = LLVMGetParam(func, num_sgprs + 2);
-		rotate = LLVMBuildTrunc(builder, prim_id, ctx->ac.i1, "");
-
-		for (unsigned i = 0; i < 6; ++i) {
-			LLVMValueRef base, rotated;
-			base = vtx_in[i];
-			rotated = vtx_in[(i + 4) % 6];
-			vtx_out[i] = LLVMBuildSelect(builder, rotate, rotated, base, "");
-		}
-
-		if (ctx->screen->info.chip_class >= GFX9) {
-			for (unsigned i = 0; i < 3; i++) {
-				LLVMValueRef hi, out;
-
-				hi = LLVMBuildShl(builder, vtx_out[i*2+1],
-						  LLVMConstInt(ctx->ac.i32, 16, 0), "");
-				out = LLVMBuildOr(builder, vtx_out[i*2], hi, "");
-				out = ac_to_float(&ctx->ac, out);
-				ret = LLVMBuildInsertValue(builder, ret, out,
-							   gfx9_vtx_params[i].arg_index, "");
-			}
-		} else {
-			for (unsigned i = 0; i < 6; i++) {
-				LLVMValueRef out;
-
-				out = ac_to_float(&ctx->ac, vtx_out[i]);
-				ret = LLVMBuildInsertValue(builder, ret, out,
-							   gfx6_vtx_params[i].arg_index, "");
-			}
-		}
-	}
-
-	LLVMBuildRet(builder, ret);
+   unsigned num_sgprs, num_vgprs;
+   LLVMBuilderRef builder = ctx->ac.builder;
+   LLVMTypeRef returns[AC_MAX_ARGS];
+   LLVMValueRef func, ret;
+
+   memset(&ctx->args, 0, sizeof(ctx->args));
+
+   if (ctx->screen->info.chip_class >= GFX9) {
+      if (key->gs_prolog.states.gfx9_prev_is_vs)
+         num_sgprs = 8 + GFX9_VSGS_NUM_USER_SGPR;
+      else
+         num_sgprs = 8 + GFX9_TESGS_NUM_USER_SGPR;
+      num_vgprs = 5; /* ES inputs are not needed by GS */
+   } else {
+      num_sgprs = GFX6_GS_NUM_USER_SGPR + 2;
+      num_vgprs = 8;
+   }
+
+   for (unsigned i = 0; i < num_sgprs; ++i) {
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
+      returns[i] = ctx->ac.i32;
+   }
+
+   for (unsigned i = 0; i < num_vgprs; ++i) {
+      ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL);
+      returns[num_sgprs + i] = ctx->ac.f32;
+   }
+
+   /* Create the function. */
+   si_llvm_create_func(ctx, "gs_prolog", returns, num_sgprs + num_vgprs, 0);
+   func = ctx->main_fn;
+
+   /* Set the full EXEC mask for the prolog, because we are only fiddling
+    * with registers here. The main shader part will set the correct EXEC
+    * mask.
+    */
+   if (ctx->screen->info.chip_class >= GFX9 && !key->gs_prolog.is_monolithic)
+      ac_init_exec_full_mask(&ctx->ac);
+
+   /* Copy inputs to outputs. This should be no-op, as the registers match,
+    * but it will prevent the compiler from overwriting them unintentionally.
+    */
+   ret = ctx->return_value;
+   for (unsigned i = 0; i < num_sgprs; i++) {
+      LLVMValueRef p = LLVMGetParam(func, i);
+      ret = LLVMBuildInsertValue(builder, ret, p, i, "");
+   }
+   for (unsigned i = 0; i < num_vgprs; i++) {
+      LLVMValueRef p = LLVMGetParam(func, num_sgprs + i);
+      p = ac_to_float(&ctx->ac, p);
+      ret = LLVMBuildInsertValue(builder, ret, p, num_sgprs + i, "");
+   }
+
+   if (key->gs_prolog.states.tri_strip_adj_fix) {
+      /* Remap the input vertices for every other primitive. */
+      const struct ac_arg gfx6_vtx_params[6] = {
+         {.used = true, .arg_index = num_sgprs},     {.used = true, .arg_index = num_sgprs + 1},
+         {.used = true, .arg_index = num_sgprs + 3}, {.used = true, .arg_index = num_sgprs + 4},
+         {.used = true, .arg_index = num_sgprs + 5}, {.used = true, .arg_index = num_sgprs + 6},
+      };
+      const struct ac_arg gfx9_vtx_params[3] = {
+         {.used = true, .arg_index = num_sgprs},
+         {.used = true, .arg_index = num_sgprs + 1},
+         {.used = true, .arg_index = num_sgprs + 4},
+      };
+      LLVMValueRef vtx_in[6], vtx_out[6];
+      LLVMValueRef prim_id, rotate;
+
+      if (ctx->screen->info.chip_class >= GFX9) {
+         for (unsigned i = 0; i < 3; i++) {
+            vtx_in[i * 2] = si_unpack_param(ctx, gfx9_vtx_params[i], 0, 16);
+            vtx_in[i * 2 + 1] = si_unpack_param(ctx, gfx9_vtx_params[i], 16, 16);
+         }
+      } else {
+         for (unsigned i = 0; i < 6; i++)
+            vtx_in[i] = ac_get_arg(&ctx->ac, gfx6_vtx_params[i]);
+      }
+
+      prim_id = LLVMGetParam(func, num_sgprs + 2);
+      rotate = LLVMBuildTrunc(builder, prim_id, ctx->ac.i1, "");
+
+      for (unsigned i = 0; i < 6; ++i) {
+         LLVMValueRef base, rotated;
+         base = vtx_in[i];
+         rotated = vtx_in[(i + 4) % 6];
+         vtx_out[i] = LLVMBuildSelect(builder, rotate, rotated, base, "");
+      }
+
+      if (ctx->screen->info.chip_class >= GFX9) {
+         for (unsigned i = 0; i < 3; i++) {
+            LLVMValueRef hi, out;
+
+            hi = LLVMBuildShl(builder, vtx_out[i * 2 + 1], LLVMConstInt(ctx->ac.i32, 16, 0), "");
+            out = LLVMBuildOr(builder, vtx_out[i * 2], hi, "");
+            out = ac_to_float(&ctx->ac, out);
+            ret = LLVMBuildInsertValue(builder, ret, out, gfx9_vtx_params[i].arg_index, "");
+         }
+      } else {
+         for (unsigned i = 0; i < 6; i++) {
+            LLVMValueRef out;
+
+            out = ac_to_float(&ctx->ac, vtx_out[i]);
+            ret = LLVMBuildInsertValue(builder, ret, out, gfx6_vtx_params[i].arg_index, "");
+         }
+      }
+   }
+
+   LLVMBuildRet(builder, ret);
 }
 
 void si_llvm_init_gs_callbacks(struct si_shader_context *ctx)
 {
-	ctx->abi.load_inputs = si_nir_load_input_gs;
-	ctx->abi.emit_vertex = si_llvm_emit_vertex;
-	ctx->abi.emit_primitive = si_llvm_emit_primitive;
-	ctx->abi.emit_outputs = si_llvm_emit_gs_epilogue;
+   ctx->abi.load_inputs = si_nir_load_input_gs;
+   ctx->abi.emit_vertex = si_llvm_emit_vertex;
+   ctx->abi.emit_primitive = si_llvm_emit_primitive;
+   ctx->abi.emit_outputs = si_llvm_emit_gs_epilogue;
 }
diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm_ps.c b/src/gallium/drivers/radeonsi/si_shader_llvm_ps.c
index c2efcc88e99..6e4d5d429c7 100644
--- a/src/gallium/drivers/radeonsi/si_shader_llvm_ps.c
+++ b/src/gallium/drivers/radeonsi/si_shader_llvm_ps.c
@@ -22,117 +22,108 @@
  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  */
 
-#include "si_shader_internal.h"
 #include "si_pipe.h"
+#include "si_shader_internal.h"
 #include "sid.h"
 
 LLVMValueRef si_get_sample_id(struct si_shader_context *ctx)
 {
-	return si_unpack_param(ctx, ctx->args.ancillary, 8, 4);
+   return si_unpack_param(ctx, ctx->args.ancillary, 8, 4);
 }
 
 static LLVMValueRef load_sample_mask_in(struct ac_shader_abi *abi)
 {
-	struct si_shader_context *ctx = si_shader_context_from_abi(abi);
-	return ac_to_integer(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args.sample_coverage));
+   struct si_shader_context *ctx = si_shader_context_from_abi(abi);
+   return ac_to_integer(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args.sample_coverage));
 }
 
 static LLVMValueRef load_sample_position(struct ac_shader_abi *abi, LLVMValueRef sample_id)
 {
-	struct si_shader_context *ctx = si_shader_context_from_abi(abi);
-	LLVMValueRef desc = ac_get_arg(&ctx->ac, ctx->rw_buffers);
-	LLVMValueRef buf_index = LLVMConstInt(ctx->ac.i32, SI_PS_CONST_SAMPLE_POSITIONS, 0);
-	LLVMValueRef resource = ac_build_load_to_sgpr(&ctx->ac, desc, buf_index);
-
-	/* offset = sample_id * 8  (8 = 2 floats containing samplepos.xy) */
-	LLVMValueRef offset0 = LLVMBuildMul(ctx->ac.builder, sample_id, LLVMConstInt(ctx->ac.i32, 8, 0), "");
-	LLVMValueRef offset1 = LLVMBuildAdd(ctx->ac.builder, offset0, LLVMConstInt(ctx->ac.i32, 4, 0), "");
-
-	LLVMValueRef pos[4] = {
-		si_buffer_load_const(ctx, resource, offset0),
-		si_buffer_load_const(ctx, resource, offset1),
-		LLVMConstReal(ctx->ac.f32, 0),
-		LLVMConstReal(ctx->ac.f32, 0)
-	};
-
-	return ac_build_gather_values(&ctx->ac, pos, 4);
+   struct si_shader_context *ctx = si_shader_context_from_abi(abi);
+   LLVMValueRef desc = ac_get_arg(&ctx->ac, ctx->rw_buffers);
+   LLVMValueRef buf_index = LLVMConstInt(ctx->ac.i32, SI_PS_CONST_SAMPLE_POSITIONS, 0);
+   LLVMValueRef resource = ac_build_load_to_sgpr(&ctx->ac, desc, buf_index);
+
+   /* offset = sample_id * 8  (8 = 2 floats containing samplepos.xy) */
+   LLVMValueRef offset0 =
+      LLVMBuildMul(ctx->ac.builder, sample_id, LLVMConstInt(ctx->ac.i32, 8, 0), "");
+   LLVMValueRef offset1 =
+      LLVMBuildAdd(ctx->ac.builder, offset0, LLVMConstInt(ctx->ac.i32, 4, 0), "");
+
+   LLVMValueRef pos[4] = {si_buffer_load_const(ctx, resource, offset0),
+                          si_buffer_load_const(ctx, resource, offset1),
+                          LLVMConstReal(ctx->ac.f32, 0), LLVMConstReal(ctx->ac.f32, 0)};
+
+   return ac_build_gather_values(&ctx->ac, pos, 4);
 }
 
 static LLVMValueRef si_nir_emit_fbfetch(struct ac_shader_abi *abi)
 {
-	struct si_shader_context *ctx = si_shader_context_from_abi(abi);
-	struct ac_image_args args = {};
-	LLVMValueRef ptr, image, fmask;
-
-	/* Ignore src0, because KHR_blend_func_extended disallows multiple render
-	 * targets.
-	 */
-
-	/* Load the image descriptor. */
-	STATIC_ASSERT(SI_PS_IMAGE_COLORBUF0 % 2 == 0);
-	ptr = ac_get_arg(&ctx->ac, ctx->rw_buffers);
-	ptr = LLVMBuildPointerCast(ctx->ac.builder, ptr,
-				   ac_array_in_const32_addr_space(ctx->ac.v8i32), "");
-	image = ac_build_load_to_sgpr(&ctx->ac, ptr,
-			LLVMConstInt(ctx->ac.i32, SI_PS_IMAGE_COLORBUF0 / 2, 0));
-
-	unsigned chan = 0;
-
-	args.coords[chan++] = si_unpack_param(ctx, ctx->pos_fixed_pt, 0, 16);
-
-	if (!ctx->shader->key.mono.u.ps.fbfetch_is_1D)
-		args.coords[chan++] = si_unpack_param(ctx, ctx->pos_fixed_pt, 16, 16);
-
-	/* Get the current render target layer index. */
-	if (ctx->shader->key.mono.u.ps.fbfetch_layered)
-		args.coords[chan++] = si_unpack_param(ctx, ctx->args.ancillary, 16, 11);
-
-	if (ctx->shader->key.mono.u.ps.fbfetch_msaa)
-		args.coords[chan++] = si_get_sample_id(ctx);
-
-	if (ctx->shader->key.mono.u.ps.fbfetch_msaa &&
-	    !(ctx->screen->debug_flags & DBG(NO_FMASK))) {
-		fmask = ac_build_load_to_sgpr(&ctx->ac, ptr,
-			LLVMConstInt(ctx->ac.i32, SI_PS_IMAGE_COLORBUF0_FMASK / 2, 0));
-
-		ac_apply_fmask_to_sample(&ctx->ac, fmask, args.coords,
-					 ctx->shader->key.mono.u.ps.fbfetch_layered);
-	}
-
-	args.opcode = ac_image_load;
-	args.resource = image;
-	args.dmask = 0xf;
-	args.attributes = AC_FUNC_ATTR_READNONE;
-
-	if (ctx->shader->key.mono.u.ps.fbfetch_msaa)
-		args.dim = ctx->shader->key.mono.u.ps.fbfetch_layered ?
-			ac_image_2darraymsaa : ac_image_2dmsaa;
-	else if (ctx->shader->key.mono.u.ps.fbfetch_is_1D)
-		args.dim = ctx->shader->key.mono.u.ps.fbfetch_layered ?
-			ac_image_1darray : ac_image_1d;
-	else
-		args.dim = ctx->shader->key.mono.u.ps.fbfetch_layered ?
-			ac_image_2darray : ac_image_2d;
-
-	return ac_build_image_opcode(&ctx->ac, &args);
+   struct si_shader_context *ctx = si_shader_context_from_abi(abi);
+   struct ac_image_args args = {};
+   LLVMValueRef ptr, image, fmask;
+
+   /* Ignore src0, because KHR_blend_func_extended disallows multiple render
+    * targets.
+    */
+
+   /* Load the image descriptor. */
+   STATIC_ASSERT(SI_PS_IMAGE_COLORBUF0 % 2 == 0);
+   ptr = ac_get_arg(&ctx->ac, ctx->rw_buffers);
+   ptr =
+      LLVMBuildPointerCast(ctx->ac.builder, ptr, ac_array_in_const32_addr_space(ctx->ac.v8i32), "");
+   image =
+      ac_build_load_to_sgpr(&ctx->ac, ptr, LLVMConstInt(ctx->ac.i32, SI_PS_IMAGE_COLORBUF0 / 2, 0));
+
+   unsigned chan = 0;
+
+   args.coords[chan++] = si_unpack_param(ctx, ctx->pos_fixed_pt, 0, 16);
+
+   if (!ctx->shader->key.mono.u.ps.fbfetch_is_1D)
+      args.coords[chan++] = si_unpack_param(ctx, ctx->pos_fixed_pt, 16, 16);
+
+   /* Get the current render target layer index. */
+   if (ctx->shader->key.mono.u.ps.fbfetch_layered)
+      args.coords[chan++] = si_unpack_param(ctx, ctx->args.ancillary, 16, 11);
+
+   if (ctx->shader->key.mono.u.ps.fbfetch_msaa)
+      args.coords[chan++] = si_get_sample_id(ctx);
+
+   if (ctx->shader->key.mono.u.ps.fbfetch_msaa && !(ctx->screen->debug_flags & DBG(NO_FMASK))) {
+      fmask = ac_build_load_to_sgpr(&ctx->ac, ptr,
+                                    LLVMConstInt(ctx->ac.i32, SI_PS_IMAGE_COLORBUF0_FMASK / 2, 0));
+
+      ac_apply_fmask_to_sample(&ctx->ac, fmask, args.coords,
+                               ctx->shader->key.mono.u.ps.fbfetch_layered);
+   }
+
+   args.opcode = ac_image_load;
+   args.resource = image;
+   args.dmask = 0xf;
+   args.attributes = AC_FUNC_ATTR_READNONE;
+
+   if (ctx->shader->key.mono.u.ps.fbfetch_msaa)
+      args.dim =
+         ctx->shader->key.mono.u.ps.fbfetch_layered ? ac_image_2darraymsaa : ac_image_2dmsaa;
+   else if (ctx->shader->key.mono.u.ps.fbfetch_is_1D)
+      args.dim = ctx->shader->key.mono.u.ps.fbfetch_layered ? ac_image_1darray : ac_image_1d;
+   else
+      args.dim = ctx->shader->key.mono.u.ps.fbfetch_layered ? ac_image_2darray : ac_image_2d;
+
+   return ac_build_image_opcode(&ctx->ac, &args);
 }
 
-static LLVMValueRef si_build_fs_interp(struct si_shader_context *ctx,
-				       unsigned attr_index, unsigned chan,
-				       LLVMValueRef prim_mask,
-				       LLVMValueRef i, LLVMValueRef j)
+static LLVMValueRef si_build_fs_interp(struct si_shader_context *ctx, unsigned attr_index,
+                                       unsigned chan, LLVMValueRef prim_mask, LLVMValueRef i,
+                                       LLVMValueRef j)
 {
-	if (i || j) {
-		return ac_build_fs_interp(&ctx->ac,
-					  LLVMConstInt(ctx->ac.i32, chan, 0),
-					  LLVMConstInt(ctx->ac.i32, attr_index, 0),
-					  prim_mask, i, j);
-	}
-	return ac_build_fs_interp_mov(&ctx->ac,
-				      LLVMConstInt(ctx->ac.i32, 2, 0), /* P0 */
-				      LLVMConstInt(ctx->ac.i32, chan, 0),
-				      LLVMConstInt(ctx->ac.i32, attr_index, 0),
-				      prim_mask);
+   if (i || j) {
+      return ac_build_fs_interp(&ctx->ac, LLVMConstInt(ctx->ac.i32, chan, 0),
+                                LLVMConstInt(ctx->ac.i32, attr_index, 0), prim_mask, i, j);
+   }
+   return ac_build_fs_interp_mov(&ctx->ac, LLVMConstInt(ctx->ac.i32, 2, 0), /* P0 */
+                                 LLVMConstInt(ctx->ac.i32, chan, 0),
+                                 LLVMConstInt(ctx->ac.i32, attr_index, 0), prim_mask);
 }
 
 /**
@@ -149,345 +140,300 @@ static LLVMValueRef si_build_fs_interp(struct si_shader_context *ctx,
  * @param face			SI_PARAM_FRONT_FACE
  * @param result		the return value (4 components)
  */
-static void interp_fs_color(struct si_shader_context *ctx,
-			    unsigned input_index,
-			    unsigned semantic_index,
-			    unsigned num_interp_inputs,
-			    unsigned colors_read_mask,
-			    LLVMValueRef interp_param,
-			    LLVMValueRef prim_mask,
-			    LLVMValueRef face,
-			    LLVMValueRef result[4])
+static void interp_fs_color(struct si_shader_context *ctx, unsigned input_index,
+                            unsigned semantic_index, unsigned num_interp_inputs,
+                            unsigned colors_read_mask, LLVMValueRef interp_param,
+                            LLVMValueRef prim_mask, LLVMValueRef face, LLVMValueRef result[4])
 {
-	LLVMValueRef i = NULL, j = NULL;
-	unsigned chan;
-
-	/* fs.constant returns the param from the middle vertex, so it's not
-	 * really useful for flat shading. It's meant to be used for custom
-	 * interpolation (but the intrinsic can't fetch from the other two
-	 * vertices).
-	 *
-	 * Luckily, it doesn't matter, because we rely on the FLAT_SHADE state
-	 * to do the right thing. The only reason we use fs.constant is that
-	 * fs.interp cannot be used on integers, because they can be equal
-	 * to NaN.
-	 *
-	 * When interp is false we will use fs.constant or for newer llvm,
-         * amdgcn.interp.mov.
-	 */
-	bool interp = interp_param != NULL;
-
-	if (interp) {
-		interp_param = LLVMBuildBitCast(ctx->ac.builder, interp_param,
-						LLVMVectorType(ctx->ac.f32, 2), "");
-
-		i = LLVMBuildExtractElement(ctx->ac.builder, interp_param,
-						ctx->ac.i32_0, "");
-		j = LLVMBuildExtractElement(ctx->ac.builder, interp_param,
-						ctx->ac.i32_1, "");
-	}
-
-	if (ctx->shader->key.part.ps.prolog.color_two_side) {
-		LLVMValueRef is_face_positive;
-
-		/* If BCOLOR0 is used, BCOLOR1 is at offset "num_inputs + 1",
-		 * otherwise it's at offset "num_inputs".
-		 */
-		unsigned back_attr_offset = num_interp_inputs;
-		if (semantic_index == 1 && colors_read_mask & 0xf)
-			back_attr_offset += 1;
-
-		is_face_positive = LLVMBuildICmp(ctx->ac.builder, LLVMIntNE,
-						 face, ctx->ac.i32_0, "");
-
-		for (chan = 0; chan < 4; chan++) {
-			LLVMValueRef front, back;
-
-			front = si_build_fs_interp(ctx,
-						   input_index, chan,
-						   prim_mask, i, j);
-			back = si_build_fs_interp(ctx,
-						  back_attr_offset, chan,
-						  prim_mask, i, j);
-
-			result[chan] = LLVMBuildSelect(ctx->ac.builder,
-						is_face_positive,
-						front,
-						back,
-						"");
-		}
-	} else {
-		for (chan = 0; chan < 4; chan++) {
-			result[chan] = si_build_fs_interp(ctx,
-							  input_index, chan,
-							  prim_mask, i, j);
-		}
-	}
+   LLVMValueRef i = NULL, j = NULL;
+   unsigned chan;
+
+   /* fs.constant returns the param from the middle vertex, so it's not
+    * really useful for flat shading. It's meant to be used for custom
+    * interpolation (but the intrinsic can't fetch from the other two
+    * vertices).
+    *
+    * Luckily, it doesn't matter, because we rely on the FLAT_SHADE state
+    * to do the right thing. The only reason we use fs.constant is that
+    * fs.interp cannot be used on integers, because they can be equal
+    * to NaN.
+    *
+    * When interp is false we will use fs.constant or for newer llvm,
+    * amdgcn.interp.mov.
+    */
+   bool interp = interp_param != NULL;
+
+   if (interp) {
+      interp_param =
+         LLVMBuildBitCast(ctx->ac.builder, interp_param, LLVMVectorType(ctx->ac.f32, 2), "");
+
+      i = LLVMBuildExtractElement(ctx->ac.builder, interp_param, ctx->ac.i32_0, "");
+      j = LLVMBuildExtractElement(ctx->ac.builder, interp_param, ctx->ac.i32_1, "");
+   }
+
+   if (ctx->shader->key.part.ps.prolog.color_two_side) {
+      LLVMValueRef is_face_positive;
+
+      /* If BCOLOR0 is used, BCOLOR1 is at offset "num_inputs + 1",
+       * otherwise it's at offset "num_inputs".
+       */
+      unsigned back_attr_offset = num_interp_inputs;
+      if (semantic_index == 1 && colors_read_mask & 0xf)
+         back_attr_offset += 1;
+
+      is_face_positive = LLVMBuildICmp(ctx->ac.builder, LLVMIntNE, face, ctx->ac.i32_0, "");
+
+      for (chan = 0; chan < 4; chan++) {
+         LLVMValueRef front, back;
+
+         front = si_build_fs_interp(ctx, input_index, chan, prim_mask, i, j);
+         back = si_build_fs_interp(ctx, back_attr_offset, chan, prim_mask, i, j);
+
+         result[chan] = LLVMBuildSelect(ctx->ac.builder, is_face_positive, front, back, "");
+      }
+   } else {
+      for (chan = 0; chan < 4; chan++) {
+         result[chan] = si_build_fs_interp(ctx, input_index, chan, prim_mask, i, j);
+      }
+   }
 }
 
 static void si_alpha_test(struct si_shader_context *ctx, LLVMValueRef alpha)
 {
-	if (ctx->shader->key.part.ps.epilog.alpha_func != PIPE_FUNC_NEVER) {
-		static LLVMRealPredicate cond_map[PIPE_FUNC_ALWAYS + 1] = {
-			[PIPE_FUNC_LESS] = LLVMRealOLT,
-			[PIPE_FUNC_EQUAL] = LLVMRealOEQ,
-			[PIPE_FUNC_LEQUAL] = LLVMRealOLE,
-			[PIPE_FUNC_GREATER] = LLVMRealOGT,
-			[PIPE_FUNC_NOTEQUAL] = LLVMRealONE,
-			[PIPE_FUNC_GEQUAL] = LLVMRealOGE,
-		};
-		LLVMRealPredicate cond = cond_map[ctx->shader->key.part.ps.epilog.alpha_func];
-		assert(cond);
-
-		LLVMValueRef alpha_ref = LLVMGetParam(ctx->main_fn,
-				SI_PARAM_ALPHA_REF);
-		LLVMValueRef alpha_pass =
-			LLVMBuildFCmp(ctx->ac.builder, cond, alpha, alpha_ref, "");
-		ac_build_kill_if_false(&ctx->ac, alpha_pass);
-	} else {
-		ac_build_kill_if_false(&ctx->ac, ctx->ac.i1false);
-	}
+   if (ctx->shader->key.part.ps.epilog.alpha_func != PIPE_FUNC_NEVER) {
+      static LLVMRealPredicate cond_map[PIPE_FUNC_ALWAYS + 1] = {
+         [PIPE_FUNC_LESS] = LLVMRealOLT,     [PIPE_FUNC_EQUAL] = LLVMRealOEQ,
+         [PIPE_FUNC_LEQUAL] = LLVMRealOLE,   [PIPE_FUNC_GREATER] = LLVMRealOGT,
+         [PIPE_FUNC_NOTEQUAL] = LLVMRealONE, [PIPE_FUNC_GEQUAL] = LLVMRealOGE,
+      };
+      LLVMRealPredicate cond = cond_map[ctx->shader->key.part.ps.epilog.alpha_func];
+      assert(cond);
+
+      LLVMValueRef alpha_ref = LLVMGetParam(ctx->main_fn, SI_PARAM_ALPHA_REF);
+      LLVMValueRef alpha_pass = LLVMBuildFCmp(ctx->ac.builder, cond, alpha, alpha_ref, "");
+      ac_build_kill_if_false(&ctx->ac, alpha_pass);
+   } else {
+      ac_build_kill_if_false(&ctx->ac, ctx->ac.i1false);
+   }
 }
 
-static LLVMValueRef si_scale_alpha_by_sample_mask(struct si_shader_context *ctx,
-						  LLVMValueRef alpha,
-						  unsigned samplemask_param)
+static LLVMValueRef si_scale_alpha_by_sample_mask(struct si_shader_context *ctx, LLVMValueRef alpha,
+                                                  unsigned samplemask_param)
 {
-	LLVMValueRef coverage;
+   LLVMValueRef coverage;
 
-	/* alpha = alpha * popcount(coverage) / SI_NUM_SMOOTH_AA_SAMPLES */
-	coverage = LLVMGetParam(ctx->main_fn,
-				samplemask_param);
-	coverage = ac_to_integer(&ctx->ac, coverage);
+   /* alpha = alpha * popcount(coverage) / SI_NUM_SMOOTH_AA_SAMPLES */
+   coverage = LLVMGetParam(ctx->main_fn, samplemask_param);
+   coverage = ac_to_integer(&ctx->ac, coverage);
 
-	coverage = ac_build_intrinsic(&ctx->ac, "llvm.ctpop.i32",
-				   ctx->ac.i32,
-				   &coverage, 1, AC_FUNC_ATTR_READNONE);
+   coverage = ac_build_intrinsic(&ctx->ac, "llvm.ctpop.i32", ctx->ac.i32, &coverage, 1,
+                                 AC_FUNC_ATTR_READNONE);
 
-	coverage = LLVMBuildUIToFP(ctx->ac.builder, coverage,
-				   ctx->ac.f32, "");
+   coverage = LLVMBuildUIToFP(ctx->ac.builder, coverage, ctx->ac.f32, "");
 
-	coverage = LLVMBuildFMul(ctx->ac.builder, coverage,
-				 LLVMConstReal(ctx->ac.f32,
-					1.0 / SI_NUM_SMOOTH_AA_SAMPLES), "");
+   coverage = LLVMBuildFMul(ctx->ac.builder, coverage,
+                            LLVMConstReal(ctx->ac.f32, 1.0 / SI_NUM_SMOOTH_AA_SAMPLES), "");
 
-	return LLVMBuildFMul(ctx->ac.builder, alpha, coverage, "");
+   return LLVMBuildFMul(ctx->ac.builder, alpha, coverage, "");
 }
 
 struct si_ps_exports {
-	unsigned num;
-	struct ac_export_args args[10];
+   unsigned num;
+   struct ac_export_args args[10];
 };
 
-static void si_export_mrt_z(struct si_shader_context *ctx,
-			    LLVMValueRef depth, LLVMValueRef stencil,
-			    LLVMValueRef samplemask, struct si_ps_exports *exp)
+static void si_export_mrt_z(struct si_shader_context *ctx, LLVMValueRef depth, LLVMValueRef stencil,
+                            LLVMValueRef samplemask, struct si_ps_exports *exp)
 {
-	struct ac_export_args args;
+   struct ac_export_args args;
 
-	ac_export_mrt_z(&ctx->ac, depth, stencil, samplemask, &args);
+   ac_export_mrt_z(&ctx->ac, depth, stencil, samplemask, &args);
 
-	memcpy(&exp->args[exp->num++], &args, sizeof(args));
+   memcpy(&exp->args[exp->num++], &args, sizeof(args));
 }
 
 /* Initialize arguments for the shader export intrinsic */
-static void si_llvm_init_ps_export_args(struct si_shader_context *ctx,
-					LLVMValueRef *values,
-					unsigned target,
-					struct ac_export_args *args)
+static void si_llvm_init_ps_export_args(struct si_shader_context *ctx, LLVMValueRef *values,
+                                        unsigned target, struct ac_export_args *args)
 {
-	const struct si_shader_key *key = &ctx->shader->key;
-	unsigned col_formats = key->part.ps.epilog.spi_shader_col_format;
-	LLVMValueRef f32undef = LLVMGetUndef(ctx->ac.f32);
-	unsigned spi_shader_col_format;
-	unsigned chan;
-	bool is_int8, is_int10;
-	int cbuf = target - V_008DFC_SQ_EXP_MRT;
-
-	assert(cbuf >= 0 && cbuf < 8);
-
-	spi_shader_col_format = (col_formats >> (cbuf * 4)) & 0xf;
-	is_int8 = (key->part.ps.epilog.color_is_int8 >> cbuf) & 0x1;
-	is_int10 = (key->part.ps.epilog.color_is_int10 >> cbuf) & 0x1;
-
-	/* Default is 0xf. Adjusted below depending on the format. */
-	args->enabled_channels = 0xf; /* writemask */
-
-	/* Specify whether the EXEC mask represents the valid mask */
-	args->valid_mask = 0;
-
-	/* Specify whether this is the last export */
-	args->done = 0;
-
-	/* Specify the target we are exporting */
-	args->target = target;
-
-	args->compr = false;
-	args->out[0] = f32undef;
-	args->out[1] = f32undef;
-	args->out[2] = f32undef;
-	args->out[3] = f32undef;
-
-	LLVMValueRef (*packf)(struct ac_llvm_context *ctx, LLVMValueRef args[2]) = NULL;
-	LLVMValueRef (*packi)(struct ac_llvm_context *ctx, LLVMValueRef args[2],
-			      unsigned bits, bool hi) = NULL;
-
-	switch (spi_shader_col_format) {
-	case V_028714_SPI_SHADER_ZERO:
-		args->enabled_channels = 0; /* writemask */
-		args->target = V_008DFC_SQ_EXP_NULL;
-		break;
-
-	case V_028714_SPI_SHADER_32_R:
-		args->enabled_channels = 1; /* writemask */
-		args->out[0] = values[0];
-		break;
-
-	case V_028714_SPI_SHADER_32_GR:
-		args->enabled_channels = 0x3; /* writemask */
-		args->out[0] = values[0];
-		args->out[1] = values[1];
-		break;
-
-	case V_028714_SPI_SHADER_32_AR:
-		if (ctx->screen->info.chip_class >= GFX10) {
-			args->enabled_channels = 0x3; /* writemask */
-			args->out[0] = values[0];
-			args->out[1] = values[3];
-		} else {
-			args->enabled_channels = 0x9; /* writemask */
-			args->out[0] = values[0];
-			args->out[3] = values[3];
-		}
-		break;
-
-	case V_028714_SPI_SHADER_FP16_ABGR:
-		packf = ac_build_cvt_pkrtz_f16;
-		break;
-
-	case V_028714_SPI_SHADER_UNORM16_ABGR:
-		packf = ac_build_cvt_pknorm_u16;
-		break;
-
-	case V_028714_SPI_SHADER_SNORM16_ABGR:
-		packf = ac_build_cvt_pknorm_i16;
-		break;
-
-	case V_028714_SPI_SHADER_UINT16_ABGR:
-		packi = ac_build_cvt_pk_u16;
-		break;
-
-	case V_028714_SPI_SHADER_SINT16_ABGR:
-		packi = ac_build_cvt_pk_i16;
-		break;
-
-	case V_028714_SPI_SHADER_32_ABGR:
-		memcpy(&args->out[0], values, sizeof(values[0]) * 4);
-		break;
-	}
-
-	/* Pack f16 or norm_i16/u16. */
-	if (packf) {
-		for (chan = 0; chan < 2; chan++) {
-			LLVMValueRef pack_args[2] = {
-				values[2 * chan],
-				values[2 * chan + 1]
-			};
-			LLVMValueRef packed;
-
-			packed = packf(&ctx->ac, pack_args);
-			args->out[chan] = ac_to_float(&ctx->ac, packed);
-		}
-		args->compr = 1; /* COMPR flag */
-	}
-	/* Pack i16/u16. */
-	if (packi) {
-		for (chan = 0; chan < 2; chan++) {
-			LLVMValueRef pack_args[2] = {
-				ac_to_integer(&ctx->ac, values[2 * chan]),
-				ac_to_integer(&ctx->ac, values[2 * chan + 1])
-			};
-			LLVMValueRef packed;
-
-			packed = packi(&ctx->ac, pack_args,
-				       is_int8 ? 8 : is_int10 ? 10 : 16,
-				       chan == 1);
-			args->out[chan] = ac_to_float(&ctx->ac, packed);
-		}
-		args->compr = 1; /* COMPR flag */
-	}
+   const struct si_shader_key *key = &ctx->shader->key;
+   unsigned col_formats = key->part.ps.epilog.spi_shader_col_format;
+   LLVMValueRef f32undef = LLVMGetUndef(ctx->ac.f32);
+   unsigned spi_shader_col_format;
+   unsigned chan;
+   bool is_int8, is_int10;
+   int cbuf = target - V_008DFC_SQ_EXP_MRT;
+
+   assert(cbuf >= 0 && cbuf < 8);
+
+   spi_shader_col_format = (col_formats >> (cbuf * 4)) & 0xf;
+   is_int8 = (key->part.ps.epilog.color_is_int8 >> cbuf) & 0x1;
+   is_int10 = (key->part.ps.epilog.color_is_int10 >> cbuf) & 0x1;
+
+   /* Default is 0xf. Adjusted below depending on the format. */
+   args->enabled_channels = 0xf; /* writemask */
+
+   /* Specify whether the EXEC mask represents the valid mask */
+   args->valid_mask = 0;
+
+   /* Specify whether this is the last export */
+   args->done = 0;
+
+   /* Specify the target we are exporting */
+   args->target = target;
+
+   args->compr = false;
+   args->out[0] = f32undef;
+   args->out[1] = f32undef;
+   args->out[2] = f32undef;
+   args->out[3] = f32undef;
+
+   LLVMValueRef (*packf)(struct ac_llvm_context * ctx, LLVMValueRef args[2]) = NULL;
+   LLVMValueRef (*packi)(struct ac_llvm_context * ctx, LLVMValueRef args[2], unsigned bits,
+                         bool hi) = NULL;
+
+   switch (spi_shader_col_format) {
+   case V_028714_SPI_SHADER_ZERO:
+      args->enabled_channels = 0; /* writemask */
+      args->target = V_008DFC_SQ_EXP_NULL;
+      break;
+
+   case V_028714_SPI_SHADER_32_R:
+      args->enabled_channels = 1; /* writemask */
+      args->out[0] = values[0];
+      break;
+
+   case V_028714_SPI_SHADER_32_GR:
+      args->enabled_channels = 0x3; /* writemask */
+      args->out[0] = values[0];
+      args->out[1] = values[1];
+      break;
+
+   case V_028714_SPI_SHADER_32_AR:
+      if (ctx->screen->info.chip_class >= GFX10) {
+         args->enabled_channels = 0x3; /* writemask */
+         args->out[0] = values[0];
+         args->out[1] = values[3];
+      } else {
+         args->enabled_channels = 0x9; /* writemask */
+         args->out[0] = values[0];
+         args->out[3] = values[3];
+      }
+      break;
+
+   case V_028714_SPI_SHADER_FP16_ABGR:
+      packf = ac_build_cvt_pkrtz_f16;
+      break;
+
+   case V_028714_SPI_SHADER_UNORM16_ABGR:
+      packf = ac_build_cvt_pknorm_u16;
+      break;
+
+   case V_028714_SPI_SHADER_SNORM16_ABGR:
+      packf = ac_build_cvt_pknorm_i16;
+      break;
+
+   case V_028714_SPI_SHADER_UINT16_ABGR:
+      packi = ac_build_cvt_pk_u16;
+      break;
+
+   case V_028714_SPI_SHADER_SINT16_ABGR:
+      packi = ac_build_cvt_pk_i16;
+      break;
+
+   case V_028714_SPI_SHADER_32_ABGR:
+      memcpy(&args->out[0], values, sizeof(values[0]) * 4);
+      break;
+   }
+
+   /* Pack f16 or norm_i16/u16. */
+   if (packf) {
+      for (chan = 0; chan < 2; chan++) {
+         LLVMValueRef pack_args[2] = {values[2 * chan], values[2 * chan + 1]};
+         LLVMValueRef packed;
+
+         packed = packf(&ctx->ac, pack_args);
+         args->out[chan] = ac_to_float(&ctx->ac, packed);
+      }
+      args->compr = 1; /* COMPR flag */
+   }
+   /* Pack i16/u16. */
+   if (packi) {
+      for (chan = 0; chan < 2; chan++) {
+         LLVMValueRef pack_args[2] = {ac_to_integer(&ctx->ac, values[2 * chan]),
+                                      ac_to_integer(&ctx->ac, values[2 * chan + 1])};
+         LLVMValueRef packed;
+
+         packed = packi(&ctx->ac, pack_args, is_int8 ? 8 : is_int10 ? 10 : 16, chan == 1);
+         args->out[chan] = ac_to_float(&ctx->ac, packed);
+      }
+      args->compr = 1; /* COMPR flag */
+   }
 }
 
-static void si_export_mrt_color(struct si_shader_context *ctx,
-				LLVMValueRef *color, unsigned index,
-				unsigned samplemask_param,
-				bool is_last, struct si_ps_exports *exp)
+static void si_export_mrt_color(struct si_shader_context *ctx, LLVMValueRef *color, unsigned index,
+                                unsigned samplemask_param, bool is_last, struct si_ps_exports *exp)
 {
-	int i;
-
-	/* Clamp color */
-	if (ctx->shader->key.part.ps.epilog.clamp_color)
-		for (i = 0; i < 4; i++)
-			color[i] = ac_build_clamp(&ctx->ac, color[i]);
-
-	/* Alpha to one */
-	if (ctx->shader->key.part.ps.epilog.alpha_to_one)
-		color[3] = ctx->ac.f32_1;
-
-	/* Alpha test */
-	if (index == 0 &&
-	    ctx->shader->key.part.ps.epilog.alpha_func != PIPE_FUNC_ALWAYS)
-		si_alpha_test(ctx, color[3]);
-
-	/* Line & polygon smoothing */
-	if (ctx->shader->key.part.ps.epilog.poly_line_smoothing)
-		color[3] = si_scale_alpha_by_sample_mask(ctx, color[3],
-							 samplemask_param);
-
-	/* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */
-	if (ctx->shader->key.part.ps.epilog.last_cbuf > 0) {
-		struct ac_export_args args[8];
-		int c, last = -1;
-
-		/* Get the export arguments, also find out what the last one is. */
-		for (c = 0; c <= ctx->shader->key.part.ps.epilog.last_cbuf; c++) {
-			si_llvm_init_ps_export_args(ctx, color,
-						    V_008DFC_SQ_EXP_MRT + c, &args[c]);
-			if (args[c].enabled_channels)
-				last = c;
-		}
-
-		/* Emit all exports. */
-		for (c = 0; c <= ctx->shader->key.part.ps.epilog.last_cbuf; c++) {
-			if (is_last && last == c) {
-				args[c].valid_mask = 1; /* whether the EXEC mask is valid */
-				args[c].done = 1; /* DONE bit */
-			} else if (!args[c].enabled_channels)
-				continue; /* unnecessary NULL export */
-
-			memcpy(&exp->args[exp->num++], &args[c], sizeof(args[c]));
-		}
-	} else {
-		struct ac_export_args args;
-
-		/* Export */
-		si_llvm_init_ps_export_args(ctx, color, V_008DFC_SQ_EXP_MRT + index,
-					    &args);
-		if (is_last) {
-			args.valid_mask = 1; /* whether the EXEC mask is valid */
-			args.done = 1; /* DONE bit */
-		} else if (!args.enabled_channels)
-			return; /* unnecessary NULL export */
-
-		memcpy(&exp->args[exp->num++], &args, sizeof(args));
-	}
+   int i;
+
+   /* Clamp color */
+   if (ctx->shader->key.part.ps.epilog.clamp_color)
+      for (i = 0; i < 4; i++)
+         color[i] = ac_build_clamp(&ctx->ac, color[i]);
+
+   /* Alpha to one */
+   if (ctx->shader->key.part.ps.epilog.alpha_to_one)
+      color[3] = ctx->ac.f32_1;
+
+   /* Alpha test */
+   if (index == 0 && ctx->shader->key.part.ps.epilog.alpha_func != PIPE_FUNC_ALWAYS)
+      si_alpha_test(ctx, color[3]);
+
+   /* Line & polygon smoothing */
+   if (ctx->shader->key.part.ps.epilog.poly_line_smoothing)
+      color[3] = si_scale_alpha_by_sample_mask(ctx, color[3], samplemask_param);
+
+   /* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */
+   if (ctx->shader->key.part.ps.epilog.last_cbuf > 0) {
+      struct ac_export_args args[8];
+      int c, last = -1;
+
+      /* Get the export arguments, also find out what the last one is. */
+      for (c = 0; c <= ctx->shader->key.part.ps.epilog.last_cbuf; c++) {
+         si_llvm_init_ps_export_args(ctx, color, V_008DFC_SQ_EXP_MRT + c, &args[c]);
+         if (args[c].enabled_channels)
+            last = c;
+      }
+
+      /* Emit all exports. */
+      for (c = 0; c <= ctx->shader->key.part.ps.epilog.last_cbuf; c++) {
+         if (is_last && last == c) {
+            args[c].valid_mask = 1; /* whether the EXEC mask is valid */
+            args[c].done = 1;       /* DONE bit */
+         } else if (!args[c].enabled_channels)
+            continue; /* unnecessary NULL export */
+
+         memcpy(&exp->args[exp->num++], &args[c], sizeof(args[c]));
+      }
+   } else {
+      struct ac_export_args args;
+
+      /* Export */
+      si_llvm_init_ps_export_args(ctx, color, V_008DFC_SQ_EXP_MRT + index, &args);
+      if (is_last) {
+         args.valid_mask = 1; /* whether the EXEC mask is valid */
+         args.done = 1;       /* DONE bit */
+      } else if (!args.enabled_channels)
+         return; /* unnecessary NULL export */
+
+      memcpy(&exp->args[exp->num++], &args, sizeof(args));
+   }
 }
 
-static void si_emit_ps_exports(struct si_shader_context *ctx,
-			       struct si_ps_exports *exp)
+static void si_emit_ps_exports(struct si_shader_context *ctx, struct si_ps_exports *exp)
 {
-	for (unsigned i = 0; i < exp->num; i++)
-		ac_build_export(&ctx->ac, &exp->args[i]);
+   for (unsigned i = 0; i < exp->num; i++)
+      ac_build_export(&ctx->ac, &exp->args[i]);
 }
 
 /**
@@ -503,117 +449,108 @@ static void si_emit_ps_exports(struct si_shader_context *ctx,
  *
  * The alpha-ref SGPR is returned via its original location.
  */
-static void si_llvm_return_fs_outputs(struct ac_shader_abi *abi,
-				      unsigned max_outputs,
-				      LLVMValueRef *addrs)
+static void si_llvm_return_fs_outputs(struct ac_shader_abi *abi, unsigned max_outputs,
+                                      LLVMValueRef *addrs)
 {
-	struct si_shader_context *ctx = si_shader_context_from_abi(abi);
-	struct si_shader *shader = ctx->shader;
-	struct si_shader_info *info = &shader->selector->info;
-	LLVMBuilderRef builder = ctx->ac.builder;
-	unsigned i, j, first_vgpr, vgpr;
-
-	LLVMValueRef color[8][4] = {};
-	LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
-	LLVMValueRef ret;
-
-	if (ctx->postponed_kill)
-		ac_build_kill_if_false(&ctx->ac, LLVMBuildLoad(builder, ctx->postponed_kill, ""));
-
-	/* Read the output values. */
-	for (i = 0; i < info->num_outputs; i++) {
-		unsigned semantic_name = info->output_semantic_name[i];
-		unsigned semantic_index = info->output_semantic_index[i];
-
-		switch (semantic_name) {
-		case TGSI_SEMANTIC_COLOR:
-			assert(semantic_index < 8);
-			for (j = 0; j < 4; j++) {
-				LLVMValueRef ptr = addrs[4 * i + j];
-				LLVMValueRef result = LLVMBuildLoad(builder, ptr, "");
-				color[semantic_index][j] = result;
-			}
-			break;
-		case TGSI_SEMANTIC_POSITION:
-			depth = LLVMBuildLoad(builder,
-					      addrs[4 * i + 0], "");
-			break;
-		case TGSI_SEMANTIC_STENCIL:
-			stencil = LLVMBuildLoad(builder,
-						addrs[4 * i + 0], "");
-			break;
-		case TGSI_SEMANTIC_SAMPLEMASK:
-			samplemask = LLVMBuildLoad(builder,
-						   addrs[4 * i + 0], "");
-			break;
-		default:
-			fprintf(stderr, "Warning: GFX6 unhandled fs output type:%d\n",
-				semantic_name);
-		}
-	}
-
-	/* Fill the return structure. */
-	ret = ctx->return_value;
-
-	/* Set SGPRs. */
-	ret = LLVMBuildInsertValue(builder, ret,
-				   ac_to_integer(&ctx->ac,
-                                                 LLVMGetParam(ctx->main_fn,
-                                                              SI_PARAM_ALPHA_REF)),
-				   SI_SGPR_ALPHA_REF, "");
-
-	/* Set VGPRs */
-	first_vgpr = vgpr = SI_SGPR_ALPHA_REF + 1;
-	for (i = 0; i < ARRAY_SIZE(color); i++) {
-		if (!color[i][0])
-			continue;
-
-		for (j = 0; j < 4; j++)
-			ret = LLVMBuildInsertValue(builder, ret, color[i][j], vgpr++, "");
-	}
-	if (depth)
-		ret = LLVMBuildInsertValue(builder, ret, depth, vgpr++, "");
-	if (stencil)
-		ret = LLVMBuildInsertValue(builder, ret, stencil, vgpr++, "");
-	if (samplemask)
-		ret = LLVMBuildInsertValue(builder, ret, samplemask, vgpr++, "");
-
-	/* Add the input sample mask for smoothing at the end. */
-	if (vgpr < first_vgpr + PS_EPILOG_SAMPLEMASK_MIN_LOC)
-		vgpr = first_vgpr + PS_EPILOG_SAMPLEMASK_MIN_LOC;
-	ret = LLVMBuildInsertValue(builder, ret,
-				   LLVMGetParam(ctx->main_fn,
-						SI_PARAM_SAMPLE_COVERAGE), vgpr++, "");
-
-	ctx->return_value = ret;
+   struct si_shader_context *ctx = si_shader_context_from_abi(abi);
+   struct si_shader *shader = ctx->shader;
+   struct si_shader_info *info = &shader->selector->info;
+   LLVMBuilderRef builder = ctx->ac.builder;
+   unsigned i, j, first_vgpr, vgpr;
+
+   LLVMValueRef color[8][4] = {};
+   LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
+   LLVMValueRef ret;
+
+   if (ctx->postponed_kill)
+      ac_build_kill_if_false(&ctx->ac, LLVMBuildLoad(builder, ctx->postponed_kill, ""));
+
+   /* Read the output values. */
+   for (i = 0; i < info->num_outputs; i++) {
+      unsigned semantic_name = info->output_semantic_name[i];
+      unsigned semantic_index = info->output_semantic_index[i];
+
+      switch (semantic_name) {
+      case TGSI_SEMANTIC_COLOR:
+         assert(semantic_index < 8);
+         for (j = 0; j < 4; j++) {
+            LLVMValueRef ptr = addrs[4 * i + j];
+            LLVMValueRef result = LLVMBuildLoad(builder, ptr, "");
+            color[semantic_index][j] = result;
+         }
+         break;
+      case TGSI_SEMANTIC_POSITION:
+         depth = LLVMBuildLoad(builder, addrs[4 * i + 0], "");
+         break;
+      case TGSI_SEMANTIC_STENCIL:
+         stencil = LLVMBuildLoad(builder, addrs[4 * i + 0], "");
+         break;
+      case TGSI_SEMANTIC_SAMPLEMASK:
+         samplemask = LLVMBuildLoad(builder, addrs[4 * i + 0], "");
+         break;
+      default:
+         fprintf(stderr, "Warning: GFX6 unhandled fs output type:%d\n", semantic_name);
+      }
+   }
+
+   /* Fill the return structure. */
+   ret = ctx->return_value;
+
+   /* Set SGPRs. */
+   ret = LLVMBuildInsertValue(
+      builder, ret, ac_to_integer(&ctx->ac, LLVMGetParam(ctx->main_fn, SI_PARAM_ALPHA_REF)),
+      SI_SGPR_ALPHA_REF, "");
+
+   /* Set VGPRs */
+   first_vgpr = vgpr = SI_SGPR_ALPHA_REF + 1;
+   for (i = 0; i < ARRAY_SIZE(color); i++) {
+      if (!color[i][0])
+         continue;
+
+      for (j = 0; j < 4; j++)
+         ret = LLVMBuildInsertValue(builder, ret, color[i][j], vgpr++, "");
+   }
+   if (depth)
+      ret = LLVMBuildInsertValue(builder, ret, depth, vgpr++, "");
+   if (stencil)
+      ret = LLVMBuildInsertValue(builder, ret, stencil, vgpr++, "");
+   if (samplemask)
+      ret = LLVMBuildInsertValue(builder, ret, samplemask, vgpr++, "");
+
+   /* Add the input sample mask for smoothing at the end. */
+   if (vgpr < first_vgpr + PS_EPILOG_SAMPLEMASK_MIN_LOC)
+      vgpr = first_vgpr + PS_EPILOG_SAMPLEMASK_MIN_LOC;
+   ret = LLVMBuildInsertValue(builder, ret, LLVMGetParam(ctx->main_fn, SI_PARAM_SAMPLE_COVERAGE),
+                              vgpr++, "");
+
+   ctx->return_value = ret;
 }
 
 static void si_llvm_emit_polygon_stipple(struct si_shader_context *ctx,
-					 LLVMValueRef param_rw_buffers,
-					 struct ac_arg param_pos_fixed_pt)
+                                         LLVMValueRef param_rw_buffers,
+                                         struct ac_arg param_pos_fixed_pt)
 {
-	LLVMBuilderRef builder = ctx->ac.builder;
-	LLVMValueRef slot, desc, offset, row, bit, address[2];
-
-	/* Use the fixed-point gl_FragCoord input.
-	 * Since the stipple pattern is 32x32 and it repeats, just get 5 bits
-	 * per coordinate to get the repeating effect.
-	 */
-	address[0] = si_unpack_param(ctx, param_pos_fixed_pt, 0, 5);
-	address[1] = si_unpack_param(ctx, param_pos_fixed_pt, 16, 5);
-
-	/* Load the buffer descriptor. */
-	slot = LLVMConstInt(ctx->ac.i32, SI_PS_CONST_POLY_STIPPLE, 0);
-	desc = ac_build_load_to_sgpr(&ctx->ac, param_rw_buffers, slot);
-
-	/* The stipple pattern is 32x32, each row has 32 bits. */
-	offset = LLVMBuildMul(builder, address[1],
-			      LLVMConstInt(ctx->ac.i32, 4, 0), "");
-	row = si_buffer_load_const(ctx, desc, offset);
-	row = ac_to_integer(&ctx->ac, row);
-	bit = LLVMBuildLShr(builder, row, address[0], "");
-	bit = LLVMBuildTrunc(builder, bit, ctx->ac.i1, "");
-	ac_build_kill_if_false(&ctx->ac, bit);
+   LLVMBuilderRef builder = ctx->ac.builder;
+   LLVMValueRef slot, desc, offset, row, bit, address[2];
+
+   /* Use the fixed-point gl_FragCoord input.
+    * Since the stipple pattern is 32x32 and it repeats, just get 5 bits
+    * per coordinate to get the repeating effect.
+    */
+   address[0] = si_unpack_param(ctx, param_pos_fixed_pt, 0, 5);
+   address[1] = si_unpack_param(ctx, param_pos_fixed_pt, 16, 5);
+
+   /* Load the buffer descriptor. */
+   slot = LLVMConstInt(ctx->ac.i32, SI_PS_CONST_POLY_STIPPLE, 0);
+   desc = ac_build_load_to_sgpr(&ctx->ac, param_rw_buffers, slot);
+
+   /* The stipple pattern is 32x32, each row has 32 bits. */
+   offset = LLVMBuildMul(builder, address[1], LLVMConstInt(ctx->ac.i32, 4, 0), "");
+   row = si_buffer_load_const(ctx, desc, offset);
+   row = ac_to_integer(&ctx->ac, row);
+   bit = LLVMBuildLShr(builder, row, address[0], "");
+   bit = LLVMBuildTrunc(builder, bit, ctx->ac.i1, "");
+   ac_build_kill_if_false(&ctx->ac, bit);
 }
 
 /**
@@ -626,416 +563,372 @@ static void si_llvm_emit_polygon_stipple(struct si_shader_context *ctx,
  * overriden by other states. (e.g. per-sample interpolation)
  * Interpolated colors are stored after the preloaded VGPRs.
  */
-void si_llvm_build_ps_prolog(struct si_shader_context *ctx,
-			     union si_shader_part_key *key)
+void si_llvm_build_ps_prolog(struct si_shader_context *ctx, union si_shader_part_key *key)
 {
-	LLVMValueRef ret, func;
-	int num_returns, i, num_color_channels;
-
-	memset(&ctx->args, 0, sizeof(ctx->args));
-
-	/* Declare inputs. */
-	LLVMTypeRef return_types[AC_MAX_ARGS];
-	num_returns = 0;
-	num_color_channels = util_bitcount(key->ps_prolog.colors_read);
-	assert(key->ps_prolog.num_input_sgprs +
-	       key->ps_prolog.num_input_vgprs +
-	       num_color_channels <= AC_MAX_ARGS);
-	for (i = 0; i < key->ps_prolog.num_input_sgprs; i++) {
-		ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
-		return_types[num_returns++] = ctx->ac.i32;
-
-	}
-
-	struct ac_arg pos_fixed_pt;
-	struct ac_arg ancillary;
-	struct ac_arg param_sample_mask;
-	for (i = 0; i < key->ps_prolog.num_input_vgprs; i++) {
-		struct ac_arg *arg = NULL;
-		if (i == key->ps_prolog.ancillary_vgpr_index) {
-			arg = &ancillary;
-		} else if (i == key->ps_prolog.ancillary_vgpr_index + 1) {
-			arg = &param_sample_mask;
-		} else if (i == key->ps_prolog.num_input_vgprs - 1) {
-			/* POS_FIXED_PT is always last. */
-			arg = &pos_fixed_pt;
-		}
-		ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, arg);
-		return_types[num_returns++] = ctx->ac.f32;
-	}
-
-	/* Declare outputs (same as inputs + add colors if needed) */
-	for (i = 0; i < num_color_channels; i++)
-		return_types[num_returns++] = ctx->ac.f32;
-
-	/* Create the function. */
-	si_llvm_create_func(ctx, "ps_prolog", return_types, num_returns, 0);
-	func = ctx->main_fn;
-
-	/* Copy inputs to outputs. This should be no-op, as the registers match,
-	 * but it will prevent the compiler from overwriting them unintentionally.
-	 */
-	ret = ctx->return_value;
-	for (i = 0; i < ctx->args.arg_count; i++) {
-		LLVMValueRef p = LLVMGetParam(func, i);
-		ret = LLVMBuildInsertValue(ctx->ac.builder, ret, p, i, "");
-	}
-
-	/* Polygon stippling. */
-	if (key->ps_prolog.states.poly_stipple) {
-		LLVMValueRef list = si_prolog_get_rw_buffers(ctx);
-
-		si_llvm_emit_polygon_stipple(ctx, list, pos_fixed_pt);
-	}
-
-	if (key->ps_prolog.states.bc_optimize_for_persp ||
-	    key->ps_prolog.states.bc_optimize_for_linear) {
-		unsigned i, base = key->ps_prolog.num_input_sgprs;
-		LLVMValueRef center[2], centroid[2], tmp, bc_optimize;
-
-		/* The shader should do: if (PRIM_MASK[31]) CENTROID = CENTER;
-		 * The hw doesn't compute CENTROID if the whole wave only
-		 * contains fully-covered quads.
-		 *
-		 * PRIM_MASK is after user SGPRs.
-		 */
-		bc_optimize = LLVMGetParam(func, SI_PS_NUM_USER_SGPR);
-		bc_optimize = LLVMBuildLShr(ctx->ac.builder, bc_optimize,
-					    LLVMConstInt(ctx->ac.i32, 31, 0), "");
-		bc_optimize = LLVMBuildTrunc(ctx->ac.builder, bc_optimize,
-					     ctx->ac.i1, "");
-
-		if (key->ps_prolog.states.bc_optimize_for_persp) {
-			/* Read PERSP_CENTER. */
-			for (i = 0; i < 2; i++)
-				center[i] = LLVMGetParam(func, base + 2 + i);
-			/* Read PERSP_CENTROID. */
-			for (i = 0; i < 2; i++)
-				centroid[i] = LLVMGetParam(func, base + 4 + i);
-			/* Select PERSP_CENTROID. */
-			for (i = 0; i < 2; i++) {
-				tmp = LLVMBuildSelect(ctx->ac.builder, bc_optimize,
-						      center[i], centroid[i], "");
-				ret = LLVMBuildInsertValue(ctx->ac.builder, ret,
-							   tmp, base + 4 + i, "");
-			}
-		}
-		if (key->ps_prolog.states.bc_optimize_for_linear) {
-			/* Read LINEAR_CENTER. */
-			for (i = 0; i < 2; i++)
-				center[i] = LLVMGetParam(func, base + 8 + i);
-			/* Read LINEAR_CENTROID. */
-			for (i = 0; i < 2; i++)
-				centroid[i] = LLVMGetParam(func, base + 10 + i);
-			/* Select LINEAR_CENTROID. */
-			for (i = 0; i < 2; i++) {
-				tmp = LLVMBuildSelect(ctx->ac.builder, bc_optimize,
-						      center[i], centroid[i], "");
-				ret = LLVMBuildInsertValue(ctx->ac.builder, ret,
-							   tmp, base + 10 + i, "");
-			}
-		}
-	}
-
-	/* Force per-sample interpolation. */
-	if (key->ps_prolog.states.force_persp_sample_interp) {
-		unsigned i, base = key->ps_prolog.num_input_sgprs;
-		LLVMValueRef persp_sample[2];
-
-		/* Read PERSP_SAMPLE. */
-		for (i = 0; i < 2; i++)
-			persp_sample[i] = LLVMGetParam(func, base + i);
-		/* Overwrite PERSP_CENTER. */
-		for (i = 0; i < 2; i++)
-			ret = LLVMBuildInsertValue(ctx->ac.builder, ret,
-						   persp_sample[i], base + 2 + i, "");
-		/* Overwrite PERSP_CENTROID. */
-		for (i = 0; i < 2; i++)
-			ret = LLVMBuildInsertValue(ctx->ac.builder, ret,
-						   persp_sample[i], base + 4 + i, "");
-	}
-	if (key->ps_prolog.states.force_linear_sample_interp) {
-		unsigned i, base = key->ps_prolog.num_input_sgprs;
-		LLVMValueRef linear_sample[2];
-
-		/* Read LINEAR_SAMPLE. */
-		for (i = 0; i < 2; i++)
-			linear_sample[i] = LLVMGetParam(func, base + 6 + i);
-		/* Overwrite LINEAR_CENTER. */
-		for (i = 0; i < 2; i++)
-			ret = LLVMBuildInsertValue(ctx->ac.builder, ret,
-						   linear_sample[i], base + 8 + i, "");
-		/* Overwrite LINEAR_CENTROID. */
-		for (i = 0; i < 2; i++)
-			ret = LLVMBuildInsertValue(ctx->ac.builder, ret,
-						   linear_sample[i], base + 10 + i, "");
-	}
-
-	/* Force center interpolation. */
-	if (key->ps_prolog.states.force_persp_center_interp) {
-		unsigned i, base = key->ps_prolog.num_input_sgprs;
-		LLVMValueRef persp_center[2];
-
-		/* Read PERSP_CENTER. */
-		for (i = 0; i < 2; i++)
-			persp_center[i] = LLVMGetParam(func, base + 2 + i);
-		/* Overwrite PERSP_SAMPLE. */
-		for (i = 0; i < 2; i++)
-			ret = LLVMBuildInsertValue(ctx->ac.builder, ret,
-						   persp_center[i], base + i, "");
-		/* Overwrite PERSP_CENTROID. */
-		for (i = 0; i < 2; i++)
-			ret = LLVMBuildInsertValue(ctx->ac.builder, ret,
-						   persp_center[i], base + 4 + i, "");
-	}
-	if (key->ps_prolog.states.force_linear_center_interp) {
-		unsigned i, base = key->ps_prolog.num_input_sgprs;
-		LLVMValueRef linear_center[2];
-
-		/* Read LINEAR_CENTER. */
-		for (i = 0; i < 2; i++)
-			linear_center[i] = LLVMGetParam(func, base + 8 + i);
-		/* Overwrite LINEAR_SAMPLE. */
-		for (i = 0; i < 2; i++)
-			ret = LLVMBuildInsertValue(ctx->ac.builder, ret,
-						   linear_center[i], base + 6 + i, "");
-		/* Overwrite LINEAR_CENTROID. */
-		for (i = 0; i < 2; i++)
-			ret = LLVMBuildInsertValue(ctx->ac.builder, ret,
-						   linear_center[i], base + 10 + i, "");
-	}
-
-	/* Interpolate colors. */
-	unsigned color_out_idx = 0;
-	for (i = 0; i < 2; i++) {
-		unsigned writemask = (key->ps_prolog.colors_read >> (i * 4)) & 0xf;
-		unsigned face_vgpr = key->ps_prolog.num_input_sgprs +
-				     key->ps_prolog.face_vgpr_index;
-		LLVMValueRef interp[2], color[4];
-		LLVMValueRef interp_ij = NULL, prim_mask = NULL, face = NULL;
-
-		if (!writemask)
-			continue;
-
-		/* If the interpolation qualifier is not CONSTANT (-1). */
-		if (key->ps_prolog.color_interp_vgpr_index[i] != -1) {
-			unsigned interp_vgpr = key->ps_prolog.num_input_sgprs +
-					       key->ps_prolog.color_interp_vgpr_index[i];
-
-			/* Get the (i,j) updated by bc_optimize handling. */
-			interp[0] = LLVMBuildExtractValue(ctx->ac.builder, ret,
-							  interp_vgpr, "");
-			interp[1] = LLVMBuildExtractValue(ctx->ac.builder, ret,
-							  interp_vgpr + 1, "");
-			interp_ij = ac_build_gather_values(&ctx->ac, interp, 2);
-		}
-
-		/* Use the absolute location of the input. */
-		prim_mask = LLVMGetParam(func, SI_PS_NUM_USER_SGPR);
-
-		if (key->ps_prolog.states.color_two_side) {
-			face = LLVMGetParam(func, face_vgpr);
-			face = ac_to_integer(&ctx->ac, face);
-		}
-
-		interp_fs_color(ctx,
-				key->ps_prolog.color_attr_index[i], i,
-				key->ps_prolog.num_interp_inputs,
-				key->ps_prolog.colors_read, interp_ij,
-				prim_mask, face, color);
-
-		while (writemask) {
-			unsigned chan = u_bit_scan(&writemask);
-			ret = LLVMBuildInsertValue(ctx->ac.builder, ret, color[chan],
-						   ctx->args.arg_count + color_out_idx++, "");
-		}
-	}
-
-	/* Section 15.2.2 (Shader Inputs) of the OpenGL 4.5 (Core Profile) spec
-	 * says:
-	 *
-	 *    "When per-sample shading is active due to the use of a fragment
-	 *     input qualified by sample or due to the use of the gl_SampleID
-	 *     or gl_SamplePosition variables, only the bit for the current
-	 *     sample is set in gl_SampleMaskIn. When state specifies multiple
-	 *     fragment shader invocations for a given fragment, the sample
-	 *     mask for any single fragment shader invocation may specify a
-	 *     subset of the covered samples for the fragment. In this case,
-	 *     the bit corresponding to each covered sample will be set in
-	 *     exactly one fragment shader invocation."
-	 *
-	 * The samplemask loaded by hardware is always the coverage of the
-	 * entire pixel/fragment, so mask bits out based on the sample ID.
-	 */
-	if (key->ps_prolog.states.samplemask_log_ps_iter) {
-		/* The bit pattern matches that used by fixed function fragment
-		 * processing. */
-		static const uint16_t ps_iter_masks[] = {
-			0xffff, /* not used */
-			0x5555,
-			0x1111,
-			0x0101,
-			0x0001,
-		};
-		assert(key->ps_prolog.states.samplemask_log_ps_iter < ARRAY_SIZE(ps_iter_masks));
-
-		uint32_t ps_iter_mask = ps_iter_masks[key->ps_prolog.states.samplemask_log_ps_iter];
-		LLVMValueRef sampleid = si_unpack_param(ctx, ancillary, 8, 4);
-		LLVMValueRef samplemask = ac_get_arg(&ctx->ac, param_sample_mask);
-
-		samplemask = ac_to_integer(&ctx->ac, samplemask);
-		samplemask = LLVMBuildAnd(
-			ctx->ac.builder,
-			samplemask,
-			LLVMBuildShl(ctx->ac.builder,
-				     LLVMConstInt(ctx->ac.i32, ps_iter_mask, false),
-				     sampleid, ""),
-			"");
-		samplemask = ac_to_float(&ctx->ac, samplemask);
-
-		ret = LLVMBuildInsertValue(ctx->ac.builder, ret, samplemask,
-					   param_sample_mask.arg_index, "");
-	}
-
-	/* Tell LLVM to insert WQM instruction sequence when needed. */
-	if (key->ps_prolog.wqm) {
-		LLVMAddTargetDependentFunctionAttr(func,
-						   "amdgpu-ps-wqm-outputs", "");
-	}
-
-	si_llvm_build_ret(ctx, ret);
+   LLVMValueRef ret, func;
+   int num_returns, i, num_color_channels;
+
+   memset(&ctx->args, 0, sizeof(ctx->args));
+
+   /* Declare inputs. */
+   LLVMTypeRef return_types[AC_MAX_ARGS];
+   num_returns = 0;
+   num_color_channels = util_bitcount(key->ps_prolog.colors_read);
+   assert(key->ps_prolog.num_input_sgprs + key->ps_prolog.num_input_vgprs + num_color_channels <=
+          AC_MAX_ARGS);
+   for (i = 0; i < key->ps_prolog.num_input_sgprs; i++) {
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
+      return_types[num_returns++] = ctx->ac.i32;
+   }
+
+   struct ac_arg pos_fixed_pt;
+   struct ac_arg ancillary;
+   struct ac_arg param_sample_mask;
+   for (i = 0; i < key->ps_prolog.num_input_vgprs; i++) {
+      struct ac_arg *arg = NULL;
+      if (i == key->ps_prolog.ancillary_vgpr_index) {
+         arg = &ancillary;
+      } else if (i == key->ps_prolog.ancillary_vgpr_index + 1) {
+         arg = &param_sample_mask;
+      } else if (i == key->ps_prolog.num_input_vgprs - 1) {
+         /* POS_FIXED_PT is always last. */
+         arg = &pos_fixed_pt;
+      }
+      ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, arg);
+      return_types[num_returns++] = ctx->ac.f32;
+   }
+
+   /* Declare outputs (same as inputs + add colors if needed) */
+   for (i = 0; i < num_color_channels; i++)
+      return_types[num_returns++] = ctx->ac.f32;
+
+   /* Create the function. */
+   si_llvm_create_func(ctx, "ps_prolog", return_types, num_returns, 0);
+   func = ctx->main_fn;
+
+   /* Copy inputs to outputs. This should be no-op, as the registers match,
+    * but it will prevent the compiler from overwriting them unintentionally.
+    */
+   ret = ctx->return_value;
+   for (i = 0; i < ctx->args.arg_count; i++) {
+      LLVMValueRef p = LLVMGetParam(func, i);
+      ret = LLVMBuildInsertValue(ctx->ac.builder, ret, p, i, "");
+   }
+
+   /* Polygon stippling. */
+   if (key->ps_prolog.states.poly_stipple) {
+      LLVMValueRef list = si_prolog_get_rw_buffers(ctx);
+
+      si_llvm_emit_polygon_stipple(ctx, list, pos_fixed_pt);
+   }
+
+   if (key->ps_prolog.states.bc_optimize_for_persp ||
+       key->ps_prolog.states.bc_optimize_for_linear) {
+      unsigned i, base = key->ps_prolog.num_input_sgprs;
+      LLVMValueRef center[2], centroid[2], tmp, bc_optimize;
+
+      /* The shader should do: if (PRIM_MASK[31]) CENTROID = CENTER;
+       * The hw doesn't compute CENTROID if the whole wave only
+       * contains fully-covered quads.
+       *
+       * PRIM_MASK is after user SGPRs.
+       */
+      bc_optimize = LLVMGetParam(func, SI_PS_NUM_USER_SGPR);
+      bc_optimize =
+         LLVMBuildLShr(ctx->ac.builder, bc_optimize, LLVMConstInt(ctx->ac.i32, 31, 0), "");
+      bc_optimize = LLVMBuildTrunc(ctx->ac.builder, bc_optimize, ctx->ac.i1, "");
+
+      if (key->ps_prolog.states.bc_optimize_for_persp) {
+         /* Read PERSP_CENTER. */
+         for (i = 0; i < 2; i++)
+            center[i] = LLVMGetParam(func, base + 2 + i);
+         /* Read PERSP_CENTROID. */
+         for (i = 0; i < 2; i++)
+            centroid[i] = LLVMGetParam(func, base + 4 + i);
+         /* Select PERSP_CENTROID. */
+         for (i = 0; i < 2; i++) {
+            tmp = LLVMBuildSelect(ctx->ac.builder, bc_optimize, center[i], centroid[i], "");
+            ret = LLVMBuildInsertValue(ctx->ac.builder, ret, tmp, base + 4 + i, "");
+         }
+      }
+      if (key->ps_prolog.states.bc_optimize_for_linear) {
+         /* Read LINEAR_CENTER. */
+         for (i = 0; i < 2; i++)
+            center[i] = LLVMGetParam(func, base + 8 + i);
+         /* Read LINEAR_CENTROID. */
+         for (i = 0; i < 2; i++)
+            centroid[i] = LLVMGetParam(func, base + 10 + i);
+         /* Select LINEAR_CENTROID. */
+         for (i = 0; i < 2; i++) {
+            tmp = LLVMBuildSelect(ctx->ac.builder, bc_optimize, center[i], centroid[i], "");
+            ret = LLVMBuildInsertValue(ctx->ac.builder, ret, tmp, base + 10 + i, "");
+         }
+      }
+   }
+
+   /* Force per-sample interpolation. */
+   if (key->ps_prolog.states.force_persp_sample_interp) {
+      unsigned i, base = key->ps_prolog.num_input_sgprs;
+      LLVMValueRef persp_sample[2];
+
+      /* Read PERSP_SAMPLE. */
+      for (i = 0; i < 2; i++)
+         persp_sample[i] = LLVMGetParam(func, base + i);
+      /* Overwrite PERSP_CENTER. */
+      for (i = 0; i < 2; i++)
+         ret = LLVMBuildInsertValue(ctx->ac.builder, ret, persp_sample[i], base + 2 + i, "");
+      /* Overwrite PERSP_CENTROID. */
+      for (i = 0; i < 2; i++)
+         ret = LLVMBuildInsertValue(ctx->ac.builder, ret, persp_sample[i], base + 4 + i, "");
+   }
+   if (key->ps_prolog.states.force_linear_sample_interp) {
+      unsigned i, base = key->ps_prolog.num_input_sgprs;
+      LLVMValueRef linear_sample[2];
+
+      /* Read LINEAR_SAMPLE. */
+      for (i = 0; i < 2; i++)
+         linear_sample[i] = LLVMGetParam(func, base + 6 + i);
+      /* Overwrite LINEAR_CENTER. */
+      for (i = 0; i < 2; i++)
+         ret = LLVMBuildInsertValue(ctx->ac.builder, ret, linear_sample[i], base + 8 + i, "");
+      /* Overwrite LINEAR_CENTROID. */
+      for (i = 0; i < 2; i++)
+         ret = LLVMBuildInsertValue(ctx->ac.builder, ret, linear_sample[i], base + 10 + i, "");
+   }
+
+   /* Force center interpolation. */
+   if (key->ps_prolog.states.force_persp_center_interp) {
+      unsigned i, base = key->ps_prolog.num_input_sgprs;
+      LLVMValueRef persp_center[2];
+
+      /* Read PERSP_CENTER. */
+      for (i = 0; i < 2; i++)
+         persp_center[i] = LLVMGetParam(func, base + 2 + i);
+      /* Overwrite PERSP_SAMPLE. */
+      for (i = 0; i < 2; i++)
+         ret = LLVMBuildInsertValue(ctx->ac.builder, ret, persp_center[i], base + i, "");
+      /* Overwrite PERSP_CENTROID. */
+      for (i = 0; i < 2; i++)
+         ret = LLVMBuildInsertValue(ctx->ac.builder, ret, persp_center[i], base + 4 + i, "");
+   }
+   if (key->ps_prolog.states.force_linear_center_interp) {
+      unsigned i, base = key->ps_prolog.num_input_sgprs;
+      LLVMValueRef linear_center[2];
+
+      /* Read LINEAR_CENTER. */
+      for (i = 0; i < 2; i++)
+         linear_center[i] = LLVMGetParam(func, base + 8 + i);
+      /* Overwrite LINEAR_SAMPLE. */
+      for (i = 0; i < 2; i++)
+         ret = LLVMBuildInsertValue(ctx->ac.builder, ret, linear_center[i], base + 6 + i, "");
+      /* Overwrite LINEAR_CENTROID. */
+      for (i = 0; i < 2; i++)
+         ret = LLVMBuildInsertValue(ctx->ac.builder, ret, linear_center[i], base + 10 + i, "");
+   }
+
+   /* Interpolate colors. */
+   unsigned color_out_idx = 0;
+   for (i = 0; i < 2; i++) {
+      unsigned writemask = (key->ps_prolog.colors_read >> (i * 4)) & 0xf;
+      unsigned face_vgpr = key->ps_prolog.num_input_sgprs + key->ps_prolog.face_vgpr_index;
+      LLVMValueRef interp[2], color[4];
+      LLVMValueRef interp_ij = NULL, prim_mask = NULL, face = NULL;
+
+      if (!writemask)
+         continue;
+
+      /* If the interpolation qualifier is not CONSTANT (-1). */
+      if (key->ps_prolog.color_interp_vgpr_index[i] != -1) {
+         unsigned interp_vgpr =
+            key->ps_prolog.num_input_sgprs + key->ps_prolog.color_interp_vgpr_index[i];
+
+         /* Get the (i,j) updated by bc_optimize handling. */
+         interp[0] = LLVMBuildExtractValue(ctx->ac.builder, ret, interp_vgpr, "");
+         interp[1] = LLVMBuildExtractValue(ctx->ac.builder, ret, interp_vgpr + 1, "");
+         interp_ij = ac_build_gather_values(&ctx->ac, interp, 2);
+      }
+
+      /* Use the absolute location of the input. */
+      prim_mask = LLVMGetParam(func, SI_PS_NUM_USER_SGPR);
+
+      if (key->ps_prolog.states.color_two_side) {
+         face = LLVMGetParam(func, face_vgpr);
+         face = ac_to_integer(&ctx->ac, face);
+      }
+
+      interp_fs_color(ctx, key->ps_prolog.color_attr_index[i], i, key->ps_prolog.num_interp_inputs,
+                      key->ps_prolog.colors_read, interp_ij, prim_mask, face, color);
+
+      while (writemask) {
+         unsigned chan = u_bit_scan(&writemask);
+         ret = LLVMBuildInsertValue(ctx->ac.builder, ret, color[chan],
+                                    ctx->args.arg_count + color_out_idx++, "");
+      }
+   }
+
+   /* Section 15.2.2 (Shader Inputs) of the OpenGL 4.5 (Core Profile) spec
+    * says:
+    *
+    *    "When per-sample shading is active due to the use of a fragment
+    *     input qualified by sample or due to the use of the gl_SampleID
+    *     or gl_SamplePosition variables, only the bit for the current
+    *     sample is set in gl_SampleMaskIn. When state specifies multiple
+    *     fragment shader invocations for a given fragment, the sample
+    *     mask for any single fragment shader invocation may specify a
+    *     subset of the covered samples for the fragment. In this case,
+    *     the bit corresponding to each covered sample will be set in
+    *     exactly one fragment shader invocation."
+    *
+    * The samplemask loaded by hardware is always the coverage of the
+    * entire pixel/fragment, so mask bits out based on the sample ID.
+    */
+   if (key->ps_prolog.states.samplemask_log_ps_iter) {
+      /* The bit pattern matches that used by fixed function fragment
+       * processing. */
+      static const uint16_t ps_iter_masks[] = {
+         0xffff, /* not used */
+         0x5555, 0x1111, 0x0101, 0x0001,
+      };
+      assert(key->ps_prolog.states.samplemask_log_ps_iter < ARRAY_SIZE(ps_iter_masks));
+
+      uint32_t ps_iter_mask = ps_iter_masks[key->ps_prolog.states.samplemask_log_ps_iter];
+      LLVMValueRef sampleid = si_unpack_param(ctx, ancillary, 8, 4);
+      LLVMValueRef samplemask = ac_get_arg(&ctx->ac, param_sample_mask);
+
+      samplemask = ac_to_integer(&ctx->ac, samplemask);
+      samplemask =
+         LLVMBuildAnd(ctx->ac.builder, samplemask,
+                      LLVMBuildShl(ctx->ac.builder, LLVMConstInt(ctx->ac.i32, ps_iter_mask, false),
+                                   sampleid, ""),
+                      "");
+      samplemask = ac_to_float(&ctx->ac, samplemask);
+
+      ret = LLVMBuildInsertValue(ctx->ac.builder, ret, samplemask, param_sample_mask.arg_index, "");
+   }
+
+   /* Tell LLVM to insert WQM instruction sequence when needed. */
+   if (key->ps_prolog.wqm) {
+      LLVMAddTargetDependentFunctionAttr(func, "amdgpu-ps-wqm-outputs", "");
+   }
+
+   si_llvm_build_ret(ctx, ret);
 }
 
 /**
  * Build the pixel shader epilog function. This handles everything that must be
  * emulated for pixel shader exports. (alpha-test, format conversions, etc)
  */
-void si_llvm_build_ps_epilog(struct si_shader_context *ctx,
-			     union si_shader_part_key *key)
+void si_llvm_build_ps_epilog(struct si_shader_context *ctx, union si_shader_part_key *key)
 {
-	LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
-	int i;
-	struct si_ps_exports exp = {};
-
-	memset(&ctx->args, 0, sizeof(ctx->args));
-
-	/* Declare input SGPRs. */
-	ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->rw_buffers);
-	ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT,
-		   &ctx->bindless_samplers_and_images);
-	ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT,
-		   &ctx->const_and_shader_buffers);
-	ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT,
-		   &ctx->samplers_and_images);
-	si_add_arg_checked(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT,
-			   NULL, SI_PARAM_ALPHA_REF);
-
-	/* Declare input VGPRs. */
-	unsigned required_num_params =
-		     ctx->args.num_sgprs_used +
-		     util_bitcount(key->ps_epilog.colors_written) * 4 +
-		     key->ps_epilog.writes_z +
-		     key->ps_epilog.writes_stencil +
-		     key->ps_epilog.writes_samplemask;
-
-	required_num_params = MAX2(required_num_params,
-				   ctx->args.num_sgprs_used + PS_EPILOG_SAMPLEMASK_MIN_LOC + 1);
-
-	while (ctx->args.arg_count < required_num_params)
-		ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, NULL);
-
-	/* Create the function. */
-	si_llvm_create_func(ctx, "ps_epilog", NULL, 0, 0);
-	/* Disable elimination of unused inputs. */
-	ac_llvm_add_target_dep_function_attr(ctx->main_fn,
-					     "InitialPSInputAddr", 0xffffff);
-
-	/* Process colors. */
-	unsigned vgpr = ctx->args.num_sgprs_used;
-	unsigned colors_written = key->ps_epilog.colors_written;
-	int last_color_export = -1;
-
-	/* Find the last color export. */
-	if (!key->ps_epilog.writes_z &&
-	    !key->ps_epilog.writes_stencil &&
-	    !key->ps_epilog.writes_samplemask) {
-		unsigned spi_format = key->ps_epilog.states.spi_shader_col_format;
-
-		/* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */
-		if (colors_written == 0x1 && key->ps_epilog.states.last_cbuf > 0) {
-			/* Just set this if any of the colorbuffers are enabled. */
-			if (spi_format &
-			    ((1ull << (4 * (key->ps_epilog.states.last_cbuf + 1))) - 1))
-				last_color_export = 0;
-		} else {
-			for (i = 0; i < 8; i++)
-				if (colors_written & (1 << i) &&
-				    (spi_format >> (i * 4)) & 0xf)
-					last_color_export = i;
-		}
-	}
-
-	while (colors_written) {
-		LLVMValueRef color[4];
-		int mrt = u_bit_scan(&colors_written);
-
-		for (i = 0; i < 4; i++)
-			color[i] = LLVMGetParam(ctx->main_fn, vgpr++);
-
-		si_export_mrt_color(ctx, color, mrt,
-				    ctx->args.arg_count - 1,
-				    mrt == last_color_export, &exp);
-	}
-
-	/* Process depth, stencil, samplemask. */
-	if (key->ps_epilog.writes_z)
-		depth = LLVMGetParam(ctx->main_fn, vgpr++);
-	if (key->ps_epilog.writes_stencil)
-		stencil = LLVMGetParam(ctx->main_fn, vgpr++);
-	if (key->ps_epilog.writes_samplemask)
-		samplemask = LLVMGetParam(ctx->main_fn, vgpr++);
-
-	if (depth || stencil || samplemask)
-		si_export_mrt_z(ctx, depth, stencil, samplemask, &exp);
-	else if (last_color_export == -1)
-		ac_build_export_null(&ctx->ac);
-
-	if (exp.num)
-		si_emit_ps_exports(ctx, &exp);
-
-	/* Compile. */
-	LLVMBuildRetVoid(ctx->ac.builder);
+   LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
+   int i;
+   struct si_ps_exports exp = {};
+
+   memset(&ctx->args, 0, sizeof(ctx->args));
+
+   /* Declare input SGPRs. */
+   ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->rw_buffers);
+   ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->bindless_samplers_and_images);
+   ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->const_and_shader_buffers);
+   ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->samplers_and_images);
+   si_add_arg_checked(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL, SI_PARAM_ALPHA_REF);
+
+   /* Declare input VGPRs. */
+   unsigned required_num_params =
+      ctx->args.num_sgprs_used + util_bitcount(key->ps_epilog.colors_written) * 4 +
+      key->ps_epilog.writes_z + key->ps_epilog.writes_stencil + key->ps_epilog.writes_samplemask;
+
+   required_num_params =
+      MAX2(required_num_params, ctx->args.num_sgprs_used + PS_EPILOG_SAMPLEMASK_MIN_LOC + 1);
+
+   while (ctx->args.arg_count < required_num_params)
+      ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, NULL);
+
+   /* Create the function. */
+   si_llvm_create_func(ctx, "ps_epilog", NULL, 0, 0);
+   /* Disable elimination of unused inputs. */
+   ac_llvm_add_target_dep_function_attr(ctx->main_fn, "InitialPSInputAddr", 0xffffff);
+
+   /* Process colors. */
+   unsigned vgpr = ctx->args.num_sgprs_used;
+   unsigned colors_written = key->ps_epilog.colors_written;
+   int last_color_export = -1;
+
+   /* Find the last color export. */
+   if (!key->ps_epilog.writes_z && !key->ps_epilog.writes_stencil &&
+       !key->ps_epilog.writes_samplemask) {
+      unsigned spi_format = key->ps_epilog.states.spi_shader_col_format;
+
+      /* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */
+      if (colors_written == 0x1 && key->ps_epilog.states.last_cbuf > 0) {
+         /* Just set this if any of the colorbuffers are enabled. */
+         if (spi_format & ((1ull << (4 * (key->ps_epilog.states.last_cbuf + 1))) - 1))
+            last_color_export = 0;
+      } else {
+         for (i = 0; i < 8; i++)
+            if (colors_written & (1 << i) && (spi_format >> (i * 4)) & 0xf)
+               last_color_export = i;
+      }
+   }
+
+   while (colors_written) {
+      LLVMValueRef color[4];
+      int mrt = u_bit_scan(&colors_written);
+
+      for (i = 0; i < 4; i++)
+         color[i] = LLVMGetParam(ctx->main_fn, vgpr++);
+
+      si_export_mrt_color(ctx, color, mrt, ctx->args.arg_count - 1, mrt == last_color_export, &exp);
+   }
+
+   /* Process depth, stencil, samplemask. */
+   if (key->ps_epilog.writes_z)
+      depth = LLVMGetParam(ctx->main_fn, vgpr++);
+   if (key->ps_epilog.writes_stencil)
+      stencil = LLVMGetParam(ctx->main_fn, vgpr++);
+   if (key->ps_epilog.writes_samplemask)
+      samplemask = LLVMGetParam(ctx->main_fn, vgpr++);
+
+   if (depth || stencil || samplemask)
+      si_export_mrt_z(ctx, depth, stencil, samplemask, &exp);
+   else if (last_color_export == -1)
+      ac_build_export_null(&ctx->ac);
+
+   if (exp.num)
+      si_emit_ps_exports(ctx, &exp);
+
+   /* Compile. */
+   LLVMBuildRetVoid(ctx->ac.builder);
 }
 
-void si_llvm_build_monolithic_ps(struct si_shader_context *ctx,
-				 struct si_shader *shader)
+void si_llvm_build_monolithic_ps(struct si_shader_context *ctx, struct si_shader *shader)
 {
-	LLVMValueRef parts[3];
-	unsigned num_parts = 0, main_index;
+   LLVMValueRef parts[3];
+   unsigned num_parts = 0, main_index;
 
-	union si_shader_part_key prolog_key;
-	si_get_ps_prolog_key(shader, &prolog_key, false);
+   union si_shader_part_key prolog_key;
+   si_get_ps_prolog_key(shader, &prolog_key, false);
 
-	if (si_need_ps_prolog(&prolog_key)) {
-		si_llvm_build_ps_prolog(ctx, &prolog_key);
-		parts[num_parts++] = ctx->main_fn;
-	}
+   if (si_need_ps_prolog(&prolog_key)) {
+      si_llvm_build_ps_prolog(ctx, &prolog_key);
+      parts[num_parts++] = ctx->main_fn;
+   }
 
-	main_index = num_parts;
-	parts[num_parts++] = ctx->main_fn;
+   main_index = num_parts;
+   parts[num_parts++] = ctx->main_fn;
 
-	union si_shader_part_key epilog_key;
-	si_get_ps_epilog_key(shader, &epilog_key);
-	si_llvm_build_ps_epilog(ctx, &epilog_key);
-	parts[num_parts++] = ctx->main_fn;
+   union si_shader_part_key epilog_key;
+   si_get_ps_epilog_key(shader, &epilog_key);
+   si_llvm_build_ps_epilog(ctx, &epilog_key);
+   parts[num_parts++] = ctx->main_fn;
 
-	si_build_wrapper_function(ctx, parts, num_parts, main_index, 0);
+   si_build_wrapper_function(ctx, parts, num_parts, main_index, 0);
 }
 
 void si_llvm_init_ps_callbacks(struct si_shader_context *ctx)
 {
-	ctx->abi.emit_outputs = si_llvm_return_fs_outputs;
-	ctx->abi.load_sample_position = load_sample_position;
-	ctx->abi.load_sample_mask_in = load_sample_mask_in;
-	ctx->abi.emit_fbfetch = si_nir_emit_fbfetch;
+   ctx->abi.emit_outputs = si_llvm_return_fs_outputs;
+   ctx->abi.load_sample_position = load_sample_position;
+   ctx->abi.load_sample_mask_in = load_sample_mask_in;
+   ctx->abi.emit_fbfetch = si_nir_emit_fbfetch;
 }
diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm_resources.c b/src/gallium/drivers/radeonsi/si_shader_llvm_resources.c
index cb06aa99ca7..122e6976261 100644
--- a/src/gallium/drivers/radeonsi/si_shader_llvm_resources.c
+++ b/src/gallium/drivers/radeonsi/si_shader_llvm_resources.c
@@ -22,111 +22,98 @@
  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  */
 
-#include "si_shader_internal.h"
 #include "si_pipe.h"
+#include "si_shader_internal.h"
 #include "sid.h"
 
 /**
  * Return a value that is equal to the given i32 \p index if it lies in [0,num)
  * or an undefined value in the same interval otherwise.
  */
-static LLVMValueRef si_llvm_bound_index(struct si_shader_context *ctx,
-				 LLVMValueRef index,
-				 unsigned num)
+static LLVMValueRef si_llvm_bound_index(struct si_shader_context *ctx, LLVMValueRef index,
+                                        unsigned num)
 {
-	LLVMBuilderRef builder = ctx->ac.builder;
-	LLVMValueRef c_max = LLVMConstInt(ctx->ac.i32, num - 1, 0);
-	LLVMValueRef cc;
-
-	if (util_is_power_of_two_or_zero(num)) {
-		index = LLVMBuildAnd(builder, index, c_max, "");
-	} else {
-		/* In theory, this MAX pattern should result in code that is
-		 * as good as the bit-wise AND above.
-		 *
-		 * In practice, LLVM generates worse code (at the time of
-		 * writing), because its value tracking is not strong enough.
-		 */
-		cc = LLVMBuildICmp(builder, LLVMIntULE, index, c_max, "");
-		index = LLVMBuildSelect(builder, cc, index, c_max, "");
-	}
-
-	return index;
+   LLVMBuilderRef builder = ctx->ac.builder;
+   LLVMValueRef c_max = LLVMConstInt(ctx->ac.i32, num - 1, 0);
+   LLVMValueRef cc;
+
+   if (util_is_power_of_two_or_zero(num)) {
+      index = LLVMBuildAnd(builder, index, c_max, "");
+   } else {
+      /* In theory, this MAX pattern should result in code that is
+       * as good as the bit-wise AND above.
+       *
+       * In practice, LLVM generates worse code (at the time of
+       * writing), because its value tracking is not strong enough.
+       */
+      cc = LLVMBuildICmp(builder, LLVMIntULE, index, c_max, "");
+      index = LLVMBuildSelect(builder, cc, index, c_max, "");
+   }
+
+   return index;
 }
 
 static LLVMValueRef load_const_buffer_desc_fast_path(struct si_shader_context *ctx)
 {
-	LLVMValueRef ptr =
-		ac_get_arg(&ctx->ac, ctx->const_and_shader_buffers);
-	struct si_shader_selector *sel = ctx->shader->selector;
-
-	/* Do the bounds checking with a descriptor, because
-	 * doing computation and manual bounds checking of 64-bit
-	 * addresses generates horrible VALU code with very high
-	 * VGPR usage and very low SIMD occupancy.
-	 */
-	ptr = LLVMBuildPtrToInt(ctx->ac.builder, ptr, ctx->ac.intptr, "");
-
-	LLVMValueRef desc0, desc1;
-	desc0 = ptr;
-	desc1 = LLVMConstInt(ctx->ac.i32,
-			     S_008F04_BASE_ADDRESS_HI(ctx->screen->info.address32_hi), 0);
-
-	uint32_t rsrc3 = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
-			 S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
-			 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
-			 S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
-
-	if (ctx->screen->info.chip_class >= GFX10)
-		rsrc3 |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) |
-			 S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) |
-			 S_008F0C_RESOURCE_LEVEL(1);
-	else
-		rsrc3 |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
-			 S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
-
-	LLVMValueRef desc_elems[] = {
-		desc0,
-		desc1,
-		LLVMConstInt(ctx->ac.i32, sel->info.constbuf0_num_slots * 16, 0),
-		LLVMConstInt(ctx->ac.i32, rsrc3, false)
-	};
-
-	return ac_build_gather_values(&ctx->ac, desc_elems, 4);
+   LLVMValueRef ptr = ac_get_arg(&ctx->ac, ctx->const_and_shader_buffers);
+   struct si_shader_selector *sel = ctx->shader->selector;
+
+   /* Do the bounds checking with a descriptor, because
+    * doing computation and manual bounds checking of 64-bit
+    * addresses generates horrible VALU code with very high
+    * VGPR usage and very low SIMD occupancy.
+    */
+   ptr = LLVMBuildPtrToInt(ctx->ac.builder, ptr, ctx->ac.intptr, "");
+
+   LLVMValueRef desc0, desc1;
+   desc0 = ptr;
+   desc1 = LLVMConstInt(ctx->ac.i32, S_008F04_BASE_ADDRESS_HI(ctx->screen->info.address32_hi), 0);
+
+   uint32_t rsrc3 = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
+                    S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
+
+   if (ctx->screen->info.chip_class >= GFX10)
+      rsrc3 |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) |
+               S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) | S_008F0C_RESOURCE_LEVEL(1);
+   else
+      rsrc3 |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
+               S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
+
+   LLVMValueRef desc_elems[] = {desc0, desc1,
+                                LLVMConstInt(ctx->ac.i32, sel->info.constbuf0_num_slots * 16, 0),
+                                LLVMConstInt(ctx->ac.i32, rsrc3, false)};
+
+   return ac_build_gather_values(&ctx->ac, desc_elems, 4);
 }
 
 static LLVMValueRef load_ubo(struct ac_shader_abi *abi, LLVMValueRef index)
 {
-	struct si_shader_context *ctx = si_shader_context_from_abi(abi);
-	struct si_shader_selector *sel = ctx->shader->selector;
+   struct si_shader_context *ctx = si_shader_context_from_abi(abi);
+   struct si_shader_selector *sel = ctx->shader->selector;
 
-	LLVMValueRef ptr = ac_get_arg(&ctx->ac, ctx->const_and_shader_buffers);
+   LLVMValueRef ptr = ac_get_arg(&ctx->ac, ctx->const_and_shader_buffers);
 
-	if (sel->info.const_buffers_declared == 1 &&
-	    sel->info.shader_buffers_declared == 0) {
-		return load_const_buffer_desc_fast_path(ctx);
-	}
+   if (sel->info.const_buffers_declared == 1 && sel->info.shader_buffers_declared == 0) {
+      return load_const_buffer_desc_fast_path(ctx);
+   }
 
-	index = si_llvm_bound_index(ctx, index, ctx->num_const_buffers);
-	index = LLVMBuildAdd(ctx->ac.builder, index,
-			     LLVMConstInt(ctx->ac.i32, SI_NUM_SHADER_BUFFERS, 0), "");
+   index = si_llvm_bound_index(ctx, index, ctx->num_const_buffers);
+   index =
+      LLVMBuildAdd(ctx->ac.builder, index, LLVMConstInt(ctx->ac.i32, SI_NUM_SHADER_BUFFERS, 0), "");
 
-	return ac_build_load_to_sgpr(&ctx->ac, ptr, index);
+   return ac_build_load_to_sgpr(&ctx->ac, ptr, index);
 }
 
-static LLVMValueRef
-load_ssbo(struct ac_shader_abi *abi, LLVMValueRef index, bool write)
+static LLVMValueRef load_ssbo(struct ac_shader_abi *abi, LLVMValueRef index, bool write)
 {
-	struct si_shader_context *ctx = si_shader_context_from_abi(abi);
-	LLVMValueRef rsrc_ptr = ac_get_arg(&ctx->ac,
-					   ctx->const_and_shader_buffers);
+   struct si_shader_context *ctx = si_shader_context_from_abi(abi);
+   LLVMValueRef rsrc_ptr = ac_get_arg(&ctx->ac, ctx->const_and_shader_buffers);
 
-	index = si_llvm_bound_index(ctx, index, ctx->num_shader_buffers);
-	index = LLVMBuildSub(ctx->ac.builder,
-			     LLVMConstInt(ctx->ac.i32, SI_NUM_SHADER_BUFFERS - 1, 0),
-			     index, "");
+   index = si_llvm_bound_index(ctx, index, ctx->num_shader_buffers);
+   index = LLVMBuildSub(ctx->ac.builder, LLVMConstInt(ctx->ac.i32, SI_NUM_SHADER_BUFFERS - 1, 0),
+                        index, "");
 
-	return ac_build_load_to_sgpr(&ctx->ac, rsrc_ptr, index);
+   return ac_build_load_to_sgpr(&ctx->ac, rsrc_ptr, index);
 }
 
 /**
@@ -140,181 +127,167 @@ load_ssbo(struct ac_shader_abi *abi, LLVMValueRef index, bool write)
  * nicer: disabling DCC in the shader still leads to undefined results but
  * avoids the lockup.
  */
-static LLVMValueRef force_dcc_off(struct si_shader_context *ctx,
-				  LLVMValueRef rsrc)
+static LLVMValueRef force_dcc_off(struct si_shader_context *ctx, LLVMValueRef rsrc)
 {
-	if (ctx->screen->info.chip_class <= GFX7) {
-		return rsrc;
-	} else {
-		LLVMValueRef i32_6 = LLVMConstInt(ctx->ac.i32, 6, 0);
-		LLVMValueRef i32_C = LLVMConstInt(ctx->ac.i32, C_008F28_COMPRESSION_EN, 0);
-		LLVMValueRef tmp;
-
-		tmp = LLVMBuildExtractElement(ctx->ac.builder, rsrc, i32_6, "");
-		tmp = LLVMBuildAnd(ctx->ac.builder, tmp, i32_C, "");
-		return LLVMBuildInsertElement(ctx->ac.builder, rsrc, tmp, i32_6, "");
-	}
+   if (ctx->screen->info.chip_class <= GFX7) {
+      return rsrc;
+   } else {
+      LLVMValueRef i32_6 = LLVMConstInt(ctx->ac.i32, 6, 0);
+      LLVMValueRef i32_C = LLVMConstInt(ctx->ac.i32, C_008F28_COMPRESSION_EN, 0);
+      LLVMValueRef tmp;
+
+      tmp = LLVMBuildExtractElement(ctx->ac.builder, rsrc, i32_6, "");
+      tmp = LLVMBuildAnd(ctx->ac.builder, tmp, i32_C, "");
+      return LLVMBuildInsertElement(ctx->ac.builder, rsrc, tmp, i32_6, "");
+   }
 }
 
 /* AC_DESC_FMASK is handled exactly like AC_DESC_IMAGE. The caller should
  * adjust "index" to point to FMASK. */
-static LLVMValueRef si_load_image_desc(struct si_shader_context *ctx,
-				       LLVMValueRef list, LLVMValueRef index,
-				       enum ac_descriptor_type desc_type,
-				       bool uses_store, bool bindless)
+static LLVMValueRef si_load_image_desc(struct si_shader_context *ctx, LLVMValueRef list,
+                                       LLVMValueRef index, enum ac_descriptor_type desc_type,
+                                       bool uses_store, bool bindless)
 {
-	LLVMBuilderRef builder = ctx->ac.builder;
-	LLVMValueRef rsrc;
-
-	if (desc_type == AC_DESC_BUFFER) {
-		index = ac_build_imad(&ctx->ac, index, LLVMConstInt(ctx->ac.i32, 2, 0),
-				      ctx->ac.i32_1);
-		list = LLVMBuildPointerCast(builder, list,
-					    ac_array_in_const32_addr_space(ctx->ac.v4i32), "");
-	} else {
-		assert(desc_type == AC_DESC_IMAGE ||
-		       desc_type == AC_DESC_FMASK);
-	}
-
-	if (bindless)
-		rsrc = ac_build_load_to_sgpr_uint_wraparound(&ctx->ac, list, index);
-	else
-		rsrc = ac_build_load_to_sgpr(&ctx->ac, list, index);
-
-	if (desc_type == AC_DESC_IMAGE && uses_store)
-		rsrc = force_dcc_off(ctx, rsrc);
-	return rsrc;
+   LLVMBuilderRef builder = ctx->ac.builder;
+   LLVMValueRef rsrc;
+
+   if (desc_type == AC_DESC_BUFFER) {
+      index = ac_build_imad(&ctx->ac, index, LLVMConstInt(ctx->ac.i32, 2, 0), ctx->ac.i32_1);
+      list = LLVMBuildPointerCast(builder, list, ac_array_in_const32_addr_space(ctx->ac.v4i32), "");
+   } else {
+      assert(desc_type == AC_DESC_IMAGE || desc_type == AC_DESC_FMASK);
+   }
+
+   if (bindless)
+      rsrc = ac_build_load_to_sgpr_uint_wraparound(&ctx->ac, list, index);
+   else
+      rsrc = ac_build_load_to_sgpr(&ctx->ac, list, index);
+
+   if (desc_type == AC_DESC_IMAGE && uses_store)
+      rsrc = force_dcc_off(ctx, rsrc);
+   return rsrc;
 }
 
 /**
  * Load an image view, fmask view. or sampler state descriptor.
  */
-static LLVMValueRef si_load_sampler_desc(struct si_shader_context *ctx,
-					 LLVMValueRef list, LLVMValueRef index,
-					 enum ac_descriptor_type type)
+static LLVMValueRef si_load_sampler_desc(struct si_shader_context *ctx, LLVMValueRef list,
+                                         LLVMValueRef index, enum ac_descriptor_type type)
 {
-	LLVMBuilderRef builder = ctx->ac.builder;
-
-	switch (type) {
-	case AC_DESC_IMAGE:
-		/* The image is at [0:7]. */
-		index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->ac.i32, 2, 0), "");
-		break;
-	case AC_DESC_BUFFER:
-		/* The buffer is in [4:7]. */
-		index = ac_build_imad(&ctx->ac, index, LLVMConstInt(ctx->ac.i32, 4, 0),
-				      ctx->ac.i32_1);
-		list = LLVMBuildPointerCast(builder, list,
-					    ac_array_in_const32_addr_space(ctx->ac.v4i32), "");
-		break;
-	case AC_DESC_FMASK:
-		/* The FMASK is at [8:15]. */
-		index = ac_build_imad(&ctx->ac, index, LLVMConstInt(ctx->ac.i32, 2, 0),
-				      ctx->ac.i32_1);
-		break;
-	case AC_DESC_SAMPLER:
-		/* The sampler state is at [12:15]. */
-		index = ac_build_imad(&ctx->ac, index, LLVMConstInt(ctx->ac.i32, 4, 0),
-				      LLVMConstInt(ctx->ac.i32, 3, 0));
-		list = LLVMBuildPointerCast(builder, list,
-					    ac_array_in_const32_addr_space(ctx->ac.v4i32), "");
-		break;
-	case AC_DESC_PLANE_0:
-	case AC_DESC_PLANE_1:
-	case AC_DESC_PLANE_2:
-		/* Only used for the multiplane image support for Vulkan. Should
-		 * never be reached in radeonsi.
-		 */
-		unreachable("Plane descriptor requested in radeonsi.");
-	}
-
-	return ac_build_load_to_sgpr(&ctx->ac, list, index);
+   LLVMBuilderRef builder = ctx->ac.builder;
+
+   switch (type) {
+   case AC_DESC_IMAGE:
+      /* The image is at [0:7]. */
+      index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->ac.i32, 2, 0), "");
+      break;
+   case AC_DESC_BUFFER:
+      /* The buffer is in [4:7]. */
+      index = ac_build_imad(&ctx->ac, index, LLVMConstInt(ctx->ac.i32, 4, 0), ctx->ac.i32_1);
+      list = LLVMBuildPointerCast(builder, list, ac_array_in_const32_addr_space(ctx->ac.v4i32), "");
+      break;
+   case AC_DESC_FMASK:
+      /* The FMASK is at [8:15]. */
+      index = ac_build_imad(&ctx->ac, index, LLVMConstInt(ctx->ac.i32, 2, 0), ctx->ac.i32_1);
+      break;
+   case AC_DESC_SAMPLER:
+      /* The sampler state is at [12:15]. */
+      index = ac_build_imad(&ctx->ac, index, LLVMConstInt(ctx->ac.i32, 4, 0),
+                            LLVMConstInt(ctx->ac.i32, 3, 0));
+      list = LLVMBuildPointerCast(builder, list, ac_array_in_const32_addr_space(ctx->ac.v4i32), "");
+      break;
+   case AC_DESC_PLANE_0:
+   case AC_DESC_PLANE_1:
+   case AC_DESC_PLANE_2:
+      /* Only used for the multiplane image support for Vulkan. Should
+       * never be reached in radeonsi.
+       */
+      unreachable("Plane descriptor requested in radeonsi.");
+   }
+
+   return ac_build_load_to_sgpr(&ctx->ac, list, index);
 }
 
-static LLVMValueRef
-si_nir_load_sampler_desc(struct ac_shader_abi *abi,
-		         unsigned descriptor_set, unsigned base_index,
-		         unsigned constant_index, LLVMValueRef dynamic_index,
-		         enum ac_descriptor_type desc_type, bool image,
-			 bool write, bool bindless)
+static LLVMValueRef si_nir_load_sampler_desc(struct ac_shader_abi *abi, unsigned descriptor_set,
+                                             unsigned base_index, unsigned constant_index,
+                                             LLVMValueRef dynamic_index,
+                                             enum ac_descriptor_type desc_type, bool image,
+                                             bool write, bool bindless)
 {
-	struct si_shader_context *ctx = si_shader_context_from_abi(abi);
-	LLVMBuilderRef builder = ctx->ac.builder;
-	unsigned const_index = base_index + constant_index;
-
-	assert(!descriptor_set);
-	assert(desc_type <= AC_DESC_BUFFER);
-
-	if (bindless) {
-		LLVMValueRef list = ac_get_arg(&ctx->ac, ctx->bindless_samplers_and_images);
-
-		/* dynamic_index is the bindless handle */
-		if (image) {
-			/* Bindless image descriptors use 16-dword slots. */
-			dynamic_index = LLVMBuildMul(ctx->ac.builder, dynamic_index,
-					     LLVMConstInt(ctx->ac.i64, 2, 0), "");
-			/* FMASK is right after the image. */
-			if (desc_type == AC_DESC_FMASK) {
-				dynamic_index = LLVMBuildAdd(ctx->ac.builder, dynamic_index,
-							     ctx->ac.i32_1, "");
-			}
-
-			return si_load_image_desc(ctx, list, dynamic_index, desc_type,
-						  write, true);
-		}
-
-		/* Since bindless handle arithmetic can contain an unsigned integer
-		 * wraparound and si_load_sampler_desc assumes there isn't any,
-		 * use GEP without "inbounds" (inside ac_build_pointer_add)
-		 * to prevent incorrect code generation and hangs.
-		 */
-		dynamic_index = LLVMBuildMul(ctx->ac.builder, dynamic_index,
-					     LLVMConstInt(ctx->ac.i64, 2, 0), "");
-		list = ac_build_pointer_add(&ctx->ac, list, dynamic_index);
-		return si_load_sampler_desc(ctx, list, ctx->ac.i32_0, desc_type);
-	}
-
-	unsigned num_slots = image ? ctx->num_images : ctx->num_samplers;
-	assert(const_index < num_slots || dynamic_index);
-
-	LLVMValueRef list = ac_get_arg(&ctx->ac, ctx->samplers_and_images);
-	LLVMValueRef index = LLVMConstInt(ctx->ac.i32, const_index, false);
-
-	if (dynamic_index) {
-		index = LLVMBuildAdd(builder, index, dynamic_index, "");
-
-		/* From the GL_ARB_shader_image_load_store extension spec:
-		 *
-		 *    If a shader performs an image load, store, or atomic
-		 *    operation using an image variable declared as an array,
-		 *    and if the index used to select an individual element is
-		 *    negative or greater than or equal to the size of the
-		 *    array, the results of the operation are undefined but may
-		 *    not lead to termination.
-		 */
-		index = si_llvm_bound_index(ctx, index, num_slots);
-	}
-
-	if (image) {
-		/* FMASKs are separate from images. */
-		if (desc_type == AC_DESC_FMASK) {
-			index = LLVMBuildAdd(ctx->ac.builder, index,
-					     LLVMConstInt(ctx->ac.i32, SI_NUM_IMAGES, 0), "");
-		}
-		index = LLVMBuildSub(ctx->ac.builder,
-				     LLVMConstInt(ctx->ac.i32, SI_NUM_IMAGE_SLOTS - 1, 0),
-				     index, "");
-		return si_load_image_desc(ctx, list, index, desc_type, write, false);
-	}
-
-	index = LLVMBuildAdd(ctx->ac.builder, index,
-			     LLVMConstInt(ctx->ac.i32, SI_NUM_IMAGE_SLOTS / 2, 0), "");
-	return si_load_sampler_desc(ctx, list, index, desc_type);
+   struct si_shader_context *ctx = si_shader_context_from_abi(abi);
+   LLVMBuilderRef builder = ctx->ac.builder;
+   unsigned const_index = base_index + constant_index;
+
+   assert(!descriptor_set);
+   assert(desc_type <= AC_DESC_BUFFER);
+
+   if (bindless) {
+      LLVMValueRef list = ac_get_arg(&ctx->ac, ctx->bindless_samplers_and_images);
+
+      /* dynamic_index is the bindless handle */
+      if (image) {
+         /* Bindless image descriptors use 16-dword slots. */
+         dynamic_index =
+            LLVMBuildMul(ctx->ac.builder, dynamic_index, LLVMConstInt(ctx->ac.i64, 2, 0), "");
+         /* FMASK is right after the image. */
+         if (desc_type == AC_DESC_FMASK) {
+            dynamic_index = LLVMBuildAdd(ctx->ac.builder, dynamic_index, ctx->ac.i32_1, "");
+         }
+
+         return si_load_image_desc(ctx, list, dynamic_index, desc_type, write, true);
+      }
+
+      /* Since bindless handle arithmetic can contain an unsigned integer
+       * wraparound and si_load_sampler_desc assumes there isn't any,
+       * use GEP without "inbounds" (inside ac_build_pointer_add)
+       * to prevent incorrect code generation and hangs.
+       */
+      dynamic_index =
+         LLVMBuildMul(ctx->ac.builder, dynamic_index, LLVMConstInt(ctx->ac.i64, 2, 0), "");
+      list = ac_build_pointer_add(&ctx->ac, list, dynamic_index);
+      return si_load_sampler_desc(ctx, list, ctx->ac.i32_0, desc_type);
+   }
+
+   unsigned num_slots = image ? ctx->num_images : ctx->num_samplers;
+   assert(const_index < num_slots || dynamic_index);
+
+   LLVMValueRef list = ac_get_arg(&ctx->ac, ctx->samplers_and_images);
+   LLVMValueRef index = LLVMConstInt(ctx->ac.i32, const_index, false);
+
+   if (dynamic_index) {
+      index = LLVMBuildAdd(builder, index, dynamic_index, "");
+
+      /* From the GL_ARB_shader_image_load_store extension spec:
+       *
+       *    If a shader performs an image load, store, or atomic
+       *    operation using an image variable declared as an array,
+       *    and if the index used to select an individual element is
+       *    negative or greater than or equal to the size of the
+       *    array, the results of the operation are undefined but may
+       *    not lead to termination.
+       */
+      index = si_llvm_bound_index(ctx, index, num_slots);
+   }
+
+   if (image) {
+      /* FMASKs are separate from images. */
+      if (desc_type == AC_DESC_FMASK) {
+         index =
+            LLVMBuildAdd(ctx->ac.builder, index, LLVMConstInt(ctx->ac.i32, SI_NUM_IMAGES, 0), "");
+      }
+      index = LLVMBuildSub(ctx->ac.builder, LLVMConstInt(ctx->ac.i32, SI_NUM_IMAGE_SLOTS - 1, 0),
+                           index, "");
+      return si_load_image_desc(ctx, list, index, desc_type, write, false);
+   }
+
+   index = LLVMBuildAdd(ctx->ac.builder, index,
+                        LLVMConstInt(ctx->ac.i32, SI_NUM_IMAGE_SLOTS / 2, 0), "");
+   return si_load_sampler_desc(ctx, list, index, desc_type);
 }
 
 void si_llvm_init_resource_callbacks(struct si_shader_context *ctx)
 {
-	ctx->abi.load_ubo = load_ubo;
-	ctx->abi.load_ssbo = load_ssbo;
-	ctx->abi.load_sampler_desc = si_nir_load_sampler_desc;
+   ctx->abi.load_ubo = load_ubo;
+   ctx->abi.load_ssbo = load_ssbo;
+   ctx->abi.load_sampler_desc = si_nir_load_sampler_desc;
 }
diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm_tess.c b/src/gallium/drivers/radeonsi/si_shader_llvm_tess.c
index 116e06e5af1..5dba9859988 100644
--- a/src/gallium/drivers/radeonsi/si_shader_llvm_tess.c
+++ b/src/gallium/drivers/radeonsi/si_shader_llvm_tess.c
@@ -22,23 +22,23 @@
  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  */
 
-#include "si_shader_internal.h"
 #include "si_pipe.h"
+#include "si_shader_internal.h"
 #include "sid.h"
 
 static LLVMValueRef get_rel_patch_id(struct si_shader_context *ctx)
 {
-	switch (ctx->type) {
-	case PIPE_SHADER_TESS_CTRL:
-		return si_unpack_param(ctx, ctx->args.tcs_rel_ids, 0, 8);
+   switch (ctx->type) {
+   case PIPE_SHADER_TESS_CTRL:
+      return si_unpack_param(ctx, ctx->args.tcs_rel_ids, 0, 8);
 
-	case PIPE_SHADER_TESS_EVAL:
-		return ac_get_arg(&ctx->ac, ctx->tes_rel_patch_id);
+   case PIPE_SHADER_TESS_EVAL:
+      return ac_get_arg(&ctx->ac, ctx->tes_rel_patch_id);
 
-	default:
-		assert(0);
-		return NULL;
-	}
+   default:
+      assert(0);
+      return NULL;
+   }
 }
 
 /* Tessellation shaders pass outputs to the next shader using LDS.
@@ -62,151 +62,134 @@ static LLVMValueRef get_rel_patch_id(struct si_shader_context *ctx)
  * All three shaders VS(LS), TCS, TES share the same LDS space.
  */
 
-static LLVMValueRef
-get_tcs_in_patch_stride(struct si_shader_context *ctx)
+static LLVMValueRef get_tcs_in_patch_stride(struct si_shader_context *ctx)
 {
-	return si_unpack_param(ctx, ctx->vs_state_bits, 11, 13);
+   return si_unpack_param(ctx, ctx->vs_state_bits, 11, 13);
 }
 
 static unsigned get_tcs_out_vertex_dw_stride_constant(struct si_shader_context *ctx)
 {
-	assert(ctx->type == PIPE_SHADER_TESS_CTRL);
+   assert(ctx->type == PIPE_SHADER_TESS_CTRL);
 
-	if (ctx->shader->key.mono.u.ff_tcs_inputs_to_copy)
-		return util_last_bit64(ctx->shader->key.mono.u.ff_tcs_inputs_to_copy) * 4;
+   if (ctx->shader->key.mono.u.ff_tcs_inputs_to_copy)
+      return util_last_bit64(ctx->shader->key.mono.u.ff_tcs_inputs_to_copy) * 4;
 
-	return util_last_bit64(ctx->shader->selector->outputs_written) * 4;
+   return util_last_bit64(ctx->shader->selector->outputs_written) * 4;
 }
 
 static LLVMValueRef get_tcs_out_vertex_dw_stride(struct si_shader_context *ctx)
 {
-	unsigned stride = get_tcs_out_vertex_dw_stride_constant(ctx);
+   unsigned stride = get_tcs_out_vertex_dw_stride_constant(ctx);
 
-	return LLVMConstInt(ctx->ac.i32, stride, 0);
+   return LLVMConstInt(ctx->ac.i32, stride, 0);
 }
 
 static LLVMValueRef get_tcs_out_patch_stride(struct si_shader_context *ctx)
 {
-	if (ctx->shader->key.mono.u.ff_tcs_inputs_to_copy)
-		return si_unpack_param(ctx, ctx->tcs_out_lds_layout, 0, 13);
-
-	const struct si_shader_info *info = &ctx->shader->selector->info;
-	unsigned tcs_out_vertices = info->properties[TGSI_PROPERTY_TCS_VERTICES_OUT];
-	unsigned vertex_dw_stride = get_tcs_out_vertex_dw_stride_constant(ctx);
-	unsigned num_patch_outputs = util_last_bit64(ctx->shader->selector->patch_outputs_written);
-	unsigned patch_dw_stride = tcs_out_vertices * vertex_dw_stride +
-				   num_patch_outputs * 4;
-	return LLVMConstInt(ctx->ac.i32, patch_dw_stride, 0);
+   if (ctx->shader->key.mono.u.ff_tcs_inputs_to_copy)
+      return si_unpack_param(ctx, ctx->tcs_out_lds_layout, 0, 13);
+
+   const struct si_shader_info *info = &ctx->shader->selector->info;
+   unsigned tcs_out_vertices = info->properties[TGSI_PROPERTY_TCS_VERTICES_OUT];
+   unsigned vertex_dw_stride = get_tcs_out_vertex_dw_stride_constant(ctx);
+   unsigned num_patch_outputs = util_last_bit64(ctx->shader->selector->patch_outputs_written);
+   unsigned patch_dw_stride = tcs_out_vertices * vertex_dw_stride + num_patch_outputs * 4;
+   return LLVMConstInt(ctx->ac.i32, patch_dw_stride, 0);
 }
 
-static LLVMValueRef
-get_tcs_out_patch0_offset(struct si_shader_context *ctx)
+static LLVMValueRef get_tcs_out_patch0_offset(struct si_shader_context *ctx)
 {
-	return LLVMBuildMul(ctx->ac.builder,
-			    si_unpack_param(ctx, ctx->tcs_out_lds_offsets, 0, 16),
-			    LLVMConstInt(ctx->ac.i32, 4, 0), "");
+   return LLVMBuildMul(ctx->ac.builder, si_unpack_param(ctx, ctx->tcs_out_lds_offsets, 0, 16),
+                       LLVMConstInt(ctx->ac.i32, 4, 0), "");
 }
 
-static LLVMValueRef
-get_tcs_out_patch0_patch_data_offset(struct si_shader_context *ctx)
+static LLVMValueRef get_tcs_out_patch0_patch_data_offset(struct si_shader_context *ctx)
 {
-	return LLVMBuildMul(ctx->ac.builder,
-			    si_unpack_param(ctx, ctx->tcs_out_lds_offsets, 16, 16),
-			    LLVMConstInt(ctx->ac.i32, 4, 0), "");
+   return LLVMBuildMul(ctx->ac.builder, si_unpack_param(ctx, ctx->tcs_out_lds_offsets, 16, 16),
+                       LLVMConstInt(ctx->ac.i32, 4, 0), "");
 }
 
-static LLVMValueRef
-get_tcs_in_current_patch_offset(struct si_shader_context *ctx)
+static LLVMValueRef get_tcs_in_current_patch_offset(struct si_shader_context *ctx)
 {
-	LLVMValueRef patch_stride = get_tcs_in_patch_stride(ctx);
-	LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
+   LLVMValueRef patch_stride = get_tcs_in_patch_stride(ctx);
+   LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
 
-	return LLVMBuildMul(ctx->ac.builder, patch_stride, rel_patch_id, "");
+   return LLVMBuildMul(ctx->ac.builder, patch_stride, rel_patch_id, "");
 }
 
-static LLVMValueRef
-get_tcs_out_current_patch_offset(struct si_shader_context *ctx)
+static LLVMValueRef get_tcs_out_current_patch_offset(struct si_shader_context *ctx)
 {
-	LLVMValueRef patch0_offset = get_tcs_out_patch0_offset(ctx);
-	LLVMValueRef patch_stride = get_tcs_out_patch_stride(ctx);
-	LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
+   LLVMValueRef patch0_offset = get_tcs_out_patch0_offset(ctx);
+   LLVMValueRef patch_stride = get_tcs_out_patch_stride(ctx);
+   LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
 
-	return ac_build_imad(&ctx->ac, patch_stride, rel_patch_id, patch0_offset);
+   return ac_build_imad(&ctx->ac, patch_stride, rel_patch_id, patch0_offset);
 }
 
-static LLVMValueRef
-get_tcs_out_current_patch_data_offset(struct si_shader_context *ctx)
+static LLVMValueRef get_tcs_out_current_patch_data_offset(struct si_shader_context *ctx)
 {
-	LLVMValueRef patch0_patch_data_offset =
-		get_tcs_out_patch0_patch_data_offset(ctx);
-	LLVMValueRef patch_stride = get_tcs_out_patch_stride(ctx);
-	LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
+   LLVMValueRef patch0_patch_data_offset = get_tcs_out_patch0_patch_data_offset(ctx);
+   LLVMValueRef patch_stride = get_tcs_out_patch_stride(ctx);
+   LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
 
-	return ac_build_imad(&ctx->ac, patch_stride, rel_patch_id, patch0_patch_data_offset);
+   return ac_build_imad(&ctx->ac, patch_stride, rel_patch_id, patch0_patch_data_offset);
 }
 
 static LLVMValueRef get_num_tcs_out_vertices(struct si_shader_context *ctx)
 {
-	unsigned tcs_out_vertices =
-		ctx->shader->selector ?
-		ctx->shader->selector->info.properties[TGSI_PROPERTY_TCS_VERTICES_OUT] : 0;
+   unsigned tcs_out_vertices =
+      ctx->shader->selector ? ctx->shader->selector->info.properties[TGSI_PROPERTY_TCS_VERTICES_OUT]
+                            : 0;
 
-	/* If !tcs_out_vertices, it's either the fixed-func TCS or the TCS epilog. */
-	if (ctx->type == PIPE_SHADER_TESS_CTRL && tcs_out_vertices)
-		return LLVMConstInt(ctx->ac.i32, tcs_out_vertices, 0);
+   /* If !tcs_out_vertices, it's either the fixed-func TCS or the TCS epilog. */
+   if (ctx->type == PIPE_SHADER_TESS_CTRL && tcs_out_vertices)
+      return LLVMConstInt(ctx->ac.i32, tcs_out_vertices, 0);
 
-	return si_unpack_param(ctx, ctx->tcs_offchip_layout, 6, 6);
+   return si_unpack_param(ctx, ctx->tcs_offchip_layout, 6, 6);
 }
 
 static LLVMValueRef get_tcs_in_vertex_dw_stride(struct si_shader_context *ctx)
 {
-	unsigned stride;
-
-	switch (ctx->type) {
-	case PIPE_SHADER_VERTEX:
-		stride = ctx->shader->selector->lshs_vertex_stride / 4;
-		return LLVMConstInt(ctx->ac.i32, stride, 0);
-
-	case PIPE_SHADER_TESS_CTRL:
-		if (ctx->screen->info.chip_class >= GFX9 &&
-		    ctx->shader->is_monolithic) {
-			stride = ctx->shader->key.part.tcs.ls->lshs_vertex_stride / 4;
-			return LLVMConstInt(ctx->ac.i32, stride, 0);
-		}
-		return si_unpack_param(ctx, ctx->vs_state_bits, 24, 8);
-
-	default:
-		assert(0);
-		return NULL;
-	}
+   unsigned stride;
+
+   switch (ctx->type) {
+   case PIPE_SHADER_VERTEX:
+      stride = ctx->shader->selector->lshs_vertex_stride / 4;
+      return LLVMConstInt(ctx->ac.i32, stride, 0);
+
+   case PIPE_SHADER_TESS_CTRL:
+      if (ctx->screen->info.chip_class >= GFX9 && ctx->shader->is_monolithic) {
+         stride = ctx->shader->key.part.tcs.ls->lshs_vertex_stride / 4;
+         return LLVMConstInt(ctx->ac.i32, stride, 0);
+      }
+      return si_unpack_param(ctx, ctx->vs_state_bits, 24, 8);
+
+   default:
+      assert(0);
+      return NULL;
+   }
 }
 
-static LLVMValueRef get_dw_address_from_generic_indices(struct si_shader_context *ctx,
-							LLVMValueRef vertex_dw_stride,
-							LLVMValueRef base_addr,
-							LLVMValueRef vertex_index,
-							LLVMValueRef param_index,
-							ubyte name, ubyte index)
+static LLVMValueRef
+get_dw_address_from_generic_indices(struct si_shader_context *ctx, LLVMValueRef vertex_dw_stride,
+                                    LLVMValueRef base_addr, LLVMValueRef vertex_index,
+                                    LLVMValueRef param_index, ubyte name, ubyte index)
 {
-	if (vertex_dw_stride) {
-		base_addr = ac_build_imad(&ctx->ac, vertex_index,
-					  vertex_dw_stride, base_addr);
-	}
-
-	if (param_index) {
-		base_addr = ac_build_imad(&ctx->ac, param_index,
-					  LLVMConstInt(ctx->ac.i32, 4, 0), base_addr);
-	}
-
-	int param = name == TGSI_SEMANTIC_PATCH ||
-		    name == TGSI_SEMANTIC_TESSINNER ||
-		    name == TGSI_SEMANTIC_TESSOUTER ?
-		si_shader_io_get_unique_index_patch(name, index) :
-		si_shader_io_get_unique_index(name, index, false);
-
-	/* Add the base address of the element. */
-	return LLVMBuildAdd(ctx->ac.builder, base_addr,
-			    LLVMConstInt(ctx->ac.i32, param * 4, 0), "");
+   if (vertex_dw_stride) {
+      base_addr = ac_build_imad(&ctx->ac, vertex_index, vertex_dw_stride, base_addr);
+   }
+
+   if (param_index) {
+      base_addr = ac_build_imad(&ctx->ac, param_index, LLVMConstInt(ctx->ac.i32, 4, 0), base_addr);
+   }
+
+   int param = name == TGSI_SEMANTIC_PATCH || name == TGSI_SEMANTIC_TESSINNER ||
+                     name == TGSI_SEMANTIC_TESSOUTER
+                  ? si_shader_io_get_unique_index_patch(name, index)
+                  : si_shader_io_get_unique_index(name, index, false);
+
+   /* Add the base address of the element. */
+   return LLVMBuildAdd(ctx->ac.builder, base_addr, LLVMConstInt(ctx->ac.i32, param * 4, 0), "");
 }
 
 /* The offchip buffer layout for TCS->TES is
@@ -228,98 +211,88 @@ static LLVMValueRef get_dw_address_from_generic_indices(struct si_shader_context
  * Note that every attribute has 4 components.
  */
 static LLVMValueRef get_tcs_tes_buffer_address(struct si_shader_context *ctx,
-					       LLVMValueRef rel_patch_id,
-                                               LLVMValueRef vertex_index,
+                                               LLVMValueRef rel_patch_id, LLVMValueRef vertex_index,
                                                LLVMValueRef param_index)
 {
-	LLVMValueRef base_addr, vertices_per_patch, num_patches, total_vertices;
-	LLVMValueRef param_stride, constant16;
-
-	vertices_per_patch = get_num_tcs_out_vertices(ctx);
-	num_patches = si_unpack_param(ctx, ctx->tcs_offchip_layout, 0, 6);
-	total_vertices = LLVMBuildMul(ctx->ac.builder, vertices_per_patch,
-	                              num_patches, "");
-
-	constant16 = LLVMConstInt(ctx->ac.i32, 16, 0);
-	if (vertex_index) {
-		base_addr = ac_build_imad(&ctx->ac, rel_patch_id,
-					  vertices_per_patch, vertex_index);
-		param_stride = total_vertices;
-	} else {
-		base_addr = rel_patch_id;
-		param_stride = num_patches;
-	}
-
-	base_addr = ac_build_imad(&ctx->ac, param_index, param_stride, base_addr);
-	base_addr = LLVMBuildMul(ctx->ac.builder, base_addr, constant16, "");
-
-	if (!vertex_index) {
-		LLVMValueRef patch_data_offset =
-		           si_unpack_param(ctx, ctx->tcs_offchip_layout, 12, 20);
-
-		base_addr = LLVMBuildAdd(ctx->ac.builder, base_addr,
-		                         patch_data_offset, "");
-	}
-	return base_addr;
+   LLVMValueRef base_addr, vertices_per_patch, num_patches, total_vertices;
+   LLVMValueRef param_stride, constant16;
+
+   vertices_per_patch = get_num_tcs_out_vertices(ctx);
+   num_patches = si_unpack_param(ctx, ctx->tcs_offchip_layout, 0, 6);
+   total_vertices = LLVMBuildMul(ctx->ac.builder, vertices_per_patch, num_patches, "");
+
+   constant16 = LLVMConstInt(ctx->ac.i32, 16, 0);
+   if (vertex_index) {
+      base_addr = ac_build_imad(&ctx->ac, rel_patch_id, vertices_per_patch, vertex_index);
+      param_stride = total_vertices;
+   } else {
+      base_addr = rel_patch_id;
+      param_stride = num_patches;
+   }
+
+   base_addr = ac_build_imad(&ctx->ac, param_index, param_stride, base_addr);
+   base_addr = LLVMBuildMul(ctx->ac.builder, base_addr, constant16, "");
+
+   if (!vertex_index) {
+      LLVMValueRef patch_data_offset = si_unpack_param(ctx, ctx->tcs_offchip_layout, 12, 20);
+
+      base_addr = LLVMBuildAdd(ctx->ac.builder, base_addr, patch_data_offset, "");
+   }
+   return base_addr;
 }
 
-static LLVMValueRef get_tcs_tes_buffer_address_from_generic_indices(
-					struct si_shader_context *ctx,
-					LLVMValueRef vertex_index,
-					LLVMValueRef param_index,
-					ubyte name, ubyte index)
+static LLVMValueRef get_tcs_tes_buffer_address_from_generic_indices(struct si_shader_context *ctx,
+                                                                    LLVMValueRef vertex_index,
+                                                                    LLVMValueRef param_index,
+                                                                    ubyte name, ubyte index)
 {
-	unsigned param_index_base;
-
-	param_index_base = name == TGSI_SEMANTIC_PATCH ||
-			   name == TGSI_SEMANTIC_TESSINNER ||
-			   name == TGSI_SEMANTIC_TESSOUTER ?
-		si_shader_io_get_unique_index_patch(name, index) :
-		si_shader_io_get_unique_index(name, index, false);
-
-	if (param_index) {
-		param_index = LLVMBuildAdd(ctx->ac.builder, param_index,
-					   LLVMConstInt(ctx->ac.i32, param_index_base, 0),
-					   "");
-	} else {
-		param_index = LLVMConstInt(ctx->ac.i32, param_index_base, 0);
-	}
-
-	return get_tcs_tes_buffer_address(ctx, get_rel_patch_id(ctx),
-					  vertex_index, param_index);
+   unsigned param_index_base;
+
+   param_index_base = name == TGSI_SEMANTIC_PATCH || name == TGSI_SEMANTIC_TESSINNER ||
+                            name == TGSI_SEMANTIC_TESSOUTER
+                         ? si_shader_io_get_unique_index_patch(name, index)
+                         : si_shader_io_get_unique_index(name, index, false);
+
+   if (param_index) {
+      param_index = LLVMBuildAdd(ctx->ac.builder, param_index,
+                                 LLVMConstInt(ctx->ac.i32, param_index_base, 0), "");
+   } else {
+      param_index = LLVMConstInt(ctx->ac.i32, param_index_base, 0);
+   }
+
+   return get_tcs_tes_buffer_address(ctx, get_rel_patch_id(ctx), vertex_index, param_index);
 }
 
-static LLVMValueRef buffer_load(struct si_shader_context *ctx,
-                                LLVMTypeRef type, unsigned swizzle,
-                                LLVMValueRef buffer, LLVMValueRef offset,
-                                LLVMValueRef base, bool can_speculate)
+static LLVMValueRef buffer_load(struct si_shader_context *ctx, LLVMTypeRef type, unsigned swizzle,
+                                LLVMValueRef buffer, LLVMValueRef offset, LLVMValueRef base,
+                                bool can_speculate)
 {
-	LLVMValueRef value, value2;
-	LLVMTypeRef vec_type = LLVMVectorType(type, 4);
+   LLVMValueRef value, value2;
+   LLVMTypeRef vec_type = LLVMVectorType(type, 4);
 
-	if (swizzle == ~0) {
-		value = ac_build_buffer_load(&ctx->ac, buffer, 4, NULL, base, offset,
-					     0, ac_glc, can_speculate, false);
+   if (swizzle == ~0) {
+      value = ac_build_buffer_load(&ctx->ac, buffer, 4, NULL, base, offset, 0, ac_glc,
+                                   can_speculate, false);
 
-		return LLVMBuildBitCast(ctx->ac.builder, value, vec_type, "");
-	}
+      return LLVMBuildBitCast(ctx->ac.builder, value, vec_type, "");
+   }
 
-	if (ac_get_type_size(type) != 8) {
-		value = ac_build_buffer_load(&ctx->ac, buffer, 4, NULL, base, offset,
-					     0, ac_glc, can_speculate, false);
+   if (ac_get_type_size(type) != 8) {
+      value = ac_build_buffer_load(&ctx->ac, buffer, 4, NULL, base, offset, 0, ac_glc,
+                                   can_speculate, false);
 
-		value = LLVMBuildBitCast(ctx->ac.builder, value, vec_type, "");
-		return LLVMBuildExtractElement(ctx->ac.builder, value,
-		                    LLVMConstInt(ctx->ac.i32, swizzle, 0), "");
-	}
+      value = LLVMBuildBitCast(ctx->ac.builder, value, vec_type, "");
+      return LLVMBuildExtractElement(ctx->ac.builder, value, LLVMConstInt(ctx->ac.i32, swizzle, 0),
+                                     "");
+   }
 
-	value = ac_build_buffer_load(&ctx->ac, buffer, 1, NULL, base, offset,
-	                          swizzle * 4, ac_glc, can_speculate, false);
+   value = ac_build_buffer_load(&ctx->ac, buffer, 1, NULL, base, offset, swizzle * 4, ac_glc,
+                                can_speculate, false);
 
-	value2 = ac_build_buffer_load(&ctx->ac, buffer, 1, NULL, base, offset,
-	                           swizzle * 4 + 4, ac_glc, can_speculate, false);
+   value2 = ac_build_buffer_load(&ctx->ac, buffer, 1, NULL, base, offset, swizzle * 4 + 4, ac_glc,
+                                 can_speculate, false);
 
-	return si_build_gather_64bit(ctx, type, value, value2);
+   return si_build_gather_64bit(ctx, type, value, value2);
 }
 
 /**
@@ -329,36 +302,34 @@ static LLVMValueRef buffer_load(struct si_shader_context *ctx,
  * \param swizzle	offset (typically 0..3); it can be ~0, which loads a vec4
  * \param dw_addr	address in dwords
  */
-static LLVMValueRef lshs_lds_load(struct si_shader_context *ctx,
-				  LLVMTypeRef type, unsigned swizzle,
-				  LLVMValueRef dw_addr)
+static LLVMValueRef lshs_lds_load(struct si_shader_context *ctx, LLVMTypeRef type, unsigned swizzle,
+                                  LLVMValueRef dw_addr)
 {
-	LLVMValueRef value;
+   LLVMValueRef value;
 
-	if (swizzle == ~0) {
-		LLVMValueRef values[4];
+   if (swizzle == ~0) {
+      LLVMValueRef values[4];
 
-		for (unsigned chan = 0; chan < 4; chan++)
-			values[chan] = lshs_lds_load(ctx, type, chan, dw_addr);
+      for (unsigned chan = 0; chan < 4; chan++)
+         values[chan] = lshs_lds_load(ctx, type, chan, dw_addr);
 
-		return ac_build_gather_values(&ctx->ac, values, 4);
-	}
+      return ac_build_gather_values(&ctx->ac, values, 4);
+   }
 
-	/* Split 64-bit loads. */
-	if (ac_get_type_size(type) == 8) {
-		LLVMValueRef lo, hi;
+   /* Split 64-bit loads. */
+   if (ac_get_type_size(type) == 8) {
+      LLVMValueRef lo, hi;
 
-		lo = lshs_lds_load(ctx, ctx->ac.i32, swizzle, dw_addr);
-		hi = lshs_lds_load(ctx, ctx->ac.i32, swizzle + 1, dw_addr);
-		return si_build_gather_64bit(ctx, type, lo, hi);
-	}
+      lo = lshs_lds_load(ctx, ctx->ac.i32, swizzle, dw_addr);
+      hi = lshs_lds_load(ctx, ctx->ac.i32, swizzle + 1, dw_addr);
+      return si_build_gather_64bit(ctx, type, lo, hi);
+   }
 
-	dw_addr = LLVMBuildAdd(ctx->ac.builder, dw_addr,
-			       LLVMConstInt(ctx->ac.i32, swizzle, 0), "");
+   dw_addr = LLVMBuildAdd(ctx->ac.builder, dw_addr, LLVMConstInt(ctx->ac.i32, swizzle, 0), "");
 
-	value = ac_lds_load(&ctx->ac, dw_addr);
+   value = ac_lds_load(&ctx->ac, dw_addr);
 
-	return LLVMBuildBitCast(ctx->ac.builder, value, type, "");
+   return LLVMBuildBitCast(ctx->ac.builder, value, type, "");
 }
 
 /**
@@ -368,423 +339,367 @@ static LLVMValueRef lshs_lds_load(struct si_shader_context *ctx,
  * \param dw_addr	address in dwords
  * \param value		value to store
  */
-static void lshs_lds_store(struct si_shader_context *ctx,
-		      unsigned dw_offset_imm, LLVMValueRef dw_addr,
-		      LLVMValueRef value)
+static void lshs_lds_store(struct si_shader_context *ctx, unsigned dw_offset_imm,
+                           LLVMValueRef dw_addr, LLVMValueRef value)
 {
-	dw_addr = LLVMBuildAdd(ctx->ac.builder, dw_addr,
-			       LLVMConstInt(ctx->ac.i32, dw_offset_imm, 0), "");
+   dw_addr =
+      LLVMBuildAdd(ctx->ac.builder, dw_addr, LLVMConstInt(ctx->ac.i32, dw_offset_imm, 0), "");
 
-	ac_lds_store(&ctx->ac, dw_addr, value);
+   ac_lds_store(&ctx->ac, dw_addr, value);
 }
 
-enum si_tess_ring {
-	TCS_FACTOR_RING,
-	TESS_OFFCHIP_RING_TCS,
-	TESS_OFFCHIP_RING_TES,
+enum si_tess_ring
+{
+   TCS_FACTOR_RING,
+   TESS_OFFCHIP_RING_TCS,
+   TESS_OFFCHIP_RING_TES,
 };
 
-static LLVMValueRef get_tess_ring_descriptor(struct si_shader_context *ctx,
-					     enum si_tess_ring ring)
+static LLVMValueRef get_tess_ring_descriptor(struct si_shader_context *ctx, enum si_tess_ring ring)
 {
-	LLVMBuilderRef builder = ctx->ac.builder;
-	LLVMValueRef addr = ac_get_arg(&ctx->ac,
-				       ring == TESS_OFFCHIP_RING_TES ?
-				       ctx->tes_offchip_addr :
-				       ctx->tcs_out_lds_layout);
-
-	/* TCS only receives high 13 bits of the address. */
-	if (ring == TESS_OFFCHIP_RING_TCS || ring == TCS_FACTOR_RING) {
-		addr = LLVMBuildAnd(builder, addr,
-				    LLVMConstInt(ctx->ac.i32, 0xfff80000, 0), "");
-	}
-
-	if (ring == TCS_FACTOR_RING) {
-		unsigned tf_offset = ctx->screen->tess_offchip_ring_size;
-		addr = LLVMBuildAdd(builder, addr,
-				    LLVMConstInt(ctx->ac.i32, tf_offset, 0), "");
-	}
-
-	uint32_t rsrc3 = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
-			 S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
-			 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
-			 S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
-
-	if (ctx->screen->info.chip_class >= GFX10)
-		rsrc3 |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) |
-			 S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) |
-			 S_008F0C_RESOURCE_LEVEL(1);
-	else
-		rsrc3 |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
-			 S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
-
-	LLVMValueRef desc[4];
-	desc[0] = addr;
-	desc[1] = LLVMConstInt(ctx->ac.i32,
-			       S_008F04_BASE_ADDRESS_HI(ctx->screen->info.address32_hi), 0);
-	desc[2] = LLVMConstInt(ctx->ac.i32, 0xffffffff, 0);
-	desc[3] = LLVMConstInt(ctx->ac.i32, rsrc3, false);
-
-	return ac_build_gather_values(&ctx->ac, desc, 4);
+   LLVMBuilderRef builder = ctx->ac.builder;
+   LLVMValueRef addr = ac_get_arg(
+      &ctx->ac, ring == TESS_OFFCHIP_RING_TES ? ctx->tes_offchip_addr : ctx->tcs_out_lds_layout);
+
+   /* TCS only receives high 13 bits of the address. */
+   if (ring == TESS_OFFCHIP_RING_TCS || ring == TCS_FACTOR_RING) {
+      addr = LLVMBuildAnd(builder, addr, LLVMConstInt(ctx->ac.i32, 0xfff80000, 0), "");
+   }
+
+   if (ring == TCS_FACTOR_RING) {
+      unsigned tf_offset = ctx->screen->tess_offchip_ring_size;
+      addr = LLVMBuildAdd(builder, addr, LLVMConstInt(ctx->ac.i32, tf_offset, 0), "");
+   }
+
+   uint32_t rsrc3 = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
+                    S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
+
+   if (ctx->screen->info.chip_class >= GFX10)
+      rsrc3 |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) |
+               S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) | S_008F0C_RESOURCE_LEVEL(1);
+   else
+      rsrc3 |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
+               S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
+
+   LLVMValueRef desc[4];
+   desc[0] = addr;
+   desc[1] = LLVMConstInt(ctx->ac.i32, S_008F04_BASE_ADDRESS_HI(ctx->screen->info.address32_hi), 0);
+   desc[2] = LLVMConstInt(ctx->ac.i32, 0xffffffff, 0);
+   desc[3] = LLVMConstInt(ctx->ac.i32, rsrc3, false);
+
+   return ac_build_gather_values(&ctx->ac, desc, 4);
 }
 
 void si_llvm_preload_tes_rings(struct si_shader_context *ctx)
 {
-	ctx->tess_offchip_ring = get_tess_ring_descriptor(ctx, TESS_OFFCHIP_RING_TES);
+   ctx->tess_offchip_ring = get_tess_ring_descriptor(ctx, TESS_OFFCHIP_RING_TES);
 }
 
-static LLVMValueRef si_nir_load_tcs_varyings(struct ac_shader_abi *abi,
-					     LLVMTypeRef type,
-					     LLVMValueRef vertex_index,
-					     LLVMValueRef param_index,
-					     unsigned const_index,
-					     unsigned location,
-					     unsigned driver_location,
-					     unsigned component,
-					     unsigned num_components,
-					     bool is_patch,
-					     bool is_compact,
-					     bool load_input)
+static LLVMValueRef si_nir_load_tcs_varyings(struct ac_shader_abi *abi, LLVMTypeRef type,
+                                             LLVMValueRef vertex_index, LLVMValueRef param_index,
+                                             unsigned const_index, unsigned location,
+                                             unsigned driver_location, unsigned component,
+                                             unsigned num_components, bool is_patch,
+                                             bool is_compact, bool load_input)
 {
-	struct si_shader_context *ctx = si_shader_context_from_abi(abi);
-	struct si_shader_info *info = &ctx->shader->selector->info;
-	LLVMValueRef dw_addr, stride;
-	ubyte name, index;
-
-	driver_location = driver_location / 4;
-
-	if (load_input) {
-		name = info->input_semantic_name[driver_location];
-		index = info->input_semantic_index[driver_location];
-	} else {
-		name = info->output_semantic_name[driver_location];
-		index = info->output_semantic_index[driver_location];
-	}
-
-	assert((name == TGSI_SEMANTIC_PATCH ||
-		name == TGSI_SEMANTIC_TESSINNER ||
-		name == TGSI_SEMANTIC_TESSOUTER) == is_patch);
-
-	if (load_input) {
-		stride = get_tcs_in_vertex_dw_stride(ctx);
-		dw_addr = get_tcs_in_current_patch_offset(ctx);
-	} else {
-		if (is_patch) {
-			stride = NULL;
-			dw_addr = get_tcs_out_current_patch_data_offset(ctx);
-		} else {
-			stride = get_tcs_out_vertex_dw_stride(ctx);
-			dw_addr = get_tcs_out_current_patch_offset(ctx);
-		}
-	}
-
-	if (!param_index) {
-		param_index = LLVMConstInt(ctx->ac.i32, const_index, 0);
-	}
-
-	dw_addr = get_dw_address_from_generic_indices(ctx, stride, dw_addr,
-						      vertex_index, param_index,
-						      name, index);
-
-	LLVMValueRef value[4];
-	for (unsigned i = 0; i < num_components; i++) {
-		unsigned offset = i;
-		if (ac_get_type_size(type) == 8)
-			offset *= 2;
-
-		offset += component;
-		value[i + component] = lshs_lds_load(ctx, type, offset, dw_addr);
-	}
-
-	return ac_build_varying_gather_values(&ctx->ac, value, num_components, component);
+   struct si_shader_context *ctx = si_shader_context_from_abi(abi);
+   struct si_shader_info *info = &ctx->shader->selector->info;
+   LLVMValueRef dw_addr, stride;
+   ubyte name, index;
+
+   driver_location = driver_location / 4;
+
+   if (load_input) {
+      name = info->input_semantic_name[driver_location];
+      index = info->input_semantic_index[driver_location];
+   } else {
+      name = info->output_semantic_name[driver_location];
+      index = info->output_semantic_index[driver_location];
+   }
+
+   assert((name == TGSI_SEMANTIC_PATCH || name == TGSI_SEMANTIC_TESSINNER ||
+           name == TGSI_SEMANTIC_TESSOUTER) == is_patch);
+
+   if (load_input) {
+      stride = get_tcs_in_vertex_dw_stride(ctx);
+      dw_addr = get_tcs_in_current_patch_offset(ctx);
+   } else {
+      if (is_patch) {
+         stride = NULL;
+         dw_addr = get_tcs_out_current_patch_data_offset(ctx);
+      } else {
+         stride = get_tcs_out_vertex_dw_stride(ctx);
+         dw_addr = get_tcs_out_current_patch_offset(ctx);
+      }
+   }
+
+   if (!param_index) {
+      param_index = LLVMConstInt(ctx->ac.i32, const_index, 0);
+   }
+
+   dw_addr = get_dw_address_from_generic_indices(ctx, stride, dw_addr, vertex_index, param_index,
+                                                 name, index);
+
+   LLVMValueRef value[4];
+   for (unsigned i = 0; i < num_components; i++) {
+      unsigned offset = i;
+      if (ac_get_type_size(type) == 8)
+         offset *= 2;
+
+      offset += component;
+      value[i + component] = lshs_lds_load(ctx, type, offset, dw_addr);
+   }
+
+   return ac_build_varying_gather_values(&ctx->ac, value, num_components, component);
 }
 
-static LLVMValueRef si_nir_load_input_tes(struct ac_shader_abi *abi,
-					  LLVMTypeRef type,
-					  LLVMValueRef vertex_index,
-					  LLVMValueRef param_index,
-					  unsigned const_index,
-					  unsigned location,
-					  unsigned driver_location,
-					  unsigned component,
-					  unsigned num_components,
-					  bool is_patch,
-					  bool is_compact,
-					  bool load_input)
+static LLVMValueRef si_nir_load_input_tes(struct ac_shader_abi *abi, LLVMTypeRef type,
+                                          LLVMValueRef vertex_index, LLVMValueRef param_index,
+                                          unsigned const_index, unsigned location,
+                                          unsigned driver_location, unsigned component,
+                                          unsigned num_components, bool is_patch, bool is_compact,
+                                          bool load_input)
 {
-	struct si_shader_context *ctx = si_shader_context_from_abi(abi);
-	struct si_shader_info *info = &ctx->shader->selector->info;
-	LLVMValueRef base, addr;
-
-	driver_location = driver_location / 4;
-	ubyte name = info->input_semantic_name[driver_location];
-	ubyte index = info->input_semantic_index[driver_location];
-
-	assert((name == TGSI_SEMANTIC_PATCH ||
-		name == TGSI_SEMANTIC_TESSINNER ||
-		name == TGSI_SEMANTIC_TESSOUTER) == is_patch);
-
-	base = ac_get_arg(&ctx->ac, ctx->tcs_offchip_offset);
-
-	if (!param_index) {
-		param_index = LLVMConstInt(ctx->ac.i32, const_index, 0);
-	}
-
-	addr = get_tcs_tes_buffer_address_from_generic_indices(ctx, vertex_index,
-							       param_index,
-							       name, index);
-
-	/* TODO: This will generate rather ordinary llvm code, although it
-	 * should be easy for the optimiser to fix up. In future we might want
-	 * to refactor buffer_load().
-	 */
-	LLVMValueRef value[4];
-	for (unsigned i = 0; i < num_components; i++) {
-		unsigned offset = i;
-		if (ac_get_type_size(type) == 8) {
-			offset *= 2;
-			if (offset == 4) {
-				ubyte name = info->input_semantic_name[driver_location + 1];
-				ubyte index = info->input_semantic_index[driver_location + 1];
-                                addr = get_tcs_tes_buffer_address_from_generic_indices(ctx,
-                                                                                       vertex_index,
-                                                                                       param_index,
-										       name, index);
-			}
-
-                        offset = offset % 4;
-		}
-
-		offset += component;
-		value[i + component] = buffer_load(ctx, type, offset,
-						   ctx->tess_offchip_ring, base, addr, true);
-	}
-
-	return ac_build_varying_gather_values(&ctx->ac, value, num_components, component);
+   struct si_shader_context *ctx = si_shader_context_from_abi(abi);
+   struct si_shader_info *info = &ctx->shader->selector->info;
+   LLVMValueRef base, addr;
+
+   driver_location = driver_location / 4;
+   ubyte name = info->input_semantic_name[driver_location];
+   ubyte index = info->input_semantic_index[driver_location];
+
+   assert((name == TGSI_SEMANTIC_PATCH || name == TGSI_SEMANTIC_TESSINNER ||
+           name == TGSI_SEMANTIC_TESSOUTER) == is_patch);
+
+   base = ac_get_arg(&ctx->ac, ctx->tcs_offchip_offset);
+
+   if (!param_index) {
+      param_index = LLVMConstInt(ctx->ac.i32, const_index, 0);
+   }
+
+   addr =
+      get_tcs_tes_buffer_address_from_generic_indices(ctx, vertex_index, param_index, name, index);
+
+   /* TODO: This will generate rather ordinary llvm code, although it
+    * should be easy for the optimiser to fix up. In future we might want
+    * to refactor buffer_load().
+    */
+   LLVMValueRef value[4];
+   for (unsigned i = 0; i < num_components; i++) {
+      unsigned offset = i;
+      if (ac_get_type_size(type) == 8) {
+         offset *= 2;
+         if (offset == 4) {
+            ubyte name = info->input_semantic_name[driver_location + 1];
+            ubyte index = info->input_semantic_index[driver_location + 1];
+            addr = get_tcs_tes_buffer_address_from_generic_indices(ctx, vertex_index, param_index,
+                                                                   name, index);
+         }
+
+         offset = offset % 4;
+      }
+
+      offset += component;
+      value[i + component] =
+         buffer_load(ctx, type, offset, ctx->tess_offchip_ring, base, addr, true);
+   }
+
+   return ac_build_varying_gather_values(&ctx->ac, value, num_components, component);
 }
 
-static void si_nir_store_output_tcs(struct ac_shader_abi *abi,
-				    const struct nir_variable *var,
-				    LLVMValueRef vertex_index,
-				    LLVMValueRef param_index,
-				    unsigned const_index,
-				    LLVMValueRef src,
-				    unsigned writemask)
+static void si_nir_store_output_tcs(struct ac_shader_abi *abi, const struct nir_variable *var,
+                                    LLVMValueRef vertex_index, LLVMValueRef param_index,
+                                    unsigned const_index, LLVMValueRef src, unsigned writemask)
 {
-	struct si_shader_context *ctx = si_shader_context_from_abi(abi);
-	struct si_shader_info *info = &ctx->shader->selector->info;
-	const unsigned component = var->data.location_frac;
-	unsigned driver_location = var->data.driver_location;
-	LLVMValueRef dw_addr, stride;
-	LLVMValueRef buffer, base, addr;
-	LLVMValueRef values[8];
-	bool skip_lds_store;
-	bool is_tess_factor = false, is_tess_inner = false;
-
-	driver_location = driver_location / 4;
-	ubyte name = info->output_semantic_name[driver_location];
-	ubyte index = info->output_semantic_index[driver_location];
-
-	bool is_const = !param_index;
-	if (!param_index)
-		param_index = LLVMConstInt(ctx->ac.i32, const_index, 0);
-
-	const bool is_patch = var->data.patch ||
-			      var->data.location == VARYING_SLOT_TESS_LEVEL_INNER ||
-			      var->data.location == VARYING_SLOT_TESS_LEVEL_OUTER;
-
-	/* Invalid SPIR-V can cause this. */
-	if ((name == TGSI_SEMANTIC_PATCH ||
-	     name == TGSI_SEMANTIC_TESSINNER ||
-	     name == TGSI_SEMANTIC_TESSOUTER) != is_patch)
-		return;
-
-	if (!is_patch) {
-		stride = get_tcs_out_vertex_dw_stride(ctx);
-		dw_addr = get_tcs_out_current_patch_offset(ctx);
-		dw_addr = get_dw_address_from_generic_indices(ctx, stride, dw_addr,
-							      vertex_index, param_index,
-							      name, index);
-
-		skip_lds_store = !info->reads_pervertex_outputs;
-	} else {
-		dw_addr = get_tcs_out_current_patch_data_offset(ctx);
-		dw_addr = get_dw_address_from_generic_indices(ctx, NULL, dw_addr,
-							      vertex_index, param_index,
-							      name, index);
-
-		skip_lds_store = !info->reads_perpatch_outputs;
-
-		if (is_const && const_index == 0) {
-			int name = info->output_semantic_name[driver_location];
-
-			/* Always write tess factors into LDS for the TCS epilog. */
-			if (name == TGSI_SEMANTIC_TESSINNER ||
-			    name == TGSI_SEMANTIC_TESSOUTER) {
-				/* The epilog doesn't read LDS if invocation 0 defines tess factors. */
-				skip_lds_store = !info->reads_tessfactor_outputs &&
-						 ctx->shader->selector->info.tessfactors_are_def_in_all_invocs;
-				is_tess_factor = true;
-				is_tess_inner = name == TGSI_SEMANTIC_TESSINNER;
-			}
-		}
-	}
-
-	buffer = get_tess_ring_descriptor(ctx, TESS_OFFCHIP_RING_TCS);
-
-	base = ac_get_arg(&ctx->ac, ctx->tcs_offchip_offset);
-
-	addr = get_tcs_tes_buffer_address_from_generic_indices(ctx, vertex_index,
-							       param_index, name, index);
-
-	for (unsigned chan = component; chan < 8; chan++) {
-		if (!(writemask & (1 << chan)))
-			continue;
-		LLVMValueRef value = ac_llvm_extract_elem(&ctx->ac, src, chan - component);
-
-		unsigned buffer_store_offset = chan % 4;
-		if (chan == 4) {
-			ubyte name = info->output_semantic_name[driver_location + 1];
-			ubyte index = info->output_semantic_index[driver_location + 1];
-                        addr = get_tcs_tes_buffer_address_from_generic_indices(ctx,
-                                                                               vertex_index,
-                                                                               param_index,
-									       name, index);
-		}
-
-		/* Skip LDS stores if there is no LDS read of this output. */
-		if (!skip_lds_store)
-			lshs_lds_store(ctx, chan, dw_addr, value);
-
-		value = ac_to_integer(&ctx->ac, value);
-		values[chan] = value;
-
-		if (writemask != 0xF && !is_tess_factor) {
-			ac_build_buffer_store_dword(&ctx->ac, buffer, value, 1,
-						    addr, base,
-						    4 * buffer_store_offset,
-                                                    ac_glc);
-		}
-
-		/* Write tess factors into VGPRs for the epilog. */
-		if (is_tess_factor &&
-		    ctx->shader->selector->info.tessfactors_are_def_in_all_invocs) {
-			if (!is_tess_inner) {
-				LLVMBuildStore(ctx->ac.builder, value, /* outer */
-					       ctx->invoc0_tess_factors[chan]);
-			} else if (chan < 2) {
-				LLVMBuildStore(ctx->ac.builder, value, /* inner */
-					       ctx->invoc0_tess_factors[4 + chan]);
-			}
-		}
-	}
-
-	if (writemask == 0xF && !is_tess_factor) {
-		LLVMValueRef value = ac_build_gather_values(&ctx->ac,
-		                                            values, 4);
-		ac_build_buffer_store_dword(&ctx->ac, buffer, value, 4, addr,
-					    base, 0, ac_glc);
-	}
+   struct si_shader_context *ctx = si_shader_context_from_abi(abi);
+   struct si_shader_info *info = &ctx->shader->selector->info;
+   const unsigned component = var->data.location_frac;
+   unsigned driver_location = var->data.driver_location;
+   LLVMValueRef dw_addr, stride;
+   LLVMValueRef buffer, base, addr;
+   LLVMValueRef values[8];
+   bool skip_lds_store;
+   bool is_tess_factor = false, is_tess_inner = false;
+
+   driver_location = driver_location / 4;
+   ubyte name = info->output_semantic_name[driver_location];
+   ubyte index = info->output_semantic_index[driver_location];
+
+   bool is_const = !param_index;
+   if (!param_index)
+      param_index = LLVMConstInt(ctx->ac.i32, const_index, 0);
+
+   const bool is_patch = var->data.patch || var->data.location == VARYING_SLOT_TESS_LEVEL_INNER ||
+                         var->data.location == VARYING_SLOT_TESS_LEVEL_OUTER;
+
+   /* Invalid SPIR-V can cause this. */
+   if ((name == TGSI_SEMANTIC_PATCH || name == TGSI_SEMANTIC_TESSINNER ||
+        name == TGSI_SEMANTIC_TESSOUTER) != is_patch)
+      return;
+
+   if (!is_patch) {
+      stride = get_tcs_out_vertex_dw_stride(ctx);
+      dw_addr = get_tcs_out_current_patch_offset(ctx);
+      dw_addr = get_dw_address_from_generic_indices(ctx, stride, dw_addr, vertex_index, param_index,
+                                                    name, index);
+
+      skip_lds_store = !info->reads_pervertex_outputs;
+   } else {
+      dw_addr = get_tcs_out_current_patch_data_offset(ctx);
+      dw_addr = get_dw_address_from_generic_indices(ctx, NULL, dw_addr, vertex_index, param_index,
+                                                    name, index);
+
+      skip_lds_store = !info->reads_perpatch_outputs;
+
+      if (is_const && const_index == 0) {
+         int name = info->output_semantic_name[driver_location];
+
+         /* Always write tess factors into LDS for the TCS epilog. */
+         if (name == TGSI_SEMANTIC_TESSINNER || name == TGSI_SEMANTIC_TESSOUTER) {
+            /* The epilog doesn't read LDS if invocation 0 defines tess factors. */
+            skip_lds_store = !info->reads_tessfactor_outputs &&
+                             ctx->shader->selector->info.tessfactors_are_def_in_all_invocs;
+            is_tess_factor = true;
+            is_tess_inner = name == TGSI_SEMANTIC_TESSINNER;
+         }
+      }
+   }
+
+   buffer = get_tess_ring_descriptor(ctx, TESS_OFFCHIP_RING_TCS);
+
+   base = ac_get_arg(&ctx->ac, ctx->tcs_offchip_offset);
+
+   addr =
+      get_tcs_tes_buffer_address_from_generic_indices(ctx, vertex_index, param_index, name, index);
+
+   for (unsigned chan = component; chan < 8; chan++) {
+      if (!(writemask & (1 << chan)))
+         continue;
+      LLVMValueRef value = ac_llvm_extract_elem(&ctx->ac, src, chan - component);
+
+      unsigned buffer_store_offset = chan % 4;
+      if (chan == 4) {
+         ubyte name = info->output_semantic_name[driver_location + 1];
+         ubyte index = info->output_semantic_index[driver_location + 1];
+         addr = get_tcs_tes_buffer_address_from_generic_indices(ctx, vertex_index, param_index,
+                                                                name, index);
+      }
+
+      /* Skip LDS stores if there is no LDS read of this output. */
+      if (!skip_lds_store)
+         lshs_lds_store(ctx, chan, dw_addr, value);
+
+      value = ac_to_integer(&ctx->ac, value);
+      values[chan] = value;
+
+      if (writemask != 0xF && !is_tess_factor) {
+         ac_build_buffer_store_dword(&ctx->ac, buffer, value, 1, addr, base,
+                                     4 * buffer_store_offset, ac_glc);
+      }
+
+      /* Write tess factors into VGPRs for the epilog. */
+      if (is_tess_factor && ctx->shader->selector->info.tessfactors_are_def_in_all_invocs) {
+         if (!is_tess_inner) {
+            LLVMBuildStore(ctx->ac.builder, value, /* outer */
+                           ctx->invoc0_tess_factors[chan]);
+         } else if (chan < 2) {
+            LLVMBuildStore(ctx->ac.builder, value, /* inner */
+                           ctx->invoc0_tess_factors[4 + chan]);
+         }
+      }
+   }
+
+   if (writemask == 0xF && !is_tess_factor) {
+      LLVMValueRef value = ac_build_gather_values(&ctx->ac, values, 4);
+      ac_build_buffer_store_dword(&ctx->ac, buffer, value, 4, addr, base, 0, ac_glc);
+   }
 }
 
 static LLVMValueRef si_load_tess_coord(struct ac_shader_abi *abi)
 {
-	struct si_shader_context *ctx = si_shader_context_from_abi(abi);
-	LLVMValueRef coord[4] = {
-		ac_get_arg(&ctx->ac, ctx->tes_u),
-		ac_get_arg(&ctx->ac, ctx->tes_v),
-		ctx->ac.f32_0,
-		ctx->ac.f32_0
-	};
-
-	/* For triangles, the vector should be (u, v, 1-u-v). */
-	if (ctx->shader->selector->info.properties[TGSI_PROPERTY_TES_PRIM_MODE] ==
-	    PIPE_PRIM_TRIANGLES) {
-		coord[2] = LLVMBuildFSub(ctx->ac.builder, ctx->ac.f32_1,
-					 LLVMBuildFAdd(ctx->ac.builder,
-						       coord[0], coord[1], ""), "");
-	}
-	return ac_build_gather_values(&ctx->ac, coord, 4);
+   struct si_shader_context *ctx = si_shader_context_from_abi(abi);
+   LLVMValueRef coord[4] = {ac_get_arg(&ctx->ac, ctx->tes_u), ac_get_arg(&ctx->ac, ctx->tes_v),
+                            ctx->ac.f32_0, ctx->ac.f32_0};
+
+   /* For triangles, the vector should be (u, v, 1-u-v). */
+   if (ctx->shader->selector->info.properties[TGSI_PROPERTY_TES_PRIM_MODE] == PIPE_PRIM_TRIANGLES) {
+      coord[2] = LLVMBuildFSub(ctx->ac.builder, ctx->ac.f32_1,
+                               LLVMBuildFAdd(ctx->ac.builder, coord[0], coord[1], ""), "");
+   }
+   return ac_build_gather_values(&ctx->ac, coord, 4);
 }
 
-static LLVMValueRef load_tess_level(struct si_shader_context *ctx,
-				    unsigned semantic_name)
+static LLVMValueRef load_tess_level(struct si_shader_context *ctx, unsigned semantic_name)
 {
-	LLVMValueRef base, addr;
-
-	int param = si_shader_io_get_unique_index_patch(semantic_name, 0);
+   LLVMValueRef base, addr;
 
-	base = ac_get_arg(&ctx->ac, ctx->tcs_offchip_offset);
-	addr = get_tcs_tes_buffer_address(ctx, get_rel_patch_id(ctx), NULL,
-					  LLVMConstInt(ctx->ac.i32, param, 0));
+   int param = si_shader_io_get_unique_index_patch(semantic_name, 0);
 
-	return buffer_load(ctx, ctx->ac.f32,
-			   ~0, ctx->tess_offchip_ring, base, addr, true);
+   base = ac_get_arg(&ctx->ac, ctx->tcs_offchip_offset);
+   addr = get_tcs_tes_buffer_address(ctx, get_rel_patch_id(ctx), NULL,
+                                     LLVMConstInt(ctx->ac.i32, param, 0));
 
+   return buffer_load(ctx, ctx->ac.f32, ~0, ctx->tess_offchip_ring, base, addr, true);
 }
 
-static LLVMValueRef load_tess_level_default(struct si_shader_context *ctx,
-					    unsigned semantic_name)
+static LLVMValueRef load_tess_level_default(struct si_shader_context *ctx, unsigned semantic_name)
 {
-	LLVMValueRef buf, slot, val[4];
-	int i, offset;
-
-	slot = LLVMConstInt(ctx->ac.i32, SI_HS_CONST_DEFAULT_TESS_LEVELS, 0);
-	buf = ac_get_arg(&ctx->ac, ctx->rw_buffers);
-	buf = ac_build_load_to_sgpr(&ctx->ac, buf, slot);
-	offset = semantic_name == TGSI_SEMANTIC_TESS_DEFAULT_INNER_LEVEL ? 4 : 0;
-
-	for (i = 0; i < 4; i++)
-		val[i] = si_buffer_load_const(ctx, buf,
-					      LLVMConstInt(ctx->ac.i32, (offset + i) * 4, 0));
-	return ac_build_gather_values(&ctx->ac, val, 4);
+   LLVMValueRef buf, slot, val[4];
+   int i, offset;
+
+   slot = LLVMConstInt(ctx->ac.i32, SI_HS_CONST_DEFAULT_TESS_LEVELS, 0);
+   buf = ac_get_arg(&ctx->ac, ctx->rw_buffers);
+   buf = ac_build_load_to_sgpr(&ctx->ac, buf, slot);
+   offset = semantic_name == TGSI_SEMANTIC_TESS_DEFAULT_INNER_LEVEL ? 4 : 0;
+
+   for (i = 0; i < 4; i++)
+      val[i] = si_buffer_load_const(ctx, buf, LLVMConstInt(ctx->ac.i32, (offset + i) * 4, 0));
+   return ac_build_gather_values(&ctx->ac, val, 4);
 }
 
-static LLVMValueRef si_load_tess_level(struct ac_shader_abi *abi,
-				       unsigned varying_id,
-				       bool load_default_state)
+static LLVMValueRef si_load_tess_level(struct ac_shader_abi *abi, unsigned varying_id,
+                                       bool load_default_state)
 {
-	struct si_shader_context *ctx = si_shader_context_from_abi(abi);
-	unsigned semantic_name;
-
-	if (load_default_state) {
-		switch (varying_id) {
-		case VARYING_SLOT_TESS_LEVEL_INNER:
-			semantic_name = TGSI_SEMANTIC_TESS_DEFAULT_INNER_LEVEL;
-			break;
-		case VARYING_SLOT_TESS_LEVEL_OUTER:
-			semantic_name = TGSI_SEMANTIC_TESS_DEFAULT_OUTER_LEVEL;
-			break;
-		default:
-			unreachable("unknown tess level");
-		}
-		return load_tess_level_default(ctx, semantic_name);
-	}
-
-	switch (varying_id) {
-	case VARYING_SLOT_TESS_LEVEL_INNER:
-		semantic_name = TGSI_SEMANTIC_TESSINNER;
-		break;
-	case VARYING_SLOT_TESS_LEVEL_OUTER:
-		semantic_name = TGSI_SEMANTIC_TESSOUTER;
-		break;
-	default:
-		unreachable("unknown tess level");
-	}
-
-	return load_tess_level(ctx, semantic_name);
-
+   struct si_shader_context *ctx = si_shader_context_from_abi(abi);
+   unsigned semantic_name;
+
+   if (load_default_state) {
+      switch (varying_id) {
+      case VARYING_SLOT_TESS_LEVEL_INNER:
+         semantic_name = TGSI_SEMANTIC_TESS_DEFAULT_INNER_LEVEL;
+         break;
+      case VARYING_SLOT_TESS_LEVEL_OUTER:
+         semantic_name = TGSI_SEMANTIC_TESS_DEFAULT_OUTER_LEVEL;
+         break;
+      default:
+         unreachable("unknown tess level");
+      }
+      return load_tess_level_default(ctx, semantic_name);
+   }
+
+   switch (varying_id) {
+   case VARYING_SLOT_TESS_LEVEL_INNER:
+      semantic_name = TGSI_SEMANTIC_TESSINNER;
+      break;
+   case VARYING_SLOT_TESS_LEVEL_OUTER:
+      semantic_name = TGSI_SEMANTIC_TESSOUTER;
+      break;
+   default:
+      unreachable("unknown tess level");
+   }
+
+   return load_tess_level(ctx, semantic_name);
 }
 
 static LLVMValueRef si_load_patch_vertices_in(struct ac_shader_abi *abi)
 {
-	struct si_shader_context *ctx = si_shader_context_from_abi(abi);
-	if (ctx->type == PIPE_SHADER_TESS_CTRL)
-		return si_unpack_param(ctx, ctx->tcs_out_lds_layout, 13, 6);
-	else if (ctx->type == PIPE_SHADER_TESS_EVAL)
-		return get_num_tcs_out_vertices(ctx);
-	else
-		unreachable("invalid shader stage for TGSI_SEMANTIC_VERTICESIN");
+   struct si_shader_context *ctx = si_shader_context_from_abi(abi);
+   if (ctx->type == PIPE_SHADER_TESS_CTRL)
+      return si_unpack_param(ctx, ctx->tcs_out_lds_layout, 13, 6);
+   else if (ctx->type == PIPE_SHADER_TESS_EVAL)
+      return get_num_tcs_out_vertices(ctx);
+   else
+      unreachable("invalid shader stage for TGSI_SEMANTIC_VERTICESIN");
 }
 
 /**
@@ -793,503 +708,450 @@ static LLVMValueRef si_load_patch_vertices_in(struct ac_shader_abi *abi)
  */
 static void si_copy_tcs_inputs(struct si_shader_context *ctx)
 {
-	LLVMValueRef invocation_id, buffer, buffer_offset;
-	LLVMValueRef lds_vertex_stride, lds_base;
-	uint64_t inputs;
+   LLVMValueRef invocation_id, buffer, buffer_offset;
+   LLVMValueRef lds_vertex_stride, lds_base;
+   uint64_t inputs;
 
-	invocation_id = si_unpack_param(ctx, ctx->args.tcs_rel_ids, 8, 5);
-	buffer = get_tess_ring_descriptor(ctx, TESS_OFFCHIP_RING_TCS);
-	buffer_offset = ac_get_arg(&ctx->ac, ctx->tcs_offchip_offset);
+   invocation_id = si_unpack_param(ctx, ctx->args.tcs_rel_ids, 8, 5);
+   buffer = get_tess_ring_descriptor(ctx, TESS_OFFCHIP_RING_TCS);
+   buffer_offset = ac_get_arg(&ctx->ac, ctx->tcs_offchip_offset);
 
-	lds_vertex_stride = get_tcs_in_vertex_dw_stride(ctx);
-	lds_base = get_tcs_in_current_patch_offset(ctx);
-	lds_base = ac_build_imad(&ctx->ac, invocation_id, lds_vertex_stride,
-				 lds_base);
+   lds_vertex_stride = get_tcs_in_vertex_dw_stride(ctx);
+   lds_base = get_tcs_in_current_patch_offset(ctx);
+   lds_base = ac_build_imad(&ctx->ac, invocation_id, lds_vertex_stride, lds_base);
 
-	inputs = ctx->shader->key.mono.u.ff_tcs_inputs_to_copy;
-	while (inputs) {
-		unsigned i = u_bit_scan64(&inputs);
+   inputs = ctx->shader->key.mono.u.ff_tcs_inputs_to_copy;
+   while (inputs) {
+      unsigned i = u_bit_scan64(&inputs);
 
-		LLVMValueRef lds_ptr = LLVMBuildAdd(ctx->ac.builder, lds_base,
-		                            LLVMConstInt(ctx->ac.i32, 4 * i, 0),
-		                             "");
+      LLVMValueRef lds_ptr =
+         LLVMBuildAdd(ctx->ac.builder, lds_base, LLVMConstInt(ctx->ac.i32, 4 * i, 0), "");
 
-		LLVMValueRef buffer_addr = get_tcs_tes_buffer_address(ctx,
-					      get_rel_patch_id(ctx),
-		                              invocation_id,
-		                              LLVMConstInt(ctx->ac.i32, i, 0));
+      LLVMValueRef buffer_addr = get_tcs_tes_buffer_address(
+         ctx, get_rel_patch_id(ctx), invocation_id, LLVMConstInt(ctx->ac.i32, i, 0));
 
-		LLVMValueRef value = lshs_lds_load(ctx, ctx->ac.i32, ~0, lds_ptr);
+      LLVMValueRef value = lshs_lds_load(ctx, ctx->ac.i32, ~0, lds_ptr);
 
-		ac_build_buffer_store_dword(&ctx->ac, buffer, value, 4, buffer_addr,
-					    buffer_offset, 0, ac_glc);
-	}
+      ac_build_buffer_store_dword(&ctx->ac, buffer, value, 4, buffer_addr, buffer_offset, 0,
+                                  ac_glc);
+   }
 }
 
-static void si_write_tess_factors(struct si_shader_context *ctx,
-				  LLVMValueRef rel_patch_id,
-				  LLVMValueRef invocation_id,
-				  LLVMValueRef tcs_out_current_patch_data_offset,
-				  LLVMValueRef invoc0_tf_outer[4],
-				  LLVMValueRef invoc0_tf_inner[2])
+static void si_write_tess_factors(struct si_shader_context *ctx, LLVMValueRef rel_patch_id,
+                                  LLVMValueRef invocation_id,
+                                  LLVMValueRef tcs_out_current_patch_data_offset,
+                                  LLVMValueRef invoc0_tf_outer[4], LLVMValueRef invoc0_tf_inner[2])
 {
-	struct si_shader *shader = ctx->shader;
-	unsigned tess_inner_index, tess_outer_index;
-	LLVMValueRef lds_base, lds_inner, lds_outer, byteoffset, buffer;
-	LLVMValueRef out[6], vec0, vec1, tf_base, inner[4], outer[4];
-	unsigned stride, outer_comps, inner_comps, i, offset;
-
-	/* Add a barrier before loading tess factors from LDS. */
-	if (!shader->key.part.tcs.epilog.invoc0_tess_factors_are_def)
-		si_llvm_emit_barrier(ctx);
-
-	/* Do this only for invocation 0, because the tess levels are per-patch,
-	 * not per-vertex.
-	 *
-	 * This can't jump, because invocation 0 executes this. It should
-	 * at least mask out the loads and stores for other invocations.
-	 */
-	ac_build_ifcc(&ctx->ac,
-		      LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ,
-				    invocation_id, ctx->ac.i32_0, ""), 6503);
-
-	/* Determine the layout of one tess factor element in the buffer. */
-	switch (shader->key.part.tcs.epilog.prim_mode) {
-	case PIPE_PRIM_LINES:
-		stride = 2; /* 2 dwords, 1 vec2 store */
-		outer_comps = 2;
-		inner_comps = 0;
-		break;
-	case PIPE_PRIM_TRIANGLES:
-		stride = 4; /* 4 dwords, 1 vec4 store */
-		outer_comps = 3;
-		inner_comps = 1;
-		break;
-	case PIPE_PRIM_QUADS:
-		stride = 6; /* 6 dwords, 2 stores (vec4 + vec2) */
-		outer_comps = 4;
-		inner_comps = 2;
-		break;
-	default:
-		assert(0);
-		return;
-	}
-
-	for (i = 0; i < 4; i++) {
-		inner[i] = LLVMGetUndef(ctx->ac.i32);
-		outer[i] = LLVMGetUndef(ctx->ac.i32);
-	}
-
-	if (shader->key.part.tcs.epilog.invoc0_tess_factors_are_def) {
-		/* Tess factors are in VGPRs. */
-		for (i = 0; i < outer_comps; i++)
-			outer[i] = out[i] = invoc0_tf_outer[i];
-		for (i = 0; i < inner_comps; i++)
-			inner[i] = out[outer_comps+i] = invoc0_tf_inner[i];
-	} else {
-		/* Load tess_inner and tess_outer from LDS.
-		 * Any invocation can write them, so we can't get them from a temporary.
-		 */
-		tess_inner_index = si_shader_io_get_unique_index_patch(TGSI_SEMANTIC_TESSINNER, 0);
-		tess_outer_index = si_shader_io_get_unique_index_patch(TGSI_SEMANTIC_TESSOUTER, 0);
-
-		lds_base = tcs_out_current_patch_data_offset;
-		lds_inner = LLVMBuildAdd(ctx->ac.builder, lds_base,
-					 LLVMConstInt(ctx->ac.i32,
-						      tess_inner_index * 4, 0), "");
-		lds_outer = LLVMBuildAdd(ctx->ac.builder, lds_base,
-					 LLVMConstInt(ctx->ac.i32,
-						      tess_outer_index * 4, 0), "");
-
-		for (i = 0; i < outer_comps; i++) {
-			outer[i] = out[i] =
-				lshs_lds_load(ctx, ctx->ac.i32, i, lds_outer);
-		}
-		for (i = 0; i < inner_comps; i++) {
-			inner[i] = out[outer_comps+i] =
-				lshs_lds_load(ctx, ctx->ac.i32, i, lds_inner);
-		}
-	}
-
-	if (shader->key.part.tcs.epilog.prim_mode == PIPE_PRIM_LINES) {
-		/* For isolines, the hardware expects tess factors in the
-		 * reverse order from what NIR specifies.
-		 */
-		LLVMValueRef tmp = out[0];
-		out[0] = out[1];
-		out[1] = tmp;
-	}
-
-	/* Convert the outputs to vectors for stores. */
-	vec0 = ac_build_gather_values(&ctx->ac, out, MIN2(stride, 4));
-	vec1 = NULL;
-
-	if (stride > 4)
-		vec1 = ac_build_gather_values(&ctx->ac, out+4, stride - 4);
-
-	/* Get the buffer. */
-	buffer = get_tess_ring_descriptor(ctx, TCS_FACTOR_RING);
-
-	/* Get the offset. */
-	tf_base = ac_get_arg(&ctx->ac,
-			     ctx->tcs_factor_offset);
-	byteoffset = LLVMBuildMul(ctx->ac.builder, rel_patch_id,
-				  LLVMConstInt(ctx->ac.i32, 4 * stride, 0), "");
-
-	ac_build_ifcc(&ctx->ac,
-		      LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ,
-				    rel_patch_id, ctx->ac.i32_0, ""), 6504);
-
-	/* Store the dynamic HS control word. */
-	offset = 0;
-	if (ctx->screen->info.chip_class <= GFX8) {
-		ac_build_buffer_store_dword(&ctx->ac, buffer,
-					    LLVMConstInt(ctx->ac.i32, 0x80000000, 0),
-					    1, ctx->ac.i32_0, tf_base,
-					    offset, ac_glc);
-		offset += 4;
-	}
-
-	ac_build_endif(&ctx->ac, 6504);
-
-	/* Store the tessellation factors. */
-	ac_build_buffer_store_dword(&ctx->ac, buffer, vec0,
-				    MIN2(stride, 4), byteoffset, tf_base,
-				    offset, ac_glc);
-	offset += 16;
-	if (vec1)
-		ac_build_buffer_store_dword(&ctx->ac, buffer, vec1,
-					    stride - 4, byteoffset, tf_base,
-					    offset, ac_glc);
-
-	/* Store the tess factors into the offchip buffer if TES reads them. */
-	if (shader->key.part.tcs.epilog.tes_reads_tess_factors) {
-		LLVMValueRef buf, base, inner_vec, outer_vec, tf_outer_offset;
-		LLVMValueRef tf_inner_offset;
-		unsigned param_outer, param_inner;
-
-		buf = get_tess_ring_descriptor(ctx, TESS_OFFCHIP_RING_TCS);
-		base = ac_get_arg(&ctx->ac, ctx->tcs_offchip_offset);
-
-		param_outer = si_shader_io_get_unique_index_patch(
-				      TGSI_SEMANTIC_TESSOUTER, 0);
-		tf_outer_offset = get_tcs_tes_buffer_address(ctx, rel_patch_id, NULL,
-					LLVMConstInt(ctx->ac.i32, param_outer, 0));
-
-		unsigned outer_vec_size =
-			ac_has_vec3_support(ctx->screen->info.chip_class, false) ?
-				outer_comps : util_next_power_of_two(outer_comps);
-		outer_vec = ac_build_gather_values(&ctx->ac, outer, outer_vec_size);
-
-		ac_build_buffer_store_dword(&ctx->ac, buf, outer_vec,
-					    outer_comps, tf_outer_offset,
-					    base, 0, ac_glc);
-		if (inner_comps) {
-			param_inner = si_shader_io_get_unique_index_patch(
-					      TGSI_SEMANTIC_TESSINNER, 0);
-			tf_inner_offset = get_tcs_tes_buffer_address(ctx, rel_patch_id, NULL,
-					LLVMConstInt(ctx->ac.i32, param_inner, 0));
-
-			inner_vec = inner_comps == 1 ? inner[0] :
-				    ac_build_gather_values(&ctx->ac, inner, inner_comps);
-			ac_build_buffer_store_dword(&ctx->ac, buf, inner_vec,
-						    inner_comps, tf_inner_offset,
-						    base, 0, ac_glc);
-		}
-	}
-
-	ac_build_endif(&ctx->ac, 6503);
+   struct si_shader *shader = ctx->shader;
+   unsigned tess_inner_index, tess_outer_index;
+   LLVMValueRef lds_base, lds_inner, lds_outer, byteoffset, buffer;
+   LLVMValueRef out[6], vec0, vec1, tf_base, inner[4], outer[4];
+   unsigned stride, outer_comps, inner_comps, i, offset;
+
+   /* Add a barrier before loading tess factors from LDS. */
+   if (!shader->key.part.tcs.epilog.invoc0_tess_factors_are_def)
+      si_llvm_emit_barrier(ctx);
+
+   /* Do this only for invocation 0, because the tess levels are per-patch,
+    * not per-vertex.
+    *
+    * This can't jump, because invocation 0 executes this. It should
+    * at least mask out the loads and stores for other invocations.
+    */
+   ac_build_ifcc(&ctx->ac,
+                 LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ, invocation_id, ctx->ac.i32_0, ""), 6503);
+
+   /* Determine the layout of one tess factor element in the buffer. */
+   switch (shader->key.part.tcs.epilog.prim_mode) {
+   case PIPE_PRIM_LINES:
+      stride = 2; /* 2 dwords, 1 vec2 store */
+      outer_comps = 2;
+      inner_comps = 0;
+      break;
+   case PIPE_PRIM_TRIANGLES:
+      stride = 4; /* 4 dwords, 1 vec4 store */
+      outer_comps = 3;
+      inner_comps = 1;
+      break;
+   case PIPE_PRIM_QUADS:
+      stride = 6; /* 6 dwords, 2 stores (vec4 + vec2) */
+      outer_comps = 4;
+      inner_comps = 2;
+      break;
+   default:
+      assert(0);
+      return;
+   }
+
+   for (i = 0; i < 4; i++) {
+      inner[i] = LLVMGetUndef(ctx->ac.i32);
+      outer[i] = LLVMGetUndef(ctx->ac.i32);
+   }
+
+   if (shader->key.part.tcs.epilog.invoc0_tess_factors_are_def) {
+      /* Tess factors are in VGPRs. */
+      for (i = 0; i < outer_comps; i++)
+         outer[i] = out[i] = invoc0_tf_outer[i];
+      for (i = 0; i < inner_comps; i++)
+         inner[i] = out[outer_comps + i] = invoc0_tf_inner[i];
+   } else {
+      /* Load tess_inner and tess_outer from LDS.
+       * Any invocation can write them, so we can't get them from a temporary.
+       */
+      tess_inner_index = si_shader_io_get_unique_index_patch(TGSI_SEMANTIC_TESSINNER, 0);
+      tess_outer_index = si_shader_io_get_unique_index_patch(TGSI_SEMANTIC_TESSOUTER, 0);
+
+      lds_base = tcs_out_current_patch_data_offset;
+      lds_inner = LLVMBuildAdd(ctx->ac.builder, lds_base,
+                               LLVMConstInt(ctx->ac.i32, tess_inner_index * 4, 0), "");
+      lds_outer = LLVMBuildAdd(ctx->ac.builder, lds_base,
+                               LLVMConstInt(ctx->ac.i32, tess_outer_index * 4, 0), "");
+
+      for (i = 0; i < outer_comps; i++) {
+         outer[i] = out[i] = lshs_lds_load(ctx, ctx->ac.i32, i, lds_outer);
+      }
+      for (i = 0; i < inner_comps; i++) {
+         inner[i] = out[outer_comps + i] = lshs_lds_load(ctx, ctx->ac.i32, i, lds_inner);
+      }
+   }
+
+   if (shader->key.part.tcs.epilog.prim_mode == PIPE_PRIM_LINES) {
+      /* For isolines, the hardware expects tess factors in the
+       * reverse order from what NIR specifies.
+       */
+      LLVMValueRef tmp = out[0];
+      out[0] = out[1];
+      out[1] = tmp;
+   }
+
+   /* Convert the outputs to vectors for stores. */
+   vec0 = ac_build_gather_values(&ctx->ac, out, MIN2(stride, 4));
+   vec1 = NULL;
+
+   if (stride > 4)
+      vec1 = ac_build_gather_values(&ctx->ac, out + 4, stride - 4);
+
+   /* Get the buffer. */
+   buffer = get_tess_ring_descriptor(ctx, TCS_FACTOR_RING);
+
+   /* Get the offset. */
+   tf_base = ac_get_arg(&ctx->ac, ctx->tcs_factor_offset);
+   byteoffset =
+      LLVMBuildMul(ctx->ac.builder, rel_patch_id, LLVMConstInt(ctx->ac.i32, 4 * stride, 0), "");
+
+   ac_build_ifcc(&ctx->ac,
+                 LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ, rel_patch_id, ctx->ac.i32_0, ""), 6504);
+
+   /* Store the dynamic HS control word. */
+   offset = 0;
+   if (ctx->screen->info.chip_class <= GFX8) {
+      ac_build_buffer_store_dword(&ctx->ac, buffer, LLVMConstInt(ctx->ac.i32, 0x80000000, 0), 1,
+                                  ctx->ac.i32_0, tf_base, offset, ac_glc);
+      offset += 4;
+   }
+
+   ac_build_endif(&ctx->ac, 6504);
+
+   /* Store the tessellation factors. */
+   ac_build_buffer_store_dword(&ctx->ac, buffer, vec0, MIN2(stride, 4), byteoffset, tf_base, offset,
+                               ac_glc);
+   offset += 16;
+   if (vec1)
+      ac_build_buffer_store_dword(&ctx->ac, buffer, vec1, stride - 4, byteoffset, tf_base, offset,
+                                  ac_glc);
+
+   /* Store the tess factors into the offchip buffer if TES reads them. */
+   if (shader->key.part.tcs.epilog.tes_reads_tess_factors) {
+      LLVMValueRef buf, base, inner_vec, outer_vec, tf_outer_offset;
+      LLVMValueRef tf_inner_offset;
+      unsigned param_outer, param_inner;
+
+      buf = get_tess_ring_descriptor(ctx, TESS_OFFCHIP_RING_TCS);
+      base = ac_get_arg(&ctx->ac, ctx->tcs_offchip_offset);
+
+      param_outer = si_shader_io_get_unique_index_patch(TGSI_SEMANTIC_TESSOUTER, 0);
+      tf_outer_offset = get_tcs_tes_buffer_address(ctx, rel_patch_id, NULL,
+                                                   LLVMConstInt(ctx->ac.i32, param_outer, 0));
+
+      unsigned outer_vec_size = ac_has_vec3_support(ctx->screen->info.chip_class, false)
+                                   ? outer_comps
+                                   : util_next_power_of_two(outer_comps);
+      outer_vec = ac_build_gather_values(&ctx->ac, outer, outer_vec_size);
+
+      ac_build_buffer_store_dword(&ctx->ac, buf, outer_vec, outer_comps, tf_outer_offset, base, 0,
+                                  ac_glc);
+      if (inner_comps) {
+         param_inner = si_shader_io_get_unique_index_patch(TGSI_SEMANTIC_TESSINNER, 0);
+         tf_inner_offset = get_tcs_tes_buffer_address(ctx, rel_patch_id, NULL,
+                                                      LLVMConstInt(ctx->ac.i32, param_inner, 0));
+
+         inner_vec =
+            inner_comps == 1 ? inner[0] : ac_build_gather_values(&ctx->ac, inner, inner_comps);
+         ac_build_buffer_store_dword(&ctx->ac, buf, inner_vec, inner_comps, tf_inner_offset, base,
+                                     0, ac_glc);
+      }
+   }
+
+   ac_build_endif(&ctx->ac, 6503);
 }
 
 /* This only writes the tessellation factor levels. */
-static void si_llvm_emit_tcs_epilogue(struct ac_shader_abi *abi,
-				      unsigned max_outputs,
-				      LLVMValueRef *addrs)
+static void si_llvm_emit_tcs_epilogue(struct ac_shader_abi *abi, unsigned max_outputs,
+                                      LLVMValueRef *addrs)
 {
-	struct si_shader_context *ctx = si_shader_context_from_abi(abi);
-	LLVMBuilderRef builder = ctx->ac.builder;
-	LLVMValueRef rel_patch_id, invocation_id, tf_lds_offset;
-
-	si_copy_tcs_inputs(ctx);
-
-	rel_patch_id = get_rel_patch_id(ctx);
-	invocation_id = si_unpack_param(ctx, ctx->args.tcs_rel_ids, 8, 5);
-	tf_lds_offset = get_tcs_out_current_patch_data_offset(ctx);
-
-	if (ctx->screen->info.chip_class >= GFX9) {
-		LLVMBasicBlockRef blocks[2] = {
-			LLVMGetInsertBlock(builder),
-			ctx->merged_wrap_if_entry_block
-		};
-		LLVMValueRef values[2];
-
-		ac_build_endif(&ctx->ac, ctx->merged_wrap_if_label);
-
-		values[0] = rel_patch_id;
-		values[1] = LLVMGetUndef(ctx->ac.i32);
-		rel_patch_id = ac_build_phi(&ctx->ac, ctx->ac.i32, 2, values, blocks);
-
-		values[0] = tf_lds_offset;
-		values[1] = LLVMGetUndef(ctx->ac.i32);
-		tf_lds_offset = ac_build_phi(&ctx->ac, ctx->ac.i32, 2, values, blocks);
-
-		values[0] = invocation_id;
-		values[1] = ctx->ac.i32_1; /* cause the epilog to skip threads */
-		invocation_id = ac_build_phi(&ctx->ac, ctx->ac.i32, 2, values, blocks);
-	}
-
-	/* Return epilog parameters from this function. */
-	LLVMValueRef ret = ctx->return_value;
-	unsigned vgpr;
-
-	if (ctx->screen->info.chip_class >= GFX9) {
-		ret = si_insert_input_ret(ctx, ret, ctx->tcs_offchip_layout,
-					  8 + GFX9_SGPR_TCS_OFFCHIP_LAYOUT);
-		ret = si_insert_input_ret(ctx, ret, ctx->tcs_out_lds_layout,
-					  8 + GFX9_SGPR_TCS_OUT_LAYOUT);
-		/* Tess offchip and tess factor offsets are at the beginning. */
-		ret = si_insert_input_ret(ctx, ret, ctx->tcs_offchip_offset, 2);
-		ret = si_insert_input_ret(ctx, ret, ctx->tcs_factor_offset, 4);
-		vgpr = 8 + GFX9_SGPR_TCS_OUT_LAYOUT + 1;
-	} else {
-		ret = si_insert_input_ret(ctx, ret, ctx->tcs_offchip_layout,
-					  GFX6_SGPR_TCS_OFFCHIP_LAYOUT);
-		ret = si_insert_input_ret(ctx, ret, ctx->tcs_out_lds_layout,
-					  GFX6_SGPR_TCS_OUT_LAYOUT);
-		/* Tess offchip and tess factor offsets are after user SGPRs. */
-		ret = si_insert_input_ret(ctx, ret, ctx->tcs_offchip_offset,
-					  GFX6_TCS_NUM_USER_SGPR);
-		ret = si_insert_input_ret(ctx, ret, ctx->tcs_factor_offset,
-					  GFX6_TCS_NUM_USER_SGPR + 1);
-		vgpr = GFX6_TCS_NUM_USER_SGPR + 2;
-	}
-
-	/* VGPRs */
-	rel_patch_id = ac_to_float(&ctx->ac, rel_patch_id);
-	invocation_id = ac_to_float(&ctx->ac, invocation_id);
-	tf_lds_offset = ac_to_float(&ctx->ac, tf_lds_offset);
-
-	/* Leave a hole corresponding to the two input VGPRs. This ensures that
-	 * the invocation_id output does not alias the tcs_rel_ids input,
-	 * which saves a V_MOV on gfx9.
-	 */
-	vgpr += 2;
-
-	ret = LLVMBuildInsertValue(builder, ret, rel_patch_id, vgpr++, "");
-	ret = LLVMBuildInsertValue(builder, ret, invocation_id, vgpr++, "");
-
-	if (ctx->shader->selector->info.tessfactors_are_def_in_all_invocs) {
-		vgpr++; /* skip the tess factor LDS offset */
-		for (unsigned i = 0; i < 6; i++) {
-			LLVMValueRef value =
-				LLVMBuildLoad(builder, ctx->invoc0_tess_factors[i], "");
-			value = ac_to_float(&ctx->ac, value);
-			ret = LLVMBuildInsertValue(builder, ret, value, vgpr++, "");
-		}
-	} else {
-		ret = LLVMBuildInsertValue(builder, ret, tf_lds_offset, vgpr++, "");
-	}
-	ctx->return_value = ret;
+   struct si_shader_context *ctx = si_shader_context_from_abi(abi);
+   LLVMBuilderRef builder = ctx->ac.builder;
+   LLVMValueRef rel_patch_id, invocation_id, tf_lds_offset;
+
+   si_copy_tcs_inputs(ctx);
+
+   rel_patch_id = get_rel_patch_id(ctx);
+   invocation_id = si_unpack_param(ctx, ctx->args.tcs_rel_ids, 8, 5);
+   tf_lds_offset = get_tcs_out_current_patch_data_offset(ctx);
+
+   if (ctx->screen->info.chip_class >= GFX9) {
+      LLVMBasicBlockRef blocks[2] = {LLVMGetInsertBlock(builder), ctx->merged_wrap_if_entry_block};
+      LLVMValueRef values[2];
+
+      ac_build_endif(&ctx->ac, ctx->merged_wrap_if_label);
+
+      values[0] = rel_patch_id;
+      values[1] = LLVMGetUndef(ctx->ac.i32);
+      rel_patch_id = ac_build_phi(&ctx->ac, ctx->ac.i32, 2, values, blocks);
+
+      values[0] = tf_lds_offset;
+      values[1] = LLVMGetUndef(ctx->ac.i32);
+      tf_lds_offset = ac_build_phi(&ctx->ac, ctx->ac.i32, 2, values, blocks);
+
+      values[0] = invocation_id;
+      values[1] = ctx->ac.i32_1; /* cause the epilog to skip threads */
+      invocation_id = ac_build_phi(&ctx->ac, ctx->ac.i32, 2, values, blocks);
+   }
+
+   /* Return epilog parameters from this function. */
+   LLVMValueRef ret = ctx->return_value;
+   unsigned vgpr;
+
+   if (ctx->screen->info.chip_class >= GFX9) {
+      ret =
+         si_insert_input_ret(ctx, ret, ctx->tcs_offchip_layout, 8 + GFX9_SGPR_TCS_OFFCHIP_LAYOUT);
+      ret = si_insert_input_ret(ctx, ret, ctx->tcs_out_lds_layout, 8 + GFX9_SGPR_TCS_OUT_LAYOUT);
+      /* Tess offchip and tess factor offsets are at the beginning. */
+      ret = si_insert_input_ret(ctx, ret, ctx->tcs_offchip_offset, 2);
+      ret = si_insert_input_ret(ctx, ret, ctx->tcs_factor_offset, 4);
+      vgpr = 8 + GFX9_SGPR_TCS_OUT_LAYOUT + 1;
+   } else {
+      ret = si_insert_input_ret(ctx, ret, ctx->tcs_offchip_layout, GFX6_SGPR_TCS_OFFCHIP_LAYOUT);
+      ret = si_insert_input_ret(ctx, ret, ctx->tcs_out_lds_layout, GFX6_SGPR_TCS_OUT_LAYOUT);
+      /* Tess offchip and tess factor offsets are after user SGPRs. */
+      ret = si_insert_input_ret(ctx, ret, ctx->tcs_offchip_offset, GFX6_TCS_NUM_USER_SGPR);
+      ret = si_insert_input_ret(ctx, ret, ctx->tcs_factor_offset, GFX6_TCS_NUM_USER_SGPR + 1);
+      vgpr = GFX6_TCS_NUM_USER_SGPR + 2;
+   }
+
+   /* VGPRs */
+   rel_patch_id = ac_to_float(&ctx->ac, rel_patch_id);
+   invocation_id = ac_to_float(&ctx->ac, invocation_id);
+   tf_lds_offset = ac_to_float(&ctx->ac, tf_lds_offset);
+
+   /* Leave a hole corresponding to the two input VGPRs. This ensures that
+    * the invocation_id output does not alias the tcs_rel_ids input,
+    * which saves a V_MOV on gfx9.
+    */
+   vgpr += 2;
+
+   ret = LLVMBuildInsertValue(builder, ret, rel_patch_id, vgpr++, "");
+   ret = LLVMBuildInsertValue(builder, ret, invocation_id, vgpr++, "");
+
+   if (ctx->shader->selector->info.tessfactors_are_def_in_all_invocs) {
+      vgpr++; /* skip the tess factor LDS offset */
+      for (unsigned i = 0; i < 6; i++) {
+         LLVMValueRef value = LLVMBuildLoad(builder, ctx->invoc0_tess_factors[i], "");
+         value = ac_to_float(&ctx->ac, value);
+         ret = LLVMBuildInsertValue(builder, ret, value, vgpr++, "");
+      }
+   } else {
+      ret = LLVMBuildInsertValue(builder, ret, tf_lds_offset, vgpr++, "");
+   }
+   ctx->return_value = ret;
 }
 
 /* Pass TCS inputs from LS to TCS on GFX9. */
 static void si_set_ls_return_value_for_tcs(struct si_shader_context *ctx)
 {
-	LLVMValueRef ret = ctx->return_value;
-
-	ret = si_insert_input_ptr(ctx, ret, ctx->other_const_and_shader_buffers, 0);
-	ret = si_insert_input_ptr(ctx, ret, ctx->other_samplers_and_images, 1);
-	ret = si_insert_input_ret(ctx, ret, ctx->tcs_offchip_offset, 2);
-	ret = si_insert_input_ret(ctx, ret, ctx->merged_wave_info, 3);
-	ret = si_insert_input_ret(ctx, ret, ctx->tcs_factor_offset, 4);
-	ret = si_insert_input_ret(ctx, ret, ctx->merged_scratch_offset, 5);
-
-	ret = si_insert_input_ptr(ctx, ret, ctx->rw_buffers,
-				  8 + SI_SGPR_RW_BUFFERS);
-	ret = si_insert_input_ptr(ctx, ret,
-				  ctx->bindless_samplers_and_images,
-				  8 + SI_SGPR_BINDLESS_SAMPLERS_AND_IMAGES);
-
-	ret = si_insert_input_ret(ctx, ret, ctx->vs_state_bits,
-				  8 + SI_SGPR_VS_STATE_BITS);
-
-	ret = si_insert_input_ret(ctx, ret, ctx->tcs_offchip_layout,
-				  8 + GFX9_SGPR_TCS_OFFCHIP_LAYOUT);
-	ret = si_insert_input_ret(ctx, ret, ctx->tcs_out_lds_offsets,
-				  8 + GFX9_SGPR_TCS_OUT_OFFSETS);
-	ret = si_insert_input_ret(ctx, ret, ctx->tcs_out_lds_layout,
-				  8 + GFX9_SGPR_TCS_OUT_LAYOUT);
-
-	unsigned vgpr = 8 + GFX9_TCS_NUM_USER_SGPR;
-	ret = LLVMBuildInsertValue(ctx->ac.builder, ret,
-				   ac_to_float(&ctx->ac,
-					       ac_get_arg(&ctx->ac, ctx->args.tcs_patch_id)),
-				   vgpr++, "");
-	ret = LLVMBuildInsertValue(ctx->ac.builder, ret,
-				   ac_to_float(&ctx->ac,
-					       ac_get_arg(&ctx->ac, ctx->args.tcs_rel_ids)),
-				   vgpr++, "");
-	ctx->return_value = ret;
+   LLVMValueRef ret = ctx->return_value;
+
+   ret = si_insert_input_ptr(ctx, ret, ctx->other_const_and_shader_buffers, 0);
+   ret = si_insert_input_ptr(ctx, ret, ctx->other_samplers_and_images, 1);
+   ret = si_insert_input_ret(ctx, ret, ctx->tcs_offchip_offset, 2);
+   ret = si_insert_input_ret(ctx, ret, ctx->merged_wave_info, 3);
+   ret = si_insert_input_ret(ctx, ret, ctx->tcs_factor_offset, 4);
+   ret = si_insert_input_ret(ctx, ret, ctx->merged_scratch_offset, 5);
+
+   ret = si_insert_input_ptr(ctx, ret, ctx->rw_buffers, 8 + SI_SGPR_RW_BUFFERS);
+   ret = si_insert_input_ptr(ctx, ret, ctx->bindless_samplers_and_images,
+                             8 + SI_SGPR_BINDLESS_SAMPLERS_AND_IMAGES);
+
+   ret = si_insert_input_ret(ctx, ret, ctx->vs_state_bits, 8 + SI_SGPR_VS_STATE_BITS);
+
+   ret = si_insert_input_ret(ctx, ret, ctx->tcs_offchip_layout, 8 + GFX9_SGPR_TCS_OFFCHIP_LAYOUT);
+   ret = si_insert_input_ret(ctx, ret, ctx->tcs_out_lds_offsets, 8 + GFX9_SGPR_TCS_OUT_OFFSETS);
+   ret = si_insert_input_ret(ctx, ret, ctx->tcs_out_lds_layout, 8 + GFX9_SGPR_TCS_OUT_LAYOUT);
+
+   unsigned vgpr = 8 + GFX9_TCS_NUM_USER_SGPR;
+   ret = LLVMBuildInsertValue(ctx->ac.builder, ret,
+                              ac_to_float(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args.tcs_patch_id)),
+                              vgpr++, "");
+   ret = LLVMBuildInsertValue(ctx->ac.builder, ret,
+                              ac_to_float(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args.tcs_rel_ids)),
+                              vgpr++, "");
+   ctx->return_value = ret;
 }
 
-void si_llvm_emit_ls_epilogue(struct ac_shader_abi *abi, unsigned max_outputs,
-			      LLVMValueRef *addrs)
+void si_llvm_emit_ls_epilogue(struct ac_shader_abi *abi, unsigned max_outputs, LLVMValueRef *addrs)
 {
-	struct si_shader_context *ctx = si_shader_context_from_abi(abi);
-	struct si_shader *shader = ctx->shader;
-	struct si_shader_info *info = &shader->selector->info;
-	unsigned i, chan;
-	LLVMValueRef vertex_id = ac_get_arg(&ctx->ac, ctx->rel_auto_id);
-	LLVMValueRef vertex_dw_stride = get_tcs_in_vertex_dw_stride(ctx);
-	LLVMValueRef base_dw_addr = LLVMBuildMul(ctx->ac.builder, vertex_id,
-						 vertex_dw_stride, "");
-
-	/* Write outputs to LDS. The next shader (TCS aka HS) will read
-	 * its inputs from it. */
-	for (i = 0; i < info->num_outputs; i++) {
-		unsigned name = info->output_semantic_name[i];
-		unsigned index = info->output_semantic_index[i];
-
-		/* The ARB_shader_viewport_layer_array spec contains the
-		 * following issue:
-		 *
-		 *    2) What happens if gl_ViewportIndex or gl_Layer is
-		 *    written in the vertex shader and a geometry shader is
-		 *    present?
-		 *
-		 *    RESOLVED: The value written by the last vertex processing
-		 *    stage is used. If the last vertex processing stage
-		 *    (vertex, tessellation evaluation or geometry) does not
-		 *    statically assign to gl_ViewportIndex or gl_Layer, index
-		 *    or layer zero is assumed.
-		 *
-		 * So writes to those outputs in VS-as-LS are simply ignored.
-		 */
-		if (name == TGSI_SEMANTIC_LAYER ||
-		    name == TGSI_SEMANTIC_VIEWPORT_INDEX)
-			continue;
-
-		int param = si_shader_io_get_unique_index(name, index, false);
-		LLVMValueRef dw_addr = LLVMBuildAdd(ctx->ac.builder, base_dw_addr,
-					LLVMConstInt(ctx->ac.i32, param * 4, 0), "");
-
-		for (chan = 0; chan < 4; chan++) {
-			if (!(info->output_usagemask[i] & (1 << chan)))
-				continue;
-
-			lshs_lds_store(ctx, chan, dw_addr,
-				  LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + chan], ""));
-		}
-	}
-
-	if (ctx->screen->info.chip_class >= GFX9)
-		si_set_ls_return_value_for_tcs(ctx);
+   struct si_shader_context *ctx = si_shader_context_from_abi(abi);
+   struct si_shader *shader = ctx->shader;
+   struct si_shader_info *info = &shader->selector->info;
+   unsigned i, chan;
+   LLVMValueRef vertex_id = ac_get_arg(&ctx->ac, ctx->rel_auto_id);
+   LLVMValueRef vertex_dw_stride = get_tcs_in_vertex_dw_stride(ctx);
+   LLVMValueRef base_dw_addr = LLVMBuildMul(ctx->ac.builder, vertex_id, vertex_dw_stride, "");
+
+   /* Write outputs to LDS. The next shader (TCS aka HS) will read
+    * its inputs from it. */
+   for (i = 0; i < info->num_outputs; i++) {
+      unsigned name = info->output_semantic_name[i];
+      unsigned index = info->output_semantic_index[i];
+
+      /* The ARB_shader_viewport_layer_array spec contains the
+       * following issue:
+       *
+       *    2) What happens if gl_ViewportIndex or gl_Layer is
+       *    written in the vertex shader and a geometry shader is
+       *    present?
+       *
+       *    RESOLVED: The value written by the last vertex processing
+       *    stage is used. If the last vertex processing stage
+       *    (vertex, tessellation evaluation or geometry) does not
+       *    statically assign to gl_ViewportIndex or gl_Layer, index
+       *    or layer zero is assumed.
+       *
+       * So writes to those outputs in VS-as-LS are simply ignored.
+       */
+      if (name == TGSI_SEMANTIC_LAYER || name == TGSI_SEMANTIC_VIEWPORT_INDEX)
+         continue;
+
+      int param = si_shader_io_get_unique_index(name, index, false);
+      LLVMValueRef dw_addr =
+         LLVMBuildAdd(ctx->ac.builder, base_dw_addr, LLVMConstInt(ctx->ac.i32, param * 4, 0), "");
+
+      for (chan = 0; chan < 4; chan++) {
+         if (!(info->output_usagemask[i] & (1 << chan)))
+            continue;
+
+         lshs_lds_store(ctx, chan, dw_addr,
+                        LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + chan], ""));
+      }
+   }
+
+   if (ctx->screen->info.chip_class >= GFX9)
+      si_set_ls_return_value_for_tcs(ctx);
 }
 
 /**
  * Compile the TCS epilog function. This writes tesselation factors to memory
  * based on the output primitive type of the tesselator (determined by TES).
  */
-void si_llvm_build_tcs_epilog(struct si_shader_context *ctx,
-			      union si_shader_part_key *key)
+void si_llvm_build_tcs_epilog(struct si_shader_context *ctx, union si_shader_part_key *key)
 {
-	memset(&ctx->args, 0, sizeof(ctx->args));
-
-	if (ctx->screen->info.chip_class >= GFX9) {
-		ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
-		ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
-		ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT,
-			   &ctx->tcs_offchip_offset);
-		ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* wave info */
-		ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT,
-			   &ctx->tcs_factor_offset);
-		ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
-		ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
-		ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
-		ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
-		ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
-		ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
-		ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
-		ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
-		ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
-		ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
-		ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
-		ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT,
-			   &ctx->tcs_offchip_layout);
-		ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
-		ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT,
-			   &ctx->tcs_out_lds_layout);
-	} else {
-		ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
-		ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
-		ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
-		ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
-		ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT,
-			   &ctx->tcs_offchip_layout);
-		ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
-		ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT,
-			   &ctx->tcs_out_lds_layout);
-		ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
-		ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT,
-			   &ctx->tcs_offchip_offset);
-		ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT,
-			   &ctx->tcs_factor_offset);
-	}
-
-	ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* VGPR gap */
-	ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* VGPR gap */
-	struct ac_arg rel_patch_id; /* patch index within the wave (REL_PATCH_ID) */
-	ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &rel_patch_id);
-	struct ac_arg invocation_id; /* invocation ID within the patch */
-	ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &invocation_id);
-	struct ac_arg tcs_out_current_patch_data_offset; /* LDS offset where tess factors should be loaded from */
-	ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT,
-		   &tcs_out_current_patch_data_offset);
-
-	struct ac_arg tess_factors[6];
-	for (unsigned i = 0; i < 6; i++)
-		ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &tess_factors[i]);
-
-	/* Create the function. */
-	si_llvm_create_func(ctx, "tcs_epilog", NULL, 0,
-			    ctx->screen->info.chip_class >= GFX7 ? 128 : 0);
-	ac_declare_lds_as_pointer(&ctx->ac);
-
-	LLVMValueRef invoc0_tess_factors[6];
-	for (unsigned i = 0; i < 6; i++)
-		invoc0_tess_factors[i] = ac_get_arg(&ctx->ac, tess_factors[i]);
-
-	si_write_tess_factors(ctx,
-			      ac_get_arg(&ctx->ac, rel_patch_id),
-			      ac_get_arg(&ctx->ac, invocation_id),
-			      ac_get_arg(&ctx->ac, tcs_out_current_patch_data_offset),
-			      invoc0_tess_factors, invoc0_tess_factors + 4);
-
-	LLVMBuildRetVoid(ctx->ac.builder);
+   memset(&ctx->args, 0, sizeof(ctx->args));
+
+   if (ctx->screen->info.chip_class >= GFX9) {
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_offset);
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* wave info */
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_factor_offset);
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_layout);
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_out_lds_layout);
+   } else {
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_layout);
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_out_lds_layout);
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_offset);
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_factor_offset);
+   }
+
+   ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* VGPR gap */
+   ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* VGPR gap */
+   struct ac_arg rel_patch_id; /* patch index within the wave (REL_PATCH_ID) */
+   ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &rel_patch_id);
+   struct ac_arg invocation_id; /* invocation ID within the patch */
+   ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &invocation_id);
+   struct ac_arg
+      tcs_out_current_patch_data_offset; /* LDS offset where tess factors should be loaded from */
+   ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &tcs_out_current_patch_data_offset);
+
+   struct ac_arg tess_factors[6];
+   for (unsigned i = 0; i < 6; i++)
+      ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &tess_factors[i]);
+
+   /* Create the function. */
+   si_llvm_create_func(ctx, "tcs_epilog", NULL, 0, ctx->screen->info.chip_class >= GFX7 ? 128 : 0);
+   ac_declare_lds_as_pointer(&ctx->ac);
+
+   LLVMValueRef invoc0_tess_factors[6];
+   for (unsigned i = 0; i < 6; i++)
+      invoc0_tess_factors[i] = ac_get_arg(&ctx->ac, tess_factors[i]);
+
+   si_write_tess_factors(ctx, ac_get_arg(&ctx->ac, rel_patch_id),
+                         ac_get_arg(&ctx->ac, invocation_id),
+                         ac_get_arg(&ctx->ac, tcs_out_current_patch_data_offset),
+                         invoc0_tess_factors, invoc0_tess_factors + 4);
+
+   LLVMBuildRetVoid(ctx->ac.builder);
 }
 
 void si_llvm_init_tcs_callbacks(struct si_shader_context *ctx)
 {
-	ctx->abi.load_tess_varyings = si_nir_load_tcs_varyings;
-	ctx->abi.load_tess_level = si_load_tess_level;
-	ctx->abi.store_tcs_outputs = si_nir_store_output_tcs;
-	ctx->abi.emit_outputs = si_llvm_emit_tcs_epilogue;
-	ctx->abi.load_patch_vertices_in = si_load_patch_vertices_in;
+   ctx->abi.load_tess_varyings = si_nir_load_tcs_varyings;
+   ctx->abi.load_tess_level = si_load_tess_level;
+   ctx->abi.store_tcs_outputs = si_nir_store_output_tcs;
+   ctx->abi.emit_outputs = si_llvm_emit_tcs_epilogue;
+   ctx->abi.load_patch_vertices_in = si_load_patch_vertices_in;
 }
 
 void si_llvm_init_tes_callbacks(struct si_shader_context *ctx, bool ngg_cull_shader)
 {
-	ctx->abi.load_tess_varyings = si_nir_load_input_tes;
-	ctx->abi.load_tess_coord = si_load_tess_coord;
-	ctx->abi.load_tess_level = si_load_tess_level;
-	ctx->abi.load_patch_vertices_in = si_load_patch_vertices_in;
-
-	if (ctx->shader->key.as_es)
-		ctx->abi.emit_outputs = si_llvm_emit_es_epilogue;
-	else if (ngg_cull_shader)
-		ctx->abi.emit_outputs = gfx10_emit_ngg_culling_epilogue_4x_wave32;
-	else if (ctx->shader->key.as_ngg)
-		ctx->abi.emit_outputs = gfx10_emit_ngg_epilogue;
-	else
-		ctx->abi.emit_outputs = si_llvm_emit_vs_epilogue;
+   ctx->abi.load_tess_varyings = si_nir_load_input_tes;
+   ctx->abi.load_tess_coord = si_load_tess_coord;
+   ctx->abi.load_tess_level = si_load_tess_level;
+   ctx->abi.load_patch_vertices_in = si_load_patch_vertices_in;
+
+   if (ctx->shader->key.as_es)
+      ctx->abi.emit_outputs = si_llvm_emit_es_epilogue;
+   else if (ngg_cull_shader)
+      ctx->abi.emit_outputs = gfx10_emit_ngg_culling_epilogue_4x_wave32;
+   else if (ctx->shader->key.as_ngg)
+      ctx->abi.emit_outputs = gfx10_emit_ngg_epilogue;
+   else
+      ctx->abi.emit_outputs = si_llvm_emit_vs_epilogue;
 }
diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm_vs.c b/src/gallium/drivers/radeonsi/si_shader_llvm_vs.c
index 39c06f41ece..8640150b18c 100644
--- a/src/gallium/drivers/radeonsi/si_shader_llvm_vs.c
+++ b/src/gallium/drivers/radeonsi/si_shader_llvm_vs.c
@@ -22,518 +22,463 @@
  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  */
 
-#include "si_shader_internal.h"
 #include "si_pipe.h"
+#include "si_shader_internal.h"
 #include "sid.h"
 #include "util/u_memory.h"
 
-static LLVMValueRef unpack_sint16(struct si_shader_context *ctx,
-				 LLVMValueRef i32, unsigned index)
+static LLVMValueRef unpack_sint16(struct si_shader_context *ctx, LLVMValueRef i32, unsigned index)
 {
-	assert(index <= 1);
+   assert(index <= 1);
 
-	if (index == 1)
-		return LLVMBuildAShr(ctx->ac.builder, i32,
-				     LLVMConstInt(ctx->ac.i32, 16, 0), "");
+   if (index == 1)
+      return LLVMBuildAShr(ctx->ac.builder, i32, LLVMConstInt(ctx->ac.i32, 16, 0), "");
 
-	return LLVMBuildSExt(ctx->ac.builder,
-			     LLVMBuildTrunc(ctx->ac.builder, i32,
-					    ctx->ac.i16, ""),
-			     ctx->ac.i32, "");
+   return LLVMBuildSExt(ctx->ac.builder, LLVMBuildTrunc(ctx->ac.builder, i32, ctx->ac.i16, ""),
+                        ctx->ac.i32, "");
 }
 
-static void load_input_vs(struct si_shader_context *ctx, unsigned input_index,
-			  LLVMValueRef out[4])
+static void load_input_vs(struct si_shader_context *ctx, unsigned input_index, LLVMValueRef out[4])
 {
-	const struct si_shader_info *info = &ctx->shader->selector->info;
-	unsigned vs_blit_property = info->properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD];
-
-	if (vs_blit_property) {
-		LLVMValueRef vertex_id = ctx->abi.vertex_id;
-		LLVMValueRef sel_x1 = LLVMBuildICmp(ctx->ac.builder,
-						    LLVMIntULE, vertex_id,
-						    ctx->ac.i32_1, "");
-		/* Use LLVMIntNE, because we have 3 vertices and only
-		 * the middle one should use y2.
-		 */
-		LLVMValueRef sel_y1 = LLVMBuildICmp(ctx->ac.builder,
-						    LLVMIntNE, vertex_id,
-						    ctx->ac.i32_1, "");
-
-		unsigned param_vs_blit_inputs = ctx->vs_blit_inputs.arg_index;
-		if (input_index == 0) {
-			/* Position: */
-			LLVMValueRef x1y1 = LLVMGetParam(ctx->main_fn,
-							 param_vs_blit_inputs);
-			LLVMValueRef x2y2 = LLVMGetParam(ctx->main_fn,
-							 param_vs_blit_inputs + 1);
-
-			LLVMValueRef x1 = unpack_sint16(ctx, x1y1, 0);
-			LLVMValueRef y1 = unpack_sint16(ctx, x1y1, 1);
-			LLVMValueRef x2 = unpack_sint16(ctx, x2y2, 0);
-			LLVMValueRef y2 = unpack_sint16(ctx, x2y2, 1);
-
-			LLVMValueRef x = LLVMBuildSelect(ctx->ac.builder, sel_x1,
-							 x1, x2, "");
-			LLVMValueRef y = LLVMBuildSelect(ctx->ac.builder, sel_y1,
-							 y1, y2, "");
-
-			out[0] = LLVMBuildSIToFP(ctx->ac.builder, x, ctx->ac.f32, "");
-			out[1] = LLVMBuildSIToFP(ctx->ac.builder, y, ctx->ac.f32, "");
-			out[2] = LLVMGetParam(ctx->main_fn,
-					      param_vs_blit_inputs + 2);
-			out[3] = ctx->ac.f32_1;
-			return;
-		}
-
-		/* Color or texture coordinates: */
-		assert(input_index == 1);
-
-		if (vs_blit_property == SI_VS_BLIT_SGPRS_POS_COLOR) {
-			for (int i = 0; i < 4; i++) {
-				out[i] = LLVMGetParam(ctx->main_fn,
-						      param_vs_blit_inputs + 3 + i);
-			}
-		} else {
-			assert(vs_blit_property == SI_VS_BLIT_SGPRS_POS_TEXCOORD);
-			LLVMValueRef x1 = LLVMGetParam(ctx->main_fn,
-						       param_vs_blit_inputs + 3);
-			LLVMValueRef y1 = LLVMGetParam(ctx->main_fn,
-						       param_vs_blit_inputs + 4);
-			LLVMValueRef x2 = LLVMGetParam(ctx->main_fn,
-						       param_vs_blit_inputs + 5);
-			LLVMValueRef y2 = LLVMGetParam(ctx->main_fn,
-						       param_vs_blit_inputs + 6);
-
-			out[0] = LLVMBuildSelect(ctx->ac.builder, sel_x1,
-						 x1, x2, "");
-			out[1] = LLVMBuildSelect(ctx->ac.builder, sel_y1,
-						 y1, y2, "");
-			out[2] = LLVMGetParam(ctx->main_fn,
-					      param_vs_blit_inputs + 7);
-			out[3] = LLVMGetParam(ctx->main_fn,
-					      param_vs_blit_inputs + 8);
-		}
-		return;
-	}
-
-	unsigned num_vbos_in_user_sgprs = ctx->shader->selector->num_vbos_in_user_sgprs;
-	union si_vs_fix_fetch fix_fetch;
-	LLVMValueRef vb_desc;
-	LLVMValueRef vertex_index;
-	LLVMValueRef tmp;
-
-	if (input_index < num_vbos_in_user_sgprs) {
-		vb_desc = ac_get_arg(&ctx->ac, ctx->vb_descriptors[input_index]);
-	} else {
-		unsigned index= input_index - num_vbos_in_user_sgprs;
-		vb_desc = ac_build_load_to_sgpr(&ctx->ac,
-						ac_get_arg(&ctx->ac, ctx->vertex_buffers),
-						LLVMConstInt(ctx->ac.i32, index, 0));
-	}
-
-	vertex_index = LLVMGetParam(ctx->main_fn,
-				    ctx->vertex_index0.arg_index +
-				    input_index);
-
-	/* Use the open-coded implementation for all loads of doubles and
-	 * of dword-sized data that needs fixups. We need to insert conversion
-	 * code anyway, and the amd/common code does it for us.
-	 *
-	 * Note: On LLVM <= 8, we can only open-code formats with
-	 * channel size >= 4 bytes.
-	 */
-	bool opencode = ctx->shader->key.mono.vs_fetch_opencode & (1 << input_index);
-	fix_fetch.bits = ctx->shader->key.mono.vs_fix_fetch[input_index].bits;
-	if (opencode ||
-	    (fix_fetch.u.log_size == 3 && fix_fetch.u.format == AC_FETCH_FORMAT_FLOAT) ||
-	    (fix_fetch.u.log_size == 2)) {
-		tmp = ac_build_opencoded_load_format(
-				&ctx->ac, fix_fetch.u.log_size, fix_fetch.u.num_channels_m1 + 1,
-				fix_fetch.u.format, fix_fetch.u.reverse, !opencode,
-				vb_desc, vertex_index, ctx->ac.i32_0, ctx->ac.i32_0, 0, true);
-		for (unsigned i = 0; i < 4; ++i)
-			out[i] = LLVMBuildExtractElement(ctx->ac.builder, tmp, LLVMConstInt(ctx->ac.i32, i, false), "");
-		return;
-	}
-
-	/* Do multiple loads for special formats. */
-	unsigned required_channels = util_last_bit(info->input_usage_mask[input_index]);
-	LLVMValueRef fetches[4];
-	unsigned num_fetches;
-	unsigned fetch_stride;
-	unsigned channels_per_fetch;
-
-	if (fix_fetch.u.log_size <= 1 && fix_fetch.u.num_channels_m1 == 2) {
-		num_fetches = MIN2(required_channels, 3);
-		fetch_stride = 1 << fix_fetch.u.log_size;
-		channels_per_fetch = 1;
-	} else {
-		num_fetches = 1;
-		fetch_stride = 0;
-		channels_per_fetch = required_channels;
-	}
-
-	for (unsigned i = 0; i < num_fetches; ++i) {
-		LLVMValueRef voffset = LLVMConstInt(ctx->ac.i32, fetch_stride * i, 0);
-		fetches[i] = ac_build_buffer_load_format(&ctx->ac, vb_desc, vertex_index, voffset,
-							 channels_per_fetch, 0, true);
-	}
-
-	if (num_fetches == 1 && channels_per_fetch > 1) {
-		LLVMValueRef fetch = fetches[0];
-		for (unsigned i = 0; i < channels_per_fetch; ++i) {
-			tmp = LLVMConstInt(ctx->ac.i32, i, false);
-			fetches[i] = LLVMBuildExtractElement(
-				ctx->ac.builder, fetch, tmp, "");
-		}
-		num_fetches = channels_per_fetch;
-		channels_per_fetch = 1;
-	}
-
-	for (unsigned i = num_fetches; i < 4; ++i)
-		fetches[i] = LLVMGetUndef(ctx->ac.f32);
-
-	if (fix_fetch.u.log_size <= 1 && fix_fetch.u.num_channels_m1 == 2 &&
-	    required_channels == 4) {
-		if (fix_fetch.u.format == AC_FETCH_FORMAT_UINT || fix_fetch.u.format == AC_FETCH_FORMAT_SINT)
-			fetches[3] = ctx->ac.i32_1;
-		else
-			fetches[3] = ctx->ac.f32_1;
-	} else if (fix_fetch.u.log_size == 3 &&
-		   (fix_fetch.u.format == AC_FETCH_FORMAT_SNORM ||
-		    fix_fetch.u.format == AC_FETCH_FORMAT_SSCALED ||
-		    fix_fetch.u.format == AC_FETCH_FORMAT_SINT) &&
-		   required_channels == 4) {
-		/* For 2_10_10_10, the hardware returns an unsigned value;
-		 * convert it to a signed one.
-		 */
-		LLVMValueRef tmp = fetches[3];
-		LLVMValueRef c30 = LLVMConstInt(ctx->ac.i32, 30, 0);
-
-		/* First, recover the sign-extended signed integer value. */
-		if (fix_fetch.u.format == AC_FETCH_FORMAT_SSCALED)
-			tmp = LLVMBuildFPToUI(ctx->ac.builder, tmp, ctx->ac.i32, "");
-		else
-			tmp = ac_to_integer(&ctx->ac, tmp);
-
-		/* For the integer-like cases, do a natural sign extension.
-		 *
-		 * For the SNORM case, the values are 0.0, 0.333, 0.666, 1.0
-		 * and happen to contain 0, 1, 2, 3 as the two LSBs of the
-		 * exponent.
-		 */
-		tmp = LLVMBuildShl(ctx->ac.builder, tmp,
-				   fix_fetch.u.format == AC_FETCH_FORMAT_SNORM ?
-				   LLVMConstInt(ctx->ac.i32, 7, 0) : c30, "");
-		tmp = LLVMBuildAShr(ctx->ac.builder, tmp, c30, "");
-
-		/* Convert back to the right type. */
-		if (fix_fetch.u.format == AC_FETCH_FORMAT_SNORM) {
-			LLVMValueRef clamp;
-			LLVMValueRef neg_one = LLVMConstReal(ctx->ac.f32, -1.0);
-			tmp = LLVMBuildSIToFP(ctx->ac.builder, tmp, ctx->ac.f32, "");
-			clamp = LLVMBuildFCmp(ctx->ac.builder, LLVMRealULT, tmp, neg_one, "");
-			tmp = LLVMBuildSelect(ctx->ac.builder, clamp, neg_one, tmp, "");
-		} else if (fix_fetch.u.format == AC_FETCH_FORMAT_SSCALED) {
-			tmp = LLVMBuildSIToFP(ctx->ac.builder, tmp, ctx->ac.f32, "");
-		}
-
-		fetches[3] = tmp;
-	}
-
-	for (unsigned i = 0; i < 4; ++i)
-		out[i] = ac_to_float(&ctx->ac, fetches[i]);
+   const struct si_shader_info *info = &ctx->shader->selector->info;
+   unsigned vs_blit_property = info->properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD];
+
+   if (vs_blit_property) {
+      LLVMValueRef vertex_id = ctx->abi.vertex_id;
+      LLVMValueRef sel_x1 =
+         LLVMBuildICmp(ctx->ac.builder, LLVMIntULE, vertex_id, ctx->ac.i32_1, "");
+      /* Use LLVMIntNE, because we have 3 vertices and only
+       * the middle one should use y2.
+       */
+      LLVMValueRef sel_y1 = LLVMBuildICmp(ctx->ac.builder, LLVMIntNE, vertex_id, ctx->ac.i32_1, "");
+
+      unsigned param_vs_blit_inputs = ctx->vs_blit_inputs.arg_index;
+      if (input_index == 0) {
+         /* Position: */
+         LLVMValueRef x1y1 = LLVMGetParam(ctx->main_fn, param_vs_blit_inputs);
+         LLVMValueRef x2y2 = LLVMGetParam(ctx->main_fn, param_vs_blit_inputs + 1);
+
+         LLVMValueRef x1 = unpack_sint16(ctx, x1y1, 0);
+         LLVMValueRef y1 = unpack_sint16(ctx, x1y1, 1);
+         LLVMValueRef x2 = unpack_sint16(ctx, x2y2, 0);
+         LLVMValueRef y2 = unpack_sint16(ctx, x2y2, 1);
+
+         LLVMValueRef x = LLVMBuildSelect(ctx->ac.builder, sel_x1, x1, x2, "");
+         LLVMValueRef y = LLVMBuildSelect(ctx->ac.builder, sel_y1, y1, y2, "");
+
+         out[0] = LLVMBuildSIToFP(ctx->ac.builder, x, ctx->ac.f32, "");
+         out[1] = LLVMBuildSIToFP(ctx->ac.builder, y, ctx->ac.f32, "");
+         out[2] = LLVMGetParam(ctx->main_fn, param_vs_blit_inputs + 2);
+         out[3] = ctx->ac.f32_1;
+         return;
+      }
+
+      /* Color or texture coordinates: */
+      assert(input_index == 1);
+
+      if (vs_blit_property == SI_VS_BLIT_SGPRS_POS_COLOR) {
+         for (int i = 0; i < 4; i++) {
+            out[i] = LLVMGetParam(ctx->main_fn, param_vs_blit_inputs + 3 + i);
+         }
+      } else {
+         assert(vs_blit_property == SI_VS_BLIT_SGPRS_POS_TEXCOORD);
+         LLVMValueRef x1 = LLVMGetParam(ctx->main_fn, param_vs_blit_inputs + 3);
+         LLVMValueRef y1 = LLVMGetParam(ctx->main_fn, param_vs_blit_inputs + 4);
+         LLVMValueRef x2 = LLVMGetParam(ctx->main_fn, param_vs_blit_inputs + 5);
+         LLVMValueRef y2 = LLVMGetParam(ctx->main_fn, param_vs_blit_inputs + 6);
+
+         out[0] = LLVMBuildSelect(ctx->ac.builder, sel_x1, x1, x2, "");
+         out[1] = LLVMBuildSelect(ctx->ac.builder, sel_y1, y1, y2, "");
+         out[2] = LLVMGetParam(ctx->main_fn, param_vs_blit_inputs + 7);
+         out[3] = LLVMGetParam(ctx->main_fn, param_vs_blit_inputs + 8);
+      }
+      return;
+   }
+
+   unsigned num_vbos_in_user_sgprs = ctx->shader->selector->num_vbos_in_user_sgprs;
+   union si_vs_fix_fetch fix_fetch;
+   LLVMValueRef vb_desc;
+   LLVMValueRef vertex_index;
+   LLVMValueRef tmp;
+
+   if (input_index < num_vbos_in_user_sgprs) {
+      vb_desc = ac_get_arg(&ctx->ac, ctx->vb_descriptors[input_index]);
+   } else {
+      unsigned index = input_index - num_vbos_in_user_sgprs;
+      vb_desc = ac_build_load_to_sgpr(&ctx->ac, ac_get_arg(&ctx->ac, ctx->vertex_buffers),
+                                      LLVMConstInt(ctx->ac.i32, index, 0));
+   }
+
+   vertex_index = LLVMGetParam(ctx->main_fn, ctx->vertex_index0.arg_index + input_index);
+
+   /* Use the open-coded implementation for all loads of doubles and
+    * of dword-sized data that needs fixups. We need to insert conversion
+    * code anyway, and the amd/common code does it for us.
+    *
+    * Note: On LLVM <= 8, we can only open-code formats with
+    * channel size >= 4 bytes.
+    */
+   bool opencode = ctx->shader->key.mono.vs_fetch_opencode & (1 << input_index);
+   fix_fetch.bits = ctx->shader->key.mono.vs_fix_fetch[input_index].bits;
+   if (opencode || (fix_fetch.u.log_size == 3 && fix_fetch.u.format == AC_FETCH_FORMAT_FLOAT) ||
+       (fix_fetch.u.log_size == 2)) {
+      tmp = ac_build_opencoded_load_format(&ctx->ac, fix_fetch.u.log_size,
+                                           fix_fetch.u.num_channels_m1 + 1, fix_fetch.u.format,
+                                           fix_fetch.u.reverse, !opencode, vb_desc, vertex_index,
+                                           ctx->ac.i32_0, ctx->ac.i32_0, 0, true);
+      for (unsigned i = 0; i < 4; ++i)
+         out[i] =
+            LLVMBuildExtractElement(ctx->ac.builder, tmp, LLVMConstInt(ctx->ac.i32, i, false), "");
+      return;
+   }
+
+   /* Do multiple loads for special formats. */
+   unsigned required_channels = util_last_bit(info->input_usage_mask[input_index]);
+   LLVMValueRef fetches[4];
+   unsigned num_fetches;
+   unsigned fetch_stride;
+   unsigned channels_per_fetch;
+
+   if (fix_fetch.u.log_size <= 1 && fix_fetch.u.num_channels_m1 == 2) {
+      num_fetches = MIN2(required_channels, 3);
+      fetch_stride = 1 << fix_fetch.u.log_size;
+      channels_per_fetch = 1;
+   } else {
+      num_fetches = 1;
+      fetch_stride = 0;
+      channels_per_fetch = required_channels;
+   }
+
+   for (unsigned i = 0; i < num_fetches; ++i) {
+      LLVMValueRef voffset = LLVMConstInt(ctx->ac.i32, fetch_stride * i, 0);
+      fetches[i] = ac_build_buffer_load_format(&ctx->ac, vb_desc, vertex_index, voffset,
+                                               channels_per_fetch, 0, true);
+   }
+
+   if (num_fetches == 1 && channels_per_fetch > 1) {
+      LLVMValueRef fetch = fetches[0];
+      for (unsigned i = 0; i < channels_per_fetch; ++i) {
+         tmp = LLVMConstInt(ctx->ac.i32, i, false);
+         fetches[i] = LLVMBuildExtractElement(ctx->ac.builder, fetch, tmp, "");
+      }
+      num_fetches = channels_per_fetch;
+      channels_per_fetch = 1;
+   }
+
+   for (unsigned i = num_fetches; i < 4; ++i)
+      fetches[i] = LLVMGetUndef(ctx->ac.f32);
+
+   if (fix_fetch.u.log_size <= 1 && fix_fetch.u.num_channels_m1 == 2 && required_channels == 4) {
+      if (fix_fetch.u.format == AC_FETCH_FORMAT_UINT || fix_fetch.u.format == AC_FETCH_FORMAT_SINT)
+         fetches[3] = ctx->ac.i32_1;
+      else
+         fetches[3] = ctx->ac.f32_1;
+   } else if (fix_fetch.u.log_size == 3 &&
+              (fix_fetch.u.format == AC_FETCH_FORMAT_SNORM ||
+               fix_fetch.u.format == AC_FETCH_FORMAT_SSCALED ||
+               fix_fetch.u.format == AC_FETCH_FORMAT_SINT) &&
+              required_channels == 4) {
+      /* For 2_10_10_10, the hardware returns an unsigned value;
+       * convert it to a signed one.
+       */
+      LLVMValueRef tmp = fetches[3];
+      LLVMValueRef c30 = LLVMConstInt(ctx->ac.i32, 30, 0);
+
+      /* First, recover the sign-extended signed integer value. */
+      if (fix_fetch.u.format == AC_FETCH_FORMAT_SSCALED)
+         tmp = LLVMBuildFPToUI(ctx->ac.builder, tmp, ctx->ac.i32, "");
+      else
+         tmp = ac_to_integer(&ctx->ac, tmp);
+
+      /* For the integer-like cases, do a natural sign extension.
+       *
+       * For the SNORM case, the values are 0.0, 0.333, 0.666, 1.0
+       * and happen to contain 0, 1, 2, 3 as the two LSBs of the
+       * exponent.
+       */
+      tmp = LLVMBuildShl(
+         ctx->ac.builder, tmp,
+         fix_fetch.u.format == AC_FETCH_FORMAT_SNORM ? LLVMConstInt(ctx->ac.i32, 7, 0) : c30, "");
+      tmp = LLVMBuildAShr(ctx->ac.builder, tmp, c30, "");
+
+      /* Convert back to the right type. */
+      if (fix_fetch.u.format == AC_FETCH_FORMAT_SNORM) {
+         LLVMValueRef clamp;
+         LLVMValueRef neg_one = LLVMConstReal(ctx->ac.f32, -1.0);
+         tmp = LLVMBuildSIToFP(ctx->ac.builder, tmp, ctx->ac.f32, "");
+         clamp = LLVMBuildFCmp(ctx->ac.builder, LLVMRealULT, tmp, neg_one, "");
+         tmp = LLVMBuildSelect(ctx->ac.builder, clamp, neg_one, tmp, "");
+      } else if (fix_fetch.u.format == AC_FETCH_FORMAT_SSCALED) {
+         tmp = LLVMBuildSIToFP(ctx->ac.builder, tmp, ctx->ac.f32, "");
+      }
+
+      fetches[3] = tmp;
+   }
+
+   for (unsigned i = 0; i < 4; ++i)
+      out[i] = ac_to_float(&ctx->ac, fetches[i]);
 }
 
 static void declare_input_vs(struct si_shader_context *ctx, unsigned input_index)
 {
-	LLVMValueRef input[4];
+   LLVMValueRef input[4];
 
-	load_input_vs(ctx, input_index / 4, input);
+   load_input_vs(ctx, input_index / 4, input);
 
-	for (unsigned chan = 0; chan < 4; chan++) {
-		ctx->inputs[input_index + chan] =
-			LLVMBuildBitCast(ctx->ac.builder, input[chan], ctx->ac.i32, "");
-	}
+   for (unsigned chan = 0; chan < 4; chan++) {
+      ctx->inputs[input_index + chan] =
+         LLVMBuildBitCast(ctx->ac.builder, input[chan], ctx->ac.i32, "");
+   }
 }
 
 void si_llvm_load_vs_inputs(struct si_shader_context *ctx, struct nir_shader *nir)
 {
-	uint64_t processed_inputs = 0;
-
-	nir_foreach_variable(variable, &nir->inputs) {
-		unsigned attrib_count = glsl_count_attribute_slots(variable->type,
-								   true);
-		unsigned input_idx = variable->data.driver_location;
-		unsigned loc = variable->data.location;
-
-		for (unsigned i = 0; i < attrib_count; i++) {
-			/* Packed components share the same location so skip
-			 * them if we have already processed the location.
-			 */
-			if (processed_inputs & ((uint64_t)1 << (loc + i))) {
-				input_idx += 4;
-				continue;
-			}
-
-			declare_input_vs(ctx, input_idx);
-			if (glsl_type_is_dual_slot(variable->type)) {
-				input_idx += 4;
-				declare_input_vs(ctx, input_idx);
-			}
-
-			processed_inputs |= ((uint64_t)1 << (loc + i));
-			input_idx += 4;
-		}
-	}
+   uint64_t processed_inputs = 0;
+
+   nir_foreach_variable (variable, &nir->inputs) {
+      unsigned attrib_count = glsl_count_attribute_slots(variable->type, true);
+      unsigned input_idx = variable->data.driver_location;
+      unsigned loc = variable->data.location;
+
+      for (unsigned i = 0; i < attrib_count; i++) {
+         /* Packed components share the same location so skip
+          * them if we have already processed the location.
+          */
+         if (processed_inputs & ((uint64_t)1 << (loc + i))) {
+            input_idx += 4;
+            continue;
+         }
+
+         declare_input_vs(ctx, input_idx);
+         if (glsl_type_is_dual_slot(variable->type)) {
+            input_idx += 4;
+            declare_input_vs(ctx, input_idx);
+         }
+
+         processed_inputs |= ((uint64_t)1 << (loc + i));
+         input_idx += 4;
+      }
+   }
 }
 
-void si_llvm_streamout_store_output(struct si_shader_context *ctx,
-				    LLVMValueRef const *so_buffers,
-				    LLVMValueRef const *so_write_offsets,
-				    struct pipe_stream_output *stream_out,
-				    struct si_shader_output_values *shader_out)
+void si_llvm_streamout_store_output(struct si_shader_context *ctx, LLVMValueRef const *so_buffers,
+                                    LLVMValueRef const *so_write_offsets,
+                                    struct pipe_stream_output *stream_out,
+                                    struct si_shader_output_values *shader_out)
 {
-	unsigned buf_idx = stream_out->output_buffer;
-	unsigned start = stream_out->start_component;
-	unsigned num_comps = stream_out->num_components;
-	LLVMValueRef out[4];
-
-	assert(num_comps && num_comps <= 4);
-	if (!num_comps || num_comps > 4)
-		return;
-
-	/* Load the output as int. */
-	for (int j = 0; j < num_comps; j++) {
-		assert(stream_out->stream == shader_out->vertex_stream[start + j]);
-
-		out[j] = ac_to_integer(&ctx->ac, shader_out->values[start + j]);
-	}
-
-	/* Pack the output. */
-	LLVMValueRef vdata = NULL;
-
-	switch (num_comps) {
-	case 1: /* as i32 */
-		vdata = out[0];
-		break;
-	case 2: /* as v2i32 */
-	case 3: /* as v3i32 */
-		if (ac_has_vec3_support(ctx->screen->info.chip_class, false)) {
-			vdata = ac_build_gather_values(&ctx->ac, out, num_comps);
-			break;
-		}
-		/* as v4i32 (aligned to 4) */
-		out[3] = LLVMGetUndef(ctx->ac.i32);
-		/* fall through */
-	case 4: /* as v4i32 */
-		vdata = ac_build_gather_values(&ctx->ac, out, util_next_power_of_two(num_comps));
-		break;
-	}
-
-	ac_build_buffer_store_dword(&ctx->ac, so_buffers[buf_idx],
-				    vdata, num_comps,
-				    so_write_offsets[buf_idx],
-				    ctx->ac.i32_0,
-				    stream_out->dst_offset * 4, ac_glc | ac_slc);
+   unsigned buf_idx = stream_out->output_buffer;
+   unsigned start = stream_out->start_component;
+   unsigned num_comps = stream_out->num_components;
+   LLVMValueRef out[4];
+
+   assert(num_comps && num_comps <= 4);
+   if (!num_comps || num_comps > 4)
+      return;
+
+   /* Load the output as int. */
+   for (int j = 0; j < num_comps; j++) {
+      assert(stream_out->stream == shader_out->vertex_stream[start + j]);
+
+      out[j] = ac_to_integer(&ctx->ac, shader_out->values[start + j]);
+   }
+
+   /* Pack the output. */
+   LLVMValueRef vdata = NULL;
+
+   switch (num_comps) {
+   case 1: /* as i32 */
+      vdata = out[0];
+      break;
+   case 2: /* as v2i32 */
+   case 3: /* as v3i32 */
+      if (ac_has_vec3_support(ctx->screen->info.chip_class, false)) {
+         vdata = ac_build_gather_values(&ctx->ac, out, num_comps);
+         break;
+      }
+      /* as v4i32 (aligned to 4) */
+      out[3] = LLVMGetUndef(ctx->ac.i32);
+      /* fall through */
+   case 4: /* as v4i32 */
+      vdata = ac_build_gather_values(&ctx->ac, out, util_next_power_of_two(num_comps));
+      break;
+   }
+
+   ac_build_buffer_store_dword(&ctx->ac, so_buffers[buf_idx], vdata, num_comps,
+                               so_write_offsets[buf_idx], ctx->ac.i32_0, stream_out->dst_offset * 4,
+                               ac_glc | ac_slc);
 }
 
 /**
  * Write streamout data to buffers for vertex stream @p stream (different
  * vertex streams can occur for GS copy shaders).
  */
-void si_llvm_emit_streamout(struct si_shader_context *ctx,
-			    struct si_shader_output_values *outputs,
-			    unsigned noutput, unsigned stream)
+void si_llvm_emit_streamout(struct si_shader_context *ctx, struct si_shader_output_values *outputs,
+                            unsigned noutput, unsigned stream)
 {
-	struct si_shader_selector *sel = ctx->shader->selector;
-	struct pipe_stream_output_info *so = &sel->so;
-	LLVMBuilderRef builder = ctx->ac.builder;
-	int i;
-
-	/* Get bits [22:16], i.e. (so_param >> 16) & 127; */
-	LLVMValueRef so_vtx_count =
-		si_unpack_param(ctx, ctx->streamout_config, 16, 7);
-
-	LLVMValueRef tid = ac_get_thread_id(&ctx->ac);
-
-	/* can_emit = tid < so_vtx_count; */
-	LLVMValueRef can_emit =
-		LLVMBuildICmp(builder, LLVMIntULT, tid, so_vtx_count, "");
-
-	/* Emit the streamout code conditionally. This actually avoids
-	 * out-of-bounds buffer access. The hw tells us via the SGPR
-	 * (so_vtx_count) which threads are allowed to emit streamout data. */
-	ac_build_ifcc(&ctx->ac, can_emit, 6501);
-	{
-		/* The buffer offset is computed as follows:
-		 *   ByteOffset = streamout_offset[buffer_id]*4 +
-		 *                (streamout_write_index + thread_id)*stride[buffer_id] +
-		 *                attrib_offset
-                 */
-
-		LLVMValueRef so_write_index =
-			ac_get_arg(&ctx->ac,
-				   ctx->streamout_write_index);
-
-		/* Compute (streamout_write_index + thread_id). */
-		so_write_index = LLVMBuildAdd(builder, so_write_index, tid, "");
-
-		/* Load the descriptor and compute the write offset for each
-		 * enabled buffer. */
-		LLVMValueRef so_write_offset[4] = {};
-		LLVMValueRef so_buffers[4];
-		LLVMValueRef buf_ptr = ac_get_arg(&ctx->ac,
-						  ctx->rw_buffers);
-
-		for (i = 0; i < 4; i++) {
-			if (!so->stride[i])
-				continue;
-
-			LLVMValueRef offset = LLVMConstInt(ctx->ac.i32,
-							   SI_VS_STREAMOUT_BUF0 + i, 0);
-
-			so_buffers[i] = ac_build_load_to_sgpr(&ctx->ac, buf_ptr, offset);
-
-			LLVMValueRef so_offset = ac_get_arg(&ctx->ac,
-							    ctx->streamout_offset[i]);
-			so_offset = LLVMBuildMul(builder, so_offset, LLVMConstInt(ctx->ac.i32, 4, 0), "");
-
-			so_write_offset[i] = ac_build_imad(&ctx->ac, so_write_index,
-							   LLVMConstInt(ctx->ac.i32, so->stride[i]*4, 0),
-							   so_offset);
-		}
-
-		/* Write streamout data. */
-		for (i = 0; i < so->num_outputs; i++) {
-			unsigned reg = so->output[i].register_index;
-
-			if (reg >= noutput)
-				continue;
-
-			if (stream != so->output[i].stream)
-				continue;
-
-			si_llvm_streamout_store_output(ctx, so_buffers, so_write_offset,
-						       &so->output[i], &outputs[reg]);
-		}
-	}
-	ac_build_endif(&ctx->ac, 6501);
+   struct si_shader_selector *sel = ctx->shader->selector;
+   struct pipe_stream_output_info *so = &sel->so;
+   LLVMBuilderRef builder = ctx->ac.builder;
+   int i;
+
+   /* Get bits [22:16], i.e. (so_param >> 16) & 127; */
+   LLVMValueRef so_vtx_count = si_unpack_param(ctx, ctx->streamout_config, 16, 7);
+
+   LLVMValueRef tid = ac_get_thread_id(&ctx->ac);
+
+   /* can_emit = tid < so_vtx_count; */
+   LLVMValueRef can_emit = LLVMBuildICmp(builder, LLVMIntULT, tid, so_vtx_count, "");
+
+   /* Emit the streamout code conditionally. This actually avoids
+    * out-of-bounds buffer access. The hw tells us via the SGPR
+    * (so_vtx_count) which threads are allowed to emit streamout data. */
+   ac_build_ifcc(&ctx->ac, can_emit, 6501);
+   {
+      /* The buffer offset is computed as follows:
+       *   ByteOffset = streamout_offset[buffer_id]*4 +
+       *                (streamout_write_index + thread_id)*stride[buffer_id] +
+       *                attrib_offset
+       */
+
+      LLVMValueRef so_write_index = ac_get_arg(&ctx->ac, ctx->streamout_write_index);
+
+      /* Compute (streamout_write_index + thread_id). */
+      so_write_index = LLVMBuildAdd(builder, so_write_index, tid, "");
+
+      /* Load the descriptor and compute the write offset for each
+       * enabled buffer. */
+      LLVMValueRef so_write_offset[4] = {};
+      LLVMValueRef so_buffers[4];
+      LLVMValueRef buf_ptr = ac_get_arg(&ctx->ac, ctx->rw_buffers);
+
+      for (i = 0; i < 4; i++) {
+         if (!so->stride[i])
+            continue;
+
+         LLVMValueRef offset = LLVMConstInt(ctx->ac.i32, SI_VS_STREAMOUT_BUF0 + i, 0);
+
+         so_buffers[i] = ac_build_load_to_sgpr(&ctx->ac, buf_ptr, offset);
+
+         LLVMValueRef so_offset = ac_get_arg(&ctx->ac, ctx->streamout_offset[i]);
+         so_offset = LLVMBuildMul(builder, so_offset, LLVMConstInt(ctx->ac.i32, 4, 0), "");
+
+         so_write_offset[i] = ac_build_imad(
+            &ctx->ac, so_write_index, LLVMConstInt(ctx->ac.i32, so->stride[i] * 4, 0), so_offset);
+      }
+
+      /* Write streamout data. */
+      for (i = 0; i < so->num_outputs; i++) {
+         unsigned reg = so->output[i].register_index;
+
+         if (reg >= noutput)
+            continue;
+
+         if (stream != so->output[i].stream)
+            continue;
+
+         si_llvm_streamout_store_output(ctx, so_buffers, so_write_offset, &so->output[i],
+                                        &outputs[reg]);
+      }
+   }
+   ac_build_endif(&ctx->ac, 6501);
 }
 
-static void si_llvm_emit_clipvertex(struct si_shader_context *ctx,
-				    struct ac_export_args *pos, LLVMValueRef *out_elts)
+static void si_llvm_emit_clipvertex(struct si_shader_context *ctx, struct ac_export_args *pos,
+                                    LLVMValueRef *out_elts)
 {
-	unsigned reg_index;
-	unsigned chan;
-	unsigned const_chan;
-	LLVMValueRef base_elt;
-	LLVMValueRef ptr = ac_get_arg(&ctx->ac, ctx->rw_buffers);
-	LLVMValueRef constbuf_index = LLVMConstInt(ctx->ac.i32,
-						   SI_VS_CONST_CLIP_PLANES, 0);
-	LLVMValueRef const_resource = ac_build_load_to_sgpr(&ctx->ac, ptr, constbuf_index);
-
-	for (reg_index = 0; reg_index < 2; reg_index ++) {
-		struct ac_export_args *args = &pos[2 + reg_index];
-
-		args->out[0] =
-		args->out[1] =
-		args->out[2] =
-		args->out[3] = LLVMConstReal(ctx->ac.f32, 0.0f);
-
-		/* Compute dot products of position and user clip plane vectors */
-		for (chan = 0; chan < 4; chan++) {
-			for (const_chan = 0; const_chan < 4; const_chan++) {
-				LLVMValueRef addr =
-					LLVMConstInt(ctx->ac.i32, ((reg_index * 4 + chan) * 4 +
-								const_chan) * 4, 0);
-				base_elt = si_buffer_load_const(ctx, const_resource,
-								addr);
-				args->out[chan] = ac_build_fmad(&ctx->ac, base_elt,
-								out_elts[const_chan], args->out[chan]);
-			}
-		}
-
-		args->enabled_channels = 0xf;
-		args->valid_mask = 0;
-		args->done = 0;
-		args->target = V_008DFC_SQ_EXP_POS + 2 + reg_index;
-		args->compr = 0;
-	}
+   unsigned reg_index;
+   unsigned chan;
+   unsigned const_chan;
+   LLVMValueRef base_elt;
+   LLVMValueRef ptr = ac_get_arg(&ctx->ac, ctx->rw_buffers);
+   LLVMValueRef constbuf_index = LLVMConstInt(ctx->ac.i32, SI_VS_CONST_CLIP_PLANES, 0);
+   LLVMValueRef const_resource = ac_build_load_to_sgpr(&ctx->ac, ptr, constbuf_index);
+
+   for (reg_index = 0; reg_index < 2; reg_index++) {
+      struct ac_export_args *args = &pos[2 + reg_index];
+
+      args->out[0] = args->out[1] = args->out[2] = args->out[3] = LLVMConstReal(ctx->ac.f32, 0.0f);
+
+      /* Compute dot products of position and user clip plane vectors */
+      for (chan = 0; chan < 4; chan++) {
+         for (const_chan = 0; const_chan < 4; const_chan++) {
+            LLVMValueRef addr =
+               LLVMConstInt(ctx->ac.i32, ((reg_index * 4 + chan) * 4 + const_chan) * 4, 0);
+            base_elt = si_buffer_load_const(ctx, const_resource, addr);
+            args->out[chan] =
+               ac_build_fmad(&ctx->ac, base_elt, out_elts[const_chan], args->out[chan]);
+         }
+      }
+
+      args->enabled_channels = 0xf;
+      args->valid_mask = 0;
+      args->done = 0;
+      args->target = V_008DFC_SQ_EXP_POS + 2 + reg_index;
+      args->compr = 0;
+   }
 }
 
 /* Initialize arguments for the shader export intrinsic */
-static void si_llvm_init_vs_export_args(struct si_shader_context *ctx,
-					LLVMValueRef *values,
-					unsigned target,
-					struct ac_export_args *args)
+static void si_llvm_init_vs_export_args(struct si_shader_context *ctx, LLVMValueRef *values,
+                                        unsigned target, struct ac_export_args *args)
 {
-	args->enabled_channels = 0xf; /* writemask - default is 0xf */
-	args->valid_mask = 0; /* Specify whether the EXEC mask represents the valid mask */
-	args->done = 0; /* Specify whether this is the last export */
-	args->target = target; /* Specify the target we are exporting */
-	args->compr = false;
+   args->enabled_channels = 0xf; /* writemask - default is 0xf */
+   args->valid_mask = 0;         /* Specify whether the EXEC mask represents the valid mask */
+   args->done = 0;               /* Specify whether this is the last export */
+   args->target = target;        /* Specify the target we are exporting */
+   args->compr = false;
 
-	memcpy(&args->out[0], values, sizeof(values[0]) * 4);
+   memcpy(&args->out[0], values, sizeof(values[0]) * 4);
 }
 
-static void si_export_param(struct si_shader_context *ctx, unsigned index,
-			    LLVMValueRef *values)
+static void si_export_param(struct si_shader_context *ctx, unsigned index, LLVMValueRef *values)
 {
-	struct ac_export_args args;
+   struct ac_export_args args;
 
-	si_llvm_init_vs_export_args(ctx, values,
-				    V_008DFC_SQ_EXP_PARAM + index, &args);
-	ac_build_export(&ctx->ac, &args);
+   si_llvm_init_vs_export_args(ctx, values, V_008DFC_SQ_EXP_PARAM + index, &args);
+   ac_build_export(&ctx->ac, &args);
 }
 
 static void si_build_param_exports(struct si_shader_context *ctx,
-				   struct si_shader_output_values *outputs,
-			           unsigned noutput)
+                                   struct si_shader_output_values *outputs, unsigned noutput)
 {
-	struct si_shader *shader = ctx->shader;
-	unsigned param_count = 0;
-
-	for (unsigned i = 0; i < noutput; i++) {
-		unsigned semantic_name = outputs[i].semantic_name;
-		unsigned semantic_index = outputs[i].semantic_index;
-
-		if (outputs[i].vertex_stream[0] != 0 &&
-		    outputs[i].vertex_stream[1] != 0 &&
-		    outputs[i].vertex_stream[2] != 0 &&
-		    outputs[i].vertex_stream[3] != 0)
-			continue;
-
-		switch (semantic_name) {
-		case TGSI_SEMANTIC_LAYER:
-		case TGSI_SEMANTIC_VIEWPORT_INDEX:
-		case TGSI_SEMANTIC_CLIPDIST:
-		case TGSI_SEMANTIC_COLOR:
-		case TGSI_SEMANTIC_BCOLOR:
-		case TGSI_SEMANTIC_PRIMID:
-		case TGSI_SEMANTIC_FOG:
-		case TGSI_SEMANTIC_TEXCOORD:
-		case TGSI_SEMANTIC_GENERIC:
-			break;
-		default:
-			continue;
-		}
-
-		if ((semantic_name != TGSI_SEMANTIC_GENERIC ||
-		     semantic_index < SI_MAX_IO_GENERIC) &&
-		    shader->key.opt.kill_outputs &
-		    (1ull << si_shader_io_get_unique_index(semantic_name,
-							   semantic_index, true)))
-			continue;
-
-		si_export_param(ctx, param_count, outputs[i].values);
-
-		assert(i < ARRAY_SIZE(shader->info.vs_output_param_offset));
-		shader->info.vs_output_param_offset[i] = param_count++;
-	}
-
-	shader->info.nr_param_exports = param_count;
+   struct si_shader *shader = ctx->shader;
+   unsigned param_count = 0;
+
+   for (unsigned i = 0; i < noutput; i++) {
+      unsigned semantic_name = outputs[i].semantic_name;
+      unsigned semantic_index = outputs[i].semantic_index;
+
+      if (outputs[i].vertex_stream[0] != 0 && outputs[i].vertex_stream[1] != 0 &&
+          outputs[i].vertex_stream[2] != 0 && outputs[i].vertex_stream[3] != 0)
+         continue;
+
+      switch (semantic_name) {
+      case TGSI_SEMANTIC_LAYER:
+      case TGSI_SEMANTIC_VIEWPORT_INDEX:
+      case TGSI_SEMANTIC_CLIPDIST:
+      case TGSI_SEMANTIC_COLOR:
+      case TGSI_SEMANTIC_BCOLOR:
+      case TGSI_SEMANTIC_PRIMID:
+      case TGSI_SEMANTIC_FOG:
+      case TGSI_SEMANTIC_TEXCOORD:
+      case TGSI_SEMANTIC_GENERIC:
+         break;
+      default:
+         continue;
+      }
+
+      if ((semantic_name != TGSI_SEMANTIC_GENERIC || semantic_index < SI_MAX_IO_GENERIC) &&
+          shader->key.opt.kill_outputs &
+             (1ull << si_shader_io_get_unique_index(semantic_name, semantic_index, true)))
+         continue;
+
+      si_export_param(ctx, param_count, outputs[i].values);
+
+      assert(i < ARRAY_SIZE(shader->info.vs_output_param_offset));
+      shader->info.vs_output_param_offset[i] = param_count++;
+   }
+
+   shader->info.nr_param_exports = param_count;
 }
 
 /**
@@ -544,296 +489,272 @@ static void si_build_param_exports(struct si_shader_context *ctx,
  * is true.
  */
 static void si_vertex_color_clamping(struct si_shader_context *ctx,
-				     struct si_shader_output_values *outputs,
-				     unsigned noutput)
+                                     struct si_shader_output_values *outputs, unsigned noutput)
 {
-	LLVMValueRef addr[SI_MAX_VS_OUTPUTS][4];
-	bool has_colors = false;
-
-	/* Store original colors to alloca variables. */
-	for (unsigned i = 0; i < noutput; i++) {
-		if (outputs[i].semantic_name != TGSI_SEMANTIC_COLOR &&
-		    outputs[i].semantic_name != TGSI_SEMANTIC_BCOLOR)
-			continue;
-
-		for (unsigned j = 0; j < 4; j++) {
-			addr[i][j] = ac_build_alloca_undef(&ctx->ac, ctx->ac.f32, "");
-			LLVMBuildStore(ctx->ac.builder, outputs[i].values[j], addr[i][j]);
-		}
-		has_colors = true;
-	}
-
-	if (!has_colors)
-		return;
-
-	/* The state is in the first bit of the user SGPR. */
-	LLVMValueRef cond = ac_get_arg(&ctx->ac, ctx->vs_state_bits);
-	cond = LLVMBuildTrunc(ctx->ac.builder, cond, ctx->ac.i1, "");
-
-	ac_build_ifcc(&ctx->ac, cond, 6502);
-
-	/* Store clamped colors to alloca variables within the conditional block. */
-	for (unsigned i = 0; i < noutput; i++) {
-		if (outputs[i].semantic_name != TGSI_SEMANTIC_COLOR &&
-		    outputs[i].semantic_name != TGSI_SEMANTIC_BCOLOR)
-			continue;
-
-		for (unsigned j = 0; j < 4; j++) {
-			LLVMBuildStore(ctx->ac.builder,
-				       ac_build_clamp(&ctx->ac, outputs[i].values[j]),
-				       addr[i][j]);
-		}
-	}
-	ac_build_endif(&ctx->ac, 6502);
-
-	/* Load clamped colors */
-	for (unsigned i = 0; i < noutput; i++) {
-		if (outputs[i].semantic_name != TGSI_SEMANTIC_COLOR &&
-		    outputs[i].semantic_name != TGSI_SEMANTIC_BCOLOR)
-			continue;
-
-		for (unsigned j = 0; j < 4; j++) {
-			outputs[i].values[j] =
-				LLVMBuildLoad(ctx->ac.builder, addr[i][j], "");
-		}
-	}
+   LLVMValueRef addr[SI_MAX_VS_OUTPUTS][4];
+   bool has_colors = false;
+
+   /* Store original colors to alloca variables. */
+   for (unsigned i = 0; i < noutput; i++) {
+      if (outputs[i].semantic_name != TGSI_SEMANTIC_COLOR &&
+          outputs[i].semantic_name != TGSI_SEMANTIC_BCOLOR)
+         continue;
+
+      for (unsigned j = 0; j < 4; j++) {
+         addr[i][j] = ac_build_alloca_undef(&ctx->ac, ctx->ac.f32, "");
+         LLVMBuildStore(ctx->ac.builder, outputs[i].values[j], addr[i][j]);
+      }
+      has_colors = true;
+   }
+
+   if (!has_colors)
+      return;
+
+   /* The state is in the first bit of the user SGPR. */
+   LLVMValueRef cond = ac_get_arg(&ctx->ac, ctx->vs_state_bits);
+   cond = LLVMBuildTrunc(ctx->ac.builder, cond, ctx->ac.i1, "");
+
+   ac_build_ifcc(&ctx->ac, cond, 6502);
+
+   /* Store clamped colors to alloca variables within the conditional block. */
+   for (unsigned i = 0; i < noutput; i++) {
+      if (outputs[i].semantic_name != TGSI_SEMANTIC_COLOR &&
+          outputs[i].semantic_name != TGSI_SEMANTIC_BCOLOR)
+         continue;
+
+      for (unsigned j = 0; j < 4; j++) {
+         LLVMBuildStore(ctx->ac.builder, ac_build_clamp(&ctx->ac, outputs[i].values[j]),
+                        addr[i][j]);
+      }
+   }
+   ac_build_endif(&ctx->ac, 6502);
+
+   /* Load clamped colors */
+   for (unsigned i = 0; i < noutput; i++) {
+      if (outputs[i].semantic_name != TGSI_SEMANTIC_COLOR &&
+          outputs[i].semantic_name != TGSI_SEMANTIC_BCOLOR)
+         continue;
+
+      for (unsigned j = 0; j < 4; j++) {
+         outputs[i].values[j] = LLVMBuildLoad(ctx->ac.builder, addr[i][j], "");
+      }
+   }
 }
 
 /* Generate export instructions for hardware VS shader stage or NGG GS stage
  * (position and parameter data only).
  */
 void si_llvm_build_vs_exports(struct si_shader_context *ctx,
-			      struct si_shader_output_values *outputs,
-			      unsigned noutput)
+                              struct si_shader_output_values *outputs, unsigned noutput)
 {
-	struct si_shader *shader = ctx->shader;
-	struct ac_export_args pos_args[4] = {};
-	LLVMValueRef psize_value = NULL, edgeflag_value = NULL, layer_value = NULL, viewport_index_value = NULL;
-	unsigned pos_idx;
-	int i;
-
-	si_vertex_color_clamping(ctx, outputs, noutput);
-
-	/* Build position exports. */
-	for (i = 0; i < noutput; i++) {
-		switch (outputs[i].semantic_name) {
-		case TGSI_SEMANTIC_POSITION:
-			si_llvm_init_vs_export_args(ctx, outputs[i].values,
-						    V_008DFC_SQ_EXP_POS, &pos_args[0]);
-			break;
-		case TGSI_SEMANTIC_PSIZE:
-			psize_value = outputs[i].values[0];
-			break;
-		case TGSI_SEMANTIC_LAYER:
-			layer_value = outputs[i].values[0];
-			break;
-		case TGSI_SEMANTIC_VIEWPORT_INDEX:
-			viewport_index_value = outputs[i].values[0];
-			break;
-		case TGSI_SEMANTIC_EDGEFLAG:
-			edgeflag_value = outputs[i].values[0];
-			break;
-		case TGSI_SEMANTIC_CLIPDIST:
-			if (!shader->key.opt.clip_disable) {
-				unsigned index = 2 + outputs[i].semantic_index;
-				si_llvm_init_vs_export_args(ctx, outputs[i].values,
-							    V_008DFC_SQ_EXP_POS + index,
-							    &pos_args[index]);
-			}
-			break;
-		case TGSI_SEMANTIC_CLIPVERTEX:
-			if (!shader->key.opt.clip_disable) {
-				si_llvm_emit_clipvertex(ctx, pos_args,
-							outputs[i].values);
-			}
-			break;
-		}
-	}
-
-	/* We need to add the position output manually if it's missing. */
-	if (!pos_args[0].out[0]) {
-		pos_args[0].enabled_channels = 0xf; /* writemask */
-		pos_args[0].valid_mask = 0; /* EXEC mask */
-		pos_args[0].done = 0; /* last export? */
-		pos_args[0].target = V_008DFC_SQ_EXP_POS;
-		pos_args[0].compr = 0; /* COMPR flag */
-		pos_args[0].out[0] = ctx->ac.f32_0; /* X */
-		pos_args[0].out[1] = ctx->ac.f32_0; /* Y */
-		pos_args[0].out[2] = ctx->ac.f32_0; /* Z */
-		pos_args[0].out[3] = ctx->ac.f32_1;  /* W */
-	}
-
-	bool pos_writes_edgeflag = shader->selector->info.writes_edgeflag &&
-				   !shader->key.as_ngg;
-
-	/* Write the misc vector (point size, edgeflag, layer, viewport). */
-	if (shader->selector->info.writes_psize ||
-	    pos_writes_edgeflag ||
-	    shader->selector->info.writes_viewport_index ||
-	    shader->selector->info.writes_layer) {
-		pos_args[1].enabled_channels = shader->selector->info.writes_psize |
-					       (pos_writes_edgeflag << 1) |
-					       (shader->selector->info.writes_layer << 2);
-
-		pos_args[1].valid_mask = 0; /* EXEC mask */
-		pos_args[1].done = 0; /* last export? */
-		pos_args[1].target = V_008DFC_SQ_EXP_POS + 1;
-		pos_args[1].compr = 0; /* COMPR flag */
-		pos_args[1].out[0] = ctx->ac.f32_0; /* X */
-		pos_args[1].out[1] = ctx->ac.f32_0; /* Y */
-		pos_args[1].out[2] = ctx->ac.f32_0; /* Z */
-		pos_args[1].out[3] = ctx->ac.f32_0; /* W */
-
-		if (shader->selector->info.writes_psize)
-			pos_args[1].out[0] = psize_value;
-
-		if (pos_writes_edgeflag) {
-			/* The output is a float, but the hw expects an integer
-			 * with the first bit containing the edge flag. */
-			edgeflag_value = LLVMBuildFPToUI(ctx->ac.builder,
-							 edgeflag_value,
-							 ctx->ac.i32, "");
-			edgeflag_value = ac_build_umin(&ctx->ac,
-						      edgeflag_value,
-						      ctx->ac.i32_1);
-
-			/* The LLVM intrinsic expects a float. */
-			pos_args[1].out[1] = ac_to_float(&ctx->ac, edgeflag_value);
-		}
-
-		if (ctx->screen->info.chip_class >= GFX9) {
-			/* GFX9 has the layer in out.z[10:0] and the viewport
-			 * index in out.z[19:16].
-			 */
-			if (shader->selector->info.writes_layer)
-				pos_args[1].out[2] = layer_value;
-
-			if (shader->selector->info.writes_viewport_index) {
-				LLVMValueRef v = viewport_index_value;
-
-				v = ac_to_integer(&ctx->ac, v);
-				v = LLVMBuildShl(ctx->ac.builder, v,
-						 LLVMConstInt(ctx->ac.i32, 16, 0), "");
-				v = LLVMBuildOr(ctx->ac.builder, v,
-						ac_to_integer(&ctx->ac,  pos_args[1].out[2]), "");
-				pos_args[1].out[2] = ac_to_float(&ctx->ac, v);
-				pos_args[1].enabled_channels |= 1 << 2;
-			}
-		} else {
-			if (shader->selector->info.writes_layer)
-				pos_args[1].out[2] = layer_value;
-
-			if (shader->selector->info.writes_viewport_index) {
-				pos_args[1].out[3] = viewport_index_value;
-				pos_args[1].enabled_channels |= 1 << 3;
-			}
-		}
-	}
-
-	for (i = 0; i < 4; i++)
-		if (pos_args[i].out[0])
-			shader->info.nr_pos_exports++;
-
-	/* Navi10-14 skip POS0 exports if EXEC=0 and DONE=0, causing a hang.
-	 * Setting valid_mask=1 prevents it and has no other effect.
-	 */
-	if (ctx->screen->info.family == CHIP_NAVI10 ||
-	    ctx->screen->info.family == CHIP_NAVI12 ||
-	    ctx->screen->info.family == CHIP_NAVI14)
-		pos_args[0].valid_mask = 1;
-
-	pos_idx = 0;
-	for (i = 0; i < 4; i++) {
-		if (!pos_args[i].out[0])
-			continue;
-
-		/* Specify the target we are exporting */
-		pos_args[i].target = V_008DFC_SQ_EXP_POS + pos_idx++;
-
-		if (pos_idx == shader->info.nr_pos_exports)
-			/* Specify that this is the last export */
-			pos_args[i].done = 1;
-
-		ac_build_export(&ctx->ac, &pos_args[i]);
-	}
-
-	/* Build parameter exports. */
-	si_build_param_exports(ctx, outputs, noutput);
+   struct si_shader *shader = ctx->shader;
+   struct ac_export_args pos_args[4] = {};
+   LLVMValueRef psize_value = NULL, edgeflag_value = NULL, layer_value = NULL,
+                viewport_index_value = NULL;
+   unsigned pos_idx;
+   int i;
+
+   si_vertex_color_clamping(ctx, outputs, noutput);
+
+   /* Build position exports. */
+   for (i = 0; i < noutput; i++) {
+      switch (outputs[i].semantic_name) {
+      case TGSI_SEMANTIC_POSITION:
+         si_llvm_init_vs_export_args(ctx, outputs[i].values, V_008DFC_SQ_EXP_POS, &pos_args[0]);
+         break;
+      case TGSI_SEMANTIC_PSIZE:
+         psize_value = outputs[i].values[0];
+         break;
+      case TGSI_SEMANTIC_LAYER:
+         layer_value = outputs[i].values[0];
+         break;
+      case TGSI_SEMANTIC_VIEWPORT_INDEX:
+         viewport_index_value = outputs[i].values[0];
+         break;
+      case TGSI_SEMANTIC_EDGEFLAG:
+         edgeflag_value = outputs[i].values[0];
+         break;
+      case TGSI_SEMANTIC_CLIPDIST:
+         if (!shader->key.opt.clip_disable) {
+            unsigned index = 2 + outputs[i].semantic_index;
+            si_llvm_init_vs_export_args(ctx, outputs[i].values, V_008DFC_SQ_EXP_POS + index,
+                                        &pos_args[index]);
+         }
+         break;
+      case TGSI_SEMANTIC_CLIPVERTEX:
+         if (!shader->key.opt.clip_disable) {
+            si_llvm_emit_clipvertex(ctx, pos_args, outputs[i].values);
+         }
+         break;
+      }
+   }
+
+   /* We need to add the position output manually if it's missing. */
+   if (!pos_args[0].out[0]) {
+      pos_args[0].enabled_channels = 0xf; /* writemask */
+      pos_args[0].valid_mask = 0;         /* EXEC mask */
+      pos_args[0].done = 0;               /* last export? */
+      pos_args[0].target = V_008DFC_SQ_EXP_POS;
+      pos_args[0].compr = 0;              /* COMPR flag */
+      pos_args[0].out[0] = ctx->ac.f32_0; /* X */
+      pos_args[0].out[1] = ctx->ac.f32_0; /* Y */
+      pos_args[0].out[2] = ctx->ac.f32_0; /* Z */
+      pos_args[0].out[3] = ctx->ac.f32_1; /* W */
+   }
+
+   bool pos_writes_edgeflag = shader->selector->info.writes_edgeflag && !shader->key.as_ngg;
+
+   /* Write the misc vector (point size, edgeflag, layer, viewport). */
+   if (shader->selector->info.writes_psize || pos_writes_edgeflag ||
+       shader->selector->info.writes_viewport_index || shader->selector->info.writes_layer) {
+      pos_args[1].enabled_channels = shader->selector->info.writes_psize |
+                                     (pos_writes_edgeflag << 1) |
+                                     (shader->selector->info.writes_layer << 2);
+
+      pos_args[1].valid_mask = 0; /* EXEC mask */
+      pos_args[1].done = 0;       /* last export? */
+      pos_args[1].target = V_008DFC_SQ_EXP_POS + 1;
+      pos_args[1].compr = 0;              /* COMPR flag */
+      pos_args[1].out[0] = ctx->ac.f32_0; /* X */
+      pos_args[1].out[1] = ctx->ac.f32_0; /* Y */
+      pos_args[1].out[2] = ctx->ac.f32_0; /* Z */
+      pos_args[1].out[3] = ctx->ac.f32_0; /* W */
+
+      if (shader->selector->info.writes_psize)
+         pos_args[1].out[0] = psize_value;
+
+      if (pos_writes_edgeflag) {
+         /* The output is a float, but the hw expects an integer
+          * with the first bit containing the edge flag. */
+         edgeflag_value = LLVMBuildFPToUI(ctx->ac.builder, edgeflag_value, ctx->ac.i32, "");
+         edgeflag_value = ac_build_umin(&ctx->ac, edgeflag_value, ctx->ac.i32_1);
+
+         /* The LLVM intrinsic expects a float. */
+         pos_args[1].out[1] = ac_to_float(&ctx->ac, edgeflag_value);
+      }
+
+      if (ctx->screen->info.chip_class >= GFX9) {
+         /* GFX9 has the layer in out.z[10:0] and the viewport
+          * index in out.z[19:16].
+          */
+         if (shader->selector->info.writes_layer)
+            pos_args[1].out[2] = layer_value;
+
+         if (shader->selector->info.writes_viewport_index) {
+            LLVMValueRef v = viewport_index_value;
+
+            v = ac_to_integer(&ctx->ac, v);
+            v = LLVMBuildShl(ctx->ac.builder, v, LLVMConstInt(ctx->ac.i32, 16, 0), "");
+            v = LLVMBuildOr(ctx->ac.builder, v, ac_to_integer(&ctx->ac, pos_args[1].out[2]), "");
+            pos_args[1].out[2] = ac_to_float(&ctx->ac, v);
+            pos_args[1].enabled_channels |= 1 << 2;
+         }
+      } else {
+         if (shader->selector->info.writes_layer)
+            pos_args[1].out[2] = layer_value;
+
+         if (shader->selector->info.writes_viewport_index) {
+            pos_args[1].out[3] = viewport_index_value;
+            pos_args[1].enabled_channels |= 1 << 3;
+         }
+      }
+   }
+
+   for (i = 0; i < 4; i++)
+      if (pos_args[i].out[0])
+         shader->info.nr_pos_exports++;
+
+   /* Navi10-14 skip POS0 exports if EXEC=0 and DONE=0, causing a hang.
+    * Setting valid_mask=1 prevents it and has no other effect.
+    */
+   if (ctx->screen->info.family == CHIP_NAVI10 || ctx->screen->info.family == CHIP_NAVI12 ||
+       ctx->screen->info.family == CHIP_NAVI14)
+      pos_args[0].valid_mask = 1;
+
+   pos_idx = 0;
+   for (i = 0; i < 4; i++) {
+      if (!pos_args[i].out[0])
+         continue;
+
+      /* Specify the target we are exporting */
+      pos_args[i].target = V_008DFC_SQ_EXP_POS + pos_idx++;
+
+      if (pos_idx == shader->info.nr_pos_exports)
+         /* Specify that this is the last export */
+         pos_args[i].done = 1;
+
+      ac_build_export(&ctx->ac, &pos_args[i]);
+   }
+
+   /* Build parameter exports. */
+   si_build_param_exports(ctx, outputs, noutput);
 }
 
-void si_llvm_emit_vs_epilogue(struct ac_shader_abi *abi, unsigned max_outputs,
-			      LLVMValueRef *addrs)
+void si_llvm_emit_vs_epilogue(struct ac_shader_abi *abi, unsigned max_outputs, LLVMValueRef *addrs)
 {
-	struct si_shader_context *ctx = si_shader_context_from_abi(abi);
-	struct si_shader_info *info = &ctx->shader->selector->info;
-	struct si_shader_output_values *outputs = NULL;
-	int i,j;
-
-	assert(!ctx->shader->is_gs_copy_shader);
-	assert(info->num_outputs <= max_outputs);
-
-	outputs = MALLOC((info->num_outputs + 1) * sizeof(outputs[0]));
-
-	for (i = 0; i < info->num_outputs; i++) {
-		outputs[i].semantic_name = info->output_semantic_name[i];
-		outputs[i].semantic_index = info->output_semantic_index[i];
-
-		for (j = 0; j < 4; j++) {
-			outputs[i].values[j] =
-				LLVMBuildLoad(ctx->ac.builder,
-					      addrs[4 * i + j],
-					      "");
-			outputs[i].vertex_stream[j] =
-				(info->output_streams[i] >> (2 * j)) & 3;
-		}
-	}
-
-	if (!ctx->screen->use_ngg_streamout &&
-	    ctx->shader->selector->so.num_outputs)
-		si_llvm_emit_streamout(ctx, outputs, i, 0);
-
-	/* Export PrimitiveID. */
-	if (ctx->shader->key.mono.u.vs_export_prim_id) {
-		outputs[i].semantic_name = TGSI_SEMANTIC_PRIMID;
-		outputs[i].semantic_index = 0;
-		outputs[i].values[0] = ac_to_float(&ctx->ac, si_get_primitive_id(ctx, 0));
-		for (j = 1; j < 4; j++)
-			outputs[i].values[j] = LLVMConstReal(ctx->ac.f32, 0);
-
-		memset(outputs[i].vertex_stream, 0,
-		       sizeof(outputs[i].vertex_stream));
-		i++;
-	}
-
-	si_llvm_build_vs_exports(ctx, outputs, i);
-	FREE(outputs);
+   struct si_shader_context *ctx = si_shader_context_from_abi(abi);
+   struct si_shader_info *info = &ctx->shader->selector->info;
+   struct si_shader_output_values *outputs = NULL;
+   int i, j;
+
+   assert(!ctx->shader->is_gs_copy_shader);
+   assert(info->num_outputs <= max_outputs);
+
+   outputs = MALLOC((info->num_outputs + 1) * sizeof(outputs[0]));
+
+   for (i = 0; i < info->num_outputs; i++) {
+      outputs[i].semantic_name = info->output_semantic_name[i];
+      outputs[i].semantic_index = info->output_semantic_index[i];
+
+      for (j = 0; j < 4; j++) {
+         outputs[i].values[j] = LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + j], "");
+         outputs[i].vertex_stream[j] = (info->output_streams[i] >> (2 * j)) & 3;
+      }
+   }
+
+   if (!ctx->screen->use_ngg_streamout && ctx->shader->selector->so.num_outputs)
+      si_llvm_emit_streamout(ctx, outputs, i, 0);
+
+   /* Export PrimitiveID. */
+   if (ctx->shader->key.mono.u.vs_export_prim_id) {
+      outputs[i].semantic_name = TGSI_SEMANTIC_PRIMID;
+      outputs[i].semantic_index = 0;
+      outputs[i].values[0] = ac_to_float(&ctx->ac, si_get_primitive_id(ctx, 0));
+      for (j = 1; j < 4; j++)
+         outputs[i].values[j] = LLVMConstReal(ctx->ac.f32, 0);
+
+      memset(outputs[i].vertex_stream, 0, sizeof(outputs[i].vertex_stream));
+      i++;
+   }
+
+   si_llvm_build_vs_exports(ctx, outputs, i);
+   FREE(outputs);
 }
 
-static void si_llvm_emit_prim_discard_cs_epilogue(struct ac_shader_abi *abi,
-						  unsigned max_outputs,
-						  LLVMValueRef *addrs)
+static void si_llvm_emit_prim_discard_cs_epilogue(struct ac_shader_abi *abi, unsigned max_outputs,
+                                                  LLVMValueRef *addrs)
 {
-	struct si_shader_context *ctx = si_shader_context_from_abi(abi);
-	struct si_shader_info *info = &ctx->shader->selector->info;
-	LLVMValueRef pos[4] = {};
-
-	assert(info->num_outputs <= max_outputs);
-
-	for (unsigned i = 0; i < info->num_outputs; i++) {
-		if (info->output_semantic_name[i] != TGSI_SEMANTIC_POSITION)
-			continue;
-
-		for (unsigned chan = 0; chan < 4; chan++)
-			pos[chan] = LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + chan], "");
-		break;
-	}
-	assert(pos[0] != NULL);
-
-	/* Return the position output. */
-	LLVMValueRef ret = ctx->return_value;
-	for (unsigned chan = 0; chan < 4; chan++)
-		ret = LLVMBuildInsertValue(ctx->ac.builder, ret, pos[chan], chan, "");
-	ctx->return_value = ret;
+   struct si_shader_context *ctx = si_shader_context_from_abi(abi);
+   struct si_shader_info *info = &ctx->shader->selector->info;
+   LLVMValueRef pos[4] = {};
+
+   assert(info->num_outputs <= max_outputs);
+
+   for (unsigned i = 0; i < info->num_outputs; i++) {
+      if (info->output_semantic_name[i] != TGSI_SEMANTIC_POSITION)
+         continue;
+
+      for (unsigned chan = 0; chan < 4; chan++)
+         pos[chan] = LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + chan], "");
+      break;
+   }
+   assert(pos[0] != NULL);
+
+   /* Return the position output. */
+   LLVMValueRef ret = ctx->return_value;
+   for (unsigned chan = 0; chan < 4; chan++)
+      ret = LLVMBuildInsertValue(ctx->ac.builder, ret, pos[chan], chan, "");
+   ctx->return_value = ret;
 }
 
 /**
@@ -852,280 +773,252 @@ static void si_llvm_emit_prim_discard_cs_epilogue(struct ac_shader_abi *abi,
  *   (InstanceID + StartInstance),
  *   (InstanceID / 2 + StartInstance)
  */
-void si_llvm_build_vs_prolog(struct si_shader_context *ctx,
-			     union si_shader_part_key *key)
+void si_llvm_build_vs_prolog(struct si_shader_context *ctx, union si_shader_part_key *key)
 {
-	LLVMTypeRef *returns;
-	LLVMValueRef ret, func;
-	int num_returns, i;
-	unsigned first_vs_vgpr = key->vs_prolog.num_merged_next_stage_vgprs;
-	unsigned num_input_vgprs = key->vs_prolog.num_merged_next_stage_vgprs + 4 +
-				   (key->vs_prolog.has_ngg_cull_inputs ? 1 : 0);
-	struct ac_arg input_sgpr_param[key->vs_prolog.num_input_sgprs];
-	struct ac_arg input_vgpr_param[10];
-	LLVMValueRef input_vgprs[10];
-	unsigned num_all_input_regs = key->vs_prolog.num_input_sgprs +
-				      num_input_vgprs;
-	unsigned user_sgpr_base = key->vs_prolog.num_merged_next_stage_vgprs ? 8 : 0;
-
-	memset(&ctx->args, 0, sizeof(ctx->args));
-
-	/* 4 preloaded VGPRs + vertex load indices as prolog outputs */
-	returns = alloca((num_all_input_regs + key->vs_prolog.num_inputs) *
-			 sizeof(LLVMTypeRef));
-	num_returns = 0;
-
-	/* Declare input and output SGPRs. */
-	for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) {
-		ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT,
-			   &input_sgpr_param[i]);
-		returns[num_returns++] = ctx->ac.i32;
-	}
-
-	struct ac_arg merged_wave_info = input_sgpr_param[3];
-
-	/* Preloaded VGPRs (outputs must be floats) */
-	for (i = 0; i < num_input_vgprs; i++) {
-		ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &input_vgpr_param[i]);
-		returns[num_returns++] = ctx->ac.f32;
-	}
-
-	/* Vertex load indices. */
-	for (i = 0; i < key->vs_prolog.num_inputs; i++)
-		returns[num_returns++] = ctx->ac.f32;
-
-	/* Create the function. */
-	si_llvm_create_func(ctx, "vs_prolog", returns, num_returns, 0);
-	func = ctx->main_fn;
-
-	for (i = 0; i < num_input_vgprs; i++) {
-		input_vgprs[i] = ac_get_arg(&ctx->ac, input_vgpr_param[i]);
-	}
-
-	if (key->vs_prolog.num_merged_next_stage_vgprs) {
-		if (!key->vs_prolog.is_monolithic)
-			si_init_exec_from_input(ctx, merged_wave_info, 0);
-
-		if (key->vs_prolog.as_ls &&
-		    ctx->screen->info.has_ls_vgpr_init_bug) {
-			/* If there are no HS threads, SPI loads the LS VGPRs
-			 * starting at VGPR 0. Shift them back to where they
-			 * belong.
-			 */
-			LLVMValueRef has_hs_threads =
-				LLVMBuildICmp(ctx->ac.builder, LLVMIntNE,
-				    si_unpack_param(ctx, input_sgpr_param[3], 8, 8),
-				    ctx->ac.i32_0, "");
-
-			for (i = 4; i > 0; --i) {
-				input_vgprs[i + 1] =
-					LLVMBuildSelect(ctx->ac.builder, has_hs_threads,
-						        input_vgprs[i + 1],
-						        input_vgprs[i - 1], "");
-			}
-		}
-	}
-
-	if (key->vs_prolog.gs_fast_launch_tri_list ||
-	    key->vs_prolog.gs_fast_launch_tri_strip) {
-		LLVMValueRef wave_id, thread_id_in_tg;
-
-		wave_id = si_unpack_param(ctx, input_sgpr_param[3], 24, 4);
-		thread_id_in_tg = ac_build_imad(&ctx->ac, wave_id,
-						LLVMConstInt(ctx->ac.i32, ctx->ac.wave_size, false),
-						ac_get_thread_id(&ctx->ac));
-
-		/* The GS fast launch initializes all VGPRs to the value of
-		 * the first thread, so we have to add the thread ID.
-		 *
-		 * Only these are initialized by the hw:
-		 *   VGPR2: Base Primitive ID
-		 *   VGPR5: Base Vertex ID
-		 *   VGPR6: Instance ID
-		 */
-
-		/* Put the vertex thread IDs into VGPRs as-is instead of packing them.
-		 * The NGG cull shader will read them from there.
-		 */
-		if (key->vs_prolog.gs_fast_launch_tri_list) {
-			input_vgprs[0] = ac_build_imad(&ctx->ac, thread_id_in_tg, /* gs_vtx01_offset */
-						       LLVMConstInt(ctx->ac.i32, 3, 0), /* Vertex 0 */
-						       LLVMConstInt(ctx->ac.i32, 0, 0));
-			input_vgprs[1] = ac_build_imad(&ctx->ac, thread_id_in_tg, /* gs_vtx23_offset */
-						       LLVMConstInt(ctx->ac.i32, 3, 0), /* Vertex 1 */
-						       LLVMConstInt(ctx->ac.i32, 1, 0));
-			input_vgprs[4] = ac_build_imad(&ctx->ac, thread_id_in_tg, /* gs_vtx45_offset */
-						       LLVMConstInt(ctx->ac.i32, 3, 0), /* Vertex 2 */
-						       LLVMConstInt(ctx->ac.i32, 2, 0));
-		} else {
-			assert(key->vs_prolog.gs_fast_launch_tri_strip);
-			LLVMBuilderRef builder = ctx->ac.builder;
-			/* Triangle indices: */
-			LLVMValueRef index[3] = {
-				thread_id_in_tg,
-				LLVMBuildAdd(builder, thread_id_in_tg,
-					     LLVMConstInt(ctx->ac.i32, 1, 0), ""),
-				LLVMBuildAdd(builder, thread_id_in_tg,
-					     LLVMConstInt(ctx->ac.i32, 2, 0), ""),
-			};
-			LLVMValueRef is_odd = LLVMBuildTrunc(ctx->ac.builder,
-							     thread_id_in_tg, ctx->ac.i1, "");
-			LLVMValueRef flatshade_first =
-				LLVMBuildICmp(builder, LLVMIntEQ,
-					      si_unpack_param(ctx, ctx->vs_state_bits, 4, 2),
-					      ctx->ac.i32_0, "");
-
-			ac_build_triangle_strip_indices_to_triangle(&ctx->ac, is_odd,
-								    flatshade_first, index);
-			input_vgprs[0] = index[0];
-			input_vgprs[1] = index[1];
-			input_vgprs[4] = index[2];
-		}
-
-		/* Triangles always have all edge flags set initially. */
-		input_vgprs[3] = LLVMConstInt(ctx->ac.i32, 0x7 << 8, 0);
-
-		input_vgprs[2] = LLVMBuildAdd(ctx->ac.builder, input_vgprs[2],
-					      thread_id_in_tg, ""); /* PrimID */
-		input_vgprs[5] = LLVMBuildAdd(ctx->ac.builder, input_vgprs[5],
-					      thread_id_in_tg, ""); /* VertexID */
-		input_vgprs[8] = input_vgprs[6]; /* InstanceID */
-	}
-
-	unsigned vertex_id_vgpr = first_vs_vgpr;
-	unsigned instance_id_vgpr =
-		ctx->screen->info.chip_class >= GFX10 ?
-			first_vs_vgpr + 3 :
-			first_vs_vgpr + (key->vs_prolog.as_ls ? 2 : 1);
-
-	ctx->abi.vertex_id = input_vgprs[vertex_id_vgpr];
-	ctx->abi.instance_id = input_vgprs[instance_id_vgpr];
-
-	/* InstanceID = VertexID >> 16;
-	 * VertexID   = VertexID & 0xffff;
-	 */
-	if (key->vs_prolog.states.unpack_instance_id_from_vertex_id) {
-		ctx->abi.instance_id = LLVMBuildLShr(ctx->ac.builder, ctx->abi.vertex_id,
-						     LLVMConstInt(ctx->ac.i32, 16, 0), "");
-		ctx->abi.vertex_id = LLVMBuildAnd(ctx->ac.builder, ctx->abi.vertex_id,
-						  LLVMConstInt(ctx->ac.i32, 0xffff, 0), "");
-	}
-
-	/* Copy inputs to outputs. This should be no-op, as the registers match,
-	 * but it will prevent the compiler from overwriting them unintentionally.
-	 */
-	ret = ctx->return_value;
-	for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) {
-		LLVMValueRef p = LLVMGetParam(func, i);
-		ret = LLVMBuildInsertValue(ctx->ac.builder, ret, p, i, "");
-	}
-	for (i = 0; i < num_input_vgprs; i++) {
-		LLVMValueRef p = input_vgprs[i];
-
-		if (i == vertex_id_vgpr)
-			p = ctx->abi.vertex_id;
-		else if (i == instance_id_vgpr)
-			p = ctx->abi.instance_id;
-
-		p = ac_to_float(&ctx->ac, p);
-		ret = LLVMBuildInsertValue(ctx->ac.builder, ret, p,
-					   key->vs_prolog.num_input_sgprs + i, "");
-	}
-
-	/* Compute vertex load indices from instance divisors. */
-	LLVMValueRef instance_divisor_constbuf = NULL;
-
-	if (key->vs_prolog.states.instance_divisor_is_fetched) {
-		LLVMValueRef list = si_prolog_get_rw_buffers(ctx);
-		LLVMValueRef buf_index =
-			LLVMConstInt(ctx->ac.i32, SI_VS_CONST_INSTANCE_DIVISORS, 0);
-		instance_divisor_constbuf =
-			ac_build_load_to_sgpr(&ctx->ac, list, buf_index);
-	}
-
-	for (i = 0; i < key->vs_prolog.num_inputs; i++) {
-		bool divisor_is_one =
-			key->vs_prolog.states.instance_divisor_is_one & (1u << i);
-		bool divisor_is_fetched =
-			key->vs_prolog.states.instance_divisor_is_fetched & (1u << i);
-		LLVMValueRef index = NULL;
-
-		if (divisor_is_one) {
-			index = ctx->abi.instance_id;
-		} else if (divisor_is_fetched) {
-			LLVMValueRef udiv_factors[4];
-
-			for (unsigned j = 0; j < 4; j++) {
-				udiv_factors[j] =
-					si_buffer_load_const(ctx, instance_divisor_constbuf,
-							     LLVMConstInt(ctx->ac.i32, i*16 + j*4, 0));
-				udiv_factors[j] = ac_to_integer(&ctx->ac, udiv_factors[j]);
-			}
-			/* The faster NUW version doesn't work when InstanceID == UINT_MAX.
-			 * Such InstanceID might not be achievable in a reasonable time though.
-			 */
-			index = ac_build_fast_udiv_nuw(&ctx->ac, ctx->abi.instance_id,
-						       udiv_factors[0], udiv_factors[1],
-						       udiv_factors[2], udiv_factors[3]);
-		}
-
-		if (divisor_is_one || divisor_is_fetched) {
-			/* Add StartInstance. */
-			index = LLVMBuildAdd(ctx->ac.builder, index,
-					     LLVMGetParam(ctx->main_fn, user_sgpr_base +
-							  SI_SGPR_START_INSTANCE), "");
-		} else {
-			/* VertexID + BaseVertex */
-			index = LLVMBuildAdd(ctx->ac.builder,
-					     ctx->abi.vertex_id,
-					     LLVMGetParam(func, user_sgpr_base +
-								SI_SGPR_BASE_VERTEX), "");
-		}
-
-		index = ac_to_float(&ctx->ac, index);
-		ret = LLVMBuildInsertValue(ctx->ac.builder, ret, index,
-					   ctx->args.arg_count + i, "");
-	}
-
-	si_llvm_build_ret(ctx, ret);
+   LLVMTypeRef *returns;
+   LLVMValueRef ret, func;
+   int num_returns, i;
+   unsigned first_vs_vgpr = key->vs_prolog.num_merged_next_stage_vgprs;
+   unsigned num_input_vgprs =
+      key->vs_prolog.num_merged_next_stage_vgprs + 4 + (key->vs_prolog.has_ngg_cull_inputs ? 1 : 0);
+   struct ac_arg input_sgpr_param[key->vs_prolog.num_input_sgprs];
+   struct ac_arg input_vgpr_param[10];
+   LLVMValueRef input_vgprs[10];
+   unsigned num_all_input_regs = key->vs_prolog.num_input_sgprs + num_input_vgprs;
+   unsigned user_sgpr_base = key->vs_prolog.num_merged_next_stage_vgprs ? 8 : 0;
+
+   memset(&ctx->args, 0, sizeof(ctx->args));
+
+   /* 4 preloaded VGPRs + vertex load indices as prolog outputs */
+   returns = alloca((num_all_input_regs + key->vs_prolog.num_inputs) * sizeof(LLVMTypeRef));
+   num_returns = 0;
+
+   /* Declare input and output SGPRs. */
+   for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) {
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &input_sgpr_param[i]);
+      returns[num_returns++] = ctx->ac.i32;
+   }
+
+   struct ac_arg merged_wave_info = input_sgpr_param[3];
+
+   /* Preloaded VGPRs (outputs must be floats) */
+   for (i = 0; i < num_input_vgprs; i++) {
+      ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &input_vgpr_param[i]);
+      returns[num_returns++] = ctx->ac.f32;
+   }
+
+   /* Vertex load indices. */
+   for (i = 0; i < key->vs_prolog.num_inputs; i++)
+      returns[num_returns++] = ctx->ac.f32;
+
+   /* Create the function. */
+   si_llvm_create_func(ctx, "vs_prolog", returns, num_returns, 0);
+   func = ctx->main_fn;
+
+   for (i = 0; i < num_input_vgprs; i++) {
+      input_vgprs[i] = ac_get_arg(&ctx->ac, input_vgpr_param[i]);
+   }
+
+   if (key->vs_prolog.num_merged_next_stage_vgprs) {
+      if (!key->vs_prolog.is_monolithic)
+         si_init_exec_from_input(ctx, merged_wave_info, 0);
+
+      if (key->vs_prolog.as_ls && ctx->screen->info.has_ls_vgpr_init_bug) {
+         /* If there are no HS threads, SPI loads the LS VGPRs
+          * starting at VGPR 0. Shift them back to where they
+          * belong.
+          */
+         LLVMValueRef has_hs_threads =
+            LLVMBuildICmp(ctx->ac.builder, LLVMIntNE,
+                          si_unpack_param(ctx, input_sgpr_param[3], 8, 8), ctx->ac.i32_0, "");
+
+         for (i = 4; i > 0; --i) {
+            input_vgprs[i + 1] = LLVMBuildSelect(ctx->ac.builder, has_hs_threads,
+                                                 input_vgprs[i + 1], input_vgprs[i - 1], "");
+         }
+      }
+   }
+
+   if (key->vs_prolog.gs_fast_launch_tri_list || key->vs_prolog.gs_fast_launch_tri_strip) {
+      LLVMValueRef wave_id, thread_id_in_tg;
+
+      wave_id = si_unpack_param(ctx, input_sgpr_param[3], 24, 4);
+      thread_id_in_tg =
+         ac_build_imad(&ctx->ac, wave_id, LLVMConstInt(ctx->ac.i32, ctx->ac.wave_size, false),
+                       ac_get_thread_id(&ctx->ac));
+
+      /* The GS fast launch initializes all VGPRs to the value of
+       * the first thread, so we have to add the thread ID.
+       *
+       * Only these are initialized by the hw:
+       *   VGPR2: Base Primitive ID
+       *   VGPR5: Base Vertex ID
+       *   VGPR6: Instance ID
+       */
+
+      /* Put the vertex thread IDs into VGPRs as-is instead of packing them.
+       * The NGG cull shader will read them from there.
+       */
+      if (key->vs_prolog.gs_fast_launch_tri_list) {
+         input_vgprs[0] = ac_build_imad(&ctx->ac, thread_id_in_tg,       /* gs_vtx01_offset */
+                                        LLVMConstInt(ctx->ac.i32, 3, 0), /* Vertex 0 */
+                                        LLVMConstInt(ctx->ac.i32, 0, 0));
+         input_vgprs[1] = ac_build_imad(&ctx->ac, thread_id_in_tg,       /* gs_vtx23_offset */
+                                        LLVMConstInt(ctx->ac.i32, 3, 0), /* Vertex 1 */
+                                        LLVMConstInt(ctx->ac.i32, 1, 0));
+         input_vgprs[4] = ac_build_imad(&ctx->ac, thread_id_in_tg,       /* gs_vtx45_offset */
+                                        LLVMConstInt(ctx->ac.i32, 3, 0), /* Vertex 2 */
+                                        LLVMConstInt(ctx->ac.i32, 2, 0));
+      } else {
+         assert(key->vs_prolog.gs_fast_launch_tri_strip);
+         LLVMBuilderRef builder = ctx->ac.builder;
+         /* Triangle indices: */
+         LLVMValueRef index[3] = {
+            thread_id_in_tg,
+            LLVMBuildAdd(builder, thread_id_in_tg, LLVMConstInt(ctx->ac.i32, 1, 0), ""),
+            LLVMBuildAdd(builder, thread_id_in_tg, LLVMConstInt(ctx->ac.i32, 2, 0), ""),
+         };
+         LLVMValueRef is_odd = LLVMBuildTrunc(ctx->ac.builder, thread_id_in_tg, ctx->ac.i1, "");
+         LLVMValueRef flatshade_first = LLVMBuildICmp(
+            builder, LLVMIntEQ, si_unpack_param(ctx, ctx->vs_state_bits, 4, 2), ctx->ac.i32_0, "");
+
+         ac_build_triangle_strip_indices_to_triangle(&ctx->ac, is_odd, flatshade_first, index);
+         input_vgprs[0] = index[0];
+         input_vgprs[1] = index[1];
+         input_vgprs[4] = index[2];
+      }
+
+      /* Triangles always have all edge flags set initially. */
+      input_vgprs[3] = LLVMConstInt(ctx->ac.i32, 0x7 << 8, 0);
+
+      input_vgprs[2] =
+         LLVMBuildAdd(ctx->ac.builder, input_vgprs[2], thread_id_in_tg, ""); /* PrimID */
+      input_vgprs[5] =
+         LLVMBuildAdd(ctx->ac.builder, input_vgprs[5], thread_id_in_tg, ""); /* VertexID */
+      input_vgprs[8] = input_vgprs[6];                                       /* InstanceID */
+   }
+
+   unsigned vertex_id_vgpr = first_vs_vgpr;
+   unsigned instance_id_vgpr = ctx->screen->info.chip_class >= GFX10
+                                  ? first_vs_vgpr + 3
+                                  : first_vs_vgpr + (key->vs_prolog.as_ls ? 2 : 1);
+
+   ctx->abi.vertex_id = input_vgprs[vertex_id_vgpr];
+   ctx->abi.instance_id = input_vgprs[instance_id_vgpr];
+
+   /* InstanceID = VertexID >> 16;
+    * VertexID   = VertexID & 0xffff;
+    */
+   if (key->vs_prolog.states.unpack_instance_id_from_vertex_id) {
+      ctx->abi.instance_id =
+         LLVMBuildLShr(ctx->ac.builder, ctx->abi.vertex_id, LLVMConstInt(ctx->ac.i32, 16, 0), "");
+      ctx->abi.vertex_id = LLVMBuildAnd(ctx->ac.builder, ctx->abi.vertex_id,
+                                        LLVMConstInt(ctx->ac.i32, 0xffff, 0), "");
+   }
+
+   /* Copy inputs to outputs. This should be no-op, as the registers match,
+    * but it will prevent the compiler from overwriting them unintentionally.
+    */
+   ret = ctx->return_value;
+   for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) {
+      LLVMValueRef p = LLVMGetParam(func, i);
+      ret = LLVMBuildInsertValue(ctx->ac.builder, ret, p, i, "");
+   }
+   for (i = 0; i < num_input_vgprs; i++) {
+      LLVMValueRef p = input_vgprs[i];
+
+      if (i == vertex_id_vgpr)
+         p = ctx->abi.vertex_id;
+      else if (i == instance_id_vgpr)
+         p = ctx->abi.instance_id;
+
+      p = ac_to_float(&ctx->ac, p);
+      ret = LLVMBuildInsertValue(ctx->ac.builder, ret, p, key->vs_prolog.num_input_sgprs + i, "");
+   }
+
+   /* Compute vertex load indices from instance divisors. */
+   LLVMValueRef instance_divisor_constbuf = NULL;
+
+   if (key->vs_prolog.states.instance_divisor_is_fetched) {
+      LLVMValueRef list = si_prolog_get_rw_buffers(ctx);
+      LLVMValueRef buf_index = LLVMConstInt(ctx->ac.i32, SI_VS_CONST_INSTANCE_DIVISORS, 0);
+      instance_divisor_constbuf = ac_build_load_to_sgpr(&ctx->ac, list, buf_index);
+   }
+
+   for (i = 0; i < key->vs_prolog.num_inputs; i++) {
+      bool divisor_is_one = key->vs_prolog.states.instance_divisor_is_one & (1u << i);
+      bool divisor_is_fetched = key->vs_prolog.states.instance_divisor_is_fetched & (1u << i);
+      LLVMValueRef index = NULL;
+
+      if (divisor_is_one) {
+         index = ctx->abi.instance_id;
+      } else if (divisor_is_fetched) {
+         LLVMValueRef udiv_factors[4];
+
+         for (unsigned j = 0; j < 4; j++) {
+            udiv_factors[j] = si_buffer_load_const(ctx, instance_divisor_constbuf,
+                                                   LLVMConstInt(ctx->ac.i32, i * 16 + j * 4, 0));
+            udiv_factors[j] = ac_to_integer(&ctx->ac, udiv_factors[j]);
+         }
+         /* The faster NUW version doesn't work when InstanceID == UINT_MAX.
+          * Such InstanceID might not be achievable in a reasonable time though.
+          */
+         index = ac_build_fast_udiv_nuw(&ctx->ac, ctx->abi.instance_id, udiv_factors[0],
+                                        udiv_factors[1], udiv_factors[2], udiv_factors[3]);
+      }
+
+      if (divisor_is_one || divisor_is_fetched) {
+         /* Add StartInstance. */
+         index =
+            LLVMBuildAdd(ctx->ac.builder, index,
+                         LLVMGetParam(ctx->main_fn, user_sgpr_base + SI_SGPR_START_INSTANCE), "");
+      } else {
+         /* VertexID + BaseVertex */
+         index = LLVMBuildAdd(ctx->ac.builder, ctx->abi.vertex_id,
+                              LLVMGetParam(func, user_sgpr_base + SI_SGPR_BASE_VERTEX), "");
+      }
+
+      index = ac_to_float(&ctx->ac, index);
+      ret = LLVMBuildInsertValue(ctx->ac.builder, ret, index, ctx->args.arg_count + i, "");
+   }
+
+   si_llvm_build_ret(ctx, ret);
 }
 
 static LLVMValueRef get_base_vertex(struct ac_shader_abi *abi)
 {
-	struct si_shader_context *ctx = si_shader_context_from_abi(abi);
-
-	/* For non-indexed draws, the base vertex set by the driver
-	 * (for direct draws) or the CP (for indirect draws) is the
-	 * first vertex ID, but GLSL expects 0 to be returned.
-	 */
-	LLVMValueRef vs_state = ac_get_arg(&ctx->ac,
-					   ctx->vs_state_bits);
-	LLVMValueRef indexed;
-
-	indexed = LLVMBuildLShr(ctx->ac.builder, vs_state, ctx->ac.i32_1, "");
-	indexed = LLVMBuildTrunc(ctx->ac.builder, indexed, ctx->ac.i1, "");
-
-	return LLVMBuildSelect(ctx->ac.builder, indexed,
-			       ac_get_arg(&ctx->ac, ctx->args.base_vertex),
-			       ctx->ac.i32_0, "");
+   struct si_shader_context *ctx = si_shader_context_from_abi(abi);
+
+   /* For non-indexed draws, the base vertex set by the driver
+    * (for direct draws) or the CP (for indirect draws) is the
+    * first vertex ID, but GLSL expects 0 to be returned.
+    */
+   LLVMValueRef vs_state = ac_get_arg(&ctx->ac, ctx->vs_state_bits);
+   LLVMValueRef indexed;
+
+   indexed = LLVMBuildLShr(ctx->ac.builder, vs_state, ctx->ac.i32_1, "");
+   indexed = LLVMBuildTrunc(ctx->ac.builder, indexed, ctx->ac.i1, "");
+
+   return LLVMBuildSelect(ctx->ac.builder, indexed, ac_get_arg(&ctx->ac, ctx->args.base_vertex),
+                          ctx->ac.i32_0, "");
 }
 
 void si_llvm_init_vs_callbacks(struct si_shader_context *ctx, bool ngg_cull_shader)
 {
-	struct si_shader *shader = ctx->shader;
-
-	if (shader->key.as_ls)
-		ctx->abi.emit_outputs = si_llvm_emit_ls_epilogue;
-	else if (shader->key.as_es)
-		ctx->abi.emit_outputs = si_llvm_emit_es_epilogue;
-	else if (shader->key.opt.vs_as_prim_discard_cs)
-		ctx->abi.emit_outputs = si_llvm_emit_prim_discard_cs_epilogue;
-	else if (ngg_cull_shader)
-		ctx->abi.emit_outputs = gfx10_emit_ngg_culling_epilogue_4x_wave32;
-	else if (shader->key.as_ngg)
-		ctx->abi.emit_outputs = gfx10_emit_ngg_epilogue;
-	else
-		ctx->abi.emit_outputs = si_llvm_emit_vs_epilogue;
-
-	ctx->abi.load_base_vertex = get_base_vertex;
+   struct si_shader *shader = ctx->shader;
+
+   if (shader->key.as_ls)
+      ctx->abi.emit_outputs = si_llvm_emit_ls_epilogue;
+   else if (shader->key.as_es)
+      ctx->abi.emit_outputs = si_llvm_emit_es_epilogue;
+   else if (shader->key.opt.vs_as_prim_discard_cs)
+      ctx->abi.emit_outputs = si_llvm_emit_prim_discard_cs_epilogue;
+   else if (ngg_cull_shader)
+      ctx->abi.emit_outputs = gfx10_emit_ngg_culling_epilogue_4x_wave32;
+   else if (shader->key.as_ngg)
+      ctx->abi.emit_outputs = gfx10_emit_ngg_epilogue;
+   else
+      ctx->abi.emit_outputs = si_llvm_emit_vs_epilogue;
+
+   ctx->abi.load_base_vertex = get_base_vertex;
 }
diff --git a/src/gallium/drivers/radeonsi/si_shader_nir.c b/src/gallium/drivers/radeonsi/si_shader_nir.c
index 49393af3abd..ddbb5c5c9c7 100644
--- a/src/gallium/drivers/radeonsi/si_shader_nir.c
+++ b/src/gallium/drivers/radeonsi/si_shader_nir.c
@@ -22,914 +22,865 @@
  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  */
 
-#include "si_shader_internal.h"
-#include "si_pipe.h"
-
 #include "ac_nir_to_llvm.h"
-
-#include "tgsi/tgsi_from_mesa.h"
-
 #include "compiler/nir/nir.h"
-#include "compiler/nir_types.h"
 #include "compiler/nir/nir_builder.h"
 #include "compiler/nir/nir_deref.h"
+#include "compiler/nir_types.h"
+#include "si_pipe.h"
+#include "si_shader_internal.h"
+#include "tgsi/tgsi_from_mesa.h"
 
 static const nir_deref_instr *tex_get_texture_deref(nir_tex_instr *instr)
 {
-	for (unsigned i = 0; i < instr->num_srcs; i++) {
-		switch (instr->src[i].src_type) {
-		case nir_tex_src_texture_deref:
-			return nir_src_as_deref(instr->src[i].src);
-		default:
-			break;
-		}
-	}
-
-	return NULL;
+   for (unsigned i = 0; i < instr->num_srcs; i++) {
+      switch (instr->src[i].src_type) {
+      case nir_tex_src_texture_deref:
+         return nir_src_as_deref(instr->src[i].src);
+      default:
+         break;
+      }
+   }
+
+   return NULL;
 }
 
-static nir_variable* intrinsic_get_var(nir_intrinsic_instr *instr)
+static nir_variable *intrinsic_get_var(nir_intrinsic_instr *instr)
 {
-	return nir_deref_instr_get_variable(nir_src_as_deref(instr->src[0]));
+   return nir_deref_instr_get_variable(nir_src_as_deref(instr->src[0]));
 }
 
-static void gather_usage_helper(const nir_deref_instr **deref_ptr,
-				unsigned location,
-				uint8_t mask,
-				uint8_t *usage_mask)
+static void gather_usage_helper(const nir_deref_instr **deref_ptr, unsigned location, uint8_t mask,
+                                uint8_t *usage_mask)
 {
-	for (; *deref_ptr; deref_ptr++) {
-		const nir_deref_instr *deref = *deref_ptr;
-		switch (deref->deref_type) {
-		case nir_deref_type_array: {
-			unsigned elem_size =
-				glsl_count_attribute_slots(deref->type, false);
-			if (nir_src_is_const(deref->arr.index)) {
-				location += elem_size * nir_src_as_uint(deref->arr.index);
-			} else {
-				unsigned array_elems =
-					glsl_get_length(deref_ptr[-1]->type);
-				for (unsigned i = 0; i < array_elems; i++) {
-					gather_usage_helper(deref_ptr + 1,
-							    location + elem_size * i,
-							    mask, usage_mask);
-				}
-				return;
-			}
-			break;
-		}
-		case nir_deref_type_struct: {
-			const struct glsl_type *parent_type =
-				deref_ptr[-1]->type;
-			unsigned index = deref->strct.index;
-			for (unsigned i = 0; i < index; i++) {
-				const struct glsl_type *ft = glsl_get_struct_field(parent_type, i);
-				location += glsl_count_attribute_slots(ft, false);
-			}
-			break;
-		}
-		default:
-			unreachable("Unhandled deref type in gather_components_used_helper");
-		}
-	}
-
-	usage_mask[location] |= mask & 0xf;
-	if (mask & 0xf0)
-		usage_mask[location + 1] |= (mask >> 4) & 0xf;
+   for (; *deref_ptr; deref_ptr++) {
+      const nir_deref_instr *deref = *deref_ptr;
+      switch (deref->deref_type) {
+      case nir_deref_type_array: {
+         unsigned elem_size = glsl_count_attribute_slots(deref->type, false);
+         if (nir_src_is_const(deref->arr.index)) {
+            location += elem_size * nir_src_as_uint(deref->arr.index);
+         } else {
+            unsigned array_elems = glsl_get_length(deref_ptr[-1]->type);
+            for (unsigned i = 0; i < array_elems; i++) {
+               gather_usage_helper(deref_ptr + 1, location + elem_size * i, mask, usage_mask);
+            }
+            return;
+         }
+         break;
+      }
+      case nir_deref_type_struct: {
+         const struct glsl_type *parent_type = deref_ptr[-1]->type;
+         unsigned index = deref->strct.index;
+         for (unsigned i = 0; i < index; i++) {
+            const struct glsl_type *ft = glsl_get_struct_field(parent_type, i);
+            location += glsl_count_attribute_slots(ft, false);
+         }
+         break;
+      }
+      default:
+         unreachable("Unhandled deref type in gather_components_used_helper");
+      }
+   }
+
+   usage_mask[location] |= mask & 0xf;
+   if (mask & 0xf0)
+      usage_mask[location + 1] |= (mask >> 4) & 0xf;
 }
 
-static void gather_usage(const nir_deref_instr *deref,
-			 uint8_t mask,
-			 uint8_t *usage_mask)
+static void gather_usage(const nir_deref_instr *deref, uint8_t mask, uint8_t *usage_mask)
 {
-	nir_deref_path path;
-	nir_deref_path_init(&path, (nir_deref_instr *)deref, NULL);
-
-	unsigned location_frac = path.path[0]->var->data.location_frac;
-	if (glsl_type_is_64bit(deref->type)) {
-		uint8_t new_mask = 0;
-		for (unsigned i = 0; i < 4; i++) {
-			if (mask & (1 << i))
-				new_mask |= 0x3 << (2 * i);
-		}
-		mask = new_mask << location_frac;
-	} else {
-		mask <<= location_frac;
-		mask &= 0xf;
-	}
-
-	gather_usage_helper((const nir_deref_instr **)&path.path[1],
-			    path.path[0]->var->data.driver_location,
-			    mask, usage_mask);
-
-	nir_deref_path_finish(&path);
+   nir_deref_path path;
+   nir_deref_path_init(&path, (nir_deref_instr *)deref, NULL);
+
+   unsigned location_frac = path.path[0]->var->data.location_frac;
+   if (glsl_type_is_64bit(deref->type)) {
+      uint8_t new_mask = 0;
+      for (unsigned i = 0; i < 4; i++) {
+         if (mask & (1 << i))
+            new_mask |= 0x3 << (2 * i);
+      }
+      mask = new_mask << location_frac;
+   } else {
+      mask <<= location_frac;
+      mask &= 0xf;
+   }
+
+   gather_usage_helper((const nir_deref_instr **)&path.path[1],
+                       path.path[0]->var->data.driver_location, mask, usage_mask);
+
+   nir_deref_path_finish(&path);
 }
 
 static void gather_intrinsic_load_deref_input_info(const nir_shader *nir,
-						   const nir_intrinsic_instr *instr,
-						   const nir_deref_instr *deref,
-						   struct si_shader_info *info)
+                                                   const nir_intrinsic_instr *instr,
+                                                   const nir_deref_instr *deref,
+                                                   struct si_shader_info *info)
 {
-	switch (nir->info.stage) {
-	case MESA_SHADER_VERTEX:
-		gather_usage(deref, nir_ssa_def_components_read(&instr->dest.ssa),
-			     info->input_usage_mask);
-	default:;
-	}
+   switch (nir->info.stage) {
+   case MESA_SHADER_VERTEX:
+      gather_usage(deref, nir_ssa_def_components_read(&instr->dest.ssa), info->input_usage_mask);
+   default:;
+   }
 }
 
 static void gather_intrinsic_load_deref_output_info(const nir_shader *nir,
-						    const nir_intrinsic_instr *instr,
-						    nir_variable *var,
-						    struct si_shader_info *info)
+                                                    const nir_intrinsic_instr *instr,
+                                                    nir_variable *var, struct si_shader_info *info)
 {
-	assert(var && var->data.mode == nir_var_shader_out);
-
-	switch (nir->info.stage) {
-	case MESA_SHADER_TESS_CTRL:
-		if (var->data.location == VARYING_SLOT_TESS_LEVEL_INNER ||
-		    var->data.location == VARYING_SLOT_TESS_LEVEL_OUTER)
-			info->reads_tessfactor_outputs = true;
-		else if (var->data.patch)
-			info->reads_perpatch_outputs = true;
-		else
-			info->reads_pervertex_outputs = true;
-		break;
-
-	case MESA_SHADER_FRAGMENT:
-		if (var->data.fb_fetch_output)
-			info->uses_fbfetch = true;
-		break;
-	default:;
-	}
+   assert(var && var->data.mode == nir_var_shader_out);
+
+   switch (nir->info.stage) {
+   case MESA_SHADER_TESS_CTRL:
+      if (var->data.location == VARYING_SLOT_TESS_LEVEL_INNER ||
+          var->data.location == VARYING_SLOT_TESS_LEVEL_OUTER)
+         info->reads_tessfactor_outputs = true;
+      else if (var->data.patch)
+         info->reads_perpatch_outputs = true;
+      else
+         info->reads_pervertex_outputs = true;
+      break;
+
+   case MESA_SHADER_FRAGMENT:
+      if (var->data.fb_fetch_output)
+         info->uses_fbfetch = true;
+      break;
+   default:;
+   }
 }
 
 static void gather_intrinsic_store_deref_output_info(const nir_shader *nir,
-						     const nir_intrinsic_instr *instr,
-						     const nir_deref_instr *deref,
-						     struct si_shader_info *info)
+                                                     const nir_intrinsic_instr *instr,
+                                                     const nir_deref_instr *deref,
+                                                     struct si_shader_info *info)
 {
-	switch (nir->info.stage) {
-	case MESA_SHADER_VERTEX: /* needed by LS, ES */
-	case MESA_SHADER_TESS_EVAL: /* needed by ES */
-	case MESA_SHADER_GEOMETRY:
-		gather_usage(deref, nir_intrinsic_write_mask(instr),
-			     info->output_usagemask);
-		break;
-	default:;
-	}
+   switch (nir->info.stage) {
+   case MESA_SHADER_VERTEX:    /* needed by LS, ES */
+   case MESA_SHADER_TESS_EVAL: /* needed by ES */
+   case MESA_SHADER_GEOMETRY:
+      gather_usage(deref, nir_intrinsic_write_mask(instr), info->output_usagemask);
+      break;
+   default:;
+   }
 }
 
-static void scan_instruction(const struct nir_shader *nir,
-			     struct si_shader_info *info,
-			     nir_instr *instr)
+static void scan_instruction(const struct nir_shader *nir, struct si_shader_info *info,
+                             nir_instr *instr)
 {
-	if (instr->type == nir_instr_type_alu) {
-		nir_alu_instr *alu = nir_instr_as_alu(instr);
-
-		switch (alu->op) {
-		case nir_op_fddx:
-		case nir_op_fddy:
-		case nir_op_fddx_fine:
-		case nir_op_fddy_fine:
-		case nir_op_fddx_coarse:
-		case nir_op_fddy_coarse:
-			info->uses_derivatives = true;
-			break;
-		default:
-			break;
-		}
-	} else if (instr->type == nir_instr_type_tex) {
-		nir_tex_instr *tex = nir_instr_as_tex(instr);
-		const nir_deref_instr *deref = tex_get_texture_deref(tex);
-		nir_variable *var = deref ? nir_deref_instr_get_variable(deref) : NULL;
-
-		if (!var) {
-			info->samplers_declared |=
-				u_bit_consecutive(tex->sampler_index, 1);
-		} else {
-			if (deref->mode != nir_var_uniform || var->data.bindless)
-				info->uses_bindless_samplers = true;
-		}
-
-		switch (tex->op) {
-		case nir_texop_tex:
-		case nir_texop_txb:
-		case nir_texop_lod:
-			info->uses_derivatives = true;
-			break;
-		default:
-			break;
-		}
-	} else if (instr->type == nir_instr_type_intrinsic) {
-		nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
-
-		switch (intr->intrinsic) {
-		case nir_intrinsic_load_front_face:
-			info->uses_frontface = 1;
-			break;
-		case nir_intrinsic_load_instance_id:
-			info->uses_instanceid = 1;
-			break;
-		case nir_intrinsic_load_invocation_id:
-			info->uses_invocationid = true;
-			break;
-		case nir_intrinsic_load_num_work_groups:
-			info->uses_grid_size = true;
-			break;
-		case nir_intrinsic_load_local_invocation_index:
-		case nir_intrinsic_load_subgroup_id:
-		case nir_intrinsic_load_num_subgroups:
-			info->uses_subgroup_info = true;
-			break;
-		case nir_intrinsic_load_local_group_size:
-			/* The block size is translated to IMM with a fixed block size. */
-			if (info->properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] == 0)
-				info->uses_block_size = true;
-			break;
-		case nir_intrinsic_load_local_invocation_id:
-		case nir_intrinsic_load_work_group_id: {
-			unsigned mask = nir_ssa_def_components_read(&intr->dest.ssa);
-			while (mask) {
-				unsigned i = u_bit_scan(&mask);
-
-				if (intr->intrinsic == nir_intrinsic_load_work_group_id)
-					info->uses_block_id[i] = true;
-				else
-					info->uses_thread_id[i] = true;
-			}
-			break;
-		}
-		case nir_intrinsic_load_vertex_id:
-			info->uses_vertexid = 1;
-			break;
-		case nir_intrinsic_load_vertex_id_zero_base:
-			info->uses_vertexid_nobase = 1;
-			break;
-		case nir_intrinsic_load_base_vertex:
-			info->uses_basevertex = 1;
-			break;
-		case nir_intrinsic_load_draw_id:
-			info->uses_drawid = 1;
-			break;
-		case nir_intrinsic_load_primitive_id:
-			info->uses_primid = 1;
-			break;
-		case nir_intrinsic_load_sample_mask_in:
-			info->reads_samplemask = true;
-			break;
-		case nir_intrinsic_load_tess_level_inner:
-		case nir_intrinsic_load_tess_level_outer:
-			info->reads_tess_factors = true;
-			break;
-		case nir_intrinsic_bindless_image_load:
-		case nir_intrinsic_bindless_image_size:
-		case nir_intrinsic_bindless_image_samples:
-			info->uses_bindless_images = true;
-			break;
-		case nir_intrinsic_bindless_image_store:
-			info->uses_bindless_images = true;
-			info->writes_memory = true;
-			info->num_memory_instructions++; /* we only care about stores */
-			break;
-		case nir_intrinsic_image_deref_store:
-			info->writes_memory = true;
-			info->num_memory_instructions++; /* we only care about stores */
-			break;
-		case nir_intrinsic_bindless_image_atomic_add:
-		case nir_intrinsic_bindless_image_atomic_imin:
-		case nir_intrinsic_bindless_image_atomic_umin:
-		case nir_intrinsic_bindless_image_atomic_imax:
-		case nir_intrinsic_bindless_image_atomic_umax:
-		case nir_intrinsic_bindless_image_atomic_and:
-		case nir_intrinsic_bindless_image_atomic_or:
-		case nir_intrinsic_bindless_image_atomic_xor:
-		case nir_intrinsic_bindless_image_atomic_exchange:
-		case nir_intrinsic_bindless_image_atomic_comp_swap:
-			info->uses_bindless_images = true;
-			info->writes_memory = true;
-			info->num_memory_instructions++; /* we only care about stores */
-			break;
-		case nir_intrinsic_image_deref_atomic_add:
-		case nir_intrinsic_image_deref_atomic_imin:
-		case nir_intrinsic_image_deref_atomic_umin:
-		case nir_intrinsic_image_deref_atomic_imax:
-		case nir_intrinsic_image_deref_atomic_umax:
-		case nir_intrinsic_image_deref_atomic_and:
-		case nir_intrinsic_image_deref_atomic_or:
-		case nir_intrinsic_image_deref_atomic_xor:
-		case nir_intrinsic_image_deref_atomic_exchange:
-		case nir_intrinsic_image_deref_atomic_comp_swap:
-		case nir_intrinsic_image_deref_atomic_inc_wrap:
-		case nir_intrinsic_image_deref_atomic_dec_wrap:
-			info->writes_memory = true;
-			info->num_memory_instructions++; /* we only care about stores */
-			break;
-		case nir_intrinsic_store_ssbo:
-		case nir_intrinsic_ssbo_atomic_add:
-		case nir_intrinsic_ssbo_atomic_imin:
-		case nir_intrinsic_ssbo_atomic_umin:
-		case nir_intrinsic_ssbo_atomic_imax:
-		case nir_intrinsic_ssbo_atomic_umax:
-		case nir_intrinsic_ssbo_atomic_and:
-		case nir_intrinsic_ssbo_atomic_or:
-		case nir_intrinsic_ssbo_atomic_xor:
-		case nir_intrinsic_ssbo_atomic_exchange:
-		case nir_intrinsic_ssbo_atomic_comp_swap:
-			info->writes_memory = true;
-			info->num_memory_instructions++; /* we only care about stores */
-			break;
-		case nir_intrinsic_load_color0:
-		case nir_intrinsic_load_color1: {
-			unsigned index = intr->intrinsic == nir_intrinsic_load_color1;
-			uint8_t mask = nir_ssa_def_components_read(&intr->dest.ssa);
-			info->colors_read |= mask << (index * 4);
-			break;
-		}
-		case nir_intrinsic_load_barycentric_pixel:
-		case nir_intrinsic_load_barycentric_centroid:
-		case nir_intrinsic_load_barycentric_sample:
-		case nir_intrinsic_load_barycentric_at_offset: /* uses center */
-		case nir_intrinsic_load_barycentric_at_sample: { /* uses center */
-			unsigned mode = nir_intrinsic_interp_mode(intr);
-
-			if (mode == INTERP_MODE_FLAT)
-				break;
-
-			if (mode == INTERP_MODE_NOPERSPECTIVE) {
-				if (intr->intrinsic == nir_intrinsic_load_barycentric_sample)
-					info->uses_linear_sample = true;
-				else if (intr->intrinsic == nir_intrinsic_load_barycentric_centroid)
-					info->uses_linear_centroid = true;
-				else
-					info->uses_linear_center = true;
-
-				if (intr->intrinsic == nir_intrinsic_load_barycentric_at_sample)
-					info->uses_linear_opcode_interp_sample = true;
-			} else {
-				if (intr->intrinsic == nir_intrinsic_load_barycentric_sample)
-					info->uses_persp_sample = true;
-				else if (intr->intrinsic == nir_intrinsic_load_barycentric_centroid)
-					info->uses_persp_centroid = true;
-				else
-					info->uses_persp_center = true;
-
-				if (intr->intrinsic == nir_intrinsic_load_barycentric_at_sample)
-					info->uses_persp_opcode_interp_sample = true;
-			}
-			break;
-		}
-		case nir_intrinsic_load_deref: {
-			nir_variable *var = intrinsic_get_var(intr);
-			nir_variable_mode mode = var->data.mode;
-
-			if (mode == nir_var_shader_in) {
-				/* PS inputs use the interpolated load intrinsics. */
-				assert(nir->info.stage != MESA_SHADER_FRAGMENT);
-				gather_intrinsic_load_deref_input_info(nir, intr,
-								       nir_src_as_deref(intr->src[0]), info);
-			} else if (mode == nir_var_shader_out) {
-				gather_intrinsic_load_deref_output_info(nir, intr, var, info);
-			}
-			break;
-		}
-		case nir_intrinsic_store_deref: {
-			nir_variable *var = intrinsic_get_var(intr);
-
-			if (var->data.mode == nir_var_shader_out)
-				gather_intrinsic_store_deref_output_info(nir, intr,
-									 nir_src_as_deref(intr->src[0]), info);
-			break;
-		}
-		case nir_intrinsic_interp_deref_at_centroid:
-		case nir_intrinsic_interp_deref_at_sample:
-		case nir_intrinsic_interp_deref_at_offset:
-			unreachable("interp opcodes should have been lowered");
-			break;
-		default:
-			break;
-		}
-	}
+   if (instr->type == nir_instr_type_alu) {
+      nir_alu_instr *alu = nir_instr_as_alu(instr);
+
+      switch (alu->op) {
+      case nir_op_fddx:
+      case nir_op_fddy:
+      case nir_op_fddx_fine:
+      case nir_op_fddy_fine:
+      case nir_op_fddx_coarse:
+      case nir_op_fddy_coarse:
+         info->uses_derivatives = true;
+         break;
+      default:
+         break;
+      }
+   } else if (instr->type == nir_instr_type_tex) {
+      nir_tex_instr *tex = nir_instr_as_tex(instr);
+      const nir_deref_instr *deref = tex_get_texture_deref(tex);
+      nir_variable *var = deref ? nir_deref_instr_get_variable(deref) : NULL;
+
+      if (!var) {
+         info->samplers_declared |= u_bit_consecutive(tex->sampler_index, 1);
+      } else {
+         if (deref->mode != nir_var_uniform || var->data.bindless)
+            info->uses_bindless_samplers = true;
+      }
+
+      switch (tex->op) {
+      case nir_texop_tex:
+      case nir_texop_txb:
+      case nir_texop_lod:
+         info->uses_derivatives = true;
+         break;
+      default:
+         break;
+      }
+   } else if (instr->type == nir_instr_type_intrinsic) {
+      nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
+
+      switch (intr->intrinsic) {
+      case nir_intrinsic_load_front_face:
+         info->uses_frontface = 1;
+         break;
+      case nir_intrinsic_load_instance_id:
+         info->uses_instanceid = 1;
+         break;
+      case nir_intrinsic_load_invocation_id:
+         info->uses_invocationid = true;
+         break;
+      case nir_intrinsic_load_num_work_groups:
+         info->uses_grid_size = true;
+         break;
+      case nir_intrinsic_load_local_invocation_index:
+      case nir_intrinsic_load_subgroup_id:
+      case nir_intrinsic_load_num_subgroups:
+         info->uses_subgroup_info = true;
+         break;
+      case nir_intrinsic_load_local_group_size:
+         /* The block size is translated to IMM with a fixed block size. */
+         if (info->properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] == 0)
+            info->uses_block_size = true;
+         break;
+      case nir_intrinsic_load_local_invocation_id:
+      case nir_intrinsic_load_work_group_id: {
+         unsigned mask = nir_ssa_def_components_read(&intr->dest.ssa);
+         while (mask) {
+            unsigned i = u_bit_scan(&mask);
+
+            if (intr->intrinsic == nir_intrinsic_load_work_group_id)
+               info->uses_block_id[i] = true;
+            else
+               info->uses_thread_id[i] = true;
+         }
+         break;
+      }
+      case nir_intrinsic_load_vertex_id:
+         info->uses_vertexid = 1;
+         break;
+      case nir_intrinsic_load_vertex_id_zero_base:
+         info->uses_vertexid_nobase = 1;
+         break;
+      case nir_intrinsic_load_base_vertex:
+         info->uses_basevertex = 1;
+         break;
+      case nir_intrinsic_load_draw_id:
+         info->uses_drawid = 1;
+         break;
+      case nir_intrinsic_load_primitive_id:
+         info->uses_primid = 1;
+         break;
+      case nir_intrinsic_load_sample_mask_in:
+         info->reads_samplemask = true;
+         break;
+      case nir_intrinsic_load_tess_level_inner:
+      case nir_intrinsic_load_tess_level_outer:
+         info->reads_tess_factors = true;
+         break;
+      case nir_intrinsic_bindless_image_load:
+      case nir_intrinsic_bindless_image_size:
+      case nir_intrinsic_bindless_image_samples:
+         info->uses_bindless_images = true;
+         break;
+      case nir_intrinsic_bindless_image_store:
+         info->uses_bindless_images = true;
+         info->writes_memory = true;
+         info->num_memory_instructions++; /* we only care about stores */
+         break;
+      case nir_intrinsic_image_deref_store:
+         info->writes_memory = true;
+         info->num_memory_instructions++; /* we only care about stores */
+         break;
+      case nir_intrinsic_bindless_image_atomic_add:
+      case nir_intrinsic_bindless_image_atomic_imin:
+      case nir_intrinsic_bindless_image_atomic_umin:
+      case nir_intrinsic_bindless_image_atomic_imax:
+      case nir_intrinsic_bindless_image_atomic_umax:
+      case nir_intrinsic_bindless_image_atomic_and:
+      case nir_intrinsic_bindless_image_atomic_or:
+      case nir_intrinsic_bindless_image_atomic_xor:
+      case nir_intrinsic_bindless_image_atomic_exchange:
+      case nir_intrinsic_bindless_image_atomic_comp_swap:
+         info->uses_bindless_images = true;
+         info->writes_memory = true;
+         info->num_memory_instructions++; /* we only care about stores */
+         break;
+      case nir_intrinsic_image_deref_atomic_add:
+      case nir_intrinsic_image_deref_atomic_imin:
+      case nir_intrinsic_image_deref_atomic_umin:
+      case nir_intrinsic_image_deref_atomic_imax:
+      case nir_intrinsic_image_deref_atomic_umax:
+      case nir_intrinsic_image_deref_atomic_and:
+      case nir_intrinsic_image_deref_atomic_or:
+      case nir_intrinsic_image_deref_atomic_xor:
+      case nir_intrinsic_image_deref_atomic_exchange:
+      case nir_intrinsic_image_deref_atomic_comp_swap:
+      case nir_intrinsic_image_deref_atomic_inc_wrap:
+      case nir_intrinsic_image_deref_atomic_dec_wrap:
+         info->writes_memory = true;
+         info->num_memory_instructions++; /* we only care about stores */
+         break;
+      case nir_intrinsic_store_ssbo:
+      case nir_intrinsic_ssbo_atomic_add:
+      case nir_intrinsic_ssbo_atomic_imin:
+      case nir_intrinsic_ssbo_atomic_umin:
+      case nir_intrinsic_ssbo_atomic_imax:
+      case nir_intrinsic_ssbo_atomic_umax:
+      case nir_intrinsic_ssbo_atomic_and:
+      case nir_intrinsic_ssbo_atomic_or:
+      case nir_intrinsic_ssbo_atomic_xor:
+      case nir_intrinsic_ssbo_atomic_exchange:
+      case nir_intrinsic_ssbo_atomic_comp_swap:
+         info->writes_memory = true;
+         info->num_memory_instructions++; /* we only care about stores */
+         break;
+      case nir_intrinsic_load_color0:
+      case nir_intrinsic_load_color1: {
+         unsigned index = intr->intrinsic == nir_intrinsic_load_color1;
+         uint8_t mask = nir_ssa_def_components_read(&intr->dest.ssa);
+         info->colors_read |= mask << (index * 4);
+         break;
+      }
+      case nir_intrinsic_load_barycentric_pixel:
+      case nir_intrinsic_load_barycentric_centroid:
+      case nir_intrinsic_load_barycentric_sample:
+      case nir_intrinsic_load_barycentric_at_offset:   /* uses center */
+      case nir_intrinsic_load_barycentric_at_sample: { /* uses center */
+         unsigned mode = nir_intrinsic_interp_mode(intr);
+
+         if (mode == INTERP_MODE_FLAT)
+            break;
+
+         if (mode == INTERP_MODE_NOPERSPECTIVE) {
+            if (intr->intrinsic == nir_intrinsic_load_barycentric_sample)
+               info->uses_linear_sample = true;
+            else if (intr->intrinsic == nir_intrinsic_load_barycentric_centroid)
+               info->uses_linear_centroid = true;
+            else
+               info->uses_linear_center = true;
+
+            if (intr->intrinsic == nir_intrinsic_load_barycentric_at_sample)
+               info->uses_linear_opcode_interp_sample = true;
+         } else {
+            if (intr->intrinsic == nir_intrinsic_load_barycentric_sample)
+               info->uses_persp_sample = true;
+            else if (intr->intrinsic == nir_intrinsic_load_barycentric_centroid)
+               info->uses_persp_centroid = true;
+            else
+               info->uses_persp_center = true;
+
+            if (intr->intrinsic == nir_intrinsic_load_barycentric_at_sample)
+               info->uses_persp_opcode_interp_sample = true;
+         }
+         break;
+      }
+      case nir_intrinsic_load_deref: {
+         nir_variable *var = intrinsic_get_var(intr);
+         nir_variable_mode mode = var->data.mode;
+
+         if (mode == nir_var_shader_in) {
+            /* PS inputs use the interpolated load intrinsics. */
+            assert(nir->info.stage != MESA_SHADER_FRAGMENT);
+            gather_intrinsic_load_deref_input_info(nir, intr, nir_src_as_deref(intr->src[0]), info);
+         } else if (mode == nir_var_shader_out) {
+            gather_intrinsic_load_deref_output_info(nir, intr, var, info);
+         }
+         break;
+      }
+      case nir_intrinsic_store_deref: {
+         nir_variable *var = intrinsic_get_var(intr);
+
+         if (var->data.mode == nir_var_shader_out)
+            gather_intrinsic_store_deref_output_info(nir, intr, nir_src_as_deref(intr->src[0]),
+                                                     info);
+         break;
+      }
+      case nir_intrinsic_interp_deref_at_centroid:
+      case nir_intrinsic_interp_deref_at_sample:
+      case nir_intrinsic_interp_deref_at_offset:
+         unreachable("interp opcodes should have been lowered");
+         break;
+      default:
+         break;
+      }
+   }
 }
 
-static void scan_output_slot(const nir_variable *var,
-			     unsigned var_idx,
-			     unsigned component, unsigned num_components,
-			     struct si_shader_info *info)
+static void scan_output_slot(const nir_variable *var, unsigned var_idx, unsigned component,
+                             unsigned num_components, struct si_shader_info *info)
 {
-	assert(component + num_components <= 4);
-	assert(component < 4);
-
-	unsigned semantic_name, semantic_index;
-
-	unsigned location = var->data.location + var_idx;
-	unsigned drv_location = var->data.driver_location + var_idx;
-
-	if (info->processor == PIPE_SHADER_FRAGMENT) {
-		tgsi_get_gl_frag_result_semantic(location,
-			&semantic_name, &semantic_index);
-
-		/* Adjust for dual source blending */
-		if (var->data.index > 0) {
-			semantic_index++;
-		}
-	} else {
-		tgsi_get_gl_varying_semantic(location, true,
-					     &semantic_name, &semantic_index);
-	}
-
-	ubyte usagemask = ((1 << num_components) - 1) << component;
-
-	unsigned gs_out_streams;
-	if (var->data.stream & NIR_STREAM_PACKED) {
-		gs_out_streams = var->data.stream & ~NIR_STREAM_PACKED;
-	} else {
-		assert(var->data.stream < 4);
-		gs_out_streams = 0;
-		for (unsigned j = 0; j < num_components; ++j)
-			gs_out_streams |= var->data.stream << (2 * (component + j));
-	}
-
-	unsigned streamx = gs_out_streams & 3;
-	unsigned streamy = (gs_out_streams >> 2) & 3;
-	unsigned streamz = (gs_out_streams >> 4) & 3;
-	unsigned streamw = (gs_out_streams >> 6) & 3;
-
-	if (usagemask & TGSI_WRITEMASK_X) {
-		info->output_streams[drv_location] |= streamx;
-		info->num_stream_output_components[streamx]++;
-	}
-	if (usagemask & TGSI_WRITEMASK_Y) {
-		info->output_streams[drv_location] |= streamy << 2;
-		info->num_stream_output_components[streamy]++;
-	}
-	if (usagemask & TGSI_WRITEMASK_Z) {
-		info->output_streams[drv_location] |= streamz << 4;
-		info->num_stream_output_components[streamz]++;
-	}
-	if (usagemask & TGSI_WRITEMASK_W) {
-		info->output_streams[drv_location] |= streamw << 6;
-		info->num_stream_output_components[streamw]++;
-	}
-
-	info->output_semantic_name[drv_location] = semantic_name;
-	info->output_semantic_index[drv_location] = semantic_index;
-
-	switch (semantic_name) {
-	case TGSI_SEMANTIC_PRIMID:
-		info->writes_primid = true;
-		break;
-	case TGSI_SEMANTIC_VIEWPORT_INDEX:
-		info->writes_viewport_index = true;
-		break;
-	case TGSI_SEMANTIC_LAYER:
-		info->writes_layer = true;
-		break;
-	case TGSI_SEMANTIC_PSIZE:
-		info->writes_psize = true;
-		break;
-	case TGSI_SEMANTIC_CLIPVERTEX:
-		info->writes_clipvertex = true;
-		break;
-	case TGSI_SEMANTIC_COLOR:
-		info->colors_written |= 1 << semantic_index;
-		break;
-	case TGSI_SEMANTIC_STENCIL:
-		info->writes_stencil = true;
-		break;
-	case TGSI_SEMANTIC_SAMPLEMASK:
-		info->writes_samplemask = true;
-		break;
-	case TGSI_SEMANTIC_EDGEFLAG:
-		info->writes_edgeflag = true;
-		break;
-	case TGSI_SEMANTIC_POSITION:
-		if (info->processor == PIPE_SHADER_FRAGMENT)
-			info->writes_z = true;
-		else
-			info->writes_position = true;
-		break;
-	}
+   assert(component + num_components <= 4);
+   assert(component < 4);
+
+   unsigned semantic_name, semantic_index;
+
+   unsigned location = var->data.location + var_idx;
+   unsigned drv_location = var->data.driver_location + var_idx;
+
+   if (info->processor == PIPE_SHADER_FRAGMENT) {
+      tgsi_get_gl_frag_result_semantic(location, &semantic_name, &semantic_index);
+
+      /* Adjust for dual source blending */
+      if (var->data.index > 0) {
+         semantic_index++;
+      }
+   } else {
+      tgsi_get_gl_varying_semantic(location, true, &semantic_name, &semantic_index);
+   }
+
+   ubyte usagemask = ((1 << num_components) - 1) << component;
+
+   unsigned gs_out_streams;
+   if (var->data.stream & NIR_STREAM_PACKED) {
+      gs_out_streams = var->data.stream & ~NIR_STREAM_PACKED;
+   } else {
+      assert(var->data.stream < 4);
+      gs_out_streams = 0;
+      for (unsigned j = 0; j < num_components; ++j)
+         gs_out_streams |= var->data.stream << (2 * (component + j));
+   }
+
+   unsigned streamx = gs_out_streams & 3;
+   unsigned streamy = (gs_out_streams >> 2) & 3;
+   unsigned streamz = (gs_out_streams >> 4) & 3;
+   unsigned streamw = (gs_out_streams >> 6) & 3;
+
+   if (usagemask & TGSI_WRITEMASK_X) {
+      info->output_streams[drv_location] |= streamx;
+      info->num_stream_output_components[streamx]++;
+   }
+   if (usagemask & TGSI_WRITEMASK_Y) {
+      info->output_streams[drv_location] |= streamy << 2;
+      info->num_stream_output_components[streamy]++;
+   }
+   if (usagemask & TGSI_WRITEMASK_Z) {
+      info->output_streams[drv_location] |= streamz << 4;
+      info->num_stream_output_components[streamz]++;
+   }
+   if (usagemask & TGSI_WRITEMASK_W) {
+      info->output_streams[drv_location] |= streamw << 6;
+      info->num_stream_output_components[streamw]++;
+   }
+
+   info->output_semantic_name[drv_location] = semantic_name;
+   info->output_semantic_index[drv_location] = semantic_index;
+
+   switch (semantic_name) {
+   case TGSI_SEMANTIC_PRIMID:
+      info->writes_primid = true;
+      break;
+   case TGSI_SEMANTIC_VIEWPORT_INDEX:
+      info->writes_viewport_index = true;
+      break;
+   case TGSI_SEMANTIC_LAYER:
+      info->writes_layer = true;
+      break;
+   case TGSI_SEMANTIC_PSIZE:
+      info->writes_psize = true;
+      break;
+   case TGSI_SEMANTIC_CLIPVERTEX:
+      info->writes_clipvertex = true;
+      break;
+   case TGSI_SEMANTIC_COLOR:
+      info->colors_written |= 1 << semantic_index;
+      break;
+   case TGSI_SEMANTIC_STENCIL:
+      info->writes_stencil = true;
+      break;
+   case TGSI_SEMANTIC_SAMPLEMASK:
+      info->writes_samplemask = true;
+      break;
+   case TGSI_SEMANTIC_EDGEFLAG:
+      info->writes_edgeflag = true;
+      break;
+   case TGSI_SEMANTIC_POSITION:
+      if (info->processor == PIPE_SHADER_FRAGMENT)
+         info->writes_z = true;
+      else
+         info->writes_position = true;
+      break;
+   }
 }
 
-static void scan_output_helper(const nir_variable *var,
-			       unsigned location,
-			       const struct glsl_type *type,
-			       struct si_shader_info *info)
+static void scan_output_helper(const nir_variable *var, unsigned location,
+                               const struct glsl_type *type, struct si_shader_info *info)
 {
-	if (glsl_type_is_struct(type) || glsl_type_is_interface(type)) {
-		for (unsigned i = 0; i < glsl_get_length(type); i++) {
-			const struct glsl_type *ft = glsl_get_struct_field(type, i);
-			scan_output_helper(var, location, ft, info);
-			location += glsl_count_attribute_slots(ft, false);
-		}
-	} else if (glsl_type_is_array_or_matrix(type)) {
-		const struct glsl_type *elem_type =
-			glsl_get_array_element(type);
-		unsigned num_elems = glsl_get_length(type);
-		if (var->data.compact) {
-			assert(glsl_type_is_scalar(elem_type));
-			assert(glsl_get_bit_size(elem_type) == 32);
-			unsigned component = var->data.location_frac;
-			scan_output_slot(var, location, component,
-					 MIN2(num_elems, 4 - component), info);
-			if (component + num_elems > 4) {
-				scan_output_slot(var, location + 1, 0,
-						 component + num_elems - 4, info);
-			}
-
-		} else {
-			unsigned elem_count = glsl_count_attribute_slots(elem_type, false);
-			for (unsigned i = 0; i < num_elems; i++) {
-				scan_output_helper(var, location, elem_type, info);
-				location += elem_count;
-			}
-		}
-	} else if (glsl_type_is_dual_slot(type)) {
-		unsigned component = var->data.location_frac;
-		scan_output_slot(var, location, component, 4 - component, info);
-		scan_output_slot(var, location + 1, 0, component + 2 * glsl_get_components(type) - 4,
-				 info);
-	} else {
-		unsigned component = var->data.location_frac;
-		assert(glsl_type_is_vector_or_scalar(type));
-		unsigned num_components = glsl_get_components(type);
-		if (glsl_type_is_64bit(type))
-			num_components *= 2;
-		scan_output_slot(var, location, component, num_components, info);
-	}
+   if (glsl_type_is_struct(type) || glsl_type_is_interface(type)) {
+      for (unsigned i = 0; i < glsl_get_length(type); i++) {
+         const struct glsl_type *ft = glsl_get_struct_field(type, i);
+         scan_output_helper(var, location, ft, info);
+         location += glsl_count_attribute_slots(ft, false);
+      }
+   } else if (glsl_type_is_array_or_matrix(type)) {
+      const struct glsl_type *elem_type = glsl_get_array_element(type);
+      unsigned num_elems = glsl_get_length(type);
+      if (var->data.compact) {
+         assert(glsl_type_is_scalar(elem_type));
+         assert(glsl_get_bit_size(elem_type) == 32);
+         unsigned component = var->data.location_frac;
+         scan_output_slot(var, location, component, MIN2(num_elems, 4 - component), info);
+         if (component + num_elems > 4) {
+            scan_output_slot(var, location + 1, 0, component + num_elems - 4, info);
+         }
+
+      } else {
+         unsigned elem_count = glsl_count_attribute_slots(elem_type, false);
+         for (unsigned i = 0; i < num_elems; i++) {
+            scan_output_helper(var, location, elem_type, info);
+            location += elem_count;
+         }
+      }
+   } else if (glsl_type_is_dual_slot(type)) {
+      unsigned component = var->data.location_frac;
+      scan_output_slot(var, location, component, 4 - component, info);
+      scan_output_slot(var, location + 1, 0, component + 2 * glsl_get_components(type) - 4, info);
+   } else {
+      unsigned component = var->data.location_frac;
+      assert(glsl_type_is_vector_or_scalar(type));
+      unsigned num_components = glsl_get_components(type);
+      if (glsl_type_is_64bit(type))
+         num_components *= 2;
+      scan_output_slot(var, location, component, num_components, info);
+   }
 }
 
-void si_nir_scan_shader(const struct nir_shader *nir,
-			struct si_shader_info *info)
+void si_nir_scan_shader(const struct nir_shader *nir, struct si_shader_info *info)
 {
-	nir_function *func;
-	unsigned i;
-
-	info->processor = pipe_shader_type_from_mesa(nir->info.stage);
-
-	info->properties[TGSI_PROPERTY_NEXT_SHADER] =
-		pipe_shader_type_from_mesa(nir->info.next_stage);
-
-	if (nir->info.stage == MESA_SHADER_VERTEX) {
-		info->properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION] =
-			nir->info.vs.window_space_position;
-		info->properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD] =
-			nir->info.vs.blit_sgprs_amd;
-	}
-
-	if (nir->info.stage == MESA_SHADER_TESS_CTRL) {
-		info->properties[TGSI_PROPERTY_TCS_VERTICES_OUT] =
-			nir->info.tess.tcs_vertices_out;
-	}
-
-	if (nir->info.stage == MESA_SHADER_TESS_EVAL) {
-		if (nir->info.tess.primitive_mode == GL_ISOLINES)
-			info->properties[TGSI_PROPERTY_TES_PRIM_MODE] = PIPE_PRIM_LINES;
-		else
-			info->properties[TGSI_PROPERTY_TES_PRIM_MODE] = nir->info.tess.primitive_mode;
-
-		STATIC_ASSERT((TESS_SPACING_EQUAL + 1) % 3 == PIPE_TESS_SPACING_EQUAL);
-		STATIC_ASSERT((TESS_SPACING_FRACTIONAL_ODD + 1) % 3 ==
-			      PIPE_TESS_SPACING_FRACTIONAL_ODD);
-		STATIC_ASSERT((TESS_SPACING_FRACTIONAL_EVEN + 1) % 3 ==
-			      PIPE_TESS_SPACING_FRACTIONAL_EVEN);
-
-		info->properties[TGSI_PROPERTY_TES_SPACING] = (nir->info.tess.spacing + 1) % 3;
-		info->properties[TGSI_PROPERTY_TES_VERTEX_ORDER_CW] = !nir->info.tess.ccw;
-		info->properties[TGSI_PROPERTY_TES_POINT_MODE] = nir->info.tess.point_mode;
-	}
-
-	if (nir->info.stage == MESA_SHADER_GEOMETRY) {
-		info->properties[TGSI_PROPERTY_GS_INPUT_PRIM] = nir->info.gs.input_primitive;
-		info->properties[TGSI_PROPERTY_GS_OUTPUT_PRIM] = nir->info.gs.output_primitive;
-		info->properties[TGSI_PROPERTY_GS_MAX_OUTPUT_VERTICES] = nir->info.gs.vertices_out;
-		info->properties[TGSI_PROPERTY_GS_INVOCATIONS] = nir->info.gs.invocations;
-	}
-
-	if (nir->info.stage == MESA_SHADER_FRAGMENT) {
-		info->properties[TGSI_PROPERTY_FS_EARLY_DEPTH_STENCIL] =
-			nir->info.fs.early_fragment_tests | nir->info.fs.post_depth_coverage;
-		info->properties[TGSI_PROPERTY_FS_POST_DEPTH_COVERAGE] = nir->info.fs.post_depth_coverage;
-
-		if (nir->info.fs.pixel_center_integer) {
-			info->properties[TGSI_PROPERTY_FS_COORD_PIXEL_CENTER] =
-				TGSI_FS_COORD_PIXEL_CENTER_INTEGER;
-		}
-
-		if (nir->info.fs.depth_layout != FRAG_DEPTH_LAYOUT_NONE) {
-			switch (nir->info.fs.depth_layout) {
-			case FRAG_DEPTH_LAYOUT_ANY:
-				info->properties[TGSI_PROPERTY_FS_DEPTH_LAYOUT] = TGSI_FS_DEPTH_LAYOUT_ANY;
-				break;
-			case FRAG_DEPTH_LAYOUT_GREATER:
-				info->properties[TGSI_PROPERTY_FS_DEPTH_LAYOUT] = TGSI_FS_DEPTH_LAYOUT_GREATER;
-				break;
-			case FRAG_DEPTH_LAYOUT_LESS:
-				info->properties[TGSI_PROPERTY_FS_DEPTH_LAYOUT] = TGSI_FS_DEPTH_LAYOUT_LESS;
-				break;
-			case FRAG_DEPTH_LAYOUT_UNCHANGED:
-				info->properties[TGSI_PROPERTY_FS_DEPTH_LAYOUT] = TGSI_FS_DEPTH_LAYOUT_UNCHANGED;
-				break;
-			default:
-				unreachable("Unknow depth layout");
-			}
-		}
-	}
-
-	if (gl_shader_stage_is_compute(nir->info.stage)) {
-		info->properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] = nir->info.cs.local_size[0];
-		info->properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT] = nir->info.cs.local_size[1];
-		info->properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH] = nir->info.cs.local_size[2];
-		info->properties[TGSI_PROPERTY_CS_USER_DATA_COMPONENTS_AMD] = nir->info.cs.user_data_components_amd;
-	}
-
-	i = 0;
-	uint64_t processed_inputs = 0;
-	nir_foreach_variable(variable, &nir->inputs) {
-		unsigned semantic_name, semantic_index;
-
-		const struct glsl_type *type = variable->type;
-		if (nir_is_per_vertex_io(variable, nir->info.stage)) {
-			assert(glsl_type_is_array(type));
-			type = glsl_get_array_element(type);
-		}
-
-		unsigned attrib_count = glsl_count_attribute_slots(type,
-								   nir->info.stage == MESA_SHADER_VERTEX);
-
-		i = variable->data.driver_location;
-
-		/* Vertex shader inputs don't have semantics. The state
-		 * tracker has already mapped them to attributes via
-		 * variable->data.driver_location.
-		 */
-		if (nir->info.stage == MESA_SHADER_VERTEX)
-			continue;
-
-		for (unsigned j = 0; j < attrib_count; j++, i++) {
-
-			if (processed_inputs & ((uint64_t)1 << i))
-				continue;
-
-			processed_inputs |= ((uint64_t)1 << i);
-
-			tgsi_get_gl_varying_semantic(variable->data.location + j, true,
-						     &semantic_name, &semantic_index);
-
-			info->input_semantic_name[i] = semantic_name;
-			info->input_semantic_index[i] = semantic_index;
-
-			if (semantic_name == TGSI_SEMANTIC_PRIMID)
-				info->uses_primid = true;
-
-			if (semantic_name == TGSI_SEMANTIC_COLOR) {
-				/* We only need this for color inputs. */
-				if (variable->data.sample)
-					info->input_interpolate_loc[i] = TGSI_INTERPOLATE_LOC_SAMPLE;
-				else if (variable->data.centroid)
-					info->input_interpolate_loc[i] = TGSI_INTERPOLATE_LOC_CENTROID;
-				else
-					info->input_interpolate_loc[i] = TGSI_INTERPOLATE_LOC_CENTER;
-			}
-
-                        enum glsl_base_type base_type =
-                                glsl_get_base_type(glsl_without_array(variable->type));
-
-                        switch (variable->data.interpolation) {
-                        case INTERP_MODE_NONE:
-                                if (glsl_base_type_is_integer(base_type)) {
-                                        info->input_interpolate[i] = TGSI_INTERPOLATE_CONSTANT;
-                                        break;
-                                }
-
-                                if (semantic_name == TGSI_SEMANTIC_COLOR) {
-                                        info->input_interpolate[i] = TGSI_INTERPOLATE_COLOR;
-                                        break;
-                                }
-                                /* fall-through */
-
-                        case INTERP_MODE_SMOOTH:
-                                assert(!glsl_base_type_is_integer(base_type));
-
-                                info->input_interpolate[i] = TGSI_INTERPOLATE_PERSPECTIVE;
-                                break;
-
-                        case INTERP_MODE_NOPERSPECTIVE:
-                                assert(!glsl_base_type_is_integer(base_type));
-
-                                info->input_interpolate[i] = TGSI_INTERPOLATE_LINEAR;
-                                break;
-
-                        case INTERP_MODE_FLAT:
-                                info->input_interpolate[i] = TGSI_INTERPOLATE_CONSTANT;
-                                break;
-                        }
-		}
-	}
-
-	nir_foreach_variable(variable, &nir->outputs) {
-		const struct glsl_type *type = variable->type;
-		if (nir_is_per_vertex_io(variable, nir->info.stage)) {
-			assert(glsl_type_is_array(type));
-			type = glsl_get_array_element(type);
-		}
-
-		ASSERTED unsigned attrib_count = glsl_count_attribute_slots(type, false);
-		scan_output_helper(variable, 0, type, info);
-
-		unsigned loc = variable->data.location;
-		if (nir->info.stage == MESA_SHADER_FRAGMENT &&
-		    loc == FRAG_RESULT_COLOR &&
-		    nir->info.outputs_written & (1ull << loc)) {
-			assert(attrib_count == 1);
-			info->properties[TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS] = true;
-		}
-	}
-
-	info->num_inputs = nir->num_inputs;
-	info->num_outputs = nir->num_outputs;
-
-	info->constbuf0_num_slots = nir->num_uniforms;
-	info->shader_buffers_declared = u_bit_consecutive(0, nir->info.num_ssbos);
-	info->const_buffers_declared = u_bit_consecutive(1, nir->info.num_ubos);
-	if (nir->num_uniforms > 0)
-		info->const_buffers_declared |= 1;
-	info->images_declared = u_bit_consecutive(0, nir->info.num_images);
-	info->msaa_images_declared = u_bit_consecutive(0, nir->info.last_msaa_image + 1);
-	info->samplers_declared = nir->info.textures_used;
-
-	info->num_written_clipdistance = nir->info.clip_distance_array_size;
-	info->num_written_culldistance = nir->info.cull_distance_array_size;
-	info->clipdist_writemask = u_bit_consecutive(0, info->num_written_clipdistance);
-	info->culldist_writemask = u_bit_consecutive(0, info->num_written_culldistance);
-
-	if (info->processor == PIPE_SHADER_FRAGMENT)
-		info->uses_kill = nir->info.fs.uses_discard;
-
-	if (nir->info.stage == MESA_SHADER_TESS_CTRL) {
-		info->tessfactors_are_def_in_all_invocs =
-			ac_are_tessfactors_def_in_all_invocs(nir);
-	}
-
-	func = (struct nir_function *)exec_list_get_head_const(&nir->functions);
-	nir_foreach_block(block, func->impl) {
-		nir_foreach_instr(instr, block)
-			scan_instruction(nir, info, instr);
-	}
+   nir_function *func;
+   unsigned i;
+
+   info->processor = pipe_shader_type_from_mesa(nir->info.stage);
+
+   info->properties[TGSI_PROPERTY_NEXT_SHADER] = pipe_shader_type_from_mesa(nir->info.next_stage);
+
+   if (nir->info.stage == MESA_SHADER_VERTEX) {
+      info->properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION] = nir->info.vs.window_space_position;
+      info->properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD] = nir->info.vs.blit_sgprs_amd;
+   }
+
+   if (nir->info.stage == MESA_SHADER_TESS_CTRL) {
+      info->properties[TGSI_PROPERTY_TCS_VERTICES_OUT] = nir->info.tess.tcs_vertices_out;
+   }
+
+   if (nir->info.stage == MESA_SHADER_TESS_EVAL) {
+      if (nir->info.tess.primitive_mode == GL_ISOLINES)
+         info->properties[TGSI_PROPERTY_TES_PRIM_MODE] = PIPE_PRIM_LINES;
+      else
+         info->properties[TGSI_PROPERTY_TES_PRIM_MODE] = nir->info.tess.primitive_mode;
+
+      STATIC_ASSERT((TESS_SPACING_EQUAL + 1) % 3 == PIPE_TESS_SPACING_EQUAL);
+      STATIC_ASSERT((TESS_SPACING_FRACTIONAL_ODD + 1) % 3 == PIPE_TESS_SPACING_FRACTIONAL_ODD);
+      STATIC_ASSERT((TESS_SPACING_FRACTIONAL_EVEN + 1) % 3 == PIPE_TESS_SPACING_FRACTIONAL_EVEN);
+
+      info->properties[TGSI_PROPERTY_TES_SPACING] = (nir->info.tess.spacing + 1) % 3;
+      info->properties[TGSI_PROPERTY_TES_VERTEX_ORDER_CW] = !nir->info.tess.ccw;
+      info->properties[TGSI_PROPERTY_TES_POINT_MODE] = nir->info.tess.point_mode;
+   }
+
+   if (nir->info.stage == MESA_SHADER_GEOMETRY) {
+      info->properties[TGSI_PROPERTY_GS_INPUT_PRIM] = nir->info.gs.input_primitive;
+      info->properties[TGSI_PROPERTY_GS_OUTPUT_PRIM] = nir->info.gs.output_primitive;
+      info->properties[TGSI_PROPERTY_GS_MAX_OUTPUT_VERTICES] = nir->info.gs.vertices_out;
+      info->properties[TGSI_PROPERTY_GS_INVOCATIONS] = nir->info.gs.invocations;
+   }
+
+   if (nir->info.stage == MESA_SHADER_FRAGMENT) {
+      info->properties[TGSI_PROPERTY_FS_EARLY_DEPTH_STENCIL] =
+         nir->info.fs.early_fragment_tests | nir->info.fs.post_depth_coverage;
+      info->properties[TGSI_PROPERTY_FS_POST_DEPTH_COVERAGE] = nir->info.fs.post_depth_coverage;
+
+      if (nir->info.fs.pixel_center_integer) {
+         info->properties[TGSI_PROPERTY_FS_COORD_PIXEL_CENTER] = TGSI_FS_COORD_PIXEL_CENTER_INTEGER;
+      }
+
+      if (nir->info.fs.depth_layout != FRAG_DEPTH_LAYOUT_NONE) {
+         switch (nir->info.fs.depth_layout) {
+         case FRAG_DEPTH_LAYOUT_ANY:
+            info->properties[TGSI_PROPERTY_FS_DEPTH_LAYOUT] = TGSI_FS_DEPTH_LAYOUT_ANY;
+            break;
+         case FRAG_DEPTH_LAYOUT_GREATER:
+            info->properties[TGSI_PROPERTY_FS_DEPTH_LAYOUT] = TGSI_FS_DEPTH_LAYOUT_GREATER;
+            break;
+         case FRAG_DEPTH_LAYOUT_LESS:
+            info->properties[TGSI_PROPERTY_FS_DEPTH_LAYOUT] = TGSI_FS_DEPTH_LAYOUT_LESS;
+            break;
+         case FRAG_DEPTH_LAYOUT_UNCHANGED:
+            info->properties[TGSI_PROPERTY_FS_DEPTH_LAYOUT] = TGSI_FS_DEPTH_LAYOUT_UNCHANGED;
+            break;
+         default:
+            unreachable("Unknow depth layout");
+         }
+      }
+   }
+
+   if (gl_shader_stage_is_compute(nir->info.stage)) {
+      info->properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] = nir->info.cs.local_size[0];
+      info->properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT] = nir->info.cs.local_size[1];
+      info->properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH] = nir->info.cs.local_size[2];
+      info->properties[TGSI_PROPERTY_CS_USER_DATA_COMPONENTS_AMD] =
+         nir->info.cs.user_data_components_amd;
+   }
+
+   i = 0;
+   uint64_t processed_inputs = 0;
+   nir_foreach_variable (variable, &nir->inputs) {
+      unsigned semantic_name, semantic_index;
+
+      const struct glsl_type *type = variable->type;
+      if (nir_is_per_vertex_io(variable, nir->info.stage)) {
+         assert(glsl_type_is_array(type));
+         type = glsl_get_array_element(type);
+      }
+
+      unsigned attrib_count =
+         glsl_count_attribute_slots(type, nir->info.stage == MESA_SHADER_VERTEX);
+
+      i = variable->data.driver_location;
+
+      /* Vertex shader inputs don't have semantics. The state
+       * tracker has already mapped them to attributes via
+       * variable->data.driver_location.
+       */
+      if (nir->info.stage == MESA_SHADER_VERTEX)
+         continue;
+
+      for (unsigned j = 0; j < attrib_count; j++, i++) {
+
+         if (processed_inputs & ((uint64_t)1 << i))
+            continue;
+
+         processed_inputs |= ((uint64_t)1 << i);
+
+         tgsi_get_gl_varying_semantic(variable->data.location + j, true, &semantic_name,
+                                      &semantic_index);
+
+         info->input_semantic_name[i] = semantic_name;
+         info->input_semantic_index[i] = semantic_index;
+
+         if (semantic_name == TGSI_SEMANTIC_PRIMID)
+            info->uses_primid = true;
+
+         if (semantic_name == TGSI_SEMANTIC_COLOR) {
+            /* We only need this for color inputs. */
+            if (variable->data.sample)
+               info->input_interpolate_loc[i] = TGSI_INTERPOLATE_LOC_SAMPLE;
+            else if (variable->data.centroid)
+               info->input_interpolate_loc[i] = TGSI_INTERPOLATE_LOC_CENTROID;
+            else
+               info->input_interpolate_loc[i] = TGSI_INTERPOLATE_LOC_CENTER;
+         }
+
+         enum glsl_base_type base_type = glsl_get_base_type(glsl_without_array(variable->type));
+
+         switch (variable->data.interpolation) {
+         case INTERP_MODE_NONE:
+            if (glsl_base_type_is_integer(base_type)) {
+               info->input_interpolate[i] = TGSI_INTERPOLATE_CONSTANT;
+               break;
+            }
+
+            if (semantic_name == TGSI_SEMANTIC_COLOR) {
+               info->input_interpolate[i] = TGSI_INTERPOLATE_COLOR;
+               break;
+            }
+            /* fall-through */
+
+         case INTERP_MODE_SMOOTH:
+            assert(!glsl_base_type_is_integer(base_type));
+
+            info->input_interpolate[i] = TGSI_INTERPOLATE_PERSPECTIVE;
+            break;
+
+         case INTERP_MODE_NOPERSPECTIVE:
+            assert(!glsl_base_type_is_integer(base_type));
+
+            info->input_interpolate[i] = TGSI_INTERPOLATE_LINEAR;
+            break;
+
+         case INTERP_MODE_FLAT:
+            info->input_interpolate[i] = TGSI_INTERPOLATE_CONSTANT;
+            break;
+         }
+      }
+   }
+
+   nir_foreach_variable (variable, &nir->outputs) {
+      const struct glsl_type *type = variable->type;
+      if (nir_is_per_vertex_io(variable, nir->info.stage)) {
+         assert(glsl_type_is_array(type));
+         type = glsl_get_array_element(type);
+      }
+
+      ASSERTED unsigned attrib_count = glsl_count_attribute_slots(type, false);
+      scan_output_helper(variable, 0, type, info);
+
+      unsigned loc = variable->data.location;
+      if (nir->info.stage == MESA_SHADER_FRAGMENT && loc == FRAG_RESULT_COLOR &&
+          nir->info.outputs_written & (1ull << loc)) {
+         assert(attrib_count == 1);
+         info->properties[TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS] = true;
+      }
+   }
+
+   info->num_inputs = nir->num_inputs;
+   info->num_outputs = nir->num_outputs;
+
+   info->constbuf0_num_slots = nir->num_uniforms;
+   info->shader_buffers_declared = u_bit_consecutive(0, nir->info.num_ssbos);
+   info->const_buffers_declared = u_bit_consecutive(1, nir->info.num_ubos);
+   if (nir->num_uniforms > 0)
+      info->const_buffers_declared |= 1;
+   info->images_declared = u_bit_consecutive(0, nir->info.num_images);
+   info->msaa_images_declared = u_bit_consecutive(0, nir->info.last_msaa_image + 1);
+   info->samplers_declared = nir->info.textures_used;
+
+   info->num_written_clipdistance = nir->info.clip_distance_array_size;
+   info->num_written_culldistance = nir->info.cull_distance_array_size;
+   info->clipdist_writemask = u_bit_consecutive(0, info->num_written_clipdistance);
+   info->culldist_writemask = u_bit_consecutive(0, info->num_written_culldistance);
+
+   if (info->processor == PIPE_SHADER_FRAGMENT)
+      info->uses_kill = nir->info.fs.uses_discard;
+
+   if (nir->info.stage == MESA_SHADER_TESS_CTRL) {
+      info->tessfactors_are_def_in_all_invocs = ac_are_tessfactors_def_in_all_invocs(nir);
+   }
+
+   func = (struct nir_function *)exec_list_get_head_const(&nir->functions);
+   nir_foreach_block (block, func->impl) {
+      nir_foreach_instr (instr, block)
+         scan_instruction(nir, info, instr);
+   }
 }
 
-static void
-si_nir_opts(struct nir_shader *nir)
+static void si_nir_opts(struct nir_shader *nir)
 {
-	bool progress;
-
-	do {
-		progress = false;
-
-		NIR_PASS_V(nir, nir_lower_vars_to_ssa);
-
-		NIR_PASS(progress, nir, nir_opt_copy_prop_vars);
-		NIR_PASS(progress, nir, nir_opt_dead_write_vars);
-
-		NIR_PASS_V(nir, nir_lower_alu_to_scalar, NULL, NULL);
-		NIR_PASS_V(nir, nir_lower_phis_to_scalar);
-
-		/* (Constant) copy propagation is needed for txf with offsets. */
-		NIR_PASS(progress, nir, nir_copy_prop);
-		NIR_PASS(progress, nir, nir_opt_remove_phis);
-		NIR_PASS(progress, nir, nir_opt_dce);
-		if (nir_opt_trivial_continues(nir)) {
-			progress = true;
-			NIR_PASS(progress, nir, nir_copy_prop);
-			NIR_PASS(progress, nir, nir_opt_dce);
-		}
-		NIR_PASS(progress, nir, nir_opt_if, true);
-		NIR_PASS(progress, nir, nir_opt_dead_cf);
-		NIR_PASS(progress, nir, nir_opt_cse);
-		NIR_PASS(progress, nir, nir_opt_peephole_select, 8, true, true);
-
-		/* Needed for algebraic lowering */
-		NIR_PASS(progress, nir, nir_opt_algebraic);
-		NIR_PASS(progress, nir, nir_opt_constant_folding);
-
-		if (!nir->info.flrp_lowered) {
-			unsigned lower_flrp =
-				(nir->options->lower_flrp16 ? 16 : 0) |
-				(nir->options->lower_flrp32 ? 32 : 0) |
-				(nir->options->lower_flrp64 ? 64 : 0);
-			assert(lower_flrp);
-			bool lower_flrp_progress = false;
-
-			NIR_PASS(lower_flrp_progress, nir, nir_lower_flrp,
-				 lower_flrp,
-				 false /* always_precise */,
-				 nir->options->lower_ffma);
-			if (lower_flrp_progress) {
-				NIR_PASS(progress, nir,
-					 nir_opt_constant_folding);
-				progress = true;
-			}
-
-			/* Nothing should rematerialize any flrps, so we only
-			 * need to do this lowering once.
-			 */
-			nir->info.flrp_lowered = true;
-		}
-
-		NIR_PASS(progress, nir, nir_opt_undef);
-		NIR_PASS(progress, nir, nir_opt_conditional_discard);
-		if (nir->options->max_unroll_iterations) {
-			NIR_PASS(progress, nir, nir_opt_loop_unroll, 0);
-		}
-	} while (progress);
+   bool progress;
+
+   do {
+      progress = false;
+
+      NIR_PASS_V(nir, nir_lower_vars_to_ssa);
+
+      NIR_PASS(progress, nir, nir_opt_copy_prop_vars);
+      NIR_PASS(progress, nir, nir_opt_dead_write_vars);
+
+      NIR_PASS_V(nir, nir_lower_alu_to_scalar, NULL, NULL);
+      NIR_PASS_V(nir, nir_lower_phis_to_scalar);
+
+      /* (Constant) copy propagation is needed for txf with offsets. */
+      NIR_PASS(progress, nir, nir_copy_prop);
+      NIR_PASS(progress, nir, nir_opt_remove_phis);
+      NIR_PASS(progress, nir, nir_opt_dce);
+      if (nir_opt_trivial_continues(nir)) {
+         progress = true;
+         NIR_PASS(progress, nir, nir_copy_prop);
+         NIR_PASS(progress, nir, nir_opt_dce);
+      }
+      NIR_PASS(progress, nir, nir_opt_if, true);
+      NIR_PASS(progress, nir, nir_opt_dead_cf);
+      NIR_PASS(progress, nir, nir_opt_cse);
+      NIR_PASS(progress, nir, nir_opt_peephole_select, 8, true, true);
+
+      /* Needed for algebraic lowering */
+      NIR_PASS(progress, nir, nir_opt_algebraic);
+      NIR_PASS(progress, nir, nir_opt_constant_folding);
+
+      if (!nir->info.flrp_lowered) {
+         unsigned lower_flrp = (nir->options->lower_flrp16 ? 16 : 0) |
+                               (nir->options->lower_flrp32 ? 32 : 0) |
+                               (nir->options->lower_flrp64 ? 64 : 0);
+         assert(lower_flrp);
+         bool lower_flrp_progress = false;
+
+         NIR_PASS(lower_flrp_progress, nir, nir_lower_flrp, lower_flrp, false /* always_precise */,
+                  nir->options->lower_ffma);
+         if (lower_flrp_progress) {
+            NIR_PASS(progress, nir, nir_opt_constant_folding);
+            progress = true;
+         }
+
+         /* Nothing should rematerialize any flrps, so we only
+          * need to do this lowering once.
+          */
+         nir->info.flrp_lowered = true;
+      }
+
+      NIR_PASS(progress, nir, nir_opt_undef);
+      NIR_PASS(progress, nir, nir_opt_conditional_discard);
+      if (nir->options->max_unroll_iterations) {
+         NIR_PASS(progress, nir, nir_opt_loop_unroll, 0);
+      }
+   } while (progress);
 }
 
-static int
-type_size_vec4(const struct glsl_type *type, bool bindless)
+static int type_size_vec4(const struct glsl_type *type, bool bindless)
 {
-	return glsl_count_attribute_slots(type, false);
+   return glsl_count_attribute_slots(type, false);
 }
 
-static void
-si_nir_lower_color(nir_shader *nir)
+static void si_nir_lower_color(nir_shader *nir)
 {
-        nir_function_impl *entrypoint = nir_shader_get_entrypoint(nir);
-
-        nir_builder b;
-        nir_builder_init(&b, entrypoint);
-
-        nir_foreach_block(block, entrypoint) {
-                nir_foreach_instr_safe(instr, block) {
-                        if (instr->type != nir_instr_type_intrinsic)
-                                continue;
-
-                        nir_intrinsic_instr *intrin =
-                                nir_instr_as_intrinsic(instr);
-
-                        if (intrin->intrinsic != nir_intrinsic_load_deref)
-                                continue;
-
-                        nir_deref_instr *deref = nir_src_as_deref(intrin->src[0]);
-                        if (deref->mode != nir_var_shader_in)
-                                continue;
-
-                        b.cursor = nir_before_instr(instr);
-                        nir_variable *var = nir_deref_instr_get_variable(deref);
-                        nir_ssa_def *def;
-
-                        if (var->data.location == VARYING_SLOT_COL0) {
-                                def = nir_load_color0(&b);
-                        } else if (var->data.location == VARYING_SLOT_COL1) {
-                                def = nir_load_color1(&b);
-                        } else {
-                                continue;
-                        }
-
-                        nir_ssa_def_rewrite_uses(&intrin->dest.ssa, nir_src_for_ssa(def));
-                        nir_instr_remove(instr);
-                }
-        }
+   nir_function_impl *entrypoint = nir_shader_get_entrypoint(nir);
+
+   nir_builder b;
+   nir_builder_init(&b, entrypoint);
+
+   nir_foreach_block (block, entrypoint) {
+      nir_foreach_instr_safe (instr, block) {
+         if (instr->type != nir_instr_type_intrinsic)
+            continue;
+
+         nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+
+         if (intrin->intrinsic != nir_intrinsic_load_deref)
+            continue;
+
+         nir_deref_instr *deref = nir_src_as_deref(intrin->src[0]);
+         if (deref->mode != nir_var_shader_in)
+            continue;
+
+         b.cursor = nir_before_instr(instr);
+         nir_variable *var = nir_deref_instr_get_variable(deref);
+         nir_ssa_def *def;
+
+         if (var->data.location == VARYING_SLOT_COL0) {
+            def = nir_load_color0(&b);
+         } else if (var->data.location == VARYING_SLOT_COL1) {
+            def = nir_load_color1(&b);
+         } else {
+            continue;
+         }
+
+         nir_ssa_def_rewrite_uses(&intrin->dest.ssa, nir_src_for_ssa(def));
+         nir_instr_remove(instr);
+      }
+   }
 }
 
 static void si_nir_lower_ps_inputs(struct nir_shader *nir)
 {
-	if (nir->info.stage != MESA_SHADER_FRAGMENT)
-		return;
-
-	NIR_PASS_V(nir, nir_lower_io_to_temporaries,
-		   nir_shader_get_entrypoint(nir), false, true);
-
-	/* Since we're doing nir_lower_io_to_temporaries late, we need
-	 * to lower all the copy_deref's introduced by
-	 * lower_io_to_temporaries before calling nir_lower_io.
-	 */
-	NIR_PASS_V(nir, nir_split_var_copies);
-	NIR_PASS_V(nir, nir_lower_var_copies);
-	NIR_PASS_V(nir, nir_lower_global_vars_to_local);
-
-	si_nir_lower_color(nir);
-	NIR_PASS_V(nir, nir_lower_io, nir_var_shader_in, type_size_vec4, 0);
-
-	/* This pass needs actual constants */
-	NIR_PASS_V(nir, nir_opt_constant_folding);
-	NIR_PASS_V(nir, nir_io_add_const_offset_to_base,
-		   nir_var_shader_in);
+   if (nir->info.stage != MESA_SHADER_FRAGMENT)
+      return;
+
+   NIR_PASS_V(nir, nir_lower_io_to_temporaries, nir_shader_get_entrypoint(nir), false, true);
+
+   /* Since we're doing nir_lower_io_to_temporaries late, we need
+    * to lower all the copy_deref's introduced by
+    * lower_io_to_temporaries before calling nir_lower_io.
+    */
+   NIR_PASS_V(nir, nir_split_var_copies);
+   NIR_PASS_V(nir, nir_lower_var_copies);
+   NIR_PASS_V(nir, nir_lower_global_vars_to_local);
+
+   si_nir_lower_color(nir);
+   NIR_PASS_V(nir, nir_lower_io, nir_var_shader_in, type_size_vec4, 0);
+
+   /* This pass needs actual constants */
+   NIR_PASS_V(nir, nir_opt_constant_folding);
+   NIR_PASS_V(nir, nir_io_add_const_offset_to_base, nir_var_shader_in);
 }
 
 void si_nir_adjust_driver_locations(struct nir_shader *nir)
 {
-	/* Adjust the driver location of inputs and outputs. The state tracker
-	 * interprets them as slots, while the ac/nir backend interprets them
-	 * as individual components.
-	 */
-	if (nir->info.stage != MESA_SHADER_FRAGMENT) {
-		nir_foreach_variable(variable, &nir->inputs)
-			variable->data.driver_location *= 4;
-	}
-
-	nir_foreach_variable(variable, &nir->outputs)
-		variable->data.driver_location *= 4;
+   /* Adjust the driver location of inputs and outputs. The state tracker
+    * interprets them as slots, while the ac/nir backend interprets them
+    * as individual components.
+    */
+   if (nir->info.stage != MESA_SHADER_FRAGMENT) {
+      nir_foreach_variable (variable, &nir->inputs)
+         variable->data.driver_location *= 4;
+   }
+
+   nir_foreach_variable (variable, &nir->outputs)
+      variable->data.driver_location *= 4;
 }
 
 /**
@@ -938,65 +889,64 @@ void si_nir_adjust_driver_locations(struct nir_shader *nir)
  */
 static void si_lower_nir(struct si_screen *sscreen, struct nir_shader *nir)
 {
-	/* Perform lowerings (and optimizations) of code.
-	 *
-	 * Performance considerations aside, we must:
-	 * - lower certain ALU operations
-	 * - ensure constant offsets for texture instructions are folded
-	 *   and copy-propagated
-	 */
-
-	static const struct nir_lower_tex_options lower_tex_options = {
-		.lower_txp = ~0u,
-	};
-	NIR_PASS_V(nir, nir_lower_tex, &lower_tex_options);
-
-	const nir_lower_subgroups_options subgroups_options = {
-		.subgroup_size = 64,
-		.ballot_bit_size = 64,
-		.lower_to_scalar = true,
-		.lower_subgroup_masks = true,
-		.lower_vote_trivial = false,
-		.lower_vote_eq_to_ballot = true,
-	};
-	NIR_PASS_V(nir, nir_lower_subgroups, &subgroups_options);
-
-	/* Lower load constants to scalar and then clean up the mess */
-	NIR_PASS_V(nir, nir_lower_load_const_to_scalar);
-	NIR_PASS_V(nir, nir_lower_var_copies);
-	NIR_PASS_V(nir, nir_lower_pack);
-	NIR_PASS_V(nir, nir_opt_access);
-	si_nir_opts(nir);
-
-	/* Lower large variables that are always constant with load_constant
-	 * intrinsics, which get turned into PC-relative loads from a data
-	 * section next to the shader.
-	 *
-	 * st/mesa calls finalize_nir twice, but we can't call this pass twice.
-	 */
-	bool changed = false;
-	if (!nir->constant_data) {
-		NIR_PASS(changed, nir, nir_opt_large_constants,
-			 glsl_get_natural_size_align_bytes, 16);
-	}
-
-	changed |= ac_lower_indirect_derefs(nir, sscreen->info.chip_class);
-	if (changed)
-		si_nir_opts(nir);
-
-	NIR_PASS_V(nir, nir_lower_bool_to_int32);
-	NIR_PASS_V(nir, nir_remove_dead_variables, nir_var_function_temp);
-
-	if (sscreen->debug_flags & DBG(FS_CORRECT_DERIVS_AFTER_KILL))
-		NIR_PASS_V(nir, nir_lower_discard_to_demote);
+   /* Perform lowerings (and optimizations) of code.
+    *
+    * Performance considerations aside, we must:
+    * - lower certain ALU operations
+    * - ensure constant offsets for texture instructions are folded
+    *   and copy-propagated
+    */
+
+   static const struct nir_lower_tex_options lower_tex_options = {
+      .lower_txp = ~0u,
+   };
+   NIR_PASS_V(nir, nir_lower_tex, &lower_tex_options);
+
+   const nir_lower_subgroups_options subgroups_options = {
+      .subgroup_size = 64,
+      .ballot_bit_size = 64,
+      .lower_to_scalar = true,
+      .lower_subgroup_masks = true,
+      .lower_vote_trivial = false,
+      .lower_vote_eq_to_ballot = true,
+   };
+   NIR_PASS_V(nir, nir_lower_subgroups, &subgroups_options);
+
+   /* Lower load constants to scalar and then clean up the mess */
+   NIR_PASS_V(nir, nir_lower_load_const_to_scalar);
+   NIR_PASS_V(nir, nir_lower_var_copies);
+   NIR_PASS_V(nir, nir_lower_pack);
+   NIR_PASS_V(nir, nir_opt_access);
+   si_nir_opts(nir);
+
+   /* Lower large variables that are always constant with load_constant
+    * intrinsics, which get turned into PC-relative loads from a data
+    * section next to the shader.
+    *
+    * st/mesa calls finalize_nir twice, but we can't call this pass twice.
+    */
+   bool changed = false;
+   if (!nir->constant_data) {
+      NIR_PASS(changed, nir, nir_opt_large_constants, glsl_get_natural_size_align_bytes, 16);
+   }
+
+   changed |= ac_lower_indirect_derefs(nir, sscreen->info.chip_class);
+   if (changed)
+      si_nir_opts(nir);
+
+   NIR_PASS_V(nir, nir_lower_bool_to_int32);
+   NIR_PASS_V(nir, nir_remove_dead_variables, nir_var_function_temp);
+
+   if (sscreen->debug_flags & DBG(FS_CORRECT_DERIVS_AFTER_KILL))
+      NIR_PASS_V(nir, nir_lower_discard_to_demote);
 }
 
 void si_finalize_nir(struct pipe_screen *screen, void *nirptr, bool optimize)
 {
-	struct si_screen *sscreen = (struct si_screen *)screen;
-	struct nir_shader *nir = (struct nir_shader *)nirptr;
+   struct si_screen *sscreen = (struct si_screen *)screen;
+   struct nir_shader *nir = (struct nir_shader *)nirptr;
 
-	nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir));
-	si_nir_lower_ps_inputs(nir);
-	si_lower_nir(sscreen, nir);
+   nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir));
+   si_nir_lower_ps_inputs(nir);
+   si_lower_nir(sscreen, nir);
 }
diff --git a/src/gallium/drivers/radeonsi/si_shaderlib_tgsi.c b/src/gallium/drivers/radeonsi/si_shaderlib_tgsi.c
index 30cca361ac4..e5fd089b59f 100644
--- a/src/gallium/drivers/radeonsi/si_shaderlib_tgsi.c
+++ b/src/gallium/drivers/radeonsi/si_shaderlib_tgsi.c
@@ -26,68 +26,59 @@
 #include "tgsi/tgsi_text.h"
 #include "tgsi/tgsi_ureg.h"
 
-void *si_get_blitter_vs(struct si_context *sctx, enum blitter_attrib_type type,
-			unsigned num_layers)
+void *si_get_blitter_vs(struct si_context *sctx, enum blitter_attrib_type type, unsigned num_layers)
 {
-	unsigned vs_blit_property;
-	void **vs;
-
-	switch (type) {
-	case UTIL_BLITTER_ATTRIB_NONE:
-		vs = num_layers > 1 ? &sctx->vs_blit_pos_layered :
-				      &sctx->vs_blit_pos;
-		vs_blit_property = SI_VS_BLIT_SGPRS_POS;
-		break;
-	case UTIL_BLITTER_ATTRIB_COLOR:
-		vs = num_layers > 1 ? &sctx->vs_blit_color_layered :
-				      &sctx->vs_blit_color;
-		vs_blit_property = SI_VS_BLIT_SGPRS_POS_COLOR;
-		break;
-	case UTIL_BLITTER_ATTRIB_TEXCOORD_XY:
-	case UTIL_BLITTER_ATTRIB_TEXCOORD_XYZW:
-		assert(num_layers == 1);
-		vs = &sctx->vs_blit_texcoord;
-		vs_blit_property = SI_VS_BLIT_SGPRS_POS_TEXCOORD;
-		break;
-	default:
-		assert(0);
-		return NULL;
-	}
-	if (*vs)
-		return *vs;
-
-	struct ureg_program *ureg = ureg_create(PIPE_SHADER_VERTEX);
-	if (!ureg)
-		return NULL;
-
-	/* Tell the shader to load VS inputs from SGPRs: */
-	ureg_property(ureg, TGSI_PROPERTY_VS_BLIT_SGPRS_AMD, vs_blit_property);
-	ureg_property(ureg, TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION, true);
-
-	/* This is just a pass-through shader with 1-3 MOV instructions. */
-	ureg_MOV(ureg,
-		 ureg_DECL_output(ureg, TGSI_SEMANTIC_POSITION, 0),
-		 ureg_DECL_vs_input(ureg, 0));
-
-	if (type != UTIL_BLITTER_ATTRIB_NONE) {
-		ureg_MOV(ureg,
-			 ureg_DECL_output(ureg, TGSI_SEMANTIC_GENERIC, 0),
-			 ureg_DECL_vs_input(ureg, 1));
-	}
-
-	if (num_layers > 1) {
-		struct ureg_src instance_id =
-			ureg_DECL_system_value(ureg, TGSI_SEMANTIC_INSTANCEID, 0);
-		struct ureg_dst layer =
-			ureg_DECL_output(ureg, TGSI_SEMANTIC_LAYER, 0);
-
-		ureg_MOV(ureg, ureg_writemask(layer, TGSI_WRITEMASK_X),
-			 ureg_scalar(instance_id, TGSI_SWIZZLE_X));
-	}
-	ureg_END(ureg);
-
-	*vs = ureg_create_shader_and_destroy(ureg, &sctx->b);
-	return *vs;
+   unsigned vs_blit_property;
+   void **vs;
+
+   switch (type) {
+   case UTIL_BLITTER_ATTRIB_NONE:
+      vs = num_layers > 1 ? &sctx->vs_blit_pos_layered : &sctx->vs_blit_pos;
+      vs_blit_property = SI_VS_BLIT_SGPRS_POS;
+      break;
+   case UTIL_BLITTER_ATTRIB_COLOR:
+      vs = num_layers > 1 ? &sctx->vs_blit_color_layered : &sctx->vs_blit_color;
+      vs_blit_property = SI_VS_BLIT_SGPRS_POS_COLOR;
+      break;
+   case UTIL_BLITTER_ATTRIB_TEXCOORD_XY:
+   case UTIL_BLITTER_ATTRIB_TEXCOORD_XYZW:
+      assert(num_layers == 1);
+      vs = &sctx->vs_blit_texcoord;
+      vs_blit_property = SI_VS_BLIT_SGPRS_POS_TEXCOORD;
+      break;
+   default:
+      assert(0);
+      return NULL;
+   }
+   if (*vs)
+      return *vs;
+
+   struct ureg_program *ureg = ureg_create(PIPE_SHADER_VERTEX);
+   if (!ureg)
+      return NULL;
+
+   /* Tell the shader to load VS inputs from SGPRs: */
+   ureg_property(ureg, TGSI_PROPERTY_VS_BLIT_SGPRS_AMD, vs_blit_property);
+   ureg_property(ureg, TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION, true);
+
+   /* This is just a pass-through shader with 1-3 MOV instructions. */
+   ureg_MOV(ureg, ureg_DECL_output(ureg, TGSI_SEMANTIC_POSITION, 0), ureg_DECL_vs_input(ureg, 0));
+
+   if (type != UTIL_BLITTER_ATTRIB_NONE) {
+      ureg_MOV(ureg, ureg_DECL_output(ureg, TGSI_SEMANTIC_GENERIC, 0), ureg_DECL_vs_input(ureg, 1));
+   }
+
+   if (num_layers > 1) {
+      struct ureg_src instance_id = ureg_DECL_system_value(ureg, TGSI_SEMANTIC_INSTANCEID, 0);
+      struct ureg_dst layer = ureg_DECL_output(ureg, TGSI_SEMANTIC_LAYER, 0);
+
+      ureg_MOV(ureg, ureg_writemask(layer, TGSI_WRITEMASK_X),
+               ureg_scalar(instance_id, TGSI_SWIZZLE_X));
+   }
+   ureg_END(ureg);
+
+   *vs = ureg_create_shader_and_destroy(ureg, &sctx->b);
+   return *vs;
 }
 
 /**
@@ -97,137 +88,128 @@ void *si_get_blitter_vs(struct si_context *sctx, enum blitter_attrib_type type,
  */
 void *si_create_fixed_func_tcs(struct si_context *sctx)
 {
-	struct ureg_src outer, inner;
-	struct ureg_dst tessouter, tessinner;
-	struct ureg_program *ureg = ureg_create(PIPE_SHADER_TESS_CTRL);
+   struct ureg_src outer, inner;
+   struct ureg_dst tessouter, tessinner;
+   struct ureg_program *ureg = ureg_create(PIPE_SHADER_TESS_CTRL);
 
-	if (!ureg)
-		return NULL;
+   if (!ureg)
+      return NULL;
 
-	outer = ureg_DECL_system_value(ureg,
-				       TGSI_SEMANTIC_TESS_DEFAULT_OUTER_LEVEL, 0);
-	inner = ureg_DECL_system_value(ureg,
-				       TGSI_SEMANTIC_TESS_DEFAULT_INNER_LEVEL, 0);
+   outer = ureg_DECL_system_value(ureg, TGSI_SEMANTIC_TESS_DEFAULT_OUTER_LEVEL, 0);
+   inner = ureg_DECL_system_value(ureg, TGSI_SEMANTIC_TESS_DEFAULT_INNER_LEVEL, 0);
 
-	tessouter = ureg_DECL_output(ureg, TGSI_SEMANTIC_TESSOUTER, 0);
-	tessinner = ureg_DECL_output(ureg, TGSI_SEMANTIC_TESSINNER, 0);
+   tessouter = ureg_DECL_output(ureg, TGSI_SEMANTIC_TESSOUTER, 0);
+   tessinner = ureg_DECL_output(ureg, TGSI_SEMANTIC_TESSINNER, 0);
 
-	ureg_MOV(ureg, tessouter, outer);
-	ureg_MOV(ureg, tessinner, inner);
-	ureg_END(ureg);
+   ureg_MOV(ureg, tessouter, outer);
+   ureg_MOV(ureg, tessinner, inner);
+   ureg_END(ureg);
 
-	return ureg_create_shader_and_destroy(ureg, &sctx->b);
+   return ureg_create_shader_and_destroy(ureg, &sctx->b);
 }
 
 /* Create a compute shader implementing clear_buffer or copy_buffer. */
-void *si_create_dma_compute_shader(struct pipe_context *ctx,
-				   unsigned num_dwords_per_thread,
-				   bool dst_stream_cache_policy, bool is_copy)
+void *si_create_dma_compute_shader(struct pipe_context *ctx, unsigned num_dwords_per_thread,
+                                   bool dst_stream_cache_policy, bool is_copy)
 {
-	struct si_screen *sscreen = (struct si_screen *)ctx->screen;
-	assert(util_is_power_of_two_nonzero(num_dwords_per_thread));
-
-	unsigned store_qualifier = TGSI_MEMORY_COHERENT | TGSI_MEMORY_RESTRICT;
-	if (dst_stream_cache_policy)
-		store_qualifier |= TGSI_MEMORY_STREAM_CACHE_POLICY;
-
-	/* Don't cache loads, because there is no reuse. */
-	unsigned load_qualifier = store_qualifier | TGSI_MEMORY_STREAM_CACHE_POLICY;
-
-	unsigned num_mem_ops = MAX2(1, num_dwords_per_thread / 4);
-	unsigned *inst_dwords = alloca(num_mem_ops * sizeof(unsigned));
-
-	for (unsigned i = 0; i < num_mem_ops; i++) {
-		if (i*4 < num_dwords_per_thread)
-			inst_dwords[i] = MIN2(4, num_dwords_per_thread - i*4);
-	}
-
-	struct ureg_program *ureg = ureg_create(PIPE_SHADER_COMPUTE);
-	if (!ureg)
-		return NULL;
-
-	ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH, sscreen->compute_wave_size);
-	ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT, 1);
-	ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH, 1);
-
-	struct ureg_src value;
-	if (!is_copy) {
-		ureg_property(ureg, TGSI_PROPERTY_CS_USER_DATA_COMPONENTS_AMD, inst_dwords[0]);
-		value = ureg_DECL_system_value(ureg, TGSI_SEMANTIC_CS_USER_DATA_AMD, 0);
-	}
-
-	struct ureg_src tid = ureg_DECL_system_value(ureg, TGSI_SEMANTIC_THREAD_ID, 0);
-	struct ureg_src blk = ureg_DECL_system_value(ureg, TGSI_SEMANTIC_BLOCK_ID, 0);
-	struct ureg_dst store_addr = ureg_writemask(ureg_DECL_temporary(ureg), TGSI_WRITEMASK_X);
-	struct ureg_dst load_addr = ureg_writemask(ureg_DECL_temporary(ureg), TGSI_WRITEMASK_X);
-	struct ureg_dst dstbuf = ureg_dst(ureg_DECL_buffer(ureg, 0, false));
-	struct ureg_src srcbuf;
-	struct ureg_src *values = NULL;
-
-	if (is_copy) {
-		srcbuf = ureg_DECL_buffer(ureg, 1, false);
-		values = malloc(num_mem_ops * sizeof(struct ureg_src));
-	}
-
-	/* If there are multiple stores, the first store writes into 0*wavesize+tid,
-	 * the 2nd store writes into 1*wavesize+tid, the 3rd store writes into 2*wavesize+tid, etc.
-	 */
-	ureg_UMAD(ureg, store_addr, blk,
-		  ureg_imm1u(ureg, sscreen->compute_wave_size * num_mem_ops), tid);
-	/* Convert from a "store size unit" into bytes. */
-	ureg_UMUL(ureg, store_addr, ureg_src(store_addr),
-		  ureg_imm1u(ureg, 4 * inst_dwords[0]));
-	ureg_MOV(ureg, load_addr, ureg_src(store_addr));
-
-	/* Distance between a load and a store for latency hiding. */
-	unsigned load_store_distance = is_copy ? 8 : 0;
-
-	for (unsigned i = 0; i < num_mem_ops + load_store_distance; i++) {
-		int d = i - load_store_distance;
-
-		if (is_copy && i < num_mem_ops) {
-			if (i) {
-				ureg_UADD(ureg, load_addr, ureg_src(load_addr),
-					  ureg_imm1u(ureg, 4 * inst_dwords[i] *
-						     sscreen->compute_wave_size));
-			}
-
-			values[i] = ureg_src(ureg_DECL_temporary(ureg));
-			struct ureg_dst dst =
-				ureg_writemask(ureg_dst(values[i]),
-					       u_bit_consecutive(0, inst_dwords[i]));
-			struct ureg_src srcs[] = {srcbuf, ureg_src(load_addr)};
-			ureg_memory_insn(ureg, TGSI_OPCODE_LOAD, &dst, 1, srcs, 2,
-					 load_qualifier, TGSI_TEXTURE_BUFFER, 0);
-		}
-
-		if (d >= 0) {
-			if (d) {
-				ureg_UADD(ureg, store_addr, ureg_src(store_addr),
-					  ureg_imm1u(ureg, 4 * inst_dwords[d] *
-						     sscreen->compute_wave_size));
-			}
-
-			struct ureg_dst dst =
-				ureg_writemask(dstbuf, u_bit_consecutive(0, inst_dwords[d]));
-			struct ureg_src srcs[] =
-				{ureg_src(store_addr), is_copy ? values[d] : value};
-			ureg_memory_insn(ureg, TGSI_OPCODE_STORE, &dst, 1, srcs, 2,
-					 store_qualifier, TGSI_TEXTURE_BUFFER, 0);
-		}
-	}
-	ureg_END(ureg);
-
-	struct pipe_compute_state state = {};
-	state.ir_type = PIPE_SHADER_IR_TGSI;
-	state.prog = ureg_get_tokens(ureg, NULL);
-
-	void *cs = ctx->create_compute_state(ctx, &state);
-	ureg_destroy(ureg);
-        ureg_free_tokens(state.prog);
-
-	free(values);
-	return cs;
+   struct si_screen *sscreen = (struct si_screen *)ctx->screen;
+   assert(util_is_power_of_two_nonzero(num_dwords_per_thread));
+
+   unsigned store_qualifier = TGSI_MEMORY_COHERENT | TGSI_MEMORY_RESTRICT;
+   if (dst_stream_cache_policy)
+      store_qualifier |= TGSI_MEMORY_STREAM_CACHE_POLICY;
+
+   /* Don't cache loads, because there is no reuse. */
+   unsigned load_qualifier = store_qualifier | TGSI_MEMORY_STREAM_CACHE_POLICY;
+
+   unsigned num_mem_ops = MAX2(1, num_dwords_per_thread / 4);
+   unsigned *inst_dwords = alloca(num_mem_ops * sizeof(unsigned));
+
+   for (unsigned i = 0; i < num_mem_ops; i++) {
+      if (i * 4 < num_dwords_per_thread)
+         inst_dwords[i] = MIN2(4, num_dwords_per_thread - i * 4);
+   }
+
+   struct ureg_program *ureg = ureg_create(PIPE_SHADER_COMPUTE);
+   if (!ureg)
+      return NULL;
+
+   ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH, sscreen->compute_wave_size);
+   ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT, 1);
+   ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH, 1);
+
+   struct ureg_src value;
+   if (!is_copy) {
+      ureg_property(ureg, TGSI_PROPERTY_CS_USER_DATA_COMPONENTS_AMD, inst_dwords[0]);
+      value = ureg_DECL_system_value(ureg, TGSI_SEMANTIC_CS_USER_DATA_AMD, 0);
+   }
+
+   struct ureg_src tid = ureg_DECL_system_value(ureg, TGSI_SEMANTIC_THREAD_ID, 0);
+   struct ureg_src blk = ureg_DECL_system_value(ureg, TGSI_SEMANTIC_BLOCK_ID, 0);
+   struct ureg_dst store_addr = ureg_writemask(ureg_DECL_temporary(ureg), TGSI_WRITEMASK_X);
+   struct ureg_dst load_addr = ureg_writemask(ureg_DECL_temporary(ureg), TGSI_WRITEMASK_X);
+   struct ureg_dst dstbuf = ureg_dst(ureg_DECL_buffer(ureg, 0, false));
+   struct ureg_src srcbuf;
+   struct ureg_src *values = NULL;
+
+   if (is_copy) {
+      srcbuf = ureg_DECL_buffer(ureg, 1, false);
+      values = malloc(num_mem_ops * sizeof(struct ureg_src));
+   }
+
+   /* If there are multiple stores, the first store writes into 0*wavesize+tid,
+    * the 2nd store writes into 1*wavesize+tid, the 3rd store writes into 2*wavesize+tid, etc.
+    */
+   ureg_UMAD(ureg, store_addr, blk, ureg_imm1u(ureg, sscreen->compute_wave_size * num_mem_ops),
+             tid);
+   /* Convert from a "store size unit" into bytes. */
+   ureg_UMUL(ureg, store_addr, ureg_src(store_addr), ureg_imm1u(ureg, 4 * inst_dwords[0]));
+   ureg_MOV(ureg, load_addr, ureg_src(store_addr));
+
+   /* Distance between a load and a store for latency hiding. */
+   unsigned load_store_distance = is_copy ? 8 : 0;
+
+   for (unsigned i = 0; i < num_mem_ops + load_store_distance; i++) {
+      int d = i - load_store_distance;
+
+      if (is_copy && i < num_mem_ops) {
+         if (i) {
+            ureg_UADD(ureg, load_addr, ureg_src(load_addr),
+                      ureg_imm1u(ureg, 4 * inst_dwords[i] * sscreen->compute_wave_size));
+         }
+
+         values[i] = ureg_src(ureg_DECL_temporary(ureg));
+         struct ureg_dst dst =
+            ureg_writemask(ureg_dst(values[i]), u_bit_consecutive(0, inst_dwords[i]));
+         struct ureg_src srcs[] = {srcbuf, ureg_src(load_addr)};
+         ureg_memory_insn(ureg, TGSI_OPCODE_LOAD, &dst, 1, srcs, 2, load_qualifier,
+                          TGSI_TEXTURE_BUFFER, 0);
+      }
+
+      if (d >= 0) {
+         if (d) {
+            ureg_UADD(ureg, store_addr, ureg_src(store_addr),
+                      ureg_imm1u(ureg, 4 * inst_dwords[d] * sscreen->compute_wave_size));
+         }
+
+         struct ureg_dst dst = ureg_writemask(dstbuf, u_bit_consecutive(0, inst_dwords[d]));
+         struct ureg_src srcs[] = {ureg_src(store_addr), is_copy ? values[d] : value};
+         ureg_memory_insn(ureg, TGSI_OPCODE_STORE, &dst, 1, srcs, 2, store_qualifier,
+                          TGSI_TEXTURE_BUFFER, 0);
+      }
+   }
+   ureg_END(ureg);
+
+   struct pipe_compute_state state = {};
+   state.ir_type = PIPE_SHADER_IR_TGSI;
+   state.prog = ureg_get_tokens(ureg, NULL);
+
+   void *cs = ctx->create_compute_state(ctx, &state);
+   ureg_destroy(ureg);
+   ureg_free_tokens(state.prog);
+
+   free(values);
+   return cs;
 }
 
 /* Create a compute shader that copies DCC from one buffer to another
@@ -240,67 +222,63 @@ void *si_create_dma_compute_shader(struct pipe_context *ctx,
  */
 void *si_create_dcc_retile_cs(struct pipe_context *ctx)
 {
-	struct ureg_program *ureg = ureg_create(PIPE_SHADER_COMPUTE);
-	if (!ureg)
-		return NULL;
-
-	ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH, 64);
-	ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT, 1);
-	ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH, 1);
-
-	/* Compute the global thread ID (in idx). */
-	struct ureg_src tid = ureg_DECL_system_value(ureg, TGSI_SEMANTIC_THREAD_ID, 0);
-	struct ureg_src blk = ureg_DECL_system_value(ureg, TGSI_SEMANTIC_BLOCK_ID, 0);
-	struct ureg_dst idx = ureg_writemask(ureg_DECL_temporary(ureg),
-					     TGSI_WRITEMASK_X);
-	ureg_UMAD(ureg, idx, blk, ureg_imm1u(ureg, 64), tid);
-
-	/* Load 2 pairs of offsets for DCC load & store. */
-	struct ureg_src map = ureg_DECL_image(ureg, 0, TGSI_TEXTURE_BUFFER, 0, false, false);
-	struct ureg_dst offsets = ureg_DECL_temporary(ureg);
-	struct ureg_src map_load_args[] = {map, ureg_src(idx)};
-
-	ureg_memory_insn(ureg, TGSI_OPCODE_LOAD, &offsets, 1, map_load_args, 2,
-			 TGSI_MEMORY_RESTRICT, TGSI_TEXTURE_BUFFER, 0);
-
-	struct ureg_src dcc_src = ureg_DECL_image(ureg, 1, TGSI_TEXTURE_BUFFER,
-						  0, false, false);
-	struct ureg_dst dcc_dst = ureg_dst(ureg_DECL_image(ureg, 2, TGSI_TEXTURE_BUFFER,
-							   0, true, false));
-	struct ureg_dst dcc_value[2];
-
-	/* Copy DCC values:
-	 *   dst[offsets.y] = src[offsets.x];
-	 *   dst[offsets.w] = src[offsets.z];
-	 */
-	for (unsigned i = 0; i < 2; i++) {
-		dcc_value[i] = ureg_writemask(ureg_DECL_temporary(ureg), TGSI_WRITEMASK_X);
-
-		struct ureg_src load_args[] =
-			{dcc_src, ureg_scalar(ureg_src(offsets), TGSI_SWIZZLE_X + i*2)};
-		ureg_memory_insn(ureg, TGSI_OPCODE_LOAD, &dcc_value[i], 1, load_args, 2,
-				 TGSI_MEMORY_RESTRICT, TGSI_TEXTURE_BUFFER, 0);
-	}
-
-	dcc_dst = ureg_writemask(dcc_dst, TGSI_WRITEMASK_X);
-
-	for (unsigned i = 0; i < 2; i++) {
-		struct ureg_src store_args[] = {
-			ureg_scalar(ureg_src(offsets), TGSI_SWIZZLE_Y + i*2),
-			ureg_src(dcc_value[i])
-		};
-		ureg_memory_insn(ureg, TGSI_OPCODE_STORE, &dcc_dst, 1, store_args, 2,
-				 TGSI_MEMORY_RESTRICT, TGSI_TEXTURE_BUFFER, 0);
-	}
-	ureg_END(ureg);
-
-	struct pipe_compute_state state = {};
-	state.ir_type = PIPE_SHADER_IR_TGSI;
-	state.prog = ureg_get_tokens(ureg, NULL);
-
-	void *cs = ctx->create_compute_state(ctx, &state);
-	ureg_destroy(ureg);
-	return cs;
+   struct ureg_program *ureg = ureg_create(PIPE_SHADER_COMPUTE);
+   if (!ureg)
+      return NULL;
+
+   ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH, 64);
+   ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT, 1);
+   ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH, 1);
+
+   /* Compute the global thread ID (in idx). */
+   struct ureg_src tid = ureg_DECL_system_value(ureg, TGSI_SEMANTIC_THREAD_ID, 0);
+   struct ureg_src blk = ureg_DECL_system_value(ureg, TGSI_SEMANTIC_BLOCK_ID, 0);
+   struct ureg_dst idx = ureg_writemask(ureg_DECL_temporary(ureg), TGSI_WRITEMASK_X);
+   ureg_UMAD(ureg, idx, blk, ureg_imm1u(ureg, 64), tid);
+
+   /* Load 2 pairs of offsets for DCC load & store. */
+   struct ureg_src map = ureg_DECL_image(ureg, 0, TGSI_TEXTURE_BUFFER, 0, false, false);
+   struct ureg_dst offsets = ureg_DECL_temporary(ureg);
+   struct ureg_src map_load_args[] = {map, ureg_src(idx)};
+
+   ureg_memory_insn(ureg, TGSI_OPCODE_LOAD, &offsets, 1, map_load_args, 2, TGSI_MEMORY_RESTRICT,
+                    TGSI_TEXTURE_BUFFER, 0);
+
+   struct ureg_src dcc_src = ureg_DECL_image(ureg, 1, TGSI_TEXTURE_BUFFER, 0, false, false);
+   struct ureg_dst dcc_dst =
+      ureg_dst(ureg_DECL_image(ureg, 2, TGSI_TEXTURE_BUFFER, 0, true, false));
+   struct ureg_dst dcc_value[2];
+
+   /* Copy DCC values:
+    *   dst[offsets.y] = src[offsets.x];
+    *   dst[offsets.w] = src[offsets.z];
+    */
+   for (unsigned i = 0; i < 2; i++) {
+      dcc_value[i] = ureg_writemask(ureg_DECL_temporary(ureg), TGSI_WRITEMASK_X);
+
+      struct ureg_src load_args[] = {dcc_src,
+                                     ureg_scalar(ureg_src(offsets), TGSI_SWIZZLE_X + i * 2)};
+      ureg_memory_insn(ureg, TGSI_OPCODE_LOAD, &dcc_value[i], 1, load_args, 2, TGSI_MEMORY_RESTRICT,
+                       TGSI_TEXTURE_BUFFER, 0);
+   }
+
+   dcc_dst = ureg_writemask(dcc_dst, TGSI_WRITEMASK_X);
+
+   for (unsigned i = 0; i < 2; i++) {
+      struct ureg_src store_args[] = {ureg_scalar(ureg_src(offsets), TGSI_SWIZZLE_Y + i * 2),
+                                      ureg_src(dcc_value[i])};
+      ureg_memory_insn(ureg, TGSI_OPCODE_STORE, &dcc_dst, 1, store_args, 2, TGSI_MEMORY_RESTRICT,
+                       TGSI_TEXTURE_BUFFER, 0);
+   }
+   ureg_END(ureg);
+
+   struct pipe_compute_state state = {};
+   state.ir_type = PIPE_SHADER_IR_TGSI;
+   state.prog = ureg_get_tokens(ureg, NULL);
+
+   void *cs = ctx->create_compute_state(ctx, &state);
+   ureg_destroy(ureg);
+   return cs;
 }
 
 /* Create the compute shader that is used to collect the results.
@@ -337,186 +315,185 @@ void *si_create_dcc_retile_cs(struct pipe_context *ctx)
  */
 void *si_create_query_result_cs(struct si_context *sctx)
 {
-	/* TEMP[0].xy = accumulated result so far
-	 * TEMP[0].z = result not available
-	 *
-	 * TEMP[1].x = current result index
-	 * TEMP[1].y = current pair index
-	 */
-	static const char text_tmpl[] =
-		"COMP\n"
-		"PROPERTY CS_FIXED_BLOCK_WIDTH 1\n"
-		"PROPERTY CS_FIXED_BLOCK_HEIGHT 1\n"
-		"PROPERTY CS_FIXED_BLOCK_DEPTH 1\n"
-		"DCL BUFFER[0]\n"
-		"DCL BUFFER[1]\n"
-		"DCL BUFFER[2]\n"
-		"DCL CONST[0][0..1]\n"
-		"DCL TEMP[0..5]\n"
-		"IMM[0] UINT32 {0, 31, 2147483647, 4294967295}\n"
-		"IMM[1] UINT32 {1, 2, 4, 8}\n"
-		"IMM[2] UINT32 {16, 32, 64, 128}\n"
-		"IMM[3] UINT32 {1000000, 0, %u, 0}\n" /* for timestamp conversion */
-		"IMM[4] UINT32 {256, 0, 0, 0}\n"
-
-		"AND TEMP[5], CONST[0][0].wwww, IMM[2].xxxx\n"
-		"UIF TEMP[5]\n"
-			/* Check result availability. */
-			"LOAD TEMP[1].x, BUFFER[0], CONST[0][1].xxxx\n"
-			"ISHR TEMP[0].z, TEMP[1].xxxx, IMM[0].yyyy\n"
-			"MOV TEMP[1], TEMP[0].zzzz\n"
-			"NOT TEMP[0].z, TEMP[0].zzzz\n"
-
-			/* Load result if available. */
-			"UIF TEMP[1]\n"
-				"LOAD TEMP[0].xy, BUFFER[0], IMM[0].xxxx\n"
-			"ENDIF\n"
-		"ELSE\n"
-			/* Load previously accumulated result if requested. */
-			"MOV TEMP[0], IMM[0].xxxx\n"
-			"AND TEMP[4], CONST[0][0].wwww, IMM[1].xxxx\n"
-			"UIF TEMP[4]\n"
-				"LOAD TEMP[0].xyz, BUFFER[1], IMM[0].xxxx\n"
-			"ENDIF\n"
-
-			"MOV TEMP[1].x, IMM[0].xxxx\n"
-			"BGNLOOP\n"
-				/* Break if accumulated result so far is not available. */
-				"UIF TEMP[0].zzzz\n"
-					"BRK\n"
-				"ENDIF\n"
-
-				/* Break if result_index >= result_count. */
-				"USGE TEMP[5], TEMP[1].xxxx, CONST[0][0].zzzz\n"
-				"UIF TEMP[5]\n"
-					"BRK\n"
-				"ENDIF\n"
-
-				/* Load fence and check result availability */
-				"UMAD TEMP[5].x, TEMP[1].xxxx, CONST[0][0].yyyy, CONST[0][1].xxxx\n"
-				"LOAD TEMP[5].x, BUFFER[0], TEMP[5].xxxx\n"
-				"ISHR TEMP[0].z, TEMP[5].xxxx, IMM[0].yyyy\n"
-				"NOT TEMP[0].z, TEMP[0].zzzz\n"
-				"UIF TEMP[0].zzzz\n"
-					"BRK\n"
-				"ENDIF\n"
-
-				"MOV TEMP[1].y, IMM[0].xxxx\n"
-				"BGNLOOP\n"
-					/* Load start and end. */
-					"UMUL TEMP[5].x, TEMP[1].xxxx, CONST[0][0].yyyy\n"
-					"UMAD TEMP[5].x, TEMP[1].yyyy, CONST[0][1].yyyy, TEMP[5].xxxx\n"
-					"LOAD TEMP[2].xy, BUFFER[0], TEMP[5].xxxx\n"
-
-					"UADD TEMP[5].y, TEMP[5].xxxx, CONST[0][0].xxxx\n"
-					"LOAD TEMP[3].xy, BUFFER[0], TEMP[5].yyyy\n"
-
-					"U64ADD TEMP[4].xy, TEMP[3], -TEMP[2]\n"
-
-					"AND TEMP[5].z, CONST[0][0].wwww, IMM[4].xxxx\n"
-					"UIF TEMP[5].zzzz\n"
-						/* Load second start/end half-pair and
-						 * take the difference
-						 */
-						"UADD TEMP[5].xy, TEMP[5], IMM[1].wwww\n"
-						"LOAD TEMP[2].xy, BUFFER[0], TEMP[5].xxxx\n"
-						"LOAD TEMP[3].xy, BUFFER[0], TEMP[5].yyyy\n"
-
-						"U64ADD TEMP[3].xy, TEMP[3], -TEMP[2]\n"
-						"U64ADD TEMP[4].xy, TEMP[4], -TEMP[3]\n"
-					"ENDIF\n"
-
-					"U64ADD TEMP[0].xy, TEMP[0], TEMP[4]\n"
-
-					/* Increment pair index */
-					"UADD TEMP[1].y, TEMP[1].yyyy, IMM[1].xxxx\n"
-					"USGE TEMP[5], TEMP[1].yyyy, CONST[0][1].zzzz\n"
-					"UIF TEMP[5]\n"
-						"BRK\n"
-					"ENDIF\n"
-				"ENDLOOP\n"
-
-				/* Increment result index */
-				"UADD TEMP[1].x, TEMP[1].xxxx, IMM[1].xxxx\n"
-			"ENDLOOP\n"
-		"ENDIF\n"
-
-		"AND TEMP[4], CONST[0][0].wwww, IMM[1].yyyy\n"
-		"UIF TEMP[4]\n"
-			/* Store accumulated data for chaining. */
-			"STORE BUFFER[2].xyz, IMM[0].xxxx, TEMP[0]\n"
-		"ELSE\n"
-			"AND TEMP[4], CONST[0][0].wwww, IMM[1].zzzz\n"
-			"UIF TEMP[4]\n"
-				/* Store result availability. */
-				"NOT TEMP[0].z, TEMP[0]\n"
-				"AND TEMP[0].z, TEMP[0].zzzz, IMM[1].xxxx\n"
-				"STORE BUFFER[2].x, IMM[0].xxxx, TEMP[0].zzzz\n"
-
-				"AND TEMP[4], CONST[0][0].wwww, IMM[2].zzzz\n"
-				"UIF TEMP[4]\n"
-					"STORE BUFFER[2].y, IMM[0].xxxx, IMM[0].xxxx\n"
-				"ENDIF\n"
-			"ELSE\n"
-				/* Store result if it is available. */
-				"NOT TEMP[4], TEMP[0].zzzz\n"
-				"UIF TEMP[4]\n"
-					/* Apply timestamp conversion */
-					"AND TEMP[4], CONST[0][0].wwww, IMM[2].yyyy\n"
-					"UIF TEMP[4]\n"
-						"U64MUL TEMP[0].xy, TEMP[0], IMM[3].xyxy\n"
-						"U64DIV TEMP[0].xy, TEMP[0], IMM[3].zwzw\n"
-					"ENDIF\n"
-
-					/* Convert to boolean */
-					"AND TEMP[4], CONST[0][0].wwww, IMM[1].wwww\n"
-					"UIF TEMP[4]\n"
-						"U64SNE TEMP[0].x, TEMP[0].xyxy, IMM[4].zwzw\n"
-						"AND TEMP[0].x, TEMP[0].xxxx, IMM[1].xxxx\n"
-						"MOV TEMP[0].y, IMM[0].xxxx\n"
-					"ENDIF\n"
-
-					"AND TEMP[4], CONST[0][0].wwww, IMM[2].zzzz\n"
-					"UIF TEMP[4]\n"
-						"STORE BUFFER[2].xy, IMM[0].xxxx, TEMP[0].xyxy\n"
-					"ELSE\n"
-						/* Clamping */
-						"UIF TEMP[0].yyyy\n"
-							"MOV TEMP[0].x, IMM[0].wwww\n"
-						"ENDIF\n"
-
-						"AND TEMP[4], CONST[0][0].wwww, IMM[2].wwww\n"
-						"UIF TEMP[4]\n"
-							"UMIN TEMP[0].x, TEMP[0].xxxx, IMM[0].zzzz\n"
-						"ENDIF\n"
-
-						"STORE BUFFER[2].x, IMM[0].xxxx, TEMP[0].xxxx\n"
-					"ENDIF\n"
-				"ENDIF\n"
-			"ENDIF\n"
-		"ENDIF\n"
-
-		"END\n";
-
-	char text[sizeof(text_tmpl) + 32];
-	struct tgsi_token tokens[1024];
-	struct pipe_compute_state state = {};
-
-	/* Hard code the frequency into the shader so that the backend can
-	 * use the full range of optimizations for divide-by-constant.
-	 */
-	snprintf(text, sizeof(text), text_tmpl,
-		 sctx->screen->info.clock_crystal_freq);
-
-	if (!tgsi_text_translate(text, tokens, ARRAY_SIZE(tokens))) {
-		assert(false);
-		return NULL;
-	}
-
-	state.ir_type = PIPE_SHADER_IR_TGSI;
-	state.prog = tokens;
-
-	return sctx->b.create_compute_state(&sctx->b, &state);
+   /* TEMP[0].xy = accumulated result so far
+    * TEMP[0].z = result not available
+    *
+    * TEMP[1].x = current result index
+    * TEMP[1].y = current pair index
+    */
+   static const char text_tmpl[] =
+      "COMP\n"
+      "PROPERTY CS_FIXED_BLOCK_WIDTH 1\n"
+      "PROPERTY CS_FIXED_BLOCK_HEIGHT 1\n"
+      "PROPERTY CS_FIXED_BLOCK_DEPTH 1\n"
+      "DCL BUFFER[0]\n"
+      "DCL BUFFER[1]\n"
+      "DCL BUFFER[2]\n"
+      "DCL CONST[0][0..1]\n"
+      "DCL TEMP[0..5]\n"
+      "IMM[0] UINT32 {0, 31, 2147483647, 4294967295}\n"
+      "IMM[1] UINT32 {1, 2, 4, 8}\n"
+      "IMM[2] UINT32 {16, 32, 64, 128}\n"
+      "IMM[3] UINT32 {1000000, 0, %u, 0}\n" /* for timestamp conversion */
+      "IMM[4] UINT32 {256, 0, 0, 0}\n"
+
+      "AND TEMP[5], CONST[0][0].wwww, IMM[2].xxxx\n"
+      "UIF TEMP[5]\n"
+      /* Check result availability. */
+      "LOAD TEMP[1].x, BUFFER[0], CONST[0][1].xxxx\n"
+      "ISHR TEMP[0].z, TEMP[1].xxxx, IMM[0].yyyy\n"
+      "MOV TEMP[1], TEMP[0].zzzz\n"
+      "NOT TEMP[0].z, TEMP[0].zzzz\n"
+
+      /* Load result if available. */
+      "UIF TEMP[1]\n"
+      "LOAD TEMP[0].xy, BUFFER[0], IMM[0].xxxx\n"
+      "ENDIF\n"
+      "ELSE\n"
+      /* Load previously accumulated result if requested. */
+      "MOV TEMP[0], IMM[0].xxxx\n"
+      "AND TEMP[4], CONST[0][0].wwww, IMM[1].xxxx\n"
+      "UIF TEMP[4]\n"
+      "LOAD TEMP[0].xyz, BUFFER[1], IMM[0].xxxx\n"
+      "ENDIF\n"
+
+      "MOV TEMP[1].x, IMM[0].xxxx\n"
+      "BGNLOOP\n"
+      /* Break if accumulated result so far is not available. */
+      "UIF TEMP[0].zzzz\n"
+      "BRK\n"
+      "ENDIF\n"
+
+      /* Break if result_index >= result_count. */
+      "USGE TEMP[5], TEMP[1].xxxx, CONST[0][0].zzzz\n"
+      "UIF TEMP[5]\n"
+      "BRK\n"
+      "ENDIF\n"
+
+      /* Load fence and check result availability */
+      "UMAD TEMP[5].x, TEMP[1].xxxx, CONST[0][0].yyyy, CONST[0][1].xxxx\n"
+      "LOAD TEMP[5].x, BUFFER[0], TEMP[5].xxxx\n"
+      "ISHR TEMP[0].z, TEMP[5].xxxx, IMM[0].yyyy\n"
+      "NOT TEMP[0].z, TEMP[0].zzzz\n"
+      "UIF TEMP[0].zzzz\n"
+      "BRK\n"
+      "ENDIF\n"
+
+      "MOV TEMP[1].y, IMM[0].xxxx\n"
+      "BGNLOOP\n"
+      /* Load start and end. */
+      "UMUL TEMP[5].x, TEMP[1].xxxx, CONST[0][0].yyyy\n"
+      "UMAD TEMP[5].x, TEMP[1].yyyy, CONST[0][1].yyyy, TEMP[5].xxxx\n"
+      "LOAD TEMP[2].xy, BUFFER[0], TEMP[5].xxxx\n"
+
+      "UADD TEMP[5].y, TEMP[5].xxxx, CONST[0][0].xxxx\n"
+      "LOAD TEMP[3].xy, BUFFER[0], TEMP[5].yyyy\n"
+
+      "U64ADD TEMP[4].xy, TEMP[3], -TEMP[2]\n"
+
+      "AND TEMP[5].z, CONST[0][0].wwww, IMM[4].xxxx\n"
+      "UIF TEMP[5].zzzz\n"
+      /* Load second start/end half-pair and
+       * take the difference
+       */
+      "UADD TEMP[5].xy, TEMP[5], IMM[1].wwww\n"
+      "LOAD TEMP[2].xy, BUFFER[0], TEMP[5].xxxx\n"
+      "LOAD TEMP[3].xy, BUFFER[0], TEMP[5].yyyy\n"
+
+      "U64ADD TEMP[3].xy, TEMP[3], -TEMP[2]\n"
+      "U64ADD TEMP[4].xy, TEMP[4], -TEMP[3]\n"
+      "ENDIF\n"
+
+      "U64ADD TEMP[0].xy, TEMP[0], TEMP[4]\n"
+
+      /* Increment pair index */
+      "UADD TEMP[1].y, TEMP[1].yyyy, IMM[1].xxxx\n"
+      "USGE TEMP[5], TEMP[1].yyyy, CONST[0][1].zzzz\n"
+      "UIF TEMP[5]\n"
+      "BRK\n"
+      "ENDIF\n"
+      "ENDLOOP\n"
+
+      /* Increment result index */
+      "UADD TEMP[1].x, TEMP[1].xxxx, IMM[1].xxxx\n"
+      "ENDLOOP\n"
+      "ENDIF\n"
+
+      "AND TEMP[4], CONST[0][0].wwww, IMM[1].yyyy\n"
+      "UIF TEMP[4]\n"
+      /* Store accumulated data for chaining. */
+      "STORE BUFFER[2].xyz, IMM[0].xxxx, TEMP[0]\n"
+      "ELSE\n"
+      "AND TEMP[4], CONST[0][0].wwww, IMM[1].zzzz\n"
+      "UIF TEMP[4]\n"
+      /* Store result availability. */
+      "NOT TEMP[0].z, TEMP[0]\n"
+      "AND TEMP[0].z, TEMP[0].zzzz, IMM[1].xxxx\n"
+      "STORE BUFFER[2].x, IMM[0].xxxx, TEMP[0].zzzz\n"
+
+      "AND TEMP[4], CONST[0][0].wwww, IMM[2].zzzz\n"
+      "UIF TEMP[4]\n"
+      "STORE BUFFER[2].y, IMM[0].xxxx, IMM[0].xxxx\n"
+      "ENDIF\n"
+      "ELSE\n"
+      /* Store result if it is available. */
+      "NOT TEMP[4], TEMP[0].zzzz\n"
+      "UIF TEMP[4]\n"
+      /* Apply timestamp conversion */
+      "AND TEMP[4], CONST[0][0].wwww, IMM[2].yyyy\n"
+      "UIF TEMP[4]\n"
+      "U64MUL TEMP[0].xy, TEMP[0], IMM[3].xyxy\n"
+      "U64DIV TEMP[0].xy, TEMP[0], IMM[3].zwzw\n"
+      "ENDIF\n"
+
+      /* Convert to boolean */
+      "AND TEMP[4], CONST[0][0].wwww, IMM[1].wwww\n"
+      "UIF TEMP[4]\n"
+      "U64SNE TEMP[0].x, TEMP[0].xyxy, IMM[4].zwzw\n"
+      "AND TEMP[0].x, TEMP[0].xxxx, IMM[1].xxxx\n"
+      "MOV TEMP[0].y, IMM[0].xxxx\n"
+      "ENDIF\n"
+
+      "AND TEMP[4], CONST[0][0].wwww, IMM[2].zzzz\n"
+      "UIF TEMP[4]\n"
+      "STORE BUFFER[2].xy, IMM[0].xxxx, TEMP[0].xyxy\n"
+      "ELSE\n"
+      /* Clamping */
+      "UIF TEMP[0].yyyy\n"
+      "MOV TEMP[0].x, IMM[0].wwww\n"
+      "ENDIF\n"
+
+      "AND TEMP[4], CONST[0][0].wwww, IMM[2].wwww\n"
+      "UIF TEMP[4]\n"
+      "UMIN TEMP[0].x, TEMP[0].xxxx, IMM[0].zzzz\n"
+      "ENDIF\n"
+
+      "STORE BUFFER[2].x, IMM[0].xxxx, TEMP[0].xxxx\n"
+      "ENDIF\n"
+      "ENDIF\n"
+      "ENDIF\n"
+      "ENDIF\n"
+
+      "END\n";
+
+   char text[sizeof(text_tmpl) + 32];
+   struct tgsi_token tokens[1024];
+   struct pipe_compute_state state = {};
+
+   /* Hard code the frequency into the shader so that the backend can
+    * use the full range of optimizations for divide-by-constant.
+    */
+   snprintf(text, sizeof(text), text_tmpl, sctx->screen->info.clock_crystal_freq);
+
+   if (!tgsi_text_translate(text, tokens, ARRAY_SIZE(tokens))) {
+      assert(false);
+      return NULL;
+   }
+
+   state.ir_type = PIPE_SHADER_IR_TGSI;
+   state.prog = tokens;
+
+   return sctx->b.create_compute_state(&sctx->b, &state);
 }
 
 /* Create a compute shader implementing copy_image.
@@ -524,247 +501,238 @@ void *si_create_query_result_cs(struct si_context *sctx)
  */
 void *si_create_copy_image_compute_shader(struct pipe_context *ctx)
 {
-	static const char text[] =
-		"COMP\n"
-		"PROPERTY CS_FIXED_BLOCK_WIDTH 8\n"
-		"PROPERTY CS_FIXED_BLOCK_HEIGHT 8\n"
-		"PROPERTY CS_FIXED_BLOCK_DEPTH 1\n"
-		"DCL SV[0], THREAD_ID\n"
-		"DCL SV[1], BLOCK_ID\n"
-		"DCL IMAGE[0], 2D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT, WR\n"
-		"DCL IMAGE[1], 2D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT, WR\n"
-		"DCL CONST[0][0..1]\n" // 0:xyzw 1:xyzw
-		"DCL TEMP[0..4], LOCAL\n"
-		"IMM[0] UINT32 {8, 1, 0, 0}\n"
-		"MOV TEMP[0].xyz, CONST[0][0].xyzw\n"
-		"UMAD TEMP[1].xyz, SV[1].xyzz, IMM[0].xxyy, SV[0].xyzz\n"
-		"UADD TEMP[2].xyz, TEMP[1].xyzx, TEMP[0].xyzx\n"
-		"LOAD TEMP[3], IMAGE[0], TEMP[2].xyzx, 2D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT\n"
-		"MOV TEMP[4].xyz, CONST[0][1].xyzw\n"
-		"UADD TEMP[2].xyz, TEMP[1].xyzx, TEMP[4].xyzx\n"
-		"STORE IMAGE[1], TEMP[2].xyzz, TEMP[3], 2D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT\n"
-		"END\n";
-
-	struct tgsi_token tokens[1024];
-	struct pipe_compute_state state = {0};
-
-	if (!tgsi_text_translate(text, tokens, ARRAY_SIZE(tokens))) {
-		assert(false);
-		return NULL;
-	}
-
-	state.ir_type = PIPE_SHADER_IR_TGSI;
-	state.prog = tokens;
-
-	return ctx->create_compute_state(ctx, &state);
+   static const char text[] =
+      "COMP\n"
+      "PROPERTY CS_FIXED_BLOCK_WIDTH 8\n"
+      "PROPERTY CS_FIXED_BLOCK_HEIGHT 8\n"
+      "PROPERTY CS_FIXED_BLOCK_DEPTH 1\n"
+      "DCL SV[0], THREAD_ID\n"
+      "DCL SV[1], BLOCK_ID\n"
+      "DCL IMAGE[0], 2D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT, WR\n"
+      "DCL IMAGE[1], 2D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT, WR\n"
+      "DCL CONST[0][0..1]\n" // 0:xyzw 1:xyzw
+      "DCL TEMP[0..4], LOCAL\n"
+      "IMM[0] UINT32 {8, 1, 0, 0}\n"
+      "MOV TEMP[0].xyz, CONST[0][0].xyzw\n"
+      "UMAD TEMP[1].xyz, SV[1].xyzz, IMM[0].xxyy, SV[0].xyzz\n"
+      "UADD TEMP[2].xyz, TEMP[1].xyzx, TEMP[0].xyzx\n"
+      "LOAD TEMP[3], IMAGE[0], TEMP[2].xyzx, 2D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT\n"
+      "MOV TEMP[4].xyz, CONST[0][1].xyzw\n"
+      "UADD TEMP[2].xyz, TEMP[1].xyzx, TEMP[4].xyzx\n"
+      "STORE IMAGE[1], TEMP[2].xyzz, TEMP[3], 2D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT\n"
+      "END\n";
+
+   struct tgsi_token tokens[1024];
+   struct pipe_compute_state state = {0};
+
+   if (!tgsi_text_translate(text, tokens, ARRAY_SIZE(tokens))) {
+      assert(false);
+      return NULL;
+   }
+
+   state.ir_type = PIPE_SHADER_IR_TGSI;
+   state.prog = tokens;
+
+   return ctx->create_compute_state(ctx, &state);
 }
 
 void *si_create_copy_image_compute_shader_1d_array(struct pipe_context *ctx)
 {
-	static const char text[] =
-		"COMP\n"
-		"PROPERTY CS_FIXED_BLOCK_WIDTH 64\n"
-		"PROPERTY CS_FIXED_BLOCK_HEIGHT 1\n"
-		"PROPERTY CS_FIXED_BLOCK_DEPTH 1\n"
-		"DCL SV[0], THREAD_ID\n"
-		"DCL SV[1], BLOCK_ID\n"
-		"DCL IMAGE[0], 1D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT, WR\n"
-		"DCL IMAGE[1], 1D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT, WR\n"
-		"DCL CONST[0][0..1]\n" // 0:xyzw 1:xyzw
-		"DCL TEMP[0..4], LOCAL\n"
-		"IMM[0] UINT32 {64, 1, 0, 0}\n"
-		"MOV TEMP[0].xy, CONST[0][0].xzzw\n"
-		"UMAD TEMP[1].xy, SV[1].xyzz, IMM[0].xyyy, SV[0].xyzz\n"
-		"UADD TEMP[2].xy, TEMP[1].xyzx, TEMP[0].xyzx\n"
-		"LOAD TEMP[3], IMAGE[0], TEMP[2].xyzx, 1D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT\n"
-		"MOV TEMP[4].xy, CONST[0][1].xzzw\n"
-		"UADD TEMP[2].xy, TEMP[1].xyzx, TEMP[4].xyzx\n"
-		"STORE IMAGE[1], TEMP[2].xyzz, TEMP[3], 1D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT\n"
-		"END\n";
-
-	struct tgsi_token tokens[1024];
-	struct pipe_compute_state state = {0};
-
-	if (!tgsi_text_translate(text, tokens, ARRAY_SIZE(tokens))) {
-		assert(false);
-		return NULL;
-	}
-
-	state.ir_type = PIPE_SHADER_IR_TGSI;
-	state.prog = tokens;
-
-	return ctx->create_compute_state(ctx, &state);
+   static const char text[] =
+      "COMP\n"
+      "PROPERTY CS_FIXED_BLOCK_WIDTH 64\n"
+      "PROPERTY CS_FIXED_BLOCK_HEIGHT 1\n"
+      "PROPERTY CS_FIXED_BLOCK_DEPTH 1\n"
+      "DCL SV[0], THREAD_ID\n"
+      "DCL SV[1], BLOCK_ID\n"
+      "DCL IMAGE[0], 1D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT, WR\n"
+      "DCL IMAGE[1], 1D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT, WR\n"
+      "DCL CONST[0][0..1]\n" // 0:xyzw 1:xyzw
+      "DCL TEMP[0..4], LOCAL\n"
+      "IMM[0] UINT32 {64, 1, 0, 0}\n"
+      "MOV TEMP[0].xy, CONST[0][0].xzzw\n"
+      "UMAD TEMP[1].xy, SV[1].xyzz, IMM[0].xyyy, SV[0].xyzz\n"
+      "UADD TEMP[2].xy, TEMP[1].xyzx, TEMP[0].xyzx\n"
+      "LOAD TEMP[3], IMAGE[0], TEMP[2].xyzx, 1D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT\n"
+      "MOV TEMP[4].xy, CONST[0][1].xzzw\n"
+      "UADD TEMP[2].xy, TEMP[1].xyzx, TEMP[4].xyzx\n"
+      "STORE IMAGE[1], TEMP[2].xyzz, TEMP[3], 1D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT\n"
+      "END\n";
+
+   struct tgsi_token tokens[1024];
+   struct pipe_compute_state state = {0};
+
+   if (!tgsi_text_translate(text, tokens, ARRAY_SIZE(tokens))) {
+      assert(false);
+      return NULL;
+   }
+
+   state.ir_type = PIPE_SHADER_IR_TGSI;
+   state.prog = tokens;
+
+   return ctx->create_compute_state(ctx, &state);
 }
 
 void *si_clear_render_target_shader(struct pipe_context *ctx)
 {
-	static const char text[] =
-		"COMP\n"
-		"PROPERTY CS_FIXED_BLOCK_WIDTH 8\n"
-		"PROPERTY CS_FIXED_BLOCK_HEIGHT 8\n"
-		"PROPERTY CS_FIXED_BLOCK_DEPTH 1\n"
-		"DCL SV[0], THREAD_ID\n"
-		"DCL SV[1], BLOCK_ID\n"
-		"DCL IMAGE[0], 2D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT, WR\n"
-		"DCL CONST[0][0..1]\n" // 0:xyzw 1:xyzw
-		"DCL TEMP[0..3], LOCAL\n"
-		"IMM[0] UINT32 {8, 1, 0, 0}\n"
-		"MOV TEMP[0].xyz, CONST[0][0].xyzw\n"
-		"UMAD TEMP[1].xyz, SV[1].xyzz, IMM[0].xxyy, SV[0].xyzz\n"
-		"UADD TEMP[2].xyz, TEMP[1].xyzx, TEMP[0].xyzx\n"
-		"MOV TEMP[3].xyzw, CONST[0][1].xyzw\n"
-		"STORE IMAGE[0], TEMP[2].xyzz, TEMP[3], 2D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT\n"
-		"END\n";
-
-	struct tgsi_token tokens[1024];
-	struct pipe_compute_state state = {0};
-
-	if (!tgsi_text_translate(text, tokens, ARRAY_SIZE(tokens))) {
-		assert(false);
-		return NULL;
-	}
-
-	state.ir_type = PIPE_SHADER_IR_TGSI;
-	state.prog = tokens;
-
-	return ctx->create_compute_state(ctx, &state);
+   static const char text[] =
+      "COMP\n"
+      "PROPERTY CS_FIXED_BLOCK_WIDTH 8\n"
+      "PROPERTY CS_FIXED_BLOCK_HEIGHT 8\n"
+      "PROPERTY CS_FIXED_BLOCK_DEPTH 1\n"
+      "DCL SV[0], THREAD_ID\n"
+      "DCL SV[1], BLOCK_ID\n"
+      "DCL IMAGE[0], 2D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT, WR\n"
+      "DCL CONST[0][0..1]\n" // 0:xyzw 1:xyzw
+      "DCL TEMP[0..3], LOCAL\n"
+      "IMM[0] UINT32 {8, 1, 0, 0}\n"
+      "MOV TEMP[0].xyz, CONST[0][0].xyzw\n"
+      "UMAD TEMP[1].xyz, SV[1].xyzz, IMM[0].xxyy, SV[0].xyzz\n"
+      "UADD TEMP[2].xyz, TEMP[1].xyzx, TEMP[0].xyzx\n"
+      "MOV TEMP[3].xyzw, CONST[0][1].xyzw\n"
+      "STORE IMAGE[0], TEMP[2].xyzz, TEMP[3], 2D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT\n"
+      "END\n";
+
+   struct tgsi_token tokens[1024];
+   struct pipe_compute_state state = {0};
+
+   if (!tgsi_text_translate(text, tokens, ARRAY_SIZE(tokens))) {
+      assert(false);
+      return NULL;
+   }
+
+   state.ir_type = PIPE_SHADER_IR_TGSI;
+   state.prog = tokens;
+
+   return ctx->create_compute_state(ctx, &state);
 }
 
 /* TODO: Didn't really test 1D_ARRAY */
 void *si_clear_render_target_shader_1d_array(struct pipe_context *ctx)
 {
-	static const char text[] =
-		"COMP\n"
-		"PROPERTY CS_FIXED_BLOCK_WIDTH 64\n"
-		"PROPERTY CS_FIXED_BLOCK_HEIGHT 1\n"
-		"PROPERTY CS_FIXED_BLOCK_DEPTH 1\n"
-		"DCL SV[0], THREAD_ID\n"
-		"DCL SV[1], BLOCK_ID\n"
-		"DCL IMAGE[0], 1D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT, WR\n"
-		"DCL CONST[0][0..1]\n" // 0:xyzw 1:xyzw
-		"DCL TEMP[0..3], LOCAL\n"
-		"IMM[0] UINT32 {64, 1, 0, 0}\n"
-		"MOV TEMP[0].xy, CONST[0][0].xzzw\n"
-		"UMAD TEMP[1].xy, SV[1].xyzz, IMM[0].xyyy, SV[0].xyzz\n"
-		"UADD TEMP[2].xy, TEMP[1].xyzx, TEMP[0].xyzx\n"
-		"MOV TEMP[3].xyzw, CONST[0][1].xyzw\n"
-		"STORE IMAGE[0], TEMP[2].xyzz, TEMP[3], 1D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT\n"
-		"END\n";
-
-	struct tgsi_token tokens[1024];
-	struct pipe_compute_state state = {0};
-
-	if (!tgsi_text_translate(text, tokens, ARRAY_SIZE(tokens))) {
-		assert(false);
-		return NULL;
-	}
-
-	state.ir_type = PIPE_SHADER_IR_TGSI;
-	state.prog = tokens;
-
-	return ctx->create_compute_state(ctx, &state);
+   static const char text[] =
+      "COMP\n"
+      "PROPERTY CS_FIXED_BLOCK_WIDTH 64\n"
+      "PROPERTY CS_FIXED_BLOCK_HEIGHT 1\n"
+      "PROPERTY CS_FIXED_BLOCK_DEPTH 1\n"
+      "DCL SV[0], THREAD_ID\n"
+      "DCL SV[1], BLOCK_ID\n"
+      "DCL IMAGE[0], 1D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT, WR\n"
+      "DCL CONST[0][0..1]\n" // 0:xyzw 1:xyzw
+      "DCL TEMP[0..3], LOCAL\n"
+      "IMM[0] UINT32 {64, 1, 0, 0}\n"
+      "MOV TEMP[0].xy, CONST[0][0].xzzw\n"
+      "UMAD TEMP[1].xy, SV[1].xyzz, IMM[0].xyyy, SV[0].xyzz\n"
+      "UADD TEMP[2].xy, TEMP[1].xyzx, TEMP[0].xyzx\n"
+      "MOV TEMP[3].xyzw, CONST[0][1].xyzw\n"
+      "STORE IMAGE[0], TEMP[2].xyzz, TEMP[3], 1D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT\n"
+      "END\n";
+
+   struct tgsi_token tokens[1024];
+   struct pipe_compute_state state = {0};
+
+   if (!tgsi_text_translate(text, tokens, ARRAY_SIZE(tokens))) {
+      assert(false);
+      return NULL;
+   }
+
+   state.ir_type = PIPE_SHADER_IR_TGSI;
+   state.prog = tokens;
+
+   return ctx->create_compute_state(ctx, &state);
 }
 
 void *si_clear_12bytes_buffer_shader(struct pipe_context *ctx)
 {
-	static const char text[] =
-		"COMP\n"
-		"PROPERTY CS_FIXED_BLOCK_WIDTH 64\n"
-		"PROPERTY CS_FIXED_BLOCK_HEIGHT 1\n"
-		"PROPERTY CS_FIXED_BLOCK_DEPTH 1\n"
-		"DCL SV[0], THREAD_ID\n"
-		"DCL SV[1], BLOCK_ID\n"
-		"DCL BUFFER[0]\n"
-		"DCL CONST[0][0..0]\n" // 0:xyzw
-		"DCL TEMP[0..0]\n"
-		"IMM[0] UINT32 {64, 1, 12, 0}\n"
-		"UMAD TEMP[0].x, SV[1].xyzz, IMM[0].xyyy, SV[0].xyzz\n"
-		"UMUL TEMP[0].x, TEMP[0].xyzz, IMM[0].zzzz\n" //12 bytes
-		"STORE BUFFER[0].xyz, TEMP[0].xxxx, CONST[0][0].xyzw\n"
-		"END\n";
-
-	struct tgsi_token tokens[1024];
-	struct pipe_compute_state state = {0};
-
-	if (!tgsi_text_translate(text, tokens, ARRAY_SIZE(tokens))) {
-		assert(false);
-		return NULL;
-	}
-
-	state.ir_type = PIPE_SHADER_IR_TGSI;
-	state.prog = tokens;
-
-	return ctx->create_compute_state(ctx, &state);
+   static const char text[] = "COMP\n"
+                              "PROPERTY CS_FIXED_BLOCK_WIDTH 64\n"
+                              "PROPERTY CS_FIXED_BLOCK_HEIGHT 1\n"
+                              "PROPERTY CS_FIXED_BLOCK_DEPTH 1\n"
+                              "DCL SV[0], THREAD_ID\n"
+                              "DCL SV[1], BLOCK_ID\n"
+                              "DCL BUFFER[0]\n"
+                              "DCL CONST[0][0..0]\n" // 0:xyzw
+                              "DCL TEMP[0..0]\n"
+                              "IMM[0] UINT32 {64, 1, 12, 0}\n"
+                              "UMAD TEMP[0].x, SV[1].xyzz, IMM[0].xyyy, SV[0].xyzz\n"
+                              "UMUL TEMP[0].x, TEMP[0].xyzz, IMM[0].zzzz\n" // 12 bytes
+                              "STORE BUFFER[0].xyz, TEMP[0].xxxx, CONST[0][0].xyzw\n"
+                              "END\n";
+
+   struct tgsi_token tokens[1024];
+   struct pipe_compute_state state = {0};
+
+   if (!tgsi_text_translate(text, tokens, ARRAY_SIZE(tokens))) {
+      assert(false);
+      return NULL;
+   }
+
+   state.ir_type = PIPE_SHADER_IR_TGSI;
+   state.prog = tokens;
+
+   return ctx->create_compute_state(ctx, &state);
 }
 
-
 /* Load samples from the image, and copy them to the same image. This looks like
  * a no-op, but it's not. Loads use FMASK, while stores don't, so samples are
  * reordered to match expanded FMASK.
  *
  * After the shader finishes, FMASK should be cleared to identity.
  */
-void *si_create_fmask_expand_cs(struct pipe_context *ctx, unsigned num_samples,
-				bool is_array)
+void *si_create_fmask_expand_cs(struct pipe_context *ctx, unsigned num_samples, bool is_array)
 {
-	enum tgsi_texture_type target = is_array ? TGSI_TEXTURE_2D_ARRAY_MSAA :
-						   TGSI_TEXTURE_2D_MSAA;
-	struct ureg_program *ureg = ureg_create(PIPE_SHADER_COMPUTE);
-	if (!ureg)
-		return NULL;
-
-	ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH, 8);
-	ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT, 8);
-	ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH, 1);
-
-	/* Compute the image coordinates. */
-	struct ureg_src image = ureg_DECL_image(ureg, 0, target, 0, true, false);
-	struct ureg_src tid = ureg_DECL_system_value(ureg, TGSI_SEMANTIC_THREAD_ID, 0);
-	struct ureg_src blk = ureg_DECL_system_value(ureg, TGSI_SEMANTIC_BLOCK_ID, 0);
-	struct ureg_dst coord = ureg_writemask(ureg_DECL_temporary(ureg),
-					       TGSI_WRITEMASK_XYZW);
-	ureg_UMAD(ureg, ureg_writemask(coord, TGSI_WRITEMASK_XY),
-		  ureg_swizzle(blk, 0, 1, 1, 1), ureg_imm2u(ureg, 8, 8),
-		  ureg_swizzle(tid, 0, 1, 1, 1));
-	if (is_array) {
-		ureg_MOV(ureg, ureg_writemask(coord, TGSI_WRITEMASK_Z),
-			 ureg_scalar(blk, TGSI_SWIZZLE_Z));
-	}
-
-	/* Load samples, resolving FMASK. */
-	struct ureg_dst sample[8];
-	assert(num_samples <= ARRAY_SIZE(sample));
-
-	for (unsigned i = 0; i < num_samples; i++) {
-		sample[i] = ureg_DECL_temporary(ureg);
-
-		ureg_MOV(ureg, ureg_writemask(coord, TGSI_WRITEMASK_W),
-			 ureg_imm1u(ureg, i));
-
-		struct ureg_src srcs[] = {image, ureg_src(coord)};
-		ureg_memory_insn(ureg, TGSI_OPCODE_LOAD, &sample[i], 1, srcs, 2,
-				 TGSI_MEMORY_RESTRICT, target, 0);
-	}
-
-	/* Store samples, ignoring FMASK. */
-	for (unsigned i = 0; i < num_samples; i++) {
-		ureg_MOV(ureg, ureg_writemask(coord, TGSI_WRITEMASK_W),
-			 ureg_imm1u(ureg, i));
-
-		struct ureg_dst dst_image = ureg_dst(image);
-		struct ureg_src srcs[] = {ureg_src(coord), ureg_src(sample[i])};
-		ureg_memory_insn(ureg, TGSI_OPCODE_STORE, &dst_image, 1, srcs, 2,
-				 TGSI_MEMORY_RESTRICT, target, 0);
-	}
-	ureg_END(ureg);
-
-	struct pipe_compute_state state = {};
-	state.ir_type = PIPE_SHADER_IR_TGSI;
-	state.prog = ureg_get_tokens(ureg, NULL);
-
-	void *cs = ctx->create_compute_state(ctx, &state);
-	ureg_destroy(ureg);
-	return cs;
+   enum tgsi_texture_type target = is_array ? TGSI_TEXTURE_2D_ARRAY_MSAA : TGSI_TEXTURE_2D_MSAA;
+   struct ureg_program *ureg = ureg_create(PIPE_SHADER_COMPUTE);
+   if (!ureg)
+      return NULL;
+
+   ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH, 8);
+   ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT, 8);
+   ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH, 1);
+
+   /* Compute the image coordinates. */
+   struct ureg_src image = ureg_DECL_image(ureg, 0, target, 0, true, false);
+   struct ureg_src tid = ureg_DECL_system_value(ureg, TGSI_SEMANTIC_THREAD_ID, 0);
+   struct ureg_src blk = ureg_DECL_system_value(ureg, TGSI_SEMANTIC_BLOCK_ID, 0);
+   struct ureg_dst coord = ureg_writemask(ureg_DECL_temporary(ureg), TGSI_WRITEMASK_XYZW);
+   ureg_UMAD(ureg, ureg_writemask(coord, TGSI_WRITEMASK_XY), ureg_swizzle(blk, 0, 1, 1, 1),
+             ureg_imm2u(ureg, 8, 8), ureg_swizzle(tid, 0, 1, 1, 1));
+   if (is_array) {
+      ureg_MOV(ureg, ureg_writemask(coord, TGSI_WRITEMASK_Z), ureg_scalar(blk, TGSI_SWIZZLE_Z));
+   }
+
+   /* Load samples, resolving FMASK. */
+   struct ureg_dst sample[8];
+   assert(num_samples <= ARRAY_SIZE(sample));
+
+   for (unsigned i = 0; i < num_samples; i++) {
+      sample[i] = ureg_DECL_temporary(ureg);
+
+      ureg_MOV(ureg, ureg_writemask(coord, TGSI_WRITEMASK_W), ureg_imm1u(ureg, i));
+
+      struct ureg_src srcs[] = {image, ureg_src(coord)};
+      ureg_memory_insn(ureg, TGSI_OPCODE_LOAD, &sample[i], 1, srcs, 2, TGSI_MEMORY_RESTRICT, target,
+                       0);
+   }
+
+   /* Store samples, ignoring FMASK. */
+   for (unsigned i = 0; i < num_samples; i++) {
+      ureg_MOV(ureg, ureg_writemask(coord, TGSI_WRITEMASK_W), ureg_imm1u(ureg, i));
+
+      struct ureg_dst dst_image = ureg_dst(image);
+      struct ureg_src srcs[] = {ureg_src(coord), ureg_src(sample[i])};
+      ureg_memory_insn(ureg, TGSI_OPCODE_STORE, &dst_image, 1, srcs, 2, TGSI_MEMORY_RESTRICT,
+                       target, 0);
+   }
+   ureg_END(ureg);
+
+   struct pipe_compute_state state = {};
+   state.ir_type = PIPE_SHADER_IR_TGSI;
+   state.prog = ureg_get_tokens(ureg, NULL);
+
+   void *cs = ctx->create_compute_state(ctx, &state);
+   ureg_destroy(ureg);
+   return cs;
 }
 
 /* Create the compute shader that is used to collect the results of gfx10+
@@ -798,196 +766,192 @@ void *si_create_fmask_expand_cs(struct pipe_context *ctx, unsigned num_samples,
  */
 void *gfx10_create_sh_query_result_cs(struct si_context *sctx)
 {
-	/* TEMP[0].x = accumulated result so far
-	 * TEMP[0].y = result missing
-	 * TEMP[0].z = whether we're in overflow mode
-	 */
-	static const char text_tmpl[] =
-		"COMP\n"
-		"PROPERTY CS_FIXED_BLOCK_WIDTH 1\n"
-		"PROPERTY CS_FIXED_BLOCK_HEIGHT 1\n"
-		"PROPERTY CS_FIXED_BLOCK_DEPTH 1\n"
-		"DCL BUFFER[0]\n"
-		"DCL BUFFER[1]\n"
-		"DCL BUFFER[2]\n"
-		"DCL CONST[0][0..0]\n"
-		"DCL TEMP[0..5]\n"
-		"IMM[0] UINT32 {0, 7, 0, 4294967295}\n"
-		"IMM[1] UINT32 {1, 2, 4, 8}\n"
-		"IMM[2] UINT32 {16, 32, 64, 128}\n"
-
-		/*
-		acc_result = 0;
-		acc_missing = 0;
-		if (chain & 1) {
-			acc_result = buffer[1][0];
-			acc_missing = buffer[1][1];
-		}
-		*/
-		"MOV TEMP[0].xy, IMM[0].xxxx\n"
-		"AND TEMP[5], CONST[0][0].zzzz, IMM[1].xxxx\n"
-		"UIF TEMP[5]\n"
-			"LOAD TEMP[0].xy, BUFFER[1], IMM[0].xxxx\n"
-		"ENDIF\n"
-
-		/*
-		is_overflow (TEMP[0].z) = (config & 7) >= 2;
-		result_remaining (TEMP[1].x) = (is_overflow && acc_result) ? 0 : result_count;
-		base_offset (TEMP[1].y) = 0;
-		for (;;) {
-			if (!result_remaining)
-				break;
-			result_remaining--;
-		*/
-		"AND TEMP[5].x, CONST[0][0].xxxx, IMM[0].yyyy\n"
-		"USGE TEMP[0].z, TEMP[5].xxxx, IMM[1].yyyy\n"
-
-		"AND TEMP[5].x, TEMP[0].zzzz, TEMP[0].xxxx\n"
-		"UCMP TEMP[1].x, TEMP[5].xxxx, IMM[0].xxxx, CONST[0][0].wwww\n"
-		"MOV TEMP[1].y, IMM[0].xxxx\n"
-
-		"BGNLOOP\n"
-			"USEQ TEMP[5], TEMP[1].xxxx, IMM[0].xxxx\n"
-			"UIF TEMP[5]\n"
-				"BRK\n"
-			"ENDIF\n"
-			"UADD TEMP[1].x, TEMP[1].xxxx, IMM[0].wwww\n"
-
-			/*
-			fence = buffer[0]@(base_offset + 32);
-			if (!fence) {
-				acc_missing = ~0u;
-				break;
-			}
-			*/
-			"UADD TEMP[5].x, TEMP[1].yyyy, IMM[2].yyyy\n"
-			"LOAD TEMP[5].x, BUFFER[0], TEMP[5].xxxx\n"
-			"USEQ TEMP[5], TEMP[5].xxxx, IMM[0].xxxx\n"
-			"UIF TEMP[5]\n"
-				"MOV TEMP[0].y, TEMP[5].xxxx\n"
-				"BRK\n"
-			"ENDIF\n"
-
-			/*
-			stream_offset (TEMP[2].x) = base_offset + offset;
-
-			if (!(config & 7)) {
-				acc_result += buffer[0]@stream_offset;
-			}
-			*/
-			"UADD TEMP[2].x, TEMP[1].yyyy, CONST[0][0].yyyy\n"
-
-			"AND TEMP[5].x, CONST[0][0].xxxx, IMM[0].yyyy\n"
-			"USEQ TEMP[5], TEMP[5].xxxx, IMM[0].xxxx\n"
-			"UIF TEMP[5]\n"
-				"LOAD TEMP[5].x, BUFFER[0], TEMP[2].xxxx\n"
-				"UADD TEMP[0].x, TEMP[0].xxxx, TEMP[5].xxxx\n"
-			"ENDIF\n"
-
-			/*
-			if ((config & 7) >= 2) {
-				count (TEMP[2].y) = (config & 1) ? 4 : 1;
-			*/
-			"AND TEMP[5].x, CONST[0][0].xxxx, IMM[0].yyyy\n"
-			"USGE TEMP[5], TEMP[5].xxxx, IMM[1].yyyy\n"
-			"UIF TEMP[5]\n"
-				"AND TEMP[5].x, CONST[0][0].xxxx, IMM[1].xxxx\n"
-				"UCMP TEMP[2].y, TEMP[5].xxxx, IMM[1].zzzz, IMM[1].xxxx\n"
-
-				/*
-				do {
-					generated = buffer[0]@stream_offset;
-					emitted = buffer[0]@(stream_offset + 16);
-					if (generated != emitted) {
-						acc_result = 1;
-						result_remaining = 0;
-						break;
-					}
-
-					stream_offset += 4;
-				} while (--count);
-				*/
-				"BGNLOOP\n"
-					"UADD TEMP[5].x, TEMP[2].xxxx, IMM[2].xxxx\n"
-					"LOAD TEMP[4].x, BUFFER[0], TEMP[2].xxxx\n"
-					"LOAD TEMP[4].y, BUFFER[0], TEMP[5].xxxx\n"
-					"USNE TEMP[5], TEMP[4].xxxx, TEMP[4].yyyy\n"
-					"UIF TEMP[5]\n"
-						"MOV TEMP[0].x, IMM[1].xxxx\n"
-						"MOV TEMP[1].y, IMM[0].xxxx\n"
-						"BRK\n"
-					"ENDIF\n"
-
-					"UADD TEMP[2].y, TEMP[2].yyyy, IMM[0].wwww\n"
-					"USEQ TEMP[5], TEMP[2].yyyy, IMM[0].xxxx\n"
-					"UIF TEMP[5]\n"
-						"BRK\n"
-					"ENDIF\n"
-					"UADD TEMP[2].x, TEMP[2].xxxx, IMM[1].zzzz\n"
-				"ENDLOOP\n"
-			"ENDIF\n"
-
-		/*
-			base_offset += 64;
-		} // end outer loop
-		*/
-			"UADD TEMP[1].y, TEMP[1].yyyy, IMM[2].zzzz\n"
-		"ENDLOOP\n"
-
-		/*
-		if (chain & 2) {
-			buffer[2][0] = acc_result;
-			buffer[2][1] = acc_missing;
-		} else {
-		*/
-		"AND TEMP[5], CONST[0][0].zzzz, IMM[1].yyyy\n"
-		"UIF TEMP[5]\n"
-			"STORE BUFFER[2].xy, IMM[0].xxxx, TEMP[0]\n"
-		"ELSE\n"
-
-			/*
-			if ((config & 7) == 1) {
-				acc_result = acc_missing ? 0 : 1;
-				acc_missing = 0;
-			}
-			*/
-			"AND TEMP[5], CONST[0][0].xxxx, IMM[0].yyyy\n"
-			"USEQ TEMP[5], TEMP[5].xxxx, IMM[1].xxxx\n"
-			"UIF TEMP[5]\n"
-				"UCMP TEMP[0].x, TEMP[0].yyyy, IMM[0].xxxx, IMM[1].xxxx\n"
-				"MOV TEMP[0].y, IMM[0].xxxx\n"
-			"ENDIF\n"
-
-			/*
-			if (!acc_missing) {
-				buffer[2][0] = acc_result;
-				if (config & 8)
-					buffer[2][1] = 0;
-			}
-			*/
-			"USEQ TEMP[5], TEMP[0].yyyy, IMM[0].xxxx\n"
-			"UIF TEMP[5]\n"
-				"STORE BUFFER[2].x, IMM[0].xxxx, TEMP[0].xxxx\n"
-
-				"AND TEMP[5], CONST[0][0].xxxx, IMM[1].wwww\n"
-				"UIF TEMP[5]\n"
-					"STORE BUFFER[2].x, IMM[1].zzzz, TEMP[0].yyyy\n"
-				"ENDIF\n"
-			"ENDIF\n"
-		"ENDIF\n"
-
-		"END\n";
-
-	struct tgsi_token tokens[1024];
-	struct pipe_compute_state state = {};
-
-	if (!tgsi_text_translate(text_tmpl, tokens, ARRAY_SIZE(tokens))) {
-		assert(false);
-		return NULL;
-	}
-
-	state.ir_type = PIPE_SHADER_IR_TGSI;
-	state.prog = tokens;
-
-	return sctx->b.create_compute_state(&sctx->b, &state);
+   /* TEMP[0].x = accumulated result so far
+    * TEMP[0].y = result missing
+    * TEMP[0].z = whether we're in overflow mode
+    */
+   static const char text_tmpl[] = "COMP\n"
+                                   "PROPERTY CS_FIXED_BLOCK_WIDTH 1\n"
+                                   "PROPERTY CS_FIXED_BLOCK_HEIGHT 1\n"
+                                   "PROPERTY CS_FIXED_BLOCK_DEPTH 1\n"
+                                   "DCL BUFFER[0]\n"
+                                   "DCL BUFFER[1]\n"
+                                   "DCL BUFFER[2]\n"
+                                   "DCL CONST[0][0..0]\n"
+                                   "DCL TEMP[0..5]\n"
+                                   "IMM[0] UINT32 {0, 7, 0, 4294967295}\n"
+                                   "IMM[1] UINT32 {1, 2, 4, 8}\n"
+                                   "IMM[2] UINT32 {16, 32, 64, 128}\n"
+
+                                   /*
+                                   acc_result = 0;
+                                   acc_missing = 0;
+                                   if (chain & 1) {
+                                           acc_result = buffer[1][0];
+                                           acc_missing = buffer[1][1];
+                                   }
+                                   */
+                                   "MOV TEMP[0].xy, IMM[0].xxxx\n"
+                                   "AND TEMP[5], CONST[0][0].zzzz, IMM[1].xxxx\n"
+                                   "UIF TEMP[5]\n"
+                                   "LOAD TEMP[0].xy, BUFFER[1], IMM[0].xxxx\n"
+                                   "ENDIF\n"
+
+                                   /*
+                                   is_overflow (TEMP[0].z) = (config & 7) >= 2;
+                                   result_remaining (TEMP[1].x) = (is_overflow && acc_result) ? 0 :
+                                   result_count; base_offset (TEMP[1].y) = 0; for (;;) { if
+                                   (!result_remaining) break; result_remaining--;
+                                   */
+                                   "AND TEMP[5].x, CONST[0][0].xxxx, IMM[0].yyyy\n"
+                                   "USGE TEMP[0].z, TEMP[5].xxxx, IMM[1].yyyy\n"
+
+                                   "AND TEMP[5].x, TEMP[0].zzzz, TEMP[0].xxxx\n"
+                                   "UCMP TEMP[1].x, TEMP[5].xxxx, IMM[0].xxxx, CONST[0][0].wwww\n"
+                                   "MOV TEMP[1].y, IMM[0].xxxx\n"
+
+                                   "BGNLOOP\n"
+                                   "USEQ TEMP[5], TEMP[1].xxxx, IMM[0].xxxx\n"
+                                   "UIF TEMP[5]\n"
+                                   "BRK\n"
+                                   "ENDIF\n"
+                                   "UADD TEMP[1].x, TEMP[1].xxxx, IMM[0].wwww\n"
+
+                                   /*
+                                   fence = buffer[0]@(base_offset + 32);
+                                   if (!fence) {
+                                           acc_missing = ~0u;
+                                           break;
+                                   }
+                                   */
+                                   "UADD TEMP[5].x, TEMP[1].yyyy, IMM[2].yyyy\n"
+                                   "LOAD TEMP[5].x, BUFFER[0], TEMP[5].xxxx\n"
+                                   "USEQ TEMP[5], TEMP[5].xxxx, IMM[0].xxxx\n"
+                                   "UIF TEMP[5]\n"
+                                   "MOV TEMP[0].y, TEMP[5].xxxx\n"
+                                   "BRK\n"
+                                   "ENDIF\n"
+
+                                   /*
+                                   stream_offset (TEMP[2].x) = base_offset + offset;
+
+                                   if (!(config & 7)) {
+                                           acc_result += buffer[0]@stream_offset;
+                                   }
+                                   */
+                                   "UADD TEMP[2].x, TEMP[1].yyyy, CONST[0][0].yyyy\n"
+
+                                   "AND TEMP[5].x, CONST[0][0].xxxx, IMM[0].yyyy\n"
+                                   "USEQ TEMP[5], TEMP[5].xxxx, IMM[0].xxxx\n"
+                                   "UIF TEMP[5]\n"
+                                   "LOAD TEMP[5].x, BUFFER[0], TEMP[2].xxxx\n"
+                                   "UADD TEMP[0].x, TEMP[0].xxxx, TEMP[5].xxxx\n"
+                                   "ENDIF\n"
+
+                                   /*
+                                   if ((config & 7) >= 2) {
+                                           count (TEMP[2].y) = (config & 1) ? 4 : 1;
+                                   */
+                                   "AND TEMP[5].x, CONST[0][0].xxxx, IMM[0].yyyy\n"
+                                   "USGE TEMP[5], TEMP[5].xxxx, IMM[1].yyyy\n"
+                                   "UIF TEMP[5]\n"
+                                   "AND TEMP[5].x, CONST[0][0].xxxx, IMM[1].xxxx\n"
+                                   "UCMP TEMP[2].y, TEMP[5].xxxx, IMM[1].zzzz, IMM[1].xxxx\n"
+
+                                   /*
+                                   do {
+                                           generated = buffer[0]@stream_offset;
+                                           emitted = buffer[0]@(stream_offset + 16);
+                                           if (generated != emitted) {
+                                                   acc_result = 1;
+                                                   result_remaining = 0;
+                                                   break;
+                                           }
+
+                                           stream_offset += 4;
+                                   } while (--count);
+                                   */
+                                   "BGNLOOP\n"
+                                   "UADD TEMP[5].x, TEMP[2].xxxx, IMM[2].xxxx\n"
+                                   "LOAD TEMP[4].x, BUFFER[0], TEMP[2].xxxx\n"
+                                   "LOAD TEMP[4].y, BUFFER[0], TEMP[5].xxxx\n"
+                                   "USNE TEMP[5], TEMP[4].xxxx, TEMP[4].yyyy\n"
+                                   "UIF TEMP[5]\n"
+                                   "MOV TEMP[0].x, IMM[1].xxxx\n"
+                                   "MOV TEMP[1].y, IMM[0].xxxx\n"
+                                   "BRK\n"
+                                   "ENDIF\n"
+
+                                   "UADD TEMP[2].y, TEMP[2].yyyy, IMM[0].wwww\n"
+                                   "USEQ TEMP[5], TEMP[2].yyyy, IMM[0].xxxx\n"
+                                   "UIF TEMP[5]\n"
+                                   "BRK\n"
+                                   "ENDIF\n"
+                                   "UADD TEMP[2].x, TEMP[2].xxxx, IMM[1].zzzz\n"
+                                   "ENDLOOP\n"
+                                   "ENDIF\n"
+
+                                   /*
+                                           base_offset += 64;
+                                   } // end outer loop
+                                   */
+                                   "UADD TEMP[1].y, TEMP[1].yyyy, IMM[2].zzzz\n"
+                                   "ENDLOOP\n"
+
+                                   /*
+                                   if (chain & 2) {
+                                           buffer[2][0] = acc_result;
+                                           buffer[2][1] = acc_missing;
+                                   } else {
+                                   */
+                                   "AND TEMP[5], CONST[0][0].zzzz, IMM[1].yyyy\n"
+                                   "UIF TEMP[5]\n"
+                                   "STORE BUFFER[2].xy, IMM[0].xxxx, TEMP[0]\n"
+                                   "ELSE\n"
+
+                                   /*
+                                   if ((config & 7) == 1) {
+                                           acc_result = acc_missing ? 0 : 1;
+                                           acc_missing = 0;
+                                   }
+                                   */
+                                   "AND TEMP[5], CONST[0][0].xxxx, IMM[0].yyyy\n"
+                                   "USEQ TEMP[5], TEMP[5].xxxx, IMM[1].xxxx\n"
+                                   "UIF TEMP[5]\n"
+                                   "UCMP TEMP[0].x, TEMP[0].yyyy, IMM[0].xxxx, IMM[1].xxxx\n"
+                                   "MOV TEMP[0].y, IMM[0].xxxx\n"
+                                   "ENDIF\n"
+
+                                   /*
+                                   if (!acc_missing) {
+                                           buffer[2][0] = acc_result;
+                                           if (config & 8)
+                                                   buffer[2][1] = 0;
+                                   }
+                                   */
+                                   "USEQ TEMP[5], TEMP[0].yyyy, IMM[0].xxxx\n"
+                                   "UIF TEMP[5]\n"
+                                   "STORE BUFFER[2].x, IMM[0].xxxx, TEMP[0].xxxx\n"
+
+                                   "AND TEMP[5], CONST[0][0].xxxx, IMM[1].wwww\n"
+                                   "UIF TEMP[5]\n"
+                                   "STORE BUFFER[2].x, IMM[1].zzzz, TEMP[0].yyyy\n"
+                                   "ENDIF\n"
+                                   "ENDIF\n"
+                                   "ENDIF\n"
+
+                                   "END\n";
+
+   struct tgsi_token tokens[1024];
+   struct pipe_compute_state state = {};
+
+   if (!tgsi_text_translate(text_tmpl, tokens, ARRAY_SIZE(tokens))) {
+      assert(false);
+      return NULL;
+   }
+
+   state.ir_type = PIPE_SHADER_IR_TGSI;
+   state.prog = tokens;
+
+   return sctx->b.create_compute_state(&sctx->b, &state);
 }
diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c
index 46d7c71b2de..60aa0865502 100644
--- a/src/gallium/drivers/radeonsi/si_state.c
+++ b/src/gallium/drivers/radeonsi/si_state.c
@@ -23,51 +23,49 @@
  */
 
 #include "si_build_pm4.h"
-#include "sid.h"
 #include "si_query.h"
-
-#include "util/u_dual_blend.h"
+#include "sid.h"
+#include "util/fast_idiv_by_const.h"
 #include "util/format/u_format.h"
 #include "util/format/u_format_s3tc.h"
+#include "util/u_dual_blend.h"
 #include "util/u_memory.h"
 #include "util/u_resource.h"
 #include "util/u_upload_mgr.h"
-#include "util/fast_idiv_by_const.h"
 
 struct gfx10_format {
-    unsigned img_format:9;
+   unsigned img_format : 9;
 
-    /* Various formats are only supported with workarounds for vertex fetch,
-     * and some 32_32_32 formats are supported natively, but only for buffers
-     * (possibly with some image support, actually, but no filtering). */
-    bool buffers_only:1;
+   /* Various formats are only supported with workarounds for vertex fetch,
+    * and some 32_32_32 formats are supported natively, but only for buffers
+    * (possibly with some image support, actually, but no filtering). */
+   bool buffers_only : 1;
 };
 
 #include "gfx10_format_table.h"
 
 static unsigned si_map_swizzle(unsigned swizzle)
 {
-	switch (swizzle) {
-	case PIPE_SWIZZLE_Y:
-		return V_008F0C_SQ_SEL_Y;
-	case PIPE_SWIZZLE_Z:
-		return V_008F0C_SQ_SEL_Z;
-	case PIPE_SWIZZLE_W:
-		return V_008F0C_SQ_SEL_W;
-	case PIPE_SWIZZLE_0:
-		return V_008F0C_SQ_SEL_0;
-	case PIPE_SWIZZLE_1:
-		return V_008F0C_SQ_SEL_1;
-	default: /* PIPE_SWIZZLE_X */
-		return V_008F0C_SQ_SEL_X;
-	}
+   switch (swizzle) {
+   case PIPE_SWIZZLE_Y:
+      return V_008F0C_SQ_SEL_Y;
+   case PIPE_SWIZZLE_Z:
+      return V_008F0C_SQ_SEL_Z;
+   case PIPE_SWIZZLE_W:
+      return V_008F0C_SQ_SEL_W;
+   case PIPE_SWIZZLE_0:
+      return V_008F0C_SQ_SEL_0;
+   case PIPE_SWIZZLE_1:
+      return V_008F0C_SQ_SEL_1;
+   default: /* PIPE_SWIZZLE_X */
+      return V_008F0C_SQ_SEL_X;
+   }
 }
 
 /* 12.4 fixed-point */
 static unsigned si_pack_float_12p4(float x)
 {
-	return x <= 0    ? 0 :
-	       x >= 4096 ? 0xffff : x * 16;
+   return x <= 0 ? 0 : x >= 4096 ? 0xffff : x * 16;
 }
 
 /*
@@ -78,202 +76,191 @@ static unsigned si_pack_float_12p4(float x)
  */
 static void si_emit_cb_render_state(struct si_context *sctx)
 {
-	struct radeon_cmdbuf *cs = sctx->gfx_cs;
-	struct si_state_blend *blend = sctx->queued.named.blend;
-	/* CB_COLORn_INFO.FORMAT=INVALID should disable unbound colorbuffers,
-	 * but you never know. */
-	uint32_t cb_target_mask = sctx->framebuffer.colorbuf_enabled_4bit &
-				  blend->cb_target_mask;
-	unsigned i;
-
-	/* Avoid a hang that happens when dual source blending is enabled
-	 * but there is not enough color outputs. This is undefined behavior,
-	 * so disable color writes completely.
-	 *
-	 * Reproducible with Unigine Heaven 4.0 and drirc missing.
-	 */
-	if (blend->dual_src_blend &&
-	    sctx->ps_shader.cso &&
-	    (sctx->ps_shader.cso->info.colors_written & 0x3) != 0x3)
-		cb_target_mask = 0;
-
-	/* GFX9: Flush DFSM when CB_TARGET_MASK changes.
-	 * I think we don't have to do anything between IBs.
-	 */
-	if (sctx->screen->dpbb_allowed &&
-	    sctx->last_cb_target_mask != cb_target_mask) {
-		sctx->last_cb_target_mask = cb_target_mask;
-
-		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
-		radeon_emit(cs, EVENT_TYPE(V_028A90_BREAK_BATCH) | EVENT_INDEX(0));
-	}
-
-	unsigned initial_cdw = cs->current.cdw;
-	radeon_opt_set_context_reg(sctx, R_028238_CB_TARGET_MASK,
-				   SI_TRACKED_CB_TARGET_MASK, cb_target_mask);
-
-	if (sctx->chip_class >= GFX8) {
-		/* DCC MSAA workaround.
-		 * Alternatively, we can set CB_COLORi_DCC_CONTROL.OVERWRITE_-
-		 * COMBINER_DISABLE, but that would be more complicated.
-		 */
-		bool oc_disable = blend->dcc_msaa_corruption_4bit & cb_target_mask &&
-				  sctx->framebuffer.nr_samples >= 2;
-		unsigned watermark = sctx->framebuffer.dcc_overwrite_combiner_watermark;
-
-		radeon_opt_set_context_reg(
-				sctx, R_028424_CB_DCC_CONTROL,
-				SI_TRACKED_CB_DCC_CONTROL,
-				S_028424_OVERWRITE_COMBINER_MRT_SHARING_DISABLE(sctx->chip_class <= GFX9) |
-				S_028424_OVERWRITE_COMBINER_WATERMARK(watermark) |
-				S_028424_OVERWRITE_COMBINER_DISABLE(oc_disable) |
-				S_028424_DISABLE_CONSTANT_ENCODE_REG(sctx->screen->info.has_dcc_constant_encode));
-	}
-
-	/* RB+ register settings. */
-	if (sctx->screen->info.rbplus_allowed) {
-		unsigned spi_shader_col_format =
-			sctx->ps_shader.cso ?
-			sctx->ps_shader.current->key.part.ps.epilog.spi_shader_col_format : 0;
-		unsigned sx_ps_downconvert = 0;
-		unsigned sx_blend_opt_epsilon = 0;
-		unsigned sx_blend_opt_control = 0;
-
-		for (i = 0; i < sctx->framebuffer.state.nr_cbufs; i++) {
-			struct si_surface *surf =
-				(struct si_surface*)sctx->framebuffer.state.cbufs[i];
-			unsigned format, swap, spi_format, colormask;
-			bool has_alpha, has_rgb;
-
-			if (!surf) {
-				/* If the color buffer is not set, the driver sets 32_R
-				 * as the SPI color format, because the hw doesn't allow
-				 * holes between color outputs, so also set this to
-				 * enable RB+.
-				 */
-				sx_ps_downconvert |= V_028754_SX_RT_EXPORT_32_R << (i * 4);
-				continue;
-			}
-
-			format = G_028C70_FORMAT(surf->cb_color_info);
-			swap = G_028C70_COMP_SWAP(surf->cb_color_info);
-			spi_format = (spi_shader_col_format >> (i * 4)) & 0xf;
-			colormask = (cb_target_mask >> (i * 4)) & 0xf;
-
-			/* Set if RGB and A are present. */
-			has_alpha = !G_028C74_FORCE_DST_ALPHA_1(surf->cb_color_attrib);
-
-			if (format == V_028C70_COLOR_8 ||
-			    format == V_028C70_COLOR_16 ||
-			    format == V_028C70_COLOR_32)
-				has_rgb = !has_alpha;
-			else
-				has_rgb = true;
-
-			/* Check the colormask and export format. */
-			if (!(colormask & (PIPE_MASK_RGBA & ~PIPE_MASK_A)))
-				has_rgb = false;
-			if (!(colormask & PIPE_MASK_A))
-				has_alpha = false;
-
-			if (spi_format == V_028714_SPI_SHADER_ZERO) {
-				has_rgb = false;
-				has_alpha = false;
-			}
-
-			/* Disable value checking for disabled channels. */
-			if (!has_rgb)
-				sx_blend_opt_control |= S_02875C_MRT0_COLOR_OPT_DISABLE(1) << (i * 4);
-			if (!has_alpha)
-				sx_blend_opt_control |= S_02875C_MRT0_ALPHA_OPT_DISABLE(1) << (i * 4);
-
-			/* Enable down-conversion for 32bpp and smaller formats. */
-			switch (format) {
-			case V_028C70_COLOR_8:
-			case V_028C70_COLOR_8_8:
-			case V_028C70_COLOR_8_8_8_8:
-				/* For 1 and 2-channel formats, use the superset thereof. */
-				if (spi_format == V_028714_SPI_SHADER_FP16_ABGR ||
-				    spi_format == V_028714_SPI_SHADER_UINT16_ABGR ||
-				    spi_format == V_028714_SPI_SHADER_SINT16_ABGR) {
-					sx_ps_downconvert |= V_028754_SX_RT_EXPORT_8_8_8_8 << (i * 4);
-					sx_blend_opt_epsilon |= V_028758_8BIT_FORMAT << (i * 4);
-				}
-				break;
-
-			case V_028C70_COLOR_5_6_5:
-				if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) {
-					sx_ps_downconvert |= V_028754_SX_RT_EXPORT_5_6_5 << (i * 4);
-					sx_blend_opt_epsilon |= V_028758_6BIT_FORMAT << (i * 4);
-				}
-				break;
-
-			case V_028C70_COLOR_1_5_5_5:
-				if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) {
-					sx_ps_downconvert |= V_028754_SX_RT_EXPORT_1_5_5_5 << (i * 4);
-					sx_blend_opt_epsilon |= V_028758_5BIT_FORMAT << (i * 4);
-				}
-				break;
-
-			case V_028C70_COLOR_4_4_4_4:
-				if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) {
-					sx_ps_downconvert |= V_028754_SX_RT_EXPORT_4_4_4_4 << (i * 4);
-					sx_blend_opt_epsilon |= V_028758_4BIT_FORMAT << (i * 4);
-				}
-				break;
-
-			case V_028C70_COLOR_32:
-				if (swap == V_028C70_SWAP_STD &&
-				    spi_format == V_028714_SPI_SHADER_32_R)
-					sx_ps_downconvert |= V_028754_SX_RT_EXPORT_32_R << (i * 4);
-				else if (swap == V_028C70_SWAP_ALT_REV &&
-					 spi_format == V_028714_SPI_SHADER_32_AR)
-					sx_ps_downconvert |= V_028754_SX_RT_EXPORT_32_A << (i * 4);
-				break;
-
-			case V_028C70_COLOR_16:
-			case V_028C70_COLOR_16_16:
-				/* For 1-channel formats, use the superset thereof. */
-				if (spi_format == V_028714_SPI_SHADER_UNORM16_ABGR ||
-				    spi_format == V_028714_SPI_SHADER_SNORM16_ABGR ||
-				    spi_format == V_028714_SPI_SHADER_UINT16_ABGR ||
-				    spi_format == V_028714_SPI_SHADER_SINT16_ABGR) {
-					if (swap == V_028C70_SWAP_STD ||
-					    swap == V_028C70_SWAP_STD_REV)
-						sx_ps_downconvert |= V_028754_SX_RT_EXPORT_16_16_GR << (i * 4);
-					else
-						sx_ps_downconvert |= V_028754_SX_RT_EXPORT_16_16_AR << (i * 4);
-				}
-				break;
-
-			case V_028C70_COLOR_10_11_11:
-				if (spi_format == V_028714_SPI_SHADER_FP16_ABGR)
-					sx_ps_downconvert |= V_028754_SX_RT_EXPORT_10_11_11 << (i * 4);
-				break;
-
-			case V_028C70_COLOR_2_10_10_10:
-				if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) {
-					sx_ps_downconvert |= V_028754_SX_RT_EXPORT_2_10_10_10 << (i * 4);
-					sx_blend_opt_epsilon |= V_028758_10BIT_FORMAT << (i * 4);
-				}
-				break;
-			}
-		}
-
-		/* If there are no color outputs, the first color export is
-		 * always enabled as 32_R, so also set this to enable RB+.
-		 */
-		if (!sx_ps_downconvert)
-			sx_ps_downconvert = V_028754_SX_RT_EXPORT_32_R;
-
-		/* SX_PS_DOWNCONVERT, SX_BLEND_OPT_EPSILON, SX_BLEND_OPT_CONTROL */
-		radeon_opt_set_context_reg3(sctx, R_028754_SX_PS_DOWNCONVERT,
-					    SI_TRACKED_SX_PS_DOWNCONVERT,
-					    sx_ps_downconvert, sx_blend_opt_epsilon,
-					    sx_blend_opt_control);
-	}
-	if (initial_cdw != cs->current.cdw)
-		sctx->context_roll = true;
+   struct radeon_cmdbuf *cs = sctx->gfx_cs;
+   struct si_state_blend *blend = sctx->queued.named.blend;
+   /* CB_COLORn_INFO.FORMAT=INVALID should disable unbound colorbuffers,
+    * but you never know. */
+   uint32_t cb_target_mask = sctx->framebuffer.colorbuf_enabled_4bit & blend->cb_target_mask;
+   unsigned i;
+
+   /* Avoid a hang that happens when dual source blending is enabled
+    * but there is not enough color outputs. This is undefined behavior,
+    * so disable color writes completely.
+    *
+    * Reproducible with Unigine Heaven 4.0 and drirc missing.
+    */
+   if (blend->dual_src_blend && sctx->ps_shader.cso &&
+       (sctx->ps_shader.cso->info.colors_written & 0x3) != 0x3)
+      cb_target_mask = 0;
+
+   /* GFX9: Flush DFSM when CB_TARGET_MASK changes.
+    * I think we don't have to do anything between IBs.
+    */
+   if (sctx->screen->dpbb_allowed && sctx->last_cb_target_mask != cb_target_mask) {
+      sctx->last_cb_target_mask = cb_target_mask;
+
+      radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
+      radeon_emit(cs, EVENT_TYPE(V_028A90_BREAK_BATCH) | EVENT_INDEX(0));
+   }
+
+   unsigned initial_cdw = cs->current.cdw;
+   radeon_opt_set_context_reg(sctx, R_028238_CB_TARGET_MASK, SI_TRACKED_CB_TARGET_MASK,
+                              cb_target_mask);
+
+   if (sctx->chip_class >= GFX8) {
+      /* DCC MSAA workaround.
+       * Alternatively, we can set CB_COLORi_DCC_CONTROL.OVERWRITE_-
+       * COMBINER_DISABLE, but that would be more complicated.
+       */
+      bool oc_disable =
+         blend->dcc_msaa_corruption_4bit & cb_target_mask && sctx->framebuffer.nr_samples >= 2;
+      unsigned watermark = sctx->framebuffer.dcc_overwrite_combiner_watermark;
+
+      radeon_opt_set_context_reg(
+         sctx, R_028424_CB_DCC_CONTROL, SI_TRACKED_CB_DCC_CONTROL,
+         S_028424_OVERWRITE_COMBINER_MRT_SHARING_DISABLE(sctx->chip_class <= GFX9) |
+            S_028424_OVERWRITE_COMBINER_WATERMARK(watermark) |
+            S_028424_OVERWRITE_COMBINER_DISABLE(oc_disable) |
+            S_028424_DISABLE_CONSTANT_ENCODE_REG(sctx->screen->info.has_dcc_constant_encode));
+   }
+
+   /* RB+ register settings. */
+   if (sctx->screen->info.rbplus_allowed) {
+      unsigned spi_shader_col_format =
+         sctx->ps_shader.cso ? sctx->ps_shader.current->key.part.ps.epilog.spi_shader_col_format
+                             : 0;
+      unsigned sx_ps_downconvert = 0;
+      unsigned sx_blend_opt_epsilon = 0;
+      unsigned sx_blend_opt_control = 0;
+
+      for (i = 0; i < sctx->framebuffer.state.nr_cbufs; i++) {
+         struct si_surface *surf = (struct si_surface *)sctx->framebuffer.state.cbufs[i];
+         unsigned format, swap, spi_format, colormask;
+         bool has_alpha, has_rgb;
+
+         if (!surf) {
+            /* If the color buffer is not set, the driver sets 32_R
+             * as the SPI color format, because the hw doesn't allow
+             * holes between color outputs, so also set this to
+             * enable RB+.
+             */
+            sx_ps_downconvert |= V_028754_SX_RT_EXPORT_32_R << (i * 4);
+            continue;
+         }
+
+         format = G_028C70_FORMAT(surf->cb_color_info);
+         swap = G_028C70_COMP_SWAP(surf->cb_color_info);
+         spi_format = (spi_shader_col_format >> (i * 4)) & 0xf;
+         colormask = (cb_target_mask >> (i * 4)) & 0xf;
+
+         /* Set if RGB and A are present. */
+         has_alpha = !G_028C74_FORCE_DST_ALPHA_1(surf->cb_color_attrib);
+
+         if (format == V_028C70_COLOR_8 || format == V_028C70_COLOR_16 ||
+             format == V_028C70_COLOR_32)
+            has_rgb = !has_alpha;
+         else
+            has_rgb = true;
+
+         /* Check the colormask and export format. */
+         if (!(colormask & (PIPE_MASK_RGBA & ~PIPE_MASK_A)))
+            has_rgb = false;
+         if (!(colormask & PIPE_MASK_A))
+            has_alpha = false;
+
+         if (spi_format == V_028714_SPI_SHADER_ZERO) {
+            has_rgb = false;
+            has_alpha = false;
+         }
+
+         /* Disable value checking for disabled channels. */
+         if (!has_rgb)
+            sx_blend_opt_control |= S_02875C_MRT0_COLOR_OPT_DISABLE(1) << (i * 4);
+         if (!has_alpha)
+            sx_blend_opt_control |= S_02875C_MRT0_ALPHA_OPT_DISABLE(1) << (i * 4);
+
+         /* Enable down-conversion for 32bpp and smaller formats. */
+         switch (format) {
+         case V_028C70_COLOR_8:
+         case V_028C70_COLOR_8_8:
+         case V_028C70_COLOR_8_8_8_8:
+            /* For 1 and 2-channel formats, use the superset thereof. */
+            if (spi_format == V_028714_SPI_SHADER_FP16_ABGR ||
+                spi_format == V_028714_SPI_SHADER_UINT16_ABGR ||
+                spi_format == V_028714_SPI_SHADER_SINT16_ABGR) {
+               sx_ps_downconvert |= V_028754_SX_RT_EXPORT_8_8_8_8 << (i * 4);
+               sx_blend_opt_epsilon |= V_028758_8BIT_FORMAT << (i * 4);
+            }
+            break;
+
+         case V_028C70_COLOR_5_6_5:
+            if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) {
+               sx_ps_downconvert |= V_028754_SX_RT_EXPORT_5_6_5 << (i * 4);
+               sx_blend_opt_epsilon |= V_028758_6BIT_FORMAT << (i * 4);
+            }
+            break;
+
+         case V_028C70_COLOR_1_5_5_5:
+            if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) {
+               sx_ps_downconvert |= V_028754_SX_RT_EXPORT_1_5_5_5 << (i * 4);
+               sx_blend_opt_epsilon |= V_028758_5BIT_FORMAT << (i * 4);
+            }
+            break;
+
+         case V_028C70_COLOR_4_4_4_4:
+            if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) {
+               sx_ps_downconvert |= V_028754_SX_RT_EXPORT_4_4_4_4 << (i * 4);
+               sx_blend_opt_epsilon |= V_028758_4BIT_FORMAT << (i * 4);
+            }
+            break;
+
+         case V_028C70_COLOR_32:
+            if (swap == V_028C70_SWAP_STD && spi_format == V_028714_SPI_SHADER_32_R)
+               sx_ps_downconvert |= V_028754_SX_RT_EXPORT_32_R << (i * 4);
+            else if (swap == V_028C70_SWAP_ALT_REV && spi_format == V_028714_SPI_SHADER_32_AR)
+               sx_ps_downconvert |= V_028754_SX_RT_EXPORT_32_A << (i * 4);
+            break;
+
+         case V_028C70_COLOR_16:
+         case V_028C70_COLOR_16_16:
+            /* For 1-channel formats, use the superset thereof. */
+            if (spi_format == V_028714_SPI_SHADER_UNORM16_ABGR ||
+                spi_format == V_028714_SPI_SHADER_SNORM16_ABGR ||
+                spi_format == V_028714_SPI_SHADER_UINT16_ABGR ||
+                spi_format == V_028714_SPI_SHADER_SINT16_ABGR) {
+               if (swap == V_028C70_SWAP_STD || swap == V_028C70_SWAP_STD_REV)
+                  sx_ps_downconvert |= V_028754_SX_RT_EXPORT_16_16_GR << (i * 4);
+               else
+                  sx_ps_downconvert |= V_028754_SX_RT_EXPORT_16_16_AR << (i * 4);
+            }
+            break;
+
+         case V_028C70_COLOR_10_11_11:
+            if (spi_format == V_028714_SPI_SHADER_FP16_ABGR)
+               sx_ps_downconvert |= V_028754_SX_RT_EXPORT_10_11_11 << (i * 4);
+            break;
+
+         case V_028C70_COLOR_2_10_10_10:
+            if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) {
+               sx_ps_downconvert |= V_028754_SX_RT_EXPORT_2_10_10_10 << (i * 4);
+               sx_blend_opt_epsilon |= V_028758_10BIT_FORMAT << (i * 4);
+            }
+            break;
+         }
+      }
+
+      /* If there are no color outputs, the first color export is
+       * always enabled as 32_R, so also set this to enable RB+.
+       */
+      if (!sx_ps_downconvert)
+         sx_ps_downconvert = V_028754_SX_RT_EXPORT_32_R;
+
+      /* SX_PS_DOWNCONVERT, SX_BLEND_OPT_EPSILON, SX_BLEND_OPT_CONTROL */
+      radeon_opt_set_context_reg3(sctx, R_028754_SX_PS_DOWNCONVERT, SI_TRACKED_SX_PS_DOWNCONVERT,
+                                  sx_ps_downconvert, sx_blend_opt_epsilon, sx_blend_opt_control);
+   }
+   if (initial_cdw != cs->current.cdw)
+      sctx->context_roll = true;
 }
 
 /*
@@ -282,551 +269,507 @@ static void si_emit_cb_render_state(struct si_context *sctx)
 
 static uint32_t si_translate_blend_function(int blend_func)
 {
-	switch (blend_func) {
-	case PIPE_BLEND_ADD:
-		return V_028780_COMB_DST_PLUS_SRC;
-	case PIPE_BLEND_SUBTRACT:
-		return V_028780_COMB_SRC_MINUS_DST;
-	case PIPE_BLEND_REVERSE_SUBTRACT:
-		return V_028780_COMB_DST_MINUS_SRC;
-	case PIPE_BLEND_MIN:
-		return V_028780_COMB_MIN_DST_SRC;
-	case PIPE_BLEND_MAX:
-		return V_028780_COMB_MAX_DST_SRC;
-	default:
-		PRINT_ERR("Unknown blend function %d\n", blend_func);
-		assert(0);
-		break;
-	}
-	return 0;
+   switch (blend_func) {
+   case PIPE_BLEND_ADD:
+      return V_028780_COMB_DST_PLUS_SRC;
+   case PIPE_BLEND_SUBTRACT:
+      return V_028780_COMB_SRC_MINUS_DST;
+   case PIPE_BLEND_REVERSE_SUBTRACT:
+      return V_028780_COMB_DST_MINUS_SRC;
+   case PIPE_BLEND_MIN:
+      return V_028780_COMB_MIN_DST_SRC;
+   case PIPE_BLEND_MAX:
+      return V_028780_COMB_MAX_DST_SRC;
+   default:
+      PRINT_ERR("Unknown blend function %d\n", blend_func);
+      assert(0);
+      break;
+   }
+   return 0;
 }
 
 static uint32_t si_translate_blend_factor(int blend_fact)
 {
-	switch (blend_fact) {
-	case PIPE_BLENDFACTOR_ONE:
-		return V_028780_BLEND_ONE;
-	case PIPE_BLENDFACTOR_SRC_COLOR:
-		return V_028780_BLEND_SRC_COLOR;
-	case PIPE_BLENDFACTOR_SRC_ALPHA:
-		return V_028780_BLEND_SRC_ALPHA;
-	case PIPE_BLENDFACTOR_DST_ALPHA:
-		return V_028780_BLEND_DST_ALPHA;
-	case PIPE_BLENDFACTOR_DST_COLOR:
-		return V_028780_BLEND_DST_COLOR;
-	case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
-		return V_028780_BLEND_SRC_ALPHA_SATURATE;
-	case PIPE_BLENDFACTOR_CONST_COLOR:
-		return V_028780_BLEND_CONSTANT_COLOR;
-	case PIPE_BLENDFACTOR_CONST_ALPHA:
-		return V_028780_BLEND_CONSTANT_ALPHA;
-	case PIPE_BLENDFACTOR_ZERO:
-		return V_028780_BLEND_ZERO;
-	case PIPE_BLENDFACTOR_INV_SRC_COLOR:
-		return V_028780_BLEND_ONE_MINUS_SRC_COLOR;
-	case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
-		return V_028780_BLEND_ONE_MINUS_SRC_ALPHA;
-	case PIPE_BLENDFACTOR_INV_DST_ALPHA:
-		return V_028780_BLEND_ONE_MINUS_DST_ALPHA;
-	case PIPE_BLENDFACTOR_INV_DST_COLOR:
-		return V_028780_BLEND_ONE_MINUS_DST_COLOR;
-	case PIPE_BLENDFACTOR_INV_CONST_COLOR:
-		return V_028780_BLEND_ONE_MINUS_CONSTANT_COLOR;
-	case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
-		return V_028780_BLEND_ONE_MINUS_CONSTANT_ALPHA;
-	case PIPE_BLENDFACTOR_SRC1_COLOR:
-		return V_028780_BLEND_SRC1_COLOR;
-	case PIPE_BLENDFACTOR_SRC1_ALPHA:
-		return V_028780_BLEND_SRC1_ALPHA;
-	case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
-		return V_028780_BLEND_INV_SRC1_COLOR;
-	case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
-		return V_028780_BLEND_INV_SRC1_ALPHA;
-	default:
-		PRINT_ERR("Bad blend factor %d not supported!\n", blend_fact);
-		assert(0);
-		break;
-	}
-	return 0;
+   switch (blend_fact) {
+   case PIPE_BLENDFACTOR_ONE:
+      return V_028780_BLEND_ONE;
+   case PIPE_BLENDFACTOR_SRC_COLOR:
+      return V_028780_BLEND_SRC_COLOR;
+   case PIPE_BLENDFACTOR_SRC_ALPHA:
+      return V_028780_BLEND_SRC_ALPHA;
+   case PIPE_BLENDFACTOR_DST_ALPHA:
+      return V_028780_BLEND_DST_ALPHA;
+   case PIPE_BLENDFACTOR_DST_COLOR:
+      return V_028780_BLEND_DST_COLOR;
+   case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
+      return V_028780_BLEND_SRC_ALPHA_SATURATE;
+   case PIPE_BLENDFACTOR_CONST_COLOR:
+      return V_028780_BLEND_CONSTANT_COLOR;
+   case PIPE_BLENDFACTOR_CONST_ALPHA:
+      return V_028780_BLEND_CONSTANT_ALPHA;
+   case PIPE_BLENDFACTOR_ZERO:
+      return V_028780_BLEND_ZERO;
+   case PIPE_BLENDFACTOR_INV_SRC_COLOR:
+      return V_028780_BLEND_ONE_MINUS_SRC_COLOR;
+   case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
+      return V_028780_BLEND_ONE_MINUS_SRC_ALPHA;
+   case PIPE_BLENDFACTOR_INV_DST_ALPHA:
+      return V_028780_BLEND_ONE_MINUS_DST_ALPHA;
+   case PIPE_BLENDFACTOR_INV_DST_COLOR:
+      return V_028780_BLEND_ONE_MINUS_DST_COLOR;
+   case PIPE_BLENDFACTOR_INV_CONST_COLOR:
+      return V_028780_BLEND_ONE_MINUS_CONSTANT_COLOR;
+   case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
+      return V_028780_BLEND_ONE_MINUS_CONSTANT_ALPHA;
+   case PIPE_BLENDFACTOR_SRC1_COLOR:
+      return V_028780_BLEND_SRC1_COLOR;
+   case PIPE_BLENDFACTOR_SRC1_ALPHA:
+      return V_028780_BLEND_SRC1_ALPHA;
+   case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
+      return V_028780_BLEND_INV_SRC1_COLOR;
+   case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
+      return V_028780_BLEND_INV_SRC1_ALPHA;
+   default:
+      PRINT_ERR("Bad blend factor %d not supported!\n", blend_fact);
+      assert(0);
+      break;
+   }
+   return 0;
 }
 
 static uint32_t si_translate_blend_opt_function(int blend_func)
 {
-	switch (blend_func) {
-	case PIPE_BLEND_ADD:
-		return V_028760_OPT_COMB_ADD;
-	case PIPE_BLEND_SUBTRACT:
-		return V_028760_OPT_COMB_SUBTRACT;
-	case PIPE_BLEND_REVERSE_SUBTRACT:
-		return V_028760_OPT_COMB_REVSUBTRACT;
-	case PIPE_BLEND_MIN:
-		return V_028760_OPT_COMB_MIN;
-	case PIPE_BLEND_MAX:
-		return V_028760_OPT_COMB_MAX;
-	default:
-		return V_028760_OPT_COMB_BLEND_DISABLED;
-	}
+   switch (blend_func) {
+   case PIPE_BLEND_ADD:
+      return V_028760_OPT_COMB_ADD;
+   case PIPE_BLEND_SUBTRACT:
+      return V_028760_OPT_COMB_SUBTRACT;
+   case PIPE_BLEND_REVERSE_SUBTRACT:
+      return V_028760_OPT_COMB_REVSUBTRACT;
+   case PIPE_BLEND_MIN:
+      return V_028760_OPT_COMB_MIN;
+   case PIPE_BLEND_MAX:
+      return V_028760_OPT_COMB_MAX;
+   default:
+      return V_028760_OPT_COMB_BLEND_DISABLED;
+   }
 }
 
 static uint32_t si_translate_blend_opt_factor(int blend_fact, bool is_alpha)
 {
-	switch (blend_fact) {
-	case PIPE_BLENDFACTOR_ZERO:
-		return V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_ALL;
-	case PIPE_BLENDFACTOR_ONE:
-		return V_028760_BLEND_OPT_PRESERVE_ALL_IGNORE_NONE;
-	case PIPE_BLENDFACTOR_SRC_COLOR:
-		return is_alpha ? V_028760_BLEND_OPT_PRESERVE_A1_IGNORE_A0
-				: V_028760_BLEND_OPT_PRESERVE_C1_IGNORE_C0;
-	case PIPE_BLENDFACTOR_INV_SRC_COLOR:
-		return is_alpha ? V_028760_BLEND_OPT_PRESERVE_A0_IGNORE_A1
-				: V_028760_BLEND_OPT_PRESERVE_C0_IGNORE_C1;
-	case PIPE_BLENDFACTOR_SRC_ALPHA:
-		return V_028760_BLEND_OPT_PRESERVE_A1_IGNORE_A0;
-	case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
-		return V_028760_BLEND_OPT_PRESERVE_A0_IGNORE_A1;
-	case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
-		return is_alpha ? V_028760_BLEND_OPT_PRESERVE_ALL_IGNORE_NONE
-				: V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_A0;
-	default:
-		return V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_NONE;
-	}
+   switch (blend_fact) {
+   case PIPE_BLENDFACTOR_ZERO:
+      return V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_ALL;
+   case PIPE_BLENDFACTOR_ONE:
+      return V_028760_BLEND_OPT_PRESERVE_ALL_IGNORE_NONE;
+   case PIPE_BLENDFACTOR_SRC_COLOR:
+      return is_alpha ? V_028760_BLEND_OPT_PRESERVE_A1_IGNORE_A0
+                      : V_028760_BLEND_OPT_PRESERVE_C1_IGNORE_C0;
+   case PIPE_BLENDFACTOR_INV_SRC_COLOR:
+      return is_alpha ? V_028760_BLEND_OPT_PRESERVE_A0_IGNORE_A1
+                      : V_028760_BLEND_OPT_PRESERVE_C0_IGNORE_C1;
+   case PIPE_BLENDFACTOR_SRC_ALPHA:
+      return V_028760_BLEND_OPT_PRESERVE_A1_IGNORE_A0;
+   case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
+      return V_028760_BLEND_OPT_PRESERVE_A0_IGNORE_A1;
+   case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
+      return is_alpha ? V_028760_BLEND_OPT_PRESERVE_ALL_IGNORE_NONE
+                      : V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_A0;
+   default:
+      return V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_NONE;
+   }
 }
 
-static void si_blend_check_commutativity(struct si_screen *sscreen,
-					 struct si_state_blend *blend,
-					 enum pipe_blend_func func,
-					 enum pipe_blendfactor src,
-					 enum pipe_blendfactor dst,
-					 unsigned chanmask)
+static void si_blend_check_commutativity(struct si_screen *sscreen, struct si_state_blend *blend,
+                                         enum pipe_blend_func func, enum pipe_blendfactor src,
+                                         enum pipe_blendfactor dst, unsigned chanmask)
 {
-	/* Src factor is allowed when it does not depend on Dst */
-	static const uint32_t src_allowed =
-		(1u << PIPE_BLENDFACTOR_ONE) |
-		(1u << PIPE_BLENDFACTOR_SRC_COLOR) |
-		(1u << PIPE_BLENDFACTOR_SRC_ALPHA) |
-		(1u << PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE) |
-		(1u << PIPE_BLENDFACTOR_CONST_COLOR) |
-		(1u << PIPE_BLENDFACTOR_CONST_ALPHA) |
-		(1u << PIPE_BLENDFACTOR_SRC1_COLOR) |
-		(1u << PIPE_BLENDFACTOR_SRC1_ALPHA) |
-		(1u << PIPE_BLENDFACTOR_ZERO) |
-		(1u << PIPE_BLENDFACTOR_INV_SRC_COLOR) |
-		(1u << PIPE_BLENDFACTOR_INV_SRC_ALPHA) |
-		(1u << PIPE_BLENDFACTOR_INV_CONST_COLOR) |
-		(1u << PIPE_BLENDFACTOR_INV_CONST_ALPHA) |
-		(1u << PIPE_BLENDFACTOR_INV_SRC1_COLOR) |
-		(1u << PIPE_BLENDFACTOR_INV_SRC1_ALPHA);
-
-	if (dst == PIPE_BLENDFACTOR_ONE &&
-	    (src_allowed & (1u << src))) {
-		/* Addition is commutative, but floating point addition isn't
-		 * associative: subtle changes can be introduced via different
-		 * rounding.
-		 *
-		 * Out-of-order is also non-deterministic, which means that
-		 * this breaks OpenGL invariance requirements. So only enable
-		 * out-of-order additive blending if explicitly allowed by a
-		 * setting.
-		 */
-		if (func == PIPE_BLEND_MAX || func == PIPE_BLEND_MIN ||
-		    (func == PIPE_BLEND_ADD && sscreen->commutative_blend_add))
-			blend->commutative_4bit |= chanmask;
-	}
+   /* Src factor is allowed when it does not depend on Dst */
+   static const uint32_t src_allowed =
+      (1u << PIPE_BLENDFACTOR_ONE) | (1u << PIPE_BLENDFACTOR_SRC_COLOR) |
+      (1u << PIPE_BLENDFACTOR_SRC_ALPHA) | (1u << PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE) |
+      (1u << PIPE_BLENDFACTOR_CONST_COLOR) | (1u << PIPE_BLENDFACTOR_CONST_ALPHA) |
+      (1u << PIPE_BLENDFACTOR_SRC1_COLOR) | (1u << PIPE_BLENDFACTOR_SRC1_ALPHA) |
+      (1u << PIPE_BLENDFACTOR_ZERO) | (1u << PIPE_BLENDFACTOR_INV_SRC_COLOR) |
+      (1u << PIPE_BLENDFACTOR_INV_SRC_ALPHA) | (1u << PIPE_BLENDFACTOR_INV_CONST_COLOR) |
+      (1u << PIPE_BLENDFACTOR_INV_CONST_ALPHA) | (1u << PIPE_BLENDFACTOR_INV_SRC1_COLOR) |
+      (1u << PIPE_BLENDFACTOR_INV_SRC1_ALPHA);
+
+   if (dst == PIPE_BLENDFACTOR_ONE && (src_allowed & (1u << src))) {
+      /* Addition is commutative, but floating point addition isn't
+       * associative: subtle changes can be introduced via different
+       * rounding.
+       *
+       * Out-of-order is also non-deterministic, which means that
+       * this breaks OpenGL invariance requirements. So only enable
+       * out-of-order additive blending if explicitly allowed by a
+       * setting.
+       */
+      if (func == PIPE_BLEND_MAX || func == PIPE_BLEND_MIN ||
+          (func == PIPE_BLEND_ADD && sscreen->commutative_blend_add))
+         blend->commutative_4bit |= chanmask;
+   }
 }
 
 /**
  * Get rid of DST in the blend factors by commuting the operands:
  *    func(src * DST, dst * 0) ---> func(src * 0, dst * SRC)
  */
-static void si_blend_remove_dst(unsigned *func, unsigned *src_factor,
-				unsigned *dst_factor, unsigned expected_dst,
-				unsigned replacement_src)
+static void si_blend_remove_dst(unsigned *func, unsigned *src_factor, unsigned *dst_factor,
+                                unsigned expected_dst, unsigned replacement_src)
 {
-	if (*src_factor == expected_dst &&
-	    *dst_factor == PIPE_BLENDFACTOR_ZERO) {
-		*src_factor = PIPE_BLENDFACTOR_ZERO;
-		*dst_factor = replacement_src;
-
-		/* Commuting the operands requires reversing subtractions. */
-		if (*func == PIPE_BLEND_SUBTRACT)
-			*func = PIPE_BLEND_REVERSE_SUBTRACT;
-		else if (*func == PIPE_BLEND_REVERSE_SUBTRACT)
-			*func = PIPE_BLEND_SUBTRACT;
-	}
+   if (*src_factor == expected_dst && *dst_factor == PIPE_BLENDFACTOR_ZERO) {
+      *src_factor = PIPE_BLENDFACTOR_ZERO;
+      *dst_factor = replacement_src;
+
+      /* Commuting the operands requires reversing subtractions. */
+      if (*func == PIPE_BLEND_SUBTRACT)
+         *func = PIPE_BLEND_REVERSE_SUBTRACT;
+      else if (*func == PIPE_BLEND_REVERSE_SUBTRACT)
+         *func = PIPE_BLEND_SUBTRACT;
+   }
 }
 
 static bool si_blend_factor_uses_dst(unsigned factor)
 {
-	return factor == PIPE_BLENDFACTOR_DST_COLOR ||
-		factor == PIPE_BLENDFACTOR_DST_ALPHA ||
-		factor == PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE ||
-		factor == PIPE_BLENDFACTOR_INV_DST_ALPHA ||
-		factor == PIPE_BLENDFACTOR_INV_DST_COLOR;
+   return factor == PIPE_BLENDFACTOR_DST_COLOR || factor == PIPE_BLENDFACTOR_DST_ALPHA ||
+          factor == PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE ||
+          factor == PIPE_BLENDFACTOR_INV_DST_ALPHA || factor == PIPE_BLENDFACTOR_INV_DST_COLOR;
 }
 
 static void *si_create_blend_state_mode(struct pipe_context *ctx,
-					const struct pipe_blend_state *state,
-					unsigned mode)
+                                        const struct pipe_blend_state *state, unsigned mode)
 {
-	struct si_context *sctx = (struct si_context*)ctx;
-	struct si_state_blend *blend = CALLOC_STRUCT(si_state_blend);
-	struct si_pm4_state *pm4 = &blend->pm4;
-	uint32_t sx_mrt_blend_opt[8] = {0};
-	uint32_t color_control = 0;
-	bool logicop_enable = state->logicop_enable &&
-			      state->logicop_func != PIPE_LOGICOP_COPY;
-
-	if (!blend)
-		return NULL;
-
-	blend->alpha_to_coverage = state->alpha_to_coverage;
-	blend->alpha_to_one = state->alpha_to_one;
-	blend->dual_src_blend = util_blend_state_is_dual(state, 0);
-	blend->logicop_enable = logicop_enable;
-
-	if (logicop_enable) {
-		color_control |= S_028808_ROP3(state->logicop_func | (state->logicop_func << 4));
-	} else {
-		color_control |= S_028808_ROP3(0xcc);
-	}
-
-	si_pm4_set_reg(pm4, R_028B70_DB_ALPHA_TO_MASK,
-		       S_028B70_ALPHA_TO_MASK_ENABLE(state->alpha_to_coverage) |
-		       S_028B70_ALPHA_TO_MASK_OFFSET0(3) |
-		       S_028B70_ALPHA_TO_MASK_OFFSET1(1) |
-		       S_028B70_ALPHA_TO_MASK_OFFSET2(0) |
-		       S_028B70_ALPHA_TO_MASK_OFFSET3(2) |
-		       S_028B70_OFFSET_ROUND(1));
-
-	if (state->alpha_to_coverage)
-		blend->need_src_alpha_4bit |= 0xf;
-
-	blend->cb_target_mask = 0;
-	blend->cb_target_enabled_4bit = 0;
-
-	for (int i = 0; i < 8; i++) {
-		/* state->rt entries > 0 only written if independent blending */
-		const int j = state->independent_blend_enable ? i : 0;
-
-		unsigned eqRGB = state->rt[j].rgb_func;
-		unsigned srcRGB = state->rt[j].rgb_src_factor;
-		unsigned dstRGB = state->rt[j].rgb_dst_factor;
-		unsigned eqA = state->rt[j].alpha_func;
-		unsigned srcA = state->rt[j].alpha_src_factor;
-		unsigned dstA = state->rt[j].alpha_dst_factor;
-
-		unsigned srcRGB_opt, dstRGB_opt, srcA_opt, dstA_opt;
-		unsigned blend_cntl = 0;
-
-		sx_mrt_blend_opt[i] =
-			S_028760_COLOR_COMB_FCN(V_028760_OPT_COMB_BLEND_DISABLED) |
-			S_028760_ALPHA_COMB_FCN(V_028760_OPT_COMB_BLEND_DISABLED);
-
-		/* Only set dual source blending for MRT0 to avoid a hang. */
-		if (i >= 1 && blend->dual_src_blend) {
-			/* Vulkan does this for dual source blending. */
-			if (i == 1)
-				blend_cntl |= S_028780_ENABLE(1);
-
-			si_pm4_set_reg(pm4, R_028780_CB_BLEND0_CONTROL + i * 4, blend_cntl);
-			continue;
-		}
-
-		/* Only addition and subtraction equations are supported with
-		 * dual source blending.
-		 */
-		if (blend->dual_src_blend &&
-		    (eqRGB == PIPE_BLEND_MIN || eqRGB == PIPE_BLEND_MAX ||
-		     eqA == PIPE_BLEND_MIN || eqA == PIPE_BLEND_MAX)) {
-			assert(!"Unsupported equation for dual source blending");
-			si_pm4_set_reg(pm4, R_028780_CB_BLEND0_CONTROL + i * 4, blend_cntl);
-			continue;
-		}
-
-		/* cb_render_state will disable unused ones */
-		blend->cb_target_mask |= (unsigned)state->rt[j].colormask << (4 * i);
-		if (state->rt[j].colormask)
-			blend->cb_target_enabled_4bit |= 0xf << (4 * i);
-
-		if (!state->rt[j].colormask || !state->rt[j].blend_enable) {
-			si_pm4_set_reg(pm4, R_028780_CB_BLEND0_CONTROL + i * 4, blend_cntl);
-			continue;
-		}
-
-		si_blend_check_commutativity(sctx->screen, blend,
-					     eqRGB, srcRGB, dstRGB, 0x7 << (4 * i));
-		si_blend_check_commutativity(sctx->screen, blend,
-					     eqA, srcA, dstA, 0x8 << (4 * i));
-
-		/* Blending optimizations for RB+.
-		 * These transformations don't change the behavior.
-		 *
-		 * First, get rid of DST in the blend factors:
-		 *    func(src * DST, dst * 0) ---> func(src * 0, dst * SRC)
-		 */
-		si_blend_remove_dst(&eqRGB, &srcRGB, &dstRGB,
-				    PIPE_BLENDFACTOR_DST_COLOR,
-				    PIPE_BLENDFACTOR_SRC_COLOR);
-		si_blend_remove_dst(&eqA, &srcA, &dstA,
-				    PIPE_BLENDFACTOR_DST_COLOR,
-				    PIPE_BLENDFACTOR_SRC_COLOR);
-		si_blend_remove_dst(&eqA, &srcA, &dstA,
-				    PIPE_BLENDFACTOR_DST_ALPHA,
-				    PIPE_BLENDFACTOR_SRC_ALPHA);
-
-		/* Look up the ideal settings from tables. */
-		srcRGB_opt = si_translate_blend_opt_factor(srcRGB, false);
-		dstRGB_opt = si_translate_blend_opt_factor(dstRGB, false);
-		srcA_opt = si_translate_blend_opt_factor(srcA, true);
-		dstA_opt = si_translate_blend_opt_factor(dstA, true);
-
-		/* Handle interdependencies. */
-		if (si_blend_factor_uses_dst(srcRGB))
-			dstRGB_opt = V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_NONE;
-		if (si_blend_factor_uses_dst(srcA))
-			dstA_opt = V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_NONE;
-
-		if (srcRGB == PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE &&
-		    (dstRGB == PIPE_BLENDFACTOR_ZERO ||
-		     dstRGB == PIPE_BLENDFACTOR_SRC_ALPHA ||
-		     dstRGB == PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE))
-			dstRGB_opt = V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_A0;
-
-		/* Set the final value. */
-		sx_mrt_blend_opt[i] =
-			S_028760_COLOR_SRC_OPT(srcRGB_opt) |
-			S_028760_COLOR_DST_OPT(dstRGB_opt) |
-			S_028760_COLOR_COMB_FCN(si_translate_blend_opt_function(eqRGB)) |
-			S_028760_ALPHA_SRC_OPT(srcA_opt) |
-			S_028760_ALPHA_DST_OPT(dstA_opt) |
-			S_028760_ALPHA_COMB_FCN(si_translate_blend_opt_function(eqA));
-
-		/* Set blend state. */
-		blend_cntl |= S_028780_ENABLE(1);
-		blend_cntl |= S_028780_COLOR_COMB_FCN(si_translate_blend_function(eqRGB));
-		blend_cntl |= S_028780_COLOR_SRCBLEND(si_translate_blend_factor(srcRGB));
-		blend_cntl |= S_028780_COLOR_DESTBLEND(si_translate_blend_factor(dstRGB));
-
-		if (srcA != srcRGB || dstA != dstRGB || eqA != eqRGB) {
-			blend_cntl |= S_028780_SEPARATE_ALPHA_BLEND(1);
-			blend_cntl |= S_028780_ALPHA_COMB_FCN(si_translate_blend_function(eqA));
-			blend_cntl |= S_028780_ALPHA_SRCBLEND(si_translate_blend_factor(srcA));
-			blend_cntl |= S_028780_ALPHA_DESTBLEND(si_translate_blend_factor(dstA));
-		}
-		si_pm4_set_reg(pm4, R_028780_CB_BLEND0_CONTROL + i * 4, blend_cntl);
-
-		blend->blend_enable_4bit |= 0xfu << (i * 4);
-
-		if (sctx->chip_class >= GFX8 && sctx->family <= CHIP_NAVI14)
-			blend->dcc_msaa_corruption_4bit |= 0xfu << (i * 4);
-
-		/* This is only important for formats without alpha. */
-		if (srcRGB == PIPE_BLENDFACTOR_SRC_ALPHA ||
-		    dstRGB == PIPE_BLENDFACTOR_SRC_ALPHA ||
-		    srcRGB == PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE ||
-		    dstRGB == PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE ||
-		    srcRGB == PIPE_BLENDFACTOR_INV_SRC_ALPHA ||
-		    dstRGB == PIPE_BLENDFACTOR_INV_SRC_ALPHA)
-			blend->need_src_alpha_4bit |= 0xfu << (i * 4);
-	}
-
-	if (sctx->chip_class >= GFX8 && sctx->family <= CHIP_NAVI14 && logicop_enable)
-		blend->dcc_msaa_corruption_4bit |= blend->cb_target_enabled_4bit;
-
-	if (blend->cb_target_mask) {
-		color_control |= S_028808_MODE(mode);
-	} else {
-		color_control |= S_028808_MODE(V_028808_CB_DISABLE);
-	}
-
-	if (sctx->screen->info.rbplus_allowed) {
-		/* Disable RB+ blend optimizations for dual source blending.
-		 * Vulkan does this.
-		 */
-		if (blend->dual_src_blend) {
-			for (int i = 0; i < 8; i++) {
-				sx_mrt_blend_opt[i] =
-					S_028760_COLOR_COMB_FCN(V_028760_OPT_COMB_NONE) |
-					S_028760_ALPHA_COMB_FCN(V_028760_OPT_COMB_NONE);
-			}
-		}
-
-		for (int i = 0; i < 8; i++)
-			si_pm4_set_reg(pm4, R_028760_SX_MRT0_BLEND_OPT + i * 4,
-				       sx_mrt_blend_opt[i]);
-
-		/* RB+ doesn't work with dual source blending, logic op, and RESOLVE. */
-		if (blend->dual_src_blend || logicop_enable ||
-		    mode == V_028808_CB_RESOLVE)
-			color_control |= S_028808_DISABLE_DUAL_QUAD(1);
-	}
-
-	si_pm4_set_reg(pm4, R_028808_CB_COLOR_CONTROL, color_control);
-	return blend;
+   struct si_context *sctx = (struct si_context *)ctx;
+   struct si_state_blend *blend = CALLOC_STRUCT(si_state_blend);
+   struct si_pm4_state *pm4 = &blend->pm4;
+   uint32_t sx_mrt_blend_opt[8] = {0};
+   uint32_t color_control = 0;
+   bool logicop_enable = state->logicop_enable && state->logicop_func != PIPE_LOGICOP_COPY;
+
+   if (!blend)
+      return NULL;
+
+   blend->alpha_to_coverage = state->alpha_to_coverage;
+   blend->alpha_to_one = state->alpha_to_one;
+   blend->dual_src_blend = util_blend_state_is_dual(state, 0);
+   blend->logicop_enable = logicop_enable;
+
+   if (logicop_enable) {
+      color_control |= S_028808_ROP3(state->logicop_func | (state->logicop_func << 4));
+   } else {
+      color_control |= S_028808_ROP3(0xcc);
+   }
+
+   si_pm4_set_reg(pm4, R_028B70_DB_ALPHA_TO_MASK,
+                  S_028B70_ALPHA_TO_MASK_ENABLE(state->alpha_to_coverage) |
+                     S_028B70_ALPHA_TO_MASK_OFFSET0(3) | S_028B70_ALPHA_TO_MASK_OFFSET1(1) |
+                     S_028B70_ALPHA_TO_MASK_OFFSET2(0) | S_028B70_ALPHA_TO_MASK_OFFSET3(2) |
+                     S_028B70_OFFSET_ROUND(1));
+
+   if (state->alpha_to_coverage)
+      blend->need_src_alpha_4bit |= 0xf;
+
+   blend->cb_target_mask = 0;
+   blend->cb_target_enabled_4bit = 0;
+
+   for (int i = 0; i < 8; i++) {
+      /* state->rt entries > 0 only written if independent blending */
+      const int j = state->independent_blend_enable ? i : 0;
+
+      unsigned eqRGB = state->rt[j].rgb_func;
+      unsigned srcRGB = state->rt[j].rgb_src_factor;
+      unsigned dstRGB = state->rt[j].rgb_dst_factor;
+      unsigned eqA = state->rt[j].alpha_func;
+      unsigned srcA = state->rt[j].alpha_src_factor;
+      unsigned dstA = state->rt[j].alpha_dst_factor;
+
+      unsigned srcRGB_opt, dstRGB_opt, srcA_opt, dstA_opt;
+      unsigned blend_cntl = 0;
+
+      sx_mrt_blend_opt[i] = S_028760_COLOR_COMB_FCN(V_028760_OPT_COMB_BLEND_DISABLED) |
+                            S_028760_ALPHA_COMB_FCN(V_028760_OPT_COMB_BLEND_DISABLED);
+
+      /* Only set dual source blending for MRT0 to avoid a hang. */
+      if (i >= 1 && blend->dual_src_blend) {
+         /* Vulkan does this for dual source blending. */
+         if (i == 1)
+            blend_cntl |= S_028780_ENABLE(1);
+
+         si_pm4_set_reg(pm4, R_028780_CB_BLEND0_CONTROL + i * 4, blend_cntl);
+         continue;
+      }
+
+      /* Only addition and subtraction equations are supported with
+       * dual source blending.
+       */
+      if (blend->dual_src_blend && (eqRGB == PIPE_BLEND_MIN || eqRGB == PIPE_BLEND_MAX ||
+                                    eqA == PIPE_BLEND_MIN || eqA == PIPE_BLEND_MAX)) {
+         assert(!"Unsupported equation for dual source blending");
+         si_pm4_set_reg(pm4, R_028780_CB_BLEND0_CONTROL + i * 4, blend_cntl);
+         continue;
+      }
+
+      /* cb_render_state will disable unused ones */
+      blend->cb_target_mask |= (unsigned)state->rt[j].colormask << (4 * i);
+      if (state->rt[j].colormask)
+         blend->cb_target_enabled_4bit |= 0xf << (4 * i);
+
+      if (!state->rt[j].colormask || !state->rt[j].blend_enable) {
+         si_pm4_set_reg(pm4, R_028780_CB_BLEND0_CONTROL + i * 4, blend_cntl);
+         continue;
+      }
+
+      si_blend_check_commutativity(sctx->screen, blend, eqRGB, srcRGB, dstRGB, 0x7 << (4 * i));
+      si_blend_check_commutativity(sctx->screen, blend, eqA, srcA, dstA, 0x8 << (4 * i));
+
+      /* Blending optimizations for RB+.
+       * These transformations don't change the behavior.
+       *
+       * First, get rid of DST in the blend factors:
+       *    func(src * DST, dst * 0) ---> func(src * 0, dst * SRC)
+       */
+      si_blend_remove_dst(&eqRGB, &srcRGB, &dstRGB, PIPE_BLENDFACTOR_DST_COLOR,
+                          PIPE_BLENDFACTOR_SRC_COLOR);
+      si_blend_remove_dst(&eqA, &srcA, &dstA, PIPE_BLENDFACTOR_DST_COLOR,
+                          PIPE_BLENDFACTOR_SRC_COLOR);
+      si_blend_remove_dst(&eqA, &srcA, &dstA, PIPE_BLENDFACTOR_DST_ALPHA,
+                          PIPE_BLENDFACTOR_SRC_ALPHA);
+
+      /* Look up the ideal settings from tables. */
+      srcRGB_opt = si_translate_blend_opt_factor(srcRGB, false);
+      dstRGB_opt = si_translate_blend_opt_factor(dstRGB, false);
+      srcA_opt = si_translate_blend_opt_factor(srcA, true);
+      dstA_opt = si_translate_blend_opt_factor(dstA, true);
+
+      /* Handle interdependencies. */
+      if (si_blend_factor_uses_dst(srcRGB))
+         dstRGB_opt = V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_NONE;
+      if (si_blend_factor_uses_dst(srcA))
+         dstA_opt = V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_NONE;
+
+      if (srcRGB == PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE &&
+          (dstRGB == PIPE_BLENDFACTOR_ZERO || dstRGB == PIPE_BLENDFACTOR_SRC_ALPHA ||
+           dstRGB == PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE))
+         dstRGB_opt = V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_A0;
+
+      /* Set the final value. */
+      sx_mrt_blend_opt[i] = S_028760_COLOR_SRC_OPT(srcRGB_opt) |
+                            S_028760_COLOR_DST_OPT(dstRGB_opt) |
+                            S_028760_COLOR_COMB_FCN(si_translate_blend_opt_function(eqRGB)) |
+                            S_028760_ALPHA_SRC_OPT(srcA_opt) | S_028760_ALPHA_DST_OPT(dstA_opt) |
+                            S_028760_ALPHA_COMB_FCN(si_translate_blend_opt_function(eqA));
+
+      /* Set blend state. */
+      blend_cntl |= S_028780_ENABLE(1);
+      blend_cntl |= S_028780_COLOR_COMB_FCN(si_translate_blend_function(eqRGB));
+      blend_cntl |= S_028780_COLOR_SRCBLEND(si_translate_blend_factor(srcRGB));
+      blend_cntl |= S_028780_COLOR_DESTBLEND(si_translate_blend_factor(dstRGB));
+
+      if (srcA != srcRGB || dstA != dstRGB || eqA != eqRGB) {
+         blend_cntl |= S_028780_SEPARATE_ALPHA_BLEND(1);
+         blend_cntl |= S_028780_ALPHA_COMB_FCN(si_translate_blend_function(eqA));
+         blend_cntl |= S_028780_ALPHA_SRCBLEND(si_translate_blend_factor(srcA));
+         blend_cntl |= S_028780_ALPHA_DESTBLEND(si_translate_blend_factor(dstA));
+      }
+      si_pm4_set_reg(pm4, R_028780_CB_BLEND0_CONTROL + i * 4, blend_cntl);
+
+      blend->blend_enable_4bit |= 0xfu << (i * 4);
+
+      if (sctx->chip_class >= GFX8 && sctx->family <= CHIP_NAVI14)
+         blend->dcc_msaa_corruption_4bit |= 0xfu << (i * 4);
+
+      /* This is only important for formats without alpha. */
+      if (srcRGB == PIPE_BLENDFACTOR_SRC_ALPHA || dstRGB == PIPE_BLENDFACTOR_SRC_ALPHA ||
+          srcRGB == PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE ||
+          dstRGB == PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE ||
+          srcRGB == PIPE_BLENDFACTOR_INV_SRC_ALPHA || dstRGB == PIPE_BLENDFACTOR_INV_SRC_ALPHA)
+         blend->need_src_alpha_4bit |= 0xfu << (i * 4);
+   }
+
+   if (sctx->chip_class >= GFX8 && sctx->family <= CHIP_NAVI14 && logicop_enable)
+      blend->dcc_msaa_corruption_4bit |= blend->cb_target_enabled_4bit;
+
+   if (blend->cb_target_mask) {
+      color_control |= S_028808_MODE(mode);
+   } else {
+      color_control |= S_028808_MODE(V_028808_CB_DISABLE);
+   }
+
+   if (sctx->screen->info.rbplus_allowed) {
+      /* Disable RB+ blend optimizations for dual source blending.
+       * Vulkan does this.
+       */
+      if (blend->dual_src_blend) {
+         for (int i = 0; i < 8; i++) {
+            sx_mrt_blend_opt[i] = S_028760_COLOR_COMB_FCN(V_028760_OPT_COMB_NONE) |
+                                  S_028760_ALPHA_COMB_FCN(V_028760_OPT_COMB_NONE);
+         }
+      }
+
+      for (int i = 0; i < 8; i++)
+         si_pm4_set_reg(pm4, R_028760_SX_MRT0_BLEND_OPT + i * 4, sx_mrt_blend_opt[i]);
+
+      /* RB+ doesn't work with dual source blending, logic op, and RESOLVE. */
+      if (blend->dual_src_blend || logicop_enable || mode == V_028808_CB_RESOLVE)
+         color_control |= S_028808_DISABLE_DUAL_QUAD(1);
+   }
+
+   si_pm4_set_reg(pm4, R_028808_CB_COLOR_CONTROL, color_control);
+   return blend;
 }
 
-static void *si_create_blend_state(struct pipe_context *ctx,
-				   const struct pipe_blend_state *state)
+static void *si_create_blend_state(struct pipe_context *ctx, const struct pipe_blend_state *state)
 {
-	return si_create_blend_state_mode(ctx, state, V_028808_CB_NORMAL);
+   return si_create_blend_state_mode(ctx, state, V_028808_CB_NORMAL);
 }
 
 static void si_bind_blend_state(struct pipe_context *ctx, void *state)
 {
-	struct si_context *sctx = (struct si_context *)ctx;
-	struct si_state_blend *old_blend = sctx->queued.named.blend;
-	struct si_state_blend *blend = (struct si_state_blend *)state;
-
-	if (!blend)
-		blend = (struct si_state_blend *)sctx->noop_blend;
-
-	si_pm4_bind_state(sctx, blend, blend);
-
-	if (old_blend->cb_target_mask != blend->cb_target_mask ||
-	    old_blend->dual_src_blend != blend->dual_src_blend ||
-	    (old_blend->dcc_msaa_corruption_4bit != blend->dcc_msaa_corruption_4bit &&
-	     sctx->framebuffer.nr_samples >= 2 &&
-	     sctx->screen->dcc_msaa_allowed))
-		si_mark_atom_dirty(sctx, &sctx->atoms.s.cb_render_state);
-
-	if (old_blend->cb_target_mask != blend->cb_target_mask ||
-	    old_blend->alpha_to_coverage != blend->alpha_to_coverage ||
-	    old_blend->alpha_to_one != blend->alpha_to_one ||
-	    old_blend->dual_src_blend != blend->dual_src_blend ||
-	    old_blend->blend_enable_4bit != blend->blend_enable_4bit ||
-	    old_blend->need_src_alpha_4bit != blend->need_src_alpha_4bit)
-		sctx->do_update_shaders = true;
-
-	if (sctx->screen->dpbb_allowed &&
-	    (old_blend->alpha_to_coverage != blend->alpha_to_coverage ||
-	     old_blend->blend_enable_4bit != blend->blend_enable_4bit ||
-	     old_blend->cb_target_enabled_4bit != blend->cb_target_enabled_4bit))
-		si_mark_atom_dirty(sctx, &sctx->atoms.s.dpbb_state);
-
-	if (sctx->screen->has_out_of_order_rast &&
-	    ((old_blend->blend_enable_4bit != blend->blend_enable_4bit ||
-	      old_blend->cb_target_enabled_4bit != blend->cb_target_enabled_4bit ||
-	      old_blend->commutative_4bit != blend->commutative_4bit ||
-	      old_blend->logicop_enable != blend->logicop_enable)))
-		si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config);
+   struct si_context *sctx = (struct si_context *)ctx;
+   struct si_state_blend *old_blend = sctx->queued.named.blend;
+   struct si_state_blend *blend = (struct si_state_blend *)state;
+
+   if (!blend)
+      blend = (struct si_state_blend *)sctx->noop_blend;
+
+   si_pm4_bind_state(sctx, blend, blend);
+
+   if (old_blend->cb_target_mask != blend->cb_target_mask ||
+       old_blend->dual_src_blend != blend->dual_src_blend ||
+       (old_blend->dcc_msaa_corruption_4bit != blend->dcc_msaa_corruption_4bit &&
+        sctx->framebuffer.nr_samples >= 2 && sctx->screen->dcc_msaa_allowed))
+      si_mark_atom_dirty(sctx, &sctx->atoms.s.cb_render_state);
+
+   if (old_blend->cb_target_mask != blend->cb_target_mask ||
+       old_blend->alpha_to_coverage != blend->alpha_to_coverage ||
+       old_blend->alpha_to_one != blend->alpha_to_one ||
+       old_blend->dual_src_blend != blend->dual_src_blend ||
+       old_blend->blend_enable_4bit != blend->blend_enable_4bit ||
+       old_blend->need_src_alpha_4bit != blend->need_src_alpha_4bit)
+      sctx->do_update_shaders = true;
+
+   if (sctx->screen->dpbb_allowed &&
+       (old_blend->alpha_to_coverage != blend->alpha_to_coverage ||
+        old_blend->blend_enable_4bit != blend->blend_enable_4bit ||
+        old_blend->cb_target_enabled_4bit != blend->cb_target_enabled_4bit))
+      si_mark_atom_dirty(sctx, &sctx->atoms.s.dpbb_state);
+
+   if (sctx->screen->has_out_of_order_rast &&
+       ((old_blend->blend_enable_4bit != blend->blend_enable_4bit ||
+         old_blend->cb_target_enabled_4bit != blend->cb_target_enabled_4bit ||
+         old_blend->commutative_4bit != blend->commutative_4bit ||
+         old_blend->logicop_enable != blend->logicop_enable)))
+      si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config);
 }
 
 static void si_delete_blend_state(struct pipe_context *ctx, void *state)
 {
-	struct si_context *sctx = (struct si_context *)ctx;
+   struct si_context *sctx = (struct si_context *)ctx;
 
-	if (sctx->queued.named.blend == state)
-		si_bind_blend_state(ctx, sctx->noop_blend);
+   if (sctx->queued.named.blend == state)
+      si_bind_blend_state(ctx, sctx->noop_blend);
 
-	si_pm4_delete_state(sctx, blend, (struct si_state_blend *)state);
+   si_pm4_delete_state(sctx, blend, (struct si_state_blend *)state);
 }
 
-static void si_set_blend_color(struct pipe_context *ctx,
-			       const struct pipe_blend_color *state)
+static void si_set_blend_color(struct pipe_context *ctx, const struct pipe_blend_color *state)
 {
-	struct si_context *sctx = (struct si_context *)ctx;
-	static const struct pipe_blend_color zeros;
+   struct si_context *sctx = (struct si_context *)ctx;
+   static const struct pipe_blend_color zeros;
 
-	sctx->blend_color.state = *state;
-	sctx->blend_color.any_nonzeros = memcmp(state, &zeros, sizeof(*state)) != 0;
-	si_mark_atom_dirty(sctx, &sctx->atoms.s.blend_color);
+   sctx->blend_color.state = *state;
+   sctx->blend_color.any_nonzeros = memcmp(state, &zeros, sizeof(*state)) != 0;
+   si_mark_atom_dirty(sctx, &sctx->atoms.s.blend_color);
 }
 
 static void si_emit_blend_color(struct si_context *sctx)
 {
-	struct radeon_cmdbuf *cs = sctx->gfx_cs;
+   struct radeon_cmdbuf *cs = sctx->gfx_cs;
 
-	radeon_set_context_reg_seq(cs, R_028414_CB_BLEND_RED, 4);
-	radeon_emit_array(cs, (uint32_t*)sctx->blend_color.state.color, 4);
+   radeon_set_context_reg_seq(cs, R_028414_CB_BLEND_RED, 4);
+   radeon_emit_array(cs, (uint32_t *)sctx->blend_color.state.color, 4);
 }
 
 /*
  * Clipping
  */
 
-static void si_set_clip_state(struct pipe_context *ctx,
-			      const struct pipe_clip_state *state)
+static void si_set_clip_state(struct pipe_context *ctx, const struct pipe_clip_state *state)
 {
-	struct si_context *sctx = (struct si_context *)ctx;
-	struct pipe_constant_buffer cb;
-	static const struct pipe_clip_state zeros;
-
-	if (memcmp(&sctx->clip_state.state, state, sizeof(*state)) == 0)
-		return;
-
-	sctx->clip_state.state = *state;
-	sctx->clip_state.any_nonzeros = memcmp(state, &zeros, sizeof(*state)) != 0;
-	si_mark_atom_dirty(sctx, &sctx->atoms.s.clip_state);
-
-	cb.buffer = NULL;
-	cb.user_buffer = state->ucp;
-	cb.buffer_offset = 0;
-	cb.buffer_size = 4*4*8;
-	si_set_rw_buffer(sctx, SI_VS_CONST_CLIP_PLANES, &cb);
-	pipe_resource_reference(&cb.buffer, NULL);
+   struct si_context *sctx = (struct si_context *)ctx;
+   struct pipe_constant_buffer cb;
+   static const struct pipe_clip_state zeros;
+
+   if (memcmp(&sctx->clip_state.state, state, sizeof(*state)) == 0)
+      return;
+
+   sctx->clip_state.state = *state;
+   sctx->clip_state.any_nonzeros = memcmp(state, &zeros, sizeof(*state)) != 0;
+   si_mark_atom_dirty(sctx, &sctx->atoms.s.clip_state);
+
+   cb.buffer = NULL;
+   cb.user_buffer = state->ucp;
+   cb.buffer_offset = 0;
+   cb.buffer_size = 4 * 4 * 8;
+   si_set_rw_buffer(sctx, SI_VS_CONST_CLIP_PLANES, &cb);
+   pipe_resource_reference(&cb.buffer, NULL);
 }
 
 static void si_emit_clip_state(struct si_context *sctx)
 {
-	struct radeon_cmdbuf *cs = sctx->gfx_cs;
+   struct radeon_cmdbuf *cs = sctx->gfx_cs;
 
-	radeon_set_context_reg_seq(cs, R_0285BC_PA_CL_UCP_0_X, 6*4);
-	radeon_emit_array(cs, (uint32_t*)sctx->clip_state.state.ucp, 6*4);
+   radeon_set_context_reg_seq(cs, R_0285BC_PA_CL_UCP_0_X, 6 * 4);
+   radeon_emit_array(cs, (uint32_t *)sctx->clip_state.state.ucp, 6 * 4);
 }
 
 static void si_emit_clip_regs(struct si_context *sctx)
 {
-	struct si_shader *vs = si_get_vs_state(sctx);
-	struct si_shader_selector *vs_sel = vs->selector;
-	struct si_shader_info *info = &vs_sel->info;
-	struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
-	unsigned window_space =
-	   info->properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION];
-	unsigned clipdist_mask = vs_sel->clipdist_mask;
-	unsigned ucp_mask = clipdist_mask ? 0 : rs->clip_plane_enable & SIX_BITS;
-	unsigned culldist_mask = vs_sel->culldist_mask;
-	unsigned total_mask;
-
-	if (vs->key.opt.clip_disable) {
-		assert(!info->culldist_writemask);
-		clipdist_mask = 0;
-		culldist_mask = 0;
-	}
-	total_mask = clipdist_mask | culldist_mask;
-
-	/* Clip distances on points have no effect, so need to be implemented
-	 * as cull distances. This applies for the clipvertex case as well.
-	 *
-	 * Setting this for primitives other than points should have no adverse
-	 * effects.
-	 */
-	clipdist_mask &= rs->clip_plane_enable;
-	culldist_mask |= clipdist_mask;
-
-	unsigned initial_cdw = sctx->gfx_cs->current.cdw;
-	unsigned pa_cl_cntl = S_02881C_VS_OUT_CCDIST0_VEC_ENA((total_mask & 0x0F) != 0) |
-			      S_02881C_VS_OUT_CCDIST1_VEC_ENA((total_mask & 0xF0) != 0) |
-			      clipdist_mask | (culldist_mask << 8);
-
-	if (sctx->chip_class >= GFX10) {
-		radeon_opt_set_context_reg_rmw(sctx, R_02881C_PA_CL_VS_OUT_CNTL,
-					       SI_TRACKED_PA_CL_VS_OUT_CNTL__CL,
-					       pa_cl_cntl,
-					       ~SI_TRACKED_PA_CL_VS_OUT_CNTL__VS_MASK);
-	} else {
-		radeon_opt_set_context_reg(sctx, R_02881C_PA_CL_VS_OUT_CNTL,
-					   SI_TRACKED_PA_CL_VS_OUT_CNTL__CL,
-					   vs_sel->pa_cl_vs_out_cntl | pa_cl_cntl);
-	}
-	radeon_opt_set_context_reg(sctx, R_028810_PA_CL_CLIP_CNTL,
-		SI_TRACKED_PA_CL_CLIP_CNTL,
-		rs->pa_cl_clip_cntl |
-		ucp_mask |
-		S_028810_CLIP_DISABLE(window_space));
-
-	if (initial_cdw != sctx->gfx_cs->current.cdw)
-		sctx->context_roll = true;
+   struct si_shader *vs = si_get_vs_state(sctx);
+   struct si_shader_selector *vs_sel = vs->selector;
+   struct si_shader_info *info = &vs_sel->info;
+   struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
+   unsigned window_space = info->properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION];
+   unsigned clipdist_mask = vs_sel->clipdist_mask;
+   unsigned ucp_mask = clipdist_mask ? 0 : rs->clip_plane_enable & SIX_BITS;
+   unsigned culldist_mask = vs_sel->culldist_mask;
+   unsigned total_mask;
+
+   if (vs->key.opt.clip_disable) {
+      assert(!info->culldist_writemask);
+      clipdist_mask = 0;
+      culldist_mask = 0;
+   }
+   total_mask = clipdist_mask | culldist_mask;
+
+   /* Clip distances on points have no effect, so need to be implemented
+    * as cull distances. This applies for the clipvertex case as well.
+    *
+    * Setting this for primitives other than points should have no adverse
+    * effects.
+    */
+   clipdist_mask &= rs->clip_plane_enable;
+   culldist_mask |= clipdist_mask;
+
+   unsigned initial_cdw = sctx->gfx_cs->current.cdw;
+   unsigned pa_cl_cntl = S_02881C_VS_OUT_CCDIST0_VEC_ENA((total_mask & 0x0F) != 0) |
+                         S_02881C_VS_OUT_CCDIST1_VEC_ENA((total_mask & 0xF0) != 0) | clipdist_mask |
+                         (culldist_mask << 8);
+
+   if (sctx->chip_class >= GFX10) {
+      radeon_opt_set_context_reg_rmw(sctx, R_02881C_PA_CL_VS_OUT_CNTL,
+                                     SI_TRACKED_PA_CL_VS_OUT_CNTL__CL, pa_cl_cntl,
+                                     ~SI_TRACKED_PA_CL_VS_OUT_CNTL__VS_MASK);
+   } else {
+      radeon_opt_set_context_reg(sctx, R_02881C_PA_CL_VS_OUT_CNTL, SI_TRACKED_PA_CL_VS_OUT_CNTL__CL,
+                                 vs_sel->pa_cl_vs_out_cntl | pa_cl_cntl);
+   }
+   radeon_opt_set_context_reg(sctx, R_028810_PA_CL_CLIP_CNTL, SI_TRACKED_PA_CL_CLIP_CNTL,
+                              rs->pa_cl_clip_cntl | ucp_mask | S_028810_CLIP_DISABLE(window_space));
+
+   if (initial_cdw != sctx->gfx_cs->current.cdw)
+      sctx->context_roll = true;
 }
 
 /*
@@ -834,28 +777,28 @@ static void si_emit_clip_regs(struct si_context *sctx)
  */
 static void si_update_poly_offset_state(struct si_context *sctx)
 {
-	struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
-
-	if (!rs->uses_poly_offset || !sctx->framebuffer.state.zsbuf) {
-		si_pm4_bind_state(sctx, poly_offset, NULL);
-		return;
-	}
-
-	/* Use the user format, not db_render_format, so that the polygon
-	 * offset behaves as expected by applications.
-	 */
-	switch (sctx->framebuffer.state.zsbuf->texture->format) {
-	case PIPE_FORMAT_Z16_UNORM:
-		si_pm4_bind_state(sctx, poly_offset, &rs->pm4_poly_offset[0]);
-		break;
-	default: /* 24-bit */
-		si_pm4_bind_state(sctx, poly_offset, &rs->pm4_poly_offset[1]);
-		break;
-	case PIPE_FORMAT_Z32_FLOAT:
-	case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT:
-		si_pm4_bind_state(sctx, poly_offset, &rs->pm4_poly_offset[2]);
-		break;
-	}
+   struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
+
+   if (!rs->uses_poly_offset || !sctx->framebuffer.state.zsbuf) {
+      si_pm4_bind_state(sctx, poly_offset, NULL);
+      return;
+   }
+
+   /* Use the user format, not db_render_format, so that the polygon
+    * offset behaves as expected by applications.
+    */
+   switch (sctx->framebuffer.state.zsbuf->texture->format) {
+   case PIPE_FORMAT_Z16_UNORM:
+      si_pm4_bind_state(sctx, poly_offset, &rs->pm4_poly_offset[0]);
+      break;
+   default: /* 24-bit */
+      si_pm4_bind_state(sctx, poly_offset, &rs->pm4_poly_offset[1]);
+      break;
+   case PIPE_FORMAT_Z32_FLOAT:
+   case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT:
+      si_pm4_bind_state(sctx, poly_offset, &rs->pm4_poly_offset[2]);
+      break;
+   }
 }
 
 /*
@@ -864,245 +807,228 @@ static void si_update_poly_offset_state(struct si_context *sctx)
 
 static uint32_t si_translate_fill(uint32_t func)
 {
-	switch(func) {
-	case PIPE_POLYGON_MODE_FILL:
-		return V_028814_X_DRAW_TRIANGLES;
-	case PIPE_POLYGON_MODE_LINE:
-		return V_028814_X_DRAW_LINES;
-	case PIPE_POLYGON_MODE_POINT:
-		return V_028814_X_DRAW_POINTS;
-	default:
-		assert(0);
-		return V_028814_X_DRAW_POINTS;
-	}
+   switch (func) {
+   case PIPE_POLYGON_MODE_FILL:
+      return V_028814_X_DRAW_TRIANGLES;
+   case PIPE_POLYGON_MODE_LINE:
+      return V_028814_X_DRAW_LINES;
+   case PIPE_POLYGON_MODE_POINT:
+      return V_028814_X_DRAW_POINTS;
+   default:
+      assert(0);
+      return V_028814_X_DRAW_POINTS;
+   }
 }
 
-static void *si_create_rs_state(struct pipe_context *ctx,
-				const struct pipe_rasterizer_state *state)
+static void *si_create_rs_state(struct pipe_context *ctx, const struct pipe_rasterizer_state *state)
 {
-	struct si_screen *sscreen = ((struct si_context *)ctx)->screen;
-	struct si_state_rasterizer *rs = CALLOC_STRUCT(si_state_rasterizer);
-	struct si_pm4_state *pm4 = &rs->pm4;
-	unsigned tmp, i;
-	float psize_min, psize_max;
-
-	if (!rs) {
-		return NULL;
-	}
-
-	if (!state->front_ccw) {
-		rs->cull_front = !!(state->cull_face & PIPE_FACE_FRONT);
-		rs->cull_back = !!(state->cull_face & PIPE_FACE_BACK);
-	} else {
-		rs->cull_back = !!(state->cull_face & PIPE_FACE_FRONT);
-		rs->cull_front = !!(state->cull_face & PIPE_FACE_BACK);
-	}
-	rs->depth_clamp_any = !state->depth_clip_near || !state->depth_clip_far;
-	rs->provoking_vertex_first = state->flatshade_first;
-	rs->scissor_enable = state->scissor;
-	rs->clip_halfz = state->clip_halfz;
-	rs->two_side = state->light_twoside;
-	rs->multisample_enable = state->multisample;
-	rs->force_persample_interp = state->force_persample_interp;
-	rs->clip_plane_enable = state->clip_plane_enable;
-	rs->half_pixel_center = state->half_pixel_center;
-	rs->line_stipple_enable = state->line_stipple_enable;
-	rs->poly_stipple_enable = state->poly_stipple_enable;
-	rs->line_smooth = state->line_smooth;
-	rs->line_width = state->line_width;
-	rs->poly_smooth = state->poly_smooth;
-	rs->uses_poly_offset = state->offset_point || state->offset_line ||
-			       state->offset_tri;
-	rs->clamp_fragment_color = state->clamp_fragment_color;
-	rs->clamp_vertex_color = state->clamp_vertex_color;
-	rs->flatshade = state->flatshade;
-	rs->flatshade_first = state->flatshade_first;
-	rs->sprite_coord_enable = state->sprite_coord_enable;
-	rs->rasterizer_discard = state->rasterizer_discard;
-	rs->polygon_mode_enabled = (state->fill_front != PIPE_POLYGON_MODE_FILL &&
-				    !(state->cull_face & PIPE_FACE_FRONT)) ||
-				   (state->fill_back != PIPE_POLYGON_MODE_FILL &&
-				    !(state->cull_face & PIPE_FACE_BACK));
-	rs->polygon_mode_is_lines = (state->fill_front == PIPE_POLYGON_MODE_LINE &&
-				     !(state->cull_face & PIPE_FACE_FRONT)) ||
-				    (state->fill_back == PIPE_POLYGON_MODE_LINE &&
-				     !(state->cull_face & PIPE_FACE_BACK));
-	rs->pa_sc_line_stipple = state->line_stipple_enable ?
-				S_028A0C_LINE_PATTERN(state->line_stipple_pattern) |
-				S_028A0C_REPEAT_COUNT(state->line_stipple_factor) : 0;
-	rs->pa_cl_clip_cntl =
-		S_028810_DX_CLIP_SPACE_DEF(state->clip_halfz) |
-		S_028810_ZCLIP_NEAR_DISABLE(!state->depth_clip_near) |
-		S_028810_ZCLIP_FAR_DISABLE(!state->depth_clip_far) |
-		S_028810_DX_RASTERIZATION_KILL(state->rasterizer_discard) |
-		S_028810_DX_LINEAR_ATTR_CLIP_ENA(1);
-
-	si_pm4_set_reg(pm4, R_0286D4_SPI_INTERP_CONTROL_0,
-		S_0286D4_FLAT_SHADE_ENA(1) |
-		S_0286D4_PNT_SPRITE_ENA(state->point_quad_rasterization) |
-		S_0286D4_PNT_SPRITE_OVRD_X(V_0286D4_SPI_PNT_SPRITE_SEL_S) |
-		S_0286D4_PNT_SPRITE_OVRD_Y(V_0286D4_SPI_PNT_SPRITE_SEL_T) |
-		S_0286D4_PNT_SPRITE_OVRD_Z(V_0286D4_SPI_PNT_SPRITE_SEL_0) |
-		S_0286D4_PNT_SPRITE_OVRD_W(V_0286D4_SPI_PNT_SPRITE_SEL_1) |
-		S_0286D4_PNT_SPRITE_TOP_1(state->sprite_coord_mode != PIPE_SPRITE_COORD_UPPER_LEFT));
-
-	/* point size 12.4 fixed point */
-	tmp = (unsigned)(state->point_size * 8.0);
-	si_pm4_set_reg(pm4, R_028A00_PA_SU_POINT_SIZE, S_028A00_HEIGHT(tmp) | S_028A00_WIDTH(tmp));
-
-	if (state->point_size_per_vertex) {
-		psize_min = util_get_min_point_size(state);
-		psize_max = SI_MAX_POINT_SIZE;
-	} else {
-		/* Force the point size to be as if the vertex output was disabled. */
-		psize_min = state->point_size;
-		psize_max = state->point_size;
-	}
-	rs->max_point_size = psize_max;
-
-	/* Divide by two, because 0.5 = 1 pixel. */
-	si_pm4_set_reg(pm4, R_028A04_PA_SU_POINT_MINMAX,
-			S_028A04_MIN_SIZE(si_pack_float_12p4(psize_min/2)) |
-			S_028A04_MAX_SIZE(si_pack_float_12p4(psize_max/2)));
-
-	si_pm4_set_reg(pm4, R_028A08_PA_SU_LINE_CNTL,
-		       S_028A08_WIDTH(si_pack_float_12p4(state->line_width/2)));
-	si_pm4_set_reg(pm4, R_028A48_PA_SC_MODE_CNTL_0,
-		       S_028A48_LINE_STIPPLE_ENABLE(state->line_stipple_enable) |
-		       S_028A48_MSAA_ENABLE(state->multisample ||
-					    state->poly_smooth ||
-					    state->line_smooth) |
-		       S_028A48_VPORT_SCISSOR_ENABLE(1) |
-		       S_028A48_ALTERNATE_RBS_PER_TILE(sscreen->info.chip_class >= GFX9));
-
-	si_pm4_set_reg(pm4, R_028B7C_PA_SU_POLY_OFFSET_CLAMP, fui(state->offset_clamp));
-	si_pm4_set_reg(pm4, R_028814_PA_SU_SC_MODE_CNTL,
-		S_028814_PROVOKING_VTX_LAST(!state->flatshade_first) |
-		S_028814_CULL_FRONT((state->cull_face & PIPE_FACE_FRONT) ? 1 : 0) |
-		S_028814_CULL_BACK((state->cull_face & PIPE_FACE_BACK) ? 1 : 0) |
-		S_028814_FACE(!state->front_ccw) |
-		S_028814_POLY_OFFSET_FRONT_ENABLE(util_get_offset(state, state->fill_front)) |
-		S_028814_POLY_OFFSET_BACK_ENABLE(util_get_offset(state, state->fill_back)) |
-		S_028814_POLY_OFFSET_PARA_ENABLE(state->offset_point || state->offset_line) |
-		S_028814_POLY_MODE(rs->polygon_mode_enabled) |
-		S_028814_POLYMODE_FRONT_PTYPE(si_translate_fill(state->fill_front)) |
-		S_028814_POLYMODE_BACK_PTYPE(si_translate_fill(state->fill_back)));
-
-	if (!rs->uses_poly_offset)
-		return rs;
-
-	rs->pm4_poly_offset = CALLOC(3, sizeof(struct si_pm4_state));
-	if (!rs->pm4_poly_offset) {
-		FREE(rs);
-		return NULL;
-	}
-
-	/* Precalculate polygon offset states for 16-bit, 24-bit, and 32-bit zbuffers. */
-	for (i = 0; i < 3; i++) {
-		struct si_pm4_state *pm4 = &rs->pm4_poly_offset[i];
-		float offset_units = state->offset_units;
-		float offset_scale = state->offset_scale * 16.0f;
-		uint32_t pa_su_poly_offset_db_fmt_cntl = 0;
-
-		if (!state->offset_units_unscaled) {
-			switch (i) {
-			case 0: /* 16-bit zbuffer */
-				offset_units *= 4.0f;
-				pa_su_poly_offset_db_fmt_cntl =
-					S_028B78_POLY_OFFSET_NEG_NUM_DB_BITS(-16);
-				break;
-			case 1: /* 24-bit zbuffer */
-				offset_units *= 2.0f;
-				pa_su_poly_offset_db_fmt_cntl =
-					S_028B78_POLY_OFFSET_NEG_NUM_DB_BITS(-24);
-				break;
-			case 2: /* 32-bit zbuffer */
-				offset_units *= 1.0f;
-				pa_su_poly_offset_db_fmt_cntl = S_028B78_POLY_OFFSET_NEG_NUM_DB_BITS(-23) |
-								S_028B78_POLY_OFFSET_DB_IS_FLOAT_FMT(1);
-				break;
-			}
-		}
-
-		si_pm4_set_reg(pm4, R_028B80_PA_SU_POLY_OFFSET_FRONT_SCALE,
-			       fui(offset_scale));
-		si_pm4_set_reg(pm4, R_028B84_PA_SU_POLY_OFFSET_FRONT_OFFSET,
-			       fui(offset_units));
-		si_pm4_set_reg(pm4, R_028B88_PA_SU_POLY_OFFSET_BACK_SCALE,
-			       fui(offset_scale));
-		si_pm4_set_reg(pm4, R_028B8C_PA_SU_POLY_OFFSET_BACK_OFFSET,
-			       fui(offset_units));
-		si_pm4_set_reg(pm4, R_028B78_PA_SU_POLY_OFFSET_DB_FMT_CNTL,
-			       pa_su_poly_offset_db_fmt_cntl);
-	}
-
-	return rs;
+   struct si_screen *sscreen = ((struct si_context *)ctx)->screen;
+   struct si_state_rasterizer *rs = CALLOC_STRUCT(si_state_rasterizer);
+   struct si_pm4_state *pm4 = &rs->pm4;
+   unsigned tmp, i;
+   float psize_min, psize_max;
+
+   if (!rs) {
+      return NULL;
+   }
+
+   if (!state->front_ccw) {
+      rs->cull_front = !!(state->cull_face & PIPE_FACE_FRONT);
+      rs->cull_back = !!(state->cull_face & PIPE_FACE_BACK);
+   } else {
+      rs->cull_back = !!(state->cull_face & PIPE_FACE_FRONT);
+      rs->cull_front = !!(state->cull_face & PIPE_FACE_BACK);
+   }
+   rs->depth_clamp_any = !state->depth_clip_near || !state->depth_clip_far;
+   rs->provoking_vertex_first = state->flatshade_first;
+   rs->scissor_enable = state->scissor;
+   rs->clip_halfz = state->clip_halfz;
+   rs->two_side = state->light_twoside;
+   rs->multisample_enable = state->multisample;
+   rs->force_persample_interp = state->force_persample_interp;
+   rs->clip_plane_enable = state->clip_plane_enable;
+   rs->half_pixel_center = state->half_pixel_center;
+   rs->line_stipple_enable = state->line_stipple_enable;
+   rs->poly_stipple_enable = state->poly_stipple_enable;
+   rs->line_smooth = state->line_smooth;
+   rs->line_width = state->line_width;
+   rs->poly_smooth = state->poly_smooth;
+   rs->uses_poly_offset = state->offset_point || state->offset_line || state->offset_tri;
+   rs->clamp_fragment_color = state->clamp_fragment_color;
+   rs->clamp_vertex_color = state->clamp_vertex_color;
+   rs->flatshade = state->flatshade;
+   rs->flatshade_first = state->flatshade_first;
+   rs->sprite_coord_enable = state->sprite_coord_enable;
+   rs->rasterizer_discard = state->rasterizer_discard;
+   rs->polygon_mode_enabled =
+      (state->fill_front != PIPE_POLYGON_MODE_FILL && !(state->cull_face & PIPE_FACE_FRONT)) ||
+      (state->fill_back != PIPE_POLYGON_MODE_FILL && !(state->cull_face & PIPE_FACE_BACK));
+   rs->polygon_mode_is_lines =
+      (state->fill_front == PIPE_POLYGON_MODE_LINE && !(state->cull_face & PIPE_FACE_FRONT)) ||
+      (state->fill_back == PIPE_POLYGON_MODE_LINE && !(state->cull_face & PIPE_FACE_BACK));
+   rs->pa_sc_line_stipple = state->line_stipple_enable
+                               ? S_028A0C_LINE_PATTERN(state->line_stipple_pattern) |
+                                    S_028A0C_REPEAT_COUNT(state->line_stipple_factor)
+                               : 0;
+   rs->pa_cl_clip_cntl = S_028810_DX_CLIP_SPACE_DEF(state->clip_halfz) |
+                         S_028810_ZCLIP_NEAR_DISABLE(!state->depth_clip_near) |
+                         S_028810_ZCLIP_FAR_DISABLE(!state->depth_clip_far) |
+                         S_028810_DX_RASTERIZATION_KILL(state->rasterizer_discard) |
+                         S_028810_DX_LINEAR_ATTR_CLIP_ENA(1);
+
+   si_pm4_set_reg(
+      pm4, R_0286D4_SPI_INTERP_CONTROL_0,
+      S_0286D4_FLAT_SHADE_ENA(1) | S_0286D4_PNT_SPRITE_ENA(state->point_quad_rasterization) |
+         S_0286D4_PNT_SPRITE_OVRD_X(V_0286D4_SPI_PNT_SPRITE_SEL_S) |
+         S_0286D4_PNT_SPRITE_OVRD_Y(V_0286D4_SPI_PNT_SPRITE_SEL_T) |
+         S_0286D4_PNT_SPRITE_OVRD_Z(V_0286D4_SPI_PNT_SPRITE_SEL_0) |
+         S_0286D4_PNT_SPRITE_OVRD_W(V_0286D4_SPI_PNT_SPRITE_SEL_1) |
+         S_0286D4_PNT_SPRITE_TOP_1(state->sprite_coord_mode != PIPE_SPRITE_COORD_UPPER_LEFT));
+
+   /* point size 12.4 fixed point */
+   tmp = (unsigned)(state->point_size * 8.0);
+   si_pm4_set_reg(pm4, R_028A00_PA_SU_POINT_SIZE, S_028A00_HEIGHT(tmp) | S_028A00_WIDTH(tmp));
+
+   if (state->point_size_per_vertex) {
+      psize_min = util_get_min_point_size(state);
+      psize_max = SI_MAX_POINT_SIZE;
+   } else {
+      /* Force the point size to be as if the vertex output was disabled. */
+      psize_min = state->point_size;
+      psize_max = state->point_size;
+   }
+   rs->max_point_size = psize_max;
+
+   /* Divide by two, because 0.5 = 1 pixel. */
+   si_pm4_set_reg(pm4, R_028A04_PA_SU_POINT_MINMAX,
+                  S_028A04_MIN_SIZE(si_pack_float_12p4(psize_min / 2)) |
+                     S_028A04_MAX_SIZE(si_pack_float_12p4(psize_max / 2)));
+
+   si_pm4_set_reg(pm4, R_028A08_PA_SU_LINE_CNTL,
+                  S_028A08_WIDTH(si_pack_float_12p4(state->line_width / 2)));
+   si_pm4_set_reg(
+      pm4, R_028A48_PA_SC_MODE_CNTL_0,
+      S_028A48_LINE_STIPPLE_ENABLE(state->line_stipple_enable) |
+         S_028A48_MSAA_ENABLE(state->multisample || state->poly_smooth || state->line_smooth) |
+         S_028A48_VPORT_SCISSOR_ENABLE(1) |
+         S_028A48_ALTERNATE_RBS_PER_TILE(sscreen->info.chip_class >= GFX9));
+
+   si_pm4_set_reg(pm4, R_028B7C_PA_SU_POLY_OFFSET_CLAMP, fui(state->offset_clamp));
+   si_pm4_set_reg(pm4, R_028814_PA_SU_SC_MODE_CNTL,
+                  S_028814_PROVOKING_VTX_LAST(!state->flatshade_first) |
+                     S_028814_CULL_FRONT((state->cull_face & PIPE_FACE_FRONT) ? 1 : 0) |
+                     S_028814_CULL_BACK((state->cull_face & PIPE_FACE_BACK) ? 1 : 0) |
+                     S_028814_FACE(!state->front_ccw) |
+                     S_028814_POLY_OFFSET_FRONT_ENABLE(util_get_offset(state, state->fill_front)) |
+                     S_028814_POLY_OFFSET_BACK_ENABLE(util_get_offset(state, state->fill_back)) |
+                     S_028814_POLY_OFFSET_PARA_ENABLE(state->offset_point || state->offset_line) |
+                     S_028814_POLY_MODE(rs->polygon_mode_enabled) |
+                     S_028814_POLYMODE_FRONT_PTYPE(si_translate_fill(state->fill_front)) |
+                     S_028814_POLYMODE_BACK_PTYPE(si_translate_fill(state->fill_back)));
+
+   if (!rs->uses_poly_offset)
+      return rs;
+
+   rs->pm4_poly_offset = CALLOC(3, sizeof(struct si_pm4_state));
+   if (!rs->pm4_poly_offset) {
+      FREE(rs);
+      return NULL;
+   }
+
+   /* Precalculate polygon offset states for 16-bit, 24-bit, and 32-bit zbuffers. */
+   for (i = 0; i < 3; i++) {
+      struct si_pm4_state *pm4 = &rs->pm4_poly_offset[i];
+      float offset_units = state->offset_units;
+      float offset_scale = state->offset_scale * 16.0f;
+      uint32_t pa_su_poly_offset_db_fmt_cntl = 0;
+
+      if (!state->offset_units_unscaled) {
+         switch (i) {
+         case 0: /* 16-bit zbuffer */
+            offset_units *= 4.0f;
+            pa_su_poly_offset_db_fmt_cntl = S_028B78_POLY_OFFSET_NEG_NUM_DB_BITS(-16);
+            break;
+         case 1: /* 24-bit zbuffer */
+            offset_units *= 2.0f;
+            pa_su_poly_offset_db_fmt_cntl = S_028B78_POLY_OFFSET_NEG_NUM_DB_BITS(-24);
+            break;
+         case 2: /* 32-bit zbuffer */
+            offset_units *= 1.0f;
+            pa_su_poly_offset_db_fmt_cntl =
+               S_028B78_POLY_OFFSET_NEG_NUM_DB_BITS(-23) | S_028B78_POLY_OFFSET_DB_IS_FLOAT_FMT(1);
+            break;
+         }
+      }
+
+      si_pm4_set_reg(pm4, R_028B80_PA_SU_POLY_OFFSET_FRONT_SCALE, fui(offset_scale));
+      si_pm4_set_reg(pm4, R_028B84_PA_SU_POLY_OFFSET_FRONT_OFFSET, fui(offset_units));
+      si_pm4_set_reg(pm4, R_028B88_PA_SU_POLY_OFFSET_BACK_SCALE, fui(offset_scale));
+      si_pm4_set_reg(pm4, R_028B8C_PA_SU_POLY_OFFSET_BACK_OFFSET, fui(offset_units));
+      si_pm4_set_reg(pm4, R_028B78_PA_SU_POLY_OFFSET_DB_FMT_CNTL, pa_su_poly_offset_db_fmt_cntl);
+   }
+
+   return rs;
 }
 
 static void si_bind_rs_state(struct pipe_context *ctx, void *state)
 {
-	struct si_context *sctx = (struct si_context *)ctx;
-	struct si_state_rasterizer *old_rs =
-		(struct si_state_rasterizer*)sctx->queued.named.rasterizer;
-	struct si_state_rasterizer *rs = (struct si_state_rasterizer *)state;
-
-	if (!rs)
-		rs = (struct si_state_rasterizer *)sctx->discard_rasterizer_state;
-
-	if (old_rs->multisample_enable != rs->multisample_enable) {
-		si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
-
-		/* Update the small primitive filter workaround if necessary. */
-		if (sctx->screen->info.has_msaa_sample_loc_bug &&
-		    sctx->framebuffer.nr_samples > 1)
-			si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_sample_locs);
-	}
-
-	sctx->current_vs_state &= C_VS_STATE_CLAMP_VERTEX_COLOR;
-	sctx->current_vs_state |= S_VS_STATE_CLAMP_VERTEX_COLOR(rs->clamp_vertex_color);
-
-	si_pm4_bind_state(sctx, rasterizer, rs);
-	si_update_poly_offset_state(sctx);
-
-	if (old_rs->scissor_enable != rs->scissor_enable)
-		si_mark_atom_dirty(sctx, &sctx->atoms.s.scissors);
-
-	if (old_rs->line_width != rs->line_width ||
-	    old_rs->max_point_size != rs->max_point_size ||
-	    old_rs->half_pixel_center != rs->half_pixel_center)
-		si_mark_atom_dirty(sctx, &sctx->atoms.s.guardband);
-
-	if (old_rs->clip_halfz != rs->clip_halfz)
-		si_mark_atom_dirty(sctx, &sctx->atoms.s.viewports);
-
-	if (old_rs->clip_plane_enable != rs->clip_plane_enable ||
-	    old_rs->pa_cl_clip_cntl != rs->pa_cl_clip_cntl)
-		si_mark_atom_dirty(sctx, &sctx->atoms.s.clip_regs);
-
-	if (old_rs->clip_plane_enable != rs->clip_plane_enable ||
-	    old_rs->rasterizer_discard != rs->rasterizer_discard ||
-	    old_rs->sprite_coord_enable != rs->sprite_coord_enable ||
-	    old_rs->flatshade != rs->flatshade ||
-	    old_rs->two_side != rs->two_side ||
-	    old_rs->multisample_enable != rs->multisample_enable ||
-	    old_rs->poly_stipple_enable != rs->poly_stipple_enable ||
-	    old_rs->poly_smooth != rs->poly_smooth ||
-	    old_rs->line_smooth != rs->line_smooth ||
-	    old_rs->clamp_fragment_color != rs->clamp_fragment_color ||
-	    old_rs->force_persample_interp != rs->force_persample_interp)
-		sctx->do_update_shaders = true;
+   struct si_context *sctx = (struct si_context *)ctx;
+   struct si_state_rasterizer *old_rs = (struct si_state_rasterizer *)sctx->queued.named.rasterizer;
+   struct si_state_rasterizer *rs = (struct si_state_rasterizer *)state;
+
+   if (!rs)
+      rs = (struct si_state_rasterizer *)sctx->discard_rasterizer_state;
+
+   if (old_rs->multisample_enable != rs->multisample_enable) {
+      si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
+
+      /* Update the small primitive filter workaround if necessary. */
+      if (sctx->screen->info.has_msaa_sample_loc_bug && sctx->framebuffer.nr_samples > 1)
+         si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_sample_locs);
+   }
+
+   sctx->current_vs_state &= C_VS_STATE_CLAMP_VERTEX_COLOR;
+   sctx->current_vs_state |= S_VS_STATE_CLAMP_VERTEX_COLOR(rs->clamp_vertex_color);
+
+   si_pm4_bind_state(sctx, rasterizer, rs);
+   si_update_poly_offset_state(sctx);
+
+   if (old_rs->scissor_enable != rs->scissor_enable)
+      si_mark_atom_dirty(sctx, &sctx->atoms.s.scissors);
+
+   if (old_rs->line_width != rs->line_width || old_rs->max_point_size != rs->max_point_size ||
+       old_rs->half_pixel_center != rs->half_pixel_center)
+      si_mark_atom_dirty(sctx, &sctx->atoms.s.guardband);
+
+   if (old_rs->clip_halfz != rs->clip_halfz)
+      si_mark_atom_dirty(sctx, &sctx->atoms.s.viewports);
+
+   if (old_rs->clip_plane_enable != rs->clip_plane_enable ||
+       old_rs->pa_cl_clip_cntl != rs->pa_cl_clip_cntl)
+      si_mark_atom_dirty(sctx, &sctx->atoms.s.clip_regs);
+
+   if (old_rs->clip_plane_enable != rs->clip_plane_enable ||
+       old_rs->rasterizer_discard != rs->rasterizer_discard ||
+       old_rs->sprite_coord_enable != rs->sprite_coord_enable ||
+       old_rs->flatshade != rs->flatshade || old_rs->two_side != rs->two_side ||
+       old_rs->multisample_enable != rs->multisample_enable ||
+       old_rs->poly_stipple_enable != rs->poly_stipple_enable ||
+       old_rs->poly_smooth != rs->poly_smooth || old_rs->line_smooth != rs->line_smooth ||
+       old_rs->clamp_fragment_color != rs->clamp_fragment_color ||
+       old_rs->force_persample_interp != rs->force_persample_interp)
+      sctx->do_update_shaders = true;
 }
 
 static void si_delete_rs_state(struct pipe_context *ctx, void *state)
 {
-	struct si_context *sctx = (struct si_context *)ctx;
-	struct si_state_rasterizer *rs = (struct si_state_rasterizer *)state;
+   struct si_context *sctx = (struct si_context *)ctx;
+   struct si_state_rasterizer *rs = (struct si_state_rasterizer *)state;
 
-	if (sctx->queued.named.rasterizer == state)
-		si_bind_rs_state(ctx, sctx->discard_rasterizer_state);
+   if (sctx->queued.named.rasterizer == state)
+      si_bind_rs_state(ctx, sctx->discard_rasterizer_state);
 
-	FREE(rs->pm4_poly_offset);
-	si_pm4_delete_state(sctx, rasterizer, rs);
+   FREE(rs->pm4_poly_offset);
+   si_pm4_delete_state(sctx, rasterizer, rs);
 }
 
 /*
@@ -1110,81 +1036,75 @@ static void si_delete_rs_state(struct pipe_context *ctx, void *state)
  */
 static void si_emit_stencil_ref(struct si_context *sctx)
 {
-	struct radeon_cmdbuf *cs = sctx->gfx_cs;
-	struct pipe_stencil_ref *ref = &sctx->stencil_ref.state;
-	struct si_dsa_stencil_ref_part *dsa = &sctx->stencil_ref.dsa_part;
-
-	radeon_set_context_reg_seq(cs, R_028430_DB_STENCILREFMASK, 2);
-	radeon_emit(cs, S_028430_STENCILTESTVAL(ref->ref_value[0]) |
-			S_028430_STENCILMASK(dsa->valuemask[0]) |
-			S_028430_STENCILWRITEMASK(dsa->writemask[0]) |
-			S_028430_STENCILOPVAL(1));
-	radeon_emit(cs, S_028434_STENCILTESTVAL_BF(ref->ref_value[1]) |
-			S_028434_STENCILMASK_BF(dsa->valuemask[1]) |
-			S_028434_STENCILWRITEMASK_BF(dsa->writemask[1]) |
-			S_028434_STENCILOPVAL_BF(1));
+   struct radeon_cmdbuf *cs = sctx->gfx_cs;
+   struct pipe_stencil_ref *ref = &sctx->stencil_ref.state;
+   struct si_dsa_stencil_ref_part *dsa = &sctx->stencil_ref.dsa_part;
+
+   radeon_set_context_reg_seq(cs, R_028430_DB_STENCILREFMASK, 2);
+   radeon_emit(cs, S_028430_STENCILTESTVAL(ref->ref_value[0]) |
+                      S_028430_STENCILMASK(dsa->valuemask[0]) |
+                      S_028430_STENCILWRITEMASK(dsa->writemask[0]) | S_028430_STENCILOPVAL(1));
+   radeon_emit(cs, S_028434_STENCILTESTVAL_BF(ref->ref_value[1]) |
+                      S_028434_STENCILMASK_BF(dsa->valuemask[1]) |
+                      S_028434_STENCILWRITEMASK_BF(dsa->writemask[1]) |
+                      S_028434_STENCILOPVAL_BF(1));
 }
 
-static void si_set_stencil_ref(struct pipe_context *ctx,
-			       const struct pipe_stencil_ref *state)
+static void si_set_stencil_ref(struct pipe_context *ctx, const struct pipe_stencil_ref *state)
 {
-        struct si_context *sctx = (struct si_context *)ctx;
+   struct si_context *sctx = (struct si_context *)ctx;
 
-	if (memcmp(&sctx->stencil_ref.state, state, sizeof(*state)) == 0)
-		return;
+   if (memcmp(&sctx->stencil_ref.state, state, sizeof(*state)) == 0)
+      return;
 
-	sctx->stencil_ref.state = *state;
-	si_mark_atom_dirty(sctx, &sctx->atoms.s.stencil_ref);
+   sctx->stencil_ref.state = *state;
+   si_mark_atom_dirty(sctx, &sctx->atoms.s.stencil_ref);
 }
 
-
 /*
  * DSA
  */
 
 static uint32_t si_translate_stencil_op(int s_op)
 {
-	switch (s_op) {
-	case PIPE_STENCIL_OP_KEEP:
-		return V_02842C_STENCIL_KEEP;
-	case PIPE_STENCIL_OP_ZERO:
-		return V_02842C_STENCIL_ZERO;
-	case PIPE_STENCIL_OP_REPLACE:
-		return V_02842C_STENCIL_REPLACE_TEST;
-	case PIPE_STENCIL_OP_INCR:
-		return V_02842C_STENCIL_ADD_CLAMP;
-	case PIPE_STENCIL_OP_DECR:
-		return V_02842C_STENCIL_SUB_CLAMP;
-	case PIPE_STENCIL_OP_INCR_WRAP:
-		return V_02842C_STENCIL_ADD_WRAP;
-	case PIPE_STENCIL_OP_DECR_WRAP:
-		return V_02842C_STENCIL_SUB_WRAP;
-	case PIPE_STENCIL_OP_INVERT:
-		return V_02842C_STENCIL_INVERT;
-	default:
-		PRINT_ERR("Unknown stencil op %d", s_op);
-		assert(0);
-		break;
-	}
-	return 0;
+   switch (s_op) {
+   case PIPE_STENCIL_OP_KEEP:
+      return V_02842C_STENCIL_KEEP;
+   case PIPE_STENCIL_OP_ZERO:
+      return V_02842C_STENCIL_ZERO;
+   case PIPE_STENCIL_OP_REPLACE:
+      return V_02842C_STENCIL_REPLACE_TEST;
+   case PIPE_STENCIL_OP_INCR:
+      return V_02842C_STENCIL_ADD_CLAMP;
+   case PIPE_STENCIL_OP_DECR:
+      return V_02842C_STENCIL_SUB_CLAMP;
+   case PIPE_STENCIL_OP_INCR_WRAP:
+      return V_02842C_STENCIL_ADD_WRAP;
+   case PIPE_STENCIL_OP_DECR_WRAP:
+      return V_02842C_STENCIL_SUB_WRAP;
+   case PIPE_STENCIL_OP_INVERT:
+      return V_02842C_STENCIL_INVERT;
+   default:
+      PRINT_ERR("Unknown stencil op %d", s_op);
+      assert(0);
+      break;
+   }
+   return 0;
 }
 
 static bool si_dsa_writes_stencil(const struct pipe_stencil_state *s)
 {
-	return s->enabled && s->writemask &&
-	       (s->fail_op  != PIPE_STENCIL_OP_KEEP ||
-		s->zfail_op != PIPE_STENCIL_OP_KEEP ||
-		s->zpass_op != PIPE_STENCIL_OP_KEEP);
+   return s->enabled && s->writemask &&
+          (s->fail_op != PIPE_STENCIL_OP_KEEP || s->zfail_op != PIPE_STENCIL_OP_KEEP ||
+           s->zpass_op != PIPE_STENCIL_OP_KEEP);
 }
 
 static bool si_order_invariant_stencil_op(enum pipe_stencil_op op)
 {
-	/* REPLACE is normally order invariant, except when the stencil
-	 * reference value is written by the fragment shader. Tracking this
-	 * interaction does not seem worth the effort, so be conservative. */
-	return op != PIPE_STENCIL_OP_INCR &&
-	       op != PIPE_STENCIL_OP_DECR &&
-	       op != PIPE_STENCIL_OP_REPLACE;
+   /* REPLACE is normally order invariant, except when the stencil
+    * reference value is written by the fragment shader. Tracking this
+    * interaction does not seem worth the effort, so be conservative. */
+   return op != PIPE_STENCIL_OP_INCR && op != PIPE_STENCIL_OP_DECR && op != PIPE_STENCIL_OP_REPLACE;
 }
 
 /* Compute whether, assuming Z writes are disabled, this stencil state is order
@@ -1192,325 +1112,304 @@ static bool si_order_invariant_stencil_op(enum pipe_stencil_op op)
  * final stencil buffer result does not depend on the order of fragments. */
 static bool si_order_invariant_stencil_state(const struct pipe_stencil_state *state)
 {
-	return !state->enabled || !state->writemask ||
-	       /* The following assumes that Z writes are disabled. */
-	       (state->func == PIPE_FUNC_ALWAYS &&
-	        si_order_invariant_stencil_op(state->zpass_op) &&
-	        si_order_invariant_stencil_op(state->zfail_op)) ||
-	       (state->func == PIPE_FUNC_NEVER &&
-	        si_order_invariant_stencil_op(state->fail_op));
+   return !state->enabled || !state->writemask ||
+          /* The following assumes that Z writes are disabled. */
+          (state->func == PIPE_FUNC_ALWAYS && si_order_invariant_stencil_op(state->zpass_op) &&
+           si_order_invariant_stencil_op(state->zfail_op)) ||
+          (state->func == PIPE_FUNC_NEVER && si_order_invariant_stencil_op(state->fail_op));
 }
 
 static void *si_create_dsa_state(struct pipe_context *ctx,
-				 const struct pipe_depth_stencil_alpha_state *state)
+                                 const struct pipe_depth_stencil_alpha_state *state)
 {
-	struct si_context *sctx = (struct si_context *)ctx;
-	struct si_state_dsa *dsa = CALLOC_STRUCT(si_state_dsa);
-	struct si_pm4_state *pm4 = &dsa->pm4;
-	unsigned db_depth_control;
-	uint32_t db_stencil_control = 0;
-
-	if (!dsa) {
-		return NULL;
-	}
-
-	dsa->stencil_ref.valuemask[0] = state->stencil[0].valuemask;
-	dsa->stencil_ref.valuemask[1] = state->stencil[1].valuemask;
-	dsa->stencil_ref.writemask[0] = state->stencil[0].writemask;
-	dsa->stencil_ref.writemask[1] = state->stencil[1].writemask;
-
-	db_depth_control = S_028800_Z_ENABLE(state->depth.enabled) |
-		S_028800_Z_WRITE_ENABLE(state->depth.writemask) |
-		S_028800_ZFUNC(state->depth.func) |
-		S_028800_DEPTH_BOUNDS_ENABLE(state->depth.bounds_test);
-
-	/* stencil */
-	if (state->stencil[0].enabled) {
-		db_depth_control |= S_028800_STENCIL_ENABLE(1);
-		db_depth_control |= S_028800_STENCILFUNC(state->stencil[0].func);
-		db_stencil_control |= S_02842C_STENCILFAIL(si_translate_stencil_op(state->stencil[0].fail_op));
-		db_stencil_control |= S_02842C_STENCILZPASS(si_translate_stencil_op(state->stencil[0].zpass_op));
-		db_stencil_control |= S_02842C_STENCILZFAIL(si_translate_stencil_op(state->stencil[0].zfail_op));
-
-		if (state->stencil[1].enabled) {
-			db_depth_control |= S_028800_BACKFACE_ENABLE(1);
-			db_depth_control |= S_028800_STENCILFUNC_BF(state->stencil[1].func);
-			db_stencil_control |= S_02842C_STENCILFAIL_BF(si_translate_stencil_op(state->stencil[1].fail_op));
-			db_stencil_control |= S_02842C_STENCILZPASS_BF(si_translate_stencil_op(state->stencil[1].zpass_op));
-			db_stencil_control |= S_02842C_STENCILZFAIL_BF(si_translate_stencil_op(state->stencil[1].zfail_op));
-		}
-	}
-
-	/* alpha */
-	if (state->alpha.enabled) {
-		dsa->alpha_func = state->alpha.func;
-
-		si_pm4_set_reg(pm4, R_00B030_SPI_SHADER_USER_DATA_PS_0 +
-		               SI_SGPR_ALPHA_REF * 4, fui(state->alpha.ref_value));
-	} else {
-		dsa->alpha_func = PIPE_FUNC_ALWAYS;
-	}
-
-	si_pm4_set_reg(pm4, R_028800_DB_DEPTH_CONTROL, db_depth_control);
-	if (state->stencil[0].enabled)
-		si_pm4_set_reg(pm4, R_02842C_DB_STENCIL_CONTROL, db_stencil_control);
-	if (state->depth.bounds_test) {
-		si_pm4_set_reg(pm4, R_028020_DB_DEPTH_BOUNDS_MIN, fui(state->depth.bounds_min));
-		si_pm4_set_reg(pm4, R_028024_DB_DEPTH_BOUNDS_MAX, fui(state->depth.bounds_max));
-	}
-
-	dsa->depth_enabled = state->depth.enabled;
-	dsa->depth_write_enabled = state->depth.enabled &&
-				   state->depth.writemask;
-	dsa->stencil_enabled = state->stencil[0].enabled;
-	dsa->stencil_write_enabled = state->stencil[0].enabled &&
-				     (si_dsa_writes_stencil(&state->stencil[0]) ||
-				      si_dsa_writes_stencil(&state->stencil[1]));
-	dsa->db_can_write = dsa->depth_write_enabled ||
-			    dsa->stencil_write_enabled;
-
-	bool zfunc_is_ordered =
-		state->depth.func == PIPE_FUNC_NEVER ||
-		state->depth.func == PIPE_FUNC_LESS ||
-		state->depth.func == PIPE_FUNC_LEQUAL ||
-		state->depth.func == PIPE_FUNC_GREATER ||
-		state->depth.func == PIPE_FUNC_GEQUAL;
-
-	bool nozwrite_and_order_invariant_stencil =
-		!dsa->db_can_write ||
-		(!dsa->depth_write_enabled &&
-		 si_order_invariant_stencil_state(&state->stencil[0]) &&
-		 si_order_invariant_stencil_state(&state->stencil[1]));
-
-	dsa->order_invariance[1].zs =
-		nozwrite_and_order_invariant_stencil ||
-		(!dsa->stencil_write_enabled && zfunc_is_ordered);
-	dsa->order_invariance[0].zs = !dsa->depth_write_enabled || zfunc_is_ordered;
-
-	dsa->order_invariance[1].pass_set =
-		nozwrite_and_order_invariant_stencil ||
-		(!dsa->stencil_write_enabled &&
-		 (state->depth.func == PIPE_FUNC_ALWAYS ||
-		  state->depth.func == PIPE_FUNC_NEVER));
-	dsa->order_invariance[0].pass_set =
-		!dsa->depth_write_enabled ||
-		(state->depth.func == PIPE_FUNC_ALWAYS ||
-		 state->depth.func == PIPE_FUNC_NEVER);
-
-	dsa->order_invariance[1].pass_last =
-		sctx->screen->assume_no_z_fights &&
-		!dsa->stencil_write_enabled &&
-		dsa->depth_write_enabled && zfunc_is_ordered;
-	dsa->order_invariance[0].pass_last =
-		sctx->screen->assume_no_z_fights &&
-		dsa->depth_write_enabled && zfunc_is_ordered;
-
-	return dsa;
+   struct si_context *sctx = (struct si_context *)ctx;
+   struct si_state_dsa *dsa = CALLOC_STRUCT(si_state_dsa);
+   struct si_pm4_state *pm4 = &dsa->pm4;
+   unsigned db_depth_control;
+   uint32_t db_stencil_control = 0;
+
+   if (!dsa) {
+      return NULL;
+   }
+
+   dsa->stencil_ref.valuemask[0] = state->stencil[0].valuemask;
+   dsa->stencil_ref.valuemask[1] = state->stencil[1].valuemask;
+   dsa->stencil_ref.writemask[0] = state->stencil[0].writemask;
+   dsa->stencil_ref.writemask[1] = state->stencil[1].writemask;
+
+   db_depth_control =
+      S_028800_Z_ENABLE(state->depth.enabled) | S_028800_Z_WRITE_ENABLE(state->depth.writemask) |
+      S_028800_ZFUNC(state->depth.func) | S_028800_DEPTH_BOUNDS_ENABLE(state->depth.bounds_test);
+
+   /* stencil */
+   if (state->stencil[0].enabled) {
+      db_depth_control |= S_028800_STENCIL_ENABLE(1);
+      db_depth_control |= S_028800_STENCILFUNC(state->stencil[0].func);
+      db_stencil_control |=
+         S_02842C_STENCILFAIL(si_translate_stencil_op(state->stencil[0].fail_op));
+      db_stencil_control |=
+         S_02842C_STENCILZPASS(si_translate_stencil_op(state->stencil[0].zpass_op));
+      db_stencil_control |=
+         S_02842C_STENCILZFAIL(si_translate_stencil_op(state->stencil[0].zfail_op));
+
+      if (state->stencil[1].enabled) {
+         db_depth_control |= S_028800_BACKFACE_ENABLE(1);
+         db_depth_control |= S_028800_STENCILFUNC_BF(state->stencil[1].func);
+         db_stencil_control |=
+            S_02842C_STENCILFAIL_BF(si_translate_stencil_op(state->stencil[1].fail_op));
+         db_stencil_control |=
+            S_02842C_STENCILZPASS_BF(si_translate_stencil_op(state->stencil[1].zpass_op));
+         db_stencil_control |=
+            S_02842C_STENCILZFAIL_BF(si_translate_stencil_op(state->stencil[1].zfail_op));
+      }
+   }
+
+   /* alpha */
+   if (state->alpha.enabled) {
+      dsa->alpha_func = state->alpha.func;
+
+      si_pm4_set_reg(pm4, R_00B030_SPI_SHADER_USER_DATA_PS_0 + SI_SGPR_ALPHA_REF * 4,
+                     fui(state->alpha.ref_value));
+   } else {
+      dsa->alpha_func = PIPE_FUNC_ALWAYS;
+   }
+
+   si_pm4_set_reg(pm4, R_028800_DB_DEPTH_CONTROL, db_depth_control);
+   if (state->stencil[0].enabled)
+      si_pm4_set_reg(pm4, R_02842C_DB_STENCIL_CONTROL, db_stencil_control);
+   if (state->depth.bounds_test) {
+      si_pm4_set_reg(pm4, R_028020_DB_DEPTH_BOUNDS_MIN, fui(state->depth.bounds_min));
+      si_pm4_set_reg(pm4, R_028024_DB_DEPTH_BOUNDS_MAX, fui(state->depth.bounds_max));
+   }
+
+   dsa->depth_enabled = state->depth.enabled;
+   dsa->depth_write_enabled = state->depth.enabled && state->depth.writemask;
+   dsa->stencil_enabled = state->stencil[0].enabled;
+   dsa->stencil_write_enabled =
+      state->stencil[0].enabled &&
+      (si_dsa_writes_stencil(&state->stencil[0]) || si_dsa_writes_stencil(&state->stencil[1]));
+   dsa->db_can_write = dsa->depth_write_enabled || dsa->stencil_write_enabled;
+
+   bool zfunc_is_ordered =
+      state->depth.func == PIPE_FUNC_NEVER || state->depth.func == PIPE_FUNC_LESS ||
+      state->depth.func == PIPE_FUNC_LEQUAL || state->depth.func == PIPE_FUNC_GREATER ||
+      state->depth.func == PIPE_FUNC_GEQUAL;
+
+   bool nozwrite_and_order_invariant_stencil =
+      !dsa->db_can_write ||
+      (!dsa->depth_write_enabled && si_order_invariant_stencil_state(&state->stencil[0]) &&
+       si_order_invariant_stencil_state(&state->stencil[1]));
+
+   dsa->order_invariance[1].zs =
+      nozwrite_and_order_invariant_stencil || (!dsa->stencil_write_enabled && zfunc_is_ordered);
+   dsa->order_invariance[0].zs = !dsa->depth_write_enabled || zfunc_is_ordered;
+
+   dsa->order_invariance[1].pass_set =
+      nozwrite_and_order_invariant_stencil ||
+      (!dsa->stencil_write_enabled &&
+       (state->depth.func == PIPE_FUNC_ALWAYS || state->depth.func == PIPE_FUNC_NEVER));
+   dsa->order_invariance[0].pass_set =
+      !dsa->depth_write_enabled ||
+      (state->depth.func == PIPE_FUNC_ALWAYS || state->depth.func == PIPE_FUNC_NEVER);
+
+   dsa->order_invariance[1].pass_last = sctx->screen->assume_no_z_fights &&
+                                        !dsa->stencil_write_enabled && dsa->depth_write_enabled &&
+                                        zfunc_is_ordered;
+   dsa->order_invariance[0].pass_last =
+      sctx->screen->assume_no_z_fights && dsa->depth_write_enabled && zfunc_is_ordered;
+
+   return dsa;
 }
 
 static void si_bind_dsa_state(struct pipe_context *ctx, void *state)
 {
-        struct si_context *sctx = (struct si_context *)ctx;
-	struct si_state_dsa *old_dsa = sctx->queued.named.dsa;
-        struct si_state_dsa *dsa = state;
-
-        if (!dsa)
-                dsa = (struct si_state_dsa *)sctx->noop_dsa;
-
-	si_pm4_bind_state(sctx, dsa, dsa);
-
-	if (memcmp(&dsa->stencil_ref, &sctx->stencil_ref.dsa_part,
-		   sizeof(struct si_dsa_stencil_ref_part)) != 0) {
-		sctx->stencil_ref.dsa_part = dsa->stencil_ref;
-		si_mark_atom_dirty(sctx, &sctx->atoms.s.stencil_ref);
-	}
-
-	if (old_dsa->alpha_func != dsa->alpha_func)
-		sctx->do_update_shaders = true;
-
-	if (sctx->screen->dpbb_allowed &&
-	    ((old_dsa->depth_enabled != dsa->depth_enabled ||
-	      old_dsa->stencil_enabled != dsa->stencil_enabled ||
-	      old_dsa->db_can_write != dsa->db_can_write)))
-		si_mark_atom_dirty(sctx, &sctx->atoms.s.dpbb_state);
-
-	if (sctx->screen->has_out_of_order_rast &&
-	    (memcmp(old_dsa->order_invariance, dsa->order_invariance,
-		    sizeof(old_dsa->order_invariance))))
-		si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config);
+   struct si_context *sctx = (struct si_context *)ctx;
+   struct si_state_dsa *old_dsa = sctx->queued.named.dsa;
+   struct si_state_dsa *dsa = state;
+
+   if (!dsa)
+      dsa = (struct si_state_dsa *)sctx->noop_dsa;
+
+   si_pm4_bind_state(sctx, dsa, dsa);
+
+   if (memcmp(&dsa->stencil_ref, &sctx->stencil_ref.dsa_part,
+              sizeof(struct si_dsa_stencil_ref_part)) != 0) {
+      sctx->stencil_ref.dsa_part = dsa->stencil_ref;
+      si_mark_atom_dirty(sctx, &sctx->atoms.s.stencil_ref);
+   }
+
+   if (old_dsa->alpha_func != dsa->alpha_func)
+      sctx->do_update_shaders = true;
+
+   if (sctx->screen->dpbb_allowed && ((old_dsa->depth_enabled != dsa->depth_enabled ||
+                                       old_dsa->stencil_enabled != dsa->stencil_enabled ||
+                                       old_dsa->db_can_write != dsa->db_can_write)))
+      si_mark_atom_dirty(sctx, &sctx->atoms.s.dpbb_state);
+
+   if (sctx->screen->has_out_of_order_rast &&
+       (memcmp(old_dsa->order_invariance, dsa->order_invariance,
+               sizeof(old_dsa->order_invariance))))
+      si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config);
 }
 
 static void si_delete_dsa_state(struct pipe_context *ctx, void *state)
 {
-	struct si_context *sctx = (struct si_context *)ctx;
+   struct si_context *sctx = (struct si_context *)ctx;
 
-	if (sctx->queued.named.dsa == state)
-		si_bind_dsa_state(ctx, sctx->noop_dsa);
+   if (sctx->queued.named.dsa == state)
+      si_bind_dsa_state(ctx, sctx->noop_dsa);
 
-	si_pm4_delete_state(sctx, dsa, (struct si_state_dsa *)state);
+   si_pm4_delete_state(sctx, dsa, (struct si_state_dsa *)state);
 }
 
 static void *si_create_db_flush_dsa(struct si_context *sctx)
 {
-	struct pipe_depth_stencil_alpha_state dsa = {};
+   struct pipe_depth_stencil_alpha_state dsa = {};
 
-	return sctx->b.create_depth_stencil_alpha_state(&sctx->b, &dsa);
+   return sctx->b.create_depth_stencil_alpha_state(&sctx->b, &dsa);
 }
 
 /* DB RENDER STATE */
 
 static void si_set_active_query_state(struct pipe_context *ctx, bool enable)
 {
-	struct si_context *sctx = (struct si_context*)ctx;
-
-	/* Pipeline stat & streamout queries. */
-	if (enable) {
-		sctx->flags &= ~SI_CONTEXT_STOP_PIPELINE_STATS;
-		sctx->flags |= SI_CONTEXT_START_PIPELINE_STATS;
-	} else {
-		sctx->flags &= ~SI_CONTEXT_START_PIPELINE_STATS;
-		sctx->flags |= SI_CONTEXT_STOP_PIPELINE_STATS;
-	}
-
-	/* Occlusion queries. */
-	if (sctx->occlusion_queries_disabled != !enable) {
-		sctx->occlusion_queries_disabled = !enable;
-		si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
-	}
+   struct si_context *sctx = (struct si_context *)ctx;
+
+   /* Pipeline stat & streamout queries. */
+   if (enable) {
+      sctx->flags &= ~SI_CONTEXT_STOP_PIPELINE_STATS;
+      sctx->flags |= SI_CONTEXT_START_PIPELINE_STATS;
+   } else {
+      sctx->flags &= ~SI_CONTEXT_START_PIPELINE_STATS;
+      sctx->flags |= SI_CONTEXT_STOP_PIPELINE_STATS;
+   }
+
+   /* Occlusion queries. */
+   if (sctx->occlusion_queries_disabled != !enable) {
+      sctx->occlusion_queries_disabled = !enable;
+      si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
+   }
 }
 
-void si_set_occlusion_query_state(struct si_context *sctx,
-				  bool old_perfect_enable)
+void si_set_occlusion_query_state(struct si_context *sctx, bool old_perfect_enable)
 {
-	si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
+   si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
 
-	bool perfect_enable = sctx->num_perfect_occlusion_queries != 0;
+   bool perfect_enable = sctx->num_perfect_occlusion_queries != 0;
 
-	if (perfect_enable != old_perfect_enable)
-		si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config);
+   if (perfect_enable != old_perfect_enable)
+      si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config);
 }
 
 void si_save_qbo_state(struct si_context *sctx, struct si_qbo_state *st)
 {
-	st->saved_compute = sctx->cs_shader_state.program;
+   st->saved_compute = sctx->cs_shader_state.program;
 
-	si_get_pipe_constant_buffer(sctx, PIPE_SHADER_COMPUTE, 0, &st->saved_const0);
-	si_get_shader_buffers(sctx, PIPE_SHADER_COMPUTE, 0, 3, st->saved_ssbo);
+   si_get_pipe_constant_buffer(sctx, PIPE_SHADER_COMPUTE, 0, &st->saved_const0);
+   si_get_shader_buffers(sctx, PIPE_SHADER_COMPUTE, 0, 3, st->saved_ssbo);
 
-	st->saved_ssbo_writable_mask = 0;
+   st->saved_ssbo_writable_mask = 0;
 
-	for (unsigned i = 0; i < 3; i++) {
-		if (sctx->const_and_shader_buffers[PIPE_SHADER_COMPUTE].writable_mask &
-		    (1u << si_get_shaderbuf_slot(i)))
-			st->saved_ssbo_writable_mask |= 1 << i;
-	}
+   for (unsigned i = 0; i < 3; i++) {
+      if (sctx->const_and_shader_buffers[PIPE_SHADER_COMPUTE].writable_mask &
+          (1u << si_get_shaderbuf_slot(i)))
+         st->saved_ssbo_writable_mask |= 1 << i;
+   }
 }
 
 void si_restore_qbo_state(struct si_context *sctx, struct si_qbo_state *st)
 {
-	sctx->b.bind_compute_state(&sctx->b, st->saved_compute);
+   sctx->b.bind_compute_state(&sctx->b, st->saved_compute);
 
-	sctx->b.set_constant_buffer(&sctx->b, PIPE_SHADER_COMPUTE, 0, &st->saved_const0);
-	pipe_resource_reference(&st->saved_const0.buffer, NULL);
+   sctx->b.set_constant_buffer(&sctx->b, PIPE_SHADER_COMPUTE, 0, &st->saved_const0);
+   pipe_resource_reference(&st->saved_const0.buffer, NULL);
 
-	sctx->b.set_shader_buffers(&sctx->b, PIPE_SHADER_COMPUTE, 0, 3, st->saved_ssbo,
-				   st->saved_ssbo_writable_mask);
-	for (unsigned i = 0; i < 3; ++i)
-		pipe_resource_reference(&st->saved_ssbo[i].buffer, NULL);
+   sctx->b.set_shader_buffers(&sctx->b, PIPE_SHADER_COMPUTE, 0, 3, st->saved_ssbo,
+                              st->saved_ssbo_writable_mask);
+   for (unsigned i = 0; i < 3; ++i)
+      pipe_resource_reference(&st->saved_ssbo[i].buffer, NULL);
 }
 
 static void si_emit_db_render_state(struct si_context *sctx)
 {
-	struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
-	unsigned db_shader_control, db_render_control, db_count_control;
-	unsigned initial_cdw = sctx->gfx_cs->current.cdw;
-
-	/* DB_RENDER_CONTROL */
-	if (sctx->dbcb_depth_copy_enabled ||
-	    sctx->dbcb_stencil_copy_enabled) {
-		db_render_control =
-			S_028000_DEPTH_COPY(sctx->dbcb_depth_copy_enabled) |
-			S_028000_STENCIL_COPY(sctx->dbcb_stencil_copy_enabled) |
-			S_028000_COPY_CENTROID(1) |
-			S_028000_COPY_SAMPLE(sctx->dbcb_copy_sample);
-	} else if (sctx->db_flush_depth_inplace || sctx->db_flush_stencil_inplace) {
-		db_render_control =
-			S_028000_DEPTH_COMPRESS_DISABLE(sctx->db_flush_depth_inplace) |
-			S_028000_STENCIL_COMPRESS_DISABLE(sctx->db_flush_stencil_inplace);
-	} else {
-		db_render_control =
-			S_028000_DEPTH_CLEAR_ENABLE(sctx->db_depth_clear) |
-			S_028000_STENCIL_CLEAR_ENABLE(sctx->db_stencil_clear);
-	}
-
-	/* DB_COUNT_CONTROL (occlusion queries) */
-	if (sctx->num_occlusion_queries > 0 &&
-	    !sctx->occlusion_queries_disabled) {
-		bool perfect = sctx->num_perfect_occlusion_queries > 0;
-		bool gfx10_perfect = sctx->chip_class >= GFX10 && perfect;
-
-		if (sctx->chip_class >= GFX7) {
-			unsigned log_sample_rate = sctx->framebuffer.log_samples;
-
-			/* Stoney doesn't increment occlusion query counters
-			 * if the sample rate is 16x. Use 8x sample rate instead.
-			 */
-			if (sctx->family == CHIP_STONEY)
-				log_sample_rate = MIN2(log_sample_rate, 3);
-
-			db_count_control =
-				S_028004_PERFECT_ZPASS_COUNTS(perfect) |
-				S_028004_DISABLE_CONSERVATIVE_ZPASS_COUNTS(gfx10_perfect) |
-				S_028004_SAMPLE_RATE(log_sample_rate) |
-				S_028004_ZPASS_ENABLE(1) |
-				S_028004_SLICE_EVEN_ENABLE(1) |
-				S_028004_SLICE_ODD_ENABLE(1);
-		} else {
-			db_count_control =
-				S_028004_PERFECT_ZPASS_COUNTS(perfect) |
-				S_028004_SAMPLE_RATE(sctx->framebuffer.log_samples);
-		}
-	} else {
-		/* Disable occlusion queries. */
-		if (sctx->chip_class >= GFX7) {
-			db_count_control = 0;
-		} else {
-			db_count_control = S_028004_ZPASS_INCREMENT_DISABLE(1);
-		}
-	}
-
-	radeon_opt_set_context_reg2(sctx, R_028000_DB_RENDER_CONTROL,
-				    SI_TRACKED_DB_RENDER_CONTROL, db_render_control,
-				    db_count_control);
-
-	/* DB_RENDER_OVERRIDE2 */
-	radeon_opt_set_context_reg(sctx,  R_028010_DB_RENDER_OVERRIDE2,
-		SI_TRACKED_DB_RENDER_OVERRIDE2,
-		S_028010_DISABLE_ZMASK_EXPCLEAR_OPTIMIZATION(sctx->db_depth_disable_expclear) |
-		S_028010_DISABLE_SMEM_EXPCLEAR_OPTIMIZATION(sctx->db_stencil_disable_expclear) |
-		S_028010_DECOMPRESS_Z_ON_FLUSH(sctx->framebuffer.nr_samples >= 4));
-
-	db_shader_control = sctx->ps_db_shader_control;
-
-	/* Bug workaround for smoothing (overrasterization) on GFX6. */
-	if (sctx->chip_class == GFX6 && sctx->smoothing_enabled) {
-		db_shader_control &= C_02880C_Z_ORDER;
-		db_shader_control |= S_02880C_Z_ORDER(V_02880C_LATE_Z);
-	}
-
-	/* Disable the gl_SampleMask fragment shader output if MSAA is disabled. */
-	if (!rs->multisample_enable)
-		db_shader_control &= C_02880C_MASK_EXPORT_ENABLE;
-
-	if (sctx->screen->info.has_rbplus &&
-	    !sctx->screen->info.rbplus_allowed)
-		db_shader_control |= S_02880C_DUAL_QUAD_DISABLE(1);
-
-	radeon_opt_set_context_reg(sctx, R_02880C_DB_SHADER_CONTROL,
-				   SI_TRACKED_DB_SHADER_CONTROL, db_shader_control);
-
-	if (initial_cdw != sctx->gfx_cs->current.cdw)
-		sctx->context_roll = true;
+   struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
+   unsigned db_shader_control, db_render_control, db_count_control;
+   unsigned initial_cdw = sctx->gfx_cs->current.cdw;
+
+   /* DB_RENDER_CONTROL */
+   if (sctx->dbcb_depth_copy_enabled || sctx->dbcb_stencil_copy_enabled) {
+      db_render_control = S_028000_DEPTH_COPY(sctx->dbcb_depth_copy_enabled) |
+                          S_028000_STENCIL_COPY(sctx->dbcb_stencil_copy_enabled) |
+                          S_028000_COPY_CENTROID(1) | S_028000_COPY_SAMPLE(sctx->dbcb_copy_sample);
+   } else if (sctx->db_flush_depth_inplace || sctx->db_flush_stencil_inplace) {
+      db_render_control = S_028000_DEPTH_COMPRESS_DISABLE(sctx->db_flush_depth_inplace) |
+                          S_028000_STENCIL_COMPRESS_DISABLE(sctx->db_flush_stencil_inplace);
+   } else {
+      db_render_control = S_028000_DEPTH_CLEAR_ENABLE(sctx->db_depth_clear) |
+                          S_028000_STENCIL_CLEAR_ENABLE(sctx->db_stencil_clear);
+   }
+
+   /* DB_COUNT_CONTROL (occlusion queries) */
+   if (sctx->num_occlusion_queries > 0 && !sctx->occlusion_queries_disabled) {
+      bool perfect = sctx->num_perfect_occlusion_queries > 0;
+      bool gfx10_perfect = sctx->chip_class >= GFX10 && perfect;
+
+      if (sctx->chip_class >= GFX7) {
+         unsigned log_sample_rate = sctx->framebuffer.log_samples;
+
+         /* Stoney doesn't increment occlusion query counters
+          * if the sample rate is 16x. Use 8x sample rate instead.
+          */
+         if (sctx->family == CHIP_STONEY)
+            log_sample_rate = MIN2(log_sample_rate, 3);
+
+         db_count_control = S_028004_PERFECT_ZPASS_COUNTS(perfect) |
+                            S_028004_DISABLE_CONSERVATIVE_ZPASS_COUNTS(gfx10_perfect) |
+                            S_028004_SAMPLE_RATE(log_sample_rate) | S_028004_ZPASS_ENABLE(1) |
+                            S_028004_SLICE_EVEN_ENABLE(1) | S_028004_SLICE_ODD_ENABLE(1);
+      } else {
+         db_count_control = S_028004_PERFECT_ZPASS_COUNTS(perfect) |
+                            S_028004_SAMPLE_RATE(sctx->framebuffer.log_samples);
+      }
+   } else {
+      /* Disable occlusion queries. */
+      if (sctx->chip_class >= GFX7) {
+         db_count_control = 0;
+      } else {
+         db_count_control = S_028004_ZPASS_INCREMENT_DISABLE(1);
+      }
+   }
+
+   radeon_opt_set_context_reg2(sctx, R_028000_DB_RENDER_CONTROL, SI_TRACKED_DB_RENDER_CONTROL,
+                               db_render_control, db_count_control);
+
+   /* DB_RENDER_OVERRIDE2 */
+   radeon_opt_set_context_reg(
+      sctx, R_028010_DB_RENDER_OVERRIDE2, SI_TRACKED_DB_RENDER_OVERRIDE2,
+      S_028010_DISABLE_ZMASK_EXPCLEAR_OPTIMIZATION(sctx->db_depth_disable_expclear) |
+         S_028010_DISABLE_SMEM_EXPCLEAR_OPTIMIZATION(sctx->db_stencil_disable_expclear) |
+         S_028010_DECOMPRESS_Z_ON_FLUSH(sctx->framebuffer.nr_samples >= 4));
+
+   db_shader_control = sctx->ps_db_shader_control;
+
+   /* Bug workaround for smoothing (overrasterization) on GFX6. */
+   if (sctx->chip_class == GFX6 && sctx->smoothing_enabled) {
+      db_shader_control &= C_02880C_Z_ORDER;
+      db_shader_control |= S_02880C_Z_ORDER(V_02880C_LATE_Z);
+   }
+
+   /* Disable the gl_SampleMask fragment shader output if MSAA is disabled. */
+   if (!rs->multisample_enable)
+      db_shader_control &= C_02880C_MASK_EXPORT_ENABLE;
+
+   if (sctx->screen->info.has_rbplus && !sctx->screen->info.rbplus_allowed)
+      db_shader_control |= S_02880C_DUAL_QUAD_DISABLE(1);
+
+   radeon_opt_set_context_reg(sctx, R_02880C_DB_SHADER_CONTROL, SI_TRACKED_DB_SHADER_CONTROL,
+                              db_shader_control);
+
+   if (initial_cdw != sctx->gfx_cs->current.cdw)
+      sctx->context_roll = true;
 }
 
 /*
@@ -1518,514 +1417,500 @@ static void si_emit_db_render_state(struct si_context *sctx)
  */
 static uint32_t si_translate_colorformat(enum pipe_format format)
 {
-	const struct util_format_description *desc = util_format_description(format);
-	if (!desc)
-		return V_028C70_COLOR_INVALID;
-
-#define HAS_SIZE(x,y,z,w) \
-	(desc->channel[0].size == (x) && desc->channel[1].size == (y) && \
-         desc->channel[2].size == (z) && desc->channel[3].size == (w))
-
-	if (format == PIPE_FORMAT_R11G11B10_FLOAT) /* isn't plain */
-		return V_028C70_COLOR_10_11_11;
-
-	if (desc->layout != UTIL_FORMAT_LAYOUT_PLAIN)
-		return V_028C70_COLOR_INVALID;
-
-	/* hw cannot support mixed formats (except depth/stencil, since
-	 * stencil is not written to). */
-	if (desc->is_mixed && desc->colorspace != UTIL_FORMAT_COLORSPACE_ZS)
-		return V_028C70_COLOR_INVALID;
-
-	switch (desc->nr_channels) {
-	case 1:
-		switch (desc->channel[0].size) {
-		case 8:
-			return V_028C70_COLOR_8;
-		case 16:
-			return V_028C70_COLOR_16;
-		case 32:
-			return V_028C70_COLOR_32;
-		}
-		break;
-	case 2:
-		if (desc->channel[0].size == desc->channel[1].size) {
-			switch (desc->channel[0].size) {
-			case 8:
-				return V_028C70_COLOR_8_8;
-			case 16:
-				return V_028C70_COLOR_16_16;
-			case 32:
-				return V_028C70_COLOR_32_32;
-			}
-		} else if (HAS_SIZE(8,24,0,0)) {
-			return V_028C70_COLOR_24_8;
-		} else if (HAS_SIZE(24,8,0,0)) {
-			return V_028C70_COLOR_8_24;
-		}
-		break;
-	case 3:
-		if (HAS_SIZE(5,6,5,0)) {
-			return V_028C70_COLOR_5_6_5;
-		} else if (HAS_SIZE(32,8,24,0)) {
-			return V_028C70_COLOR_X24_8_32_FLOAT;
-		}
-		break;
-	case 4:
-		if (desc->channel[0].size == desc->channel[1].size &&
-		    desc->channel[0].size == desc->channel[2].size &&
-		    desc->channel[0].size == desc->channel[3].size) {
-			switch (desc->channel[0].size) {
-			case 4:
-				return V_028C70_COLOR_4_4_4_4;
-			case 8:
-				return V_028C70_COLOR_8_8_8_8;
-			case 16:
-				return V_028C70_COLOR_16_16_16_16;
-			case 32:
-				return V_028C70_COLOR_32_32_32_32;
-			}
-		} else if (HAS_SIZE(5,5,5,1)) {
-			return V_028C70_COLOR_1_5_5_5;
-		} else if (HAS_SIZE(1,5,5,5)) {
-			return V_028C70_COLOR_5_5_5_1;
-		} else if (HAS_SIZE(10,10,10,2)) {
-			return V_028C70_COLOR_2_10_10_10;
-		}
-		break;
-	}
-	return V_028C70_COLOR_INVALID;
+   const struct util_format_description *desc = util_format_description(format);
+   if (!desc)
+      return V_028C70_COLOR_INVALID;
+
+#define HAS_SIZE(x, y, z, w)                                                                       \
+   (desc->channel[0].size == (x) && desc->channel[1].size == (y) &&                                \
+    desc->channel[2].size == (z) && desc->channel[3].size == (w))
+
+   if (format == PIPE_FORMAT_R11G11B10_FLOAT) /* isn't plain */
+      return V_028C70_COLOR_10_11_11;
+
+   if (desc->layout != UTIL_FORMAT_LAYOUT_PLAIN)
+      return V_028C70_COLOR_INVALID;
+
+   /* hw cannot support mixed formats (except depth/stencil, since
+    * stencil is not written to). */
+   if (desc->is_mixed && desc->colorspace != UTIL_FORMAT_COLORSPACE_ZS)
+      return V_028C70_COLOR_INVALID;
+
+   switch (desc->nr_channels) {
+   case 1:
+      switch (desc->channel[0].size) {
+      case 8:
+         return V_028C70_COLOR_8;
+      case 16:
+         return V_028C70_COLOR_16;
+      case 32:
+         return V_028C70_COLOR_32;
+      }
+      break;
+   case 2:
+      if (desc->channel[0].size == desc->channel[1].size) {
+         switch (desc->channel[0].size) {
+         case 8:
+            return V_028C70_COLOR_8_8;
+         case 16:
+            return V_028C70_COLOR_16_16;
+         case 32:
+            return V_028C70_COLOR_32_32;
+         }
+      } else if (HAS_SIZE(8, 24, 0, 0)) {
+         return V_028C70_COLOR_24_8;
+      } else if (HAS_SIZE(24, 8, 0, 0)) {
+         return V_028C70_COLOR_8_24;
+      }
+      break;
+   case 3:
+      if (HAS_SIZE(5, 6, 5, 0)) {
+         return V_028C70_COLOR_5_6_5;
+      } else if (HAS_SIZE(32, 8, 24, 0)) {
+         return V_028C70_COLOR_X24_8_32_FLOAT;
+      }
+      break;
+   case 4:
+      if (desc->channel[0].size == desc->channel[1].size &&
+          desc->channel[0].size == desc->channel[2].size &&
+          desc->channel[0].size == desc->channel[3].size) {
+         switch (desc->channel[0].size) {
+         case 4:
+            return V_028C70_COLOR_4_4_4_4;
+         case 8:
+            return V_028C70_COLOR_8_8_8_8;
+         case 16:
+            return V_028C70_COLOR_16_16_16_16;
+         case 32:
+            return V_028C70_COLOR_32_32_32_32;
+         }
+      } else if (HAS_SIZE(5, 5, 5, 1)) {
+         return V_028C70_COLOR_1_5_5_5;
+      } else if (HAS_SIZE(1, 5, 5, 5)) {
+         return V_028C70_COLOR_5_5_5_1;
+      } else if (HAS_SIZE(10, 10, 10, 2)) {
+         return V_028C70_COLOR_2_10_10_10;
+      }
+      break;
+   }
+   return V_028C70_COLOR_INVALID;
 }
 
 static uint32_t si_colorformat_endian_swap(uint32_t colorformat)
 {
-	if (SI_BIG_ENDIAN) {
-		switch(colorformat) {
-		/* 8-bit buffers. */
-		case V_028C70_COLOR_8:
-			return V_028C70_ENDIAN_NONE;
-
-		/* 16-bit buffers. */
-		case V_028C70_COLOR_5_6_5:
-		case V_028C70_COLOR_1_5_5_5:
-		case V_028C70_COLOR_4_4_4_4:
-		case V_028C70_COLOR_16:
-		case V_028C70_COLOR_8_8:
-			return V_028C70_ENDIAN_8IN16;
-
-		/* 32-bit buffers. */
-		case V_028C70_COLOR_8_8_8_8:
-		case V_028C70_COLOR_2_10_10_10:
-		case V_028C70_COLOR_8_24:
-		case V_028C70_COLOR_24_8:
-		case V_028C70_COLOR_16_16:
-			return V_028C70_ENDIAN_8IN32;
-
-		/* 64-bit buffers. */
-		case V_028C70_COLOR_16_16_16_16:
-			return V_028C70_ENDIAN_8IN16;
-
-		case V_028C70_COLOR_32_32:
-			return V_028C70_ENDIAN_8IN32;
-
-		/* 128-bit buffers. */
-		case V_028C70_COLOR_32_32_32_32:
-			return V_028C70_ENDIAN_8IN32;
-		default:
-			return V_028C70_ENDIAN_NONE; /* Unsupported. */
-		}
-	} else {
-		return V_028C70_ENDIAN_NONE;
-	}
+   if (SI_BIG_ENDIAN) {
+      switch (colorformat) {
+      /* 8-bit buffers. */
+      case V_028C70_COLOR_8:
+         return V_028C70_ENDIAN_NONE;
+
+      /* 16-bit buffers. */
+      case V_028C70_COLOR_5_6_5:
+      case V_028C70_COLOR_1_5_5_5:
+      case V_028C70_COLOR_4_4_4_4:
+      case V_028C70_COLOR_16:
+      case V_028C70_COLOR_8_8:
+         return V_028C70_ENDIAN_8IN16;
+
+      /* 32-bit buffers. */
+      case V_028C70_COLOR_8_8_8_8:
+      case V_028C70_COLOR_2_10_10_10:
+      case V_028C70_COLOR_8_24:
+      case V_028C70_COLOR_24_8:
+      case V_028C70_COLOR_16_16:
+         return V_028C70_ENDIAN_8IN32;
+
+      /* 64-bit buffers. */
+      case V_028C70_COLOR_16_16_16_16:
+         return V_028C70_ENDIAN_8IN16;
+
+      case V_028C70_COLOR_32_32:
+         return V_028C70_ENDIAN_8IN32;
+
+      /* 128-bit buffers. */
+      case V_028C70_COLOR_32_32_32_32:
+         return V_028C70_ENDIAN_8IN32;
+      default:
+         return V_028C70_ENDIAN_NONE; /* Unsupported. */
+      }
+   } else {
+      return V_028C70_ENDIAN_NONE;
+   }
 }
 
 static uint32_t si_translate_dbformat(enum pipe_format format)
 {
-	switch (format) {
-	case PIPE_FORMAT_Z16_UNORM:
-		return V_028040_Z_16;
-	case PIPE_FORMAT_S8_UINT_Z24_UNORM:
-	case PIPE_FORMAT_X8Z24_UNORM:
-	case PIPE_FORMAT_Z24X8_UNORM:
-	case PIPE_FORMAT_Z24_UNORM_S8_UINT:
-		return V_028040_Z_24; /* deprecated on AMD GCN */
-	case PIPE_FORMAT_Z32_FLOAT:
-	case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT:
-		return V_028040_Z_32_FLOAT;
-	default:
-		return V_028040_Z_INVALID;
-	}
+   switch (format) {
+   case PIPE_FORMAT_Z16_UNORM:
+      return V_028040_Z_16;
+   case PIPE_FORMAT_S8_UINT_Z24_UNORM:
+   case PIPE_FORMAT_X8Z24_UNORM:
+   case PIPE_FORMAT_Z24X8_UNORM:
+   case PIPE_FORMAT_Z24_UNORM_S8_UINT:
+      return V_028040_Z_24; /* deprecated on AMD GCN */
+   case PIPE_FORMAT_Z32_FLOAT:
+   case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT:
+      return V_028040_Z_32_FLOAT;
+   default:
+      return V_028040_Z_INVALID;
+   }
 }
 
 /*
  * Texture translation
  */
 
-static uint32_t si_translate_texformat(struct pipe_screen *screen,
-				       enum pipe_format format,
-				       const struct util_format_description *desc,
-				       int first_non_void)
+static uint32_t si_translate_texformat(struct pipe_screen *screen, enum pipe_format format,
+                                       const struct util_format_description *desc,
+                                       int first_non_void)
 {
-	struct si_screen *sscreen = (struct si_screen*)screen;
-	bool uniform = true;
-	int i;
-
-	assert(sscreen->info.chip_class <= GFX9);
-
-	/* Colorspace (return non-RGB formats directly). */
-	switch (desc->colorspace) {
-	/* Depth stencil formats */
-	case UTIL_FORMAT_COLORSPACE_ZS:
-		switch (format) {
-		case PIPE_FORMAT_Z16_UNORM:
-			return V_008F14_IMG_DATA_FORMAT_16;
-		case PIPE_FORMAT_X24S8_UINT:
-		case PIPE_FORMAT_S8X24_UINT:
-			/*
-			 * Implemented as an 8_8_8_8 data format to fix texture
-			 * gathers in stencil sampling. This affects at least
-			 * GL45-CTS.texture_cube_map_array.sampling on GFX8.
-			 */
-			if (sscreen->info.chip_class <= GFX8)
-				return V_008F14_IMG_DATA_FORMAT_8_8_8_8;
-
-			if (format == PIPE_FORMAT_X24S8_UINT)
-				return V_008F14_IMG_DATA_FORMAT_8_24;
-			else
-				return V_008F14_IMG_DATA_FORMAT_24_8;
-		case PIPE_FORMAT_Z24X8_UNORM:
-		case PIPE_FORMAT_Z24_UNORM_S8_UINT:
-			return V_008F14_IMG_DATA_FORMAT_8_24;
-		case PIPE_FORMAT_X8Z24_UNORM:
-		case PIPE_FORMAT_S8_UINT_Z24_UNORM:
-			return V_008F14_IMG_DATA_FORMAT_24_8;
-		case PIPE_FORMAT_S8_UINT:
-			return V_008F14_IMG_DATA_FORMAT_8;
-		case PIPE_FORMAT_Z32_FLOAT:
-			return V_008F14_IMG_DATA_FORMAT_32;
-		case PIPE_FORMAT_X32_S8X24_UINT:
-		case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT:
-			return V_008F14_IMG_DATA_FORMAT_X24_8_32;
-		default:
-			goto out_unknown;
-		}
-
-	case UTIL_FORMAT_COLORSPACE_YUV:
-		goto out_unknown; /* TODO */
-
-	case UTIL_FORMAT_COLORSPACE_SRGB:
-		if (desc->nr_channels != 4 && desc->nr_channels != 1)
-			goto out_unknown;
-		break;
-
-	default:
-		break;
-	}
-
-	if (desc->layout == UTIL_FORMAT_LAYOUT_RGTC) {
-		if (!sscreen->info.has_format_bc1_through_bc7)
-			goto out_unknown;
-
-		switch (format) {
-		case PIPE_FORMAT_RGTC1_SNORM:
-		case PIPE_FORMAT_LATC1_SNORM:
-		case PIPE_FORMAT_RGTC1_UNORM:
-		case PIPE_FORMAT_LATC1_UNORM:
-			return V_008F14_IMG_DATA_FORMAT_BC4;
-		case PIPE_FORMAT_RGTC2_SNORM:
-		case PIPE_FORMAT_LATC2_SNORM:
-		case PIPE_FORMAT_RGTC2_UNORM:
-		case PIPE_FORMAT_LATC2_UNORM:
-			return V_008F14_IMG_DATA_FORMAT_BC5;
-		default:
-			goto out_unknown;
-		}
-	}
-
-	if (desc->layout == UTIL_FORMAT_LAYOUT_ETC &&
-	    (sscreen->info.family == CHIP_STONEY ||
-	     sscreen->info.family == CHIP_VEGA10 ||
-	     sscreen->info.family == CHIP_RAVEN)) {
-		switch (format) {
-		case PIPE_FORMAT_ETC1_RGB8:
-		case PIPE_FORMAT_ETC2_RGB8:
-		case PIPE_FORMAT_ETC2_SRGB8:
-			return V_008F14_IMG_DATA_FORMAT_ETC2_RGB;
-		case PIPE_FORMAT_ETC2_RGB8A1:
-		case PIPE_FORMAT_ETC2_SRGB8A1:
-			return V_008F14_IMG_DATA_FORMAT_ETC2_RGBA1;
-		case PIPE_FORMAT_ETC2_RGBA8:
-		case PIPE_FORMAT_ETC2_SRGBA8:
-			return V_008F14_IMG_DATA_FORMAT_ETC2_RGBA;
-		case PIPE_FORMAT_ETC2_R11_UNORM:
-		case PIPE_FORMAT_ETC2_R11_SNORM:
-			return V_008F14_IMG_DATA_FORMAT_ETC2_R;
-		case PIPE_FORMAT_ETC2_RG11_UNORM:
-		case PIPE_FORMAT_ETC2_RG11_SNORM:
-			return V_008F14_IMG_DATA_FORMAT_ETC2_RG;
-		default:
-			goto out_unknown;
-		}
-	}
-
-	if (desc->layout == UTIL_FORMAT_LAYOUT_BPTC) {
-		if (!sscreen->info.has_format_bc1_through_bc7)
-			goto out_unknown;
-
-		switch (format) {
-		case PIPE_FORMAT_BPTC_RGBA_UNORM:
-		case PIPE_FORMAT_BPTC_SRGBA:
-			return V_008F14_IMG_DATA_FORMAT_BC7;
-		case PIPE_FORMAT_BPTC_RGB_FLOAT:
-		case PIPE_FORMAT_BPTC_RGB_UFLOAT:
-			return V_008F14_IMG_DATA_FORMAT_BC6;
-		default:
-			goto out_unknown;
-		}
-	}
-
-	if (desc->layout == UTIL_FORMAT_LAYOUT_SUBSAMPLED) {
-		switch (format) {
-		case PIPE_FORMAT_R8G8_B8G8_UNORM:
-		case PIPE_FORMAT_G8R8_B8R8_UNORM:
-			return V_008F14_IMG_DATA_FORMAT_GB_GR;
-		case PIPE_FORMAT_G8R8_G8B8_UNORM:
-		case PIPE_FORMAT_R8G8_R8B8_UNORM:
-			return V_008F14_IMG_DATA_FORMAT_BG_RG;
-		default:
-			goto out_unknown;
-		}
-	}
-
-	if (desc->layout == UTIL_FORMAT_LAYOUT_S3TC) {
-		if (!sscreen->info.has_format_bc1_through_bc7)
-			goto out_unknown;
-
-		switch (format) {
-		case PIPE_FORMAT_DXT1_RGB:
-		case PIPE_FORMAT_DXT1_RGBA:
-		case PIPE_FORMAT_DXT1_SRGB:
-		case PIPE_FORMAT_DXT1_SRGBA:
-			return V_008F14_IMG_DATA_FORMAT_BC1;
-		case PIPE_FORMAT_DXT3_RGBA:
-		case PIPE_FORMAT_DXT3_SRGBA:
-			return V_008F14_IMG_DATA_FORMAT_BC2;
-		case PIPE_FORMAT_DXT5_RGBA:
-		case PIPE_FORMAT_DXT5_SRGBA:
-			return V_008F14_IMG_DATA_FORMAT_BC3;
-		default:
-			goto out_unknown;
-		}
-	}
-
-	if (format == PIPE_FORMAT_R9G9B9E5_FLOAT) {
-		return V_008F14_IMG_DATA_FORMAT_5_9_9_9;
-	} else if (format == PIPE_FORMAT_R11G11B10_FLOAT) {
-		return V_008F14_IMG_DATA_FORMAT_10_11_11;
-	}
-
-	/* R8G8Bx_SNORM - TODO CxV8U8 */
-
-	/* hw cannot support mixed formats (except depth/stencil, since only
-	 * depth is read).*/
-	if (desc->is_mixed && desc->colorspace != UTIL_FORMAT_COLORSPACE_ZS)
-		goto out_unknown;
-
-	/* See whether the components are of the same size. */
-	for (i = 1; i < desc->nr_channels; i++) {
-		uniform = uniform && desc->channel[0].size == desc->channel[i].size;
-	}
-
-	/* Non-uniform formats. */
-	if (!uniform) {
-		switch(desc->nr_channels) {
-		case 3:
-			if (desc->channel[0].size == 5 &&
-			    desc->channel[1].size == 6 &&
-			    desc->channel[2].size == 5) {
-				return V_008F14_IMG_DATA_FORMAT_5_6_5;
-			}
-			goto out_unknown;
-		case 4:
-			if (desc->channel[0].size == 5 &&
-			    desc->channel[1].size == 5 &&
-			    desc->channel[2].size == 5 &&
-			    desc->channel[3].size == 1) {
-				return V_008F14_IMG_DATA_FORMAT_1_5_5_5;
-			}
-			if (desc->channel[0].size == 1 &&
-			    desc->channel[1].size == 5 &&
-			    desc->channel[2].size == 5 &&
-			    desc->channel[3].size == 5) {
-				return V_008F14_IMG_DATA_FORMAT_5_5_5_1;
-			}
-			if (desc->channel[0].size == 10 &&
-			    desc->channel[1].size == 10 &&
-			    desc->channel[2].size == 10 &&
-			    desc->channel[3].size == 2) {
-				return V_008F14_IMG_DATA_FORMAT_2_10_10_10;
-			}
-			goto out_unknown;
-		}
-		goto out_unknown;
-	}
-
-	if (first_non_void < 0 || first_non_void > 3)
-		goto out_unknown;
-
-	/* uniform formats */
-	switch (desc->channel[first_non_void].size) {
-	case 4:
-		switch (desc->nr_channels) {
+   struct si_screen *sscreen = (struct si_screen *)screen;
+   bool uniform = true;
+   int i;
+
+   assert(sscreen->info.chip_class <= GFX9);
+
+   /* Colorspace (return non-RGB formats directly). */
+   switch (desc->colorspace) {
+   /* Depth stencil formats */
+   case UTIL_FORMAT_COLORSPACE_ZS:
+      switch (format) {
+      case PIPE_FORMAT_Z16_UNORM:
+         return V_008F14_IMG_DATA_FORMAT_16;
+      case PIPE_FORMAT_X24S8_UINT:
+      case PIPE_FORMAT_S8X24_UINT:
+         /*
+          * Implemented as an 8_8_8_8 data format to fix texture
+          * gathers in stencil sampling. This affects at least
+          * GL45-CTS.texture_cube_map_array.sampling on GFX8.
+          */
+         if (sscreen->info.chip_class <= GFX8)
+            return V_008F14_IMG_DATA_FORMAT_8_8_8_8;
+
+         if (format == PIPE_FORMAT_X24S8_UINT)
+            return V_008F14_IMG_DATA_FORMAT_8_24;
+         else
+            return V_008F14_IMG_DATA_FORMAT_24_8;
+      case PIPE_FORMAT_Z24X8_UNORM:
+      case PIPE_FORMAT_Z24_UNORM_S8_UINT:
+         return V_008F14_IMG_DATA_FORMAT_8_24;
+      case PIPE_FORMAT_X8Z24_UNORM:
+      case PIPE_FORMAT_S8_UINT_Z24_UNORM:
+         return V_008F14_IMG_DATA_FORMAT_24_8;
+      case PIPE_FORMAT_S8_UINT:
+         return V_008F14_IMG_DATA_FORMAT_8;
+      case PIPE_FORMAT_Z32_FLOAT:
+         return V_008F14_IMG_DATA_FORMAT_32;
+      case PIPE_FORMAT_X32_S8X24_UINT:
+      case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT:
+         return V_008F14_IMG_DATA_FORMAT_X24_8_32;
+      default:
+         goto out_unknown;
+      }
+
+   case UTIL_FORMAT_COLORSPACE_YUV:
+      goto out_unknown; /* TODO */
+
+   case UTIL_FORMAT_COLORSPACE_SRGB:
+      if (desc->nr_channels != 4 && desc->nr_channels != 1)
+         goto out_unknown;
+      break;
+
+   default:
+      break;
+   }
+
+   if (desc->layout == UTIL_FORMAT_LAYOUT_RGTC) {
+      if (!sscreen->info.has_format_bc1_through_bc7)
+         goto out_unknown;
+
+      switch (format) {
+      case PIPE_FORMAT_RGTC1_SNORM:
+      case PIPE_FORMAT_LATC1_SNORM:
+      case PIPE_FORMAT_RGTC1_UNORM:
+      case PIPE_FORMAT_LATC1_UNORM:
+         return V_008F14_IMG_DATA_FORMAT_BC4;
+      case PIPE_FORMAT_RGTC2_SNORM:
+      case PIPE_FORMAT_LATC2_SNORM:
+      case PIPE_FORMAT_RGTC2_UNORM:
+      case PIPE_FORMAT_LATC2_UNORM:
+         return V_008F14_IMG_DATA_FORMAT_BC5;
+      default:
+         goto out_unknown;
+      }
+   }
+
+   if (desc->layout == UTIL_FORMAT_LAYOUT_ETC &&
+       (sscreen->info.family == CHIP_STONEY || sscreen->info.family == CHIP_VEGA10 ||
+        sscreen->info.family == CHIP_RAVEN)) {
+      switch (format) {
+      case PIPE_FORMAT_ETC1_RGB8:
+      case PIPE_FORMAT_ETC2_RGB8:
+      case PIPE_FORMAT_ETC2_SRGB8:
+         return V_008F14_IMG_DATA_FORMAT_ETC2_RGB;
+      case PIPE_FORMAT_ETC2_RGB8A1:
+      case PIPE_FORMAT_ETC2_SRGB8A1:
+         return V_008F14_IMG_DATA_FORMAT_ETC2_RGBA1;
+      case PIPE_FORMAT_ETC2_RGBA8:
+      case PIPE_FORMAT_ETC2_SRGBA8:
+         return V_008F14_IMG_DATA_FORMAT_ETC2_RGBA;
+      case PIPE_FORMAT_ETC2_R11_UNORM:
+      case PIPE_FORMAT_ETC2_R11_SNORM:
+         return V_008F14_IMG_DATA_FORMAT_ETC2_R;
+      case PIPE_FORMAT_ETC2_RG11_UNORM:
+      case PIPE_FORMAT_ETC2_RG11_SNORM:
+         return V_008F14_IMG_DATA_FORMAT_ETC2_RG;
+      default:
+         goto out_unknown;
+      }
+   }
+
+   if (desc->layout == UTIL_FORMAT_LAYOUT_BPTC) {
+      if (!sscreen->info.has_format_bc1_through_bc7)
+         goto out_unknown;
+
+      switch (format) {
+      case PIPE_FORMAT_BPTC_RGBA_UNORM:
+      case PIPE_FORMAT_BPTC_SRGBA:
+         return V_008F14_IMG_DATA_FORMAT_BC7;
+      case PIPE_FORMAT_BPTC_RGB_FLOAT:
+      case PIPE_FORMAT_BPTC_RGB_UFLOAT:
+         return V_008F14_IMG_DATA_FORMAT_BC6;
+      default:
+         goto out_unknown;
+      }
+   }
+
+   if (desc->layout == UTIL_FORMAT_LAYOUT_SUBSAMPLED) {
+      switch (format) {
+      case PIPE_FORMAT_R8G8_B8G8_UNORM:
+      case PIPE_FORMAT_G8R8_B8R8_UNORM:
+         return V_008F14_IMG_DATA_FORMAT_GB_GR;
+      case PIPE_FORMAT_G8R8_G8B8_UNORM:
+      case PIPE_FORMAT_R8G8_R8B8_UNORM:
+         return V_008F14_IMG_DATA_FORMAT_BG_RG;
+      default:
+         goto out_unknown;
+      }
+   }
+
+   if (desc->layout == UTIL_FORMAT_LAYOUT_S3TC) {
+      if (!sscreen->info.has_format_bc1_through_bc7)
+         goto out_unknown;
+
+      switch (format) {
+      case PIPE_FORMAT_DXT1_RGB:
+      case PIPE_FORMAT_DXT1_RGBA:
+      case PIPE_FORMAT_DXT1_SRGB:
+      case PIPE_FORMAT_DXT1_SRGBA:
+         return V_008F14_IMG_DATA_FORMAT_BC1;
+      case PIPE_FORMAT_DXT3_RGBA:
+      case PIPE_FORMAT_DXT3_SRGBA:
+         return V_008F14_IMG_DATA_FORMAT_BC2;
+      case PIPE_FORMAT_DXT5_RGBA:
+      case PIPE_FORMAT_DXT5_SRGBA:
+         return V_008F14_IMG_DATA_FORMAT_BC3;
+      default:
+         goto out_unknown;
+      }
+   }
+
+   if (format == PIPE_FORMAT_R9G9B9E5_FLOAT) {
+      return V_008F14_IMG_DATA_FORMAT_5_9_9_9;
+   } else if (format == PIPE_FORMAT_R11G11B10_FLOAT) {
+      return V_008F14_IMG_DATA_FORMAT_10_11_11;
+   }
+
+   /* R8G8Bx_SNORM - TODO CxV8U8 */
+
+   /* hw cannot support mixed formats (except depth/stencil, since only
+    * depth is read).*/
+   if (desc->is_mixed && desc->colorspace != UTIL_FORMAT_COLORSPACE_ZS)
+      goto out_unknown;
+
+   /* See whether the components are of the same size. */
+   for (i = 1; i < desc->nr_channels; i++) {
+      uniform = uniform && desc->channel[0].size == desc->channel[i].size;
+   }
+
+   /* Non-uniform formats. */
+   if (!uniform) {
+      switch (desc->nr_channels) {
+      case 3:
+         if (desc->channel[0].size == 5 && desc->channel[1].size == 6 &&
+             desc->channel[2].size == 5) {
+            return V_008F14_IMG_DATA_FORMAT_5_6_5;
+         }
+         goto out_unknown;
+      case 4:
+         if (desc->channel[0].size == 5 && desc->channel[1].size == 5 &&
+             desc->channel[2].size == 5 && desc->channel[3].size == 1) {
+            return V_008F14_IMG_DATA_FORMAT_1_5_5_5;
+         }
+         if (desc->channel[0].size == 1 && desc->channel[1].size == 5 &&
+             desc->channel[2].size == 5 && desc->channel[3].size == 5) {
+            return V_008F14_IMG_DATA_FORMAT_5_5_5_1;
+         }
+         if (desc->channel[0].size == 10 && desc->channel[1].size == 10 &&
+             desc->channel[2].size == 10 && desc->channel[3].size == 2) {
+            return V_008F14_IMG_DATA_FORMAT_2_10_10_10;
+         }
+         goto out_unknown;
+      }
+      goto out_unknown;
+   }
+
+   if (first_non_void < 0 || first_non_void > 3)
+      goto out_unknown;
+
+   /* uniform formats */
+   switch (desc->channel[first_non_void].size) {
+   case 4:
+      switch (desc->nr_channels) {
 #if 0 /* Not supported for render targets */
 		case 2:
 			return V_008F14_IMG_DATA_FORMAT_4_4;
 #endif
-		case 4:
-			return V_008F14_IMG_DATA_FORMAT_4_4_4_4;
-		}
-		break;
-	case 8:
-		switch (desc->nr_channels) {
-		case 1:
-			return V_008F14_IMG_DATA_FORMAT_8;
-		case 2:
-			return V_008F14_IMG_DATA_FORMAT_8_8;
-		case 4:
-			return V_008F14_IMG_DATA_FORMAT_8_8_8_8;
-		}
-		break;
-	case 16:
-		switch (desc->nr_channels) {
-		case 1:
-			return V_008F14_IMG_DATA_FORMAT_16;
-		case 2:
-			return V_008F14_IMG_DATA_FORMAT_16_16;
-		case 4:
-			return V_008F14_IMG_DATA_FORMAT_16_16_16_16;
-		}
-		break;
-	case 32:
-		switch (desc->nr_channels) {
-		case 1:
-			return V_008F14_IMG_DATA_FORMAT_32;
-		case 2:
-			return V_008F14_IMG_DATA_FORMAT_32_32;
+      case 4:
+         return V_008F14_IMG_DATA_FORMAT_4_4_4_4;
+      }
+      break;
+   case 8:
+      switch (desc->nr_channels) {
+      case 1:
+         return V_008F14_IMG_DATA_FORMAT_8;
+      case 2:
+         return V_008F14_IMG_DATA_FORMAT_8_8;
+      case 4:
+         return V_008F14_IMG_DATA_FORMAT_8_8_8_8;
+      }
+      break;
+   case 16:
+      switch (desc->nr_channels) {
+      case 1:
+         return V_008F14_IMG_DATA_FORMAT_16;
+      case 2:
+         return V_008F14_IMG_DATA_FORMAT_16_16;
+      case 4:
+         return V_008F14_IMG_DATA_FORMAT_16_16_16_16;
+      }
+      break;
+   case 32:
+      switch (desc->nr_channels) {
+      case 1:
+         return V_008F14_IMG_DATA_FORMAT_32;
+      case 2:
+         return V_008F14_IMG_DATA_FORMAT_32_32;
 #if 0 /* Not supported for render targets */
 		case 3:
 			return V_008F14_IMG_DATA_FORMAT_32_32_32;
 #endif
-		case 4:
-			return V_008F14_IMG_DATA_FORMAT_32_32_32_32;
-		}
-	}
+      case 4:
+         return V_008F14_IMG_DATA_FORMAT_32_32_32_32;
+      }
+   }
 
 out_unknown:
-	return ~0;
+   return ~0;
 }
 
 static unsigned si_tex_wrap(unsigned wrap)
 {
-	switch (wrap) {
-	default:
-	case PIPE_TEX_WRAP_REPEAT:
-		return V_008F30_SQ_TEX_WRAP;
-	case PIPE_TEX_WRAP_CLAMP:
-		return V_008F30_SQ_TEX_CLAMP_HALF_BORDER;
-	case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
-		return V_008F30_SQ_TEX_CLAMP_LAST_TEXEL;
-	case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
-		return V_008F30_SQ_TEX_CLAMP_BORDER;
-	case PIPE_TEX_WRAP_MIRROR_REPEAT:
-		return V_008F30_SQ_TEX_MIRROR;
-	case PIPE_TEX_WRAP_MIRROR_CLAMP:
-		return V_008F30_SQ_TEX_MIRROR_ONCE_HALF_BORDER;
-	case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
-		return V_008F30_SQ_TEX_MIRROR_ONCE_LAST_TEXEL;
-	case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
-		return V_008F30_SQ_TEX_MIRROR_ONCE_BORDER;
-	}
+   switch (wrap) {
+   default:
+   case PIPE_TEX_WRAP_REPEAT:
+      return V_008F30_SQ_TEX_WRAP;
+   case PIPE_TEX_WRAP_CLAMP:
+      return V_008F30_SQ_TEX_CLAMP_HALF_BORDER;
+   case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
+      return V_008F30_SQ_TEX_CLAMP_LAST_TEXEL;
+   case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
+      return V_008F30_SQ_TEX_CLAMP_BORDER;
+   case PIPE_TEX_WRAP_MIRROR_REPEAT:
+      return V_008F30_SQ_TEX_MIRROR;
+   case PIPE_TEX_WRAP_MIRROR_CLAMP:
+      return V_008F30_SQ_TEX_MIRROR_ONCE_HALF_BORDER;
+   case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
+      return V_008F30_SQ_TEX_MIRROR_ONCE_LAST_TEXEL;
+   case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
+      return V_008F30_SQ_TEX_MIRROR_ONCE_BORDER;
+   }
 }
 
 static unsigned si_tex_mipfilter(unsigned filter)
 {
-	switch (filter) {
-	case PIPE_TEX_MIPFILTER_NEAREST:
-		return V_008F38_SQ_TEX_Z_FILTER_POINT;
-	case PIPE_TEX_MIPFILTER_LINEAR:
-		return V_008F38_SQ_TEX_Z_FILTER_LINEAR;
-	default:
-	case PIPE_TEX_MIPFILTER_NONE:
-		return V_008F38_SQ_TEX_Z_FILTER_NONE;
-	}
+   switch (filter) {
+   case PIPE_TEX_MIPFILTER_NEAREST:
+      return V_008F38_SQ_TEX_Z_FILTER_POINT;
+   case PIPE_TEX_MIPFILTER_LINEAR:
+      return V_008F38_SQ_TEX_Z_FILTER_LINEAR;
+   default:
+   case PIPE_TEX_MIPFILTER_NONE:
+      return V_008F38_SQ_TEX_Z_FILTER_NONE;
+   }
 }
 
 static unsigned si_tex_compare(unsigned compare)
 {
-	switch (compare) {
-	default:
-	case PIPE_FUNC_NEVER:
-		return V_008F30_SQ_TEX_DEPTH_COMPARE_NEVER;
-	case PIPE_FUNC_LESS:
-		return V_008F30_SQ_TEX_DEPTH_COMPARE_LESS;
-	case PIPE_FUNC_EQUAL:
-		return V_008F30_SQ_TEX_DEPTH_COMPARE_EQUAL;
-	case PIPE_FUNC_LEQUAL:
-		return V_008F30_SQ_TEX_DEPTH_COMPARE_LESSEQUAL;
-	case PIPE_FUNC_GREATER:
-		return V_008F30_SQ_TEX_DEPTH_COMPARE_GREATER;
-	case PIPE_FUNC_NOTEQUAL:
-		return V_008F30_SQ_TEX_DEPTH_COMPARE_NOTEQUAL;
-	case PIPE_FUNC_GEQUAL:
-		return V_008F30_SQ_TEX_DEPTH_COMPARE_GREATEREQUAL;
-	case PIPE_FUNC_ALWAYS:
-		return V_008F30_SQ_TEX_DEPTH_COMPARE_ALWAYS;
-	}
+   switch (compare) {
+   default:
+   case PIPE_FUNC_NEVER:
+      return V_008F30_SQ_TEX_DEPTH_COMPARE_NEVER;
+   case PIPE_FUNC_LESS:
+      return V_008F30_SQ_TEX_DEPTH_COMPARE_LESS;
+   case PIPE_FUNC_EQUAL:
+      return V_008F30_SQ_TEX_DEPTH_COMPARE_EQUAL;
+   case PIPE_FUNC_LEQUAL:
+      return V_008F30_SQ_TEX_DEPTH_COMPARE_LESSEQUAL;
+   case PIPE_FUNC_GREATER:
+      return V_008F30_SQ_TEX_DEPTH_COMPARE_GREATER;
+   case PIPE_FUNC_NOTEQUAL:
+      return V_008F30_SQ_TEX_DEPTH_COMPARE_NOTEQUAL;
+   case PIPE_FUNC_GEQUAL:
+      return V_008F30_SQ_TEX_DEPTH_COMPARE_GREATEREQUAL;
+   case PIPE_FUNC_ALWAYS:
+      return V_008F30_SQ_TEX_DEPTH_COMPARE_ALWAYS;
+   }
 }
 
-static unsigned si_tex_dim(struct si_screen *sscreen, struct si_texture *tex,
-			   unsigned view_target, unsigned nr_samples)
+static unsigned si_tex_dim(struct si_screen *sscreen, struct si_texture *tex, unsigned view_target,
+                           unsigned nr_samples)
 {
-	unsigned res_target = tex->buffer.b.b.target;
-
-	if (view_target == PIPE_TEXTURE_CUBE ||
-	    view_target == PIPE_TEXTURE_CUBE_ARRAY)
-		res_target = view_target;
-	/* If interpreting cubemaps as something else, set 2D_ARRAY. */
-	else if (res_target == PIPE_TEXTURE_CUBE ||
-		 res_target == PIPE_TEXTURE_CUBE_ARRAY)
-		res_target = PIPE_TEXTURE_2D_ARRAY;
-
-	/* GFX9 allocates 1D textures as 2D. */
-	if ((res_target == PIPE_TEXTURE_1D ||
-	     res_target == PIPE_TEXTURE_1D_ARRAY) &&
-	    sscreen->info.chip_class == GFX9 &&
-	    tex->surface.u.gfx9.resource_type == RADEON_RESOURCE_2D) {
-		if (res_target == PIPE_TEXTURE_1D)
-			res_target = PIPE_TEXTURE_2D;
-		else
-			res_target = PIPE_TEXTURE_2D_ARRAY;
-	}
-
-	switch (res_target) {
-	default:
-	case PIPE_TEXTURE_1D:
-		return V_008F1C_SQ_RSRC_IMG_1D;
-	case PIPE_TEXTURE_1D_ARRAY:
-		return V_008F1C_SQ_RSRC_IMG_1D_ARRAY;
-	case PIPE_TEXTURE_2D:
-	case PIPE_TEXTURE_RECT:
-		return nr_samples > 1 ? V_008F1C_SQ_RSRC_IMG_2D_MSAA :
-					V_008F1C_SQ_RSRC_IMG_2D;
-	case PIPE_TEXTURE_2D_ARRAY:
-		return nr_samples > 1 ? V_008F1C_SQ_RSRC_IMG_2D_MSAA_ARRAY :
-					V_008F1C_SQ_RSRC_IMG_2D_ARRAY;
-	case PIPE_TEXTURE_3D:
-		return V_008F1C_SQ_RSRC_IMG_3D;
-	case PIPE_TEXTURE_CUBE:
-	case PIPE_TEXTURE_CUBE_ARRAY:
-		return V_008F1C_SQ_RSRC_IMG_CUBE;
-	}
+   unsigned res_target = tex->buffer.b.b.target;
+
+   if (view_target == PIPE_TEXTURE_CUBE || view_target == PIPE_TEXTURE_CUBE_ARRAY)
+      res_target = view_target;
+   /* If interpreting cubemaps as something else, set 2D_ARRAY. */
+   else if (res_target == PIPE_TEXTURE_CUBE || res_target == PIPE_TEXTURE_CUBE_ARRAY)
+      res_target = PIPE_TEXTURE_2D_ARRAY;
+
+   /* GFX9 allocates 1D textures as 2D. */
+   if ((res_target == PIPE_TEXTURE_1D || res_target == PIPE_TEXTURE_1D_ARRAY) &&
+       sscreen->info.chip_class == GFX9 &&
+       tex->surface.u.gfx9.resource_type == RADEON_RESOURCE_2D) {
+      if (res_target == PIPE_TEXTURE_1D)
+         res_target = PIPE_TEXTURE_2D;
+      else
+         res_target = PIPE_TEXTURE_2D_ARRAY;
+   }
+
+   switch (res_target) {
+   default:
+   case PIPE_TEXTURE_1D:
+      return V_008F1C_SQ_RSRC_IMG_1D;
+   case PIPE_TEXTURE_1D_ARRAY:
+      return V_008F1C_SQ_RSRC_IMG_1D_ARRAY;
+   case PIPE_TEXTURE_2D:
+   case PIPE_TEXTURE_RECT:
+      return nr_samples > 1 ? V_008F1C_SQ_RSRC_IMG_2D_MSAA : V_008F1C_SQ_RSRC_IMG_2D;
+   case PIPE_TEXTURE_2D_ARRAY:
+      return nr_samples > 1 ? V_008F1C_SQ_RSRC_IMG_2D_MSAA_ARRAY : V_008F1C_SQ_RSRC_IMG_2D_ARRAY;
+   case PIPE_TEXTURE_3D:
+      return V_008F1C_SQ_RSRC_IMG_3D;
+   case PIPE_TEXTURE_CUBE:
+   case PIPE_TEXTURE_CUBE_ARRAY:
+      return V_008F1C_SQ_RSRC_IMG_CUBE;
+   }
 }
 
 /*
@@ -2034,1748 +1919,1663 @@ static unsigned si_tex_dim(struct si_screen *sscreen, struct si_texture *tex,
 
 static bool si_is_sampler_format_supported(struct pipe_screen *screen, enum pipe_format format)
 {
-	struct si_screen *sscreen = (struct si_screen *)screen;
+   struct si_screen *sscreen = (struct si_screen *)screen;
 
-	if (sscreen->info.chip_class >= GFX10) {
-		const struct gfx10_format *fmt = &gfx10_format_table[format];
-		if (!fmt->img_format || fmt->buffers_only)
-			return false;
-		return true;
-	}
+   if (sscreen->info.chip_class >= GFX10) {
+      const struct gfx10_format *fmt = &gfx10_format_table[format];
+      if (!fmt->img_format || fmt->buffers_only)
+         return false;
+      return true;
+   }
 
-	const struct util_format_description *desc = util_format_description(format);
-	if (!desc)
-		return false;
+   const struct util_format_description *desc = util_format_description(format);
+   if (!desc)
+      return false;
 
-	return si_translate_texformat(screen, format, desc,
-				      util_format_get_first_non_void_channel(format)) != ~0U;
+   return si_translate_texformat(screen, format, desc,
+                                 util_format_get_first_non_void_channel(format)) != ~0U;
 }
 
 static uint32_t si_translate_buffer_dataformat(struct pipe_screen *screen,
-					       const struct util_format_description *desc,
-					       int first_non_void)
+                                               const struct util_format_description *desc,
+                                               int first_non_void)
 {
-	int i;
-
-	assert(((struct si_screen *)screen)->info.chip_class <= GFX9);
-
-	if (desc->format == PIPE_FORMAT_R11G11B10_FLOAT)
-		return V_008F0C_BUF_DATA_FORMAT_10_11_11;
-
-	assert(first_non_void >= 0);
-
-	if (desc->nr_channels == 4 &&
-	    desc->channel[0].size == 10 &&
-	    desc->channel[1].size == 10 &&
-	    desc->channel[2].size == 10 &&
-	    desc->channel[3].size == 2)
-		return V_008F0C_BUF_DATA_FORMAT_2_10_10_10;
-
-	/* See whether the components are of the same size. */
-	for (i = 0; i < desc->nr_channels; i++) {
-		if (desc->channel[first_non_void].size != desc->channel[i].size)
-			return V_008F0C_BUF_DATA_FORMAT_INVALID;
-	}
-
-	switch (desc->channel[first_non_void].size) {
-	case 8:
-		switch (desc->nr_channels) {
-		case 1:
-		case 3: /* 3 loads */
-			return V_008F0C_BUF_DATA_FORMAT_8;
-		case 2:
-			return V_008F0C_BUF_DATA_FORMAT_8_8;
-		case 4:
-			return V_008F0C_BUF_DATA_FORMAT_8_8_8_8;
-		}
-		break;
-	case 16:
-		switch (desc->nr_channels) {
-		case 1:
-		case 3: /* 3 loads */
-			return V_008F0C_BUF_DATA_FORMAT_16;
-		case 2:
-			return V_008F0C_BUF_DATA_FORMAT_16_16;
-		case 4:
-			return V_008F0C_BUF_DATA_FORMAT_16_16_16_16;
-		}
-		break;
-	case 32:
-		switch (desc->nr_channels) {
-		case 1:
-			return V_008F0C_BUF_DATA_FORMAT_32;
-		case 2:
-			return V_008F0C_BUF_DATA_FORMAT_32_32;
-		case 3:
-			return V_008F0C_BUF_DATA_FORMAT_32_32_32;
-		case 4:
-			return V_008F0C_BUF_DATA_FORMAT_32_32_32_32;
-		}
-		break;
-	case 64:
-		/* Legacy double formats. */
-		switch (desc->nr_channels) {
-		case 1: /* 1 load */
-			return V_008F0C_BUF_DATA_FORMAT_32_32;
-		case 2: /* 1 load */
-			return V_008F0C_BUF_DATA_FORMAT_32_32_32_32;
-		case 3: /* 3 loads */
-			return V_008F0C_BUF_DATA_FORMAT_32_32;
-		case 4: /* 2 loads */
-			return V_008F0C_BUF_DATA_FORMAT_32_32_32_32;
-		}
-		break;
-	}
-
-	return V_008F0C_BUF_DATA_FORMAT_INVALID;
+   int i;
+
+   assert(((struct si_screen *)screen)->info.chip_class <= GFX9);
+
+   if (desc->format == PIPE_FORMAT_R11G11B10_FLOAT)
+      return V_008F0C_BUF_DATA_FORMAT_10_11_11;
+
+   assert(first_non_void >= 0);
+
+   if (desc->nr_channels == 4 && desc->channel[0].size == 10 && desc->channel[1].size == 10 &&
+       desc->channel[2].size == 10 && desc->channel[3].size == 2)
+      return V_008F0C_BUF_DATA_FORMAT_2_10_10_10;
+
+   /* See whether the components are of the same size. */
+   for (i = 0; i < desc->nr_channels; i++) {
+      if (desc->channel[first_non_void].size != desc->channel[i].size)
+         return V_008F0C_BUF_DATA_FORMAT_INVALID;
+   }
+
+   switch (desc->channel[first_non_void].size) {
+   case 8:
+      switch (desc->nr_channels) {
+      case 1:
+      case 3: /* 3 loads */
+         return V_008F0C_BUF_DATA_FORMAT_8;
+      case 2:
+         return V_008F0C_BUF_DATA_FORMAT_8_8;
+      case 4:
+         return V_008F0C_BUF_DATA_FORMAT_8_8_8_8;
+      }
+      break;
+   case 16:
+      switch (desc->nr_channels) {
+      case 1:
+      case 3: /* 3 loads */
+         return V_008F0C_BUF_DATA_FORMAT_16;
+      case 2:
+         return V_008F0C_BUF_DATA_FORMAT_16_16;
+      case 4:
+         return V_008F0C_BUF_DATA_FORMAT_16_16_16_16;
+      }
+      break;
+   case 32:
+      switch (desc->nr_channels) {
+      case 1:
+         return V_008F0C_BUF_DATA_FORMAT_32;
+      case 2:
+         return V_008F0C_BUF_DATA_FORMAT_32_32;
+      case 3:
+         return V_008F0C_BUF_DATA_FORMAT_32_32_32;
+      case 4:
+         return V_008F0C_BUF_DATA_FORMAT_32_32_32_32;
+      }
+      break;
+   case 64:
+      /* Legacy double formats. */
+      switch (desc->nr_channels) {
+      case 1: /* 1 load */
+         return V_008F0C_BUF_DATA_FORMAT_32_32;
+      case 2: /* 1 load */
+         return V_008F0C_BUF_DATA_FORMAT_32_32_32_32;
+      case 3: /* 3 loads */
+         return V_008F0C_BUF_DATA_FORMAT_32_32;
+      case 4: /* 2 loads */
+         return V_008F0C_BUF_DATA_FORMAT_32_32_32_32;
+      }
+      break;
+   }
+
+   return V_008F0C_BUF_DATA_FORMAT_INVALID;
 }
 
 static uint32_t si_translate_buffer_numformat(struct pipe_screen *screen,
-					      const struct util_format_description *desc,
-					      int first_non_void)
+                                              const struct util_format_description *desc,
+                                              int first_non_void)
 {
-	assert(((struct si_screen *)screen)->info.chip_class <= GFX9);
-
-	if (desc->format == PIPE_FORMAT_R11G11B10_FLOAT)
-		return V_008F0C_BUF_NUM_FORMAT_FLOAT;
-
-	assert(first_non_void >= 0);
-
-	switch (desc->channel[first_non_void].type) {
-	case UTIL_FORMAT_TYPE_SIGNED:
-	case UTIL_FORMAT_TYPE_FIXED:
-		if (desc->channel[first_non_void].size >= 32 ||
-		    desc->channel[first_non_void].pure_integer)
-			return V_008F0C_BUF_NUM_FORMAT_SINT;
-		else if (desc->channel[first_non_void].normalized)
-			return V_008F0C_BUF_NUM_FORMAT_SNORM;
-		else
-			return V_008F0C_BUF_NUM_FORMAT_SSCALED;
-		break;
-	case UTIL_FORMAT_TYPE_UNSIGNED:
-		if (desc->channel[first_non_void].size >= 32 ||
-		    desc->channel[first_non_void].pure_integer)
-			return V_008F0C_BUF_NUM_FORMAT_UINT;
-		else if (desc->channel[first_non_void].normalized)
-			return V_008F0C_BUF_NUM_FORMAT_UNORM;
-		else
-			return V_008F0C_BUF_NUM_FORMAT_USCALED;
-		break;
-	case UTIL_FORMAT_TYPE_FLOAT:
-	default:
-		return V_008F0C_BUF_NUM_FORMAT_FLOAT;
-	}
+   assert(((struct si_screen *)screen)->info.chip_class <= GFX9);
+
+   if (desc->format == PIPE_FORMAT_R11G11B10_FLOAT)
+      return V_008F0C_BUF_NUM_FORMAT_FLOAT;
+
+   assert(first_non_void >= 0);
+
+   switch (desc->channel[first_non_void].type) {
+   case UTIL_FORMAT_TYPE_SIGNED:
+   case UTIL_FORMAT_TYPE_FIXED:
+      if (desc->channel[first_non_void].size >= 32 || desc->channel[first_non_void].pure_integer)
+         return V_008F0C_BUF_NUM_FORMAT_SINT;
+      else if (desc->channel[first_non_void].normalized)
+         return V_008F0C_BUF_NUM_FORMAT_SNORM;
+      else
+         return V_008F0C_BUF_NUM_FORMAT_SSCALED;
+      break;
+   case UTIL_FORMAT_TYPE_UNSIGNED:
+      if (desc->channel[first_non_void].size >= 32 || desc->channel[first_non_void].pure_integer)
+         return V_008F0C_BUF_NUM_FORMAT_UINT;
+      else if (desc->channel[first_non_void].normalized)
+         return V_008F0C_BUF_NUM_FORMAT_UNORM;
+      else
+         return V_008F0C_BUF_NUM_FORMAT_USCALED;
+      break;
+   case UTIL_FORMAT_TYPE_FLOAT:
+   default:
+      return V_008F0C_BUF_NUM_FORMAT_FLOAT;
+   }
 }
 
-static unsigned si_is_vertex_format_supported(struct pipe_screen *screen,
-					      enum pipe_format format,
-					      unsigned usage)
+static unsigned si_is_vertex_format_supported(struct pipe_screen *screen, enum pipe_format format,
+                                              unsigned usage)
 {
-	struct si_screen *sscreen = (struct si_screen *)screen;
-	const struct util_format_description *desc;
-	int first_non_void;
-	unsigned data_format;
-
-	assert((usage & ~(PIPE_BIND_SHADER_IMAGE |
-			  PIPE_BIND_SAMPLER_VIEW |
-			  PIPE_BIND_VERTEX_BUFFER)) == 0);
-
-	desc = util_format_description(format);
-	if (!desc)
-		return 0;
-
-	/* There are no native 8_8_8 or 16_16_16 data formats, and we currently
-	 * select 8_8_8_8 and 16_16_16_16 instead. This works reasonably well
-	 * for read-only access (with caveats surrounding bounds checks), but
-	 * obviously fails for write access which we have to implement for
-	 * shader images. Luckily, OpenGL doesn't expect this to be supported
-	 * anyway, and so the only impact is on PBO uploads / downloads, which
-	 * shouldn't be expected to be fast for GL_RGB anyway.
-	 */
-	if (desc->block.bits == 3 * 8 ||
-	    desc->block.bits == 3 * 16) {
-		if (usage & (PIPE_BIND_SHADER_IMAGE | PIPE_BIND_SAMPLER_VIEW)) {
-		    usage &= ~(PIPE_BIND_SHADER_IMAGE | PIPE_BIND_SAMPLER_VIEW);
-			if (!usage)
-				return 0;
-		}
-	}
-
-	if (sscreen->info.chip_class >= GFX10) {
-		const struct gfx10_format *fmt = &gfx10_format_table[format];
-		if (!fmt->img_format || fmt->img_format >= 128)
-			return 0;
-		return usage;
-	}
-
-	first_non_void = util_format_get_first_non_void_channel(format);
-	data_format = si_translate_buffer_dataformat(screen, desc, first_non_void);
-	if (data_format == V_008F0C_BUF_DATA_FORMAT_INVALID)
-		return 0;
-
-	return usage;
+   struct si_screen *sscreen = (struct si_screen *)screen;
+   const struct util_format_description *desc;
+   int first_non_void;
+   unsigned data_format;
+
+   assert((usage & ~(PIPE_BIND_SHADER_IMAGE | PIPE_BIND_SAMPLER_VIEW | PIPE_BIND_VERTEX_BUFFER)) ==
+          0);
+
+   desc = util_format_description(format);
+   if (!desc)
+      return 0;
+
+   /* There are no native 8_8_8 or 16_16_16 data formats, and we currently
+    * select 8_8_8_8 and 16_16_16_16 instead. This works reasonably well
+    * for read-only access (with caveats surrounding bounds checks), but
+    * obviously fails for write access which we have to implement for
+    * shader images. Luckily, OpenGL doesn't expect this to be supported
+    * anyway, and so the only impact is on PBO uploads / downloads, which
+    * shouldn't be expected to be fast for GL_RGB anyway.
+    */
+   if (desc->block.bits == 3 * 8 || desc->block.bits == 3 * 16) {
+      if (usage & (PIPE_BIND_SHADER_IMAGE | PIPE_BIND_SAMPLER_VIEW)) {
+         usage &= ~(PIPE_BIND_SHADER_IMAGE | PIPE_BIND_SAMPLER_VIEW);
+         if (!usage)
+            return 0;
+      }
+   }
+
+   if (sscreen->info.chip_class >= GFX10) {
+      const struct gfx10_format *fmt = &gfx10_format_table[format];
+      if (!fmt->img_format || fmt->img_format >= 128)
+         return 0;
+      return usage;
+   }
+
+   first_non_void = util_format_get_first_non_void_channel(format);
+   data_format = si_translate_buffer_dataformat(screen, desc, first_non_void);
+   if (data_format == V_008F0C_BUF_DATA_FORMAT_INVALID)
+      return 0;
+
+   return usage;
 }
 
 static bool si_is_colorbuffer_format_supported(enum pipe_format format)
 {
-	return si_translate_colorformat(format) != V_028C70_COLOR_INVALID &&
-		si_translate_colorswap(format, false) != ~0U;
+   return si_translate_colorformat(format) != V_028C70_COLOR_INVALID &&
+          si_translate_colorswap(format, false) != ~0U;
 }
 
 static bool si_is_zs_format_supported(enum pipe_format format)
 {
-	return si_translate_dbformat(format) != V_028040_Z_INVALID;
+   return si_translate_dbformat(format) != V_028040_Z_INVALID;
 }
 
-static bool si_is_format_supported(struct pipe_screen *screen,
-				   enum pipe_format format,
-				   enum pipe_texture_target target,
-				   unsigned sample_count,
-				   unsigned storage_sample_count,
-				   unsigned usage)
+static bool si_is_format_supported(struct pipe_screen *screen, enum pipe_format format,
+                                   enum pipe_texture_target target, unsigned sample_count,
+                                   unsigned storage_sample_count, unsigned usage)
 {
-	struct si_screen *sscreen = (struct si_screen *)screen;
-	unsigned retval = 0;
-
-	if (target >= PIPE_MAX_TEXTURE_TYPES) {
-		PRINT_ERR("radeonsi: unsupported texture type %d\n", target);
-		return false;
-	}
-
-	if (MAX2(1, sample_count) < MAX2(1, storage_sample_count))
-		return false;
-
-	if (sample_count > 1) {
-		if (!screen->get_param(screen, PIPE_CAP_TEXTURE_MULTISAMPLE))
-			return false;
-
-		/* Only power-of-two sample counts are supported. */
-		if (!util_is_power_of_two_or_zero(sample_count) ||
-		    !util_is_power_of_two_or_zero(storage_sample_count))
-			return false;
-
-		/* MSAA support without framebuffer attachments. */
-		if (format == PIPE_FORMAT_NONE && sample_count <= 16)
-			return true;
-
-		if (!sscreen->info.has_eqaa_surface_allocator ||
-		    util_format_is_depth_or_stencil(format)) {
-			/* Color without EQAA or depth/stencil. */
-			if (sample_count > 8 ||
-			    sample_count != storage_sample_count)
-				return false;
-		} else {
-			/* Color with EQAA. */
-			if (sample_count > 16 ||
-			    storage_sample_count > 8)
-				return false;
-		}
-	}
-
-	if (usage & (PIPE_BIND_SAMPLER_VIEW |
-		     PIPE_BIND_SHADER_IMAGE)) {
-		if (target == PIPE_BUFFER) {
-			retval |= si_is_vertex_format_supported(
-				screen, format, usage & (PIPE_BIND_SAMPLER_VIEW |
-						         PIPE_BIND_SHADER_IMAGE));
-		} else {
-			if (si_is_sampler_format_supported(screen, format))
-				retval |= usage & (PIPE_BIND_SAMPLER_VIEW |
-						   PIPE_BIND_SHADER_IMAGE);
-		}
-	}
-
-	if ((usage & (PIPE_BIND_RENDER_TARGET |
-		      PIPE_BIND_DISPLAY_TARGET |
-		      PIPE_BIND_SCANOUT |
-		      PIPE_BIND_SHARED |
-		      PIPE_BIND_BLENDABLE)) &&
-	    si_is_colorbuffer_format_supported(format)) {
-		retval |= usage &
-			  (PIPE_BIND_RENDER_TARGET |
-			   PIPE_BIND_DISPLAY_TARGET |
-			   PIPE_BIND_SCANOUT |
-			   PIPE_BIND_SHARED);
-		if (!util_format_is_pure_integer(format) &&
-		    !util_format_is_depth_or_stencil(format))
-			retval |= usage & PIPE_BIND_BLENDABLE;
-	}
-
-	if ((usage & PIPE_BIND_DEPTH_STENCIL) &&
-	    si_is_zs_format_supported(format)) {
-		retval |= PIPE_BIND_DEPTH_STENCIL;
-	}
-
-	if (usage & PIPE_BIND_VERTEX_BUFFER) {
-		retval |= si_is_vertex_format_supported(screen, format,
-							PIPE_BIND_VERTEX_BUFFER);
-	}
-
-	if ((usage & PIPE_BIND_LINEAR) &&
-	    !util_format_is_compressed(format) &&
-	    !(usage & PIPE_BIND_DEPTH_STENCIL))
-		retval |= PIPE_BIND_LINEAR;
-
-	return retval == usage;
+   struct si_screen *sscreen = (struct si_screen *)screen;
+   unsigned retval = 0;
+
+   if (target >= PIPE_MAX_TEXTURE_TYPES) {
+      PRINT_ERR("radeonsi: unsupported texture type %d\n", target);
+      return false;
+   }
+
+   if (MAX2(1, sample_count) < MAX2(1, storage_sample_count))
+      return false;
+
+   if (sample_count > 1) {
+      if (!screen->get_param(screen, PIPE_CAP_TEXTURE_MULTISAMPLE))
+         return false;
+
+      /* Only power-of-two sample counts are supported. */
+      if (!util_is_power_of_two_or_zero(sample_count) ||
+          !util_is_power_of_two_or_zero(storage_sample_count))
+         return false;
+
+      /* MSAA support without framebuffer attachments. */
+      if (format == PIPE_FORMAT_NONE && sample_count <= 16)
+         return true;
+
+      if (!sscreen->info.has_eqaa_surface_allocator || util_format_is_depth_or_stencil(format)) {
+         /* Color without EQAA or depth/stencil. */
+         if (sample_count > 8 || sample_count != storage_sample_count)
+            return false;
+      } else {
+         /* Color with EQAA. */
+         if (sample_count > 16 || storage_sample_count > 8)
+            return false;
+      }
+   }
+
+   if (usage & (PIPE_BIND_SAMPLER_VIEW | PIPE_BIND_SHADER_IMAGE)) {
+      if (target == PIPE_BUFFER) {
+         retval |= si_is_vertex_format_supported(
+            screen, format, usage & (PIPE_BIND_SAMPLER_VIEW | PIPE_BIND_SHADER_IMAGE));
+      } else {
+         if (si_is_sampler_format_supported(screen, format))
+            retval |= usage & (PIPE_BIND_SAMPLER_VIEW | PIPE_BIND_SHADER_IMAGE);
+      }
+   }
+
+   if ((usage & (PIPE_BIND_RENDER_TARGET | PIPE_BIND_DISPLAY_TARGET | PIPE_BIND_SCANOUT |
+                 PIPE_BIND_SHARED | PIPE_BIND_BLENDABLE)) &&
+       si_is_colorbuffer_format_supported(format)) {
+      retval |= usage & (PIPE_BIND_RENDER_TARGET | PIPE_BIND_DISPLAY_TARGET | PIPE_BIND_SCANOUT |
+                         PIPE_BIND_SHARED);
+      if (!util_format_is_pure_integer(format) && !util_format_is_depth_or_stencil(format))
+         retval |= usage & PIPE_BIND_BLENDABLE;
+   }
+
+   if ((usage & PIPE_BIND_DEPTH_STENCIL) && si_is_zs_format_supported(format)) {
+      retval |= PIPE_BIND_DEPTH_STENCIL;
+   }
+
+   if (usage & PIPE_BIND_VERTEX_BUFFER) {
+      retval |= si_is_vertex_format_supported(screen, format, PIPE_BIND_VERTEX_BUFFER);
+   }
+
+   if ((usage & PIPE_BIND_LINEAR) && !util_format_is_compressed(format) &&
+       !(usage & PIPE_BIND_DEPTH_STENCIL))
+      retval |= PIPE_BIND_LINEAR;
+
+   return retval == usage;
 }
 
 /*
  * framebuffer handling
  */
 
-static void si_choose_spi_color_formats(struct si_surface *surf,
-					unsigned format, unsigned swap,
-					unsigned ntype, bool is_depth)
+static void si_choose_spi_color_formats(struct si_surface *surf, unsigned format, unsigned swap,
+                                        unsigned ntype, bool is_depth)
 {
-	/* Alpha is needed for alpha-to-coverage.
-	 * Blending may be with or without alpha.
-	 */
-	unsigned normal = 0; /* most optimal, may not support blending or export alpha */
-	unsigned alpha = 0; /* exports alpha, but may not support blending */
-	unsigned blend = 0; /* supports blending, but may not export alpha */
-	unsigned blend_alpha = 0; /* least optimal, supports blending and exports alpha */
-
-	/* Choose the SPI color formats. These are required values for RB+.
-	 * Other chips have multiple choices, though they are not necessarily better.
-	 */
-	switch (format) {
-	case V_028C70_COLOR_5_6_5:
-	case V_028C70_COLOR_1_5_5_5:
-	case V_028C70_COLOR_5_5_5_1:
-	case V_028C70_COLOR_4_4_4_4:
-	case V_028C70_COLOR_10_11_11:
-	case V_028C70_COLOR_11_11_10:
-	case V_028C70_COLOR_8:
-	case V_028C70_COLOR_8_8:
-	case V_028C70_COLOR_8_8_8_8:
-	case V_028C70_COLOR_10_10_10_2:
-	case V_028C70_COLOR_2_10_10_10:
-		if (ntype == V_028C70_NUMBER_UINT)
-			alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_UINT16_ABGR;
-		else if (ntype == V_028C70_NUMBER_SINT)
-			alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_SINT16_ABGR;
-		else
-			alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_FP16_ABGR;
-		break;
-
-	case V_028C70_COLOR_16:
-	case V_028C70_COLOR_16_16:
-	case V_028C70_COLOR_16_16_16_16:
-		if (ntype == V_028C70_NUMBER_UNORM ||
-		    ntype == V_028C70_NUMBER_SNORM) {
-			/* UNORM16 and SNORM16 don't support blending */
-			if (ntype == V_028C70_NUMBER_UNORM)
-				normal = alpha = V_028714_SPI_SHADER_UNORM16_ABGR;
-			else
-				normal = alpha = V_028714_SPI_SHADER_SNORM16_ABGR;
-
-			/* Use 32 bits per channel for blending. */
-			if (format == V_028C70_COLOR_16) {
-				if (swap == V_028C70_SWAP_STD) { /* R */
-					blend = V_028714_SPI_SHADER_32_R;
-					blend_alpha = V_028714_SPI_SHADER_32_AR;
-				} else if (swap == V_028C70_SWAP_ALT_REV) /* A */
-					blend = blend_alpha = V_028714_SPI_SHADER_32_AR;
-				else
-					assert(0);
-			} else if (format == V_028C70_COLOR_16_16) {
-				if (swap == V_028C70_SWAP_STD) { /* RG */
-					blend = V_028714_SPI_SHADER_32_GR;
-					blend_alpha = V_028714_SPI_SHADER_32_ABGR;
-				} else if (swap == V_028C70_SWAP_ALT) /* RA */
-					blend = blend_alpha = V_028714_SPI_SHADER_32_AR;
-				else
-					assert(0);
-			} else /* 16_16_16_16 */
-				blend = blend_alpha = V_028714_SPI_SHADER_32_ABGR;
-		} else if (ntype == V_028C70_NUMBER_UINT)
-			alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_UINT16_ABGR;
-		else if (ntype == V_028C70_NUMBER_SINT)
-			alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_SINT16_ABGR;
-		else if (ntype == V_028C70_NUMBER_FLOAT)
-			alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_FP16_ABGR;
-		else
-			assert(0);
-		break;
-
-	case V_028C70_COLOR_32:
-		if (swap == V_028C70_SWAP_STD) { /* R */
-			blend = normal = V_028714_SPI_SHADER_32_R;
-			alpha = blend_alpha = V_028714_SPI_SHADER_32_AR;
-		} else if (swap == V_028C70_SWAP_ALT_REV) /* A */
-			alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_32_AR;
-		else
-			assert(0);
-		break;
-
-	case V_028C70_COLOR_32_32:
-		if (swap == V_028C70_SWAP_STD) { /* RG */
-			blend = normal = V_028714_SPI_SHADER_32_GR;
-			alpha = blend_alpha = V_028714_SPI_SHADER_32_ABGR;
-		} else if (swap == V_028C70_SWAP_ALT) /* RA */
-			alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_32_AR;
-		else
-			assert(0);
-		break;
-
-	case V_028C70_COLOR_32_32_32_32:
-	case V_028C70_COLOR_8_24:
-	case V_028C70_COLOR_24_8:
-	case V_028C70_COLOR_X24_8_32_FLOAT:
-		alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_32_ABGR;
-		break;
-
-	default:
-		assert(0);
-		return;
-	}
-
-	/* The DB->CB copy needs 32_ABGR. */
-	if (is_depth)
-		alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_32_ABGR;
-
-	surf->spi_shader_col_format = normal;
-	surf->spi_shader_col_format_alpha = alpha;
-	surf->spi_shader_col_format_blend = blend;
-	surf->spi_shader_col_format_blend_alpha = blend_alpha;
+   /* Alpha is needed for alpha-to-coverage.
+    * Blending may be with or without alpha.
+    */
+   unsigned normal = 0;      /* most optimal, may not support blending or export alpha */
+   unsigned alpha = 0;       /* exports alpha, but may not support blending */
+   unsigned blend = 0;       /* supports blending, but may not export alpha */
+   unsigned blend_alpha = 0; /* least optimal, supports blending and exports alpha */
+
+   /* Choose the SPI color formats. These are required values for RB+.
+    * Other chips have multiple choices, though they are not necessarily better.
+    */
+   switch (format) {
+   case V_028C70_COLOR_5_6_5:
+   case V_028C70_COLOR_1_5_5_5:
+   case V_028C70_COLOR_5_5_5_1:
+   case V_028C70_COLOR_4_4_4_4:
+   case V_028C70_COLOR_10_11_11:
+   case V_028C70_COLOR_11_11_10:
+   case V_028C70_COLOR_8:
+   case V_028C70_COLOR_8_8:
+   case V_028C70_COLOR_8_8_8_8:
+   case V_028C70_COLOR_10_10_10_2:
+   case V_028C70_COLOR_2_10_10_10:
+      if (ntype == V_028C70_NUMBER_UINT)
+         alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_UINT16_ABGR;
+      else if (ntype == V_028C70_NUMBER_SINT)
+         alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_SINT16_ABGR;
+      else
+         alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_FP16_ABGR;
+      break;
+
+   case V_028C70_COLOR_16:
+   case V_028C70_COLOR_16_16:
+   case V_028C70_COLOR_16_16_16_16:
+      if (ntype == V_028C70_NUMBER_UNORM || ntype == V_028C70_NUMBER_SNORM) {
+         /* UNORM16 and SNORM16 don't support blending */
+         if (ntype == V_028C70_NUMBER_UNORM)
+            normal = alpha = V_028714_SPI_SHADER_UNORM16_ABGR;
+         else
+            normal = alpha = V_028714_SPI_SHADER_SNORM16_ABGR;
+
+         /* Use 32 bits per channel for blending. */
+         if (format == V_028C70_COLOR_16) {
+            if (swap == V_028C70_SWAP_STD) { /* R */
+               blend = V_028714_SPI_SHADER_32_R;
+               blend_alpha = V_028714_SPI_SHADER_32_AR;
+            } else if (swap == V_028C70_SWAP_ALT_REV) /* A */
+               blend = blend_alpha = V_028714_SPI_SHADER_32_AR;
+            else
+               assert(0);
+         } else if (format == V_028C70_COLOR_16_16) {
+            if (swap == V_028C70_SWAP_STD) { /* RG */
+               blend = V_028714_SPI_SHADER_32_GR;
+               blend_alpha = V_028714_SPI_SHADER_32_ABGR;
+            } else if (swap == V_028C70_SWAP_ALT) /* RA */
+               blend = blend_alpha = V_028714_SPI_SHADER_32_AR;
+            else
+               assert(0);
+         } else /* 16_16_16_16 */
+            blend = blend_alpha = V_028714_SPI_SHADER_32_ABGR;
+      } else if (ntype == V_028C70_NUMBER_UINT)
+         alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_UINT16_ABGR;
+      else if (ntype == V_028C70_NUMBER_SINT)
+         alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_SINT16_ABGR;
+      else if (ntype == V_028C70_NUMBER_FLOAT)
+         alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_FP16_ABGR;
+      else
+         assert(0);
+      break;
+
+   case V_028C70_COLOR_32:
+      if (swap == V_028C70_SWAP_STD) { /* R */
+         blend = normal = V_028714_SPI_SHADER_32_R;
+         alpha = blend_alpha = V_028714_SPI_SHADER_32_AR;
+      } else if (swap == V_028C70_SWAP_ALT_REV) /* A */
+         alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_32_AR;
+      else
+         assert(0);
+      break;
+
+   case V_028C70_COLOR_32_32:
+      if (swap == V_028C70_SWAP_STD) { /* RG */
+         blend = normal = V_028714_SPI_SHADER_32_GR;
+         alpha = blend_alpha = V_028714_SPI_SHADER_32_ABGR;
+      } else if (swap == V_028C70_SWAP_ALT) /* RA */
+         alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_32_AR;
+      else
+         assert(0);
+      break;
+
+   case V_028C70_COLOR_32_32_32_32:
+   case V_028C70_COLOR_8_24:
+   case V_028C70_COLOR_24_8:
+   case V_028C70_COLOR_X24_8_32_FLOAT:
+      alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_32_ABGR;
+      break;
+
+   default:
+      assert(0);
+      return;
+   }
+
+   /* The DB->CB copy needs 32_ABGR. */
+   if (is_depth)
+      alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_32_ABGR;
+
+   surf->spi_shader_col_format = normal;
+   surf->spi_shader_col_format_alpha = alpha;
+   surf->spi_shader_col_format_blend = blend;
+   surf->spi_shader_col_format_blend_alpha = blend_alpha;
 }
 
-static void si_initialize_color_surface(struct si_context *sctx,
-					struct si_surface *surf)
+static void si_initialize_color_surface(struct si_context *sctx, struct si_surface *surf)
 {
-	struct si_texture *tex = (struct si_texture*)surf->base.texture;
-	unsigned color_info, color_attrib;
-	unsigned format, swap, ntype, endian;
-	const struct util_format_description *desc;
-	int firstchan;
-	unsigned blend_clamp = 0, blend_bypass = 0;
-
-	desc = util_format_description(surf->base.format);
-	for (firstchan = 0; firstchan < 4; firstchan++) {
-		if (desc->channel[firstchan].type != UTIL_FORMAT_TYPE_VOID) {
-			break;
-		}
-	}
-	if (firstchan == 4 || desc->channel[firstchan].type == UTIL_FORMAT_TYPE_FLOAT) {
-		ntype = V_028C70_NUMBER_FLOAT;
-	} else {
-		ntype = V_028C70_NUMBER_UNORM;
-		if (desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB)
-			ntype = V_028C70_NUMBER_SRGB;
-		else if (desc->channel[firstchan].type == UTIL_FORMAT_TYPE_SIGNED) {
-			if (desc->channel[firstchan].pure_integer) {
-				ntype = V_028C70_NUMBER_SINT;
-			} else {
-				assert(desc->channel[firstchan].normalized);
-				ntype = V_028C70_NUMBER_SNORM;
-			}
-		} else if (desc->channel[firstchan].type == UTIL_FORMAT_TYPE_UNSIGNED) {
-			if (desc->channel[firstchan].pure_integer) {
-				ntype = V_028C70_NUMBER_UINT;
-			} else {
-				assert(desc->channel[firstchan].normalized);
-				ntype = V_028C70_NUMBER_UNORM;
-			}
-		}
-	}
-
-	format = si_translate_colorformat(surf->base.format);
-	if (format == V_028C70_COLOR_INVALID) {
-		PRINT_ERR("Invalid CB format: %d, disabling CB.\n", surf->base.format);
-	}
-	assert(format != V_028C70_COLOR_INVALID);
-	swap = si_translate_colorswap(surf->base.format, false);
-	endian = si_colorformat_endian_swap(format);
-
-	/* blend clamp should be set for all NORM/SRGB types */
-	if (ntype == V_028C70_NUMBER_UNORM ||
-	    ntype == V_028C70_NUMBER_SNORM ||
-	    ntype == V_028C70_NUMBER_SRGB)
-		blend_clamp = 1;
-
-	/* set blend bypass according to docs if SINT/UINT or
-	   8/24 COLOR variants */
-	if (ntype == V_028C70_NUMBER_UINT || ntype == V_028C70_NUMBER_SINT ||
-	    format == V_028C70_COLOR_8_24 || format == V_028C70_COLOR_24_8 ||
-	    format == V_028C70_COLOR_X24_8_32_FLOAT) {
-		blend_clamp = 0;
-		blend_bypass = 1;
-	}
-
-	if (ntype == V_028C70_NUMBER_UINT || ntype == V_028C70_NUMBER_SINT) {
-		if (format == V_028C70_COLOR_8 ||
-		    format == V_028C70_COLOR_8_8 ||
-		    format == V_028C70_COLOR_8_8_8_8)
-			surf->color_is_int8 = true;
-		else if (format == V_028C70_COLOR_10_10_10_2 ||
-			 format == V_028C70_COLOR_2_10_10_10)
-			surf->color_is_int10 = true;
-	}
-
-	color_info = S_028C70_FORMAT(format) |
-		S_028C70_COMP_SWAP(swap) |
-		S_028C70_BLEND_CLAMP(blend_clamp) |
-		S_028C70_BLEND_BYPASS(blend_bypass) |
-		S_028C70_SIMPLE_FLOAT(1) |
-		S_028C70_ROUND_MODE(ntype != V_028C70_NUMBER_UNORM &&
-				    ntype != V_028C70_NUMBER_SNORM &&
-				    ntype != V_028C70_NUMBER_SRGB &&
-				    format != V_028C70_COLOR_8_24 &&
-				    format != V_028C70_COLOR_24_8) |
-		S_028C70_NUMBER_TYPE(ntype) |
-		S_028C70_ENDIAN(endian);
-
-	/* Intensity is implemented as Red, so treat it that way. */
-	color_attrib = S_028C74_FORCE_DST_ALPHA_1(desc->swizzle[3] == PIPE_SWIZZLE_1 ||
-						  util_format_is_intensity(surf->base.format));
-
-	if (tex->buffer.b.b.nr_samples > 1) {
-		unsigned log_samples = util_logbase2(tex->buffer.b.b.nr_samples);
-		unsigned log_fragments = util_logbase2(tex->buffer.b.b.nr_storage_samples);
-
-		color_attrib |= S_028C74_NUM_SAMPLES(log_samples) |
-				S_028C74_NUM_FRAGMENTS(log_fragments);
-
-		if (tex->surface.fmask_offset) {
-			color_info |= S_028C70_COMPRESSION(1);
-			unsigned fmask_bankh = util_logbase2(tex->surface.u.legacy.fmask.bankh);
-
-			if (sctx->chip_class == GFX6) {
-				/* due to a hw bug, FMASK_BANK_HEIGHT must be set on GFX6 too */
-				color_attrib |= S_028C74_FMASK_BANK_HEIGHT(fmask_bankh);
-			}
-		}
-	}
-
-	if (sctx->chip_class >= GFX10) {
-		unsigned min_compressed_block_size = V_028C78_MIN_BLOCK_SIZE_32B;
-
-		/* amdvlk: [min-compressed-block-size] should be set to 32 for dGPU and
-		   64 for APU because all of our APUs to date use DIMMs which have
-		   a request granularity size of 64B while all other chips have a
-		   32B request size */
-		if (!sctx->screen->info.has_dedicated_vram)
-			min_compressed_block_size = V_028C78_MIN_BLOCK_SIZE_64B;
-
-		surf->cb_dcc_control =
-			S_028C78_MAX_UNCOMPRESSED_BLOCK_SIZE(V_028C78_MAX_BLOCK_SIZE_256B) |
-			S_028C78_MAX_COMPRESSED_BLOCK_SIZE(V_028C78_MAX_BLOCK_SIZE_128B) |
-			S_028C78_MIN_COMPRESSED_BLOCK_SIZE(min_compressed_block_size) |
-			S_028C78_INDEPENDENT_64B_BLOCKS(0) |
-			S_028C78_INDEPENDENT_128B_BLOCKS(1);
-	} else if (sctx->chip_class >= GFX8) {
-		unsigned max_uncompressed_block_size = V_028C78_MAX_BLOCK_SIZE_256B;
-		unsigned min_compressed_block_size = V_028C78_MIN_BLOCK_SIZE_32B;
-
-		/* amdvlk: [min-compressed-block-size] should be set to 32 for dGPU and
-		   64 for APU because all of our APUs to date use DIMMs which have
-		   a request granularity size of 64B while all other chips have a
-		   32B request size */
-		if (!sctx->screen->info.has_dedicated_vram)
-			min_compressed_block_size = V_028C78_MIN_BLOCK_SIZE_64B;
-
-		if (tex->buffer.b.b.nr_storage_samples > 1) {
-			if (tex->surface.bpe == 1)
-				max_uncompressed_block_size = V_028C78_MAX_BLOCK_SIZE_64B;
-			else if (tex->surface.bpe == 2)
-				max_uncompressed_block_size = V_028C78_MAX_BLOCK_SIZE_128B;
-		}
-
-		surf->cb_dcc_control = S_028C78_MAX_UNCOMPRESSED_BLOCK_SIZE(max_uncompressed_block_size) |
-				       S_028C78_MIN_COMPRESSED_BLOCK_SIZE(min_compressed_block_size) |
-		                       S_028C78_INDEPENDENT_64B_BLOCKS(1);
-	}
-
-	/* This must be set for fast clear to work without FMASK. */
-	if (!tex->surface.fmask_size && sctx->chip_class == GFX6) {
-		unsigned bankh = util_logbase2(tex->surface.u.legacy.bankh);
-		color_attrib |= S_028C74_FMASK_BANK_HEIGHT(bankh);
-	}
-
-	/* GFX10 field has the same base shift as the GFX6 field */
-	unsigned color_view = S_028C6C_SLICE_START(surf->base.u.tex.first_layer) |
-			      S_028C6C_SLICE_MAX_GFX10(surf->base.u.tex.last_layer);
-	unsigned mip0_depth = util_max_layer(&tex->buffer.b.b, 0);
-
-	if (sctx->chip_class >= GFX10) {
-		color_view |= S_028C6C_MIP_LEVEL_GFX10(surf->base.u.tex.level);
-
-		surf->cb_color_attrib3 = S_028EE0_MIP0_DEPTH(mip0_depth) |
-					 S_028EE0_RESOURCE_TYPE(tex->surface.u.gfx9.resource_type) |
-					 S_028EE0_RESOURCE_LEVEL(1);
-	} else if (sctx->chip_class == GFX9) {
-		color_view |= S_028C6C_MIP_LEVEL_GFX9(surf->base.u.tex.level);
-		color_attrib |= S_028C74_MIP0_DEPTH(mip0_depth) |
-				S_028C74_RESOURCE_TYPE(tex->surface.u.gfx9.resource_type);
-	}
-
-	if (sctx->chip_class >= GFX9) {
-		surf->cb_color_attrib2 = S_028C68_MIP0_WIDTH(surf->width0 - 1) |
-					 S_028C68_MIP0_HEIGHT(surf->height0 - 1) |
-					 S_028C68_MAX_MIP(tex->buffer.b.b.last_level);
-	}
-
-	surf->cb_color_view = color_view;
-	surf->cb_color_info = color_info;
-	surf->cb_color_attrib = color_attrib;
-
-	/* Determine pixel shader export format */
-	si_choose_spi_color_formats(surf, format, swap, ntype, tex->is_depth);
-
-	surf->color_initialized = true;
+   struct si_texture *tex = (struct si_texture *)surf->base.texture;
+   unsigned color_info, color_attrib;
+   unsigned format, swap, ntype, endian;
+   const struct util_format_description *desc;
+   int firstchan;
+   unsigned blend_clamp = 0, blend_bypass = 0;
+
+   desc = util_format_description(surf->base.format);
+   for (firstchan = 0; firstchan < 4; firstchan++) {
+      if (desc->channel[firstchan].type != UTIL_FORMAT_TYPE_VOID) {
+         break;
+      }
+   }
+   if (firstchan == 4 || desc->channel[firstchan].type == UTIL_FORMAT_TYPE_FLOAT) {
+      ntype = V_028C70_NUMBER_FLOAT;
+   } else {
+      ntype = V_028C70_NUMBER_UNORM;
+      if (desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB)
+         ntype = V_028C70_NUMBER_SRGB;
+      else if (desc->channel[firstchan].type == UTIL_FORMAT_TYPE_SIGNED) {
+         if (desc->channel[firstchan].pure_integer) {
+            ntype = V_028C70_NUMBER_SINT;
+         } else {
+            assert(desc->channel[firstchan].normalized);
+            ntype = V_028C70_NUMBER_SNORM;
+         }
+      } else if (desc->channel[firstchan].type == UTIL_FORMAT_TYPE_UNSIGNED) {
+         if (desc->channel[firstchan].pure_integer) {
+            ntype = V_028C70_NUMBER_UINT;
+         } else {
+            assert(desc->channel[firstchan].normalized);
+            ntype = V_028C70_NUMBER_UNORM;
+         }
+      }
+   }
+
+   format = si_translate_colorformat(surf->base.format);
+   if (format == V_028C70_COLOR_INVALID) {
+      PRINT_ERR("Invalid CB format: %d, disabling CB.\n", surf->base.format);
+   }
+   assert(format != V_028C70_COLOR_INVALID);
+   swap = si_translate_colorswap(surf->base.format, false);
+   endian = si_colorformat_endian_swap(format);
+
+   /* blend clamp should be set for all NORM/SRGB types */
+   if (ntype == V_028C70_NUMBER_UNORM || ntype == V_028C70_NUMBER_SNORM ||
+       ntype == V_028C70_NUMBER_SRGB)
+      blend_clamp = 1;
+
+   /* set blend bypass according to docs if SINT/UINT or
+      8/24 COLOR variants */
+   if (ntype == V_028C70_NUMBER_UINT || ntype == V_028C70_NUMBER_SINT ||
+       format == V_028C70_COLOR_8_24 || format == V_028C70_COLOR_24_8 ||
+       format == V_028C70_COLOR_X24_8_32_FLOAT) {
+      blend_clamp = 0;
+      blend_bypass = 1;
+   }
+
+   if (ntype == V_028C70_NUMBER_UINT || ntype == V_028C70_NUMBER_SINT) {
+      if (format == V_028C70_COLOR_8 || format == V_028C70_COLOR_8_8 ||
+          format == V_028C70_COLOR_8_8_8_8)
+         surf->color_is_int8 = true;
+      else if (format == V_028C70_COLOR_10_10_10_2 || format == V_028C70_COLOR_2_10_10_10)
+         surf->color_is_int10 = true;
+   }
+
+   color_info =
+      S_028C70_FORMAT(format) | S_028C70_COMP_SWAP(swap) | S_028C70_BLEND_CLAMP(blend_clamp) |
+      S_028C70_BLEND_BYPASS(blend_bypass) | S_028C70_SIMPLE_FLOAT(1) |
+      S_028C70_ROUND_MODE(ntype != V_028C70_NUMBER_UNORM && ntype != V_028C70_NUMBER_SNORM &&
+                          ntype != V_028C70_NUMBER_SRGB && format != V_028C70_COLOR_8_24 &&
+                          format != V_028C70_COLOR_24_8) |
+      S_028C70_NUMBER_TYPE(ntype) | S_028C70_ENDIAN(endian);
+
+   /* Intensity is implemented as Red, so treat it that way. */
+   color_attrib = S_028C74_FORCE_DST_ALPHA_1(desc->swizzle[3] == PIPE_SWIZZLE_1 ||
+                                             util_format_is_intensity(surf->base.format));
+
+   if (tex->buffer.b.b.nr_samples > 1) {
+      unsigned log_samples = util_logbase2(tex->buffer.b.b.nr_samples);
+      unsigned log_fragments = util_logbase2(tex->buffer.b.b.nr_storage_samples);
+
+      color_attrib |= S_028C74_NUM_SAMPLES(log_samples) | S_028C74_NUM_FRAGMENTS(log_fragments);
+
+      if (tex->surface.fmask_offset) {
+         color_info |= S_028C70_COMPRESSION(1);
+         unsigned fmask_bankh = util_logbase2(tex->surface.u.legacy.fmask.bankh);
+
+         if (sctx->chip_class == GFX6) {
+            /* due to a hw bug, FMASK_BANK_HEIGHT must be set on GFX6 too */
+            color_attrib |= S_028C74_FMASK_BANK_HEIGHT(fmask_bankh);
+         }
+      }
+   }
+
+   if (sctx->chip_class >= GFX10) {
+      unsigned min_compressed_block_size = V_028C78_MIN_BLOCK_SIZE_32B;
+
+      /* amdvlk: [min-compressed-block-size] should be set to 32 for dGPU and
+         64 for APU because all of our APUs to date use DIMMs which have
+         a request granularity size of 64B while all other chips have a
+         32B request size */
+      if (!sctx->screen->info.has_dedicated_vram)
+         min_compressed_block_size = V_028C78_MIN_BLOCK_SIZE_64B;
+
+      surf->cb_dcc_control = S_028C78_MAX_UNCOMPRESSED_BLOCK_SIZE(V_028C78_MAX_BLOCK_SIZE_256B) |
+                             S_028C78_MAX_COMPRESSED_BLOCK_SIZE(V_028C78_MAX_BLOCK_SIZE_128B) |
+                             S_028C78_MIN_COMPRESSED_BLOCK_SIZE(min_compressed_block_size) |
+                             S_028C78_INDEPENDENT_64B_BLOCKS(0) |
+                             S_028C78_INDEPENDENT_128B_BLOCKS(1);
+   } else if (sctx->chip_class >= GFX8) {
+      unsigned max_uncompressed_block_size = V_028C78_MAX_BLOCK_SIZE_256B;
+      unsigned min_compressed_block_size = V_028C78_MIN_BLOCK_SIZE_32B;
+
+      /* amdvlk: [min-compressed-block-size] should be set to 32 for dGPU and
+         64 for APU because all of our APUs to date use DIMMs which have
+         a request granularity size of 64B while all other chips have a
+         32B request size */
+      if (!sctx->screen->info.has_dedicated_vram)
+         min_compressed_block_size = V_028C78_MIN_BLOCK_SIZE_64B;
+
+      if (tex->buffer.b.b.nr_storage_samples > 1) {
+         if (tex->surface.bpe == 1)
+            max_uncompressed_block_size = V_028C78_MAX_BLOCK_SIZE_64B;
+         else if (tex->surface.bpe == 2)
+            max_uncompressed_block_size = V_028C78_MAX_BLOCK_SIZE_128B;
+      }
+
+      surf->cb_dcc_control = S_028C78_MAX_UNCOMPRESSED_BLOCK_SIZE(max_uncompressed_block_size) |
+                             S_028C78_MIN_COMPRESSED_BLOCK_SIZE(min_compressed_block_size) |
+                             S_028C78_INDEPENDENT_64B_BLOCKS(1);
+   }
+
+   /* This must be set for fast clear to work without FMASK. */
+   if (!tex->surface.fmask_size && sctx->chip_class == GFX6) {
+      unsigned bankh = util_logbase2(tex->surface.u.legacy.bankh);
+      color_attrib |= S_028C74_FMASK_BANK_HEIGHT(bankh);
+   }
+
+   /* GFX10 field has the same base shift as the GFX6 field */
+   unsigned color_view = S_028C6C_SLICE_START(surf->base.u.tex.first_layer) |
+                         S_028C6C_SLICE_MAX_GFX10(surf->base.u.tex.last_layer);
+   unsigned mip0_depth = util_max_layer(&tex->buffer.b.b, 0);
+
+   if (sctx->chip_class >= GFX10) {
+      color_view |= S_028C6C_MIP_LEVEL_GFX10(surf->base.u.tex.level);
+
+      surf->cb_color_attrib3 = S_028EE0_MIP0_DEPTH(mip0_depth) |
+                               S_028EE0_RESOURCE_TYPE(tex->surface.u.gfx9.resource_type) |
+                               S_028EE0_RESOURCE_LEVEL(1);
+   } else if (sctx->chip_class == GFX9) {
+      color_view |= S_028C6C_MIP_LEVEL_GFX9(surf->base.u.tex.level);
+      color_attrib |= S_028C74_MIP0_DEPTH(mip0_depth) |
+                      S_028C74_RESOURCE_TYPE(tex->surface.u.gfx9.resource_type);
+   }
+
+   if (sctx->chip_class >= GFX9) {
+      surf->cb_color_attrib2 = S_028C68_MIP0_WIDTH(surf->width0 - 1) |
+                               S_028C68_MIP0_HEIGHT(surf->height0 - 1) |
+                               S_028C68_MAX_MIP(tex->buffer.b.b.last_level);
+   }
+
+   surf->cb_color_view = color_view;
+   surf->cb_color_info = color_info;
+   surf->cb_color_attrib = color_attrib;
+
+   /* Determine pixel shader export format */
+   si_choose_spi_color_formats(surf, format, swap, ntype, tex->is_depth);
+
+   surf->color_initialized = true;
 }
 
-static void si_init_depth_surface(struct si_context *sctx,
-				  struct si_surface *surf)
+static void si_init_depth_surface(struct si_context *sctx, struct si_surface *surf)
 {
-	struct si_texture *tex = (struct si_texture*)surf->base.texture;
-	unsigned level = surf->base.u.tex.level;
-	unsigned format, stencil_format;
-	uint32_t z_info, s_info;
-
-	format = si_translate_dbformat(tex->db_render_format);
-	stencil_format = tex->surface.has_stencil ?
-				 V_028044_STENCIL_8 : V_028044_STENCIL_INVALID;
-
-	assert(format != V_028040_Z_INVALID);
-	if (format == V_028040_Z_INVALID)
-		PRINT_ERR("Invalid DB format: %d, disabling DB.\n", tex->buffer.b.b.format);
-
-	surf->db_depth_view = S_028008_SLICE_START(surf->base.u.tex.first_layer) |
-			      S_028008_SLICE_MAX(surf->base.u.tex.last_layer);
-	surf->db_htile_data_base = 0;
-	surf->db_htile_surface = 0;
-
-	if (sctx->chip_class >= GFX10) {
-		surf->db_depth_view |= S_028008_SLICE_START_HI(surf->base.u.tex.first_layer >> 11) |
-				       S_028008_SLICE_MAX_HI(surf->base.u.tex.last_layer >> 11);
-	}
-
-	if (sctx->chip_class >= GFX9) {
-		assert(tex->surface.u.gfx9.surf_offset == 0);
-		surf->db_depth_base = tex->buffer.gpu_address >> 8;
-		surf->db_stencil_base = (tex->buffer.gpu_address +
-					 tex->surface.u.gfx9.stencil_offset) >> 8;
-		z_info = S_028038_FORMAT(format) |
-			 S_028038_NUM_SAMPLES(util_logbase2(tex->buffer.b.b.nr_samples)) |
-			 S_028038_SW_MODE(tex->surface.u.gfx9.surf.swizzle_mode) |
-			 S_028038_MAXMIP(tex->buffer.b.b.last_level);
-		s_info = S_02803C_FORMAT(stencil_format) |
-			 S_02803C_SW_MODE(tex->surface.u.gfx9.stencil.swizzle_mode);
-
-		if (sctx->chip_class == GFX9) {
-			surf->db_z_info2 = S_028068_EPITCH(tex->surface.u.gfx9.surf.epitch);
-			surf->db_stencil_info2 = S_02806C_EPITCH(tex->surface.u.gfx9.stencil.epitch);
-		}
-		surf->db_depth_view |= S_028008_MIPID(level);
-		surf->db_depth_size = S_02801C_X_MAX(tex->buffer.b.b.width0 - 1) |
-				      S_02801C_Y_MAX(tex->buffer.b.b.height0 - 1);
-
-		if (si_htile_enabled(tex, level, PIPE_MASK_ZS)) {
-			z_info |= S_028038_TILE_SURFACE_ENABLE(1) |
-				  S_028038_ALLOW_EXPCLEAR(1);
-
-			if (tex->tc_compatible_htile) {
-				unsigned max_zplanes = 4;
-
-				if (tex->db_render_format == PIPE_FORMAT_Z16_UNORM &&
-				    tex->buffer.b.b.nr_samples > 1)
-					max_zplanes = 2;
-
-				z_info |= S_028038_DECOMPRESS_ON_N_ZPLANES(max_zplanes + 1);
-
-				if (sctx->chip_class >= GFX10) {
-					z_info |= S_028040_ITERATE_FLUSH(1);
-					s_info |= S_028044_ITERATE_FLUSH(!tex->htile_stencil_disabled);
-				} else {
-					z_info |= S_028038_ITERATE_FLUSH(1);
-					s_info |= S_02803C_ITERATE_FLUSH(1);
-				}
-			}
-
-			if (tex->surface.has_stencil && !tex->htile_stencil_disabled) {
-				/* Stencil buffer workaround ported from the GFX6-GFX8 code.
-				 * See that for explanation.
-				 */
-				s_info |= S_02803C_ALLOW_EXPCLEAR(tex->buffer.b.b.nr_samples <= 1);
-			} else {
-				/* Use all HTILE for depth if there's no stencil. */
-				s_info |= S_02803C_TILE_STENCIL_DISABLE(1);
-			}
-
-			surf->db_htile_data_base = (tex->buffer.gpu_address +
-						    tex->surface.htile_offset) >> 8;
-			surf->db_htile_surface = S_028ABC_FULL_CACHE(1) |
-						 S_028ABC_PIPE_ALIGNED(tex->surface.u.gfx9.htile.pipe_aligned);
-			if (sctx->chip_class == GFX9) {
-				surf->db_htile_surface |=
-					S_028ABC_RB_ALIGNED(tex->surface.u.gfx9.htile.rb_aligned);
-			}
-		}
-	} else {
-		/* GFX6-GFX8 */
-		struct legacy_surf_level *levelinfo = &tex->surface.u.legacy.level[level];
-
-		assert(levelinfo->nblk_x % 8 == 0 && levelinfo->nblk_y % 8 == 0);
-
-		surf->db_depth_base = (tex->buffer.gpu_address +
-				       tex->surface.u.legacy.level[level].offset) >> 8;
-		surf->db_stencil_base = (tex->buffer.gpu_address +
-					 tex->surface.u.legacy.stencil_level[level].offset) >> 8;
-
-		z_info = S_028040_FORMAT(format) |
-			 S_028040_NUM_SAMPLES(util_logbase2(tex->buffer.b.b.nr_samples));
-		s_info = S_028044_FORMAT(stencil_format);
-		surf->db_depth_info = S_02803C_ADDR5_SWIZZLE_MASK(!tex->tc_compatible_htile);
-
-		if (sctx->chip_class >= GFX7) {
-			struct radeon_info *info = &sctx->screen->info;
-			unsigned index = tex->surface.u.legacy.tiling_index[level];
-			unsigned stencil_index = tex->surface.u.legacy.stencil_tiling_index[level];
-			unsigned macro_index = tex->surface.u.legacy.macro_tile_index;
-			unsigned tile_mode = info->si_tile_mode_array[index];
-			unsigned stencil_tile_mode = info->si_tile_mode_array[stencil_index];
-			unsigned macro_mode = info->cik_macrotile_mode_array[macro_index];
-
-			surf->db_depth_info |=
-				S_02803C_ARRAY_MODE(G_009910_ARRAY_MODE(tile_mode)) |
-				S_02803C_PIPE_CONFIG(G_009910_PIPE_CONFIG(tile_mode)) |
-				S_02803C_BANK_WIDTH(G_009990_BANK_WIDTH(macro_mode)) |
-				S_02803C_BANK_HEIGHT(G_009990_BANK_HEIGHT(macro_mode)) |
-				S_02803C_MACRO_TILE_ASPECT(G_009990_MACRO_TILE_ASPECT(macro_mode)) |
-				S_02803C_NUM_BANKS(G_009990_NUM_BANKS(macro_mode));
-			z_info |= S_028040_TILE_SPLIT(G_009910_TILE_SPLIT(tile_mode));
-			s_info |= S_028044_TILE_SPLIT(G_009910_TILE_SPLIT(stencil_tile_mode));
-		} else {
-			unsigned tile_mode_index = si_tile_mode_index(tex, level, false);
-			z_info |= S_028040_TILE_MODE_INDEX(tile_mode_index);
-			tile_mode_index = si_tile_mode_index(tex, level, true);
-			s_info |= S_028044_TILE_MODE_INDEX(tile_mode_index);
-		}
-
-		surf->db_depth_size = S_028058_PITCH_TILE_MAX((levelinfo->nblk_x / 8) - 1) |
-				      S_028058_HEIGHT_TILE_MAX((levelinfo->nblk_y / 8) - 1);
-		surf->db_depth_slice = S_02805C_SLICE_TILE_MAX((levelinfo->nblk_x *
-								levelinfo->nblk_y) / 64 - 1);
-
-		if (si_htile_enabled(tex, level, PIPE_MASK_ZS)) {
-			z_info |= S_028040_TILE_SURFACE_ENABLE(1) |
-				  S_028040_ALLOW_EXPCLEAR(1);
-
-			if (tex->surface.has_stencil) {
-				/* Workaround: For a not yet understood reason, the
-				 * combination of MSAA, fast stencil clear and stencil
-				 * decompress messes with subsequent stencil buffer
-				 * uses. Problem was reproduced on Verde, Bonaire,
-				 * Tonga, and Carrizo.
-				 *
-				 * Disabling EXPCLEAR works around the problem.
-				 *
-				 * Check piglit's arb_texture_multisample-stencil-clear
-				 * test if you want to try changing this.
-				 */
-				if (tex->buffer.b.b.nr_samples <= 1)
-					s_info |= S_028044_ALLOW_EXPCLEAR(1);
-			} else if (!tex->tc_compatible_htile) {
-				/* Use all of the htile_buffer for depth if there's no stencil.
-				 * This must not be set when TC-compatible HTILE is enabled
-				 * due to a hw bug.
-				 */
-				s_info |= S_028044_TILE_STENCIL_DISABLE(1);
-			}
-
-			surf->db_htile_data_base = (tex->buffer.gpu_address +
-						    tex->surface.htile_offset) >> 8;
-			surf->db_htile_surface = S_028ABC_FULL_CACHE(1);
-
-			if (tex->tc_compatible_htile) {
-				surf->db_htile_surface |= S_028ABC_TC_COMPATIBLE(1);
-
-				/* 0 = full compression. N = only compress up to N-1 Z planes. */
-				if (tex->buffer.b.b.nr_samples <= 1)
-					z_info |= S_028040_DECOMPRESS_ON_N_ZPLANES(5);
-				else if (tex->buffer.b.b.nr_samples <= 4)
-					z_info |= S_028040_DECOMPRESS_ON_N_ZPLANES(3);
-				else
-					z_info |= S_028040_DECOMPRESS_ON_N_ZPLANES(2);
-			}
-		}
-	}
-
-	surf->db_z_info = z_info;
-	surf->db_stencil_info = s_info;
-
-	surf->depth_initialized = true;
+   struct si_texture *tex = (struct si_texture *)surf->base.texture;
+   unsigned level = surf->base.u.tex.level;
+   unsigned format, stencil_format;
+   uint32_t z_info, s_info;
+
+   format = si_translate_dbformat(tex->db_render_format);
+   stencil_format = tex->surface.has_stencil ? V_028044_STENCIL_8 : V_028044_STENCIL_INVALID;
+
+   assert(format != V_028040_Z_INVALID);
+   if (format == V_028040_Z_INVALID)
+      PRINT_ERR("Invalid DB format: %d, disabling DB.\n", tex->buffer.b.b.format);
+
+   surf->db_depth_view = S_028008_SLICE_START(surf->base.u.tex.first_layer) |
+                         S_028008_SLICE_MAX(surf->base.u.tex.last_layer);
+   surf->db_htile_data_base = 0;
+   surf->db_htile_surface = 0;
+
+   if (sctx->chip_class >= GFX10) {
+      surf->db_depth_view |= S_028008_SLICE_START_HI(surf->base.u.tex.first_layer >> 11) |
+                             S_028008_SLICE_MAX_HI(surf->base.u.tex.last_layer >> 11);
+   }
+
+   if (sctx->chip_class >= GFX9) {
+      assert(tex->surface.u.gfx9.surf_offset == 0);
+      surf->db_depth_base = tex->buffer.gpu_address >> 8;
+      surf->db_stencil_base = (tex->buffer.gpu_address + tex->surface.u.gfx9.stencil_offset) >> 8;
+      z_info = S_028038_FORMAT(format) |
+               S_028038_NUM_SAMPLES(util_logbase2(tex->buffer.b.b.nr_samples)) |
+               S_028038_SW_MODE(tex->surface.u.gfx9.surf.swizzle_mode) |
+               S_028038_MAXMIP(tex->buffer.b.b.last_level);
+      s_info = S_02803C_FORMAT(stencil_format) |
+               S_02803C_SW_MODE(tex->surface.u.gfx9.stencil.swizzle_mode);
+
+      if (sctx->chip_class == GFX9) {
+         surf->db_z_info2 = S_028068_EPITCH(tex->surface.u.gfx9.surf.epitch);
+         surf->db_stencil_info2 = S_02806C_EPITCH(tex->surface.u.gfx9.stencil.epitch);
+      }
+      surf->db_depth_view |= S_028008_MIPID(level);
+      surf->db_depth_size =
+         S_02801C_X_MAX(tex->buffer.b.b.width0 - 1) | S_02801C_Y_MAX(tex->buffer.b.b.height0 - 1);
+
+      if (si_htile_enabled(tex, level, PIPE_MASK_ZS)) {
+         z_info |= S_028038_TILE_SURFACE_ENABLE(1) | S_028038_ALLOW_EXPCLEAR(1);
+
+         if (tex->tc_compatible_htile) {
+            unsigned max_zplanes = 4;
+
+            if (tex->db_render_format == PIPE_FORMAT_Z16_UNORM && tex->buffer.b.b.nr_samples > 1)
+               max_zplanes = 2;
+
+            z_info |= S_028038_DECOMPRESS_ON_N_ZPLANES(max_zplanes + 1);
+
+            if (sctx->chip_class >= GFX10) {
+               z_info |= S_028040_ITERATE_FLUSH(1);
+               s_info |= S_028044_ITERATE_FLUSH(!tex->htile_stencil_disabled);
+            } else {
+               z_info |= S_028038_ITERATE_FLUSH(1);
+               s_info |= S_02803C_ITERATE_FLUSH(1);
+            }
+         }
+
+         if (tex->surface.has_stencil && !tex->htile_stencil_disabled) {
+            /* Stencil buffer workaround ported from the GFX6-GFX8 code.
+             * See that for explanation.
+             */
+            s_info |= S_02803C_ALLOW_EXPCLEAR(tex->buffer.b.b.nr_samples <= 1);
+         } else {
+            /* Use all HTILE for depth if there's no stencil. */
+            s_info |= S_02803C_TILE_STENCIL_DISABLE(1);
+         }
+
+         surf->db_htile_data_base = (tex->buffer.gpu_address + tex->surface.htile_offset) >> 8;
+         surf->db_htile_surface =
+            S_028ABC_FULL_CACHE(1) | S_028ABC_PIPE_ALIGNED(tex->surface.u.gfx9.htile.pipe_aligned);
+         if (sctx->chip_class == GFX9) {
+            surf->db_htile_surface |= S_028ABC_RB_ALIGNED(tex->surface.u.gfx9.htile.rb_aligned);
+         }
+      }
+   } else {
+      /* GFX6-GFX8 */
+      struct legacy_surf_level *levelinfo = &tex->surface.u.legacy.level[level];
+
+      assert(levelinfo->nblk_x % 8 == 0 && levelinfo->nblk_y % 8 == 0);
+
+      surf->db_depth_base =
+         (tex->buffer.gpu_address + tex->surface.u.legacy.level[level].offset) >> 8;
+      surf->db_stencil_base =
+         (tex->buffer.gpu_address + tex->surface.u.legacy.stencil_level[level].offset) >> 8;
+
+      z_info =
+         S_028040_FORMAT(format) | S_028040_NUM_SAMPLES(util_logbase2(tex->buffer.b.b.nr_samples));
+      s_info = S_028044_FORMAT(stencil_format);
+      surf->db_depth_info = S_02803C_ADDR5_SWIZZLE_MASK(!tex->tc_compatible_htile);
+
+      if (sctx->chip_class >= GFX7) {
+         struct radeon_info *info = &sctx->screen->info;
+         unsigned index = tex->surface.u.legacy.tiling_index[level];
+         unsigned stencil_index = tex->surface.u.legacy.stencil_tiling_index[level];
+         unsigned macro_index = tex->surface.u.legacy.macro_tile_index;
+         unsigned tile_mode = info->si_tile_mode_array[index];
+         unsigned stencil_tile_mode = info->si_tile_mode_array[stencil_index];
+         unsigned macro_mode = info->cik_macrotile_mode_array[macro_index];
+
+         surf->db_depth_info |= S_02803C_ARRAY_MODE(G_009910_ARRAY_MODE(tile_mode)) |
+                                S_02803C_PIPE_CONFIG(G_009910_PIPE_CONFIG(tile_mode)) |
+                                S_02803C_BANK_WIDTH(G_009990_BANK_WIDTH(macro_mode)) |
+                                S_02803C_BANK_HEIGHT(G_009990_BANK_HEIGHT(macro_mode)) |
+                                S_02803C_MACRO_TILE_ASPECT(G_009990_MACRO_TILE_ASPECT(macro_mode)) |
+                                S_02803C_NUM_BANKS(G_009990_NUM_BANKS(macro_mode));
+         z_info |= S_028040_TILE_SPLIT(G_009910_TILE_SPLIT(tile_mode));
+         s_info |= S_028044_TILE_SPLIT(G_009910_TILE_SPLIT(stencil_tile_mode));
+      } else {
+         unsigned tile_mode_index = si_tile_mode_index(tex, level, false);
+         z_info |= S_028040_TILE_MODE_INDEX(tile_mode_index);
+         tile_mode_index = si_tile_mode_index(tex, level, true);
+         s_info |= S_028044_TILE_MODE_INDEX(tile_mode_index);
+      }
+
+      surf->db_depth_size = S_028058_PITCH_TILE_MAX((levelinfo->nblk_x / 8) - 1) |
+                            S_028058_HEIGHT_TILE_MAX((levelinfo->nblk_y / 8) - 1);
+      surf->db_depth_slice =
+         S_02805C_SLICE_TILE_MAX((levelinfo->nblk_x * levelinfo->nblk_y) / 64 - 1);
+
+      if (si_htile_enabled(tex, level, PIPE_MASK_ZS)) {
+         z_info |= S_028040_TILE_SURFACE_ENABLE(1) | S_028040_ALLOW_EXPCLEAR(1);
+
+         if (tex->surface.has_stencil) {
+            /* Workaround: For a not yet understood reason, the
+             * combination of MSAA, fast stencil clear and stencil
+             * decompress messes with subsequent stencil buffer
+             * uses. Problem was reproduced on Verde, Bonaire,
+             * Tonga, and Carrizo.
+             *
+             * Disabling EXPCLEAR works around the problem.
+             *
+             * Check piglit's arb_texture_multisample-stencil-clear
+             * test if you want to try changing this.
+             */
+            if (tex->buffer.b.b.nr_samples <= 1)
+               s_info |= S_028044_ALLOW_EXPCLEAR(1);
+         } else if (!tex->tc_compatible_htile) {
+            /* Use all of the htile_buffer for depth if there's no stencil.
+             * This must not be set when TC-compatible HTILE is enabled
+             * due to a hw bug.
+             */
+            s_info |= S_028044_TILE_STENCIL_DISABLE(1);
+         }
+
+         surf->db_htile_data_base = (tex->buffer.gpu_address + tex->surface.htile_offset) >> 8;
+         surf->db_htile_surface = S_028ABC_FULL_CACHE(1);
+
+         if (tex->tc_compatible_htile) {
+            surf->db_htile_surface |= S_028ABC_TC_COMPATIBLE(1);
+
+            /* 0 = full compression. N = only compress up to N-1 Z planes. */
+            if (tex->buffer.b.b.nr_samples <= 1)
+               z_info |= S_028040_DECOMPRESS_ON_N_ZPLANES(5);
+            else if (tex->buffer.b.b.nr_samples <= 4)
+               z_info |= S_028040_DECOMPRESS_ON_N_ZPLANES(3);
+            else
+               z_info |= S_028040_DECOMPRESS_ON_N_ZPLANES(2);
+         }
+      }
+   }
+
+   surf->db_z_info = z_info;
+   surf->db_stencil_info = s_info;
+
+   surf->depth_initialized = true;
 }
 
 void si_update_fb_dirtiness_after_rendering(struct si_context *sctx)
 {
-	if (sctx->decompression_enabled)
-		return;
-
-	if (sctx->framebuffer.state.zsbuf) {
-		struct pipe_surface *surf = sctx->framebuffer.state.zsbuf;
-		struct si_texture *tex = (struct si_texture *)surf->texture;
-
-		tex->dirty_level_mask |= 1 << surf->u.tex.level;
-
-		if (tex->surface.has_stencil)
-			tex->stencil_dirty_level_mask |= 1 << surf->u.tex.level;
-	}
-
-	unsigned compressed_cb_mask = sctx->framebuffer.compressed_cb_mask;
-	while (compressed_cb_mask) {
-		unsigned i = u_bit_scan(&compressed_cb_mask);
-		struct pipe_surface *surf = sctx->framebuffer.state.cbufs[i];
-		struct si_texture *tex = (struct si_texture*)surf->texture;
-
-		if (tex->surface.fmask_offset) {
-			tex->dirty_level_mask |= 1 << surf->u.tex.level;
-			tex->fmask_is_identity = false;
-		}
-		if (tex->dcc_gather_statistics)
-			tex->separate_dcc_dirty = true;
-	}
+   if (sctx->decompression_enabled)
+      return;
+
+   if (sctx->framebuffer.state.zsbuf) {
+      struct pipe_surface *surf = sctx->framebuffer.state.zsbuf;
+      struct si_texture *tex = (struct si_texture *)surf->texture;
+
+      tex->dirty_level_mask |= 1 << surf->u.tex.level;
+
+      if (tex->surface.has_stencil)
+         tex->stencil_dirty_level_mask |= 1 << surf->u.tex.level;
+   }
+
+   unsigned compressed_cb_mask = sctx->framebuffer.compressed_cb_mask;
+   while (compressed_cb_mask) {
+      unsigned i = u_bit_scan(&compressed_cb_mask);
+      struct pipe_surface *surf = sctx->framebuffer.state.cbufs[i];
+      struct si_texture *tex = (struct si_texture *)surf->texture;
+
+      if (tex->surface.fmask_offset) {
+         tex->dirty_level_mask |= 1 << surf->u.tex.level;
+         tex->fmask_is_identity = false;
+      }
+      if (tex->dcc_gather_statistics)
+         tex->separate_dcc_dirty = true;
+   }
 }
 
 static void si_dec_framebuffer_counters(const struct pipe_framebuffer_state *state)
 {
-	for (int i = 0; i < state->nr_cbufs; ++i) {
-		struct si_surface *surf = NULL;
-		struct si_texture *tex;
+   for (int i = 0; i < state->nr_cbufs; ++i) {
+      struct si_surface *surf = NULL;
+      struct si_texture *tex;
 
-		if (!state->cbufs[i])
-			continue;
-		surf = (struct si_surface*)state->cbufs[i];
-		tex = (struct si_texture*)surf->base.texture;
+      if (!state->cbufs[i])
+         continue;
+      surf = (struct si_surface *)state->cbufs[i];
+      tex = (struct si_texture *)surf->base.texture;
 
-		p_atomic_dec(&tex->framebuffers_bound);
-	}
+      p_atomic_dec(&tex->framebuffers_bound);
+   }
 }
 
 static void si_set_framebuffer_state(struct pipe_context *ctx,
-				     const struct pipe_framebuffer_state *state)
+                                     const struct pipe_framebuffer_state *state)
 {
-	struct si_context *sctx = (struct si_context *)ctx;
-	struct si_surface *surf = NULL;
-	struct si_texture *tex;
-	bool old_any_dst_linear = sctx->framebuffer.any_dst_linear;
-	unsigned old_nr_samples = sctx->framebuffer.nr_samples;
-	unsigned old_colorbuf_enabled_4bit = sctx->framebuffer.colorbuf_enabled_4bit;
-	bool old_has_zsbuf = !!sctx->framebuffer.state.zsbuf;
-	bool old_has_stencil =
-		old_has_zsbuf &&
-		((struct si_texture*)sctx->framebuffer.state.zsbuf->texture)->surface.has_stencil;
-	bool unbound = false;
-	int i;
-
-	/* Reject zero-sized framebuffers due to a hw bug on GFX6 that occurs
-	 * when PA_SU_HARDWARE_SCREEN_OFFSET != 0 and any_scissor.BR_X/Y <= 0.
-	 * We could implement the full workaround here, but it's a useless case.
-	 */
-	if ((!state->width || !state->height) && (state->nr_cbufs || state->zsbuf)) {
-		unreachable("the framebuffer shouldn't have zero area");
-		return;
-	}
-
-	si_update_fb_dirtiness_after_rendering(sctx);
-
-	for (i = 0; i < sctx->framebuffer.state.nr_cbufs; i++) {
-		if (!sctx->framebuffer.state.cbufs[i])
-			continue;
-
-		tex = (struct si_texture*)sctx->framebuffer.state.cbufs[i]->texture;
-		if (tex->dcc_gather_statistics)
-			vi_separate_dcc_stop_query(sctx, tex);
-	}
-
-	/* Disable DCC if the formats are incompatible. */
-	for (i = 0; i < state->nr_cbufs; i++) {
-		if (!state->cbufs[i])
-			continue;
-
-		surf = (struct si_surface*)state->cbufs[i];
-		tex = (struct si_texture*)surf->base.texture;
-
-		if (!surf->dcc_incompatible)
-			continue;
-
-		/* Since the DCC decompression calls back into set_framebuffer-
-		 * _state, we need to unbind the framebuffer, so that
-		 * vi_separate_dcc_stop_query isn't called twice with the same
-		 * color buffer.
-		 */
-		if (!unbound) {
-			util_copy_framebuffer_state(&sctx->framebuffer.state, NULL);
-			unbound = true;
-		}
-
-		if (vi_dcc_enabled(tex, surf->base.u.tex.level))
-			if (!si_texture_disable_dcc(sctx, tex))
-				si_decompress_dcc(sctx, tex);
-
-		surf->dcc_incompatible = false;
-	}
-
-	/* Only flush TC when changing the framebuffer state, because
-	 * the only client not using TC that can change textures is
-	 * the framebuffer.
-	 *
-	 * Wait for compute shaders because of possible transitions:
-	 * - FB write -> shader read
-	 * - shader write -> FB read
-	 *
-	 * DB caches are flushed on demand (using si_decompress_textures).
-	 *
-	 * When MSAA is enabled, CB and TC caches are flushed on demand
-	 * (after FMASK decompression). Shader write -> FB read transitions
-	 * cannot happen for MSAA textures, because MSAA shader images are
-	 * not supported.
-	 *
-	 * Only flush and wait for CB if there is actually a bound color buffer.
-	 */
-	if (sctx->framebuffer.uncompressed_cb_mask) {
-		si_make_CB_shader_coherent(sctx, sctx->framebuffer.nr_samples,
-					   sctx->framebuffer.CB_has_shader_readable_metadata,
-					   sctx->framebuffer.all_DCC_pipe_aligned);
-	}
-
-	sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH;
-
-	/* u_blitter doesn't invoke depth decompression when it does multiple
-	 * blits in a row, but the only case when it matters for DB is when
-	 * doing generate_mipmap. So here we flush DB manually between
-	 * individual generate_mipmap blits.
-	 * Note that lower mipmap levels aren't compressed.
-	 */
-	if (sctx->generate_mipmap_for_depth) {
-		si_make_DB_shader_coherent(sctx, 1, false,
-					   sctx->framebuffer.DB_has_shader_readable_metadata);
-	} else if (sctx->chip_class == GFX9) {
-		/* It appears that DB metadata "leaks" in a sequence of:
-		 *  - depth clear
-		 *  - DCC decompress for shader image writes (with DB disabled)
-		 *  - render with DEPTH_BEFORE_SHADER=1
-		 * Flushing DB metadata works around the problem.
-		 */
-		sctx->flags |= SI_CONTEXT_FLUSH_AND_INV_DB_META;
-	}
-
-	/* Take the maximum of the old and new count. If the new count is lower,
-	 * dirtying is needed to disable the unbound colorbuffers.
-	 */
-	sctx->framebuffer.dirty_cbufs |=
-		(1 << MAX2(sctx->framebuffer.state.nr_cbufs, state->nr_cbufs)) - 1;
-	sctx->framebuffer.dirty_zsbuf |= sctx->framebuffer.state.zsbuf != state->zsbuf;
-
-	si_dec_framebuffer_counters(&sctx->framebuffer.state);
-	util_copy_framebuffer_state(&sctx->framebuffer.state, state);
-
-	sctx->framebuffer.colorbuf_enabled_4bit = 0;
-	sctx->framebuffer.spi_shader_col_format = 0;
-	sctx->framebuffer.spi_shader_col_format_alpha = 0;
-	sctx->framebuffer.spi_shader_col_format_blend = 0;
-	sctx->framebuffer.spi_shader_col_format_blend_alpha = 0;
-	sctx->framebuffer.color_is_int8 = 0;
-	sctx->framebuffer.color_is_int10 = 0;
-
-	sctx->framebuffer.compressed_cb_mask = 0;
-	sctx->framebuffer.uncompressed_cb_mask = 0;
-	sctx->framebuffer.displayable_dcc_cb_mask = 0;
-	sctx->framebuffer.nr_samples = util_framebuffer_get_num_samples(state);
-	sctx->framebuffer.nr_color_samples = sctx->framebuffer.nr_samples;
-	sctx->framebuffer.log_samples = util_logbase2(sctx->framebuffer.nr_samples);
-	sctx->framebuffer.any_dst_linear = false;
-	sctx->framebuffer.CB_has_shader_readable_metadata = false;
-	sctx->framebuffer.DB_has_shader_readable_metadata = false;
-	sctx->framebuffer.all_DCC_pipe_aligned = true;
-	sctx->framebuffer.min_bytes_per_pixel = 0;
-
-	for (i = 0; i < state->nr_cbufs; i++) {
-		if (!state->cbufs[i])
-			continue;
-
-		surf = (struct si_surface*)state->cbufs[i];
-		tex = (struct si_texture*)surf->base.texture;
-
-		if (!surf->color_initialized) {
-			si_initialize_color_surface(sctx, surf);
-		}
-
-		sctx->framebuffer.colorbuf_enabled_4bit |= 0xf << (i * 4);
-		sctx->framebuffer.spi_shader_col_format |=
-			surf->spi_shader_col_format << (i * 4);
-		sctx->framebuffer.spi_shader_col_format_alpha |=
-			surf->spi_shader_col_format_alpha << (i * 4);
-		sctx->framebuffer.spi_shader_col_format_blend |=
-			surf->spi_shader_col_format_blend << (i * 4);
-		sctx->framebuffer.spi_shader_col_format_blend_alpha |=
-			surf->spi_shader_col_format_blend_alpha << (i * 4);
-
-		if (surf->color_is_int8)
-			sctx->framebuffer.color_is_int8 |= 1 << i;
-		if (surf->color_is_int10)
-			sctx->framebuffer.color_is_int10 |= 1 << i;
-
-		if (tex->surface.fmask_offset)
-			sctx->framebuffer.compressed_cb_mask |= 1 << i;
-		else
-			sctx->framebuffer.uncompressed_cb_mask |= 1 << i;
-
-		if (tex->surface.dcc_offset)
-			sctx->framebuffer.displayable_dcc_cb_mask |= 1 << i;
-
-		/* Don't update nr_color_samples for non-AA buffers.
-		 * (e.g. destination of MSAA resolve)
-		 */
-		if (tex->buffer.b.b.nr_samples >= 2 &&
-		    tex->buffer.b.b.nr_storage_samples < tex->buffer.b.b.nr_samples) {
-			sctx->framebuffer.nr_color_samples =
-				MIN2(sctx->framebuffer.nr_color_samples,
-				     tex->buffer.b.b.nr_storage_samples);
-			sctx->framebuffer.nr_color_samples =
-				MAX2(1, sctx->framebuffer.nr_color_samples);
-		}
-
-		if (tex->surface.is_linear)
-			sctx->framebuffer.any_dst_linear = true;
-
-		if (vi_dcc_enabled(tex, surf->base.u.tex.level)) {
-			sctx->framebuffer.CB_has_shader_readable_metadata = true;
-
-			if (sctx->chip_class >= GFX9 &&
-			    !tex->surface.u.gfx9.dcc.pipe_aligned)
-				sctx->framebuffer.all_DCC_pipe_aligned = false;
-		}
-
-		si_context_add_resource_size(sctx, surf->base.texture);
-
-		p_atomic_inc(&tex->framebuffers_bound);
-
-		if (tex->dcc_gather_statistics) {
-			/* Dirty tracking must be enabled for DCC usage analysis. */
-			sctx->framebuffer.compressed_cb_mask |= 1 << i;
-			vi_separate_dcc_start_query(sctx, tex);
-		}
-
-		/* Update the minimum but don't keep 0. */
-		if (!sctx->framebuffer.min_bytes_per_pixel ||
-		    tex->surface.bpe < sctx->framebuffer.min_bytes_per_pixel)
-			sctx->framebuffer.min_bytes_per_pixel = tex->surface.bpe;
-	}
-
-	/* For optimal DCC performance. */
-	if (sctx->chip_class >= GFX10)
-		sctx->framebuffer.dcc_overwrite_combiner_watermark = 6;
-	else
-		sctx->framebuffer.dcc_overwrite_combiner_watermark = 4;
-
-	struct si_texture *zstex = NULL;
-
-	if (state->zsbuf) {
-		surf = (struct si_surface*)state->zsbuf;
-		zstex = (struct si_texture*)surf->base.texture;
-
-		if (!surf->depth_initialized) {
-			si_init_depth_surface(sctx, surf);
-		}
-
-		if (vi_tc_compat_htile_enabled(zstex, surf->base.u.tex.level,
-					       PIPE_MASK_ZS))
-			sctx->framebuffer.DB_has_shader_readable_metadata = true;
-
-		si_context_add_resource_size(sctx, surf->base.texture);
-
-		/* Update the minimum but don't keep 0. */
-		if (!sctx->framebuffer.min_bytes_per_pixel ||
-		    zstex->surface.bpe < sctx->framebuffer.min_bytes_per_pixel)
-			sctx->framebuffer.min_bytes_per_pixel = zstex->surface.bpe;
-	}
-
-	si_update_ps_colorbuf0_slot(sctx);
-	si_update_poly_offset_state(sctx);
-	si_update_ngg_small_prim_precision(sctx);
-	si_mark_atom_dirty(sctx, &sctx->atoms.s.cb_render_state);
-	si_mark_atom_dirty(sctx, &sctx->atoms.s.framebuffer);
-
-	if (sctx->screen->dpbb_allowed)
-		si_mark_atom_dirty(sctx, &sctx->atoms.s.dpbb_state);
-
-	if (sctx->framebuffer.any_dst_linear != old_any_dst_linear)
-		si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config);
-
-	if (sctx->screen->has_out_of_order_rast &&
-	    (sctx->framebuffer.colorbuf_enabled_4bit != old_colorbuf_enabled_4bit ||
-	     !!sctx->framebuffer.state.zsbuf != old_has_zsbuf ||
-	     (zstex && zstex->surface.has_stencil != old_has_stencil)))
-		si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config);
-
-	if (sctx->framebuffer.nr_samples != old_nr_samples) {
-		struct pipe_constant_buffer constbuf = {0};
-
-		si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config);
-		si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
-
-		constbuf.buffer = sctx->sample_pos_buffer;
-
-		/* Set sample locations as fragment shader constants. */
-		switch (sctx->framebuffer.nr_samples) {
-		case 1:
-			constbuf.buffer_offset = 0;
-			break;
-		case 2:
-			constbuf.buffer_offset = (ubyte*)sctx->sample_positions.x2 -
-						 (ubyte*)sctx->sample_positions.x1;
-			break;
-		case 4:
-			constbuf.buffer_offset = (ubyte*)sctx->sample_positions.x4 -
-						 (ubyte*)sctx->sample_positions.x1;
-			break;
-		case 8:
-			constbuf.buffer_offset = (ubyte*)sctx->sample_positions.x8 -
-						 (ubyte*)sctx->sample_positions.x1;
-			break;
-		case 16:
-			constbuf.buffer_offset = (ubyte*)sctx->sample_positions.x16 -
-						 (ubyte*)sctx->sample_positions.x1;
-			break;
-		default:
-			PRINT_ERR("Requested an invalid number of samples %i.\n",
-				 sctx->framebuffer.nr_samples);
-			assert(0);
-		}
-		constbuf.buffer_size = sctx->framebuffer.nr_samples * 2 * 4;
-		si_set_rw_buffer(sctx, SI_PS_CONST_SAMPLE_POSITIONS, &constbuf);
-
-		si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_sample_locs);
-	}
-
-	sctx->do_update_shaders = true;
-
-	if (!sctx->decompression_enabled) {
-		/* Prevent textures decompression when the framebuffer state
-		 * changes come from the decompression passes themselves.
-		 */
-		sctx->need_check_render_feedback = true;
-	}
+   struct si_context *sctx = (struct si_context *)ctx;
+   struct si_surface *surf = NULL;
+   struct si_texture *tex;
+   bool old_any_dst_linear = sctx->framebuffer.any_dst_linear;
+   unsigned old_nr_samples = sctx->framebuffer.nr_samples;
+   unsigned old_colorbuf_enabled_4bit = sctx->framebuffer.colorbuf_enabled_4bit;
+   bool old_has_zsbuf = !!sctx->framebuffer.state.zsbuf;
+   bool old_has_stencil =
+      old_has_zsbuf &&
+      ((struct si_texture *)sctx->framebuffer.state.zsbuf->texture)->surface.has_stencil;
+   bool unbound = false;
+   int i;
+
+   /* Reject zero-sized framebuffers due to a hw bug on GFX6 that occurs
+    * when PA_SU_HARDWARE_SCREEN_OFFSET != 0 and any_scissor.BR_X/Y <= 0.
+    * We could implement the full workaround here, but it's a useless case.
+    */
+   if ((!state->width || !state->height) && (state->nr_cbufs || state->zsbuf)) {
+      unreachable("the framebuffer shouldn't have zero area");
+      return;
+   }
+
+   si_update_fb_dirtiness_after_rendering(sctx);
+
+   for (i = 0; i < sctx->framebuffer.state.nr_cbufs; i++) {
+      if (!sctx->framebuffer.state.cbufs[i])
+         continue;
+
+      tex = (struct si_texture *)sctx->framebuffer.state.cbufs[i]->texture;
+      if (tex->dcc_gather_statistics)
+         vi_separate_dcc_stop_query(sctx, tex);
+   }
+
+   /* Disable DCC if the formats are incompatible. */
+   for (i = 0; i < state->nr_cbufs; i++) {
+      if (!state->cbufs[i])
+         continue;
+
+      surf = (struct si_surface *)state->cbufs[i];
+      tex = (struct si_texture *)surf->base.texture;
+
+      if (!surf->dcc_incompatible)
+         continue;
+
+      /* Since the DCC decompression calls back into set_framebuffer-
+       * _state, we need to unbind the framebuffer, so that
+       * vi_separate_dcc_stop_query isn't called twice with the same
+       * color buffer.
+       */
+      if (!unbound) {
+         util_copy_framebuffer_state(&sctx->framebuffer.state, NULL);
+         unbound = true;
+      }
+
+      if (vi_dcc_enabled(tex, surf->base.u.tex.level))
+         if (!si_texture_disable_dcc(sctx, tex))
+            si_decompress_dcc(sctx, tex);
+
+      surf->dcc_incompatible = false;
+   }
+
+   /* Only flush TC when changing the framebuffer state, because
+    * the only client not using TC that can change textures is
+    * the framebuffer.
+    *
+    * Wait for compute shaders because of possible transitions:
+    * - FB write -> shader read
+    * - shader write -> FB read
+    *
+    * DB caches are flushed on demand (using si_decompress_textures).
+    *
+    * When MSAA is enabled, CB and TC caches are flushed on demand
+    * (after FMASK decompression). Shader write -> FB read transitions
+    * cannot happen for MSAA textures, because MSAA shader images are
+    * not supported.
+    *
+    * Only flush and wait for CB if there is actually a bound color buffer.
+    */
+   if (sctx->framebuffer.uncompressed_cb_mask) {
+      si_make_CB_shader_coherent(sctx, sctx->framebuffer.nr_samples,
+                                 sctx->framebuffer.CB_has_shader_readable_metadata,
+                                 sctx->framebuffer.all_DCC_pipe_aligned);
+   }
+
+   sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH;
+
+   /* u_blitter doesn't invoke depth decompression when it does multiple
+    * blits in a row, but the only case when it matters for DB is when
+    * doing generate_mipmap. So here we flush DB manually between
+    * individual generate_mipmap blits.
+    * Note that lower mipmap levels aren't compressed.
+    */
+   if (sctx->generate_mipmap_for_depth) {
+      si_make_DB_shader_coherent(sctx, 1, false, sctx->framebuffer.DB_has_shader_readable_metadata);
+   } else if (sctx->chip_class == GFX9) {
+      /* It appears that DB metadata "leaks" in a sequence of:
+       *  - depth clear
+       *  - DCC decompress for shader image writes (with DB disabled)
+       *  - render with DEPTH_BEFORE_SHADER=1
+       * Flushing DB metadata works around the problem.
+       */
+      sctx->flags |= SI_CONTEXT_FLUSH_AND_INV_DB_META;
+   }
+
+   /* Take the maximum of the old and new count. If the new count is lower,
+    * dirtying is needed to disable the unbound colorbuffers.
+    */
+   sctx->framebuffer.dirty_cbufs |=
+      (1 << MAX2(sctx->framebuffer.state.nr_cbufs, state->nr_cbufs)) - 1;
+   sctx->framebuffer.dirty_zsbuf |= sctx->framebuffer.state.zsbuf != state->zsbuf;
+
+   si_dec_framebuffer_counters(&sctx->framebuffer.state);
+   util_copy_framebuffer_state(&sctx->framebuffer.state, state);
+
+   sctx->framebuffer.colorbuf_enabled_4bit = 0;
+   sctx->framebuffer.spi_shader_col_format = 0;
+   sctx->framebuffer.spi_shader_col_format_alpha = 0;
+   sctx->framebuffer.spi_shader_col_format_blend = 0;
+   sctx->framebuffer.spi_shader_col_format_blend_alpha = 0;
+   sctx->framebuffer.color_is_int8 = 0;
+   sctx->framebuffer.color_is_int10 = 0;
+
+   sctx->framebuffer.compressed_cb_mask = 0;
+   sctx->framebuffer.uncompressed_cb_mask = 0;
+   sctx->framebuffer.displayable_dcc_cb_mask = 0;
+   sctx->framebuffer.nr_samples = util_framebuffer_get_num_samples(state);
+   sctx->framebuffer.nr_color_samples = sctx->framebuffer.nr_samples;
+   sctx->framebuffer.log_samples = util_logbase2(sctx->framebuffer.nr_samples);
+   sctx->framebuffer.any_dst_linear = false;
+   sctx->framebuffer.CB_has_shader_readable_metadata = false;
+   sctx->framebuffer.DB_has_shader_readable_metadata = false;
+   sctx->framebuffer.all_DCC_pipe_aligned = true;
+   sctx->framebuffer.min_bytes_per_pixel = 0;
+
+   for (i = 0; i < state->nr_cbufs; i++) {
+      if (!state->cbufs[i])
+         continue;
+
+      surf = (struct si_surface *)state->cbufs[i];
+      tex = (struct si_texture *)surf->base.texture;
+
+      if (!surf->color_initialized) {
+         si_initialize_color_surface(sctx, surf);
+      }
+
+      sctx->framebuffer.colorbuf_enabled_4bit |= 0xf << (i * 4);
+      sctx->framebuffer.spi_shader_col_format |= surf->spi_shader_col_format << (i * 4);
+      sctx->framebuffer.spi_shader_col_format_alpha |= surf->spi_shader_col_format_alpha << (i * 4);
+      sctx->framebuffer.spi_shader_col_format_blend |= surf->spi_shader_col_format_blend << (i * 4);
+      sctx->framebuffer.spi_shader_col_format_blend_alpha |= surf->spi_shader_col_format_blend_alpha
+                                                             << (i * 4);
+
+      if (surf->color_is_int8)
+         sctx->framebuffer.color_is_int8 |= 1 << i;
+      if (surf->color_is_int10)
+         sctx->framebuffer.color_is_int10 |= 1 << i;
+
+      if (tex->surface.fmask_offset)
+         sctx->framebuffer.compressed_cb_mask |= 1 << i;
+      else
+         sctx->framebuffer.uncompressed_cb_mask |= 1 << i;
+
+      if (tex->surface.dcc_offset)
+         sctx->framebuffer.displayable_dcc_cb_mask |= 1 << i;
+
+      /* Don't update nr_color_samples for non-AA buffers.
+       * (e.g. destination of MSAA resolve)
+       */
+      if (tex->buffer.b.b.nr_samples >= 2 &&
+          tex->buffer.b.b.nr_storage_samples < tex->buffer.b.b.nr_samples) {
+         sctx->framebuffer.nr_color_samples =
+            MIN2(sctx->framebuffer.nr_color_samples, tex->buffer.b.b.nr_storage_samples);
+         sctx->framebuffer.nr_color_samples = MAX2(1, sctx->framebuffer.nr_color_samples);
+      }
+
+      if (tex->surface.is_linear)
+         sctx->framebuffer.any_dst_linear = true;
+
+      if (vi_dcc_enabled(tex, surf->base.u.tex.level)) {
+         sctx->framebuffer.CB_has_shader_readable_metadata = true;
+
+         if (sctx->chip_class >= GFX9 && !tex->surface.u.gfx9.dcc.pipe_aligned)
+            sctx->framebuffer.all_DCC_pipe_aligned = false;
+      }
+
+      si_context_add_resource_size(sctx, surf->base.texture);
+
+      p_atomic_inc(&tex->framebuffers_bound);
+
+      if (tex->dcc_gather_statistics) {
+         /* Dirty tracking must be enabled for DCC usage analysis. */
+         sctx->framebuffer.compressed_cb_mask |= 1 << i;
+         vi_separate_dcc_start_query(sctx, tex);
+      }
+
+      /* Update the minimum but don't keep 0. */
+      if (!sctx->framebuffer.min_bytes_per_pixel ||
+          tex->surface.bpe < sctx->framebuffer.min_bytes_per_pixel)
+         sctx->framebuffer.min_bytes_per_pixel = tex->surface.bpe;
+   }
+
+   /* For optimal DCC performance. */
+   if (sctx->chip_class >= GFX10)
+      sctx->framebuffer.dcc_overwrite_combiner_watermark = 6;
+   else
+      sctx->framebuffer.dcc_overwrite_combiner_watermark = 4;
+
+   struct si_texture *zstex = NULL;
+
+   if (state->zsbuf) {
+      surf = (struct si_surface *)state->zsbuf;
+      zstex = (struct si_texture *)surf->base.texture;
+
+      if (!surf->depth_initialized) {
+         si_init_depth_surface(sctx, surf);
+      }
+
+      if (vi_tc_compat_htile_enabled(zstex, surf->base.u.tex.level, PIPE_MASK_ZS))
+         sctx->framebuffer.DB_has_shader_readable_metadata = true;
+
+      si_context_add_resource_size(sctx, surf->base.texture);
+
+      /* Update the minimum but don't keep 0. */
+      if (!sctx->framebuffer.min_bytes_per_pixel ||
+          zstex->surface.bpe < sctx->framebuffer.min_bytes_per_pixel)
+         sctx->framebuffer.min_bytes_per_pixel = zstex->surface.bpe;
+   }
+
+   si_update_ps_colorbuf0_slot(sctx);
+   si_update_poly_offset_state(sctx);
+   si_update_ngg_small_prim_precision(sctx);
+   si_mark_atom_dirty(sctx, &sctx->atoms.s.cb_render_state);
+   si_mark_atom_dirty(sctx, &sctx->atoms.s.framebuffer);
+
+   if (sctx->screen->dpbb_allowed)
+      si_mark_atom_dirty(sctx, &sctx->atoms.s.dpbb_state);
+
+   if (sctx->framebuffer.any_dst_linear != old_any_dst_linear)
+      si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config);
+
+   if (sctx->screen->has_out_of_order_rast &&
+       (sctx->framebuffer.colorbuf_enabled_4bit != old_colorbuf_enabled_4bit ||
+        !!sctx->framebuffer.state.zsbuf != old_has_zsbuf ||
+        (zstex && zstex->surface.has_stencil != old_has_stencil)))
+      si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config);
+
+   if (sctx->framebuffer.nr_samples != old_nr_samples) {
+      struct pipe_constant_buffer constbuf = {0};
+
+      si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config);
+      si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
+
+      constbuf.buffer = sctx->sample_pos_buffer;
+
+      /* Set sample locations as fragment shader constants. */
+      switch (sctx->framebuffer.nr_samples) {
+      case 1:
+         constbuf.buffer_offset = 0;
+         break;
+      case 2:
+         constbuf.buffer_offset =
+            (ubyte *)sctx->sample_positions.x2 - (ubyte *)sctx->sample_positions.x1;
+         break;
+      case 4:
+         constbuf.buffer_offset =
+            (ubyte *)sctx->sample_positions.x4 - (ubyte *)sctx->sample_positions.x1;
+         break;
+      case 8:
+         constbuf.buffer_offset =
+            (ubyte *)sctx->sample_positions.x8 - (ubyte *)sctx->sample_positions.x1;
+         break;
+      case 16:
+         constbuf.buffer_offset =
+            (ubyte *)sctx->sample_positions.x16 - (ubyte *)sctx->sample_positions.x1;
+         break;
+      default:
+         PRINT_ERR("Requested an invalid number of samples %i.\n", sctx->framebuffer.nr_samples);
+         assert(0);
+      }
+      constbuf.buffer_size = sctx->framebuffer.nr_samples * 2 * 4;
+      si_set_rw_buffer(sctx, SI_PS_CONST_SAMPLE_POSITIONS, &constbuf);
+
+      si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_sample_locs);
+   }
+
+   sctx->do_update_shaders = true;
+
+   if (!sctx->decompression_enabled) {
+      /* Prevent textures decompression when the framebuffer state
+       * changes come from the decompression passes themselves.
+       */
+      sctx->need_check_render_feedback = true;
+   }
 }
 
 static void si_emit_framebuffer_state(struct si_context *sctx)
 {
-	struct radeon_cmdbuf *cs = sctx->gfx_cs;
-	struct pipe_framebuffer_state *state = &sctx->framebuffer.state;
-	unsigned i, nr_cbufs = state->nr_cbufs;
-	struct si_texture *tex = NULL;
-	struct si_surface *cb = NULL;
-	unsigned cb_color_info = 0;
-
-	/* Colorbuffers. */
-	for (i = 0; i < nr_cbufs; i++) {
-		uint64_t cb_color_base, cb_color_fmask, cb_color_cmask, cb_dcc_base;
-		unsigned cb_color_attrib;
-
-		if (!(sctx->framebuffer.dirty_cbufs & (1 << i)))
-			continue;
-
-		cb = (struct si_surface*)state->cbufs[i];
-		if (!cb) {
-			radeon_set_context_reg(cs, R_028C70_CB_COLOR0_INFO + i * 0x3C,
-					       S_028C70_FORMAT(V_028C70_COLOR_INVALID));
-			continue;
-		}
-
-		tex = (struct si_texture *)cb->base.texture;
-		radeon_add_to_buffer_list(sctx, sctx->gfx_cs,
-				      &tex->buffer, RADEON_USAGE_READWRITE,
-				      tex->buffer.b.b.nr_samples > 1 ?
-					      RADEON_PRIO_COLOR_BUFFER_MSAA :
-					      RADEON_PRIO_COLOR_BUFFER);
-
-		if (tex->cmask_buffer && tex->cmask_buffer != &tex->buffer) {
-			radeon_add_to_buffer_list(sctx, sctx->gfx_cs,
-				tex->cmask_buffer, RADEON_USAGE_READWRITE,
-				RADEON_PRIO_SEPARATE_META);
-		}
-
-		if (tex->dcc_separate_buffer)
-			radeon_add_to_buffer_list(sctx, sctx->gfx_cs,
-						  tex->dcc_separate_buffer,
-						  RADEON_USAGE_READWRITE,
-						  RADEON_PRIO_SEPARATE_META);
-
-		/* Compute mutable surface parameters. */
-		cb_color_base = tex->buffer.gpu_address >> 8;
-		cb_color_fmask = 0;
-		cb_color_cmask = tex->cmask_base_address_reg;
-		cb_dcc_base = 0;
-		cb_color_info = cb->cb_color_info | tex->cb_color_info;
-		cb_color_attrib = cb->cb_color_attrib;
-
-		if (cb->base.u.tex.level > 0)
-			cb_color_info &= C_028C70_FAST_CLEAR;
-
-		if (tex->surface.fmask_offset) {
-			cb_color_fmask = (tex->buffer.gpu_address + tex->surface.fmask_offset) >> 8;
-			cb_color_fmask |= tex->surface.fmask_tile_swizzle;
-		}
-
-		/* Set up DCC. */
-		if (vi_dcc_enabled(tex, cb->base.u.tex.level)) {
-			bool is_msaa_resolve_dst = state->cbufs[0] &&
-						   state->cbufs[0]->texture->nr_samples > 1 &&
-						   state->cbufs[1] == &cb->base &&
-						   state->cbufs[1]->texture->nr_samples <= 1;
-
-			if (!is_msaa_resolve_dst)
-				cb_color_info |= S_028C70_DCC_ENABLE(1);
-
-			cb_dcc_base = ((!tex->dcc_separate_buffer ? tex->buffer.gpu_address : 0) +
-				       tex->surface.dcc_offset) >> 8;
-
-			unsigned dcc_tile_swizzle = tex->surface.tile_swizzle;
-			dcc_tile_swizzle &= (tex->surface.dcc_alignment - 1) >> 8;
-			cb_dcc_base |= dcc_tile_swizzle;
-		}
-
-		if (sctx->chip_class >= GFX10) {
-			unsigned cb_color_attrib3;
-
-			/* Set mutable surface parameters. */
-			cb_color_base += tex->surface.u.gfx9.surf_offset >> 8;
-			cb_color_base |= tex->surface.tile_swizzle;
-			if (!tex->surface.fmask_offset)
-				cb_color_fmask = cb_color_base;
-			if (cb->base.u.tex.level > 0)
-				cb_color_cmask = cb_color_base;
-
-			cb_color_attrib3 = cb->cb_color_attrib3 |
-					   S_028EE0_COLOR_SW_MODE(tex->surface.u.gfx9.surf.swizzle_mode) |
-					   S_028EE0_FMASK_SW_MODE(tex->surface.u.gfx9.fmask.swizzle_mode) |
-					   S_028EE0_CMASK_PIPE_ALIGNED(tex->surface.u.gfx9.cmask.pipe_aligned) |
-					   S_028EE0_DCC_PIPE_ALIGNED(tex->surface.u.gfx9.dcc.pipe_aligned);
-
-			radeon_set_context_reg_seq(cs, R_028C60_CB_COLOR0_BASE + i * 0x3C, 14);
-			radeon_emit(cs, cb_color_base);		/* CB_COLOR0_BASE */
-			radeon_emit(cs, 0);			/* hole */
-			radeon_emit(cs, 0);			/* hole */
-			radeon_emit(cs, cb->cb_color_view);	/* CB_COLOR0_VIEW */
-			radeon_emit(cs, cb_color_info);		/* CB_COLOR0_INFO */
-			radeon_emit(cs, cb_color_attrib);	/* CB_COLOR0_ATTRIB */
-			radeon_emit(cs, cb->cb_dcc_control);	/* CB_COLOR0_DCC_CONTROL */
-			radeon_emit(cs, cb_color_cmask);	/* CB_COLOR0_CMASK */
-			radeon_emit(cs, 0);			/* hole */
-			radeon_emit(cs, cb_color_fmask);	/* CB_COLOR0_FMASK */
-			radeon_emit(cs, 0);			/* hole */
-			radeon_emit(cs, tex->color_clear_value[0]); /* CB_COLOR0_CLEAR_WORD0 */
-			radeon_emit(cs, tex->color_clear_value[1]); /* CB_COLOR0_CLEAR_WORD1 */
-			radeon_emit(cs, cb_dcc_base);		/* CB_COLOR0_DCC_BASE */
-
-			radeon_set_context_reg(cs, R_028E40_CB_COLOR0_BASE_EXT + i * 4,
-					       cb_color_base >> 32);
-			radeon_set_context_reg(cs, R_028E60_CB_COLOR0_CMASK_BASE_EXT + i * 4,
-					       cb_color_cmask >> 32);
-			radeon_set_context_reg(cs, R_028E80_CB_COLOR0_FMASK_BASE_EXT + i * 4,
-					       cb_color_fmask >> 32);
-			radeon_set_context_reg(cs, R_028EA0_CB_COLOR0_DCC_BASE_EXT + i * 4,
-					       cb_dcc_base >> 32);
-			radeon_set_context_reg(cs, R_028EC0_CB_COLOR0_ATTRIB2 + i * 4,
-					       cb->cb_color_attrib2);
-			radeon_set_context_reg(cs, R_028EE0_CB_COLOR0_ATTRIB3 + i * 4,
-					       cb_color_attrib3);
-		} else if (sctx->chip_class == GFX9) {
-			struct gfx9_surf_meta_flags meta;
-
-			if (tex->surface.dcc_offset)
-				meta = tex->surface.u.gfx9.dcc;
-			else
-				meta = tex->surface.u.gfx9.cmask;
-
-			/* Set mutable surface parameters. */
-			cb_color_base += tex->surface.u.gfx9.surf_offset >> 8;
-			cb_color_base |= tex->surface.tile_swizzle;
-			if (!tex->surface.fmask_offset)
-				cb_color_fmask = cb_color_base;
-			if (cb->base.u.tex.level > 0)
-				cb_color_cmask = cb_color_base;
-			cb_color_attrib |= S_028C74_COLOR_SW_MODE(tex->surface.u.gfx9.surf.swizzle_mode) |
-					   S_028C74_FMASK_SW_MODE(tex->surface.u.gfx9.fmask.swizzle_mode) |
-					   S_028C74_RB_ALIGNED(meta.rb_aligned) |
-					   S_028C74_PIPE_ALIGNED(meta.pipe_aligned);
-
-			radeon_set_context_reg_seq(cs, R_028C60_CB_COLOR0_BASE + i * 0x3C, 15);
-			radeon_emit(cs, cb_color_base);		/* CB_COLOR0_BASE */
-			radeon_emit(cs, S_028C64_BASE_256B(cb_color_base >> 32)); /* CB_COLOR0_BASE_EXT */
-			radeon_emit(cs, cb->cb_color_attrib2);	/* CB_COLOR0_ATTRIB2 */
-			radeon_emit(cs, cb->cb_color_view);	/* CB_COLOR0_VIEW */
-			radeon_emit(cs, cb_color_info);		/* CB_COLOR0_INFO */
-			radeon_emit(cs, cb_color_attrib);	/* CB_COLOR0_ATTRIB */
-			radeon_emit(cs, cb->cb_dcc_control);	/* CB_COLOR0_DCC_CONTROL */
-			radeon_emit(cs, cb_color_cmask);	/* CB_COLOR0_CMASK */
-			radeon_emit(cs, S_028C80_BASE_256B(cb_color_cmask >> 32)); /* CB_COLOR0_CMASK_BASE_EXT */
-			radeon_emit(cs, cb_color_fmask);	/* CB_COLOR0_FMASK */
-			radeon_emit(cs, S_028C88_BASE_256B(cb_color_fmask >> 32)); /* CB_COLOR0_FMASK_BASE_EXT */
-			radeon_emit(cs, tex->color_clear_value[0]); /* CB_COLOR0_CLEAR_WORD0 */
-			radeon_emit(cs, tex->color_clear_value[1]); /* CB_COLOR0_CLEAR_WORD1 */
-			radeon_emit(cs, cb_dcc_base);		/* CB_COLOR0_DCC_BASE */
-			radeon_emit(cs, S_028C98_BASE_256B(cb_dcc_base >> 32)); /* CB_COLOR0_DCC_BASE_EXT */
-
-			radeon_set_context_reg(cs, R_0287A0_CB_MRT0_EPITCH + i * 4,
-					       S_0287A0_EPITCH(tex->surface.u.gfx9.surf.epitch));
-		} else {
-			/* Compute mutable surface parameters (GFX6-GFX8). */
-			const struct legacy_surf_level *level_info =
-				&tex->surface.u.legacy.level[cb->base.u.tex.level];
-			unsigned pitch_tile_max, slice_tile_max, tile_mode_index;
-			unsigned cb_color_pitch, cb_color_slice, cb_color_fmask_slice;
-
-			cb_color_base += level_info->offset >> 8;
-			/* Only macrotiled modes can set tile swizzle. */
-			if (level_info->mode == RADEON_SURF_MODE_2D)
-				cb_color_base |= tex->surface.tile_swizzle;
-
-			if (!tex->surface.fmask_offset)
-				cb_color_fmask = cb_color_base;
-			if (cb->base.u.tex.level > 0)
-				cb_color_cmask = cb_color_base;
-			if (cb_dcc_base)
-				cb_dcc_base += level_info->dcc_offset >> 8;
-
-			pitch_tile_max = level_info->nblk_x / 8 - 1;
-			slice_tile_max = level_info->nblk_x *
-					 level_info->nblk_y / 64 - 1;
-			tile_mode_index = si_tile_mode_index(tex, cb->base.u.tex.level, false);
-
-			cb_color_attrib |= S_028C74_TILE_MODE_INDEX(tile_mode_index);
-			cb_color_pitch = S_028C64_TILE_MAX(pitch_tile_max);
-			cb_color_slice = S_028C68_TILE_MAX(slice_tile_max);
-
-			if (tex->surface.fmask_offset) {
-				if (sctx->chip_class >= GFX7)
-					cb_color_pitch |= S_028C64_FMASK_TILE_MAX(tex->surface.u.legacy.fmask.pitch_in_pixels / 8 - 1);
-				cb_color_attrib |= S_028C74_FMASK_TILE_MODE_INDEX(tex->surface.u.legacy.fmask.tiling_index);
-				cb_color_fmask_slice = S_028C88_TILE_MAX(tex->surface.u.legacy.fmask.slice_tile_max);
-			} else {
-				/* This must be set for fast clear to work without FMASK. */
-				if (sctx->chip_class >= GFX7)
-					cb_color_pitch |= S_028C64_FMASK_TILE_MAX(pitch_tile_max);
-				cb_color_attrib |= S_028C74_FMASK_TILE_MODE_INDEX(tile_mode_index);
-				cb_color_fmask_slice = S_028C88_TILE_MAX(slice_tile_max);
-			}
-
-			radeon_set_context_reg_seq(cs, R_028C60_CB_COLOR0_BASE + i * 0x3C,
-						   sctx->chip_class >= GFX8 ? 14 : 13);
-			radeon_emit(cs, cb_color_base);		/* CB_COLOR0_BASE */
-			radeon_emit(cs, cb_color_pitch);	/* CB_COLOR0_PITCH */
-			radeon_emit(cs, cb_color_slice);	/* CB_COLOR0_SLICE */
-			radeon_emit(cs, cb->cb_color_view);	/* CB_COLOR0_VIEW */
-			radeon_emit(cs, cb_color_info);		/* CB_COLOR0_INFO */
-			radeon_emit(cs, cb_color_attrib);	/* CB_COLOR0_ATTRIB */
-			radeon_emit(cs, cb->cb_dcc_control);	/* CB_COLOR0_DCC_CONTROL */
-			radeon_emit(cs, cb_color_cmask);	/* CB_COLOR0_CMASK */
-			radeon_emit(cs, tex->surface.u.legacy.cmask_slice_tile_max); /* CB_COLOR0_CMASK_SLICE */
-			radeon_emit(cs, cb_color_fmask);		/* CB_COLOR0_FMASK */
-			radeon_emit(cs, cb_color_fmask_slice);		/* CB_COLOR0_FMASK_SLICE */
-			radeon_emit(cs, tex->color_clear_value[0]);	/* CB_COLOR0_CLEAR_WORD0 */
-			radeon_emit(cs, tex->color_clear_value[1]);	/* CB_COLOR0_CLEAR_WORD1 */
-
-			if (sctx->chip_class >= GFX8) /* R_028C94_CB_COLOR0_DCC_BASE */
-				radeon_emit(cs, cb_dcc_base);
-		}
-	}
-	for (; i < 8 ; i++)
-		if (sctx->framebuffer.dirty_cbufs & (1 << i))
-			radeon_set_context_reg(cs, R_028C70_CB_COLOR0_INFO + i * 0x3C, 0);
-
-	/* ZS buffer. */
-	if (state->zsbuf && sctx->framebuffer.dirty_zsbuf) {
-		struct si_surface *zb = (struct si_surface*)state->zsbuf;
-		struct si_texture *tex = (struct si_texture*)zb->base.texture;
-
-		radeon_add_to_buffer_list(sctx, sctx->gfx_cs,
-				      &tex->buffer, RADEON_USAGE_READWRITE,
-				      zb->base.texture->nr_samples > 1 ?
-					      RADEON_PRIO_DEPTH_BUFFER_MSAA :
-					      RADEON_PRIO_DEPTH_BUFFER);
-
-		if (sctx->chip_class >= GFX10) {
-			radeon_set_context_reg(cs, R_028014_DB_HTILE_DATA_BASE, zb->db_htile_data_base);
-			radeon_set_context_reg(cs, R_02801C_DB_DEPTH_SIZE_XY, zb->db_depth_size);
-
-			radeon_set_context_reg_seq(cs, R_02803C_DB_DEPTH_INFO, 7);
-			radeon_emit(cs, S_02803C_RESOURCE_LEVEL(1));	/* DB_DEPTH_INFO */
-			radeon_emit(cs, zb->db_z_info |			/* DB_Z_INFO */
-				    S_028038_ZRANGE_PRECISION(tex->depth_clear_value != 0));
-			radeon_emit(cs, zb->db_stencil_info);		/* DB_STENCIL_INFO */
-			radeon_emit(cs, zb->db_depth_base);		/* DB_Z_READ_BASE */
-			radeon_emit(cs, zb->db_stencil_base);		/* DB_STENCIL_READ_BASE */
-			radeon_emit(cs, zb->db_depth_base);		/* DB_Z_WRITE_BASE */
-			radeon_emit(cs, zb->db_stencil_base);		/* DB_STENCIL_WRITE_BASE */
-
-			radeon_set_context_reg_seq(cs, R_028068_DB_Z_READ_BASE_HI, 5);
-			radeon_emit(cs, zb->db_depth_base >> 32);	/* DB_Z_READ_BASE_HI */
-			radeon_emit(cs, zb->db_stencil_base >> 32);	/* DB_STENCIL_READ_BASE_HI */
-			radeon_emit(cs, zb->db_depth_base >> 32);	/* DB_Z_WRITE_BASE_HI */
-			radeon_emit(cs, zb->db_stencil_base >> 32);	/* DB_STENCIL_WRITE_BASE_HI */
-			radeon_emit(cs, zb->db_htile_data_base >> 32);	/* DB_HTILE_DATA_BASE_HI */
-		} else if (sctx->chip_class == GFX9) {
-			radeon_set_context_reg_seq(cs, R_028014_DB_HTILE_DATA_BASE, 3);
-			radeon_emit(cs, zb->db_htile_data_base);	/* DB_HTILE_DATA_BASE */
-			radeon_emit(cs, S_028018_BASE_HI(zb->db_htile_data_base >> 32)); /* DB_HTILE_DATA_BASE_HI */
-			radeon_emit(cs, zb->db_depth_size);		/* DB_DEPTH_SIZE */
-
-			radeon_set_context_reg_seq(cs, R_028038_DB_Z_INFO, 10);
-			radeon_emit(cs, zb->db_z_info |			/* DB_Z_INFO */
-				    S_028038_ZRANGE_PRECISION(tex->depth_clear_value != 0));
-			radeon_emit(cs, zb->db_stencil_info);		/* DB_STENCIL_INFO */
-			radeon_emit(cs, zb->db_depth_base);		/* DB_Z_READ_BASE */
-			radeon_emit(cs, S_028044_BASE_HI(zb->db_depth_base >> 32)); /* DB_Z_READ_BASE_HI */
-			radeon_emit(cs, zb->db_stencil_base);		/* DB_STENCIL_READ_BASE */
-			radeon_emit(cs, S_02804C_BASE_HI(zb->db_stencil_base >> 32)); /* DB_STENCIL_READ_BASE_HI */
-			radeon_emit(cs, zb->db_depth_base);		/* DB_Z_WRITE_BASE */
-			radeon_emit(cs, S_028054_BASE_HI(zb->db_depth_base >> 32)); /* DB_Z_WRITE_BASE_HI */
-			radeon_emit(cs, zb->db_stencil_base);		/* DB_STENCIL_WRITE_BASE */
-			radeon_emit(cs, S_02805C_BASE_HI(zb->db_stencil_base >> 32)); /* DB_STENCIL_WRITE_BASE_HI */
-
-			radeon_set_context_reg_seq(cs, R_028068_DB_Z_INFO2, 2);
-			radeon_emit(cs, zb->db_z_info2);	/* DB_Z_INFO2 */
-			radeon_emit(cs, zb->db_stencil_info2);	/* DB_STENCIL_INFO2 */
-		} else {
-			radeon_set_context_reg(cs, R_028014_DB_HTILE_DATA_BASE, zb->db_htile_data_base);
-
-			radeon_set_context_reg_seq(cs, R_02803C_DB_DEPTH_INFO, 9);
-			radeon_emit(cs, zb->db_depth_info);	/* DB_DEPTH_INFO */
-			radeon_emit(cs, zb->db_z_info |		/* DB_Z_INFO */
-				    S_028040_ZRANGE_PRECISION(tex->depth_clear_value != 0));
-			radeon_emit(cs, zb->db_stencil_info);	/* DB_STENCIL_INFO */
-			radeon_emit(cs, zb->db_depth_base);	/* DB_Z_READ_BASE */
-			radeon_emit(cs, zb->db_stencil_base);	/* DB_STENCIL_READ_BASE */
-			radeon_emit(cs, zb->db_depth_base);	/* DB_Z_WRITE_BASE */
-			radeon_emit(cs, zb->db_stencil_base);	/* DB_STENCIL_WRITE_BASE */
-			radeon_emit(cs, zb->db_depth_size);	/* DB_DEPTH_SIZE */
-			radeon_emit(cs, zb->db_depth_slice);	/* DB_DEPTH_SLICE */
-		}
-
-		radeon_set_context_reg_seq(cs, R_028028_DB_STENCIL_CLEAR, 2);
-		radeon_emit(cs, tex->stencil_clear_value); /* R_028028_DB_STENCIL_CLEAR */
-		radeon_emit(cs, fui(tex->depth_clear_value)); /* R_02802C_DB_DEPTH_CLEAR */
-
-		radeon_set_context_reg(cs, R_028008_DB_DEPTH_VIEW, zb->db_depth_view);
-		radeon_set_context_reg(cs, R_028ABC_DB_HTILE_SURFACE, zb->db_htile_surface);
-	} else if (sctx->framebuffer.dirty_zsbuf) {
-		if (sctx->chip_class == GFX9)
-			radeon_set_context_reg_seq(cs, R_028038_DB_Z_INFO, 2);
-		else
-			radeon_set_context_reg_seq(cs, R_028040_DB_Z_INFO, 2);
-
-		radeon_emit(cs, S_028040_FORMAT(V_028040_Z_INVALID)); /* DB_Z_INFO */
-		radeon_emit(cs, S_028044_FORMAT(V_028044_STENCIL_INVALID)); /* DB_STENCIL_INFO */
-	}
-
-	/* Framebuffer dimensions. */
-        /* PA_SC_WINDOW_SCISSOR_TL is set in si_init_config() */
-	radeon_set_context_reg(cs, R_028208_PA_SC_WINDOW_SCISSOR_BR,
-			       S_028208_BR_X(state->width) | S_028208_BR_Y(state->height));
-
-	if (sctx->screen->dfsm_allowed) {
-		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
-		radeon_emit(cs, EVENT_TYPE(V_028A90_BREAK_BATCH) | EVENT_INDEX(0));
-	}
-
-	sctx->framebuffer.dirty_cbufs = 0;
-	sctx->framebuffer.dirty_zsbuf = false;
+   struct radeon_cmdbuf *cs = sctx->gfx_cs;
+   struct pipe_framebuffer_state *state = &sctx->framebuffer.state;
+   unsigned i, nr_cbufs = state->nr_cbufs;
+   struct si_texture *tex = NULL;
+   struct si_surface *cb = NULL;
+   unsigned cb_color_info = 0;
+
+   /* Colorbuffers. */
+   for (i = 0; i < nr_cbufs; i++) {
+      uint64_t cb_color_base, cb_color_fmask, cb_color_cmask, cb_dcc_base;
+      unsigned cb_color_attrib;
+
+      if (!(sctx->framebuffer.dirty_cbufs & (1 << i)))
+         continue;
+
+      cb = (struct si_surface *)state->cbufs[i];
+      if (!cb) {
+         radeon_set_context_reg(cs, R_028C70_CB_COLOR0_INFO + i * 0x3C,
+                                S_028C70_FORMAT(V_028C70_COLOR_INVALID));
+         continue;
+      }
+
+      tex = (struct si_texture *)cb->base.texture;
+      radeon_add_to_buffer_list(
+         sctx, sctx->gfx_cs, &tex->buffer, RADEON_USAGE_READWRITE,
+         tex->buffer.b.b.nr_samples > 1 ? RADEON_PRIO_COLOR_BUFFER_MSAA : RADEON_PRIO_COLOR_BUFFER);
+
+      if (tex->cmask_buffer && tex->cmask_buffer != &tex->buffer) {
+         radeon_add_to_buffer_list(sctx, sctx->gfx_cs, tex->cmask_buffer, RADEON_USAGE_READWRITE,
+                                   RADEON_PRIO_SEPARATE_META);
+      }
+
+      if (tex->dcc_separate_buffer)
+         radeon_add_to_buffer_list(sctx, sctx->gfx_cs, tex->dcc_separate_buffer,
+                                   RADEON_USAGE_READWRITE, RADEON_PRIO_SEPARATE_META);
+
+      /* Compute mutable surface parameters. */
+      cb_color_base = tex->buffer.gpu_address >> 8;
+      cb_color_fmask = 0;
+      cb_color_cmask = tex->cmask_base_address_reg;
+      cb_dcc_base = 0;
+      cb_color_info = cb->cb_color_info | tex->cb_color_info;
+      cb_color_attrib = cb->cb_color_attrib;
+
+      if (cb->base.u.tex.level > 0)
+         cb_color_info &= C_028C70_FAST_CLEAR;
+
+      if (tex->surface.fmask_offset) {
+         cb_color_fmask = (tex->buffer.gpu_address + tex->surface.fmask_offset) >> 8;
+         cb_color_fmask |= tex->surface.fmask_tile_swizzle;
+      }
+
+      /* Set up DCC. */
+      if (vi_dcc_enabled(tex, cb->base.u.tex.level)) {
+         bool is_msaa_resolve_dst = state->cbufs[0] && state->cbufs[0]->texture->nr_samples > 1 &&
+                                    state->cbufs[1] == &cb->base &&
+                                    state->cbufs[1]->texture->nr_samples <= 1;
+
+         if (!is_msaa_resolve_dst)
+            cb_color_info |= S_028C70_DCC_ENABLE(1);
+
+         cb_dcc_base =
+            ((!tex->dcc_separate_buffer ? tex->buffer.gpu_address : 0) + tex->surface.dcc_offset) >>
+            8;
+
+         unsigned dcc_tile_swizzle = tex->surface.tile_swizzle;
+         dcc_tile_swizzle &= (tex->surface.dcc_alignment - 1) >> 8;
+         cb_dcc_base |= dcc_tile_swizzle;
+      }
+
+      if (sctx->chip_class >= GFX10) {
+         unsigned cb_color_attrib3;
+
+         /* Set mutable surface parameters. */
+         cb_color_base += tex->surface.u.gfx9.surf_offset >> 8;
+         cb_color_base |= tex->surface.tile_swizzle;
+         if (!tex->surface.fmask_offset)
+            cb_color_fmask = cb_color_base;
+         if (cb->base.u.tex.level > 0)
+            cb_color_cmask = cb_color_base;
+
+         cb_color_attrib3 = cb->cb_color_attrib3 |
+                            S_028EE0_COLOR_SW_MODE(tex->surface.u.gfx9.surf.swizzle_mode) |
+                            S_028EE0_FMASK_SW_MODE(tex->surface.u.gfx9.fmask.swizzle_mode) |
+                            S_028EE0_CMASK_PIPE_ALIGNED(tex->surface.u.gfx9.cmask.pipe_aligned) |
+                            S_028EE0_DCC_PIPE_ALIGNED(tex->surface.u.gfx9.dcc.pipe_aligned);
+
+         radeon_set_context_reg_seq(cs, R_028C60_CB_COLOR0_BASE + i * 0x3C, 14);
+         radeon_emit(cs, cb_color_base);             /* CB_COLOR0_BASE */
+         radeon_emit(cs, 0);                         /* hole */
+         radeon_emit(cs, 0);                         /* hole */
+         radeon_emit(cs, cb->cb_color_view);         /* CB_COLOR0_VIEW */
+         radeon_emit(cs, cb_color_info);             /* CB_COLOR0_INFO */
+         radeon_emit(cs, cb_color_attrib);           /* CB_COLOR0_ATTRIB */
+         radeon_emit(cs, cb->cb_dcc_control);        /* CB_COLOR0_DCC_CONTROL */
+         radeon_emit(cs, cb_color_cmask);            /* CB_COLOR0_CMASK */
+         radeon_emit(cs, 0);                         /* hole */
+         radeon_emit(cs, cb_color_fmask);            /* CB_COLOR0_FMASK */
+         radeon_emit(cs, 0);                         /* hole */
+         radeon_emit(cs, tex->color_clear_value[0]); /* CB_COLOR0_CLEAR_WORD0 */
+         radeon_emit(cs, tex->color_clear_value[1]); /* CB_COLOR0_CLEAR_WORD1 */
+         radeon_emit(cs, cb_dcc_base);               /* CB_COLOR0_DCC_BASE */
+
+         radeon_set_context_reg(cs, R_028E40_CB_COLOR0_BASE_EXT + i * 4, cb_color_base >> 32);
+         radeon_set_context_reg(cs, R_028E60_CB_COLOR0_CMASK_BASE_EXT + i * 4,
+                                cb_color_cmask >> 32);
+         radeon_set_context_reg(cs, R_028E80_CB_COLOR0_FMASK_BASE_EXT + i * 4,
+                                cb_color_fmask >> 32);
+         radeon_set_context_reg(cs, R_028EA0_CB_COLOR0_DCC_BASE_EXT + i * 4, cb_dcc_base >> 32);
+         radeon_set_context_reg(cs, R_028EC0_CB_COLOR0_ATTRIB2 + i * 4, cb->cb_color_attrib2);
+         radeon_set_context_reg(cs, R_028EE0_CB_COLOR0_ATTRIB3 + i * 4, cb_color_attrib3);
+      } else if (sctx->chip_class == GFX9) {
+         struct gfx9_surf_meta_flags meta;
+
+         if (tex->surface.dcc_offset)
+            meta = tex->surface.u.gfx9.dcc;
+         else
+            meta = tex->surface.u.gfx9.cmask;
+
+         /* Set mutable surface parameters. */
+         cb_color_base += tex->surface.u.gfx9.surf_offset >> 8;
+         cb_color_base |= tex->surface.tile_swizzle;
+         if (!tex->surface.fmask_offset)
+            cb_color_fmask = cb_color_base;
+         if (cb->base.u.tex.level > 0)
+            cb_color_cmask = cb_color_base;
+         cb_color_attrib |= S_028C74_COLOR_SW_MODE(tex->surface.u.gfx9.surf.swizzle_mode) |
+                            S_028C74_FMASK_SW_MODE(tex->surface.u.gfx9.fmask.swizzle_mode) |
+                            S_028C74_RB_ALIGNED(meta.rb_aligned) |
+                            S_028C74_PIPE_ALIGNED(meta.pipe_aligned);
+
+         radeon_set_context_reg_seq(cs, R_028C60_CB_COLOR0_BASE + i * 0x3C, 15);
+         radeon_emit(cs, cb_color_base);                            /* CB_COLOR0_BASE */
+         radeon_emit(cs, S_028C64_BASE_256B(cb_color_base >> 32));  /* CB_COLOR0_BASE_EXT */
+         radeon_emit(cs, cb->cb_color_attrib2);                     /* CB_COLOR0_ATTRIB2 */
+         radeon_emit(cs, cb->cb_color_view);                        /* CB_COLOR0_VIEW */
+         radeon_emit(cs, cb_color_info);                            /* CB_COLOR0_INFO */
+         radeon_emit(cs, cb_color_attrib);                          /* CB_COLOR0_ATTRIB */
+         radeon_emit(cs, cb->cb_dcc_control);                       /* CB_COLOR0_DCC_CONTROL */
+         radeon_emit(cs, cb_color_cmask);                           /* CB_COLOR0_CMASK */
+         radeon_emit(cs, S_028C80_BASE_256B(cb_color_cmask >> 32)); /* CB_COLOR0_CMASK_BASE_EXT */
+         radeon_emit(cs, cb_color_fmask);                           /* CB_COLOR0_FMASK */
+         radeon_emit(cs, S_028C88_BASE_256B(cb_color_fmask >> 32)); /* CB_COLOR0_FMASK_BASE_EXT */
+         radeon_emit(cs, tex->color_clear_value[0]);                /* CB_COLOR0_CLEAR_WORD0 */
+         radeon_emit(cs, tex->color_clear_value[1]);                /* CB_COLOR0_CLEAR_WORD1 */
+         radeon_emit(cs, cb_dcc_base);                              /* CB_COLOR0_DCC_BASE */
+         radeon_emit(cs, S_028C98_BASE_256B(cb_dcc_base >> 32));    /* CB_COLOR0_DCC_BASE_EXT */
+
+         radeon_set_context_reg(cs, R_0287A0_CB_MRT0_EPITCH + i * 4,
+                                S_0287A0_EPITCH(tex->surface.u.gfx9.surf.epitch));
+      } else {
+         /* Compute mutable surface parameters (GFX6-GFX8). */
+         const struct legacy_surf_level *level_info =
+            &tex->surface.u.legacy.level[cb->base.u.tex.level];
+         unsigned pitch_tile_max, slice_tile_max, tile_mode_index;
+         unsigned cb_color_pitch, cb_color_slice, cb_color_fmask_slice;
+
+         cb_color_base += level_info->offset >> 8;
+         /* Only macrotiled modes can set tile swizzle. */
+         if (level_info->mode == RADEON_SURF_MODE_2D)
+            cb_color_base |= tex->surface.tile_swizzle;
+
+         if (!tex->surface.fmask_offset)
+            cb_color_fmask = cb_color_base;
+         if (cb->base.u.tex.level > 0)
+            cb_color_cmask = cb_color_base;
+         if (cb_dcc_base)
+            cb_dcc_base += level_info->dcc_offset >> 8;
+
+         pitch_tile_max = level_info->nblk_x / 8 - 1;
+         slice_tile_max = level_info->nblk_x * level_info->nblk_y / 64 - 1;
+         tile_mode_index = si_tile_mode_index(tex, cb->base.u.tex.level, false);
+
+         cb_color_attrib |= S_028C74_TILE_MODE_INDEX(tile_mode_index);
+         cb_color_pitch = S_028C64_TILE_MAX(pitch_tile_max);
+         cb_color_slice = S_028C68_TILE_MAX(slice_tile_max);
+
+         if (tex->surface.fmask_offset) {
+            if (sctx->chip_class >= GFX7)
+               cb_color_pitch |=
+                  S_028C64_FMASK_TILE_MAX(tex->surface.u.legacy.fmask.pitch_in_pixels / 8 - 1);
+            cb_color_attrib |=
+               S_028C74_FMASK_TILE_MODE_INDEX(tex->surface.u.legacy.fmask.tiling_index);
+            cb_color_fmask_slice = S_028C88_TILE_MAX(tex->surface.u.legacy.fmask.slice_tile_max);
+         } else {
+            /* This must be set for fast clear to work without FMASK. */
+            if (sctx->chip_class >= GFX7)
+               cb_color_pitch |= S_028C64_FMASK_TILE_MAX(pitch_tile_max);
+            cb_color_attrib |= S_028C74_FMASK_TILE_MODE_INDEX(tile_mode_index);
+            cb_color_fmask_slice = S_028C88_TILE_MAX(slice_tile_max);
+         }
+
+         radeon_set_context_reg_seq(cs, R_028C60_CB_COLOR0_BASE + i * 0x3C,
+                                    sctx->chip_class >= GFX8 ? 14 : 13);
+         radeon_emit(cs, cb_color_base);                              /* CB_COLOR0_BASE */
+         radeon_emit(cs, cb_color_pitch);                             /* CB_COLOR0_PITCH */
+         radeon_emit(cs, cb_color_slice);                             /* CB_COLOR0_SLICE */
+         radeon_emit(cs, cb->cb_color_view);                          /* CB_COLOR0_VIEW */
+         radeon_emit(cs, cb_color_info);                              /* CB_COLOR0_INFO */
+         radeon_emit(cs, cb_color_attrib);                            /* CB_COLOR0_ATTRIB */
+         radeon_emit(cs, cb->cb_dcc_control);                         /* CB_COLOR0_DCC_CONTROL */
+         radeon_emit(cs, cb_color_cmask);                             /* CB_COLOR0_CMASK */
+         radeon_emit(cs, tex->surface.u.legacy.cmask_slice_tile_max); /* CB_COLOR0_CMASK_SLICE */
+         radeon_emit(cs, cb_color_fmask);                             /* CB_COLOR0_FMASK */
+         radeon_emit(cs, cb_color_fmask_slice);                       /* CB_COLOR0_FMASK_SLICE */
+         radeon_emit(cs, tex->color_clear_value[0]);                  /* CB_COLOR0_CLEAR_WORD0 */
+         radeon_emit(cs, tex->color_clear_value[1]);                  /* CB_COLOR0_CLEAR_WORD1 */
+
+         if (sctx->chip_class >= GFX8) /* R_028C94_CB_COLOR0_DCC_BASE */
+            radeon_emit(cs, cb_dcc_base);
+      }
+   }
+   for (; i < 8; i++)
+      if (sctx->framebuffer.dirty_cbufs & (1 << i))
+         radeon_set_context_reg(cs, R_028C70_CB_COLOR0_INFO + i * 0x3C, 0);
+
+   /* ZS buffer. */
+   if (state->zsbuf && sctx->framebuffer.dirty_zsbuf) {
+      struct si_surface *zb = (struct si_surface *)state->zsbuf;
+      struct si_texture *tex = (struct si_texture *)zb->base.texture;
+
+      radeon_add_to_buffer_list(sctx, sctx->gfx_cs, &tex->buffer, RADEON_USAGE_READWRITE,
+                                zb->base.texture->nr_samples > 1 ? RADEON_PRIO_DEPTH_BUFFER_MSAA
+                                                                 : RADEON_PRIO_DEPTH_BUFFER);
+
+      if (sctx->chip_class >= GFX10) {
+         radeon_set_context_reg(cs, R_028014_DB_HTILE_DATA_BASE, zb->db_htile_data_base);
+         radeon_set_context_reg(cs, R_02801C_DB_DEPTH_SIZE_XY, zb->db_depth_size);
+
+         radeon_set_context_reg_seq(cs, R_02803C_DB_DEPTH_INFO, 7);
+         radeon_emit(cs, S_02803C_RESOURCE_LEVEL(1)); /* DB_DEPTH_INFO */
+         radeon_emit(cs, zb->db_z_info |              /* DB_Z_INFO */
+                            S_028038_ZRANGE_PRECISION(tex->depth_clear_value != 0));
+         radeon_emit(cs, zb->db_stencil_info); /* DB_STENCIL_INFO */
+         radeon_emit(cs, zb->db_depth_base);   /* DB_Z_READ_BASE */
+         radeon_emit(cs, zb->db_stencil_base); /* DB_STENCIL_READ_BASE */
+         radeon_emit(cs, zb->db_depth_base);   /* DB_Z_WRITE_BASE */
+         radeon_emit(cs, zb->db_stencil_base); /* DB_STENCIL_WRITE_BASE */
+
+         radeon_set_context_reg_seq(cs, R_028068_DB_Z_READ_BASE_HI, 5);
+         radeon_emit(cs, zb->db_depth_base >> 32);      /* DB_Z_READ_BASE_HI */
+         radeon_emit(cs, zb->db_stencil_base >> 32);    /* DB_STENCIL_READ_BASE_HI */
+         radeon_emit(cs, zb->db_depth_base >> 32);      /* DB_Z_WRITE_BASE_HI */
+         radeon_emit(cs, zb->db_stencil_base >> 32);    /* DB_STENCIL_WRITE_BASE_HI */
+         radeon_emit(cs, zb->db_htile_data_base >> 32); /* DB_HTILE_DATA_BASE_HI */
+      } else if (sctx->chip_class == GFX9) {
+         radeon_set_context_reg_seq(cs, R_028014_DB_HTILE_DATA_BASE, 3);
+         radeon_emit(cs, zb->db_htile_data_base); /* DB_HTILE_DATA_BASE */
+         radeon_emit(cs,
+                     S_028018_BASE_HI(zb->db_htile_data_base >> 32)); /* DB_HTILE_DATA_BASE_HI */
+         radeon_emit(cs, zb->db_depth_size);                          /* DB_DEPTH_SIZE */
+
+         radeon_set_context_reg_seq(cs, R_028038_DB_Z_INFO, 10);
+         radeon_emit(cs, zb->db_z_info | /* DB_Z_INFO */
+                            S_028038_ZRANGE_PRECISION(tex->depth_clear_value != 0));
+         radeon_emit(cs, zb->db_stencil_info);                         /* DB_STENCIL_INFO */
+         radeon_emit(cs, zb->db_depth_base);                           /* DB_Z_READ_BASE */
+         radeon_emit(cs, S_028044_BASE_HI(zb->db_depth_base >> 32));   /* DB_Z_READ_BASE_HI */
+         radeon_emit(cs, zb->db_stencil_base);                         /* DB_STENCIL_READ_BASE */
+         radeon_emit(cs, S_02804C_BASE_HI(zb->db_stencil_base >> 32)); /* DB_STENCIL_READ_BASE_HI */
+         radeon_emit(cs, zb->db_depth_base);                           /* DB_Z_WRITE_BASE */
+         radeon_emit(cs, S_028054_BASE_HI(zb->db_depth_base >> 32));   /* DB_Z_WRITE_BASE_HI */
+         radeon_emit(cs, zb->db_stencil_base);                         /* DB_STENCIL_WRITE_BASE */
+         radeon_emit(cs,
+                     S_02805C_BASE_HI(zb->db_stencil_base >> 32)); /* DB_STENCIL_WRITE_BASE_HI */
+
+         radeon_set_context_reg_seq(cs, R_028068_DB_Z_INFO2, 2);
+         radeon_emit(cs, zb->db_z_info2);       /* DB_Z_INFO2 */
+         radeon_emit(cs, zb->db_stencil_info2); /* DB_STENCIL_INFO2 */
+      } else {
+         radeon_set_context_reg(cs, R_028014_DB_HTILE_DATA_BASE, zb->db_htile_data_base);
+
+         radeon_set_context_reg_seq(cs, R_02803C_DB_DEPTH_INFO, 9);
+         radeon_emit(cs, zb->db_depth_info); /* DB_DEPTH_INFO */
+         radeon_emit(cs, zb->db_z_info |     /* DB_Z_INFO */
+                            S_028040_ZRANGE_PRECISION(tex->depth_clear_value != 0));
+         radeon_emit(cs, zb->db_stencil_info); /* DB_STENCIL_INFO */
+         radeon_emit(cs, zb->db_depth_base);   /* DB_Z_READ_BASE */
+         radeon_emit(cs, zb->db_stencil_base); /* DB_STENCIL_READ_BASE */
+         radeon_emit(cs, zb->db_depth_base);   /* DB_Z_WRITE_BASE */
+         radeon_emit(cs, zb->db_stencil_base); /* DB_STENCIL_WRITE_BASE */
+         radeon_emit(cs, zb->db_depth_size);   /* DB_DEPTH_SIZE */
+         radeon_emit(cs, zb->db_depth_slice);  /* DB_DEPTH_SLICE */
+      }
+
+      radeon_set_context_reg_seq(cs, R_028028_DB_STENCIL_CLEAR, 2);
+      radeon_emit(cs, tex->stencil_clear_value);    /* R_028028_DB_STENCIL_CLEAR */
+      radeon_emit(cs, fui(tex->depth_clear_value)); /* R_02802C_DB_DEPTH_CLEAR */
+
+      radeon_set_context_reg(cs, R_028008_DB_DEPTH_VIEW, zb->db_depth_view);
+      radeon_set_context_reg(cs, R_028ABC_DB_HTILE_SURFACE, zb->db_htile_surface);
+   } else if (sctx->framebuffer.dirty_zsbuf) {
+      if (sctx->chip_class == GFX9)
+         radeon_set_context_reg_seq(cs, R_028038_DB_Z_INFO, 2);
+      else
+         radeon_set_context_reg_seq(cs, R_028040_DB_Z_INFO, 2);
+
+      radeon_emit(cs, S_028040_FORMAT(V_028040_Z_INVALID));       /* DB_Z_INFO */
+      radeon_emit(cs, S_028044_FORMAT(V_028044_STENCIL_INVALID)); /* DB_STENCIL_INFO */
+   }
+
+   /* Framebuffer dimensions. */
+   /* PA_SC_WINDOW_SCISSOR_TL is set in si_init_config() */
+   radeon_set_context_reg(cs, R_028208_PA_SC_WINDOW_SCISSOR_BR,
+                          S_028208_BR_X(state->width) | S_028208_BR_Y(state->height));
+
+   if (sctx->screen->dfsm_allowed) {
+      radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
+      radeon_emit(cs, EVENT_TYPE(V_028A90_BREAK_BATCH) | EVENT_INDEX(0));
+   }
+
+   sctx->framebuffer.dirty_cbufs = 0;
+   sctx->framebuffer.dirty_zsbuf = false;
 }
 
 static void si_emit_msaa_sample_locs(struct si_context *sctx)
 {
-	struct radeon_cmdbuf *cs = sctx->gfx_cs;
-	struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
-	unsigned nr_samples = sctx->framebuffer.nr_samples;
-	bool has_msaa_sample_loc_bug = sctx->screen->info.has_msaa_sample_loc_bug;
-
-	/* Smoothing (only possible with nr_samples == 1) uses the same
-	 * sample locations as the MSAA it simulates.
-	 */
-	if (nr_samples <= 1 && sctx->smoothing_enabled)
-		nr_samples = SI_NUM_SMOOTH_AA_SAMPLES;
-
-	/* On Polaris, the small primitive filter uses the sample locations
-	 * even when MSAA is off, so we need to make sure they're set to 0.
-	 *
-	 * GFX10 uses sample locations unconditionally, so they always need
-	 * to be set up.
-	 */
-	if ((nr_samples >= 2 || has_msaa_sample_loc_bug ||
-	     sctx->chip_class >= GFX10) &&
-	    nr_samples != sctx->sample_locs_num_samples) {
-		sctx->sample_locs_num_samples = nr_samples;
-		si_emit_sample_locations(cs, nr_samples);
-	}
-
-	if (sctx->family >= CHIP_POLARIS10) {
-		unsigned small_prim_filter_cntl =
-			S_028830_SMALL_PRIM_FILTER_ENABLE(1) |
-			/* line bug */
-			S_028830_LINE_FILTER_DISABLE(sctx->family <= CHIP_POLARIS12);
-
-		/* The alternative of setting sample locations to 0 would
-		 * require a DB flush to avoid Z errors, see
-		 * https://bugs.freedesktop.org/show_bug.cgi?id=96908
-		 */
-		if (has_msaa_sample_loc_bug &&
-		    sctx->framebuffer.nr_samples > 1 &&
-		    !rs->multisample_enable)
-			small_prim_filter_cntl &= C_028830_SMALL_PRIM_FILTER_ENABLE;
-
-		radeon_opt_set_context_reg(sctx,
-					   R_028830_PA_SU_SMALL_PRIM_FILTER_CNTL,
-					   SI_TRACKED_PA_SU_SMALL_PRIM_FILTER_CNTL,
-					   small_prim_filter_cntl);
-	}
-
-	/* The exclusion bits can be set to improve rasterization efficiency
-	 * if no sample lies on the pixel boundary (-8 sample offset).
-	 */
-	bool exclusion = sctx->chip_class >= GFX7 &&
-			 (!rs->multisample_enable || nr_samples != 16);
-	radeon_opt_set_context_reg(sctx, R_02882C_PA_SU_PRIM_FILTER_CNTL,
-				   SI_TRACKED_PA_SU_PRIM_FILTER_CNTL,
-				   S_02882C_XMAX_RIGHT_EXCLUSION(exclusion) |
-				   S_02882C_YMAX_BOTTOM_EXCLUSION(exclusion));
+   struct radeon_cmdbuf *cs = sctx->gfx_cs;
+   struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
+   unsigned nr_samples = sctx->framebuffer.nr_samples;
+   bool has_msaa_sample_loc_bug = sctx->screen->info.has_msaa_sample_loc_bug;
+
+   /* Smoothing (only possible with nr_samples == 1) uses the same
+    * sample locations as the MSAA it simulates.
+    */
+   if (nr_samples <= 1 && sctx->smoothing_enabled)
+      nr_samples = SI_NUM_SMOOTH_AA_SAMPLES;
+
+   /* On Polaris, the small primitive filter uses the sample locations
+    * even when MSAA is off, so we need to make sure they're set to 0.
+    *
+    * GFX10 uses sample locations unconditionally, so they always need
+    * to be set up.
+    */
+   if ((nr_samples >= 2 || has_msaa_sample_loc_bug || sctx->chip_class >= GFX10) &&
+       nr_samples != sctx->sample_locs_num_samples) {
+      sctx->sample_locs_num_samples = nr_samples;
+      si_emit_sample_locations(cs, nr_samples);
+   }
+
+   if (sctx->family >= CHIP_POLARIS10) {
+      unsigned small_prim_filter_cntl =
+         S_028830_SMALL_PRIM_FILTER_ENABLE(1) |
+         /* line bug */
+         S_028830_LINE_FILTER_DISABLE(sctx->family <= CHIP_POLARIS12);
+
+      /* The alternative of setting sample locations to 0 would
+       * require a DB flush to avoid Z errors, see
+       * https://bugs.freedesktop.org/show_bug.cgi?id=96908
+       */
+      if (has_msaa_sample_loc_bug && sctx->framebuffer.nr_samples > 1 && !rs->multisample_enable)
+         small_prim_filter_cntl &= C_028830_SMALL_PRIM_FILTER_ENABLE;
+
+      radeon_opt_set_context_reg(sctx, R_028830_PA_SU_SMALL_PRIM_FILTER_CNTL,
+                                 SI_TRACKED_PA_SU_SMALL_PRIM_FILTER_CNTL, small_prim_filter_cntl);
+   }
+
+   /* The exclusion bits can be set to improve rasterization efficiency
+    * if no sample lies on the pixel boundary (-8 sample offset).
+    */
+   bool exclusion = sctx->chip_class >= GFX7 && (!rs->multisample_enable || nr_samples != 16);
+   radeon_opt_set_context_reg(
+      sctx, R_02882C_PA_SU_PRIM_FILTER_CNTL, SI_TRACKED_PA_SU_PRIM_FILTER_CNTL,
+      S_02882C_XMAX_RIGHT_EXCLUSION(exclusion) | S_02882C_YMAX_BOTTOM_EXCLUSION(exclusion));
 }
 
 static bool si_out_of_order_rasterization(struct si_context *sctx)
 {
-	struct si_state_blend *blend = sctx->queued.named.blend;
-	struct si_state_dsa *dsa = sctx->queued.named.dsa;
+   struct si_state_blend *blend = sctx->queued.named.blend;
+   struct si_state_dsa *dsa = sctx->queued.named.dsa;
 
-	if (!sctx->screen->has_out_of_order_rast)
-		return false;
+   if (!sctx->screen->has_out_of_order_rast)
+      return false;
 
-	unsigned colormask = sctx->framebuffer.colorbuf_enabled_4bit;
+   unsigned colormask = sctx->framebuffer.colorbuf_enabled_4bit;
 
-	colormask &= blend->cb_target_enabled_4bit;
+   colormask &= blend->cb_target_enabled_4bit;
 
-	/* Conservative: No logic op. */
-	if (colormask && blend->logicop_enable)
-		return false;
+   /* Conservative: No logic op. */
+   if (colormask && blend->logicop_enable)
+      return false;
 
-	struct si_dsa_order_invariance dsa_order_invariant = {
-		.zs = true, .pass_set = true, .pass_last = false
-	};
+   struct si_dsa_order_invariance dsa_order_invariant = {.zs = true,
+                                                         .pass_set = true,
+                                                         .pass_last = false};
 
-	if (sctx->framebuffer.state.zsbuf) {
-		struct si_texture *zstex =
-			(struct si_texture*)sctx->framebuffer.state.zsbuf->texture;
-		bool has_stencil = zstex->surface.has_stencil;
-		dsa_order_invariant = dsa->order_invariance[has_stencil];
-		if (!dsa_order_invariant.zs)
-			return false;
+   if (sctx->framebuffer.state.zsbuf) {
+      struct si_texture *zstex = (struct si_texture *)sctx->framebuffer.state.zsbuf->texture;
+      bool has_stencil = zstex->surface.has_stencil;
+      dsa_order_invariant = dsa->order_invariance[has_stencil];
+      if (!dsa_order_invariant.zs)
+         return false;
 
-		/* The set of PS invocations is always order invariant,
-		 * except when early Z/S tests are requested. */
-		if (sctx->ps_shader.cso &&
-		    sctx->ps_shader.cso->info.writes_memory &&
-		    sctx->ps_shader.cso->info.properties[TGSI_PROPERTY_FS_EARLY_DEPTH_STENCIL] &&
-		    !dsa_order_invariant.pass_set)
-			return false;
+      /* The set of PS invocations is always order invariant,
+       * except when early Z/S tests are requested. */
+      if (sctx->ps_shader.cso && sctx->ps_shader.cso->info.writes_memory &&
+          sctx->ps_shader.cso->info.properties[TGSI_PROPERTY_FS_EARLY_DEPTH_STENCIL] &&
+          !dsa_order_invariant.pass_set)
+         return false;
 
-		if (sctx->num_perfect_occlusion_queries != 0 &&
-		    !dsa_order_invariant.pass_set)
-			return false;
-	}
+      if (sctx->num_perfect_occlusion_queries != 0 && !dsa_order_invariant.pass_set)
+         return false;
+   }
 
-	if (!colormask)
-		return true;
+   if (!colormask)
+      return true;
 
-	unsigned blendmask = colormask & blend->blend_enable_4bit;
+   unsigned blendmask = colormask & blend->blend_enable_4bit;
 
-	if (blendmask) {
-		/* Only commutative blending. */
-		if (blendmask & ~blend->commutative_4bit)
-			return false;
+   if (blendmask) {
+      /* Only commutative blending. */
+      if (blendmask & ~blend->commutative_4bit)
+         return false;
 
-		if (!dsa_order_invariant.pass_set)
-			return false;
-	}
+      if (!dsa_order_invariant.pass_set)
+         return false;
+   }
 
-	if (colormask & ~blendmask) {
-		if (!dsa_order_invariant.pass_last)
-			return false;
-	}
+   if (colormask & ~blendmask) {
+      if (!dsa_order_invariant.pass_last)
+         return false;
+   }
 
-	return true;
+   return true;
 }
 
 static void si_emit_msaa_config(struct si_context *sctx)
 {
-	struct radeon_cmdbuf *cs = sctx->gfx_cs;
-	unsigned num_tile_pipes = sctx->screen->info.num_tile_pipes;
-	/* 33% faster rendering to linear color buffers */
-	bool dst_is_linear = sctx->framebuffer.any_dst_linear;
-	bool out_of_order_rast = si_out_of_order_rasterization(sctx);
-	unsigned sc_mode_cntl_1 =
-		S_028A4C_WALK_SIZE(dst_is_linear) |
-		S_028A4C_WALK_FENCE_ENABLE(!dst_is_linear) |
-		S_028A4C_WALK_FENCE_SIZE(num_tile_pipes == 2 ? 2 : 3) |
-		S_028A4C_OUT_OF_ORDER_PRIMITIVE_ENABLE(out_of_order_rast) |
-		S_028A4C_OUT_OF_ORDER_WATER_MARK(0x7) |
-		/* always 1: */
-		S_028A4C_WALK_ALIGN8_PRIM_FITS_ST(1) |
-		S_028A4C_SUPERTILE_WALK_ORDER_ENABLE(1) |
-		S_028A4C_TILE_WALK_ORDER_ENABLE(1) |
-		S_028A4C_MULTI_SHADER_ENGINE_PRIM_DISCARD_ENABLE(1) |
-		S_028A4C_FORCE_EOV_CNTDWN_ENABLE(1) |
-		S_028A4C_FORCE_EOV_REZ_ENABLE(1);
-	unsigned db_eqaa = S_028804_HIGH_QUALITY_INTERSECTIONS(1) |
-			   S_028804_INCOHERENT_EQAA_READS(1) |
-			   S_028804_INTERPOLATE_COMP_Z(1) |
-			   S_028804_STATIC_ANCHOR_ASSOCIATIONS(1);
-	unsigned coverage_samples, color_samples, z_samples;
-	struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
-
-	/* S: Coverage samples (up to 16x):
-	 * - Scan conversion samples (PA_SC_AA_CONFIG.MSAA_NUM_SAMPLES)
-	 * - CB FMASK samples (CB_COLORi_ATTRIB.NUM_SAMPLES)
-	 *
-	 * Z: Z/S samples (up to 8x, must be <= coverage samples and >= color samples):
-	 * - Value seen by DB (DB_Z_INFO.NUM_SAMPLES)
-	 * - Value seen by CB, must be correct even if Z/S is unbound (DB_EQAA.MAX_ANCHOR_SAMPLES)
-	 * # Missing samples are derived from Z planes if Z is compressed (up to 16x quality), or
-	 * # from the closest defined sample if Z is uncompressed (same quality as the number of
-	 * # Z samples).
-	 *
-	 * F: Color samples (up to 8x, must be <= coverage samples):
-	 * - CB color samples (CB_COLORi_ATTRIB.NUM_FRAGMENTS)
-	 * - PS iter samples (DB_EQAA.PS_ITER_SAMPLES)
-	 *
-	 * Can be anything between coverage and color samples:
-	 * - SampleMaskIn samples (PA_SC_AA_CONFIG.MSAA_EXPOSED_SAMPLES)
-	 * - SampleMaskOut samples (DB_EQAA.MASK_EXPORT_NUM_SAMPLES)
-	 * - Alpha-to-coverage samples (DB_EQAA.ALPHA_TO_MASK_NUM_SAMPLES)
-	 * - Occlusion query samples (DB_COUNT_CONTROL.SAMPLE_RATE)
-	 * # All are currently set the same as coverage samples.
-	 *
-	 * If color samples < coverage samples, FMASK has a higher bpp to store an "unknown"
-	 * flag for undefined color samples. A shader-based resolve must handle unknowns
-	 * or mask them out with AND. Unknowns can also be guessed from neighbors via
-	 * an edge-detect shader-based resolve, which is required to make "color samples = 1"
-	 * useful. The CB resolve always drops unknowns.
-	 *
-	 * Sensible AA configurations:
-	 *   EQAA 16s 8z 8f - might look the same as 16x MSAA if Z is compressed
-	 *   EQAA 16s 8z 4f - might look the same as 16x MSAA if Z is compressed
-	 *   EQAA 16s 4z 4f - might look the same as 16x MSAA if Z is compressed
-	 *   EQAA  8s 8z 8f = 8x MSAA
-	 *   EQAA  8s 8z 4f - might look the same as 8x MSAA
-	 *   EQAA  8s 8z 2f - might look the same as 8x MSAA with low-density geometry
-	 *   EQAA  8s 4z 4f - might look the same as 8x MSAA if Z is compressed
-	 *   EQAA  8s 4z 2f - might look the same as 8x MSAA with low-density geometry if Z is compressed
-	 *   EQAA  4s 4z 4f = 4x MSAA
-	 *   EQAA  4s 4z 2f - might look the same as 4x MSAA with low-density geometry
-	 *   EQAA  2s 2z 2f = 2x MSAA
-	 */
-	if (sctx->framebuffer.nr_samples > 1 && rs->multisample_enable) {
-		coverage_samples = sctx->framebuffer.nr_samples;
-		color_samples = sctx->framebuffer.nr_color_samples;
-
-		if (sctx->framebuffer.state.zsbuf) {
-			z_samples = sctx->framebuffer.state.zsbuf->texture->nr_samples;
-			z_samples = MAX2(1, z_samples);
-		} else {
-			z_samples = coverage_samples;
-		}
-	} else if (sctx->smoothing_enabled) {
-		coverage_samples = color_samples = z_samples = SI_NUM_SMOOTH_AA_SAMPLES;
-	} else {
-		coverage_samples = color_samples = z_samples = 1;
-	}
-
-	/* Required by OpenGL line rasterization.
-	 *
-	 * TODO: We should also enable perpendicular endcaps for AA lines,
-	 *       but that requires implementing line stippling in the pixel
-	 *       shader. SC can only do line stippling with axis-aligned
-	 *       endcaps.
-	 */
-	unsigned sc_line_cntl = S_028BDC_DX10_DIAMOND_TEST_ENA(1);
-	unsigned sc_aa_config = 0;
-
-	if (coverage_samples > 1) {
-		/* distance from the pixel center, indexed by log2(nr_samples) */
-		static unsigned max_dist[] = {
-			0, /* unused */
-			4, /* 2x MSAA */
-			6, /* 4x MSAA */
-			7, /* 8x MSAA */
-			8, /* 16x MSAA */
-		};
-		unsigned log_samples = util_logbase2(coverage_samples);
-		unsigned log_z_samples = util_logbase2(z_samples);
-		unsigned ps_iter_samples = si_get_ps_iter_samples(sctx);
-		unsigned log_ps_iter_samples = util_logbase2(ps_iter_samples);
-
-		sc_line_cntl |= S_028BDC_EXPAND_LINE_WIDTH(1);
-		sc_aa_config = S_028BE0_MSAA_NUM_SAMPLES(log_samples) |
-			       S_028BE0_MAX_SAMPLE_DIST(max_dist[log_samples]) |
-			       S_028BE0_MSAA_EXPOSED_SAMPLES(log_samples);
-
-		if (sctx->framebuffer.nr_samples > 1) {
-			db_eqaa |= S_028804_MAX_ANCHOR_SAMPLES(log_z_samples) |
-				   S_028804_PS_ITER_SAMPLES(log_ps_iter_samples) |
-				   S_028804_MASK_EXPORT_NUM_SAMPLES(log_samples) |
-				   S_028804_ALPHA_TO_MASK_NUM_SAMPLES(log_samples);
-			sc_mode_cntl_1 |= S_028A4C_PS_ITER_SAMPLE(ps_iter_samples > 1);
-		} else if (sctx->smoothing_enabled) {
-			db_eqaa |= S_028804_OVERRASTERIZATION_AMOUNT(log_samples);
-		}
-	}
-
-	unsigned initial_cdw = cs->current.cdw;
-
-	/* R_028BDC_PA_SC_LINE_CNTL, R_028BE0_PA_SC_AA_CONFIG */
-	radeon_opt_set_context_reg2(sctx, R_028BDC_PA_SC_LINE_CNTL,
-				    SI_TRACKED_PA_SC_LINE_CNTL, sc_line_cntl,
-				    sc_aa_config);
-	/* R_028804_DB_EQAA */
-	radeon_opt_set_context_reg(sctx, R_028804_DB_EQAA, SI_TRACKED_DB_EQAA,
-				   db_eqaa);
-	/* R_028A4C_PA_SC_MODE_CNTL_1 */
-	radeon_opt_set_context_reg(sctx, R_028A4C_PA_SC_MODE_CNTL_1,
-				   SI_TRACKED_PA_SC_MODE_CNTL_1, sc_mode_cntl_1);
-
-	if (initial_cdw != cs->current.cdw) {
-		sctx->context_roll = true;
-
-		/* GFX9: Flush DFSM when the AA mode changes. */
-		if (sctx->screen->dfsm_allowed) {
-			radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
-			radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_DFSM) | EVENT_INDEX(0));
-		}
-	}
+   struct radeon_cmdbuf *cs = sctx->gfx_cs;
+   unsigned num_tile_pipes = sctx->screen->info.num_tile_pipes;
+   /* 33% faster rendering to linear color buffers */
+   bool dst_is_linear = sctx->framebuffer.any_dst_linear;
+   bool out_of_order_rast = si_out_of_order_rasterization(sctx);
+   unsigned sc_mode_cntl_1 =
+      S_028A4C_WALK_SIZE(dst_is_linear) | S_028A4C_WALK_FENCE_ENABLE(!dst_is_linear) |
+      S_028A4C_WALK_FENCE_SIZE(num_tile_pipes == 2 ? 2 : 3) |
+      S_028A4C_OUT_OF_ORDER_PRIMITIVE_ENABLE(out_of_order_rast) |
+      S_028A4C_OUT_OF_ORDER_WATER_MARK(0x7) |
+      /* always 1: */
+      S_028A4C_WALK_ALIGN8_PRIM_FITS_ST(1) | S_028A4C_SUPERTILE_WALK_ORDER_ENABLE(1) |
+      S_028A4C_TILE_WALK_ORDER_ENABLE(1) | S_028A4C_MULTI_SHADER_ENGINE_PRIM_DISCARD_ENABLE(1) |
+      S_028A4C_FORCE_EOV_CNTDWN_ENABLE(1) | S_028A4C_FORCE_EOV_REZ_ENABLE(1);
+   unsigned db_eqaa = S_028804_HIGH_QUALITY_INTERSECTIONS(1) | S_028804_INCOHERENT_EQAA_READS(1) |
+                      S_028804_INTERPOLATE_COMP_Z(1) | S_028804_STATIC_ANCHOR_ASSOCIATIONS(1);
+   unsigned coverage_samples, color_samples, z_samples;
+   struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
+
+   /* S: Coverage samples (up to 16x):
+    * - Scan conversion samples (PA_SC_AA_CONFIG.MSAA_NUM_SAMPLES)
+    * - CB FMASK samples (CB_COLORi_ATTRIB.NUM_SAMPLES)
+    *
+    * Z: Z/S samples (up to 8x, must be <= coverage samples and >= color samples):
+    * - Value seen by DB (DB_Z_INFO.NUM_SAMPLES)
+    * - Value seen by CB, must be correct even if Z/S is unbound (DB_EQAA.MAX_ANCHOR_SAMPLES)
+    * # Missing samples are derived from Z planes if Z is compressed (up to 16x quality), or
+    * # from the closest defined sample if Z is uncompressed (same quality as the number of
+    * # Z samples).
+    *
+    * F: Color samples (up to 8x, must be <= coverage samples):
+    * - CB color samples (CB_COLORi_ATTRIB.NUM_FRAGMENTS)
+    * - PS iter samples (DB_EQAA.PS_ITER_SAMPLES)
+    *
+    * Can be anything between coverage and color samples:
+    * - SampleMaskIn samples (PA_SC_AA_CONFIG.MSAA_EXPOSED_SAMPLES)
+    * - SampleMaskOut samples (DB_EQAA.MASK_EXPORT_NUM_SAMPLES)
+    * - Alpha-to-coverage samples (DB_EQAA.ALPHA_TO_MASK_NUM_SAMPLES)
+    * - Occlusion query samples (DB_COUNT_CONTROL.SAMPLE_RATE)
+    * # All are currently set the same as coverage samples.
+    *
+    * If color samples < coverage samples, FMASK has a higher bpp to store an "unknown"
+    * flag for undefined color samples. A shader-based resolve must handle unknowns
+    * or mask them out with AND. Unknowns can also be guessed from neighbors via
+    * an edge-detect shader-based resolve, which is required to make "color samples = 1"
+    * useful. The CB resolve always drops unknowns.
+    *
+    * Sensible AA configurations:
+    *   EQAA 16s 8z 8f - might look the same as 16x MSAA if Z is compressed
+    *   EQAA 16s 8z 4f - might look the same as 16x MSAA if Z is compressed
+    *   EQAA 16s 4z 4f - might look the same as 16x MSAA if Z is compressed
+    *   EQAA  8s 8z 8f = 8x MSAA
+    *   EQAA  8s 8z 4f - might look the same as 8x MSAA
+    *   EQAA  8s 8z 2f - might look the same as 8x MSAA with low-density geometry
+    *   EQAA  8s 4z 4f - might look the same as 8x MSAA if Z is compressed
+    *   EQAA  8s 4z 2f - might look the same as 8x MSAA with low-density geometry if Z is compressed
+    *   EQAA  4s 4z 4f = 4x MSAA
+    *   EQAA  4s 4z 2f - might look the same as 4x MSAA with low-density geometry
+    *   EQAA  2s 2z 2f = 2x MSAA
+    */
+   if (sctx->framebuffer.nr_samples > 1 && rs->multisample_enable) {
+      coverage_samples = sctx->framebuffer.nr_samples;
+      color_samples = sctx->framebuffer.nr_color_samples;
+
+      if (sctx->framebuffer.state.zsbuf) {
+         z_samples = sctx->framebuffer.state.zsbuf->texture->nr_samples;
+         z_samples = MAX2(1, z_samples);
+      } else {
+         z_samples = coverage_samples;
+      }
+   } else if (sctx->smoothing_enabled) {
+      coverage_samples = color_samples = z_samples = SI_NUM_SMOOTH_AA_SAMPLES;
+   } else {
+      coverage_samples = color_samples = z_samples = 1;
+   }
+
+   /* Required by OpenGL line rasterization.
+    *
+    * TODO: We should also enable perpendicular endcaps for AA lines,
+    *       but that requires implementing line stippling in the pixel
+    *       shader. SC can only do line stippling with axis-aligned
+    *       endcaps.
+    */
+   unsigned sc_line_cntl = S_028BDC_DX10_DIAMOND_TEST_ENA(1);
+   unsigned sc_aa_config = 0;
+
+   if (coverage_samples > 1) {
+      /* distance from the pixel center, indexed by log2(nr_samples) */
+      static unsigned max_dist[] = {
+         0, /* unused */
+         4, /* 2x MSAA */
+         6, /* 4x MSAA */
+         7, /* 8x MSAA */
+         8, /* 16x MSAA */
+      };
+      unsigned log_samples = util_logbase2(coverage_samples);
+      unsigned log_z_samples = util_logbase2(z_samples);
+      unsigned ps_iter_samples = si_get_ps_iter_samples(sctx);
+      unsigned log_ps_iter_samples = util_logbase2(ps_iter_samples);
+
+      sc_line_cntl |= S_028BDC_EXPAND_LINE_WIDTH(1);
+      sc_aa_config = S_028BE0_MSAA_NUM_SAMPLES(log_samples) |
+                     S_028BE0_MAX_SAMPLE_DIST(max_dist[log_samples]) |
+                     S_028BE0_MSAA_EXPOSED_SAMPLES(log_samples);
+
+      if (sctx->framebuffer.nr_samples > 1) {
+         db_eqaa |= S_028804_MAX_ANCHOR_SAMPLES(log_z_samples) |
+                    S_028804_PS_ITER_SAMPLES(log_ps_iter_samples) |
+                    S_028804_MASK_EXPORT_NUM_SAMPLES(log_samples) |
+                    S_028804_ALPHA_TO_MASK_NUM_SAMPLES(log_samples);
+         sc_mode_cntl_1 |= S_028A4C_PS_ITER_SAMPLE(ps_iter_samples > 1);
+      } else if (sctx->smoothing_enabled) {
+         db_eqaa |= S_028804_OVERRASTERIZATION_AMOUNT(log_samples);
+      }
+   }
+
+   unsigned initial_cdw = cs->current.cdw;
+
+   /* R_028BDC_PA_SC_LINE_CNTL, R_028BE0_PA_SC_AA_CONFIG */
+   radeon_opt_set_context_reg2(sctx, R_028BDC_PA_SC_LINE_CNTL, SI_TRACKED_PA_SC_LINE_CNTL,
+                               sc_line_cntl, sc_aa_config);
+   /* R_028804_DB_EQAA */
+   radeon_opt_set_context_reg(sctx, R_028804_DB_EQAA, SI_TRACKED_DB_EQAA, db_eqaa);
+   /* R_028A4C_PA_SC_MODE_CNTL_1 */
+   radeon_opt_set_context_reg(sctx, R_028A4C_PA_SC_MODE_CNTL_1, SI_TRACKED_PA_SC_MODE_CNTL_1,
+                              sc_mode_cntl_1);
+
+   if (initial_cdw != cs->current.cdw) {
+      sctx->context_roll = true;
+
+      /* GFX9: Flush DFSM when the AA mode changes. */
+      if (sctx->screen->dfsm_allowed) {
+         radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
+         radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_DFSM) | EVENT_INDEX(0));
+      }
+   }
 }
 
 void si_update_ps_iter_samples(struct si_context *sctx)
 {
-	if (sctx->framebuffer.nr_samples > 1)
-		si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config);
-	if (sctx->screen->dpbb_allowed)
-		si_mark_atom_dirty(sctx, &sctx->atoms.s.dpbb_state);
+   if (sctx->framebuffer.nr_samples > 1)
+      si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config);
+   if (sctx->screen->dpbb_allowed)
+      si_mark_atom_dirty(sctx, &sctx->atoms.s.dpbb_state);
 }
 
 static void si_set_min_samples(struct pipe_context *ctx, unsigned min_samples)
 {
-	struct si_context *sctx = (struct si_context *)ctx;
+   struct si_context *sctx = (struct si_context *)ctx;
 
-	/* The hardware can only do sample shading with 2^n samples. */
-	min_samples = util_next_power_of_two(min_samples);
+   /* The hardware can only do sample shading with 2^n samples. */
+   min_samples = util_next_power_of_two(min_samples);
 
-	if (sctx->ps_iter_samples == min_samples)
-		return;
+   if (sctx->ps_iter_samples == min_samples)
+      return;
 
-	sctx->ps_iter_samples = min_samples;
-	sctx->do_update_shaders = true;
+   sctx->ps_iter_samples = min_samples;
+   sctx->do_update_shaders = true;
 
-	si_update_ps_iter_samples(sctx);
+   si_update_ps_iter_samples(sctx);
 }
 
 /*
@@ -3786,650 +3586,607 @@ static void si_set_min_samples(struct pipe_context *ctx, unsigned min_samples)
  * Build the sampler view descriptor for a buffer texture.
  * @param state 256-bit descriptor; only the high 128 bits are filled in
  */
-void
-si_make_buffer_descriptor(struct si_screen *screen, struct si_resource *buf,
-			  enum pipe_format format,
-			  unsigned offset, unsigned size,
-			  uint32_t *state)
+void si_make_buffer_descriptor(struct si_screen *screen, struct si_resource *buf,
+                               enum pipe_format format, unsigned offset, unsigned size,
+                               uint32_t *state)
 {
-	const struct util_format_description *desc;
-	unsigned stride;
-	unsigned num_records;
-
-	desc = util_format_description(format);
-	stride = desc->block.bits / 8;
-
-	num_records = size / stride;
-	num_records = MIN2(num_records, (buf->b.b.width0 - offset) / stride);
-
-	/* The NUM_RECORDS field has a different meaning depending on the chip,
-	 * instruction type, STRIDE, and SWIZZLE_ENABLE.
-	 *
-	 * GFX6-7,10:
-	 * - If STRIDE == 0, it's in byte units.
-	 * - If STRIDE != 0, it's in units of STRIDE, used with inst.IDXEN.
-	 *
-	 * GFX8:
-	 * - For SMEM and STRIDE == 0, it's in byte units.
-	 * - For SMEM and STRIDE != 0, it's in units of STRIDE.
-	 * - For VMEM and STRIDE == 0 or SWIZZLE_ENABLE == 0, it's in byte units.
-	 * - For VMEM and STRIDE != 0 and SWIZZLE_ENABLE == 1, it's in units of STRIDE.
-	 * NOTE: There is incompatibility between VMEM and SMEM opcodes due to SWIZZLE_-
-	 *       ENABLE. The workaround is to set STRIDE = 0 if SWIZZLE_ENABLE == 0 when
-	 *       using SMEM. This can be done in the shader by clearing STRIDE with s_and.
-	 *       That way the same descriptor can be used by both SMEM and VMEM.
-	 *
-	 * GFX9:
-	 * - For SMEM and STRIDE == 0, it's in byte units.
-	 * - For SMEM and STRIDE != 0, it's in units of STRIDE.
-	 * - For VMEM and inst.IDXEN == 0 or STRIDE == 0, it's in byte units.
-	 * - For VMEM and inst.IDXEN == 1 and STRIDE != 0, it's in units of STRIDE.
-	 */
-	if (screen->info.chip_class == GFX8)
-		num_records *= stride;
-
-	state[4] = 0;
-	state[5] = S_008F04_STRIDE(stride);
-	state[6] = num_records;
-	state[7] = S_008F0C_DST_SEL_X(si_map_swizzle(desc->swizzle[0])) |
-		   S_008F0C_DST_SEL_Y(si_map_swizzle(desc->swizzle[1])) |
-		   S_008F0C_DST_SEL_Z(si_map_swizzle(desc->swizzle[2])) |
-		   S_008F0C_DST_SEL_W(si_map_swizzle(desc->swizzle[3]));
-
-	if (screen->info.chip_class >= GFX10) {
-		const struct gfx10_format *fmt = &gfx10_format_table[format];
-
-		/* OOB_SELECT chooses the out-of-bounds check:
-		 *  - 0: (index >= NUM_RECORDS) || (offset >= STRIDE)
-		 *  - 1: index >= NUM_RECORDS
-		 *  - 2: NUM_RECORDS == 0
-		 *  - 3: if SWIZZLE_ENABLE == 0: offset >= NUM_RECORDS
-		 *       else: swizzle_address >= NUM_RECORDS
-		 */
-		state[7] |= S_008F0C_FORMAT(fmt->img_format) |
-			    S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_STRUCTURED_WITH_OFFSET) |
-			    S_008F0C_RESOURCE_LEVEL(1);
-	} else {
-		int first_non_void;
-		unsigned num_format, data_format;
-
-		first_non_void = util_format_get_first_non_void_channel(format);
-		num_format = si_translate_buffer_numformat(&screen->b, desc, first_non_void);
-		data_format = si_translate_buffer_dataformat(&screen->b, desc, first_non_void);
-
-		state[7] |= S_008F0C_NUM_FORMAT(num_format) |
-			    S_008F0C_DATA_FORMAT(data_format);
-	}
+   const struct util_format_description *desc;
+   unsigned stride;
+   unsigned num_records;
+
+   desc = util_format_description(format);
+   stride = desc->block.bits / 8;
+
+   num_records = size / stride;
+   num_records = MIN2(num_records, (buf->b.b.width0 - offset) / stride);
+
+   /* The NUM_RECORDS field has a different meaning depending on the chip,
+    * instruction type, STRIDE, and SWIZZLE_ENABLE.
+    *
+    * GFX6-7,10:
+    * - If STRIDE == 0, it's in byte units.
+    * - If STRIDE != 0, it's in units of STRIDE, used with inst.IDXEN.
+    *
+    * GFX8:
+    * - For SMEM and STRIDE == 0, it's in byte units.
+    * - For SMEM and STRIDE != 0, it's in units of STRIDE.
+    * - For VMEM and STRIDE == 0 or SWIZZLE_ENABLE == 0, it's in byte units.
+    * - For VMEM and STRIDE != 0 and SWIZZLE_ENABLE == 1, it's in units of STRIDE.
+    * NOTE: There is incompatibility between VMEM and SMEM opcodes due to SWIZZLE_-
+    *       ENABLE. The workaround is to set STRIDE = 0 if SWIZZLE_ENABLE == 0 when
+    *       using SMEM. This can be done in the shader by clearing STRIDE with s_and.
+    *       That way the same descriptor can be used by both SMEM and VMEM.
+    *
+    * GFX9:
+    * - For SMEM and STRIDE == 0, it's in byte units.
+    * - For SMEM and STRIDE != 0, it's in units of STRIDE.
+    * - For VMEM and inst.IDXEN == 0 or STRIDE == 0, it's in byte units.
+    * - For VMEM and inst.IDXEN == 1 and STRIDE != 0, it's in units of STRIDE.
+    */
+   if (screen->info.chip_class == GFX8)
+      num_records *= stride;
+
+   state[4] = 0;
+   state[5] = S_008F04_STRIDE(stride);
+   state[6] = num_records;
+   state[7] = S_008F0C_DST_SEL_X(si_map_swizzle(desc->swizzle[0])) |
+              S_008F0C_DST_SEL_Y(si_map_swizzle(desc->swizzle[1])) |
+              S_008F0C_DST_SEL_Z(si_map_swizzle(desc->swizzle[2])) |
+              S_008F0C_DST_SEL_W(si_map_swizzle(desc->swizzle[3]));
+
+   if (screen->info.chip_class >= GFX10) {
+      const struct gfx10_format *fmt = &gfx10_format_table[format];
+
+      /* OOB_SELECT chooses the out-of-bounds check:
+       *  - 0: (index >= NUM_RECORDS) || (offset >= STRIDE)
+       *  - 1: index >= NUM_RECORDS
+       *  - 2: NUM_RECORDS == 0
+       *  - 3: if SWIZZLE_ENABLE == 0: offset >= NUM_RECORDS
+       *       else: swizzle_address >= NUM_RECORDS
+       */
+      state[7] |= S_008F0C_FORMAT(fmt->img_format) |
+                  S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_STRUCTURED_WITH_OFFSET) |
+                  S_008F0C_RESOURCE_LEVEL(1);
+   } else {
+      int first_non_void;
+      unsigned num_format, data_format;
+
+      first_non_void = util_format_get_first_non_void_channel(format);
+      num_format = si_translate_buffer_numformat(&screen->b, desc, first_non_void);
+      data_format = si_translate_buffer_dataformat(&screen->b, desc, first_non_void);
+
+      state[7] |= S_008F0C_NUM_FORMAT(num_format) | S_008F0C_DATA_FORMAT(data_format);
+   }
 }
 
 static unsigned gfx9_border_color_swizzle(const unsigned char swizzle[4])
 {
-	unsigned bc_swizzle = V_008F20_BC_SWIZZLE_XYZW;
-
-	if (swizzle[3] == PIPE_SWIZZLE_X) {
-		/* For the pre-defined border color values (white, opaque
-		 * black, transparent black), the only thing that matters is
-		 * that the alpha channel winds up in the correct place
-		 * (because the RGB channels are all the same) so either of
-		 * these enumerations will work.
-		 */
-		if (swizzle[2] == PIPE_SWIZZLE_Y)
-			bc_swizzle = V_008F20_BC_SWIZZLE_WZYX;
-		else
-			bc_swizzle = V_008F20_BC_SWIZZLE_WXYZ;
-	} else if (swizzle[0] == PIPE_SWIZZLE_X) {
-		if (swizzle[1] == PIPE_SWIZZLE_Y)
-			bc_swizzle = V_008F20_BC_SWIZZLE_XYZW;
-		else
-			bc_swizzle = V_008F20_BC_SWIZZLE_XWYZ;
-	} else if (swizzle[1] == PIPE_SWIZZLE_X) {
-		bc_swizzle = V_008F20_BC_SWIZZLE_YXWZ;
-	} else if (swizzle[2] == PIPE_SWIZZLE_X) {
-		bc_swizzle = V_008F20_BC_SWIZZLE_ZYXW;
-	}
-
-	return bc_swizzle;
+   unsigned bc_swizzle = V_008F20_BC_SWIZZLE_XYZW;
+
+   if (swizzle[3] == PIPE_SWIZZLE_X) {
+      /* For the pre-defined border color values (white, opaque
+       * black, transparent black), the only thing that matters is
+       * that the alpha channel winds up in the correct place
+       * (because the RGB channels are all the same) so either of
+       * these enumerations will work.
+       */
+      if (swizzle[2] == PIPE_SWIZZLE_Y)
+         bc_swizzle = V_008F20_BC_SWIZZLE_WZYX;
+      else
+         bc_swizzle = V_008F20_BC_SWIZZLE_WXYZ;
+   } else if (swizzle[0] == PIPE_SWIZZLE_X) {
+      if (swizzle[1] == PIPE_SWIZZLE_Y)
+         bc_swizzle = V_008F20_BC_SWIZZLE_XYZW;
+      else
+         bc_swizzle = V_008F20_BC_SWIZZLE_XWYZ;
+   } else if (swizzle[1] == PIPE_SWIZZLE_X) {
+      bc_swizzle = V_008F20_BC_SWIZZLE_YXWZ;
+   } else if (swizzle[2] == PIPE_SWIZZLE_X) {
+      bc_swizzle = V_008F20_BC_SWIZZLE_ZYXW;
+   }
+
+   return bc_swizzle;
 }
 
 /**
  * Build the sampler view descriptor for a texture.
  */
-static void
-gfx10_make_texture_descriptor(struct si_screen *screen,
-			      struct si_texture *tex,
-			      bool sampler,
-			      enum pipe_texture_target target,
-			      enum pipe_format pipe_format,
-			      const unsigned char state_swizzle[4],
-			      unsigned first_level, unsigned last_level,
-			      unsigned first_layer, unsigned last_layer,
-			      unsigned width, unsigned height, unsigned depth,
-			      uint32_t *state,
-			      uint32_t *fmask_state)
+static void gfx10_make_texture_descriptor(
+   struct si_screen *screen, struct si_texture *tex, bool sampler, enum pipe_texture_target target,
+   enum pipe_format pipe_format, const unsigned char state_swizzle[4], unsigned first_level,
+   unsigned last_level, unsigned first_layer, unsigned last_layer, unsigned width, unsigned height,
+   unsigned depth, uint32_t *state, uint32_t *fmask_state)
 {
-	struct pipe_resource *res = &tex->buffer.b.b;
-	const struct util_format_description *desc;
-	unsigned img_format;
-	unsigned char swizzle[4];
-	unsigned type;
-	uint64_t va;
-
-	desc = util_format_description(pipe_format);
-	img_format = gfx10_format_table[pipe_format].img_format;
-
-	if (desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) {
-		const unsigned char swizzle_xxxx[4] = {0, 0, 0, 0};
-		const unsigned char swizzle_yyyy[4] = {1, 1, 1, 1};
-		const unsigned char swizzle_wwww[4] = {3, 3, 3, 3};
-		bool is_stencil = false;
-
-		switch (pipe_format) {
-		case PIPE_FORMAT_S8_UINT_Z24_UNORM:
-		case PIPE_FORMAT_X32_S8X24_UINT:
-		case PIPE_FORMAT_X8Z24_UNORM:
-			util_format_compose_swizzles(swizzle_yyyy, state_swizzle, swizzle);
-			is_stencil = true;
-			break;
-		case PIPE_FORMAT_X24S8_UINT:
-			/*
-			 * X24S8 is implemented as an 8_8_8_8 data format, to
-			 * fix texture gathers. This affects at least
-			 * GL45-CTS.texture_cube_map_array.sampling on GFX8.
-			 */
-			util_format_compose_swizzles(swizzle_wwww, state_swizzle, swizzle);
-			is_stencil = true;
-			break;
-		default:
-			util_format_compose_swizzles(swizzle_xxxx, state_swizzle, swizzle);
-			is_stencil = pipe_format == PIPE_FORMAT_S8_UINT;
-		}
-
-		if (tex->upgraded_depth && !is_stencil) {
-			assert(img_format == V_008F0C_IMG_FORMAT_32_FLOAT);
-			img_format = V_008F0C_IMG_FORMAT_32_FLOAT_CLAMP;
-		}
-	} else {
-		util_format_compose_swizzles(desc->swizzle, state_swizzle, swizzle);
-	}
-
-	if (!sampler &&
-	    (res->target == PIPE_TEXTURE_CUBE ||
-	     res->target == PIPE_TEXTURE_CUBE_ARRAY)) {
-		/* For the purpose of shader images, treat cube maps as 2D
-		 * arrays.
-		 */
-		type = V_008F1C_SQ_RSRC_IMG_2D_ARRAY;
-	} else {
-		type = si_tex_dim(screen, tex, target, res->nr_samples);
-	}
-
-	if (type == V_008F1C_SQ_RSRC_IMG_1D_ARRAY) {
-	        height = 1;
-		depth = res->array_size;
-	} else if (type == V_008F1C_SQ_RSRC_IMG_2D_ARRAY ||
-		   type == V_008F1C_SQ_RSRC_IMG_2D_MSAA_ARRAY) {
-		if (sampler || res->target != PIPE_TEXTURE_3D)
-			depth = res->array_size;
-	} else if (type == V_008F1C_SQ_RSRC_IMG_CUBE)
-		depth = res->array_size / 6;
-
-	state[0] = 0;
-	state[1] = S_00A004_FORMAT(img_format) |
-		   S_00A004_WIDTH_LO(width - 1);
-	state[2] = S_00A008_WIDTH_HI((width - 1) >> 2) |
-		   S_00A008_HEIGHT(height - 1) |
-		   S_00A008_RESOURCE_LEVEL(1);
-	state[3] = S_00A00C_DST_SEL_X(si_map_swizzle(swizzle[0])) |
-		   S_00A00C_DST_SEL_Y(si_map_swizzle(swizzle[1])) |
-		   S_00A00C_DST_SEL_Z(si_map_swizzle(swizzle[2])) |
-		   S_00A00C_DST_SEL_W(si_map_swizzle(swizzle[3])) |
-		   S_00A00C_BASE_LEVEL(res->nr_samples > 1 ?
-					0 : first_level) |
-		   S_00A00C_LAST_LEVEL(res->nr_samples > 1 ?
-					util_logbase2(res->nr_samples) :
-					last_level) |
-		   S_00A00C_BC_SWIZZLE(gfx9_border_color_swizzle(desc->swizzle)) |
-		   S_00A00C_TYPE(type);
-	/* Depth is the the last accessible layer on gfx9+. The hw doesn't need
-	 * to know the total number of layers.
-	 */
-	state[4] = S_00A010_DEPTH((type == V_008F1C_SQ_RSRC_IMG_3D && sampler)
-				  ? depth - 1 : last_layer) |
-		   S_00A010_BASE_ARRAY(first_layer);
-	state[5] = S_00A014_ARRAY_PITCH(!!(type == V_008F1C_SQ_RSRC_IMG_3D && !sampler)) |
-		   S_00A014_MAX_MIP(res->nr_samples > 1 ?
-				    util_logbase2(res->nr_samples) :
-				    tex->buffer.b.b.last_level) |
-		   S_00A014_PERF_MOD(4);
-	state[6] = 0;
-	state[7] = 0;
-
-	if (tex->surface.dcc_offset) {
-		state[6] |= S_00A018_MAX_UNCOMPRESSED_BLOCK_SIZE(V_028C78_MAX_BLOCK_SIZE_256B) |
-			    S_00A018_MAX_COMPRESSED_BLOCK_SIZE(V_028C78_MAX_BLOCK_SIZE_128B) |
-			    S_00A018_ALPHA_IS_ON_MSB(vi_alpha_is_on_msb(screen, pipe_format));
-	}
-
-	/* Initialize the sampler view for FMASK. */
-	if (tex->surface.fmask_offset) {
-		uint32_t format;
-
-		va = tex->buffer.gpu_address + tex->surface.fmask_offset;
-
-#define FMASK(s,f) (((unsigned)(MAX2(1, s)) * 16) + (MAX2(1, f)))
-		switch (FMASK(res->nr_samples, res->nr_storage_samples)) {
-		case FMASK(2,1):
-			format = V_008F0C_IMG_FORMAT_FMASK8_S2_F1;
-			break;
-		case FMASK(2,2):
-			format = V_008F0C_IMG_FORMAT_FMASK8_S2_F2;
-			break;
-		case FMASK(4,1):
-			format = V_008F0C_IMG_FORMAT_FMASK8_S4_F1;
-			break;
-		case FMASK(4,2):
-			format = V_008F0C_IMG_FORMAT_FMASK8_S4_F2;
-			break;
-		case FMASK(4,4):
-			format = V_008F0C_IMG_FORMAT_FMASK8_S4_F4;
-			break;
-		case FMASK(8,1):
-			format = V_008F0C_IMG_FORMAT_FMASK8_S8_F1;
-			break;
-		case FMASK(8,2):
-			format = V_008F0C_IMG_FORMAT_FMASK16_S8_F2;
-			break;
-		case FMASK(8,4):
-			format = V_008F0C_IMG_FORMAT_FMASK32_S8_F4;
-			break;
-		case FMASK(8,8):
-			format = V_008F0C_IMG_FORMAT_FMASK32_S8_F8;
-			break;
-		case FMASK(16,1):
-			format = V_008F0C_IMG_FORMAT_FMASK16_S16_F1;
-			break;
-		case FMASK(16,2):
-			format = V_008F0C_IMG_FORMAT_FMASK32_S16_F2;
-			break;
-		case FMASK(16,4):
-			format = V_008F0C_IMG_FORMAT_FMASK64_S16_F4;
-			break;
-		case FMASK(16,8):
-			format = V_008F0C_IMG_FORMAT_FMASK64_S16_F8;
-			break;
-		default:
-			unreachable("invalid nr_samples");
-		}
+   struct pipe_resource *res = &tex->buffer.b.b;
+   const struct util_format_description *desc;
+   unsigned img_format;
+   unsigned char swizzle[4];
+   unsigned type;
+   uint64_t va;
+
+   desc = util_format_description(pipe_format);
+   img_format = gfx10_format_table[pipe_format].img_format;
+
+   if (desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) {
+      const unsigned char swizzle_xxxx[4] = {0, 0, 0, 0};
+      const unsigned char swizzle_yyyy[4] = {1, 1, 1, 1};
+      const unsigned char swizzle_wwww[4] = {3, 3, 3, 3};
+      bool is_stencil = false;
+
+      switch (pipe_format) {
+      case PIPE_FORMAT_S8_UINT_Z24_UNORM:
+      case PIPE_FORMAT_X32_S8X24_UINT:
+      case PIPE_FORMAT_X8Z24_UNORM:
+         util_format_compose_swizzles(swizzle_yyyy, state_swizzle, swizzle);
+         is_stencil = true;
+         break;
+      case PIPE_FORMAT_X24S8_UINT:
+         /*
+          * X24S8 is implemented as an 8_8_8_8 data format, to
+          * fix texture gathers. This affects at least
+          * GL45-CTS.texture_cube_map_array.sampling on GFX8.
+          */
+         util_format_compose_swizzles(swizzle_wwww, state_swizzle, swizzle);
+         is_stencil = true;
+         break;
+      default:
+         util_format_compose_swizzles(swizzle_xxxx, state_swizzle, swizzle);
+         is_stencil = pipe_format == PIPE_FORMAT_S8_UINT;
+      }
+
+      if (tex->upgraded_depth && !is_stencil) {
+         assert(img_format == V_008F0C_IMG_FORMAT_32_FLOAT);
+         img_format = V_008F0C_IMG_FORMAT_32_FLOAT_CLAMP;
+      }
+   } else {
+      util_format_compose_swizzles(desc->swizzle, state_swizzle, swizzle);
+   }
+
+   if (!sampler && (res->target == PIPE_TEXTURE_CUBE || res->target == PIPE_TEXTURE_CUBE_ARRAY)) {
+      /* For the purpose of shader images, treat cube maps as 2D
+       * arrays.
+       */
+      type = V_008F1C_SQ_RSRC_IMG_2D_ARRAY;
+   } else {
+      type = si_tex_dim(screen, tex, target, res->nr_samples);
+   }
+
+   if (type == V_008F1C_SQ_RSRC_IMG_1D_ARRAY) {
+      height = 1;
+      depth = res->array_size;
+   } else if (type == V_008F1C_SQ_RSRC_IMG_2D_ARRAY || type == V_008F1C_SQ_RSRC_IMG_2D_MSAA_ARRAY) {
+      if (sampler || res->target != PIPE_TEXTURE_3D)
+         depth = res->array_size;
+   } else if (type == V_008F1C_SQ_RSRC_IMG_CUBE)
+      depth = res->array_size / 6;
+
+   state[0] = 0;
+   state[1] = S_00A004_FORMAT(img_format) | S_00A004_WIDTH_LO(width - 1);
+   state[2] = S_00A008_WIDTH_HI((width - 1) >> 2) | S_00A008_HEIGHT(height - 1) |
+              S_00A008_RESOURCE_LEVEL(1);
+   state[3] =
+      S_00A00C_DST_SEL_X(si_map_swizzle(swizzle[0])) |
+      S_00A00C_DST_SEL_Y(si_map_swizzle(swizzle[1])) |
+      S_00A00C_DST_SEL_Z(si_map_swizzle(swizzle[2])) |
+      S_00A00C_DST_SEL_W(si_map_swizzle(swizzle[3])) |
+      S_00A00C_BASE_LEVEL(res->nr_samples > 1 ? 0 : first_level) |
+      S_00A00C_LAST_LEVEL(res->nr_samples > 1 ? util_logbase2(res->nr_samples) : last_level) |
+      S_00A00C_BC_SWIZZLE(gfx9_border_color_swizzle(desc->swizzle)) | S_00A00C_TYPE(type);
+   /* Depth is the the last accessible layer on gfx9+. The hw doesn't need
+    * to know the total number of layers.
+    */
+   state[4] =
+      S_00A010_DEPTH((type == V_008F1C_SQ_RSRC_IMG_3D && sampler) ? depth - 1 : last_layer) |
+      S_00A010_BASE_ARRAY(first_layer);
+   state[5] = S_00A014_ARRAY_PITCH(!!(type == V_008F1C_SQ_RSRC_IMG_3D && !sampler)) |
+              S_00A014_MAX_MIP(res->nr_samples > 1 ? util_logbase2(res->nr_samples)
+                                                   : tex->buffer.b.b.last_level) |
+              S_00A014_PERF_MOD(4);
+   state[6] = 0;
+   state[7] = 0;
+
+   if (tex->surface.dcc_offset) {
+      state[6] |= S_00A018_MAX_UNCOMPRESSED_BLOCK_SIZE(V_028C78_MAX_BLOCK_SIZE_256B) |
+                  S_00A018_MAX_COMPRESSED_BLOCK_SIZE(V_028C78_MAX_BLOCK_SIZE_128B) |
+                  S_00A018_ALPHA_IS_ON_MSB(vi_alpha_is_on_msb(screen, pipe_format));
+   }
+
+   /* Initialize the sampler view for FMASK. */
+   if (tex->surface.fmask_offset) {
+      uint32_t format;
+
+      va = tex->buffer.gpu_address + tex->surface.fmask_offset;
+
+#define FMASK(s, f) (((unsigned)(MAX2(1, s)) * 16) + (MAX2(1, f)))
+      switch (FMASK(res->nr_samples, res->nr_storage_samples)) {
+      case FMASK(2, 1):
+         format = V_008F0C_IMG_FORMAT_FMASK8_S2_F1;
+         break;
+      case FMASK(2, 2):
+         format = V_008F0C_IMG_FORMAT_FMASK8_S2_F2;
+         break;
+      case FMASK(4, 1):
+         format = V_008F0C_IMG_FORMAT_FMASK8_S4_F1;
+         break;
+      case FMASK(4, 2):
+         format = V_008F0C_IMG_FORMAT_FMASK8_S4_F2;
+         break;
+      case FMASK(4, 4):
+         format = V_008F0C_IMG_FORMAT_FMASK8_S4_F4;
+         break;
+      case FMASK(8, 1):
+         format = V_008F0C_IMG_FORMAT_FMASK8_S8_F1;
+         break;
+      case FMASK(8, 2):
+         format = V_008F0C_IMG_FORMAT_FMASK16_S8_F2;
+         break;
+      case FMASK(8, 4):
+         format = V_008F0C_IMG_FORMAT_FMASK32_S8_F4;
+         break;
+      case FMASK(8, 8):
+         format = V_008F0C_IMG_FORMAT_FMASK32_S8_F8;
+         break;
+      case FMASK(16, 1):
+         format = V_008F0C_IMG_FORMAT_FMASK16_S16_F1;
+         break;
+      case FMASK(16, 2):
+         format = V_008F0C_IMG_FORMAT_FMASK32_S16_F2;
+         break;
+      case FMASK(16, 4):
+         format = V_008F0C_IMG_FORMAT_FMASK64_S16_F4;
+         break;
+      case FMASK(16, 8):
+         format = V_008F0C_IMG_FORMAT_FMASK64_S16_F8;
+         break;
+      default:
+         unreachable("invalid nr_samples");
+      }
 #undef FMASK
-		fmask_state[0] = (va >> 8) | tex->surface.fmask_tile_swizzle;
-		fmask_state[1] = S_00A004_BASE_ADDRESS_HI(va >> 40) |
-				 S_00A004_FORMAT(format) |
-				 S_00A004_WIDTH_LO(width - 1);
-		fmask_state[2] = S_00A008_WIDTH_HI((width - 1) >> 2) |
-				 S_00A008_HEIGHT(height - 1) |
-				 S_00A008_RESOURCE_LEVEL(1);
-		fmask_state[3] = S_00A00C_DST_SEL_X(V_008F1C_SQ_SEL_X) |
-				 S_00A00C_DST_SEL_Y(V_008F1C_SQ_SEL_X) |
-				 S_00A00C_DST_SEL_Z(V_008F1C_SQ_SEL_X) |
-				 S_00A00C_DST_SEL_W(V_008F1C_SQ_SEL_X) |
-				 S_00A00C_SW_MODE(tex->surface.u.gfx9.fmask.swizzle_mode) |
-				 S_00A00C_TYPE(si_tex_dim(screen, tex, target, 0));
-		fmask_state[4] = S_00A010_DEPTH(last_layer) |
-				 S_00A010_BASE_ARRAY(first_layer);
-		fmask_state[5] = 0;
-		fmask_state[6] = S_00A018_META_PIPE_ALIGNED(tex->surface.u.gfx9.cmask.pipe_aligned);
-		fmask_state[7] = 0;
-	}
+      fmask_state[0] = (va >> 8) | tex->surface.fmask_tile_swizzle;
+      fmask_state[1] = S_00A004_BASE_ADDRESS_HI(va >> 40) | S_00A004_FORMAT(format) |
+                       S_00A004_WIDTH_LO(width - 1);
+      fmask_state[2] = S_00A008_WIDTH_HI((width - 1) >> 2) | S_00A008_HEIGHT(height - 1) |
+                       S_00A008_RESOURCE_LEVEL(1);
+      fmask_state[3] =
+         S_00A00C_DST_SEL_X(V_008F1C_SQ_SEL_X) | S_00A00C_DST_SEL_Y(V_008F1C_SQ_SEL_X) |
+         S_00A00C_DST_SEL_Z(V_008F1C_SQ_SEL_X) | S_00A00C_DST_SEL_W(V_008F1C_SQ_SEL_X) |
+         S_00A00C_SW_MODE(tex->surface.u.gfx9.fmask.swizzle_mode) |
+         S_00A00C_TYPE(si_tex_dim(screen, tex, target, 0));
+      fmask_state[4] = S_00A010_DEPTH(last_layer) | S_00A010_BASE_ARRAY(first_layer);
+      fmask_state[5] = 0;
+      fmask_state[6] = S_00A018_META_PIPE_ALIGNED(tex->surface.u.gfx9.cmask.pipe_aligned);
+      fmask_state[7] = 0;
+   }
 }
 
 /**
  * Build the sampler view descriptor for a texture (SI-GFX9).
  */
-static void
-si_make_texture_descriptor(struct si_screen *screen,
-			   struct si_texture *tex,
-			   bool sampler,
-			   enum pipe_texture_target target,
-			   enum pipe_format pipe_format,
-			   const unsigned char state_swizzle[4],
-			   unsigned first_level, unsigned last_level,
-			   unsigned first_layer, unsigned last_layer,
-			   unsigned width, unsigned height, unsigned depth,
-			   uint32_t *state,
-			   uint32_t *fmask_state)
+static void si_make_texture_descriptor(struct si_screen *screen, struct si_texture *tex,
+                                       bool sampler, enum pipe_texture_target target,
+                                       enum pipe_format pipe_format,
+                                       const unsigned char state_swizzle[4], unsigned first_level,
+                                       unsigned last_level, unsigned first_layer,
+                                       unsigned last_layer, unsigned width, unsigned height,
+                                       unsigned depth, uint32_t *state, uint32_t *fmask_state)
 {
-	struct pipe_resource *res = &tex->buffer.b.b;
-	const struct util_format_description *desc;
-	unsigned char swizzle[4];
-	int first_non_void;
-	unsigned num_format, data_format, type, num_samples;
-	uint64_t va;
-
-	desc = util_format_description(pipe_format);
-
-	num_samples = desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS ?
-			MAX2(1, res->nr_samples) :
-			MAX2(1, res->nr_storage_samples);
-
-	if (desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) {
-		const unsigned char swizzle_xxxx[4] = {0, 0, 0, 0};
-		const unsigned char swizzle_yyyy[4] = {1, 1, 1, 1};
-		const unsigned char swizzle_wwww[4] = {3, 3, 3, 3};
-
-		switch (pipe_format) {
-		case PIPE_FORMAT_S8_UINT_Z24_UNORM:
-		case PIPE_FORMAT_X32_S8X24_UINT:
-		case PIPE_FORMAT_X8Z24_UNORM:
-			util_format_compose_swizzles(swizzle_yyyy, state_swizzle, swizzle);
-			break;
-		case PIPE_FORMAT_X24S8_UINT:
-			/*
-			 * X24S8 is implemented as an 8_8_8_8 data format, to
-			 * fix texture gathers. This affects at least
-			 * GL45-CTS.texture_cube_map_array.sampling on GFX8.
-			 */
-			if (screen->info.chip_class <= GFX8)
-				util_format_compose_swizzles(swizzle_wwww, state_swizzle, swizzle);
-			else
-				util_format_compose_swizzles(swizzle_yyyy, state_swizzle, swizzle);
-			break;
-		default:
-			util_format_compose_swizzles(swizzle_xxxx, state_swizzle, swizzle);
-		}
-	} else {
-		util_format_compose_swizzles(desc->swizzle, state_swizzle, swizzle);
-	}
-
-	first_non_void = util_format_get_first_non_void_channel(pipe_format);
-
-	switch (pipe_format) {
-	case PIPE_FORMAT_S8_UINT_Z24_UNORM:
-		num_format = V_008F14_IMG_NUM_FORMAT_UNORM;
-		break;
-	default:
-		if (first_non_void < 0) {
-			if (util_format_is_compressed(pipe_format)) {
-				switch (pipe_format) {
-				case PIPE_FORMAT_DXT1_SRGB:
-				case PIPE_FORMAT_DXT1_SRGBA:
-				case PIPE_FORMAT_DXT3_SRGBA:
-				case PIPE_FORMAT_DXT5_SRGBA:
-				case PIPE_FORMAT_BPTC_SRGBA:
-				case PIPE_FORMAT_ETC2_SRGB8:
-				case PIPE_FORMAT_ETC2_SRGB8A1:
-				case PIPE_FORMAT_ETC2_SRGBA8:
-					num_format = V_008F14_IMG_NUM_FORMAT_SRGB;
-					break;
-				case PIPE_FORMAT_RGTC1_SNORM:
-				case PIPE_FORMAT_LATC1_SNORM:
-				case PIPE_FORMAT_RGTC2_SNORM:
-				case PIPE_FORMAT_LATC2_SNORM:
-				case PIPE_FORMAT_ETC2_R11_SNORM:
-				case PIPE_FORMAT_ETC2_RG11_SNORM:
-				/* implies float, so use SNORM/UNORM to determine
-				   whether data is signed or not */
-				case PIPE_FORMAT_BPTC_RGB_FLOAT:
-					num_format = V_008F14_IMG_NUM_FORMAT_SNORM;
-					break;
-				default:
-					num_format = V_008F14_IMG_NUM_FORMAT_UNORM;
-					break;
-				}
-			} else if (desc->layout == UTIL_FORMAT_LAYOUT_SUBSAMPLED) {
-				num_format = V_008F14_IMG_NUM_FORMAT_UNORM;
-			} else {
-				num_format = V_008F14_IMG_NUM_FORMAT_FLOAT;
-			}
-		} else if (desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB) {
-			num_format = V_008F14_IMG_NUM_FORMAT_SRGB;
-		} else {
-			num_format = V_008F14_IMG_NUM_FORMAT_UNORM;
-
-			switch (desc->channel[first_non_void].type) {
-			case UTIL_FORMAT_TYPE_FLOAT:
-				num_format = V_008F14_IMG_NUM_FORMAT_FLOAT;
-				break;
-			case UTIL_FORMAT_TYPE_SIGNED:
-				if (desc->channel[first_non_void].normalized)
-					num_format = V_008F14_IMG_NUM_FORMAT_SNORM;
-				else if (desc->channel[first_non_void].pure_integer)
-					num_format = V_008F14_IMG_NUM_FORMAT_SINT;
-				else
-					num_format = V_008F14_IMG_NUM_FORMAT_SSCALED;
-				break;
-			case UTIL_FORMAT_TYPE_UNSIGNED:
-				if (desc->channel[first_non_void].normalized)
-					num_format = V_008F14_IMG_NUM_FORMAT_UNORM;
-				else if (desc->channel[first_non_void].pure_integer)
-					num_format = V_008F14_IMG_NUM_FORMAT_UINT;
-				else
-					num_format = V_008F14_IMG_NUM_FORMAT_USCALED;
-			}
-		}
-	}
-
-	data_format = si_translate_texformat(&screen->b, pipe_format, desc, first_non_void);
-	if (data_format == ~0) {
-		data_format = 0;
-	}
-
-	/* S8 with Z32 HTILE needs a special format. */
-	if (screen->info.chip_class == GFX9 &&
-	    pipe_format == PIPE_FORMAT_S8_UINT &&
-	    tex->tc_compatible_htile)
-		data_format = V_008F14_IMG_DATA_FORMAT_S8_32;
-
-	if (!sampler &&
-	    (res->target == PIPE_TEXTURE_CUBE ||
-	     res->target == PIPE_TEXTURE_CUBE_ARRAY ||
-	     (screen->info.chip_class <= GFX8 &&
-	      res->target == PIPE_TEXTURE_3D))) {
-		/* For the purpose of shader images, treat cube maps and 3D
-		 * textures as 2D arrays. For 3D textures, the address
-		 * calculations for mipmaps are different, so we rely on the
-		 * caller to effectively disable mipmaps.
-		 */
-		type = V_008F1C_SQ_RSRC_IMG_2D_ARRAY;
-
-		assert(res->target != PIPE_TEXTURE_3D || (first_level == 0 && last_level == 0));
-	} else {
-		type = si_tex_dim(screen, tex, target, num_samples);
-	}
-
-	if (type == V_008F1C_SQ_RSRC_IMG_1D_ARRAY) {
-	        height = 1;
-		depth = res->array_size;
-	} else if (type == V_008F1C_SQ_RSRC_IMG_2D_ARRAY ||
-		   type == V_008F1C_SQ_RSRC_IMG_2D_MSAA_ARRAY) {
-		if (sampler || res->target != PIPE_TEXTURE_3D)
-			depth = res->array_size;
-	} else if (type == V_008F1C_SQ_RSRC_IMG_CUBE)
-		depth = res->array_size / 6;
-
-	state[0] = 0;
-	state[1] = (S_008F14_DATA_FORMAT(data_format) |
-		    S_008F14_NUM_FORMAT(num_format));
-	state[2] = (S_008F18_WIDTH(width - 1) |
-		    S_008F18_HEIGHT(height - 1) |
-		    S_008F18_PERF_MOD(4));
-	state[3] = (S_008F1C_DST_SEL_X(si_map_swizzle(swizzle[0])) |
-		    S_008F1C_DST_SEL_Y(si_map_swizzle(swizzle[1])) |
-		    S_008F1C_DST_SEL_Z(si_map_swizzle(swizzle[2])) |
-		    S_008F1C_DST_SEL_W(si_map_swizzle(swizzle[3])) |
-		    S_008F1C_BASE_LEVEL(num_samples > 1 ? 0 : first_level) |
-		    S_008F1C_LAST_LEVEL(num_samples > 1 ?
-					util_logbase2(num_samples) :
-					last_level) |
-		    S_008F1C_TYPE(type));
-	state[4] = 0;
-	state[5] = S_008F24_BASE_ARRAY(first_layer);
-	state[6] = 0;
-	state[7] = 0;
-
-	if (screen->info.chip_class == GFX9) {
-		unsigned bc_swizzle = gfx9_border_color_swizzle(desc->swizzle);
-
-		/* Depth is the the last accessible layer on Gfx9.
-		 * The hw doesn't need to know the total number of layers.
-		 */
-		if (type == V_008F1C_SQ_RSRC_IMG_3D)
-			state[4] |= S_008F20_DEPTH(depth - 1);
-		else
-			state[4] |= S_008F20_DEPTH(last_layer);
-
-		state[4] |= S_008F20_BC_SWIZZLE(bc_swizzle);
-		state[5] |= S_008F24_MAX_MIP(num_samples > 1 ?
-					     util_logbase2(num_samples) :
-					     tex->buffer.b.b.last_level);
-	} else {
-		state[3] |= S_008F1C_POW2_PAD(res->last_level > 0);
-		state[4] |= S_008F20_DEPTH(depth - 1);
-		state[5] |= S_008F24_LAST_ARRAY(last_layer);
-	}
-
-	if (tex->surface.dcc_offset) {
-		state[6] = S_008F28_ALPHA_IS_ON_MSB(vi_alpha_is_on_msb(screen, pipe_format));
-	} else {
-		/* The last dword is unused by hw. The shader uses it to clear
-		 * bits in the first dword of sampler state.
-		 */
-		if (screen->info.chip_class <= GFX7 && res->nr_samples <= 1) {
-			if (first_level == last_level)
-				state[7] = C_008F30_MAX_ANISO_RATIO;
-			else
-				state[7] = 0xffffffff;
-		}
-	}
-
-	/* Initialize the sampler view for FMASK. */
-	if (tex->surface.fmask_offset) {
-		uint32_t data_format, num_format;
-
-		va = tex->buffer.gpu_address + tex->surface.fmask_offset;
-
-#define FMASK(s,f) (((unsigned)(MAX2(1, s)) * 16) + (MAX2(1, f)))
-		if (screen->info.chip_class == GFX9) {
-			data_format = V_008F14_IMG_DATA_FORMAT_FMASK;
-			switch (FMASK(res->nr_samples, res->nr_storage_samples)) {
-			case FMASK(2,1):
-				num_format = V_008F14_IMG_FMASK_8_2_1;
-				break;
-			case FMASK(2,2):
-				num_format = V_008F14_IMG_FMASK_8_2_2;
-				break;
-			case FMASK(4,1):
-				num_format = V_008F14_IMG_FMASK_8_4_1;
-				break;
-			case FMASK(4,2):
-				num_format = V_008F14_IMG_FMASK_8_4_2;
-				break;
-			case FMASK(4,4):
-				num_format = V_008F14_IMG_FMASK_8_4_4;
-				break;
-			case FMASK(8,1):
-				num_format = V_008F14_IMG_FMASK_8_8_1;
-				break;
-			case FMASK(8,2):
-				num_format = V_008F14_IMG_FMASK_16_8_2;
-				break;
-			case FMASK(8,4):
-				num_format = V_008F14_IMG_FMASK_32_8_4;
-				break;
-			case FMASK(8,8):
-				num_format = V_008F14_IMG_FMASK_32_8_8;
-				break;
-			case FMASK(16,1):
-				num_format = V_008F14_IMG_FMASK_16_16_1;
-				break;
-			case FMASK(16,2):
-				num_format = V_008F14_IMG_FMASK_32_16_2;
-				break;
-			case FMASK(16,4):
-				num_format = V_008F14_IMG_FMASK_64_16_4;
-				break;
-			case FMASK(16,8):
-				num_format = V_008F14_IMG_FMASK_64_16_8;
-				break;
-			default:
-				unreachable("invalid nr_samples");
-			}
-		} else {
-			switch (FMASK(res->nr_samples, res->nr_storage_samples)) {
-			case FMASK(2,1):
-				data_format = V_008F14_IMG_DATA_FORMAT_FMASK8_S2_F1;
-				break;
-			case FMASK(2,2):
-				data_format = V_008F14_IMG_DATA_FORMAT_FMASK8_S2_F2;
-				break;
-			case FMASK(4,1):
-				data_format = V_008F14_IMG_DATA_FORMAT_FMASK8_S4_F1;
-				break;
-			case FMASK(4,2):
-				data_format = V_008F14_IMG_DATA_FORMAT_FMASK8_S4_F2;
-				break;
-			case FMASK(4,4):
-				data_format = V_008F14_IMG_DATA_FORMAT_FMASK8_S4_F4;
-				break;
-			case FMASK(8,1):
-				data_format = V_008F14_IMG_DATA_FORMAT_FMASK8_S8_F1;
-				break;
-			case FMASK(8,2):
-				data_format = V_008F14_IMG_DATA_FORMAT_FMASK16_S8_F2;
-				break;
-			case FMASK(8,4):
-				data_format = V_008F14_IMG_DATA_FORMAT_FMASK32_S8_F4;
-				break;
-			case FMASK(8,8):
-				data_format = V_008F14_IMG_DATA_FORMAT_FMASK32_S8_F8;
-				break;
-			case FMASK(16,1):
-				data_format = V_008F14_IMG_DATA_FORMAT_FMASK16_S16_F1;
-				break;
-			case FMASK(16,2):
-				data_format = V_008F14_IMG_DATA_FORMAT_FMASK32_S16_F2;
-				break;
-			case FMASK(16,4):
-				data_format = V_008F14_IMG_DATA_FORMAT_FMASK64_S16_F4;
-				break;
-			case FMASK(16,8):
-				data_format = V_008F14_IMG_DATA_FORMAT_FMASK64_S16_F8;
-				break;
-			default:
-				unreachable("invalid nr_samples");
-			}
-			num_format = V_008F14_IMG_NUM_FORMAT_UINT;
-		}
+   struct pipe_resource *res = &tex->buffer.b.b;
+   const struct util_format_description *desc;
+   unsigned char swizzle[4];
+   int first_non_void;
+   unsigned num_format, data_format, type, num_samples;
+   uint64_t va;
+
+   desc = util_format_description(pipe_format);
+
+   num_samples = desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS ? MAX2(1, res->nr_samples)
+                                                               : MAX2(1, res->nr_storage_samples);
+
+   if (desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) {
+      const unsigned char swizzle_xxxx[4] = {0, 0, 0, 0};
+      const unsigned char swizzle_yyyy[4] = {1, 1, 1, 1};
+      const unsigned char swizzle_wwww[4] = {3, 3, 3, 3};
+
+      switch (pipe_format) {
+      case PIPE_FORMAT_S8_UINT_Z24_UNORM:
+      case PIPE_FORMAT_X32_S8X24_UINT:
+      case PIPE_FORMAT_X8Z24_UNORM:
+         util_format_compose_swizzles(swizzle_yyyy, state_swizzle, swizzle);
+         break;
+      case PIPE_FORMAT_X24S8_UINT:
+         /*
+          * X24S8 is implemented as an 8_8_8_8 data format, to
+          * fix texture gathers. This affects at least
+          * GL45-CTS.texture_cube_map_array.sampling on GFX8.
+          */
+         if (screen->info.chip_class <= GFX8)
+            util_format_compose_swizzles(swizzle_wwww, state_swizzle, swizzle);
+         else
+            util_format_compose_swizzles(swizzle_yyyy, state_swizzle, swizzle);
+         break;
+      default:
+         util_format_compose_swizzles(swizzle_xxxx, state_swizzle, swizzle);
+      }
+   } else {
+      util_format_compose_swizzles(desc->swizzle, state_swizzle, swizzle);
+   }
+
+   first_non_void = util_format_get_first_non_void_channel(pipe_format);
+
+   switch (pipe_format) {
+   case PIPE_FORMAT_S8_UINT_Z24_UNORM:
+      num_format = V_008F14_IMG_NUM_FORMAT_UNORM;
+      break;
+   default:
+      if (first_non_void < 0) {
+         if (util_format_is_compressed(pipe_format)) {
+            switch (pipe_format) {
+            case PIPE_FORMAT_DXT1_SRGB:
+            case PIPE_FORMAT_DXT1_SRGBA:
+            case PIPE_FORMAT_DXT3_SRGBA:
+            case PIPE_FORMAT_DXT5_SRGBA:
+            case PIPE_FORMAT_BPTC_SRGBA:
+            case PIPE_FORMAT_ETC2_SRGB8:
+            case PIPE_FORMAT_ETC2_SRGB8A1:
+            case PIPE_FORMAT_ETC2_SRGBA8:
+               num_format = V_008F14_IMG_NUM_FORMAT_SRGB;
+               break;
+            case PIPE_FORMAT_RGTC1_SNORM:
+            case PIPE_FORMAT_LATC1_SNORM:
+            case PIPE_FORMAT_RGTC2_SNORM:
+            case PIPE_FORMAT_LATC2_SNORM:
+            case PIPE_FORMAT_ETC2_R11_SNORM:
+            case PIPE_FORMAT_ETC2_RG11_SNORM:
+            /* implies float, so use SNORM/UNORM to determine
+               whether data is signed or not */
+            case PIPE_FORMAT_BPTC_RGB_FLOAT:
+               num_format = V_008F14_IMG_NUM_FORMAT_SNORM;
+               break;
+            default:
+               num_format = V_008F14_IMG_NUM_FORMAT_UNORM;
+               break;
+            }
+         } else if (desc->layout == UTIL_FORMAT_LAYOUT_SUBSAMPLED) {
+            num_format = V_008F14_IMG_NUM_FORMAT_UNORM;
+         } else {
+            num_format = V_008F14_IMG_NUM_FORMAT_FLOAT;
+         }
+      } else if (desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB) {
+         num_format = V_008F14_IMG_NUM_FORMAT_SRGB;
+      } else {
+         num_format = V_008F14_IMG_NUM_FORMAT_UNORM;
+
+         switch (desc->channel[first_non_void].type) {
+         case UTIL_FORMAT_TYPE_FLOAT:
+            num_format = V_008F14_IMG_NUM_FORMAT_FLOAT;
+            break;
+         case UTIL_FORMAT_TYPE_SIGNED:
+            if (desc->channel[first_non_void].normalized)
+               num_format = V_008F14_IMG_NUM_FORMAT_SNORM;
+            else if (desc->channel[first_non_void].pure_integer)
+               num_format = V_008F14_IMG_NUM_FORMAT_SINT;
+            else
+               num_format = V_008F14_IMG_NUM_FORMAT_SSCALED;
+            break;
+         case UTIL_FORMAT_TYPE_UNSIGNED:
+            if (desc->channel[first_non_void].normalized)
+               num_format = V_008F14_IMG_NUM_FORMAT_UNORM;
+            else if (desc->channel[first_non_void].pure_integer)
+               num_format = V_008F14_IMG_NUM_FORMAT_UINT;
+            else
+               num_format = V_008F14_IMG_NUM_FORMAT_USCALED;
+         }
+      }
+   }
+
+   data_format = si_translate_texformat(&screen->b, pipe_format, desc, first_non_void);
+   if (data_format == ~0) {
+      data_format = 0;
+   }
+
+   /* S8 with Z32 HTILE needs a special format. */
+   if (screen->info.chip_class == GFX9 && pipe_format == PIPE_FORMAT_S8_UINT &&
+       tex->tc_compatible_htile)
+      data_format = V_008F14_IMG_DATA_FORMAT_S8_32;
+
+   if (!sampler && (res->target == PIPE_TEXTURE_CUBE || res->target == PIPE_TEXTURE_CUBE_ARRAY ||
+                    (screen->info.chip_class <= GFX8 && res->target == PIPE_TEXTURE_3D))) {
+      /* For the purpose of shader images, treat cube maps and 3D
+       * textures as 2D arrays. For 3D textures, the address
+       * calculations for mipmaps are different, so we rely on the
+       * caller to effectively disable mipmaps.
+       */
+      type = V_008F1C_SQ_RSRC_IMG_2D_ARRAY;
+
+      assert(res->target != PIPE_TEXTURE_3D || (first_level == 0 && last_level == 0));
+   } else {
+      type = si_tex_dim(screen, tex, target, num_samples);
+   }
+
+   if (type == V_008F1C_SQ_RSRC_IMG_1D_ARRAY) {
+      height = 1;
+      depth = res->array_size;
+   } else if (type == V_008F1C_SQ_RSRC_IMG_2D_ARRAY || type == V_008F1C_SQ_RSRC_IMG_2D_MSAA_ARRAY) {
+      if (sampler || res->target != PIPE_TEXTURE_3D)
+         depth = res->array_size;
+   } else if (type == V_008F1C_SQ_RSRC_IMG_CUBE)
+      depth = res->array_size / 6;
+
+   state[0] = 0;
+   state[1] = (S_008F14_DATA_FORMAT(data_format) | S_008F14_NUM_FORMAT(num_format));
+   state[2] = (S_008F18_WIDTH(width - 1) | S_008F18_HEIGHT(height - 1) | S_008F18_PERF_MOD(4));
+   state[3] = (S_008F1C_DST_SEL_X(si_map_swizzle(swizzle[0])) |
+               S_008F1C_DST_SEL_Y(si_map_swizzle(swizzle[1])) |
+               S_008F1C_DST_SEL_Z(si_map_swizzle(swizzle[2])) |
+               S_008F1C_DST_SEL_W(si_map_swizzle(swizzle[3])) |
+               S_008F1C_BASE_LEVEL(num_samples > 1 ? 0 : first_level) |
+               S_008F1C_LAST_LEVEL(num_samples > 1 ? util_logbase2(num_samples) : last_level) |
+               S_008F1C_TYPE(type));
+   state[4] = 0;
+   state[5] = S_008F24_BASE_ARRAY(first_layer);
+   state[6] = 0;
+   state[7] = 0;
+
+   if (screen->info.chip_class == GFX9) {
+      unsigned bc_swizzle = gfx9_border_color_swizzle(desc->swizzle);
+
+      /* Depth is the the last accessible layer on Gfx9.
+       * The hw doesn't need to know the total number of layers.
+       */
+      if (type == V_008F1C_SQ_RSRC_IMG_3D)
+         state[4] |= S_008F20_DEPTH(depth - 1);
+      else
+         state[4] |= S_008F20_DEPTH(last_layer);
+
+      state[4] |= S_008F20_BC_SWIZZLE(bc_swizzle);
+      state[5] |= S_008F24_MAX_MIP(num_samples > 1 ? util_logbase2(num_samples)
+                                                   : tex->buffer.b.b.last_level);
+   } else {
+      state[3] |= S_008F1C_POW2_PAD(res->last_level > 0);
+      state[4] |= S_008F20_DEPTH(depth - 1);
+      state[5] |= S_008F24_LAST_ARRAY(last_layer);
+   }
+
+   if (tex->surface.dcc_offset) {
+      state[6] = S_008F28_ALPHA_IS_ON_MSB(vi_alpha_is_on_msb(screen, pipe_format));
+   } else {
+      /* The last dword is unused by hw. The shader uses it to clear
+       * bits in the first dword of sampler state.
+       */
+      if (screen->info.chip_class <= GFX7 && res->nr_samples <= 1) {
+         if (first_level == last_level)
+            state[7] = C_008F30_MAX_ANISO_RATIO;
+         else
+            state[7] = 0xffffffff;
+      }
+   }
+
+   /* Initialize the sampler view for FMASK. */
+   if (tex->surface.fmask_offset) {
+      uint32_t data_format, num_format;
+
+      va = tex->buffer.gpu_address + tex->surface.fmask_offset;
+
+#define FMASK(s, f) (((unsigned)(MAX2(1, s)) * 16) + (MAX2(1, f)))
+      if (screen->info.chip_class == GFX9) {
+         data_format = V_008F14_IMG_DATA_FORMAT_FMASK;
+         switch (FMASK(res->nr_samples, res->nr_storage_samples)) {
+         case FMASK(2, 1):
+            num_format = V_008F14_IMG_FMASK_8_2_1;
+            break;
+         case FMASK(2, 2):
+            num_format = V_008F14_IMG_FMASK_8_2_2;
+            break;
+         case FMASK(4, 1):
+            num_format = V_008F14_IMG_FMASK_8_4_1;
+            break;
+         case FMASK(4, 2):
+            num_format = V_008F14_IMG_FMASK_8_4_2;
+            break;
+         case FMASK(4, 4):
+            num_format = V_008F14_IMG_FMASK_8_4_4;
+            break;
+         case FMASK(8, 1):
+            num_format = V_008F14_IMG_FMASK_8_8_1;
+            break;
+         case FMASK(8, 2):
+            num_format = V_008F14_IMG_FMASK_16_8_2;
+            break;
+         case FMASK(8, 4):
+            num_format = V_008F14_IMG_FMASK_32_8_4;
+            break;
+         case FMASK(8, 8):
+            num_format = V_008F14_IMG_FMASK_32_8_8;
+            break;
+         case FMASK(16, 1):
+            num_format = V_008F14_IMG_FMASK_16_16_1;
+            break;
+         case FMASK(16, 2):
+            num_format = V_008F14_IMG_FMASK_32_16_2;
+            break;
+         case FMASK(16, 4):
+            num_format = V_008F14_IMG_FMASK_64_16_4;
+            break;
+         case FMASK(16, 8):
+            num_format = V_008F14_IMG_FMASK_64_16_8;
+            break;
+         default:
+            unreachable("invalid nr_samples");
+         }
+      } else {
+         switch (FMASK(res->nr_samples, res->nr_storage_samples)) {
+         case FMASK(2, 1):
+            data_format = V_008F14_IMG_DATA_FORMAT_FMASK8_S2_F1;
+            break;
+         case FMASK(2, 2):
+            data_format = V_008F14_IMG_DATA_FORMAT_FMASK8_S2_F2;
+            break;
+         case FMASK(4, 1):
+            data_format = V_008F14_IMG_DATA_FORMAT_FMASK8_S4_F1;
+            break;
+         case FMASK(4, 2):
+            data_format = V_008F14_IMG_DATA_FORMAT_FMASK8_S4_F2;
+            break;
+         case FMASK(4, 4):
+            data_format = V_008F14_IMG_DATA_FORMAT_FMASK8_S4_F4;
+            break;
+         case FMASK(8, 1):
+            data_format = V_008F14_IMG_DATA_FORMAT_FMASK8_S8_F1;
+            break;
+         case FMASK(8, 2):
+            data_format = V_008F14_IMG_DATA_FORMAT_FMASK16_S8_F2;
+            break;
+         case FMASK(8, 4):
+            data_format = V_008F14_IMG_DATA_FORMAT_FMASK32_S8_F4;
+            break;
+         case FMASK(8, 8):
+            data_format = V_008F14_IMG_DATA_FORMAT_FMASK32_S8_F8;
+            break;
+         case FMASK(16, 1):
+            data_format = V_008F14_IMG_DATA_FORMAT_FMASK16_S16_F1;
+            break;
+         case FMASK(16, 2):
+            data_format = V_008F14_IMG_DATA_FORMAT_FMASK32_S16_F2;
+            break;
+         case FMASK(16, 4):
+            data_format = V_008F14_IMG_DATA_FORMAT_FMASK64_S16_F4;
+            break;
+         case FMASK(16, 8):
+            data_format = V_008F14_IMG_DATA_FORMAT_FMASK64_S16_F8;
+            break;
+         default:
+            unreachable("invalid nr_samples");
+         }
+         num_format = V_008F14_IMG_NUM_FORMAT_UINT;
+      }
 #undef FMASK
 
-		fmask_state[0] = (va >> 8) | tex->surface.fmask_tile_swizzle;
-		fmask_state[1] = S_008F14_BASE_ADDRESS_HI(va >> 40) |
-				 S_008F14_DATA_FORMAT(data_format) |
-				 S_008F14_NUM_FORMAT(num_format);
-		fmask_state[2] = S_008F18_WIDTH(width - 1) |
-				 S_008F18_HEIGHT(height - 1);
-		fmask_state[3] = S_008F1C_DST_SEL_X(V_008F1C_SQ_SEL_X) |
-				 S_008F1C_DST_SEL_Y(V_008F1C_SQ_SEL_X) |
-				 S_008F1C_DST_SEL_Z(V_008F1C_SQ_SEL_X) |
-				 S_008F1C_DST_SEL_W(V_008F1C_SQ_SEL_X) |
-				 S_008F1C_TYPE(si_tex_dim(screen, tex, target, 0));
-		fmask_state[4] = 0;
-		fmask_state[5] = S_008F24_BASE_ARRAY(first_layer);
-		fmask_state[6] = 0;
-		fmask_state[7] = 0;
-
-		if (screen->info.chip_class == GFX9) {
-			fmask_state[3] |= S_008F1C_SW_MODE(tex->surface.u.gfx9.fmask.swizzle_mode);
-			fmask_state[4] |= S_008F20_DEPTH(last_layer) |
-					  S_008F20_PITCH(tex->surface.u.gfx9.fmask.epitch);
-			fmask_state[5] |= S_008F24_META_PIPE_ALIGNED(tex->surface.u.gfx9.cmask.pipe_aligned) |
-					  S_008F24_META_RB_ALIGNED(tex->surface.u.gfx9.cmask.rb_aligned);
-		} else {
-			fmask_state[3] |= S_008F1C_TILING_INDEX(tex->surface.u.legacy.fmask.tiling_index);
-			fmask_state[4] |= S_008F20_DEPTH(depth - 1) |
-					  S_008F20_PITCH(tex->surface.u.legacy.fmask.pitch_in_pixels - 1);
-			fmask_state[5] |= S_008F24_LAST_ARRAY(last_layer);
-		}
-	}
+      fmask_state[0] = (va >> 8) | tex->surface.fmask_tile_swizzle;
+      fmask_state[1] = S_008F14_BASE_ADDRESS_HI(va >> 40) | S_008F14_DATA_FORMAT(data_format) |
+                       S_008F14_NUM_FORMAT(num_format);
+      fmask_state[2] = S_008F18_WIDTH(width - 1) | S_008F18_HEIGHT(height - 1);
+      fmask_state[3] =
+         S_008F1C_DST_SEL_X(V_008F1C_SQ_SEL_X) | S_008F1C_DST_SEL_Y(V_008F1C_SQ_SEL_X) |
+         S_008F1C_DST_SEL_Z(V_008F1C_SQ_SEL_X) | S_008F1C_DST_SEL_W(V_008F1C_SQ_SEL_X) |
+         S_008F1C_TYPE(si_tex_dim(screen, tex, target, 0));
+      fmask_state[4] = 0;
+      fmask_state[5] = S_008F24_BASE_ARRAY(first_layer);
+      fmask_state[6] = 0;
+      fmask_state[7] = 0;
+
+      if (screen->info.chip_class == GFX9) {
+         fmask_state[3] |= S_008F1C_SW_MODE(tex->surface.u.gfx9.fmask.swizzle_mode);
+         fmask_state[4] |=
+            S_008F20_DEPTH(last_layer) | S_008F20_PITCH(tex->surface.u.gfx9.fmask.epitch);
+         fmask_state[5] |= S_008F24_META_PIPE_ALIGNED(tex->surface.u.gfx9.cmask.pipe_aligned) |
+                           S_008F24_META_RB_ALIGNED(tex->surface.u.gfx9.cmask.rb_aligned);
+      } else {
+         fmask_state[3] |= S_008F1C_TILING_INDEX(tex->surface.u.legacy.fmask.tiling_index);
+         fmask_state[4] |= S_008F20_DEPTH(depth - 1) |
+                           S_008F20_PITCH(tex->surface.u.legacy.fmask.pitch_in_pixels - 1);
+         fmask_state[5] |= S_008F24_LAST_ARRAY(last_layer);
+      }
+   }
 }
 
 /**
@@ -4442,1282 +4199,1195 @@ si_make_texture_descriptor(struct si_screen *screen,
  * @param height0	height0 override (for compressed textures as int)
  * @param force_level   set the base address to the level (for compressed textures)
  */
-struct pipe_sampler_view *
-si_create_sampler_view_custom(struct pipe_context *ctx,
-			      struct pipe_resource *texture,
-			      const struct pipe_sampler_view *state,
-			      unsigned width0, unsigned height0,
-			      unsigned force_level)
+struct pipe_sampler_view *si_create_sampler_view_custom(struct pipe_context *ctx,
+                                                        struct pipe_resource *texture,
+                                                        const struct pipe_sampler_view *state,
+                                                        unsigned width0, unsigned height0,
+                                                        unsigned force_level)
 {
-	struct si_context *sctx = (struct si_context*)ctx;
-	struct si_sampler_view *view = CALLOC_STRUCT(si_sampler_view);
-	struct si_texture *tex = (struct si_texture*)texture;
-	unsigned base_level, first_level, last_level;
-	unsigned char state_swizzle[4];
-	unsigned height, depth, width;
-	unsigned last_layer = state->u.tex.last_layer;
-	enum pipe_format pipe_format;
-	const struct legacy_surf_level *surflevel;
-
-	if (!view)
-		return NULL;
-
-	/* initialize base object */
-	view->base = *state;
-	view->base.texture = NULL;
-	view->base.reference.count = 1;
-	view->base.context = ctx;
-
-	assert(texture);
-	pipe_resource_reference(&view->base.texture, texture);
-
-	if (state->format == PIPE_FORMAT_X24S8_UINT ||
-	    state->format == PIPE_FORMAT_S8X24_UINT ||
-	    state->format == PIPE_FORMAT_X32_S8X24_UINT ||
-	    state->format == PIPE_FORMAT_S8_UINT)
-		view->is_stencil_sampler = true;
-
-	/* Buffer resource. */
-	if (texture->target == PIPE_BUFFER) {
-		si_make_buffer_descriptor(sctx->screen,
-					  si_resource(texture),
-					  state->format,
-					  state->u.buf.offset,
-					  state->u.buf.size,
-					  view->state);
-		return &view->base;
-	}
-
-	state_swizzle[0] = state->swizzle_r;
-	state_swizzle[1] = state->swizzle_g;
-	state_swizzle[2] = state->swizzle_b;
-	state_swizzle[3] = state->swizzle_a;
-
-	base_level = 0;
-	first_level = state->u.tex.first_level;
-	last_level = state->u.tex.last_level;
-	width = width0;
-	height = height0;
-	depth = texture->depth0;
-
-	if (sctx->chip_class <= GFX8 && force_level) {
-		assert(force_level == first_level &&
-		       force_level == last_level);
-		base_level = force_level;
-		first_level = 0;
-		last_level = 0;
-		width = u_minify(width, force_level);
-		height = u_minify(height, force_level);
-		depth = u_minify(depth, force_level);
-	}
-
-	/* This is not needed if state trackers set last_layer correctly. */
-	if (state->target == PIPE_TEXTURE_1D ||
-	    state->target == PIPE_TEXTURE_2D ||
-	    state->target == PIPE_TEXTURE_RECT ||
-	    state->target == PIPE_TEXTURE_CUBE)
-		last_layer = state->u.tex.first_layer;
-
-	/* Texturing with separate depth and stencil. */
-	pipe_format = state->format;
-
-	/* Depth/stencil texturing sometimes needs separate texture. */
-	if (tex->is_depth && !si_can_sample_zs(tex, view->is_stencil_sampler)) {
-		if (!tex->flushed_depth_texture &&
-		    !si_init_flushed_depth_texture(ctx, texture)) {
-			pipe_resource_reference(&view->base.texture, NULL);
-			FREE(view);
-			return NULL;
-		}
-
-		assert(tex->flushed_depth_texture);
-
-		/* Override format for the case where the flushed texture
-		 * contains only Z or only S.
-		 */
-		if (tex->flushed_depth_texture->buffer.b.b.format != tex->buffer.b.b.format)
-			pipe_format = tex->flushed_depth_texture->buffer.b.b.format;
-
-		tex = tex->flushed_depth_texture;
-	}
-
-	surflevel = tex->surface.u.legacy.level;
-
-	if (tex->db_compatible) {
-		if (!view->is_stencil_sampler)
-			pipe_format = tex->db_render_format;
-
-		switch (pipe_format) {
-		case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT:
-			pipe_format = PIPE_FORMAT_Z32_FLOAT;
-			break;
-		case PIPE_FORMAT_X8Z24_UNORM:
-		case PIPE_FORMAT_S8_UINT_Z24_UNORM:
-			/* Z24 is always stored like this for DB
-			 * compatibility.
-			 */
-			pipe_format = PIPE_FORMAT_Z24X8_UNORM;
-			break;
-		case PIPE_FORMAT_X24S8_UINT:
-		case PIPE_FORMAT_S8X24_UINT:
-		case PIPE_FORMAT_X32_S8X24_UINT:
-			pipe_format = PIPE_FORMAT_S8_UINT;
-			surflevel = tex->surface.u.legacy.stencil_level;
-			break;
-		default:;
-		}
-	}
-
-	view->dcc_incompatible =
-		vi_dcc_formats_are_incompatible(texture,
-						state->u.tex.first_level,
-						state->format);
-
-	sctx->screen->make_texture_descriptor(sctx->screen, tex, true,
-				   state->target, pipe_format, state_swizzle,
-				   first_level, last_level,
-				   state->u.tex.first_layer, last_layer,
-				   width, height, depth,
-				   view->state, view->fmask_state);
-
-	const struct util_format_description *desc = util_format_description(pipe_format);
-	view->is_integer = false;
-
-	for (unsigned i = 0; i < desc->nr_channels; ++i) {
-		if (desc->channel[i].type == UTIL_FORMAT_TYPE_VOID)
-			continue;
-
-		/* Whether the number format is {U,S}{SCALED,INT} */
-		view->is_integer =
-			(desc->channel[i].type == UTIL_FORMAT_TYPE_UNSIGNED ||
-			 desc->channel[i].type == UTIL_FORMAT_TYPE_SIGNED) &&
-			(desc->channel[i].pure_integer || !desc->channel[i].normalized);
-		break;
-	}
-
-	view->base_level_info = &surflevel[base_level];
-	view->base_level = base_level;
-	view->block_width = util_format_get_blockwidth(pipe_format);
-	return &view->base;
+   struct si_context *sctx = (struct si_context *)ctx;
+   struct si_sampler_view *view = CALLOC_STRUCT(si_sampler_view);
+   struct si_texture *tex = (struct si_texture *)texture;
+   unsigned base_level, first_level, last_level;
+   unsigned char state_swizzle[4];
+   unsigned height, depth, width;
+   unsigned last_layer = state->u.tex.last_layer;
+   enum pipe_format pipe_format;
+   const struct legacy_surf_level *surflevel;
+
+   if (!view)
+      return NULL;
+
+   /* initialize base object */
+   view->base = *state;
+   view->base.texture = NULL;
+   view->base.reference.count = 1;
+   view->base.context = ctx;
+
+   assert(texture);
+   pipe_resource_reference(&view->base.texture, texture);
+
+   if (state->format == PIPE_FORMAT_X24S8_UINT || state->format == PIPE_FORMAT_S8X24_UINT ||
+       state->format == PIPE_FORMAT_X32_S8X24_UINT || state->format == PIPE_FORMAT_S8_UINT)
+      view->is_stencil_sampler = true;
+
+   /* Buffer resource. */
+   if (texture->target == PIPE_BUFFER) {
+      si_make_buffer_descriptor(sctx->screen, si_resource(texture), state->format,
+                                state->u.buf.offset, state->u.buf.size, view->state);
+      return &view->base;
+   }
+
+   state_swizzle[0] = state->swizzle_r;
+   state_swizzle[1] = state->swizzle_g;
+   state_swizzle[2] = state->swizzle_b;
+   state_swizzle[3] = state->swizzle_a;
+
+   base_level = 0;
+   first_level = state->u.tex.first_level;
+   last_level = state->u.tex.last_level;
+   width = width0;
+   height = height0;
+   depth = texture->depth0;
+
+   if (sctx->chip_class <= GFX8 && force_level) {
+      assert(force_level == first_level && force_level == last_level);
+      base_level = force_level;
+      first_level = 0;
+      last_level = 0;
+      width = u_minify(width, force_level);
+      height = u_minify(height, force_level);
+      depth = u_minify(depth, force_level);
+   }
+
+   /* This is not needed if state trackers set last_layer correctly. */
+   if (state->target == PIPE_TEXTURE_1D || state->target == PIPE_TEXTURE_2D ||
+       state->target == PIPE_TEXTURE_RECT || state->target == PIPE_TEXTURE_CUBE)
+      last_layer = state->u.tex.first_layer;
+
+   /* Texturing with separate depth and stencil. */
+   pipe_format = state->format;
+
+   /* Depth/stencil texturing sometimes needs separate texture. */
+   if (tex->is_depth && !si_can_sample_zs(tex, view->is_stencil_sampler)) {
+      if (!tex->flushed_depth_texture && !si_init_flushed_depth_texture(ctx, texture)) {
+         pipe_resource_reference(&view->base.texture, NULL);
+         FREE(view);
+         return NULL;
+      }
+
+      assert(tex->flushed_depth_texture);
+
+      /* Override format for the case where the flushed texture
+       * contains only Z or only S.
+       */
+      if (tex->flushed_depth_texture->buffer.b.b.format != tex->buffer.b.b.format)
+         pipe_format = tex->flushed_depth_texture->buffer.b.b.format;
+
+      tex = tex->flushed_depth_texture;
+   }
+
+   surflevel = tex->surface.u.legacy.level;
+
+   if (tex->db_compatible) {
+      if (!view->is_stencil_sampler)
+         pipe_format = tex->db_render_format;
+
+      switch (pipe_format) {
+      case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT:
+         pipe_format = PIPE_FORMAT_Z32_FLOAT;
+         break;
+      case PIPE_FORMAT_X8Z24_UNORM:
+      case PIPE_FORMAT_S8_UINT_Z24_UNORM:
+         /* Z24 is always stored like this for DB
+          * compatibility.
+          */
+         pipe_format = PIPE_FORMAT_Z24X8_UNORM;
+         break;
+      case PIPE_FORMAT_X24S8_UINT:
+      case PIPE_FORMAT_S8X24_UINT:
+      case PIPE_FORMAT_X32_S8X24_UINT:
+         pipe_format = PIPE_FORMAT_S8_UINT;
+         surflevel = tex->surface.u.legacy.stencil_level;
+         break;
+      default:;
+      }
+   }
+
+   view->dcc_incompatible =
+      vi_dcc_formats_are_incompatible(texture, state->u.tex.first_level, state->format);
+
+   sctx->screen->make_texture_descriptor(
+      sctx->screen, tex, true, state->target, pipe_format, state_swizzle, first_level, last_level,
+      state->u.tex.first_layer, last_layer, width, height, depth, view->state, view->fmask_state);
+
+   const struct util_format_description *desc = util_format_description(pipe_format);
+   view->is_integer = false;
+
+   for (unsigned i = 0; i < desc->nr_channels; ++i) {
+      if (desc->channel[i].type == UTIL_FORMAT_TYPE_VOID)
+         continue;
+
+      /* Whether the number format is {U,S}{SCALED,INT} */
+      view->is_integer = (desc->channel[i].type == UTIL_FORMAT_TYPE_UNSIGNED ||
+                          desc->channel[i].type == UTIL_FORMAT_TYPE_SIGNED) &&
+                         (desc->channel[i].pure_integer || !desc->channel[i].normalized);
+      break;
+   }
+
+   view->base_level_info = &surflevel[base_level];
+   view->base_level = base_level;
+   view->block_width = util_format_get_blockwidth(pipe_format);
+   return &view->base;
 }
 
-static struct pipe_sampler_view *
-si_create_sampler_view(struct pipe_context *ctx,
-		       struct pipe_resource *texture,
-		       const struct pipe_sampler_view *state)
+static struct pipe_sampler_view *si_create_sampler_view(struct pipe_context *ctx,
+                                                        struct pipe_resource *texture,
+                                                        const struct pipe_sampler_view *state)
 {
-	return si_create_sampler_view_custom(ctx, texture, state,
-					     texture ? texture->width0 : 0,
-					     texture ? texture->height0 : 0, 0);
+   return si_create_sampler_view_custom(ctx, texture, state, texture ? texture->width0 : 0,
+                                        texture ? texture->height0 : 0, 0);
 }
 
-static void si_sampler_view_destroy(struct pipe_context *ctx,
-				    struct pipe_sampler_view *state)
+static void si_sampler_view_destroy(struct pipe_context *ctx, struct pipe_sampler_view *state)
 {
-	struct si_sampler_view *view = (struct si_sampler_view *)state;
+   struct si_sampler_view *view = (struct si_sampler_view *)state;
 
-	pipe_resource_reference(&state->texture, NULL);
-	FREE(view);
+   pipe_resource_reference(&state->texture, NULL);
+   FREE(view);
 }
 
 static bool wrap_mode_uses_border_color(unsigned wrap, bool linear_filter)
 {
-	return wrap == PIPE_TEX_WRAP_CLAMP_TO_BORDER ||
-	       wrap == PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER ||
-	       (linear_filter &&
-	        (wrap == PIPE_TEX_WRAP_CLAMP ||
-		 wrap == PIPE_TEX_WRAP_MIRROR_CLAMP));
+   return wrap == PIPE_TEX_WRAP_CLAMP_TO_BORDER || wrap == PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER ||
+          (linear_filter && (wrap == PIPE_TEX_WRAP_CLAMP || wrap == PIPE_TEX_WRAP_MIRROR_CLAMP));
 }
 
 static uint32_t si_translate_border_color(struct si_context *sctx,
-					  const struct pipe_sampler_state *state,
-					  const union pipe_color_union *color,
-					  bool is_integer)
+                                          const struct pipe_sampler_state *state,
+                                          const union pipe_color_union *color, bool is_integer)
 {
-	bool linear_filter = state->min_img_filter != PIPE_TEX_FILTER_NEAREST ||
-			     state->mag_img_filter != PIPE_TEX_FILTER_NEAREST;
-
-	if (!wrap_mode_uses_border_color(state->wrap_s, linear_filter) &&
-	    !wrap_mode_uses_border_color(state->wrap_t, linear_filter) &&
-	    !wrap_mode_uses_border_color(state->wrap_r, linear_filter))
-		return S_008F3C_BORDER_COLOR_TYPE(V_008F3C_SQ_TEX_BORDER_COLOR_TRANS_BLACK);
-
-#define simple_border_types(elt) \
-do { \
-	if (color->elt[0] == 0 && color->elt[1] == 0 &&                         \
-	    color->elt[2] == 0 && color->elt[3] == 0)                           \
-		return S_008F3C_BORDER_COLOR_TYPE(V_008F3C_SQ_TEX_BORDER_COLOR_TRANS_BLACK); \
-	if (color->elt[0] == 0 && color->elt[1] == 0 &&                         \
-	    color->elt[2] == 0 && color->elt[3] == 1)                           \
-		return S_008F3C_BORDER_COLOR_TYPE(V_008F3C_SQ_TEX_BORDER_COLOR_OPAQUE_BLACK); \
-	if (color->elt[0] == 1 && color->elt[1] == 1 &&                         \
-	    color->elt[2] == 1 && color->elt[3] == 1)                           \
-		return S_008F3C_BORDER_COLOR_TYPE(V_008F3C_SQ_TEX_BORDER_COLOR_OPAQUE_WHITE); \
-} while (false)
-
-	if (is_integer)
-		simple_border_types(ui);
-	else
-		simple_border_types(f);
+   bool linear_filter = state->min_img_filter != PIPE_TEX_FILTER_NEAREST ||
+                        state->mag_img_filter != PIPE_TEX_FILTER_NEAREST;
+
+   if (!wrap_mode_uses_border_color(state->wrap_s, linear_filter) &&
+       !wrap_mode_uses_border_color(state->wrap_t, linear_filter) &&
+       !wrap_mode_uses_border_color(state->wrap_r, linear_filter))
+      return S_008F3C_BORDER_COLOR_TYPE(V_008F3C_SQ_TEX_BORDER_COLOR_TRANS_BLACK);
+
+#define simple_border_types(elt)                                                                   \
+   do {                                                                                            \
+      if (color->elt[0] == 0 && color->elt[1] == 0 && color->elt[2] == 0 && color->elt[3] == 0)    \
+         return S_008F3C_BORDER_COLOR_TYPE(V_008F3C_SQ_TEX_BORDER_COLOR_TRANS_BLACK);              \
+      if (color->elt[0] == 0 && color->elt[1] == 0 && color->elt[2] == 0 && color->elt[3] == 1)    \
+         return S_008F3C_BORDER_COLOR_TYPE(V_008F3C_SQ_TEX_BORDER_COLOR_OPAQUE_BLACK);             \
+      if (color->elt[0] == 1 && color->elt[1] == 1 && color->elt[2] == 1 && color->elt[3] == 1)    \
+         return S_008F3C_BORDER_COLOR_TYPE(V_008F3C_SQ_TEX_BORDER_COLOR_OPAQUE_WHITE);             \
+   } while (false)
+
+   if (is_integer)
+      simple_border_types(ui);
+   else
+      simple_border_types(f);
 
 #undef simple_border_types
 
-	int i;
-
-	/* Check if the border has been uploaded already. */
-	for (i = 0; i < sctx->border_color_count; i++)
-		if (memcmp(&sctx->border_color_table[i], color,
-			   sizeof(*color)) == 0)
-			break;
-
-	if (i >= SI_MAX_BORDER_COLORS) {
-		/* Getting 4096 unique border colors is very unlikely. */
-		fprintf(stderr, "radeonsi: The border color table is full. "
-			"Any new border colors will be just black. "
-			"Please file a bug.\n");
-		return S_008F3C_BORDER_COLOR_TYPE(V_008F3C_SQ_TEX_BORDER_COLOR_TRANS_BLACK);
-	}
-
-	if (i == sctx->border_color_count) {
-		/* Upload a new border color. */
-		memcpy(&sctx->border_color_table[i], color,
-		       sizeof(*color));
-		util_memcpy_cpu_to_le32(&sctx->border_color_map[i],
-					color, sizeof(*color));
-		sctx->border_color_count++;
-	}
-
-	return S_008F3C_BORDER_COLOR_PTR(i) |
-	       S_008F3C_BORDER_COLOR_TYPE(V_008F3C_SQ_TEX_BORDER_COLOR_REGISTER);
+   int i;
+
+   /* Check if the border has been uploaded already. */
+   for (i = 0; i < sctx->border_color_count; i++)
+      if (memcmp(&sctx->border_color_table[i], color, sizeof(*color)) == 0)
+         break;
+
+   if (i >= SI_MAX_BORDER_COLORS) {
+      /* Getting 4096 unique border colors is very unlikely. */
+      fprintf(stderr, "radeonsi: The border color table is full. "
+                      "Any new border colors will be just black. "
+                      "Please file a bug.\n");
+      return S_008F3C_BORDER_COLOR_TYPE(V_008F3C_SQ_TEX_BORDER_COLOR_TRANS_BLACK);
+   }
+
+   if (i == sctx->border_color_count) {
+      /* Upload a new border color. */
+      memcpy(&sctx->border_color_table[i], color, sizeof(*color));
+      util_memcpy_cpu_to_le32(&sctx->border_color_map[i], color, sizeof(*color));
+      sctx->border_color_count++;
+   }
+
+   return S_008F3C_BORDER_COLOR_PTR(i) |
+          S_008F3C_BORDER_COLOR_TYPE(V_008F3C_SQ_TEX_BORDER_COLOR_REGISTER);
 }
 
 static inline int S_FIXED(float value, unsigned frac_bits)
 {
-	return value * (1 << frac_bits);
+   return value * (1 << frac_bits);
 }
 
 static inline unsigned si_tex_filter(unsigned filter, unsigned max_aniso)
 {
-	if (filter == PIPE_TEX_FILTER_LINEAR)
-		return max_aniso > 1 ? V_008F38_SQ_TEX_XY_FILTER_ANISO_BILINEAR
-				     : V_008F38_SQ_TEX_XY_FILTER_BILINEAR;
-	else
-		return max_aniso > 1 ? V_008F38_SQ_TEX_XY_FILTER_ANISO_POINT
-				     : V_008F38_SQ_TEX_XY_FILTER_POINT;
+   if (filter == PIPE_TEX_FILTER_LINEAR)
+      return max_aniso > 1 ? V_008F38_SQ_TEX_XY_FILTER_ANISO_BILINEAR
+                           : V_008F38_SQ_TEX_XY_FILTER_BILINEAR;
+   else
+      return max_aniso > 1 ? V_008F38_SQ_TEX_XY_FILTER_ANISO_POINT
+                           : V_008F38_SQ_TEX_XY_FILTER_POINT;
 }
 
 static inline unsigned si_tex_aniso_filter(unsigned filter)
 {
-	if (filter < 2)
-		return 0;
-	if (filter < 4)
-		return 1;
-	if (filter < 8)
-		return 2;
-	if (filter < 16)
-		return 3;
-	return 4;
+   if (filter < 2)
+      return 0;
+   if (filter < 4)
+      return 1;
+   if (filter < 8)
+      return 2;
+   if (filter < 16)
+      return 3;
+   return 4;
 }
 
 static void *si_create_sampler_state(struct pipe_context *ctx,
-				     const struct pipe_sampler_state *state)
+                                     const struct pipe_sampler_state *state)
 {
-	struct si_context *sctx = (struct si_context *)ctx;
-	struct si_screen *sscreen = sctx->screen;
-	struct si_sampler_state *rstate = CALLOC_STRUCT(si_sampler_state);
-	unsigned max_aniso = sscreen->force_aniso >= 0 ? sscreen->force_aniso
-						       : state->max_anisotropy;
-	unsigned max_aniso_ratio = si_tex_aniso_filter(max_aniso);
-	union pipe_color_union clamped_border_color;
-
-	if (!rstate) {
-		return NULL;
-	}
+   struct si_context *sctx = (struct si_context *)ctx;
+   struct si_screen *sscreen = sctx->screen;
+   struct si_sampler_state *rstate = CALLOC_STRUCT(si_sampler_state);
+   unsigned max_aniso = sscreen->force_aniso >= 0 ? sscreen->force_aniso : state->max_anisotropy;
+   unsigned max_aniso_ratio = si_tex_aniso_filter(max_aniso);
+   union pipe_color_union clamped_border_color;
+
+   if (!rstate) {
+      return NULL;
+   }
 
 #ifndef NDEBUG
-	rstate->magic = SI_SAMPLER_STATE_MAGIC;
+   rstate->magic = SI_SAMPLER_STATE_MAGIC;
 #endif
-	rstate->val[0] = (S_008F30_CLAMP_X(si_tex_wrap(state->wrap_s)) |
-			  S_008F30_CLAMP_Y(si_tex_wrap(state->wrap_t)) |
-			  S_008F30_CLAMP_Z(si_tex_wrap(state->wrap_r)) |
-			  S_008F30_MAX_ANISO_RATIO(max_aniso_ratio) |
-			  S_008F30_DEPTH_COMPARE_FUNC(si_tex_compare(state->compare_func)) |
-			  S_008F30_FORCE_UNNORMALIZED(!state->normalized_coords) |
-			  S_008F30_ANISO_THRESHOLD(max_aniso_ratio >> 1) |
-			  S_008F30_ANISO_BIAS(max_aniso_ratio) |
-			  S_008F30_DISABLE_CUBE_WRAP(!state->seamless_cube_map) |
-			  S_008F30_COMPAT_MODE(sctx->chip_class == GFX8 || sctx->chip_class == GFX9));
-	rstate->val[1] = (S_008F34_MIN_LOD(S_FIXED(CLAMP(state->min_lod, 0, 15), 8)) |
-			  S_008F34_MAX_LOD(S_FIXED(CLAMP(state->max_lod, 0, 15), 8)) |
-			  S_008F34_PERF_MIP(max_aniso_ratio ? max_aniso_ratio + 6 : 0));
-	rstate->val[2] = (S_008F38_LOD_BIAS(S_FIXED(CLAMP(state->lod_bias, -16, 16), 8)) |
-			  S_008F38_XY_MAG_FILTER(si_tex_filter(state->mag_img_filter, max_aniso)) |
-			  S_008F38_XY_MIN_FILTER(si_tex_filter(state->min_img_filter, max_aniso)) |
-			  S_008F38_MIP_FILTER(si_tex_mipfilter(state->min_mip_filter)) |
-			  S_008F38_MIP_POINT_PRECLAMP(0));
-	rstate->val[3] = si_translate_border_color(sctx, state, &state->border_color, false);
-
-	if (sscreen->info.chip_class >= GFX10) {
-		rstate->val[2] |= S_008F38_ANISO_OVERRIDE_GFX10(1);
-	} else {
-		rstate->val[2] |= S_008F38_DISABLE_LSB_CEIL(sctx->chip_class <= GFX8) |
-				  S_008F38_FILTER_PREC_FIX(1) |
-				  S_008F38_ANISO_OVERRIDE_GFX6(sctx->chip_class >= GFX8);
-	}
-
-	/* Create sampler resource for integer textures. */
-	memcpy(rstate->integer_val, rstate->val, sizeof(rstate->val));
-	rstate->integer_val[3] = si_translate_border_color(sctx, state, &state->border_color, true);
-
-	/* Create sampler resource for upgraded depth textures. */
-	memcpy(rstate->upgraded_depth_val, rstate->val, sizeof(rstate->val));
-
-	for (unsigned i = 0; i < 4; ++i) {
-		/* Use channel 0 on purpose, so that we can use OPAQUE_WHITE
-		 * when the border color is 1.0. */
-		clamped_border_color.f[i] = CLAMP(state->border_color.f[0], 0, 1);
-	}
-
-	if (memcmp(&state->border_color, &clamped_border_color, sizeof(clamped_border_color)) == 0) {
-		if (sscreen->info.chip_class <= GFX9)
-			rstate->upgraded_depth_val[3] |= S_008F3C_UPGRADED_DEPTH(1);
-	} else {
-		rstate->upgraded_depth_val[3] =
-			si_translate_border_color(sctx, state, &clamped_border_color, false);
-	}
-
-	return rstate;
+   rstate->val[0] =
+      (S_008F30_CLAMP_X(si_tex_wrap(state->wrap_s)) | S_008F30_CLAMP_Y(si_tex_wrap(state->wrap_t)) |
+       S_008F30_CLAMP_Z(si_tex_wrap(state->wrap_r)) | S_008F30_MAX_ANISO_RATIO(max_aniso_ratio) |
+       S_008F30_DEPTH_COMPARE_FUNC(si_tex_compare(state->compare_func)) |
+       S_008F30_FORCE_UNNORMALIZED(!state->normalized_coords) |
+       S_008F30_ANISO_THRESHOLD(max_aniso_ratio >> 1) | S_008F30_ANISO_BIAS(max_aniso_ratio) |
+       S_008F30_DISABLE_CUBE_WRAP(!state->seamless_cube_map) |
+       S_008F30_COMPAT_MODE(sctx->chip_class == GFX8 || sctx->chip_class == GFX9));
+   rstate->val[1] = (S_008F34_MIN_LOD(S_FIXED(CLAMP(state->min_lod, 0, 15), 8)) |
+                     S_008F34_MAX_LOD(S_FIXED(CLAMP(state->max_lod, 0, 15), 8)) |
+                     S_008F34_PERF_MIP(max_aniso_ratio ? max_aniso_ratio + 6 : 0));
+   rstate->val[2] = (S_008F38_LOD_BIAS(S_FIXED(CLAMP(state->lod_bias, -16, 16), 8)) |
+                     S_008F38_XY_MAG_FILTER(si_tex_filter(state->mag_img_filter, max_aniso)) |
+                     S_008F38_XY_MIN_FILTER(si_tex_filter(state->min_img_filter, max_aniso)) |
+                     S_008F38_MIP_FILTER(si_tex_mipfilter(state->min_mip_filter)) |
+                     S_008F38_MIP_POINT_PRECLAMP(0));
+   rstate->val[3] = si_translate_border_color(sctx, state, &state->border_color, false);
+
+   if (sscreen->info.chip_class >= GFX10) {
+      rstate->val[2] |= S_008F38_ANISO_OVERRIDE_GFX10(1);
+   } else {
+      rstate->val[2] |= S_008F38_DISABLE_LSB_CEIL(sctx->chip_class <= GFX8) |
+                        S_008F38_FILTER_PREC_FIX(1) |
+                        S_008F38_ANISO_OVERRIDE_GFX6(sctx->chip_class >= GFX8);
+   }
+
+   /* Create sampler resource for integer textures. */
+   memcpy(rstate->integer_val, rstate->val, sizeof(rstate->val));
+   rstate->integer_val[3] = si_translate_border_color(sctx, state, &state->border_color, true);
+
+   /* Create sampler resource for upgraded depth textures. */
+   memcpy(rstate->upgraded_depth_val, rstate->val, sizeof(rstate->val));
+
+   for (unsigned i = 0; i < 4; ++i) {
+      /* Use channel 0 on purpose, so that we can use OPAQUE_WHITE
+       * when the border color is 1.0. */
+      clamped_border_color.f[i] = CLAMP(state->border_color.f[0], 0, 1);
+   }
+
+   if (memcmp(&state->border_color, &clamped_border_color, sizeof(clamped_border_color)) == 0) {
+      if (sscreen->info.chip_class <= GFX9)
+         rstate->upgraded_depth_val[3] |= S_008F3C_UPGRADED_DEPTH(1);
+   } else {
+      rstate->upgraded_depth_val[3] =
+         si_translate_border_color(sctx, state, &clamped_border_color, false);
+   }
+
+   return rstate;
 }
 
 static void si_set_sample_mask(struct pipe_context *ctx, unsigned sample_mask)
 {
-	struct si_context *sctx = (struct si_context *)ctx;
+   struct si_context *sctx = (struct si_context *)ctx;
 
-	if (sctx->sample_mask == (uint16_t)sample_mask)
-		return;
+   if (sctx->sample_mask == (uint16_t)sample_mask)
+      return;
 
-	sctx->sample_mask = sample_mask;
-	si_mark_atom_dirty(sctx, &sctx->atoms.s.sample_mask);
+   sctx->sample_mask = sample_mask;
+   si_mark_atom_dirty(sctx, &sctx->atoms.s.sample_mask);
 }
 
 static void si_emit_sample_mask(struct si_context *sctx)
 {
-	struct radeon_cmdbuf *cs = sctx->gfx_cs;
-	unsigned mask = sctx->sample_mask;
-
-	/* Needed for line and polygon smoothing as well as for the Polaris
-	 * small primitive filter. We expect the state tracker to take care of
-	 * this for us.
-	 */
-	assert(mask == 0xffff || sctx->framebuffer.nr_samples > 1 ||
-	       (mask & 1 && sctx->blitter->running));
-
-	radeon_set_context_reg_seq(cs, R_028C38_PA_SC_AA_MASK_X0Y0_X1Y0, 2);
-	radeon_emit(cs, mask | (mask << 16));
-	radeon_emit(cs, mask | (mask << 16));
+   struct radeon_cmdbuf *cs = sctx->gfx_cs;
+   unsigned mask = sctx->sample_mask;
+
+   /* Needed for line and polygon smoothing as well as for the Polaris
+    * small primitive filter. We expect the state tracker to take care of
+    * this for us.
+    */
+   assert(mask == 0xffff || sctx->framebuffer.nr_samples > 1 ||
+          (mask & 1 && sctx->blitter->running));
+
+   radeon_set_context_reg_seq(cs, R_028C38_PA_SC_AA_MASK_X0Y0_X1Y0, 2);
+   radeon_emit(cs, mask | (mask << 16));
+   radeon_emit(cs, mask | (mask << 16));
 }
 
 static void si_delete_sampler_state(struct pipe_context *ctx, void *state)
 {
 #ifndef NDEBUG
-	struct si_sampler_state *s = state;
+   struct si_sampler_state *s = state;
 
-	assert(s->magic == SI_SAMPLER_STATE_MAGIC);
-	s->magic = 0;
+   assert(s->magic == SI_SAMPLER_STATE_MAGIC);
+   s->magic = 0;
 #endif
-	free(state);
+   free(state);
 }
 
 /*
  * Vertex elements & buffers
  */
 
-struct si_fast_udiv_info32
-si_compute_fast_udiv_info32(uint32_t D, unsigned num_bits)
+struct si_fast_udiv_info32 si_compute_fast_udiv_info32(uint32_t D, unsigned num_bits)
 {
-	struct util_fast_udiv_info info =
-		util_compute_fast_udiv_info(D, num_bits, 32);
-
-	struct si_fast_udiv_info32 result = {
-		info.multiplier,
-		info.pre_shift,
-		info.post_shift,
-		info.increment,
-	};
-	return result;
+   struct util_fast_udiv_info info = util_compute_fast_udiv_info(D, num_bits, 32);
+
+   struct si_fast_udiv_info32 result = {
+      info.multiplier,
+      info.pre_shift,
+      info.post_shift,
+      info.increment,
+   };
+   return result;
 }
 
-static void *si_create_vertex_elements(struct pipe_context *ctx,
-				       unsigned count,
-				       const struct pipe_vertex_element *elements)
+static void *si_create_vertex_elements(struct pipe_context *ctx, unsigned count,
+                                       const struct pipe_vertex_element *elements)
 {
-	struct si_screen *sscreen = (struct si_screen*)ctx->screen;
-	struct si_vertex_elements *v = CALLOC_STRUCT(si_vertex_elements);
-	bool used[SI_NUM_VERTEX_BUFFERS] = {};
-	struct si_fast_udiv_info32 divisor_factors[SI_MAX_ATTRIBS] = {};
-	STATIC_ASSERT(sizeof(struct si_fast_udiv_info32) == 16);
-	STATIC_ASSERT(sizeof(divisor_factors[0].multiplier) == 4);
-	STATIC_ASSERT(sizeof(divisor_factors[0].pre_shift) == 4);
-	STATIC_ASSERT(sizeof(divisor_factors[0].post_shift) == 4);
-	STATIC_ASSERT(sizeof(divisor_factors[0].increment) == 4);
-	int i;
-
-	assert(count <= SI_MAX_ATTRIBS);
-	if (!v)
-		return NULL;
-
-	v->count = count;
-
-	unsigned alloc_count = count > sscreen->num_vbos_in_user_sgprs ?
-			       count - sscreen->num_vbos_in_user_sgprs : 0;
-	v->vb_desc_list_alloc_size = align(alloc_count * 16, SI_CPDMA_ALIGNMENT);
-
-	for (i = 0; i < count; ++i) {
-		const struct util_format_description *desc;
-		const struct util_format_channel_description *channel;
-		int first_non_void;
-		unsigned vbo_index = elements[i].vertex_buffer_index;
-
-		if (vbo_index >= SI_NUM_VERTEX_BUFFERS) {
-			FREE(v);
-			return NULL;
-		}
-
-		unsigned instance_divisor = elements[i].instance_divisor;
-		if (instance_divisor) {
-			v->uses_instance_divisors = true;
-
-			if (instance_divisor == 1) {
-				v->instance_divisor_is_one |= 1u << i;
-			} else {
-				v->instance_divisor_is_fetched |= 1u << i;
-				divisor_factors[i] =
-					si_compute_fast_udiv_info32(instance_divisor, 32);
-			}
-		}
-
-		if (!used[vbo_index]) {
-			v->first_vb_use_mask |= 1 << i;
-			used[vbo_index] = true;
-		}
-
-		desc = util_format_description(elements[i].src_format);
-		first_non_void = util_format_get_first_non_void_channel(elements[i].src_format);
-		channel = first_non_void >= 0 ? &desc->channel[first_non_void] : NULL;
-
-		v->format_size[i] = desc->block.bits / 8;
-		v->src_offset[i] = elements[i].src_offset;
-		v->vertex_buffer_index[i] = vbo_index;
-
-		bool always_fix = false;
-		union si_vs_fix_fetch fix_fetch;
-		unsigned log_hw_load_size; /* the load element size as seen by the hardware */
-
-		fix_fetch.bits = 0;
-		log_hw_load_size = MIN2(2, util_logbase2(desc->block.bits) - 3);
-
-		if (channel) {
-			switch (channel->type) {
-			case UTIL_FORMAT_TYPE_FLOAT: fix_fetch.u.format = AC_FETCH_FORMAT_FLOAT; break;
-			case UTIL_FORMAT_TYPE_FIXED: fix_fetch.u.format = AC_FETCH_FORMAT_FIXED; break;
-			case UTIL_FORMAT_TYPE_SIGNED: {
-				if (channel->pure_integer)
-					fix_fetch.u.format = AC_FETCH_FORMAT_SINT;
-				else if (channel->normalized)
-					fix_fetch.u.format = AC_FETCH_FORMAT_SNORM;
-				else
-					fix_fetch.u.format = AC_FETCH_FORMAT_SSCALED;
-				break;
-			}
-			case UTIL_FORMAT_TYPE_UNSIGNED: {
-				if (channel->pure_integer)
-					fix_fetch.u.format = AC_FETCH_FORMAT_UINT;
-				else if (channel->normalized)
-					fix_fetch.u.format = AC_FETCH_FORMAT_UNORM;
-				else
-					fix_fetch.u.format = AC_FETCH_FORMAT_USCALED;
-				break;
-			}
-			default: unreachable("bad format type");
-			}
-		} else {
-			switch (elements[i].src_format) {
-			case PIPE_FORMAT_R11G11B10_FLOAT: fix_fetch.u.format = AC_FETCH_FORMAT_FLOAT; break;
-			default: unreachable("bad other format");
-			}
-		}
-
-		if (desc->channel[0].size == 10) {
-			fix_fetch.u.log_size = 3; /* special encoding for 2_10_10_10 */
-			log_hw_load_size = 2;
-
-			/* The hardware always treats the 2-bit alpha channel as
-			 * unsigned, so a shader workaround is needed. The affected
-			 * chips are GFX8 and older except Stoney (GFX8.1).
-			 */
-			always_fix = sscreen->info.chip_class <= GFX8 &&
-				     sscreen->info.family != CHIP_STONEY &&
-				     channel->type == UTIL_FORMAT_TYPE_SIGNED;
-		} else if (elements[i].src_format == PIPE_FORMAT_R11G11B10_FLOAT) {
-			fix_fetch.u.log_size = 3; /* special encoding */
-			fix_fetch.u.format = AC_FETCH_FORMAT_FIXED;
-			log_hw_load_size = 2;
-		} else {
-			fix_fetch.u.log_size = util_logbase2(channel->size) - 3;
-			fix_fetch.u.num_channels_m1 = desc->nr_channels - 1;
-
-			/* Always fix up:
-			 * - doubles (multiple loads + truncate to float)
-			 * - 32-bit requiring a conversion
-			 */
-			always_fix =
-				(fix_fetch.u.log_size == 3) ||
-				(fix_fetch.u.log_size == 2 &&
-				 fix_fetch.u.format != AC_FETCH_FORMAT_FLOAT &&
-				 fix_fetch.u.format != AC_FETCH_FORMAT_UINT &&
-				 fix_fetch.u.format != AC_FETCH_FORMAT_SINT);
-
-			/* Also fixup 8_8_8 and 16_16_16. */
-			if (desc->nr_channels == 3 && fix_fetch.u.log_size <= 1) {
-				always_fix = true;
-				log_hw_load_size = fix_fetch.u.log_size;
-			}
-		}
-
-		if (desc->swizzle[0] != PIPE_SWIZZLE_X) {
-			assert(desc->swizzle[0] == PIPE_SWIZZLE_Z &&
-			       (desc->swizzle[2] == PIPE_SWIZZLE_X || desc->swizzle[2] == PIPE_SWIZZLE_0));
-			fix_fetch.u.reverse = 1;
-		}
-
-		/* Force the workaround for unaligned access here already if the
-		 * offset relative to the vertex buffer base is unaligned.
-		 *
-		 * There is a theoretical case in which this is too conservative:
-		 * if the vertex buffer's offset is also unaligned in just the
-		 * right way, we end up with an aligned address after all.
-		 * However, this case should be extremely rare in practice (it
-		 * won't happen in well-behaved applications), and taking it
-		 * into account would complicate the fast path (where everything
-		 * is nicely aligned).
-		 */
-		bool check_alignment =
-			log_hw_load_size >= 1 &&
-			(sscreen->info.chip_class == GFX6 || sscreen->info.chip_class == GFX10);
-		bool opencode = sscreen->options.vs_fetch_always_opencode;
-
-		if (check_alignment &&
-		    (elements[i].src_offset & ((1 << log_hw_load_size) - 1)) != 0)
-			opencode = true;
-
-		if (always_fix || check_alignment || opencode)
-			v->fix_fetch[i] = fix_fetch.bits;
-
-		if (opencode)
-			v->fix_fetch_opencode |= 1 << i;
-		if (opencode || always_fix)
-			v->fix_fetch_always |= 1 << i;
-
-		if (check_alignment && !opencode) {
-			assert(log_hw_load_size == 1 || log_hw_load_size == 2);
-
-			v->fix_fetch_unaligned |= 1 << i;
-			v->hw_load_is_dword |= (log_hw_load_size - 1) << i;
-			v->vb_alignment_check_mask |= 1 << vbo_index;
-		}
-
-		v->rsrc_word3[i] = S_008F0C_DST_SEL_X(si_map_swizzle(desc->swizzle[0])) |
-				   S_008F0C_DST_SEL_Y(si_map_swizzle(desc->swizzle[1])) |
-				   S_008F0C_DST_SEL_Z(si_map_swizzle(desc->swizzle[2])) |
-				   S_008F0C_DST_SEL_W(si_map_swizzle(desc->swizzle[3]));
-
-		if (sscreen->info.chip_class >= GFX10) {
-			const struct gfx10_format *fmt =
-				&gfx10_format_table[elements[i].src_format];
-			assert(fmt->img_format != 0 && fmt->img_format < 128);
-			v->rsrc_word3[i] |= S_008F0C_FORMAT(fmt->img_format) |
-					    S_008F0C_RESOURCE_LEVEL(1);
-		} else {
-			unsigned data_format, num_format;
-			data_format = si_translate_buffer_dataformat(ctx->screen, desc, first_non_void);
-			num_format = si_translate_buffer_numformat(ctx->screen, desc, first_non_void);
-			v->rsrc_word3[i] |= S_008F0C_NUM_FORMAT(num_format) |
-					    S_008F0C_DATA_FORMAT(data_format);
-		}
-	}
-
-	if (v->instance_divisor_is_fetched) {
-		unsigned num_divisors = util_last_bit(v->instance_divisor_is_fetched);
-
-		v->instance_divisor_factor_buffer =
-			(struct si_resource*)
-			pipe_buffer_create(&sscreen->b, 0, PIPE_USAGE_DEFAULT,
-					   num_divisors * sizeof(divisor_factors[0]));
-		if (!v->instance_divisor_factor_buffer) {
-			FREE(v);
-			return NULL;
-		}
-		void *map = sscreen->ws->buffer_map(v->instance_divisor_factor_buffer->buf,
-						    NULL, PIPE_TRANSFER_WRITE);
-		memcpy(map , divisor_factors, num_divisors * sizeof(divisor_factors[0]));
-	}
-	return v;
+   struct si_screen *sscreen = (struct si_screen *)ctx->screen;
+   struct si_vertex_elements *v = CALLOC_STRUCT(si_vertex_elements);
+   bool used[SI_NUM_VERTEX_BUFFERS] = {};
+   struct si_fast_udiv_info32 divisor_factors[SI_MAX_ATTRIBS] = {};
+   STATIC_ASSERT(sizeof(struct si_fast_udiv_info32) == 16);
+   STATIC_ASSERT(sizeof(divisor_factors[0].multiplier) == 4);
+   STATIC_ASSERT(sizeof(divisor_factors[0].pre_shift) == 4);
+   STATIC_ASSERT(sizeof(divisor_factors[0].post_shift) == 4);
+   STATIC_ASSERT(sizeof(divisor_factors[0].increment) == 4);
+   int i;
+
+   assert(count <= SI_MAX_ATTRIBS);
+   if (!v)
+      return NULL;
+
+   v->count = count;
+
+   unsigned alloc_count =
+      count > sscreen->num_vbos_in_user_sgprs ? count - sscreen->num_vbos_in_user_sgprs : 0;
+   v->vb_desc_list_alloc_size = align(alloc_count * 16, SI_CPDMA_ALIGNMENT);
+
+   for (i = 0; i < count; ++i) {
+      const struct util_format_description *desc;
+      const struct util_format_channel_description *channel;
+      int first_non_void;
+      unsigned vbo_index = elements[i].vertex_buffer_index;
+
+      if (vbo_index >= SI_NUM_VERTEX_BUFFERS) {
+         FREE(v);
+         return NULL;
+      }
+
+      unsigned instance_divisor = elements[i].instance_divisor;
+      if (instance_divisor) {
+         v->uses_instance_divisors = true;
+
+         if (instance_divisor == 1) {
+            v->instance_divisor_is_one |= 1u << i;
+         } else {
+            v->instance_divisor_is_fetched |= 1u << i;
+            divisor_factors[i] = si_compute_fast_udiv_info32(instance_divisor, 32);
+         }
+      }
+
+      if (!used[vbo_index]) {
+         v->first_vb_use_mask |= 1 << i;
+         used[vbo_index] = true;
+      }
+
+      desc = util_format_description(elements[i].src_format);
+      first_non_void = util_format_get_first_non_void_channel(elements[i].src_format);
+      channel = first_non_void >= 0 ? &desc->channel[first_non_void] : NULL;
+
+      v->format_size[i] = desc->block.bits / 8;
+      v->src_offset[i] = elements[i].src_offset;
+      v->vertex_buffer_index[i] = vbo_index;
+
+      bool always_fix = false;
+      union si_vs_fix_fetch fix_fetch;
+      unsigned log_hw_load_size; /* the load element size as seen by the hardware */
+
+      fix_fetch.bits = 0;
+      log_hw_load_size = MIN2(2, util_logbase2(desc->block.bits) - 3);
+
+      if (channel) {
+         switch (channel->type) {
+         case UTIL_FORMAT_TYPE_FLOAT:
+            fix_fetch.u.format = AC_FETCH_FORMAT_FLOAT;
+            break;
+         case UTIL_FORMAT_TYPE_FIXED:
+            fix_fetch.u.format = AC_FETCH_FORMAT_FIXED;
+            break;
+         case UTIL_FORMAT_TYPE_SIGNED: {
+            if (channel->pure_integer)
+               fix_fetch.u.format = AC_FETCH_FORMAT_SINT;
+            else if (channel->normalized)
+               fix_fetch.u.format = AC_FETCH_FORMAT_SNORM;
+            else
+               fix_fetch.u.format = AC_FETCH_FORMAT_SSCALED;
+            break;
+         }
+         case UTIL_FORMAT_TYPE_UNSIGNED: {
+            if (channel->pure_integer)
+               fix_fetch.u.format = AC_FETCH_FORMAT_UINT;
+            else if (channel->normalized)
+               fix_fetch.u.format = AC_FETCH_FORMAT_UNORM;
+            else
+               fix_fetch.u.format = AC_FETCH_FORMAT_USCALED;
+            break;
+         }
+         default:
+            unreachable("bad format type");
+         }
+      } else {
+         switch (elements[i].src_format) {
+         case PIPE_FORMAT_R11G11B10_FLOAT:
+            fix_fetch.u.format = AC_FETCH_FORMAT_FLOAT;
+            break;
+         default:
+            unreachable("bad other format");
+         }
+      }
+
+      if (desc->channel[0].size == 10) {
+         fix_fetch.u.log_size = 3; /* special encoding for 2_10_10_10 */
+         log_hw_load_size = 2;
+
+         /* The hardware always treats the 2-bit alpha channel as
+          * unsigned, so a shader workaround is needed. The affected
+          * chips are GFX8 and older except Stoney (GFX8.1).
+          */
+         always_fix = sscreen->info.chip_class <= GFX8 && sscreen->info.family != CHIP_STONEY &&
+                      channel->type == UTIL_FORMAT_TYPE_SIGNED;
+      } else if (elements[i].src_format == PIPE_FORMAT_R11G11B10_FLOAT) {
+         fix_fetch.u.log_size = 3; /* special encoding */
+         fix_fetch.u.format = AC_FETCH_FORMAT_FIXED;
+         log_hw_load_size = 2;
+      } else {
+         fix_fetch.u.log_size = util_logbase2(channel->size) - 3;
+         fix_fetch.u.num_channels_m1 = desc->nr_channels - 1;
+
+         /* Always fix up:
+          * - doubles (multiple loads + truncate to float)
+          * - 32-bit requiring a conversion
+          */
+         always_fix = (fix_fetch.u.log_size == 3) ||
+                      (fix_fetch.u.log_size == 2 && fix_fetch.u.format != AC_FETCH_FORMAT_FLOAT &&
+                       fix_fetch.u.format != AC_FETCH_FORMAT_UINT &&
+                       fix_fetch.u.format != AC_FETCH_FORMAT_SINT);
+
+         /* Also fixup 8_8_8 and 16_16_16. */
+         if (desc->nr_channels == 3 && fix_fetch.u.log_size <= 1) {
+            always_fix = true;
+            log_hw_load_size = fix_fetch.u.log_size;
+         }
+      }
+
+      if (desc->swizzle[0] != PIPE_SWIZZLE_X) {
+         assert(desc->swizzle[0] == PIPE_SWIZZLE_Z &&
+                (desc->swizzle[2] == PIPE_SWIZZLE_X || desc->swizzle[2] == PIPE_SWIZZLE_0));
+         fix_fetch.u.reverse = 1;
+      }
+
+      /* Force the workaround for unaligned access here already if the
+       * offset relative to the vertex buffer base is unaligned.
+       *
+       * There is a theoretical case in which this is too conservative:
+       * if the vertex buffer's offset is also unaligned in just the
+       * right way, we end up with an aligned address after all.
+       * However, this case should be extremely rare in practice (it
+       * won't happen in well-behaved applications), and taking it
+       * into account would complicate the fast path (where everything
+       * is nicely aligned).
+       */
+      bool check_alignment = log_hw_load_size >= 1 && (sscreen->info.chip_class == GFX6 ||
+                                                       sscreen->info.chip_class == GFX10);
+      bool opencode = sscreen->options.vs_fetch_always_opencode;
+
+      if (check_alignment && (elements[i].src_offset & ((1 << log_hw_load_size) - 1)) != 0)
+         opencode = true;
+
+      if (always_fix || check_alignment || opencode)
+         v->fix_fetch[i] = fix_fetch.bits;
+
+      if (opencode)
+         v->fix_fetch_opencode |= 1 << i;
+      if (opencode || always_fix)
+         v->fix_fetch_always |= 1 << i;
+
+      if (check_alignment && !opencode) {
+         assert(log_hw_load_size == 1 || log_hw_load_size == 2);
+
+         v->fix_fetch_unaligned |= 1 << i;
+         v->hw_load_is_dword |= (log_hw_load_size - 1) << i;
+         v->vb_alignment_check_mask |= 1 << vbo_index;
+      }
+
+      v->rsrc_word3[i] = S_008F0C_DST_SEL_X(si_map_swizzle(desc->swizzle[0])) |
+                         S_008F0C_DST_SEL_Y(si_map_swizzle(desc->swizzle[1])) |
+                         S_008F0C_DST_SEL_Z(si_map_swizzle(desc->swizzle[2])) |
+                         S_008F0C_DST_SEL_W(si_map_swizzle(desc->swizzle[3]));
+
+      if (sscreen->info.chip_class >= GFX10) {
+         const struct gfx10_format *fmt = &gfx10_format_table[elements[i].src_format];
+         assert(fmt->img_format != 0 && fmt->img_format < 128);
+         v->rsrc_word3[i] |= S_008F0C_FORMAT(fmt->img_format) | S_008F0C_RESOURCE_LEVEL(1);
+      } else {
+         unsigned data_format, num_format;
+         data_format = si_translate_buffer_dataformat(ctx->screen, desc, first_non_void);
+         num_format = si_translate_buffer_numformat(ctx->screen, desc, first_non_void);
+         v->rsrc_word3[i] |= S_008F0C_NUM_FORMAT(num_format) | S_008F0C_DATA_FORMAT(data_format);
+      }
+   }
+
+   if (v->instance_divisor_is_fetched) {
+      unsigned num_divisors = util_last_bit(v->instance_divisor_is_fetched);
+
+      v->instance_divisor_factor_buffer = (struct si_resource *)pipe_buffer_create(
+         &sscreen->b, 0, PIPE_USAGE_DEFAULT, num_divisors * sizeof(divisor_factors[0]));
+      if (!v->instance_divisor_factor_buffer) {
+         FREE(v);
+         return NULL;
+      }
+      void *map =
+         sscreen->ws->buffer_map(v->instance_divisor_factor_buffer->buf, NULL, PIPE_TRANSFER_WRITE);
+      memcpy(map, divisor_factors, num_divisors * sizeof(divisor_factors[0]));
+   }
+   return v;
 }
 
 static void si_bind_vertex_elements(struct pipe_context *ctx, void *state)
 {
-	struct si_context *sctx = (struct si_context *)ctx;
-	struct si_vertex_elements *old = sctx->vertex_elements;
-	struct si_vertex_elements *v = (struct si_vertex_elements*)state;
-
-	sctx->vertex_elements = v;
-	sctx->num_vertex_elements = v ? v->count : 0;
-
-	if (sctx->num_vertex_elements) {
-		sctx->vertex_buffers_dirty = true;
-	} else {
-		sctx->vertex_buffer_pointer_dirty = false;
-		sctx->vertex_buffer_user_sgprs_dirty = false;
-	}
-
-	if (v &&
-	    (!old ||
-	     old->count != v->count ||
-	     old->uses_instance_divisors != v->uses_instance_divisors ||
-	     /* we don't check which divisors changed */
-	     v->uses_instance_divisors ||
-	     (old->vb_alignment_check_mask ^ v->vb_alignment_check_mask) & sctx->vertex_buffer_unaligned ||
-	     ((v->vb_alignment_check_mask & sctx->vertex_buffer_unaligned) &&
-	      memcmp(old->vertex_buffer_index, v->vertex_buffer_index,
-		     sizeof(v->vertex_buffer_index[0]) * v->count)) ||
-	     /* fix_fetch_{always,opencode,unaligned} and hw_load_is_dword are
-	      * functions of fix_fetch and the src_offset alignment.
-	      * If they change and fix_fetch doesn't, it must be due to different
-	      * src_offset alignment, which is reflected in fix_fetch_opencode. */
-	     old->fix_fetch_opencode != v->fix_fetch_opencode ||
-	     memcmp(old->fix_fetch, v->fix_fetch, sizeof(v->fix_fetch[0]) * v->count)))
-		sctx->do_update_shaders = true;
-
-	if (v && v->instance_divisor_is_fetched) {
-		struct pipe_constant_buffer cb;
-
-		cb.buffer = &v->instance_divisor_factor_buffer->b.b;
-		cb.user_buffer = NULL;
-		cb.buffer_offset = 0;
-		cb.buffer_size = 0xffffffff;
-		si_set_rw_buffer(sctx, SI_VS_CONST_INSTANCE_DIVISORS, &cb);
-	}
+   struct si_context *sctx = (struct si_context *)ctx;
+   struct si_vertex_elements *old = sctx->vertex_elements;
+   struct si_vertex_elements *v = (struct si_vertex_elements *)state;
+
+   sctx->vertex_elements = v;
+   sctx->num_vertex_elements = v ? v->count : 0;
+
+   if (sctx->num_vertex_elements) {
+      sctx->vertex_buffers_dirty = true;
+   } else {
+      sctx->vertex_buffer_pointer_dirty = false;
+      sctx->vertex_buffer_user_sgprs_dirty = false;
+   }
+
+   if (v && (!old || old->count != v->count ||
+             old->uses_instance_divisors != v->uses_instance_divisors ||
+             /* we don't check which divisors changed */
+             v->uses_instance_divisors ||
+             (old->vb_alignment_check_mask ^ v->vb_alignment_check_mask) &
+                sctx->vertex_buffer_unaligned ||
+             ((v->vb_alignment_check_mask & sctx->vertex_buffer_unaligned) &&
+              memcmp(old->vertex_buffer_index, v->vertex_buffer_index,
+                     sizeof(v->vertex_buffer_index[0]) * v->count)) ||
+             /* fix_fetch_{always,opencode,unaligned} and hw_load_is_dword are
+              * functions of fix_fetch and the src_offset alignment.
+              * If they change and fix_fetch doesn't, it must be due to different
+              * src_offset alignment, which is reflected in fix_fetch_opencode. */
+             old->fix_fetch_opencode != v->fix_fetch_opencode ||
+             memcmp(old->fix_fetch, v->fix_fetch, sizeof(v->fix_fetch[0]) * v->count)))
+      sctx->do_update_shaders = true;
+
+   if (v && v->instance_divisor_is_fetched) {
+      struct pipe_constant_buffer cb;
+
+      cb.buffer = &v->instance_divisor_factor_buffer->b.b;
+      cb.user_buffer = NULL;
+      cb.buffer_offset = 0;
+      cb.buffer_size = 0xffffffff;
+      si_set_rw_buffer(sctx, SI_VS_CONST_INSTANCE_DIVISORS, &cb);
+   }
 }
 
 static void si_delete_vertex_element(struct pipe_context *ctx, void *state)
 {
-	struct si_context *sctx = (struct si_context *)ctx;
-	struct si_vertex_elements *v = (struct si_vertex_elements*)state;
-
-	if (sctx->vertex_elements == state) {
-		sctx->vertex_elements = NULL;
-		sctx->num_vertex_elements = 0;
-	}
-	si_resource_reference(&v->instance_divisor_factor_buffer, NULL);
-	FREE(state);
+   struct si_context *sctx = (struct si_context *)ctx;
+   struct si_vertex_elements *v = (struct si_vertex_elements *)state;
+
+   if (sctx->vertex_elements == state) {
+      sctx->vertex_elements = NULL;
+      sctx->num_vertex_elements = 0;
+   }
+   si_resource_reference(&v->instance_divisor_factor_buffer, NULL);
+   FREE(state);
 }
 
-static void si_set_vertex_buffers(struct pipe_context *ctx,
-				  unsigned start_slot, unsigned count,
-				  const struct pipe_vertex_buffer *buffers)
+static void si_set_vertex_buffers(struct pipe_context *ctx, unsigned start_slot, unsigned count,
+                                  const struct pipe_vertex_buffer *buffers)
 {
-	struct si_context *sctx = (struct si_context *)ctx;
-	struct pipe_vertex_buffer *dst = sctx->vertex_buffer + start_slot;
-	unsigned updated_mask = u_bit_consecutive(start_slot, count);
-	uint32_t orig_unaligned = sctx->vertex_buffer_unaligned;
-	uint32_t unaligned = 0;
-	int i;
-
-	assert(start_slot + count <= ARRAY_SIZE(sctx->vertex_buffer));
-
-	if (buffers) {
-		for (i = 0; i < count; i++) {
-			const struct pipe_vertex_buffer *src = buffers + i;
-			struct pipe_vertex_buffer *dsti = dst + i;
-			struct pipe_resource *buf = src->buffer.resource;
-			unsigned slot_bit = 1 << (start_slot + i);
-
-			pipe_resource_reference(&dsti->buffer.resource, buf);
-			dsti->buffer_offset = src->buffer_offset;
-			dsti->stride = src->stride;
-
-			if (dsti->buffer_offset & 3 || dsti->stride & 3)
-				unaligned |= slot_bit;
-
-			si_context_add_resource_size(sctx, buf);
-			if (buf)
-				si_resource(buf)->bind_history |= PIPE_BIND_VERTEX_BUFFER;
-		}
-	} else {
-		for (i = 0; i < count; i++) {
-			pipe_resource_reference(&dst[i].buffer.resource, NULL);
-		}
-		unaligned &= ~updated_mask;
-	}
-	sctx->vertex_buffers_dirty = true;
-	sctx->vertex_buffer_unaligned = (orig_unaligned & ~updated_mask) | unaligned;
-
-	/* Check whether alignment may have changed in a way that requires
-	 * shader changes. This check is conservative: a vertex buffer can only
-	 * trigger a shader change if the misalignment amount changes (e.g.
-	 * from byte-aligned to short-aligned), but we only keep track of
-	 * whether buffers are at least dword-aligned, since that should always
-	 * be the case in well-behaved applications anyway.
-	 */
-	if (sctx->vertex_elements &&
-	    (sctx->vertex_elements->vb_alignment_check_mask &
-	     (unaligned | orig_unaligned) & updated_mask))
-		sctx->do_update_shaders = true;
+   struct si_context *sctx = (struct si_context *)ctx;
+   struct pipe_vertex_buffer *dst = sctx->vertex_buffer + start_slot;
+   unsigned updated_mask = u_bit_consecutive(start_slot, count);
+   uint32_t orig_unaligned = sctx->vertex_buffer_unaligned;
+   uint32_t unaligned = 0;
+   int i;
+
+   assert(start_slot + count <= ARRAY_SIZE(sctx->vertex_buffer));
+
+   if (buffers) {
+      for (i = 0; i < count; i++) {
+         const struct pipe_vertex_buffer *src = buffers + i;
+         struct pipe_vertex_buffer *dsti = dst + i;
+         struct pipe_resource *buf = src->buffer.resource;
+         unsigned slot_bit = 1 << (start_slot + i);
+
+         pipe_resource_reference(&dsti->buffer.resource, buf);
+         dsti->buffer_offset = src->buffer_offset;
+         dsti->stride = src->stride;
+
+         if (dsti->buffer_offset & 3 || dsti->stride & 3)
+            unaligned |= slot_bit;
+
+         si_context_add_resource_size(sctx, buf);
+         if (buf)
+            si_resource(buf)->bind_history |= PIPE_BIND_VERTEX_BUFFER;
+      }
+   } else {
+      for (i = 0; i < count; i++) {
+         pipe_resource_reference(&dst[i].buffer.resource, NULL);
+      }
+      unaligned &= ~updated_mask;
+   }
+   sctx->vertex_buffers_dirty = true;
+   sctx->vertex_buffer_unaligned = (orig_unaligned & ~updated_mask) | unaligned;
+
+   /* Check whether alignment may have changed in a way that requires
+    * shader changes. This check is conservative: a vertex buffer can only
+    * trigger a shader change if the misalignment amount changes (e.g.
+    * from byte-aligned to short-aligned), but we only keep track of
+    * whether buffers are at least dword-aligned, since that should always
+    * be the case in well-behaved applications anyway.
+    */
+   if (sctx->vertex_elements && (sctx->vertex_elements->vb_alignment_check_mask &
+                                 (unaligned | orig_unaligned) & updated_mask))
+      sctx->do_update_shaders = true;
 }
 
 /*
  * Misc
  */
 
-static void si_set_tess_state(struct pipe_context *ctx,
-			      const float default_outer_level[4],
-			      const float default_inner_level[2])
+static void si_set_tess_state(struct pipe_context *ctx, const float default_outer_level[4],
+                              const float default_inner_level[2])
 {
-	struct si_context *sctx = (struct si_context *)ctx;
-	struct pipe_constant_buffer cb;
-	float array[8];
+   struct si_context *sctx = (struct si_context *)ctx;
+   struct pipe_constant_buffer cb;
+   float array[8];
 
-	memcpy(array, default_outer_level, sizeof(float) * 4);
-	memcpy(array+4, default_inner_level, sizeof(float) * 2);
+   memcpy(array, default_outer_level, sizeof(float) * 4);
+   memcpy(array + 4, default_inner_level, sizeof(float) * 2);
 
-	cb.buffer = NULL;
-	cb.user_buffer = NULL;
-	cb.buffer_size = sizeof(array);
+   cb.buffer = NULL;
+   cb.user_buffer = NULL;
+   cb.buffer_size = sizeof(array);
 
-	si_upload_const_buffer(sctx, (struct si_resource**)&cb.buffer,
-			       (void*)array, sizeof(array),
-			       &cb.buffer_offset);
+   si_upload_const_buffer(sctx, (struct si_resource **)&cb.buffer, (void *)array, sizeof(array),
+                          &cb.buffer_offset);
 
-	si_set_rw_buffer(sctx, SI_HS_CONST_DEFAULT_TESS_LEVELS, &cb);
-	pipe_resource_reference(&cb.buffer, NULL);
+   si_set_rw_buffer(sctx, SI_HS_CONST_DEFAULT_TESS_LEVELS, &cb);
+   pipe_resource_reference(&cb.buffer, NULL);
 }
 
 static void si_texture_barrier(struct pipe_context *ctx, unsigned flags)
 {
-	struct si_context *sctx = (struct si_context *)ctx;
+   struct si_context *sctx = (struct si_context *)ctx;
 
-	si_update_fb_dirtiness_after_rendering(sctx);
+   si_update_fb_dirtiness_after_rendering(sctx);
 
-	/* Multisample surfaces are flushed in si_decompress_textures. */
-	if (sctx->framebuffer.uncompressed_cb_mask) {
-		si_make_CB_shader_coherent(sctx, sctx->framebuffer.nr_samples,
-					   sctx->framebuffer.CB_has_shader_readable_metadata,
-					   sctx->framebuffer.all_DCC_pipe_aligned);
-	}
+   /* Multisample surfaces are flushed in si_decompress_textures. */
+   if (sctx->framebuffer.uncompressed_cb_mask) {
+      si_make_CB_shader_coherent(sctx, sctx->framebuffer.nr_samples,
+                                 sctx->framebuffer.CB_has_shader_readable_metadata,
+                                 sctx->framebuffer.all_DCC_pipe_aligned);
+   }
 }
 
 /* This only ensures coherency for shader image/buffer stores. */
 static void si_memory_barrier(struct pipe_context *ctx, unsigned flags)
 {
-	struct si_context *sctx = (struct si_context *)ctx;
-
-	if (!(flags & ~PIPE_BARRIER_UPDATE))
-		return;
-
-	/* Subsequent commands must wait for all shader invocations to
-	 * complete. */
-	sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
-		       SI_CONTEXT_CS_PARTIAL_FLUSH;
-
-	if (flags & PIPE_BARRIER_CONSTANT_BUFFER)
-		sctx->flags |= SI_CONTEXT_INV_SCACHE |
-			       SI_CONTEXT_INV_VCACHE;
-
-	if (flags & (PIPE_BARRIER_VERTEX_BUFFER |
-		     PIPE_BARRIER_SHADER_BUFFER |
-		     PIPE_BARRIER_TEXTURE |
-		     PIPE_BARRIER_IMAGE |
-		     PIPE_BARRIER_STREAMOUT_BUFFER |
-		     PIPE_BARRIER_GLOBAL_BUFFER)) {
-		/* As far as I can tell, L1 contents are written back to L2
-		 * automatically at end of shader, but the contents of other
-		 * L1 caches might still be stale. */
-		sctx->flags |= SI_CONTEXT_INV_VCACHE;
-	}
-
-	if (flags & PIPE_BARRIER_INDEX_BUFFER) {
-		/* Indices are read through TC L2 since GFX8.
-		 * L1 isn't used.
-		 */
-		if (sctx->screen->info.chip_class <= GFX7)
-			sctx->flags |= SI_CONTEXT_WB_L2;
-	}
-
-	/* MSAA color, any depth and any stencil are flushed in
-	 * si_decompress_textures when needed.
-	 */
-	if (flags & PIPE_BARRIER_FRAMEBUFFER &&
-	    sctx->framebuffer.uncompressed_cb_mask) {
-		sctx->flags |= SI_CONTEXT_FLUSH_AND_INV_CB;
-
-		if (sctx->chip_class <= GFX8)
-			sctx->flags |= SI_CONTEXT_WB_L2;
-	}
-
-	/* Indirect buffers use TC L2 on GFX9, but not older hw. */
-	if (sctx->screen->info.chip_class <= GFX8 &&
-	    flags & PIPE_BARRIER_INDIRECT_BUFFER)
-		sctx->flags |= SI_CONTEXT_WB_L2;
+   struct si_context *sctx = (struct si_context *)ctx;
+
+   if (!(flags & ~PIPE_BARRIER_UPDATE))
+      return;
+
+   /* Subsequent commands must wait for all shader invocations to
+    * complete. */
+   sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH;
+
+   if (flags & PIPE_BARRIER_CONSTANT_BUFFER)
+      sctx->flags |= SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE;
+
+   if (flags & (PIPE_BARRIER_VERTEX_BUFFER | PIPE_BARRIER_SHADER_BUFFER | PIPE_BARRIER_TEXTURE |
+                PIPE_BARRIER_IMAGE | PIPE_BARRIER_STREAMOUT_BUFFER | PIPE_BARRIER_GLOBAL_BUFFER)) {
+      /* As far as I can tell, L1 contents are written back to L2
+       * automatically at end of shader, but the contents of other
+       * L1 caches might still be stale. */
+      sctx->flags |= SI_CONTEXT_INV_VCACHE;
+   }
+
+   if (flags & PIPE_BARRIER_INDEX_BUFFER) {
+      /* Indices are read through TC L2 since GFX8.
+       * L1 isn't used.
+       */
+      if (sctx->screen->info.chip_class <= GFX7)
+         sctx->flags |= SI_CONTEXT_WB_L2;
+   }
+
+   /* MSAA color, any depth and any stencil are flushed in
+    * si_decompress_textures when needed.
+    */
+   if (flags & PIPE_BARRIER_FRAMEBUFFER && sctx->framebuffer.uncompressed_cb_mask) {
+      sctx->flags |= SI_CONTEXT_FLUSH_AND_INV_CB;
+
+      if (sctx->chip_class <= GFX8)
+         sctx->flags |= SI_CONTEXT_WB_L2;
+   }
+
+   /* Indirect buffers use TC L2 on GFX9, but not older hw. */
+   if (sctx->screen->info.chip_class <= GFX8 && flags & PIPE_BARRIER_INDIRECT_BUFFER)
+      sctx->flags |= SI_CONTEXT_WB_L2;
 }
 
 static void *si_create_blend_custom(struct si_context *sctx, unsigned mode)
 {
-	struct pipe_blend_state blend;
+   struct pipe_blend_state blend;
 
-	memset(&blend, 0, sizeof(blend));
-	blend.independent_blend_enable = true;
-	blend.rt[0].colormask = 0xf;
-	return si_create_blend_state_mode(&sctx->b, &blend, mode);
+   memset(&blend, 0, sizeof(blend));
+   blend.independent_blend_enable = true;
+   blend.rt[0].colormask = 0xf;
+   return si_create_blend_state_mode(&sctx->b, &blend, mode);
 }
 
 static void si_init_config(struct si_context *sctx);
 
 void si_init_state_compute_functions(struct si_context *sctx)
 {
-	sctx->b.create_sampler_state = si_create_sampler_state;
-	sctx->b.delete_sampler_state = si_delete_sampler_state;
-	sctx->b.create_sampler_view = si_create_sampler_view;
-	sctx->b.sampler_view_destroy = si_sampler_view_destroy;
-	sctx->b.memory_barrier = si_memory_barrier;
+   sctx->b.create_sampler_state = si_create_sampler_state;
+   sctx->b.delete_sampler_state = si_delete_sampler_state;
+   sctx->b.create_sampler_view = si_create_sampler_view;
+   sctx->b.sampler_view_destroy = si_sampler_view_destroy;
+   sctx->b.memory_barrier = si_memory_barrier;
 }
 
 void si_init_state_functions(struct si_context *sctx)
 {
-	sctx->atoms.s.framebuffer.emit = si_emit_framebuffer_state;
-	sctx->atoms.s.msaa_sample_locs.emit = si_emit_msaa_sample_locs;
-	sctx->atoms.s.db_render_state.emit = si_emit_db_render_state;
-	sctx->atoms.s.dpbb_state.emit = si_emit_dpbb_state;
-	sctx->atoms.s.msaa_config.emit = si_emit_msaa_config;
-	sctx->atoms.s.sample_mask.emit = si_emit_sample_mask;
-	sctx->atoms.s.cb_render_state.emit = si_emit_cb_render_state;
-	sctx->atoms.s.blend_color.emit = si_emit_blend_color;
-	sctx->atoms.s.clip_regs.emit = si_emit_clip_regs;
-	sctx->atoms.s.clip_state.emit = si_emit_clip_state;
-	sctx->atoms.s.stencil_ref.emit = si_emit_stencil_ref;
-
-	sctx->b.create_blend_state = si_create_blend_state;
-	sctx->b.bind_blend_state = si_bind_blend_state;
-	sctx->b.delete_blend_state = si_delete_blend_state;
-	sctx->b.set_blend_color = si_set_blend_color;
-
-	sctx->b.create_rasterizer_state = si_create_rs_state;
-	sctx->b.bind_rasterizer_state = si_bind_rs_state;
-	sctx->b.delete_rasterizer_state = si_delete_rs_state;
-
-	sctx->b.create_depth_stencil_alpha_state = si_create_dsa_state;
-	sctx->b.bind_depth_stencil_alpha_state = si_bind_dsa_state;
-	sctx->b.delete_depth_stencil_alpha_state = si_delete_dsa_state;
-
-	sctx->custom_dsa_flush = si_create_db_flush_dsa(sctx);
-	sctx->custom_blend_resolve = si_create_blend_custom(sctx, V_028808_CB_RESOLVE);
-	sctx->custom_blend_fmask_decompress = si_create_blend_custom(sctx, V_028808_CB_FMASK_DECOMPRESS);
-	sctx->custom_blend_eliminate_fastclear = si_create_blend_custom(sctx, V_028808_CB_ELIMINATE_FAST_CLEAR);
-	sctx->custom_blend_dcc_decompress = si_create_blend_custom(sctx, V_028808_CB_DCC_DECOMPRESS);
-
-	sctx->b.set_clip_state = si_set_clip_state;
-	sctx->b.set_stencil_ref = si_set_stencil_ref;
-
-	sctx->b.set_framebuffer_state = si_set_framebuffer_state;
-
-	sctx->b.set_sample_mask = si_set_sample_mask;
-
-	sctx->b.create_vertex_elements_state = si_create_vertex_elements;
-	sctx->b.bind_vertex_elements_state = si_bind_vertex_elements;
-	sctx->b.delete_vertex_elements_state = si_delete_vertex_element;
-	sctx->b.set_vertex_buffers = si_set_vertex_buffers;
-
-	sctx->b.texture_barrier = si_texture_barrier;
-	sctx->b.set_min_samples = si_set_min_samples;
-	sctx->b.set_tess_state = si_set_tess_state;
-
-	sctx->b.set_active_query_state = si_set_active_query_state;
-
-	si_init_config(sctx);
+   sctx->atoms.s.framebuffer.emit = si_emit_framebuffer_state;
+   sctx->atoms.s.msaa_sample_locs.emit = si_emit_msaa_sample_locs;
+   sctx->atoms.s.db_render_state.emit = si_emit_db_render_state;
+   sctx->atoms.s.dpbb_state.emit = si_emit_dpbb_state;
+   sctx->atoms.s.msaa_config.emit = si_emit_msaa_config;
+   sctx->atoms.s.sample_mask.emit = si_emit_sample_mask;
+   sctx->atoms.s.cb_render_state.emit = si_emit_cb_render_state;
+   sctx->atoms.s.blend_color.emit = si_emit_blend_color;
+   sctx->atoms.s.clip_regs.emit = si_emit_clip_regs;
+   sctx->atoms.s.clip_state.emit = si_emit_clip_state;
+   sctx->atoms.s.stencil_ref.emit = si_emit_stencil_ref;
+
+   sctx->b.create_blend_state = si_create_blend_state;
+   sctx->b.bind_blend_state = si_bind_blend_state;
+   sctx->b.delete_blend_state = si_delete_blend_state;
+   sctx->b.set_blend_color = si_set_blend_color;
+
+   sctx->b.create_rasterizer_state = si_create_rs_state;
+   sctx->b.bind_rasterizer_state = si_bind_rs_state;
+   sctx->b.delete_rasterizer_state = si_delete_rs_state;
+
+   sctx->b.create_depth_stencil_alpha_state = si_create_dsa_state;
+   sctx->b.bind_depth_stencil_alpha_state = si_bind_dsa_state;
+   sctx->b.delete_depth_stencil_alpha_state = si_delete_dsa_state;
+
+   sctx->custom_dsa_flush = si_create_db_flush_dsa(sctx);
+   sctx->custom_blend_resolve = si_create_blend_custom(sctx, V_028808_CB_RESOLVE);
+   sctx->custom_blend_fmask_decompress = si_create_blend_custom(sctx, V_028808_CB_FMASK_DECOMPRESS);
+   sctx->custom_blend_eliminate_fastclear =
+      si_create_blend_custom(sctx, V_028808_CB_ELIMINATE_FAST_CLEAR);
+   sctx->custom_blend_dcc_decompress = si_create_blend_custom(sctx, V_028808_CB_DCC_DECOMPRESS);
+
+   sctx->b.set_clip_state = si_set_clip_state;
+   sctx->b.set_stencil_ref = si_set_stencil_ref;
+
+   sctx->b.set_framebuffer_state = si_set_framebuffer_state;
+
+   sctx->b.set_sample_mask = si_set_sample_mask;
+
+   sctx->b.create_vertex_elements_state = si_create_vertex_elements;
+   sctx->b.bind_vertex_elements_state = si_bind_vertex_elements;
+   sctx->b.delete_vertex_elements_state = si_delete_vertex_element;
+   sctx->b.set_vertex_buffers = si_set_vertex_buffers;
+
+   sctx->b.texture_barrier = si_texture_barrier;
+   sctx->b.set_min_samples = si_set_min_samples;
+   sctx->b.set_tess_state = si_set_tess_state;
+
+   sctx->b.set_active_query_state = si_set_active_query_state;
+
+   si_init_config(sctx);
 }
 
 void si_init_screen_state_functions(struct si_screen *sscreen)
 {
-	sscreen->b.is_format_supported = si_is_format_supported;
+   sscreen->b.is_format_supported = si_is_format_supported;
 
-	if (sscreen->info.chip_class >= GFX10) {
-		sscreen->make_texture_descriptor = gfx10_make_texture_descriptor;
-	} else {
-		sscreen->make_texture_descriptor = si_make_texture_descriptor;
-	}
+   if (sscreen->info.chip_class >= GFX10) {
+      sscreen->make_texture_descriptor = gfx10_make_texture_descriptor;
+   } else {
+      sscreen->make_texture_descriptor = si_make_texture_descriptor;
+   }
 }
 
-static void si_set_grbm_gfx_index(struct si_context *sctx,
-				  struct si_pm4_state *pm4,  unsigned value)
+static void si_set_grbm_gfx_index(struct si_context *sctx, struct si_pm4_state *pm4, unsigned value)
 {
-	unsigned reg = sctx->chip_class >= GFX7 ? R_030800_GRBM_GFX_INDEX :
-						   R_00802C_GRBM_GFX_INDEX;
-	si_pm4_set_reg(pm4, reg, value);
+   unsigned reg = sctx->chip_class >= GFX7 ? R_030800_GRBM_GFX_INDEX : R_00802C_GRBM_GFX_INDEX;
+   si_pm4_set_reg(pm4, reg, value);
 }
 
-static void si_set_grbm_gfx_index_se(struct si_context *sctx,
-				     struct si_pm4_state *pm4, unsigned se)
+static void si_set_grbm_gfx_index_se(struct si_context *sctx, struct si_pm4_state *pm4, unsigned se)
 {
-	assert(se == ~0 || se < sctx->screen->info.max_se);
-	si_set_grbm_gfx_index(sctx, pm4,
-			      (se == ~0 ? S_030800_SE_BROADCAST_WRITES(1) :
-					  S_030800_SE_INDEX(se)) |
-			      S_030800_SH_BROADCAST_WRITES(1) |
-			      S_030800_INSTANCE_BROADCAST_WRITES(1));
+   assert(se == ~0 || se < sctx->screen->info.max_se);
+   si_set_grbm_gfx_index(sctx, pm4,
+                         (se == ~0 ? S_030800_SE_BROADCAST_WRITES(1) : S_030800_SE_INDEX(se)) |
+                            S_030800_SH_BROADCAST_WRITES(1) |
+                            S_030800_INSTANCE_BROADCAST_WRITES(1));
 }
 
-static void
-si_write_harvested_raster_configs(struct si_context *sctx,
-				  struct si_pm4_state *pm4,
-				  unsigned raster_config,
-				  unsigned raster_config_1)
+static void si_write_harvested_raster_configs(struct si_context *sctx, struct si_pm4_state *pm4,
+                                              unsigned raster_config, unsigned raster_config_1)
 {
-	unsigned num_se = MAX2(sctx->screen->info.max_se, 1);
-	unsigned raster_config_se[4];
-	unsigned se;
-
-	ac_get_harvested_configs(&sctx->screen->info,
-				 raster_config,
-				 &raster_config_1,
-				 raster_config_se);
-
-	for (se = 0; se < num_se; se++) {
-		si_set_grbm_gfx_index_se(sctx, pm4, se);
-		si_pm4_set_reg(pm4, R_028350_PA_SC_RASTER_CONFIG, raster_config_se[se]);
-	}
-	si_set_grbm_gfx_index(sctx, pm4, ~0);
-
-	if (sctx->chip_class >= GFX7) {
-		si_pm4_set_reg(pm4, R_028354_PA_SC_RASTER_CONFIG_1, raster_config_1);
-	}
+   unsigned num_se = MAX2(sctx->screen->info.max_se, 1);
+   unsigned raster_config_se[4];
+   unsigned se;
+
+   ac_get_harvested_configs(&sctx->screen->info, raster_config, &raster_config_1, raster_config_se);
+
+   for (se = 0; se < num_se; se++) {
+      si_set_grbm_gfx_index_se(sctx, pm4, se);
+      si_pm4_set_reg(pm4, R_028350_PA_SC_RASTER_CONFIG, raster_config_se[se]);
+   }
+   si_set_grbm_gfx_index(sctx, pm4, ~0);
+
+   if (sctx->chip_class >= GFX7) {
+      si_pm4_set_reg(pm4, R_028354_PA_SC_RASTER_CONFIG_1, raster_config_1);
+   }
 }
 
 static void si_set_raster_config(struct si_context *sctx, struct si_pm4_state *pm4)
 {
-	struct si_screen *sscreen = sctx->screen;
-	unsigned num_rb = MIN2(sscreen->info.num_render_backends, 16);
-	unsigned rb_mask = sscreen->info.enabled_rb_mask;
-	unsigned raster_config = sscreen->pa_sc_raster_config;
-	unsigned raster_config_1 = sscreen->pa_sc_raster_config_1;
-
-	if (!rb_mask || util_bitcount(rb_mask) >= num_rb) {
-		/* Always use the default config when all backends are enabled
-		 * (or when we failed to determine the enabled backends).
-		 */
-		si_pm4_set_reg(pm4, R_028350_PA_SC_RASTER_CONFIG,
-			       raster_config);
-		if (sctx->chip_class >= GFX7)
-			si_pm4_set_reg(pm4, R_028354_PA_SC_RASTER_CONFIG_1,
-				       raster_config_1);
-	} else {
-		si_write_harvested_raster_configs(sctx, pm4, raster_config, raster_config_1);
-	}
+   struct si_screen *sscreen = sctx->screen;
+   unsigned num_rb = MIN2(sscreen->info.num_render_backends, 16);
+   unsigned rb_mask = sscreen->info.enabled_rb_mask;
+   unsigned raster_config = sscreen->pa_sc_raster_config;
+   unsigned raster_config_1 = sscreen->pa_sc_raster_config_1;
+
+   if (!rb_mask || util_bitcount(rb_mask) >= num_rb) {
+      /* Always use the default config when all backends are enabled
+       * (or when we failed to determine the enabled backends).
+       */
+      si_pm4_set_reg(pm4, R_028350_PA_SC_RASTER_CONFIG, raster_config);
+      if (sctx->chip_class >= GFX7)
+         si_pm4_set_reg(pm4, R_028354_PA_SC_RASTER_CONFIG_1, raster_config_1);
+   } else {
+      si_write_harvested_raster_configs(sctx, pm4, raster_config, raster_config_1);
+   }
 }
 
 static void si_init_config(struct si_context *sctx)
 {
-	struct si_screen *sscreen = sctx->screen;
-	uint64_t border_color_va = sctx->border_color_buffer->gpu_address;
-	bool has_clear_state = sscreen->info.has_clear_state;
-	struct si_pm4_state *pm4 = CALLOC_STRUCT(si_pm4_state);
-
-	if (!pm4)
-		return;
-
-	si_pm4_cmd_begin(pm4, PKT3_CONTEXT_CONTROL);
-	si_pm4_cmd_add(pm4, CONTEXT_CONTROL_LOAD_ENABLE(1));
-	si_pm4_cmd_add(pm4, CONTEXT_CONTROL_SHADOW_ENABLE(1));
-	si_pm4_cmd_end(pm4, false);
-
-	if (has_clear_state) {
-		si_pm4_cmd_begin(pm4, PKT3_CLEAR_STATE);
-		si_pm4_cmd_add(pm4, 0);
-		si_pm4_cmd_end(pm4, false);
-	}
-
-	if (sctx->chip_class <= GFX8)
-		si_set_raster_config(sctx, pm4);
-
-	si_pm4_set_reg(pm4, R_028A18_VGT_HOS_MAX_TESS_LEVEL, fui(64));
-	if (!has_clear_state)
-		si_pm4_set_reg(pm4, R_028A1C_VGT_HOS_MIN_TESS_LEVEL, fui(0));
-
-	/* FIXME calculate these values somehow ??? */
-	if (sctx->chip_class <= GFX8) {
-		si_pm4_set_reg(pm4, R_028A54_VGT_GS_PER_ES, SI_GS_PER_ES);
-		si_pm4_set_reg(pm4, R_028A58_VGT_ES_PER_GS, 0x40);
-	}
-
-	if (!has_clear_state) {
-		si_pm4_set_reg(pm4, R_028A5C_VGT_GS_PER_VS, 0x2);
-		si_pm4_set_reg(pm4, R_028A8C_VGT_PRIMITIVEID_RESET, 0x0);
-		si_pm4_set_reg(pm4, R_028B98_VGT_STRMOUT_BUFFER_CONFIG, 0x0);
-	}
-
-	if (sscreen->info.chip_class <= GFX9)
-		si_pm4_set_reg(pm4, R_028AA0_VGT_INSTANCE_STEP_RATE_0, 1);
-	if (!has_clear_state)
-		si_pm4_set_reg(pm4, R_028AB8_VGT_VTX_CNT_EN, 0x0);
-	if (sctx->chip_class < GFX7)
-		si_pm4_set_reg(pm4, R_008A14_PA_CL_ENHANCE, S_008A14_NUM_CLIP_SEQ(3) |
-			       S_008A14_CLIP_VTX_REORDER_ENA(1));
-
-	/* CLEAR_STATE doesn't restore these correctly. */
-	si_pm4_set_reg(pm4, R_028240_PA_SC_GENERIC_SCISSOR_TL, S_028240_WINDOW_OFFSET_DISABLE(1));
-	si_pm4_set_reg(pm4, R_028244_PA_SC_GENERIC_SCISSOR_BR,
-		       S_028244_BR_X(16384) | S_028244_BR_Y(16384));
-
-	/* CLEAR_STATE doesn't clear these correctly on certain generations.
-	 * I don't know why. Deduced by trial and error.
-	 */
-	if (sctx->chip_class <= GFX7 || !has_clear_state) {
-		si_pm4_set_reg(pm4, R_028B28_VGT_STRMOUT_DRAW_OPAQUE_OFFSET, 0);
-		si_pm4_set_reg(pm4, R_028204_PA_SC_WINDOW_SCISSOR_TL, S_028204_WINDOW_OFFSET_DISABLE(1));
-		si_pm4_set_reg(pm4, R_028030_PA_SC_SCREEN_SCISSOR_TL, 0);
-		si_pm4_set_reg(pm4, R_028034_PA_SC_SCREEN_SCISSOR_BR,
-			       S_028034_BR_X(16384) | S_028034_BR_Y(16384));
-	}
-
-	if (!has_clear_state) {
-		si_pm4_set_reg(pm4, R_028230_PA_SC_EDGERULE,
-			       S_028230_ER_TRI(0xA) |
-			       S_028230_ER_POINT(0xA) |
-			       S_028230_ER_RECT(0xA) |
-			       /* Required by DX10_DIAMOND_TEST_ENA: */
-			       S_028230_ER_LINE_LR(0x1A) |
-			       S_028230_ER_LINE_RL(0x26) |
-			       S_028230_ER_LINE_TB(0xA) |
-			       S_028230_ER_LINE_BT(0xA));
-		si_pm4_set_reg(pm4, R_028820_PA_CL_NANINF_CNTL, 0);
-		si_pm4_set_reg(pm4, R_028AC0_DB_SRESULTS_COMPARE_STATE0, 0x0);
-		si_pm4_set_reg(pm4, R_028AC4_DB_SRESULTS_COMPARE_STATE1, 0x0);
-		si_pm4_set_reg(pm4, R_028AC8_DB_PRELOAD_CONTROL, 0x0);
-		si_pm4_set_reg(pm4, R_02800C_DB_RENDER_OVERRIDE, 0);
-	}
-
-	if (sctx->chip_class >= GFX10) {
-		si_pm4_set_reg(pm4, R_028A98_VGT_DRAW_PAYLOAD_CNTL, 0);
-		si_pm4_set_reg(pm4, R_030964_GE_MAX_VTX_INDX, ~0);
-		si_pm4_set_reg(pm4, R_030924_GE_MIN_VTX_INDX, 0);
-		si_pm4_set_reg(pm4, R_030928_GE_INDX_OFFSET, 0);
-		si_pm4_set_reg(pm4, R_03097C_GE_STEREO_CNTL, 0);
-		si_pm4_set_reg(pm4, R_030988_GE_USER_VGPR_EN, 0);
-	} else if (sctx->chip_class == GFX9) {
-		si_pm4_set_reg(pm4, R_030920_VGT_MAX_VTX_INDX, ~0);
-		si_pm4_set_reg(pm4, R_030924_VGT_MIN_VTX_INDX, 0);
-		si_pm4_set_reg(pm4, R_030928_VGT_INDX_OFFSET, 0);
-	} else {
-		/* These registers, when written, also overwrite the CLEAR_STATE
-		 * context, so we can't rely on CLEAR_STATE setting them.
-		 * It would be an issue if there was another UMD changing them.
-		 */
-		si_pm4_set_reg(pm4, R_028400_VGT_MAX_VTX_INDX, ~0);
-		si_pm4_set_reg(pm4, R_028404_VGT_MIN_VTX_INDX, 0);
-		si_pm4_set_reg(pm4, R_028408_VGT_INDX_OFFSET, 0);
-	}
-
-	if (sctx->chip_class >= GFX7) {
-		if (sctx->chip_class >= GFX10) {
-			/* Logical CUs 16 - 31 */
-			si_pm4_set_reg(pm4, R_00B404_SPI_SHADER_PGM_RSRC4_HS,
-				       S_00B404_CU_EN(0xffff));
-			si_pm4_set_reg(pm4, R_00B104_SPI_SHADER_PGM_RSRC4_VS,
-				       S_00B104_CU_EN(0xffff));
-			si_pm4_set_reg(pm4, R_00B004_SPI_SHADER_PGM_RSRC4_PS,
-				       S_00B004_CU_EN(0xffff));
-		}
-
-		if (sctx->chip_class >= GFX9) {
-			si_pm4_set_reg(pm4, R_00B41C_SPI_SHADER_PGM_RSRC3_HS,
-				       S_00B41C_CU_EN(0xffff) | S_00B41C_WAVE_LIMIT(0x3F));
-		} else {
-			si_pm4_set_reg(pm4, R_00B51C_SPI_SHADER_PGM_RSRC3_LS,
-				       S_00B51C_CU_EN(0xffff) | S_00B51C_WAVE_LIMIT(0x3F));
-			si_pm4_set_reg(pm4, R_00B41C_SPI_SHADER_PGM_RSRC3_HS,
-				       S_00B41C_WAVE_LIMIT(0x3F));
-			si_pm4_set_reg(pm4, R_00B31C_SPI_SHADER_PGM_RSRC3_ES,
-				       S_00B31C_CU_EN(0xffff) | S_00B31C_WAVE_LIMIT(0x3F));
-
-			/* If this is 0, Bonaire can hang even if GS isn't being used.
-			 * Other chips are unaffected. These are suboptimal values,
-			 * but we don't use on-chip GS.
-			 */
-			si_pm4_set_reg(pm4, R_028A44_VGT_GS_ONCHIP_CNTL,
-				       S_028A44_ES_VERTS_PER_SUBGRP(64) |
-				       S_028A44_GS_PRIMS_PER_SUBGRP(4));
-		}
-
-		/* Compute LATE_ALLOC_VS.LIMIT. */
-		unsigned num_cu_per_sh = sscreen->info.num_good_cu_per_sh;
-		unsigned late_alloc_wave64 = 0; /* The limit is per SH. */
-		unsigned cu_mask_vs = 0xffff;
-		unsigned cu_mask_gs = 0xffff;
-
-		if (sctx->chip_class >= GFX10) {
-			/* For Wave32, the hw will launch twice the number of late
-			 * alloc waves, so 1 == 2x wave32.
-			 */
-			if (!sscreen->info.use_late_alloc) {
-				late_alloc_wave64 = 0;
-			} else if (num_cu_per_sh <= 6) {
-				late_alloc_wave64 = num_cu_per_sh - 2;
-			} else {
-				late_alloc_wave64 = (num_cu_per_sh - 2) * 4;
-
-				/* CU2 & CU3 disabled because of the dual CU design */
-				/* Late alloc is not used for NGG on Navi14 due to a hw bug. */
-				cu_mask_vs = 0xfff3;
-				cu_mask_gs = sscreen->use_ngg &&
-					     sctx->family != CHIP_NAVI14 ? 0xfff3 : 0xffff;
-			}
-		} else {
-			if (!sscreen->info.use_late_alloc) {
-				late_alloc_wave64 = 0;
-			} else if (num_cu_per_sh <= 4) {
-				/* Too few available compute units per SH. Disallowing
-				 * VS to run on one CU could hurt us more than late VS
-				 * allocation would help.
-				 *
-				 * 2 is the highest safe number that allows us to keep
-				 * all CUs enabled.
-				 */
-				late_alloc_wave64 = 2;
-			} else {
-				/* This is a good initial value, allowing 1 late_alloc
-				 * wave per SIMD on num_cu - 2.
-				 */
-				late_alloc_wave64 = (num_cu_per_sh - 2) * 4;
-			}
-
-			if (late_alloc_wave64 > 2)
-				cu_mask_vs = 0xfffe; /* 1 CU disabled */
-		}
-
-		/* VS can't execute on one CU if the limit is > 2. */
-		si_pm4_set_reg(pm4, R_00B118_SPI_SHADER_PGM_RSRC3_VS,
-			S_00B118_CU_EN(cu_mask_vs) |
-			S_00B118_WAVE_LIMIT(0x3F));
-		si_pm4_set_reg(pm4, R_00B11C_SPI_SHADER_LATE_ALLOC_VS,
-			S_00B11C_LIMIT(late_alloc_wave64));
-
-		si_pm4_set_reg(pm4, R_00B21C_SPI_SHADER_PGM_RSRC3_GS,
-			       S_00B21C_CU_EN(cu_mask_gs) | S_00B21C_WAVE_LIMIT(0x3F));
-
-		si_pm4_set_reg(pm4, R_00B01C_SPI_SHADER_PGM_RSRC3_PS,
-			       S_00B01C_CU_EN(0xffff) | S_00B01C_WAVE_LIMIT(0x3F));
-	}
-
-	if (sctx->chip_class >= GFX10) {
-		/* Break up a pixel wave if it contains deallocs for more than
-		 * half the parameter cache.
-		 *
-		 * To avoid a deadlock where pixel waves aren't launched
-		 * because they're waiting for more pixels while the frontend
-		 * is stuck waiting for PC space, the maximum allowed value is
-		 * the size of the PC minus the largest possible allocation for
-		 * a single primitive shader subgroup.
-		 */
-		si_pm4_set_reg(pm4, R_028C50_PA_SC_NGG_MODE_CNTL,
-			       S_028C50_MAX_DEALLOCS_IN_WAVE(512));
-		si_pm4_set_reg(pm4, R_028C58_VGT_VERTEX_REUSE_BLOCK_CNTL, 14);
-
-		if (!has_clear_state) {
-			si_pm4_set_reg(pm4, R_02835C_PA_SC_TILE_STEERING_OVERRIDE,
-				       sscreen->info.pa_sc_tile_steering_override);
-		}
-
-		/* Enable CMASK/FMASK/HTILE/DCC caching in L2 for small chips. */
-		unsigned meta_write_policy, meta_read_policy;
-		/* TODO: investigate whether LRU improves performance on other chips too */
-		if (sscreen->info.num_render_backends <= 4) {
-			meta_write_policy = V_02807C_CACHE_LRU_WR; /* cache writes */
-			meta_read_policy =  V_02807C_CACHE_LRU_RD; /* cache reads */
-		} else {
-			meta_write_policy = V_02807C_CACHE_STREAM_WR; /* write combine */
-			meta_read_policy =  V_02807C_CACHE_NOA_RD;    /* don't cache reads */
-		}
-
-		si_pm4_set_reg(pm4, R_02807C_DB_RMI_L2_CACHE_CONTROL,
-			       S_02807C_Z_WR_POLICY(V_02807C_CACHE_STREAM_WR) |
-			       S_02807C_S_WR_POLICY(V_02807C_CACHE_STREAM_WR) |
-			       S_02807C_HTILE_WR_POLICY(meta_write_policy) |
-			       S_02807C_ZPCPSD_WR_POLICY(V_02807C_CACHE_STREAM_WR) |
-			       S_02807C_Z_RD_POLICY(V_02807C_CACHE_NOA_RD) |
-			       S_02807C_S_RD_POLICY(V_02807C_CACHE_NOA_RD) |
-			       S_02807C_HTILE_RD_POLICY(meta_read_policy));
-
-		si_pm4_set_reg(pm4, R_028410_CB_RMI_GL2_CACHE_CONTROL,
-			       S_028410_CMASK_WR_POLICY(meta_write_policy) |
-			       S_028410_FMASK_WR_POLICY(meta_write_policy) |
-			       S_028410_DCC_WR_POLICY(meta_write_policy) |
-			       S_028410_COLOR_WR_POLICY(V_028410_CACHE_STREAM_WR) |
-			       S_028410_CMASK_RD_POLICY(meta_read_policy) |
-			       S_028410_FMASK_RD_POLICY(meta_read_policy) |
-			       S_028410_DCC_RD_POLICY(meta_read_policy) |
-			       S_028410_COLOR_RD_POLICY(V_028410_CACHE_NOA_RD));
-		si_pm4_set_reg(pm4, R_028428_CB_COVERAGE_OUT_CONTROL, 0);
-
-		si_pm4_set_reg(pm4, R_00B0C0_SPI_SHADER_REQ_CTRL_PS,
-			       S_00B0C0_SOFT_GROUPING_EN(1) |
-			       S_00B0C0_NUMBER_OF_REQUESTS_PER_CU(4 - 1));
-		si_pm4_set_reg(pm4, R_00B1C0_SPI_SHADER_REQ_CTRL_VS, 0);
-	}
-
-	if (sctx->chip_class >= GFX9) {
-		si_pm4_set_reg(pm4, R_028B50_VGT_TESS_DISTRIBUTION,
-			       S_028B50_ACCUM_ISOLINE(40) |
-			       S_028B50_ACCUM_TRI(30) |
-			       S_028B50_ACCUM_QUAD(24) |
-			       S_028B50_DONUT_SPLIT(24) |
-			       S_028B50_TRAP_SPLIT(6));
-	} else if (sctx->chip_class >= GFX8) {
-		unsigned vgt_tess_distribution;
-
-		vgt_tess_distribution =
-			S_028B50_ACCUM_ISOLINE(32) |
-			S_028B50_ACCUM_TRI(11) |
-			S_028B50_ACCUM_QUAD(11) |
-			S_028B50_DONUT_SPLIT(16);
-
-		/* Testing with Unigine Heaven extreme tesselation yielded best results
-		 * with TRAP_SPLIT = 3.
-		 */
-		if (sctx->family == CHIP_FIJI ||
-		    sctx->family >= CHIP_POLARIS10)
-			vgt_tess_distribution |= S_028B50_TRAP_SPLIT(3);
-
-		si_pm4_set_reg(pm4, R_028B50_VGT_TESS_DISTRIBUTION, vgt_tess_distribution);
-	} else if (!has_clear_state) {
-		si_pm4_set_reg(pm4, R_028C58_VGT_VERTEX_REUSE_BLOCK_CNTL, 14);
-		si_pm4_set_reg(pm4, R_028C5C_VGT_OUT_DEALLOC_CNTL, 16);
-	}
-
-	si_pm4_set_reg(pm4, R_028080_TA_BC_BASE_ADDR, border_color_va >> 8);
-	if (sctx->chip_class >= GFX7) {
-		si_pm4_set_reg(pm4, R_028084_TA_BC_BASE_ADDR_HI,
-			       S_028084_ADDRESS(border_color_va >> 40));
-	}
-	si_pm4_add_bo(pm4, sctx->border_color_buffer, RADEON_USAGE_READ,
-		      RADEON_PRIO_BORDER_COLORS);
-
-	if (sctx->chip_class >= GFX9) {
-		si_pm4_set_reg(pm4, R_028C48_PA_SC_BINNER_CNTL_1,
-			       S_028C48_MAX_ALLOC_COUNT(sscreen->info.pbb_max_alloc_count - 1) |
-			       S_028C48_MAX_PRIM_PER_BATCH(1023));
-		si_pm4_set_reg(pm4, R_028C4C_PA_SC_CONSERVATIVE_RASTERIZATION_CNTL,
-			       S_028C4C_NULL_SQUAD_AA_MASK_ENABLE(1));
-		si_pm4_set_reg(pm4, R_030968_VGT_INSTANCE_BASE_ID, 0);
-	}
-
-	si_pm4_upload_indirect_buffer(sctx, pm4);
-	sctx->init_config = pm4;
+   struct si_screen *sscreen = sctx->screen;
+   uint64_t border_color_va = sctx->border_color_buffer->gpu_address;
+   bool has_clear_state = sscreen->info.has_clear_state;
+   struct si_pm4_state *pm4 = CALLOC_STRUCT(si_pm4_state);
+
+   if (!pm4)
+      return;
+
+   si_pm4_cmd_begin(pm4, PKT3_CONTEXT_CONTROL);
+   si_pm4_cmd_add(pm4, CONTEXT_CONTROL_LOAD_ENABLE(1));
+   si_pm4_cmd_add(pm4, CONTEXT_CONTROL_SHADOW_ENABLE(1));
+   si_pm4_cmd_end(pm4, false);
+
+   if (has_clear_state) {
+      si_pm4_cmd_begin(pm4, PKT3_CLEAR_STATE);
+      si_pm4_cmd_add(pm4, 0);
+      si_pm4_cmd_end(pm4, false);
+   }
+
+   if (sctx->chip_class <= GFX8)
+      si_set_raster_config(sctx, pm4);
+
+   si_pm4_set_reg(pm4, R_028A18_VGT_HOS_MAX_TESS_LEVEL, fui(64));
+   if (!has_clear_state)
+      si_pm4_set_reg(pm4, R_028A1C_VGT_HOS_MIN_TESS_LEVEL, fui(0));
+
+   /* FIXME calculate these values somehow ??? */
+   if (sctx->chip_class <= GFX8) {
+      si_pm4_set_reg(pm4, R_028A54_VGT_GS_PER_ES, SI_GS_PER_ES);
+      si_pm4_set_reg(pm4, R_028A58_VGT_ES_PER_GS, 0x40);
+   }
+
+   if (!has_clear_state) {
+      si_pm4_set_reg(pm4, R_028A5C_VGT_GS_PER_VS, 0x2);
+      si_pm4_set_reg(pm4, R_028A8C_VGT_PRIMITIVEID_RESET, 0x0);
+      si_pm4_set_reg(pm4, R_028B98_VGT_STRMOUT_BUFFER_CONFIG, 0x0);
+   }
+
+   if (sscreen->info.chip_class <= GFX9)
+      si_pm4_set_reg(pm4, R_028AA0_VGT_INSTANCE_STEP_RATE_0, 1);
+   if (!has_clear_state)
+      si_pm4_set_reg(pm4, R_028AB8_VGT_VTX_CNT_EN, 0x0);
+   if (sctx->chip_class < GFX7)
+      si_pm4_set_reg(pm4, R_008A14_PA_CL_ENHANCE,
+                     S_008A14_NUM_CLIP_SEQ(3) | S_008A14_CLIP_VTX_REORDER_ENA(1));
+
+   /* CLEAR_STATE doesn't restore these correctly. */
+   si_pm4_set_reg(pm4, R_028240_PA_SC_GENERIC_SCISSOR_TL, S_028240_WINDOW_OFFSET_DISABLE(1));
+   si_pm4_set_reg(pm4, R_028244_PA_SC_GENERIC_SCISSOR_BR,
+                  S_028244_BR_X(16384) | S_028244_BR_Y(16384));
+
+   /* CLEAR_STATE doesn't clear these correctly on certain generations.
+    * I don't know why. Deduced by trial and error.
+    */
+   if (sctx->chip_class <= GFX7 || !has_clear_state) {
+      si_pm4_set_reg(pm4, R_028B28_VGT_STRMOUT_DRAW_OPAQUE_OFFSET, 0);
+      si_pm4_set_reg(pm4, R_028204_PA_SC_WINDOW_SCISSOR_TL, S_028204_WINDOW_OFFSET_DISABLE(1));
+      si_pm4_set_reg(pm4, R_028030_PA_SC_SCREEN_SCISSOR_TL, 0);
+      si_pm4_set_reg(pm4, R_028034_PA_SC_SCREEN_SCISSOR_BR,
+                     S_028034_BR_X(16384) | S_028034_BR_Y(16384));
+   }
+
+   if (!has_clear_state) {
+      si_pm4_set_reg(pm4, R_028230_PA_SC_EDGERULE,
+                     S_028230_ER_TRI(0xA) | S_028230_ER_POINT(0xA) | S_028230_ER_RECT(0xA) |
+                        /* Required by DX10_DIAMOND_TEST_ENA: */
+                        S_028230_ER_LINE_LR(0x1A) | S_028230_ER_LINE_RL(0x26) |
+                        S_028230_ER_LINE_TB(0xA) | S_028230_ER_LINE_BT(0xA));
+      si_pm4_set_reg(pm4, R_028820_PA_CL_NANINF_CNTL, 0);
+      si_pm4_set_reg(pm4, R_028AC0_DB_SRESULTS_COMPARE_STATE0, 0x0);
+      si_pm4_set_reg(pm4, R_028AC4_DB_SRESULTS_COMPARE_STATE1, 0x0);
+      si_pm4_set_reg(pm4, R_028AC8_DB_PRELOAD_CONTROL, 0x0);
+      si_pm4_set_reg(pm4, R_02800C_DB_RENDER_OVERRIDE, 0);
+   }
+
+   if (sctx->chip_class >= GFX10) {
+      si_pm4_set_reg(pm4, R_028A98_VGT_DRAW_PAYLOAD_CNTL, 0);
+      si_pm4_set_reg(pm4, R_030964_GE_MAX_VTX_INDX, ~0);
+      si_pm4_set_reg(pm4, R_030924_GE_MIN_VTX_INDX, 0);
+      si_pm4_set_reg(pm4, R_030928_GE_INDX_OFFSET, 0);
+      si_pm4_set_reg(pm4, R_03097C_GE_STEREO_CNTL, 0);
+      si_pm4_set_reg(pm4, R_030988_GE_USER_VGPR_EN, 0);
+   } else if (sctx->chip_class == GFX9) {
+      si_pm4_set_reg(pm4, R_030920_VGT_MAX_VTX_INDX, ~0);
+      si_pm4_set_reg(pm4, R_030924_VGT_MIN_VTX_INDX, 0);
+      si_pm4_set_reg(pm4, R_030928_VGT_INDX_OFFSET, 0);
+   } else {
+      /* These registers, when written, also overwrite the CLEAR_STATE
+       * context, so we can't rely on CLEAR_STATE setting them.
+       * It would be an issue if there was another UMD changing them.
+       */
+      si_pm4_set_reg(pm4, R_028400_VGT_MAX_VTX_INDX, ~0);
+      si_pm4_set_reg(pm4, R_028404_VGT_MIN_VTX_INDX, 0);
+      si_pm4_set_reg(pm4, R_028408_VGT_INDX_OFFSET, 0);
+   }
+
+   if (sctx->chip_class >= GFX7) {
+      if (sctx->chip_class >= GFX10) {
+         /* Logical CUs 16 - 31 */
+         si_pm4_set_reg(pm4, R_00B404_SPI_SHADER_PGM_RSRC4_HS, S_00B404_CU_EN(0xffff));
+         si_pm4_set_reg(pm4, R_00B104_SPI_SHADER_PGM_RSRC4_VS, S_00B104_CU_EN(0xffff));
+         si_pm4_set_reg(pm4, R_00B004_SPI_SHADER_PGM_RSRC4_PS, S_00B004_CU_EN(0xffff));
+      }
+
+      if (sctx->chip_class >= GFX9) {
+         si_pm4_set_reg(pm4, R_00B41C_SPI_SHADER_PGM_RSRC3_HS,
+                        S_00B41C_CU_EN(0xffff) | S_00B41C_WAVE_LIMIT(0x3F));
+      } else {
+         si_pm4_set_reg(pm4, R_00B51C_SPI_SHADER_PGM_RSRC3_LS,
+                        S_00B51C_CU_EN(0xffff) | S_00B51C_WAVE_LIMIT(0x3F));
+         si_pm4_set_reg(pm4, R_00B41C_SPI_SHADER_PGM_RSRC3_HS, S_00B41C_WAVE_LIMIT(0x3F));
+         si_pm4_set_reg(pm4, R_00B31C_SPI_SHADER_PGM_RSRC3_ES,
+                        S_00B31C_CU_EN(0xffff) | S_00B31C_WAVE_LIMIT(0x3F));
+
+         /* If this is 0, Bonaire can hang even if GS isn't being used.
+          * Other chips are unaffected. These are suboptimal values,
+          * but we don't use on-chip GS.
+          */
+         si_pm4_set_reg(pm4, R_028A44_VGT_GS_ONCHIP_CNTL,
+                        S_028A44_ES_VERTS_PER_SUBGRP(64) | S_028A44_GS_PRIMS_PER_SUBGRP(4));
+      }
+
+      /* Compute LATE_ALLOC_VS.LIMIT. */
+      unsigned num_cu_per_sh = sscreen->info.num_good_cu_per_sh;
+      unsigned late_alloc_wave64 = 0; /* The limit is per SH. */
+      unsigned cu_mask_vs = 0xffff;
+      unsigned cu_mask_gs = 0xffff;
+
+      if (sctx->chip_class >= GFX10) {
+         /* For Wave32, the hw will launch twice the number of late
+          * alloc waves, so 1 == 2x wave32.
+          */
+         if (!sscreen->info.use_late_alloc) {
+            late_alloc_wave64 = 0;
+         } else if (num_cu_per_sh <= 6) {
+            late_alloc_wave64 = num_cu_per_sh - 2;
+         } else {
+            late_alloc_wave64 = (num_cu_per_sh - 2) * 4;
+
+            /* CU2 & CU3 disabled because of the dual CU design */
+            /* Late alloc is not used for NGG on Navi14 due to a hw bug. */
+            cu_mask_vs = 0xfff3;
+            cu_mask_gs = sscreen->use_ngg && sctx->family != CHIP_NAVI14 ? 0xfff3 : 0xffff;
+         }
+      } else {
+         if (!sscreen->info.use_late_alloc) {
+            late_alloc_wave64 = 0;
+         } else if (num_cu_per_sh <= 4) {
+            /* Too few available compute units per SH. Disallowing
+             * VS to run on one CU could hurt us more than late VS
+             * allocation would help.
+             *
+             * 2 is the highest safe number that allows us to keep
+             * all CUs enabled.
+             */
+            late_alloc_wave64 = 2;
+         } else {
+            /* This is a good initial value, allowing 1 late_alloc
+             * wave per SIMD on num_cu - 2.
+             */
+            late_alloc_wave64 = (num_cu_per_sh - 2) * 4;
+         }
+
+         if (late_alloc_wave64 > 2)
+            cu_mask_vs = 0xfffe; /* 1 CU disabled */
+      }
+
+      /* VS can't execute on one CU if the limit is > 2. */
+      si_pm4_set_reg(pm4, R_00B118_SPI_SHADER_PGM_RSRC3_VS,
+                     S_00B118_CU_EN(cu_mask_vs) | S_00B118_WAVE_LIMIT(0x3F));
+      si_pm4_set_reg(pm4, R_00B11C_SPI_SHADER_LATE_ALLOC_VS, S_00B11C_LIMIT(late_alloc_wave64));
+
+      si_pm4_set_reg(pm4, R_00B21C_SPI_SHADER_PGM_RSRC3_GS,
+                     S_00B21C_CU_EN(cu_mask_gs) | S_00B21C_WAVE_LIMIT(0x3F));
+
+      si_pm4_set_reg(pm4, R_00B01C_SPI_SHADER_PGM_RSRC3_PS,
+                     S_00B01C_CU_EN(0xffff) | S_00B01C_WAVE_LIMIT(0x3F));
+   }
+
+   if (sctx->chip_class >= GFX10) {
+      /* Break up a pixel wave if it contains deallocs for more than
+       * half the parameter cache.
+       *
+       * To avoid a deadlock where pixel waves aren't launched
+       * because they're waiting for more pixels while the frontend
+       * is stuck waiting for PC space, the maximum allowed value is
+       * the size of the PC minus the largest possible allocation for
+       * a single primitive shader subgroup.
+       */
+      si_pm4_set_reg(pm4, R_028C50_PA_SC_NGG_MODE_CNTL, S_028C50_MAX_DEALLOCS_IN_WAVE(512));
+      si_pm4_set_reg(pm4, R_028C58_VGT_VERTEX_REUSE_BLOCK_CNTL, 14);
+
+      if (!has_clear_state) {
+         si_pm4_set_reg(pm4, R_02835C_PA_SC_TILE_STEERING_OVERRIDE,
+                        sscreen->info.pa_sc_tile_steering_override);
+      }
+
+      /* Enable CMASK/FMASK/HTILE/DCC caching in L2 for small chips. */
+      unsigned meta_write_policy, meta_read_policy;
+      /* TODO: investigate whether LRU improves performance on other chips too */
+      if (sscreen->info.num_render_backends <= 4) {
+         meta_write_policy = V_02807C_CACHE_LRU_WR; /* cache writes */
+         meta_read_policy = V_02807C_CACHE_LRU_RD;  /* cache reads */
+      } else {
+         meta_write_policy = V_02807C_CACHE_STREAM_WR; /* write combine */
+         meta_read_policy = V_02807C_CACHE_NOA_RD;     /* don't cache reads */
+      }
+
+      si_pm4_set_reg(pm4, R_02807C_DB_RMI_L2_CACHE_CONTROL,
+                     S_02807C_Z_WR_POLICY(V_02807C_CACHE_STREAM_WR) |
+                        S_02807C_S_WR_POLICY(V_02807C_CACHE_STREAM_WR) |
+                        S_02807C_HTILE_WR_POLICY(meta_write_policy) |
+                        S_02807C_ZPCPSD_WR_POLICY(V_02807C_CACHE_STREAM_WR) |
+                        S_02807C_Z_RD_POLICY(V_02807C_CACHE_NOA_RD) |
+                        S_02807C_S_RD_POLICY(V_02807C_CACHE_NOA_RD) |
+                        S_02807C_HTILE_RD_POLICY(meta_read_policy));
+
+      si_pm4_set_reg(
+         pm4, R_028410_CB_RMI_GL2_CACHE_CONTROL,
+         S_028410_CMASK_WR_POLICY(meta_write_policy) | S_028410_FMASK_WR_POLICY(meta_write_policy) |
+            S_028410_DCC_WR_POLICY(meta_write_policy) |
+            S_028410_COLOR_WR_POLICY(V_028410_CACHE_STREAM_WR) |
+            S_028410_CMASK_RD_POLICY(meta_read_policy) |
+            S_028410_FMASK_RD_POLICY(meta_read_policy) | S_028410_DCC_RD_POLICY(meta_read_policy) |
+            S_028410_COLOR_RD_POLICY(V_028410_CACHE_NOA_RD));
+      si_pm4_set_reg(pm4, R_028428_CB_COVERAGE_OUT_CONTROL, 0);
+
+      si_pm4_set_reg(pm4, R_00B0C0_SPI_SHADER_REQ_CTRL_PS,
+                     S_00B0C0_SOFT_GROUPING_EN(1) | S_00B0C0_NUMBER_OF_REQUESTS_PER_CU(4 - 1));
+      si_pm4_set_reg(pm4, R_00B1C0_SPI_SHADER_REQ_CTRL_VS, 0);
+   }
+
+   if (sctx->chip_class >= GFX9) {
+      si_pm4_set_reg(pm4, R_028B50_VGT_TESS_DISTRIBUTION,
+                     S_028B50_ACCUM_ISOLINE(40) | S_028B50_ACCUM_TRI(30) | S_028B50_ACCUM_QUAD(24) |
+                        S_028B50_DONUT_SPLIT(24) | S_028B50_TRAP_SPLIT(6));
+   } else if (sctx->chip_class >= GFX8) {
+      unsigned vgt_tess_distribution;
+
+      vgt_tess_distribution = S_028B50_ACCUM_ISOLINE(32) | S_028B50_ACCUM_TRI(11) |
+                              S_028B50_ACCUM_QUAD(11) | S_028B50_DONUT_SPLIT(16);
+
+      /* Testing with Unigine Heaven extreme tesselation yielded best results
+       * with TRAP_SPLIT = 3.
+       */
+      if (sctx->family == CHIP_FIJI || sctx->family >= CHIP_POLARIS10)
+         vgt_tess_distribution |= S_028B50_TRAP_SPLIT(3);
+
+      si_pm4_set_reg(pm4, R_028B50_VGT_TESS_DISTRIBUTION, vgt_tess_distribution);
+   } else if (!has_clear_state) {
+      si_pm4_set_reg(pm4, R_028C58_VGT_VERTEX_REUSE_BLOCK_CNTL, 14);
+      si_pm4_set_reg(pm4, R_028C5C_VGT_OUT_DEALLOC_CNTL, 16);
+   }
+
+   si_pm4_set_reg(pm4, R_028080_TA_BC_BASE_ADDR, border_color_va >> 8);
+   if (sctx->chip_class >= GFX7) {
+      si_pm4_set_reg(pm4, R_028084_TA_BC_BASE_ADDR_HI, S_028084_ADDRESS(border_color_va >> 40));
+   }
+   si_pm4_add_bo(pm4, sctx->border_color_buffer, RADEON_USAGE_READ, RADEON_PRIO_BORDER_COLORS);
+
+   if (sctx->chip_class >= GFX9) {
+      si_pm4_set_reg(pm4, R_028C48_PA_SC_BINNER_CNTL_1,
+                     S_028C48_MAX_ALLOC_COUNT(sscreen->info.pbb_max_alloc_count - 1) |
+                        S_028C48_MAX_PRIM_PER_BATCH(1023));
+      si_pm4_set_reg(pm4, R_028C4C_PA_SC_CONSERVATIVE_RASTERIZATION_CNTL,
+                     S_028C4C_NULL_SQUAD_AA_MASK_ENABLE(1));
+      si_pm4_set_reg(pm4, R_030968_VGT_INSTANCE_BASE_ID, 0);
+   }
+
+   si_pm4_upload_indirect_buffer(sctx, pm4);
+   sctx->init_config = pm4;
 }
diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h
index 824bf4fef41..aa024b72e43 100644
--- a/src/gallium/drivers/radeonsi/si_state.h
+++ b/src/gallium/drivers/radeonsi/si_state.h
@@ -25,20 +25,19 @@
 #ifndef SI_STATE_H
 #define SI_STATE_H
 
-#include "si_pm4.h"
-
 #include "pipebuffer/pb_slab.h"
+#include "si_pm4.h"
 #include "util/u_blitter.h"
 
-#define SI_NUM_GRAPHICS_SHADERS (PIPE_SHADER_TESS_EVAL+1)
-#define SI_NUM_SHADERS (PIPE_SHADER_COMPUTE+1)
+#define SI_NUM_GRAPHICS_SHADERS (PIPE_SHADER_TESS_EVAL + 1)
+#define SI_NUM_SHADERS          (PIPE_SHADER_COMPUTE + 1)
 
-#define SI_NUM_VERTEX_BUFFERS		SI_MAX_ATTRIBS
-#define SI_NUM_SAMPLERS			32 /* OpenGL textures units per shader */
-#define SI_NUM_CONST_BUFFERS		16
-#define SI_NUM_IMAGES			16
-#define SI_NUM_IMAGE_SLOTS		(SI_NUM_IMAGES * 2) /* the second half are FMASK slots */
-#define SI_NUM_SHADER_BUFFERS		16
+#define SI_NUM_VERTEX_BUFFERS SI_MAX_ATTRIBS
+#define SI_NUM_SAMPLERS       32 /* OpenGL textures units per shader */
+#define SI_NUM_CONST_BUFFERS  16
+#define SI_NUM_IMAGES         16
+#define SI_NUM_IMAGE_SLOTS    (SI_NUM_IMAGES * 2) /* the second half are FMASK slots */
+#define SI_NUM_SHADER_BUFFERS 16
 
 struct si_screen;
 struct si_shader;
@@ -48,351 +47,335 @@ struct si_texture;
 struct si_qbo_state;
 
 struct si_state_blend {
-	struct si_pm4_state	pm4;
-	uint32_t		cb_target_mask;
-	/* Set 0xf or 0x0 (4 bits) per render target if the following is
-	 * true. ANDed with spi_shader_col_format.
-	 */
-	unsigned		cb_target_enabled_4bit;
-	unsigned		blend_enable_4bit;
-	unsigned		need_src_alpha_4bit;
-	unsigned		commutative_4bit;
-	unsigned		dcc_msaa_corruption_4bit;
-	bool			alpha_to_coverage:1;
-	bool			alpha_to_one:1;
-	bool			dual_src_blend:1;
-	bool			logicop_enable:1;
+   struct si_pm4_state pm4;
+   uint32_t cb_target_mask;
+   /* Set 0xf or 0x0 (4 bits) per render target if the following is
+    * true. ANDed with spi_shader_col_format.
+    */
+   unsigned cb_target_enabled_4bit;
+   unsigned blend_enable_4bit;
+   unsigned need_src_alpha_4bit;
+   unsigned commutative_4bit;
+   unsigned dcc_msaa_corruption_4bit;
+   bool alpha_to_coverage : 1;
+   bool alpha_to_one : 1;
+   bool dual_src_blend : 1;
+   bool logicop_enable : 1;
 };
 
 struct si_state_rasterizer {
-	struct si_pm4_state	pm4;
-	/* poly offset states for 16-bit, 24-bit, and 32-bit zbuffers */
-	struct si_pm4_state	*pm4_poly_offset;
-	unsigned		pa_sc_line_stipple;
-	unsigned		pa_cl_clip_cntl;
-	float			line_width;
-	float			max_point_size;
-	unsigned		sprite_coord_enable:8;
-	unsigned		clip_plane_enable:8;
-	unsigned		half_pixel_center:1;
-	unsigned		flatshade:1;
-	unsigned		flatshade_first:1;
-	unsigned		two_side:1;
-	unsigned		multisample_enable:1;
-	unsigned		force_persample_interp:1;
-	unsigned		line_stipple_enable:1;
-	unsigned		poly_stipple_enable:1;
-	unsigned		line_smooth:1;
-	unsigned		poly_smooth:1;
-	unsigned		uses_poly_offset:1;
-	unsigned		clamp_fragment_color:1;
-	unsigned		clamp_vertex_color:1;
-	unsigned		rasterizer_discard:1;
-	unsigned		scissor_enable:1;
-	unsigned		clip_halfz:1;
-	unsigned		cull_front:1;
-	unsigned		cull_back:1;
-	unsigned		depth_clamp_any:1;
-	unsigned		provoking_vertex_first:1;
-	unsigned		polygon_mode_enabled:1;
-	unsigned		polygon_mode_is_lines:1;
+   struct si_pm4_state pm4;
+   /* poly offset states for 16-bit, 24-bit, and 32-bit zbuffers */
+   struct si_pm4_state *pm4_poly_offset;
+   unsigned pa_sc_line_stipple;
+   unsigned pa_cl_clip_cntl;
+   float line_width;
+   float max_point_size;
+   unsigned sprite_coord_enable : 8;
+   unsigned clip_plane_enable : 8;
+   unsigned half_pixel_center : 1;
+   unsigned flatshade : 1;
+   unsigned flatshade_first : 1;
+   unsigned two_side : 1;
+   unsigned multisample_enable : 1;
+   unsigned force_persample_interp : 1;
+   unsigned line_stipple_enable : 1;
+   unsigned poly_stipple_enable : 1;
+   unsigned line_smooth : 1;
+   unsigned poly_smooth : 1;
+   unsigned uses_poly_offset : 1;
+   unsigned clamp_fragment_color : 1;
+   unsigned clamp_vertex_color : 1;
+   unsigned rasterizer_discard : 1;
+   unsigned scissor_enable : 1;
+   unsigned clip_halfz : 1;
+   unsigned cull_front : 1;
+   unsigned cull_back : 1;
+   unsigned depth_clamp_any : 1;
+   unsigned provoking_vertex_first : 1;
+   unsigned polygon_mode_enabled : 1;
+   unsigned polygon_mode_is_lines : 1;
 };
 
 struct si_dsa_stencil_ref_part {
-	uint8_t			valuemask[2];
-	uint8_t			writemask[2];
+   uint8_t valuemask[2];
+   uint8_t writemask[2];
 };
 
 struct si_dsa_order_invariance {
-	/** Whether the final result in Z/S buffers is guaranteed to be
-	 * invariant under changes to the order in which fragments arrive. */
-	bool zs:1;
-
-	/** Whether the set of fragments that pass the combined Z/S test is
-	 * guaranteed to be invariant under changes to the order in which
-	 * fragments arrive. */
-	bool pass_set:1;
-
-	/** Whether the last fragment that passes the combined Z/S test at each
-	 * sample is guaranteed to be invariant under changes to the order in
-	 * which fragments arrive. */
-	bool pass_last:1;
+   /** Whether the final result in Z/S buffers is guaranteed to be
+    * invariant under changes to the order in which fragments arrive. */
+   bool zs : 1;
+
+   /** Whether the set of fragments that pass the combined Z/S test is
+    * guaranteed to be invariant under changes to the order in which
+    * fragments arrive. */
+   bool pass_set : 1;
+
+   /** Whether the last fragment that passes the combined Z/S test at each
+    * sample is guaranteed to be invariant under changes to the order in
+    * which fragments arrive. */
+   bool pass_last : 1;
 };
 
 struct si_state_dsa {
-	struct si_pm4_state		pm4;
-	struct si_dsa_stencil_ref_part	stencil_ref;
-
-	/* 0 = without stencil buffer, 1 = when both Z and S buffers are present */
-	struct si_dsa_order_invariance	order_invariance[2];
-
-	ubyte				alpha_func:3;
-	bool				depth_enabled:1;
-	bool				depth_write_enabled:1;
-	bool				stencil_enabled:1;
-	bool				stencil_write_enabled:1;
-	bool				db_can_write:1;
-
+   struct si_pm4_state pm4;
+   struct si_dsa_stencil_ref_part stencil_ref;
+
+   /* 0 = without stencil buffer, 1 = when both Z and S buffers are present */
+   struct si_dsa_order_invariance order_invariance[2];
+
+   ubyte alpha_func : 3;
+   bool depth_enabled : 1;
+   bool depth_write_enabled : 1;
+   bool stencil_enabled : 1;
+   bool stencil_write_enabled : 1;
+   bool db_can_write : 1;
 };
 
 struct si_stencil_ref {
-	struct pipe_stencil_ref		state;
-	struct si_dsa_stencil_ref_part	dsa_part;
+   struct pipe_stencil_ref state;
+   struct si_dsa_stencil_ref_part dsa_part;
 };
 
-struct si_vertex_elements
-{
-	struct si_resource		*instance_divisor_factor_buffer;
-	uint32_t			rsrc_word3[SI_MAX_ATTRIBS];
-	uint16_t			src_offset[SI_MAX_ATTRIBS];
-	uint8_t				fix_fetch[SI_MAX_ATTRIBS];
-	uint8_t				format_size[SI_MAX_ATTRIBS];
-	uint8_t				vertex_buffer_index[SI_MAX_ATTRIBS];
-
-	/* Bitmask of elements that always need a fixup to be applied. */
-	uint16_t			fix_fetch_always;
-
-	/* Bitmask of elements whose fetch should always be opencoded. */
-	uint16_t			fix_fetch_opencode;
-
-	/* Bitmask of elements which need to be opencoded if the vertex buffer
-	 * is unaligned. */
-	uint16_t			fix_fetch_unaligned;
-
-	/* For elements in fix_fetch_unaligned: whether the effective
-	 * element load size as seen by the hardware is a dword (as opposed
-	 * to a short).
-	 */
-	uint16_t			hw_load_is_dword;
-
-	/* Bitmask of vertex buffers requiring alignment check */
-	uint16_t			vb_alignment_check_mask;
-
-	uint8_t				count;
-	bool				uses_instance_divisors;
-
-	uint16_t			first_vb_use_mask;
-	/* Vertex buffer descriptor list size aligned for optimal prefetch. */
-	uint16_t			vb_desc_list_alloc_size;
-	uint16_t			instance_divisor_is_one; /* bitmask of inputs */
-	uint16_t			instance_divisor_is_fetched;  /* bitmask of inputs */
+struct si_vertex_elements {
+   struct si_resource *instance_divisor_factor_buffer;
+   uint32_t rsrc_word3[SI_MAX_ATTRIBS];
+   uint16_t src_offset[SI_MAX_ATTRIBS];
+   uint8_t fix_fetch[SI_MAX_ATTRIBS];
+   uint8_t format_size[SI_MAX_ATTRIBS];
+   uint8_t vertex_buffer_index[SI_MAX_ATTRIBS];
+
+   /* Bitmask of elements that always need a fixup to be applied. */
+   uint16_t fix_fetch_always;
+
+   /* Bitmask of elements whose fetch should always be opencoded. */
+   uint16_t fix_fetch_opencode;
+
+   /* Bitmask of elements which need to be opencoded if the vertex buffer
+    * is unaligned. */
+   uint16_t fix_fetch_unaligned;
+
+   /* For elements in fix_fetch_unaligned: whether the effective
+    * element load size as seen by the hardware is a dword (as opposed
+    * to a short).
+    */
+   uint16_t hw_load_is_dword;
+
+   /* Bitmask of vertex buffers requiring alignment check */
+   uint16_t vb_alignment_check_mask;
+
+   uint8_t count;
+   bool uses_instance_divisors;
+
+   uint16_t first_vb_use_mask;
+   /* Vertex buffer descriptor list size aligned for optimal prefetch. */
+   uint16_t vb_desc_list_alloc_size;
+   uint16_t instance_divisor_is_one;     /* bitmask of inputs */
+   uint16_t instance_divisor_is_fetched; /* bitmask of inputs */
 };
 
 union si_state {
-	struct {
-		struct si_state_blend		*blend;
-		struct si_state_rasterizer	*rasterizer;
-		struct si_state_dsa		*dsa;
-		struct si_pm4_state		*poly_offset;
-		struct si_pm4_state		*ls;
-		struct si_pm4_state		*hs;
-		struct si_pm4_state		*es;
-		struct si_pm4_state		*gs;
-		struct si_pm4_state		*vgt_shader_config;
-		struct si_pm4_state		*vs;
-		struct si_pm4_state		*ps;
-	} named;
-	struct si_pm4_state	*array[0];
+   struct {
+      struct si_state_blend *blend;
+      struct si_state_rasterizer *rasterizer;
+      struct si_state_dsa *dsa;
+      struct si_pm4_state *poly_offset;
+      struct si_pm4_state *ls;
+      struct si_pm4_state *hs;
+      struct si_pm4_state *es;
+      struct si_pm4_state *gs;
+      struct si_pm4_state *vgt_shader_config;
+      struct si_pm4_state *vs;
+      struct si_pm4_state *ps;
+   } named;
+   struct si_pm4_state *array[0];
 };
 
-#define SI_STATE_IDX(name) \
-	(offsetof(union si_state, named.name) / sizeof(struct si_pm4_state *))
+#define SI_STATE_IDX(name) (offsetof(union si_state, named.name) / sizeof(struct si_pm4_state *))
 #define SI_STATE_BIT(name) (1 << SI_STATE_IDX(name))
-#define SI_NUM_STATES (sizeof(union si_state) / sizeof(struct si_pm4_state *))
+#define SI_NUM_STATES      (sizeof(union si_state) / sizeof(struct si_pm4_state *))
 
 static inline unsigned si_states_that_always_roll_context(void)
 {
-	return (SI_STATE_BIT(blend) |
-		SI_STATE_BIT(rasterizer) |
-		SI_STATE_BIT(dsa) |
-		SI_STATE_BIT(poly_offset) |
-		SI_STATE_BIT(vgt_shader_config));
+   return (SI_STATE_BIT(blend) | SI_STATE_BIT(rasterizer) | SI_STATE_BIT(dsa) |
+           SI_STATE_BIT(poly_offset) | SI_STATE_BIT(vgt_shader_config));
 }
 
 union si_state_atoms {
-	struct {
-		/* The order matters. */
-		struct si_atom render_cond;
-		struct si_atom streamout_begin;
-		struct si_atom streamout_enable; /* must be after streamout_begin */
-		struct si_atom framebuffer;
-		struct si_atom msaa_sample_locs;
-		struct si_atom db_render_state;
-		struct si_atom dpbb_state;
-		struct si_atom msaa_config;
-		struct si_atom sample_mask;
-		struct si_atom cb_render_state;
-		struct si_atom blend_color;
-		struct si_atom clip_regs;
-		struct si_atom clip_state;
-		struct si_atom shader_pointers;
-		struct si_atom guardband;
-		struct si_atom scissors;
-		struct si_atom viewports;
-		struct si_atom stencil_ref;
-		struct si_atom spi_map;
-		struct si_atom scratch_state;
-		struct si_atom window_rectangles;
-		struct si_atom shader_query;
-	} s;
-	struct si_atom array[0];
+   struct {
+      /* The order matters. */
+      struct si_atom render_cond;
+      struct si_atom streamout_begin;
+      struct si_atom streamout_enable; /* must be after streamout_begin */
+      struct si_atom framebuffer;
+      struct si_atom msaa_sample_locs;
+      struct si_atom db_render_state;
+      struct si_atom dpbb_state;
+      struct si_atom msaa_config;
+      struct si_atom sample_mask;
+      struct si_atom cb_render_state;
+      struct si_atom blend_color;
+      struct si_atom clip_regs;
+      struct si_atom clip_state;
+      struct si_atom shader_pointers;
+      struct si_atom guardband;
+      struct si_atom scissors;
+      struct si_atom viewports;
+      struct si_atom stencil_ref;
+      struct si_atom spi_map;
+      struct si_atom scratch_state;
+      struct si_atom window_rectangles;
+      struct si_atom shader_query;
+   } s;
+   struct si_atom array[0];
 };
 
-#define SI_ATOM_BIT(name) (1 << (offsetof(union si_state_atoms, s.name) / \
-			         sizeof(struct si_atom)))
-#define SI_NUM_ATOMS (sizeof(union si_state_atoms)/sizeof(struct si_atom*))
+#define SI_ATOM_BIT(name) (1 << (offsetof(union si_state_atoms, s.name) / sizeof(struct si_atom)))
+#define SI_NUM_ATOMS      (sizeof(union si_state_atoms) / sizeof(struct si_atom *))
 
 static inline unsigned si_atoms_that_always_roll_context(void)
 {
-	return (SI_ATOM_BIT(streamout_begin) |
-		SI_ATOM_BIT(streamout_enable) |
-		SI_ATOM_BIT(framebuffer) |
-		SI_ATOM_BIT(msaa_sample_locs) |
-		SI_ATOM_BIT(sample_mask) |
-		SI_ATOM_BIT(blend_color) |
-		SI_ATOM_BIT(clip_state) |
-		SI_ATOM_BIT(scissors) |
-		SI_ATOM_BIT(viewports) |
-		SI_ATOM_BIT(stencil_ref) |
-		SI_ATOM_BIT(scratch_state) |
-		SI_ATOM_BIT(window_rectangles));
+   return (SI_ATOM_BIT(streamout_begin) | SI_ATOM_BIT(streamout_enable) | SI_ATOM_BIT(framebuffer) |
+           SI_ATOM_BIT(msaa_sample_locs) | SI_ATOM_BIT(sample_mask) | SI_ATOM_BIT(blend_color) |
+           SI_ATOM_BIT(clip_state) | SI_ATOM_BIT(scissors) | SI_ATOM_BIT(viewports) |
+           SI_ATOM_BIT(stencil_ref) | SI_ATOM_BIT(scratch_state) | SI_ATOM_BIT(window_rectangles));
 }
 
 struct si_shader_data {
-	uint32_t		sh_base[SI_NUM_SHADERS];
+   uint32_t sh_base[SI_NUM_SHADERS];
 };
 
-#define SI_TRACKED_PA_CL_VS_OUT_CNTL__VS_MASK \
-	(S_02881C_USE_VTX_POINT_SIZE(1) | \
-	 S_02881C_USE_VTX_EDGE_FLAG(1) | \
-	 S_02881C_USE_VTX_RENDER_TARGET_INDX(1) | \
-	 S_02881C_USE_VTX_VIEWPORT_INDX(1) | \
-	 S_02881C_VS_OUT_MISC_VEC_ENA(1) | \
-	 S_02881C_VS_OUT_MISC_SIDE_BUS_ENA(1))
+#define SI_TRACKED_PA_CL_VS_OUT_CNTL__VS_MASK                                                      \
+   (S_02881C_USE_VTX_POINT_SIZE(1) | S_02881C_USE_VTX_EDGE_FLAG(1) |                               \
+    S_02881C_USE_VTX_RENDER_TARGET_INDX(1) | S_02881C_USE_VTX_VIEWPORT_INDX(1) |                   \
+    S_02881C_VS_OUT_MISC_VEC_ENA(1) | S_02881C_VS_OUT_MISC_SIDE_BUS_ENA(1))
 
 /* The list of registers whose emitted values are remembered by si_context. */
-enum si_tracked_reg {
-	SI_TRACKED_DB_RENDER_CONTROL, /* 2 consecutive registers */
-	SI_TRACKED_DB_COUNT_CONTROL,
+enum si_tracked_reg
+{
+   SI_TRACKED_DB_RENDER_CONTROL, /* 2 consecutive registers */
+   SI_TRACKED_DB_COUNT_CONTROL,
 
-	SI_TRACKED_DB_RENDER_OVERRIDE2,
-	SI_TRACKED_DB_SHADER_CONTROL,
+   SI_TRACKED_DB_RENDER_OVERRIDE2,
+   SI_TRACKED_DB_SHADER_CONTROL,
 
-	SI_TRACKED_CB_TARGET_MASK,
-	SI_TRACKED_CB_DCC_CONTROL,
+   SI_TRACKED_CB_TARGET_MASK,
+   SI_TRACKED_CB_DCC_CONTROL,
 
-	SI_TRACKED_SX_PS_DOWNCONVERT, /* 3 consecutive registers */
-	SI_TRACKED_SX_BLEND_OPT_EPSILON,
-	SI_TRACKED_SX_BLEND_OPT_CONTROL,
+   SI_TRACKED_SX_PS_DOWNCONVERT, /* 3 consecutive registers */
+   SI_TRACKED_SX_BLEND_OPT_EPSILON,
+   SI_TRACKED_SX_BLEND_OPT_CONTROL,
 
-	SI_TRACKED_PA_SC_LINE_CNTL, /* 2 consecutive registers */
-	SI_TRACKED_PA_SC_AA_CONFIG,
+   SI_TRACKED_PA_SC_LINE_CNTL, /* 2 consecutive registers */
+   SI_TRACKED_PA_SC_AA_CONFIG,
 
-	SI_TRACKED_DB_EQAA,
-	SI_TRACKED_PA_SC_MODE_CNTL_1,
+   SI_TRACKED_DB_EQAA,
+   SI_TRACKED_PA_SC_MODE_CNTL_1,
 
-	SI_TRACKED_PA_SU_PRIM_FILTER_CNTL,
-	SI_TRACKED_PA_SU_SMALL_PRIM_FILTER_CNTL,
+   SI_TRACKED_PA_SU_PRIM_FILTER_CNTL,
+   SI_TRACKED_PA_SU_SMALL_PRIM_FILTER_CNTL,
 
-	SI_TRACKED_PA_CL_VS_OUT_CNTL__VS, /* set with SI_TRACKED_PA_CL_VS_OUT_CNTL__VS_MASK*/
-	SI_TRACKED_PA_CL_VS_OUT_CNTL__CL, /* set with ~SI_TRACKED_PA_CL_VS_OUT_CNTL__VS_MASK */
-	SI_TRACKED_PA_CL_CLIP_CNTL,
+   SI_TRACKED_PA_CL_VS_OUT_CNTL__VS, /* set with SI_TRACKED_PA_CL_VS_OUT_CNTL__VS_MASK*/
+   SI_TRACKED_PA_CL_VS_OUT_CNTL__CL, /* set with ~SI_TRACKED_PA_CL_VS_OUT_CNTL__VS_MASK */
+   SI_TRACKED_PA_CL_CLIP_CNTL,
 
-	SI_TRACKED_PA_SC_BINNER_CNTL_0,
-	SI_TRACKED_DB_DFSM_CONTROL,
+   SI_TRACKED_PA_SC_BINNER_CNTL_0,
+   SI_TRACKED_DB_DFSM_CONTROL,
 
-	SI_TRACKED_PA_CL_GB_VERT_CLIP_ADJ, /* 4 consecutive registers */
-	SI_TRACKED_PA_CL_GB_VERT_DISC_ADJ,
-	SI_TRACKED_PA_CL_GB_HORZ_CLIP_ADJ,
-	SI_TRACKED_PA_CL_GB_HORZ_DISC_ADJ,
+   SI_TRACKED_PA_CL_GB_VERT_CLIP_ADJ, /* 4 consecutive registers */
+   SI_TRACKED_PA_CL_GB_VERT_DISC_ADJ,
+   SI_TRACKED_PA_CL_GB_HORZ_CLIP_ADJ,
+   SI_TRACKED_PA_CL_GB_HORZ_DISC_ADJ,
 
-	SI_TRACKED_PA_SU_HARDWARE_SCREEN_OFFSET,
-	SI_TRACKED_PA_SU_VTX_CNTL,
+   SI_TRACKED_PA_SU_HARDWARE_SCREEN_OFFSET,
+   SI_TRACKED_PA_SU_VTX_CNTL,
 
-	SI_TRACKED_PA_SC_CLIPRECT_RULE,
+   SI_TRACKED_PA_SC_CLIPRECT_RULE,
 
-	SI_TRACKED_PA_SC_LINE_STIPPLE,
+   SI_TRACKED_PA_SC_LINE_STIPPLE,
 
-	SI_TRACKED_VGT_ESGS_RING_ITEMSIZE,
+   SI_TRACKED_VGT_ESGS_RING_ITEMSIZE,
 
-	SI_TRACKED_VGT_GSVS_RING_OFFSET_1, /* 3 consecutive registers */
-	SI_TRACKED_VGT_GSVS_RING_OFFSET_2,
-	SI_TRACKED_VGT_GSVS_RING_OFFSET_3,
+   SI_TRACKED_VGT_GSVS_RING_OFFSET_1, /* 3 consecutive registers */
+   SI_TRACKED_VGT_GSVS_RING_OFFSET_2,
+   SI_TRACKED_VGT_GSVS_RING_OFFSET_3,
 
-	SI_TRACKED_VGT_GSVS_RING_ITEMSIZE,
-	SI_TRACKED_VGT_GS_MAX_VERT_OUT,
+   SI_TRACKED_VGT_GSVS_RING_ITEMSIZE,
+   SI_TRACKED_VGT_GS_MAX_VERT_OUT,
 
-	SI_TRACKED_VGT_GS_VERT_ITEMSIZE, /* 4 consecutive registers */
-	SI_TRACKED_VGT_GS_VERT_ITEMSIZE_1,
-	SI_TRACKED_VGT_GS_VERT_ITEMSIZE_2,
-	SI_TRACKED_VGT_GS_VERT_ITEMSIZE_3,
+   SI_TRACKED_VGT_GS_VERT_ITEMSIZE, /* 4 consecutive registers */
+   SI_TRACKED_VGT_GS_VERT_ITEMSIZE_1,
+   SI_TRACKED_VGT_GS_VERT_ITEMSIZE_2,
+   SI_TRACKED_VGT_GS_VERT_ITEMSIZE_3,
 
-	SI_TRACKED_VGT_GS_INSTANCE_CNT,
-	SI_TRACKED_VGT_GS_ONCHIP_CNTL,
-	SI_TRACKED_VGT_GS_MAX_PRIMS_PER_SUBGROUP,
-	SI_TRACKED_VGT_GS_MODE,
-	SI_TRACKED_VGT_PRIMITIVEID_EN,
-	SI_TRACKED_VGT_REUSE_OFF,
-	SI_TRACKED_SPI_VS_OUT_CONFIG,
-	SI_TRACKED_PA_CL_VTE_CNTL,
-	SI_TRACKED_PA_CL_NGG_CNTL,
-	SI_TRACKED_GE_MAX_OUTPUT_PER_SUBGROUP,
-	SI_TRACKED_GE_NGG_SUBGRP_CNTL,
+   SI_TRACKED_VGT_GS_INSTANCE_CNT,
+   SI_TRACKED_VGT_GS_ONCHIP_CNTL,
+   SI_TRACKED_VGT_GS_MAX_PRIMS_PER_SUBGROUP,
+   SI_TRACKED_VGT_GS_MODE,
+   SI_TRACKED_VGT_PRIMITIVEID_EN,
+   SI_TRACKED_VGT_REUSE_OFF,
+   SI_TRACKED_SPI_VS_OUT_CONFIG,
+   SI_TRACKED_PA_CL_VTE_CNTL,
+   SI_TRACKED_PA_CL_NGG_CNTL,
+   SI_TRACKED_GE_MAX_OUTPUT_PER_SUBGROUP,
+   SI_TRACKED_GE_NGG_SUBGRP_CNTL,
 
-	SI_TRACKED_SPI_SHADER_IDX_FORMAT, /* 2 consecutive registers */
-	SI_TRACKED_SPI_SHADER_POS_FORMAT,
+   SI_TRACKED_SPI_SHADER_IDX_FORMAT, /* 2 consecutive registers */
+   SI_TRACKED_SPI_SHADER_POS_FORMAT,
 
-	SI_TRACKED_SPI_PS_INPUT_ENA, /* 2 consecutive registers */
-	SI_TRACKED_SPI_PS_INPUT_ADDR,
+   SI_TRACKED_SPI_PS_INPUT_ENA, /* 2 consecutive registers */
+   SI_TRACKED_SPI_PS_INPUT_ADDR,
 
-	SI_TRACKED_SPI_BARYC_CNTL,
-	SI_TRACKED_SPI_PS_IN_CONTROL,
+   SI_TRACKED_SPI_BARYC_CNTL,
+   SI_TRACKED_SPI_PS_IN_CONTROL,
 
-	SI_TRACKED_SPI_SHADER_Z_FORMAT, /* 2 consecutive registers */
-	SI_TRACKED_SPI_SHADER_COL_FORMAT,
+   SI_TRACKED_SPI_SHADER_Z_FORMAT, /* 2 consecutive registers */
+   SI_TRACKED_SPI_SHADER_COL_FORMAT,
 
-	SI_TRACKED_CB_SHADER_MASK,
-	SI_TRACKED_VGT_TF_PARAM,
-	SI_TRACKED_VGT_VERTEX_REUSE_BLOCK_CNTL,
+   SI_TRACKED_CB_SHADER_MASK,
+   SI_TRACKED_VGT_TF_PARAM,
+   SI_TRACKED_VGT_VERTEX_REUSE_BLOCK_CNTL,
 
-	SI_TRACKED_GE_PC_ALLOC,
+   SI_TRACKED_GE_PC_ALLOC,
 
-	SI_NUM_TRACKED_REGS,
+   SI_NUM_TRACKED_REGS,
 };
 
 struct si_tracked_regs {
-	uint64_t		reg_saved;
-	uint32_t		reg_value[SI_NUM_TRACKED_REGS];
-	uint32_t		spi_ps_input_cntl[32];
+   uint64_t reg_saved;
+   uint32_t reg_value[SI_NUM_TRACKED_REGS];
+   uint32_t spi_ps_input_cntl[32];
 };
 
 /* Private read-write buffer slots. */
-enum {
-	SI_ES_RING_ESGS,
-	SI_GS_RING_ESGS,
+enum
+{
+   SI_ES_RING_ESGS,
+   SI_GS_RING_ESGS,
 
-	SI_RING_GSVS,
+   SI_RING_GSVS,
 
-	SI_VS_STREAMOUT_BUF0,
-	SI_VS_STREAMOUT_BUF1,
-	SI_VS_STREAMOUT_BUF2,
-	SI_VS_STREAMOUT_BUF3,
+   SI_VS_STREAMOUT_BUF0,
+   SI_VS_STREAMOUT_BUF1,
+   SI_VS_STREAMOUT_BUF2,
+   SI_VS_STREAMOUT_BUF3,
 
-	SI_HS_CONST_DEFAULT_TESS_LEVELS,
-	SI_VS_CONST_INSTANCE_DIVISORS,
-	SI_VS_CONST_CLIP_PLANES,
-	SI_PS_CONST_POLY_STIPPLE,
-	SI_PS_CONST_SAMPLE_POSITIONS,
+   SI_HS_CONST_DEFAULT_TESS_LEVELS,
+   SI_VS_CONST_INSTANCE_DIVISORS,
+   SI_VS_CONST_CLIP_PLANES,
+   SI_PS_CONST_POLY_STIPPLE,
+   SI_PS_CONST_SAMPLE_POSITIONS,
 
-	/* Image descriptor of color buffer 0 for KHR_blend_equation_advanced. */
-	SI_PS_IMAGE_COLORBUF0,
-	SI_PS_IMAGE_COLORBUF0_HI,
-	SI_PS_IMAGE_COLORBUF0_FMASK,
-	SI_PS_IMAGE_COLORBUF0_FMASK_HI,
+   /* Image descriptor of color buffer 0 for KHR_blend_equation_advanced. */
+   SI_PS_IMAGE_COLORBUF0,
+   SI_PS_IMAGE_COLORBUF0_HI,
+   SI_PS_IMAGE_COLORBUF0_FMASK,
+   SI_PS_IMAGE_COLORBUF0_FMASK_HI,
 
-	GFX10_GS_QUERY_BUF,
+   GFX10_GS_QUERY_BUF,
 
-	SI_NUM_RW_BUFFERS,
+   SI_NUM_RW_BUFFERS,
 };
 
 /* Indices into sctx->descriptors, laid out so that gfx and compute pipelines
@@ -406,122 +389,111 @@ enum {
  *  11 - compute const and shader buffers
  *  12 - compute samplers and images
  */
-enum {
-	SI_SHADER_DESCS_CONST_AND_SHADER_BUFFERS,
-	SI_SHADER_DESCS_SAMPLERS_AND_IMAGES,
-	SI_NUM_SHADER_DESCS,
+enum
+{
+   SI_SHADER_DESCS_CONST_AND_SHADER_BUFFERS,
+   SI_SHADER_DESCS_SAMPLERS_AND_IMAGES,
+   SI_NUM_SHADER_DESCS,
 };
 
-#define SI_DESCS_RW_BUFFERS            0
-#define SI_DESCS_FIRST_SHADER          1
-#define SI_DESCS_FIRST_COMPUTE         (SI_DESCS_FIRST_SHADER + \
-                                        PIPE_SHADER_COMPUTE * SI_NUM_SHADER_DESCS)
-#define SI_NUM_DESCS                   (SI_DESCS_FIRST_SHADER + \
-                                        SI_NUM_SHADERS * SI_NUM_SHADER_DESCS)
+#define SI_DESCS_RW_BUFFERS    0
+#define SI_DESCS_FIRST_SHADER  1
+#define SI_DESCS_FIRST_COMPUTE (SI_DESCS_FIRST_SHADER + PIPE_SHADER_COMPUTE * SI_NUM_SHADER_DESCS)
+#define SI_NUM_DESCS           (SI_DESCS_FIRST_SHADER + SI_NUM_SHADERS * SI_NUM_SHADER_DESCS)
 
-#define SI_DESCS_SHADER_MASK(name) \
-	u_bit_consecutive(SI_DESCS_FIRST_SHADER + \
-			  PIPE_SHADER_##name * SI_NUM_SHADER_DESCS, \
-			  SI_NUM_SHADER_DESCS)
+#define SI_DESCS_SHADER_MASK(name)                                                                 \
+   u_bit_consecutive(SI_DESCS_FIRST_SHADER + PIPE_SHADER_##name * SI_NUM_SHADER_DESCS,             \
+                     SI_NUM_SHADER_DESCS)
 
-static inline unsigned
-si_const_and_shader_buffer_descriptors_idx(unsigned shader)
+static inline unsigned si_const_and_shader_buffer_descriptors_idx(unsigned shader)
 {
-	return SI_DESCS_FIRST_SHADER + shader * SI_NUM_SHADER_DESCS +
-	       SI_SHADER_DESCS_CONST_AND_SHADER_BUFFERS;
+   return SI_DESCS_FIRST_SHADER + shader * SI_NUM_SHADER_DESCS +
+          SI_SHADER_DESCS_CONST_AND_SHADER_BUFFERS;
 }
 
-static inline unsigned
-si_sampler_and_image_descriptors_idx(unsigned shader)
+static inline unsigned si_sampler_and_image_descriptors_idx(unsigned shader)
 {
-	return SI_DESCS_FIRST_SHADER + shader * SI_NUM_SHADER_DESCS +
-	       SI_SHADER_DESCS_SAMPLERS_AND_IMAGES;
+   return SI_DESCS_FIRST_SHADER + shader * SI_NUM_SHADER_DESCS +
+          SI_SHADER_DESCS_SAMPLERS_AND_IMAGES;
 }
 
 /* This represents descriptors in memory, such as buffer resources,
  * image resources, and sampler states.
  */
 struct si_descriptors {
-	/* The list of descriptors in malloc'd memory. */
-	uint32_t *list;
-	/* The list in mapped GPU memory. */
-	uint32_t *gpu_list;
-
-	/* The buffer where the descriptors have been uploaded. */
-	struct si_resource *buffer;
-	uint64_t gpu_address;
-
-	/* The maximum number of descriptors. */
-	uint32_t num_elements;
-
-	/* Slots that are used by currently-bound shaders.
-	 * It determines which slots are uploaded.
-	 */
-	uint32_t first_active_slot;
-	uint32_t num_active_slots;
-
-	/* The SH register offset relative to USER_DATA*_0 where the pointer
-	 * to the descriptor array will be stored. */
-	short shader_userdata_offset;
-	/* The size of one descriptor. */
-	ubyte element_dw_size;
-	/* If there is only one slot enabled, bind it directly instead of
-	 * uploading descriptors. -1 if disabled. */
-	signed char slot_index_to_bind_directly;
+   /* The list of descriptors in malloc'd memory. */
+   uint32_t *list;
+   /* The list in mapped GPU memory. */
+   uint32_t *gpu_list;
+
+   /* The buffer where the descriptors have been uploaded. */
+   struct si_resource *buffer;
+   uint64_t gpu_address;
+
+   /* The maximum number of descriptors. */
+   uint32_t num_elements;
+
+   /* Slots that are used by currently-bound shaders.
+    * It determines which slots are uploaded.
+    */
+   uint32_t first_active_slot;
+   uint32_t num_active_slots;
+
+   /* The SH register offset relative to USER_DATA*_0 where the pointer
+    * to the descriptor array will be stored. */
+   short shader_userdata_offset;
+   /* The size of one descriptor. */
+   ubyte element_dw_size;
+   /* If there is only one slot enabled, bind it directly instead of
+    * uploading descriptors. -1 if disabled. */
+   signed char slot_index_to_bind_directly;
 };
 
 struct si_buffer_resources {
-	struct pipe_resource		**buffers; /* this has num_buffers elements */
-	unsigned			*offsets; /* this has num_buffers elements */
+   struct pipe_resource **buffers; /* this has num_buffers elements */
+   unsigned *offsets;              /* this has num_buffers elements */
 
-	enum radeon_bo_priority		priority:6;
-	enum radeon_bo_priority		priority_constbuf:6;
+   enum radeon_bo_priority priority : 6;
+   enum radeon_bo_priority priority_constbuf : 6;
 
-	/* The i-th bit is set if that element is enabled (non-NULL resource). */
-	unsigned			enabled_mask;
-	unsigned			writable_mask;
+   /* The i-th bit is set if that element is enabled (non-NULL resource). */
+   unsigned enabled_mask;
+   unsigned writable_mask;
 };
 
-#define si_pm4_state_changed(sctx, member) \
-	((sctx)->queued.named.member != (sctx)->emitted.named.member)
+#define si_pm4_state_changed(sctx, member)                                                         \
+   ((sctx)->queued.named.member != (sctx)->emitted.named.member)
 
-#define si_pm4_state_enabled_and_changed(sctx, member) \
-	((sctx)->queued.named.member && si_pm4_state_changed(sctx, member))
+#define si_pm4_state_enabled_and_changed(sctx, member)                                             \
+   ((sctx)->queued.named.member && si_pm4_state_changed(sctx, member))
 
-#define si_pm4_bind_state(sctx, member, value) \
-	do { \
-		(sctx)->queued.named.member = (value); \
-		(sctx)->dirty_states |= SI_STATE_BIT(member); \
-	} while(0)
+#define si_pm4_bind_state(sctx, member, value)                                                     \
+   do {                                                                                            \
+      (sctx)->queued.named.member = (value);                                                       \
+      (sctx)->dirty_states |= SI_STATE_BIT(member);                                                \
+   } while (0)
 
-#define si_pm4_delete_state(sctx, member, value) \
-	do { \
-		if ((sctx)->queued.named.member == (value)) { \
-			(sctx)->queued.named.member = NULL; \
-		} \
-		si_pm4_free_state(sctx, (struct si_pm4_state *)(value), \
-				  SI_STATE_IDX(member)); \
-	} while(0)
+#define si_pm4_delete_state(sctx, member, value)                                                   \
+   do {                                                                                            \
+      if ((sctx)->queued.named.member == (value)) {                                                \
+         (sctx)->queued.named.member = NULL;                                                       \
+      }                                                                                            \
+      si_pm4_free_state(sctx, (struct si_pm4_state *)(value), SI_STATE_IDX(member));               \
+   } while (0)
 
 /* si_descriptors.c */
-void si_set_mutable_tex_desc_fields(struct si_screen *sscreen,
-				    struct si_texture *tex,
-				    const struct legacy_surf_level *base_level_info,
-				    unsigned base_level, unsigned first_level,
-				    unsigned block_width, bool is_stencil,
-				    uint32_t *state);
+void si_set_mutable_tex_desc_fields(struct si_screen *sscreen, struct si_texture *tex,
+                                    const struct legacy_surf_level *base_level_info,
+                                    unsigned base_level, unsigned first_level, unsigned block_width,
+                                    bool is_stencil, uint32_t *state);
 void si_update_ps_colorbuf0_slot(struct si_context *sctx);
-void si_get_pipe_constant_buffer(struct si_context *sctx, uint shader,
-				 uint slot, struct pipe_constant_buffer *cbuf);
-void si_get_shader_buffers(struct si_context *sctx,
-			   enum pipe_shader_type shader,
-			   uint start_slot, uint count,
-			   struct pipe_shader_buffer *sbuf);
-void si_set_ring_buffer(struct si_context *sctx, uint slot,
-			struct pipe_resource *buffer,
-			unsigned stride, unsigned num_records,
-			bool add_tid, bool swizzle,
-			unsigned element_size, unsigned index_stride, uint64_t offset);
+void si_get_pipe_constant_buffer(struct si_context *sctx, uint shader, uint slot,
+                                 struct pipe_constant_buffer *cbuf);
+void si_get_shader_buffers(struct si_context *sctx, enum pipe_shader_type shader, uint start_slot,
+                           uint count, struct pipe_shader_buffer *sbuf);
+void si_set_ring_buffer(struct si_context *sctx, uint slot, struct pipe_resource *buffer,
+                        unsigned stride, unsigned num_records, bool add_tid, bool swizzle,
+                        unsigned element_size, unsigned index_stride, uint64_t offset);
 void si_init_all_descriptors(struct si_context *sctx);
 bool si_upload_vertex_buffer_descriptors(struct si_context *sctx);
 bool si_upload_graphics_shader_descriptors(struct si_context *sctx);
@@ -530,102 +502,84 @@ void si_release_all_descriptors(struct si_context *sctx);
 void si_gfx_resources_add_all_to_bo_list(struct si_context *sctx);
 void si_compute_resources_add_all_to_bo_list(struct si_context *sctx);
 void si_all_descriptors_begin_new_cs(struct si_context *sctx);
-void si_upload_const_buffer(struct si_context *sctx, struct si_resource **buf,
-			    const uint8_t *ptr, unsigned size, uint32_t *const_offset);
+void si_upload_const_buffer(struct si_context *sctx, struct si_resource **buf, const uint8_t *ptr,
+                            unsigned size, uint32_t *const_offset);
 void si_update_all_texture_descriptors(struct si_context *sctx);
 void si_shader_change_notify(struct si_context *sctx);
 void si_update_needs_color_decompress_masks(struct si_context *sctx);
 void si_emit_graphics_shader_pointers(struct si_context *sctx);
 void si_emit_compute_shader_pointers(struct si_context *sctx);
-void si_set_rw_buffer(struct si_context *sctx,
-		      uint slot, const struct pipe_constant_buffer *input);
+void si_set_rw_buffer(struct si_context *sctx, uint slot, const struct pipe_constant_buffer *input);
 void si_set_rw_shader_buffer(struct si_context *sctx, uint slot,
-			     const struct pipe_shader_buffer *sbuffer);
+                             const struct pipe_shader_buffer *sbuffer);
 void si_set_active_descriptors(struct si_context *sctx, unsigned desc_idx,
-			       uint64_t new_active_mask);
-void si_set_active_descriptors_for_shader(struct si_context *sctx,
-					  struct si_shader_selector *sel);
-bool si_bindless_descriptor_can_reclaim_slab(void *priv,
-					     struct pb_slab_entry *entry);
-struct pb_slab *si_bindless_descriptor_slab_alloc(void *priv, unsigned heap,
-						  unsigned entry_size,
-						  unsigned group_index);
+                               uint64_t new_active_mask);
+void si_set_active_descriptors_for_shader(struct si_context *sctx, struct si_shader_selector *sel);
+bool si_bindless_descriptor_can_reclaim_slab(void *priv, struct pb_slab_entry *entry);
+struct pb_slab *si_bindless_descriptor_slab_alloc(void *priv, unsigned heap, unsigned entry_size,
+                                                  unsigned group_index);
 void si_bindless_descriptor_slab_free(void *priv, struct pb_slab *pslab);
 void si_rebind_buffer(struct si_context *sctx, struct pipe_resource *buf);
 /* si_state.c */
 void si_init_state_compute_functions(struct si_context *sctx);
 void si_init_state_functions(struct si_context *sctx);
 void si_init_screen_state_functions(struct si_screen *sscreen);
-void
-si_make_buffer_descriptor(struct si_screen *screen, struct si_resource *buf,
-			  enum pipe_format format,
-			  unsigned offset, unsigned size,
-			  uint32_t *state);
-struct pipe_sampler_view *
-si_create_sampler_view_custom(struct pipe_context *ctx,
-			      struct pipe_resource *texture,
-			      const struct pipe_sampler_view *state,
-			      unsigned width0, unsigned height0,
-			      unsigned force_level);
+void si_make_buffer_descriptor(struct si_screen *screen, struct si_resource *buf,
+                               enum pipe_format format, unsigned offset, unsigned size,
+                               uint32_t *state);
+struct pipe_sampler_view *si_create_sampler_view_custom(struct pipe_context *ctx,
+                                                        struct pipe_resource *texture,
+                                                        const struct pipe_sampler_view *state,
+                                                        unsigned width0, unsigned height0,
+                                                        unsigned force_level);
 void si_update_fb_dirtiness_after_rendering(struct si_context *sctx);
 void si_update_ps_iter_samples(struct si_context *sctx);
 void si_save_qbo_state(struct si_context *sctx, struct si_qbo_state *st);
 void si_restore_qbo_state(struct si_context *sctx, struct si_qbo_state *st);
-void si_set_occlusion_query_state(struct si_context *sctx,
-				  bool old_perfect_enable);
+void si_set_occlusion_query_state(struct si_context *sctx, bool old_perfect_enable);
 
 struct si_fast_udiv_info32 {
    unsigned multiplier; /* the "magic number" multiplier */
-   unsigned pre_shift; /* shift for the dividend before multiplying */
+   unsigned pre_shift;  /* shift for the dividend before multiplying */
    unsigned post_shift; /* shift for the dividend after multiplying */
-   int increment; /* 0 or 1; if set then increment the numerator, using one of
-                     the two strategies */
+   int increment;       /* 0 or 1; if set then increment the numerator, using one of
+                           the two strategies */
 };
 
-struct si_fast_udiv_info32
-si_compute_fast_udiv_info32(uint32_t D, unsigned num_bits);
+struct si_fast_udiv_info32 si_compute_fast_udiv_info32(uint32_t D, unsigned num_bits);
 
 /* si_state_binning.c */
 void si_emit_dpbb_state(struct si_context *sctx);
 
 /* si_state_shaders.c */
 void si_get_ir_cache_key(struct si_shader_selector *sel, bool ngg, bool es,
-			 unsigned char ir_sha1_cache_key[20]);
-bool si_shader_cache_load_shader(struct si_screen *sscreen,
-				 unsigned char ir_sha1_cache_key[20],
-				 struct si_shader *shader);
-void si_shader_cache_insert_shader(struct si_screen *sscreen,
-				   unsigned char ir_sha1_cache_key[20],
-				   struct si_shader *shader,
-				   bool insert_into_disk_cache);
+                         unsigned char ir_sha1_cache_key[20]);
+bool si_shader_cache_load_shader(struct si_screen *sscreen, unsigned char ir_sha1_cache_key[20],
+                                 struct si_shader *shader);
+void si_shader_cache_insert_shader(struct si_screen *sscreen, unsigned char ir_sha1_cache_key[20],
+                                   struct si_shader *shader, bool insert_into_disk_cache);
 bool si_update_shaders(struct si_context *sctx);
 void si_init_screen_live_shader_cache(struct si_screen *sscreen);
 void si_init_shader_functions(struct si_context *sctx);
 bool si_init_shader_cache(struct si_screen *sscreen);
 void si_destroy_shader_cache(struct si_screen *sscreen);
 void si_schedule_initial_compile(struct si_context *sctx, unsigned processor,
-				 struct util_queue_fence *ready_fence,
-				 struct si_compiler_ctx_state *compiler_ctx_state,
-				 void *job, util_queue_execute_func execute);
-void si_get_active_slot_masks(const struct si_shader_info *info,
-			      uint32_t *const_and_shader_buffers,
-			      uint64_t *samplers_and_images);
-int si_shader_select_with_key(struct si_screen *sscreen,
-			      struct si_shader_ctx_state *state,
-			      struct si_compiler_ctx_state *compiler_state,
-			      struct si_shader_key *key,
-			      int thread_index,
-			      bool optimized_or_none);
-void si_shader_selector_key_vs(struct si_context *sctx,
-			       struct si_shader_selector *vs,
-			       struct si_shader_key *key,
-			       struct si_vs_prolog_bits *prolog_key);
+                                 struct util_queue_fence *ready_fence,
+                                 struct si_compiler_ctx_state *compiler_ctx_state, void *job,
+                                 util_queue_execute_func execute);
+void si_get_active_slot_masks(const struct si_shader_info *info, uint32_t *const_and_shader_buffers,
+                              uint64_t *samplers_and_images);
+int si_shader_select_with_key(struct si_screen *sscreen, struct si_shader_ctx_state *state,
+                              struct si_compiler_ctx_state *compiler_state,
+                              struct si_shader_key *key, int thread_index, bool optimized_or_none);
+void si_shader_selector_key_vs(struct si_context *sctx, struct si_shader_selector *vs,
+                               struct si_shader_key *key, struct si_vs_prolog_bits *prolog_key);
 unsigned si_get_input_prim(const struct si_shader_selector *gs);
 bool si_update_ngg(struct si_context *sctx);
 
 /* si_state_draw.c */
 void si_emit_surface_sync(struct si_context *sctx, struct radeon_cmdbuf *cs,
-			  unsigned cp_coher_cntl);
+                          unsigned cp_coher_cntl);
 void si_prim_discard_signal_next_compute_ib_start(struct si_context *sctx);
 void gfx10_emit_cache_flush(struct si_context *sctx);
 void si_emit_cache_flush(struct si_context *sctx);
@@ -639,35 +593,33 @@ void si_emit_sample_locations(struct radeon_cmdbuf *cs, int nr_samples);
 /* si_state_streamout.c */
 void si_streamout_buffers_dirty(struct si_context *sctx);
 void si_emit_streamout_end(struct si_context *sctx);
-void si_update_prims_generated_query_state(struct si_context *sctx,
-					   unsigned type, int diff);
+void si_update_prims_generated_query_state(struct si_context *sctx, unsigned type, int diff);
 void si_init_streamout_functions(struct si_context *sctx);
 
-
 static inline unsigned si_get_constbuf_slot(unsigned slot)
 {
-	/* Constant buffers are in slots [16..31], ascending */
-	return SI_NUM_SHADER_BUFFERS + slot;
+   /* Constant buffers are in slots [16..31], ascending */
+   return SI_NUM_SHADER_BUFFERS + slot;
 }
 
 static inline unsigned si_get_shaderbuf_slot(unsigned slot)
 {
-	/* shader buffers are in slots [15..0], descending */
-	return SI_NUM_SHADER_BUFFERS - 1 - slot;
+   /* shader buffers are in slots [15..0], descending */
+   return SI_NUM_SHADER_BUFFERS - 1 - slot;
 }
 
 static inline unsigned si_get_sampler_slot(unsigned slot)
 {
-	/* 32 samplers are in sampler slots [16..47], 16 dw per slot, ascending */
-	/* those are equivalent to image slots [32..95], 8 dw per slot, ascending  */
-	return SI_NUM_IMAGE_SLOTS / 2 + slot;
+   /* 32 samplers are in sampler slots [16..47], 16 dw per slot, ascending */
+   /* those are equivalent to image slots [32..95], 8 dw per slot, ascending  */
+   return SI_NUM_IMAGE_SLOTS / 2 + slot;
 }
 
 static inline unsigned si_get_image_slot(unsigned slot)
 {
-	/* image slots are in [31..0] (sampler slots [15..0]), descending */
-	/* images are in slots [31..16], while FMASKs are in slots [15..0] */
-	return SI_NUM_IMAGE_SLOTS - 1 - slot;
+   /* image slots are in [31..0] (sampler slots [15..0]), descending */
+   /* images are in slots [31..16], while FMASKs are in slots [15..0] */
+   return SI_NUM_IMAGE_SLOTS - 1 - slot;
 }
 
 #endif
diff --git a/src/gallium/drivers/radeonsi/si_state_binning.c b/src/gallium/drivers/radeonsi/si_state_binning.c
index 1251b53785b..39bb94366f2 100644
--- a/src/gallium/drivers/radeonsi/si_state_binning.c
+++ b/src/gallium/drivers/radeonsi/si_state_binning.c
@@ -28,577 +28,548 @@
 #include "sid.h"
 
 struct uvec2 {
-	unsigned x, y;
+   unsigned x, y;
 };
 
 struct si_bin_size_map {
-	unsigned start;
-	unsigned bin_size_x;
-	unsigned bin_size_y;
+   unsigned start;
+   unsigned bin_size_x;
+   unsigned bin_size_y;
 };
 
 typedef struct si_bin_size_map si_bin_size_subtable[3][10];
 
 /* Find the bin size where sum is >= table[i].start and < table[i + 1].start. */
-static struct uvec2 si_find_bin_size(struct si_screen *sscreen,
-				     const si_bin_size_subtable table[],
-				     unsigned sum)
+static struct uvec2 si_find_bin_size(struct si_screen *sscreen, const si_bin_size_subtable table[],
+                                     unsigned sum)
 {
-	unsigned log_num_rb_per_se =
-		util_logbase2_ceil(sscreen->info.num_render_backends /
-				   sscreen->info.max_se);
-	unsigned log_num_se = util_logbase2_ceil(sscreen->info.max_se);
-	unsigned i;
-
-	/* Get the chip-specific subtable. */
-	const struct si_bin_size_map *subtable =
-		&table[log_num_rb_per_se][log_num_se][0];
-
-	for (i = 0; subtable[i].bin_size_x != 0; i++) {
-		if (sum >= subtable[i].start && sum < subtable[i + 1].start)
-			break;
-	}
-
-	struct uvec2 size = {subtable[i].bin_size_x, subtable[i].bin_size_y};
-	return size;
+   unsigned log_num_rb_per_se =
+      util_logbase2_ceil(sscreen->info.num_render_backends / sscreen->info.max_se);
+   unsigned log_num_se = util_logbase2_ceil(sscreen->info.max_se);
+   unsigned i;
+
+   /* Get the chip-specific subtable. */
+   const struct si_bin_size_map *subtable = &table[log_num_rb_per_se][log_num_se][0];
+
+   for (i = 0; subtable[i].bin_size_x != 0; i++) {
+      if (sum >= subtable[i].start && sum < subtable[i + 1].start)
+         break;
+   }
+
+   struct uvec2 size = {subtable[i].bin_size_x, subtable[i].bin_size_y};
+   return size;
 }
 
-static struct uvec2 si_get_color_bin_size(struct si_context *sctx,
-					  unsigned cb_target_enabled_4bit)
+static struct uvec2 si_get_color_bin_size(struct si_context *sctx, unsigned cb_target_enabled_4bit)
 {
-	unsigned num_fragments = sctx->framebuffer.nr_color_samples;
-	unsigned sum = 0;
-
-	/* Compute the sum of all Bpp. */
-	for (unsigned i = 0; i < sctx->framebuffer.state.nr_cbufs; i++) {
-		if (!(cb_target_enabled_4bit & (0xf << (i * 4))))
-			continue;
-
-		struct si_texture *tex =
-			(struct si_texture*)sctx->framebuffer.state.cbufs[i]->texture;
-		sum += tex->surface.bpe;
-	}
-
-	/* Multiply the sum by some function of the number of samples. */
-	if (num_fragments >= 2) {
-		if (si_get_ps_iter_samples(sctx) >= 2)
-			sum *= num_fragments;
-		else
-			sum *= 2;
-	}
-
-	static const si_bin_size_subtable table[] = {
-		{
-			/* One RB / SE */
-			{
-				/* One shader engine */
-				{        0,  128,  128 },
-				{        1,   64,  128 },
-				{        2,   32,  128 },
-				{        3,   16,  128 },
-				{       17,    0,    0 },
-			},
-			{
-				/* Two shader engines */
-				{        0,  128,  128 },
-				{        2,   64,  128 },
-				{        3,   32,  128 },
-				{        5,   16,  128 },
-				{       17,    0,    0 },
-			},
-			{
-				/* Four shader engines */
-				{        0,  128,  128 },
-				{        3,   64,  128 },
-				{        5,   16,  128 },
-				{       17,    0,    0 },
-			},
-		},
-		{
-			/* Two RB / SE */
-			{
-				/* One shader engine */
-				{        0,  128,  128 },
-				{        2,   64,  128 },
-				{        3,   32,  128 },
-				{        9,   16,  128 },
-				{       33,    0,    0 },
-			},
-			{
-				/* Two shader engines */
-				{        0,  128,  128 },
-				{        3,   64,  128 },
-				{        5,   32,  128 },
-				{        9,   16,  128 },
-				{       33,    0,    0 },
-			},
-			{
-				/* Four shader engines */
-				{        0,  256,  256 },
-				{        2,  128,  256 },
-				{        3,  128,  128 },
-				{        5,   64,  128 },
-				{        9,   16,  128 },
-				{       33,    0,    0 },
-			},
-		},
-		{
-			/* Four RB / SE */
-			{
-				/* One shader engine */
-				{        0,  128,  256 },
-				{        2,  128,  128 },
-				{        3,   64,  128 },
-				{        5,   32,  128 },
-				{        9,   16,  128 },
-				{       17,    0,    0 },
-			},
-			{
-				/* Two shader engines */
-				{        0,  256,  256 },
-				{        2,  128,  256 },
-				{        3,  128,  128 },
-				{        5,   64,  128 },
-				{        9,   32,  128 },
-				{       17,   16,  128 },
-				{       33,    0,    0 },
-			},
-			{
-				/* Four shader engines */
-				{        0,  256,  512 },
-				{        2,  128,  512 },
-				{        3,   64,  512 },
-				{        5,   32,  512 },
-				{        9,   32,  256 },
-				{       17,   32,  128 },
-				{       33,    0,    0 },
-			},
-		},
-	};
-
-	return si_find_bin_size(sctx->screen, table, sum);
+   unsigned num_fragments = sctx->framebuffer.nr_color_samples;
+   unsigned sum = 0;
+
+   /* Compute the sum of all Bpp. */
+   for (unsigned i = 0; i < sctx->framebuffer.state.nr_cbufs; i++) {
+      if (!(cb_target_enabled_4bit & (0xf << (i * 4))))
+         continue;
+
+      struct si_texture *tex = (struct si_texture *)sctx->framebuffer.state.cbufs[i]->texture;
+      sum += tex->surface.bpe;
+   }
+
+   /* Multiply the sum by some function of the number of samples. */
+   if (num_fragments >= 2) {
+      if (si_get_ps_iter_samples(sctx) >= 2)
+         sum *= num_fragments;
+      else
+         sum *= 2;
+   }
+
+   static const si_bin_size_subtable table[] = {
+      {
+         /* One RB / SE */
+         {
+            /* One shader engine */
+            {0, 128, 128},
+            {1, 64, 128},
+            {2, 32, 128},
+            {3, 16, 128},
+            {17, 0, 0},
+         },
+         {
+            /* Two shader engines */
+            {0, 128, 128},
+            {2, 64, 128},
+            {3, 32, 128},
+            {5, 16, 128},
+            {17, 0, 0},
+         },
+         {
+            /* Four shader engines */
+            {0, 128, 128},
+            {3, 64, 128},
+            {5, 16, 128},
+            {17, 0, 0},
+         },
+      },
+      {
+         /* Two RB / SE */
+         {
+            /* One shader engine */
+            {0, 128, 128},
+            {2, 64, 128},
+            {3, 32, 128},
+            {9, 16, 128},
+            {33, 0, 0},
+         },
+         {
+            /* Two shader engines */
+            {0, 128, 128},
+            {3, 64, 128},
+            {5, 32, 128},
+            {9, 16, 128},
+            {33, 0, 0},
+         },
+         {
+            /* Four shader engines */
+            {0, 256, 256},
+            {2, 128, 256},
+            {3, 128, 128},
+            {5, 64, 128},
+            {9, 16, 128},
+            {33, 0, 0},
+         },
+      },
+      {
+         /* Four RB / SE */
+         {
+            /* One shader engine */
+            {0, 128, 256},
+            {2, 128, 128},
+            {3, 64, 128},
+            {5, 32, 128},
+            {9, 16, 128},
+            {17, 0, 0},
+         },
+         {
+            /* Two shader engines */
+            {0, 256, 256},
+            {2, 128, 256},
+            {3, 128, 128},
+            {5, 64, 128},
+            {9, 32, 128},
+            {17, 16, 128},
+            {33, 0, 0},
+         },
+         {
+            /* Four shader engines */
+            {0, 256, 512},
+            {2, 128, 512},
+            {3, 64, 512},
+            {5, 32, 512},
+            {9, 32, 256},
+            {17, 32, 128},
+            {33, 0, 0},
+         },
+      },
+   };
+
+   return si_find_bin_size(sctx->screen, table, sum);
 }
 
 static struct uvec2 si_get_depth_bin_size(struct si_context *sctx)
 {
-	struct si_state_dsa *dsa = sctx->queued.named.dsa;
-
-	if (!sctx->framebuffer.state.zsbuf ||
-	    (!dsa->depth_enabled && !dsa->stencil_enabled)) {
-		/* Return the max size. */
-		struct uvec2 size = {512, 512};
-		return size;
-	}
-
-	struct si_texture *tex =
-		(struct si_texture*)sctx->framebuffer.state.zsbuf->texture;
-	unsigned depth_coeff = dsa->depth_enabled ? 5 : 0;
-	unsigned stencil_coeff = tex->surface.has_stencil &&
-				 dsa->stencil_enabled ? 1 : 0;
-	unsigned sum = 4 * (depth_coeff + stencil_coeff) *
-		       MAX2(tex->buffer.b.b.nr_samples, 1);
-
-	static const si_bin_size_subtable table[] = {
-		{
-			// One RB / SE
-			{
-				// One shader engine
-				{        0,   64,  512 },
-				{        2,   64,  256 },
-				{        4,   64,  128 },
-				{        7,   32,  128 },
-				{       13,   16,  128 },
-				{       49,    0,    0 },
-			},
-			{
-				// Two shader engines
-				{        0,  128,  512 },
-				{        2,   64,  512 },
-				{        4,   64,  256 },
-				{        7,   64,  128 },
-				{       13,   32,  128 },
-				{       25,   16,  128 },
-				{       49,    0,    0 },
-			},
-			{
-				// Four shader engines
-				{        0,  256,  512 },
-				{        2,  128,  512 },
-				{        4,   64,  512 },
-				{        7,   64,  256 },
-				{       13,   64,  128 },
-				{       25,   16,  128 },
-				{       49,    0,    0 },
-			},
-		},
-		{
-			// Two RB / SE
-			{
-				// One shader engine
-				{        0,  128,  512 },
-				{        2,   64,  512 },
-				{        4,   64,  256 },
-				{        7,   64,  128 },
-				{       13,   32,  128 },
-				{       25,   16,  128 },
-				{       97,    0,    0 },
-			},
-			{
-				// Two shader engines
-				{        0,  256,  512 },
-				{        2,  128,  512 },
-				{        4,   64,  512 },
-				{        7,   64,  256 },
-				{       13,   64,  128 },
-				{       25,   32,  128 },
-				{       49,   16,  128 },
-				{       97,    0,    0 },
-			},
-			{
-				// Four shader engines
-				{        0,  512,  512 },
-				{        2,  256,  512 },
-				{        4,  128,  512 },
-				{        7,   64,  512 },
-				{       13,   64,  256 },
-				{       25,   64,  128 },
-				{       49,   16,  128 },
-				{       97,    0,    0 },
-			},
-		},
-		{
-			// Four RB / SE
-			{
-				// One shader engine
-				{        0,  256,  512 },
-				{        2,  128,  512 },
-				{        4,   64,  512 },
-				{        7,   64,  256 },
-				{       13,   64,  128 },
-				{       25,   32,  128 },
-				{       49,   16,  128 },
-				{      193,    0,    0 },
-			},
-			{
-				// Two shader engines
-				{        0,  512,  512 },
-				{        2,  256,  512 },
-				{        4,  128,  512 },
-				{        7,   64,  512 },
-				{       13,   64,  256 },
-				{       25,   64,  128 },
-				{       49,   32,  128 },
-				{       97,   16,  128 },
-				{      193,    0,    0 },
-			},
-			{
-				// Four shader engines
-				{        0,  512,  512 },
-				{        4,  256,  512 },
-				{        7,  128,  512 },
-				{       13,   64,  512 },
-				{       25,   32,  512 },
-				{       49,   32,  256 },
-				{       97,   16,  128 },
-				{      193,    0,    0 },
-			},
-		},
-	};
-
-	return si_find_bin_size(sctx->screen, table, sum);
+   struct si_state_dsa *dsa = sctx->queued.named.dsa;
+
+   if (!sctx->framebuffer.state.zsbuf || (!dsa->depth_enabled && !dsa->stencil_enabled)) {
+      /* Return the max size. */
+      struct uvec2 size = {512, 512};
+      return size;
+   }
+
+   struct si_texture *tex = (struct si_texture *)sctx->framebuffer.state.zsbuf->texture;
+   unsigned depth_coeff = dsa->depth_enabled ? 5 : 0;
+   unsigned stencil_coeff = tex->surface.has_stencil && dsa->stencil_enabled ? 1 : 0;
+   unsigned sum = 4 * (depth_coeff + stencil_coeff) * MAX2(tex->buffer.b.b.nr_samples, 1);
+
+   static const si_bin_size_subtable table[] = {
+      {
+         // One RB / SE
+         {
+            // One shader engine
+            {0, 64, 512},
+            {2, 64, 256},
+            {4, 64, 128},
+            {7, 32, 128},
+            {13, 16, 128},
+            {49, 0, 0},
+         },
+         {
+            // Two shader engines
+            {0, 128, 512},
+            {2, 64, 512},
+            {4, 64, 256},
+            {7, 64, 128},
+            {13, 32, 128},
+            {25, 16, 128},
+            {49, 0, 0},
+         },
+         {
+            // Four shader engines
+            {0, 256, 512},
+            {2, 128, 512},
+            {4, 64, 512},
+            {7, 64, 256},
+            {13, 64, 128},
+            {25, 16, 128},
+            {49, 0, 0},
+         },
+      },
+      {
+         // Two RB / SE
+         {
+            // One shader engine
+            {0, 128, 512},
+            {2, 64, 512},
+            {4, 64, 256},
+            {7, 64, 128},
+            {13, 32, 128},
+            {25, 16, 128},
+            {97, 0, 0},
+         },
+         {
+            // Two shader engines
+            {0, 256, 512},
+            {2, 128, 512},
+            {4, 64, 512},
+            {7, 64, 256},
+            {13, 64, 128},
+            {25, 32, 128},
+            {49, 16, 128},
+            {97, 0, 0},
+         },
+         {
+            // Four shader engines
+            {0, 512, 512},
+            {2, 256, 512},
+            {4, 128, 512},
+            {7, 64, 512},
+            {13, 64, 256},
+            {25, 64, 128},
+            {49, 16, 128},
+            {97, 0, 0},
+         },
+      },
+      {
+         // Four RB / SE
+         {
+            // One shader engine
+            {0, 256, 512},
+            {2, 128, 512},
+            {4, 64, 512},
+            {7, 64, 256},
+            {13, 64, 128},
+            {25, 32, 128},
+            {49, 16, 128},
+            {193, 0, 0},
+         },
+         {
+            // Two shader engines
+            {0, 512, 512},
+            {2, 256, 512},
+            {4, 128, 512},
+            {7, 64, 512},
+            {13, 64, 256},
+            {25, 64, 128},
+            {49, 32, 128},
+            {97, 16, 128},
+            {193, 0, 0},
+         },
+         {
+            // Four shader engines
+            {0, 512, 512},
+            {4, 256, 512},
+            {7, 128, 512},
+            {13, 64, 512},
+            {25, 32, 512},
+            {49, 32, 256},
+            {97, 16, 128},
+            {193, 0, 0},
+         },
+      },
+   };
+
+   return si_find_bin_size(sctx->screen, table, sum);
 }
 
-static void gfx10_get_bin_sizes(struct si_context *sctx,
-				unsigned cb_target_enabled_4bit,
-				struct uvec2 *color_bin_size,
-				struct uvec2 *depth_bin_size)
+static void gfx10_get_bin_sizes(struct si_context *sctx, unsigned cb_target_enabled_4bit,
+                                struct uvec2 *color_bin_size, struct uvec2 *depth_bin_size)
 {
-	const unsigned ZsTagSize  = 64;
-	const unsigned ZsNumTags  = 312;
-	const unsigned CcTagSize  = 1024;
-	const unsigned CcReadTags = 31;
-	const unsigned FcTagSize  = 256;
-	const unsigned FcReadTags = 44;
-
-	const unsigned num_rbs = sctx->screen->info.num_render_backends;
-	const unsigned num_pipes = MAX2(num_rbs, sctx->screen->info.num_sdp_interfaces);
-
-	const unsigned depthBinSizeTagPart = ((ZsNumTags * num_rbs / num_pipes) * (ZsTagSize * num_pipes));
-	const unsigned colorBinSizeTagPart = ((CcReadTags * num_rbs / num_pipes) * (CcTagSize * num_pipes));
-	const unsigned fmaskBinSizeTagPart = ((FcReadTags * num_rbs / num_pipes) * (FcTagSize * num_pipes));
-
-	const unsigned minBinSizeX = 128;
-	const unsigned minBinSizeY = 64;
-
-	const unsigned num_fragments = sctx->framebuffer.nr_color_samples;
-	const unsigned num_samples = sctx->framebuffer.nr_samples;
-	const bool ps_iter_sample = si_get_ps_iter_samples(sctx) >= 2;
-
-	/* Calculate cColor and cFmask(if applicable) */
-	unsigned cColor = 0;
-	unsigned cFmask = 0;
-	bool has_fmask = false;
-
-	for (unsigned i = 0; i < sctx->framebuffer.state.nr_cbufs; i++) {
-		if (!sctx->framebuffer.state.cbufs[i])
-			continue;
-
-		struct si_texture *tex =
-			(struct si_texture*)sctx->framebuffer.state.cbufs[i]->texture;
-		const unsigned mmrt =
-			num_fragments == 1 ? 1 : (ps_iter_sample ? num_fragments : 2);
-
-		cColor += tex->surface.bpe * mmrt;
-		if (num_samples >= 2 /* if FMASK is bound */) {
-			const unsigned fragmentsLog2 = util_logbase2(num_fragments);
-			const unsigned samplesLog2 = util_logbase2(num_samples);
-
-			static const unsigned cFmaskMrt[4 /* fragments */][5 /* samples */] = {
-				{ 0, 1, 1, 1, 2 }, /* fragments = 1 */
-				{ 0, 1, 1, 2, 4 }, /* fragments = 2 */
-				{ 0, 1, 1, 4, 8 }, /* fragments = 4 */
-				{ 0, 1, 2, 4, 8 }  /* fragments = 8 */
-			};
-			cFmask += cFmaskMrt[fragmentsLog2][samplesLog2];
-			has_fmask = true;
-		}
-	}
-	cColor = MAX2(cColor, 1u);
-
-	const unsigned colorLog2Pixels = util_logbase2(colorBinSizeTagPart / cColor);
-	const unsigned colorBinSizeX   = 1 << ((colorLog2Pixels + 1) / 2); /* round up width */
-	const unsigned colorBinSizeY   = 1 << (colorLog2Pixels / 2);       /* round down height */
-
-	unsigned binSizeX = colorBinSizeX;
-	unsigned binSizeY = colorBinSizeY;
-
-	if (has_fmask) {
-		cFmask = MAX2(cFmask, 1u);
-
-		const unsigned fmaskLog2Pixels = util_logbase2(fmaskBinSizeTagPart / cFmask);
-		const unsigned fmaskBinSizeX   = 1 << ((fmaskLog2Pixels + 1) / 2); /* round up width */
-		const unsigned fmaskBinSizeY   = 1 << (fmaskLog2Pixels / 2);       /* round down height */
-
-		/* use the smaller of the Color vs. Fmask bin sizes */
-		if (fmaskLog2Pixels < colorLog2Pixels) {
-			binSizeX = fmaskBinSizeX;
-			binSizeY = fmaskBinSizeY;
-		}
-	}
-
-	/* Return size adjusted for minimum bin size */
-	color_bin_size->x = MAX2(binSizeX, minBinSizeX);
-	color_bin_size->y = MAX2(binSizeY, minBinSizeY);
-
-	if (!sctx->framebuffer.state.zsbuf) {
-		/* Set to max sizes when no depth buffer is bound. */
-		depth_bin_size->x = 512;
-		depth_bin_size->y = 512;
-	} else {
-		struct si_texture *zstex = (struct si_texture*)sctx->framebuffer.state.zsbuf->texture;
-		struct si_state_dsa *dsa = sctx->queued.named.dsa;
-
-		const unsigned cPerDepthSample   = dsa->depth_enabled ? 5 : 0;
-		const unsigned cPerStencilSample = dsa->stencil_enabled ? 1 : 0;
-		const unsigned cDepth            = (cPerDepthSample + cPerStencilSample) *
-						   MAX2(zstex->buffer.b.b.nr_samples, 1);
-
-		const unsigned depthLog2Pixels = util_logbase2(depthBinSizeTagPart / MAX2(cDepth, 1u));
-		unsigned       depthBinSizeX   = 1 << ((depthLog2Pixels + 1) / 2);
-		unsigned       depthBinSizeY   = 1 << (depthLog2Pixels / 2);
-
-		depth_bin_size->x = MAX2(depthBinSizeX, minBinSizeX);
-		depth_bin_size->y = MAX2(depthBinSizeY, minBinSizeY);
-	}
+   const unsigned ZsTagSize = 64;
+   const unsigned ZsNumTags = 312;
+   const unsigned CcTagSize = 1024;
+   const unsigned CcReadTags = 31;
+   const unsigned FcTagSize = 256;
+   const unsigned FcReadTags = 44;
+
+   const unsigned num_rbs = sctx->screen->info.num_render_backends;
+   const unsigned num_pipes = MAX2(num_rbs, sctx->screen->info.num_sdp_interfaces);
+
+   const unsigned depthBinSizeTagPart =
+      ((ZsNumTags * num_rbs / num_pipes) * (ZsTagSize * num_pipes));
+   const unsigned colorBinSizeTagPart =
+      ((CcReadTags * num_rbs / num_pipes) * (CcTagSize * num_pipes));
+   const unsigned fmaskBinSizeTagPart =
+      ((FcReadTags * num_rbs / num_pipes) * (FcTagSize * num_pipes));
+
+   const unsigned minBinSizeX = 128;
+   const unsigned minBinSizeY = 64;
+
+   const unsigned num_fragments = sctx->framebuffer.nr_color_samples;
+   const unsigned num_samples = sctx->framebuffer.nr_samples;
+   const bool ps_iter_sample = si_get_ps_iter_samples(sctx) >= 2;
+
+   /* Calculate cColor and cFmask(if applicable) */
+   unsigned cColor = 0;
+   unsigned cFmask = 0;
+   bool has_fmask = false;
+
+   for (unsigned i = 0; i < sctx->framebuffer.state.nr_cbufs; i++) {
+      if (!sctx->framebuffer.state.cbufs[i])
+         continue;
+
+      struct si_texture *tex = (struct si_texture *)sctx->framebuffer.state.cbufs[i]->texture;
+      const unsigned mmrt = num_fragments == 1 ? 1 : (ps_iter_sample ? num_fragments : 2);
+
+      cColor += tex->surface.bpe * mmrt;
+      if (num_samples >= 2 /* if FMASK is bound */) {
+         const unsigned fragmentsLog2 = util_logbase2(num_fragments);
+         const unsigned samplesLog2 = util_logbase2(num_samples);
+
+         static const unsigned cFmaskMrt[4 /* fragments */][5 /* samples */] = {
+            {0, 1, 1, 1, 2}, /* fragments = 1 */
+            {0, 1, 1, 2, 4}, /* fragments = 2 */
+            {0, 1, 1, 4, 8}, /* fragments = 4 */
+            {0, 1, 2, 4, 8}  /* fragments = 8 */
+         };
+         cFmask += cFmaskMrt[fragmentsLog2][samplesLog2];
+         has_fmask = true;
+      }
+   }
+   cColor = MAX2(cColor, 1u);
+
+   const unsigned colorLog2Pixels = util_logbase2(colorBinSizeTagPart / cColor);
+   const unsigned colorBinSizeX = 1 << ((colorLog2Pixels + 1) / 2); /* round up width */
+   const unsigned colorBinSizeY = 1 << (colorLog2Pixels / 2);       /* round down height */
+
+   unsigned binSizeX = colorBinSizeX;
+   unsigned binSizeY = colorBinSizeY;
+
+   if (has_fmask) {
+      cFmask = MAX2(cFmask, 1u);
+
+      const unsigned fmaskLog2Pixels = util_logbase2(fmaskBinSizeTagPart / cFmask);
+      const unsigned fmaskBinSizeX = 1 << ((fmaskLog2Pixels + 1) / 2); /* round up width */
+      const unsigned fmaskBinSizeY = 1 << (fmaskLog2Pixels / 2);       /* round down height */
+
+      /* use the smaller of the Color vs. Fmask bin sizes */
+      if (fmaskLog2Pixels < colorLog2Pixels) {
+         binSizeX = fmaskBinSizeX;
+         binSizeY = fmaskBinSizeY;
+      }
+   }
+
+   /* Return size adjusted for minimum bin size */
+   color_bin_size->x = MAX2(binSizeX, minBinSizeX);
+   color_bin_size->y = MAX2(binSizeY, minBinSizeY);
+
+   if (!sctx->framebuffer.state.zsbuf) {
+      /* Set to max sizes when no depth buffer is bound. */
+      depth_bin_size->x = 512;
+      depth_bin_size->y = 512;
+   } else {
+      struct si_texture *zstex = (struct si_texture *)sctx->framebuffer.state.zsbuf->texture;
+      struct si_state_dsa *dsa = sctx->queued.named.dsa;
+
+      const unsigned cPerDepthSample = dsa->depth_enabled ? 5 : 0;
+      const unsigned cPerStencilSample = dsa->stencil_enabled ? 1 : 0;
+      const unsigned cDepth =
+         (cPerDepthSample + cPerStencilSample) * MAX2(zstex->buffer.b.b.nr_samples, 1);
+
+      const unsigned depthLog2Pixels = util_logbase2(depthBinSizeTagPart / MAX2(cDepth, 1u));
+      unsigned depthBinSizeX = 1 << ((depthLog2Pixels + 1) / 2);
+      unsigned depthBinSizeY = 1 << (depthLog2Pixels / 2);
+
+      depth_bin_size->x = MAX2(depthBinSizeX, minBinSizeX);
+      depth_bin_size->y = MAX2(depthBinSizeY, minBinSizeY);
+   }
 }
 
 static void si_emit_dpbb_disable(struct si_context *sctx)
 {
-	unsigned initial_cdw = sctx->gfx_cs->current.cdw;
-
-	if (sctx->chip_class >= GFX10) {
-		struct uvec2 bin_size = {};
-		struct uvec2 bin_size_extend = {};
-
-		bin_size.x = 128;
-		bin_size.y = sctx->framebuffer.min_bytes_per_pixel <= 4 ? 128 : 64;
-
-		if (bin_size.x >= 32)
-			bin_size_extend.x = util_logbase2(bin_size.x) - 5;
-		if (bin_size.y >= 32)
-			bin_size_extend.y = util_logbase2(bin_size.y) - 5;
-
-		radeon_opt_set_context_reg(sctx, R_028C44_PA_SC_BINNER_CNTL_0,
-			SI_TRACKED_PA_SC_BINNER_CNTL_0,
-			S_028C44_BINNING_MODE(V_028C44_DISABLE_BINNING_USE_NEW_SC) |
-			S_028C44_BIN_SIZE_X(bin_size.x == 16) |
-			S_028C44_BIN_SIZE_Y(bin_size.y == 16) |
-			S_028C44_BIN_SIZE_X_EXTEND(bin_size_extend.x) |
-			S_028C44_BIN_SIZE_Y_EXTEND(bin_size_extend.y) |
-			S_028C44_DISABLE_START_OF_PRIM(1) |
-			S_028C44_FLUSH_ON_BINNING_TRANSITION(sctx->last_binning_enabled != 0));
-	} else {
-		radeon_opt_set_context_reg(sctx, R_028C44_PA_SC_BINNER_CNTL_0,
-			SI_TRACKED_PA_SC_BINNER_CNTL_0,
-			S_028C44_BINNING_MODE(V_028C44_DISABLE_BINNING_USE_LEGACY_SC) |
-			S_028C44_DISABLE_START_OF_PRIM(1) |
-			S_028C44_FLUSH_ON_BINNING_TRANSITION((sctx->family == CHIP_VEGA12 ||
-							      sctx->family == CHIP_VEGA20 ||
-							      sctx->family >= CHIP_RAVEN2) &&
-							     sctx->last_binning_enabled != 0));
-	}
-
-	unsigned db_dfsm_control = sctx->chip_class >= GFX10 ? R_028038_DB_DFSM_CONTROL
-							     : R_028060_DB_DFSM_CONTROL;
-	radeon_opt_set_context_reg(sctx, db_dfsm_control,
-				   SI_TRACKED_DB_DFSM_CONTROL,
-				   S_028060_PUNCHOUT_MODE(V_028060_FORCE_OFF) |
-				   S_028060_POPS_DRAIN_PS_ON_OVERLAP(1));
-	if (initial_cdw != sctx->gfx_cs->current.cdw)
-		sctx->context_roll = true;
-
-	sctx->last_binning_enabled = false;
+   unsigned initial_cdw = sctx->gfx_cs->current.cdw;
+
+   if (sctx->chip_class >= GFX10) {
+      struct uvec2 bin_size = {};
+      struct uvec2 bin_size_extend = {};
+
+      bin_size.x = 128;
+      bin_size.y = sctx->framebuffer.min_bytes_per_pixel <= 4 ? 128 : 64;
+
+      if (bin_size.x >= 32)
+         bin_size_extend.x = util_logbase2(bin_size.x) - 5;
+      if (bin_size.y >= 32)
+         bin_size_extend.y = util_logbase2(bin_size.y) - 5;
+
+      radeon_opt_set_context_reg(
+         sctx, R_028C44_PA_SC_BINNER_CNTL_0, SI_TRACKED_PA_SC_BINNER_CNTL_0,
+         S_028C44_BINNING_MODE(V_028C44_DISABLE_BINNING_USE_NEW_SC) |
+            S_028C44_BIN_SIZE_X(bin_size.x == 16) | S_028C44_BIN_SIZE_Y(bin_size.y == 16) |
+            S_028C44_BIN_SIZE_X_EXTEND(bin_size_extend.x) |
+            S_028C44_BIN_SIZE_Y_EXTEND(bin_size_extend.y) | S_028C44_DISABLE_START_OF_PRIM(1) |
+            S_028C44_FLUSH_ON_BINNING_TRANSITION(sctx->last_binning_enabled != 0));
+   } else {
+      radeon_opt_set_context_reg(
+         sctx, R_028C44_PA_SC_BINNER_CNTL_0, SI_TRACKED_PA_SC_BINNER_CNTL_0,
+         S_028C44_BINNING_MODE(V_028C44_DISABLE_BINNING_USE_LEGACY_SC) |
+            S_028C44_DISABLE_START_OF_PRIM(1) |
+            S_028C44_FLUSH_ON_BINNING_TRANSITION((sctx->family == CHIP_VEGA12 ||
+                                                  sctx->family == CHIP_VEGA20 ||
+                                                  sctx->family >= CHIP_RAVEN2) &&
+                                                 sctx->last_binning_enabled != 0));
+   }
+
+   unsigned db_dfsm_control =
+      sctx->chip_class >= GFX10 ? R_028038_DB_DFSM_CONTROL : R_028060_DB_DFSM_CONTROL;
+   radeon_opt_set_context_reg(
+      sctx, db_dfsm_control, SI_TRACKED_DB_DFSM_CONTROL,
+      S_028060_PUNCHOUT_MODE(V_028060_FORCE_OFF) | S_028060_POPS_DRAIN_PS_ON_OVERLAP(1));
+   if (initial_cdw != sctx->gfx_cs->current.cdw)
+      sctx->context_roll = true;
+
+   sctx->last_binning_enabled = false;
 }
 
 void si_emit_dpbb_state(struct si_context *sctx)
 {
-	struct si_screen *sscreen = sctx->screen;
-	struct si_state_blend *blend = sctx->queued.named.blend;
-	struct si_state_dsa *dsa = sctx->queued.named.dsa;
-	unsigned db_shader_control = sctx->ps_db_shader_control;
-
-	assert(sctx->chip_class >= GFX9);
-
-	if (!sscreen->dpbb_allowed || sctx->dpbb_force_off) {
-		si_emit_dpbb_disable(sctx);
-		return;
-	}
-
-	bool ps_can_kill = G_02880C_KILL_ENABLE(db_shader_control) ||
-			   G_02880C_MASK_EXPORT_ENABLE(db_shader_control) ||
-			   G_02880C_COVERAGE_TO_MASK_ENABLE(db_shader_control) ||
-			   blend->alpha_to_coverage;
-
-	bool db_can_reject_z_trivially =
-		!G_02880C_Z_EXPORT_ENABLE(db_shader_control) ||
-		G_02880C_CONSERVATIVE_Z_EXPORT(db_shader_control) ||
-		G_02880C_DEPTH_BEFORE_SHADER(db_shader_control);
-
-	/* Disable DPBB when it's believed to be inefficient. */
-	if (sscreen->info.num_render_backends > 4 &&
-	    ps_can_kill &&
-	    db_can_reject_z_trivially &&
-	    sctx->framebuffer.state.zsbuf &&
-	    dsa->db_can_write) {
-		si_emit_dpbb_disable(sctx);
-		return;
-	}
-
-	/* Compute the bin size. */
-	/* TODO: We could also look at enabled pixel shader outputs. */
-	unsigned cb_target_enabled_4bit = sctx->framebuffer.colorbuf_enabled_4bit &
-					  blend->cb_target_enabled_4bit;
-	struct uvec2 color_bin_size, depth_bin_size;
-
-	if (sctx->chip_class >= GFX10) {
-		gfx10_get_bin_sizes(sctx, cb_target_enabled_4bit,
-				    &color_bin_size, &depth_bin_size);
-	} else {
-		color_bin_size = si_get_color_bin_size(sctx, cb_target_enabled_4bit);
-		depth_bin_size = si_get_depth_bin_size(sctx);
-	}
-
-	unsigned color_area = color_bin_size.x * color_bin_size.y;
-	unsigned depth_area = depth_bin_size.x * depth_bin_size.y;
-
-	struct uvec2 bin_size = color_area < depth_area ? color_bin_size
-							: depth_bin_size;
-
-	if (!bin_size.x || !bin_size.y) {
-		si_emit_dpbb_disable(sctx);
-		return;
-	}
-
-	/* Enable DFSM if it's preferred. */
-	unsigned punchout_mode = V_028060_FORCE_OFF;
-	bool disable_start_of_prim = true;
-	bool zs_eqaa_dfsm_bug = sctx->chip_class == GFX9 &&
-				sctx->framebuffer.state.zsbuf &&
-				sctx->framebuffer.nr_samples !=
-				MAX2(1, sctx->framebuffer.state.zsbuf->texture->nr_samples);
-
-	if (sscreen->dfsm_allowed &&
-	    !zs_eqaa_dfsm_bug &&
-	    cb_target_enabled_4bit &&
-	    !G_02880C_KILL_ENABLE(db_shader_control) &&
-	    /* These two also imply that DFSM is disabled when PS writes to memory. */
-	    !G_02880C_EXEC_ON_HIER_FAIL(db_shader_control) &&
-	    !G_02880C_EXEC_ON_NOOP(db_shader_control) &&
-	    G_02880C_Z_ORDER(db_shader_control) == V_02880C_EARLY_Z_THEN_LATE_Z) {
-		punchout_mode = V_028060_AUTO;
-		disable_start_of_prim = (cb_target_enabled_4bit &
-					 blend->blend_enable_4bit) != 0;
-	}
-
-	/* Tunable parameters. Also test with DFSM enabled/disabled. */
-	unsigned context_states_per_bin; /* allowed range: [1, 6] */
-	unsigned persistent_states_per_bin; /* allowed range: [1, 32] */
-	unsigned fpovs_per_batch; /* allowed range: [0, 255], 0 = unlimited */
-
-	/* Tuned for Raven. Vega might need different values. */
-	if (sscreen->info.has_dedicated_vram) {
-		if (sscreen->info.num_render_backends > 4) {
-			context_states_per_bin = 1;
-			persistent_states_per_bin = 1;
-		} else {
-			context_states_per_bin = 3;
-			persistent_states_per_bin = 8;
-		}
-	} else {
-		/* This is a workaround for:
-		 *    https://bugs.freedesktop.org/show_bug.cgi?id=110214
-		 * (an alternative is to insert manual BATCH_BREAK event when
-		 * a context_roll is detected). */
-		context_states_per_bin = sctx->screen->info.has_gfx9_scissor_bug ? 1 : 6;
-		/* Using 32 here can cause GPU hangs on RAVEN1 */
-		persistent_states_per_bin = 16;
-	}
-	fpovs_per_batch = 63;
-
-	/* Emit registers. */
-	struct uvec2 bin_size_extend = {};
-	if (bin_size.x >= 32)
-		bin_size_extend.x = util_logbase2(bin_size.x) - 5;
-	if (bin_size.y >= 32)
-		bin_size_extend.y = util_logbase2(bin_size.y) - 5;
-
-	unsigned initial_cdw = sctx->gfx_cs->current.cdw;
-	radeon_opt_set_context_reg(
-		sctx, R_028C44_PA_SC_BINNER_CNTL_0,
-		SI_TRACKED_PA_SC_BINNER_CNTL_0,
-		S_028C44_BINNING_MODE(V_028C44_BINNING_ALLOWED) |
-		S_028C44_BIN_SIZE_X(bin_size.x == 16) |
-		S_028C44_BIN_SIZE_Y(bin_size.y == 16) |
-		S_028C44_BIN_SIZE_X_EXTEND(bin_size_extend.x) |
-		S_028C44_BIN_SIZE_Y_EXTEND(bin_size_extend.y) |
-		S_028C44_CONTEXT_STATES_PER_BIN(context_states_per_bin - 1) |
-		S_028C44_PERSISTENT_STATES_PER_BIN(persistent_states_per_bin - 1) |
-		S_028C44_DISABLE_START_OF_PRIM(disable_start_of_prim) |
-		S_028C44_FPOVS_PER_BATCH(fpovs_per_batch) |
-		S_028C44_OPTIMAL_BIN_SELECTION(1) |
-		S_028C44_FLUSH_ON_BINNING_TRANSITION((sctx->family == CHIP_VEGA12 ||
-						      sctx->family == CHIP_VEGA20 ||
-						      sctx->family >= CHIP_RAVEN2) &&
-						     sctx->last_binning_enabled != 1));
-
-	unsigned db_dfsm_control = sctx->chip_class >= GFX10 ? R_028038_DB_DFSM_CONTROL
-							     : R_028060_DB_DFSM_CONTROL;
-	radeon_opt_set_context_reg(sctx, db_dfsm_control,
-				   SI_TRACKED_DB_DFSM_CONTROL,
-				   S_028060_PUNCHOUT_MODE(punchout_mode) |
-				   S_028060_POPS_DRAIN_PS_ON_OVERLAP(1));
-	if (initial_cdw != sctx->gfx_cs->current.cdw)
-		sctx->context_roll = true;
-
-	sctx->last_binning_enabled = true;
+   struct si_screen *sscreen = sctx->screen;
+   struct si_state_blend *blend = sctx->queued.named.blend;
+   struct si_state_dsa *dsa = sctx->queued.named.dsa;
+   unsigned db_shader_control = sctx->ps_db_shader_control;
+
+   assert(sctx->chip_class >= GFX9);
+
+   if (!sscreen->dpbb_allowed || sctx->dpbb_force_off) {
+      si_emit_dpbb_disable(sctx);
+      return;
+   }
+
+   bool ps_can_kill =
+      G_02880C_KILL_ENABLE(db_shader_control) || G_02880C_MASK_EXPORT_ENABLE(db_shader_control) ||
+      G_02880C_COVERAGE_TO_MASK_ENABLE(db_shader_control) || blend->alpha_to_coverage;
+
+   bool db_can_reject_z_trivially = !G_02880C_Z_EXPORT_ENABLE(db_shader_control) ||
+                                    G_02880C_CONSERVATIVE_Z_EXPORT(db_shader_control) ||
+                                    G_02880C_DEPTH_BEFORE_SHADER(db_shader_control);
+
+   /* Disable DPBB when it's believed to be inefficient. */
+   if (sscreen->info.num_render_backends > 4 && ps_can_kill && db_can_reject_z_trivially &&
+       sctx->framebuffer.state.zsbuf && dsa->db_can_write) {
+      si_emit_dpbb_disable(sctx);
+      return;
+   }
+
+   /* Compute the bin size. */
+   /* TODO: We could also look at enabled pixel shader outputs. */
+   unsigned cb_target_enabled_4bit =
+      sctx->framebuffer.colorbuf_enabled_4bit & blend->cb_target_enabled_4bit;
+   struct uvec2 color_bin_size, depth_bin_size;
+
+   if (sctx->chip_class >= GFX10) {
+      gfx10_get_bin_sizes(sctx, cb_target_enabled_4bit, &color_bin_size, &depth_bin_size);
+   } else {
+      color_bin_size = si_get_color_bin_size(sctx, cb_target_enabled_4bit);
+      depth_bin_size = si_get_depth_bin_size(sctx);
+   }
+
+   unsigned color_area = color_bin_size.x * color_bin_size.y;
+   unsigned depth_area = depth_bin_size.x * depth_bin_size.y;
+
+   struct uvec2 bin_size = color_area < depth_area ? color_bin_size : depth_bin_size;
+
+   if (!bin_size.x || !bin_size.y) {
+      si_emit_dpbb_disable(sctx);
+      return;
+   }
+
+   /* Enable DFSM if it's preferred. */
+   unsigned punchout_mode = V_028060_FORCE_OFF;
+   bool disable_start_of_prim = true;
+   bool zs_eqaa_dfsm_bug =
+      sctx->chip_class == GFX9 && sctx->framebuffer.state.zsbuf &&
+      sctx->framebuffer.nr_samples != MAX2(1, sctx->framebuffer.state.zsbuf->texture->nr_samples);
+
+   if (sscreen->dfsm_allowed && !zs_eqaa_dfsm_bug && cb_target_enabled_4bit &&
+       !G_02880C_KILL_ENABLE(db_shader_control) &&
+       /* These two also imply that DFSM is disabled when PS writes to memory. */
+       !G_02880C_EXEC_ON_HIER_FAIL(db_shader_control) &&
+       !G_02880C_EXEC_ON_NOOP(db_shader_control) &&
+       G_02880C_Z_ORDER(db_shader_control) == V_02880C_EARLY_Z_THEN_LATE_Z) {
+      punchout_mode = V_028060_AUTO;
+      disable_start_of_prim = (cb_target_enabled_4bit & blend->blend_enable_4bit) != 0;
+   }
+
+   /* Tunable parameters. Also test with DFSM enabled/disabled. */
+   unsigned context_states_per_bin;    /* allowed range: [1, 6] */
+   unsigned persistent_states_per_bin; /* allowed range: [1, 32] */
+   unsigned fpovs_per_batch;           /* allowed range: [0, 255], 0 = unlimited */
+
+   /* Tuned for Raven. Vega might need different values. */
+   if (sscreen->info.has_dedicated_vram) {
+      if (sscreen->info.num_render_backends > 4) {
+         context_states_per_bin = 1;
+         persistent_states_per_bin = 1;
+      } else {
+         context_states_per_bin = 3;
+         persistent_states_per_bin = 8;
+      }
+   } else {
+      /* This is a workaround for:
+       *    https://bugs.freedesktop.org/show_bug.cgi?id=110214
+       * (an alternative is to insert manual BATCH_BREAK event when
+       * a context_roll is detected). */
+      context_states_per_bin = sctx->screen->info.has_gfx9_scissor_bug ? 1 : 6;
+      /* Using 32 here can cause GPU hangs on RAVEN1 */
+      persistent_states_per_bin = 16;
+   }
+   fpovs_per_batch = 63;
+
+   /* Emit registers. */
+   struct uvec2 bin_size_extend = {};
+   if (bin_size.x >= 32)
+      bin_size_extend.x = util_logbase2(bin_size.x) - 5;
+   if (bin_size.y >= 32)
+      bin_size_extend.y = util_logbase2(bin_size.y) - 5;
+
+   unsigned initial_cdw = sctx->gfx_cs->current.cdw;
+   radeon_opt_set_context_reg(
+      sctx, R_028C44_PA_SC_BINNER_CNTL_0, SI_TRACKED_PA_SC_BINNER_CNTL_0,
+      S_028C44_BINNING_MODE(V_028C44_BINNING_ALLOWED) | S_028C44_BIN_SIZE_X(bin_size.x == 16) |
+         S_028C44_BIN_SIZE_Y(bin_size.y == 16) | S_028C44_BIN_SIZE_X_EXTEND(bin_size_extend.x) |
+         S_028C44_BIN_SIZE_Y_EXTEND(bin_size_extend.y) |
+         S_028C44_CONTEXT_STATES_PER_BIN(context_states_per_bin - 1) |
+         S_028C44_PERSISTENT_STATES_PER_BIN(persistent_states_per_bin - 1) |
+         S_028C44_DISABLE_START_OF_PRIM(disable_start_of_prim) |
+         S_028C44_FPOVS_PER_BATCH(fpovs_per_batch) | S_028C44_OPTIMAL_BIN_SELECTION(1) |
+         S_028C44_FLUSH_ON_BINNING_TRANSITION((sctx->family == CHIP_VEGA12 ||
+                                               sctx->family == CHIP_VEGA20 ||
+                                               sctx->family >= CHIP_RAVEN2) &&
+                                              sctx->last_binning_enabled != 1));
+
+   unsigned db_dfsm_control =
+      sctx->chip_class >= GFX10 ? R_028038_DB_DFSM_CONTROL : R_028060_DB_DFSM_CONTROL;
+   radeon_opt_set_context_reg(
+      sctx, db_dfsm_control, SI_TRACKED_DB_DFSM_CONTROL,
+      S_028060_PUNCHOUT_MODE(punchout_mode) | S_028060_POPS_DRAIN_PS_ON_OVERLAP(1));
+   if (initial_cdw != sctx->gfx_cs->current.cdw)
+      sctx->context_roll = true;
+
+   sctx->last_binning_enabled = true;
 }
diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c
index dc6de604d21..7def05440e1 100644
--- a/src/gallium/drivers/radeonsi/si_state_draw.c
+++ b/src/gallium/drivers/radeonsi/si_state_draw.c
@@ -22,42 +22,39 @@
  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  */
 
+#include "ac_debug.h"
 #include "si_build_pm4.h"
 #include "sid.h"
-
 #include "util/u_index_modify.h"
 #include "util/u_log.h"
-#include "util/u_upload_mgr.h"
 #include "util/u_prim.h"
 #include "util/u_suballoc.h"
-
-#include "ac_debug.h"
+#include "util/u_upload_mgr.h"
 
 /* special primitive types */
-#define SI_PRIM_RECTANGLE_LIST	PIPE_PRIM_MAX
+#define SI_PRIM_RECTANGLE_LIST PIPE_PRIM_MAX
 
 static unsigned si_conv_pipe_prim(unsigned mode)
 {
-        static const unsigned prim_conv[] = {
-		[PIPE_PRIM_POINTS]			= V_008958_DI_PT_POINTLIST,
-		[PIPE_PRIM_LINES]			= V_008958_DI_PT_LINELIST,
-		[PIPE_PRIM_LINE_LOOP]			= V_008958_DI_PT_LINELOOP,
-		[PIPE_PRIM_LINE_STRIP]			= V_008958_DI_PT_LINESTRIP,
-		[PIPE_PRIM_TRIANGLES]			= V_008958_DI_PT_TRILIST,
-		[PIPE_PRIM_TRIANGLE_STRIP]		= V_008958_DI_PT_TRISTRIP,
-		[PIPE_PRIM_TRIANGLE_FAN]		= V_008958_DI_PT_TRIFAN,
-		[PIPE_PRIM_QUADS]			= V_008958_DI_PT_QUADLIST,
-		[PIPE_PRIM_QUAD_STRIP]			= V_008958_DI_PT_QUADSTRIP,
-		[PIPE_PRIM_POLYGON]			= V_008958_DI_PT_POLYGON,
-		[PIPE_PRIM_LINES_ADJACENCY]		= V_008958_DI_PT_LINELIST_ADJ,
-		[PIPE_PRIM_LINE_STRIP_ADJACENCY]	= V_008958_DI_PT_LINESTRIP_ADJ,
-		[PIPE_PRIM_TRIANGLES_ADJACENCY]		= V_008958_DI_PT_TRILIST_ADJ,
-		[PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY]	= V_008958_DI_PT_TRISTRIP_ADJ,
-		[PIPE_PRIM_PATCHES]			= V_008958_DI_PT_PATCH,
-		[SI_PRIM_RECTANGLE_LIST]		= V_008958_DI_PT_RECTLIST
-        };
-	assert(mode < ARRAY_SIZE(prim_conv));
-	return prim_conv[mode];
+   static const unsigned prim_conv[] = {
+      [PIPE_PRIM_POINTS] = V_008958_DI_PT_POINTLIST,
+      [PIPE_PRIM_LINES] = V_008958_DI_PT_LINELIST,
+      [PIPE_PRIM_LINE_LOOP] = V_008958_DI_PT_LINELOOP,
+      [PIPE_PRIM_LINE_STRIP] = V_008958_DI_PT_LINESTRIP,
+      [PIPE_PRIM_TRIANGLES] = V_008958_DI_PT_TRILIST,
+      [PIPE_PRIM_TRIANGLE_STRIP] = V_008958_DI_PT_TRISTRIP,
+      [PIPE_PRIM_TRIANGLE_FAN] = V_008958_DI_PT_TRIFAN,
+      [PIPE_PRIM_QUADS] = V_008958_DI_PT_QUADLIST,
+      [PIPE_PRIM_QUAD_STRIP] = V_008958_DI_PT_QUADSTRIP,
+      [PIPE_PRIM_POLYGON] = V_008958_DI_PT_POLYGON,
+      [PIPE_PRIM_LINES_ADJACENCY] = V_008958_DI_PT_LINELIST_ADJ,
+      [PIPE_PRIM_LINE_STRIP_ADJACENCY] = V_008958_DI_PT_LINESTRIP_ADJ,
+      [PIPE_PRIM_TRIANGLES_ADJACENCY] = V_008958_DI_PT_TRILIST_ADJ,
+      [PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY] = V_008958_DI_PT_TRISTRIP_ADJ,
+      [PIPE_PRIM_PATCHES] = V_008958_DI_PT_PATCH,
+      [SI_PRIM_RECTANGLE_LIST] = V_008958_DI_PT_RECTLIST};
+   assert(mode < ARRAY_SIZE(prim_conv));
+   return prim_conv[mode];
 }
 
 /**
@@ -67,652 +64,597 @@ static unsigned si_conv_pipe_prim(unsigned mode)
  * The information about LDS and other non-compile-time parameters is then
  * written to userdata SGPRs.
  */
-static void si_emit_derived_tess_state(struct si_context *sctx,
-				       const struct pipe_draw_info *info,
-				       unsigned *num_patches)
+static void si_emit_derived_tess_state(struct si_context *sctx, const struct pipe_draw_info *info,
+                                       unsigned *num_patches)
 {
-	struct radeon_cmdbuf *cs = sctx->gfx_cs;
-	struct si_shader *ls_current;
-	struct si_shader_selector *ls;
-	/* The TES pointer will only be used for sctx->last_tcs.
-	 * It would be wrong to think that TCS = TES. */
-	struct si_shader_selector *tcs =
-		sctx->tcs_shader.cso ? sctx->tcs_shader.cso : sctx->tes_shader.cso;
-	unsigned tess_uses_primid = sctx->ia_multi_vgt_param_key.u.tess_uses_prim_id;
-	bool has_primid_instancing_bug = sctx->chip_class == GFX6 &&
-					 sctx->screen->info.max_se == 1;
-	unsigned tes_sh_base = sctx->shader_pointers.sh_base[PIPE_SHADER_TESS_EVAL];
-	unsigned num_tcs_input_cp = info->vertices_per_patch;
-	unsigned num_tcs_output_cp, num_tcs_inputs, num_tcs_outputs;
-	unsigned num_tcs_patch_outputs;
-	unsigned input_vertex_size, output_vertex_size, pervertex_output_patch_size;
-	unsigned input_patch_size, output_patch_size, output_patch0_offset;
-	unsigned perpatch_output_offset, lds_size;
-	unsigned tcs_in_layout, tcs_out_layout, tcs_out_offsets;
-	unsigned offchip_layout, hardware_lds_size, ls_hs_config;
-
-	/* Since GFX9 has merged LS-HS in the TCS state, set LS = TCS. */
-	if (sctx->chip_class >= GFX9) {
-		if (sctx->tcs_shader.cso)
-			ls_current = sctx->tcs_shader.current;
-		else
-			ls_current = sctx->fixed_func_tcs_shader.current;
-
-		ls = ls_current->key.part.tcs.ls;
-	} else {
-		ls_current = sctx->vs_shader.current;
-		ls = sctx->vs_shader.cso;
-	}
-
-	if (sctx->last_ls == ls_current &&
-	    sctx->last_tcs == tcs &&
-	    sctx->last_tes_sh_base == tes_sh_base &&
-	    sctx->last_num_tcs_input_cp == num_tcs_input_cp &&
-	    (!has_primid_instancing_bug ||
-	     (sctx->last_tess_uses_primid == tess_uses_primid))) {
-		*num_patches = sctx->last_num_patches;
-		return;
-	}
-
-	sctx->last_ls = ls_current;
-	sctx->last_tcs = tcs;
-	sctx->last_tes_sh_base = tes_sh_base;
-	sctx->last_num_tcs_input_cp = num_tcs_input_cp;
-	sctx->last_tess_uses_primid = tess_uses_primid;
-
-	/* This calculates how shader inputs and outputs among VS, TCS, and TES
-	 * are laid out in LDS. */
-	num_tcs_inputs = util_last_bit64(ls->outputs_written);
-
-	if (sctx->tcs_shader.cso) {
-		num_tcs_outputs = util_last_bit64(tcs->outputs_written);
-		num_tcs_output_cp = tcs->info.properties[TGSI_PROPERTY_TCS_VERTICES_OUT];
-		num_tcs_patch_outputs = util_last_bit64(tcs->patch_outputs_written);
-	} else {
-		/* No TCS. Route varyings from LS to TES. */
-		num_tcs_outputs = num_tcs_inputs;
-		num_tcs_output_cp = num_tcs_input_cp;
-		num_tcs_patch_outputs = 2; /* TESSINNER + TESSOUTER */
-	}
-
-	input_vertex_size = ls->lshs_vertex_stride;
-	output_vertex_size = num_tcs_outputs * 16;
-
-	input_patch_size = num_tcs_input_cp * input_vertex_size;
-
-	pervertex_output_patch_size = num_tcs_output_cp * output_vertex_size;
-	output_patch_size = pervertex_output_patch_size + num_tcs_patch_outputs * 16;
-
-	/* Ensure that we only need one wave per SIMD so we don't need to check
-	 * resource usage. Also ensures that the number of tcs in and out
-	 * vertices per threadgroup are at most 256.
-	 */
-	unsigned max_verts_per_patch = MAX2(num_tcs_input_cp, num_tcs_output_cp);
-	*num_patches = 256 / max_verts_per_patch;
-
-	/* Make sure that the data fits in LDS. This assumes the shaders only
-	 * use LDS for the inputs and outputs.
-	 *
-	 * While GFX7 can use 64K per threadgroup, there is a hang on Stoney
-	 * with 2 CUs if we use more than 32K. The closed Vulkan driver also
-	 * uses 32K at most on all GCN chips.
-	 */
-	hardware_lds_size = 32768;
-	*num_patches = MIN2(*num_patches, hardware_lds_size / (input_patch_size +
-	                                                       output_patch_size));
-
-	/* Make sure the output data fits in the offchip buffer */
-	*num_patches = MIN2(*num_patches,
-			    (sctx->screen->tess_offchip_block_dw_size * 4) /
-			    output_patch_size);
-
-	/* Not necessary for correctness, but improves performance.
-	 * The hardware can do more, but the radeonsi shader constant is
-	 * limited to 6 bits.
-	 */
-	*num_patches = MIN2(*num_patches, 63); /* triangles: 3 full waves except 3 lanes */
-
-	/* When distributed tessellation is unsupported, switch between SEs
-	 * at a higher frequency to compensate for it.
-	 */
-	if (!sctx->screen->info.has_distributed_tess && sctx->screen->info.max_se > 1)
-		*num_patches = MIN2(*num_patches, 16); /* recommended */
-
-	/* Make sure that vector lanes are reasonably occupied. It probably
-	 * doesn't matter much because this is LS-HS, and TES is likely to
-	 * occupy significantly more CUs.
-	 */
-	unsigned temp_verts_per_tg = *num_patches * max_verts_per_patch;
-	unsigned wave_size = sctx->screen->ge_wave_size;
-
-	if (temp_verts_per_tg > wave_size && temp_verts_per_tg % wave_size < wave_size*3/4)
-		*num_patches = (temp_verts_per_tg & ~(wave_size - 1)) / max_verts_per_patch;
-
-	if (sctx->chip_class == GFX6) {
-		/* GFX6 bug workaround, related to power management. Limit LS-HS
-		 * threadgroups to only one wave.
-		 */
-		unsigned one_wave = wave_size / max_verts_per_patch;
-		*num_patches = MIN2(*num_patches, one_wave);
-	}
-
-	/* The VGT HS block increments the patch ID unconditionally
-	 * within a single threadgroup. This results in incorrect
-	 * patch IDs when instanced draws are used.
-	 *
-	 * The intended solution is to restrict threadgroups to
-	 * a single instance by setting SWITCH_ON_EOI, which
-	 * should cause IA to split instances up. However, this
-	 * doesn't work correctly on GFX6 when there is no other
-	 * SE to switch to.
-	 */
-	if (has_primid_instancing_bug && tess_uses_primid)
-		*num_patches = 1;
-
-	sctx->last_num_patches = *num_patches;
-
-	output_patch0_offset = input_patch_size * *num_patches;
-	perpatch_output_offset = output_patch0_offset + pervertex_output_patch_size;
-
-	/* Compute userdata SGPRs. */
-	assert(((input_vertex_size / 4) & ~0xff) == 0);
-	assert(((output_vertex_size / 4) & ~0xff) == 0);
-	assert(((input_patch_size / 4) & ~0x1fff) == 0);
-	assert(((output_patch_size / 4) & ~0x1fff) == 0);
-	assert(((output_patch0_offset / 16) & ~0xffff) == 0);
-	assert(((perpatch_output_offset / 16) & ~0xffff) == 0);
-	assert(num_tcs_input_cp <= 32);
-	assert(num_tcs_output_cp <= 32);
-
-	uint64_t ring_va = si_resource(sctx->tess_rings)->gpu_address;
-	assert((ring_va & u_bit_consecutive(0, 19)) == 0);
-
-	tcs_in_layout = S_VS_STATE_LS_OUT_PATCH_SIZE(input_patch_size / 4) |
-			S_VS_STATE_LS_OUT_VERTEX_SIZE(input_vertex_size / 4);
-	tcs_out_layout = (output_patch_size / 4) |
-			 (num_tcs_input_cp << 13) |
-			 ring_va;
-	tcs_out_offsets = (output_patch0_offset / 16) |
-			  ((perpatch_output_offset / 16) << 16);
-	offchip_layout = *num_patches |
-			 (num_tcs_output_cp << 6) |
-			 (pervertex_output_patch_size * *num_patches << 12);
-
-	/* Compute the LDS size. */
-	lds_size = output_patch0_offset + output_patch_size * *num_patches;
-
-	if (sctx->chip_class >= GFX7) {
-		assert(lds_size <= 65536);
-		lds_size = align(lds_size, 512) / 512;
-	} else {
-		assert(lds_size <= 32768);
-		lds_size = align(lds_size, 256) / 256;
-	}
-
-	/* Set SI_SGPR_VS_STATE_BITS. */
-	sctx->current_vs_state &= C_VS_STATE_LS_OUT_PATCH_SIZE &
-				  C_VS_STATE_LS_OUT_VERTEX_SIZE;
-	sctx->current_vs_state |= tcs_in_layout;
-
-	/* We should be able to support in-shader LDS use with LLVM >= 9
-	 * by just adding the lds_sizes together, but it has never
-	 * been tested. */
-	assert(ls_current->config.lds_size == 0);
-
-	if (sctx->chip_class >= GFX9) {
-		unsigned hs_rsrc2 = ls_current->config.rsrc2;
-
-		if (sctx->chip_class >= GFX10)
-			hs_rsrc2 |= S_00B42C_LDS_SIZE_GFX10(lds_size);
-		else
-			hs_rsrc2 |= S_00B42C_LDS_SIZE_GFX9(lds_size);
-
-		radeon_set_sh_reg(cs, R_00B42C_SPI_SHADER_PGM_RSRC2_HS, hs_rsrc2);
-
-		/* Set userdata SGPRs for merged LS-HS. */
-		radeon_set_sh_reg_seq(cs,
-				      R_00B430_SPI_SHADER_USER_DATA_LS_0 +
-				      GFX9_SGPR_TCS_OFFCHIP_LAYOUT * 4, 3);
-		radeon_emit(cs, offchip_layout);
-		radeon_emit(cs, tcs_out_offsets);
-		radeon_emit(cs, tcs_out_layout);
-	} else {
-		unsigned ls_rsrc2 = ls_current->config.rsrc2;
-
-		si_multiwave_lds_size_workaround(sctx->screen, &lds_size);
-		ls_rsrc2 |= S_00B52C_LDS_SIZE(lds_size);
-
-		/* Due to a hw bug, RSRC2_LS must be written twice with another
-		 * LS register written in between. */
-		if (sctx->chip_class == GFX7 && sctx->family != CHIP_HAWAII)
-			radeon_set_sh_reg(cs, R_00B52C_SPI_SHADER_PGM_RSRC2_LS, ls_rsrc2);
-		radeon_set_sh_reg_seq(cs, R_00B528_SPI_SHADER_PGM_RSRC1_LS, 2);
-		radeon_emit(cs, ls_current->config.rsrc1);
-		radeon_emit(cs, ls_rsrc2);
-
-		/* Set userdata SGPRs for TCS. */
-		radeon_set_sh_reg_seq(cs,
-			R_00B430_SPI_SHADER_USER_DATA_HS_0 + GFX6_SGPR_TCS_OFFCHIP_LAYOUT * 4, 4);
-		radeon_emit(cs, offchip_layout);
-		radeon_emit(cs, tcs_out_offsets);
-		radeon_emit(cs, tcs_out_layout);
-		radeon_emit(cs, tcs_in_layout);
-	}
-
-	/* Set userdata SGPRs for TES. */
-	radeon_set_sh_reg_seq(cs, tes_sh_base + SI_SGPR_TES_OFFCHIP_LAYOUT * 4, 2);
-	radeon_emit(cs, offchip_layout);
-	radeon_emit(cs, ring_va);
-
-	ls_hs_config = S_028B58_NUM_PATCHES(*num_patches) |
-		       S_028B58_HS_NUM_INPUT_CP(num_tcs_input_cp) |
-		       S_028B58_HS_NUM_OUTPUT_CP(num_tcs_output_cp);
-
-	if (sctx->last_ls_hs_config != ls_hs_config) {
-		if (sctx->chip_class >= GFX7) {
-			radeon_set_context_reg_idx(cs, R_028B58_VGT_LS_HS_CONFIG, 2,
-						   ls_hs_config);
-		} else {
-			radeon_set_context_reg(cs, R_028B58_VGT_LS_HS_CONFIG,
-					       ls_hs_config);
-		}
-		sctx->last_ls_hs_config = ls_hs_config;
-		sctx->context_roll = true;
-	}
+   struct radeon_cmdbuf *cs = sctx->gfx_cs;
+   struct si_shader *ls_current;
+   struct si_shader_selector *ls;
+   /* The TES pointer will only be used for sctx->last_tcs.
+    * It would be wrong to think that TCS = TES. */
+   struct si_shader_selector *tcs =
+      sctx->tcs_shader.cso ? sctx->tcs_shader.cso : sctx->tes_shader.cso;
+   unsigned tess_uses_primid = sctx->ia_multi_vgt_param_key.u.tess_uses_prim_id;
+   bool has_primid_instancing_bug = sctx->chip_class == GFX6 && sctx->screen->info.max_se == 1;
+   unsigned tes_sh_base = sctx->shader_pointers.sh_base[PIPE_SHADER_TESS_EVAL];
+   unsigned num_tcs_input_cp = info->vertices_per_patch;
+   unsigned num_tcs_output_cp, num_tcs_inputs, num_tcs_outputs;
+   unsigned num_tcs_patch_outputs;
+   unsigned input_vertex_size, output_vertex_size, pervertex_output_patch_size;
+   unsigned input_patch_size, output_patch_size, output_patch0_offset;
+   unsigned perpatch_output_offset, lds_size;
+   unsigned tcs_in_layout, tcs_out_layout, tcs_out_offsets;
+   unsigned offchip_layout, hardware_lds_size, ls_hs_config;
+
+   /* Since GFX9 has merged LS-HS in the TCS state, set LS = TCS. */
+   if (sctx->chip_class >= GFX9) {
+      if (sctx->tcs_shader.cso)
+         ls_current = sctx->tcs_shader.current;
+      else
+         ls_current = sctx->fixed_func_tcs_shader.current;
+
+      ls = ls_current->key.part.tcs.ls;
+   } else {
+      ls_current = sctx->vs_shader.current;
+      ls = sctx->vs_shader.cso;
+   }
+
+   if (sctx->last_ls == ls_current && sctx->last_tcs == tcs &&
+       sctx->last_tes_sh_base == tes_sh_base && sctx->last_num_tcs_input_cp == num_tcs_input_cp &&
+       (!has_primid_instancing_bug || (sctx->last_tess_uses_primid == tess_uses_primid))) {
+      *num_patches = sctx->last_num_patches;
+      return;
+   }
+
+   sctx->last_ls = ls_current;
+   sctx->last_tcs = tcs;
+   sctx->last_tes_sh_base = tes_sh_base;
+   sctx->last_num_tcs_input_cp = num_tcs_input_cp;
+   sctx->last_tess_uses_primid = tess_uses_primid;
+
+   /* This calculates how shader inputs and outputs among VS, TCS, and TES
+    * are laid out in LDS. */
+   num_tcs_inputs = util_last_bit64(ls->outputs_written);
+
+   if (sctx->tcs_shader.cso) {
+      num_tcs_outputs = util_last_bit64(tcs->outputs_written);
+      num_tcs_output_cp = tcs->info.properties[TGSI_PROPERTY_TCS_VERTICES_OUT];
+      num_tcs_patch_outputs = util_last_bit64(tcs->patch_outputs_written);
+   } else {
+      /* No TCS. Route varyings from LS to TES. */
+      num_tcs_outputs = num_tcs_inputs;
+      num_tcs_output_cp = num_tcs_input_cp;
+      num_tcs_patch_outputs = 2; /* TESSINNER + TESSOUTER */
+   }
+
+   input_vertex_size = ls->lshs_vertex_stride;
+   output_vertex_size = num_tcs_outputs * 16;
+
+   input_patch_size = num_tcs_input_cp * input_vertex_size;
+
+   pervertex_output_patch_size = num_tcs_output_cp * output_vertex_size;
+   output_patch_size = pervertex_output_patch_size + num_tcs_patch_outputs * 16;
+
+   /* Ensure that we only need one wave per SIMD so we don't need to check
+    * resource usage. Also ensures that the number of tcs in and out
+    * vertices per threadgroup are at most 256.
+    */
+   unsigned max_verts_per_patch = MAX2(num_tcs_input_cp, num_tcs_output_cp);
+   *num_patches = 256 / max_verts_per_patch;
+
+   /* Make sure that the data fits in LDS. This assumes the shaders only
+    * use LDS for the inputs and outputs.
+    *
+    * While GFX7 can use 64K per threadgroup, there is a hang on Stoney
+    * with 2 CUs if we use more than 32K. The closed Vulkan driver also
+    * uses 32K at most on all GCN chips.
+    */
+   hardware_lds_size = 32768;
+   *num_patches = MIN2(*num_patches, hardware_lds_size / (input_patch_size + output_patch_size));
+
+   /* Make sure the output data fits in the offchip buffer */
+   *num_patches =
+      MIN2(*num_patches, (sctx->screen->tess_offchip_block_dw_size * 4) / output_patch_size);
+
+   /* Not necessary for correctness, but improves performance.
+    * The hardware can do more, but the radeonsi shader constant is
+    * limited to 6 bits.
+    */
+   *num_patches = MIN2(*num_patches, 63); /* triangles: 3 full waves except 3 lanes */
+
+   /* When distributed tessellation is unsupported, switch between SEs
+    * at a higher frequency to compensate for it.
+    */
+   if (!sctx->screen->info.has_distributed_tess && sctx->screen->info.max_se > 1)
+      *num_patches = MIN2(*num_patches, 16); /* recommended */
+
+   /* Make sure that vector lanes are reasonably occupied. It probably
+    * doesn't matter much because this is LS-HS, and TES is likely to
+    * occupy significantly more CUs.
+    */
+   unsigned temp_verts_per_tg = *num_patches * max_verts_per_patch;
+   unsigned wave_size = sctx->screen->ge_wave_size;
+
+   if (temp_verts_per_tg > wave_size && temp_verts_per_tg % wave_size < wave_size * 3 / 4)
+      *num_patches = (temp_verts_per_tg & ~(wave_size - 1)) / max_verts_per_patch;
+
+   if (sctx->chip_class == GFX6) {
+      /* GFX6 bug workaround, related to power management. Limit LS-HS
+       * threadgroups to only one wave.
+       */
+      unsigned one_wave = wave_size / max_verts_per_patch;
+      *num_patches = MIN2(*num_patches, one_wave);
+   }
+
+   /* The VGT HS block increments the patch ID unconditionally
+    * within a single threadgroup. This results in incorrect
+    * patch IDs when instanced draws are used.
+    *
+    * The intended solution is to restrict threadgroups to
+    * a single instance by setting SWITCH_ON_EOI, which
+    * should cause IA to split instances up. However, this
+    * doesn't work correctly on GFX6 when there is no other
+    * SE to switch to.
+    */
+   if (has_primid_instancing_bug && tess_uses_primid)
+      *num_patches = 1;
+
+   sctx->last_num_patches = *num_patches;
+
+   output_patch0_offset = input_patch_size * *num_patches;
+   perpatch_output_offset = output_patch0_offset + pervertex_output_patch_size;
+
+   /* Compute userdata SGPRs. */
+   assert(((input_vertex_size / 4) & ~0xff) == 0);
+   assert(((output_vertex_size / 4) & ~0xff) == 0);
+   assert(((input_patch_size / 4) & ~0x1fff) == 0);
+   assert(((output_patch_size / 4) & ~0x1fff) == 0);
+   assert(((output_patch0_offset / 16) & ~0xffff) == 0);
+   assert(((perpatch_output_offset / 16) & ~0xffff) == 0);
+   assert(num_tcs_input_cp <= 32);
+   assert(num_tcs_output_cp <= 32);
+
+   uint64_t ring_va = si_resource(sctx->tess_rings)->gpu_address;
+   assert((ring_va & u_bit_consecutive(0, 19)) == 0);
+
+   tcs_in_layout = S_VS_STATE_LS_OUT_PATCH_SIZE(input_patch_size / 4) |
+                   S_VS_STATE_LS_OUT_VERTEX_SIZE(input_vertex_size / 4);
+   tcs_out_layout = (output_patch_size / 4) | (num_tcs_input_cp << 13) | ring_va;
+   tcs_out_offsets = (output_patch0_offset / 16) | ((perpatch_output_offset / 16) << 16);
+   offchip_layout =
+      *num_patches | (num_tcs_output_cp << 6) | (pervertex_output_patch_size * *num_patches << 12);
+
+   /* Compute the LDS size. */
+   lds_size = output_patch0_offset + output_patch_size * *num_patches;
+
+   if (sctx->chip_class >= GFX7) {
+      assert(lds_size <= 65536);
+      lds_size = align(lds_size, 512) / 512;
+   } else {
+      assert(lds_size <= 32768);
+      lds_size = align(lds_size, 256) / 256;
+   }
+
+   /* Set SI_SGPR_VS_STATE_BITS. */
+   sctx->current_vs_state &= C_VS_STATE_LS_OUT_PATCH_SIZE & C_VS_STATE_LS_OUT_VERTEX_SIZE;
+   sctx->current_vs_state |= tcs_in_layout;
+
+   /* We should be able to support in-shader LDS use with LLVM >= 9
+    * by just adding the lds_sizes together, but it has never
+    * been tested. */
+   assert(ls_current->config.lds_size == 0);
+
+   if (sctx->chip_class >= GFX9) {
+      unsigned hs_rsrc2 = ls_current->config.rsrc2;
+
+      if (sctx->chip_class >= GFX10)
+         hs_rsrc2 |= S_00B42C_LDS_SIZE_GFX10(lds_size);
+      else
+         hs_rsrc2 |= S_00B42C_LDS_SIZE_GFX9(lds_size);
+
+      radeon_set_sh_reg(cs, R_00B42C_SPI_SHADER_PGM_RSRC2_HS, hs_rsrc2);
+
+      /* Set userdata SGPRs for merged LS-HS. */
+      radeon_set_sh_reg_seq(
+         cs, R_00B430_SPI_SHADER_USER_DATA_LS_0 + GFX9_SGPR_TCS_OFFCHIP_LAYOUT * 4, 3);
+      radeon_emit(cs, offchip_layout);
+      radeon_emit(cs, tcs_out_offsets);
+      radeon_emit(cs, tcs_out_layout);
+   } else {
+      unsigned ls_rsrc2 = ls_current->config.rsrc2;
+
+      si_multiwave_lds_size_workaround(sctx->screen, &lds_size);
+      ls_rsrc2 |= S_00B52C_LDS_SIZE(lds_size);
+
+      /* Due to a hw bug, RSRC2_LS must be written twice with another
+       * LS register written in between. */
+      if (sctx->chip_class == GFX7 && sctx->family != CHIP_HAWAII)
+         radeon_set_sh_reg(cs, R_00B52C_SPI_SHADER_PGM_RSRC2_LS, ls_rsrc2);
+      radeon_set_sh_reg_seq(cs, R_00B528_SPI_SHADER_PGM_RSRC1_LS, 2);
+      radeon_emit(cs, ls_current->config.rsrc1);
+      radeon_emit(cs, ls_rsrc2);
+
+      /* Set userdata SGPRs for TCS. */
+      radeon_set_sh_reg_seq(
+         cs, R_00B430_SPI_SHADER_USER_DATA_HS_0 + GFX6_SGPR_TCS_OFFCHIP_LAYOUT * 4, 4);
+      radeon_emit(cs, offchip_layout);
+      radeon_emit(cs, tcs_out_offsets);
+      radeon_emit(cs, tcs_out_layout);
+      radeon_emit(cs, tcs_in_layout);
+   }
+
+   /* Set userdata SGPRs for TES. */
+   radeon_set_sh_reg_seq(cs, tes_sh_base + SI_SGPR_TES_OFFCHIP_LAYOUT * 4, 2);
+   radeon_emit(cs, offchip_layout);
+   radeon_emit(cs, ring_va);
+
+   ls_hs_config = S_028B58_NUM_PATCHES(*num_patches) | S_028B58_HS_NUM_INPUT_CP(num_tcs_input_cp) |
+                  S_028B58_HS_NUM_OUTPUT_CP(num_tcs_output_cp);
+
+   if (sctx->last_ls_hs_config != ls_hs_config) {
+      if (sctx->chip_class >= GFX7) {
+         radeon_set_context_reg_idx(cs, R_028B58_VGT_LS_HS_CONFIG, 2, ls_hs_config);
+      } else {
+         radeon_set_context_reg(cs, R_028B58_VGT_LS_HS_CONFIG, ls_hs_config);
+      }
+      sctx->last_ls_hs_config = ls_hs_config;
+      sctx->context_roll = true;
+   }
 }
 
 static unsigned si_num_prims_for_vertices(const struct pipe_draw_info *info,
-					  enum pipe_prim_type prim)
+                                          enum pipe_prim_type prim)
 {
-	switch (prim) {
-	case PIPE_PRIM_PATCHES:
-		return info->count / info->vertices_per_patch;
-	case PIPE_PRIM_POLYGON:
-		return info->count >= 3;
-	case SI_PRIM_RECTANGLE_LIST:
-		return info->count / 3;
-	default:
-		return u_decomposed_prims_for_vertices(prim, info->count);
-	}
+   switch (prim) {
+   case PIPE_PRIM_PATCHES:
+      return info->count / info->vertices_per_patch;
+   case PIPE_PRIM_POLYGON:
+      return info->count >= 3;
+   case SI_PRIM_RECTANGLE_LIST:
+      return info->count / 3;
+   default:
+      return u_decomposed_prims_for_vertices(prim, info->count);
+   }
 }
 
-static unsigned
-si_get_init_multi_vgt_param(struct si_screen *sscreen,
-			    union si_vgt_param_key *key)
+static unsigned si_get_init_multi_vgt_param(struct si_screen *sscreen, union si_vgt_param_key *key)
 {
-	STATIC_ASSERT(sizeof(union si_vgt_param_key) == 4);
-	unsigned max_primgroup_in_wave = 2;
-
-	/* SWITCH_ON_EOP(0) is always preferable. */
-	bool wd_switch_on_eop = false;
-	bool ia_switch_on_eop = false;
-	bool ia_switch_on_eoi = false;
-	bool partial_vs_wave = false;
-	bool partial_es_wave = false;
-
-	if (key->u.uses_tess) {
-		/* SWITCH_ON_EOI must be set if PrimID is used. */
-		if (key->u.tess_uses_prim_id)
-			ia_switch_on_eoi = true;
-
-		/* Bug with tessellation and GS on Bonaire and older 2 SE chips. */
-		if ((sscreen->info.family == CHIP_TAHITI ||
-		     sscreen->info.family == CHIP_PITCAIRN ||
-		     sscreen->info.family == CHIP_BONAIRE) &&
-		    key->u.uses_gs)
-			partial_vs_wave = true;
-
-		/* Needed for 028B6C_DISTRIBUTION_MODE != 0. (implies >= GFX8) */
-		if (sscreen->info.has_distributed_tess) {
-			if (key->u.uses_gs) {
-				if (sscreen->info.chip_class == GFX8)
-					partial_es_wave = true;
-			} else {
-				partial_vs_wave = true;
-			}
-		}
-	}
-
-	/* This is a hardware requirement. */
-	if (key->u.line_stipple_enabled ||
-	    (sscreen->debug_flags & DBG(SWITCH_ON_EOP))) {
-		ia_switch_on_eop = true;
-		wd_switch_on_eop = true;
-	}
-
-	if (sscreen->info.chip_class >= GFX7) {
-		/* WD_SWITCH_ON_EOP has no effect on GPUs with less than
-		 * 4 shader engines. Set 1 to pass the assertion below.
-		 * The other cases are hardware requirements.
-		 *
-		 * Polaris supports primitive restart with WD_SWITCH_ON_EOP=0
-		 * for points, line strips, and tri strips.
-		 */
-		if (sscreen->info.max_se <= 2 ||
-		    key->u.prim == PIPE_PRIM_POLYGON ||
-		    key->u.prim == PIPE_PRIM_LINE_LOOP ||
-		    key->u.prim == PIPE_PRIM_TRIANGLE_FAN ||
-		    key->u.prim == PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY ||
-		    (key->u.primitive_restart &&
-		     (sscreen->info.family < CHIP_POLARIS10 ||
-		      (key->u.prim != PIPE_PRIM_POINTS &&
-		       key->u.prim != PIPE_PRIM_LINE_STRIP &&
-		       key->u.prim != PIPE_PRIM_TRIANGLE_STRIP))) ||
-		    key->u.count_from_stream_output)
-			wd_switch_on_eop = true;
-
-		/* Hawaii hangs if instancing is enabled and WD_SWITCH_ON_EOP is 0.
-		 * We don't know that for indirect drawing, so treat it as
-		 * always problematic. */
-		if (sscreen->info.family == CHIP_HAWAII &&
-		    key->u.uses_instancing)
-			wd_switch_on_eop = true;
-
-		/* Performance recommendation for 4 SE Gfx7-8 parts if
-		 * instances are smaller than a primgroup.
-		 * Assume indirect draws always use small instances.
-		 * This is needed for good VS wave utilization.
-		 */
-		if (sscreen->info.chip_class <= GFX8 &&
-		    sscreen->info.max_se == 4 &&
-		    key->u.multi_instances_smaller_than_primgroup)
-			wd_switch_on_eop = true;
-
-		/* Required on GFX7 and later. */
-		if (sscreen->info.max_se == 4 && !wd_switch_on_eop)
-			ia_switch_on_eoi = true;
-
-		/* HW engineers suggested that PARTIAL_VS_WAVE_ON should be set
-		 * to work around a GS hang.
-		 */
-		if (key->u.uses_gs &&
-		    (sscreen->info.family == CHIP_TONGA ||
-		     sscreen->info.family == CHIP_FIJI ||
-		     sscreen->info.family == CHIP_POLARIS10 ||
-		     sscreen->info.family == CHIP_POLARIS11 ||
-		     sscreen->info.family == CHIP_POLARIS12 ||
-		     sscreen->info.family == CHIP_VEGAM))
-			partial_vs_wave = true;
-
-		/* Required by Hawaii and, for some special cases, by GFX8. */
-		if (ia_switch_on_eoi &&
-		    (sscreen->info.family == CHIP_HAWAII ||
-		     (sscreen->info.chip_class == GFX8 &&
-		      (key->u.uses_gs || max_primgroup_in_wave != 2))))
-			partial_vs_wave = true;
-
-		/* Instancing bug on Bonaire. */
-		if (sscreen->info.family == CHIP_BONAIRE && ia_switch_on_eoi &&
-		    key->u.uses_instancing)
-			partial_vs_wave = true;
-
-		/* This only applies to Polaris10 and later 4 SE chips.
-		 * wd_switch_on_eop is already true on all other chips.
-		 */
-		if (!wd_switch_on_eop && key->u.primitive_restart)
-			partial_vs_wave = true;
-
-		/* If the WD switch is false, the IA switch must be false too. */
-		assert(wd_switch_on_eop || !ia_switch_on_eop);
-	}
-
-	/* If SWITCH_ON_EOI is set, PARTIAL_ES_WAVE must be set too. */
-	if (sscreen->info.chip_class <= GFX8 && ia_switch_on_eoi)
-		partial_es_wave = true;
-
-	return S_028AA8_SWITCH_ON_EOP(ia_switch_on_eop) |
-		S_028AA8_SWITCH_ON_EOI(ia_switch_on_eoi) |
-		S_028AA8_PARTIAL_VS_WAVE_ON(partial_vs_wave) |
-		S_028AA8_PARTIAL_ES_WAVE_ON(partial_es_wave) |
-		S_028AA8_WD_SWITCH_ON_EOP(sscreen->info.chip_class >= GFX7 ? wd_switch_on_eop : 0) |
-		/* The following field was moved to VGT_SHADER_STAGES_EN in GFX9. */
-		S_028AA8_MAX_PRIMGRP_IN_WAVE(sscreen->info.chip_class == GFX8 ?
-					     max_primgroup_in_wave : 0) |
-		S_030960_EN_INST_OPT_BASIC(sscreen->info.chip_class >= GFX9) |
-		S_030960_EN_INST_OPT_ADV(sscreen->info.chip_class >= GFX9);
+   STATIC_ASSERT(sizeof(union si_vgt_param_key) == 4);
+   unsigned max_primgroup_in_wave = 2;
+
+   /* SWITCH_ON_EOP(0) is always preferable. */
+   bool wd_switch_on_eop = false;
+   bool ia_switch_on_eop = false;
+   bool ia_switch_on_eoi = false;
+   bool partial_vs_wave = false;
+   bool partial_es_wave = false;
+
+   if (key->u.uses_tess) {
+      /* SWITCH_ON_EOI must be set if PrimID is used. */
+      if (key->u.tess_uses_prim_id)
+         ia_switch_on_eoi = true;
+
+      /* Bug with tessellation and GS on Bonaire and older 2 SE chips. */
+      if ((sscreen->info.family == CHIP_TAHITI || sscreen->info.family == CHIP_PITCAIRN ||
+           sscreen->info.family == CHIP_BONAIRE) &&
+          key->u.uses_gs)
+         partial_vs_wave = true;
+
+      /* Needed for 028B6C_DISTRIBUTION_MODE != 0. (implies >= GFX8) */
+      if (sscreen->info.has_distributed_tess) {
+         if (key->u.uses_gs) {
+            if (sscreen->info.chip_class == GFX8)
+               partial_es_wave = true;
+         } else {
+            partial_vs_wave = true;
+         }
+      }
+   }
+
+   /* This is a hardware requirement. */
+   if (key->u.line_stipple_enabled || (sscreen->debug_flags & DBG(SWITCH_ON_EOP))) {
+      ia_switch_on_eop = true;
+      wd_switch_on_eop = true;
+   }
+
+   if (sscreen->info.chip_class >= GFX7) {
+      /* WD_SWITCH_ON_EOP has no effect on GPUs with less than
+       * 4 shader engines. Set 1 to pass the assertion below.
+       * The other cases are hardware requirements.
+       *
+       * Polaris supports primitive restart with WD_SWITCH_ON_EOP=0
+       * for points, line strips, and tri strips.
+       */
+      if (sscreen->info.max_se <= 2 || key->u.prim == PIPE_PRIM_POLYGON ||
+          key->u.prim == PIPE_PRIM_LINE_LOOP || key->u.prim == PIPE_PRIM_TRIANGLE_FAN ||
+          key->u.prim == PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY ||
+          (key->u.primitive_restart &&
+           (sscreen->info.family < CHIP_POLARIS10 ||
+            (key->u.prim != PIPE_PRIM_POINTS && key->u.prim != PIPE_PRIM_LINE_STRIP &&
+             key->u.prim != PIPE_PRIM_TRIANGLE_STRIP))) ||
+          key->u.count_from_stream_output)
+         wd_switch_on_eop = true;
+
+      /* Hawaii hangs if instancing is enabled and WD_SWITCH_ON_EOP is 0.
+       * We don't know that for indirect drawing, so treat it as
+       * always problematic. */
+      if (sscreen->info.family == CHIP_HAWAII && key->u.uses_instancing)
+         wd_switch_on_eop = true;
+
+      /* Performance recommendation for 4 SE Gfx7-8 parts if
+       * instances are smaller than a primgroup.
+       * Assume indirect draws always use small instances.
+       * This is needed for good VS wave utilization.
+       */
+      if (sscreen->info.chip_class <= GFX8 && sscreen->info.max_se == 4 &&
+          key->u.multi_instances_smaller_than_primgroup)
+         wd_switch_on_eop = true;
+
+      /* Required on GFX7 and later. */
+      if (sscreen->info.max_se == 4 && !wd_switch_on_eop)
+         ia_switch_on_eoi = true;
+
+      /* HW engineers suggested that PARTIAL_VS_WAVE_ON should be set
+       * to work around a GS hang.
+       */
+      if (key->u.uses_gs &&
+          (sscreen->info.family == CHIP_TONGA || sscreen->info.family == CHIP_FIJI ||
+           sscreen->info.family == CHIP_POLARIS10 || sscreen->info.family == CHIP_POLARIS11 ||
+           sscreen->info.family == CHIP_POLARIS12 || sscreen->info.family == CHIP_VEGAM))
+         partial_vs_wave = true;
+
+      /* Required by Hawaii and, for some special cases, by GFX8. */
+      if (ia_switch_on_eoi &&
+          (sscreen->info.family == CHIP_HAWAII ||
+           (sscreen->info.chip_class == GFX8 && (key->u.uses_gs || max_primgroup_in_wave != 2))))
+         partial_vs_wave = true;
+
+      /* Instancing bug on Bonaire. */
+      if (sscreen->info.family == CHIP_BONAIRE && ia_switch_on_eoi && key->u.uses_instancing)
+         partial_vs_wave = true;
+
+      /* This only applies to Polaris10 and later 4 SE chips.
+       * wd_switch_on_eop is already true on all other chips.
+       */
+      if (!wd_switch_on_eop && key->u.primitive_restart)
+         partial_vs_wave = true;
+
+      /* If the WD switch is false, the IA switch must be false too. */
+      assert(wd_switch_on_eop || !ia_switch_on_eop);
+   }
+
+   /* If SWITCH_ON_EOI is set, PARTIAL_ES_WAVE must be set too. */
+   if (sscreen->info.chip_class <= GFX8 && ia_switch_on_eoi)
+      partial_es_wave = true;
+
+   return S_028AA8_SWITCH_ON_EOP(ia_switch_on_eop) | S_028AA8_SWITCH_ON_EOI(ia_switch_on_eoi) |
+          S_028AA8_PARTIAL_VS_WAVE_ON(partial_vs_wave) |
+          S_028AA8_PARTIAL_ES_WAVE_ON(partial_es_wave) |
+          S_028AA8_WD_SWITCH_ON_EOP(sscreen->info.chip_class >= GFX7 ? wd_switch_on_eop : 0) |
+          /* The following field was moved to VGT_SHADER_STAGES_EN in GFX9. */
+          S_028AA8_MAX_PRIMGRP_IN_WAVE(sscreen->info.chip_class == GFX8 ? max_primgroup_in_wave
+                                                                        : 0) |
+          S_030960_EN_INST_OPT_BASIC(sscreen->info.chip_class >= GFX9) |
+          S_030960_EN_INST_OPT_ADV(sscreen->info.chip_class >= GFX9);
 }
 
 static void si_init_ia_multi_vgt_param_table(struct si_context *sctx)
 {
-	for (int prim = 0; prim <= SI_PRIM_RECTANGLE_LIST; prim++)
-	for (int uses_instancing = 0; uses_instancing < 2; uses_instancing++)
-	for (int multi_instances = 0; multi_instances < 2; multi_instances++)
-	for (int primitive_restart = 0; primitive_restart < 2; primitive_restart++)
-	for (int count_from_so = 0; count_from_so < 2; count_from_so++)
-	for (int line_stipple = 0; line_stipple < 2; line_stipple++)
-	for (int uses_tess = 0; uses_tess < 2; uses_tess++)
-	for (int tess_uses_primid = 0; tess_uses_primid < 2; tess_uses_primid++)
-	for (int uses_gs = 0; uses_gs < 2; uses_gs++) {
-		union si_vgt_param_key key;
-
-		key.index = 0;
-		key.u.prim = prim;
-		key.u.uses_instancing = uses_instancing;
-		key.u.multi_instances_smaller_than_primgroup = multi_instances;
-		key.u.primitive_restart = primitive_restart;
-		key.u.count_from_stream_output = count_from_so;
-		key.u.line_stipple_enabled = line_stipple;
-		key.u.uses_tess = uses_tess;
-		key.u.tess_uses_prim_id = tess_uses_primid;
-		key.u.uses_gs = uses_gs;
-
-		sctx->ia_multi_vgt_param[key.index] =
-			si_get_init_multi_vgt_param(sctx->screen, &key);
-	}
+   for (int prim = 0; prim <= SI_PRIM_RECTANGLE_LIST; prim++)
+      for (int uses_instancing = 0; uses_instancing < 2; uses_instancing++)
+         for (int multi_instances = 0; multi_instances < 2; multi_instances++)
+            for (int primitive_restart = 0; primitive_restart < 2; primitive_restart++)
+               for (int count_from_so = 0; count_from_so < 2; count_from_so++)
+                  for (int line_stipple = 0; line_stipple < 2; line_stipple++)
+                     for (int uses_tess = 0; uses_tess < 2; uses_tess++)
+                        for (int tess_uses_primid = 0; tess_uses_primid < 2; tess_uses_primid++)
+                           for (int uses_gs = 0; uses_gs < 2; uses_gs++) {
+                              union si_vgt_param_key key;
+
+                              key.index = 0;
+                              key.u.prim = prim;
+                              key.u.uses_instancing = uses_instancing;
+                              key.u.multi_instances_smaller_than_primgroup = multi_instances;
+                              key.u.primitive_restart = primitive_restart;
+                              key.u.count_from_stream_output = count_from_so;
+                              key.u.line_stipple_enabled = line_stipple;
+                              key.u.uses_tess = uses_tess;
+                              key.u.tess_uses_prim_id = tess_uses_primid;
+                              key.u.uses_gs = uses_gs;
+
+                              sctx->ia_multi_vgt_param[key.index] =
+                                 si_get_init_multi_vgt_param(sctx->screen, &key);
+                           }
 }
 
 static bool si_is_line_stipple_enabled(struct si_context *sctx)
 {
-	struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
+   struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
 
-	return rs->line_stipple_enable &&
-	       sctx->current_rast_prim != PIPE_PRIM_POINTS &&
-	       (rs->polygon_mode_is_lines ||
-		util_prim_is_lines(sctx->current_rast_prim));
+   return rs->line_stipple_enable && sctx->current_rast_prim != PIPE_PRIM_POINTS &&
+          (rs->polygon_mode_is_lines || util_prim_is_lines(sctx->current_rast_prim));
 }
 
 static unsigned si_get_ia_multi_vgt_param(struct si_context *sctx,
-					  const struct pipe_draw_info *info,
-					  enum pipe_prim_type prim,
-					  unsigned num_patches,
-					  unsigned instance_count,
-					  bool primitive_restart)
+                                          const struct pipe_draw_info *info,
+                                          enum pipe_prim_type prim, unsigned num_patches,
+                                          unsigned instance_count, bool primitive_restart)
 {
-	union si_vgt_param_key key = sctx->ia_multi_vgt_param_key;
-	unsigned primgroup_size;
-	unsigned ia_multi_vgt_param;
-
-	if (sctx->tes_shader.cso) {
-		primgroup_size = num_patches; /* must be a multiple of NUM_PATCHES */
-	} else if (sctx->gs_shader.cso) {
-		primgroup_size = 64; /* recommended with a GS */
-	} else {
-		primgroup_size = 128; /* recommended without a GS and tess */
-	}
-
-	key.u.prim = prim;
-	key.u.uses_instancing = info->indirect || instance_count > 1;
-	key.u.multi_instances_smaller_than_primgroup =
-		info->indirect ||
-		(instance_count > 1 &&
-		 (info->count_from_stream_output ||
-		  si_num_prims_for_vertices(info, prim) < primgroup_size));
-	key.u.primitive_restart = primitive_restart;
-	key.u.count_from_stream_output = info->count_from_stream_output != NULL;
-	key.u.line_stipple_enabled = si_is_line_stipple_enabled(sctx);
-
-	ia_multi_vgt_param = sctx->ia_multi_vgt_param[key.index] |
-			     S_028AA8_PRIMGROUP_SIZE(primgroup_size - 1);
-
-	if (sctx->gs_shader.cso) {
-		/* GS requirement. */
-		if (sctx->chip_class <= GFX8 &&
-		    SI_GS_PER_ES / primgroup_size >= sctx->screen->gs_table_depth - 3)
-			ia_multi_vgt_param |= S_028AA8_PARTIAL_ES_WAVE_ON(1);
-
-		/* GS hw bug with single-primitive instances and SWITCH_ON_EOI.
-		 * The hw doc says all multi-SE chips are affected, but Vulkan
-		 * only applies it to Hawaii. Do what Vulkan does.
-		 */
-		if (sctx->family == CHIP_HAWAII &&
-		    G_028AA8_SWITCH_ON_EOI(ia_multi_vgt_param) &&
-		    (info->indirect ||
-		     (instance_count > 1 &&
-		      (info->count_from_stream_output ||
-		       si_num_prims_for_vertices(info, prim) <= 1))))
-			sctx->flags |= SI_CONTEXT_VGT_FLUSH;
-	}
-
-	return ia_multi_vgt_param;
+   union si_vgt_param_key key = sctx->ia_multi_vgt_param_key;
+   unsigned primgroup_size;
+   unsigned ia_multi_vgt_param;
+
+   if (sctx->tes_shader.cso) {
+      primgroup_size = num_patches; /* must be a multiple of NUM_PATCHES */
+   } else if (sctx->gs_shader.cso) {
+      primgroup_size = 64; /* recommended with a GS */
+   } else {
+      primgroup_size = 128; /* recommended without a GS and tess */
+   }
+
+   key.u.prim = prim;
+   key.u.uses_instancing = info->indirect || instance_count > 1;
+   key.u.multi_instances_smaller_than_primgroup =
+      info->indirect ||
+      (instance_count > 1 &&
+       (info->count_from_stream_output || si_num_prims_for_vertices(info, prim) < primgroup_size));
+   key.u.primitive_restart = primitive_restart;
+   key.u.count_from_stream_output = info->count_from_stream_output != NULL;
+   key.u.line_stipple_enabled = si_is_line_stipple_enabled(sctx);
+
+   ia_multi_vgt_param =
+      sctx->ia_multi_vgt_param[key.index] | S_028AA8_PRIMGROUP_SIZE(primgroup_size - 1);
+
+   if (sctx->gs_shader.cso) {
+      /* GS requirement. */
+      if (sctx->chip_class <= GFX8 &&
+          SI_GS_PER_ES / primgroup_size >= sctx->screen->gs_table_depth - 3)
+         ia_multi_vgt_param |= S_028AA8_PARTIAL_ES_WAVE_ON(1);
+
+      /* GS hw bug with single-primitive instances and SWITCH_ON_EOI.
+       * The hw doc says all multi-SE chips are affected, but Vulkan
+       * only applies it to Hawaii. Do what Vulkan does.
+       */
+      if (sctx->family == CHIP_HAWAII && G_028AA8_SWITCH_ON_EOI(ia_multi_vgt_param) &&
+          (info->indirect || (instance_count > 1 && (info->count_from_stream_output ||
+                                                     si_num_prims_for_vertices(info, prim) <= 1))))
+         sctx->flags |= SI_CONTEXT_VGT_FLUSH;
+   }
+
+   return ia_multi_vgt_param;
 }
 
 static unsigned si_conv_prim_to_gs_out(unsigned mode)
 {
-	static const int prim_conv[] = {
-		[PIPE_PRIM_POINTS]			= V_028A6C_OUTPRIM_TYPE_POINTLIST,
-		[PIPE_PRIM_LINES]			= V_028A6C_OUTPRIM_TYPE_LINESTRIP,
-		[PIPE_PRIM_LINE_LOOP]			= V_028A6C_OUTPRIM_TYPE_LINESTRIP,
-		[PIPE_PRIM_LINE_STRIP]			= V_028A6C_OUTPRIM_TYPE_LINESTRIP,
-		[PIPE_PRIM_TRIANGLES]			= V_028A6C_OUTPRIM_TYPE_TRISTRIP,
-		[PIPE_PRIM_TRIANGLE_STRIP]		= V_028A6C_OUTPRIM_TYPE_TRISTRIP,
-		[PIPE_PRIM_TRIANGLE_FAN]		= V_028A6C_OUTPRIM_TYPE_TRISTRIP,
-		[PIPE_PRIM_QUADS]			= V_028A6C_OUTPRIM_TYPE_TRISTRIP,
-		[PIPE_PRIM_QUAD_STRIP]			= V_028A6C_OUTPRIM_TYPE_TRISTRIP,
-		[PIPE_PRIM_POLYGON]			= V_028A6C_OUTPRIM_TYPE_TRISTRIP,
-		[PIPE_PRIM_LINES_ADJACENCY]		= V_028A6C_OUTPRIM_TYPE_LINESTRIP,
-		[PIPE_PRIM_LINE_STRIP_ADJACENCY]	= V_028A6C_OUTPRIM_TYPE_LINESTRIP,
-		[PIPE_PRIM_TRIANGLES_ADJACENCY]		= V_028A6C_OUTPRIM_TYPE_TRISTRIP,
-		[PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY]	= V_028A6C_OUTPRIM_TYPE_TRISTRIP,
-		[PIPE_PRIM_PATCHES]			= V_028A6C_OUTPRIM_TYPE_POINTLIST,
-		[SI_PRIM_RECTANGLE_LIST]		= V_028A6C_VGT_OUT_RECT_V0,
-	};
-	assert(mode < ARRAY_SIZE(prim_conv));
-
-	return prim_conv[mode];
+   static const int prim_conv[] = {
+      [PIPE_PRIM_POINTS] = V_028A6C_OUTPRIM_TYPE_POINTLIST,
+      [PIPE_PRIM_LINES] = V_028A6C_OUTPRIM_TYPE_LINESTRIP,
+      [PIPE_PRIM_LINE_LOOP] = V_028A6C_OUTPRIM_TYPE_LINESTRIP,
+      [PIPE_PRIM_LINE_STRIP] = V_028A6C_OUTPRIM_TYPE_LINESTRIP,
+      [PIPE_PRIM_TRIANGLES] = V_028A6C_OUTPRIM_TYPE_TRISTRIP,
+      [PIPE_PRIM_TRIANGLE_STRIP] = V_028A6C_OUTPRIM_TYPE_TRISTRIP,
+      [PIPE_PRIM_TRIANGLE_FAN] = V_028A6C_OUTPRIM_TYPE_TRISTRIP,
+      [PIPE_PRIM_QUADS] = V_028A6C_OUTPRIM_TYPE_TRISTRIP,
+      [PIPE_PRIM_QUAD_STRIP] = V_028A6C_OUTPRIM_TYPE_TRISTRIP,
+      [PIPE_PRIM_POLYGON] = V_028A6C_OUTPRIM_TYPE_TRISTRIP,
+      [PIPE_PRIM_LINES_ADJACENCY] = V_028A6C_OUTPRIM_TYPE_LINESTRIP,
+      [PIPE_PRIM_LINE_STRIP_ADJACENCY] = V_028A6C_OUTPRIM_TYPE_LINESTRIP,
+      [PIPE_PRIM_TRIANGLES_ADJACENCY] = V_028A6C_OUTPRIM_TYPE_TRISTRIP,
+      [PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY] = V_028A6C_OUTPRIM_TYPE_TRISTRIP,
+      [PIPE_PRIM_PATCHES] = V_028A6C_OUTPRIM_TYPE_POINTLIST,
+      [SI_PRIM_RECTANGLE_LIST] = V_028A6C_VGT_OUT_RECT_V0,
+   };
+   assert(mode < ARRAY_SIZE(prim_conv));
+
+   return prim_conv[mode];
 }
 
 /* rast_prim is the primitive type after GS. */
 static void si_emit_rasterizer_prim_state(struct si_context *sctx)
 {
-	struct radeon_cmdbuf *cs = sctx->gfx_cs;
-	enum pipe_prim_type rast_prim = sctx->current_rast_prim;
-	struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
-	unsigned initial_cdw = cs->current.cdw;
-
-	if (unlikely(si_is_line_stipple_enabled(sctx))) {
-		/* For lines, reset the stipple pattern at each primitive. Otherwise,
-		 * reset the stipple pattern at each packet (line strips, line loops).
-		 */
-		unsigned value = rs->pa_sc_line_stipple |
-				 S_028A0C_AUTO_RESET_CNTL(rast_prim == PIPE_PRIM_LINES ? 1 : 2);
-
-		radeon_opt_set_context_reg(sctx, R_028A0C_PA_SC_LINE_STIPPLE,
-					   SI_TRACKED_PA_SC_LINE_STIPPLE, value);
-	}
-
-	unsigned gs_out_prim = si_conv_prim_to_gs_out(rast_prim);
-	if (unlikely(gs_out_prim != sctx->last_gs_out_prim &&
-		     (sctx->ngg || sctx->gs_shader.cso))) {
-		radeon_set_context_reg(cs, R_028A6C_VGT_GS_OUT_PRIM_TYPE, gs_out_prim);
-		sctx->last_gs_out_prim = gs_out_prim;
-	}
-
-	if (initial_cdw != cs->current.cdw)
-		sctx->context_roll = true;
-
-	if (sctx->ngg) {
-		unsigned vtx_index = rs->flatshade_first ? 0 : gs_out_prim;
-
-		sctx->current_vs_state &= C_VS_STATE_OUTPRIM &
-					  C_VS_STATE_PROVOKING_VTX_INDEX;
-		sctx->current_vs_state |= S_VS_STATE_OUTPRIM(gs_out_prim) |
-					  S_VS_STATE_PROVOKING_VTX_INDEX(vtx_index);
-	}
+   struct radeon_cmdbuf *cs = sctx->gfx_cs;
+   enum pipe_prim_type rast_prim = sctx->current_rast_prim;
+   struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
+   unsigned initial_cdw = cs->current.cdw;
+
+   if (unlikely(si_is_line_stipple_enabled(sctx))) {
+      /* For lines, reset the stipple pattern at each primitive. Otherwise,
+       * reset the stipple pattern at each packet (line strips, line loops).
+       */
+      unsigned value =
+         rs->pa_sc_line_stipple | S_028A0C_AUTO_RESET_CNTL(rast_prim == PIPE_PRIM_LINES ? 1 : 2);
+
+      radeon_opt_set_context_reg(sctx, R_028A0C_PA_SC_LINE_STIPPLE, SI_TRACKED_PA_SC_LINE_STIPPLE,
+                                 value);
+   }
+
+   unsigned gs_out_prim = si_conv_prim_to_gs_out(rast_prim);
+   if (unlikely(gs_out_prim != sctx->last_gs_out_prim && (sctx->ngg || sctx->gs_shader.cso))) {
+      radeon_set_context_reg(cs, R_028A6C_VGT_GS_OUT_PRIM_TYPE, gs_out_prim);
+      sctx->last_gs_out_prim = gs_out_prim;
+   }
+
+   if (initial_cdw != cs->current.cdw)
+      sctx->context_roll = true;
+
+   if (sctx->ngg) {
+      unsigned vtx_index = rs->flatshade_first ? 0 : gs_out_prim;
+
+      sctx->current_vs_state &= C_VS_STATE_OUTPRIM & C_VS_STATE_PROVOKING_VTX_INDEX;
+      sctx->current_vs_state |=
+         S_VS_STATE_OUTPRIM(gs_out_prim) | S_VS_STATE_PROVOKING_VTX_INDEX(vtx_index);
+   }
 }
 
-static void si_emit_vs_state(struct si_context *sctx,
-			     const struct pipe_draw_info *info)
+static void si_emit_vs_state(struct si_context *sctx, const struct pipe_draw_info *info)
 {
-	sctx->current_vs_state &= C_VS_STATE_INDEXED;
-	sctx->current_vs_state |= S_VS_STATE_INDEXED(!!info->index_size);
-
-	if (sctx->num_vs_blit_sgprs) {
-		/* Re-emit the state after we leave u_blitter. */
-		sctx->last_vs_state = ~0;
-		return;
-	}
-
-	if (sctx->current_vs_state != sctx->last_vs_state) {
-		struct radeon_cmdbuf *cs = sctx->gfx_cs;
-
-		/* For the API vertex shader (VS_STATE_INDEXED, LS_OUT_*). */
-		radeon_set_sh_reg(cs,
-			sctx->shader_pointers.sh_base[PIPE_SHADER_VERTEX] +
-			SI_SGPR_VS_STATE_BITS * 4,
-			sctx->current_vs_state);
-
-		/* Set CLAMP_VERTEX_COLOR and OUTPRIM in the last stage
-		 * before the rasterizer.
-		 *
-		 * For TES or the GS copy shader without NGG:
-		 */
-		if (sctx->shader_pointers.sh_base[PIPE_SHADER_VERTEX] !=
-		    R_00B130_SPI_SHADER_USER_DATA_VS_0) {
-			radeon_set_sh_reg(cs,
-				R_00B130_SPI_SHADER_USER_DATA_VS_0 +
-				SI_SGPR_VS_STATE_BITS * 4,
-				sctx->current_vs_state);
-		}
-
-		/* For NGG: */
-		if (sctx->screen->use_ngg &&
-		    sctx->shader_pointers.sh_base[PIPE_SHADER_VERTEX] !=
-		    R_00B230_SPI_SHADER_USER_DATA_GS_0) {
-			radeon_set_sh_reg(cs,
-					  R_00B230_SPI_SHADER_USER_DATA_GS_0 +
-					  SI_SGPR_VS_STATE_BITS * 4,
-					  sctx->current_vs_state);
-		}
-
-		sctx->last_vs_state = sctx->current_vs_state;
-	}
+   sctx->current_vs_state &= C_VS_STATE_INDEXED;
+   sctx->current_vs_state |= S_VS_STATE_INDEXED(!!info->index_size);
+
+   if (sctx->num_vs_blit_sgprs) {
+      /* Re-emit the state after we leave u_blitter. */
+      sctx->last_vs_state = ~0;
+      return;
+   }
+
+   if (sctx->current_vs_state != sctx->last_vs_state) {
+      struct radeon_cmdbuf *cs = sctx->gfx_cs;
+
+      /* For the API vertex shader (VS_STATE_INDEXED, LS_OUT_*). */
+      radeon_set_sh_reg(
+         cs, sctx->shader_pointers.sh_base[PIPE_SHADER_VERTEX] + SI_SGPR_VS_STATE_BITS * 4,
+         sctx->current_vs_state);
+
+      /* Set CLAMP_VERTEX_COLOR and OUTPRIM in the last stage
+       * before the rasterizer.
+       *
+       * For TES or the GS copy shader without NGG:
+       */
+      if (sctx->shader_pointers.sh_base[PIPE_SHADER_VERTEX] != R_00B130_SPI_SHADER_USER_DATA_VS_0) {
+         radeon_set_sh_reg(cs, R_00B130_SPI_SHADER_USER_DATA_VS_0 + SI_SGPR_VS_STATE_BITS * 4,
+                           sctx->current_vs_state);
+      }
+
+      /* For NGG: */
+      if (sctx->screen->use_ngg &&
+          sctx->shader_pointers.sh_base[PIPE_SHADER_VERTEX] != R_00B230_SPI_SHADER_USER_DATA_GS_0) {
+         radeon_set_sh_reg(cs, R_00B230_SPI_SHADER_USER_DATA_GS_0 + SI_SGPR_VS_STATE_BITS * 4,
+                           sctx->current_vs_state);
+      }
+
+      sctx->last_vs_state = sctx->current_vs_state;
+   }
 }
 
-static inline bool si_prim_restart_index_changed(struct si_context *sctx,
-						 bool primitive_restart,
-						 unsigned restart_index)
+static inline bool si_prim_restart_index_changed(struct si_context *sctx, bool primitive_restart,
+                                                 unsigned restart_index)
 {
-	return primitive_restart &&
-	       (restart_index != sctx->last_restart_index ||
-		sctx->last_restart_index == SI_RESTART_INDEX_UNKNOWN);
+   return primitive_restart && (restart_index != sctx->last_restart_index ||
+                                sctx->last_restart_index == SI_RESTART_INDEX_UNKNOWN);
 }
 
-static void si_emit_ia_multi_vgt_param(struct si_context *sctx,
-				       const struct pipe_draw_info *info,
-				       enum pipe_prim_type prim,
-				       unsigned num_patches,
-				       unsigned instance_count,
-				       bool primitive_restart)
+static void si_emit_ia_multi_vgt_param(struct si_context *sctx, const struct pipe_draw_info *info,
+                                       enum pipe_prim_type prim, unsigned num_patches,
+                                       unsigned instance_count, bool primitive_restart)
 {
-	struct radeon_cmdbuf *cs = sctx->gfx_cs;
-	unsigned ia_multi_vgt_param;
-
-	ia_multi_vgt_param = si_get_ia_multi_vgt_param(sctx, info, prim, num_patches,
-						       instance_count, primitive_restart);
-
-	/* Draw state. */
-	if (ia_multi_vgt_param != sctx->last_multi_vgt_param) {
-		if (sctx->chip_class == GFX9)
-			radeon_set_uconfig_reg_idx(cs, sctx->screen,
-						   R_030960_IA_MULTI_VGT_PARAM, 4,
-						   ia_multi_vgt_param);
-		else if (sctx->chip_class >= GFX7)
-			radeon_set_context_reg_idx(cs, R_028AA8_IA_MULTI_VGT_PARAM, 1, ia_multi_vgt_param);
-		else
-			radeon_set_context_reg(cs, R_028AA8_IA_MULTI_VGT_PARAM, ia_multi_vgt_param);
-
-		sctx->last_multi_vgt_param = ia_multi_vgt_param;
-	}
+   struct radeon_cmdbuf *cs = sctx->gfx_cs;
+   unsigned ia_multi_vgt_param;
+
+   ia_multi_vgt_param =
+      si_get_ia_multi_vgt_param(sctx, info, prim, num_patches, instance_count, primitive_restart);
+
+   /* Draw state. */
+   if (ia_multi_vgt_param != sctx->last_multi_vgt_param) {
+      if (sctx->chip_class == GFX9)
+         radeon_set_uconfig_reg_idx(cs, sctx->screen, R_030960_IA_MULTI_VGT_PARAM, 4,
+                                    ia_multi_vgt_param);
+      else if (sctx->chip_class >= GFX7)
+         radeon_set_context_reg_idx(cs, R_028AA8_IA_MULTI_VGT_PARAM, 1, ia_multi_vgt_param);
+      else
+         radeon_set_context_reg(cs, R_028AA8_IA_MULTI_VGT_PARAM, ia_multi_vgt_param);
+
+      sctx->last_multi_vgt_param = ia_multi_vgt_param;
+   }
 }
 
 /* GFX10 removed IA_MULTI_VGT_PARAM in exchange for GE_CNTL.
@@ -720,1601 +662,1460 @@ static void si_emit_ia_multi_vgt_param(struct si_context *sctx,
  */
 static void gfx10_emit_ge_cntl(struct si_context *sctx, unsigned num_patches)
 {
-	union si_vgt_param_key key = sctx->ia_multi_vgt_param_key;
-	unsigned ge_cntl;
-
-	if (sctx->ngg) {
-		if (sctx->tes_shader.cso) {
-			ge_cntl = S_03096C_PRIM_GRP_SIZE(num_patches) |
-				  S_03096C_VERT_GRP_SIZE(256) | /* 256 = disable vertex grouping */
-				  S_03096C_BREAK_WAVE_AT_EOI(key.u.tess_uses_prim_id);
-		} else {
-			ge_cntl = si_get_vs_state(sctx)->ge_cntl;
-		}
-	} else {
-		unsigned primgroup_size;
-		unsigned vertgroup_size = 256; /* 256 = disable vertex grouping */;
-
-		if (sctx->tes_shader.cso) {
-			primgroup_size = num_patches; /* must be a multiple of NUM_PATCHES */
-		} else if (sctx->gs_shader.cso) {
-			unsigned vgt_gs_onchip_cntl = sctx->gs_shader.current->ctx_reg.gs.vgt_gs_onchip_cntl;
-			primgroup_size = G_028A44_GS_PRIMS_PER_SUBGRP(vgt_gs_onchip_cntl);
-		} else {
-			primgroup_size = 128; /* recommended without a GS and tess */
-		}
-
-		ge_cntl = S_03096C_PRIM_GRP_SIZE(primgroup_size) |
-			  S_03096C_VERT_GRP_SIZE(vertgroup_size) |
-			  S_03096C_BREAK_WAVE_AT_EOI(key.u.uses_tess && key.u.tess_uses_prim_id);
-	}
-
-	ge_cntl |= S_03096C_PACKET_TO_ONE_PA(si_is_line_stipple_enabled(sctx));
-
-	if (ge_cntl != sctx->last_multi_vgt_param) {
-		radeon_set_uconfig_reg(sctx->gfx_cs, R_03096C_GE_CNTL, ge_cntl);
-		sctx->last_multi_vgt_param = ge_cntl;
-	}
+   union si_vgt_param_key key = sctx->ia_multi_vgt_param_key;
+   unsigned ge_cntl;
+
+   if (sctx->ngg) {
+      if (sctx->tes_shader.cso) {
+         ge_cntl = S_03096C_PRIM_GRP_SIZE(num_patches) |
+                   S_03096C_VERT_GRP_SIZE(256) | /* 256 = disable vertex grouping */
+                   S_03096C_BREAK_WAVE_AT_EOI(key.u.tess_uses_prim_id);
+      } else {
+         ge_cntl = si_get_vs_state(sctx)->ge_cntl;
+      }
+   } else {
+      unsigned primgroup_size;
+      unsigned vertgroup_size = 256; /* 256 = disable vertex grouping */
+      ;
+
+      if (sctx->tes_shader.cso) {
+         primgroup_size = num_patches; /* must be a multiple of NUM_PATCHES */
+      } else if (sctx->gs_shader.cso) {
+         unsigned vgt_gs_onchip_cntl = sctx->gs_shader.current->ctx_reg.gs.vgt_gs_onchip_cntl;
+         primgroup_size = G_028A44_GS_PRIMS_PER_SUBGRP(vgt_gs_onchip_cntl);
+      } else {
+         primgroup_size = 128; /* recommended without a GS and tess */
+      }
+
+      ge_cntl = S_03096C_PRIM_GRP_SIZE(primgroup_size) | S_03096C_VERT_GRP_SIZE(vertgroup_size) |
+                S_03096C_BREAK_WAVE_AT_EOI(key.u.uses_tess && key.u.tess_uses_prim_id);
+   }
+
+   ge_cntl |= S_03096C_PACKET_TO_ONE_PA(si_is_line_stipple_enabled(sctx));
+
+   if (ge_cntl != sctx->last_multi_vgt_param) {
+      radeon_set_uconfig_reg(sctx->gfx_cs, R_03096C_GE_CNTL, ge_cntl);
+      sctx->last_multi_vgt_param = ge_cntl;
+   }
 }
 
-static void si_emit_draw_registers(struct si_context *sctx,
-				   const struct pipe_draw_info *info,
-				   enum pipe_prim_type prim,
-				   unsigned num_patches,
-				   unsigned instance_count,
-				   bool primitive_restart)
+static void si_emit_draw_registers(struct si_context *sctx, const struct pipe_draw_info *info,
+                                   enum pipe_prim_type prim, unsigned num_patches,
+                                   unsigned instance_count, bool primitive_restart)
 {
-	struct radeon_cmdbuf *cs = sctx->gfx_cs;
-	unsigned vgt_prim = si_conv_pipe_prim(prim);
-
-	if (sctx->chip_class >= GFX10)
-		gfx10_emit_ge_cntl(sctx, num_patches);
-	else
-		si_emit_ia_multi_vgt_param(sctx, info, prim, num_patches,
-					   instance_count, primitive_restart);
-
-	if (vgt_prim != sctx->last_prim) {
-		if (sctx->chip_class >= GFX10)
-			radeon_set_uconfig_reg(cs, R_030908_VGT_PRIMITIVE_TYPE, vgt_prim);
-		else if (sctx->chip_class >= GFX7)
-			radeon_set_uconfig_reg_idx(cs, sctx->screen,
-						   R_030908_VGT_PRIMITIVE_TYPE, 1, vgt_prim);
-		else
-			radeon_set_config_reg(cs, R_008958_VGT_PRIMITIVE_TYPE, vgt_prim);
-
-		sctx->last_prim = vgt_prim;
-	}
-
-	/* Primitive restart. */
-	if (primitive_restart != sctx->last_primitive_restart_en) {
-		if (sctx->chip_class >= GFX9)
-			radeon_set_uconfig_reg(cs, R_03092C_VGT_MULTI_PRIM_IB_RESET_EN,
-					       primitive_restart);
-		else
-			radeon_set_context_reg(cs, R_028A94_VGT_MULTI_PRIM_IB_RESET_EN,
-					       primitive_restart);
-
-		sctx->last_primitive_restart_en = primitive_restart;
-
-	}
-	if (si_prim_restart_index_changed(sctx, primitive_restart, info->restart_index)) {
-		radeon_set_context_reg(cs, R_02840C_VGT_MULTI_PRIM_IB_RESET_INDX,
-				       info->restart_index);
-		sctx->last_restart_index = info->restart_index;
-		sctx->context_roll = true;
-	}
+   struct radeon_cmdbuf *cs = sctx->gfx_cs;
+   unsigned vgt_prim = si_conv_pipe_prim(prim);
+
+   if (sctx->chip_class >= GFX10)
+      gfx10_emit_ge_cntl(sctx, num_patches);
+   else
+      si_emit_ia_multi_vgt_param(sctx, info, prim, num_patches, instance_count, primitive_restart);
+
+   if (vgt_prim != sctx->last_prim) {
+      if (sctx->chip_class >= GFX10)
+         radeon_set_uconfig_reg(cs, R_030908_VGT_PRIMITIVE_TYPE, vgt_prim);
+      else if (sctx->chip_class >= GFX7)
+         radeon_set_uconfig_reg_idx(cs, sctx->screen, R_030908_VGT_PRIMITIVE_TYPE, 1, vgt_prim);
+      else
+         radeon_set_config_reg(cs, R_008958_VGT_PRIMITIVE_TYPE, vgt_prim);
+
+      sctx->last_prim = vgt_prim;
+   }
+
+   /* Primitive restart. */
+   if (primitive_restart != sctx->last_primitive_restart_en) {
+      if (sctx->chip_class >= GFX9)
+         radeon_set_uconfig_reg(cs, R_03092C_VGT_MULTI_PRIM_IB_RESET_EN, primitive_restart);
+      else
+         radeon_set_context_reg(cs, R_028A94_VGT_MULTI_PRIM_IB_RESET_EN, primitive_restart);
+
+      sctx->last_primitive_restart_en = primitive_restart;
+   }
+   if (si_prim_restart_index_changed(sctx, primitive_restart, info->restart_index)) {
+      radeon_set_context_reg(cs, R_02840C_VGT_MULTI_PRIM_IB_RESET_INDX, info->restart_index);
+      sctx->last_restart_index = info->restart_index;
+      sctx->context_roll = true;
+   }
 }
 
-static void si_emit_draw_packets(struct si_context *sctx,
-				 const struct pipe_draw_info *info,
-				 struct pipe_resource *indexbuf,
-				 unsigned index_size,
-				 unsigned index_offset,
-				 unsigned instance_count,
-				 bool dispatch_prim_discard_cs,
-				 unsigned original_index_size)
+static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw_info *info,
+                                 struct pipe_resource *indexbuf, unsigned index_size,
+                                 unsigned index_offset, unsigned instance_count,
+                                 bool dispatch_prim_discard_cs, unsigned original_index_size)
 {
-	struct pipe_draw_indirect_info *indirect = info->indirect;
-	struct radeon_cmdbuf *cs = sctx->gfx_cs;
-	unsigned sh_base_reg = sctx->shader_pointers.sh_base[PIPE_SHADER_VERTEX];
-	bool render_cond_bit = sctx->render_cond && !sctx->render_cond_force_off;
-	uint32_t index_max_size = 0;
-	uint64_t index_va = 0;
-
-	if (info->count_from_stream_output) {
-		struct si_streamout_target *t =
-			(struct si_streamout_target*)info->count_from_stream_output;
-
-		radeon_set_context_reg(cs, R_028B30_VGT_STRMOUT_DRAW_OPAQUE_VERTEX_STRIDE,
-				       t->stride_in_dw);
-		si_cp_copy_data(sctx, sctx->gfx_cs,
-				COPY_DATA_REG, NULL,
-				R_028B2C_VGT_STRMOUT_DRAW_OPAQUE_BUFFER_FILLED_SIZE >> 2,
-				COPY_DATA_SRC_MEM, t->buf_filled_size,
-				t->buf_filled_size_offset);
-	}
-
-	/* draw packet */
-	if (index_size) {
-		if (index_size != sctx->last_index_size) {
-			unsigned index_type;
-
-			/* index type */
-			switch (index_size) {
-			case 1:
-				index_type = V_028A7C_VGT_INDEX_8;
-				break;
-			case 2:
-				index_type = V_028A7C_VGT_INDEX_16 |
-					     (SI_BIG_ENDIAN && sctx->chip_class <= GFX7 ?
-						      V_028A7C_VGT_DMA_SWAP_16_BIT : 0);
-				break;
-			case 4:
-				index_type = V_028A7C_VGT_INDEX_32 |
-					     (SI_BIG_ENDIAN && sctx->chip_class <= GFX7 ?
-						      V_028A7C_VGT_DMA_SWAP_32_BIT : 0);
-				break;
-			default:
-				assert(!"unreachable");
-				return;
-			}
-
-			if (sctx->chip_class >= GFX9) {
-				radeon_set_uconfig_reg_idx(cs, sctx->screen,
-							   R_03090C_VGT_INDEX_TYPE, 2,
-							   index_type);
-			} else {
-				radeon_emit(cs, PKT3(PKT3_INDEX_TYPE, 0, 0));
-				radeon_emit(cs, index_type);
-			}
-
-			sctx->last_index_size = index_size;
-		}
-
-		if (original_index_size) {
-			index_max_size = (indexbuf->width0 - index_offset) /
-					  original_index_size;
-			/* Skip draw calls with 0-sized index buffers.
-			 * They cause a hang on some chips, like Navi10-14.
-			 */
-			if (!index_max_size)
-				return;
-
-			index_va = si_resource(indexbuf)->gpu_address + index_offset;
-
-			radeon_add_to_buffer_list(sctx, sctx->gfx_cs,
-					      si_resource(indexbuf),
-					      RADEON_USAGE_READ, RADEON_PRIO_INDEX_BUFFER);
-		}
-	} else {
-		/* On GFX7 and later, non-indexed draws overwrite VGT_INDEX_TYPE,
-		 * so the state must be re-emitted before the next indexed draw.
-		 */
-		if (sctx->chip_class >= GFX7)
-			sctx->last_index_size = -1;
-	}
-
-	if (indirect) {
-		uint64_t indirect_va = si_resource(indirect->buffer)->gpu_address;
-
-		assert(indirect_va % 8 == 0);
-
-		si_invalidate_draw_sh_constants(sctx);
-
-		radeon_emit(cs, PKT3(PKT3_SET_BASE, 2, 0));
-		radeon_emit(cs, 1);
-		radeon_emit(cs, indirect_va);
-		radeon_emit(cs, indirect_va >> 32);
-
-		radeon_add_to_buffer_list(sctx, sctx->gfx_cs,
-				      si_resource(indirect->buffer),
-				      RADEON_USAGE_READ, RADEON_PRIO_DRAW_INDIRECT);
-
-		unsigned di_src_sel = index_size ? V_0287F0_DI_SRC_SEL_DMA
-						    : V_0287F0_DI_SRC_SEL_AUTO_INDEX;
-
-		assert(indirect->offset % 4 == 0);
-
-		if (index_size) {
-			radeon_emit(cs, PKT3(PKT3_INDEX_BASE, 1, 0));
-			radeon_emit(cs, index_va);
-			radeon_emit(cs, index_va >> 32);
-
-			radeon_emit(cs, PKT3(PKT3_INDEX_BUFFER_SIZE, 0, 0));
-			radeon_emit(cs, index_max_size);
-		}
-
-		if (!sctx->screen->has_draw_indirect_multi) {
-			radeon_emit(cs, PKT3(index_size ? PKT3_DRAW_INDEX_INDIRECT
-							   : PKT3_DRAW_INDIRECT,
-					     3, render_cond_bit));
-			radeon_emit(cs, indirect->offset);
-			radeon_emit(cs, (sh_base_reg + SI_SGPR_BASE_VERTEX * 4 - SI_SH_REG_OFFSET) >> 2);
-			radeon_emit(cs, (sh_base_reg + SI_SGPR_START_INSTANCE * 4 - SI_SH_REG_OFFSET) >> 2);
-			radeon_emit(cs, di_src_sel);
-		} else {
-			uint64_t count_va = 0;
-
-			if (indirect->indirect_draw_count) {
-				struct si_resource *params_buf =
-					si_resource(indirect->indirect_draw_count);
-
-				radeon_add_to_buffer_list(
-					sctx, sctx->gfx_cs, params_buf,
-					RADEON_USAGE_READ, RADEON_PRIO_DRAW_INDIRECT);
-
-				count_va = params_buf->gpu_address + indirect->indirect_draw_count_offset;
-			}
-
-			radeon_emit(cs, PKT3(index_size ? PKT3_DRAW_INDEX_INDIRECT_MULTI :
-							     PKT3_DRAW_INDIRECT_MULTI,
-					     8, render_cond_bit));
-			radeon_emit(cs, indirect->offset);
-			radeon_emit(cs, (sh_base_reg + SI_SGPR_BASE_VERTEX * 4 - SI_SH_REG_OFFSET) >> 2);
-			radeon_emit(cs, (sh_base_reg + SI_SGPR_START_INSTANCE * 4 - SI_SH_REG_OFFSET) >> 2);
-			radeon_emit(cs, ((sh_base_reg + SI_SGPR_DRAWID * 4 - SI_SH_REG_OFFSET) >> 2) |
-					S_2C3_DRAW_INDEX_ENABLE(1) |
-					S_2C3_COUNT_INDIRECT_ENABLE(!!indirect->indirect_draw_count));
-			radeon_emit(cs, indirect->draw_count);
-			radeon_emit(cs, count_va);
-			radeon_emit(cs, count_va >> 32);
-			radeon_emit(cs, indirect->stride);
-			radeon_emit(cs, di_src_sel);
-		}
-	} else {
-		int base_vertex;
-
-		if (sctx->last_instance_count == SI_INSTANCE_COUNT_UNKNOWN ||
-		    sctx->last_instance_count != instance_count) {
-			radeon_emit(cs, PKT3(PKT3_NUM_INSTANCES, 0, 0));
-			radeon_emit(cs, instance_count);
-			sctx->last_instance_count = instance_count;
-		}
-
-		/* Base vertex and start instance. */
-		base_vertex = original_index_size ? info->index_bias : info->start;
-
-		if (sctx->num_vs_blit_sgprs) {
-			/* Re-emit draw constants after we leave u_blitter. */
-			si_invalidate_draw_sh_constants(sctx);
-
-			/* Blit VS doesn't use BASE_VERTEX, START_INSTANCE, and DRAWID. */
-			radeon_set_sh_reg_seq(cs, sh_base_reg + SI_SGPR_VS_BLIT_DATA * 4,
-					      sctx->num_vs_blit_sgprs);
-			radeon_emit_array(cs, sctx->vs_blit_sh_data,
-					  sctx->num_vs_blit_sgprs);
-		} else if (base_vertex != sctx->last_base_vertex ||
-			   sctx->last_base_vertex == SI_BASE_VERTEX_UNKNOWN ||
-			   info->start_instance != sctx->last_start_instance ||
-			   info->drawid != sctx->last_drawid ||
-			   sh_base_reg != sctx->last_sh_base_reg) {
-			radeon_set_sh_reg_seq(cs, sh_base_reg + SI_SGPR_BASE_VERTEX * 4, 3);
-			radeon_emit(cs, base_vertex);
-			radeon_emit(cs, info->start_instance);
-			radeon_emit(cs, info->drawid);
-
-			sctx->last_base_vertex = base_vertex;
-			sctx->last_start_instance = info->start_instance;
-			sctx->last_drawid = info->drawid;
-			sctx->last_sh_base_reg = sh_base_reg;
-		}
-
-		if (index_size) {
-			if (dispatch_prim_discard_cs) {
-				index_va += info->start * original_index_size;
-				index_max_size = MIN2(index_max_size, info->count);
-
-				si_dispatch_prim_discard_cs_and_draw(sctx, info,
-								     original_index_size,
-								     base_vertex,
-								     index_va, index_max_size);
-				return;
-			}
-
-			index_va += info->start * index_size;
-
-			radeon_emit(cs, PKT3(PKT3_DRAW_INDEX_2, 4, render_cond_bit));
-			radeon_emit(cs, index_max_size);
-			radeon_emit(cs, index_va);
-			radeon_emit(cs, index_va >> 32);
-			radeon_emit(cs, info->count);
-			radeon_emit(cs, V_0287F0_DI_SRC_SEL_DMA);
-		} else {
-			radeon_emit(cs, PKT3(PKT3_DRAW_INDEX_AUTO, 1, render_cond_bit));
-			radeon_emit(cs, info->count);
-			radeon_emit(cs, V_0287F0_DI_SRC_SEL_AUTO_INDEX |
-				        S_0287F0_USE_OPAQUE(!!info->count_from_stream_output));
-		}
-	}
+   struct pipe_draw_indirect_info *indirect = info->indirect;
+   struct radeon_cmdbuf *cs = sctx->gfx_cs;
+   unsigned sh_base_reg = sctx->shader_pointers.sh_base[PIPE_SHADER_VERTEX];
+   bool render_cond_bit = sctx->render_cond && !sctx->render_cond_force_off;
+   uint32_t index_max_size = 0;
+   uint64_t index_va = 0;
+
+   if (info->count_from_stream_output) {
+      struct si_streamout_target *t = (struct si_streamout_target *)info->count_from_stream_output;
+
+      radeon_set_context_reg(cs, R_028B30_VGT_STRMOUT_DRAW_OPAQUE_VERTEX_STRIDE, t->stride_in_dw);
+      si_cp_copy_data(sctx, sctx->gfx_cs, COPY_DATA_REG, NULL,
+                      R_028B2C_VGT_STRMOUT_DRAW_OPAQUE_BUFFER_FILLED_SIZE >> 2, COPY_DATA_SRC_MEM,
+                      t->buf_filled_size, t->buf_filled_size_offset);
+   }
+
+   /* draw packet */
+   if (index_size) {
+      if (index_size != sctx->last_index_size) {
+         unsigned index_type;
+
+         /* index type */
+         switch (index_size) {
+         case 1:
+            index_type = V_028A7C_VGT_INDEX_8;
+            break;
+         case 2:
+            index_type =
+               V_028A7C_VGT_INDEX_16 |
+               (SI_BIG_ENDIAN && sctx->chip_class <= GFX7 ? V_028A7C_VGT_DMA_SWAP_16_BIT : 0);
+            break;
+         case 4:
+            index_type =
+               V_028A7C_VGT_INDEX_32 |
+               (SI_BIG_ENDIAN && sctx->chip_class <= GFX7 ? V_028A7C_VGT_DMA_SWAP_32_BIT : 0);
+            break;
+         default:
+            assert(!"unreachable");
+            return;
+         }
+
+         if (sctx->chip_class >= GFX9) {
+            radeon_set_uconfig_reg_idx(cs, sctx->screen, R_03090C_VGT_INDEX_TYPE, 2, index_type);
+         } else {
+            radeon_emit(cs, PKT3(PKT3_INDEX_TYPE, 0, 0));
+            radeon_emit(cs, index_type);
+         }
+
+         sctx->last_index_size = index_size;
+      }
+
+      if (original_index_size) {
+         index_max_size = (indexbuf->width0 - index_offset) / original_index_size;
+         /* Skip draw calls with 0-sized index buffers.
+          * They cause a hang on some chips, like Navi10-14.
+          */
+         if (!index_max_size)
+            return;
+
+         index_va = si_resource(indexbuf)->gpu_address + index_offset;
+
+         radeon_add_to_buffer_list(sctx, sctx->gfx_cs, si_resource(indexbuf), RADEON_USAGE_READ,
+                                   RADEON_PRIO_INDEX_BUFFER);
+      }
+   } else {
+      /* On GFX7 and later, non-indexed draws overwrite VGT_INDEX_TYPE,
+       * so the state must be re-emitted before the next indexed draw.
+       */
+      if (sctx->chip_class >= GFX7)
+         sctx->last_index_size = -1;
+   }
+
+   if (indirect) {
+      uint64_t indirect_va = si_resource(indirect->buffer)->gpu_address;
+
+      assert(indirect_va % 8 == 0);
+
+      si_invalidate_draw_sh_constants(sctx);
+
+      radeon_emit(cs, PKT3(PKT3_SET_BASE, 2, 0));
+      radeon_emit(cs, 1);
+      radeon_emit(cs, indirect_va);
+      radeon_emit(cs, indirect_va >> 32);
+
+      radeon_add_to_buffer_list(sctx, sctx->gfx_cs, si_resource(indirect->buffer),
+                                RADEON_USAGE_READ, RADEON_PRIO_DRAW_INDIRECT);
+
+      unsigned di_src_sel = index_size ? V_0287F0_DI_SRC_SEL_DMA : V_0287F0_DI_SRC_SEL_AUTO_INDEX;
+
+      assert(indirect->offset % 4 == 0);
+
+      if (index_size) {
+         radeon_emit(cs, PKT3(PKT3_INDEX_BASE, 1, 0));
+         radeon_emit(cs, index_va);
+         radeon_emit(cs, index_va >> 32);
+
+         radeon_emit(cs, PKT3(PKT3_INDEX_BUFFER_SIZE, 0, 0));
+         radeon_emit(cs, index_max_size);
+      }
+
+      if (!sctx->screen->has_draw_indirect_multi) {
+         radeon_emit(cs, PKT3(index_size ? PKT3_DRAW_INDEX_INDIRECT : PKT3_DRAW_INDIRECT, 3,
+                              render_cond_bit));
+         radeon_emit(cs, indirect->offset);
+         radeon_emit(cs, (sh_base_reg + SI_SGPR_BASE_VERTEX * 4 - SI_SH_REG_OFFSET) >> 2);
+         radeon_emit(cs, (sh_base_reg + SI_SGPR_START_INSTANCE * 4 - SI_SH_REG_OFFSET) >> 2);
+         radeon_emit(cs, di_src_sel);
+      } else {
+         uint64_t count_va = 0;
+
+         if (indirect->indirect_draw_count) {
+            struct si_resource *params_buf = si_resource(indirect->indirect_draw_count);
+
+            radeon_add_to_buffer_list(sctx, sctx->gfx_cs, params_buf, RADEON_USAGE_READ,
+                                      RADEON_PRIO_DRAW_INDIRECT);
+
+            count_va = params_buf->gpu_address + indirect->indirect_draw_count_offset;
+         }
+
+         radeon_emit(cs,
+                     PKT3(index_size ? PKT3_DRAW_INDEX_INDIRECT_MULTI : PKT3_DRAW_INDIRECT_MULTI, 8,
+                          render_cond_bit));
+         radeon_emit(cs, indirect->offset);
+         radeon_emit(cs, (sh_base_reg + SI_SGPR_BASE_VERTEX * 4 - SI_SH_REG_OFFSET) >> 2);
+         radeon_emit(cs, (sh_base_reg + SI_SGPR_START_INSTANCE * 4 - SI_SH_REG_OFFSET) >> 2);
+         radeon_emit(cs, ((sh_base_reg + SI_SGPR_DRAWID * 4 - SI_SH_REG_OFFSET) >> 2) |
+                            S_2C3_DRAW_INDEX_ENABLE(1) |
+                            S_2C3_COUNT_INDIRECT_ENABLE(!!indirect->indirect_draw_count));
+         radeon_emit(cs, indirect->draw_count);
+         radeon_emit(cs, count_va);
+         radeon_emit(cs, count_va >> 32);
+         radeon_emit(cs, indirect->stride);
+         radeon_emit(cs, di_src_sel);
+      }
+   } else {
+      int base_vertex;
+
+      if (sctx->last_instance_count == SI_INSTANCE_COUNT_UNKNOWN ||
+          sctx->last_instance_count != instance_count) {
+         radeon_emit(cs, PKT3(PKT3_NUM_INSTANCES, 0, 0));
+         radeon_emit(cs, instance_count);
+         sctx->last_instance_count = instance_count;
+      }
+
+      /* Base vertex and start instance. */
+      base_vertex = original_index_size ? info->index_bias : info->start;
+
+      if (sctx->num_vs_blit_sgprs) {
+         /* Re-emit draw constants after we leave u_blitter. */
+         si_invalidate_draw_sh_constants(sctx);
+
+         /* Blit VS doesn't use BASE_VERTEX, START_INSTANCE, and DRAWID. */
+         radeon_set_sh_reg_seq(cs, sh_base_reg + SI_SGPR_VS_BLIT_DATA * 4, sctx->num_vs_blit_sgprs);
+         radeon_emit_array(cs, sctx->vs_blit_sh_data, sctx->num_vs_blit_sgprs);
+      } else if (base_vertex != sctx->last_base_vertex ||
+                 sctx->last_base_vertex == SI_BASE_VERTEX_UNKNOWN ||
+                 info->start_instance != sctx->last_start_instance ||
+                 info->drawid != sctx->last_drawid || sh_base_reg != sctx->last_sh_base_reg) {
+         radeon_set_sh_reg_seq(cs, sh_base_reg + SI_SGPR_BASE_VERTEX * 4, 3);
+         radeon_emit(cs, base_vertex);
+         radeon_emit(cs, info->start_instance);
+         radeon_emit(cs, info->drawid);
+
+         sctx->last_base_vertex = base_vertex;
+         sctx->last_start_instance = info->start_instance;
+         sctx->last_drawid = info->drawid;
+         sctx->last_sh_base_reg = sh_base_reg;
+      }
+
+      if (index_size) {
+         if (dispatch_prim_discard_cs) {
+            index_va += info->start * original_index_size;
+            index_max_size = MIN2(index_max_size, info->count);
+
+            si_dispatch_prim_discard_cs_and_draw(sctx, info, original_index_size, base_vertex,
+                                                 index_va, index_max_size);
+            return;
+         }
+
+         index_va += info->start * index_size;
+
+         radeon_emit(cs, PKT3(PKT3_DRAW_INDEX_2, 4, render_cond_bit));
+         radeon_emit(cs, index_max_size);
+         radeon_emit(cs, index_va);
+         radeon_emit(cs, index_va >> 32);
+         radeon_emit(cs, info->count);
+         radeon_emit(cs, V_0287F0_DI_SRC_SEL_DMA);
+      } else {
+         radeon_emit(cs, PKT3(PKT3_DRAW_INDEX_AUTO, 1, render_cond_bit));
+         radeon_emit(cs, info->count);
+         radeon_emit(cs, V_0287F0_DI_SRC_SEL_AUTO_INDEX |
+                            S_0287F0_USE_OPAQUE(!!info->count_from_stream_output));
+      }
+   }
 }
 
-void si_emit_surface_sync(struct si_context *sctx, struct radeon_cmdbuf *cs,
-			  unsigned cp_coher_cntl)
+void si_emit_surface_sync(struct si_context *sctx, struct radeon_cmdbuf *cs, unsigned cp_coher_cntl)
 {
-	bool compute_ib = !sctx->has_graphics ||
-			  cs == sctx->prim_discard_compute_cs;
-
-	assert(sctx->chip_class <= GFX9);
-
-	if (sctx->chip_class == GFX9 || compute_ib) {
-		/* Flush caches and wait for the caches to assert idle. */
-		radeon_emit(cs, PKT3(PKT3_ACQUIRE_MEM, 5, 0));
-		radeon_emit(cs, cp_coher_cntl);	/* CP_COHER_CNTL */
-		radeon_emit(cs, 0xffffffff);	/* CP_COHER_SIZE */
-		radeon_emit(cs, 0xffffff);	/* CP_COHER_SIZE_HI */
-		radeon_emit(cs, 0);		/* CP_COHER_BASE */
-		radeon_emit(cs, 0);		/* CP_COHER_BASE_HI */
-		radeon_emit(cs, 0x0000000A);	/* POLL_INTERVAL */
-	} else {
-		/* ACQUIRE_MEM is only required on a compute ring. */
-		radeon_emit(cs, PKT3(PKT3_SURFACE_SYNC, 3, 0));
-		radeon_emit(cs, cp_coher_cntl);   /* CP_COHER_CNTL */
-		radeon_emit(cs, 0xffffffff);      /* CP_COHER_SIZE */
-		radeon_emit(cs, 0);               /* CP_COHER_BASE */
-		radeon_emit(cs, 0x0000000A);      /* POLL_INTERVAL */
-	}
-
-	/* ACQUIRE_MEM has an implicit context roll if the current context
-	 * is busy. */
-	if (!compute_ib)
-		sctx->context_roll = true;
+   bool compute_ib = !sctx->has_graphics || cs == sctx->prim_discard_compute_cs;
+
+   assert(sctx->chip_class <= GFX9);
+
+   if (sctx->chip_class == GFX9 || compute_ib) {
+      /* Flush caches and wait for the caches to assert idle. */
+      radeon_emit(cs, PKT3(PKT3_ACQUIRE_MEM, 5, 0));
+      radeon_emit(cs, cp_coher_cntl); /* CP_COHER_CNTL */
+      radeon_emit(cs, 0xffffffff);    /* CP_COHER_SIZE */
+      radeon_emit(cs, 0xffffff);      /* CP_COHER_SIZE_HI */
+      radeon_emit(cs, 0);             /* CP_COHER_BASE */
+      radeon_emit(cs, 0);             /* CP_COHER_BASE_HI */
+      radeon_emit(cs, 0x0000000A);    /* POLL_INTERVAL */
+   } else {
+      /* ACQUIRE_MEM is only required on a compute ring. */
+      radeon_emit(cs, PKT3(PKT3_SURFACE_SYNC, 3, 0));
+      radeon_emit(cs, cp_coher_cntl); /* CP_COHER_CNTL */
+      radeon_emit(cs, 0xffffffff);    /* CP_COHER_SIZE */
+      radeon_emit(cs, 0);             /* CP_COHER_BASE */
+      radeon_emit(cs, 0x0000000A);    /* POLL_INTERVAL */
+   }
+
+   /* ACQUIRE_MEM has an implicit context roll if the current context
+    * is busy. */
+   if (!compute_ib)
+      sctx->context_roll = true;
 }
 
 void si_prim_discard_signal_next_compute_ib_start(struct si_context *sctx)
 {
-	if (!si_compute_prim_discard_enabled(sctx))
-		return;
-
-	if (!sctx->barrier_buf) {
-		u_suballocator_alloc(sctx->allocator_zeroed_memory, 4, 4,
-				     &sctx->barrier_buf_offset,
-				     (struct pipe_resource**)&sctx->barrier_buf);
-	}
-
-	/* Emit a placeholder to signal the next compute IB to start.
-	 * See si_compute_prim_discard.c for explanation.
-	 */
-	uint32_t signal = 1;
-	si_cp_write_data(sctx, sctx->barrier_buf, sctx->barrier_buf_offset,
-			 4, V_370_MEM, V_370_ME, &signal);
-
-	sctx->last_pkt3_write_data =
-			&sctx->gfx_cs->current.buf[sctx->gfx_cs->current.cdw - 5];
-
-	/* Only the last occurence of WRITE_DATA will be executed.
-	 * The packet will be enabled in si_flush_gfx_cs.
-	 */
-	*sctx->last_pkt3_write_data = PKT3(PKT3_NOP, 3, 0);
+   if (!si_compute_prim_discard_enabled(sctx))
+      return;
+
+   if (!sctx->barrier_buf) {
+      u_suballocator_alloc(sctx->allocator_zeroed_memory, 4, 4, &sctx->barrier_buf_offset,
+                           (struct pipe_resource **)&sctx->barrier_buf);
+   }
+
+   /* Emit a placeholder to signal the next compute IB to start.
+    * See si_compute_prim_discard.c for explanation.
+    */
+   uint32_t signal = 1;
+   si_cp_write_data(sctx, sctx->barrier_buf, sctx->barrier_buf_offset, 4, V_370_MEM, V_370_ME,
+                    &signal);
+
+   sctx->last_pkt3_write_data = &sctx->gfx_cs->current.buf[sctx->gfx_cs->current.cdw - 5];
+
+   /* Only the last occurence of WRITE_DATA will be executed.
+    * The packet will be enabled in si_flush_gfx_cs.
+    */
+   *sctx->last_pkt3_write_data = PKT3(PKT3_NOP, 3, 0);
 }
 
 void gfx10_emit_cache_flush(struct si_context *ctx)
 {
-	struct radeon_cmdbuf *cs = ctx->gfx_cs;
-	uint32_t gcr_cntl = 0;
-	unsigned cb_db_event = 0;
-	unsigned flags = ctx->flags;
-
-	if (!ctx->has_graphics) {
-		/* Only process compute flags. */
-		flags &= SI_CONTEXT_INV_ICACHE |
-			 SI_CONTEXT_INV_SCACHE |
-			 SI_CONTEXT_INV_VCACHE |
-			 SI_CONTEXT_INV_L2 |
-			 SI_CONTEXT_WB_L2 |
-			 SI_CONTEXT_INV_L2_METADATA |
-			 SI_CONTEXT_CS_PARTIAL_FLUSH;
-	}
-
-	/* We don't need these. */
-	assert(!(flags & (SI_CONTEXT_VGT_STREAMOUT_SYNC |
-			  SI_CONTEXT_FLUSH_AND_INV_DB_META)));
-
-	if (flags & SI_CONTEXT_VGT_FLUSH) {
-		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
-		radeon_emit(cs, EVENT_TYPE(V_028A90_VGT_FLUSH) | EVENT_INDEX(0));
-	}
-
-	if (flags & SI_CONTEXT_FLUSH_AND_INV_CB)
-		ctx->num_cb_cache_flushes++;
-	if (flags & SI_CONTEXT_FLUSH_AND_INV_DB)
-		ctx->num_db_cache_flushes++;
-
-	if (flags & SI_CONTEXT_INV_ICACHE)
-		gcr_cntl |= S_586_GLI_INV(V_586_GLI_ALL);
-	if (flags & SI_CONTEXT_INV_SCACHE) {
-		/* TODO: When writing to the SMEM L1 cache, we need to set SEQ
-		 * to FORWARD when both L1 and L2 are written out (WB or INV).
-		 */
-		gcr_cntl |= S_586_GL1_INV(1) | S_586_GLK_INV(1);
-	}
-	if (flags & SI_CONTEXT_INV_VCACHE)
-		gcr_cntl |= S_586_GL1_INV(1) | S_586_GLV_INV(1);
-
-	/* The L2 cache ops are:
-	 * - INV: - invalidate lines that reflect memory (were loaded from memory)
-	 *        - don't touch lines that were overwritten (were stored by gfx clients)
-	 * - WB: - don't touch lines that reflect memory
-	 *       - write back lines that were overwritten
-	 * - WB | INV: - invalidate lines that reflect memory
-	 *             - write back lines that were overwritten
-	 *
-	 * GLM doesn't support WB alone. If WB is set, INV must be set too.
-	 */
-	if (flags & SI_CONTEXT_INV_L2) {
-		/* Writeback and invalidate everything in L2. */
-		gcr_cntl |= S_586_GL2_INV(1) | S_586_GL2_WB(1) |
-			    S_586_GLM_INV(1) | S_586_GLM_WB(1);
-		ctx->num_L2_invalidates++;
-	} else if (flags & SI_CONTEXT_WB_L2) {
-		gcr_cntl |= S_586_GL2_WB(1) |
-			    S_586_GLM_WB(1) | S_586_GLM_INV(1);
-	} else if (flags & SI_CONTEXT_INV_L2_METADATA) {
-		gcr_cntl |= S_586_GLM_INV(1) | S_586_GLM_WB(1);
-	}
-
-	if (flags & (SI_CONTEXT_FLUSH_AND_INV_CB | SI_CONTEXT_FLUSH_AND_INV_DB)) {
-		if (flags & SI_CONTEXT_FLUSH_AND_INV_CB) {
-			/* Flush CMASK/FMASK/DCC. Will wait for idle later. */
-			radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
-			radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_AND_INV_CB_META) |
-					EVENT_INDEX(0));
-		}
-		if (flags & SI_CONTEXT_FLUSH_AND_INV_DB) {
-			/* Flush HTILE. Will wait for idle later. */
-			radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
-			radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_AND_INV_DB_META) |
-					EVENT_INDEX(0));
-		}
-
-		/* First flush CB/DB, then L1/L2. */
-		gcr_cntl |= S_586_SEQ(V_586_SEQ_FORWARD);
-
-		if ((flags & (SI_CONTEXT_FLUSH_AND_INV_CB | SI_CONTEXT_FLUSH_AND_INV_DB)) ==
-		    (SI_CONTEXT_FLUSH_AND_INV_CB | SI_CONTEXT_FLUSH_AND_INV_DB)) {
-			cb_db_event = V_028A90_CACHE_FLUSH_AND_INV_TS_EVENT;
-		} else if (flags & SI_CONTEXT_FLUSH_AND_INV_CB) {
-			cb_db_event = V_028A90_FLUSH_AND_INV_CB_DATA_TS;
-		} else if (flags & SI_CONTEXT_FLUSH_AND_INV_DB) {
-			cb_db_event = V_028A90_FLUSH_AND_INV_DB_DATA_TS;
-		} else {
-			assert(0);
-		}
-	} else {
-		/* Wait for graphics shaders to go idle if requested. */
-		if (flags & SI_CONTEXT_PS_PARTIAL_FLUSH) {
-			radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
-			radeon_emit(cs, EVENT_TYPE(V_028A90_PS_PARTIAL_FLUSH) | EVENT_INDEX(4));
-			/* Only count explicit shader flushes, not implicit ones. */
-			ctx->num_vs_flushes++;
-			ctx->num_ps_flushes++;
-		} else if (flags & SI_CONTEXT_VS_PARTIAL_FLUSH) {
-			radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
-			radeon_emit(cs, EVENT_TYPE(V_028A90_VS_PARTIAL_FLUSH) | EVENT_INDEX(4));
-			ctx->num_vs_flushes++;
-		}
-	}
-
-	if (flags & SI_CONTEXT_CS_PARTIAL_FLUSH && ctx->compute_is_busy) {
-		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
-		radeon_emit(cs, EVENT_TYPE(V_028A90_CS_PARTIAL_FLUSH | EVENT_INDEX(4)));
-		ctx->num_cs_flushes++;
-		ctx->compute_is_busy = false;
-	}
-
-	if (cb_db_event) {
-		/* CB/DB flush and invalidate (or possibly just a wait for a
-		 * meta flush) via RELEASE_MEM.
-		 *
-		 * Combine this with other cache flushes when possible; this
-		 * requires affected shaders to be idle, so do it after the
-		 * CS_PARTIAL_FLUSH before (VS/PS partial flushes are always
-		 * implied).
-		 */
-		uint64_t va;
-
-		/* Do the flush (enqueue the event and wait for it). */
-		va = ctx->wait_mem_scratch->gpu_address;
-		ctx->wait_mem_number++;
-
-		/* Get GCR_CNTL fields, because the encoding is different in RELEASE_MEM. */
-		unsigned glm_wb = G_586_GLM_WB(gcr_cntl);
-		unsigned glm_inv = G_586_GLM_INV(gcr_cntl);
-		unsigned glv_inv = G_586_GLV_INV(gcr_cntl);
-		unsigned gl1_inv = G_586_GL1_INV(gcr_cntl);
-		assert(G_586_GL2_US(gcr_cntl) == 0);
-		assert(G_586_GL2_RANGE(gcr_cntl) == 0);
-		assert(G_586_GL2_DISCARD(gcr_cntl) == 0);
-		unsigned gl2_inv = G_586_GL2_INV(gcr_cntl);
-		unsigned gl2_wb = G_586_GL2_WB(gcr_cntl);
-		unsigned gcr_seq = G_586_SEQ(gcr_cntl);
-
-		gcr_cntl &= C_586_GLM_WB &
-			    C_586_GLM_INV &
-			    C_586_GLV_INV &
-			    C_586_GL1_INV &
-			    C_586_GL2_INV &
-			    C_586_GL2_WB; /* keep SEQ */
-
-		si_cp_release_mem(ctx, cs, cb_db_event,
-				  S_490_GLM_WB(glm_wb) |
-				  S_490_GLM_INV(glm_inv) |
-				  S_490_GLV_INV(glv_inv) |
-				  S_490_GL1_INV(gl1_inv) |
-				  S_490_GL2_INV(gl2_inv) |
-				  S_490_GL2_WB(gl2_wb) |
-				  S_490_SEQ(gcr_seq),
-				  EOP_DST_SEL_MEM,
-				  EOP_INT_SEL_SEND_DATA_AFTER_WR_CONFIRM,
-				  EOP_DATA_SEL_VALUE_32BIT,
-				  ctx->wait_mem_scratch, va,
-				  ctx->wait_mem_number, SI_NOT_QUERY);
-		si_cp_wait_mem(ctx, ctx->gfx_cs, va, ctx->wait_mem_number, 0xffffffff,
-			       WAIT_REG_MEM_EQUAL);
-	}
-
-	/* Ignore fields that only modify the behavior of other fields. */
-	if (gcr_cntl & C_586_GL1_RANGE & C_586_GL2_RANGE & C_586_SEQ) {
-		/* Flush caches and wait for the caches to assert idle.
-		 * The cache flush is executed in the ME, but the PFP waits
-		 * for completion.
-		 */
-		radeon_emit(cs, PKT3(PKT3_ACQUIRE_MEM, 6, 0));
-		radeon_emit(cs, 0);		/* CP_COHER_CNTL */
-		radeon_emit(cs, 0xffffffff);	/* CP_COHER_SIZE */
-		radeon_emit(cs, 0xffffff);	/* CP_COHER_SIZE_HI */
-		radeon_emit(cs, 0);		/* CP_COHER_BASE */
-		radeon_emit(cs, 0);		/* CP_COHER_BASE_HI */
-		radeon_emit(cs, 0x0000000A);	/* POLL_INTERVAL */
-		radeon_emit(cs, gcr_cntl);	/* GCR_CNTL */
-	} else if (cb_db_event ||
-		   (flags & (SI_CONTEXT_VS_PARTIAL_FLUSH |
-			     SI_CONTEXT_PS_PARTIAL_FLUSH |
-			     SI_CONTEXT_CS_PARTIAL_FLUSH))) {
-		/* We need to ensure that PFP waits as well. */
-		radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));
-		radeon_emit(cs, 0);
-	}
-
-	if (flags & SI_CONTEXT_START_PIPELINE_STATS) {
-		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
-		radeon_emit(cs, EVENT_TYPE(V_028A90_PIPELINESTAT_START) |
-			        EVENT_INDEX(0));
-	} else if (flags & SI_CONTEXT_STOP_PIPELINE_STATS) {
-		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
-		radeon_emit(cs, EVENT_TYPE(V_028A90_PIPELINESTAT_STOP) |
-			        EVENT_INDEX(0));
-	}
-
-	ctx->flags = 0;
+   struct radeon_cmdbuf *cs = ctx->gfx_cs;
+   uint32_t gcr_cntl = 0;
+   unsigned cb_db_event = 0;
+   unsigned flags = ctx->flags;
+
+   if (!ctx->has_graphics) {
+      /* Only process compute flags. */
+      flags &= SI_CONTEXT_INV_ICACHE | SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE |
+               SI_CONTEXT_INV_L2 | SI_CONTEXT_WB_L2 | SI_CONTEXT_INV_L2_METADATA |
+               SI_CONTEXT_CS_PARTIAL_FLUSH;
+   }
+
+   /* We don't need these. */
+   assert(!(flags & (SI_CONTEXT_VGT_STREAMOUT_SYNC | SI_CONTEXT_FLUSH_AND_INV_DB_META)));
+
+   if (flags & SI_CONTEXT_VGT_FLUSH) {
+      radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
+      radeon_emit(cs, EVENT_TYPE(V_028A90_VGT_FLUSH) | EVENT_INDEX(0));
+   }
+
+   if (flags & SI_CONTEXT_FLUSH_AND_INV_CB)
+      ctx->num_cb_cache_flushes++;
+   if (flags & SI_CONTEXT_FLUSH_AND_INV_DB)
+      ctx->num_db_cache_flushes++;
+
+   if (flags & SI_CONTEXT_INV_ICACHE)
+      gcr_cntl |= S_586_GLI_INV(V_586_GLI_ALL);
+   if (flags & SI_CONTEXT_INV_SCACHE) {
+      /* TODO: When writing to the SMEM L1 cache, we need to set SEQ
+       * to FORWARD when both L1 and L2 are written out (WB or INV).
+       */
+      gcr_cntl |= S_586_GL1_INV(1) | S_586_GLK_INV(1);
+   }
+   if (flags & SI_CONTEXT_INV_VCACHE)
+      gcr_cntl |= S_586_GL1_INV(1) | S_586_GLV_INV(1);
+
+   /* The L2 cache ops are:
+    * - INV: - invalidate lines that reflect memory (were loaded from memory)
+    *        - don't touch lines that were overwritten (were stored by gfx clients)
+    * - WB: - don't touch lines that reflect memory
+    *       - write back lines that were overwritten
+    * - WB | INV: - invalidate lines that reflect memory
+    *             - write back lines that were overwritten
+    *
+    * GLM doesn't support WB alone. If WB is set, INV must be set too.
+    */
+   if (flags & SI_CONTEXT_INV_L2) {
+      /* Writeback and invalidate everything in L2. */
+      gcr_cntl |= S_586_GL2_INV(1) | S_586_GL2_WB(1) | S_586_GLM_INV(1) | S_586_GLM_WB(1);
+      ctx->num_L2_invalidates++;
+   } else if (flags & SI_CONTEXT_WB_L2) {
+      gcr_cntl |= S_586_GL2_WB(1) | S_586_GLM_WB(1) | S_586_GLM_INV(1);
+   } else if (flags & SI_CONTEXT_INV_L2_METADATA) {
+      gcr_cntl |= S_586_GLM_INV(1) | S_586_GLM_WB(1);
+   }
+
+   if (flags & (SI_CONTEXT_FLUSH_AND_INV_CB | SI_CONTEXT_FLUSH_AND_INV_DB)) {
+      if (flags & SI_CONTEXT_FLUSH_AND_INV_CB) {
+         /* Flush CMASK/FMASK/DCC. Will wait for idle later. */
+         radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
+         radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_AND_INV_CB_META) | EVENT_INDEX(0));
+      }
+      if (flags & SI_CONTEXT_FLUSH_AND_INV_DB) {
+         /* Flush HTILE. Will wait for idle later. */
+         radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
+         radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_AND_INV_DB_META) | EVENT_INDEX(0));
+      }
+
+      /* First flush CB/DB, then L1/L2. */
+      gcr_cntl |= S_586_SEQ(V_586_SEQ_FORWARD);
+
+      if ((flags & (SI_CONTEXT_FLUSH_AND_INV_CB | SI_CONTEXT_FLUSH_AND_INV_DB)) ==
+          (SI_CONTEXT_FLUSH_AND_INV_CB | SI_CONTEXT_FLUSH_AND_INV_DB)) {
+         cb_db_event = V_028A90_CACHE_FLUSH_AND_INV_TS_EVENT;
+      } else if (flags & SI_CONTEXT_FLUSH_AND_INV_CB) {
+         cb_db_event = V_028A90_FLUSH_AND_INV_CB_DATA_TS;
+      } else if (flags & SI_CONTEXT_FLUSH_AND_INV_DB) {
+         cb_db_event = V_028A90_FLUSH_AND_INV_DB_DATA_TS;
+      } else {
+         assert(0);
+      }
+   } else {
+      /* Wait for graphics shaders to go idle if requested. */
+      if (flags & SI_CONTEXT_PS_PARTIAL_FLUSH) {
+         radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
+         radeon_emit(cs, EVENT_TYPE(V_028A90_PS_PARTIAL_FLUSH) | EVENT_INDEX(4));
+         /* Only count explicit shader flushes, not implicit ones. */
+         ctx->num_vs_flushes++;
+         ctx->num_ps_flushes++;
+      } else if (flags & SI_CONTEXT_VS_PARTIAL_FLUSH) {
+         radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
+         radeon_emit(cs, EVENT_TYPE(V_028A90_VS_PARTIAL_FLUSH) | EVENT_INDEX(4));
+         ctx->num_vs_flushes++;
+      }
+   }
+
+   if (flags & SI_CONTEXT_CS_PARTIAL_FLUSH && ctx->compute_is_busy) {
+      radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
+      radeon_emit(cs, EVENT_TYPE(V_028A90_CS_PARTIAL_FLUSH | EVENT_INDEX(4)));
+      ctx->num_cs_flushes++;
+      ctx->compute_is_busy = false;
+   }
+
+   if (cb_db_event) {
+      /* CB/DB flush and invalidate (or possibly just a wait for a
+       * meta flush) via RELEASE_MEM.
+       *
+       * Combine this with other cache flushes when possible; this
+       * requires affected shaders to be idle, so do it after the
+       * CS_PARTIAL_FLUSH before (VS/PS partial flushes are always
+       * implied).
+       */
+      uint64_t va;
+
+      /* Do the flush (enqueue the event and wait for it). */
+      va = ctx->wait_mem_scratch->gpu_address;
+      ctx->wait_mem_number++;
+
+      /* Get GCR_CNTL fields, because the encoding is different in RELEASE_MEM. */
+      unsigned glm_wb = G_586_GLM_WB(gcr_cntl);
+      unsigned glm_inv = G_586_GLM_INV(gcr_cntl);
+      unsigned glv_inv = G_586_GLV_INV(gcr_cntl);
+      unsigned gl1_inv = G_586_GL1_INV(gcr_cntl);
+      assert(G_586_GL2_US(gcr_cntl) == 0);
+      assert(G_586_GL2_RANGE(gcr_cntl) == 0);
+      assert(G_586_GL2_DISCARD(gcr_cntl) == 0);
+      unsigned gl2_inv = G_586_GL2_INV(gcr_cntl);
+      unsigned gl2_wb = G_586_GL2_WB(gcr_cntl);
+      unsigned gcr_seq = G_586_SEQ(gcr_cntl);
+
+      gcr_cntl &= C_586_GLM_WB & C_586_GLM_INV & C_586_GLV_INV & C_586_GL1_INV & C_586_GL2_INV &
+                  C_586_GL2_WB; /* keep SEQ */
+
+      si_cp_release_mem(ctx, cs, cb_db_event,
+                        S_490_GLM_WB(glm_wb) | S_490_GLM_INV(glm_inv) | S_490_GLV_INV(glv_inv) |
+                           S_490_GL1_INV(gl1_inv) | S_490_GL2_INV(gl2_inv) | S_490_GL2_WB(gl2_wb) |
+                           S_490_SEQ(gcr_seq),
+                        EOP_DST_SEL_MEM, EOP_INT_SEL_SEND_DATA_AFTER_WR_CONFIRM,
+                        EOP_DATA_SEL_VALUE_32BIT, ctx->wait_mem_scratch, va, ctx->wait_mem_number,
+                        SI_NOT_QUERY);
+      si_cp_wait_mem(ctx, ctx->gfx_cs, va, ctx->wait_mem_number, 0xffffffff, WAIT_REG_MEM_EQUAL);
+   }
+
+   /* Ignore fields that only modify the behavior of other fields. */
+   if (gcr_cntl & C_586_GL1_RANGE & C_586_GL2_RANGE & C_586_SEQ) {
+      /* Flush caches and wait for the caches to assert idle.
+       * The cache flush is executed in the ME, but the PFP waits
+       * for completion.
+       */
+      radeon_emit(cs, PKT3(PKT3_ACQUIRE_MEM, 6, 0));
+      radeon_emit(cs, 0);          /* CP_COHER_CNTL */
+      radeon_emit(cs, 0xffffffff); /* CP_COHER_SIZE */
+      radeon_emit(cs, 0xffffff);   /* CP_COHER_SIZE_HI */
+      radeon_emit(cs, 0);          /* CP_COHER_BASE */
+      radeon_emit(cs, 0);          /* CP_COHER_BASE_HI */
+      radeon_emit(cs, 0x0000000A); /* POLL_INTERVAL */
+      radeon_emit(cs, gcr_cntl);   /* GCR_CNTL */
+   } else if (cb_db_event || (flags & (SI_CONTEXT_VS_PARTIAL_FLUSH | SI_CONTEXT_PS_PARTIAL_FLUSH |
+                                       SI_CONTEXT_CS_PARTIAL_FLUSH))) {
+      /* We need to ensure that PFP waits as well. */
+      radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));
+      radeon_emit(cs, 0);
+   }
+
+   if (flags & SI_CONTEXT_START_PIPELINE_STATS) {
+      radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
+      radeon_emit(cs, EVENT_TYPE(V_028A90_PIPELINESTAT_START) | EVENT_INDEX(0));
+   } else if (flags & SI_CONTEXT_STOP_PIPELINE_STATS) {
+      radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
+      radeon_emit(cs, EVENT_TYPE(V_028A90_PIPELINESTAT_STOP) | EVENT_INDEX(0));
+   }
+
+   ctx->flags = 0;
 }
 
 void si_emit_cache_flush(struct si_context *sctx)
 {
-	struct radeon_cmdbuf *cs = sctx->gfx_cs;
-	uint32_t flags = sctx->flags;
-
-	if (!sctx->has_graphics) {
-		/* Only process compute flags. */
-		flags &= SI_CONTEXT_INV_ICACHE |
-			 SI_CONTEXT_INV_SCACHE |
-			 SI_CONTEXT_INV_VCACHE |
-			 SI_CONTEXT_INV_L2 |
-			 SI_CONTEXT_WB_L2 |
-			 SI_CONTEXT_INV_L2_METADATA |
-			 SI_CONTEXT_CS_PARTIAL_FLUSH;
-	}
-
-	uint32_t cp_coher_cntl = 0;
-	const uint32_t flush_cb_db = flags & (SI_CONTEXT_FLUSH_AND_INV_CB |
-					      SI_CONTEXT_FLUSH_AND_INV_DB);
-	const bool is_barrier = flush_cb_db ||
-				/* INV_ICACHE == beginning of gfx IB. Checking
-				 * INV_ICACHE fixes corruption for DeusExMD with
-				 * compute-based culling, but I don't know why.
-				 */
-				flags & (SI_CONTEXT_INV_ICACHE |
-					 SI_CONTEXT_PS_PARTIAL_FLUSH |
-					 SI_CONTEXT_VS_PARTIAL_FLUSH) ||
-				(flags & SI_CONTEXT_CS_PARTIAL_FLUSH &&
-				 sctx->compute_is_busy);
-
-	assert(sctx->chip_class <= GFX9);
-
-	if (flags & SI_CONTEXT_FLUSH_AND_INV_CB)
-		sctx->num_cb_cache_flushes++;
-	if (flags & SI_CONTEXT_FLUSH_AND_INV_DB)
-		sctx->num_db_cache_flushes++;
-
-	/* GFX6 has a bug that it always flushes ICACHE and KCACHE if either
-	 * bit is set. An alternative way is to write SQC_CACHES, but that
-	 * doesn't seem to work reliably. Since the bug doesn't affect
-	 * correctness (it only does more work than necessary) and
-	 * the performance impact is likely negligible, there is no plan
-	 * to add a workaround for it.
-	 */
-
-	if (flags & SI_CONTEXT_INV_ICACHE)
-		cp_coher_cntl |= S_0085F0_SH_ICACHE_ACTION_ENA(1);
-	if (flags & SI_CONTEXT_INV_SCACHE)
-		cp_coher_cntl |= S_0085F0_SH_KCACHE_ACTION_ENA(1);
-
-	if (sctx->chip_class <= GFX8) {
-		if (flags & SI_CONTEXT_FLUSH_AND_INV_CB) {
-			cp_coher_cntl |= S_0085F0_CB_ACTION_ENA(1) |
-					 S_0085F0_CB0_DEST_BASE_ENA(1) |
-					 S_0085F0_CB1_DEST_BASE_ENA(1) |
-					 S_0085F0_CB2_DEST_BASE_ENA(1) |
-					 S_0085F0_CB3_DEST_BASE_ENA(1) |
-					 S_0085F0_CB4_DEST_BASE_ENA(1) |
-					 S_0085F0_CB5_DEST_BASE_ENA(1) |
-					 S_0085F0_CB6_DEST_BASE_ENA(1) |
-					 S_0085F0_CB7_DEST_BASE_ENA(1);
-
-			/* Necessary for DCC */
-			if (sctx->chip_class == GFX8)
-				si_cp_release_mem(sctx, cs,
-						  V_028A90_FLUSH_AND_INV_CB_DATA_TS,
-						  0, EOP_DST_SEL_MEM, EOP_INT_SEL_NONE,
-						  EOP_DATA_SEL_DISCARD, NULL,
-						  0, 0, SI_NOT_QUERY);
-		}
-		if (flags & SI_CONTEXT_FLUSH_AND_INV_DB)
-			cp_coher_cntl |= S_0085F0_DB_ACTION_ENA(1) |
-					 S_0085F0_DB_DEST_BASE_ENA(1);
-	}
-
-	if (flags & SI_CONTEXT_FLUSH_AND_INV_CB) {
-		/* Flush CMASK/FMASK/DCC. SURFACE_SYNC will wait for idle. */
-		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
-		radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_AND_INV_CB_META) | EVENT_INDEX(0));
-	}
-	if (flags & (SI_CONTEXT_FLUSH_AND_INV_DB |
-		     SI_CONTEXT_FLUSH_AND_INV_DB_META)) {
-		/* Flush HTILE. SURFACE_SYNC will wait for idle. */
-		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
-		radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_AND_INV_DB_META) | EVENT_INDEX(0));
-	}
-
-	/* Wait for shader engines to go idle.
-	 * VS and PS waits are unnecessary if SURFACE_SYNC is going to wait
-	 * for everything including CB/DB cache flushes.
-	 */
-	if (!flush_cb_db) {
-		if (flags & SI_CONTEXT_PS_PARTIAL_FLUSH) {
-			radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
-			radeon_emit(cs, EVENT_TYPE(V_028A90_PS_PARTIAL_FLUSH) | EVENT_INDEX(4));
-			/* Only count explicit shader flushes, not implicit ones
-			 * done by SURFACE_SYNC.
-			 */
-			sctx->num_vs_flushes++;
-			sctx->num_ps_flushes++;
-		} else if (flags & SI_CONTEXT_VS_PARTIAL_FLUSH) {
-			radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
-			radeon_emit(cs, EVENT_TYPE(V_028A90_VS_PARTIAL_FLUSH) | EVENT_INDEX(4));
-			sctx->num_vs_flushes++;
-		}
-	}
-
-	if (flags & SI_CONTEXT_CS_PARTIAL_FLUSH &&
-	    sctx->compute_is_busy) {
-		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
-		radeon_emit(cs, EVENT_TYPE(V_028A90_CS_PARTIAL_FLUSH) | EVENT_INDEX(4));
-		sctx->num_cs_flushes++;
-		sctx->compute_is_busy = false;
-	}
-
-	/* VGT state synchronization. */
-	if (flags & SI_CONTEXT_VGT_FLUSH) {
-		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
-		radeon_emit(cs, EVENT_TYPE(V_028A90_VGT_FLUSH) | EVENT_INDEX(0));
-	}
-	if (flags & SI_CONTEXT_VGT_STREAMOUT_SYNC) {
-		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
-		radeon_emit(cs, EVENT_TYPE(V_028A90_VGT_STREAMOUT_SYNC) | EVENT_INDEX(0));
-	}
-
-	/* GFX9: Wait for idle if we're flushing CB or DB. ACQUIRE_MEM doesn't
-	 * wait for idle on GFX9. We have to use a TS event.
-	 */
-	if (sctx->chip_class == GFX9 && flush_cb_db) {
-		uint64_t va;
-		unsigned tc_flags, cb_db_event;
-
-		/* Set the CB/DB flush event. */
-		switch (flush_cb_db) {
-		case SI_CONTEXT_FLUSH_AND_INV_CB:
-			cb_db_event = V_028A90_FLUSH_AND_INV_CB_DATA_TS;
-			break;
-		case SI_CONTEXT_FLUSH_AND_INV_DB:
-			cb_db_event = V_028A90_FLUSH_AND_INV_DB_DATA_TS;
-			break;
-		default:
-			/* both CB & DB */
-			cb_db_event = V_028A90_CACHE_FLUSH_AND_INV_TS_EVENT;
-		}
-
-		/* These are the only allowed combinations. If you need to
-		 * do multiple operations at once, do them separately.
-		 * All operations that invalidate L2 also seem to invalidate
-		 * metadata. Volatile (VOL) and WC flushes are not listed here.
-		 *
-		 * TC    | TC_WB         = writeback & invalidate L2 & L1
-		 * TC    | TC_WB | TC_NC = writeback & invalidate L2 for MTYPE == NC
-		 *         TC_WB | TC_NC = writeback L2 for MTYPE == NC
-		 * TC            | TC_NC = invalidate L2 for MTYPE == NC
-		 * TC    | TC_MD         = writeback & invalidate L2 metadata (DCC, etc.)
-		 * TCL1                  = invalidate L1
-		 */
-		tc_flags = 0;
-
-		if (flags & SI_CONTEXT_INV_L2_METADATA) {
-			tc_flags = EVENT_TC_ACTION_ENA |
-				   EVENT_TC_MD_ACTION_ENA;
-		}
-
-		/* Ideally flush TC together with CB/DB. */
-		if (flags & SI_CONTEXT_INV_L2) {
-			/* Writeback and invalidate everything in L2 & L1. */
-			tc_flags = EVENT_TC_ACTION_ENA |
-				   EVENT_TC_WB_ACTION_ENA;
-
-			/* Clear the flags. */
-			flags &= ~(SI_CONTEXT_INV_L2 |
-				   SI_CONTEXT_WB_L2 |
-				   SI_CONTEXT_INV_VCACHE);
-			sctx->num_L2_invalidates++;
-		}
-
-		/* Do the flush (enqueue the event and wait for it). */
-		va = sctx->wait_mem_scratch->gpu_address;
-		sctx->wait_mem_number++;
-
-		si_cp_release_mem(sctx, cs, cb_db_event, tc_flags,
-				  EOP_DST_SEL_MEM,
-				  EOP_INT_SEL_SEND_DATA_AFTER_WR_CONFIRM,
-				  EOP_DATA_SEL_VALUE_32BIT,
-				  sctx->wait_mem_scratch, va,
-				  sctx->wait_mem_number, SI_NOT_QUERY);
-		si_cp_wait_mem(sctx, cs, va, sctx->wait_mem_number, 0xffffffff,
-			       WAIT_REG_MEM_EQUAL);
-	}
-
-	/* Make sure ME is idle (it executes most packets) before continuing.
-	 * This prevents read-after-write hazards between PFP and ME.
-	 */
-	if (sctx->has_graphics &&
-	    (cp_coher_cntl ||
-	     (flags & (SI_CONTEXT_CS_PARTIAL_FLUSH |
-		       SI_CONTEXT_INV_VCACHE |
-		       SI_CONTEXT_INV_L2 |
-		       SI_CONTEXT_WB_L2)))) {
-		radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));
-		radeon_emit(cs, 0);
-	}
-
-	/* GFX6-GFX8 only:
-	 *   When one of the CP_COHER_CNTL.DEST_BASE flags is set, SURFACE_SYNC
-	 *   waits for idle, so it should be last. SURFACE_SYNC is done in PFP.
-	 *
-	 * cp_coher_cntl should contain all necessary flags except TC flags
-	 * at this point.
-	 *
-	 * GFX6-GFX7 don't support L2 write-back.
-	 */
-	if (flags & SI_CONTEXT_INV_L2 ||
-	    (sctx->chip_class <= GFX7 &&
-	     (flags & SI_CONTEXT_WB_L2))) {
-		/* Invalidate L1 & L2. (L1 is always invalidated on GFX6)
-		 * WB must be set on GFX8+ when TC_ACTION is set.
-		 */
-		si_emit_surface_sync(sctx, sctx->gfx_cs, cp_coher_cntl |
-				     S_0085F0_TC_ACTION_ENA(1) |
-				     S_0085F0_TCL1_ACTION_ENA(1) |
-				     S_0301F0_TC_WB_ACTION_ENA(sctx->chip_class >= GFX8));
-		cp_coher_cntl = 0;
-		sctx->num_L2_invalidates++;
-	} else {
-		/* L1 invalidation and L2 writeback must be done separately,
-		 * because both operations can't be done together.
-		 */
-		if (flags & SI_CONTEXT_WB_L2) {
-			/* WB = write-back
-			 * NC = apply to non-coherent MTYPEs
-			 *      (i.e. MTYPE <= 1, which is what we use everywhere)
-			 *
-			 * WB doesn't work without NC.
-			 */
-			si_emit_surface_sync(sctx, sctx->gfx_cs, cp_coher_cntl |
-					     S_0301F0_TC_WB_ACTION_ENA(1) |
-					     S_0301F0_TC_NC_ACTION_ENA(1));
-			cp_coher_cntl = 0;
-			sctx->num_L2_writebacks++;
-		}
-		if (flags & SI_CONTEXT_INV_VCACHE) {
-			/* Invalidate per-CU VMEM L1. */
-			si_emit_surface_sync(sctx, sctx->gfx_cs, cp_coher_cntl |
-					     S_0085F0_TCL1_ACTION_ENA(1));
-			cp_coher_cntl = 0;
-		}
-	}
-
-	/* If TC flushes haven't cleared this... */
-	if (cp_coher_cntl)
-		si_emit_surface_sync(sctx, sctx->gfx_cs, cp_coher_cntl);
-
-	if (is_barrier)
-		si_prim_discard_signal_next_compute_ib_start(sctx);
-
-	if (flags & SI_CONTEXT_START_PIPELINE_STATS) {
-		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
-		radeon_emit(cs, EVENT_TYPE(V_028A90_PIPELINESTAT_START) |
-			        EVENT_INDEX(0));
-	} else if (flags & SI_CONTEXT_STOP_PIPELINE_STATS) {
-		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
-		radeon_emit(cs, EVENT_TYPE(V_028A90_PIPELINESTAT_STOP) |
-			        EVENT_INDEX(0));
-	}
-
-	sctx->flags = 0;
+   struct radeon_cmdbuf *cs = sctx->gfx_cs;
+   uint32_t flags = sctx->flags;
+
+   if (!sctx->has_graphics) {
+      /* Only process compute flags. */
+      flags &= SI_CONTEXT_INV_ICACHE | SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE |
+               SI_CONTEXT_INV_L2 | SI_CONTEXT_WB_L2 | SI_CONTEXT_INV_L2_METADATA |
+               SI_CONTEXT_CS_PARTIAL_FLUSH;
+   }
+
+   uint32_t cp_coher_cntl = 0;
+   const uint32_t flush_cb_db = flags & (SI_CONTEXT_FLUSH_AND_INV_CB | SI_CONTEXT_FLUSH_AND_INV_DB);
+   const bool is_barrier =
+      flush_cb_db ||
+      /* INV_ICACHE == beginning of gfx IB. Checking
+       * INV_ICACHE fixes corruption for DeusExMD with
+       * compute-based culling, but I don't know why.
+       */
+      flags & (SI_CONTEXT_INV_ICACHE | SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_VS_PARTIAL_FLUSH) ||
+      (flags & SI_CONTEXT_CS_PARTIAL_FLUSH && sctx->compute_is_busy);
+
+   assert(sctx->chip_class <= GFX9);
+
+   if (flags & SI_CONTEXT_FLUSH_AND_INV_CB)
+      sctx->num_cb_cache_flushes++;
+   if (flags & SI_CONTEXT_FLUSH_AND_INV_DB)
+      sctx->num_db_cache_flushes++;
+
+   /* GFX6 has a bug that it always flushes ICACHE and KCACHE if either
+    * bit is set. An alternative way is to write SQC_CACHES, but that
+    * doesn't seem to work reliably. Since the bug doesn't affect
+    * correctness (it only does more work than necessary) and
+    * the performance impact is likely negligible, there is no plan
+    * to add a workaround for it.
+    */
+
+   if (flags & SI_CONTEXT_INV_ICACHE)
+      cp_coher_cntl |= S_0085F0_SH_ICACHE_ACTION_ENA(1);
+   if (flags & SI_CONTEXT_INV_SCACHE)
+      cp_coher_cntl |= S_0085F0_SH_KCACHE_ACTION_ENA(1);
+
+   if (sctx->chip_class <= GFX8) {
+      if (flags & SI_CONTEXT_FLUSH_AND_INV_CB) {
+         cp_coher_cntl |= S_0085F0_CB_ACTION_ENA(1) | S_0085F0_CB0_DEST_BASE_ENA(1) |
+                          S_0085F0_CB1_DEST_BASE_ENA(1) | S_0085F0_CB2_DEST_BASE_ENA(1) |
+                          S_0085F0_CB3_DEST_BASE_ENA(1) | S_0085F0_CB4_DEST_BASE_ENA(1) |
+                          S_0085F0_CB5_DEST_BASE_ENA(1) | S_0085F0_CB6_DEST_BASE_ENA(1) |
+                          S_0085F0_CB7_DEST_BASE_ENA(1);
+
+         /* Necessary for DCC */
+         if (sctx->chip_class == GFX8)
+            si_cp_release_mem(sctx, cs, V_028A90_FLUSH_AND_INV_CB_DATA_TS, 0, EOP_DST_SEL_MEM,
+                              EOP_INT_SEL_NONE, EOP_DATA_SEL_DISCARD, NULL, 0, 0, SI_NOT_QUERY);
+      }
+      if (flags & SI_CONTEXT_FLUSH_AND_INV_DB)
+         cp_coher_cntl |= S_0085F0_DB_ACTION_ENA(1) | S_0085F0_DB_DEST_BASE_ENA(1);
+   }
+
+   if (flags & SI_CONTEXT_FLUSH_AND_INV_CB) {
+      /* Flush CMASK/FMASK/DCC. SURFACE_SYNC will wait for idle. */
+      radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
+      radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_AND_INV_CB_META) | EVENT_INDEX(0));
+   }
+   if (flags & (SI_CONTEXT_FLUSH_AND_INV_DB | SI_CONTEXT_FLUSH_AND_INV_DB_META)) {
+      /* Flush HTILE. SURFACE_SYNC will wait for idle. */
+      radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
+      radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_AND_INV_DB_META) | EVENT_INDEX(0));
+   }
+
+   /* Wait for shader engines to go idle.
+    * VS and PS waits are unnecessary if SURFACE_SYNC is going to wait
+    * for everything including CB/DB cache flushes.
+    */
+   if (!flush_cb_db) {
+      if (flags & SI_CONTEXT_PS_PARTIAL_FLUSH) {
+         radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
+         radeon_emit(cs, EVENT_TYPE(V_028A90_PS_PARTIAL_FLUSH) | EVENT_INDEX(4));
+         /* Only count explicit shader flushes, not implicit ones
+          * done by SURFACE_SYNC.
+          */
+         sctx->num_vs_flushes++;
+         sctx->num_ps_flushes++;
+      } else if (flags & SI_CONTEXT_VS_PARTIAL_FLUSH) {
+         radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
+         radeon_emit(cs, EVENT_TYPE(V_028A90_VS_PARTIAL_FLUSH) | EVENT_INDEX(4));
+         sctx->num_vs_flushes++;
+      }
+   }
+
+   if (flags & SI_CONTEXT_CS_PARTIAL_FLUSH && sctx->compute_is_busy) {
+      radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
+      radeon_emit(cs, EVENT_TYPE(V_028A90_CS_PARTIAL_FLUSH) | EVENT_INDEX(4));
+      sctx->num_cs_flushes++;
+      sctx->compute_is_busy = false;
+   }
+
+   /* VGT state synchronization. */
+   if (flags & SI_CONTEXT_VGT_FLUSH) {
+      radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
+      radeon_emit(cs, EVENT_TYPE(V_028A90_VGT_FLUSH) | EVENT_INDEX(0));
+   }
+   if (flags & SI_CONTEXT_VGT_STREAMOUT_SYNC) {
+      radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
+      radeon_emit(cs, EVENT_TYPE(V_028A90_VGT_STREAMOUT_SYNC) | EVENT_INDEX(0));
+   }
+
+   /* GFX9: Wait for idle if we're flushing CB or DB. ACQUIRE_MEM doesn't
+    * wait for idle on GFX9. We have to use a TS event.
+    */
+   if (sctx->chip_class == GFX9 && flush_cb_db) {
+      uint64_t va;
+      unsigned tc_flags, cb_db_event;
+
+      /* Set the CB/DB flush event. */
+      switch (flush_cb_db) {
+      case SI_CONTEXT_FLUSH_AND_INV_CB:
+         cb_db_event = V_028A90_FLUSH_AND_INV_CB_DATA_TS;
+         break;
+      case SI_CONTEXT_FLUSH_AND_INV_DB:
+         cb_db_event = V_028A90_FLUSH_AND_INV_DB_DATA_TS;
+         break;
+      default:
+         /* both CB & DB */
+         cb_db_event = V_028A90_CACHE_FLUSH_AND_INV_TS_EVENT;
+      }
+
+      /* These are the only allowed combinations. If you need to
+       * do multiple operations at once, do them separately.
+       * All operations that invalidate L2 also seem to invalidate
+       * metadata. Volatile (VOL) and WC flushes are not listed here.
+       *
+       * TC    | TC_WB         = writeback & invalidate L2 & L1
+       * TC    | TC_WB | TC_NC = writeback & invalidate L2 for MTYPE == NC
+       *         TC_WB | TC_NC = writeback L2 for MTYPE == NC
+       * TC            | TC_NC = invalidate L2 for MTYPE == NC
+       * TC    | TC_MD         = writeback & invalidate L2 metadata (DCC, etc.)
+       * TCL1                  = invalidate L1
+       */
+      tc_flags = 0;
+
+      if (flags & SI_CONTEXT_INV_L2_METADATA) {
+         tc_flags = EVENT_TC_ACTION_ENA | EVENT_TC_MD_ACTION_ENA;
+      }
+
+      /* Ideally flush TC together with CB/DB. */
+      if (flags & SI_CONTEXT_INV_L2) {
+         /* Writeback and invalidate everything in L2 & L1. */
+         tc_flags = EVENT_TC_ACTION_ENA | EVENT_TC_WB_ACTION_ENA;
+
+         /* Clear the flags. */
+         flags &= ~(SI_CONTEXT_INV_L2 | SI_CONTEXT_WB_L2 | SI_CONTEXT_INV_VCACHE);
+         sctx->num_L2_invalidates++;
+      }
+
+      /* Do the flush (enqueue the event and wait for it). */
+      va = sctx->wait_mem_scratch->gpu_address;
+      sctx->wait_mem_number++;
+
+      si_cp_release_mem(sctx, cs, cb_db_event, tc_flags, EOP_DST_SEL_MEM,
+                        EOP_INT_SEL_SEND_DATA_AFTER_WR_CONFIRM, EOP_DATA_SEL_VALUE_32BIT,
+                        sctx->wait_mem_scratch, va, sctx->wait_mem_number, SI_NOT_QUERY);
+      si_cp_wait_mem(sctx, cs, va, sctx->wait_mem_number, 0xffffffff, WAIT_REG_MEM_EQUAL);
+   }
+
+   /* Make sure ME is idle (it executes most packets) before continuing.
+    * This prevents read-after-write hazards between PFP and ME.
+    */
+   if (sctx->has_graphics &&
+       (cp_coher_cntl || (flags & (SI_CONTEXT_CS_PARTIAL_FLUSH | SI_CONTEXT_INV_VCACHE |
+                                   SI_CONTEXT_INV_L2 | SI_CONTEXT_WB_L2)))) {
+      radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));
+      radeon_emit(cs, 0);
+   }
+
+   /* GFX6-GFX8 only:
+    *   When one of the CP_COHER_CNTL.DEST_BASE flags is set, SURFACE_SYNC
+    *   waits for idle, so it should be last. SURFACE_SYNC is done in PFP.
+    *
+    * cp_coher_cntl should contain all necessary flags except TC flags
+    * at this point.
+    *
+    * GFX6-GFX7 don't support L2 write-back.
+    */
+   if (flags & SI_CONTEXT_INV_L2 || (sctx->chip_class <= GFX7 && (flags & SI_CONTEXT_WB_L2))) {
+      /* Invalidate L1 & L2. (L1 is always invalidated on GFX6)
+       * WB must be set on GFX8+ when TC_ACTION is set.
+       */
+      si_emit_surface_sync(sctx, sctx->gfx_cs,
+                           cp_coher_cntl | S_0085F0_TC_ACTION_ENA(1) | S_0085F0_TCL1_ACTION_ENA(1) |
+                              S_0301F0_TC_WB_ACTION_ENA(sctx->chip_class >= GFX8));
+      cp_coher_cntl = 0;
+      sctx->num_L2_invalidates++;
+   } else {
+      /* L1 invalidation and L2 writeback must be done separately,
+       * because both operations can't be done together.
+       */
+      if (flags & SI_CONTEXT_WB_L2) {
+         /* WB = write-back
+          * NC = apply to non-coherent MTYPEs
+          *      (i.e. MTYPE <= 1, which is what we use everywhere)
+          *
+          * WB doesn't work without NC.
+          */
+         si_emit_surface_sync(
+            sctx, sctx->gfx_cs,
+            cp_coher_cntl | S_0301F0_TC_WB_ACTION_ENA(1) | S_0301F0_TC_NC_ACTION_ENA(1));
+         cp_coher_cntl = 0;
+         sctx->num_L2_writebacks++;
+      }
+      if (flags & SI_CONTEXT_INV_VCACHE) {
+         /* Invalidate per-CU VMEM L1. */
+         si_emit_surface_sync(sctx, sctx->gfx_cs, cp_coher_cntl | S_0085F0_TCL1_ACTION_ENA(1));
+         cp_coher_cntl = 0;
+      }
+   }
+
+   /* If TC flushes haven't cleared this... */
+   if (cp_coher_cntl)
+      si_emit_surface_sync(sctx, sctx->gfx_cs, cp_coher_cntl);
+
+   if (is_barrier)
+      si_prim_discard_signal_next_compute_ib_start(sctx);
+
+   if (flags & SI_CONTEXT_START_PIPELINE_STATS) {
+      radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
+      radeon_emit(cs, EVENT_TYPE(V_028A90_PIPELINESTAT_START) | EVENT_INDEX(0));
+   } else if (flags & SI_CONTEXT_STOP_PIPELINE_STATS) {
+      radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
+      radeon_emit(cs, EVENT_TYPE(V_028A90_PIPELINESTAT_STOP) | EVENT_INDEX(0));
+   }
+
+   sctx->flags = 0;
 }
 
-static void si_get_draw_start_count(struct si_context *sctx,
-				    const struct pipe_draw_info *info,
-				    unsigned *start, unsigned *count)
+static void si_get_draw_start_count(struct si_context *sctx, const struct pipe_draw_info *info,
+                                    unsigned *start, unsigned *count)
 {
-	struct pipe_draw_indirect_info *indirect = info->indirect;
-
-	if (indirect) {
-		unsigned indirect_count;
-		struct pipe_transfer *transfer;
-		unsigned begin, end;
-		unsigned map_size;
-		unsigned *data;
-
-		if (indirect->indirect_draw_count) {
-			data = pipe_buffer_map_range(&sctx->b,
-					indirect->indirect_draw_count,
-					indirect->indirect_draw_count_offset,
-					sizeof(unsigned),
-					PIPE_TRANSFER_READ, &transfer);
-
-			indirect_count = *data;
-
-			pipe_buffer_unmap(&sctx->b, transfer);
-		} else {
-			indirect_count = indirect->draw_count;
-		}
-
-		if (!indirect_count) {
-			*start = *count = 0;
-			return;
-		}
-
-		map_size = (indirect_count - 1) * indirect->stride + 3 * sizeof(unsigned);
-		data = pipe_buffer_map_range(&sctx->b, indirect->buffer,
-					     indirect->offset, map_size,
-					     PIPE_TRANSFER_READ, &transfer);
-
-		begin = UINT_MAX;
-		end = 0;
-
-		for (unsigned i = 0; i < indirect_count; ++i) {
-			unsigned count = data[0];
-			unsigned start = data[2];
-
-			if (count > 0) {
-				begin = MIN2(begin, start);
-				end = MAX2(end, start + count);
-			}
-
-			data += indirect->stride / sizeof(unsigned);
-		}
-
-		pipe_buffer_unmap(&sctx->b, transfer);
-
-		if (begin < end) {
-			*start = begin;
-			*count = end - begin;
-		} else {
-			*start = *count = 0;
-		}
-	} else {
-		*start = info->start;
-		*count = info->count;
-	}
+   struct pipe_draw_indirect_info *indirect = info->indirect;
+
+   if (indirect) {
+      unsigned indirect_count;
+      struct pipe_transfer *transfer;
+      unsigned begin, end;
+      unsigned map_size;
+      unsigned *data;
+
+      if (indirect->indirect_draw_count) {
+         data = pipe_buffer_map_range(&sctx->b, indirect->indirect_draw_count,
+                                      indirect->indirect_draw_count_offset, sizeof(unsigned),
+                                      PIPE_TRANSFER_READ, &transfer);
+
+         indirect_count = *data;
+
+         pipe_buffer_unmap(&sctx->b, transfer);
+      } else {
+         indirect_count = indirect->draw_count;
+      }
+
+      if (!indirect_count) {
+         *start = *count = 0;
+         return;
+      }
+
+      map_size = (indirect_count - 1) * indirect->stride + 3 * sizeof(unsigned);
+      data = pipe_buffer_map_range(&sctx->b, indirect->buffer, indirect->offset, map_size,
+                                   PIPE_TRANSFER_READ, &transfer);
+
+      begin = UINT_MAX;
+      end = 0;
+
+      for (unsigned i = 0; i < indirect_count; ++i) {
+         unsigned count = data[0];
+         unsigned start = data[2];
+
+         if (count > 0) {
+            begin = MIN2(begin, start);
+            end = MAX2(end, start + count);
+         }
+
+         data += indirect->stride / sizeof(unsigned);
+      }
+
+      pipe_buffer_unmap(&sctx->b, transfer);
+
+      if (begin < end) {
+         *start = begin;
+         *count = end - begin;
+      } else {
+         *start = *count = 0;
+      }
+   } else {
+      *start = info->start;
+      *count = info->count;
+   }
 }
 
 static void si_emit_all_states(struct si_context *sctx, const struct pipe_draw_info *info,
-			       enum pipe_prim_type prim, unsigned instance_count,
-			       bool primitive_restart, unsigned skip_atom_mask)
+                               enum pipe_prim_type prim, unsigned instance_count,
+                               bool primitive_restart, unsigned skip_atom_mask)
 {
-	unsigned num_patches = 0;
+   unsigned num_patches = 0;
 
-	si_emit_rasterizer_prim_state(sctx);
-	if (sctx->tes_shader.cso)
-		si_emit_derived_tess_state(sctx, info, &num_patches);
+   si_emit_rasterizer_prim_state(sctx);
+   if (sctx->tes_shader.cso)
+      si_emit_derived_tess_state(sctx, info, &num_patches);
 
-	/* Emit state atoms. */
-	unsigned mask = sctx->dirty_atoms & ~skip_atom_mask;
-	while (mask)
-		sctx->atoms.array[u_bit_scan(&mask)].emit(sctx);
+   /* Emit state atoms. */
+   unsigned mask = sctx->dirty_atoms & ~skip_atom_mask;
+   while (mask)
+      sctx->atoms.array[u_bit_scan(&mask)].emit(sctx);
 
-	sctx->dirty_atoms &= skip_atom_mask;
+   sctx->dirty_atoms &= skip_atom_mask;
 
-	/* Emit states. */
-	mask = sctx->dirty_states;
-	while (mask) {
-		unsigned i = u_bit_scan(&mask);
-		struct si_pm4_state *state = sctx->queued.array[i];
+   /* Emit states. */
+   mask = sctx->dirty_states;
+   while (mask) {
+      unsigned i = u_bit_scan(&mask);
+      struct si_pm4_state *state = sctx->queued.array[i];
 
-		if (!state || sctx->emitted.array[i] == state)
-			continue;
+      if (!state || sctx->emitted.array[i] == state)
+         continue;
 
-		si_pm4_emit(sctx, state);
-		sctx->emitted.array[i] = state;
-	}
-	sctx->dirty_states = 0;
+      si_pm4_emit(sctx, state);
+      sctx->emitted.array[i] = state;
+   }
+   sctx->dirty_states = 0;
 
-	/* Emit draw states. */
-	si_emit_vs_state(sctx, info);
-	si_emit_draw_registers(sctx, info, prim, num_patches, instance_count,
-			       primitive_restart);
+   /* Emit draw states. */
+   si_emit_vs_state(sctx, info);
+   si_emit_draw_registers(sctx, info, prim, num_patches, instance_count, primitive_restart);
 }
 
-static bool
-si_all_vs_resources_read_only(struct si_context *sctx,
-			      struct pipe_resource *indexbuf)
+static bool si_all_vs_resources_read_only(struct si_context *sctx, struct pipe_resource *indexbuf)
 {
-	struct radeon_winsys *ws = sctx->ws;
-	struct radeon_cmdbuf *cs = sctx->gfx_cs;
-
-	/* Index buffer. */
-	if (indexbuf &&
-	    ws->cs_is_buffer_referenced(cs, si_resource(indexbuf)->buf,
-					RADEON_USAGE_WRITE))
-		goto has_write_reference;
-
-	/* Vertex buffers. */
-	struct si_vertex_elements *velems = sctx->vertex_elements;
-	unsigned num_velems = velems->count;
-
-	for (unsigned i = 0; i < num_velems; i++) {
-		if (!((1 << i) & velems->first_vb_use_mask))
-			continue;
-
-		unsigned vb_index = velems->vertex_buffer_index[i];
-		struct pipe_resource *res = sctx->vertex_buffer[vb_index].buffer.resource;
-		if (!res)
-			continue;
-
-		if (ws->cs_is_buffer_referenced(cs, si_resource(res)->buf,
-						RADEON_USAGE_WRITE))
-			goto has_write_reference;
-	}
-
-	/* Constant and shader buffers. */
-	struct si_descriptors *buffers =
-		&sctx->descriptors[si_const_and_shader_buffer_descriptors_idx(PIPE_SHADER_VERTEX)];
-	for (unsigned i = 0; i < buffers->num_active_slots; i++) {
-		unsigned index = buffers->first_active_slot + i;
-		struct pipe_resource *res =
-			sctx->const_and_shader_buffers[PIPE_SHADER_VERTEX].buffers[index];
-		if (!res)
-			continue;
-
-		if (ws->cs_is_buffer_referenced(cs, si_resource(res)->buf,
-						RADEON_USAGE_WRITE))
-			goto has_write_reference;
-	}
-
-	/* Samplers. */
-	struct si_shader_selector *vs = sctx->vs_shader.cso;
-	if (vs->info.samplers_declared) {
-		unsigned num_samplers = util_last_bit(vs->info.samplers_declared);
-
-		for (unsigned i = 0; i < num_samplers; i++) {
-			struct pipe_sampler_view *view = sctx->samplers[PIPE_SHADER_VERTEX].views[i];
-			if (!view)
-				continue;
-
-			if (ws->cs_is_buffer_referenced(cs,
-							si_resource(view->texture)->buf,
-							RADEON_USAGE_WRITE))
-				goto has_write_reference;
-		}
-	}
-
-	/* Images. */
-	if (vs->info.images_declared) {
-		unsigned num_images = util_last_bit(vs->info.images_declared);
-
-		for (unsigned i = 0; i < num_images; i++) {
-			struct pipe_resource *res = sctx->images[PIPE_SHADER_VERTEX].views[i].resource;
-			if (!res)
-				continue;
-
-			if (ws->cs_is_buffer_referenced(cs, si_resource(res)->buf,
-							RADEON_USAGE_WRITE))
-				goto has_write_reference;
-		}
-	}
-
-	return true;
+   struct radeon_winsys *ws = sctx->ws;
+   struct radeon_cmdbuf *cs = sctx->gfx_cs;
+
+   /* Index buffer. */
+   if (indexbuf && ws->cs_is_buffer_referenced(cs, si_resource(indexbuf)->buf, RADEON_USAGE_WRITE))
+      goto has_write_reference;
+
+   /* Vertex buffers. */
+   struct si_vertex_elements *velems = sctx->vertex_elements;
+   unsigned num_velems = velems->count;
+
+   for (unsigned i = 0; i < num_velems; i++) {
+      if (!((1 << i) & velems->first_vb_use_mask))
+         continue;
+
+      unsigned vb_index = velems->vertex_buffer_index[i];
+      struct pipe_resource *res = sctx->vertex_buffer[vb_index].buffer.resource;
+      if (!res)
+         continue;
+
+      if (ws->cs_is_buffer_referenced(cs, si_resource(res)->buf, RADEON_USAGE_WRITE))
+         goto has_write_reference;
+   }
+
+   /* Constant and shader buffers. */
+   struct si_descriptors *buffers =
+      &sctx->descriptors[si_const_and_shader_buffer_descriptors_idx(PIPE_SHADER_VERTEX)];
+   for (unsigned i = 0; i < buffers->num_active_slots; i++) {
+      unsigned index = buffers->first_active_slot + i;
+      struct pipe_resource *res = sctx->const_and_shader_buffers[PIPE_SHADER_VERTEX].buffers[index];
+      if (!res)
+         continue;
+
+      if (ws->cs_is_buffer_referenced(cs, si_resource(res)->buf, RADEON_USAGE_WRITE))
+         goto has_write_reference;
+   }
+
+   /* Samplers. */
+   struct si_shader_selector *vs = sctx->vs_shader.cso;
+   if (vs->info.samplers_declared) {
+      unsigned num_samplers = util_last_bit(vs->info.samplers_declared);
+
+      for (unsigned i = 0; i < num_samplers; i++) {
+         struct pipe_sampler_view *view = sctx->samplers[PIPE_SHADER_VERTEX].views[i];
+         if (!view)
+            continue;
+
+         if (ws->cs_is_buffer_referenced(cs, si_resource(view->texture)->buf, RADEON_USAGE_WRITE))
+            goto has_write_reference;
+      }
+   }
+
+   /* Images. */
+   if (vs->info.images_declared) {
+      unsigned num_images = util_last_bit(vs->info.images_declared);
+
+      for (unsigned i = 0; i < num_images; i++) {
+         struct pipe_resource *res = sctx->images[PIPE_SHADER_VERTEX].views[i].resource;
+         if (!res)
+            continue;
+
+         if (ws->cs_is_buffer_referenced(cs, si_resource(res)->buf, RADEON_USAGE_WRITE))
+            goto has_write_reference;
+      }
+   }
+
+   return true;
 
 has_write_reference:
-	/* If the current gfx IB has enough packets, flush it to remove write
-	 * references to buffers.
-	 */
-	if (cs->prev_dw + cs->current.cdw > 2048) {
-		si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
-		assert(si_all_vs_resources_read_only(sctx, indexbuf));
-		return true;
-	}
-	return false;
+   /* If the current gfx IB has enough packets, flush it to remove write
+    * references to buffers.
+    */
+   if (cs->prev_dw + cs->current.cdw > 2048) {
+      si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
+      assert(si_all_vs_resources_read_only(sctx, indexbuf));
+      return true;
+   }
+   return false;
 }
 
 static ALWAYS_INLINE bool pd_msg(const char *s)
 {
-	if (SI_PRIM_DISCARD_DEBUG)
-		printf("PD failed: %s\n", s);
-	return false;
+   if (SI_PRIM_DISCARD_DEBUG)
+      printf("PD failed: %s\n", s);
+   return false;
 }
 
 static void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
 {
-	struct si_context *sctx = (struct si_context *)ctx;
-	struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
-	struct pipe_resource *indexbuf = info->index.resource;
-	unsigned dirty_tex_counter, dirty_buf_counter;
-	enum pipe_prim_type rast_prim, prim = info->mode;
-	unsigned index_size = info->index_size;
-	unsigned index_offset = info->indirect ? info->start * index_size : 0;
-	unsigned instance_count = info->instance_count;
-	bool primitive_restart = info->primitive_restart &&
-				 (!sctx->screen->options.prim_restart_tri_strips_only ||
-				  (prim != PIPE_PRIM_TRIANGLE_STRIP &&
-				   prim != PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY));
-
-	if (likely(!info->indirect)) {
-		/* GFX6-GFX7 treat instance_count==0 as instance_count==1. There is
-		 * no workaround for indirect draws, but we can at least skip
-		 * direct draws.
-		 */
-		if (unlikely(!instance_count))
-			return;
-
-		/* Handle count == 0. */
-		if (unlikely(!info->count &&
-			     (index_size || !info->count_from_stream_output)))
-			return;
-	}
-
-	struct si_shader_selector *vs = sctx->vs_shader.cso;
-	if (unlikely(!vs ||
-		     sctx->num_vertex_elements < vs->num_vs_inputs ||
-		     (!sctx->ps_shader.cso && !rs->rasterizer_discard) ||
-		     (!!sctx->tes_shader.cso != (prim == PIPE_PRIM_PATCHES)))) {
-		assert(0);
-		return;
-	}
-
-	/* Recompute and re-emit the texture resource states if needed. */
-	dirty_tex_counter = p_atomic_read(&sctx->screen->dirty_tex_counter);
-	if (unlikely(dirty_tex_counter != sctx->last_dirty_tex_counter)) {
-		sctx->last_dirty_tex_counter = dirty_tex_counter;
-		sctx->framebuffer.dirty_cbufs |=
-			((1 << sctx->framebuffer.state.nr_cbufs) - 1);
-		sctx->framebuffer.dirty_zsbuf = true;
-		si_mark_atom_dirty(sctx, &sctx->atoms.s.framebuffer);
-		si_update_all_texture_descriptors(sctx);
-	}
-
-	dirty_buf_counter = p_atomic_read(&sctx->screen->dirty_buf_counter);
-	if (unlikely(dirty_buf_counter != sctx->last_dirty_buf_counter)) {
-		sctx->last_dirty_buf_counter = dirty_buf_counter;
-		/* Rebind all buffers unconditionally. */
-		si_rebind_buffer(sctx, NULL);
-	}
-
-	si_decompress_textures(sctx, u_bit_consecutive(0, SI_NUM_GRAPHICS_SHADERS));
-
-	/* Set the rasterization primitive type.
-	 *
-	 * This must be done after si_decompress_textures, which can call
-	 * draw_vbo recursively, and before si_update_shaders, which uses
-	 * current_rast_prim for this draw_vbo call. */
-	if (sctx->gs_shader.cso) {
-		/* Only possibilities: POINTS, LINE_STRIP, TRIANGLES */
-		rast_prim = sctx->gs_shader.cso->rast_prim;
-	} else if (sctx->tes_shader.cso) {
-		/* Only possibilities: POINTS, LINE_STRIP, TRIANGLES */
-		rast_prim = sctx->tes_shader.cso->rast_prim;
-	} else if (util_rast_prim_is_triangles(prim)) {
-		rast_prim = PIPE_PRIM_TRIANGLES;
-	} else {
-		/* Only possibilities, POINTS, LINE*, RECTANGLES */
-		rast_prim = prim;
-	}
-
-	if (rast_prim != sctx->current_rast_prim) {
-		if (util_prim_is_points_or_lines(sctx->current_rast_prim) !=
-		    util_prim_is_points_or_lines(rast_prim))
-			si_mark_atom_dirty(sctx, &sctx->atoms.s.guardband);
-
-		sctx->current_rast_prim = rast_prim;
-		sctx->do_update_shaders = true;
-	}
-
-	if (sctx->tes_shader.cso &&
-	    sctx->screen->info.has_ls_vgpr_init_bug) {
-		/* Determine whether the LS VGPR fix should be applied.
-		 *
-		 * It is only required when num input CPs > num output CPs,
-		 * which cannot happen with the fixed function TCS. We should
-		 * also update this bit when switching from TCS to fixed
-		 * function TCS.
-		 */
-		struct si_shader_selector *tcs = sctx->tcs_shader.cso;
-		bool ls_vgpr_fix =
-			tcs &&
-			info->vertices_per_patch >
-			tcs->info.properties[TGSI_PROPERTY_TCS_VERTICES_OUT];
-
-		if (ls_vgpr_fix != sctx->ls_vgpr_fix) {
-			sctx->ls_vgpr_fix = ls_vgpr_fix;
-			sctx->do_update_shaders = true;
-		}
-	}
-
-	if (sctx->chip_class <= GFX9 && sctx->gs_shader.cso) {
-		/* Determine whether the GS triangle strip adjacency fix should
-		 * be applied. Rotate every other triangle if
-		 * - triangle strips with adjacency are fed to the GS and
-		 * - primitive restart is disabled (the rotation doesn't help
-		 *   when the restart occurs after an odd number of triangles).
-		 */
-		bool gs_tri_strip_adj_fix =
-			!sctx->tes_shader.cso &&
-			prim == PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY &&
-			!primitive_restart;
-
-		if (gs_tri_strip_adj_fix != sctx->gs_tri_strip_adj_fix) {
-			sctx->gs_tri_strip_adj_fix = gs_tri_strip_adj_fix;
-			sctx->do_update_shaders = true;
-		}
-	}
-
-	if (index_size) {
-		/* Translate or upload, if needed. */
-		/* 8-bit indices are supported on GFX8. */
-		if (sctx->chip_class <= GFX7 && index_size == 1) {
-			unsigned start, count, start_offset, size, offset;
-			void *ptr;
-
-			si_get_draw_start_count(sctx, info, &start, &count);
-			start_offset = start * 2;
-			size = count * 2;
-
-			indexbuf = NULL;
-			u_upload_alloc(ctx->stream_uploader, start_offset,
-				       size,
-				       si_optimal_tcc_alignment(sctx, size),
-				       &offset, &indexbuf, &ptr);
-			if (!indexbuf)
-				return;
-
-			util_shorten_ubyte_elts_to_userptr(&sctx->b, info, 0, 0,
-							   index_offset + start,
-							   count, ptr);
-
-			/* info->start will be added by the drawing code */
-			index_offset = offset - start_offset;
-			index_size = 2;
-		} else if (info->has_user_indices) {
-			unsigned start_offset;
-
-			assert(!info->indirect);
-			start_offset = info->start * index_size;
-
-			indexbuf = NULL;
-			u_upload_data(ctx->stream_uploader, start_offset,
-				      info->count * index_size,
-				      sctx->screen->info.tcc_cache_line_size,
-				      (char*)info->index.user + start_offset,
-				      &index_offset, &indexbuf);
-			if (!indexbuf)
-				return;
-
-			/* info->start will be added by the drawing code */
-			index_offset -= start_offset;
-		} else if (sctx->chip_class <= GFX7 &&
-			   si_resource(indexbuf)->TC_L2_dirty) {
-			/* GFX8 reads index buffers through TC L2, so it doesn't
-			 * need this. */
-			sctx->flags |= SI_CONTEXT_WB_L2;
-			si_resource(indexbuf)->TC_L2_dirty = false;
-		}
-	}
-
-	bool dispatch_prim_discard_cs = false;
-	bool prim_discard_cs_instancing = false;
-	unsigned original_index_size = index_size;
-	unsigned direct_count = 0;
-
-	if (info->indirect) {
-		struct pipe_draw_indirect_info *indirect = info->indirect;
-
-		/* Add the buffer size for memory checking in need_cs_space. */
-		si_context_add_resource_size(sctx, indirect->buffer);
-
-		/* Indirect buffers use TC L2 on GFX9, but not older hw. */
-		if (sctx->chip_class <= GFX8) {
-			if (si_resource(indirect->buffer)->TC_L2_dirty) {
-				sctx->flags |= SI_CONTEXT_WB_L2;
-				si_resource(indirect->buffer)->TC_L2_dirty = false;
-			}
-
-			if (indirect->indirect_draw_count &&
-			    si_resource(indirect->indirect_draw_count)->TC_L2_dirty) {
-				sctx->flags |= SI_CONTEXT_WB_L2;
-				si_resource(indirect->indirect_draw_count)->TC_L2_dirty = false;
-			}
-		}
-	} else {
-		/* Multiply by 3 for strips and fans to get an approximate vertex
-		 * count as triangles. */
-		direct_count = info->count * instance_count *
-			       (prim == PIPE_PRIM_TRIANGLES ? 1 : 3);
-	}
-
-	/* Determine if we can use the primitive discard compute shader. */
-	if (si_compute_prim_discard_enabled(sctx) &&
-	    (direct_count > sctx->prim_discard_vertex_count_threshold ?
-	     (sctx->compute_num_verts_rejected += direct_count, true) : /* Add, then return true. */
-	     (sctx->compute_num_verts_ineligible += direct_count, false)) && /* Add, then return false. */
-	    (!info->count_from_stream_output || pd_msg("draw_opaque")) &&
-	    (primitive_restart ?
-	     /* Supported prim types with primitive restart: */
-	     (prim == PIPE_PRIM_TRIANGLE_STRIP || pd_msg("bad prim type with primitive restart")) &&
-	     /* Disallow instancing with primitive restart: */
-	     (instance_count == 1 || pd_msg("instance_count > 1 with primitive restart")) :
-	     /* Supported prim types without primitive restart + allow instancing: */
-	     (1 << prim) & ((1 << PIPE_PRIM_TRIANGLES) |
-			    (1 << PIPE_PRIM_TRIANGLE_STRIP) |
-			    (1 << PIPE_PRIM_TRIANGLE_FAN)) &&
-	     /* Instancing is limited to 16-bit indices, because InstanceID is packed into VertexID. */
-	     /* TODO: DrawArraysInstanced doesn't sometimes work, so it's disabled. */
-	     (instance_count == 1 ||
-	      (instance_count <= USHRT_MAX && index_size && index_size <= 2) ||
-	      pd_msg("instance_count too large or index_size == 4 or DrawArraysInstanced"))) &&
-	    (info->drawid == 0 || !sctx->vs_shader.cso->info.uses_drawid || pd_msg("draw_id > 0")) &&
-	    (!sctx->render_cond || pd_msg("render condition")) &&
-	    /* Forced enablement ignores pipeline statistics queries. */
-	    (sctx->screen->debug_flags & (DBG(PD) | DBG(ALWAYS_PD)) ||
-	     (!sctx->num_pipeline_stat_queries && !sctx->streamout.prims_gen_query_enabled) ||
-	     pd_msg("pipestat or primgen query")) &&
-	    (!sctx->vertex_elements->instance_divisor_is_fetched || pd_msg("loads instance divisors")) &&
-	    (!sctx->tes_shader.cso || pd_msg("uses tess")) &&
-	    (!sctx->gs_shader.cso || pd_msg("uses GS")) &&
-	    (!sctx->ps_shader.cso->info.uses_primid || pd_msg("PS uses PrimID")) &&
-	    !rs->polygon_mode_enabled &&
+   struct si_context *sctx = (struct si_context *)ctx;
+   struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
+   struct pipe_resource *indexbuf = info->index.resource;
+   unsigned dirty_tex_counter, dirty_buf_counter;
+   enum pipe_prim_type rast_prim, prim = info->mode;
+   unsigned index_size = info->index_size;
+   unsigned index_offset = info->indirect ? info->start * index_size : 0;
+   unsigned instance_count = info->instance_count;
+   bool primitive_restart =
+      info->primitive_restart &&
+      (!sctx->screen->options.prim_restart_tri_strips_only ||
+       (prim != PIPE_PRIM_TRIANGLE_STRIP && prim != PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY));
+
+   if (likely(!info->indirect)) {
+      /* GFX6-GFX7 treat instance_count==0 as instance_count==1. There is
+       * no workaround for indirect draws, but we can at least skip
+       * direct draws.
+       */
+      if (unlikely(!instance_count))
+         return;
+
+      /* Handle count == 0. */
+      if (unlikely(!info->count && (index_size || !info->count_from_stream_output)))
+         return;
+   }
+
+   struct si_shader_selector *vs = sctx->vs_shader.cso;
+   if (unlikely(!vs || sctx->num_vertex_elements < vs->num_vs_inputs ||
+                (!sctx->ps_shader.cso && !rs->rasterizer_discard) ||
+                (!!sctx->tes_shader.cso != (prim == PIPE_PRIM_PATCHES)))) {
+      assert(0);
+      return;
+   }
+
+   /* Recompute and re-emit the texture resource states if needed. */
+   dirty_tex_counter = p_atomic_read(&sctx->screen->dirty_tex_counter);
+   if (unlikely(dirty_tex_counter != sctx->last_dirty_tex_counter)) {
+      sctx->last_dirty_tex_counter = dirty_tex_counter;
+      sctx->framebuffer.dirty_cbufs |= ((1 << sctx->framebuffer.state.nr_cbufs) - 1);
+      sctx->framebuffer.dirty_zsbuf = true;
+      si_mark_atom_dirty(sctx, &sctx->atoms.s.framebuffer);
+      si_update_all_texture_descriptors(sctx);
+   }
+
+   dirty_buf_counter = p_atomic_read(&sctx->screen->dirty_buf_counter);
+   if (unlikely(dirty_buf_counter != sctx->last_dirty_buf_counter)) {
+      sctx->last_dirty_buf_counter = dirty_buf_counter;
+      /* Rebind all buffers unconditionally. */
+      si_rebind_buffer(sctx, NULL);
+   }
+
+   si_decompress_textures(sctx, u_bit_consecutive(0, SI_NUM_GRAPHICS_SHADERS));
+
+   /* Set the rasterization primitive type.
+    *
+    * This must be done after si_decompress_textures, which can call
+    * draw_vbo recursively, and before si_update_shaders, which uses
+    * current_rast_prim for this draw_vbo call. */
+   if (sctx->gs_shader.cso) {
+      /* Only possibilities: POINTS, LINE_STRIP, TRIANGLES */
+      rast_prim = sctx->gs_shader.cso->rast_prim;
+   } else if (sctx->tes_shader.cso) {
+      /* Only possibilities: POINTS, LINE_STRIP, TRIANGLES */
+      rast_prim = sctx->tes_shader.cso->rast_prim;
+   } else if (util_rast_prim_is_triangles(prim)) {
+      rast_prim = PIPE_PRIM_TRIANGLES;
+   } else {
+      /* Only possibilities, POINTS, LINE*, RECTANGLES */
+      rast_prim = prim;
+   }
+
+   if (rast_prim != sctx->current_rast_prim) {
+      if (util_prim_is_points_or_lines(sctx->current_rast_prim) !=
+          util_prim_is_points_or_lines(rast_prim))
+         si_mark_atom_dirty(sctx, &sctx->atoms.s.guardband);
+
+      sctx->current_rast_prim = rast_prim;
+      sctx->do_update_shaders = true;
+   }
+
+   if (sctx->tes_shader.cso && sctx->screen->info.has_ls_vgpr_init_bug) {
+      /* Determine whether the LS VGPR fix should be applied.
+       *
+       * It is only required when num input CPs > num output CPs,
+       * which cannot happen with the fixed function TCS. We should
+       * also update this bit when switching from TCS to fixed
+       * function TCS.
+       */
+      struct si_shader_selector *tcs = sctx->tcs_shader.cso;
+      bool ls_vgpr_fix =
+         tcs && info->vertices_per_patch > tcs->info.properties[TGSI_PROPERTY_TCS_VERTICES_OUT];
+
+      if (ls_vgpr_fix != sctx->ls_vgpr_fix) {
+         sctx->ls_vgpr_fix = ls_vgpr_fix;
+         sctx->do_update_shaders = true;
+      }
+   }
+
+   if (sctx->chip_class <= GFX9 && sctx->gs_shader.cso) {
+      /* Determine whether the GS triangle strip adjacency fix should
+       * be applied. Rotate every other triangle if
+       * - triangle strips with adjacency are fed to the GS and
+       * - primitive restart is disabled (the rotation doesn't help
+       *   when the restart occurs after an odd number of triangles).
+       */
+      bool gs_tri_strip_adj_fix =
+         !sctx->tes_shader.cso && prim == PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY && !primitive_restart;
+
+      if (gs_tri_strip_adj_fix != sctx->gs_tri_strip_adj_fix) {
+         sctx->gs_tri_strip_adj_fix = gs_tri_strip_adj_fix;
+         sctx->do_update_shaders = true;
+      }
+   }
+
+   if (index_size) {
+      /* Translate or upload, if needed. */
+      /* 8-bit indices are supported on GFX8. */
+      if (sctx->chip_class <= GFX7 && index_size == 1) {
+         unsigned start, count, start_offset, size, offset;
+         void *ptr;
+
+         si_get_draw_start_count(sctx, info, &start, &count);
+         start_offset = start * 2;
+         size = count * 2;
+
+         indexbuf = NULL;
+         u_upload_alloc(ctx->stream_uploader, start_offset, size,
+                        si_optimal_tcc_alignment(sctx, size), &offset, &indexbuf, &ptr);
+         if (!indexbuf)
+            return;
+
+         util_shorten_ubyte_elts_to_userptr(&sctx->b, info, 0, 0, index_offset + start, count, ptr);
+
+         /* info->start will be added by the drawing code */
+         index_offset = offset - start_offset;
+         index_size = 2;
+      } else if (info->has_user_indices) {
+         unsigned start_offset;
+
+         assert(!info->indirect);
+         start_offset = info->start * index_size;
+
+         indexbuf = NULL;
+         u_upload_data(ctx->stream_uploader, start_offset, info->count * index_size,
+                       sctx->screen->info.tcc_cache_line_size,
+                       (char *)info->index.user + start_offset, &index_offset, &indexbuf);
+         if (!indexbuf)
+            return;
+
+         /* info->start will be added by the drawing code */
+         index_offset -= start_offset;
+      } else if (sctx->chip_class <= GFX7 && si_resource(indexbuf)->TC_L2_dirty) {
+         /* GFX8 reads index buffers through TC L2, so it doesn't
+          * need this. */
+         sctx->flags |= SI_CONTEXT_WB_L2;
+         si_resource(indexbuf)->TC_L2_dirty = false;
+      }
+   }
+
+   bool dispatch_prim_discard_cs = false;
+   bool prim_discard_cs_instancing = false;
+   unsigned original_index_size = index_size;
+   unsigned direct_count = 0;
+
+   if (info->indirect) {
+      struct pipe_draw_indirect_info *indirect = info->indirect;
+
+      /* Add the buffer size for memory checking in need_cs_space. */
+      si_context_add_resource_size(sctx, indirect->buffer);
+
+      /* Indirect buffers use TC L2 on GFX9, but not older hw. */
+      if (sctx->chip_class <= GFX8) {
+         if (si_resource(indirect->buffer)->TC_L2_dirty) {
+            sctx->flags |= SI_CONTEXT_WB_L2;
+            si_resource(indirect->buffer)->TC_L2_dirty = false;
+         }
+
+         if (indirect->indirect_draw_count &&
+             si_resource(indirect->indirect_draw_count)->TC_L2_dirty) {
+            sctx->flags |= SI_CONTEXT_WB_L2;
+            si_resource(indirect->indirect_draw_count)->TC_L2_dirty = false;
+         }
+      }
+   } else {
+      /* Multiply by 3 for strips and fans to get an approximate vertex
+       * count as triangles. */
+      direct_count = info->count * instance_count * (prim == PIPE_PRIM_TRIANGLES ? 1 : 3);
+   }
+
+   /* Determine if we can use the primitive discard compute shader. */
+   if (si_compute_prim_discard_enabled(sctx) &&
+       (direct_count > sctx->prim_discard_vertex_count_threshold
+           ? (sctx->compute_num_verts_rejected += direct_count, true)
+           : /* Add, then return true. */
+           (sctx->compute_num_verts_ineligible += direct_count,
+            false)) && /* Add, then return false. */
+       (!info->count_from_stream_output || pd_msg("draw_opaque")) &&
+       (primitive_restart ?
+                          /* Supported prim types with primitive restart: */
+           (prim == PIPE_PRIM_TRIANGLE_STRIP || pd_msg("bad prim type with primitive restart")) &&
+              /* Disallow instancing with primitive restart: */
+              (instance_count == 1 || pd_msg("instance_count > 1 with primitive restart"))
+                          :
+                          /* Supported prim types without primitive restart + allow instancing: */
+           (1 << prim) & ((1 << PIPE_PRIM_TRIANGLES) | (1 << PIPE_PRIM_TRIANGLE_STRIP) |
+                          (1 << PIPE_PRIM_TRIANGLE_FAN)) &&
+              /* Instancing is limited to 16-bit indices, because InstanceID is packed into
+                 VertexID. */
+              /* TODO: DrawArraysInstanced doesn't sometimes work, so it's disabled. */
+              (instance_count == 1 ||
+               (instance_count <= USHRT_MAX && index_size && index_size <= 2) ||
+               pd_msg("instance_count too large or index_size == 4 or DrawArraysInstanced"))) &&
+       (info->drawid == 0 || !sctx->vs_shader.cso->info.uses_drawid || pd_msg("draw_id > 0")) &&
+       (!sctx->render_cond || pd_msg("render condition")) &&
+       /* Forced enablement ignores pipeline statistics queries. */
+       (sctx->screen->debug_flags & (DBG(PD) | DBG(ALWAYS_PD)) ||
+        (!sctx->num_pipeline_stat_queries && !sctx->streamout.prims_gen_query_enabled) ||
+        pd_msg("pipestat or primgen query")) &&
+       (!sctx->vertex_elements->instance_divisor_is_fetched || pd_msg("loads instance divisors")) &&
+       (!sctx->tes_shader.cso || pd_msg("uses tess")) &&
+       (!sctx->gs_shader.cso || pd_msg("uses GS")) &&
+       (!sctx->ps_shader.cso->info.uses_primid || pd_msg("PS uses PrimID")) &&
+       !rs->polygon_mode_enabled &&
 #if SI_PRIM_DISCARD_DEBUG /* same as cso->prim_discard_cs_allowed */
-	    (!sctx->vs_shader.cso->info.uses_bindless_images || pd_msg("uses bindless images")) &&
-	    (!sctx->vs_shader.cso->info.uses_bindless_samplers || pd_msg("uses bindless samplers")) &&
-	    (!sctx->vs_shader.cso->info.writes_memory || pd_msg("writes memory")) &&
-	    (!sctx->vs_shader.cso->info.writes_viewport_index || pd_msg("writes viewport index")) &&
-	    !sctx->vs_shader.cso->info.properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION] &&
-	    !sctx->vs_shader.cso->so.num_outputs &&
+       (!sctx->vs_shader.cso->info.uses_bindless_images || pd_msg("uses bindless images")) &&
+       (!sctx->vs_shader.cso->info.uses_bindless_samplers || pd_msg("uses bindless samplers")) &&
+       (!sctx->vs_shader.cso->info.writes_memory || pd_msg("writes memory")) &&
+       (!sctx->vs_shader.cso->info.writes_viewport_index || pd_msg("writes viewport index")) &&
+       !sctx->vs_shader.cso->info.properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION] &&
+       !sctx->vs_shader.cso->so.num_outputs &&
 #else
-	    (sctx->vs_shader.cso->prim_discard_cs_allowed || pd_msg("VS shader uses unsupported features")) &&
+       (sctx->vs_shader.cso->prim_discard_cs_allowed ||
+        pd_msg("VS shader uses unsupported features")) &&
 #endif
-	    /* Check that all buffers are used for read only, because compute
-	     * dispatches can run ahead. */
-	    (si_all_vs_resources_read_only(sctx, index_size ? indexbuf : NULL) || pd_msg("write reference"))) {
-		switch (si_prepare_prim_discard_or_split_draw(sctx, info, primitive_restart)) {
-		case SI_PRIM_DISCARD_ENABLED:
-			original_index_size = index_size;
-			prim_discard_cs_instancing = instance_count > 1;
-			dispatch_prim_discard_cs = true;
-
-			/* The compute shader changes/lowers the following: */
-			prim = PIPE_PRIM_TRIANGLES;
-			index_size = 4;
-			instance_count = 1;
-			primitive_restart = false;
-			sctx->compute_num_verts_rejected -= direct_count;
-			sctx->compute_num_verts_accepted += direct_count;
-			break;
-		case SI_PRIM_DISCARD_DISABLED:
-			break;
-		case SI_PRIM_DISCARD_DRAW_SPLIT:
-			sctx->compute_num_verts_rejected -= direct_count;
-			goto return_cleanup;
-		}
-	}
-
-	if (prim_discard_cs_instancing != sctx->prim_discard_cs_instancing) {
-		sctx->prim_discard_cs_instancing = prim_discard_cs_instancing;
-		sctx->do_update_shaders = true;
-	}
-
-	/* Update NGG culling settings. */
-	if (sctx->ngg &&
-	    !dispatch_prim_discard_cs &&
-	    rast_prim == PIPE_PRIM_TRIANGLES &&
-	    (sctx->screen->always_use_ngg_culling ||
-	     /* At least 1024 non-indexed vertices (8 subgroups) are needed
-	      * per draw call (no TES/GS) to enable NGG culling.
-	      */
-	     (!index_size && direct_count >= 1024 &&
-	      (prim == PIPE_PRIM_TRIANGLES || prim == PIPE_PRIM_TRIANGLE_STRIP) &&
-	      !sctx->tes_shader.cso && !sctx->gs_shader.cso)) &&
-	    si_get_vs(sctx)->cso->ngg_culling_allowed) {
-		unsigned ngg_culling = 0;
-
-		if (rs->rasterizer_discard) {
-			ngg_culling |= SI_NGG_CULL_FRONT_FACE |
-				       SI_NGG_CULL_BACK_FACE;
-		} else {
-			/* Polygon mode can't use view and small primitive culling,
-			 * because it draws points or lines where the culling depends
-			 * on the point or line width.
-			 */
-			if (!rs->polygon_mode_enabled)
-				ngg_culling |= SI_NGG_CULL_VIEW_SMALLPRIMS;
-
-			if (sctx->viewports.y_inverted ? rs->cull_back : rs->cull_front)
-				ngg_culling |= SI_NGG_CULL_FRONT_FACE;
-			if (sctx->viewports.y_inverted ? rs->cull_front : rs->cull_back)
-				ngg_culling |= SI_NGG_CULL_BACK_FACE;
-		}
-
-		/* Use NGG fast launch for certain non-indexed primitive types.
-		 * A draw must have at least 1 full primitive.
-		 */
-		if (ngg_culling && !index_size && direct_count >= 3 &&
-		    !sctx->tes_shader.cso && !sctx->gs_shader.cso) {
-			if (prim == PIPE_PRIM_TRIANGLES)
-				ngg_culling |= SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST;
-			else if (prim == PIPE_PRIM_TRIANGLE_STRIP)
-				ngg_culling |= SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP;
-		}
-
-		if (ngg_culling != sctx->ngg_culling) {
-			/* Insert a VGT_FLUSH when enabling fast launch changes to prevent hangs.
-			* See issues #2418, #2426, #2434
-			*/
-			if (ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL)
-				sctx->flags |= SI_CONTEXT_VGT_FLUSH;
-			sctx->ngg_culling = ngg_culling;
-			sctx->do_update_shaders = true;
-		}
-	} else if (sctx->ngg_culling) {
-		sctx->ngg_culling = false;
-		sctx->do_update_shaders = true;
-	}
-
-	if (sctx->do_update_shaders && !si_update_shaders(sctx))
-		goto return_cleanup;
-
-	si_need_gfx_cs_space(sctx);
-
-	if (sctx->bo_list_add_all_gfx_resources)
-		si_gfx_resources_add_all_to_bo_list(sctx);
-
-	/* Since we've called si_context_add_resource_size for vertex buffers,
-	 * this must be called after si_need_cs_space, because we must let
-	 * need_cs_space flush before we add buffers to the buffer list.
-	 */
-	if (!si_upload_vertex_buffer_descriptors(sctx))
-		goto return_cleanup;
-
-	/* Vega10/Raven scissor bug workaround. When any context register is
-	 * written (i.e. the GPU rolls the context), PA_SC_VPORT_SCISSOR
-	 * registers must be written too.
-	 */
-	unsigned masked_atoms = 0;
-
-	if (sctx->screen->info.has_gfx9_scissor_bug) {
-		masked_atoms |= si_get_atom_bit(sctx, &sctx->atoms.s.scissors);
-
-		if (info->count_from_stream_output ||
-		    sctx->dirty_atoms & si_atoms_that_always_roll_context() ||
-		    sctx->dirty_states & si_states_that_always_roll_context())
-			sctx->context_roll = true;
-	}
-
-	/* Use optimal packet order based on whether we need to sync the pipeline. */
-	if (unlikely(sctx->flags & (SI_CONTEXT_FLUSH_AND_INV_CB |
-				      SI_CONTEXT_FLUSH_AND_INV_DB |
-				      SI_CONTEXT_PS_PARTIAL_FLUSH |
-				      SI_CONTEXT_CS_PARTIAL_FLUSH))) {
-		/* If we have to wait for idle, set all states first, so that all
-		 * SET packets are processed in parallel with previous draw calls.
-		 * Then draw and prefetch at the end. This ensures that the time
-		 * the CUs are idle is very short.
-		 */
-		if (unlikely(sctx->flags & SI_CONTEXT_FLUSH_FOR_RENDER_COND))
-			masked_atoms |= si_get_atom_bit(sctx, &sctx->atoms.s.render_cond);
-
-		if (!si_upload_graphics_shader_descriptors(sctx))
-			goto return_cleanup;
-
-		/* Emit all states except possibly render condition. */
-		si_emit_all_states(sctx, info, prim, instance_count,
-				   primitive_restart, masked_atoms);
-		sctx->emit_cache_flush(sctx);
-		/* <-- CUs are idle here. */
-
-		if (si_is_atom_dirty(sctx, &sctx->atoms.s.render_cond))
-			sctx->atoms.s.render_cond.emit(sctx);
-
-		if (sctx->screen->info.has_gfx9_scissor_bug &&
-		    (sctx->context_roll ||
-		     si_is_atom_dirty(sctx, &sctx->atoms.s.scissors)))
-			sctx->atoms.s.scissors.emit(sctx);
-
-		sctx->dirty_atoms = 0;
-
-		si_emit_draw_packets(sctx, info, indexbuf, index_size, index_offset,
-				     instance_count, dispatch_prim_discard_cs,
-				     original_index_size);
-		/* <-- CUs are busy here. */
-
-		/* Start prefetches after the draw has been started. Both will run
-		 * in parallel, but starting the draw first is more important.
-		 */
-		if (sctx->chip_class >= GFX7 && sctx->prefetch_L2_mask)
-			cik_emit_prefetch_L2(sctx, false);
-	} else {
-		/* If we don't wait for idle, start prefetches first, then set
-		 * states, and draw at the end.
-		 */
-		if (sctx->flags)
-			sctx->emit_cache_flush(sctx);
-
-		/* Only prefetch the API VS and VBO descriptors. */
-		if (sctx->chip_class >= GFX7 && sctx->prefetch_L2_mask)
-			cik_emit_prefetch_L2(sctx, true);
-
-		if (!si_upload_graphics_shader_descriptors(sctx))
-			goto return_cleanup;
-
-		si_emit_all_states(sctx, info, prim, instance_count,
-				   primitive_restart, masked_atoms);
-
-		if (sctx->screen->info.has_gfx9_scissor_bug &&
-		    (sctx->context_roll ||
-		     si_is_atom_dirty(sctx, &sctx->atoms.s.scissors)))
-			sctx->atoms.s.scissors.emit(sctx);
-
-		sctx->dirty_atoms = 0;
-
-		si_emit_draw_packets(sctx, info, indexbuf, index_size, index_offset,
-				     instance_count, dispatch_prim_discard_cs,
-				     original_index_size);
-
-		/* Prefetch the remaining shaders after the draw has been
-		 * started. */
-		if (sctx->chip_class >= GFX7 && sctx->prefetch_L2_mask)
-			cik_emit_prefetch_L2(sctx, false);
-	}
-
-	/* Mark the displayable dcc buffer as dirty in order to update
-	 * it on the next call to si_flush_resource. */
-	if (sctx->screen->info.use_display_dcc_with_retile_blit) {
-		/* Don't use si_update_fb_dirtiness_after_rendering because it'll
-		 * cause unnecessary texture decompressions on each draw. */
-		unsigned displayable_dcc_cb_mask = sctx->framebuffer.displayable_dcc_cb_mask;
-		while (displayable_dcc_cb_mask) {
-			unsigned i = u_bit_scan(&displayable_dcc_cb_mask);
-			struct pipe_surface *surf = sctx->framebuffer.state.cbufs[i];
-			struct si_texture *tex = (struct si_texture*) surf->texture;
-			tex->displayable_dcc_dirty = true;
-		}
-	}
-
-	/* Clear the context roll flag after the draw call. */
-	sctx->context_roll = false;
-
-	if (unlikely(sctx->current_saved_cs)) {
-		si_trace_emit(sctx);
-		si_log_draw_state(sctx, sctx->log);
-	}
-
-	/* Workaround for a VGT hang when streamout is enabled.
-	 * It must be done after drawing. */
-	if ((sctx->family == CHIP_HAWAII ||
-	     sctx->family == CHIP_TONGA ||
-	     sctx->family == CHIP_FIJI) &&
-	    si_get_strmout_en(sctx)) {
-		sctx->flags |= SI_CONTEXT_VGT_STREAMOUT_SYNC;
-	}
-
-	if (unlikely(sctx->decompression_enabled)) {
-		sctx->num_decompress_calls++;
-	} else {
-		sctx->num_draw_calls++;
-		if (sctx->framebuffer.state.nr_cbufs > 1)
-			sctx->num_mrt_draw_calls++;
-		if (primitive_restart)
-			sctx->num_prim_restart_calls++;
-		if (G_0286E8_WAVESIZE(sctx->spi_tmpring_size))
-			sctx->num_spill_draw_calls++;
-	}
+       /* Check that all buffers are used for read only, because compute
+        * dispatches can run ahead. */
+       (si_all_vs_resources_read_only(sctx, index_size ? indexbuf : NULL) ||
+        pd_msg("write reference"))) {
+      switch (si_prepare_prim_discard_or_split_draw(sctx, info, primitive_restart)) {
+      case SI_PRIM_DISCARD_ENABLED:
+         original_index_size = index_size;
+         prim_discard_cs_instancing = instance_count > 1;
+         dispatch_prim_discard_cs = true;
+
+         /* The compute shader changes/lowers the following: */
+         prim = PIPE_PRIM_TRIANGLES;
+         index_size = 4;
+         instance_count = 1;
+         primitive_restart = false;
+         sctx->compute_num_verts_rejected -= direct_count;
+         sctx->compute_num_verts_accepted += direct_count;
+         break;
+      case SI_PRIM_DISCARD_DISABLED:
+         break;
+      case SI_PRIM_DISCARD_DRAW_SPLIT:
+         sctx->compute_num_verts_rejected -= direct_count;
+         goto return_cleanup;
+      }
+   }
+
+   if (prim_discard_cs_instancing != sctx->prim_discard_cs_instancing) {
+      sctx->prim_discard_cs_instancing = prim_discard_cs_instancing;
+      sctx->do_update_shaders = true;
+   }
+
+   /* Update NGG culling settings. */
+   if (sctx->ngg && !dispatch_prim_discard_cs && rast_prim == PIPE_PRIM_TRIANGLES &&
+       (sctx->screen->always_use_ngg_culling ||
+        /* At least 1024 non-indexed vertices (8 subgroups) are needed
+         * per draw call (no TES/GS) to enable NGG culling.
+         */
+        (!index_size && direct_count >= 1024 &&
+         (prim == PIPE_PRIM_TRIANGLES || prim == PIPE_PRIM_TRIANGLE_STRIP) &&
+         !sctx->tes_shader.cso && !sctx->gs_shader.cso)) &&
+       si_get_vs(sctx)->cso->ngg_culling_allowed) {
+      unsigned ngg_culling = 0;
+
+      if (rs->rasterizer_discard) {
+         ngg_culling |= SI_NGG_CULL_FRONT_FACE | SI_NGG_CULL_BACK_FACE;
+      } else {
+         /* Polygon mode can't use view and small primitive culling,
+          * because it draws points or lines where the culling depends
+          * on the point or line width.
+          */
+         if (!rs->polygon_mode_enabled)
+            ngg_culling |= SI_NGG_CULL_VIEW_SMALLPRIMS;
+
+         if (sctx->viewports.y_inverted ? rs->cull_back : rs->cull_front)
+            ngg_culling |= SI_NGG_CULL_FRONT_FACE;
+         if (sctx->viewports.y_inverted ? rs->cull_front : rs->cull_back)
+            ngg_culling |= SI_NGG_CULL_BACK_FACE;
+      }
+
+      /* Use NGG fast launch for certain non-indexed primitive types.
+       * A draw must have at least 1 full primitive.
+       */
+      if (ngg_culling && !index_size && direct_count >= 3 && !sctx->tes_shader.cso &&
+          !sctx->gs_shader.cso) {
+         if (prim == PIPE_PRIM_TRIANGLES)
+            ngg_culling |= SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST;
+         else if (prim == PIPE_PRIM_TRIANGLE_STRIP)
+            ngg_culling |= SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP;
+      }
+
+      if (ngg_culling != sctx->ngg_culling) {
+         /* Insert a VGT_FLUSH when enabling fast launch changes to prevent hangs.
+          * See issues #2418, #2426, #2434
+          */
+         if (ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL)
+            sctx->flags |= SI_CONTEXT_VGT_FLUSH;
+         sctx->ngg_culling = ngg_culling;
+         sctx->do_update_shaders = true;
+      }
+   } else if (sctx->ngg_culling) {
+      sctx->ngg_culling = false;
+      sctx->do_update_shaders = true;
+   }
+
+   if (sctx->do_update_shaders && !si_update_shaders(sctx))
+      goto return_cleanup;
+
+   si_need_gfx_cs_space(sctx);
+
+   if (sctx->bo_list_add_all_gfx_resources)
+      si_gfx_resources_add_all_to_bo_list(sctx);
+
+   /* Since we've called si_context_add_resource_size for vertex buffers,
+    * this must be called after si_need_cs_space, because we must let
+    * need_cs_space flush before we add buffers to the buffer list.
+    */
+   if (!si_upload_vertex_buffer_descriptors(sctx))
+      goto return_cleanup;
+
+   /* Vega10/Raven scissor bug workaround. When any context register is
+    * written (i.e. the GPU rolls the context), PA_SC_VPORT_SCISSOR
+    * registers must be written too.
+    */
+   unsigned masked_atoms = 0;
+
+   if (sctx->screen->info.has_gfx9_scissor_bug) {
+      masked_atoms |= si_get_atom_bit(sctx, &sctx->atoms.s.scissors);
+
+      if (info->count_from_stream_output ||
+          sctx->dirty_atoms & si_atoms_that_always_roll_context() ||
+          sctx->dirty_states & si_states_that_always_roll_context())
+         sctx->context_roll = true;
+   }
+
+   /* Use optimal packet order based on whether we need to sync the pipeline. */
+   if (unlikely(sctx->flags & (SI_CONTEXT_FLUSH_AND_INV_CB | SI_CONTEXT_FLUSH_AND_INV_DB |
+                               SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH))) {
+      /* If we have to wait for idle, set all states first, so that all
+       * SET packets are processed in parallel with previous draw calls.
+       * Then draw and prefetch at the end. This ensures that the time
+       * the CUs are idle is very short.
+       */
+      if (unlikely(sctx->flags & SI_CONTEXT_FLUSH_FOR_RENDER_COND))
+         masked_atoms |= si_get_atom_bit(sctx, &sctx->atoms.s.render_cond);
+
+      if (!si_upload_graphics_shader_descriptors(sctx))
+         goto return_cleanup;
+
+      /* Emit all states except possibly render condition. */
+      si_emit_all_states(sctx, info, prim, instance_count, primitive_restart, masked_atoms);
+      sctx->emit_cache_flush(sctx);
+      /* <-- CUs are idle here. */
+
+      if (si_is_atom_dirty(sctx, &sctx->atoms.s.render_cond))
+         sctx->atoms.s.render_cond.emit(sctx);
+
+      if (sctx->screen->info.has_gfx9_scissor_bug &&
+          (sctx->context_roll || si_is_atom_dirty(sctx, &sctx->atoms.s.scissors)))
+         sctx->atoms.s.scissors.emit(sctx);
+
+      sctx->dirty_atoms = 0;
+
+      si_emit_draw_packets(sctx, info, indexbuf, index_size, index_offset, instance_count,
+                           dispatch_prim_discard_cs, original_index_size);
+      /* <-- CUs are busy here. */
+
+      /* Start prefetches after the draw has been started. Both will run
+       * in parallel, but starting the draw first is more important.
+       */
+      if (sctx->chip_class >= GFX7 && sctx->prefetch_L2_mask)
+         cik_emit_prefetch_L2(sctx, false);
+   } else {
+      /* If we don't wait for idle, start prefetches first, then set
+       * states, and draw at the end.
+       */
+      if (sctx->flags)
+         sctx->emit_cache_flush(sctx);
+
+      /* Only prefetch the API VS and VBO descriptors. */
+      if (sctx->chip_class >= GFX7 && sctx->prefetch_L2_mask)
+         cik_emit_prefetch_L2(sctx, true);
+
+      if (!si_upload_graphics_shader_descriptors(sctx))
+         goto return_cleanup;
+
+      si_emit_all_states(sctx, info, prim, instance_count, primitive_restart, masked_atoms);
+
+      if (sctx->screen->info.has_gfx9_scissor_bug &&
+          (sctx->context_roll || si_is_atom_dirty(sctx, &sctx->atoms.s.scissors)))
+         sctx->atoms.s.scissors.emit(sctx);
+
+      sctx->dirty_atoms = 0;
+
+      si_emit_draw_packets(sctx, info, indexbuf, index_size, index_offset, instance_count,
+                           dispatch_prim_discard_cs, original_index_size);
+
+      /* Prefetch the remaining shaders after the draw has been
+       * started. */
+      if (sctx->chip_class >= GFX7 && sctx->prefetch_L2_mask)
+         cik_emit_prefetch_L2(sctx, false);
+   }
+
+   /* Mark the displayable dcc buffer as dirty in order to update
+    * it on the next call to si_flush_resource. */
+   if (sctx->screen->info.use_display_dcc_with_retile_blit) {
+      /* Don't use si_update_fb_dirtiness_after_rendering because it'll
+       * cause unnecessary texture decompressions on each draw. */
+      unsigned displayable_dcc_cb_mask = sctx->framebuffer.displayable_dcc_cb_mask;
+      while (displayable_dcc_cb_mask) {
+         unsigned i = u_bit_scan(&displayable_dcc_cb_mask);
+         struct pipe_surface *surf = sctx->framebuffer.state.cbufs[i];
+         struct si_texture *tex = (struct si_texture *)surf->texture;
+         tex->displayable_dcc_dirty = true;
+      }
+   }
+
+   /* Clear the context roll flag after the draw call. */
+   sctx->context_roll = false;
+
+   if (unlikely(sctx->current_saved_cs)) {
+      si_trace_emit(sctx);
+      si_log_draw_state(sctx, sctx->log);
+   }
+
+   /* Workaround for a VGT hang when streamout is enabled.
+    * It must be done after drawing. */
+   if ((sctx->family == CHIP_HAWAII || sctx->family == CHIP_TONGA || sctx->family == CHIP_FIJI) &&
+       si_get_strmout_en(sctx)) {
+      sctx->flags |= SI_CONTEXT_VGT_STREAMOUT_SYNC;
+   }
+
+   if (unlikely(sctx->decompression_enabled)) {
+      sctx->num_decompress_calls++;
+   } else {
+      sctx->num_draw_calls++;
+      if (sctx->framebuffer.state.nr_cbufs > 1)
+         sctx->num_mrt_draw_calls++;
+      if (primitive_restart)
+         sctx->num_prim_restart_calls++;
+      if (G_0286E8_WAVESIZE(sctx->spi_tmpring_size))
+         sctx->num_spill_draw_calls++;
+   }
 
 return_cleanup:
-	if (index_size && indexbuf != info->index.resource)
-		pipe_resource_reference(&indexbuf, NULL);
+   if (index_size && indexbuf != info->index.resource)
+      pipe_resource_reference(&indexbuf, NULL);
 }
 
-static void
-si_draw_rectangle(struct blitter_context *blitter,
-		  void *vertex_elements_cso,
-		  blitter_get_vs_func get_vs,
-		  int x1, int y1, int x2, int y2,
-		  float depth, unsigned num_instances,
-		  enum blitter_attrib_type type,
-		  const union blitter_attrib *attrib)
+static void si_draw_rectangle(struct blitter_context *blitter, void *vertex_elements_cso,
+                              blitter_get_vs_func get_vs, int x1, int y1, int x2, int y2,
+                              float depth, unsigned num_instances, enum blitter_attrib_type type,
+                              const union blitter_attrib *attrib)
 {
-	struct pipe_context *pipe = util_blitter_get_pipe(blitter);
-	struct si_context *sctx = (struct si_context*)pipe;
-
-	/* Pack position coordinates as signed int16. */
-	sctx->vs_blit_sh_data[0] = (uint32_t)(x1 & 0xffff) |
-				   ((uint32_t)(y1 & 0xffff) << 16);
-	sctx->vs_blit_sh_data[1] = (uint32_t)(x2 & 0xffff) |
-				   ((uint32_t)(y2 & 0xffff) << 16);
-	sctx->vs_blit_sh_data[2] = fui(depth);
-
-	switch (type) {
-	case UTIL_BLITTER_ATTRIB_COLOR:
-		memcpy(&sctx->vs_blit_sh_data[3], attrib->color,
-		       sizeof(float)*4);
-		break;
-	case UTIL_BLITTER_ATTRIB_TEXCOORD_XY:
-	case UTIL_BLITTER_ATTRIB_TEXCOORD_XYZW:
-		memcpy(&sctx->vs_blit_sh_data[3], &attrib->texcoord,
-		       sizeof(attrib->texcoord));
-		break;
-	case UTIL_BLITTER_ATTRIB_NONE:;
-	}
-
-	pipe->bind_vs_state(pipe, si_get_blitter_vs(sctx, type, num_instances));
-
-	struct pipe_draw_info info = {};
-	info.mode = SI_PRIM_RECTANGLE_LIST;
-	info.count = 3;
-	info.instance_count = num_instances;
-
-	/* Don't set per-stage shader pointers for VS. */
-	sctx->shader_pointers_dirty &= ~SI_DESCS_SHADER_MASK(VERTEX);
-	sctx->vertex_buffer_pointer_dirty = false;
-	sctx->vertex_buffer_user_sgprs_dirty = false;
-
-	si_draw_vbo(pipe, &info);
+   struct pipe_context *pipe = util_blitter_get_pipe(blitter);
+   struct si_context *sctx = (struct si_context *)pipe;
+
+   /* Pack position coordinates as signed int16. */
+   sctx->vs_blit_sh_data[0] = (uint32_t)(x1 & 0xffff) | ((uint32_t)(y1 & 0xffff) << 16);
+   sctx->vs_blit_sh_data[1] = (uint32_t)(x2 & 0xffff) | ((uint32_t)(y2 & 0xffff) << 16);
+   sctx->vs_blit_sh_data[2] = fui(depth);
+
+   switch (type) {
+   case UTIL_BLITTER_ATTRIB_COLOR:
+      memcpy(&sctx->vs_blit_sh_data[3], attrib->color, sizeof(float) * 4);
+      break;
+   case UTIL_BLITTER_ATTRIB_TEXCOORD_XY:
+   case UTIL_BLITTER_ATTRIB_TEXCOORD_XYZW:
+      memcpy(&sctx->vs_blit_sh_data[3], &attrib->texcoord, sizeof(attrib->texcoord));
+      break;
+   case UTIL_BLITTER_ATTRIB_NONE:;
+   }
+
+   pipe->bind_vs_state(pipe, si_get_blitter_vs(sctx, type, num_instances));
+
+   struct pipe_draw_info info = {};
+   info.mode = SI_PRIM_RECTANGLE_LIST;
+   info.count = 3;
+   info.instance_count = num_instances;
+
+   /* Don't set per-stage shader pointers for VS. */
+   sctx->shader_pointers_dirty &= ~SI_DESCS_SHADER_MASK(VERTEX);
+   sctx->vertex_buffer_pointer_dirty = false;
+   sctx->vertex_buffer_user_sgprs_dirty = false;
+
+   si_draw_vbo(pipe, &info);
 }
 
 void si_trace_emit(struct si_context *sctx)
 {
-	struct radeon_cmdbuf *cs = sctx->gfx_cs;
-	uint32_t trace_id = ++sctx->current_saved_cs->trace_id;
+   struct radeon_cmdbuf *cs = sctx->gfx_cs;
+   uint32_t trace_id = ++sctx->current_saved_cs->trace_id;
 
-	si_cp_write_data(sctx, sctx->current_saved_cs->trace_buf,
-			 0, 4, V_370_MEM, V_370_ME, &trace_id);
+   si_cp_write_data(sctx, sctx->current_saved_cs->trace_buf, 0, 4, V_370_MEM, V_370_ME, &trace_id);
 
-	radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
-	radeon_emit(cs, AC_ENCODE_TRACE_POINT(trace_id));
+   radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
+   radeon_emit(cs, AC_ENCODE_TRACE_POINT(trace_id));
 
-	if (sctx->log)
-		u_log_flush(sctx->log);
+   if (sctx->log)
+      u_log_flush(sctx->log);
 }
 
 void si_init_draw_functions(struct si_context *sctx)
 {
-	sctx->b.draw_vbo = si_draw_vbo;
+   sctx->b.draw_vbo = si_draw_vbo;
 
-	sctx->blitter->draw_rectangle = si_draw_rectangle;
+   sctx->blitter->draw_rectangle = si_draw_rectangle;
 
-	si_init_ia_multi_vgt_param_table(sctx);
+   si_init_ia_multi_vgt_param_table(sctx);
 }
diff --git a/src/gallium/drivers/radeonsi/si_state_msaa.c b/src/gallium/drivers/radeonsi/si_state_msaa.c
index 0fa38918b20..9ebb1e5dcb4 100644
--- a/src/gallium/drivers/radeonsi/si_state_msaa.c
+++ b/src/gallium/drivers/radeonsi/si_state_msaa.c
@@ -25,17 +25,16 @@
 #include "si_build_pm4.h"
 
 /* For MSAA sample positions. */
-#define FILL_SREG(s0x, s0y, s1x, s1y, s2x, s2y, s3x, s3y)  \
-	((((unsigned)(s0x) & 0xf) << 0)  | (((unsigned)(s0y) & 0xf) << 4)  | \
-	 (((unsigned)(s1x) & 0xf) << 8)  | (((unsigned)(s1y) & 0xf) << 12) | \
-	 (((unsigned)(s2x) & 0xf) << 16) | (((unsigned)(s2y) & 0xf) << 20) | \
-	 (((unsigned)(s3x) & 0xf) << 24) | (((unsigned)(s3y) & 0xf) << 28))
+#define FILL_SREG(s0x, s0y, s1x, s1y, s2x, s2y, s3x, s3y)                                          \
+   ((((unsigned)(s0x)&0xf) << 0) | (((unsigned)(s0y)&0xf) << 4) | (((unsigned)(s1x)&0xf) << 8) |   \
+    (((unsigned)(s1y)&0xf) << 12) | (((unsigned)(s2x)&0xf) << 16) |                                \
+    (((unsigned)(s2y)&0xf) << 20) | (((unsigned)(s3x)&0xf) << 24) | (((unsigned)(s3y)&0xf) << 28))
 
 /* For obtaining location coordinates from registers */
-#define SEXT4(x)		((int)((x) | ((x) & 0x8 ? 0xfffffff0 : 0)))
-#define GET_SFIELD(reg, index)	SEXT4(((reg) >> ((index) * 4)) & 0xf)
-#define GET_SX(reg, index)	GET_SFIELD((reg)[(index) / 4], ((index) % 4) * 2)
-#define GET_SY(reg, index)	GET_SFIELD((reg)[(index) / 4], ((index) % 4) * 2 + 1)
+#define SEXT4(x)               ((int)((x) | ((x)&0x8 ? 0xfffffff0 : 0)))
+#define GET_SFIELD(reg, index) SEXT4(((reg) >> ((index)*4)) & 0xf)
+#define GET_SX(reg, index)     GET_SFIELD((reg)[(index) / 4], ((index) % 4) * 2)
+#define GET_SY(reg, index)     GET_SFIELD((reg)[(index) / 4], ((index) % 4) * 2 + 1)
 
 /* The following sample ordering is required by EQAA.
  *
@@ -88,132 +87,128 @@
 
 /* 1x MSAA */
 static const uint32_t sample_locs_1x =
-	FILL_SREG( 0, 0,   0, 0,   0, 0,   0, 0); /* S1, S2, S3 fields are not used by 1x */
+   FILL_SREG(0, 0, 0, 0, 0, 0, 0, 0); /* S1, S2, S3 fields are not used by 1x */
 static const uint64_t centroid_priority_1x = 0x0000000000000000ull;
 
 /* 2x MSAA (the positions are sorted for EQAA) */
 static const uint32_t sample_locs_2x =
-	FILL_SREG(-4,-4,   4, 4,   0, 0,   0, 0); /* S2 & S3 fields are not used by 2x MSAA */
+   FILL_SREG(-4, -4, 4, 4, 0, 0, 0, 0); /* S2 & S3 fields are not used by 2x MSAA */
 static const uint64_t centroid_priority_2x = 0x1010101010101010ull;
 
 /* 4x MSAA (the positions are sorted for EQAA) */
-static const uint32_t sample_locs_4x =
-	FILL_SREG(-2,-6,   2, 6,   -6, 2,  6,-2);
+static const uint32_t sample_locs_4x = FILL_SREG(-2, -6, 2, 6, -6, 2, 6, -2);
 static const uint64_t centroid_priority_4x = 0x3210321032103210ull;
 
 /* 8x MSAA (the positions are sorted for EQAA) */
 static const uint32_t sample_locs_8x[] = {
-	FILL_SREG(-3,-5,   5, 1,  -1, 3,   7,-7),
-	FILL_SREG(-7,-1,   3, 7,  -5, 5,   1,-3),
-	/* The following are unused by hardware, but we emit them to IBs
-	 * instead of multiple SET_CONTEXT_REG packets. */
-	0,
-	0,
+   FILL_SREG(-3, -5, 5, 1, -1, 3, 7, -7),
+   FILL_SREG(-7, -1, 3, 7, -5, 5, 1, -3),
+   /* The following are unused by hardware, but we emit them to IBs
+    * instead of multiple SET_CONTEXT_REG packets. */
+   0,
+   0,
 };
 static const uint64_t centroid_priority_8x = 0x3546012735460127ull;
 
 /* 16x MSAA (the positions are sorted for EQAA) */
 static const uint32_t sample_locs_16x[] = {
-	FILL_SREG(-5,-2,   5, 3,  -2, 6,   3,-5),
-	FILL_SREG(-4,-6,   1, 1,  -6, 4,   7,-4),
-	FILL_SREG(-1,-3,   6, 7,  -3, 2,   0,-7),
-	FILL_SREG(-7,-8,   2, 5,  -8, 0,   4,-1),
+   FILL_SREG(-5, -2, 5, 3, -2, 6, 3, -5),
+   FILL_SREG(-4, -6, 1, 1, -6, 4, 7, -4),
+   FILL_SREG(-1, -3, 6, 7, -3, 2, 0, -7),
+   FILL_SREG(-7, -8, 2, 5, -8, 0, 4, -1),
 };
 static const uint64_t centroid_priority_16x = 0xc97e64b231d0fa85ull;
 
 static void si_get_sample_position(struct pipe_context *ctx, unsigned sample_count,
-				   unsigned sample_index, float *out_value)
+                                   unsigned sample_index, float *out_value)
 {
-	const uint32_t *sample_locs;
-
-	switch (sample_count) {
-	case 1:
-	default:
-		sample_locs = &sample_locs_1x;
-		break;
-	case 2:
-		sample_locs = &sample_locs_2x;
-		break;
-	case 4:
-		sample_locs = &sample_locs_4x;
-		break;
-	case 8:
-		sample_locs = sample_locs_8x;
-		break;
-	case 16:
-		sample_locs = sample_locs_16x;
-		break;
-	}
-
-	out_value[0] = (GET_SX(sample_locs, sample_index) + 8) / 16.0f;
-	out_value[1] = (GET_SY(sample_locs, sample_index) + 8) / 16.0f;
+   const uint32_t *sample_locs;
+
+   switch (sample_count) {
+   case 1:
+   default:
+      sample_locs = &sample_locs_1x;
+      break;
+   case 2:
+      sample_locs = &sample_locs_2x;
+      break;
+   case 4:
+      sample_locs = &sample_locs_4x;
+      break;
+   case 8:
+      sample_locs = sample_locs_8x;
+      break;
+   case 16:
+      sample_locs = sample_locs_16x;
+      break;
+   }
+
+   out_value[0] = (GET_SX(sample_locs, sample_index) + 8) / 16.0f;
+   out_value[1] = (GET_SY(sample_locs, sample_index) + 8) / 16.0f;
 }
 
-static void si_emit_max_4_sample_locs(struct radeon_cmdbuf *cs,
-				      uint64_t centroid_priority,
-				      uint32_t sample_locs)
+static void si_emit_max_4_sample_locs(struct radeon_cmdbuf *cs, uint64_t centroid_priority,
+                                      uint32_t sample_locs)
 {
-	radeon_set_context_reg_seq(cs, R_028BD4_PA_SC_CENTROID_PRIORITY_0, 2);
-	radeon_emit(cs, centroid_priority);
-	radeon_emit(cs, centroid_priority >> 32);
-	radeon_set_context_reg(cs, R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0, sample_locs);
-	radeon_set_context_reg(cs, R_028C08_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_0, sample_locs);
-	radeon_set_context_reg(cs, R_028C18_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_0, sample_locs);
-	radeon_set_context_reg(cs, R_028C28_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_0, sample_locs);
+   radeon_set_context_reg_seq(cs, R_028BD4_PA_SC_CENTROID_PRIORITY_0, 2);
+   radeon_emit(cs, centroid_priority);
+   radeon_emit(cs, centroid_priority >> 32);
+   radeon_set_context_reg(cs, R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0, sample_locs);
+   radeon_set_context_reg(cs, R_028C08_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_0, sample_locs);
+   radeon_set_context_reg(cs, R_028C18_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_0, sample_locs);
+   radeon_set_context_reg(cs, R_028C28_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_0, sample_locs);
 }
 
-static void si_emit_max_16_sample_locs(struct radeon_cmdbuf *cs,
-				       uint64_t centroid_priority,
-				       const uint32_t *sample_locs,
-				       unsigned num_samples)
+static void si_emit_max_16_sample_locs(struct radeon_cmdbuf *cs, uint64_t centroid_priority,
+                                       const uint32_t *sample_locs, unsigned num_samples)
 {
-	radeon_set_context_reg_seq(cs, R_028BD4_PA_SC_CENTROID_PRIORITY_0, 2);
-	radeon_emit(cs, centroid_priority);
-	radeon_emit(cs, centroid_priority >> 32);
-	radeon_set_context_reg_seq(cs, R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0,
-				   num_samples == 8 ? 14 : 16);
-	radeon_emit_array(cs, sample_locs, 4);
-	radeon_emit_array(cs, sample_locs, 4);
-	radeon_emit_array(cs, sample_locs, 4);
-	radeon_emit_array(cs, sample_locs, num_samples == 8 ? 2 : 4);
+   radeon_set_context_reg_seq(cs, R_028BD4_PA_SC_CENTROID_PRIORITY_0, 2);
+   radeon_emit(cs, centroid_priority);
+   radeon_emit(cs, centroid_priority >> 32);
+   radeon_set_context_reg_seq(cs, R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0,
+                              num_samples == 8 ? 14 : 16);
+   radeon_emit_array(cs, sample_locs, 4);
+   radeon_emit_array(cs, sample_locs, 4);
+   radeon_emit_array(cs, sample_locs, 4);
+   radeon_emit_array(cs, sample_locs, num_samples == 8 ? 2 : 4);
 }
 
 void si_emit_sample_locations(struct radeon_cmdbuf *cs, int nr_samples)
 {
-	switch (nr_samples) {
-	default:
-	case 1:
-		si_emit_max_4_sample_locs(cs, centroid_priority_1x, sample_locs_1x);
-		break;
-	case 2:
-		si_emit_max_4_sample_locs(cs, centroid_priority_2x, sample_locs_2x);
-		break;
-	case 4:
-		si_emit_max_4_sample_locs(cs, centroid_priority_4x, sample_locs_4x);
-		break;
-	case 8:
-		si_emit_max_16_sample_locs(cs, centroid_priority_8x, sample_locs_8x, 8);
-		break;
-	case 16:
-		si_emit_max_16_sample_locs(cs, centroid_priority_16x, sample_locs_16x, 16);
-		break;
-	}
+   switch (nr_samples) {
+   default:
+   case 1:
+      si_emit_max_4_sample_locs(cs, centroid_priority_1x, sample_locs_1x);
+      break;
+   case 2:
+      si_emit_max_4_sample_locs(cs, centroid_priority_2x, sample_locs_2x);
+      break;
+   case 4:
+      si_emit_max_4_sample_locs(cs, centroid_priority_4x, sample_locs_4x);
+      break;
+   case 8:
+      si_emit_max_16_sample_locs(cs, centroid_priority_8x, sample_locs_8x, 8);
+      break;
+   case 16:
+      si_emit_max_16_sample_locs(cs, centroid_priority_16x, sample_locs_16x, 16);
+      break;
+   }
 }
 
 void si_init_msaa_functions(struct si_context *sctx)
 {
-	int i;
+   int i;
 
-	sctx->b.get_sample_position = si_get_sample_position;
+   sctx->b.get_sample_position = si_get_sample_position;
 
-	si_get_sample_position(&sctx->b, 1, 0, sctx->sample_positions.x1[0]);
+   si_get_sample_position(&sctx->b, 1, 0, sctx->sample_positions.x1[0]);
 
-	for (i = 0; i < 2; i++)
-		si_get_sample_position(&sctx->b, 2, i, sctx->sample_positions.x2[i]);
-	for (i = 0; i < 4; i++)
-		si_get_sample_position(&sctx->b, 4, i, sctx->sample_positions.x4[i]);
-	for (i = 0; i < 8; i++)
-		si_get_sample_position(&sctx->b, 8, i, sctx->sample_positions.x8[i]);
-	for (i = 0; i < 16; i++)
-		si_get_sample_position(&sctx->b, 16, i, sctx->sample_positions.x16[i]);
+   for (i = 0; i < 2; i++)
+      si_get_sample_position(&sctx->b, 2, i, sctx->sample_positions.x2[i]);
+   for (i = 0; i < 4; i++)
+      si_get_sample_position(&sctx->b, 4, i, sctx->sample_positions.x4[i]);
+   for (i = 0; i < 8; i++)
+      si_get_sample_position(&sctx->b, 8, i, sctx->sample_positions.x8[i]);
+   for (i = 0; i < 16; i++)
+      si_get_sample_position(&sctx->b, 16, i, sctx->sample_positions.x16[i]);
 }
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c
index be7cda1d332..d322cd1f341 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -22,96 +22,91 @@
  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  */
 
-#include "si_build_pm4.h"
-#include "sid.h"
-
+#include "ac_exp_param.h"
+#include "ac_shader_util.h"
 #include "compiler/nir/nir_serialize.h"
 #include "nir/tgsi_to_nir.h"
-#include "util/hash_table.h"
+#include "si_build_pm4.h"
+#include "sid.h"
 #include "util/crc32.h"
+#include "util/disk_cache.h"
+#include "util/hash_table.h"
+#include "util/mesa-sha1.h"
 #include "util/u_async_debug.h"
 #include "util/u_memory.h"
 #include "util/u_prim.h"
 
-#include "util/disk_cache.h"
-#include "util/mesa-sha1.h"
-#include "ac_exp_param.h"
-#include "ac_shader_util.h"
-
 /* SHADER_CACHE */
 
 /**
  * Return the IR key for the shader cache.
  */
 void si_get_ir_cache_key(struct si_shader_selector *sel, bool ngg, bool es,
-			 unsigned char ir_sha1_cache_key[20])
-{
-	struct blob blob = {};
-	unsigned ir_size;
-	void *ir_binary;
-
-	if (sel->nir_binary) {
-		ir_binary = sel->nir_binary;
-		ir_size = sel->nir_size;
-	} else {
-		assert(sel->nir);
-
-		blob_init(&blob);
-		nir_serialize(&blob, sel->nir, true);
-		ir_binary = blob.data;
-		ir_size = blob.size;
-	}
-
-	/* These settings affect the compilation, but they are not derived
-	 * from the input shader IR.
-	 */
-	unsigned shader_variant_flags = 0;
-
-	if (ngg)
-		shader_variant_flags |= 1 << 0;
-	if (sel->nir)
-		shader_variant_flags |= 1 << 1;
-	if (si_get_wave_size(sel->screen, sel->type, ngg, es, false) == 32)
-		shader_variant_flags |= 1 << 2;
-	if (sel->type == PIPE_SHADER_FRAGMENT &&
-	    sel->info.uses_derivatives &&
-	    sel->info.uses_kill &&
-	    sel->screen->debug_flags & DBG(FS_CORRECT_DERIVS_AFTER_KILL))
-		shader_variant_flags |= 1 << 3;
-
-	/* This varies depending on whether compute-based culling is enabled. */
-	shader_variant_flags |= sel->screen->num_vbos_in_user_sgprs << 4;
-
-	struct mesa_sha1 ctx;
-	_mesa_sha1_init(&ctx);
-	_mesa_sha1_update(&ctx, &shader_variant_flags, 4);
-	_mesa_sha1_update(&ctx, ir_binary, ir_size);
-	if (sel->type == PIPE_SHADER_VERTEX ||
-	    sel->type == PIPE_SHADER_TESS_EVAL ||
-	    sel->type == PIPE_SHADER_GEOMETRY)
-		_mesa_sha1_update(&ctx, &sel->so, sizeof(sel->so));
-	_mesa_sha1_final(&ctx, ir_sha1_cache_key);
-
-	if (ir_binary == blob.data)
-		blob_finish(&blob);
+                         unsigned char ir_sha1_cache_key[20])
+{
+   struct blob blob = {};
+   unsigned ir_size;
+   void *ir_binary;
+
+   if (sel->nir_binary) {
+      ir_binary = sel->nir_binary;
+      ir_size = sel->nir_size;
+   } else {
+      assert(sel->nir);
+
+      blob_init(&blob);
+      nir_serialize(&blob, sel->nir, true);
+      ir_binary = blob.data;
+      ir_size = blob.size;
+   }
+
+   /* These settings affect the compilation, but they are not derived
+    * from the input shader IR.
+    */
+   unsigned shader_variant_flags = 0;
+
+   if (ngg)
+      shader_variant_flags |= 1 << 0;
+   if (sel->nir)
+      shader_variant_flags |= 1 << 1;
+   if (si_get_wave_size(sel->screen, sel->type, ngg, es, false) == 32)
+      shader_variant_flags |= 1 << 2;
+   if (sel->type == PIPE_SHADER_FRAGMENT && sel->info.uses_derivatives && sel->info.uses_kill &&
+       sel->screen->debug_flags & DBG(FS_CORRECT_DERIVS_AFTER_KILL))
+      shader_variant_flags |= 1 << 3;
+
+   /* This varies depending on whether compute-based culling is enabled. */
+   shader_variant_flags |= sel->screen->num_vbos_in_user_sgprs << 4;
+
+   struct mesa_sha1 ctx;
+   _mesa_sha1_init(&ctx);
+   _mesa_sha1_update(&ctx, &shader_variant_flags, 4);
+   _mesa_sha1_update(&ctx, ir_binary, ir_size);
+   if (sel->type == PIPE_SHADER_VERTEX || sel->type == PIPE_SHADER_TESS_EVAL ||
+       sel->type == PIPE_SHADER_GEOMETRY)
+      _mesa_sha1_update(&ctx, &sel->so, sizeof(sel->so));
+   _mesa_sha1_final(&ctx, ir_sha1_cache_key);
+
+   if (ir_binary == blob.data)
+      blob_finish(&blob);
 }
 
 /** Copy "data" to "ptr" and return the next dword following copied data. */
 static uint32_t *write_data(uint32_t *ptr, const void *data, unsigned size)
 {
-	/* data may be NULL if size == 0 */
-	if (size)
-		memcpy(ptr, data, size);
-	ptr += DIV_ROUND_UP(size, 4);
-	return ptr;
+   /* data may be NULL if size == 0 */
+   if (size)
+      memcpy(ptr, data, size);
+   ptr += DIV_ROUND_UP(size, 4);
+   return ptr;
 }
 
 /** Read data from "ptr". Return the next dword following the data. */
 static uint32_t *read_data(uint32_t *ptr, void *data, unsigned size)
 {
-	memcpy(data, ptr, size);
-	ptr += DIV_ROUND_UP(size, 4);
-	return ptr;
+   memcpy(data, ptr, size);
+   ptr += DIV_ROUND_UP(size, 4);
+   return ptr;
 }
 
 /**
@@ -120,8 +115,8 @@ static uint32_t *read_data(uint32_t *ptr, void *data, unsigned size)
  */
 static uint32_t *write_chunk(uint32_t *ptr, const void *data, unsigned size)
 {
-	*ptr++ = size;
-	return write_data(ptr, data, size);
+   *ptr++ = size;
+   return write_data(ptr, data, size);
 }
 
 /**
@@ -130,12 +125,12 @@ static uint32_t *write_chunk(uint32_t *ptr, const void *data, unsigned size)
  */
 static uint32_t *read_chunk(uint32_t *ptr, void **data, unsigned *size)
 {
-	*size = *ptr++;
-	assert(*data == NULL);
-	if (!*size)
-		return ptr;
-	*data = malloc(*size);
-	return read_data(ptr, *data, *size);
+   *size = *ptr++;
+   assert(*data == NULL);
+   if (!*size)
+      return ptr;
+   *data = malloc(*size);
+   return read_data(ptr, *data, *size);
 }
 
 /**
@@ -144,258 +139,236 @@ static uint32_t *read_chunk(uint32_t *ptr, void **data, unsigned *size)
  */
 static void *si_get_shader_binary(struct si_shader *shader)
 {
-	/* There is always a size of data followed by the data itself. */
-	unsigned llvm_ir_size = shader->binary.llvm_ir_string ?
-				strlen(shader->binary.llvm_ir_string) + 1 : 0;
-
-	/* Refuse to allocate overly large buffers and guard against integer
-	 * overflow. */
-	if (shader->binary.elf_size > UINT_MAX / 4 ||
-	    llvm_ir_size > UINT_MAX / 4)
-		return NULL;
-
-	unsigned size =
-		4 + /* total size */
-		4 + /* CRC32 of the data below */
-		align(sizeof(shader->config), 4) +
-		align(sizeof(shader->info), 4) +
-		4 + align(shader->binary.elf_size, 4) +
-		4 + align(llvm_ir_size, 4);
-	void *buffer = CALLOC(1, size);
-	uint32_t *ptr = (uint32_t*)buffer;
-
-	if (!buffer)
-		return NULL;
-
-	*ptr++ = size;
-	ptr++; /* CRC32 is calculated at the end. */
-
-	ptr = write_data(ptr, &shader->config, sizeof(shader->config));
-	ptr = write_data(ptr, &shader->info, sizeof(shader->info));
-	ptr = write_chunk(ptr, shader->binary.elf_buffer, shader->binary.elf_size);
-	ptr = write_chunk(ptr, shader->binary.llvm_ir_string, llvm_ir_size);
-	assert((char *)ptr - (char *)buffer == size);
-
-	/* Compute CRC32. */
-	ptr = (uint32_t*)buffer;
-	ptr++;
-	*ptr = util_hash_crc32(ptr + 1, size - 8);
-
-	return buffer;
+   /* There is always a size of data followed by the data itself. */
+   unsigned llvm_ir_size =
+      shader->binary.llvm_ir_string ? strlen(shader->binary.llvm_ir_string) + 1 : 0;
+
+   /* Refuse to allocate overly large buffers and guard against integer
+    * overflow. */
+   if (shader->binary.elf_size > UINT_MAX / 4 || llvm_ir_size > UINT_MAX / 4)
+      return NULL;
+
+   unsigned size = 4 + /* total size */
+                   4 + /* CRC32 of the data below */
+                   align(sizeof(shader->config), 4) + align(sizeof(shader->info), 4) + 4 +
+                   align(shader->binary.elf_size, 4) + 4 + align(llvm_ir_size, 4);
+   void *buffer = CALLOC(1, size);
+   uint32_t *ptr = (uint32_t *)buffer;
+
+   if (!buffer)
+      return NULL;
+
+   *ptr++ = size;
+   ptr++; /* CRC32 is calculated at the end. */
+
+   ptr = write_data(ptr, &shader->config, sizeof(shader->config));
+   ptr = write_data(ptr, &shader->info, sizeof(shader->info));
+   ptr = write_chunk(ptr, shader->binary.elf_buffer, shader->binary.elf_size);
+   ptr = write_chunk(ptr, shader->binary.llvm_ir_string, llvm_ir_size);
+   assert((char *)ptr - (char *)buffer == size);
+
+   /* Compute CRC32. */
+   ptr = (uint32_t *)buffer;
+   ptr++;
+   *ptr = util_hash_crc32(ptr + 1, size - 8);
+
+   return buffer;
 }
 
 static bool si_load_shader_binary(struct si_shader *shader, void *binary)
 {
-	uint32_t *ptr = (uint32_t*)binary;
-	uint32_t size = *ptr++;
-	uint32_t crc32 = *ptr++;
-	unsigned chunk_size;
-	unsigned elf_size;
-
-	if (util_hash_crc32(ptr, size - 8) != crc32) {
-		fprintf(stderr, "radeonsi: binary shader has invalid CRC32\n");
-		return false;
-	}
-
-	ptr = read_data(ptr, &shader->config, sizeof(shader->config));
-	ptr = read_data(ptr, &shader->info, sizeof(shader->info));
-	ptr = read_chunk(ptr, (void**)&shader->binary.elf_buffer,
-			 &elf_size);
-	shader->binary.elf_size = elf_size;
-	ptr = read_chunk(ptr, (void**)&shader->binary.llvm_ir_string, &chunk_size);
-
-	return true;
+   uint32_t *ptr = (uint32_t *)binary;
+   uint32_t size = *ptr++;
+   uint32_t crc32 = *ptr++;
+   unsigned chunk_size;
+   unsigned elf_size;
+
+   if (util_hash_crc32(ptr, size - 8) != crc32) {
+      fprintf(stderr, "radeonsi: binary shader has invalid CRC32\n");
+      return false;
+   }
+
+   ptr = read_data(ptr, &shader->config, sizeof(shader->config));
+   ptr = read_data(ptr, &shader->info, sizeof(shader->info));
+   ptr = read_chunk(ptr, (void **)&shader->binary.elf_buffer, &elf_size);
+   shader->binary.elf_size = elf_size;
+   ptr = read_chunk(ptr, (void **)&shader->binary.llvm_ir_string, &chunk_size);
+
+   return true;
 }
 
 /**
  * Insert a shader into the cache. It's assumed the shader is not in the cache.
  * Use si_shader_cache_load_shader before calling this.
  */
-void si_shader_cache_insert_shader(struct si_screen *sscreen,
-				   unsigned char ir_sha1_cache_key[20],
-				   struct si_shader *shader,
-				   bool insert_into_disk_cache)
-{
-	void *hw_binary;
-	struct hash_entry *entry;
-	uint8_t key[CACHE_KEY_SIZE];
-
-	entry = _mesa_hash_table_search(sscreen->shader_cache, ir_sha1_cache_key);
-	if (entry)
-		return; /* already added */
-
-	hw_binary = si_get_shader_binary(shader);
-	if (!hw_binary)
-		return;
-
-	if (_mesa_hash_table_insert(sscreen->shader_cache,
-				    mem_dup(ir_sha1_cache_key, 20),
-				    hw_binary) == NULL) {
-		FREE(hw_binary);
-		return;
-	}
-
-	if (sscreen->disk_shader_cache && insert_into_disk_cache) {
-		disk_cache_compute_key(sscreen->disk_shader_cache,
-				       ir_sha1_cache_key, 20, key);
-		disk_cache_put(sscreen->disk_shader_cache, key, hw_binary,
-			       *((uint32_t *) hw_binary), NULL);
-	}
-}
-
-bool si_shader_cache_load_shader(struct si_screen *sscreen,
-				 unsigned char ir_sha1_cache_key[20],
-				 struct si_shader *shader)
-{
-	struct hash_entry *entry =
-		_mesa_hash_table_search(sscreen->shader_cache, ir_sha1_cache_key);
-
-	if (entry) {
-		if (si_load_shader_binary(shader, entry->data)) {
-			p_atomic_inc(&sscreen->num_memory_shader_cache_hits);
-			return true;
-		}
-	}
-	p_atomic_inc(&sscreen->num_memory_shader_cache_misses);
-
-	if (!sscreen->disk_shader_cache)
-		return false;
-
-	unsigned char sha1[CACHE_KEY_SIZE];
-	disk_cache_compute_key(sscreen->disk_shader_cache, ir_sha1_cache_key,
-			       20, sha1);
-
-	size_t binary_size;
-	uint8_t *buffer = disk_cache_get(sscreen->disk_shader_cache, sha1,
-					 &binary_size);
-	if (buffer) {
-		if (binary_size >= sizeof(uint32_t) &&
-		    *((uint32_t*)buffer) == binary_size) {
-			if (si_load_shader_binary(shader, buffer)) {
-				free(buffer);
-				si_shader_cache_insert_shader(sscreen, ir_sha1_cache_key,
-							      shader, false);
-				p_atomic_inc(&sscreen->num_disk_shader_cache_hits);
-				return true;
-			}
-		} else {
-			/* Something has gone wrong discard the item from the cache and
-			 * rebuild/link from source.
-			 */
-			assert(!"Invalid radeonsi shader disk cache item!");
-			disk_cache_remove(sscreen->disk_shader_cache, sha1);
-		}
-	}
-
-	free(buffer);
-	p_atomic_inc(&sscreen->num_disk_shader_cache_misses);
-	return false;
+void si_shader_cache_insert_shader(struct si_screen *sscreen, unsigned char ir_sha1_cache_key[20],
+                                   struct si_shader *shader, bool insert_into_disk_cache)
+{
+   void *hw_binary;
+   struct hash_entry *entry;
+   uint8_t key[CACHE_KEY_SIZE];
+
+   entry = _mesa_hash_table_search(sscreen->shader_cache, ir_sha1_cache_key);
+   if (entry)
+      return; /* already added */
+
+   hw_binary = si_get_shader_binary(shader);
+   if (!hw_binary)
+      return;
+
+   if (_mesa_hash_table_insert(sscreen->shader_cache, mem_dup(ir_sha1_cache_key, 20), hw_binary) ==
+       NULL) {
+      FREE(hw_binary);
+      return;
+   }
+
+   if (sscreen->disk_shader_cache && insert_into_disk_cache) {
+      disk_cache_compute_key(sscreen->disk_shader_cache, ir_sha1_cache_key, 20, key);
+      disk_cache_put(sscreen->disk_shader_cache, key, hw_binary, *((uint32_t *)hw_binary), NULL);
+   }
+}
+
+bool si_shader_cache_load_shader(struct si_screen *sscreen, unsigned char ir_sha1_cache_key[20],
+                                 struct si_shader *shader)
+{
+   struct hash_entry *entry = _mesa_hash_table_search(sscreen->shader_cache, ir_sha1_cache_key);
+
+   if (entry) {
+      if (si_load_shader_binary(shader, entry->data)) {
+         p_atomic_inc(&sscreen->num_memory_shader_cache_hits);
+         return true;
+      }
+   }
+   p_atomic_inc(&sscreen->num_memory_shader_cache_misses);
+
+   if (!sscreen->disk_shader_cache)
+      return false;
+
+   unsigned char sha1[CACHE_KEY_SIZE];
+   disk_cache_compute_key(sscreen->disk_shader_cache, ir_sha1_cache_key, 20, sha1);
+
+   size_t binary_size;
+   uint8_t *buffer = disk_cache_get(sscreen->disk_shader_cache, sha1, &binary_size);
+   if (buffer) {
+      if (binary_size >= sizeof(uint32_t) && *((uint32_t *)buffer) == binary_size) {
+         if (si_load_shader_binary(shader, buffer)) {
+            free(buffer);
+            si_shader_cache_insert_shader(sscreen, ir_sha1_cache_key, shader, false);
+            p_atomic_inc(&sscreen->num_disk_shader_cache_hits);
+            return true;
+         }
+      } else {
+         /* Something has gone wrong discard the item from the cache and
+          * rebuild/link from source.
+          */
+         assert(!"Invalid radeonsi shader disk cache item!");
+         disk_cache_remove(sscreen->disk_shader_cache, sha1);
+      }
+   }
+
+   free(buffer);
+   p_atomic_inc(&sscreen->num_disk_shader_cache_misses);
+   return false;
 }
 
 static uint32_t si_shader_cache_key_hash(const void *key)
 {
-	/* Take the first dword of SHA1. */
-	return *(uint32_t*)key;
+   /* Take the first dword of SHA1. */
+   return *(uint32_t *)key;
 }
 
 static bool si_shader_cache_key_equals(const void *a, const void *b)
 {
-	/* Compare SHA1s. */
-	return memcmp(a, b, 20) == 0;
+   /* Compare SHA1s. */
+   return memcmp(a, b, 20) == 0;
 }
 
 static void si_destroy_shader_cache_entry(struct hash_entry *entry)
 {
-	FREE((void*)entry->key);
-	FREE(entry->data);
+   FREE((void *)entry->key);
+   FREE(entry->data);
 }
 
 bool si_init_shader_cache(struct si_screen *sscreen)
 {
-	(void) simple_mtx_init(&sscreen->shader_cache_mutex, mtx_plain);
-	sscreen->shader_cache =
-		_mesa_hash_table_create(NULL,
-					si_shader_cache_key_hash,
-					si_shader_cache_key_equals);
+   (void)simple_mtx_init(&sscreen->shader_cache_mutex, mtx_plain);
+   sscreen->shader_cache =
+      _mesa_hash_table_create(NULL, si_shader_cache_key_hash, si_shader_cache_key_equals);
 
-	return sscreen->shader_cache != NULL;
+   return sscreen->shader_cache != NULL;
 }
 
 void si_destroy_shader_cache(struct si_screen *sscreen)
 {
-	if (sscreen->shader_cache)
-		_mesa_hash_table_destroy(sscreen->shader_cache,
-					 si_destroy_shader_cache_entry);
-	simple_mtx_destroy(&sscreen->shader_cache_mutex);
+   if (sscreen->shader_cache)
+      _mesa_hash_table_destroy(sscreen->shader_cache, si_destroy_shader_cache_entry);
+   simple_mtx_destroy(&sscreen->shader_cache_mutex);
 }
 
 /* SHADER STATES */
 
-static void si_set_tesseval_regs(struct si_screen *sscreen,
-				 const struct si_shader_selector *tes,
-				 struct si_pm4_state *pm4)
-{
-	const struct si_shader_info *info = &tes->info;
-	unsigned tes_prim_mode = info->properties[TGSI_PROPERTY_TES_PRIM_MODE];
-	unsigned tes_spacing = info->properties[TGSI_PROPERTY_TES_SPACING];
-	bool tes_vertex_order_cw = info->properties[TGSI_PROPERTY_TES_VERTEX_ORDER_CW];
-	bool tes_point_mode = info->properties[TGSI_PROPERTY_TES_POINT_MODE];
-	unsigned type, partitioning, topology, distribution_mode;
-
-	switch (tes_prim_mode) {
-	case PIPE_PRIM_LINES:
-		type = V_028B6C_TESS_ISOLINE;
-		break;
-	case PIPE_PRIM_TRIANGLES:
-		type = V_028B6C_TESS_TRIANGLE;
-		break;
-	case PIPE_PRIM_QUADS:
-		type = V_028B6C_TESS_QUAD;
-		break;
-	default:
-		assert(0);
-		return;
-	}
-
-	switch (tes_spacing) {
-	case PIPE_TESS_SPACING_FRACTIONAL_ODD:
-		partitioning = V_028B6C_PART_FRAC_ODD;
-		break;
-	case PIPE_TESS_SPACING_FRACTIONAL_EVEN:
-		partitioning = V_028B6C_PART_FRAC_EVEN;
-		break;
-	case PIPE_TESS_SPACING_EQUAL:
-		partitioning = V_028B6C_PART_INTEGER;
-		break;
-	default:
-		assert(0);
-		return;
-	}
-
-	if (tes_point_mode)
-		topology = V_028B6C_OUTPUT_POINT;
-	else if (tes_prim_mode == PIPE_PRIM_LINES)
-		topology = V_028B6C_OUTPUT_LINE;
-	else if (tes_vertex_order_cw)
-		/* for some reason, this must be the other way around */
-		topology = V_028B6C_OUTPUT_TRIANGLE_CCW;
-	else
-		topology = V_028B6C_OUTPUT_TRIANGLE_CW;
-
-	if (sscreen->info.has_distributed_tess) {
-		if (sscreen->info.family == CHIP_FIJI ||
-		    sscreen->info.family >= CHIP_POLARIS10)
-			distribution_mode = V_028B6C_DISTRIBUTION_MODE_TRAPEZOIDS;
-		else
-			distribution_mode = V_028B6C_DISTRIBUTION_MODE_DONUTS;
-	} else
-		distribution_mode = V_028B6C_DISTRIBUTION_MODE_NO_DIST;
-
-	assert(pm4->shader);
-	pm4->shader->vgt_tf_param = S_028B6C_TYPE(type) |
-				    S_028B6C_PARTITIONING(partitioning) |
-				    S_028B6C_TOPOLOGY(topology) |
-				    S_028B6C_DISTRIBUTION_MODE(distribution_mode);
+static void si_set_tesseval_regs(struct si_screen *sscreen, const struct si_shader_selector *tes,
+                                 struct si_pm4_state *pm4)
+{
+   const struct si_shader_info *info = &tes->info;
+   unsigned tes_prim_mode = info->properties[TGSI_PROPERTY_TES_PRIM_MODE];
+   unsigned tes_spacing = info->properties[TGSI_PROPERTY_TES_SPACING];
+   bool tes_vertex_order_cw = info->properties[TGSI_PROPERTY_TES_VERTEX_ORDER_CW];
+   bool tes_point_mode = info->properties[TGSI_PROPERTY_TES_POINT_MODE];
+   unsigned type, partitioning, topology, distribution_mode;
+
+   switch (tes_prim_mode) {
+   case PIPE_PRIM_LINES:
+      type = V_028B6C_TESS_ISOLINE;
+      break;
+   case PIPE_PRIM_TRIANGLES:
+      type = V_028B6C_TESS_TRIANGLE;
+      break;
+   case PIPE_PRIM_QUADS:
+      type = V_028B6C_TESS_QUAD;
+      break;
+   default:
+      assert(0);
+      return;
+   }
+
+   switch (tes_spacing) {
+   case PIPE_TESS_SPACING_FRACTIONAL_ODD:
+      partitioning = V_028B6C_PART_FRAC_ODD;
+      break;
+   case PIPE_TESS_SPACING_FRACTIONAL_EVEN:
+      partitioning = V_028B6C_PART_FRAC_EVEN;
+      break;
+   case PIPE_TESS_SPACING_EQUAL:
+      partitioning = V_028B6C_PART_INTEGER;
+      break;
+   default:
+      assert(0);
+      return;
+   }
+
+   if (tes_point_mode)
+      topology = V_028B6C_OUTPUT_POINT;
+   else if (tes_prim_mode == PIPE_PRIM_LINES)
+      topology = V_028B6C_OUTPUT_LINE;
+   else if (tes_vertex_order_cw)
+      /* for some reason, this must be the other way around */
+      topology = V_028B6C_OUTPUT_TRIANGLE_CCW;
+   else
+      topology = V_028B6C_OUTPUT_TRIANGLE_CW;
+
+   if (sscreen->info.has_distributed_tess) {
+      if (sscreen->info.family == CHIP_FIJI || sscreen->info.family >= CHIP_POLARIS10)
+         distribution_mode = V_028B6C_DISTRIBUTION_MODE_TRAPEZOIDS;
+      else
+         distribution_mode = V_028B6C_DISTRIBUTION_MODE_DONUTS;
+   } else
+      distribution_mode = V_028B6C_DISTRIBUTION_MODE_NO_DIST;
+
+   assert(pm4->shader);
+   pm4->shader->vgt_tf_param = S_028B6C_TYPE(type) | S_028B6C_PARTITIONING(partitioning) |
+                               S_028B6C_TOPOLOGY(topology) |
+                               S_028B6C_DISTRIBUTION_MODE(distribution_mode);
 }
 
 /* Polaris needs different VTX_REUSE_DEPTH settings depending on
@@ -412,722 +385,674 @@ static void si_set_tesseval_regs(struct si_screen *sscreen,
  *
  * If "shader" is NULL, it's assumed it's not LS or GS copy shader.
  */
-static void polaris_set_vgt_vertex_reuse(struct si_screen *sscreen,
-					 struct si_shader_selector *sel,
-					 struct si_shader *shader,
-					 struct si_pm4_state *pm4)
+static void polaris_set_vgt_vertex_reuse(struct si_screen *sscreen, struct si_shader_selector *sel,
+                                         struct si_shader *shader, struct si_pm4_state *pm4)
 {
-	unsigned type = sel->type;
-
-	if (sscreen->info.family < CHIP_POLARIS10 ||
-	    sscreen->info.chip_class >= GFX10)
-		return;
-
-	/* VS as VS, or VS as ES: */
-	if ((type == PIPE_SHADER_VERTEX &&
-	     (!shader ||
-	      (!shader->key.as_ls && !shader->is_gs_copy_shader))) ||
-	    /* TES as VS, or TES as ES: */
-	    type == PIPE_SHADER_TESS_EVAL) {
-		unsigned vtx_reuse_depth = 30;
-
-		if (type == PIPE_SHADER_TESS_EVAL &&
-		    sel->info.properties[TGSI_PROPERTY_TES_SPACING] ==
-		    PIPE_TESS_SPACING_FRACTIONAL_ODD)
-			vtx_reuse_depth = 14;
-
-		assert(pm4->shader);
-		pm4->shader->vgt_vertex_reuse_block_cntl = vtx_reuse_depth;
-	}
+   unsigned type = sel->type;
+
+   if (sscreen->info.family < CHIP_POLARIS10 || sscreen->info.chip_class >= GFX10)
+      return;
+
+   /* VS as VS, or VS as ES: */
+   if ((type == PIPE_SHADER_VERTEX &&
+        (!shader || (!shader->key.as_ls && !shader->is_gs_copy_shader))) ||
+       /* TES as VS, or TES as ES: */
+       type == PIPE_SHADER_TESS_EVAL) {
+      unsigned vtx_reuse_depth = 30;
+
+      if (type == PIPE_SHADER_TESS_EVAL &&
+          sel->info.properties[TGSI_PROPERTY_TES_SPACING] == PIPE_TESS_SPACING_FRACTIONAL_ODD)
+         vtx_reuse_depth = 14;
+
+      assert(pm4->shader);
+      pm4->shader->vgt_vertex_reuse_block_cntl = vtx_reuse_depth;
+   }
 }
 
 static struct si_pm4_state *si_get_shader_pm4_state(struct si_shader *shader)
 {
-	if (shader->pm4)
-		si_pm4_clear_state(shader->pm4);
-	else
-		shader->pm4 = CALLOC_STRUCT(si_pm4_state);
-
-	if (shader->pm4) {
-		shader->pm4->shader = shader;
-		return shader->pm4;
-	} else {
-		fprintf(stderr, "radeonsi: Failed to create pm4 state.\n");
-		return NULL;
-	}
+   if (shader->pm4)
+      si_pm4_clear_state(shader->pm4);
+   else
+      shader->pm4 = CALLOC_STRUCT(si_pm4_state);
+
+   if (shader->pm4) {
+      shader->pm4->shader = shader;
+      return shader->pm4;
+   } else {
+      fprintf(stderr, "radeonsi: Failed to create pm4 state.\n");
+      return NULL;
+   }
 }
 
 static unsigned si_get_num_vs_user_sgprs(struct si_shader *shader,
-					 unsigned num_always_on_user_sgprs)
+                                         unsigned num_always_on_user_sgprs)
 {
-	struct si_shader_selector *vs = shader->previous_stage_sel ?
-			shader->previous_stage_sel : shader->selector;
-	unsigned num_vbos_in_user_sgprs = vs->num_vbos_in_user_sgprs;
+   struct si_shader_selector *vs =
+      shader->previous_stage_sel ? shader->previous_stage_sel : shader->selector;
+   unsigned num_vbos_in_user_sgprs = vs->num_vbos_in_user_sgprs;
 
-	/* 1 SGPR is reserved for the vertex buffer pointer. */
-	assert(num_always_on_user_sgprs <= SI_SGPR_VS_VB_DESCRIPTOR_FIRST - 1);
+   /* 1 SGPR is reserved for the vertex buffer pointer. */
+   assert(num_always_on_user_sgprs <= SI_SGPR_VS_VB_DESCRIPTOR_FIRST - 1);
 
-	if (num_vbos_in_user_sgprs)
-		return SI_SGPR_VS_VB_DESCRIPTOR_FIRST + num_vbos_in_user_sgprs * 4;
+   if (num_vbos_in_user_sgprs)
+      return SI_SGPR_VS_VB_DESCRIPTOR_FIRST + num_vbos_in_user_sgprs * 4;
 
-	/* Add the pointer to VBO descriptors. */
-	return num_always_on_user_sgprs + 1;
+   /* Add the pointer to VBO descriptors. */
+   return num_always_on_user_sgprs + 1;
 }
 
 /* Return VGPR_COMP_CNT for the API vertex shader. This can be hw LS, LSHS, ES, ESGS, VS. */
-static unsigned si_get_vs_vgpr_comp_cnt(struct si_screen *sscreen,
-					struct si_shader *shader, bool legacy_vs_prim_id)
-{
-	assert(shader->selector->type == PIPE_SHADER_VERTEX ||
-	       (shader->previous_stage_sel &&
-		shader->previous_stage_sel->type == PIPE_SHADER_VERTEX));
-
-	/* GFX6-9 LS    (VertexID, RelAutoindex,                InstanceID / StepRate0(==1), ...).
-	 * GFX6-9 ES,VS (VertexID, InstanceID / StepRate0(==1), VSPrimID,                    ...)
-	 * GFX10  LS    (VertexID, RelAutoindex,                UserVGPR1,                   InstanceID).
-	 * GFX10  ES,VS (VertexID, UserVGPR0,                   UserVGPR1 or VSPrimID,       UserVGPR2 or InstanceID)
-	 */
-	bool is_ls = shader->selector->type == PIPE_SHADER_TESS_CTRL || shader->key.as_ls;
-
-	if (sscreen->info.chip_class >= GFX10 && shader->info.uses_instanceid)
-		return 3;
-	else if ((is_ls && shader->info.uses_instanceid) || legacy_vs_prim_id)
-		return 2;
-	else if (is_ls || shader->info.uses_instanceid)
-		return 1;
-	else
-		return 0;
+static unsigned si_get_vs_vgpr_comp_cnt(struct si_screen *sscreen, struct si_shader *shader,
+                                        bool legacy_vs_prim_id)
+{
+   assert(shader->selector->type == PIPE_SHADER_VERTEX ||
+          (shader->previous_stage_sel && shader->previous_stage_sel->type == PIPE_SHADER_VERTEX));
+
+   /* GFX6-9 LS    (VertexID, RelAutoindex,                InstanceID / StepRate0(==1), ...).
+    * GFX6-9 ES,VS (VertexID, InstanceID / StepRate0(==1), VSPrimID,                    ...)
+    * GFX10  LS    (VertexID, RelAutoindex,                UserVGPR1,                   InstanceID).
+    * GFX10  ES,VS (VertexID, UserVGPR0,                   UserVGPR1 or VSPrimID,       UserVGPR2 or
+    * InstanceID)
+    */
+   bool is_ls = shader->selector->type == PIPE_SHADER_TESS_CTRL || shader->key.as_ls;
+
+   if (sscreen->info.chip_class >= GFX10 && shader->info.uses_instanceid)
+      return 3;
+   else if ((is_ls && shader->info.uses_instanceid) || legacy_vs_prim_id)
+      return 2;
+   else if (is_ls || shader->info.uses_instanceid)
+      return 1;
+   else
+      return 0;
 }
 
 static void si_shader_ls(struct si_screen *sscreen, struct si_shader *shader)
 {
-	struct si_pm4_state *pm4;
-	uint64_t va;
+   struct si_pm4_state *pm4;
+   uint64_t va;
 
-	assert(sscreen->info.chip_class <= GFX8);
+   assert(sscreen->info.chip_class <= GFX8);
 
-	pm4 = si_get_shader_pm4_state(shader);
-	if (!pm4)
-		return;
+   pm4 = si_get_shader_pm4_state(shader);
+   if (!pm4)
+      return;
 
-	va = shader->bo->gpu_address;
-	si_pm4_add_bo(pm4, shader->bo, RADEON_USAGE_READ, RADEON_PRIO_SHADER_BINARY);
+   va = shader->bo->gpu_address;
+   si_pm4_add_bo(pm4, shader->bo, RADEON_USAGE_READ, RADEON_PRIO_SHADER_BINARY);
 
-	si_pm4_set_reg(pm4, R_00B520_SPI_SHADER_PGM_LO_LS, va >> 8);
-	si_pm4_set_reg(pm4, R_00B524_SPI_SHADER_PGM_HI_LS, S_00B524_MEM_BASE(va >> 40));
+   si_pm4_set_reg(pm4, R_00B520_SPI_SHADER_PGM_LO_LS, va >> 8);
+   si_pm4_set_reg(pm4, R_00B524_SPI_SHADER_PGM_HI_LS, S_00B524_MEM_BASE(va >> 40));
 
-	shader->config.rsrc1 = S_00B528_VGPRS((shader->config.num_vgprs - 1) / 4) |
-			   S_00B528_SGPRS((shader->config.num_sgprs - 1) / 8) |
-		           S_00B528_VGPR_COMP_CNT(si_get_vs_vgpr_comp_cnt(sscreen, shader, false)) |
-			   S_00B528_DX10_CLAMP(1) |
-			   S_00B528_FLOAT_MODE(shader->config.float_mode);
-	shader->config.rsrc2 = S_00B52C_USER_SGPR(si_get_num_vs_user_sgprs(shader, SI_VS_NUM_USER_SGPR)) |
-			   S_00B52C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0);
+   shader->config.rsrc1 = S_00B528_VGPRS((shader->config.num_vgprs - 1) / 4) |
+                          S_00B528_SGPRS((shader->config.num_sgprs - 1) / 8) |
+                          S_00B528_VGPR_COMP_CNT(si_get_vs_vgpr_comp_cnt(sscreen, shader, false)) |
+                          S_00B528_DX10_CLAMP(1) | S_00B528_FLOAT_MODE(shader->config.float_mode);
+   shader->config.rsrc2 =
+      S_00B52C_USER_SGPR(si_get_num_vs_user_sgprs(shader, SI_VS_NUM_USER_SGPR)) |
+      S_00B52C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0);
 }
 
 static void si_shader_hs(struct si_screen *sscreen, struct si_shader *shader)
 {
-	struct si_pm4_state *pm4;
-	uint64_t va;
-
-	pm4 = si_get_shader_pm4_state(shader);
-	if (!pm4)
-		return;
-
-	va = shader->bo->gpu_address;
-	si_pm4_add_bo(pm4, shader->bo, RADEON_USAGE_READ, RADEON_PRIO_SHADER_BINARY);
-
-	if (sscreen->info.chip_class >= GFX9) {
-		if (sscreen->info.chip_class >= GFX10) {
-			si_pm4_set_reg(pm4, R_00B520_SPI_SHADER_PGM_LO_LS, va >> 8);
-			si_pm4_set_reg(pm4, R_00B524_SPI_SHADER_PGM_HI_LS, S_00B524_MEM_BASE(va >> 40));
-		} else {
-			si_pm4_set_reg(pm4, R_00B410_SPI_SHADER_PGM_LO_LS, va >> 8);
-			si_pm4_set_reg(pm4, R_00B414_SPI_SHADER_PGM_HI_LS, S_00B414_MEM_BASE(va >> 40));
-		}
-
-		unsigned num_user_sgprs =
-			si_get_num_vs_user_sgprs(shader, GFX9_TCS_NUM_USER_SGPR);
-
-		shader->config.rsrc2 =
-			S_00B42C_USER_SGPR(num_user_sgprs) |
-			S_00B42C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0);
-
-		if (sscreen->info.chip_class >= GFX10)
-			shader->config.rsrc2 |= S_00B42C_USER_SGPR_MSB_GFX10(num_user_sgprs >> 5);
-		else
-			shader->config.rsrc2 |= S_00B42C_USER_SGPR_MSB_GFX9(num_user_sgprs >> 5);
-	} else {
-		si_pm4_set_reg(pm4, R_00B420_SPI_SHADER_PGM_LO_HS, va >> 8);
-		si_pm4_set_reg(pm4, R_00B424_SPI_SHADER_PGM_HI_HS, S_00B424_MEM_BASE(va >> 40));
-
-		shader->config.rsrc2 =
-			S_00B42C_USER_SGPR(GFX6_TCS_NUM_USER_SGPR) |
-			S_00B42C_OC_LDS_EN(1) |
-			S_00B42C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0);
-	}
-
-	si_pm4_set_reg(pm4, R_00B428_SPI_SHADER_PGM_RSRC1_HS,
-		       S_00B428_VGPRS((shader->config.num_vgprs - 1) /
-				      (sscreen->ge_wave_size == 32 ? 8 : 4)) |
-		       (sscreen->info.chip_class <= GFX9 ?
-				S_00B428_SGPRS((shader->config.num_sgprs - 1) / 8) : 0) |
-		       S_00B428_DX10_CLAMP(1) |
-		       S_00B428_MEM_ORDERED(sscreen->info.chip_class >= GFX10) |
-		       S_00B428_WGP_MODE(sscreen->info.chip_class >= GFX10) |
-		       S_00B428_FLOAT_MODE(shader->config.float_mode) |
-		       S_00B428_LS_VGPR_COMP_CNT(sscreen->info.chip_class >= GFX9 ?
-						 si_get_vs_vgpr_comp_cnt(sscreen, shader, false) : 0));
-
-	if (sscreen->info.chip_class <= GFX8) {
-		si_pm4_set_reg(pm4, R_00B42C_SPI_SHADER_PGM_RSRC2_HS,
-			       shader->config.rsrc2);
-	}
+   struct si_pm4_state *pm4;
+   uint64_t va;
+
+   pm4 = si_get_shader_pm4_state(shader);
+   if (!pm4)
+      return;
+
+   va = shader->bo->gpu_address;
+   si_pm4_add_bo(pm4, shader->bo, RADEON_USAGE_READ, RADEON_PRIO_SHADER_BINARY);
+
+   if (sscreen->info.chip_class >= GFX9) {
+      if (sscreen->info.chip_class >= GFX10) {
+         si_pm4_set_reg(pm4, R_00B520_SPI_SHADER_PGM_LO_LS, va >> 8);
+         si_pm4_set_reg(pm4, R_00B524_SPI_SHADER_PGM_HI_LS, S_00B524_MEM_BASE(va >> 40));
+      } else {
+         si_pm4_set_reg(pm4, R_00B410_SPI_SHADER_PGM_LO_LS, va >> 8);
+         si_pm4_set_reg(pm4, R_00B414_SPI_SHADER_PGM_HI_LS, S_00B414_MEM_BASE(va >> 40));
+      }
+
+      unsigned num_user_sgprs = si_get_num_vs_user_sgprs(shader, GFX9_TCS_NUM_USER_SGPR);
+
+      shader->config.rsrc2 = S_00B42C_USER_SGPR(num_user_sgprs) |
+                             S_00B42C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0);
+
+      if (sscreen->info.chip_class >= GFX10)
+         shader->config.rsrc2 |= S_00B42C_USER_SGPR_MSB_GFX10(num_user_sgprs >> 5);
+      else
+         shader->config.rsrc2 |= S_00B42C_USER_SGPR_MSB_GFX9(num_user_sgprs >> 5);
+   } else {
+      si_pm4_set_reg(pm4, R_00B420_SPI_SHADER_PGM_LO_HS, va >> 8);
+      si_pm4_set_reg(pm4, R_00B424_SPI_SHADER_PGM_HI_HS, S_00B424_MEM_BASE(va >> 40));
+
+      shader->config.rsrc2 = S_00B42C_USER_SGPR(GFX6_TCS_NUM_USER_SGPR) | S_00B42C_OC_LDS_EN(1) |
+                             S_00B42C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0);
+   }
+
+   si_pm4_set_reg(
+      pm4, R_00B428_SPI_SHADER_PGM_RSRC1_HS,
+      S_00B428_VGPRS((shader->config.num_vgprs - 1) / (sscreen->ge_wave_size == 32 ? 8 : 4)) |
+         (sscreen->info.chip_class <= GFX9 ? S_00B428_SGPRS((shader->config.num_sgprs - 1) / 8)
+                                           : 0) |
+         S_00B428_DX10_CLAMP(1) | S_00B428_MEM_ORDERED(sscreen->info.chip_class >= GFX10) |
+         S_00B428_WGP_MODE(sscreen->info.chip_class >= GFX10) |
+         S_00B428_FLOAT_MODE(shader->config.float_mode) |
+         S_00B428_LS_VGPR_COMP_CNT(sscreen->info.chip_class >= GFX9
+                                      ? si_get_vs_vgpr_comp_cnt(sscreen, shader, false)
+                                      : 0));
+
+   if (sscreen->info.chip_class <= GFX8) {
+      si_pm4_set_reg(pm4, R_00B42C_SPI_SHADER_PGM_RSRC2_HS, shader->config.rsrc2);
+   }
 }
 
 static void si_emit_shader_es(struct si_context *sctx)
 {
-	struct si_shader *shader = sctx->queued.named.es->shader;
-	unsigned initial_cdw = sctx->gfx_cs->current.cdw;
+   struct si_shader *shader = sctx->queued.named.es->shader;
+   unsigned initial_cdw = sctx->gfx_cs->current.cdw;
 
-	if (!shader)
-		return;
+   if (!shader)
+      return;
 
-	radeon_opt_set_context_reg(sctx, R_028AAC_VGT_ESGS_RING_ITEMSIZE,
-				   SI_TRACKED_VGT_ESGS_RING_ITEMSIZE,
-				   shader->selector->esgs_itemsize / 4);
+   radeon_opt_set_context_reg(sctx, R_028AAC_VGT_ESGS_RING_ITEMSIZE,
+                              SI_TRACKED_VGT_ESGS_RING_ITEMSIZE,
+                              shader->selector->esgs_itemsize / 4);
 
-	if (shader->selector->type == PIPE_SHADER_TESS_EVAL)
-		radeon_opt_set_context_reg(sctx, R_028B6C_VGT_TF_PARAM,
-					   SI_TRACKED_VGT_TF_PARAM,
-					   shader->vgt_tf_param);
+   if (shader->selector->type == PIPE_SHADER_TESS_EVAL)
+      radeon_opt_set_context_reg(sctx, R_028B6C_VGT_TF_PARAM, SI_TRACKED_VGT_TF_PARAM,
+                                 shader->vgt_tf_param);
 
-	if (shader->vgt_vertex_reuse_block_cntl)
-		radeon_opt_set_context_reg(sctx, R_028C58_VGT_VERTEX_REUSE_BLOCK_CNTL,
-					   SI_TRACKED_VGT_VERTEX_REUSE_BLOCK_CNTL,
-					   shader->vgt_vertex_reuse_block_cntl);
+   if (shader->vgt_vertex_reuse_block_cntl)
+      radeon_opt_set_context_reg(sctx, R_028C58_VGT_VERTEX_REUSE_BLOCK_CNTL,
+                                 SI_TRACKED_VGT_VERTEX_REUSE_BLOCK_CNTL,
+                                 shader->vgt_vertex_reuse_block_cntl);
 
-	if (initial_cdw != sctx->gfx_cs->current.cdw)
-		sctx->context_roll = true;
+   if (initial_cdw != sctx->gfx_cs->current.cdw)
+      sctx->context_roll = true;
 }
 
 static void si_shader_es(struct si_screen *sscreen, struct si_shader *shader)
 {
-	struct si_pm4_state *pm4;
-	unsigned num_user_sgprs;
-	unsigned vgpr_comp_cnt;
-	uint64_t va;
-	unsigned oc_lds_en;
-
-	assert(sscreen->info.chip_class <= GFX8);
-
-	pm4 = si_get_shader_pm4_state(shader);
-	if (!pm4)
-		return;
-
-	pm4->atom.emit = si_emit_shader_es;
-	va = shader->bo->gpu_address;
-	si_pm4_add_bo(pm4, shader->bo, RADEON_USAGE_READ, RADEON_PRIO_SHADER_BINARY);
-
-	if (shader->selector->type == PIPE_SHADER_VERTEX) {
-		vgpr_comp_cnt = si_get_vs_vgpr_comp_cnt(sscreen, shader, false);
-		num_user_sgprs = si_get_num_vs_user_sgprs(shader, SI_VS_NUM_USER_SGPR);
-	} else if (shader->selector->type == PIPE_SHADER_TESS_EVAL) {
-		vgpr_comp_cnt = shader->selector->info.uses_primid ? 3 : 2;
-		num_user_sgprs = SI_TES_NUM_USER_SGPR;
-	} else
-		unreachable("invalid shader selector type");
-
-	oc_lds_en = shader->selector->type == PIPE_SHADER_TESS_EVAL ? 1 : 0;
-
-	si_pm4_set_reg(pm4, R_00B320_SPI_SHADER_PGM_LO_ES, va >> 8);
-	si_pm4_set_reg(pm4, R_00B324_SPI_SHADER_PGM_HI_ES, S_00B324_MEM_BASE(va >> 40));
-	si_pm4_set_reg(pm4, R_00B328_SPI_SHADER_PGM_RSRC1_ES,
-		       S_00B328_VGPRS((shader->config.num_vgprs - 1) / 4) |
-		       S_00B328_SGPRS((shader->config.num_sgprs - 1) / 8) |
-		       S_00B328_VGPR_COMP_CNT(vgpr_comp_cnt) |
-		       S_00B328_DX10_CLAMP(1) |
-		       S_00B328_FLOAT_MODE(shader->config.float_mode));
-	si_pm4_set_reg(pm4, R_00B32C_SPI_SHADER_PGM_RSRC2_ES,
-		       S_00B32C_USER_SGPR(num_user_sgprs) |
-		       S_00B32C_OC_LDS_EN(oc_lds_en) |
-		       S_00B32C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0));
-
-	if (shader->selector->type == PIPE_SHADER_TESS_EVAL)
-		si_set_tesseval_regs(sscreen, shader->selector, pm4);
-
-	polaris_set_vgt_vertex_reuse(sscreen, shader->selector, shader, pm4);
-}
-
-void gfx9_get_gs_info(struct si_shader_selector *es,
-		      struct si_shader_selector *gs,
-		      struct gfx9_gs_info *out)
-{
-	unsigned gs_num_invocations = MAX2(gs->gs_num_invocations, 1);
-	unsigned input_prim = gs->info.properties[TGSI_PROPERTY_GS_INPUT_PRIM];
-	bool uses_adjacency = input_prim >= PIPE_PRIM_LINES_ADJACENCY &&
-			      input_prim <= PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY;
-
-	/* All these are in dwords: */
-	/* We can't allow using the whole LDS, because GS waves compete with
-	 * other shader stages for LDS space. */
-	const unsigned max_lds_size = 8 * 1024;
-	const unsigned esgs_itemsize = es->esgs_itemsize / 4;
-	unsigned esgs_lds_size;
-
-	/* All these are per subgroup: */
-	const unsigned max_out_prims = 32 * 1024;
-	const unsigned max_es_verts = 255;
-	const unsigned ideal_gs_prims = 64;
-	unsigned max_gs_prims, gs_prims;
-	unsigned min_es_verts, es_verts, worst_case_es_verts;
-
-	if (uses_adjacency || gs_num_invocations > 1)
-		max_gs_prims = 127 / gs_num_invocations;
-	else
-		max_gs_prims = 255;
-
-	/* MAX_PRIMS_PER_SUBGROUP = gs_prims * max_vert_out * gs_invocations.
-	 * Make sure we don't go over the maximum value.
-	 */
-	if (gs->gs_max_out_vertices > 0) {
-		max_gs_prims = MIN2(max_gs_prims,
-				    max_out_prims /
-				    (gs->gs_max_out_vertices * gs_num_invocations));
-	}
-	assert(max_gs_prims > 0);
-
-	/* If the primitive has adjacency, halve the number of vertices
-	 * that will be reused in multiple primitives.
-	 */
-	min_es_verts = gs->gs_input_verts_per_prim / (uses_adjacency ? 2 : 1);
-
-	gs_prims = MIN2(ideal_gs_prims, max_gs_prims);
-	worst_case_es_verts = MIN2(min_es_verts * gs_prims, max_es_verts);
-
-	/* Compute ESGS LDS size based on the worst case number of ES vertices
-	 * needed to create the target number of GS prims per subgroup.
-	 */
-	esgs_lds_size = esgs_itemsize * worst_case_es_verts;
-
-	/* If total LDS usage is too big, refactor partitions based on ratio
-	 * of ESGS item sizes.
-	 */
-	if (esgs_lds_size > max_lds_size) {
-		/* Our target GS Prims Per Subgroup was too large. Calculate
-		 * the maximum number of GS Prims Per Subgroup that will fit
-		 * into LDS, capped by the maximum that the hardware can support.
-		 */
-		gs_prims = MIN2((max_lds_size / (esgs_itemsize * min_es_verts)),
-				max_gs_prims);
-		assert(gs_prims > 0);
-		worst_case_es_verts = MIN2(min_es_verts * gs_prims,
-					   max_es_verts);
-
-		esgs_lds_size = esgs_itemsize * worst_case_es_verts;
-		assert(esgs_lds_size <= max_lds_size);
-	}
-
-	/* Now calculate remaining ESGS information. */
-	if (esgs_lds_size)
-		es_verts = MIN2(esgs_lds_size / esgs_itemsize, max_es_verts);
-	else
-		es_verts = max_es_verts;
-
-	/* Vertices for adjacency primitives are not always reused, so restore
-	 * it for ES_VERTS_PER_SUBGRP.
-	 */
-	min_es_verts = gs->gs_input_verts_per_prim;
-
-	/* For normal primitives, the VGT only checks if they are past the ES
-	 * verts per subgroup after allocating a full GS primitive and if they
-	 * are, kick off a new subgroup.  But if those additional ES verts are
-	 * unique (e.g. not reused) we need to make sure there is enough LDS
-	 * space to account for those ES verts beyond ES_VERTS_PER_SUBGRP.
-	 */
-	es_verts -= min_es_verts - 1;
-
-	out->es_verts_per_subgroup = es_verts;
-	out->gs_prims_per_subgroup = gs_prims;
-	out->gs_inst_prims_in_subgroup = gs_prims * gs_num_invocations;
-	out->max_prims_per_subgroup = out->gs_inst_prims_in_subgroup *
-				      gs->gs_max_out_vertices;
-	out->esgs_ring_size = 4 * esgs_lds_size;
-
-	assert(out->max_prims_per_subgroup <= max_out_prims);
+   struct si_pm4_state *pm4;
+   unsigned num_user_sgprs;
+   unsigned vgpr_comp_cnt;
+   uint64_t va;
+   unsigned oc_lds_en;
+
+   assert(sscreen->info.chip_class <= GFX8);
+
+   pm4 = si_get_shader_pm4_state(shader);
+   if (!pm4)
+      return;
+
+   pm4->atom.emit = si_emit_shader_es;
+   va = shader->bo->gpu_address;
+   si_pm4_add_bo(pm4, shader->bo, RADEON_USAGE_READ, RADEON_PRIO_SHADER_BINARY);
+
+   if (shader->selector->type == PIPE_SHADER_VERTEX) {
+      vgpr_comp_cnt = si_get_vs_vgpr_comp_cnt(sscreen, shader, false);
+      num_user_sgprs = si_get_num_vs_user_sgprs(shader, SI_VS_NUM_USER_SGPR);
+   } else if (shader->selector->type == PIPE_SHADER_TESS_EVAL) {
+      vgpr_comp_cnt = shader->selector->info.uses_primid ? 3 : 2;
+      num_user_sgprs = SI_TES_NUM_USER_SGPR;
+   } else
+      unreachable("invalid shader selector type");
+
+   oc_lds_en = shader->selector->type == PIPE_SHADER_TESS_EVAL ? 1 : 0;
+
+   si_pm4_set_reg(pm4, R_00B320_SPI_SHADER_PGM_LO_ES, va >> 8);
+   si_pm4_set_reg(pm4, R_00B324_SPI_SHADER_PGM_HI_ES, S_00B324_MEM_BASE(va >> 40));
+   si_pm4_set_reg(pm4, R_00B328_SPI_SHADER_PGM_RSRC1_ES,
+                  S_00B328_VGPRS((shader->config.num_vgprs - 1) / 4) |
+                     S_00B328_SGPRS((shader->config.num_sgprs - 1) / 8) |
+                     S_00B328_VGPR_COMP_CNT(vgpr_comp_cnt) | S_00B328_DX10_CLAMP(1) |
+                     S_00B328_FLOAT_MODE(shader->config.float_mode));
+   si_pm4_set_reg(pm4, R_00B32C_SPI_SHADER_PGM_RSRC2_ES,
+                  S_00B32C_USER_SGPR(num_user_sgprs) | S_00B32C_OC_LDS_EN(oc_lds_en) |
+                     S_00B32C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0));
+
+   if (shader->selector->type == PIPE_SHADER_TESS_EVAL)
+      si_set_tesseval_regs(sscreen, shader->selector, pm4);
+
+   polaris_set_vgt_vertex_reuse(sscreen, shader->selector, shader, pm4);
+}
+
+void gfx9_get_gs_info(struct si_shader_selector *es, struct si_shader_selector *gs,
+                      struct gfx9_gs_info *out)
+{
+   unsigned gs_num_invocations = MAX2(gs->gs_num_invocations, 1);
+   unsigned input_prim = gs->info.properties[TGSI_PROPERTY_GS_INPUT_PRIM];
+   bool uses_adjacency =
+      input_prim >= PIPE_PRIM_LINES_ADJACENCY && input_prim <= PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY;
+
+   /* All these are in dwords: */
+   /* We can't allow using the whole LDS, because GS waves compete with
+    * other shader stages for LDS space. */
+   const unsigned max_lds_size = 8 * 1024;
+   const unsigned esgs_itemsize = es->esgs_itemsize / 4;
+   unsigned esgs_lds_size;
+
+   /* All these are per subgroup: */
+   const unsigned max_out_prims = 32 * 1024;
+   const unsigned max_es_verts = 255;
+   const unsigned ideal_gs_prims = 64;
+   unsigned max_gs_prims, gs_prims;
+   unsigned min_es_verts, es_verts, worst_case_es_verts;
+
+   if (uses_adjacency || gs_num_invocations > 1)
+      max_gs_prims = 127 / gs_num_invocations;
+   else
+      max_gs_prims = 255;
+
+   /* MAX_PRIMS_PER_SUBGROUP = gs_prims * max_vert_out * gs_invocations.
+    * Make sure we don't go over the maximum value.
+    */
+   if (gs->gs_max_out_vertices > 0) {
+      max_gs_prims =
+         MIN2(max_gs_prims, max_out_prims / (gs->gs_max_out_vertices * gs_num_invocations));
+   }
+   assert(max_gs_prims > 0);
+
+   /* If the primitive has adjacency, halve the number of vertices
+    * that will be reused in multiple primitives.
+    */
+   min_es_verts = gs->gs_input_verts_per_prim / (uses_adjacency ? 2 : 1);
+
+   gs_prims = MIN2(ideal_gs_prims, max_gs_prims);
+   worst_case_es_verts = MIN2(min_es_verts * gs_prims, max_es_verts);
+
+   /* Compute ESGS LDS size based on the worst case number of ES vertices
+    * needed to create the target number of GS prims per subgroup.
+    */
+   esgs_lds_size = esgs_itemsize * worst_case_es_verts;
+
+   /* If total LDS usage is too big, refactor partitions based on ratio
+    * of ESGS item sizes.
+    */
+   if (esgs_lds_size > max_lds_size) {
+      /* Our target GS Prims Per Subgroup was too large. Calculate
+       * the maximum number of GS Prims Per Subgroup that will fit
+       * into LDS, capped by the maximum that the hardware can support.
+       */
+      gs_prims = MIN2((max_lds_size / (esgs_itemsize * min_es_verts)), max_gs_prims);
+      assert(gs_prims > 0);
+      worst_case_es_verts = MIN2(min_es_verts * gs_prims, max_es_verts);
+
+      esgs_lds_size = esgs_itemsize * worst_case_es_verts;
+      assert(esgs_lds_size <= max_lds_size);
+   }
+
+   /* Now calculate remaining ESGS information. */
+   if (esgs_lds_size)
+      es_verts = MIN2(esgs_lds_size / esgs_itemsize, max_es_verts);
+   else
+      es_verts = max_es_verts;
+
+   /* Vertices for adjacency primitives are not always reused, so restore
+    * it for ES_VERTS_PER_SUBGRP.
+    */
+   min_es_verts = gs->gs_input_verts_per_prim;
+
+   /* For normal primitives, the VGT only checks if they are past the ES
+    * verts per subgroup after allocating a full GS primitive and if they
+    * are, kick off a new subgroup.  But if those additional ES verts are
+    * unique (e.g. not reused) we need to make sure there is enough LDS
+    * space to account for those ES verts beyond ES_VERTS_PER_SUBGRP.
+    */
+   es_verts -= min_es_verts - 1;
+
+   out->es_verts_per_subgroup = es_verts;
+   out->gs_prims_per_subgroup = gs_prims;
+   out->gs_inst_prims_in_subgroup = gs_prims * gs_num_invocations;
+   out->max_prims_per_subgroup = out->gs_inst_prims_in_subgroup * gs->gs_max_out_vertices;
+   out->esgs_ring_size = 4 * esgs_lds_size;
+
+   assert(out->max_prims_per_subgroup <= max_out_prims);
 }
 
 static void si_emit_shader_gs(struct si_context *sctx)
 {
-	struct si_shader *shader = sctx->queued.named.gs->shader;
-	unsigned initial_cdw = sctx->gfx_cs->current.cdw;
-
-	if (!shader)
-		return;
-
-	/* R_028A60_VGT_GSVS_RING_OFFSET_1, R_028A64_VGT_GSVS_RING_OFFSET_2
-	 * R_028A68_VGT_GSVS_RING_OFFSET_3 */
-	radeon_opt_set_context_reg3(sctx, R_028A60_VGT_GSVS_RING_OFFSET_1,
-				    SI_TRACKED_VGT_GSVS_RING_OFFSET_1,
-				    shader->ctx_reg.gs.vgt_gsvs_ring_offset_1,
-				    shader->ctx_reg.gs.vgt_gsvs_ring_offset_2,
-				    shader->ctx_reg.gs.vgt_gsvs_ring_offset_3);
-
-	/* R_028AB0_VGT_GSVS_RING_ITEMSIZE */
-	radeon_opt_set_context_reg(sctx, R_028AB0_VGT_GSVS_RING_ITEMSIZE,
-				   SI_TRACKED_VGT_GSVS_RING_ITEMSIZE,
-				   shader->ctx_reg.gs.vgt_gsvs_ring_itemsize);
-
-	/* R_028B38_VGT_GS_MAX_VERT_OUT */
-	radeon_opt_set_context_reg(sctx, R_028B38_VGT_GS_MAX_VERT_OUT,
-				   SI_TRACKED_VGT_GS_MAX_VERT_OUT,
-				   shader->ctx_reg.gs.vgt_gs_max_vert_out);
-
-	/* R_028B5C_VGT_GS_VERT_ITEMSIZE, R_028B60_VGT_GS_VERT_ITEMSIZE_1
-	 * R_028B64_VGT_GS_VERT_ITEMSIZE_2, R_028B68_VGT_GS_VERT_ITEMSIZE_3 */
-	radeon_opt_set_context_reg4(sctx, R_028B5C_VGT_GS_VERT_ITEMSIZE,
-				    SI_TRACKED_VGT_GS_VERT_ITEMSIZE,
-				    shader->ctx_reg.gs.vgt_gs_vert_itemsize,
-				    shader->ctx_reg.gs.vgt_gs_vert_itemsize_1,
-				    shader->ctx_reg.gs.vgt_gs_vert_itemsize_2,
-				    shader->ctx_reg.gs.vgt_gs_vert_itemsize_3);
-
-	/* R_028B90_VGT_GS_INSTANCE_CNT */
-	radeon_opt_set_context_reg(sctx, R_028B90_VGT_GS_INSTANCE_CNT,
-				   SI_TRACKED_VGT_GS_INSTANCE_CNT,
-				   shader->ctx_reg.gs.vgt_gs_instance_cnt);
-
-	if (sctx->chip_class >= GFX9) {
-		/* R_028A44_VGT_GS_ONCHIP_CNTL */
-		radeon_opt_set_context_reg(sctx, R_028A44_VGT_GS_ONCHIP_CNTL,
-					   SI_TRACKED_VGT_GS_ONCHIP_CNTL,
-					   shader->ctx_reg.gs.vgt_gs_onchip_cntl);
-		/* R_028A94_VGT_GS_MAX_PRIMS_PER_SUBGROUP */
-		radeon_opt_set_context_reg(sctx, R_028A94_VGT_GS_MAX_PRIMS_PER_SUBGROUP,
-					   SI_TRACKED_VGT_GS_MAX_PRIMS_PER_SUBGROUP,
-					   shader->ctx_reg.gs.vgt_gs_max_prims_per_subgroup);
-		/* R_028AAC_VGT_ESGS_RING_ITEMSIZE */
-		radeon_opt_set_context_reg(sctx, R_028AAC_VGT_ESGS_RING_ITEMSIZE,
-					   SI_TRACKED_VGT_ESGS_RING_ITEMSIZE,
-					   shader->ctx_reg.gs.vgt_esgs_ring_itemsize);
-
-		if (shader->key.part.gs.es->type == PIPE_SHADER_TESS_EVAL)
-			radeon_opt_set_context_reg(sctx, R_028B6C_VGT_TF_PARAM,
-						   SI_TRACKED_VGT_TF_PARAM,
-						   shader->vgt_tf_param);
-		if (shader->vgt_vertex_reuse_block_cntl)
-			radeon_opt_set_context_reg(sctx, R_028C58_VGT_VERTEX_REUSE_BLOCK_CNTL,
-						   SI_TRACKED_VGT_VERTEX_REUSE_BLOCK_CNTL,
-						   shader->vgt_vertex_reuse_block_cntl);
-	}
-
-	if (initial_cdw != sctx->gfx_cs->current.cdw)
-		sctx->context_roll = true;
+   struct si_shader *shader = sctx->queued.named.gs->shader;
+   unsigned initial_cdw = sctx->gfx_cs->current.cdw;
+
+   if (!shader)
+      return;
+
+   /* R_028A60_VGT_GSVS_RING_OFFSET_1, R_028A64_VGT_GSVS_RING_OFFSET_2
+    * R_028A68_VGT_GSVS_RING_OFFSET_3 */
+   radeon_opt_set_context_reg3(
+      sctx, R_028A60_VGT_GSVS_RING_OFFSET_1, SI_TRACKED_VGT_GSVS_RING_OFFSET_1,
+      shader->ctx_reg.gs.vgt_gsvs_ring_offset_1, shader->ctx_reg.gs.vgt_gsvs_ring_offset_2,
+      shader->ctx_reg.gs.vgt_gsvs_ring_offset_3);
+
+   /* R_028AB0_VGT_GSVS_RING_ITEMSIZE */
+   radeon_opt_set_context_reg(sctx, R_028AB0_VGT_GSVS_RING_ITEMSIZE,
+                              SI_TRACKED_VGT_GSVS_RING_ITEMSIZE,
+                              shader->ctx_reg.gs.vgt_gsvs_ring_itemsize);
+
+   /* R_028B38_VGT_GS_MAX_VERT_OUT */
+   radeon_opt_set_context_reg(sctx, R_028B38_VGT_GS_MAX_VERT_OUT, SI_TRACKED_VGT_GS_MAX_VERT_OUT,
+                              shader->ctx_reg.gs.vgt_gs_max_vert_out);
+
+   /* R_028B5C_VGT_GS_VERT_ITEMSIZE, R_028B60_VGT_GS_VERT_ITEMSIZE_1
+    * R_028B64_VGT_GS_VERT_ITEMSIZE_2, R_028B68_VGT_GS_VERT_ITEMSIZE_3 */
+   radeon_opt_set_context_reg4(
+      sctx, R_028B5C_VGT_GS_VERT_ITEMSIZE, SI_TRACKED_VGT_GS_VERT_ITEMSIZE,
+      shader->ctx_reg.gs.vgt_gs_vert_itemsize, shader->ctx_reg.gs.vgt_gs_vert_itemsize_1,
+      shader->ctx_reg.gs.vgt_gs_vert_itemsize_2, shader->ctx_reg.gs.vgt_gs_vert_itemsize_3);
+
+   /* R_028B90_VGT_GS_INSTANCE_CNT */
+   radeon_opt_set_context_reg(sctx, R_028B90_VGT_GS_INSTANCE_CNT, SI_TRACKED_VGT_GS_INSTANCE_CNT,
+                              shader->ctx_reg.gs.vgt_gs_instance_cnt);
+
+   if (sctx->chip_class >= GFX9) {
+      /* R_028A44_VGT_GS_ONCHIP_CNTL */
+      radeon_opt_set_context_reg(sctx, R_028A44_VGT_GS_ONCHIP_CNTL, SI_TRACKED_VGT_GS_ONCHIP_CNTL,
+                                 shader->ctx_reg.gs.vgt_gs_onchip_cntl);
+      /* R_028A94_VGT_GS_MAX_PRIMS_PER_SUBGROUP */
+      radeon_opt_set_context_reg(sctx, R_028A94_VGT_GS_MAX_PRIMS_PER_SUBGROUP,
+                                 SI_TRACKED_VGT_GS_MAX_PRIMS_PER_SUBGROUP,
+                                 shader->ctx_reg.gs.vgt_gs_max_prims_per_subgroup);
+      /* R_028AAC_VGT_ESGS_RING_ITEMSIZE */
+      radeon_opt_set_context_reg(sctx, R_028AAC_VGT_ESGS_RING_ITEMSIZE,
+                                 SI_TRACKED_VGT_ESGS_RING_ITEMSIZE,
+                                 shader->ctx_reg.gs.vgt_esgs_ring_itemsize);
+
+      if (shader->key.part.gs.es->type == PIPE_SHADER_TESS_EVAL)
+         radeon_opt_set_context_reg(sctx, R_028B6C_VGT_TF_PARAM, SI_TRACKED_VGT_TF_PARAM,
+                                    shader->vgt_tf_param);
+      if (shader->vgt_vertex_reuse_block_cntl)
+         radeon_opt_set_context_reg(sctx, R_028C58_VGT_VERTEX_REUSE_BLOCK_CNTL,
+                                    SI_TRACKED_VGT_VERTEX_REUSE_BLOCK_CNTL,
+                                    shader->vgt_vertex_reuse_block_cntl);
+   }
+
+   if (initial_cdw != sctx->gfx_cs->current.cdw)
+      sctx->context_roll = true;
 }
 
 static void si_shader_gs(struct si_screen *sscreen, struct si_shader *shader)
 {
-	struct si_shader_selector *sel = shader->selector;
-	const ubyte *num_components = sel->info.num_stream_output_components;
-	unsigned gs_num_invocations = sel->gs_num_invocations;
-	struct si_pm4_state *pm4;
-	uint64_t va;
-	unsigned max_stream = sel->max_gs_stream;
-	unsigned offset;
-
-	pm4 = si_get_shader_pm4_state(shader);
-	if (!pm4)
-		return;
-
-	pm4->atom.emit = si_emit_shader_gs;
-
-	offset = num_components[0] * sel->gs_max_out_vertices;
-	shader->ctx_reg.gs.vgt_gsvs_ring_offset_1 = offset;
-
-	if (max_stream >= 1)
-		offset += num_components[1] * sel->gs_max_out_vertices;
-	shader->ctx_reg.gs.vgt_gsvs_ring_offset_2 = offset;
-
-	if (max_stream >= 2)
-		offset += num_components[2] * sel->gs_max_out_vertices;
-	shader->ctx_reg.gs.vgt_gsvs_ring_offset_3 = offset;
-
-	if (max_stream >= 3)
-		offset += num_components[3] * sel->gs_max_out_vertices;
-	shader->ctx_reg.gs.vgt_gsvs_ring_itemsize = offset;
-
-	/* The GSVS_RING_ITEMSIZE register takes 15 bits */
-	assert(offset < (1 << 15));
-
-	shader->ctx_reg.gs.vgt_gs_max_vert_out = sel->gs_max_out_vertices;
-
-	shader->ctx_reg.gs.vgt_gs_vert_itemsize = num_components[0];
-	shader->ctx_reg.gs.vgt_gs_vert_itemsize_1 = (max_stream >= 1) ? num_components[1] : 0;
-	shader->ctx_reg.gs.vgt_gs_vert_itemsize_2 = (max_stream >= 2) ? num_components[2] : 0;
-	shader->ctx_reg.gs.vgt_gs_vert_itemsize_3 = (max_stream >= 3) ? num_components[3] : 0;
-
-	shader->ctx_reg.gs.vgt_gs_instance_cnt = S_028B90_CNT(MIN2(gs_num_invocations, 127)) |
-						 S_028B90_ENABLE(gs_num_invocations > 0);
-
-	va = shader->bo->gpu_address;
-	si_pm4_add_bo(pm4, shader->bo, RADEON_USAGE_READ, RADEON_PRIO_SHADER_BINARY);
-
-	if (sscreen->info.chip_class >= GFX9) {
-		unsigned input_prim = sel->info.properties[TGSI_PROPERTY_GS_INPUT_PRIM];
-		unsigned es_type = shader->key.part.gs.es->type;
-		unsigned es_vgpr_comp_cnt, gs_vgpr_comp_cnt;
-
-		if (es_type == PIPE_SHADER_VERTEX) {
-			es_vgpr_comp_cnt = si_get_vs_vgpr_comp_cnt(sscreen, shader, false);
-		} else if (es_type == PIPE_SHADER_TESS_EVAL)
-			es_vgpr_comp_cnt = shader->key.part.gs.es->info.uses_primid ? 3 : 2;
-		else
-			unreachable("invalid shader selector type");
-
-		/* If offsets 4, 5 are used, GS_VGPR_COMP_CNT is ignored and
-		 * VGPR[0:4] are always loaded.
-		 */
-		if (sel->info.uses_invocationid)
-			gs_vgpr_comp_cnt = 3; /* VGPR3 contains InvocationID. */
-		else if (sel->info.uses_primid)
-			gs_vgpr_comp_cnt = 2; /* VGPR2 contains PrimitiveID. */
-		else if (input_prim >= PIPE_PRIM_TRIANGLES)
-			gs_vgpr_comp_cnt = 1; /* VGPR1 contains offsets 2, 3 */
-		else
-			gs_vgpr_comp_cnt = 0; /* VGPR0 contains offsets 0, 1 */
-
-		unsigned num_user_sgprs;
-		if (es_type == PIPE_SHADER_VERTEX)
-			num_user_sgprs = si_get_num_vs_user_sgprs(shader, GFX9_VSGS_NUM_USER_SGPR);
-		else
-			num_user_sgprs = GFX9_TESGS_NUM_USER_SGPR;
-
-		if (sscreen->info.chip_class >= GFX10) {
-			si_pm4_set_reg(pm4, R_00B320_SPI_SHADER_PGM_LO_ES, va >> 8);
-			si_pm4_set_reg(pm4, R_00B324_SPI_SHADER_PGM_HI_ES, S_00B324_MEM_BASE(va >> 40));
-		} else {
-			si_pm4_set_reg(pm4, R_00B210_SPI_SHADER_PGM_LO_ES, va >> 8);
-			si_pm4_set_reg(pm4, R_00B214_SPI_SHADER_PGM_HI_ES, S_00B214_MEM_BASE(va >> 40));
-		}
-
-		uint32_t rsrc1 =
-			S_00B228_VGPRS((shader->config.num_vgprs - 1) / 4) |
-			S_00B228_DX10_CLAMP(1) |
-			S_00B228_MEM_ORDERED(sscreen->info.chip_class >= GFX10) |
-			S_00B228_WGP_MODE(sscreen->info.chip_class >= GFX10) |
-			S_00B228_FLOAT_MODE(shader->config.float_mode) |
-			S_00B228_GS_VGPR_COMP_CNT(gs_vgpr_comp_cnt);
-		uint32_t rsrc2 =
-			S_00B22C_USER_SGPR(num_user_sgprs) |
-			S_00B22C_ES_VGPR_COMP_CNT(es_vgpr_comp_cnt) |
-			S_00B22C_OC_LDS_EN(es_type == PIPE_SHADER_TESS_EVAL) |
-			S_00B22C_LDS_SIZE(shader->config.lds_size) |
-			S_00B22C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0);
-
-		if (sscreen->info.chip_class >= GFX10) {
-			rsrc2 |= S_00B22C_USER_SGPR_MSB_GFX10(num_user_sgprs >> 5);
-		} else {
-			rsrc1 |= S_00B228_SGPRS((shader->config.num_sgprs - 1) / 8);
-			rsrc2 |= S_00B22C_USER_SGPR_MSB_GFX9(num_user_sgprs >> 5);
-		}
-
-		si_pm4_set_reg(pm4, R_00B228_SPI_SHADER_PGM_RSRC1_GS, rsrc1);
-		si_pm4_set_reg(pm4, R_00B22C_SPI_SHADER_PGM_RSRC2_GS, rsrc2);
-
-		if (sscreen->info.chip_class >= GFX10) {
-			si_pm4_set_reg(pm4, R_00B204_SPI_SHADER_PGM_RSRC4_GS,
-				       S_00B204_CU_EN(0xffff) |
-				       S_00B204_SPI_SHADER_LATE_ALLOC_GS_GFX10(0));
-		}
-
-		shader->ctx_reg.gs.vgt_gs_onchip_cntl =
-			S_028A44_ES_VERTS_PER_SUBGRP(shader->gs_info.es_verts_per_subgroup) |
-			S_028A44_GS_PRIMS_PER_SUBGRP(shader->gs_info.gs_prims_per_subgroup) |
-			S_028A44_GS_INST_PRIMS_IN_SUBGRP(shader->gs_info.gs_inst_prims_in_subgroup);
-		shader->ctx_reg.gs.vgt_gs_max_prims_per_subgroup =
-			S_028A94_MAX_PRIMS_PER_SUBGROUP(shader->gs_info.max_prims_per_subgroup);
-		shader->ctx_reg.gs.vgt_esgs_ring_itemsize =
-			shader->key.part.gs.es->esgs_itemsize / 4;
-
-		if (es_type == PIPE_SHADER_TESS_EVAL)
-			si_set_tesseval_regs(sscreen, shader->key.part.gs.es, pm4);
-
-		polaris_set_vgt_vertex_reuse(sscreen, shader->key.part.gs.es,
-					     NULL, pm4);
-	} else {
-		si_pm4_set_reg(pm4, R_00B220_SPI_SHADER_PGM_LO_GS, va >> 8);
-		si_pm4_set_reg(pm4, R_00B224_SPI_SHADER_PGM_HI_GS, S_00B224_MEM_BASE(va >> 40));
-
-		si_pm4_set_reg(pm4, R_00B228_SPI_SHADER_PGM_RSRC1_GS,
-			       S_00B228_VGPRS((shader->config.num_vgprs - 1) / 4) |
-			       S_00B228_SGPRS((shader->config.num_sgprs - 1) / 8) |
-			       S_00B228_DX10_CLAMP(1) |
-			       S_00B228_FLOAT_MODE(shader->config.float_mode));
-		si_pm4_set_reg(pm4, R_00B22C_SPI_SHADER_PGM_RSRC2_GS,
-			       S_00B22C_USER_SGPR(GFX6_GS_NUM_USER_SGPR) |
-			       S_00B22C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0));
-	}
+   struct si_shader_selector *sel = shader->selector;
+   const ubyte *num_components = sel->info.num_stream_output_components;
+   unsigned gs_num_invocations = sel->gs_num_invocations;
+   struct si_pm4_state *pm4;
+   uint64_t va;
+   unsigned max_stream = sel->max_gs_stream;
+   unsigned offset;
+
+   pm4 = si_get_shader_pm4_state(shader);
+   if (!pm4)
+      return;
+
+   pm4->atom.emit = si_emit_shader_gs;
+
+   offset = num_components[0] * sel->gs_max_out_vertices;
+   shader->ctx_reg.gs.vgt_gsvs_ring_offset_1 = offset;
+
+   if (max_stream >= 1)
+      offset += num_components[1] * sel->gs_max_out_vertices;
+   shader->ctx_reg.gs.vgt_gsvs_ring_offset_2 = offset;
+
+   if (max_stream >= 2)
+      offset += num_components[2] * sel->gs_max_out_vertices;
+   shader->ctx_reg.gs.vgt_gsvs_ring_offset_3 = offset;
+
+   if (max_stream >= 3)
+      offset += num_components[3] * sel->gs_max_out_vertices;
+   shader->ctx_reg.gs.vgt_gsvs_ring_itemsize = offset;
+
+   /* The GSVS_RING_ITEMSIZE register takes 15 bits */
+   assert(offset < (1 << 15));
+
+   shader->ctx_reg.gs.vgt_gs_max_vert_out = sel->gs_max_out_vertices;
+
+   shader->ctx_reg.gs.vgt_gs_vert_itemsize = num_components[0];
+   shader->ctx_reg.gs.vgt_gs_vert_itemsize_1 = (max_stream >= 1) ? num_components[1] : 0;
+   shader->ctx_reg.gs.vgt_gs_vert_itemsize_2 = (max_stream >= 2) ? num_components[2] : 0;
+   shader->ctx_reg.gs.vgt_gs_vert_itemsize_3 = (max_stream >= 3) ? num_components[3] : 0;
+
+   shader->ctx_reg.gs.vgt_gs_instance_cnt =
+      S_028B90_CNT(MIN2(gs_num_invocations, 127)) | S_028B90_ENABLE(gs_num_invocations > 0);
+
+   va = shader->bo->gpu_address;
+   si_pm4_add_bo(pm4, shader->bo, RADEON_USAGE_READ, RADEON_PRIO_SHADER_BINARY);
+
+   if (sscreen->info.chip_class >= GFX9) {
+      unsigned input_prim = sel->info.properties[TGSI_PROPERTY_GS_INPUT_PRIM];
+      unsigned es_type = shader->key.part.gs.es->type;
+      unsigned es_vgpr_comp_cnt, gs_vgpr_comp_cnt;
+
+      if (es_type == PIPE_SHADER_VERTEX) {
+         es_vgpr_comp_cnt = si_get_vs_vgpr_comp_cnt(sscreen, shader, false);
+      } else if (es_type == PIPE_SHADER_TESS_EVAL)
+         es_vgpr_comp_cnt = shader->key.part.gs.es->info.uses_primid ? 3 : 2;
+      else
+         unreachable("invalid shader selector type");
+
+      /* If offsets 4, 5 are used, GS_VGPR_COMP_CNT is ignored and
+       * VGPR[0:4] are always loaded.
+       */
+      if (sel->info.uses_invocationid)
+         gs_vgpr_comp_cnt = 3; /* VGPR3 contains InvocationID. */
+      else if (sel->info.uses_primid)
+         gs_vgpr_comp_cnt = 2; /* VGPR2 contains PrimitiveID. */
+      else if (input_prim >= PIPE_PRIM_TRIANGLES)
+         gs_vgpr_comp_cnt = 1; /* VGPR1 contains offsets 2, 3 */
+      else
+         gs_vgpr_comp_cnt = 0; /* VGPR0 contains offsets 0, 1 */
+
+      unsigned num_user_sgprs;
+      if (es_type == PIPE_SHADER_VERTEX)
+         num_user_sgprs = si_get_num_vs_user_sgprs(shader, GFX9_VSGS_NUM_USER_SGPR);
+      else
+         num_user_sgprs = GFX9_TESGS_NUM_USER_SGPR;
+
+      if (sscreen->info.chip_class >= GFX10) {
+         si_pm4_set_reg(pm4, R_00B320_SPI_SHADER_PGM_LO_ES, va >> 8);
+         si_pm4_set_reg(pm4, R_00B324_SPI_SHADER_PGM_HI_ES, S_00B324_MEM_BASE(va >> 40));
+      } else {
+         si_pm4_set_reg(pm4, R_00B210_SPI_SHADER_PGM_LO_ES, va >> 8);
+         si_pm4_set_reg(pm4, R_00B214_SPI_SHADER_PGM_HI_ES, S_00B214_MEM_BASE(va >> 40));
+      }
+
+      uint32_t rsrc1 = S_00B228_VGPRS((shader->config.num_vgprs - 1) / 4) | S_00B228_DX10_CLAMP(1) |
+                       S_00B228_MEM_ORDERED(sscreen->info.chip_class >= GFX10) |
+                       S_00B228_WGP_MODE(sscreen->info.chip_class >= GFX10) |
+                       S_00B228_FLOAT_MODE(shader->config.float_mode) |
+                       S_00B228_GS_VGPR_COMP_CNT(gs_vgpr_comp_cnt);
+      uint32_t rsrc2 = S_00B22C_USER_SGPR(num_user_sgprs) |
+                       S_00B22C_ES_VGPR_COMP_CNT(es_vgpr_comp_cnt) |
+                       S_00B22C_OC_LDS_EN(es_type == PIPE_SHADER_TESS_EVAL) |
+                       S_00B22C_LDS_SIZE(shader->config.lds_size) |
+                       S_00B22C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0);
+
+      if (sscreen->info.chip_class >= GFX10) {
+         rsrc2 |= S_00B22C_USER_SGPR_MSB_GFX10(num_user_sgprs >> 5);
+      } else {
+         rsrc1 |= S_00B228_SGPRS((shader->config.num_sgprs - 1) / 8);
+         rsrc2 |= S_00B22C_USER_SGPR_MSB_GFX9(num_user_sgprs >> 5);
+      }
+
+      si_pm4_set_reg(pm4, R_00B228_SPI_SHADER_PGM_RSRC1_GS, rsrc1);
+      si_pm4_set_reg(pm4, R_00B22C_SPI_SHADER_PGM_RSRC2_GS, rsrc2);
+
+      if (sscreen->info.chip_class >= GFX10) {
+         si_pm4_set_reg(pm4, R_00B204_SPI_SHADER_PGM_RSRC4_GS,
+                        S_00B204_CU_EN(0xffff) | S_00B204_SPI_SHADER_LATE_ALLOC_GS_GFX10(0));
+      }
+
+      shader->ctx_reg.gs.vgt_gs_onchip_cntl =
+         S_028A44_ES_VERTS_PER_SUBGRP(shader->gs_info.es_verts_per_subgroup) |
+         S_028A44_GS_PRIMS_PER_SUBGRP(shader->gs_info.gs_prims_per_subgroup) |
+         S_028A44_GS_INST_PRIMS_IN_SUBGRP(shader->gs_info.gs_inst_prims_in_subgroup);
+      shader->ctx_reg.gs.vgt_gs_max_prims_per_subgroup =
+         S_028A94_MAX_PRIMS_PER_SUBGROUP(shader->gs_info.max_prims_per_subgroup);
+      shader->ctx_reg.gs.vgt_esgs_ring_itemsize = shader->key.part.gs.es->esgs_itemsize / 4;
+
+      if (es_type == PIPE_SHADER_TESS_EVAL)
+         si_set_tesseval_regs(sscreen, shader->key.part.gs.es, pm4);
+
+      polaris_set_vgt_vertex_reuse(sscreen, shader->key.part.gs.es, NULL, pm4);
+   } else {
+      si_pm4_set_reg(pm4, R_00B220_SPI_SHADER_PGM_LO_GS, va >> 8);
+      si_pm4_set_reg(pm4, R_00B224_SPI_SHADER_PGM_HI_GS, S_00B224_MEM_BASE(va >> 40));
+
+      si_pm4_set_reg(pm4, R_00B228_SPI_SHADER_PGM_RSRC1_GS,
+                     S_00B228_VGPRS((shader->config.num_vgprs - 1) / 4) |
+                        S_00B228_SGPRS((shader->config.num_sgprs - 1) / 8) |
+                        S_00B228_DX10_CLAMP(1) | S_00B228_FLOAT_MODE(shader->config.float_mode));
+      si_pm4_set_reg(pm4, R_00B22C_SPI_SHADER_PGM_RSRC2_GS,
+                     S_00B22C_USER_SGPR(GFX6_GS_NUM_USER_SGPR) |
+                        S_00B22C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0));
+   }
 }
 
 static void gfx10_emit_ge_pc_alloc(struct si_context *sctx, unsigned value)
 {
-	enum si_tracked_reg reg = SI_TRACKED_GE_PC_ALLOC;
+   enum si_tracked_reg reg = SI_TRACKED_GE_PC_ALLOC;
 
-	if (((sctx->tracked_regs.reg_saved >> reg) & 0x1) != 0x1 ||
-	    sctx->tracked_regs.reg_value[reg] != value) {
-		struct radeon_cmdbuf *cs = sctx->gfx_cs;
+   if (((sctx->tracked_regs.reg_saved >> reg) & 0x1) != 0x1 ||
+       sctx->tracked_regs.reg_value[reg] != value) {
+      struct radeon_cmdbuf *cs = sctx->gfx_cs;
 
-		if (sctx->family == CHIP_NAVI10 ||
-		    sctx->family == CHIP_NAVI12 ||
-		    sctx->family == CHIP_NAVI14) {
-			/* SQ_NON_EVENT must be emitted before GE_PC_ALLOC is written. */
-			radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
-			radeon_emit(cs, EVENT_TYPE(V_028A90_SQ_NON_EVENT) | EVENT_INDEX(0));
-		}
+      if (sctx->family == CHIP_NAVI10 || sctx->family == CHIP_NAVI12 ||
+          sctx->family == CHIP_NAVI14) {
+         /* SQ_NON_EVENT must be emitted before GE_PC_ALLOC is written. */
+         radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
+         radeon_emit(cs, EVENT_TYPE(V_028A90_SQ_NON_EVENT) | EVENT_INDEX(0));
+      }
 
-		radeon_set_uconfig_reg(cs, R_030980_GE_PC_ALLOC, value);
+      radeon_set_uconfig_reg(cs, R_030980_GE_PC_ALLOC, value);
 
-		sctx->tracked_regs.reg_saved |= 0x1ull << reg;
-		sctx->tracked_regs.reg_value[reg] = value;
-	}
+      sctx->tracked_regs.reg_saved |= 0x1ull << reg;
+      sctx->tracked_regs.reg_value[reg] = value;
+   }
 }
 
 /* Common tail code for NGG primitive shaders. */
-static void gfx10_emit_shader_ngg_tail(struct si_context *sctx,
-				       struct si_shader *shader,
-				       unsigned initial_cdw)
-{
-	radeon_opt_set_context_reg(sctx, R_0287FC_GE_MAX_OUTPUT_PER_SUBGROUP,
-				   SI_TRACKED_GE_MAX_OUTPUT_PER_SUBGROUP,
-				   shader->ctx_reg.ngg.ge_max_output_per_subgroup);
-	radeon_opt_set_context_reg(sctx, R_028B4C_GE_NGG_SUBGRP_CNTL,
-				   SI_TRACKED_GE_NGG_SUBGRP_CNTL,
-				   shader->ctx_reg.ngg.ge_ngg_subgrp_cntl);
-	radeon_opt_set_context_reg(sctx, R_028A84_VGT_PRIMITIVEID_EN,
-				   SI_TRACKED_VGT_PRIMITIVEID_EN,
-				   shader->ctx_reg.ngg.vgt_primitiveid_en);
-	radeon_opt_set_context_reg(sctx, R_028A44_VGT_GS_ONCHIP_CNTL,
-				   SI_TRACKED_VGT_GS_ONCHIP_CNTL,
-				   shader->ctx_reg.ngg.vgt_gs_onchip_cntl);
-	radeon_opt_set_context_reg(sctx, R_028B90_VGT_GS_INSTANCE_CNT,
-				   SI_TRACKED_VGT_GS_INSTANCE_CNT,
-				   shader->ctx_reg.ngg.vgt_gs_instance_cnt);
-	radeon_opt_set_context_reg(sctx, R_028AAC_VGT_ESGS_RING_ITEMSIZE,
-				   SI_TRACKED_VGT_ESGS_RING_ITEMSIZE,
-				   shader->ctx_reg.ngg.vgt_esgs_ring_itemsize);
-	radeon_opt_set_context_reg(sctx, R_0286C4_SPI_VS_OUT_CONFIG,
-				   SI_TRACKED_SPI_VS_OUT_CONFIG,
-				   shader->ctx_reg.ngg.spi_vs_out_config);
-	radeon_opt_set_context_reg2(sctx, R_028708_SPI_SHADER_IDX_FORMAT,
-				   SI_TRACKED_SPI_SHADER_IDX_FORMAT,
-				   shader->ctx_reg.ngg.spi_shader_idx_format,
-				   shader->ctx_reg.ngg.spi_shader_pos_format);
-	radeon_opt_set_context_reg(sctx, R_028818_PA_CL_VTE_CNTL,
-				   SI_TRACKED_PA_CL_VTE_CNTL,
-				   shader->ctx_reg.ngg.pa_cl_vte_cntl);
-	radeon_opt_set_context_reg(sctx, R_028838_PA_CL_NGG_CNTL,
-				   SI_TRACKED_PA_CL_NGG_CNTL,
-				   shader->ctx_reg.ngg.pa_cl_ngg_cntl);
-
-	radeon_opt_set_context_reg_rmw(sctx, R_02881C_PA_CL_VS_OUT_CNTL,
-				       SI_TRACKED_PA_CL_VS_OUT_CNTL__VS,
-				       shader->pa_cl_vs_out_cntl,
-				       SI_TRACKED_PA_CL_VS_OUT_CNTL__VS_MASK);
-
-	if (initial_cdw != sctx->gfx_cs->current.cdw)
-		sctx->context_roll = true;
-
-	/* GE_PC_ALLOC is not a context register, so it doesn't cause a context roll. */
-	gfx10_emit_ge_pc_alloc(sctx, shader->ctx_reg.ngg.ge_pc_alloc);
+static void gfx10_emit_shader_ngg_tail(struct si_context *sctx, struct si_shader *shader,
+                                       unsigned initial_cdw)
+{
+   radeon_opt_set_context_reg(sctx, R_0287FC_GE_MAX_OUTPUT_PER_SUBGROUP,
+                              SI_TRACKED_GE_MAX_OUTPUT_PER_SUBGROUP,
+                              shader->ctx_reg.ngg.ge_max_output_per_subgroup);
+   radeon_opt_set_context_reg(sctx, R_028B4C_GE_NGG_SUBGRP_CNTL, SI_TRACKED_GE_NGG_SUBGRP_CNTL,
+                              shader->ctx_reg.ngg.ge_ngg_subgrp_cntl);
+   radeon_opt_set_context_reg(sctx, R_028A84_VGT_PRIMITIVEID_EN, SI_TRACKED_VGT_PRIMITIVEID_EN,
+                              shader->ctx_reg.ngg.vgt_primitiveid_en);
+   radeon_opt_set_context_reg(sctx, R_028A44_VGT_GS_ONCHIP_CNTL, SI_TRACKED_VGT_GS_ONCHIP_CNTL,
+                              shader->ctx_reg.ngg.vgt_gs_onchip_cntl);
+   radeon_opt_set_context_reg(sctx, R_028B90_VGT_GS_INSTANCE_CNT, SI_TRACKED_VGT_GS_INSTANCE_CNT,
+                              shader->ctx_reg.ngg.vgt_gs_instance_cnt);
+   radeon_opt_set_context_reg(sctx, R_028AAC_VGT_ESGS_RING_ITEMSIZE,
+                              SI_TRACKED_VGT_ESGS_RING_ITEMSIZE,
+                              shader->ctx_reg.ngg.vgt_esgs_ring_itemsize);
+   radeon_opt_set_context_reg(sctx, R_0286C4_SPI_VS_OUT_CONFIG, SI_TRACKED_SPI_VS_OUT_CONFIG,
+                              shader->ctx_reg.ngg.spi_vs_out_config);
+   radeon_opt_set_context_reg2(
+      sctx, R_028708_SPI_SHADER_IDX_FORMAT, SI_TRACKED_SPI_SHADER_IDX_FORMAT,
+      shader->ctx_reg.ngg.spi_shader_idx_format, shader->ctx_reg.ngg.spi_shader_pos_format);
+   radeon_opt_set_context_reg(sctx, R_028818_PA_CL_VTE_CNTL, SI_TRACKED_PA_CL_VTE_CNTL,
+                              shader->ctx_reg.ngg.pa_cl_vte_cntl);
+   radeon_opt_set_context_reg(sctx, R_028838_PA_CL_NGG_CNTL, SI_TRACKED_PA_CL_NGG_CNTL,
+                              shader->ctx_reg.ngg.pa_cl_ngg_cntl);
+
+   radeon_opt_set_context_reg_rmw(sctx, R_02881C_PA_CL_VS_OUT_CNTL,
+                                  SI_TRACKED_PA_CL_VS_OUT_CNTL__VS, shader->pa_cl_vs_out_cntl,
+                                  SI_TRACKED_PA_CL_VS_OUT_CNTL__VS_MASK);
+
+   if (initial_cdw != sctx->gfx_cs->current.cdw)
+      sctx->context_roll = true;
+
+   /* GE_PC_ALLOC is not a context register, so it doesn't cause a context roll. */
+   gfx10_emit_ge_pc_alloc(sctx, shader->ctx_reg.ngg.ge_pc_alloc);
 }
 
 static void gfx10_emit_shader_ngg_notess_nogs(struct si_context *sctx)
 {
-	struct si_shader *shader = sctx->queued.named.gs->shader;
-	unsigned initial_cdw = sctx->gfx_cs->current.cdw;
+   struct si_shader *shader = sctx->queued.named.gs->shader;
+   unsigned initial_cdw = sctx->gfx_cs->current.cdw;
 
-	if (!shader)
-		return;
+   if (!shader)
+      return;
 
-	gfx10_emit_shader_ngg_tail(sctx, shader, initial_cdw);
+   gfx10_emit_shader_ngg_tail(sctx, shader, initial_cdw);
 }
 
 static void gfx10_emit_shader_ngg_tess_nogs(struct si_context *sctx)
 {
-	struct si_shader *shader = sctx->queued.named.gs->shader;
-	unsigned initial_cdw = sctx->gfx_cs->current.cdw;
+   struct si_shader *shader = sctx->queued.named.gs->shader;
+   unsigned initial_cdw = sctx->gfx_cs->current.cdw;
 
-	if (!shader)
-		return;
+   if (!shader)
+      return;
 
-	radeon_opt_set_context_reg(sctx, R_028B6C_VGT_TF_PARAM,
-				   SI_TRACKED_VGT_TF_PARAM,
-				   shader->vgt_tf_param);
+   radeon_opt_set_context_reg(sctx, R_028B6C_VGT_TF_PARAM, SI_TRACKED_VGT_TF_PARAM,
+                              shader->vgt_tf_param);
 
-	gfx10_emit_shader_ngg_tail(sctx, shader, initial_cdw);
+   gfx10_emit_shader_ngg_tail(sctx, shader, initial_cdw);
 }
 
 static void gfx10_emit_shader_ngg_notess_gs(struct si_context *sctx)
 {
-	struct si_shader *shader = sctx->queued.named.gs->shader;
-	unsigned initial_cdw = sctx->gfx_cs->current.cdw;
+   struct si_shader *shader = sctx->queued.named.gs->shader;
+   unsigned initial_cdw = sctx->gfx_cs->current.cdw;
 
-	if (!shader)
-		return;
+   if (!shader)
+      return;
 
-	radeon_opt_set_context_reg(sctx, R_028B38_VGT_GS_MAX_VERT_OUT,
-				   SI_TRACKED_VGT_GS_MAX_VERT_OUT,
-				   shader->ctx_reg.ngg.vgt_gs_max_vert_out);
+   radeon_opt_set_context_reg(sctx, R_028B38_VGT_GS_MAX_VERT_OUT, SI_TRACKED_VGT_GS_MAX_VERT_OUT,
+                              shader->ctx_reg.ngg.vgt_gs_max_vert_out);
 
-	gfx10_emit_shader_ngg_tail(sctx, shader, initial_cdw);
+   gfx10_emit_shader_ngg_tail(sctx, shader, initial_cdw);
 }
 
 static void gfx10_emit_shader_ngg_tess_gs(struct si_context *sctx)
 {
-	struct si_shader *shader = sctx->queued.named.gs->shader;
-	unsigned initial_cdw = sctx->gfx_cs->current.cdw;
+   struct si_shader *shader = sctx->queued.named.gs->shader;
+   unsigned initial_cdw = sctx->gfx_cs->current.cdw;
 
-	if (!shader)
-		return;
+   if (!shader)
+      return;
 
-	radeon_opt_set_context_reg(sctx, R_028B38_VGT_GS_MAX_VERT_OUT,
-				   SI_TRACKED_VGT_GS_MAX_VERT_OUT,
-				   shader->ctx_reg.ngg.vgt_gs_max_vert_out);
-	radeon_opt_set_context_reg(sctx, R_028B6C_VGT_TF_PARAM,
-				   SI_TRACKED_VGT_TF_PARAM,
-				   shader->vgt_tf_param);
+   radeon_opt_set_context_reg(sctx, R_028B38_VGT_GS_MAX_VERT_OUT, SI_TRACKED_VGT_GS_MAX_VERT_OUT,
+                              shader->ctx_reg.ngg.vgt_gs_max_vert_out);
+   radeon_opt_set_context_reg(sctx, R_028B6C_VGT_TF_PARAM, SI_TRACKED_VGT_TF_PARAM,
+                              shader->vgt_tf_param);
 
-	gfx10_emit_shader_ngg_tail(sctx, shader, initial_cdw);
+   gfx10_emit_shader_ngg_tail(sctx, shader, initial_cdw);
 }
 
 unsigned si_get_input_prim(const struct si_shader_selector *gs)
 {
-	if (gs->type == PIPE_SHADER_GEOMETRY)
-		return gs->info.properties[TGSI_PROPERTY_GS_INPUT_PRIM];
-
-	if (gs->type == PIPE_SHADER_TESS_EVAL) {
-		if (gs->info.properties[TGSI_PROPERTY_TES_POINT_MODE])
-			return PIPE_PRIM_POINTS;
-		if (gs->info.properties[TGSI_PROPERTY_TES_PRIM_MODE] == PIPE_PRIM_LINES)
-			return PIPE_PRIM_LINES;
-		return PIPE_PRIM_TRIANGLES;
-	}
-
-	/* TODO: Set this correctly if the primitive type is set in the shader key. */
-	return PIPE_PRIM_TRIANGLES; /* worst case for all callers */
+   if (gs->type == PIPE_SHADER_GEOMETRY)
+      return gs->info.properties[TGSI_PROPERTY_GS_INPUT_PRIM];
+
+   if (gs->type == PIPE_SHADER_TESS_EVAL) {
+      if (gs->info.properties[TGSI_PROPERTY_TES_POINT_MODE])
+         return PIPE_PRIM_POINTS;
+      if (gs->info.properties[TGSI_PROPERTY_TES_PRIM_MODE] == PIPE_PRIM_LINES)
+         return PIPE_PRIM_LINES;
+      return PIPE_PRIM_TRIANGLES;
+   }
+
+   /* TODO: Set this correctly if the primitive type is set in the shader key. */
+   return PIPE_PRIM_TRIANGLES; /* worst case for all callers */
 }
 
 static unsigned si_get_vs_out_cntl(const struct si_shader_selector *sel, bool ngg)
 {
-	bool misc_vec_ena =
-		sel->info.writes_psize || (sel->info.writes_edgeflag && !ngg) ||
-		sel->info.writes_layer || sel->info.writes_viewport_index;
-	return S_02881C_USE_VTX_POINT_SIZE(sel->info.writes_psize) |
-	       S_02881C_USE_VTX_EDGE_FLAG(sel->info.writes_edgeflag && !ngg) |
-	       S_02881C_USE_VTX_RENDER_TARGET_INDX(sel->info.writes_layer) |
-	       S_02881C_USE_VTX_VIEWPORT_INDX(sel->info.writes_viewport_index) |
-	       S_02881C_VS_OUT_MISC_VEC_ENA(misc_vec_ena) |
-	       S_02881C_VS_OUT_MISC_SIDE_BUS_ENA(misc_vec_ena);
+   bool misc_vec_ena = sel->info.writes_psize || (sel->info.writes_edgeflag && !ngg) ||
+                       sel->info.writes_layer || sel->info.writes_viewport_index;
+   return S_02881C_USE_VTX_POINT_SIZE(sel->info.writes_psize) |
+          S_02881C_USE_VTX_EDGE_FLAG(sel->info.writes_edgeflag && !ngg) |
+          S_02881C_USE_VTX_RENDER_TARGET_INDX(sel->info.writes_layer) |
+          S_02881C_USE_VTX_VIEWPORT_INDX(sel->info.writes_viewport_index) |
+          S_02881C_VS_OUT_MISC_VEC_ENA(misc_vec_ena) |
+          S_02881C_VS_OUT_MISC_SIDE_BUS_ENA(misc_vec_ena);
 }
 
 /**
@@ -1136,305 +1061,279 @@ static unsigned si_get_vs_out_cntl(const struct si_shader_selector *sel, bool ng
  */
 static void gfx10_shader_ngg(struct si_screen *sscreen, struct si_shader *shader)
 {
-	const struct si_shader_selector *gs_sel = shader->selector;
-	const struct si_shader_info *gs_info = &gs_sel->info;
-	enum pipe_shader_type gs_type = shader->selector->type;
-	const struct si_shader_selector *es_sel =
-		shader->previous_stage_sel ? shader->previous_stage_sel : shader->selector;
-	const struct si_shader_info *es_info = &es_sel->info;
-	enum pipe_shader_type es_type = es_sel->type;
-	unsigned num_user_sgprs;
-	unsigned nparams, es_vgpr_comp_cnt, gs_vgpr_comp_cnt;
-	uint64_t va;
-	unsigned window_space =
-		gs_info->properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION];
-	bool es_enable_prim_id = shader->key.mono.u.vs_export_prim_id || es_info->uses_primid;
-	unsigned gs_num_invocations = MAX2(gs_sel->gs_num_invocations, 1);
-	unsigned input_prim = si_get_input_prim(gs_sel);
-	bool break_wave_at_eoi = false;
-	struct si_pm4_state *pm4 = si_get_shader_pm4_state(shader);
-	if (!pm4)
-		return;
-
-	if (es_type == PIPE_SHADER_TESS_EVAL) {
-		pm4->atom.emit = gs_type == PIPE_SHADER_GEOMETRY ? gfx10_emit_shader_ngg_tess_gs
-								 : gfx10_emit_shader_ngg_tess_nogs;
-	} else {
-		pm4->atom.emit = gs_type == PIPE_SHADER_GEOMETRY ? gfx10_emit_shader_ngg_notess_gs
-								 : gfx10_emit_shader_ngg_notess_nogs;
-	}
-
-	va = shader->bo->gpu_address;
-	si_pm4_add_bo(pm4, shader->bo, RADEON_USAGE_READ, RADEON_PRIO_SHADER_BINARY);
-
-	if (es_type == PIPE_SHADER_VERTEX) {
-		es_vgpr_comp_cnt = si_get_vs_vgpr_comp_cnt(sscreen, shader, false);
-
-		if (es_info->properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD]) {
-			num_user_sgprs = SI_SGPR_VS_BLIT_DATA +
-					 es_info->properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD];
-		} else {
-			num_user_sgprs = si_get_num_vs_user_sgprs(shader, GFX9_VSGS_NUM_USER_SGPR);
-		}
-	} else {
-		assert(es_type == PIPE_SHADER_TESS_EVAL);
-		es_vgpr_comp_cnt = es_enable_prim_id ? 3 : 2;
-		num_user_sgprs = GFX9_TESGS_NUM_USER_SGPR;
-
-		if (es_enable_prim_id || gs_info->uses_primid)
-			break_wave_at_eoi = true;
-	}
-
-	/* If offsets 4, 5 are used, GS_VGPR_COMP_CNT is ignored and
-	 * VGPR[0:4] are always loaded.
-	 *
-	 * Vertex shaders always need to load VGPR3, because they need to
-	 * pass edge flags for decomposed primitives (such as quads) to the PA
-	 * for the GL_LINE polygon mode to skip rendering lines on inner edges.
-	 */
-	if (gs_info->uses_invocationid ||
-	    (gs_type == PIPE_SHADER_VERTEX && !gfx10_is_ngg_passthrough(shader)))
-		gs_vgpr_comp_cnt = 3; /* VGPR3 contains InvocationID, edge flags. */
-	else if ((gs_type == PIPE_SHADER_GEOMETRY && gs_info->uses_primid) ||
-		 (gs_type == PIPE_SHADER_VERTEX && shader->key.mono.u.vs_export_prim_id))
-		gs_vgpr_comp_cnt = 2; /* VGPR2 contains PrimitiveID. */
-	else if (input_prim >= PIPE_PRIM_TRIANGLES && !gfx10_is_ngg_passthrough(shader))
-		gs_vgpr_comp_cnt = 1; /* VGPR1 contains offsets 2, 3 */
-	else
-		gs_vgpr_comp_cnt = 0; /* VGPR0 contains offsets 0, 1 */
-
-	si_pm4_set_reg(pm4, R_00B320_SPI_SHADER_PGM_LO_ES, va >> 8);
-	si_pm4_set_reg(pm4, R_00B324_SPI_SHADER_PGM_HI_ES, va >> 40);
-	si_pm4_set_reg(pm4, R_00B228_SPI_SHADER_PGM_RSRC1_GS,
-		       S_00B228_VGPRS((shader->config.num_vgprs - 1) /
-				      (sscreen->ge_wave_size == 32 ? 8 : 4)) |
-		       S_00B228_FLOAT_MODE(shader->config.float_mode) |
-		       S_00B228_DX10_CLAMP(1) |
-		       S_00B228_MEM_ORDERED(1) |
-		       S_00B228_WGP_MODE(1) |
-		       S_00B228_GS_VGPR_COMP_CNT(gs_vgpr_comp_cnt));
-	si_pm4_set_reg(pm4, R_00B22C_SPI_SHADER_PGM_RSRC2_GS,
-		       S_00B22C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0) |
-		       S_00B22C_USER_SGPR(num_user_sgprs) |
-		       S_00B22C_ES_VGPR_COMP_CNT(es_vgpr_comp_cnt) |
-		       S_00B22C_USER_SGPR_MSB_GFX10(num_user_sgprs >> 5) |
-		       S_00B22C_OC_LDS_EN(es_type == PIPE_SHADER_TESS_EVAL) |
-		       S_00B22C_LDS_SIZE(shader->config.lds_size));
-
-	/* Determine LATE_ALLOC_GS. */
-	unsigned num_cu_per_sh = sscreen->info.num_good_cu_per_sh;
-	unsigned late_alloc_wave64; /* The limit is per SH. */
-
-	/* For Wave32, the hw will launch twice the number of late
-	 * alloc waves, so 1 == 2x wave32.
-	 *
-	 * Don't use late alloc for NGG on Navi14 due to a hw bug.
-	 */
-	if (sscreen->info.family == CHIP_NAVI14 || !sscreen->info.use_late_alloc)
-		late_alloc_wave64 = 0;
-	else if (num_cu_per_sh <= 6)
-		late_alloc_wave64 = num_cu_per_sh - 2; /* All CUs enabled */
-	else if (shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL)
-		late_alloc_wave64 = (num_cu_per_sh - 2) * 6;
-	else
-		late_alloc_wave64 = (num_cu_per_sh - 2) * 4;
-
-	/* Limit LATE_ALLOC_GS for prevent a hang (hw bug). */
-	if (sscreen->info.family == CHIP_NAVI10 ||
-	    sscreen->info.family == CHIP_NAVI12 ||
-	    sscreen->info.family == CHIP_NAVI14)
-		late_alloc_wave64 = MIN2(late_alloc_wave64, 64);
-
-	si_pm4_set_reg(pm4, R_00B204_SPI_SHADER_PGM_RSRC4_GS,
-		       S_00B204_CU_EN(0xffff) |
-		       S_00B204_SPI_SHADER_LATE_ALLOC_GS_GFX10(late_alloc_wave64));
-
-	nparams = MAX2(shader->info.nr_param_exports, 1);
-	shader->ctx_reg.ngg.spi_vs_out_config =
-		S_0286C4_VS_EXPORT_COUNT(nparams - 1) |
-		S_0286C4_NO_PC_EXPORT(shader->info.nr_param_exports == 0);
-
-	shader->ctx_reg.ngg.spi_shader_idx_format =
-		S_028708_IDX0_EXPORT_FORMAT(V_028708_SPI_SHADER_1COMP);
-	shader->ctx_reg.ngg.spi_shader_pos_format =
-		S_02870C_POS0_EXPORT_FORMAT(V_02870C_SPI_SHADER_4COMP) |
-		S_02870C_POS1_EXPORT_FORMAT(shader->info.nr_pos_exports > 1 ?
-					    V_02870C_SPI_SHADER_4COMP :
-					    V_02870C_SPI_SHADER_NONE) |
-		S_02870C_POS2_EXPORT_FORMAT(shader->info.nr_pos_exports > 2 ?
-					    V_02870C_SPI_SHADER_4COMP :
-					    V_02870C_SPI_SHADER_NONE) |
-		S_02870C_POS3_EXPORT_FORMAT(shader->info.nr_pos_exports > 3 ?
-					    V_02870C_SPI_SHADER_4COMP :
-					    V_02870C_SPI_SHADER_NONE);
-
-	shader->ctx_reg.ngg.vgt_primitiveid_en =
-		S_028A84_PRIMITIVEID_EN(es_enable_prim_id) |
-		S_028A84_NGG_DISABLE_PROVOK_REUSE(shader->key.mono.u.vs_export_prim_id ||
-						  gs_sel->info.writes_primid);
-
-	if (gs_type == PIPE_SHADER_GEOMETRY) {
-		shader->ctx_reg.ngg.vgt_esgs_ring_itemsize = es_sel->esgs_itemsize / 4;
-		shader->ctx_reg.ngg.vgt_gs_max_vert_out = gs_sel->gs_max_out_vertices;
-	} else {
-		shader->ctx_reg.ngg.vgt_esgs_ring_itemsize = 1;
-	}
-
-	if (es_type == PIPE_SHADER_TESS_EVAL)
-		si_set_tesseval_regs(sscreen, es_sel, pm4);
-
-	shader->ctx_reg.ngg.vgt_gs_onchip_cntl =
-		S_028A44_ES_VERTS_PER_SUBGRP(shader->ngg.hw_max_esverts) |
-		S_028A44_GS_PRIMS_PER_SUBGRP(shader->ngg.max_gsprims) |
-		S_028A44_GS_INST_PRIMS_IN_SUBGRP(shader->ngg.max_gsprims * gs_num_invocations);
-	shader->ctx_reg.ngg.ge_max_output_per_subgroup =
-		S_0287FC_MAX_VERTS_PER_SUBGROUP(shader->ngg.max_out_verts);
-	shader->ctx_reg.ngg.ge_ngg_subgrp_cntl =
-		S_028B4C_PRIM_AMP_FACTOR(shader->ngg.prim_amp_factor) |
-		S_028B4C_THDS_PER_SUBGRP(0); /* for fast launch */
-	shader->ctx_reg.ngg.vgt_gs_instance_cnt =
-		S_028B90_CNT(gs_num_invocations) |
-		S_028B90_ENABLE(gs_num_invocations > 1) |
-		S_028B90_EN_MAX_VERT_OUT_PER_GS_INSTANCE(
-			shader->ngg.max_vert_out_per_gs_instance);
-
-	/* Always output hw-generated edge flags and pass them via the prim
-	 * export to prevent drawing lines on internal edges of decomposed
-	 * primitives (such as quads) with polygon mode = lines. Only VS needs
-	 * this.
-	 */
-	shader->ctx_reg.ngg.pa_cl_ngg_cntl =
-		S_028838_INDEX_BUF_EDGE_FLAG_ENA(gs_type == PIPE_SHADER_VERTEX);
-	shader->pa_cl_vs_out_cntl = si_get_vs_out_cntl(gs_sel, true);
-
-	/* Oversubscribe PC. This improves performance when there are too many varyings. */
-	float oversub_pc_factor = 0.25;
-
-	if (shader->key.opt.ngg_culling) {
-		/* Be more aggressive with NGG culling. */
-		if (shader->info.nr_param_exports > 4)
-			oversub_pc_factor = 1;
-		else if (shader->info.nr_param_exports > 2)
-			oversub_pc_factor = 0.75;
-		else
-			oversub_pc_factor = 0.5;
-	}
-
-	unsigned oversub_pc_lines = sscreen->info.pc_lines * oversub_pc_factor;
-	shader->ctx_reg.ngg.ge_pc_alloc = S_030980_OVERSUB_EN(sscreen->info.use_late_alloc) |
-					  S_030980_NUM_PC_LINES(oversub_pc_lines - 1);
-
-	if (shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST) {
-		shader->ge_cntl =
-			S_03096C_PRIM_GRP_SIZE(shader->ngg.max_gsprims) |
-			S_03096C_VERT_GRP_SIZE(shader->ngg.max_gsprims * 3);
-	} else if (shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP) {
-		shader->ge_cntl =
-			S_03096C_PRIM_GRP_SIZE(shader->ngg.max_gsprims) |
-			S_03096C_VERT_GRP_SIZE(shader->ngg.max_gsprims + 2);
-	} else {
-		shader->ge_cntl =
-			S_03096C_PRIM_GRP_SIZE(shader->ngg.max_gsprims) |
-			S_03096C_VERT_GRP_SIZE(256) | /* 256 = disable vertex grouping */
-			S_03096C_BREAK_WAVE_AT_EOI(break_wave_at_eoi);
-
-		/* Bug workaround for a possible hang with non-tessellation cases.
-		 * Tessellation always sets GE_CNTL.VERT_GRP_SIZE = 0
-		 *
-		 * Requirement: GE_CNTL.VERT_GRP_SIZE = VGT_GS_ONCHIP_CNTL.ES_VERTS_PER_SUBGRP - 5
-		 */
-		if ((sscreen->info.family == CHIP_NAVI10 ||
-		     sscreen->info.family == CHIP_NAVI12 ||
-		     sscreen->info.family == CHIP_NAVI14) &&
-		    (es_type == PIPE_SHADER_VERTEX || gs_type == PIPE_SHADER_VERTEX) && /* = no tess */
-		    shader->ngg.hw_max_esverts != 256) {
-			shader->ge_cntl &= C_03096C_VERT_GRP_SIZE;
-
-			if (shader->ngg.hw_max_esverts > 5) {
-				shader->ge_cntl |=
-					S_03096C_VERT_GRP_SIZE(shader->ngg.hw_max_esverts - 5);
-			}
-		}
-	}
-
-	if (window_space) {
-		shader->ctx_reg.ngg.pa_cl_vte_cntl =
-			S_028818_VTX_XY_FMT(1) | S_028818_VTX_Z_FMT(1);
-	} else {
-		shader->ctx_reg.ngg.pa_cl_vte_cntl =
-			S_028818_VTX_W0_FMT(1) |
-			S_028818_VPORT_X_SCALE_ENA(1) | S_028818_VPORT_X_OFFSET_ENA(1) |
-			S_028818_VPORT_Y_SCALE_ENA(1) | S_028818_VPORT_Y_OFFSET_ENA(1) |
-			S_028818_VPORT_Z_SCALE_ENA(1) | S_028818_VPORT_Z_OFFSET_ENA(1);
-	}
+   const struct si_shader_selector *gs_sel = shader->selector;
+   const struct si_shader_info *gs_info = &gs_sel->info;
+   enum pipe_shader_type gs_type = shader->selector->type;
+   const struct si_shader_selector *es_sel =
+      shader->previous_stage_sel ? shader->previous_stage_sel : shader->selector;
+   const struct si_shader_info *es_info = &es_sel->info;
+   enum pipe_shader_type es_type = es_sel->type;
+   unsigned num_user_sgprs;
+   unsigned nparams, es_vgpr_comp_cnt, gs_vgpr_comp_cnt;
+   uint64_t va;
+   unsigned window_space = gs_info->properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION];
+   bool es_enable_prim_id = shader->key.mono.u.vs_export_prim_id || es_info->uses_primid;
+   unsigned gs_num_invocations = MAX2(gs_sel->gs_num_invocations, 1);
+   unsigned input_prim = si_get_input_prim(gs_sel);
+   bool break_wave_at_eoi = false;
+   struct si_pm4_state *pm4 = si_get_shader_pm4_state(shader);
+   if (!pm4)
+      return;
+
+   if (es_type == PIPE_SHADER_TESS_EVAL) {
+      pm4->atom.emit = gs_type == PIPE_SHADER_GEOMETRY ? gfx10_emit_shader_ngg_tess_gs
+                                                       : gfx10_emit_shader_ngg_tess_nogs;
+   } else {
+      pm4->atom.emit = gs_type == PIPE_SHADER_GEOMETRY ? gfx10_emit_shader_ngg_notess_gs
+                                                       : gfx10_emit_shader_ngg_notess_nogs;
+   }
+
+   va = shader->bo->gpu_address;
+   si_pm4_add_bo(pm4, shader->bo, RADEON_USAGE_READ, RADEON_PRIO_SHADER_BINARY);
+
+   if (es_type == PIPE_SHADER_VERTEX) {
+      es_vgpr_comp_cnt = si_get_vs_vgpr_comp_cnt(sscreen, shader, false);
+
+      if (es_info->properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD]) {
+         num_user_sgprs =
+            SI_SGPR_VS_BLIT_DATA + es_info->properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD];
+      } else {
+         num_user_sgprs = si_get_num_vs_user_sgprs(shader, GFX9_VSGS_NUM_USER_SGPR);
+      }
+   } else {
+      assert(es_type == PIPE_SHADER_TESS_EVAL);
+      es_vgpr_comp_cnt = es_enable_prim_id ? 3 : 2;
+      num_user_sgprs = GFX9_TESGS_NUM_USER_SGPR;
+
+      if (es_enable_prim_id || gs_info->uses_primid)
+         break_wave_at_eoi = true;
+   }
+
+   /* If offsets 4, 5 are used, GS_VGPR_COMP_CNT is ignored and
+    * VGPR[0:4] are always loaded.
+    *
+    * Vertex shaders always need to load VGPR3, because they need to
+    * pass edge flags for decomposed primitives (such as quads) to the PA
+    * for the GL_LINE polygon mode to skip rendering lines on inner edges.
+    */
+   if (gs_info->uses_invocationid ||
+       (gs_type == PIPE_SHADER_VERTEX && !gfx10_is_ngg_passthrough(shader)))
+      gs_vgpr_comp_cnt = 3; /* VGPR3 contains InvocationID, edge flags. */
+   else if ((gs_type == PIPE_SHADER_GEOMETRY && gs_info->uses_primid) ||
+            (gs_type == PIPE_SHADER_VERTEX && shader->key.mono.u.vs_export_prim_id))
+      gs_vgpr_comp_cnt = 2; /* VGPR2 contains PrimitiveID. */
+   else if (input_prim >= PIPE_PRIM_TRIANGLES && !gfx10_is_ngg_passthrough(shader))
+      gs_vgpr_comp_cnt = 1; /* VGPR1 contains offsets 2, 3 */
+   else
+      gs_vgpr_comp_cnt = 0; /* VGPR0 contains offsets 0, 1 */
+
+   si_pm4_set_reg(pm4, R_00B320_SPI_SHADER_PGM_LO_ES, va >> 8);
+   si_pm4_set_reg(pm4, R_00B324_SPI_SHADER_PGM_HI_ES, va >> 40);
+   si_pm4_set_reg(
+      pm4, R_00B228_SPI_SHADER_PGM_RSRC1_GS,
+      S_00B228_VGPRS((shader->config.num_vgprs - 1) / (sscreen->ge_wave_size == 32 ? 8 : 4)) |
+         S_00B228_FLOAT_MODE(shader->config.float_mode) | S_00B228_DX10_CLAMP(1) |
+         S_00B228_MEM_ORDERED(1) | S_00B228_WGP_MODE(1) |
+         S_00B228_GS_VGPR_COMP_CNT(gs_vgpr_comp_cnt));
+   si_pm4_set_reg(pm4, R_00B22C_SPI_SHADER_PGM_RSRC2_GS,
+                  S_00B22C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0) |
+                     S_00B22C_USER_SGPR(num_user_sgprs) |
+                     S_00B22C_ES_VGPR_COMP_CNT(es_vgpr_comp_cnt) |
+                     S_00B22C_USER_SGPR_MSB_GFX10(num_user_sgprs >> 5) |
+                     S_00B22C_OC_LDS_EN(es_type == PIPE_SHADER_TESS_EVAL) |
+                     S_00B22C_LDS_SIZE(shader->config.lds_size));
+
+   /* Determine LATE_ALLOC_GS. */
+   unsigned num_cu_per_sh = sscreen->info.num_good_cu_per_sh;
+   unsigned late_alloc_wave64; /* The limit is per SH. */
+
+   /* For Wave32, the hw will launch twice the number of late
+    * alloc waves, so 1 == 2x wave32.
+    *
+    * Don't use late alloc for NGG on Navi14 due to a hw bug.
+    */
+   if (sscreen->info.family == CHIP_NAVI14 || !sscreen->info.use_late_alloc)
+      late_alloc_wave64 = 0;
+   else if (num_cu_per_sh <= 6)
+      late_alloc_wave64 = num_cu_per_sh - 2; /* All CUs enabled */
+   else if (shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL)
+      late_alloc_wave64 = (num_cu_per_sh - 2) * 6;
+   else
+      late_alloc_wave64 = (num_cu_per_sh - 2) * 4;
+
+   /* Limit LATE_ALLOC_GS for prevent a hang (hw bug). */
+   if (sscreen->info.family == CHIP_NAVI10 || sscreen->info.family == CHIP_NAVI12 ||
+       sscreen->info.family == CHIP_NAVI14)
+      late_alloc_wave64 = MIN2(late_alloc_wave64, 64);
+
+   si_pm4_set_reg(
+      pm4, R_00B204_SPI_SHADER_PGM_RSRC4_GS,
+      S_00B204_CU_EN(0xffff) | S_00B204_SPI_SHADER_LATE_ALLOC_GS_GFX10(late_alloc_wave64));
+
+   nparams = MAX2(shader->info.nr_param_exports, 1);
+   shader->ctx_reg.ngg.spi_vs_out_config =
+      S_0286C4_VS_EXPORT_COUNT(nparams - 1) |
+      S_0286C4_NO_PC_EXPORT(shader->info.nr_param_exports == 0);
+
+   shader->ctx_reg.ngg.spi_shader_idx_format =
+      S_028708_IDX0_EXPORT_FORMAT(V_028708_SPI_SHADER_1COMP);
+   shader->ctx_reg.ngg.spi_shader_pos_format =
+      S_02870C_POS0_EXPORT_FORMAT(V_02870C_SPI_SHADER_4COMP) |
+      S_02870C_POS1_EXPORT_FORMAT(shader->info.nr_pos_exports > 1 ? V_02870C_SPI_SHADER_4COMP
+                                                                  : V_02870C_SPI_SHADER_NONE) |
+      S_02870C_POS2_EXPORT_FORMAT(shader->info.nr_pos_exports > 2 ? V_02870C_SPI_SHADER_4COMP
+                                                                  : V_02870C_SPI_SHADER_NONE) |
+      S_02870C_POS3_EXPORT_FORMAT(shader->info.nr_pos_exports > 3 ? V_02870C_SPI_SHADER_4COMP
+                                                                  : V_02870C_SPI_SHADER_NONE);
+
+   shader->ctx_reg.ngg.vgt_primitiveid_en =
+      S_028A84_PRIMITIVEID_EN(es_enable_prim_id) |
+      S_028A84_NGG_DISABLE_PROVOK_REUSE(shader->key.mono.u.vs_export_prim_id ||
+                                        gs_sel->info.writes_primid);
+
+   if (gs_type == PIPE_SHADER_GEOMETRY) {
+      shader->ctx_reg.ngg.vgt_esgs_ring_itemsize = es_sel->esgs_itemsize / 4;
+      shader->ctx_reg.ngg.vgt_gs_max_vert_out = gs_sel->gs_max_out_vertices;
+   } else {
+      shader->ctx_reg.ngg.vgt_esgs_ring_itemsize = 1;
+   }
+
+   if (es_type == PIPE_SHADER_TESS_EVAL)
+      si_set_tesseval_regs(sscreen, es_sel, pm4);
+
+   shader->ctx_reg.ngg.vgt_gs_onchip_cntl =
+      S_028A44_ES_VERTS_PER_SUBGRP(shader->ngg.hw_max_esverts) |
+      S_028A44_GS_PRIMS_PER_SUBGRP(shader->ngg.max_gsprims) |
+      S_028A44_GS_INST_PRIMS_IN_SUBGRP(shader->ngg.max_gsprims * gs_num_invocations);
+   shader->ctx_reg.ngg.ge_max_output_per_subgroup =
+      S_0287FC_MAX_VERTS_PER_SUBGROUP(shader->ngg.max_out_verts);
+   shader->ctx_reg.ngg.ge_ngg_subgrp_cntl = S_028B4C_PRIM_AMP_FACTOR(shader->ngg.prim_amp_factor) |
+                                            S_028B4C_THDS_PER_SUBGRP(0); /* for fast launch */
+   shader->ctx_reg.ngg.vgt_gs_instance_cnt =
+      S_028B90_CNT(gs_num_invocations) | S_028B90_ENABLE(gs_num_invocations > 1) |
+      S_028B90_EN_MAX_VERT_OUT_PER_GS_INSTANCE(shader->ngg.max_vert_out_per_gs_instance);
+
+   /* Always output hw-generated edge flags and pass them via the prim
+    * export to prevent drawing lines on internal edges of decomposed
+    * primitives (such as quads) with polygon mode = lines. Only VS needs
+    * this.
+    */
+   shader->ctx_reg.ngg.pa_cl_ngg_cntl =
+      S_028838_INDEX_BUF_EDGE_FLAG_ENA(gs_type == PIPE_SHADER_VERTEX);
+   shader->pa_cl_vs_out_cntl = si_get_vs_out_cntl(gs_sel, true);
+
+   /* Oversubscribe PC. This improves performance when there are too many varyings. */
+   float oversub_pc_factor = 0.25;
+
+   if (shader->key.opt.ngg_culling) {
+      /* Be more aggressive with NGG culling. */
+      if (shader->info.nr_param_exports > 4)
+         oversub_pc_factor = 1;
+      else if (shader->info.nr_param_exports > 2)
+         oversub_pc_factor = 0.75;
+      else
+         oversub_pc_factor = 0.5;
+   }
+
+   unsigned oversub_pc_lines = sscreen->info.pc_lines * oversub_pc_factor;
+   shader->ctx_reg.ngg.ge_pc_alloc = S_030980_OVERSUB_EN(sscreen->info.use_late_alloc) |
+                                     S_030980_NUM_PC_LINES(oversub_pc_lines - 1);
+
+   if (shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST) {
+      shader->ge_cntl = S_03096C_PRIM_GRP_SIZE(shader->ngg.max_gsprims) |
+                        S_03096C_VERT_GRP_SIZE(shader->ngg.max_gsprims * 3);
+   } else if (shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP) {
+      shader->ge_cntl = S_03096C_PRIM_GRP_SIZE(shader->ngg.max_gsprims) |
+                        S_03096C_VERT_GRP_SIZE(shader->ngg.max_gsprims + 2);
+   } else {
+      shader->ge_cntl = S_03096C_PRIM_GRP_SIZE(shader->ngg.max_gsprims) |
+                        S_03096C_VERT_GRP_SIZE(256) | /* 256 = disable vertex grouping */
+                        S_03096C_BREAK_WAVE_AT_EOI(break_wave_at_eoi);
+
+      /* Bug workaround for a possible hang with non-tessellation cases.
+       * Tessellation always sets GE_CNTL.VERT_GRP_SIZE = 0
+       *
+       * Requirement: GE_CNTL.VERT_GRP_SIZE = VGT_GS_ONCHIP_CNTL.ES_VERTS_PER_SUBGRP - 5
+       */
+      if ((sscreen->info.family == CHIP_NAVI10 || sscreen->info.family == CHIP_NAVI12 ||
+           sscreen->info.family == CHIP_NAVI14) &&
+          (es_type == PIPE_SHADER_VERTEX || gs_type == PIPE_SHADER_VERTEX) && /* = no tess */
+          shader->ngg.hw_max_esverts != 256) {
+         shader->ge_cntl &= C_03096C_VERT_GRP_SIZE;
+
+         if (shader->ngg.hw_max_esverts > 5) {
+            shader->ge_cntl |= S_03096C_VERT_GRP_SIZE(shader->ngg.hw_max_esverts - 5);
+         }
+      }
+   }
+
+   if (window_space) {
+      shader->ctx_reg.ngg.pa_cl_vte_cntl = S_028818_VTX_XY_FMT(1) | S_028818_VTX_Z_FMT(1);
+   } else {
+      shader->ctx_reg.ngg.pa_cl_vte_cntl =
+         S_028818_VTX_W0_FMT(1) | S_028818_VPORT_X_SCALE_ENA(1) | S_028818_VPORT_X_OFFSET_ENA(1) |
+         S_028818_VPORT_Y_SCALE_ENA(1) | S_028818_VPORT_Y_OFFSET_ENA(1) |
+         S_028818_VPORT_Z_SCALE_ENA(1) | S_028818_VPORT_Z_OFFSET_ENA(1);
+   }
 }
 
 static void si_emit_shader_vs(struct si_context *sctx)
 {
-	struct si_shader *shader = sctx->queued.named.vs->shader;
-	unsigned initial_cdw = sctx->gfx_cs->current.cdw;
-
-	if (!shader)
-		return;
-
-	radeon_opt_set_context_reg(sctx, R_028A40_VGT_GS_MODE,
-				   SI_TRACKED_VGT_GS_MODE,
-				   shader->ctx_reg.vs.vgt_gs_mode);
-	radeon_opt_set_context_reg(sctx, R_028A84_VGT_PRIMITIVEID_EN,
-				   SI_TRACKED_VGT_PRIMITIVEID_EN,
-				   shader->ctx_reg.vs.vgt_primitiveid_en);
-
-	if (sctx->chip_class <= GFX8) {
-		radeon_opt_set_context_reg(sctx, R_028AB4_VGT_REUSE_OFF,
-					   SI_TRACKED_VGT_REUSE_OFF,
-					   shader->ctx_reg.vs.vgt_reuse_off);
-	}
-
-	radeon_opt_set_context_reg(sctx, R_0286C4_SPI_VS_OUT_CONFIG,
-				   SI_TRACKED_SPI_VS_OUT_CONFIG,
-				   shader->ctx_reg.vs.spi_vs_out_config);
-
-	radeon_opt_set_context_reg(sctx, R_02870C_SPI_SHADER_POS_FORMAT,
-				   SI_TRACKED_SPI_SHADER_POS_FORMAT,
-				   shader->ctx_reg.vs.spi_shader_pos_format);
-
-	radeon_opt_set_context_reg(sctx, R_028818_PA_CL_VTE_CNTL,
-				   SI_TRACKED_PA_CL_VTE_CNTL,
-				   shader->ctx_reg.vs.pa_cl_vte_cntl);
-
-	if (shader->selector->type == PIPE_SHADER_TESS_EVAL)
-		radeon_opt_set_context_reg(sctx, R_028B6C_VGT_TF_PARAM,
-					   SI_TRACKED_VGT_TF_PARAM,
-					   shader->vgt_tf_param);
-
-	if (shader->vgt_vertex_reuse_block_cntl)
-		radeon_opt_set_context_reg(sctx, R_028C58_VGT_VERTEX_REUSE_BLOCK_CNTL,
-					   SI_TRACKED_VGT_VERTEX_REUSE_BLOCK_CNTL,
-					   shader->vgt_vertex_reuse_block_cntl);
-
-	/* Required programming for tessellation. (legacy pipeline only) */
-	if (sctx->chip_class == GFX10 &&
-	    shader->selector->type == PIPE_SHADER_TESS_EVAL) {
-		radeon_opt_set_context_reg(sctx, R_028A44_VGT_GS_ONCHIP_CNTL,
-					   SI_TRACKED_VGT_GS_ONCHIP_CNTL,
-					   S_028A44_ES_VERTS_PER_SUBGRP(250) |
-					   S_028A44_GS_PRIMS_PER_SUBGRP(126) |
-					   S_028A44_GS_INST_PRIMS_IN_SUBGRP(126));
-	}
-
-	if (sctx->chip_class >= GFX10) {
-		radeon_opt_set_context_reg_rmw(sctx, R_02881C_PA_CL_VS_OUT_CNTL,
-					       SI_TRACKED_PA_CL_VS_OUT_CNTL__VS,
-					       shader->pa_cl_vs_out_cntl,
-					       SI_TRACKED_PA_CL_VS_OUT_CNTL__VS_MASK);
-	}
-
-	if (initial_cdw != sctx->gfx_cs->current.cdw)
-		sctx->context_roll = true;
-
-	/* GE_PC_ALLOC is not a context register, so it doesn't cause a context roll. */
-	if (sctx->chip_class >= GFX10)
-		gfx10_emit_ge_pc_alloc(sctx, shader->ctx_reg.vs.ge_pc_alloc);
+   struct si_shader *shader = sctx->queued.named.vs->shader;
+   unsigned initial_cdw = sctx->gfx_cs->current.cdw;
+
+   if (!shader)
+      return;
+
+   radeon_opt_set_context_reg(sctx, R_028A40_VGT_GS_MODE, SI_TRACKED_VGT_GS_MODE,
+                              shader->ctx_reg.vs.vgt_gs_mode);
+   radeon_opt_set_context_reg(sctx, R_028A84_VGT_PRIMITIVEID_EN, SI_TRACKED_VGT_PRIMITIVEID_EN,
+                              shader->ctx_reg.vs.vgt_primitiveid_en);
+
+   if (sctx->chip_class <= GFX8) {
+      radeon_opt_set_context_reg(sctx, R_028AB4_VGT_REUSE_OFF, SI_TRACKED_VGT_REUSE_OFF,
+                                 shader->ctx_reg.vs.vgt_reuse_off);
+   }
+
+   radeon_opt_set_context_reg(sctx, R_0286C4_SPI_VS_OUT_CONFIG, SI_TRACKED_SPI_VS_OUT_CONFIG,
+                              shader->ctx_reg.vs.spi_vs_out_config);
+
+   radeon_opt_set_context_reg(sctx, R_02870C_SPI_SHADER_POS_FORMAT,
+                              SI_TRACKED_SPI_SHADER_POS_FORMAT,
+                              shader->ctx_reg.vs.spi_shader_pos_format);
+
+   radeon_opt_set_context_reg(sctx, R_028818_PA_CL_VTE_CNTL, SI_TRACKED_PA_CL_VTE_CNTL,
+                              shader->ctx_reg.vs.pa_cl_vte_cntl);
+
+   if (shader->selector->type == PIPE_SHADER_TESS_EVAL)
+      radeon_opt_set_context_reg(sctx, R_028B6C_VGT_TF_PARAM, SI_TRACKED_VGT_TF_PARAM,
+                                 shader->vgt_tf_param);
+
+   if (shader->vgt_vertex_reuse_block_cntl)
+      radeon_opt_set_context_reg(sctx, R_028C58_VGT_VERTEX_REUSE_BLOCK_CNTL,
+                                 SI_TRACKED_VGT_VERTEX_REUSE_BLOCK_CNTL,
+                                 shader->vgt_vertex_reuse_block_cntl);
+
+   /* Required programming for tessellation. (legacy pipeline only) */
+   if (sctx->chip_class == GFX10 && shader->selector->type == PIPE_SHADER_TESS_EVAL) {
+      radeon_opt_set_context_reg(sctx, R_028A44_VGT_GS_ONCHIP_CNTL, SI_TRACKED_VGT_GS_ONCHIP_CNTL,
+                                 S_028A44_ES_VERTS_PER_SUBGRP(250) |
+                                    S_028A44_GS_PRIMS_PER_SUBGRP(126) |
+                                    S_028A44_GS_INST_PRIMS_IN_SUBGRP(126));
+   }
+
+   if (sctx->chip_class >= GFX10) {
+      radeon_opt_set_context_reg_rmw(sctx, R_02881C_PA_CL_VS_OUT_CNTL,
+                                     SI_TRACKED_PA_CL_VS_OUT_CNTL__VS, shader->pa_cl_vs_out_cntl,
+                                     SI_TRACKED_PA_CL_VS_OUT_CNTL__VS_MASK);
+   }
+
+   if (initial_cdw != sctx->gfx_cs->current.cdw)
+      sctx->context_roll = true;
+
+   /* GE_PC_ALLOC is not a context register, so it doesn't cause a context roll. */
+   if (sctx->chip_class >= GFX10)
+      gfx10_emit_ge_pc_alloc(sctx, shader->ctx_reg.vs.ge_pc_alloc);
 }
 
 /**
@@ -1447,827 +1346,757 @@ static void si_emit_shader_vs(struct si_context *sctx)
 static void si_shader_vs(struct si_screen *sscreen, struct si_shader *shader,
                          struct si_shader_selector *gs)
 {
-	const struct si_shader_info *info = &shader->selector->info;
-	struct si_pm4_state *pm4;
-	unsigned num_user_sgprs, vgpr_comp_cnt;
-	uint64_t va;
-	unsigned nparams, oc_lds_en;
-	unsigned window_space =
-		info->properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION];
-	bool enable_prim_id = shader->key.mono.u.vs_export_prim_id || info->uses_primid;
-
-	pm4 = si_get_shader_pm4_state(shader);
-	if (!pm4)
-		return;
-
-	pm4->atom.emit = si_emit_shader_vs;
-
-	/* We always write VGT_GS_MODE in the VS state, because every switch
-	 * between different shader pipelines involving a different GS or no
-	 * GS at all involves a switch of the VS (different GS use different
-	 * copy shaders). On the other hand, when the API switches from a GS to
-	 * no GS and then back to the same GS used originally, the GS state is
-	 * not sent again.
-	 */
-	if (!gs) {
-		unsigned mode = V_028A40_GS_OFF;
-
-		/* PrimID needs GS scenario A. */
-		if (enable_prim_id)
-			mode = V_028A40_GS_SCENARIO_A;
-
-		shader->ctx_reg.vs.vgt_gs_mode = S_028A40_MODE(mode);
-		shader->ctx_reg.vs.vgt_primitiveid_en = enable_prim_id;
-	} else {
-		shader->ctx_reg.vs.vgt_gs_mode = ac_vgt_gs_mode(gs->gs_max_out_vertices,
-								sscreen->info.chip_class);
-		shader->ctx_reg.vs.vgt_primitiveid_en = 0;
-	}
-
-	if (sscreen->info.chip_class <= GFX8) {
-		/* Reuse needs to be set off if we write oViewport. */
-		shader->ctx_reg.vs.vgt_reuse_off =
-				S_028AB4_REUSE_OFF(info->writes_viewport_index);
-	}
-
-	va = shader->bo->gpu_address;
-	si_pm4_add_bo(pm4, shader->bo, RADEON_USAGE_READ, RADEON_PRIO_SHADER_BINARY);
-
-	if (gs) {
-		vgpr_comp_cnt = 0; /* only VertexID is needed for GS-COPY. */
-		num_user_sgprs = SI_GSCOPY_NUM_USER_SGPR;
-	} else if (shader->selector->type == PIPE_SHADER_VERTEX) {
-		vgpr_comp_cnt = si_get_vs_vgpr_comp_cnt(sscreen, shader, enable_prim_id);
-
-		if (info->properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD]) {
-			num_user_sgprs = SI_SGPR_VS_BLIT_DATA +
-					 info->properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD];
-		} else {
-			num_user_sgprs = si_get_num_vs_user_sgprs(shader, SI_VS_NUM_USER_SGPR);
-		}
-	} else if (shader->selector->type == PIPE_SHADER_TESS_EVAL) {
-		vgpr_comp_cnt = enable_prim_id ? 3 : 2;
-		num_user_sgprs = SI_TES_NUM_USER_SGPR;
-	} else
-		unreachable("invalid shader selector type");
-
-	/* VS is required to export at least one param. */
-	nparams = MAX2(shader->info.nr_param_exports, 1);
-	shader->ctx_reg.vs.spi_vs_out_config = S_0286C4_VS_EXPORT_COUNT(nparams - 1);
-
-	if (sscreen->info.chip_class >= GFX10) {
-		shader->ctx_reg.vs.spi_vs_out_config |=
-			S_0286C4_NO_PC_EXPORT(shader->info.nr_param_exports == 0);
-	}
-
-	shader->ctx_reg.vs.spi_shader_pos_format =
-			S_02870C_POS0_EXPORT_FORMAT(V_02870C_SPI_SHADER_4COMP) |
-			S_02870C_POS1_EXPORT_FORMAT(shader->info.nr_pos_exports > 1 ?
-						    V_02870C_SPI_SHADER_4COMP :
-						    V_02870C_SPI_SHADER_NONE) |
-			S_02870C_POS2_EXPORT_FORMAT(shader->info.nr_pos_exports > 2 ?
-						    V_02870C_SPI_SHADER_4COMP :
-						    V_02870C_SPI_SHADER_NONE) |
-			S_02870C_POS3_EXPORT_FORMAT(shader->info.nr_pos_exports > 3 ?
-						    V_02870C_SPI_SHADER_4COMP :
-						    V_02870C_SPI_SHADER_NONE);
-	shader->ctx_reg.vs.ge_pc_alloc = S_030980_OVERSUB_EN(sscreen->info.use_late_alloc) |
-					 S_030980_NUM_PC_LINES(sscreen->info.pc_lines / 4 - 1);
-	shader->pa_cl_vs_out_cntl = si_get_vs_out_cntl(shader->selector, false);
-
-	oc_lds_en = shader->selector->type == PIPE_SHADER_TESS_EVAL ? 1 : 0;
-
-	si_pm4_set_reg(pm4, R_00B120_SPI_SHADER_PGM_LO_VS, va >> 8);
-	si_pm4_set_reg(pm4, R_00B124_SPI_SHADER_PGM_HI_VS, S_00B124_MEM_BASE(va >> 40));
-
-	uint32_t rsrc1 = S_00B128_VGPRS((shader->config.num_vgprs - 1) /
-					(sscreen->ge_wave_size == 32 ? 8 : 4)) |
-			 S_00B128_VGPR_COMP_CNT(vgpr_comp_cnt) |
-			 S_00B128_DX10_CLAMP(1) |
-			 S_00B128_MEM_ORDERED(sscreen->info.chip_class >= GFX10) |
-			 S_00B128_FLOAT_MODE(shader->config.float_mode);
-	uint32_t rsrc2 = S_00B12C_USER_SGPR(num_user_sgprs) |
-			 S_00B12C_OC_LDS_EN(oc_lds_en) |
-			 S_00B12C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0);
-
-	if (sscreen->info.chip_class >= GFX10)
-		rsrc2 |= S_00B12C_USER_SGPR_MSB_GFX10(num_user_sgprs >> 5);
-	else if (sscreen->info.chip_class == GFX9)
-		rsrc2 |= S_00B12C_USER_SGPR_MSB_GFX9(num_user_sgprs >> 5);
-
-	if (sscreen->info.chip_class <= GFX9)
-		rsrc1 |= S_00B128_SGPRS((shader->config.num_sgprs - 1) / 8);
-
-	if (!sscreen->use_ngg_streamout) {
-		rsrc2 |= S_00B12C_SO_BASE0_EN(!!shader->selector->so.stride[0]) |
-			 S_00B12C_SO_BASE1_EN(!!shader->selector->so.stride[1]) |
-			 S_00B12C_SO_BASE2_EN(!!shader->selector->so.stride[2]) |
-			 S_00B12C_SO_BASE3_EN(!!shader->selector->so.stride[3]) |
-			 S_00B12C_SO_EN(!!shader->selector->so.num_outputs);
-	}
-
-	si_pm4_set_reg(pm4, R_00B128_SPI_SHADER_PGM_RSRC1_VS, rsrc1);
-	si_pm4_set_reg(pm4, R_00B12C_SPI_SHADER_PGM_RSRC2_VS, rsrc2);
-
-	if (window_space)
-		shader->ctx_reg.vs.pa_cl_vte_cntl =
-				S_028818_VTX_XY_FMT(1) | S_028818_VTX_Z_FMT(1);
-	else
-		shader->ctx_reg.vs.pa_cl_vte_cntl =
-				S_028818_VTX_W0_FMT(1) |
-				S_028818_VPORT_X_SCALE_ENA(1) | S_028818_VPORT_X_OFFSET_ENA(1) |
-				S_028818_VPORT_Y_SCALE_ENA(1) | S_028818_VPORT_Y_OFFSET_ENA(1) |
-				S_028818_VPORT_Z_SCALE_ENA(1) | S_028818_VPORT_Z_OFFSET_ENA(1);
-
-	if (shader->selector->type == PIPE_SHADER_TESS_EVAL)
-		si_set_tesseval_regs(sscreen, shader->selector, pm4);
-
-	polaris_set_vgt_vertex_reuse(sscreen, shader->selector, shader, pm4);
+   const struct si_shader_info *info = &shader->selector->info;
+   struct si_pm4_state *pm4;
+   unsigned num_user_sgprs, vgpr_comp_cnt;
+   uint64_t va;
+   unsigned nparams, oc_lds_en;
+   unsigned window_space = info->properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION];
+   bool enable_prim_id = shader->key.mono.u.vs_export_prim_id || info->uses_primid;
+
+   pm4 = si_get_shader_pm4_state(shader);
+   if (!pm4)
+      return;
+
+   pm4->atom.emit = si_emit_shader_vs;
+
+   /* We always write VGT_GS_MODE in the VS state, because every switch
+    * between different shader pipelines involving a different GS or no
+    * GS at all involves a switch of the VS (different GS use different
+    * copy shaders). On the other hand, when the API switches from a GS to
+    * no GS and then back to the same GS used originally, the GS state is
+    * not sent again.
+    */
+   if (!gs) {
+      unsigned mode = V_028A40_GS_OFF;
+
+      /* PrimID needs GS scenario A. */
+      if (enable_prim_id)
+         mode = V_028A40_GS_SCENARIO_A;
+
+      shader->ctx_reg.vs.vgt_gs_mode = S_028A40_MODE(mode);
+      shader->ctx_reg.vs.vgt_primitiveid_en = enable_prim_id;
+   } else {
+      shader->ctx_reg.vs.vgt_gs_mode =
+         ac_vgt_gs_mode(gs->gs_max_out_vertices, sscreen->info.chip_class);
+      shader->ctx_reg.vs.vgt_primitiveid_en = 0;
+   }
+
+   if (sscreen->info.chip_class <= GFX8) {
+      /* Reuse needs to be set off if we write oViewport. */
+      shader->ctx_reg.vs.vgt_reuse_off = S_028AB4_REUSE_OFF(info->writes_viewport_index);
+   }
+
+   va = shader->bo->gpu_address;
+   si_pm4_add_bo(pm4, shader->bo, RADEON_USAGE_READ, RADEON_PRIO_SHADER_BINARY);
+
+   if (gs) {
+      vgpr_comp_cnt = 0; /* only VertexID is needed for GS-COPY. */
+      num_user_sgprs = SI_GSCOPY_NUM_USER_SGPR;
+   } else if (shader->selector->type == PIPE_SHADER_VERTEX) {
+      vgpr_comp_cnt = si_get_vs_vgpr_comp_cnt(sscreen, shader, enable_prim_id);
+
+      if (info->properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD]) {
+         num_user_sgprs = SI_SGPR_VS_BLIT_DATA + info->properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD];
+      } else {
+         num_user_sgprs = si_get_num_vs_user_sgprs(shader, SI_VS_NUM_USER_SGPR);
+      }
+   } else if (shader->selector->type == PIPE_SHADER_TESS_EVAL) {
+      vgpr_comp_cnt = enable_prim_id ? 3 : 2;
+      num_user_sgprs = SI_TES_NUM_USER_SGPR;
+   } else
+      unreachable("invalid shader selector type");
+
+   /* VS is required to export at least one param. */
+   nparams = MAX2(shader->info.nr_param_exports, 1);
+   shader->ctx_reg.vs.spi_vs_out_config = S_0286C4_VS_EXPORT_COUNT(nparams - 1);
+
+   if (sscreen->info.chip_class >= GFX10) {
+      shader->ctx_reg.vs.spi_vs_out_config |=
+         S_0286C4_NO_PC_EXPORT(shader->info.nr_param_exports == 0);
+   }
+
+   shader->ctx_reg.vs.spi_shader_pos_format =
+      S_02870C_POS0_EXPORT_FORMAT(V_02870C_SPI_SHADER_4COMP) |
+      S_02870C_POS1_EXPORT_FORMAT(shader->info.nr_pos_exports > 1 ? V_02870C_SPI_SHADER_4COMP
+                                                                  : V_02870C_SPI_SHADER_NONE) |
+      S_02870C_POS2_EXPORT_FORMAT(shader->info.nr_pos_exports > 2 ? V_02870C_SPI_SHADER_4COMP
+                                                                  : V_02870C_SPI_SHADER_NONE) |
+      S_02870C_POS3_EXPORT_FORMAT(shader->info.nr_pos_exports > 3 ? V_02870C_SPI_SHADER_4COMP
+                                                                  : V_02870C_SPI_SHADER_NONE);
+   shader->ctx_reg.vs.ge_pc_alloc = S_030980_OVERSUB_EN(sscreen->info.use_late_alloc) |
+                                    S_030980_NUM_PC_LINES(sscreen->info.pc_lines / 4 - 1);
+   shader->pa_cl_vs_out_cntl = si_get_vs_out_cntl(shader->selector, false);
+
+   oc_lds_en = shader->selector->type == PIPE_SHADER_TESS_EVAL ? 1 : 0;
+
+   si_pm4_set_reg(pm4, R_00B120_SPI_SHADER_PGM_LO_VS, va >> 8);
+   si_pm4_set_reg(pm4, R_00B124_SPI_SHADER_PGM_HI_VS, S_00B124_MEM_BASE(va >> 40));
+
+   uint32_t rsrc1 =
+      S_00B128_VGPRS((shader->config.num_vgprs - 1) / (sscreen->ge_wave_size == 32 ? 8 : 4)) |
+      S_00B128_VGPR_COMP_CNT(vgpr_comp_cnt) | S_00B128_DX10_CLAMP(1) |
+      S_00B128_MEM_ORDERED(sscreen->info.chip_class >= GFX10) |
+      S_00B128_FLOAT_MODE(shader->config.float_mode);
+   uint32_t rsrc2 = S_00B12C_USER_SGPR(num_user_sgprs) | S_00B12C_OC_LDS_EN(oc_lds_en) |
+                    S_00B12C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0);
+
+   if (sscreen->info.chip_class >= GFX10)
+      rsrc2 |= S_00B12C_USER_SGPR_MSB_GFX10(num_user_sgprs >> 5);
+   else if (sscreen->info.chip_class == GFX9)
+      rsrc2 |= S_00B12C_USER_SGPR_MSB_GFX9(num_user_sgprs >> 5);
+
+   if (sscreen->info.chip_class <= GFX9)
+      rsrc1 |= S_00B128_SGPRS((shader->config.num_sgprs - 1) / 8);
+
+   if (!sscreen->use_ngg_streamout) {
+      rsrc2 |= S_00B12C_SO_BASE0_EN(!!shader->selector->so.stride[0]) |
+               S_00B12C_SO_BASE1_EN(!!shader->selector->so.stride[1]) |
+               S_00B12C_SO_BASE2_EN(!!shader->selector->so.stride[2]) |
+               S_00B12C_SO_BASE3_EN(!!shader->selector->so.stride[3]) |
+               S_00B12C_SO_EN(!!shader->selector->so.num_outputs);
+   }
+
+   si_pm4_set_reg(pm4, R_00B128_SPI_SHADER_PGM_RSRC1_VS, rsrc1);
+   si_pm4_set_reg(pm4, R_00B12C_SPI_SHADER_PGM_RSRC2_VS, rsrc2);
+
+   if (window_space)
+      shader->ctx_reg.vs.pa_cl_vte_cntl = S_028818_VTX_XY_FMT(1) | S_028818_VTX_Z_FMT(1);
+   else
+      shader->ctx_reg.vs.pa_cl_vte_cntl =
+         S_028818_VTX_W0_FMT(1) | S_028818_VPORT_X_SCALE_ENA(1) | S_028818_VPORT_X_OFFSET_ENA(1) |
+         S_028818_VPORT_Y_SCALE_ENA(1) | S_028818_VPORT_Y_OFFSET_ENA(1) |
+         S_028818_VPORT_Z_SCALE_ENA(1) | S_028818_VPORT_Z_OFFSET_ENA(1);
+
+   if (shader->selector->type == PIPE_SHADER_TESS_EVAL)
+      si_set_tesseval_regs(sscreen, shader->selector, pm4);
+
+   polaris_set_vgt_vertex_reuse(sscreen, shader->selector, shader, pm4);
 }
 
 static unsigned si_get_ps_num_interp(struct si_shader *ps)
 {
-	struct si_shader_info *info = &ps->selector->info;
-	unsigned num_colors = !!(info->colors_read & 0x0f) +
-			      !!(info->colors_read & 0xf0);
-	unsigned num_interp = ps->selector->info.num_inputs +
-			      (ps->key.part.ps.prolog.color_two_side ? num_colors : 0);
-
-	assert(num_interp <= 32);
-	return MIN2(num_interp, 32);
+   struct si_shader_info *info = &ps->selector->info;
+   unsigned num_colors = !!(info->colors_read & 0x0f) + !!(info->colors_read & 0xf0);
+   unsigned num_interp =
+      ps->selector->info.num_inputs + (ps->key.part.ps.prolog.color_two_side ? num_colors : 0);
+
+   assert(num_interp <= 32);
+   return MIN2(num_interp, 32);
 }
 
 static unsigned si_get_spi_shader_col_format(struct si_shader *shader)
 {
-	unsigned value = shader->key.part.ps.epilog.spi_shader_col_format;
-	unsigned i, num_targets = (util_last_bit(value) + 3) / 4;
+   unsigned value = shader->key.part.ps.epilog.spi_shader_col_format;
+   unsigned i, num_targets = (util_last_bit(value) + 3) / 4;
 
-	/* If the i-th target format is set, all previous target formats must
-	 * be non-zero to avoid hangs.
-	 */
-	for (i = 0; i < num_targets; i++)
-		if (!(value & (0xf << (i * 4))))
-			value |= V_028714_SPI_SHADER_32_R << (i * 4);
+   /* If the i-th target format is set, all previous target formats must
+    * be non-zero to avoid hangs.
+    */
+   for (i = 0; i < num_targets; i++)
+      if (!(value & (0xf << (i * 4))))
+         value |= V_028714_SPI_SHADER_32_R << (i * 4);
 
-	return value;
+   return value;
 }
 
 static void si_emit_shader_ps(struct si_context *sctx)
 {
-	struct si_shader *shader = sctx->queued.named.ps->shader;
-	unsigned initial_cdw = sctx->gfx_cs->current.cdw;
-
-	if (!shader)
-		return;
-
-	/* R_0286CC_SPI_PS_INPUT_ENA, R_0286D0_SPI_PS_INPUT_ADDR*/
-	radeon_opt_set_context_reg2(sctx, R_0286CC_SPI_PS_INPUT_ENA,
-				    SI_TRACKED_SPI_PS_INPUT_ENA,
-				    shader->ctx_reg.ps.spi_ps_input_ena,
-				    shader->ctx_reg.ps.spi_ps_input_addr);
-
-	radeon_opt_set_context_reg(sctx, R_0286E0_SPI_BARYC_CNTL,
-				   SI_TRACKED_SPI_BARYC_CNTL,
-				   shader->ctx_reg.ps.spi_baryc_cntl);
-	radeon_opt_set_context_reg(sctx, R_0286D8_SPI_PS_IN_CONTROL,
-				   SI_TRACKED_SPI_PS_IN_CONTROL,
-				   shader->ctx_reg.ps.spi_ps_in_control);
-
-	/* R_028710_SPI_SHADER_Z_FORMAT, R_028714_SPI_SHADER_COL_FORMAT */
-	radeon_opt_set_context_reg2(sctx, R_028710_SPI_SHADER_Z_FORMAT,
-				    SI_TRACKED_SPI_SHADER_Z_FORMAT,
-				    shader->ctx_reg.ps.spi_shader_z_format,
-				    shader->ctx_reg.ps.spi_shader_col_format);
-
-	radeon_opt_set_context_reg(sctx, R_02823C_CB_SHADER_MASK,
-				   SI_TRACKED_CB_SHADER_MASK,
-				   shader->ctx_reg.ps.cb_shader_mask);
-
-	if (initial_cdw != sctx->gfx_cs->current.cdw)
-		sctx->context_roll = true;
-}
+   struct si_shader *shader = sctx->queued.named.ps->shader;
+   unsigned initial_cdw = sctx->gfx_cs->current.cdw;
 
-static void si_shader_ps(struct si_screen *sscreen, struct si_shader *shader)
-{
-	struct si_shader_info *info = &shader->selector->info;
-	struct si_pm4_state *pm4;
-	unsigned spi_ps_in_control, spi_shader_col_format, cb_shader_mask;
-	unsigned spi_baryc_cntl = S_0286E0_FRONT_FACE_ALL_BITS(1);
-	uint64_t va;
-	unsigned input_ena = shader->config.spi_ps_input_ena;
-
-	/* we need to enable at least one of them, otherwise we hang the GPU */
-	assert(G_0286CC_PERSP_SAMPLE_ENA(input_ena) ||
-	       G_0286CC_PERSP_CENTER_ENA(input_ena) ||
-	       G_0286CC_PERSP_CENTROID_ENA(input_ena) ||
-	       G_0286CC_PERSP_PULL_MODEL_ENA(input_ena) ||
-	       G_0286CC_LINEAR_SAMPLE_ENA(input_ena) ||
-	       G_0286CC_LINEAR_CENTER_ENA(input_ena) ||
-	       G_0286CC_LINEAR_CENTROID_ENA(input_ena) ||
-	       G_0286CC_LINE_STIPPLE_TEX_ENA(input_ena));
-	/* POS_W_FLOAT_ENA requires one of the perspective weights. */
-	assert(!G_0286CC_POS_W_FLOAT_ENA(input_ena) ||
-	       G_0286CC_PERSP_SAMPLE_ENA(input_ena) ||
-	       G_0286CC_PERSP_CENTER_ENA(input_ena) ||
-	       G_0286CC_PERSP_CENTROID_ENA(input_ena) ||
-	       G_0286CC_PERSP_PULL_MODEL_ENA(input_ena));
-
-	/* Validate interpolation optimization flags (read as implications). */
-	assert(!shader->key.part.ps.prolog.bc_optimize_for_persp ||
-	       (G_0286CC_PERSP_CENTER_ENA(input_ena) &&
-		G_0286CC_PERSP_CENTROID_ENA(input_ena)));
-	assert(!shader->key.part.ps.prolog.bc_optimize_for_linear ||
-	       (G_0286CC_LINEAR_CENTER_ENA(input_ena) &&
-		G_0286CC_LINEAR_CENTROID_ENA(input_ena)));
-	assert(!shader->key.part.ps.prolog.force_persp_center_interp ||
-	       (!G_0286CC_PERSP_SAMPLE_ENA(input_ena) &&
-		!G_0286CC_PERSP_CENTROID_ENA(input_ena)));
-	assert(!shader->key.part.ps.prolog.force_linear_center_interp ||
-	       (!G_0286CC_LINEAR_SAMPLE_ENA(input_ena) &&
-		!G_0286CC_LINEAR_CENTROID_ENA(input_ena)));
-	assert(!shader->key.part.ps.prolog.force_persp_sample_interp ||
-	       (!G_0286CC_PERSP_CENTER_ENA(input_ena) &&
-		!G_0286CC_PERSP_CENTROID_ENA(input_ena)));
-	assert(!shader->key.part.ps.prolog.force_linear_sample_interp ||
-	       (!G_0286CC_LINEAR_CENTER_ENA(input_ena) &&
-		!G_0286CC_LINEAR_CENTROID_ENA(input_ena)));
-
-	/* Validate cases when the optimizations are off (read as implications). */
-	assert(shader->key.part.ps.prolog.bc_optimize_for_persp ||
-	       !G_0286CC_PERSP_CENTER_ENA(input_ena) ||
-	       !G_0286CC_PERSP_CENTROID_ENA(input_ena));
-	assert(shader->key.part.ps.prolog.bc_optimize_for_linear ||
-	       !G_0286CC_LINEAR_CENTER_ENA(input_ena) ||
-	       !G_0286CC_LINEAR_CENTROID_ENA(input_ena));
-
-	pm4 = si_get_shader_pm4_state(shader);
-	if (!pm4)
-		return;
-
-	pm4->atom.emit = si_emit_shader_ps;
-
-	/* SPI_BARYC_CNTL.POS_FLOAT_LOCATION
-	 * Possible vaules:
-	 * 0 -> Position = pixel center
-	 * 1 -> Position = pixel centroid
-	 * 2 -> Position = at sample position
-	 *
-	 * From GLSL 4.5 specification, section 7.1:
-	 *   "The variable gl_FragCoord is available as an input variable from
-	 *    within fragment shaders and it holds the window relative coordinates
-	 *    (x, y, z, 1/w) values for the fragment. If multi-sampling, this
-	 *    value can be for any location within the pixel, or one of the
-	 *    fragment samples. The use of centroid does not further restrict
-	 *    this value to be inside the current primitive."
-	 *
-	 * Meaning that centroid has no effect and we can return anything within
-	 * the pixel. Thus, return the value at sample position, because that's
-	 * the most accurate one shaders can get.
-	 */
-	spi_baryc_cntl |= S_0286E0_POS_FLOAT_LOCATION(2);
-
-	if (info->properties[TGSI_PROPERTY_FS_COORD_PIXEL_CENTER] ==
-	    TGSI_FS_COORD_PIXEL_CENTER_INTEGER)
-		spi_baryc_cntl |= S_0286E0_POS_FLOAT_ULC(1);
-
-	spi_shader_col_format = si_get_spi_shader_col_format(shader);
-	cb_shader_mask = ac_get_cb_shader_mask(spi_shader_col_format);
-
-	/* Ensure that some export memory is always allocated, for two reasons:
-	 *
-	 * 1) Correctness: The hardware ignores the EXEC mask if no export
-	 *    memory is allocated, so KILL and alpha test do not work correctly
-	 *    without this.
-	 * 2) Performance: Every shader needs at least a NULL export, even when
-	 *    it writes no color/depth output. The NULL export instruction
-	 *    stalls without this setting.
-	 *
-	 * Don't add this to CB_SHADER_MASK.
-	 *
-	 * GFX10 supports pixel shaders without exports by setting both
-	 * the color and Z formats to SPI_SHADER_ZERO. The hw will skip export
-	 * instructions if any are present.
-	 */
-	if ((sscreen->info.chip_class <= GFX9 ||
-	     info->uses_kill ||
-	     shader->key.part.ps.epilog.alpha_func != PIPE_FUNC_ALWAYS) &&
-	    !spi_shader_col_format &&
-	    !info->writes_z && !info->writes_stencil && !info->writes_samplemask)
-		spi_shader_col_format = V_028714_SPI_SHADER_32_R;
-
-	shader->ctx_reg.ps.spi_ps_input_ena = input_ena;
-	shader->ctx_reg.ps.spi_ps_input_addr = shader->config.spi_ps_input_addr;
-
-	/* Set interpolation controls. */
-	spi_ps_in_control = S_0286D8_NUM_INTERP(si_get_ps_num_interp(shader)) |
-			    S_0286D8_PS_W32_EN(sscreen->ps_wave_size == 32);
-
-	shader->ctx_reg.ps.spi_baryc_cntl = spi_baryc_cntl;
-	shader->ctx_reg.ps.spi_ps_in_control = spi_ps_in_control;
-	shader->ctx_reg.ps.spi_shader_z_format =
-			ac_get_spi_shader_z_format(info->writes_z,
-						   info->writes_stencil,
-						   info->writes_samplemask);
-	shader->ctx_reg.ps.spi_shader_col_format = spi_shader_col_format;
-	shader->ctx_reg.ps.cb_shader_mask = cb_shader_mask;
-
-	va = shader->bo->gpu_address;
-	si_pm4_add_bo(pm4, shader->bo, RADEON_USAGE_READ, RADEON_PRIO_SHADER_BINARY);
-	si_pm4_set_reg(pm4, R_00B020_SPI_SHADER_PGM_LO_PS, va >> 8);
-	si_pm4_set_reg(pm4, R_00B024_SPI_SHADER_PGM_HI_PS, S_00B024_MEM_BASE(va >> 40));
-
-	uint32_t rsrc1 =
-		S_00B028_VGPRS((shader->config.num_vgprs - 1) /
-			       (sscreen->ps_wave_size == 32 ? 8 : 4)) |
-		S_00B028_DX10_CLAMP(1) |
-		S_00B028_MEM_ORDERED(sscreen->info.chip_class >= GFX10) |
-		S_00B028_FLOAT_MODE(shader->config.float_mode);
-
-	if (sscreen->info.chip_class < GFX10) {
-		rsrc1 |= S_00B028_SGPRS((shader->config.num_sgprs - 1) / 8);
-	}
-
-	si_pm4_set_reg(pm4, R_00B028_SPI_SHADER_PGM_RSRC1_PS, rsrc1);
-	si_pm4_set_reg(pm4, R_00B02C_SPI_SHADER_PGM_RSRC2_PS,
-		       S_00B02C_EXTRA_LDS_SIZE(shader->config.lds_size) |
-		       S_00B02C_USER_SGPR(SI_PS_NUM_USER_SGPR) |
-		       S_00B32C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0));
+   if (!shader)
+      return;
+
+   /* R_0286CC_SPI_PS_INPUT_ENA, R_0286D0_SPI_PS_INPUT_ADDR*/
+   radeon_opt_set_context_reg2(sctx, R_0286CC_SPI_PS_INPUT_ENA, SI_TRACKED_SPI_PS_INPUT_ENA,
+                               shader->ctx_reg.ps.spi_ps_input_ena,
+                               shader->ctx_reg.ps.spi_ps_input_addr);
+
+   radeon_opt_set_context_reg(sctx, R_0286E0_SPI_BARYC_CNTL, SI_TRACKED_SPI_BARYC_CNTL,
+                              shader->ctx_reg.ps.spi_baryc_cntl);
+   radeon_opt_set_context_reg(sctx, R_0286D8_SPI_PS_IN_CONTROL, SI_TRACKED_SPI_PS_IN_CONTROL,
+                              shader->ctx_reg.ps.spi_ps_in_control);
+
+   /* R_028710_SPI_SHADER_Z_FORMAT, R_028714_SPI_SHADER_COL_FORMAT */
+   radeon_opt_set_context_reg2(sctx, R_028710_SPI_SHADER_Z_FORMAT, SI_TRACKED_SPI_SHADER_Z_FORMAT,
+                               shader->ctx_reg.ps.spi_shader_z_format,
+                               shader->ctx_reg.ps.spi_shader_col_format);
+
+   radeon_opt_set_context_reg(sctx, R_02823C_CB_SHADER_MASK, SI_TRACKED_CB_SHADER_MASK,
+                              shader->ctx_reg.ps.cb_shader_mask);
+
+   if (initial_cdw != sctx->gfx_cs->current.cdw)
+      sctx->context_roll = true;
 }
 
-static void si_shader_init_pm4_state(struct si_screen *sscreen,
-                                     struct si_shader *shader)
+static void si_shader_ps(struct si_screen *sscreen, struct si_shader *shader)
 {
-	switch (shader->selector->type) {
-	case PIPE_SHADER_VERTEX:
-		if (shader->key.as_ls)
-			si_shader_ls(sscreen, shader);
-		else if (shader->key.as_es)
-			si_shader_es(sscreen, shader);
-		else if (shader->key.as_ngg)
-			gfx10_shader_ngg(sscreen, shader);
-		else
-			si_shader_vs(sscreen, shader, NULL);
-		break;
-	case PIPE_SHADER_TESS_CTRL:
-		si_shader_hs(sscreen, shader);
-		break;
-	case PIPE_SHADER_TESS_EVAL:
-		if (shader->key.as_es)
-			si_shader_es(sscreen, shader);
-		else if (shader->key.as_ngg)
-			gfx10_shader_ngg(sscreen, shader);
-		else
-			si_shader_vs(sscreen, shader, NULL);
-		break;
-	case PIPE_SHADER_GEOMETRY:
-		if (shader->key.as_ngg)
-			gfx10_shader_ngg(sscreen, shader);
-		else
-			si_shader_gs(sscreen, shader);
-		break;
-	case PIPE_SHADER_FRAGMENT:
-		si_shader_ps(sscreen, shader);
-		break;
-	default:
-		assert(0);
-	}
+   struct si_shader_info *info = &shader->selector->info;
+   struct si_pm4_state *pm4;
+   unsigned spi_ps_in_control, spi_shader_col_format, cb_shader_mask;
+   unsigned spi_baryc_cntl = S_0286E0_FRONT_FACE_ALL_BITS(1);
+   uint64_t va;
+   unsigned input_ena = shader->config.spi_ps_input_ena;
+
+   /* we need to enable at least one of them, otherwise we hang the GPU */
+   assert(G_0286CC_PERSP_SAMPLE_ENA(input_ena) || G_0286CC_PERSP_CENTER_ENA(input_ena) ||
+          G_0286CC_PERSP_CENTROID_ENA(input_ena) || G_0286CC_PERSP_PULL_MODEL_ENA(input_ena) ||
+          G_0286CC_LINEAR_SAMPLE_ENA(input_ena) || G_0286CC_LINEAR_CENTER_ENA(input_ena) ||
+          G_0286CC_LINEAR_CENTROID_ENA(input_ena) || G_0286CC_LINE_STIPPLE_TEX_ENA(input_ena));
+   /* POS_W_FLOAT_ENA requires one of the perspective weights. */
+   assert(!G_0286CC_POS_W_FLOAT_ENA(input_ena) || G_0286CC_PERSP_SAMPLE_ENA(input_ena) ||
+          G_0286CC_PERSP_CENTER_ENA(input_ena) || G_0286CC_PERSP_CENTROID_ENA(input_ena) ||
+          G_0286CC_PERSP_PULL_MODEL_ENA(input_ena));
+
+   /* Validate interpolation optimization flags (read as implications). */
+   assert(!shader->key.part.ps.prolog.bc_optimize_for_persp ||
+          (G_0286CC_PERSP_CENTER_ENA(input_ena) && G_0286CC_PERSP_CENTROID_ENA(input_ena)));
+   assert(!shader->key.part.ps.prolog.bc_optimize_for_linear ||
+          (G_0286CC_LINEAR_CENTER_ENA(input_ena) && G_0286CC_LINEAR_CENTROID_ENA(input_ena)));
+   assert(!shader->key.part.ps.prolog.force_persp_center_interp ||
+          (!G_0286CC_PERSP_SAMPLE_ENA(input_ena) && !G_0286CC_PERSP_CENTROID_ENA(input_ena)));
+   assert(!shader->key.part.ps.prolog.force_linear_center_interp ||
+          (!G_0286CC_LINEAR_SAMPLE_ENA(input_ena) && !G_0286CC_LINEAR_CENTROID_ENA(input_ena)));
+   assert(!shader->key.part.ps.prolog.force_persp_sample_interp ||
+          (!G_0286CC_PERSP_CENTER_ENA(input_ena) && !G_0286CC_PERSP_CENTROID_ENA(input_ena)));
+   assert(!shader->key.part.ps.prolog.force_linear_sample_interp ||
+          (!G_0286CC_LINEAR_CENTER_ENA(input_ena) && !G_0286CC_LINEAR_CENTROID_ENA(input_ena)));
+
+   /* Validate cases when the optimizations are off (read as implications). */
+   assert(shader->key.part.ps.prolog.bc_optimize_for_persp ||
+          !G_0286CC_PERSP_CENTER_ENA(input_ena) || !G_0286CC_PERSP_CENTROID_ENA(input_ena));
+   assert(shader->key.part.ps.prolog.bc_optimize_for_linear ||
+          !G_0286CC_LINEAR_CENTER_ENA(input_ena) || !G_0286CC_LINEAR_CENTROID_ENA(input_ena));
+
+   pm4 = si_get_shader_pm4_state(shader);
+   if (!pm4)
+      return;
+
+   pm4->atom.emit = si_emit_shader_ps;
+
+   /* SPI_BARYC_CNTL.POS_FLOAT_LOCATION
+    * Possible vaules:
+    * 0 -> Position = pixel center
+    * 1 -> Position = pixel centroid
+    * 2 -> Position = at sample position
+    *
+    * From GLSL 4.5 specification, section 7.1:
+    *   "The variable gl_FragCoord is available as an input variable from
+    *    within fragment shaders and it holds the window relative coordinates
+    *    (x, y, z, 1/w) values for the fragment. If multi-sampling, this
+    *    value can be for any location within the pixel, or one of the
+    *    fragment samples. The use of centroid does not further restrict
+    *    this value to be inside the current primitive."
+    *
+    * Meaning that centroid has no effect and we can return anything within
+    * the pixel. Thus, return the value at sample position, because that's
+    * the most accurate one shaders can get.
+    */
+   spi_baryc_cntl |= S_0286E0_POS_FLOAT_LOCATION(2);
+
+   if (info->properties[TGSI_PROPERTY_FS_COORD_PIXEL_CENTER] == TGSI_FS_COORD_PIXEL_CENTER_INTEGER)
+      spi_baryc_cntl |= S_0286E0_POS_FLOAT_ULC(1);
+
+   spi_shader_col_format = si_get_spi_shader_col_format(shader);
+   cb_shader_mask = ac_get_cb_shader_mask(spi_shader_col_format);
+
+   /* Ensure that some export memory is always allocated, for two reasons:
+    *
+    * 1) Correctness: The hardware ignores the EXEC mask if no export
+    *    memory is allocated, so KILL and alpha test do not work correctly
+    *    without this.
+    * 2) Performance: Every shader needs at least a NULL export, even when
+    *    it writes no color/depth output. The NULL export instruction
+    *    stalls without this setting.
+    *
+    * Don't add this to CB_SHADER_MASK.
+    *
+    * GFX10 supports pixel shaders without exports by setting both
+    * the color and Z formats to SPI_SHADER_ZERO. The hw will skip export
+    * instructions if any are present.
+    */
+   if ((sscreen->info.chip_class <= GFX9 || info->uses_kill ||
+        shader->key.part.ps.epilog.alpha_func != PIPE_FUNC_ALWAYS) &&
+       !spi_shader_col_format && !info->writes_z && !info->writes_stencil &&
+       !info->writes_samplemask)
+      spi_shader_col_format = V_028714_SPI_SHADER_32_R;
+
+   shader->ctx_reg.ps.spi_ps_input_ena = input_ena;
+   shader->ctx_reg.ps.spi_ps_input_addr = shader->config.spi_ps_input_addr;
+
+   /* Set interpolation controls. */
+   spi_ps_in_control = S_0286D8_NUM_INTERP(si_get_ps_num_interp(shader)) |
+                       S_0286D8_PS_W32_EN(sscreen->ps_wave_size == 32);
+
+   shader->ctx_reg.ps.spi_baryc_cntl = spi_baryc_cntl;
+   shader->ctx_reg.ps.spi_ps_in_control = spi_ps_in_control;
+   shader->ctx_reg.ps.spi_shader_z_format =
+      ac_get_spi_shader_z_format(info->writes_z, info->writes_stencil, info->writes_samplemask);
+   shader->ctx_reg.ps.spi_shader_col_format = spi_shader_col_format;
+   shader->ctx_reg.ps.cb_shader_mask = cb_shader_mask;
+
+   va = shader->bo->gpu_address;
+   si_pm4_add_bo(pm4, shader->bo, RADEON_USAGE_READ, RADEON_PRIO_SHADER_BINARY);
+   si_pm4_set_reg(pm4, R_00B020_SPI_SHADER_PGM_LO_PS, va >> 8);
+   si_pm4_set_reg(pm4, R_00B024_SPI_SHADER_PGM_HI_PS, S_00B024_MEM_BASE(va >> 40));
+
+   uint32_t rsrc1 =
+      S_00B028_VGPRS((shader->config.num_vgprs - 1) / (sscreen->ps_wave_size == 32 ? 8 : 4)) |
+      S_00B028_DX10_CLAMP(1) | S_00B028_MEM_ORDERED(sscreen->info.chip_class >= GFX10) |
+      S_00B028_FLOAT_MODE(shader->config.float_mode);
+
+   if (sscreen->info.chip_class < GFX10) {
+      rsrc1 |= S_00B028_SGPRS((shader->config.num_sgprs - 1) / 8);
+   }
+
+   si_pm4_set_reg(pm4, R_00B028_SPI_SHADER_PGM_RSRC1_PS, rsrc1);
+   si_pm4_set_reg(pm4, R_00B02C_SPI_SHADER_PGM_RSRC2_PS,
+                  S_00B02C_EXTRA_LDS_SIZE(shader->config.lds_size) |
+                     S_00B02C_USER_SGPR(SI_PS_NUM_USER_SGPR) |
+                     S_00B32C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0));
+}
+
+static void si_shader_init_pm4_state(struct si_screen *sscreen, struct si_shader *shader)
+{
+   switch (shader->selector->type) {
+   case PIPE_SHADER_VERTEX:
+      if (shader->key.as_ls)
+         si_shader_ls(sscreen, shader);
+      else if (shader->key.as_es)
+         si_shader_es(sscreen, shader);
+      else if (shader->key.as_ngg)
+         gfx10_shader_ngg(sscreen, shader);
+      else
+         si_shader_vs(sscreen, shader, NULL);
+      break;
+   case PIPE_SHADER_TESS_CTRL:
+      si_shader_hs(sscreen, shader);
+      break;
+   case PIPE_SHADER_TESS_EVAL:
+      if (shader->key.as_es)
+         si_shader_es(sscreen, shader);
+      else if (shader->key.as_ngg)
+         gfx10_shader_ngg(sscreen, shader);
+      else
+         si_shader_vs(sscreen, shader, NULL);
+      break;
+   case PIPE_SHADER_GEOMETRY:
+      if (shader->key.as_ngg)
+         gfx10_shader_ngg(sscreen, shader);
+      else
+         si_shader_gs(sscreen, shader);
+      break;
+   case PIPE_SHADER_FRAGMENT:
+      si_shader_ps(sscreen, shader);
+      break;
+   default:
+      assert(0);
+   }
 }
 
 static unsigned si_get_alpha_test_func(struct si_context *sctx)
 {
-	/* Alpha-test should be disabled if colorbuffer 0 is integer. */
-	return sctx->queued.named.dsa->alpha_func;
+   /* Alpha-test should be disabled if colorbuffer 0 is integer. */
+   return sctx->queued.named.dsa->alpha_func;
 }
 
-void si_shader_selector_key_vs(struct si_context *sctx,
-			       struct si_shader_selector *vs,
-			       struct si_shader_key *key,
-			       struct si_vs_prolog_bits *prolog_key)
+void si_shader_selector_key_vs(struct si_context *sctx, struct si_shader_selector *vs,
+                               struct si_shader_key *key, struct si_vs_prolog_bits *prolog_key)
 {
-	if (!sctx->vertex_elements ||
-	    vs->info.properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD])
-		return;
-
-	struct si_vertex_elements *elts = sctx->vertex_elements;
-
-	prolog_key->instance_divisor_is_one = elts->instance_divisor_is_one;
-	prolog_key->instance_divisor_is_fetched = elts->instance_divisor_is_fetched;
-	prolog_key->unpack_instance_id_from_vertex_id =
-		sctx->prim_discard_cs_instancing;
-
-	/* Prefer a monolithic shader to allow scheduling divisions around
-	 * VBO loads. */
-	if (prolog_key->instance_divisor_is_fetched)
-		key->opt.prefer_mono = 1;
-
-	unsigned count = MIN2(vs->info.num_inputs, elts->count);
-	unsigned count_mask = (1 << count) - 1;
-	unsigned fix = elts->fix_fetch_always & count_mask;
-	unsigned opencode = elts->fix_fetch_opencode & count_mask;
-
-	if (sctx->vertex_buffer_unaligned & elts->vb_alignment_check_mask) {
-		uint32_t mask = elts->fix_fetch_unaligned & count_mask;
-		while (mask) {
-			unsigned i = u_bit_scan(&mask);
-			unsigned log_hw_load_size = 1 + ((elts->hw_load_is_dword >> i) & 1);
-			unsigned vbidx = elts->vertex_buffer_index[i];
-			struct pipe_vertex_buffer *vb = &sctx->vertex_buffer[vbidx];
-			unsigned align_mask = (1 << log_hw_load_size) - 1;
-			if (vb->buffer_offset & align_mask ||
-			    vb->stride & align_mask) {
-				fix |= 1 << i;
-				opencode |= 1 << i;
-			}
-		}
-	}
-
-	while (fix) {
-		unsigned i = u_bit_scan(&fix);
-		key->mono.vs_fix_fetch[i].bits = elts->fix_fetch[i];
-	}
-	key->mono.vs_fetch_opencode = opencode;
-}
+   if (!sctx->vertex_elements || vs->info.properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD])
+      return;
 
-static void si_shader_selector_key_hw_vs(struct si_context *sctx,
-					 struct si_shader_selector *vs,
-					 struct si_shader_key *key)
-{
-	struct si_shader_selector *ps = sctx->ps_shader.cso;
-
-	key->opt.clip_disable =
-		sctx->queued.named.rasterizer->clip_plane_enable == 0 &&
-		(vs->info.clipdist_writemask ||
-		 vs->info.writes_clipvertex) &&
-		!vs->info.culldist_writemask;
-
-	/* Find out if PS is disabled. */
-	bool ps_disabled = true;
-	if (ps) {
-		bool ps_modifies_zs = ps->info.uses_kill ||
-				      ps->info.writes_z ||
-				      ps->info.writes_stencil ||
-				      ps->info.writes_samplemask ||
-				      sctx->queued.named.blend->alpha_to_coverage ||
-				      si_get_alpha_test_func(sctx) != PIPE_FUNC_ALWAYS;
-		unsigned ps_colormask = si_get_total_colormask(sctx);
-
-		ps_disabled = sctx->queued.named.rasterizer->rasterizer_discard ||
-			      (!ps_colormask &&
-			       !ps_modifies_zs &&
-			       !ps->info.writes_memory);
-	}
-
-	/* Find out which VS outputs aren't used by the PS. */
-	uint64_t outputs_written = vs->outputs_written_before_ps;
-	uint64_t inputs_read = 0;
-
-	/* Ignore outputs that are not passed from VS to PS. */
-	outputs_written &= ~((1ull << si_shader_io_get_unique_index(TGSI_SEMANTIC_POSITION, 0, true)) |
-			     (1ull << si_shader_io_get_unique_index(TGSI_SEMANTIC_PSIZE, 0, true)) |
-			     (1ull << si_shader_io_get_unique_index(TGSI_SEMANTIC_CLIPVERTEX, 0, true)));
-
-	if (!ps_disabled) {
-		inputs_read = ps->inputs_read;
-	}
-
-	uint64_t linked = outputs_written & inputs_read;
-
-	key->opt.kill_outputs = ~linked & outputs_written;
-	key->opt.ngg_culling = sctx->ngg_culling;
+   struct si_vertex_elements *elts = sctx->vertex_elements;
+
+   prolog_key->instance_divisor_is_one = elts->instance_divisor_is_one;
+   prolog_key->instance_divisor_is_fetched = elts->instance_divisor_is_fetched;
+   prolog_key->unpack_instance_id_from_vertex_id = sctx->prim_discard_cs_instancing;
+
+   /* Prefer a monolithic shader to allow scheduling divisions around
+    * VBO loads. */
+   if (prolog_key->instance_divisor_is_fetched)
+      key->opt.prefer_mono = 1;
+
+   unsigned count = MIN2(vs->info.num_inputs, elts->count);
+   unsigned count_mask = (1 << count) - 1;
+   unsigned fix = elts->fix_fetch_always & count_mask;
+   unsigned opencode = elts->fix_fetch_opencode & count_mask;
+
+   if (sctx->vertex_buffer_unaligned & elts->vb_alignment_check_mask) {
+      uint32_t mask = elts->fix_fetch_unaligned & count_mask;
+      while (mask) {
+         unsigned i = u_bit_scan(&mask);
+         unsigned log_hw_load_size = 1 + ((elts->hw_load_is_dword >> i) & 1);
+         unsigned vbidx = elts->vertex_buffer_index[i];
+         struct pipe_vertex_buffer *vb = &sctx->vertex_buffer[vbidx];
+         unsigned align_mask = (1 << log_hw_load_size) - 1;
+         if (vb->buffer_offset & align_mask || vb->stride & align_mask) {
+            fix |= 1 << i;
+            opencode |= 1 << i;
+         }
+      }
+   }
+
+   while (fix) {
+      unsigned i = u_bit_scan(&fix);
+      key->mono.vs_fix_fetch[i].bits = elts->fix_fetch[i];
+   }
+   key->mono.vs_fetch_opencode = opencode;
 }
 
-/* Compute the key for the hw shader variant */
-static inline void si_shader_selector_key(struct pipe_context *ctx,
-					  struct si_shader_selector *sel,
-					  union si_vgt_stages_key stages_key,
-					  struct si_shader_key *key)
+static void si_shader_selector_key_hw_vs(struct si_context *sctx, struct si_shader_selector *vs,
+                                         struct si_shader_key *key)
 {
-	struct si_context *sctx = (struct si_context *)ctx;
-
-	memset(key, 0, sizeof(*key));
-
-	switch (sel->type) {
-	case PIPE_SHADER_VERTEX:
-		si_shader_selector_key_vs(sctx, sel, key, &key->part.vs.prolog);
-
-		if (sctx->tes_shader.cso)
-			key->as_ls = 1;
-		else if (sctx->gs_shader.cso) {
-			key->as_es = 1;
-			key->as_ngg = stages_key.u.ngg;
-		} else {
-			key->as_ngg = stages_key.u.ngg;
-			si_shader_selector_key_hw_vs(sctx, sel, key);
-
-			if (sctx->ps_shader.cso && sctx->ps_shader.cso->info.uses_primid)
-				key->mono.u.vs_export_prim_id = 1;
-		}
-		break;
-	case PIPE_SHADER_TESS_CTRL:
-		if (sctx->chip_class >= GFX9) {
-			si_shader_selector_key_vs(sctx, sctx->vs_shader.cso,
-						  key, &key->part.tcs.ls_prolog);
-			key->part.tcs.ls = sctx->vs_shader.cso;
-
-			/* When the LS VGPR fix is needed, monolithic shaders
-			 * can:
-			 *  - avoid initializing EXEC in both the LS prolog
-			 *    and the LS main part when !vs_needs_prolog
-			 *  - remove the fixup for unused input VGPRs
-			 */
-			key->part.tcs.ls_prolog.ls_vgpr_fix = sctx->ls_vgpr_fix;
-
-			/* The LS output / HS input layout can be communicated
-			 * directly instead of via user SGPRs for merged LS-HS.
-			 * The LS VGPR fix prefers this too.
-			 */
-			key->opt.prefer_mono = 1;
-		}
-
-		key->part.tcs.epilog.prim_mode =
-			sctx->tes_shader.cso->info.properties[TGSI_PROPERTY_TES_PRIM_MODE];
-		key->part.tcs.epilog.invoc0_tess_factors_are_def =
-			sel->info.tessfactors_are_def_in_all_invocs;
-		key->part.tcs.epilog.tes_reads_tess_factors =
-			sctx->tes_shader.cso->info.reads_tess_factors;
-
-		if (sel == sctx->fixed_func_tcs_shader.cso)
-			key->mono.u.ff_tcs_inputs_to_copy = sctx->vs_shader.cso->outputs_written;
-		break;
-	case PIPE_SHADER_TESS_EVAL:
-		key->as_ngg = stages_key.u.ngg;
-
-		if (sctx->gs_shader.cso)
-			key->as_es = 1;
-		else {
-			si_shader_selector_key_hw_vs(sctx, sel, key);
-
-			if (sctx->ps_shader.cso && sctx->ps_shader.cso->info.uses_primid)
-				key->mono.u.vs_export_prim_id = 1;
-		}
-		break;
-	case PIPE_SHADER_GEOMETRY:
-		if (sctx->chip_class >= GFX9) {
-			if (sctx->tes_shader.cso) {
-				key->part.gs.es = sctx->tes_shader.cso;
-			} else {
-				si_shader_selector_key_vs(sctx, sctx->vs_shader.cso,
-							  key, &key->part.gs.vs_prolog);
-				key->part.gs.es = sctx->vs_shader.cso;
-				key->part.gs.prolog.gfx9_prev_is_vs = 1;
-			}
-
-			key->as_ngg = stages_key.u.ngg;
-
-			/* Merged ES-GS can have unbalanced wave usage.
-			 *
-			 * ES threads are per-vertex, while GS threads are
-			 * per-primitive. So without any amplification, there
-			 * are fewer GS threads than ES threads, which can result
-			 * in empty (no-op) GS waves. With too much amplification,
-			 * there are more GS threads than ES threads, which
-			 * can result in empty (no-op) ES waves.
-			 *
-			 * Non-monolithic shaders are implemented by setting EXEC
-			 * at the beginning of shader parts, and don't jump to
-			 * the end if EXEC is 0.
-			 *
-			 * Monolithic shaders use conditional blocks, so they can
-			 * jump and skip empty waves of ES or GS. So set this to
-			 * always use optimized variants, which are monolithic.
-			 */
-			key->opt.prefer_mono = 1;
-		}
-		key->part.gs.prolog.tri_strip_adj_fix = sctx->gs_tri_strip_adj_fix;
-		break;
-	case PIPE_SHADER_FRAGMENT: {
-		struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
-		struct si_state_blend *blend = sctx->queued.named.blend;
-
-		if (sel->info.properties[TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS] &&
-		    sel->info.colors_written == 0x1)
-			key->part.ps.epilog.last_cbuf = MAX2(sctx->framebuffer.state.nr_cbufs, 1) - 1;
-
-		/* Select the shader color format based on whether
-		 * blending or alpha are needed.
-		 */
-		key->part.ps.epilog.spi_shader_col_format =
-			(blend->blend_enable_4bit & blend->need_src_alpha_4bit &
-			 sctx->framebuffer.spi_shader_col_format_blend_alpha) |
-			(blend->blend_enable_4bit & ~blend->need_src_alpha_4bit &
-			 sctx->framebuffer.spi_shader_col_format_blend) |
-			(~blend->blend_enable_4bit & blend->need_src_alpha_4bit &
-			 sctx->framebuffer.spi_shader_col_format_alpha) |
-			(~blend->blend_enable_4bit & ~blend->need_src_alpha_4bit &
-			 sctx->framebuffer.spi_shader_col_format);
-		key->part.ps.epilog.spi_shader_col_format &= blend->cb_target_enabled_4bit;
-
-		/* The output for dual source blending should have
-		 * the same format as the first output.
-		 */
-		if (blend->dual_src_blend) {
-			key->part.ps.epilog.spi_shader_col_format |=
-				(key->part.ps.epilog.spi_shader_col_format & 0xf) << 4;
-		}
-
-		/* If alpha-to-coverage is enabled, we have to export alpha
-		 * even if there is no color buffer.
-		 */
-		if (!(key->part.ps.epilog.spi_shader_col_format & 0xf) &&
-		    blend->alpha_to_coverage)
-			key->part.ps.epilog.spi_shader_col_format |= V_028710_SPI_SHADER_32_AR;
-
-		/* On GFX6 and GFX7 except Hawaii, the CB doesn't clamp outputs
-		 * to the range supported by the type if a channel has less
-		 * than 16 bits and the export format is 16_ABGR.
-		 */
-		if (sctx->chip_class <= GFX7 && sctx->family != CHIP_HAWAII) {
-			key->part.ps.epilog.color_is_int8 = sctx->framebuffer.color_is_int8;
-			key->part.ps.epilog.color_is_int10 = sctx->framebuffer.color_is_int10;
-		}
-
-		/* Disable unwritten outputs (if WRITE_ALL_CBUFS isn't enabled). */
-		if (!key->part.ps.epilog.last_cbuf) {
-			key->part.ps.epilog.spi_shader_col_format &= sel->colors_written_4bit;
-			key->part.ps.epilog.color_is_int8 &= sel->info.colors_written;
-			key->part.ps.epilog.color_is_int10 &= sel->info.colors_written;
-		}
-
-		bool is_poly = !util_prim_is_points_or_lines(sctx->current_rast_prim);
-		bool is_line = util_prim_is_lines(sctx->current_rast_prim);
-
-		key->part.ps.prolog.color_two_side = rs->two_side && sel->info.colors_read;
-		key->part.ps.prolog.flatshade_colors = rs->flatshade && sel->info.colors_read;
-
-		key->part.ps.epilog.alpha_to_one = blend->alpha_to_one &&
-						   rs->multisample_enable;
-
-		key->part.ps.prolog.poly_stipple = rs->poly_stipple_enable && is_poly;
-		key->part.ps.epilog.poly_line_smoothing = ((is_poly && rs->poly_smooth) ||
-							   (is_line && rs->line_smooth)) &&
-							  sctx->framebuffer.nr_samples <= 1;
-		key->part.ps.epilog.clamp_color = rs->clamp_fragment_color;
-
-		if (sctx->ps_iter_samples > 1 &&
-		    sel->info.reads_samplemask) {
-			key->part.ps.prolog.samplemask_log_ps_iter =
-				util_logbase2(sctx->ps_iter_samples);
-		}
-
-		if (rs->force_persample_interp &&
-		    rs->multisample_enable &&
-		    sctx->framebuffer.nr_samples > 1 &&
-		    sctx->ps_iter_samples > 1) {
-			key->part.ps.prolog.force_persp_sample_interp =
-				sel->info.uses_persp_center ||
-				sel->info.uses_persp_centroid;
-
-			key->part.ps.prolog.force_linear_sample_interp =
-				sel->info.uses_linear_center ||
-				sel->info.uses_linear_centroid;
-		} else if (rs->multisample_enable &&
-			   sctx->framebuffer.nr_samples > 1) {
-			key->part.ps.prolog.bc_optimize_for_persp =
-				sel->info.uses_persp_center &&
-				sel->info.uses_persp_centroid;
-			key->part.ps.prolog.bc_optimize_for_linear =
-				sel->info.uses_linear_center &&
-				sel->info.uses_linear_centroid;
-		} else {
-			/* Make sure SPI doesn't compute more than 1 pair
-			 * of (i,j), which is the optimization here. */
-			key->part.ps.prolog.force_persp_center_interp =
-				sel->info.uses_persp_center +
-				sel->info.uses_persp_centroid +
-				sel->info.uses_persp_sample > 1;
-
-			key->part.ps.prolog.force_linear_center_interp =
-				sel->info.uses_linear_center +
-				sel->info.uses_linear_centroid +
-				sel->info.uses_linear_sample > 1;
-
-			if (sel->info.uses_persp_opcode_interp_sample ||
-			    sel->info.uses_linear_opcode_interp_sample)
-				key->mono.u.ps.interpolate_at_sample_force_center = 1;
-		}
-
-		key->part.ps.epilog.alpha_func = si_get_alpha_test_func(sctx);
-
-		/* ps_uses_fbfetch is true only if the color buffer is bound. */
-		if (sctx->ps_uses_fbfetch && !sctx->blitter->running) {
-			struct pipe_surface *cb0 = sctx->framebuffer.state.cbufs[0];
-			struct pipe_resource *tex = cb0->texture;
-
-			/* 1D textures are allocated and used as 2D on GFX9. */
-			key->mono.u.ps.fbfetch_msaa = sctx->framebuffer.nr_samples > 1;
-			key->mono.u.ps.fbfetch_is_1D = sctx->chip_class != GFX9 &&
-						       (tex->target == PIPE_TEXTURE_1D ||
-							tex->target == PIPE_TEXTURE_1D_ARRAY);
-			key->mono.u.ps.fbfetch_layered = tex->target == PIPE_TEXTURE_1D_ARRAY ||
-							 tex->target == PIPE_TEXTURE_2D_ARRAY ||
-							 tex->target == PIPE_TEXTURE_CUBE ||
-							 tex->target == PIPE_TEXTURE_CUBE_ARRAY ||
-							 tex->target == PIPE_TEXTURE_3D;
-		}
-		break;
-	}
-	default:
-		assert(0);
-	}
-
-	if (unlikely(sctx->screen->debug_flags & DBG(NO_OPT_VARIANT)))
-		memset(&key->opt, 0, sizeof(key->opt));
+   struct si_shader_selector *ps = sctx->ps_shader.cso;
+
+   key->opt.clip_disable = sctx->queued.named.rasterizer->clip_plane_enable == 0 &&
+                           (vs->info.clipdist_writemask || vs->info.writes_clipvertex) &&
+                           !vs->info.culldist_writemask;
+
+   /* Find out if PS is disabled. */
+   bool ps_disabled = true;
+   if (ps) {
+      bool ps_modifies_zs = ps->info.uses_kill || ps->info.writes_z || ps->info.writes_stencil ||
+                            ps->info.writes_samplemask ||
+                            sctx->queued.named.blend->alpha_to_coverage ||
+                            si_get_alpha_test_func(sctx) != PIPE_FUNC_ALWAYS;
+      unsigned ps_colormask = si_get_total_colormask(sctx);
+
+      ps_disabled = sctx->queued.named.rasterizer->rasterizer_discard ||
+                    (!ps_colormask && !ps_modifies_zs && !ps->info.writes_memory);
+   }
+
+   /* Find out which VS outputs aren't used by the PS. */
+   uint64_t outputs_written = vs->outputs_written_before_ps;
+   uint64_t inputs_read = 0;
+
+   /* Ignore outputs that are not passed from VS to PS. */
+   outputs_written &= ~((1ull << si_shader_io_get_unique_index(TGSI_SEMANTIC_POSITION, 0, true)) |
+                        (1ull << si_shader_io_get_unique_index(TGSI_SEMANTIC_PSIZE, 0, true)) |
+                        (1ull << si_shader_io_get_unique_index(TGSI_SEMANTIC_CLIPVERTEX, 0, true)));
+
+   if (!ps_disabled) {
+      inputs_read = ps->inputs_read;
+   }
+
+   uint64_t linked = outputs_written & inputs_read;
+
+   key->opt.kill_outputs = ~linked & outputs_written;
+   key->opt.ngg_culling = sctx->ngg_culling;
 }
 
-static void si_build_shader_variant(struct si_shader *shader,
-				    int thread_index,
-				    bool low_priority)
-{
-	struct si_shader_selector *sel = shader->selector;
-	struct si_screen *sscreen = sel->screen;
-	struct ac_llvm_compiler *compiler;
-	struct pipe_debug_callback *debug = &shader->compiler_ctx_state.debug;
-
-	if (thread_index >= 0) {
-		if (low_priority) {
-			assert(thread_index < ARRAY_SIZE(sscreen->compiler_lowp));
-			compiler = &sscreen->compiler_lowp[thread_index];
-		} else {
-			assert(thread_index < ARRAY_SIZE(sscreen->compiler));
-			compiler = &sscreen->compiler[thread_index];
-		}
-		if (!debug->async)
-			debug = NULL;
-	} else {
-		assert(!low_priority);
-		compiler = shader->compiler_ctx_state.compiler;
-	}
-
-	if (!compiler->passes)
-		si_init_compiler(sscreen, compiler);
-
-	if (unlikely(!si_create_shader_variant(sscreen, compiler, shader, debug))) {
-		PRINT_ERR("Failed to build shader variant (type=%u)\n",
-			  sel->type);
-		shader->compilation_failed = true;
-		return;
-	}
-
-	if (shader->compiler_ctx_state.is_debug_context) {
-		FILE *f = open_memstream(&shader->shader_log,
-					 &shader->shader_log_size);
-		if (f) {
-			si_shader_dump(sscreen, shader, NULL, f, false);
-			fclose(f);
-		}
-	}
-
-	si_shader_init_pm4_state(sscreen, shader);
+/* Compute the key for the hw shader variant */
+static inline void si_shader_selector_key(struct pipe_context *ctx, struct si_shader_selector *sel,
+                                          union si_vgt_stages_key stages_key,
+                                          struct si_shader_key *key)
+{
+   struct si_context *sctx = (struct si_context *)ctx;
+
+   memset(key, 0, sizeof(*key));
+
+   switch (sel->type) {
+   case PIPE_SHADER_VERTEX:
+      si_shader_selector_key_vs(sctx, sel, key, &key->part.vs.prolog);
+
+      if (sctx->tes_shader.cso)
+         key->as_ls = 1;
+      else if (sctx->gs_shader.cso) {
+         key->as_es = 1;
+         key->as_ngg = stages_key.u.ngg;
+      } else {
+         key->as_ngg = stages_key.u.ngg;
+         si_shader_selector_key_hw_vs(sctx, sel, key);
+
+         if (sctx->ps_shader.cso && sctx->ps_shader.cso->info.uses_primid)
+            key->mono.u.vs_export_prim_id = 1;
+      }
+      break;
+   case PIPE_SHADER_TESS_CTRL:
+      if (sctx->chip_class >= GFX9) {
+         si_shader_selector_key_vs(sctx, sctx->vs_shader.cso, key, &key->part.tcs.ls_prolog);
+         key->part.tcs.ls = sctx->vs_shader.cso;
+
+         /* When the LS VGPR fix is needed, monolithic shaders
+          * can:
+          *  - avoid initializing EXEC in both the LS prolog
+          *    and the LS main part when !vs_needs_prolog
+          *  - remove the fixup for unused input VGPRs
+          */
+         key->part.tcs.ls_prolog.ls_vgpr_fix = sctx->ls_vgpr_fix;
+
+         /* The LS output / HS input layout can be communicated
+          * directly instead of via user SGPRs for merged LS-HS.
+          * The LS VGPR fix prefers this too.
+          */
+         key->opt.prefer_mono = 1;
+      }
+
+      key->part.tcs.epilog.prim_mode =
+         sctx->tes_shader.cso->info.properties[TGSI_PROPERTY_TES_PRIM_MODE];
+      key->part.tcs.epilog.invoc0_tess_factors_are_def =
+         sel->info.tessfactors_are_def_in_all_invocs;
+      key->part.tcs.epilog.tes_reads_tess_factors = sctx->tes_shader.cso->info.reads_tess_factors;
+
+      if (sel == sctx->fixed_func_tcs_shader.cso)
+         key->mono.u.ff_tcs_inputs_to_copy = sctx->vs_shader.cso->outputs_written;
+      break;
+   case PIPE_SHADER_TESS_EVAL:
+      key->as_ngg = stages_key.u.ngg;
+
+      if (sctx->gs_shader.cso)
+         key->as_es = 1;
+      else {
+         si_shader_selector_key_hw_vs(sctx, sel, key);
+
+         if (sctx->ps_shader.cso && sctx->ps_shader.cso->info.uses_primid)
+            key->mono.u.vs_export_prim_id = 1;
+      }
+      break;
+   case PIPE_SHADER_GEOMETRY:
+      if (sctx->chip_class >= GFX9) {
+         if (sctx->tes_shader.cso) {
+            key->part.gs.es = sctx->tes_shader.cso;
+         } else {
+            si_shader_selector_key_vs(sctx, sctx->vs_shader.cso, key, &key->part.gs.vs_prolog);
+            key->part.gs.es = sctx->vs_shader.cso;
+            key->part.gs.prolog.gfx9_prev_is_vs = 1;
+         }
+
+         key->as_ngg = stages_key.u.ngg;
+
+         /* Merged ES-GS can have unbalanced wave usage.
+          *
+          * ES threads are per-vertex, while GS threads are
+          * per-primitive. So without any amplification, there
+          * are fewer GS threads than ES threads, which can result
+          * in empty (no-op) GS waves. With too much amplification,
+          * there are more GS threads than ES threads, which
+          * can result in empty (no-op) ES waves.
+          *
+          * Non-monolithic shaders are implemented by setting EXEC
+          * at the beginning of shader parts, and don't jump to
+          * the end if EXEC is 0.
+          *
+          * Monolithic shaders use conditional blocks, so they can
+          * jump and skip empty waves of ES or GS. So set this to
+          * always use optimized variants, which are monolithic.
+          */
+         key->opt.prefer_mono = 1;
+      }
+      key->part.gs.prolog.tri_strip_adj_fix = sctx->gs_tri_strip_adj_fix;
+      break;
+   case PIPE_SHADER_FRAGMENT: {
+      struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
+      struct si_state_blend *blend = sctx->queued.named.blend;
+
+      if (sel->info.properties[TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS] &&
+          sel->info.colors_written == 0x1)
+         key->part.ps.epilog.last_cbuf = MAX2(sctx->framebuffer.state.nr_cbufs, 1) - 1;
+
+      /* Select the shader color format based on whether
+       * blending or alpha are needed.
+       */
+      key->part.ps.epilog.spi_shader_col_format =
+         (blend->blend_enable_4bit & blend->need_src_alpha_4bit &
+          sctx->framebuffer.spi_shader_col_format_blend_alpha) |
+         (blend->blend_enable_4bit & ~blend->need_src_alpha_4bit &
+          sctx->framebuffer.spi_shader_col_format_blend) |
+         (~blend->blend_enable_4bit & blend->need_src_alpha_4bit &
+          sctx->framebuffer.spi_shader_col_format_alpha) |
+         (~blend->blend_enable_4bit & ~blend->need_src_alpha_4bit &
+          sctx->framebuffer.spi_shader_col_format);
+      key->part.ps.epilog.spi_shader_col_format &= blend->cb_target_enabled_4bit;
+
+      /* The output for dual source blending should have
+       * the same format as the first output.
+       */
+      if (blend->dual_src_blend) {
+         key->part.ps.epilog.spi_shader_col_format |=
+            (key->part.ps.epilog.spi_shader_col_format & 0xf) << 4;
+      }
+
+      /* If alpha-to-coverage is enabled, we have to export alpha
+       * even if there is no color buffer.
+       */
+      if (!(key->part.ps.epilog.spi_shader_col_format & 0xf) && blend->alpha_to_coverage)
+         key->part.ps.epilog.spi_shader_col_format |= V_028710_SPI_SHADER_32_AR;
+
+      /* On GFX6 and GFX7 except Hawaii, the CB doesn't clamp outputs
+       * to the range supported by the type if a channel has less
+       * than 16 bits and the export format is 16_ABGR.
+       */
+      if (sctx->chip_class <= GFX7 && sctx->family != CHIP_HAWAII) {
+         key->part.ps.epilog.color_is_int8 = sctx->framebuffer.color_is_int8;
+         key->part.ps.epilog.color_is_int10 = sctx->framebuffer.color_is_int10;
+      }
+
+      /* Disable unwritten outputs (if WRITE_ALL_CBUFS isn't enabled). */
+      if (!key->part.ps.epilog.last_cbuf) {
+         key->part.ps.epilog.spi_shader_col_format &= sel->colors_written_4bit;
+         key->part.ps.epilog.color_is_int8 &= sel->info.colors_written;
+         key->part.ps.epilog.color_is_int10 &= sel->info.colors_written;
+      }
+
+      bool is_poly = !util_prim_is_points_or_lines(sctx->current_rast_prim);
+      bool is_line = util_prim_is_lines(sctx->current_rast_prim);
+
+      key->part.ps.prolog.color_two_side = rs->two_side && sel->info.colors_read;
+      key->part.ps.prolog.flatshade_colors = rs->flatshade && sel->info.colors_read;
+
+      key->part.ps.epilog.alpha_to_one = blend->alpha_to_one && rs->multisample_enable;
+
+      key->part.ps.prolog.poly_stipple = rs->poly_stipple_enable && is_poly;
+      key->part.ps.epilog.poly_line_smoothing =
+         ((is_poly && rs->poly_smooth) || (is_line && rs->line_smooth)) &&
+         sctx->framebuffer.nr_samples <= 1;
+      key->part.ps.epilog.clamp_color = rs->clamp_fragment_color;
+
+      if (sctx->ps_iter_samples > 1 && sel->info.reads_samplemask) {
+         key->part.ps.prolog.samplemask_log_ps_iter = util_logbase2(sctx->ps_iter_samples);
+      }
+
+      if (rs->force_persample_interp && rs->multisample_enable &&
+          sctx->framebuffer.nr_samples > 1 && sctx->ps_iter_samples > 1) {
+         key->part.ps.prolog.force_persp_sample_interp =
+            sel->info.uses_persp_center || sel->info.uses_persp_centroid;
+
+         key->part.ps.prolog.force_linear_sample_interp =
+            sel->info.uses_linear_center || sel->info.uses_linear_centroid;
+      } else if (rs->multisample_enable && sctx->framebuffer.nr_samples > 1) {
+         key->part.ps.prolog.bc_optimize_for_persp =
+            sel->info.uses_persp_center && sel->info.uses_persp_centroid;
+         key->part.ps.prolog.bc_optimize_for_linear =
+            sel->info.uses_linear_center && sel->info.uses_linear_centroid;
+      } else {
+         /* Make sure SPI doesn't compute more than 1 pair
+          * of (i,j), which is the optimization here. */
+         key->part.ps.prolog.force_persp_center_interp = sel->info.uses_persp_center +
+                                                            sel->info.uses_persp_centroid +
+                                                            sel->info.uses_persp_sample >
+                                                         1;
+
+         key->part.ps.prolog.force_linear_center_interp = sel->info.uses_linear_center +
+                                                             sel->info.uses_linear_centroid +
+                                                             sel->info.uses_linear_sample >
+                                                          1;
+
+         if (sel->info.uses_persp_opcode_interp_sample ||
+             sel->info.uses_linear_opcode_interp_sample)
+            key->mono.u.ps.interpolate_at_sample_force_center = 1;
+      }
+
+      key->part.ps.epilog.alpha_func = si_get_alpha_test_func(sctx);
+
+      /* ps_uses_fbfetch is true only if the color buffer is bound. */
+      if (sctx->ps_uses_fbfetch && !sctx->blitter->running) {
+         struct pipe_surface *cb0 = sctx->framebuffer.state.cbufs[0];
+         struct pipe_resource *tex = cb0->texture;
+
+         /* 1D textures are allocated and used as 2D on GFX9. */
+         key->mono.u.ps.fbfetch_msaa = sctx->framebuffer.nr_samples > 1;
+         key->mono.u.ps.fbfetch_is_1D =
+            sctx->chip_class != GFX9 &&
+            (tex->target == PIPE_TEXTURE_1D || tex->target == PIPE_TEXTURE_1D_ARRAY);
+         key->mono.u.ps.fbfetch_layered =
+            tex->target == PIPE_TEXTURE_1D_ARRAY || tex->target == PIPE_TEXTURE_2D_ARRAY ||
+            tex->target == PIPE_TEXTURE_CUBE || tex->target == PIPE_TEXTURE_CUBE_ARRAY ||
+            tex->target == PIPE_TEXTURE_3D;
+      }
+      break;
+   }
+   default:
+      assert(0);
+   }
+
+   if (unlikely(sctx->screen->debug_flags & DBG(NO_OPT_VARIANT)))
+      memset(&key->opt, 0, sizeof(key->opt));
+}
+
+static void si_build_shader_variant(struct si_shader *shader, int thread_index, bool low_priority)
+{
+   struct si_shader_selector *sel = shader->selector;
+   struct si_screen *sscreen = sel->screen;
+   struct ac_llvm_compiler *compiler;
+   struct pipe_debug_callback *debug = &shader->compiler_ctx_state.debug;
+
+   if (thread_index >= 0) {
+      if (low_priority) {
+         assert(thread_index < ARRAY_SIZE(sscreen->compiler_lowp));
+         compiler = &sscreen->compiler_lowp[thread_index];
+      } else {
+         assert(thread_index < ARRAY_SIZE(sscreen->compiler));
+         compiler = &sscreen->compiler[thread_index];
+      }
+      if (!debug->async)
+         debug = NULL;
+   } else {
+      assert(!low_priority);
+      compiler = shader->compiler_ctx_state.compiler;
+   }
+
+   if (!compiler->passes)
+      si_init_compiler(sscreen, compiler);
+
+   if (unlikely(!si_create_shader_variant(sscreen, compiler, shader, debug))) {
+      PRINT_ERR("Failed to build shader variant (type=%u)\n", sel->type);
+      shader->compilation_failed = true;
+      return;
+   }
+
+   if (shader->compiler_ctx_state.is_debug_context) {
+      FILE *f = open_memstream(&shader->shader_log, &shader->shader_log_size);
+      if (f) {
+         si_shader_dump(sscreen, shader, NULL, f, false);
+         fclose(f);
+      }
+   }
+
+   si_shader_init_pm4_state(sscreen, shader);
 }
 
 static void si_build_shader_variant_low_priority(void *job, int thread_index)
 {
-	struct si_shader *shader = (struct si_shader *)job;
+   struct si_shader *shader = (struct si_shader *)job;
 
-	assert(thread_index >= 0);
+   assert(thread_index >= 0);
 
-	si_build_shader_variant(shader, thread_index, true);
+   si_build_shader_variant(shader, thread_index, true);
 }
 
 static const struct si_shader_key zeroed;
 
-static bool si_check_missing_main_part(struct si_screen *sscreen,
-				       struct si_shader_selector *sel,
-				       struct si_compiler_ctx_state *compiler_state,
-				       struct si_shader_key *key)
+static bool si_check_missing_main_part(struct si_screen *sscreen, struct si_shader_selector *sel,
+                                       struct si_compiler_ctx_state *compiler_state,
+                                       struct si_shader_key *key)
 {
-	struct si_shader **mainp = si_get_main_shader_part(sel, key);
-
-	if (!*mainp) {
-		struct si_shader *main_part = CALLOC_STRUCT(si_shader);
-
-		if (!main_part)
-			return false;
-
-		/* We can leave the fence as permanently signaled because the
-		 * main part becomes visible globally only after it has been
-		 * compiled. */
-		util_queue_fence_init(&main_part->ready);
-
-		main_part->selector = sel;
-		main_part->key.as_es = key->as_es;
-		main_part->key.as_ls = key->as_ls;
-		main_part->key.as_ngg = key->as_ngg;
-		main_part->is_monolithic = false;
-
-		if (!si_compile_shader(sscreen, compiler_state->compiler,
-				       main_part, &compiler_state->debug)) {
-			FREE(main_part);
-			return false;
-		}
-		*mainp = main_part;
-	}
-	return true;
+   struct si_shader **mainp = si_get_main_shader_part(sel, key);
+
+   if (!*mainp) {
+      struct si_shader *main_part = CALLOC_STRUCT(si_shader);
+
+      if (!main_part)
+         return false;
+
+      /* We can leave the fence as permanently signaled because the
+       * main part becomes visible globally only after it has been
+       * compiled. */
+      util_queue_fence_init(&main_part->ready);
+
+      main_part->selector = sel;
+      main_part->key.as_es = key->as_es;
+      main_part->key.as_ls = key->as_ls;
+      main_part->key.as_ngg = key->as_ngg;
+      main_part->is_monolithic = false;
+
+      if (!si_compile_shader(sscreen, compiler_state->compiler, main_part,
+                             &compiler_state->debug)) {
+         FREE(main_part);
+         return false;
+      }
+      *mainp = main_part;
+   }
+   return true;
 }
 
 /**
@@ -2277,283 +2106,264 @@ static bool si_check_missing_main_part(struct si_screen *sscreen,
  *                           the compilation isn't finished, don't select any
  *                           shader and return an error.
  */
-int si_shader_select_with_key(struct si_screen *sscreen,
-			      struct si_shader_ctx_state *state,
-			      struct si_compiler_ctx_state *compiler_state,
-			      struct si_shader_key *key,
-			      int thread_index,
-			      bool optimized_or_none)
+int si_shader_select_with_key(struct si_screen *sscreen, struct si_shader_ctx_state *state,
+                              struct si_compiler_ctx_state *compiler_state,
+                              struct si_shader_key *key, int thread_index, bool optimized_or_none)
 {
-	struct si_shader_selector *sel = state->cso;
-	struct si_shader_selector *previous_stage_sel = NULL;
-	struct si_shader *current = state->current;
-	struct si_shader *iter, *shader = NULL;
+   struct si_shader_selector *sel = state->cso;
+   struct si_shader_selector *previous_stage_sel = NULL;
+   struct si_shader *current = state->current;
+   struct si_shader *iter, *shader = NULL;
 
 again:
-	/* Check if we don't need to change anything.
-	 * This path is also used for most shaders that don't need multiple
-	 * variants, it will cost just a computation of the key and this
-	 * test. */
-	if (likely(current &&
-		   memcmp(&current->key, key, sizeof(*key)) == 0)) {
-		if (unlikely(!util_queue_fence_is_signalled(&current->ready))) {
-			if (current->is_optimized) {
-				if (optimized_or_none)
-					return -1;
-
-				memset(&key->opt, 0, sizeof(key->opt));
-				goto current_not_ready;
-			}
-
-			util_queue_fence_wait(&current->ready);
-		}
-
-		return current->compilation_failed ? -1 : 0;
-	}
+   /* Check if we don't need to change anything.
+    * This path is also used for most shaders that don't need multiple
+    * variants, it will cost just a computation of the key and this
+    * test. */
+   if (likely(current && memcmp(&current->key, key, sizeof(*key)) == 0)) {
+      if (unlikely(!util_queue_fence_is_signalled(&current->ready))) {
+         if (current->is_optimized) {
+            if (optimized_or_none)
+               return -1;
+
+            memset(&key->opt, 0, sizeof(key->opt));
+            goto current_not_ready;
+         }
+
+         util_queue_fence_wait(&current->ready);
+      }
+
+      return current->compilation_failed ? -1 : 0;
+   }
 current_not_ready:
 
-	/* This must be done before the mutex is locked, because async GS
-	 * compilation calls this function too, and therefore must enter
-	 * the mutex first.
-	 *
-	 * Only wait if we are in a draw call. Don't wait if we are
-	 * in a compiler thread.
-	 */
-	if (thread_index < 0)
-		util_queue_fence_wait(&sel->ready);
-
-	simple_mtx_lock(&sel->mutex);
-
-	/* Find the shader variant. */
-	for (iter = sel->first_variant; iter; iter = iter->next_variant) {
-		/* Don't check the "current" shader. We checked it above. */
-		if (current != iter &&
-		    memcmp(&iter->key, key, sizeof(*key)) == 0) {
-			simple_mtx_unlock(&sel->mutex);
-
-			if (unlikely(!util_queue_fence_is_signalled(&iter->ready))) {
-				/* If it's an optimized shader and its compilation has
-				 * been started but isn't done, use the unoptimized
-				 * shader so as not to cause a stall due to compilation.
-				 */
-				if (iter->is_optimized) {
-					if (optimized_or_none)
-						return -1;
-					memset(&key->opt, 0, sizeof(key->opt));
-					goto again;
-				}
-
-				util_queue_fence_wait(&iter->ready);
-			}
-
-			if (iter->compilation_failed) {
-				return -1; /* skip the draw call */
-			}
-
-			state->current = iter;
-			return 0;
-		}
-	}
-
-	/* Build a new shader. */
-	shader = CALLOC_STRUCT(si_shader);
-	if (!shader) {
-		simple_mtx_unlock(&sel->mutex);
-		return -ENOMEM;
-	}
-
-	util_queue_fence_init(&shader->ready);
-
-	shader->selector = sel;
-	shader->key = *key;
-	shader->compiler_ctx_state = *compiler_state;
-
-	/* If this is a merged shader, get the first shader's selector. */
-	if (sscreen->info.chip_class >= GFX9) {
-		if (sel->type == PIPE_SHADER_TESS_CTRL)
-			previous_stage_sel = key->part.tcs.ls;
-		else if (sel->type == PIPE_SHADER_GEOMETRY)
-			previous_stage_sel = key->part.gs.es;
-
-		/* We need to wait for the previous shader. */
-		if (previous_stage_sel && thread_index < 0)
-			util_queue_fence_wait(&previous_stage_sel->ready);
-	}
-
-	bool is_pure_monolithic =
-		sscreen->use_monolithic_shaders ||
-		memcmp(&key->mono, &zeroed.mono, sizeof(key->mono)) != 0;
-
-	/* Compile the main shader part if it doesn't exist. This can happen
-	 * if the initial guess was wrong.
-	 *
-	 * The prim discard CS doesn't need the main shader part.
-	 */
-	if (!is_pure_monolithic &&
-	    !key->opt.vs_as_prim_discard_cs) {
-		bool ok = true;
-
-		/* Make sure the main shader part is present. This is needed
-		 * for shaders that can be compiled as VS, LS, or ES, and only
-		 * one of them is compiled at creation.
-		 *
-		 * It is also needed for GS, which can be compiled as non-NGG
-		 * and NGG.
-		 *
-		 * For merged shaders, check that the starting shader's main
-		 * part is present.
-		 */
-		if (previous_stage_sel) {
-			struct si_shader_key shader1_key = zeroed;
-
-			if (sel->type == PIPE_SHADER_TESS_CTRL) {
-				shader1_key.as_ls = 1;
-			} else if (sel->type == PIPE_SHADER_GEOMETRY) {
-				shader1_key.as_es = 1;
-				shader1_key.as_ngg = key->as_ngg; /* for Wave32 vs Wave64 */
-			} else {
-				assert(0);
-			}
-
-			simple_mtx_lock(&previous_stage_sel->mutex);
-			ok = si_check_missing_main_part(sscreen,
-							previous_stage_sel,
-							compiler_state, &shader1_key);
-			simple_mtx_unlock(&previous_stage_sel->mutex);
-		}
-
-		if (ok) {
-			ok = si_check_missing_main_part(sscreen, sel,
-							compiler_state, key);
-		}
-
-		if (!ok) {
-			FREE(shader);
-			simple_mtx_unlock(&sel->mutex);
-			return -ENOMEM; /* skip the draw call */
-		}
-	}
-
-	/* Keep the reference to the 1st shader of merged shaders, so that
-	 * Gallium can't destroy it before we destroy the 2nd shader.
-	 *
-	 * Set sctx = NULL, because it's unused if we're not releasing
-	 * the shader, and we don't have any sctx here.
-	 */
-	si_shader_selector_reference(NULL, &shader->previous_stage_sel,
-				     previous_stage_sel);
-
-	/* Monolithic-only shaders don't make a distinction between optimized
-	 * and unoptimized. */
-	shader->is_monolithic =
-		is_pure_monolithic ||
-		memcmp(&key->opt, &zeroed.opt, sizeof(key->opt)) != 0;
-
-	/* The prim discard CS is always optimized. */
-	shader->is_optimized =
-		(!is_pure_monolithic || key->opt.vs_as_prim_discard_cs) &&
-		 memcmp(&key->opt, &zeroed.opt, sizeof(key->opt)) != 0;
-
-	/* If it's an optimized shader, compile it asynchronously. */
-	if (shader->is_optimized && thread_index < 0) {
-		/* Compile it asynchronously. */
-		util_queue_add_job(&sscreen->shader_compiler_queue_low_priority,
-				   shader, &shader->ready,
-				   si_build_shader_variant_low_priority, NULL,
-				   0);
-
-		/* Add only after the ready fence was reset, to guard against a
-		 * race with si_bind_XX_shader. */
-		if (!sel->last_variant) {
-			sel->first_variant = shader;
-			sel->last_variant = shader;
-		} else {
-			sel->last_variant->next_variant = shader;
-			sel->last_variant = shader;
-		}
-
-		/* Use the default (unoptimized) shader for now. */
-		memset(&key->opt, 0, sizeof(key->opt));
-		simple_mtx_unlock(&sel->mutex);
-
-		if (sscreen->options.sync_compile)
-			util_queue_fence_wait(&shader->ready);
-
-		if (optimized_or_none)
-			return -1;
-		goto again;
-	}
-
-	/* Reset the fence before adding to the variant list. */
-	util_queue_fence_reset(&shader->ready);
-
-	if (!sel->last_variant) {
-		sel->first_variant = shader;
-		sel->last_variant = shader;
-	} else {
-		sel->last_variant->next_variant = shader;
-		sel->last_variant = shader;
-	}
-
-	simple_mtx_unlock(&sel->mutex);
-
-	assert(!shader->is_optimized);
-	si_build_shader_variant(shader, thread_index, false);
-
-	util_queue_fence_signal(&shader->ready);
-
-	if (!shader->compilation_failed)
-		state->current = shader;
-
-	return shader->compilation_failed ? -1 : 0;
-}
-
-static int si_shader_select(struct pipe_context *ctx,
-			    struct si_shader_ctx_state *state,
-			    union si_vgt_stages_key stages_key,
-			    struct si_compiler_ctx_state *compiler_state)
-{
-	struct si_context *sctx = (struct si_context *)ctx;
-	struct si_shader_key key;
-
-	si_shader_selector_key(ctx, state->cso, stages_key, &key);
-	return si_shader_select_with_key(sctx->screen, state, compiler_state,
-					 &key, -1, false);
-}
-
-static void si_parse_next_shader_property(const struct si_shader_info *info,
-					  bool streamout,
-					  struct si_shader_key *key)
-{
-	unsigned next_shader = info->properties[TGSI_PROPERTY_NEXT_SHADER];
-
-	switch (info->processor) {
-	case PIPE_SHADER_VERTEX:
-		switch (next_shader) {
-		case PIPE_SHADER_GEOMETRY:
-			key->as_es = 1;
-			break;
-		case PIPE_SHADER_TESS_CTRL:
-		case PIPE_SHADER_TESS_EVAL:
-			key->as_ls = 1;
-			break;
-		default:
-			/* If POSITION isn't written, it can only be a HW VS
-			 * if streamout is used. If streamout isn't used,
-			 * assume that it's a HW LS. (the next shader is TCS)
-			 * This heuristic is needed for separate shader objects.
-			 */
-			if (!info->writes_position && !streamout)
-				key->as_ls = 1;
-		}
-		break;
-
-	case PIPE_SHADER_TESS_EVAL:
-		if (next_shader == PIPE_SHADER_GEOMETRY ||
-		    !info->writes_position)
-			key->as_es = 1;
-		break;
-	}
+   /* This must be done before the mutex is locked, because async GS
+    * compilation calls this function too, and therefore must enter
+    * the mutex first.
+    *
+    * Only wait if we are in a draw call. Don't wait if we are
+    * in a compiler thread.
+    */
+   if (thread_index < 0)
+      util_queue_fence_wait(&sel->ready);
+
+   simple_mtx_lock(&sel->mutex);
+
+   /* Find the shader variant. */
+   for (iter = sel->first_variant; iter; iter = iter->next_variant) {
+      /* Don't check the "current" shader. We checked it above. */
+      if (current != iter && memcmp(&iter->key, key, sizeof(*key)) == 0) {
+         simple_mtx_unlock(&sel->mutex);
+
+         if (unlikely(!util_queue_fence_is_signalled(&iter->ready))) {
+            /* If it's an optimized shader and its compilation has
+             * been started but isn't done, use the unoptimized
+             * shader so as not to cause a stall due to compilation.
+             */
+            if (iter->is_optimized) {
+               if (optimized_or_none)
+                  return -1;
+               memset(&key->opt, 0, sizeof(key->opt));
+               goto again;
+            }
+
+            util_queue_fence_wait(&iter->ready);
+         }
+
+         if (iter->compilation_failed) {
+            return -1; /* skip the draw call */
+         }
+
+         state->current = iter;
+         return 0;
+      }
+   }
+
+   /* Build a new shader. */
+   shader = CALLOC_STRUCT(si_shader);
+   if (!shader) {
+      simple_mtx_unlock(&sel->mutex);
+      return -ENOMEM;
+   }
+
+   util_queue_fence_init(&shader->ready);
+
+   shader->selector = sel;
+   shader->key = *key;
+   shader->compiler_ctx_state = *compiler_state;
+
+   /* If this is a merged shader, get the first shader's selector. */
+   if (sscreen->info.chip_class >= GFX9) {
+      if (sel->type == PIPE_SHADER_TESS_CTRL)
+         previous_stage_sel = key->part.tcs.ls;
+      else if (sel->type == PIPE_SHADER_GEOMETRY)
+         previous_stage_sel = key->part.gs.es;
+
+      /* We need to wait for the previous shader. */
+      if (previous_stage_sel && thread_index < 0)
+         util_queue_fence_wait(&previous_stage_sel->ready);
+   }
+
+   bool is_pure_monolithic =
+      sscreen->use_monolithic_shaders || memcmp(&key->mono, &zeroed.mono, sizeof(key->mono)) != 0;
+
+   /* Compile the main shader part if it doesn't exist. This can happen
+    * if the initial guess was wrong.
+    *
+    * The prim discard CS doesn't need the main shader part.
+    */
+   if (!is_pure_monolithic && !key->opt.vs_as_prim_discard_cs) {
+      bool ok = true;
+
+      /* Make sure the main shader part is present. This is needed
+       * for shaders that can be compiled as VS, LS, or ES, and only
+       * one of them is compiled at creation.
+       *
+       * It is also needed for GS, which can be compiled as non-NGG
+       * and NGG.
+       *
+       * For merged shaders, check that the starting shader's main
+       * part is present.
+       */
+      if (previous_stage_sel) {
+         struct si_shader_key shader1_key = zeroed;
+
+         if (sel->type == PIPE_SHADER_TESS_CTRL) {
+            shader1_key.as_ls = 1;
+         } else if (sel->type == PIPE_SHADER_GEOMETRY) {
+            shader1_key.as_es = 1;
+            shader1_key.as_ngg = key->as_ngg; /* for Wave32 vs Wave64 */
+         } else {
+            assert(0);
+         }
+
+         simple_mtx_lock(&previous_stage_sel->mutex);
+         ok = si_check_missing_main_part(sscreen, previous_stage_sel, compiler_state, &shader1_key);
+         simple_mtx_unlock(&previous_stage_sel->mutex);
+      }
+
+      if (ok) {
+         ok = si_check_missing_main_part(sscreen, sel, compiler_state, key);
+      }
+
+      if (!ok) {
+         FREE(shader);
+         simple_mtx_unlock(&sel->mutex);
+         return -ENOMEM; /* skip the draw call */
+      }
+   }
+
+   /* Keep the reference to the 1st shader of merged shaders, so that
+    * Gallium can't destroy it before we destroy the 2nd shader.
+    *
+    * Set sctx = NULL, because it's unused if we're not releasing
+    * the shader, and we don't have any sctx here.
+    */
+   si_shader_selector_reference(NULL, &shader->previous_stage_sel, previous_stage_sel);
+
+   /* Monolithic-only shaders don't make a distinction between optimized
+    * and unoptimized. */
+   shader->is_monolithic =
+      is_pure_monolithic || memcmp(&key->opt, &zeroed.opt, sizeof(key->opt)) != 0;
+
+   /* The prim discard CS is always optimized. */
+   shader->is_optimized = (!is_pure_monolithic || key->opt.vs_as_prim_discard_cs) &&
+                          memcmp(&key->opt, &zeroed.opt, sizeof(key->opt)) != 0;
+
+   /* If it's an optimized shader, compile it asynchronously. */
+   if (shader->is_optimized && thread_index < 0) {
+      /* Compile it asynchronously. */
+      util_queue_add_job(&sscreen->shader_compiler_queue_low_priority, shader, &shader->ready,
+                         si_build_shader_variant_low_priority, NULL, 0);
+
+      /* Add only after the ready fence was reset, to guard against a
+       * race with si_bind_XX_shader. */
+      if (!sel->last_variant) {
+         sel->first_variant = shader;
+         sel->last_variant = shader;
+      } else {
+         sel->last_variant->next_variant = shader;
+         sel->last_variant = shader;
+      }
+
+      /* Use the default (unoptimized) shader for now. */
+      memset(&key->opt, 0, sizeof(key->opt));
+      simple_mtx_unlock(&sel->mutex);
+
+      if (sscreen->options.sync_compile)
+         util_queue_fence_wait(&shader->ready);
+
+      if (optimized_or_none)
+         return -1;
+      goto again;
+   }
+
+   /* Reset the fence before adding to the variant list. */
+   util_queue_fence_reset(&shader->ready);
+
+   if (!sel->last_variant) {
+      sel->first_variant = shader;
+      sel->last_variant = shader;
+   } else {
+      sel->last_variant->next_variant = shader;
+      sel->last_variant = shader;
+   }
+
+   simple_mtx_unlock(&sel->mutex);
+
+   assert(!shader->is_optimized);
+   si_build_shader_variant(shader, thread_index, false);
+
+   util_queue_fence_signal(&shader->ready);
+
+   if (!shader->compilation_failed)
+      state->current = shader;
+
+   return shader->compilation_failed ? -1 : 0;
+}
+
+static int si_shader_select(struct pipe_context *ctx, struct si_shader_ctx_state *state,
+                            union si_vgt_stages_key stages_key,
+                            struct si_compiler_ctx_state *compiler_state)
+{
+   struct si_context *sctx = (struct si_context *)ctx;
+   struct si_shader_key key;
+
+   si_shader_selector_key(ctx, state->cso, stages_key, &key);
+   return si_shader_select_with_key(sctx->screen, state, compiler_state, &key, -1, false);
+}
+
+static void si_parse_next_shader_property(const struct si_shader_info *info, bool streamout,
+                                          struct si_shader_key *key)
+{
+   unsigned next_shader = info->properties[TGSI_PROPERTY_NEXT_SHADER];
+
+   switch (info->processor) {
+   case PIPE_SHADER_VERTEX:
+      switch (next_shader) {
+      case PIPE_SHADER_GEOMETRY:
+         key->as_es = 1;
+         break;
+      case PIPE_SHADER_TESS_CTRL:
+      case PIPE_SHADER_TESS_EVAL:
+         key->as_ls = 1;
+         break;
+      default:
+         /* If POSITION isn't written, it can only be a HW VS
+          * if streamout is used. If streamout isn't used,
+          * assume that it's a HW LS. (the next shader is TCS)
+          * This heuristic is needed for separate shader objects.
+          */
+         if (!info->writes_position && !streamout)
+            key->as_ls = 1;
+      }
+      break;
+
+   case PIPE_SHADER_TESS_EVAL:
+      if (next_shader == PIPE_SHADER_GEOMETRY || !info->writes_position)
+         key->as_es = 1;
+      break;
+   }
 }
 
 /**
@@ -2563,971 +2373,904 @@ static void si_parse_next_shader_property(const struct si_shader_info *info,
  */
 static void si_init_shader_selector_async(void *job, int thread_index)
 {
-	struct si_shader_selector *sel = (struct si_shader_selector *)job;
-	struct si_screen *sscreen = sel->screen;
-	struct ac_llvm_compiler *compiler;
-	struct pipe_debug_callback *debug = &sel->compiler_ctx_state.debug;
-
-	assert(!debug->debug_message || debug->async);
-	assert(thread_index >= 0);
-	assert(thread_index < ARRAY_SIZE(sscreen->compiler));
-	compiler = &sscreen->compiler[thread_index];
-
-	if (!compiler->passes)
-		si_init_compiler(sscreen, compiler);
-
-	/* Serialize NIR to save memory. Monolithic shader variants
-	 * have to deserialize NIR before compilation.
-	 */
-	if (sel->nir) {
-		struct blob blob;
-                size_t size;
-
-		blob_init(&blob);
-		/* true = remove optional debugging data to increase
-		 * the likehood of getting more shader cache hits.
-		 * It also drops variable names, so we'll save more memory.
-		 */
-		nir_serialize(&blob, sel->nir, true);
-		blob_finish_get_buffer(&blob, &sel->nir_binary, &size);
-		sel->nir_size = size;
-	}
-
-	/* Compile the main shader part for use with a prolog and/or epilog.
-	 * If this fails, the driver will try to compile a monolithic shader
-	 * on demand.
-	 */
-	if (!sscreen->use_monolithic_shaders) {
-		struct si_shader *shader = CALLOC_STRUCT(si_shader);
-		unsigned char ir_sha1_cache_key[20];
-
-		if (!shader) {
-			fprintf(stderr, "radeonsi: can't allocate a main shader part\n");
-			return;
-		}
-
-		/* We can leave the fence signaled because use of the default
-		 * main part is guarded by the selector's ready fence. */
-		util_queue_fence_init(&shader->ready);
-
-		shader->selector = sel;
-		shader->is_monolithic = false;
-		si_parse_next_shader_property(&sel->info,
-					      sel->so.num_outputs != 0,
-					      &shader->key);
-
-		if (sscreen->use_ngg &&
-		    (!sel->so.num_outputs || sscreen->use_ngg_streamout) &&
-		    ((sel->type == PIPE_SHADER_VERTEX && !shader->key.as_ls) ||
-		     sel->type == PIPE_SHADER_TESS_EVAL ||
-		     sel->type == PIPE_SHADER_GEOMETRY))
-			shader->key.as_ngg = 1;
-
-		if (sel->nir) {
-			si_get_ir_cache_key(sel, shader->key.as_ngg,
-					    shader->key.as_es, ir_sha1_cache_key);
-		}
-
-		/* Try to load the shader from the shader cache. */
-		simple_mtx_lock(&sscreen->shader_cache_mutex);
-
-		if (si_shader_cache_load_shader(sscreen, ir_sha1_cache_key, shader)) {
-			simple_mtx_unlock(&sscreen->shader_cache_mutex);
-			si_shader_dump_stats_for_shader_db(sscreen, shader, debug);
-		} else {
-			simple_mtx_unlock(&sscreen->shader_cache_mutex);
-
-			/* Compile the shader if it hasn't been loaded from the cache. */
-			if (!si_compile_shader(sscreen, compiler, shader, debug)) {
-				FREE(shader);
-				fprintf(stderr, "radeonsi: can't compile a main shader part\n");
-				return;
-			}
-
-			simple_mtx_lock(&sscreen->shader_cache_mutex);
-			si_shader_cache_insert_shader(sscreen, ir_sha1_cache_key,
-						      shader, true);
-			simple_mtx_unlock(&sscreen->shader_cache_mutex);
-		}
-
-		*si_get_main_shader_part(sel, &shader->key) = shader;
-
-		/* Unset "outputs_written" flags for outputs converted to
-		 * DEFAULT_VAL, so that later inter-shader optimizations don't
-		 * try to eliminate outputs that don't exist in the final
-		 * shader.
-		 *
-		 * This is only done if non-monolithic shaders are enabled.
-		 */
-		if ((sel->type == PIPE_SHADER_VERTEX ||
-		     sel->type == PIPE_SHADER_TESS_EVAL) &&
-		    !shader->key.as_ls &&
-		    !shader->key.as_es) {
-			unsigned i;
-
-			for (i = 0; i < sel->info.num_outputs; i++) {
-				unsigned offset = shader->info.vs_output_param_offset[i];
-
-				if (offset <= AC_EXP_PARAM_OFFSET_31)
-					continue;
-
-				unsigned name = sel->info.output_semantic_name[i];
-				unsigned index = sel->info.output_semantic_index[i];
-				unsigned id;
-
-				switch (name) {
-				case TGSI_SEMANTIC_GENERIC:
-					/* don't process indices the function can't handle */
-					if (index >= SI_MAX_IO_GENERIC)
-						break;
-					/* fall through */
-				default:
-					id = si_shader_io_get_unique_index(name, index, true);
-					sel->outputs_written_before_ps &= ~(1ull << id);
-					break;
-				case TGSI_SEMANTIC_POSITION: /* ignore these */
-				case TGSI_SEMANTIC_PSIZE:
-				case TGSI_SEMANTIC_CLIPVERTEX:
-				case TGSI_SEMANTIC_EDGEFLAG:
-					break;
-				}
-			}
-		}
-	}
-
-	/* The GS copy shader is always pre-compiled. */
-	if (sel->type == PIPE_SHADER_GEOMETRY &&
-	    (!sscreen->use_ngg ||
-	     !sscreen->use_ngg_streamout || /* also for PRIMITIVES_GENERATED */
-	     sel->tess_turns_off_ngg)) {
-		sel->gs_copy_shader = si_generate_gs_copy_shader(sscreen, compiler, sel, debug);
-		if (!sel->gs_copy_shader) {
-			fprintf(stderr, "radeonsi: can't create GS copy shader\n");
-			return;
-		}
-
-		si_shader_vs(sscreen, sel->gs_copy_shader, sel);
-	}
-
-	/* Free NIR. We only keep serialized NIR after this point. */
-	if (sel->nir) {
-		ralloc_free(sel->nir);
-		sel->nir = NULL;
-	}
+   struct si_shader_selector *sel = (struct si_shader_selector *)job;
+   struct si_screen *sscreen = sel->screen;
+   struct ac_llvm_compiler *compiler;
+   struct pipe_debug_callback *debug = &sel->compiler_ctx_state.debug;
+
+   assert(!debug->debug_message || debug->async);
+   assert(thread_index >= 0);
+   assert(thread_index < ARRAY_SIZE(sscreen->compiler));
+   compiler = &sscreen->compiler[thread_index];
+
+   if (!compiler->passes)
+      si_init_compiler(sscreen, compiler);
+
+   /* Serialize NIR to save memory. Monolithic shader variants
+    * have to deserialize NIR before compilation.
+    */
+   if (sel->nir) {
+      struct blob blob;
+      size_t size;
+
+      blob_init(&blob);
+      /* true = remove optional debugging data to increase
+       * the likehood of getting more shader cache hits.
+       * It also drops variable names, so we'll save more memory.
+       */
+      nir_serialize(&blob, sel->nir, true);
+      blob_finish_get_buffer(&blob, &sel->nir_binary, &size);
+      sel->nir_size = size;
+   }
+
+   /* Compile the main shader part for use with a prolog and/or epilog.
+    * If this fails, the driver will try to compile a monolithic shader
+    * on demand.
+    */
+   if (!sscreen->use_monolithic_shaders) {
+      struct si_shader *shader = CALLOC_STRUCT(si_shader);
+      unsigned char ir_sha1_cache_key[20];
+
+      if (!shader) {
+         fprintf(stderr, "radeonsi: can't allocate a main shader part\n");
+         return;
+      }
+
+      /* We can leave the fence signaled because use of the default
+       * main part is guarded by the selector's ready fence. */
+      util_queue_fence_init(&shader->ready);
+
+      shader->selector = sel;
+      shader->is_monolithic = false;
+      si_parse_next_shader_property(&sel->info, sel->so.num_outputs != 0, &shader->key);
+
+      if (sscreen->use_ngg && (!sel->so.num_outputs || sscreen->use_ngg_streamout) &&
+          ((sel->type == PIPE_SHADER_VERTEX && !shader->key.as_ls) ||
+           sel->type == PIPE_SHADER_TESS_EVAL || sel->type == PIPE_SHADER_GEOMETRY))
+         shader->key.as_ngg = 1;
+
+      if (sel->nir) {
+         si_get_ir_cache_key(sel, shader->key.as_ngg, shader->key.as_es, ir_sha1_cache_key);
+      }
+
+      /* Try to load the shader from the shader cache. */
+      simple_mtx_lock(&sscreen->shader_cache_mutex);
+
+      if (si_shader_cache_load_shader(sscreen, ir_sha1_cache_key, shader)) {
+         simple_mtx_unlock(&sscreen->shader_cache_mutex);
+         si_shader_dump_stats_for_shader_db(sscreen, shader, debug);
+      } else {
+         simple_mtx_unlock(&sscreen->shader_cache_mutex);
+
+         /* Compile the shader if it hasn't been loaded from the cache. */
+         if (!si_compile_shader(sscreen, compiler, shader, debug)) {
+            FREE(shader);
+            fprintf(stderr, "radeonsi: can't compile a main shader part\n");
+            return;
+         }
+
+         simple_mtx_lock(&sscreen->shader_cache_mutex);
+         si_shader_cache_insert_shader(sscreen, ir_sha1_cache_key, shader, true);
+         simple_mtx_unlock(&sscreen->shader_cache_mutex);
+      }
+
+      *si_get_main_shader_part(sel, &shader->key) = shader;
+
+      /* Unset "outputs_written" flags for outputs converted to
+       * DEFAULT_VAL, so that later inter-shader optimizations don't
+       * try to eliminate outputs that don't exist in the final
+       * shader.
+       *
+       * This is only done if non-monolithic shaders are enabled.
+       */
+      if ((sel->type == PIPE_SHADER_VERTEX || sel->type == PIPE_SHADER_TESS_EVAL) &&
+          !shader->key.as_ls && !shader->key.as_es) {
+         unsigned i;
+
+         for (i = 0; i < sel->info.num_outputs; i++) {
+            unsigned offset = shader->info.vs_output_param_offset[i];
+
+            if (offset <= AC_EXP_PARAM_OFFSET_31)
+               continue;
+
+            unsigned name = sel->info.output_semantic_name[i];
+            unsigned index = sel->info.output_semantic_index[i];
+            unsigned id;
+
+            switch (name) {
+            case TGSI_SEMANTIC_GENERIC:
+               /* don't process indices the function can't handle */
+               if (index >= SI_MAX_IO_GENERIC)
+                  break;
+               /* fall through */
+            default:
+               id = si_shader_io_get_unique_index(name, index, true);
+               sel->outputs_written_before_ps &= ~(1ull << id);
+               break;
+            case TGSI_SEMANTIC_POSITION: /* ignore these */
+            case TGSI_SEMANTIC_PSIZE:
+            case TGSI_SEMANTIC_CLIPVERTEX:
+            case TGSI_SEMANTIC_EDGEFLAG:
+               break;
+            }
+         }
+      }
+   }
+
+   /* The GS copy shader is always pre-compiled. */
+   if (sel->type == PIPE_SHADER_GEOMETRY &&
+       (!sscreen->use_ngg || !sscreen->use_ngg_streamout || /* also for PRIMITIVES_GENERATED */
+        sel->tess_turns_off_ngg)) {
+      sel->gs_copy_shader = si_generate_gs_copy_shader(sscreen, compiler, sel, debug);
+      if (!sel->gs_copy_shader) {
+         fprintf(stderr, "radeonsi: can't create GS copy shader\n");
+         return;
+      }
+
+      si_shader_vs(sscreen, sel->gs_copy_shader, sel);
+   }
+
+   /* Free NIR. We only keep serialized NIR after this point. */
+   if (sel->nir) {
+      ralloc_free(sel->nir);
+      sel->nir = NULL;
+   }
 }
 
 void si_schedule_initial_compile(struct si_context *sctx, unsigned processor,
-				 struct util_queue_fence *ready_fence,
-				 struct si_compiler_ctx_state *compiler_ctx_state,
-				 void *job, util_queue_execute_func execute)
+                                 struct util_queue_fence *ready_fence,
+                                 struct si_compiler_ctx_state *compiler_ctx_state, void *job,
+                                 util_queue_execute_func execute)
 {
-	util_queue_fence_init(ready_fence);
-
-	struct util_async_debug_callback async_debug;
-	bool debug =
-		(sctx->debug.debug_message && !sctx->debug.async) ||
-		sctx->is_debug ||
-		si_can_dump_shader(sctx->screen, processor);
-
-	if (debug) {
-		u_async_debug_init(&async_debug);
-		compiler_ctx_state->debug = async_debug.base;
-	}
-
-	util_queue_add_job(&sctx->screen->shader_compiler_queue, job,
-			   ready_fence, execute, NULL, 0);
-
-	if (debug) {
-		util_queue_fence_wait(ready_fence);
-		u_async_debug_drain(&async_debug, &sctx->debug);
-		u_async_debug_cleanup(&async_debug);
-	}
-
-	if (sctx->screen->options.sync_compile)
-		util_queue_fence_wait(ready_fence);
+   util_queue_fence_init(ready_fence);
+
+   struct util_async_debug_callback async_debug;
+   bool debug = (sctx->debug.debug_message && !sctx->debug.async) || sctx->is_debug ||
+                si_can_dump_shader(sctx->screen, processor);
+
+   if (debug) {
+      u_async_debug_init(&async_debug);
+      compiler_ctx_state->debug = async_debug.base;
+   }
+
+   util_queue_add_job(&sctx->screen->shader_compiler_queue, job, ready_fence, execute, NULL, 0);
+
+   if (debug) {
+      util_queue_fence_wait(ready_fence);
+      u_async_debug_drain(&async_debug, &sctx->debug);
+      u_async_debug_cleanup(&async_debug);
+   }
+
+   if (sctx->screen->options.sync_compile)
+      util_queue_fence_wait(ready_fence);
 }
 
 /* Return descriptor slot usage masks from the given shader info. */
-void si_get_active_slot_masks(const struct si_shader_info *info,
-			      uint32_t *const_and_shader_buffers,
-			      uint64_t *samplers_and_images)
-{
-	unsigned start, num_shaderbufs, num_constbufs, num_images, num_msaa_images, num_samplers;
-
-	num_shaderbufs = util_last_bit(info->shader_buffers_declared);
-	num_constbufs = util_last_bit(info->const_buffers_declared);
-	/* two 8-byte images share one 16-byte slot */
-	num_images = align(util_last_bit(info->images_declared), 2);
-	num_msaa_images = align(util_last_bit(info->msaa_images_declared), 2);
-	num_samplers = util_last_bit(info->samplers_declared);
-
-	/* The layout is: sb[last] ... sb[0], cb[0] ... cb[last] */
-	start = si_get_shaderbuf_slot(num_shaderbufs - 1);
-	*const_and_shader_buffers =
-		u_bit_consecutive(start, num_shaderbufs + num_constbufs);
-
-	/* The layout is:
-	 *   - fmask[last] ... fmask[0]     go to [15-last .. 15]
-	 *   - image[last] ... image[0]     go to [31-last .. 31]
-	 *   - sampler[0] ... sampler[last] go to [32 .. 32+last*2]
-	 *
-	 * FMASKs for images are placed separately, because MSAA images are rare,
-	 * and so we can benefit from a better cache hit rate if we keep image
-	 * descriptors together.
-	 */
-	if (num_msaa_images)
-		num_images = SI_NUM_IMAGES + num_msaa_images; /* add FMASK descriptors */
-
-	start = si_get_image_slot(num_images - 1) / 2;
-	*samplers_and_images =
-		u_bit_consecutive64(start, num_images / 2 + num_samplers);
+void si_get_active_slot_masks(const struct si_shader_info *info, uint32_t *const_and_shader_buffers,
+                              uint64_t *samplers_and_images)
+{
+   unsigned start, num_shaderbufs, num_constbufs, num_images, num_msaa_images, num_samplers;
+
+   num_shaderbufs = util_last_bit(info->shader_buffers_declared);
+   num_constbufs = util_last_bit(info->const_buffers_declared);
+   /* two 8-byte images share one 16-byte slot */
+   num_images = align(util_last_bit(info->images_declared), 2);
+   num_msaa_images = align(util_last_bit(info->msaa_images_declared), 2);
+   num_samplers = util_last_bit(info->samplers_declared);
+
+   /* The layout is: sb[last] ... sb[0], cb[0] ... cb[last] */
+   start = si_get_shaderbuf_slot(num_shaderbufs - 1);
+   *const_and_shader_buffers = u_bit_consecutive(start, num_shaderbufs + num_constbufs);
+
+   /* The layout is:
+    *   - fmask[last] ... fmask[0]     go to [15-last .. 15]
+    *   - image[last] ... image[0]     go to [31-last .. 31]
+    *   - sampler[0] ... sampler[last] go to [32 .. 32+last*2]
+    *
+    * FMASKs for images are placed separately, because MSAA images are rare,
+    * and so we can benefit from a better cache hit rate if we keep image
+    * descriptors together.
+    */
+   if (num_msaa_images)
+      num_images = SI_NUM_IMAGES + num_msaa_images; /* add FMASK descriptors */
+
+   start = si_get_image_slot(num_images - 1) / 2;
+   *samplers_and_images = u_bit_consecutive64(start, num_images / 2 + num_samplers);
 }
 
 static void *si_create_shader_selector(struct pipe_context *ctx,
-				       const struct pipe_shader_state *state)
-{
-	struct si_screen *sscreen = (struct si_screen *)ctx->screen;
-	struct si_context *sctx = (struct si_context*)ctx;
-	struct si_shader_selector *sel = CALLOC_STRUCT(si_shader_selector);
-	int i;
-
-	if (!sel)
-		return NULL;
-
-	sel->screen = sscreen;
-	sel->compiler_ctx_state.debug = sctx->debug;
-	sel->compiler_ctx_state.is_debug_context = sctx->is_debug;
-
-	sel->so = state->stream_output;
-
-	if (state->type == PIPE_SHADER_IR_TGSI) {
-		sel->nir = tgsi_to_nir(state->tokens, ctx->screen);
-	} else {
-		assert(state->type == PIPE_SHADER_IR_NIR);
-		sel->nir = state->ir.nir;
-	}
-
-	si_nir_scan_shader(sel->nir, &sel->info);
-	si_nir_adjust_driver_locations(sel->nir);
-
-	sel->type = sel->info.processor;
-	p_atomic_inc(&sscreen->num_shaders_created);
-	si_get_active_slot_masks(&sel->info,
-				 &sel->active_const_and_shader_buffers,
-				 &sel->active_samplers_and_images);
-
-	/* Record which streamout buffers are enabled. */
-	for (i = 0; i < sel->so.num_outputs; i++) {
-		sel->enabled_streamout_buffer_mask |=
-			(1 << sel->so.output[i].output_buffer) <<
-			(sel->so.output[i].stream * 4);
-	}
-
-	sel->num_vs_inputs = sel->type == PIPE_SHADER_VERTEX &&
-			     !sel->info.properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD] ?
-				     sel->info.num_inputs : 0;
-	sel->num_vbos_in_user_sgprs =
-		MIN2(sel->num_vs_inputs, sscreen->num_vbos_in_user_sgprs);
-
-	/* The prolog is a no-op if there are no inputs. */
-	sel->vs_needs_prolog = sel->type == PIPE_SHADER_VERTEX &&
-			       sel->info.num_inputs &&
-			       !sel->info.properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD];
-
-	sel->prim_discard_cs_allowed =
-		sel->type == PIPE_SHADER_VERTEX &&
-		!sel->info.uses_bindless_images &&
-		!sel->info.uses_bindless_samplers &&
-		!sel->info.writes_memory &&
-		!sel->info.writes_viewport_index &&
-		!sel->info.properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION] &&
-		!sel->so.num_outputs;
-
-	switch (sel->type) {
-	case PIPE_SHADER_GEOMETRY:
-		sel->gs_output_prim =
-			sel->info.properties[TGSI_PROPERTY_GS_OUTPUT_PRIM];
-
-		/* Only possibilities: POINTS, LINE_STRIP, TRIANGLES */
-		sel->rast_prim = sel->gs_output_prim;
-		if (util_rast_prim_is_triangles(sel->rast_prim))
-			sel->rast_prim = PIPE_PRIM_TRIANGLES;
-
-		sel->gs_max_out_vertices =
-			sel->info.properties[TGSI_PROPERTY_GS_MAX_OUTPUT_VERTICES];
-		sel->gs_num_invocations =
-			sel->info.properties[TGSI_PROPERTY_GS_INVOCATIONS];
-		sel->gsvs_vertex_size = sel->info.num_outputs * 16;
-		sel->max_gsvs_emit_size = sel->gsvs_vertex_size *
-					  sel->gs_max_out_vertices;
-
-		sel->max_gs_stream = 0;
-		for (i = 0; i < sel->so.num_outputs; i++)
-			sel->max_gs_stream = MAX2(sel->max_gs_stream,
-						  sel->so.output[i].stream);
-
-		sel->gs_input_verts_per_prim =
-			u_vertices_per_prim(sel->info.properties[TGSI_PROPERTY_GS_INPUT_PRIM]);
-
-		/* EN_MAX_VERT_OUT_PER_GS_INSTANCE does not work with tesselation. */
-		sel->tess_turns_off_ngg =
-			sscreen->info.chip_class == GFX10 &&
-			sel->gs_num_invocations * sel->gs_max_out_vertices > 256;
-		break;
-
-	case PIPE_SHADER_TESS_CTRL:
-		/* Always reserve space for these. */
-		sel->patch_outputs_written |=
-			(1ull << si_shader_io_get_unique_index_patch(TGSI_SEMANTIC_TESSINNER, 0)) |
-			(1ull << si_shader_io_get_unique_index_patch(TGSI_SEMANTIC_TESSOUTER, 0));
-		/* fall through */
-	case PIPE_SHADER_VERTEX:
-	case PIPE_SHADER_TESS_EVAL:
-		for (i = 0; i < sel->info.num_outputs; i++) {
-			unsigned name = sel->info.output_semantic_name[i];
-			unsigned index = sel->info.output_semantic_index[i];
-
-			switch (name) {
-			case TGSI_SEMANTIC_TESSINNER:
-			case TGSI_SEMANTIC_TESSOUTER:
-			case TGSI_SEMANTIC_PATCH:
-				sel->patch_outputs_written |=
-					1ull << si_shader_io_get_unique_index_patch(name, index);
-				break;
-
-			case TGSI_SEMANTIC_GENERIC:
-				/* don't process indices the function can't handle */
-				if (index >= SI_MAX_IO_GENERIC)
-					break;
-				/* fall through */
-			default:
-				sel->outputs_written |=
-					1ull << si_shader_io_get_unique_index(name, index, false);
-				sel->outputs_written_before_ps |=
-					1ull << si_shader_io_get_unique_index(name, index, true);
-				break;
-			case TGSI_SEMANTIC_EDGEFLAG:
-				break;
-			}
-		}
-		sel->esgs_itemsize = util_last_bit64(sel->outputs_written) * 16;
-		sel->lshs_vertex_stride = sel->esgs_itemsize;
-
-		/* Add 1 dword to reduce LDS bank conflicts, so that each vertex
-		 * will start on a different bank. (except for the maximum 32*16).
-		 */
-		if (sel->lshs_vertex_stride < 32*16)
-			sel->lshs_vertex_stride += 4;
-
-		/* For the ESGS ring in LDS, add 1 dword to reduce LDS bank
-		 * conflicts, i.e. each vertex will start at a different bank.
-		 */
-		if (sctx->chip_class >= GFX9)
-			sel->esgs_itemsize += 4;
-
-		assert(((sel->esgs_itemsize / 4) & C_028AAC_ITEMSIZE) == 0);
-
-		/* Only for TES: */
-		if (sel->info.properties[TGSI_PROPERTY_TES_POINT_MODE])
-			sel->rast_prim = PIPE_PRIM_POINTS;
-		else if (sel->info.properties[TGSI_PROPERTY_TES_PRIM_MODE] == PIPE_PRIM_LINES)
-			sel->rast_prim = PIPE_PRIM_LINE_STRIP;
-		else
-			sel->rast_prim = PIPE_PRIM_TRIANGLES;
-		break;
-
-	case PIPE_SHADER_FRAGMENT:
-		for (i = 0; i < sel->info.num_inputs; i++) {
-			unsigned name = sel->info.input_semantic_name[i];
-			unsigned index = sel->info.input_semantic_index[i];
-
-			switch (name) {
-			case TGSI_SEMANTIC_GENERIC:
-				/* don't process indices the function can't handle */
-				if (index >= SI_MAX_IO_GENERIC)
-					break;
-				/* fall through */
-			default:
-				sel->inputs_read |=
-					1ull << si_shader_io_get_unique_index(name, index, true);
-				break;
-			case TGSI_SEMANTIC_PCOORD: /* ignore this */
-				break;
-			}
-		}
-
-		for (i = 0; i < 8; i++)
-			if (sel->info.colors_written & (1 << i))
-				sel->colors_written_4bit |= 0xf << (4 * i);
-
-		for (i = 0; i < sel->info.num_inputs; i++) {
-			if (sel->info.input_semantic_name[i] == TGSI_SEMANTIC_COLOR) {
-				int index = sel->info.input_semantic_index[i];
-				sel->color_attr_index[index] = i;
-			}
-		}
-		break;
-	default:;
-	}
-
-	sel->ngg_culling_allowed =
-		sscreen->info.chip_class == GFX10 &&
-		sscreen->info.has_dedicated_vram &&
-		sscreen->use_ngg_culling &&
-		/* Disallow TES by default, because TessMark results are mixed. */
-		(sel->type == PIPE_SHADER_VERTEX ||
-		 (sscreen->always_use_ngg_culling && sel->type == PIPE_SHADER_TESS_EVAL)) &&
-		sel->info.writes_position &&
-		!sel->info.writes_viewport_index && /* cull only against viewport 0 */
-		!sel->info.writes_memory &&
-		!sel->so.num_outputs &&
-		!sel->info.properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD] &&
-		!sel->info.properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION];
-
-	/* PA_CL_VS_OUT_CNTL */
-	if (sctx->chip_class <= GFX9)
-		sel->pa_cl_vs_out_cntl = si_get_vs_out_cntl(sel, false);
-
-	sel->clipdist_mask = sel->info.writes_clipvertex ?
-				     SIX_BITS : sel->info.clipdist_writemask;
-	sel->culldist_mask = sel->info.culldist_writemask <<
-			     sel->info.num_written_clipdistance;
-
-	/* DB_SHADER_CONTROL */
-	sel->db_shader_control =
-		S_02880C_Z_EXPORT_ENABLE(sel->info.writes_z) |
-		S_02880C_STENCIL_TEST_VAL_EXPORT_ENABLE(sel->info.writes_stencil) |
-		S_02880C_MASK_EXPORT_ENABLE(sel->info.writes_samplemask) |
-		S_02880C_KILL_ENABLE(sel->info.uses_kill);
-
-	switch (sel->info.properties[TGSI_PROPERTY_FS_DEPTH_LAYOUT]) {
-	case TGSI_FS_DEPTH_LAYOUT_GREATER:
-		sel->db_shader_control |=
-			S_02880C_CONSERVATIVE_Z_EXPORT(V_02880C_EXPORT_GREATER_THAN_Z);
-		break;
-	case TGSI_FS_DEPTH_LAYOUT_LESS:
-		sel->db_shader_control |=
-			S_02880C_CONSERVATIVE_Z_EXPORT(V_02880C_EXPORT_LESS_THAN_Z);
-		break;
-	}
-
-	/* Z_ORDER, EXEC_ON_HIER_FAIL and EXEC_ON_NOOP should be set as following:
-	 *
-	 *   | early Z/S | writes_mem | allow_ReZ? |      Z_ORDER       | EXEC_ON_HIER_FAIL | EXEC_ON_NOOP
-	 * --|-----------|------------|------------|--------------------|-------------------|-------------
-	 * 1a|   false   |   false    |   true     | EarlyZ_Then_ReZ    |         0         |     0
-	 * 1b|   false   |   false    |   false    | EarlyZ_Then_LateZ  |         0         |     0
-	 * 2 |   false   |   true     |   n/a      |       LateZ        |         1         |     0
-	 * 3 |   true    |   false    |   n/a      | EarlyZ_Then_LateZ  |         0         |     0
-	 * 4 |   true    |   true     |   n/a      | EarlyZ_Then_LateZ  |         0         |     1
-	 *
-	 * In cases 3 and 4, HW will force Z_ORDER to EarlyZ regardless of what's set in the register.
-	 * In case 2, NOOP_CULL is a don't care field. In case 2, 3 and 4, ReZ doesn't make sense.
-	 *
-	 * Don't use ReZ without profiling !!!
-	 *
-	 * ReZ decreases performance by 15% in DiRT: Showdown on Ultra settings, which has pretty complex
-	 * shaders.
-	 */
-	if (sel->info.properties[TGSI_PROPERTY_FS_EARLY_DEPTH_STENCIL]) {
-		/* Cases 3, 4. */
-		sel->db_shader_control |= S_02880C_DEPTH_BEFORE_SHADER(1) |
-					  S_02880C_Z_ORDER(V_02880C_EARLY_Z_THEN_LATE_Z) |
-					  S_02880C_EXEC_ON_NOOP(sel->info.writes_memory);
-	} else if (sel->info.writes_memory) {
-		/* Case 2. */
-		sel->db_shader_control |= S_02880C_Z_ORDER(V_02880C_LATE_Z) |
-					  S_02880C_EXEC_ON_HIER_FAIL(1);
-	} else {
-		/* Case 1. */
-		sel->db_shader_control |= S_02880C_Z_ORDER(V_02880C_EARLY_Z_THEN_LATE_Z);
-	}
-
-	if (sel->info.properties[TGSI_PROPERTY_FS_POST_DEPTH_COVERAGE])
-		sel->db_shader_control |= S_02880C_PRE_SHADER_DEPTH_COVERAGE_ENABLE(1);
-
-	(void) simple_mtx_init(&sel->mutex, mtx_plain);
-
-	si_schedule_initial_compile(sctx, sel->info.processor, &sel->ready,
-				    &sel->compiler_ctx_state, sel,
-				    si_init_shader_selector_async);
-	return sel;
-}
-
-static void *si_create_shader(struct pipe_context *ctx,
-			      const struct pipe_shader_state *state)
-{
-	struct si_screen *sscreen = (struct si_screen *)ctx->screen;
-
-	return util_live_shader_cache_get(ctx, &sscreen->live_shader_cache, state);
+                                       const struct pipe_shader_state *state)
+{
+   struct si_screen *sscreen = (struct si_screen *)ctx->screen;
+   struct si_context *sctx = (struct si_context *)ctx;
+   struct si_shader_selector *sel = CALLOC_STRUCT(si_shader_selector);
+   int i;
+
+   if (!sel)
+      return NULL;
+
+   sel->screen = sscreen;
+   sel->compiler_ctx_state.debug = sctx->debug;
+   sel->compiler_ctx_state.is_debug_context = sctx->is_debug;
+
+   sel->so = state->stream_output;
+
+   if (state->type == PIPE_SHADER_IR_TGSI) {
+      sel->nir = tgsi_to_nir(state->tokens, ctx->screen);
+   } else {
+      assert(state->type == PIPE_SHADER_IR_NIR);
+      sel->nir = state->ir.nir;
+   }
+
+   si_nir_scan_shader(sel->nir, &sel->info);
+   si_nir_adjust_driver_locations(sel->nir);
+
+   sel->type = sel->info.processor;
+   p_atomic_inc(&sscreen->num_shaders_created);
+   si_get_active_slot_masks(&sel->info, &sel->active_const_and_shader_buffers,
+                            &sel->active_samplers_and_images);
+
+   /* Record which streamout buffers are enabled. */
+   for (i = 0; i < sel->so.num_outputs; i++) {
+      sel->enabled_streamout_buffer_mask |= (1 << sel->so.output[i].output_buffer)
+                                            << (sel->so.output[i].stream * 4);
+   }
+
+   sel->num_vs_inputs =
+      sel->type == PIPE_SHADER_VERTEX && !sel->info.properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD]
+         ? sel->info.num_inputs
+         : 0;
+   sel->num_vbos_in_user_sgprs = MIN2(sel->num_vs_inputs, sscreen->num_vbos_in_user_sgprs);
+
+   /* The prolog is a no-op if there are no inputs. */
+   sel->vs_needs_prolog = sel->type == PIPE_SHADER_VERTEX && sel->info.num_inputs &&
+                          !sel->info.properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD];
+
+   sel->prim_discard_cs_allowed =
+      sel->type == PIPE_SHADER_VERTEX && !sel->info.uses_bindless_images &&
+      !sel->info.uses_bindless_samplers && !sel->info.writes_memory &&
+      !sel->info.writes_viewport_index &&
+      !sel->info.properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION] && !sel->so.num_outputs;
+
+   switch (sel->type) {
+   case PIPE_SHADER_GEOMETRY:
+      sel->gs_output_prim = sel->info.properties[TGSI_PROPERTY_GS_OUTPUT_PRIM];
+
+      /* Only possibilities: POINTS, LINE_STRIP, TRIANGLES */
+      sel->rast_prim = sel->gs_output_prim;
+      if (util_rast_prim_is_triangles(sel->rast_prim))
+         sel->rast_prim = PIPE_PRIM_TRIANGLES;
+
+      sel->gs_max_out_vertices = sel->info.properties[TGSI_PROPERTY_GS_MAX_OUTPUT_VERTICES];
+      sel->gs_num_invocations = sel->info.properties[TGSI_PROPERTY_GS_INVOCATIONS];
+      sel->gsvs_vertex_size = sel->info.num_outputs * 16;
+      sel->max_gsvs_emit_size = sel->gsvs_vertex_size * sel->gs_max_out_vertices;
+
+      sel->max_gs_stream = 0;
+      for (i = 0; i < sel->so.num_outputs; i++)
+         sel->max_gs_stream = MAX2(sel->max_gs_stream, sel->so.output[i].stream);
+
+      sel->gs_input_verts_per_prim =
+         u_vertices_per_prim(sel->info.properties[TGSI_PROPERTY_GS_INPUT_PRIM]);
+
+      /* EN_MAX_VERT_OUT_PER_GS_INSTANCE does not work with tesselation. */
+      sel->tess_turns_off_ngg = sscreen->info.chip_class == GFX10 &&
+                                sel->gs_num_invocations * sel->gs_max_out_vertices > 256;
+      break;
+
+   case PIPE_SHADER_TESS_CTRL:
+      /* Always reserve space for these. */
+      sel->patch_outputs_written |=
+         (1ull << si_shader_io_get_unique_index_patch(TGSI_SEMANTIC_TESSINNER, 0)) |
+         (1ull << si_shader_io_get_unique_index_patch(TGSI_SEMANTIC_TESSOUTER, 0));
+      /* fall through */
+   case PIPE_SHADER_VERTEX:
+   case PIPE_SHADER_TESS_EVAL:
+      for (i = 0; i < sel->info.num_outputs; i++) {
+         unsigned name = sel->info.output_semantic_name[i];
+         unsigned index = sel->info.output_semantic_index[i];
+
+         switch (name) {
+         case TGSI_SEMANTIC_TESSINNER:
+         case TGSI_SEMANTIC_TESSOUTER:
+         case TGSI_SEMANTIC_PATCH:
+            sel->patch_outputs_written |= 1ull << si_shader_io_get_unique_index_patch(name, index);
+            break;
+
+         case TGSI_SEMANTIC_GENERIC:
+            /* don't process indices the function can't handle */
+            if (index >= SI_MAX_IO_GENERIC)
+               break;
+            /* fall through */
+         default:
+            sel->outputs_written |= 1ull << si_shader_io_get_unique_index(name, index, false);
+            sel->outputs_written_before_ps |= 1ull
+                                              << si_shader_io_get_unique_index(name, index, true);
+            break;
+         case TGSI_SEMANTIC_EDGEFLAG:
+            break;
+         }
+      }
+      sel->esgs_itemsize = util_last_bit64(sel->outputs_written) * 16;
+      sel->lshs_vertex_stride = sel->esgs_itemsize;
+
+      /* Add 1 dword to reduce LDS bank conflicts, so that each vertex
+       * will start on a different bank. (except for the maximum 32*16).
+       */
+      if (sel->lshs_vertex_stride < 32 * 16)
+         sel->lshs_vertex_stride += 4;
+
+      /* For the ESGS ring in LDS, add 1 dword to reduce LDS bank
+       * conflicts, i.e. each vertex will start at a different bank.
+       */
+      if (sctx->chip_class >= GFX9)
+         sel->esgs_itemsize += 4;
+
+      assert(((sel->esgs_itemsize / 4) & C_028AAC_ITEMSIZE) == 0);
+
+      /* Only for TES: */
+      if (sel->info.properties[TGSI_PROPERTY_TES_POINT_MODE])
+         sel->rast_prim = PIPE_PRIM_POINTS;
+      else if (sel->info.properties[TGSI_PROPERTY_TES_PRIM_MODE] == PIPE_PRIM_LINES)
+         sel->rast_prim = PIPE_PRIM_LINE_STRIP;
+      else
+         sel->rast_prim = PIPE_PRIM_TRIANGLES;
+      break;
+
+   case PIPE_SHADER_FRAGMENT:
+      for (i = 0; i < sel->info.num_inputs; i++) {
+         unsigned name = sel->info.input_semantic_name[i];
+         unsigned index = sel->info.input_semantic_index[i];
+
+         switch (name) {
+         case TGSI_SEMANTIC_GENERIC:
+            /* don't process indices the function can't handle */
+            if (index >= SI_MAX_IO_GENERIC)
+               break;
+            /* fall through */
+         default:
+            sel->inputs_read |= 1ull << si_shader_io_get_unique_index(name, index, true);
+            break;
+         case TGSI_SEMANTIC_PCOORD: /* ignore this */
+            break;
+         }
+      }
+
+      for (i = 0; i < 8; i++)
+         if (sel->info.colors_written & (1 << i))
+            sel->colors_written_4bit |= 0xf << (4 * i);
+
+      for (i = 0; i < sel->info.num_inputs; i++) {
+         if (sel->info.input_semantic_name[i] == TGSI_SEMANTIC_COLOR) {
+            int index = sel->info.input_semantic_index[i];
+            sel->color_attr_index[index] = i;
+         }
+      }
+      break;
+   default:;
+   }
+
+   sel->ngg_culling_allowed =
+      sscreen->info.chip_class == GFX10 && sscreen->info.has_dedicated_vram &&
+      sscreen->use_ngg_culling &&
+      /* Disallow TES by default, because TessMark results are mixed. */
+      (sel->type == PIPE_SHADER_VERTEX ||
+       (sscreen->always_use_ngg_culling && sel->type == PIPE_SHADER_TESS_EVAL)) &&
+      sel->info.writes_position &&
+      !sel->info.writes_viewport_index && /* cull only against viewport 0 */
+      !sel->info.writes_memory && !sel->so.num_outputs &&
+      !sel->info.properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD] &&
+      !sel->info.properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION];
+
+   /* PA_CL_VS_OUT_CNTL */
+   if (sctx->chip_class <= GFX9)
+      sel->pa_cl_vs_out_cntl = si_get_vs_out_cntl(sel, false);
+
+   sel->clipdist_mask = sel->info.writes_clipvertex ? SIX_BITS : sel->info.clipdist_writemask;
+   sel->culldist_mask = sel->info.culldist_writemask << sel->info.num_written_clipdistance;
+
+   /* DB_SHADER_CONTROL */
+   sel->db_shader_control = S_02880C_Z_EXPORT_ENABLE(sel->info.writes_z) |
+                            S_02880C_STENCIL_TEST_VAL_EXPORT_ENABLE(sel->info.writes_stencil) |
+                            S_02880C_MASK_EXPORT_ENABLE(sel->info.writes_samplemask) |
+                            S_02880C_KILL_ENABLE(sel->info.uses_kill);
+
+   switch (sel->info.properties[TGSI_PROPERTY_FS_DEPTH_LAYOUT]) {
+   case TGSI_FS_DEPTH_LAYOUT_GREATER:
+      sel->db_shader_control |= S_02880C_CONSERVATIVE_Z_EXPORT(V_02880C_EXPORT_GREATER_THAN_Z);
+      break;
+   case TGSI_FS_DEPTH_LAYOUT_LESS:
+      sel->db_shader_control |= S_02880C_CONSERVATIVE_Z_EXPORT(V_02880C_EXPORT_LESS_THAN_Z);
+      break;
+   }
+
+   /* Z_ORDER, EXEC_ON_HIER_FAIL and EXEC_ON_NOOP should be set as following:
+    *
+    *   | early Z/S | writes_mem | allow_ReZ? |      Z_ORDER       | EXEC_ON_HIER_FAIL | EXEC_ON_NOOP
+    * --|-----------|------------|------------|--------------------|-------------------|-------------
+    * 1a|   false   |   false    |   true     | EarlyZ_Then_ReZ    |         0         |     0
+    * 1b|   false   |   false    |   false    | EarlyZ_Then_LateZ  |         0         |     0
+    * 2 |   false   |   true     |   n/a      |       LateZ        |         1         |     0
+    * 3 |   true    |   false    |   n/a      | EarlyZ_Then_LateZ  |         0         |     0
+    * 4 |   true    |   true     |   n/a      | EarlyZ_Then_LateZ  |         0         |     1
+    *
+    * In cases 3 and 4, HW will force Z_ORDER to EarlyZ regardless of what's set in the register.
+    * In case 2, NOOP_CULL is a don't care field. In case 2, 3 and 4, ReZ doesn't make sense.
+    *
+    * Don't use ReZ without profiling !!!
+    *
+    * ReZ decreases performance by 15% in DiRT: Showdown on Ultra settings, which has pretty complex
+    * shaders.
+    */
+   if (sel->info.properties[TGSI_PROPERTY_FS_EARLY_DEPTH_STENCIL]) {
+      /* Cases 3, 4. */
+      sel->db_shader_control |= S_02880C_DEPTH_BEFORE_SHADER(1) |
+                                S_02880C_Z_ORDER(V_02880C_EARLY_Z_THEN_LATE_Z) |
+                                S_02880C_EXEC_ON_NOOP(sel->info.writes_memory);
+   } else if (sel->info.writes_memory) {
+      /* Case 2. */
+      sel->db_shader_control |= S_02880C_Z_ORDER(V_02880C_LATE_Z) | S_02880C_EXEC_ON_HIER_FAIL(1);
+   } else {
+      /* Case 1. */
+      sel->db_shader_control |= S_02880C_Z_ORDER(V_02880C_EARLY_Z_THEN_LATE_Z);
+   }
+
+   if (sel->info.properties[TGSI_PROPERTY_FS_POST_DEPTH_COVERAGE])
+      sel->db_shader_control |= S_02880C_PRE_SHADER_DEPTH_COVERAGE_ENABLE(1);
+
+   (void)simple_mtx_init(&sel->mutex, mtx_plain);
+
+   si_schedule_initial_compile(sctx, sel->info.processor, &sel->ready, &sel->compiler_ctx_state,
+                               sel, si_init_shader_selector_async);
+   return sel;
+}
+
+static void *si_create_shader(struct pipe_context *ctx, const struct pipe_shader_state *state)
+{
+   struct si_screen *sscreen = (struct si_screen *)ctx->screen;
+
+   return util_live_shader_cache_get(ctx, &sscreen->live_shader_cache, state);
 }
 
 static void si_update_streamout_state(struct si_context *sctx)
 {
-	struct si_shader_selector *shader_with_so = si_get_vs(sctx)->cso;
+   struct si_shader_selector *shader_with_so = si_get_vs(sctx)->cso;
 
-	if (!shader_with_so)
-		return;
+   if (!shader_with_so)
+      return;
 
-	sctx->streamout.enabled_stream_buffers_mask =
-		shader_with_so->enabled_streamout_buffer_mask;
-	sctx->streamout.stride_in_dw = shader_with_so->so.stride;
+   sctx->streamout.enabled_stream_buffers_mask = shader_with_so->enabled_streamout_buffer_mask;
+   sctx->streamout.stride_in_dw = shader_with_so->so.stride;
 }
 
-static void si_update_clip_regs(struct si_context *sctx,
-				struct si_shader_selector *old_hw_vs,
-				struct si_shader *old_hw_vs_variant,
-				struct si_shader_selector *next_hw_vs,
-				struct si_shader *next_hw_vs_variant)
+static void si_update_clip_regs(struct si_context *sctx, struct si_shader_selector *old_hw_vs,
+                                struct si_shader *old_hw_vs_variant,
+                                struct si_shader_selector *next_hw_vs,
+                                struct si_shader *next_hw_vs_variant)
 {
-	if (next_hw_vs &&
-	    (!old_hw_vs ||
-	     old_hw_vs->info.properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION] !=
-	     next_hw_vs->info.properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION] ||
-	     old_hw_vs->pa_cl_vs_out_cntl != next_hw_vs->pa_cl_vs_out_cntl ||
-	     old_hw_vs->clipdist_mask != next_hw_vs->clipdist_mask ||
-	     old_hw_vs->culldist_mask != next_hw_vs->culldist_mask ||
-	     !old_hw_vs_variant ||
-	     !next_hw_vs_variant ||
-	     old_hw_vs_variant->key.opt.clip_disable !=
-	     next_hw_vs_variant->key.opt.clip_disable))
-		si_mark_atom_dirty(sctx, &sctx->atoms.s.clip_regs);
+   if (next_hw_vs &&
+       (!old_hw_vs ||
+        old_hw_vs->info.properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION] !=
+           next_hw_vs->info.properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION] ||
+        old_hw_vs->pa_cl_vs_out_cntl != next_hw_vs->pa_cl_vs_out_cntl ||
+        old_hw_vs->clipdist_mask != next_hw_vs->clipdist_mask ||
+        old_hw_vs->culldist_mask != next_hw_vs->culldist_mask || !old_hw_vs_variant ||
+        !next_hw_vs_variant ||
+        old_hw_vs_variant->key.opt.clip_disable != next_hw_vs_variant->key.opt.clip_disable))
+      si_mark_atom_dirty(sctx, &sctx->atoms.s.clip_regs);
 }
 
 static void si_update_common_shader_state(struct si_context *sctx)
 {
-	sctx->uses_bindless_samplers =
-		si_shader_uses_bindless_samplers(sctx->vs_shader.cso)  ||
-		si_shader_uses_bindless_samplers(sctx->gs_shader.cso)  ||
-		si_shader_uses_bindless_samplers(sctx->ps_shader.cso)  ||
-		si_shader_uses_bindless_samplers(sctx->tcs_shader.cso) ||
-		si_shader_uses_bindless_samplers(sctx->tes_shader.cso);
-	sctx->uses_bindless_images =
-		si_shader_uses_bindless_images(sctx->vs_shader.cso)  ||
-		si_shader_uses_bindless_images(sctx->gs_shader.cso)  ||
-		si_shader_uses_bindless_images(sctx->ps_shader.cso)  ||
-		si_shader_uses_bindless_images(sctx->tcs_shader.cso) ||
-		si_shader_uses_bindless_images(sctx->tes_shader.cso);
-	sctx->do_update_shaders = true;
+   sctx->uses_bindless_samplers = si_shader_uses_bindless_samplers(sctx->vs_shader.cso) ||
+                                  si_shader_uses_bindless_samplers(sctx->gs_shader.cso) ||
+                                  si_shader_uses_bindless_samplers(sctx->ps_shader.cso) ||
+                                  si_shader_uses_bindless_samplers(sctx->tcs_shader.cso) ||
+                                  si_shader_uses_bindless_samplers(sctx->tes_shader.cso);
+   sctx->uses_bindless_images = si_shader_uses_bindless_images(sctx->vs_shader.cso) ||
+                                si_shader_uses_bindless_images(sctx->gs_shader.cso) ||
+                                si_shader_uses_bindless_images(sctx->ps_shader.cso) ||
+                                si_shader_uses_bindless_images(sctx->tcs_shader.cso) ||
+                                si_shader_uses_bindless_images(sctx->tes_shader.cso);
+   sctx->do_update_shaders = true;
 }
 
 static void si_bind_vs_shader(struct pipe_context *ctx, void *state)
 {
-	struct si_context *sctx = (struct si_context *)ctx;
-	struct si_shader_selector *old_hw_vs = si_get_vs(sctx)->cso;
-	struct si_shader *old_hw_vs_variant = si_get_vs_state(sctx);
-	struct si_shader_selector *sel = state;
-
-	if (sctx->vs_shader.cso == sel)
-		return;
-
-	sctx->vs_shader.cso = sel;
-	sctx->vs_shader.current = sel ? sel->first_variant : NULL;
-	sctx->num_vs_blit_sgprs = sel ? sel->info.properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD] : 0;
-
-	if (si_update_ngg(sctx))
-		si_shader_change_notify(sctx);
-
-	si_update_common_shader_state(sctx);
-	si_update_vs_viewport_state(sctx);
-	si_set_active_descriptors_for_shader(sctx, sel);
-	si_update_streamout_state(sctx);
-	si_update_clip_regs(sctx, old_hw_vs, old_hw_vs_variant,
-			    si_get_vs(sctx)->cso, si_get_vs_state(sctx));
+   struct si_context *sctx = (struct si_context *)ctx;
+   struct si_shader_selector *old_hw_vs = si_get_vs(sctx)->cso;
+   struct si_shader *old_hw_vs_variant = si_get_vs_state(sctx);
+   struct si_shader_selector *sel = state;
+
+   if (sctx->vs_shader.cso == sel)
+      return;
+
+   sctx->vs_shader.cso = sel;
+   sctx->vs_shader.current = sel ? sel->first_variant : NULL;
+   sctx->num_vs_blit_sgprs = sel ? sel->info.properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD] : 0;
+
+   if (si_update_ngg(sctx))
+      si_shader_change_notify(sctx);
+
+   si_update_common_shader_state(sctx);
+   si_update_vs_viewport_state(sctx);
+   si_set_active_descriptors_for_shader(sctx, sel);
+   si_update_streamout_state(sctx);
+   si_update_clip_regs(sctx, old_hw_vs, old_hw_vs_variant, si_get_vs(sctx)->cso,
+                       si_get_vs_state(sctx));
 }
 
 static void si_update_tess_uses_prim_id(struct si_context *sctx)
 {
-	sctx->ia_multi_vgt_param_key.u.tess_uses_prim_id =
-		(sctx->tes_shader.cso &&
-		 sctx->tes_shader.cso->info.uses_primid) ||
-		(sctx->tcs_shader.cso &&
-		 sctx->tcs_shader.cso->info.uses_primid) ||
-		(sctx->gs_shader.cso &&
-		 sctx->gs_shader.cso->info.uses_primid) ||
-		(sctx->ps_shader.cso && !sctx->gs_shader.cso &&
-		 sctx->ps_shader.cso->info.uses_primid);
+   sctx->ia_multi_vgt_param_key.u.tess_uses_prim_id =
+      (sctx->tes_shader.cso && sctx->tes_shader.cso->info.uses_primid) ||
+      (sctx->tcs_shader.cso && sctx->tcs_shader.cso->info.uses_primid) ||
+      (sctx->gs_shader.cso && sctx->gs_shader.cso->info.uses_primid) ||
+      (sctx->ps_shader.cso && !sctx->gs_shader.cso && sctx->ps_shader.cso->info.uses_primid);
 }
 
 bool si_update_ngg(struct si_context *sctx)
 {
-	if (!sctx->screen->use_ngg) {
-		assert(!sctx->ngg);
-		return false;
-	}
-
-	bool new_ngg = true;
-
-	if (sctx->gs_shader.cso && sctx->tes_shader.cso &&
-	    sctx->gs_shader.cso->tess_turns_off_ngg) {
-		new_ngg = false;
-	} else if (!sctx->screen->use_ngg_streamout) {
-		struct si_shader_selector *last = si_get_vs(sctx)->cso;
-
-		if ((last && last->so.num_outputs) ||
-		    sctx->streamout.prims_gen_query_enabled)
-			new_ngg = false;
-	}
-
-	if (new_ngg != sctx->ngg) {
-		/* Transitioning from NGG to legacy GS requires VGT_FLUSH on Navi10-14.
-		 * VGT_FLUSH is also emitted at the beginning of IBs when legacy GS ring
-		 * pointers are set.
-		 */
-		if ((sctx->family == CHIP_NAVI10 ||
-		     sctx->family == CHIP_NAVI12 ||
-		     sctx->family == CHIP_NAVI14) &&
-		    !new_ngg)
-			sctx->flags |= SI_CONTEXT_VGT_FLUSH;
-
-		sctx->ngg = new_ngg;
-		sctx->last_gs_out_prim = -1; /* reset this so that it gets updated */
-		return true;
-	}
-	return false;
+   if (!sctx->screen->use_ngg) {
+      assert(!sctx->ngg);
+      return false;
+   }
+
+   bool new_ngg = true;
+
+   if (sctx->gs_shader.cso && sctx->tes_shader.cso && sctx->gs_shader.cso->tess_turns_off_ngg) {
+      new_ngg = false;
+   } else if (!sctx->screen->use_ngg_streamout) {
+      struct si_shader_selector *last = si_get_vs(sctx)->cso;
+
+      if ((last && last->so.num_outputs) || sctx->streamout.prims_gen_query_enabled)
+         new_ngg = false;
+   }
+
+   if (new_ngg != sctx->ngg) {
+      /* Transitioning from NGG to legacy GS requires VGT_FLUSH on Navi10-14.
+       * VGT_FLUSH is also emitted at the beginning of IBs when legacy GS ring
+       * pointers are set.
+       */
+      if ((sctx->family == CHIP_NAVI10 || sctx->family == CHIP_NAVI12 ||
+           sctx->family == CHIP_NAVI14) &&
+          !new_ngg)
+         sctx->flags |= SI_CONTEXT_VGT_FLUSH;
+
+      sctx->ngg = new_ngg;
+      sctx->last_gs_out_prim = -1; /* reset this so that it gets updated */
+      return true;
+   }
+   return false;
 }
 
 static void si_bind_gs_shader(struct pipe_context *ctx, void *state)
 {
-	struct si_context *sctx = (struct si_context *)ctx;
-	struct si_shader_selector *old_hw_vs = si_get_vs(sctx)->cso;
-	struct si_shader *old_hw_vs_variant = si_get_vs_state(sctx);
-	struct si_shader_selector *sel = state;
-	bool enable_changed = !!sctx->gs_shader.cso != !!sel;
-	bool ngg_changed;
-
-	if (sctx->gs_shader.cso == sel)
-		return;
-
-	sctx->gs_shader.cso = sel;
-	sctx->gs_shader.current = sel ? sel->first_variant : NULL;
-	sctx->ia_multi_vgt_param_key.u.uses_gs = sel != NULL;
-
-	si_update_common_shader_state(sctx);
-	sctx->last_gs_out_prim = -1; /* reset this so that it gets updated */
-
-	ngg_changed = si_update_ngg(sctx);
-	if (ngg_changed || enable_changed)
-		si_shader_change_notify(sctx);
-	if (enable_changed) {
-		if (sctx->ia_multi_vgt_param_key.u.uses_tess)
-			si_update_tess_uses_prim_id(sctx);
-	}
-	si_update_vs_viewport_state(sctx);
-	si_set_active_descriptors_for_shader(sctx, sel);
-	si_update_streamout_state(sctx);
-	si_update_clip_regs(sctx, old_hw_vs, old_hw_vs_variant,
-			    si_get_vs(sctx)->cso, si_get_vs_state(sctx));
+   struct si_context *sctx = (struct si_context *)ctx;
+   struct si_shader_selector *old_hw_vs = si_get_vs(sctx)->cso;
+   struct si_shader *old_hw_vs_variant = si_get_vs_state(sctx);
+   struct si_shader_selector *sel = state;
+   bool enable_changed = !!sctx->gs_shader.cso != !!sel;
+   bool ngg_changed;
+
+   if (sctx->gs_shader.cso == sel)
+      return;
+
+   sctx->gs_shader.cso = sel;
+   sctx->gs_shader.current = sel ? sel->first_variant : NULL;
+   sctx->ia_multi_vgt_param_key.u.uses_gs = sel != NULL;
+
+   si_update_common_shader_state(sctx);
+   sctx->last_gs_out_prim = -1; /* reset this so that it gets updated */
+
+   ngg_changed = si_update_ngg(sctx);
+   if (ngg_changed || enable_changed)
+      si_shader_change_notify(sctx);
+   if (enable_changed) {
+      if (sctx->ia_multi_vgt_param_key.u.uses_tess)
+         si_update_tess_uses_prim_id(sctx);
+   }
+   si_update_vs_viewport_state(sctx);
+   si_set_active_descriptors_for_shader(sctx, sel);
+   si_update_streamout_state(sctx);
+   si_update_clip_regs(sctx, old_hw_vs, old_hw_vs_variant, si_get_vs(sctx)->cso,
+                       si_get_vs_state(sctx));
 }
 
 static void si_bind_tcs_shader(struct pipe_context *ctx, void *state)
 {
-	struct si_context *sctx = (struct si_context *)ctx;
-	struct si_shader_selector *sel = state;
-	bool enable_changed = !!sctx->tcs_shader.cso != !!sel;
+   struct si_context *sctx = (struct si_context *)ctx;
+   struct si_shader_selector *sel = state;
+   bool enable_changed = !!sctx->tcs_shader.cso != !!sel;
 
-	if (sctx->tcs_shader.cso == sel)
-		return;
+   if (sctx->tcs_shader.cso == sel)
+      return;
 
-	sctx->tcs_shader.cso = sel;
-	sctx->tcs_shader.current = sel ? sel->first_variant : NULL;
-	si_update_tess_uses_prim_id(sctx);
+   sctx->tcs_shader.cso = sel;
+   sctx->tcs_shader.current = sel ? sel->first_variant : NULL;
+   si_update_tess_uses_prim_id(sctx);
 
-	si_update_common_shader_state(sctx);
+   si_update_common_shader_state(sctx);
 
-	if (enable_changed)
-		sctx->last_tcs = NULL; /* invalidate derived tess state */
+   if (enable_changed)
+      sctx->last_tcs = NULL; /* invalidate derived tess state */
 
-	si_set_active_descriptors_for_shader(sctx, sel);
+   si_set_active_descriptors_for_shader(sctx, sel);
 }
 
 static void si_bind_tes_shader(struct pipe_context *ctx, void *state)
 {
-	struct si_context *sctx = (struct si_context *)ctx;
-	struct si_shader_selector *old_hw_vs = si_get_vs(sctx)->cso;
-	struct si_shader *old_hw_vs_variant = si_get_vs_state(sctx);
-	struct si_shader_selector *sel = state;
-	bool enable_changed = !!sctx->tes_shader.cso != !!sel;
-
-	if (sctx->tes_shader.cso == sel)
-		return;
-
-	sctx->tes_shader.cso = sel;
-	sctx->tes_shader.current = sel ? sel->first_variant : NULL;
-	sctx->ia_multi_vgt_param_key.u.uses_tess = sel != NULL;
-	si_update_tess_uses_prim_id(sctx);
-
-	si_update_common_shader_state(sctx);
-	sctx->last_gs_out_prim = -1; /* reset this so that it gets updated */
-
-	bool ngg_changed = si_update_ngg(sctx);
-	if (ngg_changed || enable_changed)
-		si_shader_change_notify(sctx);
-	if (enable_changed)
-		sctx->last_tes_sh_base = -1; /* invalidate derived tess state */
-	si_update_vs_viewport_state(sctx);
-	si_set_active_descriptors_for_shader(sctx, sel);
-	si_update_streamout_state(sctx);
-	si_update_clip_regs(sctx, old_hw_vs, old_hw_vs_variant,
-			    si_get_vs(sctx)->cso, si_get_vs_state(sctx));
+   struct si_context *sctx = (struct si_context *)ctx;
+   struct si_shader_selector *old_hw_vs = si_get_vs(sctx)->cso;
+   struct si_shader *old_hw_vs_variant = si_get_vs_state(sctx);
+   struct si_shader_selector *sel = state;
+   bool enable_changed = !!sctx->tes_shader.cso != !!sel;
+
+   if (sctx->tes_shader.cso == sel)
+      return;
+
+   sctx->tes_shader.cso = sel;
+   sctx->tes_shader.current = sel ? sel->first_variant : NULL;
+   sctx->ia_multi_vgt_param_key.u.uses_tess = sel != NULL;
+   si_update_tess_uses_prim_id(sctx);
+
+   si_update_common_shader_state(sctx);
+   sctx->last_gs_out_prim = -1; /* reset this so that it gets updated */
+
+   bool ngg_changed = si_update_ngg(sctx);
+   if (ngg_changed || enable_changed)
+      si_shader_change_notify(sctx);
+   if (enable_changed)
+      sctx->last_tes_sh_base = -1; /* invalidate derived tess state */
+   si_update_vs_viewport_state(sctx);
+   si_set_active_descriptors_for_shader(sctx, sel);
+   si_update_streamout_state(sctx);
+   si_update_clip_regs(sctx, old_hw_vs, old_hw_vs_variant, si_get_vs(sctx)->cso,
+                       si_get_vs_state(sctx));
 }
 
 static void si_bind_ps_shader(struct pipe_context *ctx, void *state)
 {
-	struct si_context *sctx = (struct si_context *)ctx;
-	struct si_shader_selector *old_sel = sctx->ps_shader.cso;
-	struct si_shader_selector *sel = state;
-
-	/* skip if supplied shader is one already in use */
-	if (old_sel == sel)
-		return;
-
-	sctx->ps_shader.cso = sel;
-	sctx->ps_shader.current = sel ? sel->first_variant : NULL;
-
-	si_update_common_shader_state(sctx);
-	if (sel) {
-		if (sctx->ia_multi_vgt_param_key.u.uses_tess)
-			si_update_tess_uses_prim_id(sctx);
-
-		if (!old_sel ||
-		    old_sel->info.colors_written != sel->info.colors_written)
-			si_mark_atom_dirty(sctx, &sctx->atoms.s.cb_render_state);
-
-		if (sctx->screen->has_out_of_order_rast &&
-		    (!old_sel ||
-		     old_sel->info.writes_memory != sel->info.writes_memory ||
-		     old_sel->info.properties[TGSI_PROPERTY_FS_EARLY_DEPTH_STENCIL] !=
-		     sel->info.properties[TGSI_PROPERTY_FS_EARLY_DEPTH_STENCIL]))
-			si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config);
-	}
-	si_set_active_descriptors_for_shader(sctx, sel);
-	si_update_ps_colorbuf0_slot(sctx);
+   struct si_context *sctx = (struct si_context *)ctx;
+   struct si_shader_selector *old_sel = sctx->ps_shader.cso;
+   struct si_shader_selector *sel = state;
+
+   /* skip if supplied shader is one already in use */
+   if (old_sel == sel)
+      return;
+
+   sctx->ps_shader.cso = sel;
+   sctx->ps_shader.current = sel ? sel->first_variant : NULL;
+
+   si_update_common_shader_state(sctx);
+   if (sel) {
+      if (sctx->ia_multi_vgt_param_key.u.uses_tess)
+         si_update_tess_uses_prim_id(sctx);
+
+      if (!old_sel || old_sel->info.colors_written != sel->info.colors_written)
+         si_mark_atom_dirty(sctx, &sctx->atoms.s.cb_render_state);
+
+      if (sctx->screen->has_out_of_order_rast &&
+          (!old_sel || old_sel->info.writes_memory != sel->info.writes_memory ||
+           old_sel->info.properties[TGSI_PROPERTY_FS_EARLY_DEPTH_STENCIL] !=
+              sel->info.properties[TGSI_PROPERTY_FS_EARLY_DEPTH_STENCIL]))
+         si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config);
+   }
+   si_set_active_descriptors_for_shader(sctx, sel);
+   si_update_ps_colorbuf0_slot(sctx);
 }
 
 static void si_delete_shader(struct si_context *sctx, struct si_shader *shader)
 {
-	if (shader->is_optimized) {
-		util_queue_drop_job(&sctx->screen->shader_compiler_queue_low_priority,
-				    &shader->ready);
-	}
-
-	util_queue_fence_destroy(&shader->ready);
-
-	if (shader->pm4) {
-		/* If destroyed shaders were not unbound, the next compiled
-		 * shader variant could get the same pointer address and so
-		 * binding it to the same shader stage would be considered
-		 * a no-op, causing random behavior.
-		 */
-		switch (shader->selector->type) {
-		case PIPE_SHADER_VERTEX:
-			if (shader->key.as_ls) {
-				assert(sctx->chip_class <= GFX8);
-				si_pm4_delete_state(sctx, ls, shader->pm4);
-			} else if (shader->key.as_es) {
-				assert(sctx->chip_class <= GFX8);
-				si_pm4_delete_state(sctx, es, shader->pm4);
-			} else if (shader->key.as_ngg) {
-				si_pm4_delete_state(sctx, gs, shader->pm4);
-			} else {
-				si_pm4_delete_state(sctx, vs, shader->pm4);
-			}
-			break;
-		case PIPE_SHADER_TESS_CTRL:
-			si_pm4_delete_state(sctx, hs, shader->pm4);
-			break;
-		case PIPE_SHADER_TESS_EVAL:
-			if (shader->key.as_es) {
-				assert(sctx->chip_class <= GFX8);
-				si_pm4_delete_state(sctx, es, shader->pm4);
-			} else if (shader->key.as_ngg) {
-				si_pm4_delete_state(sctx, gs, shader->pm4);
-			} else {
-				si_pm4_delete_state(sctx, vs, shader->pm4);
-			}
-			break;
-		case PIPE_SHADER_GEOMETRY:
-			if (shader->is_gs_copy_shader)
-				si_pm4_delete_state(sctx, vs, shader->pm4);
-			else
-				si_pm4_delete_state(sctx, gs, shader->pm4);
-			break;
-		case PIPE_SHADER_FRAGMENT:
-			si_pm4_delete_state(sctx, ps, shader->pm4);
-			break;
-		default:;
-		}
-	}
-
-	si_shader_selector_reference(sctx, &shader->previous_stage_sel, NULL);
-	si_shader_destroy(shader);
-	free(shader);
+   if (shader->is_optimized) {
+      util_queue_drop_job(&sctx->screen->shader_compiler_queue_low_priority, &shader->ready);
+   }
+
+   util_queue_fence_destroy(&shader->ready);
+
+   if (shader->pm4) {
+      /* If destroyed shaders were not unbound, the next compiled
+       * shader variant could get the same pointer address and so
+       * binding it to the same shader stage would be considered
+       * a no-op, causing random behavior.
+       */
+      switch (shader->selector->type) {
+      case PIPE_SHADER_VERTEX:
+         if (shader->key.as_ls) {
+            assert(sctx->chip_class <= GFX8);
+            si_pm4_delete_state(sctx, ls, shader->pm4);
+         } else if (shader->key.as_es) {
+            assert(sctx->chip_class <= GFX8);
+            si_pm4_delete_state(sctx, es, shader->pm4);
+         } else if (shader->key.as_ngg) {
+            si_pm4_delete_state(sctx, gs, shader->pm4);
+         } else {
+            si_pm4_delete_state(sctx, vs, shader->pm4);
+         }
+         break;
+      case PIPE_SHADER_TESS_CTRL:
+         si_pm4_delete_state(sctx, hs, shader->pm4);
+         break;
+      case PIPE_SHADER_TESS_EVAL:
+         if (shader->key.as_es) {
+            assert(sctx->chip_class <= GFX8);
+            si_pm4_delete_state(sctx, es, shader->pm4);
+         } else if (shader->key.as_ngg) {
+            si_pm4_delete_state(sctx, gs, shader->pm4);
+         } else {
+            si_pm4_delete_state(sctx, vs, shader->pm4);
+         }
+         break;
+      case PIPE_SHADER_GEOMETRY:
+         if (shader->is_gs_copy_shader)
+            si_pm4_delete_state(sctx, vs, shader->pm4);
+         else
+            si_pm4_delete_state(sctx, gs, shader->pm4);
+         break;
+      case PIPE_SHADER_FRAGMENT:
+         si_pm4_delete_state(sctx, ps, shader->pm4);
+         break;
+      default:;
+      }
+   }
+
+   si_shader_selector_reference(sctx, &shader->previous_stage_sel, NULL);
+   si_shader_destroy(shader);
+   free(shader);
 }
 
 static void si_destroy_shader_selector(struct pipe_context *ctx, void *cso)
 {
-	struct si_context *sctx = (struct si_context*)ctx;
-	struct si_shader_selector *sel = (struct si_shader_selector *)cso;
-	struct si_shader *p = sel->first_variant, *c;
-	struct si_shader_ctx_state *current_shader[SI_NUM_SHADERS] = {
-		[PIPE_SHADER_VERTEX] = &sctx->vs_shader,
-		[PIPE_SHADER_TESS_CTRL] = &sctx->tcs_shader,
-		[PIPE_SHADER_TESS_EVAL] = &sctx->tes_shader,
-		[PIPE_SHADER_GEOMETRY] = &sctx->gs_shader,
-		[PIPE_SHADER_FRAGMENT] = &sctx->ps_shader,
-	};
-
-	util_queue_drop_job(&sctx->screen->shader_compiler_queue, &sel->ready);
-
-	if (current_shader[sel->type]->cso == sel) {
-		current_shader[sel->type]->cso = NULL;
-		current_shader[sel->type]->current = NULL;
-	}
-
-	while (p) {
-		c = p->next_variant;
-		si_delete_shader(sctx, p);
-		p = c;
-	}
-
-	if (sel->main_shader_part)
-		si_delete_shader(sctx, sel->main_shader_part);
-	if (sel->main_shader_part_ls)
-		si_delete_shader(sctx, sel->main_shader_part_ls);
-	if (sel->main_shader_part_es)
-		si_delete_shader(sctx, sel->main_shader_part_es);
-	if (sel->main_shader_part_ngg)
-		si_delete_shader(sctx, sel->main_shader_part_ngg);
-	if (sel->gs_copy_shader)
-		si_delete_shader(sctx, sel->gs_copy_shader);
-
-	util_queue_fence_destroy(&sel->ready);
-	simple_mtx_destroy(&sel->mutex);
-	ralloc_free(sel->nir);
-	free(sel->nir_binary);
-	free(sel);
+   struct si_context *sctx = (struct si_context *)ctx;
+   struct si_shader_selector *sel = (struct si_shader_selector *)cso;
+   struct si_shader *p = sel->first_variant, *c;
+   struct si_shader_ctx_state *current_shader[SI_NUM_SHADERS] = {
+      [PIPE_SHADER_VERTEX] = &sctx->vs_shader,     [PIPE_SHADER_TESS_CTRL] = &sctx->tcs_shader,
+      [PIPE_SHADER_TESS_EVAL] = &sctx->tes_shader, [PIPE_SHADER_GEOMETRY] = &sctx->gs_shader,
+      [PIPE_SHADER_FRAGMENT] = &sctx->ps_shader,
+   };
+
+   util_queue_drop_job(&sctx->screen->shader_compiler_queue, &sel->ready);
+
+   if (current_shader[sel->type]->cso == sel) {
+      current_shader[sel->type]->cso = NULL;
+      current_shader[sel->type]->current = NULL;
+   }
+
+   while (p) {
+      c = p->next_variant;
+      si_delete_shader(sctx, p);
+      p = c;
+   }
+
+   if (sel->main_shader_part)
+      si_delete_shader(sctx, sel->main_shader_part);
+   if (sel->main_shader_part_ls)
+      si_delete_shader(sctx, sel->main_shader_part_ls);
+   if (sel->main_shader_part_es)
+      si_delete_shader(sctx, sel->main_shader_part_es);
+   if (sel->main_shader_part_ngg)
+      si_delete_shader(sctx, sel->main_shader_part_ngg);
+   if (sel->gs_copy_shader)
+      si_delete_shader(sctx, sel->gs_copy_shader);
+
+   util_queue_fence_destroy(&sel->ready);
+   simple_mtx_destroy(&sel->mutex);
+   ralloc_free(sel->nir);
+   free(sel->nir_binary);
+   free(sel);
 }
 
 static void si_delete_shader_selector(struct pipe_context *ctx, void *state)
 {
-	struct si_context *sctx = (struct si_context *)ctx;
-	struct si_shader_selector *sel = (struct si_shader_selector *)state;
-
-	si_shader_selector_reference(sctx, &sel, NULL);
-}
-
-static unsigned si_get_ps_input_cntl(struct si_context *sctx,
-				     struct si_shader *vs, unsigned name,
-				     unsigned index, unsigned interpolate)
-{
-	struct si_shader_info *vsinfo = &vs->selector->info;
-	unsigned j, offset, ps_input_cntl = 0;
-
-	if (interpolate == TGSI_INTERPOLATE_CONSTANT ||
-	    (interpolate == TGSI_INTERPOLATE_COLOR && sctx->flatshade) ||
-	    name == TGSI_SEMANTIC_PRIMID)
-		ps_input_cntl |= S_028644_FLAT_SHADE(1);
-
-	if (name == TGSI_SEMANTIC_PCOORD ||
-	    (name == TGSI_SEMANTIC_TEXCOORD &&
-	     sctx->sprite_coord_enable & (1 << index))) {
-		ps_input_cntl |= S_028644_PT_SPRITE_TEX(1);
-	}
-
-	for (j = 0; j < vsinfo->num_outputs; j++) {
-		if (name == vsinfo->output_semantic_name[j] &&
-		    index == vsinfo->output_semantic_index[j]) {
-			offset = vs->info.vs_output_param_offset[j];
-
-			if (offset <= AC_EXP_PARAM_OFFSET_31) {
-				/* The input is loaded from parameter memory. */
-				ps_input_cntl |= S_028644_OFFSET(offset);
-			} else if (!G_028644_PT_SPRITE_TEX(ps_input_cntl)) {
-				if (offset == AC_EXP_PARAM_UNDEFINED) {
-					/* This can happen with depth-only rendering. */
-					offset = 0;
-				} else {
-					/* The input is a DEFAULT_VAL constant. */
-					assert(offset >= AC_EXP_PARAM_DEFAULT_VAL_0000 &&
-					       offset <= AC_EXP_PARAM_DEFAULT_VAL_1111);
-					offset -= AC_EXP_PARAM_DEFAULT_VAL_0000;
-				}
-
-				ps_input_cntl = S_028644_OFFSET(0x20) |
-						S_028644_DEFAULT_VAL(offset);
-			}
-			break;
-		}
-	}
-
-	if (j == vsinfo->num_outputs && name == TGSI_SEMANTIC_PRIMID)
-		/* PrimID is written after the last output when HW VS is used. */
-		ps_input_cntl |= S_028644_OFFSET(vs->info.vs_output_param_offset[vsinfo->num_outputs]);
-	else if (j == vsinfo->num_outputs && !G_028644_PT_SPRITE_TEX(ps_input_cntl)) {
-		/* No corresponding output found, load defaults into input.
-		 * Don't set any other bits.
-		 * (FLAT_SHADE=1 completely changes behavior) */
-		ps_input_cntl = S_028644_OFFSET(0x20);
-		/* D3D 9 behaviour. GL is undefined */
-		if (name == TGSI_SEMANTIC_COLOR && index == 0)
-			ps_input_cntl |= S_028644_DEFAULT_VAL(3);
-	}
-	return ps_input_cntl;
+   struct si_context *sctx = (struct si_context *)ctx;
+   struct si_shader_selector *sel = (struct si_shader_selector *)state;
+
+   si_shader_selector_reference(sctx, &sel, NULL);
+}
+
+static unsigned si_get_ps_input_cntl(struct si_context *sctx, struct si_shader *vs, unsigned name,
+                                     unsigned index, unsigned interpolate)
+{
+   struct si_shader_info *vsinfo = &vs->selector->info;
+   unsigned j, offset, ps_input_cntl = 0;
+
+   if (interpolate == TGSI_INTERPOLATE_CONSTANT ||
+       (interpolate == TGSI_INTERPOLATE_COLOR && sctx->flatshade) || name == TGSI_SEMANTIC_PRIMID)
+      ps_input_cntl |= S_028644_FLAT_SHADE(1);
+
+   if (name == TGSI_SEMANTIC_PCOORD ||
+       (name == TGSI_SEMANTIC_TEXCOORD && sctx->sprite_coord_enable & (1 << index))) {
+      ps_input_cntl |= S_028644_PT_SPRITE_TEX(1);
+   }
+
+   for (j = 0; j < vsinfo->num_outputs; j++) {
+      if (name == vsinfo->output_semantic_name[j] && index == vsinfo->output_semantic_index[j]) {
+         offset = vs->info.vs_output_param_offset[j];
+
+         if (offset <= AC_EXP_PARAM_OFFSET_31) {
+            /* The input is loaded from parameter memory. */
+            ps_input_cntl |= S_028644_OFFSET(offset);
+         } else if (!G_028644_PT_SPRITE_TEX(ps_input_cntl)) {
+            if (offset == AC_EXP_PARAM_UNDEFINED) {
+               /* This can happen with depth-only rendering. */
+               offset = 0;
+            } else {
+               /* The input is a DEFAULT_VAL constant. */
+               assert(offset >= AC_EXP_PARAM_DEFAULT_VAL_0000 &&
+                      offset <= AC_EXP_PARAM_DEFAULT_VAL_1111);
+               offset -= AC_EXP_PARAM_DEFAULT_VAL_0000;
+            }
+
+            ps_input_cntl = S_028644_OFFSET(0x20) | S_028644_DEFAULT_VAL(offset);
+         }
+         break;
+      }
+   }
+
+   if (j == vsinfo->num_outputs && name == TGSI_SEMANTIC_PRIMID)
+      /* PrimID is written after the last output when HW VS is used. */
+      ps_input_cntl |= S_028644_OFFSET(vs->info.vs_output_param_offset[vsinfo->num_outputs]);
+   else if (j == vsinfo->num_outputs && !G_028644_PT_SPRITE_TEX(ps_input_cntl)) {
+      /* No corresponding output found, load defaults into input.
+       * Don't set any other bits.
+       * (FLAT_SHADE=1 completely changes behavior) */
+      ps_input_cntl = S_028644_OFFSET(0x20);
+      /* D3D 9 behaviour. GL is undefined */
+      if (name == TGSI_SEMANTIC_COLOR && index == 0)
+         ps_input_cntl |= S_028644_DEFAULT_VAL(3);
+   }
+   return ps_input_cntl;
 }
 
 static void si_emit_spi_map(struct si_context *sctx)
 {
-	struct si_shader *ps = sctx->ps_shader.current;
-	struct si_shader *vs = si_get_vs_state(sctx);
-	struct si_shader_info *psinfo = ps ? &ps->selector->info : NULL;
-	unsigned i, num_interp, num_written = 0, bcol_interp[2];
-	unsigned spi_ps_input_cntl[32];
-
-	if (!ps || !ps->selector->info.num_inputs)
-		return;
-
-	num_interp = si_get_ps_num_interp(ps);
-	assert(num_interp > 0);
-
-	for (i = 0; i < psinfo->num_inputs; i++) {
-		unsigned name = psinfo->input_semantic_name[i];
-		unsigned index = psinfo->input_semantic_index[i];
-		unsigned interpolate = psinfo->input_interpolate[i];
-
-		spi_ps_input_cntl[num_written++] = si_get_ps_input_cntl(sctx, vs, name,
-							    index, interpolate);
-
-		if (name == TGSI_SEMANTIC_COLOR) {
-			assert(index < ARRAY_SIZE(bcol_interp));
-			bcol_interp[index] = interpolate;
-		}
-	}
-
-	if (ps->key.part.ps.prolog.color_two_side) {
-		unsigned bcol = TGSI_SEMANTIC_BCOLOR;
-
-		for (i = 0; i < 2; i++) {
-			if (!(psinfo->colors_read & (0xf << (i * 4))))
-				continue;
-
-			spi_ps_input_cntl[num_written++] =
-			  si_get_ps_input_cntl(sctx, vs, bcol, i, bcol_interp[i]);
-
-		}
-	}
-	assert(num_interp == num_written);
-
-	/* R_028644_SPI_PS_INPUT_CNTL_0 */
-	/* Dota 2: Only ~16% of SPI map updates set different values. */
-	/* Talos: Only ~9% of SPI map updates set different values. */
-	unsigned initial_cdw = sctx->gfx_cs->current.cdw;
-	radeon_opt_set_context_regn(sctx, R_028644_SPI_PS_INPUT_CNTL_0,
-				    spi_ps_input_cntl,
-				    sctx->tracked_regs.spi_ps_input_cntl, num_interp);
-
-	if (initial_cdw != sctx->gfx_cs->current.cdw)
-		sctx->context_roll = true;
+   struct si_shader *ps = sctx->ps_shader.current;
+   struct si_shader *vs = si_get_vs_state(sctx);
+   struct si_shader_info *psinfo = ps ? &ps->selector->info : NULL;
+   unsigned i, num_interp, num_written = 0, bcol_interp[2];
+   unsigned spi_ps_input_cntl[32];
+
+   if (!ps || !ps->selector->info.num_inputs)
+      return;
+
+   num_interp = si_get_ps_num_interp(ps);
+   assert(num_interp > 0);
+
+   for (i = 0; i < psinfo->num_inputs; i++) {
+      unsigned name = psinfo->input_semantic_name[i];
+      unsigned index = psinfo->input_semantic_index[i];
+      unsigned interpolate = psinfo->input_interpolate[i];
+
+      spi_ps_input_cntl[num_written++] = si_get_ps_input_cntl(sctx, vs, name, index, interpolate);
+
+      if (name == TGSI_SEMANTIC_COLOR) {
+         assert(index < ARRAY_SIZE(bcol_interp));
+         bcol_interp[index] = interpolate;
+      }
+   }
+
+   if (ps->key.part.ps.prolog.color_two_side) {
+      unsigned bcol = TGSI_SEMANTIC_BCOLOR;
+
+      for (i = 0; i < 2; i++) {
+         if (!(psinfo->colors_read & (0xf << (i * 4))))
+            continue;
+
+         spi_ps_input_cntl[num_written++] = si_get_ps_input_cntl(sctx, vs, bcol, i, bcol_interp[i]);
+      }
+   }
+   assert(num_interp == num_written);
+
+   /* R_028644_SPI_PS_INPUT_CNTL_0 */
+   /* Dota 2: Only ~16% of SPI map updates set different values. */
+   /* Talos: Only ~9% of SPI map updates set different values. */
+   unsigned initial_cdw = sctx->gfx_cs->current.cdw;
+   radeon_opt_set_context_regn(sctx, R_028644_SPI_PS_INPUT_CNTL_0, spi_ps_input_cntl,
+                               sctx->tracked_regs.spi_ps_input_cntl, num_interp);
+
+   if (initial_cdw != sctx->gfx_cs->current.cdw)
+      sctx->context_roll = true;
 }
 
 /**
@@ -3535,169 +3278,150 @@ static void si_emit_spi_map(struct si_context *sctx)
  */
 static void si_init_config_add_vgt_flush(struct si_context *sctx)
 {
-	if (sctx->init_config_has_vgt_flush)
-		return;
-
-	/* Done by Vulkan before VGT_FLUSH. */
-	si_pm4_cmd_begin(sctx->init_config, PKT3_EVENT_WRITE);
-	si_pm4_cmd_add(sctx->init_config,
-		       EVENT_TYPE(V_028A90_VS_PARTIAL_FLUSH) | EVENT_INDEX(4));
-	si_pm4_cmd_end(sctx->init_config, false);
-
-	/* VGT_FLUSH is required even if VGT is idle. It resets VGT pointers. */
-	si_pm4_cmd_begin(sctx->init_config, PKT3_EVENT_WRITE);
-	si_pm4_cmd_add(sctx->init_config, EVENT_TYPE(V_028A90_VGT_FLUSH) | EVENT_INDEX(0));
-	si_pm4_cmd_end(sctx->init_config, false);
-	sctx->init_config_has_vgt_flush = true;
+   if (sctx->init_config_has_vgt_flush)
+      return;
+
+   /* Done by Vulkan before VGT_FLUSH. */
+   si_pm4_cmd_begin(sctx->init_config, PKT3_EVENT_WRITE);
+   si_pm4_cmd_add(sctx->init_config, EVENT_TYPE(V_028A90_VS_PARTIAL_FLUSH) | EVENT_INDEX(4));
+   si_pm4_cmd_end(sctx->init_config, false);
+
+   /* VGT_FLUSH is required even if VGT is idle. It resets VGT pointers. */
+   si_pm4_cmd_begin(sctx->init_config, PKT3_EVENT_WRITE);
+   si_pm4_cmd_add(sctx->init_config, EVENT_TYPE(V_028A90_VGT_FLUSH) | EVENT_INDEX(0));
+   si_pm4_cmd_end(sctx->init_config, false);
+   sctx->init_config_has_vgt_flush = true;
 }
 
 /* Initialize state related to ESGS / GSVS ring buffers */
 static bool si_update_gs_ring_buffers(struct si_context *sctx)
 {
-	struct si_shader_selector *es =
-		sctx->tes_shader.cso ? sctx->tes_shader.cso : sctx->vs_shader.cso;
-	struct si_shader_selector *gs = sctx->gs_shader.cso;
-	struct si_pm4_state *pm4;
-
-	/* Chip constants. */
-	unsigned num_se = sctx->screen->info.max_se;
-	unsigned wave_size = 64;
-	unsigned max_gs_waves = 32 * num_se; /* max 32 per SE on GCN */
-	/* On GFX6-GFX7, the value comes from VGT_GS_VERTEX_REUSE = 16.
-	 * On GFX8+, the value comes from VGT_VERTEX_REUSE_BLOCK_CNTL = 30 (+2).
-	 */
-	unsigned gs_vertex_reuse = (sctx->chip_class >= GFX8 ? 32 : 16) * num_se;
-	unsigned alignment = 256 * num_se;
-	/* The maximum size is 63.999 MB per SE. */
-	unsigned max_size = ((unsigned)(63.999 * 1024 * 1024) & ~255) * num_se;
-
-	/* Calculate the minimum size. */
-	unsigned min_esgs_ring_size = align(es->esgs_itemsize * gs_vertex_reuse *
-					    wave_size, alignment);
-
-	/* These are recommended sizes, not minimum sizes. */
-	unsigned esgs_ring_size = max_gs_waves * 2 * wave_size *
-				  es->esgs_itemsize * gs->gs_input_verts_per_prim;
-	unsigned gsvs_ring_size = max_gs_waves * 2 * wave_size *
-				  gs->max_gsvs_emit_size;
-
-	min_esgs_ring_size = align(min_esgs_ring_size, alignment);
-	esgs_ring_size = align(esgs_ring_size, alignment);
-	gsvs_ring_size = align(gsvs_ring_size, alignment);
-
-	esgs_ring_size = CLAMP(esgs_ring_size, min_esgs_ring_size, max_size);
-	gsvs_ring_size = MIN2(gsvs_ring_size, max_size);
-
-	/* Some rings don't have to be allocated if shaders don't use them.
-	 * (e.g. no varyings between ES and GS or GS and VS)
-	 *
-	 * GFX9 doesn't have the ESGS ring.
-	 */
-	bool update_esgs = sctx->chip_class <= GFX8 &&
-			   esgs_ring_size &&
-			   (!sctx->esgs_ring ||
-			    sctx->esgs_ring->width0 < esgs_ring_size);
-	bool update_gsvs = gsvs_ring_size &&
-			   (!sctx->gsvs_ring ||
-			    sctx->gsvs_ring->width0 < gsvs_ring_size);
-
-	if (!update_esgs && !update_gsvs)
-		return true;
-
-	if (update_esgs) {
-		pipe_resource_reference(&sctx->esgs_ring, NULL);
-		sctx->esgs_ring =
-			pipe_aligned_buffer_create(sctx->b.screen,
-						   SI_RESOURCE_FLAG_UNMAPPABLE,
-						   PIPE_USAGE_DEFAULT,
-						   esgs_ring_size,
-						   sctx->screen->info.pte_fragment_size);
-		if (!sctx->esgs_ring)
-			return false;
-	}
-
-	if (update_gsvs) {
-		pipe_resource_reference(&sctx->gsvs_ring, NULL);
-		sctx->gsvs_ring =
-			pipe_aligned_buffer_create(sctx->b.screen,
-						   SI_RESOURCE_FLAG_UNMAPPABLE,
-						   PIPE_USAGE_DEFAULT,
-						   gsvs_ring_size,
-						   sctx->screen->info.pte_fragment_size);
-		if (!sctx->gsvs_ring)
-			return false;
-	}
-
-	/* Create the "init_config_gs_rings" state. */
-	pm4 = CALLOC_STRUCT(si_pm4_state);
-	if (!pm4)
-		return false;
-
-	if (sctx->chip_class >= GFX7) {
-		if (sctx->esgs_ring) {
-			assert(sctx->chip_class <= GFX8);
-			si_pm4_set_reg(pm4, R_030900_VGT_ESGS_RING_SIZE,
-				       sctx->esgs_ring->width0 / 256);
-		}
-		if (sctx->gsvs_ring)
-			si_pm4_set_reg(pm4, R_030904_VGT_GSVS_RING_SIZE,
-				       sctx->gsvs_ring->width0 / 256);
-	} else {
-		if (sctx->esgs_ring)
-			si_pm4_set_reg(pm4, R_0088C8_VGT_ESGS_RING_SIZE,
-				       sctx->esgs_ring->width0 / 256);
-		if (sctx->gsvs_ring)
-			si_pm4_set_reg(pm4, R_0088CC_VGT_GSVS_RING_SIZE,
-				       sctx->gsvs_ring->width0 / 256);
-	}
-
-	/* Set the state. */
-	if (sctx->init_config_gs_rings)
-		si_pm4_free_state(sctx, sctx->init_config_gs_rings, ~0);
-	sctx->init_config_gs_rings = pm4;
-
-	if (!sctx->init_config_has_vgt_flush) {
-		si_init_config_add_vgt_flush(sctx);
-		si_pm4_upload_indirect_buffer(sctx, sctx->init_config);
-	}
-
-	/* Flush the context to re-emit both init_config states. */
-	sctx->initial_gfx_cs_size = 0; /* force flush */
-	si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
-
-	/* Set ring bindings. */
-	if (sctx->esgs_ring) {
-		assert(sctx->chip_class <= GFX8);
-		si_set_ring_buffer(sctx, SI_ES_RING_ESGS,
-				   sctx->esgs_ring, 0, sctx->esgs_ring->width0,
-				   true, true, 4, 64, 0);
-		si_set_ring_buffer(sctx, SI_GS_RING_ESGS,
-				   sctx->esgs_ring, 0, sctx->esgs_ring->width0,
-				   false, false, 0, 0, 0);
-	}
-	if (sctx->gsvs_ring) {
-		si_set_ring_buffer(sctx, SI_RING_GSVS,
-				   sctx->gsvs_ring, 0, sctx->gsvs_ring->width0,
-				   false, false, 0, 0, 0);
-	}
-
-	return true;
+   struct si_shader_selector *es =
+      sctx->tes_shader.cso ? sctx->tes_shader.cso : sctx->vs_shader.cso;
+   struct si_shader_selector *gs = sctx->gs_shader.cso;
+   struct si_pm4_state *pm4;
+
+   /* Chip constants. */
+   unsigned num_se = sctx->screen->info.max_se;
+   unsigned wave_size = 64;
+   unsigned max_gs_waves = 32 * num_se; /* max 32 per SE on GCN */
+   /* On GFX6-GFX7, the value comes from VGT_GS_VERTEX_REUSE = 16.
+    * On GFX8+, the value comes from VGT_VERTEX_REUSE_BLOCK_CNTL = 30 (+2).
+    */
+   unsigned gs_vertex_reuse = (sctx->chip_class >= GFX8 ? 32 : 16) * num_se;
+   unsigned alignment = 256 * num_se;
+   /* The maximum size is 63.999 MB per SE. */
+   unsigned max_size = ((unsigned)(63.999 * 1024 * 1024) & ~255) * num_se;
+
+   /* Calculate the minimum size. */
+   unsigned min_esgs_ring_size = align(es->esgs_itemsize * gs_vertex_reuse * wave_size, alignment);
+
+   /* These are recommended sizes, not minimum sizes. */
+   unsigned esgs_ring_size =
+      max_gs_waves * 2 * wave_size * es->esgs_itemsize * gs->gs_input_verts_per_prim;
+   unsigned gsvs_ring_size = max_gs_waves * 2 * wave_size * gs->max_gsvs_emit_size;
+
+   min_esgs_ring_size = align(min_esgs_ring_size, alignment);
+   esgs_ring_size = align(esgs_ring_size, alignment);
+   gsvs_ring_size = align(gsvs_ring_size, alignment);
+
+   esgs_ring_size = CLAMP(esgs_ring_size, min_esgs_ring_size, max_size);
+   gsvs_ring_size = MIN2(gsvs_ring_size, max_size);
+
+   /* Some rings don't have to be allocated if shaders don't use them.
+    * (e.g. no varyings between ES and GS or GS and VS)
+    *
+    * GFX9 doesn't have the ESGS ring.
+    */
+   bool update_esgs = sctx->chip_class <= GFX8 && esgs_ring_size &&
+                      (!sctx->esgs_ring || sctx->esgs_ring->width0 < esgs_ring_size);
+   bool update_gsvs =
+      gsvs_ring_size && (!sctx->gsvs_ring || sctx->gsvs_ring->width0 < gsvs_ring_size);
+
+   if (!update_esgs && !update_gsvs)
+      return true;
+
+   if (update_esgs) {
+      pipe_resource_reference(&sctx->esgs_ring, NULL);
+      sctx->esgs_ring =
+         pipe_aligned_buffer_create(sctx->b.screen, SI_RESOURCE_FLAG_UNMAPPABLE, PIPE_USAGE_DEFAULT,
+                                    esgs_ring_size, sctx->screen->info.pte_fragment_size);
+      if (!sctx->esgs_ring)
+         return false;
+   }
+
+   if (update_gsvs) {
+      pipe_resource_reference(&sctx->gsvs_ring, NULL);
+      sctx->gsvs_ring =
+         pipe_aligned_buffer_create(sctx->b.screen, SI_RESOURCE_FLAG_UNMAPPABLE, PIPE_USAGE_DEFAULT,
+                                    gsvs_ring_size, sctx->screen->info.pte_fragment_size);
+      if (!sctx->gsvs_ring)
+         return false;
+   }
+
+   /* Create the "init_config_gs_rings" state. */
+   pm4 = CALLOC_STRUCT(si_pm4_state);
+   if (!pm4)
+      return false;
+
+   if (sctx->chip_class >= GFX7) {
+      if (sctx->esgs_ring) {
+         assert(sctx->chip_class <= GFX8);
+         si_pm4_set_reg(pm4, R_030900_VGT_ESGS_RING_SIZE, sctx->esgs_ring->width0 / 256);
+      }
+      if (sctx->gsvs_ring)
+         si_pm4_set_reg(pm4, R_030904_VGT_GSVS_RING_SIZE, sctx->gsvs_ring->width0 / 256);
+   } else {
+      if (sctx->esgs_ring)
+         si_pm4_set_reg(pm4, R_0088C8_VGT_ESGS_RING_SIZE, sctx->esgs_ring->width0 / 256);
+      if (sctx->gsvs_ring)
+         si_pm4_set_reg(pm4, R_0088CC_VGT_GSVS_RING_SIZE, sctx->gsvs_ring->width0 / 256);
+   }
+
+   /* Set the state. */
+   if (sctx->init_config_gs_rings)
+      si_pm4_free_state(sctx, sctx->init_config_gs_rings, ~0);
+   sctx->init_config_gs_rings = pm4;
+
+   if (!sctx->init_config_has_vgt_flush) {
+      si_init_config_add_vgt_flush(sctx);
+      si_pm4_upload_indirect_buffer(sctx, sctx->init_config);
+   }
+
+   /* Flush the context to re-emit both init_config states. */
+   sctx->initial_gfx_cs_size = 0; /* force flush */
+   si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
+
+   /* Set ring bindings. */
+   if (sctx->esgs_ring) {
+      assert(sctx->chip_class <= GFX8);
+      si_set_ring_buffer(sctx, SI_ES_RING_ESGS, sctx->esgs_ring, 0, sctx->esgs_ring->width0, true,
+                         true, 4, 64, 0);
+      si_set_ring_buffer(sctx, SI_GS_RING_ESGS, sctx->esgs_ring, 0, sctx->esgs_ring->width0, false,
+                         false, 0, 0, 0);
+   }
+   if (sctx->gsvs_ring) {
+      si_set_ring_buffer(sctx, SI_RING_GSVS, sctx->gsvs_ring, 0, sctx->gsvs_ring->width0, false,
+                         false, 0, 0, 0);
+   }
+
+   return true;
 }
 
 static void si_shader_lock(struct si_shader *shader)
 {
-	simple_mtx_lock(&shader->selector->mutex);
-	if (shader->previous_stage_sel) {
-		assert(shader->previous_stage_sel != shader->selector);
-		simple_mtx_lock(&shader->previous_stage_sel->mutex);
-	}
+   simple_mtx_lock(&shader->selector->mutex);
+   if (shader->previous_stage_sel) {
+      assert(shader->previous_stage_sel != shader->selector);
+      simple_mtx_lock(&shader->previous_stage_sel->mutex);
+   }
 }
 
 static void si_shader_unlock(struct si_shader *shader)
 {
-	if (shader->previous_stage_sel)
-		simple_mtx_unlock(&shader->previous_stage_sel->mutex);
-	simple_mtx_unlock(&shader->selector->mutex);
+   if (shader->previous_stage_sel)
+      simple_mtx_unlock(&shader->previous_stage_sel->mutex);
+   simple_mtx_unlock(&shader->selector->mutex);
 }
 
 /**
@@ -3705,578 +3429,545 @@ static void si_shader_unlock(struct si_shader *shader)
  *          0 if not
  *          < 0 if there was a failure
  */
-static int si_update_scratch_buffer(struct si_context *sctx,
-				    struct si_shader *shader)
+static int si_update_scratch_buffer(struct si_context *sctx, struct si_shader *shader)
 {
-	uint64_t scratch_va = sctx->scratch_buffer->gpu_address;
+   uint64_t scratch_va = sctx->scratch_buffer->gpu_address;
 
-	if (!shader)
-		return 0;
+   if (!shader)
+      return 0;
 
-	/* This shader doesn't need a scratch buffer */
-	if (shader->config.scratch_bytes_per_wave == 0)
-		return 0;
+   /* This shader doesn't need a scratch buffer */
+   if (shader->config.scratch_bytes_per_wave == 0)
+      return 0;
 
-	/* Prevent race conditions when updating:
-	 * - si_shader::scratch_bo
-	 * - si_shader::binary::code
-	 * - si_shader::previous_stage::binary::code.
-	 */
-	si_shader_lock(shader);
+   /* Prevent race conditions when updating:
+    * - si_shader::scratch_bo
+    * - si_shader::binary::code
+    * - si_shader::previous_stage::binary::code.
+    */
+   si_shader_lock(shader);
 
-	/* This shader is already configured to use the current
-	 * scratch buffer. */
-	if (shader->scratch_bo == sctx->scratch_buffer) {
-		si_shader_unlock(shader);
-		return 0;
-	}
+   /* This shader is already configured to use the current
+    * scratch buffer. */
+   if (shader->scratch_bo == sctx->scratch_buffer) {
+      si_shader_unlock(shader);
+      return 0;
+   }
 
-	assert(sctx->scratch_buffer);
+   assert(sctx->scratch_buffer);
 
-	/* Replace the shader bo with a new bo that has the relocs applied. */
-	if (!si_shader_binary_upload(sctx->screen, shader, scratch_va)) {
-		si_shader_unlock(shader);
-		return -1;
-	}
+   /* Replace the shader bo with a new bo that has the relocs applied. */
+   if (!si_shader_binary_upload(sctx->screen, shader, scratch_va)) {
+      si_shader_unlock(shader);
+      return -1;
+   }
 
-	/* Update the shader state to use the new shader bo. */
-	si_shader_init_pm4_state(sctx->screen, shader);
+   /* Update the shader state to use the new shader bo. */
+   si_shader_init_pm4_state(sctx->screen, shader);
 
-	si_resource_reference(&shader->scratch_bo, sctx->scratch_buffer);
+   si_resource_reference(&shader->scratch_bo, sctx->scratch_buffer);
 
-	si_shader_unlock(shader);
-	return 1;
+   si_shader_unlock(shader);
+   return 1;
 }
 
 static unsigned si_get_scratch_buffer_bytes_per_wave(struct si_shader *shader)
 {
-	return shader ? shader->config.scratch_bytes_per_wave : 0;
+   return shader ? shader->config.scratch_bytes_per_wave : 0;
 }
 
 static struct si_shader *si_get_tcs_current(struct si_context *sctx)
 {
-	if (!sctx->tes_shader.cso)
-		return NULL; /* tessellation disabled */
+   if (!sctx->tes_shader.cso)
+      return NULL; /* tessellation disabled */
 
-	return sctx->tcs_shader.cso ? sctx->tcs_shader.current :
-				      sctx->fixed_func_tcs_shader.current;
+   return sctx->tcs_shader.cso ? sctx->tcs_shader.current : sctx->fixed_func_tcs_shader.current;
 }
 
 static bool si_update_scratch_relocs(struct si_context *sctx)
 {
-	struct si_shader *tcs = si_get_tcs_current(sctx);
-	int r;
-
-	/* Update the shaders, so that they are using the latest scratch.
-	 * The scratch buffer may have been changed since these shaders were
-	 * last used, so we still need to try to update them, even if they
-	 * require scratch buffers smaller than the current size.
-	 */
-	r = si_update_scratch_buffer(sctx, sctx->ps_shader.current);
-	if (r < 0)
-		return false;
-	if (r == 1)
-		si_pm4_bind_state(sctx, ps, sctx->ps_shader.current->pm4);
-
-	r = si_update_scratch_buffer(sctx, sctx->gs_shader.current);
-	if (r < 0)
-		return false;
-	if (r == 1)
-		si_pm4_bind_state(sctx, gs, sctx->gs_shader.current->pm4);
-
-	r = si_update_scratch_buffer(sctx, tcs);
-	if (r < 0)
-		return false;
-	if (r == 1)
-		si_pm4_bind_state(sctx, hs, tcs->pm4);
-
-	/* VS can be bound as LS, ES, or VS. */
-	r = si_update_scratch_buffer(sctx, sctx->vs_shader.current);
-	if (r < 0)
-		return false;
-	if (r == 1) {
-		if (sctx->vs_shader.current->key.as_ls)
-			si_pm4_bind_state(sctx, ls, sctx->vs_shader.current->pm4);
-		else if (sctx->vs_shader.current->key.as_es)
-			si_pm4_bind_state(sctx, es, sctx->vs_shader.current->pm4);
-		else if (sctx->vs_shader.current->key.as_ngg)
-			si_pm4_bind_state(sctx, gs, sctx->vs_shader.current->pm4);
-		else
-			si_pm4_bind_state(sctx, vs, sctx->vs_shader.current->pm4);
-	}
-
-	/* TES can be bound as ES or VS. */
-	r = si_update_scratch_buffer(sctx, sctx->tes_shader.current);
-	if (r < 0)
-		return false;
-	if (r == 1) {
-		if (sctx->tes_shader.current->key.as_es)
-			si_pm4_bind_state(sctx, es, sctx->tes_shader.current->pm4);
-		else if (sctx->tes_shader.current->key.as_ngg)
-			si_pm4_bind_state(sctx, gs, sctx->tes_shader.current->pm4);
-		else
-			si_pm4_bind_state(sctx, vs, sctx->tes_shader.current->pm4);
-	}
-
-	return true;
+   struct si_shader *tcs = si_get_tcs_current(sctx);
+   int r;
+
+   /* Update the shaders, so that they are using the latest scratch.
+    * The scratch buffer may have been changed since these shaders were
+    * last used, so we still need to try to update them, even if they
+    * require scratch buffers smaller than the current size.
+    */
+   r = si_update_scratch_buffer(sctx, sctx->ps_shader.current);
+   if (r < 0)
+      return false;
+   if (r == 1)
+      si_pm4_bind_state(sctx, ps, sctx->ps_shader.current->pm4);
+
+   r = si_update_scratch_buffer(sctx, sctx->gs_shader.current);
+   if (r < 0)
+      return false;
+   if (r == 1)
+      si_pm4_bind_state(sctx, gs, sctx->gs_shader.current->pm4);
+
+   r = si_update_scratch_buffer(sctx, tcs);
+   if (r < 0)
+      return false;
+   if (r == 1)
+      si_pm4_bind_state(sctx, hs, tcs->pm4);
+
+   /* VS can be bound as LS, ES, or VS. */
+   r = si_update_scratch_buffer(sctx, sctx->vs_shader.current);
+   if (r < 0)
+      return false;
+   if (r == 1) {
+      if (sctx->vs_shader.current->key.as_ls)
+         si_pm4_bind_state(sctx, ls, sctx->vs_shader.current->pm4);
+      else if (sctx->vs_shader.current->key.as_es)
+         si_pm4_bind_state(sctx, es, sctx->vs_shader.current->pm4);
+      else if (sctx->vs_shader.current->key.as_ngg)
+         si_pm4_bind_state(sctx, gs, sctx->vs_shader.current->pm4);
+      else
+         si_pm4_bind_state(sctx, vs, sctx->vs_shader.current->pm4);
+   }
+
+   /* TES can be bound as ES or VS. */
+   r = si_update_scratch_buffer(sctx, sctx->tes_shader.current);
+   if (r < 0)
+      return false;
+   if (r == 1) {
+      if (sctx->tes_shader.current->key.as_es)
+         si_pm4_bind_state(sctx, es, sctx->tes_shader.current->pm4);
+      else if (sctx->tes_shader.current->key.as_ngg)
+         si_pm4_bind_state(sctx, gs, sctx->tes_shader.current->pm4);
+      else
+         si_pm4_bind_state(sctx, vs, sctx->tes_shader.current->pm4);
+   }
+
+   return true;
 }
 
 static bool si_update_spi_tmpring_size(struct si_context *sctx)
 {
-	/* SPI_TMPRING_SIZE.WAVESIZE must be constant for each scratch buffer.
-	 * There are 2 cases to handle:
-	 *
-	 * - If the current needed size is less than the maximum seen size,
-	 *   use the maximum seen size, so that WAVESIZE remains the same.
-	 *
-	 * - If the current needed size is greater than the maximum seen size,
-	 *   the scratch buffer is reallocated, so we can increase WAVESIZE.
-	 *
-	 * Shaders that set SCRATCH_EN=0 don't allocate scratch space.
-	 * Otherwise, the number of waves that can use scratch is
-	 * SPI_TMPRING_SIZE.WAVES.
-	 */
-	unsigned bytes = 0;
-
-	bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx->ps_shader.current));
-	bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx->gs_shader.current));
-	bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx->vs_shader.current));
-
-	if (sctx->tes_shader.cso) {
-		bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx->tes_shader.current));
-		bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(si_get_tcs_current(sctx)));
-	}
-
-	sctx->max_seen_scratch_bytes_per_wave =
-		MAX2(sctx->max_seen_scratch_bytes_per_wave, bytes);
-
-	unsigned scratch_needed_size =
-		sctx->max_seen_scratch_bytes_per_wave * sctx->scratch_waves;
-	unsigned spi_tmpring_size;
-
-	if (scratch_needed_size > 0) {
-		if (!sctx->scratch_buffer ||
-		    scratch_needed_size > sctx->scratch_buffer->b.b.width0) {
-			/* Create a bigger scratch buffer */
-			si_resource_reference(&sctx->scratch_buffer, NULL);
-
-			sctx->scratch_buffer =
-				si_aligned_buffer_create(&sctx->screen->b,
-							 SI_RESOURCE_FLAG_UNMAPPABLE,
-							 PIPE_USAGE_DEFAULT,
-							 scratch_needed_size,
-							 sctx->screen->info.pte_fragment_size);
-			if (!sctx->scratch_buffer)
-				return false;
-
-			si_mark_atom_dirty(sctx, &sctx->atoms.s.scratch_state);
-			si_context_add_resource_size(sctx,
-						     &sctx->scratch_buffer->b.b);
-		}
-
-		if (!si_update_scratch_relocs(sctx))
-			return false;
-	}
-
-	/* The LLVM shader backend should be reporting aligned scratch_sizes. */
-	assert((scratch_needed_size & ~0x3FF) == scratch_needed_size &&
-		"scratch size should already be aligned correctly.");
-
-	spi_tmpring_size = S_0286E8_WAVES(sctx->scratch_waves) |
-			   S_0286E8_WAVESIZE(sctx->max_seen_scratch_bytes_per_wave >> 10);
-	if (spi_tmpring_size != sctx->spi_tmpring_size) {
-		sctx->spi_tmpring_size = spi_tmpring_size;
-		si_mark_atom_dirty(sctx, &sctx->atoms.s.scratch_state);
-	}
-	return true;
+   /* SPI_TMPRING_SIZE.WAVESIZE must be constant for each scratch buffer.
+    * There are 2 cases to handle:
+    *
+    * - If the current needed size is less than the maximum seen size,
+    *   use the maximum seen size, so that WAVESIZE remains the same.
+    *
+    * - If the current needed size is greater than the maximum seen size,
+    *   the scratch buffer is reallocated, so we can increase WAVESIZE.
+    *
+    * Shaders that set SCRATCH_EN=0 don't allocate scratch space.
+    * Otherwise, the number of waves that can use scratch is
+    * SPI_TMPRING_SIZE.WAVES.
+    */
+   unsigned bytes = 0;
+
+   bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx->ps_shader.current));
+   bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx->gs_shader.current));
+   bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx->vs_shader.current));
+
+   if (sctx->tes_shader.cso) {
+      bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx->tes_shader.current));
+      bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(si_get_tcs_current(sctx)));
+   }
+
+   sctx->max_seen_scratch_bytes_per_wave = MAX2(sctx->max_seen_scratch_bytes_per_wave, bytes);
+
+   unsigned scratch_needed_size = sctx->max_seen_scratch_bytes_per_wave * sctx->scratch_waves;
+   unsigned spi_tmpring_size;
+
+   if (scratch_needed_size > 0) {
+      if (!sctx->scratch_buffer || scratch_needed_size > sctx->scratch_buffer->b.b.width0) {
+         /* Create a bigger scratch buffer */
+         si_resource_reference(&sctx->scratch_buffer, NULL);
+
+         sctx->scratch_buffer = si_aligned_buffer_create(
+            &sctx->screen->b, SI_RESOURCE_FLAG_UNMAPPABLE, PIPE_USAGE_DEFAULT, scratch_needed_size,
+            sctx->screen->info.pte_fragment_size);
+         if (!sctx->scratch_buffer)
+            return false;
+
+         si_mark_atom_dirty(sctx, &sctx->atoms.s.scratch_state);
+         si_context_add_resource_size(sctx, &sctx->scratch_buffer->b.b);
+      }
+
+      if (!si_update_scratch_relocs(sctx))
+         return false;
+   }
+
+   /* The LLVM shader backend should be reporting aligned scratch_sizes. */
+   assert((scratch_needed_size & ~0x3FF) == scratch_needed_size &&
+          "scratch size should already be aligned correctly.");
+
+   spi_tmpring_size = S_0286E8_WAVES(sctx->scratch_waves) |
+                      S_0286E8_WAVESIZE(sctx->max_seen_scratch_bytes_per_wave >> 10);
+   if (spi_tmpring_size != sctx->spi_tmpring_size) {
+      sctx->spi_tmpring_size = spi_tmpring_size;
+      si_mark_atom_dirty(sctx, &sctx->atoms.s.scratch_state);
+   }
+   return true;
 }
 
 static void si_init_tess_factor_ring(struct si_context *sctx)
 {
-	assert(!sctx->tess_rings);
-	assert(((sctx->screen->tess_factor_ring_size / 4) & C_030938_SIZE) == 0);
-
-	/* The address must be aligned to 2^19, because the shader only
-	 * receives the high 13 bits.
-	 */
-	sctx->tess_rings = pipe_aligned_buffer_create(sctx->b.screen,
-						    SI_RESOURCE_FLAG_32BIT,
-						    PIPE_USAGE_DEFAULT,
-						    sctx->screen->tess_offchip_ring_size +
-						    sctx->screen->tess_factor_ring_size,
-						    1 << 19);
-	if (!sctx->tess_rings)
-		return;
-
-	si_init_config_add_vgt_flush(sctx);
-
-	si_pm4_add_bo(sctx->init_config, si_resource(sctx->tess_rings),
-		      RADEON_USAGE_READWRITE, RADEON_PRIO_SHADER_RINGS);
-
-	uint64_t factor_va = si_resource(sctx->tess_rings)->gpu_address +
-			     sctx->screen->tess_offchip_ring_size;
-
-	/* Append these registers to the init config state. */
-	if (sctx->chip_class >= GFX7) {
-		si_pm4_set_reg(sctx->init_config, R_030938_VGT_TF_RING_SIZE,
-			       S_030938_SIZE(sctx->screen->tess_factor_ring_size / 4));
-		si_pm4_set_reg(sctx->init_config, R_030940_VGT_TF_MEMORY_BASE,
-			       factor_va >> 8);
-		if (sctx->chip_class >= GFX10)
-			si_pm4_set_reg(sctx->init_config, R_030984_VGT_TF_MEMORY_BASE_HI_UMD,
-				       S_030984_BASE_HI(factor_va >> 40));
-		else if (sctx->chip_class == GFX9)
-			si_pm4_set_reg(sctx->init_config, R_030944_VGT_TF_MEMORY_BASE_HI,
-				       S_030944_BASE_HI(factor_va >> 40));
-		si_pm4_set_reg(sctx->init_config, R_03093C_VGT_HS_OFFCHIP_PARAM,
-			       sctx->screen->vgt_hs_offchip_param);
-	} else {
-		si_pm4_set_reg(sctx->init_config, R_008988_VGT_TF_RING_SIZE,
-			       S_008988_SIZE(sctx->screen->tess_factor_ring_size / 4));
-		si_pm4_set_reg(sctx->init_config, R_0089B8_VGT_TF_MEMORY_BASE,
-			       factor_va >> 8);
-		si_pm4_set_reg(sctx->init_config, R_0089B0_VGT_HS_OFFCHIP_PARAM,
-		               sctx->screen->vgt_hs_offchip_param);
-	}
-
-	/* Flush the context to re-emit the init_config state.
-	 * This is done only once in a lifetime of a context.
-	 */
-	si_pm4_upload_indirect_buffer(sctx, sctx->init_config);
-	sctx->initial_gfx_cs_size = 0; /* force flush */
-	si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
+   assert(!sctx->tess_rings);
+   assert(((sctx->screen->tess_factor_ring_size / 4) & C_030938_SIZE) == 0);
+
+   /* The address must be aligned to 2^19, because the shader only
+    * receives the high 13 bits.
+    */
+   sctx->tess_rings = pipe_aligned_buffer_create(
+      sctx->b.screen, SI_RESOURCE_FLAG_32BIT, PIPE_USAGE_DEFAULT,
+      sctx->screen->tess_offchip_ring_size + sctx->screen->tess_factor_ring_size, 1 << 19);
+   if (!sctx->tess_rings)
+      return;
+
+   si_init_config_add_vgt_flush(sctx);
+
+   si_pm4_add_bo(sctx->init_config, si_resource(sctx->tess_rings), RADEON_USAGE_READWRITE,
+                 RADEON_PRIO_SHADER_RINGS);
+
+   uint64_t factor_va =
+      si_resource(sctx->tess_rings)->gpu_address + sctx->screen->tess_offchip_ring_size;
+
+   /* Append these registers to the init config state. */
+   if (sctx->chip_class >= GFX7) {
+      si_pm4_set_reg(sctx->init_config, R_030938_VGT_TF_RING_SIZE,
+                     S_030938_SIZE(sctx->screen->tess_factor_ring_size / 4));
+      si_pm4_set_reg(sctx->init_config, R_030940_VGT_TF_MEMORY_BASE, factor_va >> 8);
+      if (sctx->chip_class >= GFX10)
+         si_pm4_set_reg(sctx->init_config, R_030984_VGT_TF_MEMORY_BASE_HI_UMD,
+                        S_030984_BASE_HI(factor_va >> 40));
+      else if (sctx->chip_class == GFX9)
+         si_pm4_set_reg(sctx->init_config, R_030944_VGT_TF_MEMORY_BASE_HI,
+                        S_030944_BASE_HI(factor_va >> 40));
+      si_pm4_set_reg(sctx->init_config, R_03093C_VGT_HS_OFFCHIP_PARAM,
+                     sctx->screen->vgt_hs_offchip_param);
+   } else {
+      si_pm4_set_reg(sctx->init_config, R_008988_VGT_TF_RING_SIZE,
+                     S_008988_SIZE(sctx->screen->tess_factor_ring_size / 4));
+      si_pm4_set_reg(sctx->init_config, R_0089B8_VGT_TF_MEMORY_BASE, factor_va >> 8);
+      si_pm4_set_reg(sctx->init_config, R_0089B0_VGT_HS_OFFCHIP_PARAM,
+                     sctx->screen->vgt_hs_offchip_param);
+   }
+
+   /* Flush the context to re-emit the init_config state.
+    * This is done only once in a lifetime of a context.
+    */
+   si_pm4_upload_indirect_buffer(sctx, sctx->init_config);
+   sctx->initial_gfx_cs_size = 0; /* force flush */
+   si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
 }
 
 static struct si_pm4_state *si_build_vgt_shader_config(struct si_screen *screen,
-						       union si_vgt_stages_key key)
+                                                       union si_vgt_stages_key key)
 {
-	struct si_pm4_state *pm4 = CALLOC_STRUCT(si_pm4_state);
-	uint32_t stages = 0;
-
-	if (key.u.tess) {
-		stages |= S_028B54_LS_EN(V_028B54_LS_STAGE_ON) |
-		          S_028B54_HS_EN(1) | S_028B54_DYNAMIC_HS(1);
-
-		if (key.u.gs)
-			stages |= S_028B54_ES_EN(V_028B54_ES_STAGE_DS) |
-				  S_028B54_GS_EN(1);
-		else if (key.u.ngg)
-			stages |= S_028B54_ES_EN(V_028B54_ES_STAGE_DS);
-		else
-			stages |= S_028B54_VS_EN(V_028B54_VS_STAGE_DS);
-	} else if (key.u.gs) {
-		stages |= S_028B54_ES_EN(V_028B54_ES_STAGE_REAL) |
-			  S_028B54_GS_EN(1);
-	} else if (key.u.ngg) {
-		stages |= S_028B54_ES_EN(V_028B54_ES_STAGE_REAL);
-	}
-
-	if (key.u.ngg) {
-		stages |= S_028B54_PRIMGEN_EN(1) |
-			  S_028B54_GS_FAST_LAUNCH(key.u.ngg_gs_fast_launch) |
-			  S_028B54_NGG_WAVE_ID_EN(key.u.streamout) |
-			  S_028B54_PRIMGEN_PASSTHRU_EN(key.u.ngg_passthrough);
-	} else if (key.u.gs)
-		stages |= S_028B54_VS_EN(V_028B54_VS_STAGE_COPY_SHADER);
-
-	if (screen->info.chip_class >= GFX9)
-		stages |= S_028B54_MAX_PRIMGRP_IN_WAVE(2);
-
-	if (screen->info.chip_class >= GFX10 && screen->ge_wave_size == 32) {
-		stages |= S_028B54_HS_W32_EN(1) |
-			  S_028B54_GS_W32_EN(key.u.ngg) | /* legacy GS only supports Wave64 */
-			  S_028B54_VS_W32_EN(1);
-	}
-
-	si_pm4_set_reg(pm4, R_028B54_VGT_SHADER_STAGES_EN, stages);
-	return pm4;
+   struct si_pm4_state *pm4 = CALLOC_STRUCT(si_pm4_state);
+   uint32_t stages = 0;
+
+   if (key.u.tess) {
+      stages |= S_028B54_LS_EN(V_028B54_LS_STAGE_ON) | S_028B54_HS_EN(1) | S_028B54_DYNAMIC_HS(1);
+
+      if (key.u.gs)
+         stages |= S_028B54_ES_EN(V_028B54_ES_STAGE_DS) | S_028B54_GS_EN(1);
+      else if (key.u.ngg)
+         stages |= S_028B54_ES_EN(V_028B54_ES_STAGE_DS);
+      else
+         stages |= S_028B54_VS_EN(V_028B54_VS_STAGE_DS);
+   } else if (key.u.gs) {
+      stages |= S_028B54_ES_EN(V_028B54_ES_STAGE_REAL) | S_028B54_GS_EN(1);
+   } else if (key.u.ngg) {
+      stages |= S_028B54_ES_EN(V_028B54_ES_STAGE_REAL);
+   }
+
+   if (key.u.ngg) {
+      stages |= S_028B54_PRIMGEN_EN(1) | S_028B54_GS_FAST_LAUNCH(key.u.ngg_gs_fast_launch) |
+                S_028B54_NGG_WAVE_ID_EN(key.u.streamout) |
+                S_028B54_PRIMGEN_PASSTHRU_EN(key.u.ngg_passthrough);
+   } else if (key.u.gs)
+      stages |= S_028B54_VS_EN(V_028B54_VS_STAGE_COPY_SHADER);
+
+   if (screen->info.chip_class >= GFX9)
+      stages |= S_028B54_MAX_PRIMGRP_IN_WAVE(2);
+
+   if (screen->info.chip_class >= GFX10 && screen->ge_wave_size == 32) {
+      stages |= S_028B54_HS_W32_EN(1) |
+                S_028B54_GS_W32_EN(key.u.ngg) | /* legacy GS only supports Wave64 */
+                S_028B54_VS_W32_EN(1);
+   }
+
+   si_pm4_set_reg(pm4, R_028B54_VGT_SHADER_STAGES_EN, stages);
+   return pm4;
 }
 
-static void si_update_vgt_shader_config(struct si_context *sctx,
-					union si_vgt_stages_key key)
+static void si_update_vgt_shader_config(struct si_context *sctx, union si_vgt_stages_key key)
 {
-	struct si_pm4_state **pm4 = &sctx->vgt_shader_config[key.index];
+   struct si_pm4_state **pm4 = &sctx->vgt_shader_config[key.index];
 
-	if (unlikely(!*pm4))
-		*pm4 = si_build_vgt_shader_config(sctx->screen, key);
-	si_pm4_bind_state(sctx, vgt_shader_config, *pm4);
+   if (unlikely(!*pm4))
+      *pm4 = si_build_vgt_shader_config(sctx->screen, key);
+   si_pm4_bind_state(sctx, vgt_shader_config, *pm4);
 }
 
 bool si_update_shaders(struct si_context *sctx)
 {
-	struct pipe_context *ctx = (struct pipe_context*)sctx;
-	struct si_compiler_ctx_state compiler_state;
-	struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
-	struct si_shader *old_vs = si_get_vs_state(sctx);
-	bool old_clip_disable = old_vs ? old_vs->key.opt.clip_disable : false;
-	struct si_shader *old_ps = sctx->ps_shader.current;
-	union si_vgt_stages_key key;
-	unsigned old_spi_shader_col_format =
-		old_ps ? old_ps->key.part.ps.epilog.spi_shader_col_format : 0;
-	int r;
-
-	if (!sctx->compiler.passes)
-		si_init_compiler(sctx->screen, &sctx->compiler);
-
-	compiler_state.compiler = &sctx->compiler;
-	compiler_state.debug = sctx->debug;
-	compiler_state.is_debug_context = sctx->is_debug;
-
-	key.index = 0;
-
-	if (sctx->tes_shader.cso)
-		key.u.tess = 1;
-	if (sctx->gs_shader.cso)
-		key.u.gs = 1;
-
-	if (sctx->ngg) {
-		key.u.ngg = 1;
-		key.u.streamout = !!si_get_vs(sctx)->cso->so.num_outputs;
-	}
-
-	/* Update TCS and TES. */
-	if (sctx->tes_shader.cso) {
-		if (!sctx->tess_rings) {
-			si_init_tess_factor_ring(sctx);
-			if (!sctx->tess_rings)
-				return false;
-		}
-
-		if (sctx->tcs_shader.cso) {
-			r = si_shader_select(ctx, &sctx->tcs_shader, key,
-					     &compiler_state);
-			if (r)
-				return false;
-			si_pm4_bind_state(sctx, hs, sctx->tcs_shader.current->pm4);
-		} else {
-			if (!sctx->fixed_func_tcs_shader.cso) {
-				sctx->fixed_func_tcs_shader.cso =
-					si_create_fixed_func_tcs(sctx);
-				if (!sctx->fixed_func_tcs_shader.cso)
-					return false;
-			}
-
-			r = si_shader_select(ctx, &sctx->fixed_func_tcs_shader,
-					     key, &compiler_state);
-			if (r)
-				return false;
-			si_pm4_bind_state(sctx, hs,
-					  sctx->fixed_func_tcs_shader.current->pm4);
-		}
-
-		if (!sctx->gs_shader.cso || sctx->chip_class <= GFX8) {
-			r = si_shader_select(ctx, &sctx->tes_shader, key, &compiler_state);
-			if (r)
-				return false;
-
-			if (sctx->gs_shader.cso) {
-				/* TES as ES */
-				assert(sctx->chip_class <= GFX8);
-				si_pm4_bind_state(sctx, es, sctx->tes_shader.current->pm4);
-			} else if (key.u.ngg) {
-				si_pm4_bind_state(sctx, gs, sctx->tes_shader.current->pm4);
-			} else {
-				si_pm4_bind_state(sctx, vs, sctx->tes_shader.current->pm4);
-			}
-		}
-	} else {
-		if (sctx->chip_class <= GFX8)
-			si_pm4_bind_state(sctx, ls, NULL);
-		si_pm4_bind_state(sctx, hs, NULL);
-	}
-
-	/* Update GS. */
-	if (sctx->gs_shader.cso) {
-		r = si_shader_select(ctx, &sctx->gs_shader, key, &compiler_state);
-		if (r)
-			return false;
-		si_pm4_bind_state(sctx, gs, sctx->gs_shader.current->pm4);
-		if (!key.u.ngg) {
-			si_pm4_bind_state(sctx, vs, sctx->gs_shader.cso->gs_copy_shader->pm4);
-
-			if (!si_update_gs_ring_buffers(sctx))
-				return false;
-		} else {
-			si_pm4_bind_state(sctx, vs, NULL);
-		}
-	} else {
-		if (!key.u.ngg) {
-			si_pm4_bind_state(sctx, gs, NULL);
-			if (sctx->chip_class <= GFX8)
-				si_pm4_bind_state(sctx, es, NULL);
-		}
-	}
-
-	/* Update VS. */
-	if ((!key.u.tess && !key.u.gs) || sctx->chip_class <= GFX8) {
-		r = si_shader_select(ctx, &sctx->vs_shader, key, &compiler_state);
-		if (r)
-			return false;
-
-		if (!key.u.tess && !key.u.gs) {
-			if (key.u.ngg) {
-				si_pm4_bind_state(sctx, gs, sctx->vs_shader.current->pm4);
-				si_pm4_bind_state(sctx, vs, NULL);
-			} else {
-				si_pm4_bind_state(sctx, vs, sctx->vs_shader.current->pm4);
-			}
-		} else if (sctx->tes_shader.cso) {
-			si_pm4_bind_state(sctx, ls, sctx->vs_shader.current->pm4);
-		} else {
-			assert(sctx->gs_shader.cso);
-			si_pm4_bind_state(sctx, es, sctx->vs_shader.current->pm4);
-		}
-	}
-
-	/* This must be done after the shader variant is selected. */
-	if (sctx->ngg) {
-		struct si_shader *vs = si_get_vs(sctx)->current;
-
-		key.u.ngg_passthrough = gfx10_is_ngg_passthrough(vs);
-		key.u.ngg_gs_fast_launch = !!(vs->key.opt.ngg_culling &
-					      SI_NGG_CULL_GS_FAST_LAUNCH_ALL);
-	}
-
-	si_update_vgt_shader_config(sctx, key);
-
-	if (old_clip_disable != si_get_vs_state(sctx)->key.opt.clip_disable)
-		si_mark_atom_dirty(sctx, &sctx->atoms.s.clip_regs);
-
-	if (sctx->ps_shader.cso) {
-		unsigned db_shader_control;
-
-		r = si_shader_select(ctx, &sctx->ps_shader, key, &compiler_state);
-		if (r)
-			return false;
-		si_pm4_bind_state(sctx, ps, sctx->ps_shader.current->pm4);
-
-		db_shader_control =
-			sctx->ps_shader.cso->db_shader_control |
-			S_02880C_KILL_ENABLE(si_get_alpha_test_func(sctx) != PIPE_FUNC_ALWAYS);
-
-		if (si_pm4_state_changed(sctx, ps) ||
-		    si_pm4_state_changed(sctx, vs) ||
-		    (key.u.ngg && si_pm4_state_changed(sctx, gs)) ||
-		    sctx->sprite_coord_enable != rs->sprite_coord_enable ||
-		    sctx->flatshade != rs->flatshade) {
-			sctx->sprite_coord_enable = rs->sprite_coord_enable;
-			sctx->flatshade = rs->flatshade;
-			si_mark_atom_dirty(sctx, &sctx->atoms.s.spi_map);
-		}
-
-		if (sctx->screen->info.rbplus_allowed &&
-		    si_pm4_state_changed(sctx, ps) &&
-		    (!old_ps ||
-		     old_spi_shader_col_format !=
-		     sctx->ps_shader.current->key.part.ps.epilog.spi_shader_col_format))
-			si_mark_atom_dirty(sctx, &sctx->atoms.s.cb_render_state);
-
-		if (sctx->ps_db_shader_control != db_shader_control) {
-			sctx->ps_db_shader_control = db_shader_control;
-			si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
-			if (sctx->screen->dpbb_allowed)
-				si_mark_atom_dirty(sctx, &sctx->atoms.s.dpbb_state);
-		}
-
-		if (sctx->smoothing_enabled != sctx->ps_shader.current->key.part.ps.epilog.poly_line_smoothing) {
-			sctx->smoothing_enabled = sctx->ps_shader.current->key.part.ps.epilog.poly_line_smoothing;
-			si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config);
-
-			if (sctx->chip_class == GFX6)
-				si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
-
-			if (sctx->framebuffer.nr_samples <= 1)
-				si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_sample_locs);
-		}
-	}
-
-	if (si_pm4_state_enabled_and_changed(sctx, ls) ||
-	    si_pm4_state_enabled_and_changed(sctx, hs) ||
-	    si_pm4_state_enabled_and_changed(sctx, es) ||
-	    si_pm4_state_enabled_and_changed(sctx, gs) ||
-	    si_pm4_state_enabled_and_changed(sctx, vs) ||
-	    si_pm4_state_enabled_and_changed(sctx, ps)) {
-		if (!si_update_spi_tmpring_size(sctx))
-			return false;
-	}
-
-	if (sctx->chip_class >= GFX7) {
-		if (si_pm4_state_enabled_and_changed(sctx, ls))
-			sctx->prefetch_L2_mask |= SI_PREFETCH_LS;
-		else if (!sctx->queued.named.ls)
-			sctx->prefetch_L2_mask &= ~SI_PREFETCH_LS;
-
-		if (si_pm4_state_enabled_and_changed(sctx, hs))
-			sctx->prefetch_L2_mask |= SI_PREFETCH_HS;
-		else if (!sctx->queued.named.hs)
-			sctx->prefetch_L2_mask &= ~SI_PREFETCH_HS;
-
-		if (si_pm4_state_enabled_and_changed(sctx, es))
-			sctx->prefetch_L2_mask |= SI_PREFETCH_ES;
-		else if (!sctx->queued.named.es)
-			sctx->prefetch_L2_mask &= ~SI_PREFETCH_ES;
-
-		if (si_pm4_state_enabled_and_changed(sctx, gs))
-			sctx->prefetch_L2_mask |= SI_PREFETCH_GS;
-		else if (!sctx->queued.named.gs)
-			sctx->prefetch_L2_mask &= ~SI_PREFETCH_GS;
-
-		if (si_pm4_state_enabled_and_changed(sctx, vs))
-			sctx->prefetch_L2_mask |= SI_PREFETCH_VS;
-		else if (!sctx->queued.named.vs)
-			sctx->prefetch_L2_mask &= ~SI_PREFETCH_VS;
-
-		if (si_pm4_state_enabled_and_changed(sctx, ps))
-			sctx->prefetch_L2_mask |= SI_PREFETCH_PS;
-		else if (!sctx->queued.named.ps)
-			sctx->prefetch_L2_mask &= ~SI_PREFETCH_PS;
-	}
-
-	sctx->do_update_shaders = false;
-	return true;
+   struct pipe_context *ctx = (struct pipe_context *)sctx;
+   struct si_compiler_ctx_state compiler_state;
+   struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
+   struct si_shader *old_vs = si_get_vs_state(sctx);
+   bool old_clip_disable = old_vs ? old_vs->key.opt.clip_disable : false;
+   struct si_shader *old_ps = sctx->ps_shader.current;
+   union si_vgt_stages_key key;
+   unsigned old_spi_shader_col_format =
+      old_ps ? old_ps->key.part.ps.epilog.spi_shader_col_format : 0;
+   int r;
+
+   if (!sctx->compiler.passes)
+      si_init_compiler(sctx->screen, &sctx->compiler);
+
+   compiler_state.compiler = &sctx->compiler;
+   compiler_state.debug = sctx->debug;
+   compiler_state.is_debug_context = sctx->is_debug;
+
+   key.index = 0;
+
+   if (sctx->tes_shader.cso)
+      key.u.tess = 1;
+   if (sctx->gs_shader.cso)
+      key.u.gs = 1;
+
+   if (sctx->ngg) {
+      key.u.ngg = 1;
+      key.u.streamout = !!si_get_vs(sctx)->cso->so.num_outputs;
+   }
+
+   /* Update TCS and TES. */
+   if (sctx->tes_shader.cso) {
+      if (!sctx->tess_rings) {
+         si_init_tess_factor_ring(sctx);
+         if (!sctx->tess_rings)
+            return false;
+      }
+
+      if (sctx->tcs_shader.cso) {
+         r = si_shader_select(ctx, &sctx->tcs_shader, key, &compiler_state);
+         if (r)
+            return false;
+         si_pm4_bind_state(sctx, hs, sctx->tcs_shader.current->pm4);
+      } else {
+         if (!sctx->fixed_func_tcs_shader.cso) {
+            sctx->fixed_func_tcs_shader.cso = si_create_fixed_func_tcs(sctx);
+            if (!sctx->fixed_func_tcs_shader.cso)
+               return false;
+         }
+
+         r = si_shader_select(ctx, &sctx->fixed_func_tcs_shader, key, &compiler_state);
+         if (r)
+            return false;
+         si_pm4_bind_state(sctx, hs, sctx->fixed_func_tcs_shader.current->pm4);
+      }
+
+      if (!sctx->gs_shader.cso || sctx->chip_class <= GFX8) {
+         r = si_shader_select(ctx, &sctx->tes_shader, key, &compiler_state);
+         if (r)
+            return false;
+
+         if (sctx->gs_shader.cso) {
+            /* TES as ES */
+            assert(sctx->chip_class <= GFX8);
+            si_pm4_bind_state(sctx, es, sctx->tes_shader.current->pm4);
+         } else if (key.u.ngg) {
+            si_pm4_bind_state(sctx, gs, sctx->tes_shader.current->pm4);
+         } else {
+            si_pm4_bind_state(sctx, vs, sctx->tes_shader.current->pm4);
+         }
+      }
+   } else {
+      if (sctx->chip_class <= GFX8)
+         si_pm4_bind_state(sctx, ls, NULL);
+      si_pm4_bind_state(sctx, hs, NULL);
+   }
+
+   /* Update GS. */
+   if (sctx->gs_shader.cso) {
+      r = si_shader_select(ctx, &sctx->gs_shader, key, &compiler_state);
+      if (r)
+         return false;
+      si_pm4_bind_state(sctx, gs, sctx->gs_shader.current->pm4);
+      if (!key.u.ngg) {
+         si_pm4_bind_state(sctx, vs, sctx->gs_shader.cso->gs_copy_shader->pm4);
+
+         if (!si_update_gs_ring_buffers(sctx))
+            return false;
+      } else {
+         si_pm4_bind_state(sctx, vs, NULL);
+      }
+   } else {
+      if (!key.u.ngg) {
+         si_pm4_bind_state(sctx, gs, NULL);
+         if (sctx->chip_class <= GFX8)
+            si_pm4_bind_state(sctx, es, NULL);
+      }
+   }
+
+   /* Update VS. */
+   if ((!key.u.tess && !key.u.gs) || sctx->chip_class <= GFX8) {
+      r = si_shader_select(ctx, &sctx->vs_shader, key, &compiler_state);
+      if (r)
+         return false;
+
+      if (!key.u.tess && !key.u.gs) {
+         if (key.u.ngg) {
+            si_pm4_bind_state(sctx, gs, sctx->vs_shader.current->pm4);
+            si_pm4_bind_state(sctx, vs, NULL);
+         } else {
+            si_pm4_bind_state(sctx, vs, sctx->vs_shader.current->pm4);
+         }
+      } else if (sctx->tes_shader.cso) {
+         si_pm4_bind_state(sctx, ls, sctx->vs_shader.current->pm4);
+      } else {
+         assert(sctx->gs_shader.cso);
+         si_pm4_bind_state(sctx, es, sctx->vs_shader.current->pm4);
+      }
+   }
+
+   /* This must be done after the shader variant is selected. */
+   if (sctx->ngg) {
+      struct si_shader *vs = si_get_vs(sctx)->current;
+
+      key.u.ngg_passthrough = gfx10_is_ngg_passthrough(vs);
+      key.u.ngg_gs_fast_launch = !!(vs->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL);
+   }
+
+   si_update_vgt_shader_config(sctx, key);
+
+   if (old_clip_disable != si_get_vs_state(sctx)->key.opt.clip_disable)
+      si_mark_atom_dirty(sctx, &sctx->atoms.s.clip_regs);
+
+   if (sctx->ps_shader.cso) {
+      unsigned db_shader_control;
+
+      r = si_shader_select(ctx, &sctx->ps_shader, key, &compiler_state);
+      if (r)
+         return false;
+      si_pm4_bind_state(sctx, ps, sctx->ps_shader.current->pm4);
+
+      db_shader_control = sctx->ps_shader.cso->db_shader_control |
+                          S_02880C_KILL_ENABLE(si_get_alpha_test_func(sctx) != PIPE_FUNC_ALWAYS);
+
+      if (si_pm4_state_changed(sctx, ps) || si_pm4_state_changed(sctx, vs) ||
+          (key.u.ngg && si_pm4_state_changed(sctx, gs)) ||
+          sctx->sprite_coord_enable != rs->sprite_coord_enable ||
+          sctx->flatshade != rs->flatshade) {
+         sctx->sprite_coord_enable = rs->sprite_coord_enable;
+         sctx->flatshade = rs->flatshade;
+         si_mark_atom_dirty(sctx, &sctx->atoms.s.spi_map);
+      }
+
+      if (sctx->screen->info.rbplus_allowed && si_pm4_state_changed(sctx, ps) &&
+          (!old_ps || old_spi_shader_col_format !=
+                         sctx->ps_shader.current->key.part.ps.epilog.spi_shader_col_format))
+         si_mark_atom_dirty(sctx, &sctx->atoms.s.cb_render_state);
+
+      if (sctx->ps_db_shader_control != db_shader_control) {
+         sctx->ps_db_shader_control = db_shader_control;
+         si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
+         if (sctx->screen->dpbb_allowed)
+            si_mark_atom_dirty(sctx, &sctx->atoms.s.dpbb_state);
+      }
+
+      if (sctx->smoothing_enabled !=
+          sctx->ps_shader.current->key.part.ps.epilog.poly_line_smoothing) {
+         sctx->smoothing_enabled = sctx->ps_shader.current->key.part.ps.epilog.poly_line_smoothing;
+         si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config);
+
+         if (sctx->chip_class == GFX6)
+            si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
+
+         if (sctx->framebuffer.nr_samples <= 1)
+            si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_sample_locs);
+      }
+   }
+
+   if (si_pm4_state_enabled_and_changed(sctx, ls) || si_pm4_state_enabled_and_changed(sctx, hs) ||
+       si_pm4_state_enabled_and_changed(sctx, es) || si_pm4_state_enabled_and_changed(sctx, gs) ||
+       si_pm4_state_enabled_and_changed(sctx, vs) || si_pm4_state_enabled_and_changed(sctx, ps)) {
+      if (!si_update_spi_tmpring_size(sctx))
+         return false;
+   }
+
+   if (sctx->chip_class >= GFX7) {
+      if (si_pm4_state_enabled_and_changed(sctx, ls))
+         sctx->prefetch_L2_mask |= SI_PREFETCH_LS;
+      else if (!sctx->queued.named.ls)
+         sctx->prefetch_L2_mask &= ~SI_PREFETCH_LS;
+
+      if (si_pm4_state_enabled_and_changed(sctx, hs))
+         sctx->prefetch_L2_mask |= SI_PREFETCH_HS;
+      else if (!sctx->queued.named.hs)
+         sctx->prefetch_L2_mask &= ~SI_PREFETCH_HS;
+
+      if (si_pm4_state_enabled_and_changed(sctx, es))
+         sctx->prefetch_L2_mask |= SI_PREFETCH_ES;
+      else if (!sctx->queued.named.es)
+         sctx->prefetch_L2_mask &= ~SI_PREFETCH_ES;
+
+      if (si_pm4_state_enabled_and_changed(sctx, gs))
+         sctx->prefetch_L2_mask |= SI_PREFETCH_GS;
+      else if (!sctx->queued.named.gs)
+         sctx->prefetch_L2_mask &= ~SI_PREFETCH_GS;
+
+      if (si_pm4_state_enabled_and_changed(sctx, vs))
+         sctx->prefetch_L2_mask |= SI_PREFETCH_VS;
+      else if (!sctx->queued.named.vs)
+         sctx->prefetch_L2_mask &= ~SI_PREFETCH_VS;
+
+      if (si_pm4_state_enabled_and_changed(sctx, ps))
+         sctx->prefetch_L2_mask |= SI_PREFETCH_PS;
+      else if (!sctx->queued.named.ps)
+         sctx->prefetch_L2_mask &= ~SI_PREFETCH_PS;
+   }
+
+   sctx->do_update_shaders = false;
+   return true;
 }
 
 static void si_emit_scratch_state(struct si_context *sctx)
 {
-	struct radeon_cmdbuf *cs = sctx->gfx_cs;
+   struct radeon_cmdbuf *cs = sctx->gfx_cs;
 
-	radeon_set_context_reg(cs, R_0286E8_SPI_TMPRING_SIZE,
-			       sctx->spi_tmpring_size);
+   radeon_set_context_reg(cs, R_0286E8_SPI_TMPRING_SIZE, sctx->spi_tmpring_size);
 
-	if (sctx->scratch_buffer) {
-		radeon_add_to_buffer_list(sctx, sctx->gfx_cs,
-				      sctx->scratch_buffer, RADEON_USAGE_READWRITE,
-				      RADEON_PRIO_SCRATCH_BUFFER);
-	}
+   if (sctx->scratch_buffer) {
+      radeon_add_to_buffer_list(sctx, sctx->gfx_cs, sctx->scratch_buffer, RADEON_USAGE_READWRITE,
+                                RADEON_PRIO_SCRATCH_BUFFER);
+   }
 }
 
 void si_init_screen_live_shader_cache(struct si_screen *sscreen)
 {
-	util_live_shader_cache_init(&sscreen->live_shader_cache,
-				    si_create_shader_selector,
-				    si_destroy_shader_selector);
+   util_live_shader_cache_init(&sscreen->live_shader_cache, si_create_shader_selector,
+                               si_destroy_shader_selector);
 }
 
 void si_init_shader_functions(struct si_context *sctx)
 {
-	sctx->atoms.s.spi_map.emit = si_emit_spi_map;
-	sctx->atoms.s.scratch_state.emit = si_emit_scratch_state;
-
-	sctx->b.create_vs_state = si_create_shader;
-	sctx->b.create_tcs_state = si_create_shader;
-	sctx->b.create_tes_state = si_create_shader;
-	sctx->b.create_gs_state = si_create_shader;
-	sctx->b.create_fs_state = si_create_shader;
-
-	sctx->b.bind_vs_state = si_bind_vs_shader;
-	sctx->b.bind_tcs_state = si_bind_tcs_shader;
-	sctx->b.bind_tes_state = si_bind_tes_shader;
-	sctx->b.bind_gs_state = si_bind_gs_shader;
-	sctx->b.bind_fs_state = si_bind_ps_shader;
-
-	sctx->b.delete_vs_state = si_delete_shader_selector;
-	sctx->b.delete_tcs_state = si_delete_shader_selector;
-	sctx->b.delete_tes_state = si_delete_shader_selector;
-	sctx->b.delete_gs_state = si_delete_shader_selector;
-	sctx->b.delete_fs_state = si_delete_shader_selector;
+   sctx->atoms.s.spi_map.emit = si_emit_spi_map;
+   sctx->atoms.s.scratch_state.emit = si_emit_scratch_state;
+
+   sctx->b.create_vs_state = si_create_shader;
+   sctx->b.create_tcs_state = si_create_shader;
+   sctx->b.create_tes_state = si_create_shader;
+   sctx->b.create_gs_state = si_create_shader;
+   sctx->b.create_fs_state = si_create_shader;
+
+   sctx->b.bind_vs_state = si_bind_vs_shader;
+   sctx->b.bind_tcs_state = si_bind_tcs_shader;
+   sctx->b.bind_tes_state = si_bind_tes_shader;
+   sctx->b.bind_gs_state = si_bind_gs_shader;
+   sctx->b.bind_fs_state = si_bind_ps_shader;
+
+   sctx->b.delete_vs_state = si_delete_shader_selector;
+   sctx->b.delete_tcs_state = si_delete_shader_selector;
+   sctx->b.delete_tes_state = si_delete_shader_selector;
+   sctx->b.delete_gs_state = si_delete_shader_selector;
+   sctx->b.delete_fs_state = si_delete_shader_selector;
 }
diff --git a/src/gallium/drivers/radeonsi/si_state_streamout.c b/src/gallium/drivers/radeonsi/si_state_streamout.c
index 85ac4a119c5..2ce8de0ccde 100644
--- a/src/gallium/drivers/radeonsi/si_state_streamout.c
+++ b/src/gallium/drivers/radeonsi/si_state_streamout.c
@@ -23,395 +23,372 @@
  */
 
 #include "si_build_pm4.h"
-
 #include "util/u_memory.h"
 #include "util/u_suballoc.h"
 
 static void si_set_streamout_enable(struct si_context *sctx, bool enable);
 
 static inline void si_so_target_reference(struct si_streamout_target **dst,
-					  struct pipe_stream_output_target *src)
+                                          struct pipe_stream_output_target *src)
 {
-	pipe_so_target_reference((struct pipe_stream_output_target**)dst, src);
+   pipe_so_target_reference((struct pipe_stream_output_target **)dst, src);
 }
 
-static struct pipe_stream_output_target *
-si_create_so_target(struct pipe_context *ctx,
-		    struct pipe_resource *buffer,
-		    unsigned buffer_offset,
-		    unsigned buffer_size)
+static struct pipe_stream_output_target *si_create_so_target(struct pipe_context *ctx,
+                                                             struct pipe_resource *buffer,
+                                                             unsigned buffer_offset,
+                                                             unsigned buffer_size)
 {
-	struct si_context *sctx = (struct si_context *)ctx;
-	struct si_streamout_target *t;
-	struct si_resource *buf = si_resource(buffer);
-
-	t = CALLOC_STRUCT(si_streamout_target);
-	if (!t) {
-		return NULL;
-	}
-
-	unsigned buf_filled_size_size = sctx->screen->use_ngg_streamout ? 8 : 4;
-	u_suballocator_alloc(sctx->allocator_zeroed_memory, buf_filled_size_size, 4,
-			     &t->buf_filled_size_offset,
-			     (struct pipe_resource**)&t->buf_filled_size);
-	if (!t->buf_filled_size) {
-		FREE(t);
-		return NULL;
-	}
-
-	t->b.reference.count = 1;
-	t->b.context = ctx;
-	pipe_resource_reference(&t->b.buffer, buffer);
-	t->b.buffer_offset = buffer_offset;
-	t->b.buffer_size = buffer_size;
-
-	util_range_add(&buf->b.b, &buf->valid_buffer_range, buffer_offset,
-		       buffer_offset + buffer_size);
-	return &t->b;
+   struct si_context *sctx = (struct si_context *)ctx;
+   struct si_streamout_target *t;
+   struct si_resource *buf = si_resource(buffer);
+
+   t = CALLOC_STRUCT(si_streamout_target);
+   if (!t) {
+      return NULL;
+   }
+
+   unsigned buf_filled_size_size = sctx->screen->use_ngg_streamout ? 8 : 4;
+   u_suballocator_alloc(sctx->allocator_zeroed_memory, buf_filled_size_size, 4,
+                        &t->buf_filled_size_offset, (struct pipe_resource **)&t->buf_filled_size);
+   if (!t->buf_filled_size) {
+      FREE(t);
+      return NULL;
+   }
+
+   t->b.reference.count = 1;
+   t->b.context = ctx;
+   pipe_resource_reference(&t->b.buffer, buffer);
+   t->b.buffer_offset = buffer_offset;
+   t->b.buffer_size = buffer_size;
+
+   util_range_add(&buf->b.b, &buf->valid_buffer_range, buffer_offset, buffer_offset + buffer_size);
+   return &t->b;
 }
 
-static void si_so_target_destroy(struct pipe_context *ctx,
-				 struct pipe_stream_output_target *target)
+static void si_so_target_destroy(struct pipe_context *ctx, struct pipe_stream_output_target *target)
 {
-	struct si_streamout_target *t = (struct si_streamout_target*)target;
-	pipe_resource_reference(&t->b.buffer, NULL);
-	si_resource_reference(&t->buf_filled_size, NULL);
-	FREE(t);
+   struct si_streamout_target *t = (struct si_streamout_target *)target;
+   pipe_resource_reference(&t->b.buffer, NULL);
+   si_resource_reference(&t->buf_filled_size, NULL);
+   FREE(t);
 }
 
 void si_streamout_buffers_dirty(struct si_context *sctx)
 {
-	if (!sctx->streamout.enabled_mask)
-		return;
+   if (!sctx->streamout.enabled_mask)
+      return;
 
-	si_mark_atom_dirty(sctx, &sctx->atoms.s.streamout_begin);
-	si_set_streamout_enable(sctx, true);
+   si_mark_atom_dirty(sctx, &sctx->atoms.s.streamout_begin);
+   si_set_streamout_enable(sctx, true);
 }
 
-static void si_set_streamout_targets(struct pipe_context *ctx,
-				     unsigned num_targets,
-				     struct pipe_stream_output_target **targets,
-				     const unsigned *offsets)
+static void si_set_streamout_targets(struct pipe_context *ctx, unsigned num_targets,
+                                     struct pipe_stream_output_target **targets,
+                                     const unsigned *offsets)
 {
-	struct si_context *sctx = (struct si_context *)ctx;
-	unsigned old_num_targets = sctx->streamout.num_targets;
-	unsigned i;
-	bool wait_now = false;
-
-	/* We are going to unbind the buffers. Mark which caches need to be flushed. */
-	if (sctx->streamout.num_targets && sctx->streamout.begin_emitted) {
-		/* Since streamout uses vector writes which go through TC L2
-		 * and most other clients can use TC L2 as well, we don't need
-		 * to flush it.
-		 *
-		 * The only cases which requires flushing it is VGT DMA index
-		 * fetching (on <= GFX7) and indirect draw data, which are rare
-		 * cases. Thus, flag the TC L2 dirtiness in the resource and
-		 * handle it at draw call time.
-		 */
-		for (i = 0; i < sctx->streamout.num_targets; i++)
-			if (sctx->streamout.targets[i])
-				si_resource(sctx->streamout.targets[i]->b.buffer)->TC_L2_dirty = true;
-
-		/* Invalidate the scalar cache in case a streamout buffer is
-		 * going to be used as a constant buffer.
-		 *
-		 * Invalidate vL1, because streamout bypasses it (done by
-		 * setting GLC=1 in the store instruction), but vL1 in other
-		 * CUs can contain outdated data of streamout buffers.
-		 *
-		 * VS_PARTIAL_FLUSH is required if the buffers are going to be
-		 * used as an input immediately.
-		 */
-		sctx->flags |= SI_CONTEXT_INV_SCACHE |
-			       SI_CONTEXT_INV_VCACHE;
-
-		/* The BUFFER_FILLED_SIZE is written using a PS_DONE event. */
-		if (sctx->screen->use_ngg_streamout) {
-			sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH;
-
-			/* Wait now. This is needed to make sure that GDS is not
-			 * busy at the end of IBs.
-			 *
-			 * Also, the next streamout operation will overwrite GDS,
-			 * so we need to make sure that it's idle.
-			 */
-			wait_now = true;
-		} else {
-			sctx->flags |= SI_CONTEXT_VS_PARTIAL_FLUSH;
-		}
-	}
-
-	/* All readers of the streamout targets need to be finished before we can
-	 * start writing to the targets.
-	 */
-	if (num_targets) {
-		if (sctx->screen->use_ngg_streamout)
-			si_allocate_gds(sctx);
-
-		sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
-			       SI_CONTEXT_CS_PARTIAL_FLUSH;
-	}
-
-	/* Streamout buffers must be bound in 2 places:
-	 * 1) in VGT by setting the VGT_STRMOUT registers
-	 * 2) as shader resources
-	 */
-
-	/* Stop streamout. */
-	if (sctx->streamout.num_targets && sctx->streamout.begin_emitted)
-		si_emit_streamout_end(sctx);
-
-	/* Set the new targets. */
-	unsigned enabled_mask = 0, append_bitmask = 0;
-	for (i = 0; i < num_targets; i++) {
-		si_so_target_reference(&sctx->streamout.targets[i], targets[i]);
-		if (!targets[i])
-			continue;
-
-		si_context_add_resource_size(sctx, targets[i]->buffer);
-		enabled_mask |= 1 << i;
-
-		if (offsets[i] == ((unsigned)-1))
-			append_bitmask |= 1 << i;
-	}
-
-	for (; i < sctx->streamout.num_targets; i++)
-		si_so_target_reference(&sctx->streamout.targets[i], NULL);
-
-	sctx->streamout.enabled_mask = enabled_mask;
-	sctx->streamout.num_targets = num_targets;
-	sctx->streamout.append_bitmask = append_bitmask;
-
-	/* Update dirty state bits. */
-	if (num_targets) {
-		si_streamout_buffers_dirty(sctx);
-	} else {
-		si_set_atom_dirty(sctx, &sctx->atoms.s.streamout_begin, false);
-		si_set_streamout_enable(sctx, false);
-	}
-
-	/* Set the shader resources.*/
-	for (i = 0; i < num_targets; i++) {
-		if (targets[i]) {
-			struct pipe_shader_buffer sbuf;
-			sbuf.buffer = targets[i]->buffer;
-
-			if (sctx->screen->use_ngg_streamout) {
-				sbuf.buffer_offset = targets[i]->buffer_offset;
-				sbuf.buffer_size = targets[i]->buffer_size;
-			} else {
-				sbuf.buffer_offset = 0;
-				sbuf.buffer_size = targets[i]->buffer_offset +
-						   targets[i]->buffer_size;
-			}
-
-			si_set_rw_shader_buffer(sctx, SI_VS_STREAMOUT_BUF0 + i, &sbuf);
-			si_resource(targets[i]->buffer)->bind_history |= PIPE_BIND_STREAM_OUTPUT;
-		} else {
-			si_set_rw_shader_buffer(sctx, SI_VS_STREAMOUT_BUF0 + i, NULL);
-		}
-	}
-	for (; i < old_num_targets; i++)
-		si_set_rw_shader_buffer(sctx, SI_VS_STREAMOUT_BUF0 + i, NULL);
-
-	if (wait_now)
-		sctx->emit_cache_flush(sctx);
+   struct si_context *sctx = (struct si_context *)ctx;
+   unsigned old_num_targets = sctx->streamout.num_targets;
+   unsigned i;
+   bool wait_now = false;
+
+   /* We are going to unbind the buffers. Mark which caches need to be flushed. */
+   if (sctx->streamout.num_targets && sctx->streamout.begin_emitted) {
+      /* Since streamout uses vector writes which go through TC L2
+       * and most other clients can use TC L2 as well, we don't need
+       * to flush it.
+       *
+       * The only cases which requires flushing it is VGT DMA index
+       * fetching (on <= GFX7) and indirect draw data, which are rare
+       * cases. Thus, flag the TC L2 dirtiness in the resource and
+       * handle it at draw call time.
+       */
+      for (i = 0; i < sctx->streamout.num_targets; i++)
+         if (sctx->streamout.targets[i])
+            si_resource(sctx->streamout.targets[i]->b.buffer)->TC_L2_dirty = true;
+
+      /* Invalidate the scalar cache in case a streamout buffer is
+       * going to be used as a constant buffer.
+       *
+       * Invalidate vL1, because streamout bypasses it (done by
+       * setting GLC=1 in the store instruction), but vL1 in other
+       * CUs can contain outdated data of streamout buffers.
+       *
+       * VS_PARTIAL_FLUSH is required if the buffers are going to be
+       * used as an input immediately.
+       */
+      sctx->flags |= SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE;
+
+      /* The BUFFER_FILLED_SIZE is written using a PS_DONE event. */
+      if (sctx->screen->use_ngg_streamout) {
+         sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH;
+
+         /* Wait now. This is needed to make sure that GDS is not
+          * busy at the end of IBs.
+          *
+          * Also, the next streamout operation will overwrite GDS,
+          * so we need to make sure that it's idle.
+          */
+         wait_now = true;
+      } else {
+         sctx->flags |= SI_CONTEXT_VS_PARTIAL_FLUSH;
+      }
+   }
+
+   /* All readers of the streamout targets need to be finished before we can
+    * start writing to the targets.
+    */
+   if (num_targets) {
+      if (sctx->screen->use_ngg_streamout)
+         si_allocate_gds(sctx);
+
+      sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH;
+   }
+
+   /* Streamout buffers must be bound in 2 places:
+    * 1) in VGT by setting the VGT_STRMOUT registers
+    * 2) as shader resources
+    */
+
+   /* Stop streamout. */
+   if (sctx->streamout.num_targets && sctx->streamout.begin_emitted)
+      si_emit_streamout_end(sctx);
+
+   /* Set the new targets. */
+   unsigned enabled_mask = 0, append_bitmask = 0;
+   for (i = 0; i < num_targets; i++) {
+      si_so_target_reference(&sctx->streamout.targets[i], targets[i]);
+      if (!targets[i])
+         continue;
+
+      si_context_add_resource_size(sctx, targets[i]->buffer);
+      enabled_mask |= 1 << i;
+
+      if (offsets[i] == ((unsigned)-1))
+         append_bitmask |= 1 << i;
+   }
+
+   for (; i < sctx->streamout.num_targets; i++)
+      si_so_target_reference(&sctx->streamout.targets[i], NULL);
+
+   sctx->streamout.enabled_mask = enabled_mask;
+   sctx->streamout.num_targets = num_targets;
+   sctx->streamout.append_bitmask = append_bitmask;
+
+   /* Update dirty state bits. */
+   if (num_targets) {
+      si_streamout_buffers_dirty(sctx);
+   } else {
+      si_set_atom_dirty(sctx, &sctx->atoms.s.streamout_begin, false);
+      si_set_streamout_enable(sctx, false);
+   }
+
+   /* Set the shader resources.*/
+   for (i = 0; i < num_targets; i++) {
+      if (targets[i]) {
+         struct pipe_shader_buffer sbuf;
+         sbuf.buffer = targets[i]->buffer;
+
+         if (sctx->screen->use_ngg_streamout) {
+            sbuf.buffer_offset = targets[i]->buffer_offset;
+            sbuf.buffer_size = targets[i]->buffer_size;
+         } else {
+            sbuf.buffer_offset = 0;
+            sbuf.buffer_size = targets[i]->buffer_offset + targets[i]->buffer_size;
+         }
+
+         si_set_rw_shader_buffer(sctx, SI_VS_STREAMOUT_BUF0 + i, &sbuf);
+         si_resource(targets[i]->buffer)->bind_history |= PIPE_BIND_STREAM_OUTPUT;
+      } else {
+         si_set_rw_shader_buffer(sctx, SI_VS_STREAMOUT_BUF0 + i, NULL);
+      }
+   }
+   for (; i < old_num_targets; i++)
+      si_set_rw_shader_buffer(sctx, SI_VS_STREAMOUT_BUF0 + i, NULL);
+
+   if (wait_now)
+      sctx->emit_cache_flush(sctx);
 }
 
 static void gfx10_emit_streamout_begin(struct si_context *sctx)
 {
-	struct si_streamout_target **t = sctx->streamout.targets;
-	struct radeon_cmdbuf *cs = sctx->gfx_cs;
-	unsigned last_target = 0;
-
-	for (unsigned i = 0; i < sctx->streamout.num_targets; i++) {
-		if (t[i])
-			last_target = i;
-	}
-
-	for (unsigned i = 0; i < sctx->streamout.num_targets; i++) {
-		if (!t[i])
-			continue;
-
-		t[i]->stride_in_dw = sctx->streamout.stride_in_dw[i];
-
-		bool append = sctx->streamout.append_bitmask & (1 << i);
-		uint64_t va = 0;
-
-		if (append) {
-			radeon_add_to_buffer_list(sctx,  sctx->gfx_cs,
-						  t[i]->buf_filled_size,
-						  RADEON_USAGE_READ,
-						  RADEON_PRIO_SO_FILLED_SIZE);
-
-			va = t[i]->buf_filled_size->gpu_address +
-			     t[i]->buf_filled_size_offset;
-		}
-
-		radeon_emit(cs, PKT3(PKT3_DMA_DATA, 5, 0));
-		radeon_emit(cs, S_411_SRC_SEL(append ? V_411_SRC_ADDR_TC_L2 : V_411_DATA) |
-				S_411_DST_SEL(V_411_GDS) |
-				S_411_CP_SYNC(i == last_target));
-		radeon_emit(cs, va);
-		radeon_emit(cs, va >> 32);
-		radeon_emit(cs, 4 * i); /* destination in GDS */
-		radeon_emit(cs, 0);
-		radeon_emit(cs, S_414_BYTE_COUNT_GFX9(4) |
-				S_414_DISABLE_WR_CONFIRM_GFX9(i != last_target));
-	}
-
-	sctx->streamout.begin_emitted = true;
+   struct si_streamout_target **t = sctx->streamout.targets;
+   struct radeon_cmdbuf *cs = sctx->gfx_cs;
+   unsigned last_target = 0;
+
+   for (unsigned i = 0; i < sctx->streamout.num_targets; i++) {
+      if (t[i])
+         last_target = i;
+   }
+
+   for (unsigned i = 0; i < sctx->streamout.num_targets; i++) {
+      if (!t[i])
+         continue;
+
+      t[i]->stride_in_dw = sctx->streamout.stride_in_dw[i];
+
+      bool append = sctx->streamout.append_bitmask & (1 << i);
+      uint64_t va = 0;
+
+      if (append) {
+         radeon_add_to_buffer_list(sctx, sctx->gfx_cs, t[i]->buf_filled_size, RADEON_USAGE_READ,
+                                   RADEON_PRIO_SO_FILLED_SIZE);
+
+         va = t[i]->buf_filled_size->gpu_address + t[i]->buf_filled_size_offset;
+      }
+
+      radeon_emit(cs, PKT3(PKT3_DMA_DATA, 5, 0));
+      radeon_emit(cs, S_411_SRC_SEL(append ? V_411_SRC_ADDR_TC_L2 : V_411_DATA) |
+                         S_411_DST_SEL(V_411_GDS) | S_411_CP_SYNC(i == last_target));
+      radeon_emit(cs, va);
+      radeon_emit(cs, va >> 32);
+      radeon_emit(cs, 4 * i); /* destination in GDS */
+      radeon_emit(cs, 0);
+      radeon_emit(cs, S_414_BYTE_COUNT_GFX9(4) | S_414_DISABLE_WR_CONFIRM_GFX9(i != last_target));
+   }
+
+   sctx->streamout.begin_emitted = true;
 }
 
 static void gfx10_emit_streamout_end(struct si_context *sctx)
 {
-	struct si_streamout_target **t = sctx->streamout.targets;
+   struct si_streamout_target **t = sctx->streamout.targets;
 
-	for (unsigned i = 0; i < sctx->streamout.num_targets; i++) {
-		if (!t[i])
-			continue;
+   for (unsigned i = 0; i < sctx->streamout.num_targets; i++) {
+      if (!t[i])
+         continue;
 
-		uint64_t va = t[i]->buf_filled_size->gpu_address + t[i]->buf_filled_size_offset;
+      uint64_t va = t[i]->buf_filled_size->gpu_address + t[i]->buf_filled_size_offset;
 
-		si_cp_release_mem(sctx, sctx->gfx_cs, V_028A90_PS_DONE, 0,
-				  EOP_DST_SEL_TC_L2,
-				  EOP_INT_SEL_SEND_DATA_AFTER_WR_CONFIRM,
-				  EOP_DATA_SEL_GDS,
-				  t[i]->buf_filled_size, va,
-				  EOP_DATA_GDS(i, 1), 0);
+      si_cp_release_mem(sctx, sctx->gfx_cs, V_028A90_PS_DONE, 0, EOP_DST_SEL_TC_L2,
+                        EOP_INT_SEL_SEND_DATA_AFTER_WR_CONFIRM, EOP_DATA_SEL_GDS,
+                        t[i]->buf_filled_size, va, EOP_DATA_GDS(i, 1), 0);
 
-		t[i]->buf_filled_size_valid = true;
-	}
+      t[i]->buf_filled_size_valid = true;
+   }
 
-	sctx->streamout.begin_emitted = false;
+   sctx->streamout.begin_emitted = false;
 }
 
 static void si_flush_vgt_streamout(struct si_context *sctx)
 {
-	struct radeon_cmdbuf *cs = sctx->gfx_cs;
-	unsigned reg_strmout_cntl;
-
-	/* The register is at different places on different ASICs. */
-	if (sctx->chip_class >= GFX7) {
-		reg_strmout_cntl = R_0300FC_CP_STRMOUT_CNTL;
-		radeon_set_uconfig_reg(cs, reg_strmout_cntl, 0);
-	} else {
-		reg_strmout_cntl = R_0084FC_CP_STRMOUT_CNTL;
-		radeon_set_config_reg(cs, reg_strmout_cntl, 0);
-	}
-
-	radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
-	radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_SO_VGTSTREAMOUT_FLUSH) | EVENT_INDEX(0));
-
-	radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, 0));
-	radeon_emit(cs, WAIT_REG_MEM_EQUAL); /* wait until the register is equal to the reference value */
-	radeon_emit(cs, reg_strmout_cntl >> 2);  /* register */
-	radeon_emit(cs, 0);
-	radeon_emit(cs, S_0084FC_OFFSET_UPDATE_DONE(1)); /* reference value */
-	radeon_emit(cs, S_0084FC_OFFSET_UPDATE_DONE(1)); /* mask */
-	radeon_emit(cs, 4); /* poll interval */
+   struct radeon_cmdbuf *cs = sctx->gfx_cs;
+   unsigned reg_strmout_cntl;
+
+   /* The register is at different places on different ASICs. */
+   if (sctx->chip_class >= GFX7) {
+      reg_strmout_cntl = R_0300FC_CP_STRMOUT_CNTL;
+      radeon_set_uconfig_reg(cs, reg_strmout_cntl, 0);
+   } else {
+      reg_strmout_cntl = R_0084FC_CP_STRMOUT_CNTL;
+      radeon_set_config_reg(cs, reg_strmout_cntl, 0);
+   }
+
+   radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
+   radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_SO_VGTSTREAMOUT_FLUSH) | EVENT_INDEX(0));
+
+   radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, 0));
+   radeon_emit(cs,
+               WAIT_REG_MEM_EQUAL); /* wait until the register is equal to the reference value */
+   radeon_emit(cs, reg_strmout_cntl >> 2); /* register */
+   radeon_emit(cs, 0);
+   radeon_emit(cs, S_0084FC_OFFSET_UPDATE_DONE(1)); /* reference value */
+   radeon_emit(cs, S_0084FC_OFFSET_UPDATE_DONE(1)); /* mask */
+   radeon_emit(cs, 4);                              /* poll interval */
 }
 
 static void si_emit_streamout_begin(struct si_context *sctx)
 {
-	struct radeon_cmdbuf *cs = sctx->gfx_cs;
-	struct si_streamout_target **t = sctx->streamout.targets;
-	uint16_t *stride_in_dw = sctx->streamout.stride_in_dw;
-	unsigned i;
-
-	si_flush_vgt_streamout(sctx);
-
-	for (i = 0; i < sctx->streamout.num_targets; i++) {
-		if (!t[i])
-			continue;
-
-		t[i]->stride_in_dw = stride_in_dw[i];
-
-		/* AMD GCN binds streamout buffers as shader resources.
-		 * VGT only counts primitives and tells the shader
-		 * through SGPRs what to do. */
-		radeon_set_context_reg_seq(cs, R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 + 16*i, 2);
-		radeon_emit(cs, (t[i]->b.buffer_offset +
-				 t[i]->b.buffer_size) >> 2);	/* BUFFER_SIZE (in DW) */
-		radeon_emit(cs, stride_in_dw[i]);		/* VTX_STRIDE (in DW) */
-
-		if (sctx->streamout.append_bitmask & (1 << i) && t[i]->buf_filled_size_valid) {
-			uint64_t va = t[i]->buf_filled_size->gpu_address +
-				      t[i]->buf_filled_size_offset;
-
-			/* Append. */
-			radeon_emit(cs, PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0));
-			radeon_emit(cs, STRMOUT_SELECT_BUFFER(i) |
-				    STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_FROM_MEM)); /* control */
-			radeon_emit(cs, 0); /* unused */
-			radeon_emit(cs, 0); /* unused */
-			radeon_emit(cs, va); /* src address lo */
-			radeon_emit(cs, va >> 32); /* src address hi */
-
-			radeon_add_to_buffer_list(sctx,  sctx->gfx_cs,
-						  t[i]->buf_filled_size,
-						  RADEON_USAGE_READ,
-						  RADEON_PRIO_SO_FILLED_SIZE);
-		} else {
-			/* Start from the beginning. */
-			radeon_emit(cs, PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0));
-			radeon_emit(cs, STRMOUT_SELECT_BUFFER(i) |
-				    STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_FROM_PACKET)); /* control */
-			radeon_emit(cs, 0); /* unused */
-			radeon_emit(cs, 0); /* unused */
-			radeon_emit(cs, t[i]->b.buffer_offset >> 2); /* buffer offset in DW */
-			radeon_emit(cs, 0); /* unused */
-		}
-	}
-
-	sctx->streamout.begin_emitted = true;
+   struct radeon_cmdbuf *cs = sctx->gfx_cs;
+   struct si_streamout_target **t = sctx->streamout.targets;
+   uint16_t *stride_in_dw = sctx->streamout.stride_in_dw;
+   unsigned i;
+
+   si_flush_vgt_streamout(sctx);
+
+   for (i = 0; i < sctx->streamout.num_targets; i++) {
+      if (!t[i])
+         continue;
+
+      t[i]->stride_in_dw = stride_in_dw[i];
+
+      /* AMD GCN binds streamout buffers as shader resources.
+       * VGT only counts primitives and tells the shader
+       * through SGPRs what to do. */
+      radeon_set_context_reg_seq(cs, R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 + 16 * i, 2);
+      radeon_emit(cs, (t[i]->b.buffer_offset + t[i]->b.buffer_size) >> 2); /* BUFFER_SIZE (in DW) */
+      radeon_emit(cs, stride_in_dw[i]);                                    /* VTX_STRIDE (in DW) */
+
+      if (sctx->streamout.append_bitmask & (1 << i) && t[i]->buf_filled_size_valid) {
+         uint64_t va = t[i]->buf_filled_size->gpu_address + t[i]->buf_filled_size_offset;
+
+         /* Append. */
+         radeon_emit(cs, PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0));
+         radeon_emit(cs, STRMOUT_SELECT_BUFFER(i) |
+                            STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_FROM_MEM)); /* control */
+         radeon_emit(cs, 0);                                                 /* unused */
+         radeon_emit(cs, 0);                                                 /* unused */
+         radeon_emit(cs, va);                                                /* src address lo */
+         radeon_emit(cs, va >> 32);                                          /* src address hi */
+
+         radeon_add_to_buffer_list(sctx, sctx->gfx_cs, t[i]->buf_filled_size, RADEON_USAGE_READ,
+                                   RADEON_PRIO_SO_FILLED_SIZE);
+      } else {
+         /* Start from the beginning. */
+         radeon_emit(cs, PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0));
+         radeon_emit(cs, STRMOUT_SELECT_BUFFER(i) |
+                            STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_FROM_PACKET)); /* control */
+         radeon_emit(cs, 0);                                                    /* unused */
+         radeon_emit(cs, 0);                                                    /* unused */
+         radeon_emit(cs, t[i]->b.buffer_offset >> 2); /* buffer offset in DW */
+         radeon_emit(cs, 0);                          /* unused */
+      }
+   }
+
+   sctx->streamout.begin_emitted = true;
 }
 
 void si_emit_streamout_end(struct si_context *sctx)
 {
-	if (sctx->screen->use_ngg_streamout) {
-		gfx10_emit_streamout_end(sctx);
-		return;
-	}
-
-	struct radeon_cmdbuf *cs = sctx->gfx_cs;
-	struct si_streamout_target **t = sctx->streamout.targets;
-	unsigned i;
-	uint64_t va;
-
-	si_flush_vgt_streamout(sctx);
-
-	for (i = 0; i < sctx->streamout.num_targets; i++) {
-		if (!t[i])
-			continue;
-
-		va = t[i]->buf_filled_size->gpu_address + t[i]->buf_filled_size_offset;
-		radeon_emit(cs, PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0));
-		radeon_emit(cs, STRMOUT_SELECT_BUFFER(i) |
-			    STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_NONE) |
-			    STRMOUT_STORE_BUFFER_FILLED_SIZE); /* control */
-		radeon_emit(cs, va);     /* dst address lo */
-		radeon_emit(cs, va >> 32); /* dst address hi */
-		radeon_emit(cs, 0); /* unused */
-		radeon_emit(cs, 0); /* unused */
-
-		radeon_add_to_buffer_list(sctx,  sctx->gfx_cs,
-					  t[i]->buf_filled_size,
-					  RADEON_USAGE_WRITE,
-					  RADEON_PRIO_SO_FILLED_SIZE);
-
-		/* Zero the buffer size. The counters (primitives generated,
-		 * primitives emitted) may be enabled even if there is not
-		 * buffer bound. This ensures that the primitives-emitted query
-		 * won't increment. */
-		radeon_set_context_reg(cs, R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 + 16*i, 0);
-		sctx->context_roll = true;
-
-		t[i]->buf_filled_size_valid = true;
-	}
-
-	sctx->streamout.begin_emitted = false;
+   if (sctx->screen->use_ngg_streamout) {
+      gfx10_emit_streamout_end(sctx);
+      return;
+   }
+
+   struct radeon_cmdbuf *cs = sctx->gfx_cs;
+   struct si_streamout_target **t = sctx->streamout.targets;
+   unsigned i;
+   uint64_t va;
+
+   si_flush_vgt_streamout(sctx);
+
+   for (i = 0; i < sctx->streamout.num_targets; i++) {
+      if (!t[i])
+         continue;
+
+      va = t[i]->buf_filled_size->gpu_address + t[i]->buf_filled_size_offset;
+      radeon_emit(cs, PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0));
+      radeon_emit(cs, STRMOUT_SELECT_BUFFER(i) | STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_NONE) |
+                         STRMOUT_STORE_BUFFER_FILLED_SIZE); /* control */
+      radeon_emit(cs, va);                                  /* dst address lo */
+      radeon_emit(cs, va >> 32);                            /* dst address hi */
+      radeon_emit(cs, 0);                                   /* unused */
+      radeon_emit(cs, 0);                                   /* unused */
+
+      radeon_add_to_buffer_list(sctx, sctx->gfx_cs, t[i]->buf_filled_size, RADEON_USAGE_WRITE,
+                                RADEON_PRIO_SO_FILLED_SIZE);
+
+      /* Zero the buffer size. The counters (primitives generated,
+       * primitives emitted) may be enabled even if there is not
+       * buffer bound. This ensures that the primitives-emitted query
+       * won't increment. */
+      radeon_set_context_reg(cs, R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 + 16 * i, 0);
+      sctx->context_roll = true;
+
+      t[i]->buf_filled_size_valid = true;
+   }
+
+   sctx->streamout.begin_emitted = false;
 }
 
 /* STREAMOUT CONFIG DERIVED STATE
@@ -423,71 +400,65 @@ void si_emit_streamout_end(struct si_context *sctx)
 
 static void si_emit_streamout_enable(struct si_context *sctx)
 {
-	assert(!sctx->screen->use_ngg_streamout);
-
-	radeon_set_context_reg_seq(sctx->gfx_cs, R_028B94_VGT_STRMOUT_CONFIG, 2);
-	radeon_emit(sctx->gfx_cs,
-		    S_028B94_STREAMOUT_0_EN(si_get_strmout_en(sctx)) |
-		    S_028B94_RAST_STREAM(0) |
-		    S_028B94_STREAMOUT_1_EN(si_get_strmout_en(sctx)) |
-		    S_028B94_STREAMOUT_2_EN(si_get_strmout_en(sctx)) |
-		    S_028B94_STREAMOUT_3_EN(si_get_strmout_en(sctx)));
-	radeon_emit(sctx->gfx_cs,
-		    sctx->streamout.hw_enabled_mask &
-		    sctx->streamout.enabled_stream_buffers_mask);
+   assert(!sctx->screen->use_ngg_streamout);
+
+   radeon_set_context_reg_seq(sctx->gfx_cs, R_028B94_VGT_STRMOUT_CONFIG, 2);
+   radeon_emit(sctx->gfx_cs, S_028B94_STREAMOUT_0_EN(si_get_strmout_en(sctx)) |
+                                S_028B94_RAST_STREAM(0) |
+                                S_028B94_STREAMOUT_1_EN(si_get_strmout_en(sctx)) |
+                                S_028B94_STREAMOUT_2_EN(si_get_strmout_en(sctx)) |
+                                S_028B94_STREAMOUT_3_EN(si_get_strmout_en(sctx)));
+   radeon_emit(sctx->gfx_cs,
+               sctx->streamout.hw_enabled_mask & sctx->streamout.enabled_stream_buffers_mask);
 }
 
 static void si_set_streamout_enable(struct si_context *sctx, bool enable)
 {
-	bool old_strmout_en = si_get_strmout_en(sctx);
-	unsigned old_hw_enabled_mask = sctx->streamout.hw_enabled_mask;
+   bool old_strmout_en = si_get_strmout_en(sctx);
+   unsigned old_hw_enabled_mask = sctx->streamout.hw_enabled_mask;
 
-	sctx->streamout.streamout_enabled = enable;
+   sctx->streamout.streamout_enabled = enable;
 
-	sctx->streamout.hw_enabled_mask = sctx->streamout.enabled_mask |
-					  (sctx->streamout.enabled_mask << 4) |
-					  (sctx->streamout.enabled_mask << 8) |
-					  (sctx->streamout.enabled_mask << 12);
+   sctx->streamout.hw_enabled_mask =
+      sctx->streamout.enabled_mask | (sctx->streamout.enabled_mask << 4) |
+      (sctx->streamout.enabled_mask << 8) | (sctx->streamout.enabled_mask << 12);
 
-	if (!sctx->screen->use_ngg_streamout &&
-	    ((old_strmout_en != si_get_strmout_en(sctx)) ||
-	     (old_hw_enabled_mask != sctx->streamout.hw_enabled_mask)))
-		si_mark_atom_dirty(sctx, &sctx->atoms.s.streamout_enable);
+   if (!sctx->screen->use_ngg_streamout &&
+       ((old_strmout_en != si_get_strmout_en(sctx)) ||
+        (old_hw_enabled_mask != sctx->streamout.hw_enabled_mask)))
+      si_mark_atom_dirty(sctx, &sctx->atoms.s.streamout_enable);
 }
 
-void si_update_prims_generated_query_state(struct si_context *sctx,
-					   unsigned type, int diff)
+void si_update_prims_generated_query_state(struct si_context *sctx, unsigned type, int diff)
 {
-	if (!sctx->screen->use_ngg_streamout &&
-	    type == PIPE_QUERY_PRIMITIVES_GENERATED) {
-		bool old_strmout_en = si_get_strmout_en(sctx);
+   if (!sctx->screen->use_ngg_streamout && type == PIPE_QUERY_PRIMITIVES_GENERATED) {
+      bool old_strmout_en = si_get_strmout_en(sctx);
 
-		sctx->streamout.num_prims_gen_queries += diff;
-		assert(sctx->streamout.num_prims_gen_queries >= 0);
+      sctx->streamout.num_prims_gen_queries += diff;
+      assert(sctx->streamout.num_prims_gen_queries >= 0);
 
-		sctx->streamout.prims_gen_query_enabled =
-			sctx->streamout.num_prims_gen_queries != 0;
+      sctx->streamout.prims_gen_query_enabled = sctx->streamout.num_prims_gen_queries != 0;
 
-		if (old_strmout_en != si_get_strmout_en(sctx))
-			si_mark_atom_dirty(sctx, &sctx->atoms.s.streamout_enable);
+      if (old_strmout_en != si_get_strmout_en(sctx))
+         si_mark_atom_dirty(sctx, &sctx->atoms.s.streamout_enable);
 
-		if (si_update_ngg(sctx)) {
-			si_shader_change_notify(sctx);
-			sctx->do_update_shaders = true;
-		}
-	}
+      if (si_update_ngg(sctx)) {
+         si_shader_change_notify(sctx);
+         sctx->do_update_shaders = true;
+      }
+   }
 }
 
 void si_init_streamout_functions(struct si_context *sctx)
 {
-	sctx->b.create_stream_output_target = si_create_so_target;
-	sctx->b.stream_output_target_destroy = si_so_target_destroy;
-	sctx->b.set_stream_output_targets = si_set_streamout_targets;
-
-	if (sctx->screen->use_ngg_streamout) {
-		sctx->atoms.s.streamout_begin.emit = gfx10_emit_streamout_begin;
-	} else {
-		sctx->atoms.s.streamout_begin.emit = si_emit_streamout_begin;
-		sctx->atoms.s.streamout_enable.emit = si_emit_streamout_enable;
-	}
+   sctx->b.create_stream_output_target = si_create_so_target;
+   sctx->b.stream_output_target_destroy = si_so_target_destroy;
+   sctx->b.set_stream_output_targets = si_set_streamout_targets;
+
+   if (sctx->screen->use_ngg_streamout) {
+      sctx->atoms.s.streamout_begin.emit = gfx10_emit_streamout_begin;
+   } else {
+      sctx->atoms.s.streamout_begin.emit = si_emit_streamout_begin;
+      sctx->atoms.s.streamout_enable.emit = si_emit_streamout_enable;
+   }
 }
diff --git a/src/gallium/drivers/radeonsi/si_state_viewport.c b/src/gallium/drivers/radeonsi/si_state_viewport.c
index 682f00d44a8..5149ee1c643 100644
--- a/src/gallium/drivers/radeonsi/si_state_viewport.c
+++ b/src/gallium/drivers/radeonsi/si_state_viewport.c
@@ -30,541 +30,512 @@
 
 void si_update_ngg_small_prim_precision(struct si_context *ctx)
 {
-	if (!ctx->screen->use_ngg_culling)
-		return;
-
-	/* Set VS_STATE.SMALL_PRIM_PRECISION for NGG culling. */
-	unsigned num_samples = ctx->framebuffer.nr_samples;
-	unsigned quant_mode = ctx->viewports.as_scissor[0].quant_mode;
-	float precision;
-
-	if (quant_mode == SI_QUANT_MODE_12_12_FIXED_POINT_1_4096TH)
-		precision = num_samples / 4096.0;
-	else if (quant_mode == SI_QUANT_MODE_14_10_FIXED_POINT_1_1024TH)
-		precision = num_samples / 1024.0;
-	else
-		precision = num_samples / 256.0;
-
-	ctx->current_vs_state &= C_VS_STATE_SMALL_PRIM_PRECISION;
-	ctx->current_vs_state |= S_VS_STATE_SMALL_PRIM_PRECISION(fui(precision) >> 23);
+   if (!ctx->screen->use_ngg_culling)
+      return;
+
+   /* Set VS_STATE.SMALL_PRIM_PRECISION for NGG culling. */
+   unsigned num_samples = ctx->framebuffer.nr_samples;
+   unsigned quant_mode = ctx->viewports.as_scissor[0].quant_mode;
+   float precision;
+
+   if (quant_mode == SI_QUANT_MODE_12_12_FIXED_POINT_1_4096TH)
+      precision = num_samples / 4096.0;
+   else if (quant_mode == SI_QUANT_MODE_14_10_FIXED_POINT_1_1024TH)
+      precision = num_samples / 1024.0;
+   else
+      precision = num_samples / 256.0;
+
+   ctx->current_vs_state &= C_VS_STATE_SMALL_PRIM_PRECISION;
+   ctx->current_vs_state |= S_VS_STATE_SMALL_PRIM_PRECISION(fui(precision) >> 23);
 }
 
-void si_get_small_prim_cull_info(struct si_context *sctx,
-				 struct si_small_prim_cull_info *out)
+void si_get_small_prim_cull_info(struct si_context *sctx, struct si_small_prim_cull_info *out)
 {
-	/* This is needed by the small primitive culling, because it's done
-	 * in screen space.
-	 */
-	struct si_small_prim_cull_info info;
-	unsigned num_samples = sctx->framebuffer.nr_samples;
-	assert(num_samples >= 1);
-
-	info.scale[0] = sctx->viewports.states[0].scale[0];
-	info.scale[1] = sctx->viewports.states[0].scale[1];
-	info.translate[0] = sctx->viewports.states[0].translate[0];
-	info.translate[1] = sctx->viewports.states[0].translate[1];
-
-	/* The viewport shouldn't flip the X axis for the small prim culling to work. */
-	assert(-info.scale[0] + info.translate[0] <= info.scale[0] + info.translate[0]);
-
-	/* If the Y axis is inverted (OpenGL default framebuffer), reverse it.
-	 * This is because the viewport transformation inverts the clip space
-	 * bounding box, so min becomes max, which breaks small primitive
-	 * culling.
-	 */
-	if (sctx->viewports.y_inverted) {
-		info.scale[1] = -info.scale[1];
-		info.translate[1] = -info.translate[1];
-	}
-
-	/* Scale the framebuffer up, so that samples become pixels and small
-	 * primitive culling is the same for all sample counts.
-	 * This only works with the standard DX sample positions, because
-	 * the samples are evenly spaced on both X and Y axes.
-	 */
-	for (unsigned i = 0; i < 2; i++) {
-		info.scale[i] *= num_samples;
-		info.translate[i] *= num_samples;
-	}
-	*out = info;
+   /* This is needed by the small primitive culling, because it's done
+    * in screen space.
+    */
+   struct si_small_prim_cull_info info;
+   unsigned num_samples = sctx->framebuffer.nr_samples;
+   assert(num_samples >= 1);
+
+   info.scale[0] = sctx->viewports.states[0].scale[0];
+   info.scale[1] = sctx->viewports.states[0].scale[1];
+   info.translate[0] = sctx->viewports.states[0].translate[0];
+   info.translate[1] = sctx->viewports.states[0].translate[1];
+
+   /* The viewport shouldn't flip the X axis for the small prim culling to work. */
+   assert(-info.scale[0] + info.translate[0] <= info.scale[0] + info.translate[0]);
+
+   /* If the Y axis is inverted (OpenGL default framebuffer), reverse it.
+    * This is because the viewport transformation inverts the clip space
+    * bounding box, so min becomes max, which breaks small primitive
+    * culling.
+    */
+   if (sctx->viewports.y_inverted) {
+      info.scale[1] = -info.scale[1];
+      info.translate[1] = -info.translate[1];
+   }
+
+   /* Scale the framebuffer up, so that samples become pixels and small
+    * primitive culling is the same for all sample counts.
+    * This only works with the standard DX sample positions, because
+    * the samples are evenly spaced on both X and Y axes.
+    */
+   for (unsigned i = 0; i < 2; i++) {
+      info.scale[i] *= num_samples;
+      info.translate[i] *= num_samples;
+   }
+   *out = info;
 }
 
-static void si_set_scissor_states(struct pipe_context *pctx,
-				  unsigned start_slot,
-				  unsigned num_scissors,
-				  const struct pipe_scissor_state *state)
+static void si_set_scissor_states(struct pipe_context *pctx, unsigned start_slot,
+                                  unsigned num_scissors, const struct pipe_scissor_state *state)
 {
-	struct si_context *ctx = (struct si_context *)pctx;
-	int i;
+   struct si_context *ctx = (struct si_context *)pctx;
+   int i;
 
-	for (i = 0; i < num_scissors; i++)
-		ctx->scissors[start_slot + i] = state[i];
+   for (i = 0; i < num_scissors; i++)
+      ctx->scissors[start_slot + i] = state[i];
 
-	if (!ctx->queued.named.rasterizer->scissor_enable)
-		return;
+   if (!ctx->queued.named.rasterizer->scissor_enable)
+      return;
 
-	si_mark_atom_dirty(ctx, &ctx->atoms.s.scissors);
+   si_mark_atom_dirty(ctx, &ctx->atoms.s.scissors);
 }
 
 /* Since the guard band disables clipping, we have to clip per-pixel
  * using a scissor.
  */
 static void si_get_scissor_from_viewport(struct si_context *ctx,
-					 const struct pipe_viewport_state *vp,
-					 struct si_signed_scissor *scissor)
+                                         const struct pipe_viewport_state *vp,
+                                         struct si_signed_scissor *scissor)
 {
-	float tmp, minx, miny, maxx, maxy;
-
-	/* Convert (-1, -1) and (1, 1) from clip space into window space. */
-	minx = -vp->scale[0] + vp->translate[0];
-	miny = -vp->scale[1] + vp->translate[1];
-	maxx = vp->scale[0] + vp->translate[0];
-	maxy = vp->scale[1] + vp->translate[1];
-
-	/* Handle inverted viewports. */
-	if (minx > maxx) {
-		tmp = minx;
-		minx = maxx;
-		maxx = tmp;
-	}
-	if (miny > maxy) {
-		tmp = miny;
-		miny = maxy;
-		maxy = tmp;
-	}
-
-	/* Convert to integer and round up the max bounds. */
-	scissor->minx = minx;
-	scissor->miny = miny;
-	scissor->maxx = ceilf(maxx);
-	scissor->maxy = ceilf(maxy);
+   float tmp, minx, miny, maxx, maxy;
+
+   /* Convert (-1, -1) and (1, 1) from clip space into window space. */
+   minx = -vp->scale[0] + vp->translate[0];
+   miny = -vp->scale[1] + vp->translate[1];
+   maxx = vp->scale[0] + vp->translate[0];
+   maxy = vp->scale[1] + vp->translate[1];
+
+   /* Handle inverted viewports. */
+   if (minx > maxx) {
+      tmp = minx;
+      minx = maxx;
+      maxx = tmp;
+   }
+   if (miny > maxy) {
+      tmp = miny;
+      miny = maxy;
+      maxy = tmp;
+   }
+
+   /* Convert to integer and round up the max bounds. */
+   scissor->minx = minx;
+   scissor->miny = miny;
+   scissor->maxx = ceilf(maxx);
+   scissor->maxy = ceilf(maxy);
 }
 
-static void si_clamp_scissor(struct si_context *ctx,
-			     struct pipe_scissor_state *out,
-			     struct si_signed_scissor *scissor)
+static void si_clamp_scissor(struct si_context *ctx, struct pipe_scissor_state *out,
+                             struct si_signed_scissor *scissor)
 {
-	out->minx = CLAMP(scissor->minx, 0, SI_MAX_SCISSOR);
-	out->miny = CLAMP(scissor->miny, 0, SI_MAX_SCISSOR);
-	out->maxx = CLAMP(scissor->maxx, 0, SI_MAX_SCISSOR);
-	out->maxy = CLAMP(scissor->maxy, 0, SI_MAX_SCISSOR);
+   out->minx = CLAMP(scissor->minx, 0, SI_MAX_SCISSOR);
+   out->miny = CLAMP(scissor->miny, 0, SI_MAX_SCISSOR);
+   out->maxx = CLAMP(scissor->maxx, 0, SI_MAX_SCISSOR);
+   out->maxy = CLAMP(scissor->maxy, 0, SI_MAX_SCISSOR);
 }
 
-static void si_clip_scissor(struct pipe_scissor_state *out,
-			    struct pipe_scissor_state *clip)
+static void si_clip_scissor(struct pipe_scissor_state *out, struct pipe_scissor_state *clip)
 {
-	out->minx = MAX2(out->minx, clip->minx);
-	out->miny = MAX2(out->miny, clip->miny);
-	out->maxx = MIN2(out->maxx, clip->maxx);
-	out->maxy = MIN2(out->maxy, clip->maxy);
+   out->minx = MAX2(out->minx, clip->minx);
+   out->miny = MAX2(out->miny, clip->miny);
+   out->maxx = MIN2(out->maxx, clip->maxx);
+   out->maxy = MIN2(out->maxy, clip->maxy);
 }
 
-static void si_scissor_make_union(struct si_signed_scissor *out,
-				  struct si_signed_scissor *in)
+static void si_scissor_make_union(struct si_signed_scissor *out, struct si_signed_scissor *in)
 {
-	out->minx = MIN2(out->minx, in->minx);
-	out->miny = MIN2(out->miny, in->miny);
-	out->maxx = MAX2(out->maxx, in->maxx);
-	out->maxy = MAX2(out->maxy, in->maxy);
-	out->quant_mode = MIN2(out->quant_mode, in->quant_mode);
+   out->minx = MIN2(out->minx, in->minx);
+   out->miny = MIN2(out->miny, in->miny);
+   out->maxx = MAX2(out->maxx, in->maxx);
+   out->maxy = MAX2(out->maxy, in->maxy);
+   out->quant_mode = MIN2(out->quant_mode, in->quant_mode);
 }
 
-static void si_emit_one_scissor(struct si_context *ctx,
-				struct radeon_cmdbuf *cs,
-				struct si_signed_scissor *vp_scissor,
-				struct pipe_scissor_state *scissor)
+static void si_emit_one_scissor(struct si_context *ctx, struct radeon_cmdbuf *cs,
+                                struct si_signed_scissor *vp_scissor,
+                                struct pipe_scissor_state *scissor)
 {
-	struct pipe_scissor_state final;
-
-	if (ctx->vs_disables_clipping_viewport) {
-		final.minx = final.miny = 0;
-		final.maxx = final.maxy = SI_MAX_SCISSOR;
-	} else {
-		si_clamp_scissor(ctx, &final, vp_scissor);
-	}
-
-	if (scissor)
-		si_clip_scissor(&final, scissor);
-
-	/* Workaround for a hw bug on GFX6 that occurs when PA_SU_HARDWARE_-
-	 * SCREEN_OFFSET != 0 and any_scissor.BR_X/Y <= 0.
-	 */
-	if (ctx->chip_class == GFX6 && (final.maxx == 0 || final.maxy == 0)) {
-		radeon_emit(cs, S_028250_TL_X(1) |
-				S_028250_TL_Y(1) |
-				S_028250_WINDOW_OFFSET_DISABLE(1));
-		radeon_emit(cs, S_028254_BR_X(1) |
-				S_028254_BR_Y(1));
-		return;
-	}
-
-	radeon_emit(cs, S_028250_TL_X(final.minx) |
-			S_028250_TL_Y(final.miny) |
-			S_028250_WINDOW_OFFSET_DISABLE(1));
-	radeon_emit(cs, S_028254_BR_X(final.maxx) |
-			S_028254_BR_Y(final.maxy));
+   struct pipe_scissor_state final;
+
+   if (ctx->vs_disables_clipping_viewport) {
+      final.minx = final.miny = 0;
+      final.maxx = final.maxy = SI_MAX_SCISSOR;
+   } else {
+      si_clamp_scissor(ctx, &final, vp_scissor);
+   }
+
+   if (scissor)
+      si_clip_scissor(&final, scissor);
+
+   /* Workaround for a hw bug on GFX6 that occurs when PA_SU_HARDWARE_-
+    * SCREEN_OFFSET != 0 and any_scissor.BR_X/Y <= 0.
+    */
+   if (ctx->chip_class == GFX6 && (final.maxx == 0 || final.maxy == 0)) {
+      radeon_emit(cs, S_028250_TL_X(1) | S_028250_TL_Y(1) | S_028250_WINDOW_OFFSET_DISABLE(1));
+      radeon_emit(cs, S_028254_BR_X(1) | S_028254_BR_Y(1));
+      return;
+   }
+
+   radeon_emit(cs, S_028250_TL_X(final.minx) | S_028250_TL_Y(final.miny) |
+                      S_028250_WINDOW_OFFSET_DISABLE(1));
+   radeon_emit(cs, S_028254_BR_X(final.maxx) | S_028254_BR_Y(final.maxy));
 }
 
 #define MAX_PA_SU_HARDWARE_SCREEN_OFFSET 8176
 
 static void si_emit_guardband(struct si_context *ctx)
 {
-	const struct si_state_rasterizer *rs = ctx->queued.named.rasterizer;
-	struct si_signed_scissor vp_as_scissor;
-	struct pipe_viewport_state vp;
-	float left, top, right, bottom, max_range, guardband_x, guardband_y;
-	float discard_x, discard_y;
-
-	if (ctx->vs_writes_viewport_index) {
-		/* Shaders can draw to any viewport. Make a union of all
-		 * viewports. */
-		vp_as_scissor = ctx->viewports.as_scissor[0];
-		for (unsigned i = 1; i < SI_MAX_VIEWPORTS; i++) {
-			si_scissor_make_union(&vp_as_scissor,
-					      &ctx->viewports.as_scissor[i]);
-		}
-	} else {
-		vp_as_scissor = ctx->viewports.as_scissor[0];
-	}
-
-	/* Blits don't set the viewport state. The vertex shader determines
-	 * the viewport size by scaling the coordinates, so we don't know
-	 * how large the viewport is. Assume the worst case.
-	 */
-	if (ctx->vs_disables_clipping_viewport)
-		vp_as_scissor.quant_mode = SI_QUANT_MODE_16_8_FIXED_POINT_1_256TH;
-
-	/* Determine the optimal hardware screen offset to center the viewport
-	 * within the viewport range in order to maximize the guardband size.
-	 */
-	int hw_screen_offset_x = (vp_as_scissor.maxx + vp_as_scissor.minx) / 2;
-	int hw_screen_offset_y = (vp_as_scissor.maxy + vp_as_scissor.miny) / 2;
-
-	/* GFX6-GFX7 need to align the offset to an ubertile consisting of all SEs. */
-	const unsigned hw_screen_offset_alignment =
-		ctx->chip_class >= GFX8 ? 16 : MAX2(ctx->screen->se_tile_repeat, 16);
-
-	/* Indexed by quantization modes */
-	static int max_viewport_size[] = {65535, 16383, 4095};
-
-	/* Ensure that the whole viewport stays representable in
-	 * absolute coordinates.
-	 * See comment in si_set_viewport_states.
-	 */
-	assert(vp_as_scissor.maxx <= max_viewport_size[vp_as_scissor.quant_mode] &&
-	       vp_as_scissor.maxy <= max_viewport_size[vp_as_scissor.quant_mode]);
-
-	hw_screen_offset_x = CLAMP(hw_screen_offset_x, 0, MAX_PA_SU_HARDWARE_SCREEN_OFFSET);
-	hw_screen_offset_y = CLAMP(hw_screen_offset_y, 0, MAX_PA_SU_HARDWARE_SCREEN_OFFSET);
-
-	/* Align the screen offset by dropping the low bits. */
-	hw_screen_offset_x &= ~(hw_screen_offset_alignment - 1);
-	hw_screen_offset_y &= ~(hw_screen_offset_alignment - 1);
-
-	/* Apply the offset to center the viewport and maximize the guardband. */
-	vp_as_scissor.minx -= hw_screen_offset_x;
-	vp_as_scissor.maxx -= hw_screen_offset_x;
-	vp_as_scissor.miny -= hw_screen_offset_y;
-	vp_as_scissor.maxy -= hw_screen_offset_y;
-
-	/* Reconstruct the viewport transformation from the scissor. */
-	vp.translate[0] = (vp_as_scissor.minx + vp_as_scissor.maxx) / 2.0;
-	vp.translate[1] = (vp_as_scissor.miny + vp_as_scissor.maxy) / 2.0;
-	vp.scale[0] = vp_as_scissor.maxx - vp.translate[0];
-	vp.scale[1] = vp_as_scissor.maxy - vp.translate[1];
-
-	/* Treat a 0x0 viewport as 1x1 to prevent division by zero. */
-	if (vp_as_scissor.minx == vp_as_scissor.maxx)
-		vp.scale[0] = 0.5;
-	if (vp_as_scissor.miny == vp_as_scissor.maxy)
-		vp.scale[1] = 0.5;
-
-	/* Find the biggest guard band that is inside the supported viewport
-	 * range. The guard band is specified as a horizontal and vertical
-	 * distance from (0,0) in clip space.
-	 *
-	 * This is done by applying the inverse viewport transformation
-	 * on the viewport limits to get those limits in clip space.
-	 *
-	 * The viewport range is [-max_viewport_size/2, max_viewport_size/2].
-	 */
-	assert(vp_as_scissor.quant_mode < ARRAY_SIZE(max_viewport_size));
-	max_range = max_viewport_size[vp_as_scissor.quant_mode] / 2;
-	left   = (-max_range - vp.translate[0]) / vp.scale[0];
-	right  = ( max_range - vp.translate[0]) / vp.scale[0];
-	top    = (-max_range - vp.translate[1]) / vp.scale[1];
-	bottom = ( max_range - vp.translate[1]) / vp.scale[1];
-
-	assert(left <= -1 && top <= -1 && right >= 1 && bottom >= 1);
-
-	guardband_x = MIN2(-left, right);
-	guardband_y = MIN2(-top, bottom);
-
-	discard_x = 1.0;
-	discard_y = 1.0;
-
-	if (unlikely(util_prim_is_points_or_lines(ctx->current_rast_prim))) {
-		/* When rendering wide points or lines, we need to be more
-		 * conservative about when to discard them entirely. */
-		float pixels;
-
-		if (ctx->current_rast_prim == PIPE_PRIM_POINTS)
-			pixels = rs->max_point_size;
-		else
-			pixels = rs->line_width;
-
-		/* Add half the point size / line width */
-		discard_x += pixels / (2.0 * vp.scale[0]);
-		discard_y += pixels / (2.0 * vp.scale[1]);
-
-		/* Discard primitives that would lie entirely outside the clip
-		 * region. */
-		discard_x = MIN2(discard_x, guardband_x);
-		discard_y = MIN2(discard_y, guardband_y);
-	}
-
-	/* If any of the GB registers is updated, all of them must be updated.
-	 * R_028BE8_PA_CL_GB_VERT_CLIP_ADJ, R_028BEC_PA_CL_GB_VERT_DISC_ADJ
-	 * R_028BF0_PA_CL_GB_HORZ_CLIP_ADJ, R_028BF4_PA_CL_GB_HORZ_DISC_ADJ
-	 */
-	unsigned initial_cdw = ctx->gfx_cs->current.cdw;
-	radeon_opt_set_context_reg4(ctx, R_028BE8_PA_CL_GB_VERT_CLIP_ADJ,
-				    SI_TRACKED_PA_CL_GB_VERT_CLIP_ADJ,
-				    fui(guardband_y), fui(discard_y),
-				    fui(guardband_x), fui(discard_x));
-	radeon_opt_set_context_reg(ctx, R_028234_PA_SU_HARDWARE_SCREEN_OFFSET,
-				   SI_TRACKED_PA_SU_HARDWARE_SCREEN_OFFSET,
-				   S_028234_HW_SCREEN_OFFSET_X(hw_screen_offset_x >> 4) |
-				   S_028234_HW_SCREEN_OFFSET_Y(hw_screen_offset_y >> 4));
-	radeon_opt_set_context_reg(ctx, R_028BE4_PA_SU_VTX_CNTL,
-				   SI_TRACKED_PA_SU_VTX_CNTL,
-				   S_028BE4_PIX_CENTER(rs->half_pixel_center) |
-				   S_028BE4_QUANT_MODE(V_028BE4_X_16_8_FIXED_POINT_1_256TH +
-						       vp_as_scissor.quant_mode));
-	if (initial_cdw != ctx->gfx_cs->current.cdw)
-		ctx->context_roll = true;
-
-	si_update_ngg_small_prim_precision(ctx);
+   const struct si_state_rasterizer *rs = ctx->queued.named.rasterizer;
+   struct si_signed_scissor vp_as_scissor;
+   struct pipe_viewport_state vp;
+   float left, top, right, bottom, max_range, guardband_x, guardband_y;
+   float discard_x, discard_y;
+
+   if (ctx->vs_writes_viewport_index) {
+      /* Shaders can draw to any viewport. Make a union of all
+       * viewports. */
+      vp_as_scissor = ctx->viewports.as_scissor[0];
+      for (unsigned i = 1; i < SI_MAX_VIEWPORTS; i++) {
+         si_scissor_make_union(&vp_as_scissor, &ctx->viewports.as_scissor[i]);
+      }
+   } else {
+      vp_as_scissor = ctx->viewports.as_scissor[0];
+   }
+
+   /* Blits don't set the viewport state. The vertex shader determines
+    * the viewport size by scaling the coordinates, so we don't know
+    * how large the viewport is. Assume the worst case.
+    */
+   if (ctx->vs_disables_clipping_viewport)
+      vp_as_scissor.quant_mode = SI_QUANT_MODE_16_8_FIXED_POINT_1_256TH;
+
+   /* Determine the optimal hardware screen offset to center the viewport
+    * within the viewport range in order to maximize the guardband size.
+    */
+   int hw_screen_offset_x = (vp_as_scissor.maxx + vp_as_scissor.minx) / 2;
+   int hw_screen_offset_y = (vp_as_scissor.maxy + vp_as_scissor.miny) / 2;
+
+   /* GFX6-GFX7 need to align the offset to an ubertile consisting of all SEs. */
+   const unsigned hw_screen_offset_alignment =
+      ctx->chip_class >= GFX8 ? 16 : MAX2(ctx->screen->se_tile_repeat, 16);
+
+   /* Indexed by quantization modes */
+   static int max_viewport_size[] = {65535, 16383, 4095};
+
+   /* Ensure that the whole viewport stays representable in
+    * absolute coordinates.
+    * See comment in si_set_viewport_states.
+    */
+   assert(vp_as_scissor.maxx <= max_viewport_size[vp_as_scissor.quant_mode] &&
+          vp_as_scissor.maxy <= max_viewport_size[vp_as_scissor.quant_mode]);
+
+   hw_screen_offset_x = CLAMP(hw_screen_offset_x, 0, MAX_PA_SU_HARDWARE_SCREEN_OFFSET);
+   hw_screen_offset_y = CLAMP(hw_screen_offset_y, 0, MAX_PA_SU_HARDWARE_SCREEN_OFFSET);
+
+   /* Align the screen offset by dropping the low bits. */
+   hw_screen_offset_x &= ~(hw_screen_offset_alignment - 1);
+   hw_screen_offset_y &= ~(hw_screen_offset_alignment - 1);
+
+   /* Apply the offset to center the viewport and maximize the guardband. */
+   vp_as_scissor.minx -= hw_screen_offset_x;
+   vp_as_scissor.maxx -= hw_screen_offset_x;
+   vp_as_scissor.miny -= hw_screen_offset_y;
+   vp_as_scissor.maxy -= hw_screen_offset_y;
+
+   /* Reconstruct the viewport transformation from the scissor. */
+   vp.translate[0] = (vp_as_scissor.minx + vp_as_scissor.maxx) / 2.0;
+   vp.translate[1] = (vp_as_scissor.miny + vp_as_scissor.maxy) / 2.0;
+   vp.scale[0] = vp_as_scissor.maxx - vp.translate[0];
+   vp.scale[1] = vp_as_scissor.maxy - vp.translate[1];
+
+   /* Treat a 0x0 viewport as 1x1 to prevent division by zero. */
+   if (vp_as_scissor.minx == vp_as_scissor.maxx)
+      vp.scale[0] = 0.5;
+   if (vp_as_scissor.miny == vp_as_scissor.maxy)
+      vp.scale[1] = 0.5;
+
+   /* Find the biggest guard band that is inside the supported viewport
+    * range. The guard band is specified as a horizontal and vertical
+    * distance from (0,0) in clip space.
+    *
+    * This is done by applying the inverse viewport transformation
+    * on the viewport limits to get those limits in clip space.
+    *
+    * The viewport range is [-max_viewport_size/2, max_viewport_size/2].
+    */
+   assert(vp_as_scissor.quant_mode < ARRAY_SIZE(max_viewport_size));
+   max_range = max_viewport_size[vp_as_scissor.quant_mode] / 2;
+   left = (-max_range - vp.translate[0]) / vp.scale[0];
+   right = (max_range - vp.translate[0]) / vp.scale[0];
+   top = (-max_range - vp.translate[1]) / vp.scale[1];
+   bottom = (max_range - vp.translate[1]) / vp.scale[1];
+
+   assert(left <= -1 && top <= -1 && right >= 1 && bottom >= 1);
+
+   guardband_x = MIN2(-left, right);
+   guardband_y = MIN2(-top, bottom);
+
+   discard_x = 1.0;
+   discard_y = 1.0;
+
+   if (unlikely(util_prim_is_points_or_lines(ctx->current_rast_prim))) {
+      /* When rendering wide points or lines, we need to be more
+       * conservative about when to discard them entirely. */
+      float pixels;
+
+      if (ctx->current_rast_prim == PIPE_PRIM_POINTS)
+         pixels = rs->max_point_size;
+      else
+         pixels = rs->line_width;
+
+      /* Add half the point size / line width */
+      discard_x += pixels / (2.0 * vp.scale[0]);
+      discard_y += pixels / (2.0 * vp.scale[1]);
+
+      /* Discard primitives that would lie entirely outside the clip
+       * region. */
+      discard_x = MIN2(discard_x, guardband_x);
+      discard_y = MIN2(discard_y, guardband_y);
+   }
+
+   /* If any of the GB registers is updated, all of them must be updated.
+    * R_028BE8_PA_CL_GB_VERT_CLIP_ADJ, R_028BEC_PA_CL_GB_VERT_DISC_ADJ
+    * R_028BF0_PA_CL_GB_HORZ_CLIP_ADJ, R_028BF4_PA_CL_GB_HORZ_DISC_ADJ
+    */
+   unsigned initial_cdw = ctx->gfx_cs->current.cdw;
+   radeon_opt_set_context_reg4(ctx, R_028BE8_PA_CL_GB_VERT_CLIP_ADJ,
+                               SI_TRACKED_PA_CL_GB_VERT_CLIP_ADJ, fui(guardband_y), fui(discard_y),
+                               fui(guardband_x), fui(discard_x));
+   radeon_opt_set_context_reg(ctx, R_028234_PA_SU_HARDWARE_SCREEN_OFFSET,
+                              SI_TRACKED_PA_SU_HARDWARE_SCREEN_OFFSET,
+                              S_028234_HW_SCREEN_OFFSET_X(hw_screen_offset_x >> 4) |
+                                 S_028234_HW_SCREEN_OFFSET_Y(hw_screen_offset_y >> 4));
+   radeon_opt_set_context_reg(
+      ctx, R_028BE4_PA_SU_VTX_CNTL, SI_TRACKED_PA_SU_VTX_CNTL,
+      S_028BE4_PIX_CENTER(rs->half_pixel_center) |
+         S_028BE4_QUANT_MODE(V_028BE4_X_16_8_FIXED_POINT_1_256TH + vp_as_scissor.quant_mode));
+   if (initial_cdw != ctx->gfx_cs->current.cdw)
+      ctx->context_roll = true;
+
+   si_update_ngg_small_prim_precision(ctx);
 }
 
 static void si_emit_scissors(struct si_context *ctx)
 {
-	struct radeon_cmdbuf *cs = ctx->gfx_cs;
-	struct pipe_scissor_state *states = ctx->scissors;
-	bool scissor_enabled = ctx->queued.named.rasterizer->scissor_enable;
-
-	/* The simple case: Only 1 viewport is active. */
-	if (!ctx->vs_writes_viewport_index) {
-		struct si_signed_scissor *vp = &ctx->viewports.as_scissor[0];
-
-		radeon_set_context_reg_seq(cs, R_028250_PA_SC_VPORT_SCISSOR_0_TL, 2);
-		si_emit_one_scissor(ctx, cs, vp, scissor_enabled ? &states[0] : NULL);
-		return;
-	}
-
-	/* All registers in the array need to be updated if any of them is changed.
-	 * This is a hardware requirement.
-	 */
-	radeon_set_context_reg_seq(cs, R_028250_PA_SC_VPORT_SCISSOR_0_TL,
-				   SI_MAX_VIEWPORTS * 2);
-	for (unsigned i = 0; i < SI_MAX_VIEWPORTS; i++) {
-		si_emit_one_scissor(ctx, cs, &ctx->viewports.as_scissor[i],
-				    scissor_enabled ? &states[i] : NULL);
-	}
+   struct radeon_cmdbuf *cs = ctx->gfx_cs;
+   struct pipe_scissor_state *states = ctx->scissors;
+   bool scissor_enabled = ctx->queued.named.rasterizer->scissor_enable;
+
+   /* The simple case: Only 1 viewport is active. */
+   if (!ctx->vs_writes_viewport_index) {
+      struct si_signed_scissor *vp = &ctx->viewports.as_scissor[0];
+
+      radeon_set_context_reg_seq(cs, R_028250_PA_SC_VPORT_SCISSOR_0_TL, 2);
+      si_emit_one_scissor(ctx, cs, vp, scissor_enabled ? &states[0] : NULL);
+      return;
+   }
+
+   /* All registers in the array need to be updated if any of them is changed.
+    * This is a hardware requirement.
+    */
+   radeon_set_context_reg_seq(cs, R_028250_PA_SC_VPORT_SCISSOR_0_TL, SI_MAX_VIEWPORTS * 2);
+   for (unsigned i = 0; i < SI_MAX_VIEWPORTS; i++) {
+      si_emit_one_scissor(ctx, cs, &ctx->viewports.as_scissor[i],
+                          scissor_enabled ? &states[i] : NULL);
+   }
 }
 
-static void si_set_viewport_states(struct pipe_context *pctx,
-				   unsigned start_slot,
-				   unsigned num_viewports,
-				   const struct pipe_viewport_state *state)
+static void si_set_viewport_states(struct pipe_context *pctx, unsigned start_slot,
+                                   unsigned num_viewports, const struct pipe_viewport_state *state)
 {
-	struct si_context *ctx = (struct si_context *)pctx;
-	int i;
-
-	for (i = 0; i < num_viewports; i++) {
-		unsigned index = start_slot + i;
-		struct si_signed_scissor *scissor = &ctx->viewports.as_scissor[index];
-
-		ctx->viewports.states[index] = state[i];
-
-		si_get_scissor_from_viewport(ctx, &state[i], scissor);
-
-		unsigned w = scissor->maxx - scissor->minx;
-		unsigned h = scissor->maxy - scissor->miny;
-		unsigned max_extent = MAX2(w, h);
-
-		int max_corner = MAX2(scissor->maxx, scissor->maxy);
-
-		unsigned center_x = (scissor->maxx + scissor->minx) / 2;
-		unsigned center_y = (scissor->maxy + scissor->miny) / 2;
-		unsigned max_center = MAX2(center_x, center_y);
-
-		/* PA_SU_HARDWARE_SCREEN_OFFSET can't center viewports whose
-		 * center start farther than MAX_PA_SU_HARDWARE_SCREEN_OFFSET.
-		 * (for example, a 1x1 viewport in the lower right corner of
-		 * 16Kx16K) Such viewports need a greater guardband, so they
-		 * have to use a worse quantization mode.
-		 */
-		unsigned distance_off_center =
-			MAX2(0, (int)max_center - MAX_PA_SU_HARDWARE_SCREEN_OFFSET);
-		max_extent += distance_off_center;
-
-		/* Determine the best quantization mode (subpixel precision),
-		 * but also leave enough space for the guardband.
-		 *
-		 * Note that primitive binning requires QUANT_MODE == 16_8 on Vega10
-		 * and Raven1 for line and rectangle primitive types to work correctly.
-		 * Always use 16_8 if primitive binning is possible to occur.
-		 */
-		if ((ctx->family == CHIP_VEGA10 || ctx->family == CHIP_RAVEN) &&
-		    ctx->screen->dpbb_allowed)
-			max_extent = 16384; /* Use QUANT_MODE == 16_8. */
-
-		/* Another constraint is that all coordinates in the viewport
-		 * are representable in fixed point with respect to the
-		 * surface origin.
-		 *
-		 * It means that PA_SU_HARDWARE_SCREEN_OFFSET can't be given
-		 * an offset that would make the upper corner of the viewport
-		 * greater than the maximum representable number post
-		 * quantization, ie 2^quant_bits.
-		 *
-		 * This does not matter for 14.10 and 16.8 formats since the
-		 * offset is already limited at 8k, but it means we can't use
-		 * 12.12 if we are drawing to some pixels outside the lower
-		 * 4k x 4k of the render target.
-		 */
-
-		if (max_extent <= 1024 && max_corner < 4096) /* 4K scanline area for guardband */
-			scissor->quant_mode = SI_QUANT_MODE_12_12_FIXED_POINT_1_4096TH;
-		else if (max_extent <= 4096) /* 16K scanline area for guardband */
-			scissor->quant_mode = SI_QUANT_MODE_14_10_FIXED_POINT_1_1024TH;
-		else /* 64K scanline area for guardband */
-			scissor->quant_mode = SI_QUANT_MODE_16_8_FIXED_POINT_1_256TH;
-	}
-
-	if (start_slot == 0) {
-		ctx->viewports.y_inverted =
-			-state->scale[1] + state->translate[1] >
-			state->scale[1] + state->translate[1];
-	}
-
-	si_mark_atom_dirty(ctx, &ctx->atoms.s.viewports);
-	si_mark_atom_dirty(ctx, &ctx->atoms.s.guardband);
-	si_mark_atom_dirty(ctx, &ctx->atoms.s.scissors);
+   struct si_context *ctx = (struct si_context *)pctx;
+   int i;
+
+   for (i = 0; i < num_viewports; i++) {
+      unsigned index = start_slot + i;
+      struct si_signed_scissor *scissor = &ctx->viewports.as_scissor[index];
+
+      ctx->viewports.states[index] = state[i];
+
+      si_get_scissor_from_viewport(ctx, &state[i], scissor);
+
+      unsigned w = scissor->maxx - scissor->minx;
+      unsigned h = scissor->maxy - scissor->miny;
+      unsigned max_extent = MAX2(w, h);
+
+      int max_corner = MAX2(scissor->maxx, scissor->maxy);
+
+      unsigned center_x = (scissor->maxx + scissor->minx) / 2;
+      unsigned center_y = (scissor->maxy + scissor->miny) / 2;
+      unsigned max_center = MAX2(center_x, center_y);
+
+      /* PA_SU_HARDWARE_SCREEN_OFFSET can't center viewports whose
+       * center start farther than MAX_PA_SU_HARDWARE_SCREEN_OFFSET.
+       * (for example, a 1x1 viewport in the lower right corner of
+       * 16Kx16K) Such viewports need a greater guardband, so they
+       * have to use a worse quantization mode.
+       */
+      unsigned distance_off_center = MAX2(0, (int)max_center - MAX_PA_SU_HARDWARE_SCREEN_OFFSET);
+      max_extent += distance_off_center;
+
+      /* Determine the best quantization mode (subpixel precision),
+       * but also leave enough space for the guardband.
+       *
+       * Note that primitive binning requires QUANT_MODE == 16_8 on Vega10
+       * and Raven1 for line and rectangle primitive types to work correctly.
+       * Always use 16_8 if primitive binning is possible to occur.
+       */
+      if ((ctx->family == CHIP_VEGA10 || ctx->family == CHIP_RAVEN) && ctx->screen->dpbb_allowed)
+         max_extent = 16384; /* Use QUANT_MODE == 16_8. */
+
+      /* Another constraint is that all coordinates in the viewport
+       * are representable in fixed point with respect to the
+       * surface origin.
+       *
+       * It means that PA_SU_HARDWARE_SCREEN_OFFSET can't be given
+       * an offset that would make the upper corner of the viewport
+       * greater than the maximum representable number post
+       * quantization, ie 2^quant_bits.
+       *
+       * This does not matter for 14.10 and 16.8 formats since the
+       * offset is already limited at 8k, but it means we can't use
+       * 12.12 if we are drawing to some pixels outside the lower
+       * 4k x 4k of the render target.
+       */
+
+      if (max_extent <= 1024 && max_corner < 4096) /* 4K scanline area for guardband */
+         scissor->quant_mode = SI_QUANT_MODE_12_12_FIXED_POINT_1_4096TH;
+      else if (max_extent <= 4096) /* 16K scanline area for guardband */
+         scissor->quant_mode = SI_QUANT_MODE_14_10_FIXED_POINT_1_1024TH;
+      else /* 64K scanline area for guardband */
+         scissor->quant_mode = SI_QUANT_MODE_16_8_FIXED_POINT_1_256TH;
+   }
+
+   if (start_slot == 0) {
+      ctx->viewports.y_inverted =
+         -state->scale[1] + state->translate[1] > state->scale[1] + state->translate[1];
+   }
+
+   si_mark_atom_dirty(ctx, &ctx->atoms.s.viewports);
+   si_mark_atom_dirty(ctx, &ctx->atoms.s.guardband);
+   si_mark_atom_dirty(ctx, &ctx->atoms.s.scissors);
 }
 
-static void si_emit_one_viewport(struct si_context *ctx,
-				 struct pipe_viewport_state *state)
+static void si_emit_one_viewport(struct si_context *ctx, struct pipe_viewport_state *state)
 {
-	struct radeon_cmdbuf *cs = ctx->gfx_cs;
-
-	radeon_emit(cs, fui(state->scale[0]));
-	radeon_emit(cs, fui(state->translate[0]));
-	radeon_emit(cs, fui(state->scale[1]));
-	radeon_emit(cs, fui(state->translate[1]));
-	radeon_emit(cs, fui(state->scale[2]));
-	radeon_emit(cs, fui(state->translate[2]));
+   struct radeon_cmdbuf *cs = ctx->gfx_cs;
+
+   radeon_emit(cs, fui(state->scale[0]));
+   radeon_emit(cs, fui(state->translate[0]));
+   radeon_emit(cs, fui(state->scale[1]));
+   radeon_emit(cs, fui(state->translate[1]));
+   radeon_emit(cs, fui(state->scale[2]));
+   radeon_emit(cs, fui(state->translate[2]));
 }
 
 static void si_emit_viewports(struct si_context *ctx)
 {
-	struct radeon_cmdbuf *cs = ctx->gfx_cs;
-	struct pipe_viewport_state *states = ctx->viewports.states;
-
-	if (ctx->screen->use_ngg_culling) {
-		/* Set the viewport info for small primitive culling. */
-		struct si_small_prim_cull_info info;
-		si_get_small_prim_cull_info(ctx, &info);
-
-		if (memcmp(&info, &ctx->last_small_prim_cull_info, sizeof(info))) {
-			unsigned offset = 0;
-
-			/* Align to 256, because the address is shifted by 8 bits. */
-			u_upload_data(ctx->b.const_uploader, 0, sizeof(info), 256,
-				      &info, &offset,
-				      (struct pipe_resource**)&ctx->small_prim_cull_info_buf);
-
-			ctx->small_prim_cull_info_address =
-				ctx->small_prim_cull_info_buf->gpu_address + offset;
-			ctx->last_small_prim_cull_info = info;
-			ctx->small_prim_cull_info_dirty = true;
-		}
-
-		if (ctx->small_prim_cull_info_dirty) {
-			/* This will end up in SGPR6 as (value << 8), shifted by the hw. */
-			radeon_add_to_buffer_list(ctx, ctx->gfx_cs, ctx->small_prim_cull_info_buf,
-						  RADEON_USAGE_READ, RADEON_PRIO_CONST_BUFFER);
-			radeon_set_sh_reg(ctx->gfx_cs, R_00B220_SPI_SHADER_PGM_LO_GS,
-					  ctx->small_prim_cull_info_address >> 8);
-			ctx->small_prim_cull_info_dirty = false;
-		}
-	}
-
-	/* The simple case: Only 1 viewport is active. */
-	if (!ctx->vs_writes_viewport_index) {
-		radeon_set_context_reg_seq(cs, R_02843C_PA_CL_VPORT_XSCALE, 6);
-		si_emit_one_viewport(ctx, &states[0]);
-		return;
-	}
-
-	/* All registers in the array need to be updated if any of them is changed.
-	 * This is a hardware requirement.
-	 */
-	radeon_set_context_reg_seq(cs, R_02843C_PA_CL_VPORT_XSCALE +
-				   0, SI_MAX_VIEWPORTS * 6);
-	for (unsigned i = 0; i < SI_MAX_VIEWPORTS; i++)
-		si_emit_one_viewport(ctx, &states[i]);
+   struct radeon_cmdbuf *cs = ctx->gfx_cs;
+   struct pipe_viewport_state *states = ctx->viewports.states;
+
+   if (ctx->screen->use_ngg_culling) {
+      /* Set the viewport info for small primitive culling. */
+      struct si_small_prim_cull_info info;
+      si_get_small_prim_cull_info(ctx, &info);
+
+      if (memcmp(&info, &ctx->last_small_prim_cull_info, sizeof(info))) {
+         unsigned offset = 0;
+
+         /* Align to 256, because the address is shifted by 8 bits. */
+         u_upload_data(ctx->b.const_uploader, 0, sizeof(info), 256, &info, &offset,
+                       (struct pipe_resource **)&ctx->small_prim_cull_info_buf);
+
+         ctx->small_prim_cull_info_address = ctx->small_prim_cull_info_buf->gpu_address + offset;
+         ctx->last_small_prim_cull_info = info;
+         ctx->small_prim_cull_info_dirty = true;
+      }
+
+      if (ctx->small_prim_cull_info_dirty) {
+         /* This will end up in SGPR6 as (value << 8), shifted by the hw. */
+         radeon_add_to_buffer_list(ctx, ctx->gfx_cs, ctx->small_prim_cull_info_buf,
+                                   RADEON_USAGE_READ, RADEON_PRIO_CONST_BUFFER);
+         radeon_set_sh_reg(ctx->gfx_cs, R_00B220_SPI_SHADER_PGM_LO_GS,
+                           ctx->small_prim_cull_info_address >> 8);
+         ctx->small_prim_cull_info_dirty = false;
+      }
+   }
+
+   /* The simple case: Only 1 viewport is active. */
+   if (!ctx->vs_writes_viewport_index) {
+      radeon_set_context_reg_seq(cs, R_02843C_PA_CL_VPORT_XSCALE, 6);
+      si_emit_one_viewport(ctx, &states[0]);
+      return;
+   }
+
+   /* All registers in the array need to be updated if any of them is changed.
+    * This is a hardware requirement.
+    */
+   radeon_set_context_reg_seq(cs, R_02843C_PA_CL_VPORT_XSCALE + 0, SI_MAX_VIEWPORTS * 6);
+   for (unsigned i = 0; i < SI_MAX_VIEWPORTS; i++)
+      si_emit_one_viewport(ctx, &states[i]);
 }
 
-static inline void
-si_viewport_zmin_zmax(const struct pipe_viewport_state *vp, bool halfz,
-		      bool window_space_position, float *zmin, float *zmax)
+static inline void si_viewport_zmin_zmax(const struct pipe_viewport_state *vp, bool halfz,
+                                         bool window_space_position, float *zmin, float *zmax)
 {
-	if (window_space_position) {
-		*zmin = 0;
-		*zmax = 1;
-		return;
-	}
-	util_viewport_zmin_zmax(vp, halfz, zmin, zmax);
+   if (window_space_position) {
+      *zmin = 0;
+      *zmax = 1;
+      return;
+   }
+   util_viewport_zmin_zmax(vp, halfz, zmin, zmax);
 }
 
 static void si_emit_depth_ranges(struct si_context *ctx)
 {
-	struct radeon_cmdbuf *cs = ctx->gfx_cs;
-	struct pipe_viewport_state *states = ctx->viewports.states;
-	bool clip_halfz = ctx->queued.named.rasterizer->clip_halfz;
-	bool window_space = ctx->vs_disables_clipping_viewport;
-	float zmin, zmax;
-
-	/* The simple case: Only 1 viewport is active. */
-	if (!ctx->vs_writes_viewport_index) {
-		si_viewport_zmin_zmax(&states[0], clip_halfz, window_space,
-				      &zmin, &zmax);
-
-		radeon_set_context_reg_seq(cs, R_0282D0_PA_SC_VPORT_ZMIN_0, 2);
-		radeon_emit(cs, fui(zmin));
-		radeon_emit(cs, fui(zmax));
-		return;
-	}
-
-	/* All registers in the array need to be updated if any of them is changed.
-	 * This is a hardware requirement.
-	 */
-	radeon_set_context_reg_seq(cs, R_0282D0_PA_SC_VPORT_ZMIN_0,
-				   SI_MAX_VIEWPORTS * 2);
-	for (unsigned i = 0; i < SI_MAX_VIEWPORTS; i++) {
-		si_viewport_zmin_zmax(&states[i], clip_halfz, window_space,
-				      &zmin, &zmax);
-		radeon_emit(cs, fui(zmin));
-		radeon_emit(cs, fui(zmax));
-	}
+   struct radeon_cmdbuf *cs = ctx->gfx_cs;
+   struct pipe_viewport_state *states = ctx->viewports.states;
+   bool clip_halfz = ctx->queued.named.rasterizer->clip_halfz;
+   bool window_space = ctx->vs_disables_clipping_viewport;
+   float zmin, zmax;
+
+   /* The simple case: Only 1 viewport is active. */
+   if (!ctx->vs_writes_viewport_index) {
+      si_viewport_zmin_zmax(&states[0], clip_halfz, window_space, &zmin, &zmax);
+
+      radeon_set_context_reg_seq(cs, R_0282D0_PA_SC_VPORT_ZMIN_0, 2);
+      radeon_emit(cs, fui(zmin));
+      radeon_emit(cs, fui(zmax));
+      return;
+   }
+
+   /* All registers in the array need to be updated if any of them is changed.
+    * This is a hardware requirement.
+    */
+   radeon_set_context_reg_seq(cs, R_0282D0_PA_SC_VPORT_ZMIN_0, SI_MAX_VIEWPORTS * 2);
+   for (unsigned i = 0; i < SI_MAX_VIEWPORTS; i++) {
+      si_viewport_zmin_zmax(&states[i], clip_halfz, window_space, &zmin, &zmax);
+      radeon_emit(cs, fui(zmin));
+      radeon_emit(cs, fui(zmax));
+   }
 }
 
 static void si_emit_viewport_states(struct si_context *ctx)
 {
-	si_emit_viewports(ctx);
-	si_emit_depth_ranges(ctx);
+   si_emit_viewports(ctx);
+   si_emit_depth_ranges(ctx);
 }
 
 /**
@@ -579,128 +550,112 @@ static void si_emit_viewport_states(struct si_context *ctx)
  */
 void si_update_vs_viewport_state(struct si_context *ctx)
 {
-	struct si_shader_info *info = si_get_vs_info(ctx);
-	bool vs_window_space;
-
-	if (!info)
-		return;
-
-	/* When the VS disables clipping and viewport transformation. */
-	vs_window_space =
-		info->properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION];
-
-	if (ctx->vs_disables_clipping_viewport != vs_window_space) {
-		ctx->vs_disables_clipping_viewport = vs_window_space;
-		si_mark_atom_dirty(ctx, &ctx->atoms.s.scissors);
-		si_mark_atom_dirty(ctx, &ctx->atoms.s.viewports);
-	}
-
-	/* Viewport index handling. */
-	if (ctx->vs_writes_viewport_index == info->writes_viewport_index)
-		return;
-
-	/* This changes how the guardband is computed. */
-	ctx->vs_writes_viewport_index = info->writes_viewport_index;
-	si_mark_atom_dirty(ctx, &ctx->atoms.s.guardband);
-
-	/* Emit scissors and viewports that were enabled by having
-	 * the ViewportIndex output.
-	 */
-	if (info->writes_viewport_index) {
-	    si_mark_atom_dirty(ctx, &ctx->atoms.s.scissors);
-	    si_mark_atom_dirty(ctx, &ctx->atoms.s.viewports);
-	}
+   struct si_shader_info *info = si_get_vs_info(ctx);
+   bool vs_window_space;
+
+   if (!info)
+      return;
+
+   /* When the VS disables clipping and viewport transformation. */
+   vs_window_space = info->properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION];
+
+   if (ctx->vs_disables_clipping_viewport != vs_window_space) {
+      ctx->vs_disables_clipping_viewport = vs_window_space;
+      si_mark_atom_dirty(ctx, &ctx->atoms.s.scissors);
+      si_mark_atom_dirty(ctx, &ctx->atoms.s.viewports);
+   }
+
+   /* Viewport index handling. */
+   if (ctx->vs_writes_viewport_index == info->writes_viewport_index)
+      return;
+
+   /* This changes how the guardband is computed. */
+   ctx->vs_writes_viewport_index = info->writes_viewport_index;
+   si_mark_atom_dirty(ctx, &ctx->atoms.s.guardband);
+
+   /* Emit scissors and viewports that were enabled by having
+    * the ViewportIndex output.
+    */
+   if (info->writes_viewport_index) {
+      si_mark_atom_dirty(ctx, &ctx->atoms.s.scissors);
+      si_mark_atom_dirty(ctx, &ctx->atoms.s.viewports);
+   }
 }
 
 static void si_emit_window_rectangles(struct si_context *sctx)
 {
-	/* There are four clipping rectangles. Their corner coordinates are inclusive.
-	 * Every pixel is assigned a number from 0 and 15 by setting bits 0-3 depending
-	 * on whether the pixel is inside cliprects 0-3, respectively. For example,
-	 * if a pixel is inside cliprects 0 and 1, but outside 2 and 3, it is assigned
-	 * the number 3 (binary 0011).
-	 *
-	 * If CLIPRECT_RULE & (1 << number), the pixel is rasterized.
-	 */
-	struct radeon_cmdbuf *cs = sctx->gfx_cs;
-	static const unsigned outside[4] = {
-		/* outside rectangle 0 */
-		V_02820C_OUT |
-		V_02820C_IN_1 |
-		V_02820C_IN_2 |
-		V_02820C_IN_21 |
-		V_02820C_IN_3 |
-		V_02820C_IN_31 |
-		V_02820C_IN_32 |
-		V_02820C_IN_321,
-		/* outside rectangles 0, 1 */
-		V_02820C_OUT |
-		V_02820C_IN_2 |
-		V_02820C_IN_3 |
-		V_02820C_IN_32,
-		/* outside rectangles 0, 1, 2 */
-		V_02820C_OUT |
-		V_02820C_IN_3,
-		/* outside rectangles 0, 1, 2, 3 */
-		V_02820C_OUT,
-	};
-	const unsigned disabled = 0xffff; /* all inside and outside cases */
-	unsigned num_rectangles = sctx->num_window_rectangles;
-	struct pipe_scissor_state *rects = sctx->window_rectangles;
-	unsigned rule;
-
-	assert(num_rectangles <= 4);
-
-	if (num_rectangles == 0)
-		rule = disabled;
-	else if (sctx->window_rectangles_include)
-		rule = ~outside[num_rectangles - 1];
-	else
-		rule = outside[num_rectangles - 1];
-
-	radeon_opt_set_context_reg(sctx, R_02820C_PA_SC_CLIPRECT_RULE,
-				   SI_TRACKED_PA_SC_CLIPRECT_RULE, rule);
-	if (num_rectangles == 0)
-		return;
-
-	radeon_set_context_reg_seq(cs, R_028210_PA_SC_CLIPRECT_0_TL,
-				   num_rectangles * 2);
-	for (unsigned i = 0; i < num_rectangles; i++) {
-		radeon_emit(cs, S_028210_TL_X(rects[i].minx) |
-				S_028210_TL_Y(rects[i].miny));
-		radeon_emit(cs, S_028214_BR_X(rects[i].maxx) |
-				S_028214_BR_Y(rects[i].maxy));
-	}
+   /* There are four clipping rectangles. Their corner coordinates are inclusive.
+    * Every pixel is assigned a number from 0 and 15 by setting bits 0-3 depending
+    * on whether the pixel is inside cliprects 0-3, respectively. For example,
+    * if a pixel is inside cliprects 0 and 1, but outside 2 and 3, it is assigned
+    * the number 3 (binary 0011).
+    *
+    * If CLIPRECT_RULE & (1 << number), the pixel is rasterized.
+    */
+   struct radeon_cmdbuf *cs = sctx->gfx_cs;
+   static const unsigned outside[4] = {
+      /* outside rectangle 0 */
+      V_02820C_OUT | V_02820C_IN_1 | V_02820C_IN_2 | V_02820C_IN_21 | V_02820C_IN_3 |
+         V_02820C_IN_31 | V_02820C_IN_32 | V_02820C_IN_321,
+      /* outside rectangles 0, 1 */
+      V_02820C_OUT | V_02820C_IN_2 | V_02820C_IN_3 | V_02820C_IN_32,
+      /* outside rectangles 0, 1, 2 */
+      V_02820C_OUT | V_02820C_IN_3,
+      /* outside rectangles 0, 1, 2, 3 */
+      V_02820C_OUT,
+   };
+   const unsigned disabled = 0xffff; /* all inside and outside cases */
+   unsigned num_rectangles = sctx->num_window_rectangles;
+   struct pipe_scissor_state *rects = sctx->window_rectangles;
+   unsigned rule;
+
+   assert(num_rectangles <= 4);
+
+   if (num_rectangles == 0)
+      rule = disabled;
+   else if (sctx->window_rectangles_include)
+      rule = ~outside[num_rectangles - 1];
+   else
+      rule = outside[num_rectangles - 1];
+
+   radeon_opt_set_context_reg(sctx, R_02820C_PA_SC_CLIPRECT_RULE, SI_TRACKED_PA_SC_CLIPRECT_RULE,
+                              rule);
+   if (num_rectangles == 0)
+      return;
+
+   radeon_set_context_reg_seq(cs, R_028210_PA_SC_CLIPRECT_0_TL, num_rectangles * 2);
+   for (unsigned i = 0; i < num_rectangles; i++) {
+      radeon_emit(cs, S_028210_TL_X(rects[i].minx) | S_028210_TL_Y(rects[i].miny));
+      radeon_emit(cs, S_028214_BR_X(rects[i].maxx) | S_028214_BR_Y(rects[i].maxy));
+   }
 }
 
-static void si_set_window_rectangles(struct pipe_context *ctx,
-				     bool include,
-				     unsigned num_rectangles,
-				     const struct pipe_scissor_state *rects)
+static void si_set_window_rectangles(struct pipe_context *ctx, bool include,
+                                     unsigned num_rectangles,
+                                     const struct pipe_scissor_state *rects)
 {
-	struct si_context *sctx = (struct si_context *)ctx;
+   struct si_context *sctx = (struct si_context *)ctx;
 
-	sctx->num_window_rectangles = num_rectangles;
-	sctx->window_rectangles_include = include;
-	if (num_rectangles) {
-		memcpy(sctx->window_rectangles, rects,
-		       sizeof(*rects) * num_rectangles);
-	}
+   sctx->num_window_rectangles = num_rectangles;
+   sctx->window_rectangles_include = include;
+   if (num_rectangles) {
+      memcpy(sctx->window_rectangles, rects, sizeof(*rects) * num_rectangles);
+   }
 
-	si_mark_atom_dirty(sctx, &sctx->atoms.s.window_rectangles);
+   si_mark_atom_dirty(sctx, &sctx->atoms.s.window_rectangles);
 }
 
 void si_init_viewport_functions(struct si_context *ctx)
 {
-	ctx->atoms.s.guardband.emit = si_emit_guardband;
-	ctx->atoms.s.scissors.emit = si_emit_scissors;
-	ctx->atoms.s.viewports.emit = si_emit_viewport_states;
-	ctx->atoms.s.window_rectangles.emit = si_emit_window_rectangles;
+   ctx->atoms.s.guardband.emit = si_emit_guardband;
+   ctx->atoms.s.scissors.emit = si_emit_scissors;
+   ctx->atoms.s.viewports.emit = si_emit_viewport_states;
+   ctx->atoms.s.window_rectangles.emit = si_emit_window_rectangles;
 
-	ctx->b.set_scissor_states = si_set_scissor_states;
-	ctx->b.set_viewport_states = si_set_viewport_states;
-	ctx->b.set_window_rectangles = si_set_window_rectangles;
+   ctx->b.set_scissor_states = si_set_scissor_states;
+   ctx->b.set_viewport_states = si_set_viewport_states;
+   ctx->b.set_window_rectangles = si_set_window_rectangles;
 
-	for (unsigned i = 0; i < 16; i++)
-		ctx->viewports.as_scissor[i].quant_mode = SI_QUANT_MODE_16_8_FIXED_POINT_1_256TH;
+   for (unsigned i = 0; i < 16; i++)
+      ctx->viewports.as_scissor[i].quant_mode = SI_QUANT_MODE_16_8_FIXED_POINT_1_256TH;
 }
diff --git a/src/gallium/drivers/radeonsi/si_test_dma.c b/src/gallium/drivers/radeonsi/si_test_dma.c
index f803448cfc6..7b4ecedbcba 100644
--- a/src/gallium/drivers/radeonsi/si_test_dma.c
+++ b/src/gallium/drivers/radeonsi/si_test_dma.c
@@ -26,8 +26,8 @@
 /* This file implements randomized SDMA texture blit tests. */
 
 #include "si_pipe.h"
-#include "util/u_surface.h"
 #include "util/rand_xor.h"
+#include "util/u_surface.h"
 
 static uint64_t seed_xorshift128plus[2];
 
@@ -36,382 +36,356 @@ static uint64_t seed_xorshift128plus[2];
 /* The GPU blits are emulated on the CPU using these CPU textures. */
 
 struct cpu_texture {
-	uint8_t *ptr;
-	uint64_t size;
-	uint64_t layer_stride;
-	unsigned stride;
+   uint8_t *ptr;
+   uint64_t size;
+   uint64_t layer_stride;
+   unsigned stride;
 };
 
-static void alloc_cpu_texture(struct cpu_texture *tex,
-			      struct pipe_resource *templ)
+static void alloc_cpu_texture(struct cpu_texture *tex, struct pipe_resource *templ)
 {
-	tex->stride = align(util_format_get_stride(templ->format, templ->width0),
-			    RAND_NUM_SIZE);
-	tex->layer_stride = (uint64_t)tex->stride * templ->height0;
-	tex->size = tex->layer_stride * templ->array_size;
-	tex->ptr = malloc(tex->size);
-	assert(tex->ptr);
+   tex->stride = align(util_format_get_stride(templ->format, templ->width0), RAND_NUM_SIZE);
+   tex->layer_stride = (uint64_t)tex->stride * templ->height0;
+   tex->size = tex->layer_stride * templ->array_size;
+   tex->ptr = malloc(tex->size);
+   assert(tex->ptr);
 }
 
-static void set_random_pixels(struct pipe_context *ctx,
-			      struct pipe_resource *tex,
-			      struct cpu_texture *cpu)
+static void set_random_pixels(struct pipe_context *ctx, struct pipe_resource *tex,
+                              struct cpu_texture *cpu)
 {
-	struct pipe_transfer *t;
-	uint8_t *map;
-	int x,y,z;
-
-	map = pipe_transfer_map_3d(ctx, tex, 0, PIPE_TRANSFER_WRITE,
-				   0, 0, 0, tex->width0, tex->height0,
-				   tex->array_size, &t);
-	assert(map);
-
-	for (z = 0; z < tex->array_size; z++) {
-		for (y = 0; y < tex->height0; y++) {
-			uint64_t *ptr = (uint64_t*)
-				(map + t->layer_stride*z + t->stride*y);
-			uint64_t *ptr_cpu = (uint64_t*)
-				(cpu->ptr + cpu->layer_stride*z + cpu->stride*y);
-			unsigned size = cpu->stride / RAND_NUM_SIZE;
-
-			assert(t->stride % RAND_NUM_SIZE == 0);
-			assert(cpu->stride % RAND_NUM_SIZE == 0);
-
-			for (x = 0; x < size; x++) {
-				*ptr++ = *ptr_cpu++ =
-					rand_xorshift128plus(seed_xorshift128plus);
-			}
-		}
-	}
-
-	pipe_transfer_unmap(ctx, t);
+   struct pipe_transfer *t;
+   uint8_t *map;
+   int x, y, z;
+
+   map = pipe_transfer_map_3d(ctx, tex, 0, PIPE_TRANSFER_WRITE, 0, 0, 0, tex->width0, tex->height0,
+                              tex->array_size, &t);
+   assert(map);
+
+   for (z = 0; z < tex->array_size; z++) {
+      for (y = 0; y < tex->height0; y++) {
+         uint64_t *ptr = (uint64_t *)(map + t->layer_stride * z + t->stride * y);
+         uint64_t *ptr_cpu = (uint64_t *)(cpu->ptr + cpu->layer_stride * z + cpu->stride * y);
+         unsigned size = cpu->stride / RAND_NUM_SIZE;
+
+         assert(t->stride % RAND_NUM_SIZE == 0);
+         assert(cpu->stride % RAND_NUM_SIZE == 0);
+
+         for (x = 0; x < size; x++) {
+            *ptr++ = *ptr_cpu++ = rand_xorshift128plus(seed_xorshift128plus);
+         }
+      }
+   }
+
+   pipe_transfer_unmap(ctx, t);
 }
 
-static bool compare_textures(struct pipe_context *ctx,
-			     struct pipe_resource *tex,
-			     struct cpu_texture *cpu)
+static bool compare_textures(struct pipe_context *ctx, struct pipe_resource *tex,
+                             struct cpu_texture *cpu)
 {
-	struct pipe_transfer *t;
-	uint8_t *map;
-	int y,z;
-	bool pass = true;
-	unsigned stride = util_format_get_stride(tex->format, tex->width0);
-
-	map = pipe_transfer_map_3d(ctx, tex, 0, PIPE_TRANSFER_READ,
-				   0, 0, 0, tex->width0, tex->height0,
-				   tex->array_size, &t);
-	assert(map);
-
-	for (z = 0; z < tex->array_size; z++) {
-		for (y = 0; y < tex->height0; y++) {
-			uint8_t *ptr = map + t->layer_stride*z + t->stride*y;
-			uint8_t *cpu_ptr = cpu->ptr +
-					   cpu->layer_stride*z + cpu->stride*y;
-
-			if (memcmp(ptr, cpu_ptr, stride)) {
-				pass = false;
-				goto done;
-			}
-		}
-	}
+   struct pipe_transfer *t;
+   uint8_t *map;
+   int y, z;
+   bool pass = true;
+   unsigned stride = util_format_get_stride(tex->format, tex->width0);
+
+   map = pipe_transfer_map_3d(ctx, tex, 0, PIPE_TRANSFER_READ, 0, 0, 0, tex->width0, tex->height0,
+                              tex->array_size, &t);
+   assert(map);
+
+   for (z = 0; z < tex->array_size; z++) {
+      for (y = 0; y < tex->height0; y++) {
+         uint8_t *ptr = map + t->layer_stride * z + t->stride * y;
+         uint8_t *cpu_ptr = cpu->ptr + cpu->layer_stride * z + cpu->stride * y;
+
+         if (memcmp(ptr, cpu_ptr, stride)) {
+            pass = false;
+            goto done;
+         }
+      }
+   }
 done:
-	pipe_transfer_unmap(ctx, t);
-	return pass;
+   pipe_transfer_unmap(ctx, t);
+   return pass;
 }
 
 static enum pipe_format choose_format()
 {
-	enum pipe_format formats[] = {
-		PIPE_FORMAT_R8_UINT,
-		PIPE_FORMAT_R16_UINT,
-		PIPE_FORMAT_R32_UINT,
-		PIPE_FORMAT_R32G32_UINT,
-		PIPE_FORMAT_R32G32B32A32_UINT,
-		PIPE_FORMAT_G8R8_B8R8_UNORM,
-	};
-	return formats[rand() % ARRAY_SIZE(formats)];
+   enum pipe_format formats[] = {
+      PIPE_FORMAT_R8_UINT,     PIPE_FORMAT_R16_UINT,          PIPE_FORMAT_R32_UINT,
+      PIPE_FORMAT_R32G32_UINT, PIPE_FORMAT_R32G32B32A32_UINT, PIPE_FORMAT_G8R8_B8R8_UNORM,
+   };
+   return formats[rand() % ARRAY_SIZE(formats)];
 }
 
-static const char *array_mode_to_string(struct si_screen *sscreen,
-					struct radeon_surf *surf)
+static const char *array_mode_to_string(struct si_screen *sscreen, struct radeon_surf *surf)
 {
-	if (sscreen->info.chip_class >= GFX9) {
-		switch (surf->u.gfx9.surf.swizzle_mode) {
-		case 0:
-			return "  LINEAR";
-		case 21:
-			return " 4KB_S_X";
-		case 22:
-			return " 4KB_D_X";
-		case 25:
-			return "64KB_S_X";
-		case 26:
-			return "64KB_D_X";
-		default:
-			printf("Unhandled swizzle mode = %u\n",
-			       surf->u.gfx9.surf.swizzle_mode);
-			return " UNKNOWN";
-		}
-	} else {
-		switch (surf->u.legacy.level[0].mode) {
-		case RADEON_SURF_MODE_LINEAR_ALIGNED:
-			return "LINEAR_ALIGNED";
-		case RADEON_SURF_MODE_1D:
-			return "1D_TILED_THIN1";
-		case RADEON_SURF_MODE_2D:
-			return "2D_TILED_THIN1";
-		default:
-			assert(0);
-			return "       UNKNOWN";
-		}
-	}
+   if (sscreen->info.chip_class >= GFX9) {
+      switch (surf->u.gfx9.surf.swizzle_mode) {
+      case 0:
+         return "  LINEAR";
+      case 21:
+         return " 4KB_S_X";
+      case 22:
+         return " 4KB_D_X";
+      case 25:
+         return "64KB_S_X";
+      case 26:
+         return "64KB_D_X";
+      default:
+         printf("Unhandled swizzle mode = %u\n", surf->u.gfx9.surf.swizzle_mode);
+         return " UNKNOWN";
+      }
+   } else {
+      switch (surf->u.legacy.level[0].mode) {
+      case RADEON_SURF_MODE_LINEAR_ALIGNED:
+         return "LINEAR_ALIGNED";
+      case RADEON_SURF_MODE_1D:
+         return "1D_TILED_THIN1";
+      case RADEON_SURF_MODE_2D:
+         return "2D_TILED_THIN1";
+      default:
+         assert(0);
+         return "       UNKNOWN";
+      }
+   }
 }
 
 static unsigned generate_max_tex_side(unsigned max_tex_side)
 {
-	switch (rand() % 4) {
-	case 0:
-		/* Try to hit large sizes in 1/4 of the cases. */
-		return max_tex_side;
-	case 1:
-		/* Try to hit 1D tiling in 1/4 of the cases. */
-		return 128;
-	default:
-		/* Try to hit common sizes in 2/4 of the cases. */
-		return 2048;
-	}
+   switch (rand() % 4) {
+   case 0:
+      /* Try to hit large sizes in 1/4 of the cases. */
+      return max_tex_side;
+   case 1:
+      /* Try to hit 1D tiling in 1/4 of the cases. */
+      return 128;
+   default:
+      /* Try to hit common sizes in 2/4 of the cases. */
+      return 2048;
+   }
 }
 
 void si_test_dma(struct si_screen *sscreen)
 {
-	struct pipe_screen *screen = &sscreen->b;
-	struct pipe_context *ctx = screen->context_create(screen, NULL, 0);
-	struct si_context *sctx = (struct si_context*)ctx;
-	uint64_t max_alloc_size;
-	unsigned i, iterations, num_partial_copies, max_tex_side;
-	unsigned num_pass = 0, num_fail = 0;
-
-	max_tex_side = screen->get_param(screen, PIPE_CAP_MAX_TEXTURE_2D_SIZE);
-
-	/* Max 128 MB allowed for both textures. */
-	max_alloc_size = 128 * 1024 * 1024;
-
-	/* the seed for random test parameters */
-	srand(0x9b47d95b);
-	/* the seed for random pixel data */
-	s_rand_xorshift128plus(seed_xorshift128plus, false);
-
-	iterations = 1000000000; /* just kill it when you are bored */
-	num_partial_copies = 30;
-
-	/* These parameters are randomly generated per test:
-	 * - whether to do one whole-surface copy or N partial copies per test
-	 * - which tiling modes to use (LINEAR_ALIGNED, 1D, 2D)
-	 * - which texture dimensions to use
-	 * - whether to use VRAM (all tiling modes) and GTT (staging, linear
-	 *   only) allocations
-	 * - random initial pixels in src
-	 * - generate random subrectangle copies for partial blits
-	 */
-	for (i = 0; i < iterations; i++) {
-		struct pipe_resource tsrc = {}, tdst = {}, *src, *dst;
-		struct si_texture *sdst;
-		struct si_texture *ssrc;
-		struct cpu_texture src_cpu, dst_cpu;
-		unsigned max_width, max_height, max_depth, j, num;
-		unsigned gfx_blits = 0, dma_blits = 0, cs_blits = 0, max_tex_side_gen;
-		unsigned max_tex_layers;
-		bool pass;
-		bool do_partial_copies = rand() & 1;
-
-		/* generate a random test case */
-		tsrc.target = tdst.target = PIPE_TEXTURE_2D_ARRAY;
-		tsrc.depth0 = tdst.depth0 = 1;
-
-		tsrc.format = tdst.format = choose_format();
-
-		max_tex_side_gen = generate_max_tex_side(max_tex_side);
-		max_tex_layers = rand() % 4 ? 1 : 5;
-
-		tsrc.width0 = (rand() % max_tex_side_gen) + 1;
-		tsrc.height0 = (rand() % max_tex_side_gen) + 1;
-		tsrc.array_size = (rand() % max_tex_layers) + 1;
-
-		if (tsrc.format == PIPE_FORMAT_G8R8_B8R8_UNORM)
-			tsrc.width0 = align(tsrc.width0, 2);
-
-		/* Have a 1/4 chance of getting power-of-two dimensions. */
-		if (rand() % 4 == 0) {
-			tsrc.width0 = util_next_power_of_two(tsrc.width0);
-			tsrc.height0 = util_next_power_of_two(tsrc.height0);
-		}
-
-		if (!do_partial_copies) {
-			/* whole-surface copies only, same dimensions */
-			tdst = tsrc;
-		} else {
-			max_tex_side_gen = generate_max_tex_side(max_tex_side);
-			max_tex_layers = rand() % 4 ? 1 : 5;
-
-			/* many partial copies, dimensions can be different */
-			tdst.width0 = (rand() % max_tex_side_gen) + 1;
-			tdst.height0 = (rand() % max_tex_side_gen) + 1;
-			tdst.array_size = (rand() % max_tex_layers) + 1;
-
-			/* Have a 1/4 chance of getting power-of-two dimensions. */
-			if (rand() % 4 == 0) {
-				tdst.width0 = util_next_power_of_two(tdst.width0);
-				tdst.height0 = util_next_power_of_two(tdst.height0);
-			}
-		}
-
-		/* check texture sizes */
-		if ((uint64_t) util_format_get_nblocks(tsrc.format, tsrc.width0, tsrc.height0)
-			* tsrc.array_size * util_format_get_blocksize(tsrc.format) +
-		    (uint64_t) util_format_get_nblocks(tdst.format, tdst.width0, tdst.height0)
-			* tdst.array_size * util_format_get_blocksize(tdst.format) >
-		    max_alloc_size) {
-			/* too large, try again */
-			i--;
-			continue;
-		}
-
-		/* VRAM + the tiling mode depends on dimensions (3/4 of cases),
-		 * or GTT + linear only (1/4 of cases)
-		 */
-		tsrc.usage = rand() % 4 ? PIPE_USAGE_DEFAULT : PIPE_USAGE_STAGING;
-		tdst.usage = rand() % 4 ? PIPE_USAGE_DEFAULT : PIPE_USAGE_STAGING;
-
-		/* Allocate textures (both the GPU and CPU copies).
-		 * The CPU will emulate what the GPU should be doing.
-		 */
-		src = screen->resource_create(screen, &tsrc);
-		dst = screen->resource_create(screen, &tdst);
-		assert(src);
-		assert(dst);
-		sdst = (struct si_texture*)dst;
-		ssrc = (struct si_texture*)src;
-		alloc_cpu_texture(&src_cpu, &tsrc);
-		alloc_cpu_texture(&dst_cpu, &tdst);
-
-		printf("%4u: dst = (%5u x %5u x %u, %s), "
-		       " src = (%5u x %5u x %u, %s), format = %s, ",
-		       i, tdst.width0, tdst.height0, tdst.array_size,
-		       array_mode_to_string(sscreen, &sdst->surface),
-		       tsrc.width0, tsrc.height0, tsrc.array_size,
-		       array_mode_to_string(sscreen, &ssrc->surface),
-		       util_format_description(tsrc.format)->name);
-		fflush(stdout);
-
-		/* set src pixels */
-		set_random_pixels(ctx, src, &src_cpu);
-
-		/* clear dst pixels */
-		uint32_t zero = 0;
-		si_clear_buffer(sctx, dst, 0, sdst->surface.surf_size, &zero, 4,
-		                SI_COHERENCY_SHADER, false);
-		memset(dst_cpu.ptr, 0, dst_cpu.layer_stride * tdst.array_size);
-
-		/* preparation */
-		max_width = MIN2(tsrc.width0, tdst.width0);
-		max_height = MIN2(tsrc.height0, tdst.height0);
-		max_depth = MIN2(tsrc.array_size, tdst.array_size);
-
-		num = do_partial_copies ? num_partial_copies : 1;
-		for (j = 0; j < num; j++) {
-			int width, height, depth;
-			int srcx, srcy, srcz, dstx, dsty, dstz;
-			struct pipe_box box;
-			unsigned old_num_draw_calls = sctx->num_draw_calls;
-			unsigned old_num_dma_calls = sctx->num_dma_calls;
-			unsigned old_num_cs_calls = sctx->num_compute_calls;
-
-			if (!do_partial_copies) {
-				/* copy whole src to dst */
-				width = max_width;
-				height = max_height;
-				depth = max_depth;
-
-				srcx = srcy = srcz = dstx = dsty = dstz = 0;
-			} else {
-				/* random sub-rectangle copies from src to dst */
-				depth = (rand() % max_depth) + 1;
-				srcz = rand() % (tsrc.array_size - depth + 1);
-				dstz = rand() % (tdst.array_size - depth + 1);
-
-				/* special code path to hit the tiled partial copies */
-				if (!ssrc->surface.is_linear &&
-				    !sdst->surface.is_linear &&
-				    rand() & 1) {
-					if (max_width < 8 || max_height < 8)
-						continue;
-					width = ((rand() % (max_width / 8)) + 1) * 8;
-					height = ((rand() % (max_height / 8)) + 1) * 8;
-
-					srcx = rand() % (tsrc.width0 - width + 1) & ~0x7;
-					srcy = rand() % (tsrc.height0 - height + 1) & ~0x7;
-
-					dstx = rand() % (tdst.width0 - width + 1) & ~0x7;
-					dsty = rand() % (tdst.height0 - height + 1) & ~0x7;
-				} else {
-					/* just make sure that it doesn't divide by zero */
-					assert(max_width > 0 && max_height > 0);
-
-					width = (rand() % max_width) + 1;
-					height = (rand() % max_height) + 1;
-
-					srcx = rand() % (tsrc.width0 - width + 1);
-					srcy = rand() % (tsrc.height0 - height + 1);
-
-					dstx = rand() % (tdst.width0 - width + 1);
-					dsty = rand() % (tdst.height0 - height + 1);
-				}
-
-				/* special code path to hit out-of-bounds reads in L2T */
-				if (ssrc->surface.is_linear &&
-				    !sdst->surface.is_linear &&
-				    rand() % 4 == 0) {
-					srcx = 0;
-					srcy = 0;
-					srcz = 0;
-				}
-			}
-
-			/* GPU copy */
-			u_box_3d(srcx, srcy, srcz, width, height, depth, &box);
-			sctx->dma_copy(ctx, dst, 0, dstx, dsty, dstz, src, 0, &box);
-
-			/* See which engine was used. */
-			gfx_blits += sctx->num_draw_calls > old_num_draw_calls;
-			dma_blits += sctx->num_dma_calls > old_num_dma_calls;
-			cs_blits  += sctx->num_compute_calls > old_num_cs_calls;
-
-			/* CPU copy */
-			util_copy_box(dst_cpu.ptr, tdst.format, dst_cpu.stride,
-				      dst_cpu.layer_stride,
-				      dstx, dsty, dstz, width, height, depth,
-				      src_cpu.ptr, src_cpu.stride,
-				      src_cpu.layer_stride,
-				      srcx, srcy, srcz);
-		}
-
-		pass = compare_textures(ctx, dst, &dst_cpu);
-		if (pass)
-			num_pass++;
-		else
-			num_fail++;
-
-		printf("BLITs: GFX = %2u, DMA = %2u, CS = %2u, %s [%u/%u]\n",
-		       gfx_blits, dma_blits, cs_blits, pass ? "pass" : "fail",
-		       num_pass, num_pass+num_fail);
-
-		/* cleanup */
-		pipe_resource_reference(&src, NULL);
-		pipe_resource_reference(&dst, NULL);
-		free(src_cpu.ptr);
-		free(dst_cpu.ptr);
-	}
-
-	ctx->destroy(ctx);
-	exit(0);
+   struct pipe_screen *screen = &sscreen->b;
+   struct pipe_context *ctx = screen->context_create(screen, NULL, 0);
+   struct si_context *sctx = (struct si_context *)ctx;
+   uint64_t max_alloc_size;
+   unsigned i, iterations, num_partial_copies, max_tex_side;
+   unsigned num_pass = 0, num_fail = 0;
+
+   max_tex_side = screen->get_param(screen, PIPE_CAP_MAX_TEXTURE_2D_SIZE);
+
+   /* Max 128 MB allowed for both textures. */
+   max_alloc_size = 128 * 1024 * 1024;
+
+   /* the seed for random test parameters */
+   srand(0x9b47d95b);
+   /* the seed for random pixel data */
+   s_rand_xorshift128plus(seed_xorshift128plus, false);
+
+   iterations = 1000000000; /* just kill it when you are bored */
+   num_partial_copies = 30;
+
+   /* These parameters are randomly generated per test:
+    * - whether to do one whole-surface copy or N partial copies per test
+    * - which tiling modes to use (LINEAR_ALIGNED, 1D, 2D)
+    * - which texture dimensions to use
+    * - whether to use VRAM (all tiling modes) and GTT (staging, linear
+    *   only) allocations
+    * - random initial pixels in src
+    * - generate random subrectangle copies for partial blits
+    */
+   for (i = 0; i < iterations; i++) {
+      struct pipe_resource tsrc = {}, tdst = {}, *src, *dst;
+      struct si_texture *sdst;
+      struct si_texture *ssrc;
+      struct cpu_texture src_cpu, dst_cpu;
+      unsigned max_width, max_height, max_depth, j, num;
+      unsigned gfx_blits = 0, dma_blits = 0, cs_blits = 0, max_tex_side_gen;
+      unsigned max_tex_layers;
+      bool pass;
+      bool do_partial_copies = rand() & 1;
+
+      /* generate a random test case */
+      tsrc.target = tdst.target = PIPE_TEXTURE_2D_ARRAY;
+      tsrc.depth0 = tdst.depth0 = 1;
+
+      tsrc.format = tdst.format = choose_format();
+
+      max_tex_side_gen = generate_max_tex_side(max_tex_side);
+      max_tex_layers = rand() % 4 ? 1 : 5;
+
+      tsrc.width0 = (rand() % max_tex_side_gen) + 1;
+      tsrc.height0 = (rand() % max_tex_side_gen) + 1;
+      tsrc.array_size = (rand() % max_tex_layers) + 1;
+
+      if (tsrc.format == PIPE_FORMAT_G8R8_B8R8_UNORM)
+         tsrc.width0 = align(tsrc.width0, 2);
+
+      /* Have a 1/4 chance of getting power-of-two dimensions. */
+      if (rand() % 4 == 0) {
+         tsrc.width0 = util_next_power_of_two(tsrc.width0);
+         tsrc.height0 = util_next_power_of_two(tsrc.height0);
+      }
+
+      if (!do_partial_copies) {
+         /* whole-surface copies only, same dimensions */
+         tdst = tsrc;
+      } else {
+         max_tex_side_gen = generate_max_tex_side(max_tex_side);
+         max_tex_layers = rand() % 4 ? 1 : 5;
+
+         /* many partial copies, dimensions can be different */
+         tdst.width0 = (rand() % max_tex_side_gen) + 1;
+         tdst.height0 = (rand() % max_tex_side_gen) + 1;
+         tdst.array_size = (rand() % max_tex_layers) + 1;
+
+         /* Have a 1/4 chance of getting power-of-two dimensions. */
+         if (rand() % 4 == 0) {
+            tdst.width0 = util_next_power_of_two(tdst.width0);
+            tdst.height0 = util_next_power_of_two(tdst.height0);
+         }
+      }
+
+      /* check texture sizes */
+      if ((uint64_t)util_format_get_nblocks(tsrc.format, tsrc.width0, tsrc.height0) *
+                tsrc.array_size * util_format_get_blocksize(tsrc.format) +
+             (uint64_t)util_format_get_nblocks(tdst.format, tdst.width0, tdst.height0) *
+                tdst.array_size * util_format_get_blocksize(tdst.format) >
+          max_alloc_size) {
+         /* too large, try again */
+         i--;
+         continue;
+      }
+
+      /* VRAM + the tiling mode depends on dimensions (3/4 of cases),
+       * or GTT + linear only (1/4 of cases)
+       */
+      tsrc.usage = rand() % 4 ? PIPE_USAGE_DEFAULT : PIPE_USAGE_STAGING;
+      tdst.usage = rand() % 4 ? PIPE_USAGE_DEFAULT : PIPE_USAGE_STAGING;
+
+      /* Allocate textures (both the GPU and CPU copies).
+       * The CPU will emulate what the GPU should be doing.
+       */
+      src = screen->resource_create(screen, &tsrc);
+      dst = screen->resource_create(screen, &tdst);
+      assert(src);
+      assert(dst);
+      sdst = (struct si_texture *)dst;
+      ssrc = (struct si_texture *)src;
+      alloc_cpu_texture(&src_cpu, &tsrc);
+      alloc_cpu_texture(&dst_cpu, &tdst);
+
+      printf("%4u: dst = (%5u x %5u x %u, %s), "
+             " src = (%5u x %5u x %u, %s), format = %s, ",
+             i, tdst.width0, tdst.height0, tdst.array_size,
+             array_mode_to_string(sscreen, &sdst->surface), tsrc.width0, tsrc.height0,
+             tsrc.array_size, array_mode_to_string(sscreen, &ssrc->surface),
+             util_format_description(tsrc.format)->name);
+      fflush(stdout);
+
+      /* set src pixels */
+      set_random_pixels(ctx, src, &src_cpu);
+
+      /* clear dst pixels */
+      uint32_t zero = 0;
+      si_clear_buffer(sctx, dst, 0, sdst->surface.surf_size, &zero, 4, SI_COHERENCY_SHADER, false);
+      memset(dst_cpu.ptr, 0, dst_cpu.layer_stride * tdst.array_size);
+
+      /* preparation */
+      max_width = MIN2(tsrc.width0, tdst.width0);
+      max_height = MIN2(tsrc.height0, tdst.height0);
+      max_depth = MIN2(tsrc.array_size, tdst.array_size);
+
+      num = do_partial_copies ? num_partial_copies : 1;
+      for (j = 0; j < num; j++) {
+         int width, height, depth;
+         int srcx, srcy, srcz, dstx, dsty, dstz;
+         struct pipe_box box;
+         unsigned old_num_draw_calls = sctx->num_draw_calls;
+         unsigned old_num_dma_calls = sctx->num_dma_calls;
+         unsigned old_num_cs_calls = sctx->num_compute_calls;
+
+         if (!do_partial_copies) {
+            /* copy whole src to dst */
+            width = max_width;
+            height = max_height;
+            depth = max_depth;
+
+            srcx = srcy = srcz = dstx = dsty = dstz = 0;
+         } else {
+            /* random sub-rectangle copies from src to dst */
+            depth = (rand() % max_depth) + 1;
+            srcz = rand() % (tsrc.array_size - depth + 1);
+            dstz = rand() % (tdst.array_size - depth + 1);
+
+            /* special code path to hit the tiled partial copies */
+            if (!ssrc->surface.is_linear && !sdst->surface.is_linear && rand() & 1) {
+               if (max_width < 8 || max_height < 8)
+                  continue;
+               width = ((rand() % (max_width / 8)) + 1) * 8;
+               height = ((rand() % (max_height / 8)) + 1) * 8;
+
+               srcx = rand() % (tsrc.width0 - width + 1) & ~0x7;
+               srcy = rand() % (tsrc.height0 - height + 1) & ~0x7;
+
+               dstx = rand() % (tdst.width0 - width + 1) & ~0x7;
+               dsty = rand() % (tdst.height0 - height + 1) & ~0x7;
+            } else {
+               /* just make sure that it doesn't divide by zero */
+               assert(max_width > 0 && max_height > 0);
+
+               width = (rand() % max_width) + 1;
+               height = (rand() % max_height) + 1;
+
+               srcx = rand() % (tsrc.width0 - width + 1);
+               srcy = rand() % (tsrc.height0 - height + 1);
+
+               dstx = rand() % (tdst.width0 - width + 1);
+               dsty = rand() % (tdst.height0 - height + 1);
+            }
+
+            /* special code path to hit out-of-bounds reads in L2T */
+            if (ssrc->surface.is_linear && !sdst->surface.is_linear && rand() % 4 == 0) {
+               srcx = 0;
+               srcy = 0;
+               srcz = 0;
+            }
+         }
+
+         /* GPU copy */
+         u_box_3d(srcx, srcy, srcz, width, height, depth, &box);
+         sctx->dma_copy(ctx, dst, 0, dstx, dsty, dstz, src, 0, &box);
+
+         /* See which engine was used. */
+         gfx_blits += sctx->num_draw_calls > old_num_draw_calls;
+         dma_blits += sctx->num_dma_calls > old_num_dma_calls;
+         cs_blits += sctx->num_compute_calls > old_num_cs_calls;
+
+         /* CPU copy */
+         util_copy_box(dst_cpu.ptr, tdst.format, dst_cpu.stride, dst_cpu.layer_stride, dstx, dsty,
+                       dstz, width, height, depth, src_cpu.ptr, src_cpu.stride,
+                       src_cpu.layer_stride, srcx, srcy, srcz);
+      }
+
+      pass = compare_textures(ctx, dst, &dst_cpu);
+      if (pass)
+         num_pass++;
+      else
+         num_fail++;
+
+      printf("BLITs: GFX = %2u, DMA = %2u, CS = %2u, %s [%u/%u]\n", gfx_blits, dma_blits, cs_blits,
+             pass ? "pass" : "fail", num_pass, num_pass + num_fail);
+
+      /* cleanup */
+      pipe_resource_reference(&src, NULL);
+      pipe_resource_reference(&dst, NULL);
+      free(src_cpu.ptr);
+      free(dst_cpu.ptr);
+   }
+
+   ctx->destroy(ctx);
+   exit(0);
 }
diff --git a/src/gallium/drivers/radeonsi/si_test_dma_perf.c b/src/gallium/drivers/radeonsi/si_test_dma_perf.c
index 4eec3d12459..116bfe69069 100644
--- a/src/gallium/drivers/radeonsi/si_test_dma_perf.c
+++ b/src/gallium/drivers/radeonsi/si_test_dma_perf.c
@@ -28,451 +28,444 @@
 #include "si_pipe.h"
 #include "si_query.h"
 
-#define MIN_SIZE	512
-#define MAX_SIZE	(128 * 1024 * 1024)
-#define SIZE_SHIFT	1
-#define NUM_RUNS	128
+#define MIN_SIZE   512
+#define MAX_SIZE   (128 * 1024 * 1024)
+#define SIZE_SHIFT 1
+#define NUM_RUNS   128
 
 static double get_MBps_rate(unsigned num_bytes, unsigned ns)
 {
-	return (num_bytes / (1024.0 * 1024.0)) / (ns / 1000000000.0);
+   return (num_bytes / (1024.0 * 1024.0)) / (ns / 1000000000.0);
 }
 
 void si_test_dma_perf(struct si_screen *sscreen)
 {
-	struct pipe_screen *screen = &sscreen->b;
-	struct pipe_context *ctx = screen->context_create(screen, NULL, 0);
-	struct si_context *sctx = (struct si_context*)ctx;
-	const uint32_t clear_value = 0x12345678;
-	static const unsigned cs_dwords_per_thread_list[] = {64, 32, 16, 8, 4, 2, 1};
-	static const unsigned cs_waves_per_sh_list[] = {1, 2, 4, 8, 16, 0};
+   struct pipe_screen *screen = &sscreen->b;
+   struct pipe_context *ctx = screen->context_create(screen, NULL, 0);
+   struct si_context *sctx = (struct si_context *)ctx;
+   const uint32_t clear_value = 0x12345678;
+   static const unsigned cs_dwords_per_thread_list[] = {64, 32, 16, 8, 4, 2, 1};
+   static const unsigned cs_waves_per_sh_list[] = {1, 2, 4, 8, 16, 0};
 
 #define NUM_SHADERS ARRAY_SIZE(cs_dwords_per_thread_list)
-#define NUM_METHODS (4 + 2*NUM_SHADERS * ARRAY_SIZE(cs_waves_per_sh_list))
-
-	static const char *method_str[] = {
-		"CP MC   ",
-		"CP L2   ",
-		"CP L2   ",
-		"SDMA    ",
-	};
-	static const char *placement_str[] = {
-		/* Clear */
-		"fill->VRAM",
-		"fill->GTT ",
-		/* Copy */
-		"VRAM->VRAM",
-		"VRAM->GTT ",
-		"GTT ->VRAM",
-	};
-
-	printf("DMA rate is in MB/s for each size. Slow cases are skipped and print 0.\n");
-	printf("Heap       ,Method  ,L2p,Wa,");
-	for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT) {
-		if (size >= 1024)
-			printf("%6uKB,", size / 1024);
-		else
-			printf(" %6uB,", size);
-	}
-	printf("\n");
-
-	/* results[log2(size)][placement][method][] */
-	struct si_result {
-		bool is_valid;
-		bool is_cp;
-		bool is_sdma;
-		bool is_cs;
-		unsigned cache_policy;
-		unsigned dwords_per_thread;
-		unsigned waves_per_sh;
-		unsigned score;
-		unsigned index; /* index in results[x][y][index] */
-	} results[32][ARRAY_SIZE(placement_str)][NUM_METHODS] = {};
-
-	/* Run benchmarks. */
-	for (unsigned placement = 0; placement < ARRAY_SIZE(placement_str); placement++) {
-		bool is_copy = placement >= 2;
-
-		printf("-----------,--------,---,--,");
-		for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT)
-			printf("--------,");
-		printf("\n");
-
-		for (unsigned method = 0; method < NUM_METHODS; method++) {
-			bool test_cp = method <= 2;
-			bool test_sdma = method == 3;
-			bool test_cs = method >= 4;
-			unsigned cs_method = method - 4;
-			STATIC_ASSERT(L2_STREAM + 1 == L2_LRU);
-			unsigned cs_waves_per_sh =
-				test_cs ? cs_waves_per_sh_list[cs_method / (2*NUM_SHADERS)] : 0;
-			cs_method %= 2*NUM_SHADERS;
-			unsigned cache_policy = test_cp ? method % 3 :
-						test_cs ? L2_STREAM + (cs_method / NUM_SHADERS) : 0;
-			unsigned cs_dwords_per_thread =
-				test_cs ? cs_dwords_per_thread_list[cs_method % NUM_SHADERS] : 0;
-
-			if (test_sdma && !sctx->sdma_cs)
-				continue;
-
-			if (sctx->chip_class == GFX6) {
-				/* GFX6 doesn't support CP DMA operations through L2. */
-				if (test_cp && cache_policy != L2_BYPASS)
-					continue;
-				/* WAVES_PER_SH is in multiples of 16 on GFX6. */
-				if (test_cs && cs_waves_per_sh % 16 != 0)
-					continue;
-			}
-
-			printf("%s ,", placement_str[placement]);
-			if (test_cs) {
-				printf("CS x%-4u,%3s,", cs_dwords_per_thread,
-				       cache_policy == L2_LRU ? "LRU" :
-				       cache_policy == L2_STREAM ? "Str" : "");
-			} else {
-				printf("%s,%3s,", method_str[method],
-				       method == L2_LRU ? "LRU" :
-				       method == L2_STREAM ? "Str" : "");
-			}
-			if (test_cs && cs_waves_per_sh)
-				printf("%2u,", cs_waves_per_sh);
-			else
-				printf("  ,");
-
-			double score = 0;
-			for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT) {
-				/* Don't test bigger sizes if it's too slow. Print 0. */
-				if (size >= 512*1024 &&
-				    score < 400 * (size / (4*1024*1024))) {
-					printf("%7.0f ,", 0.0);
-					continue;
-				}
-
-				enum pipe_resource_usage dst_usage, src_usage;
-				struct pipe_resource *dst, *src;
-				struct pipe_query *q[NUM_RUNS];
-				unsigned query_type = PIPE_QUERY_TIME_ELAPSED;
-
-				if (test_sdma) {
-					if (sctx->chip_class == GFX6)
-						query_type = SI_QUERY_TIME_ELAPSED_SDMA_SI;
-					else
-						query_type = SI_QUERY_TIME_ELAPSED_SDMA;
-				}
-
-				if (placement == 0 || placement == 2 || placement == 4)
-					dst_usage = PIPE_USAGE_DEFAULT;
-				else
-					dst_usage = PIPE_USAGE_STREAM;
-
-				if (placement == 2 || placement == 3)
-					src_usage = PIPE_USAGE_DEFAULT;
-				else
-					src_usage = PIPE_USAGE_STREAM;
-
-				dst = pipe_buffer_create(screen, 0, dst_usage, size);
-				src = is_copy ? pipe_buffer_create(screen, 0, src_usage, size) : NULL;
-
-				/* Run tests. */
-				for (unsigned iter = 0; iter < NUM_RUNS; iter++) {
-					q[iter] = ctx->create_query(ctx, query_type, 0);
-					ctx->begin_query(ctx, q[iter]);
-
-					if (test_cp) {
-						/* CP DMA */
-						if (is_copy) {
-							si_cp_dma_copy_buffer(sctx, dst, src, 0, 0, size, 0,
-									      SI_COHERENCY_NONE, cache_policy);
-						} else {
-							si_cp_dma_clear_buffer(sctx, sctx->gfx_cs, dst, 0, size,
-									       clear_value, 0,
-									       SI_COHERENCY_NONE, cache_policy);
-						}
-					} else if (test_sdma) {
-						/* SDMA */
-						if (is_copy) {
-							si_sdma_copy_buffer(sctx, dst, src, 0, 0, size);
-						} else {
-							si_sdma_clear_buffer(sctx, dst, 0, size, clear_value);
-						}
-					} else {
-						/* Compute */
-						/* The memory accesses are coalesced, meaning that the 1st instruction writes
-						 * the 1st contiguous block of data for the whole wave, the 2nd instruction
-						 * writes the 2nd contiguous block of data, etc.
-						 */
-						unsigned instructions_per_thread = MAX2(1, cs_dwords_per_thread / 4);
-						unsigned dwords_per_instruction = cs_dwords_per_thread / instructions_per_thread;
-						unsigned dwords_per_wave = cs_dwords_per_thread * 64;
-
-						unsigned num_dwords = size / 4;
-						unsigned num_instructions = DIV_ROUND_UP(num_dwords, dwords_per_instruction);
-
-						void *cs = si_create_dma_compute_shader(ctx, cs_dwords_per_thread,
-											cache_policy == L2_STREAM, is_copy);
-
-						struct pipe_grid_info info = {};
-						info.block[0] = MIN2(64, num_instructions);
-						info.block[1] = 1;
-						info.block[2] = 1;
-						info.grid[0] = DIV_ROUND_UP(num_dwords, dwords_per_wave);
-						info.grid[1] = 1;
-						info.grid[2] = 1;
-
-						struct pipe_shader_buffer sb[2] = {};
-						sb[0].buffer = dst;
-						sb[0].buffer_size = size;
-
-						if (is_copy) {
-							sb[1].buffer = src;
-							sb[1].buffer_size = size;
-						} else {
-							for (unsigned i = 0; i < 4; i++)
-								sctx->cs_user_data[i] = clear_value;
-						}
-
-						sctx->flags |= SI_CONTEXT_INV_VCACHE |
-							       SI_CONTEXT_INV_SCACHE;
-
-						ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0,
-									is_copy ? 2 : 1, sb, 0x1);
-						ctx->bind_compute_state(ctx, cs);
-						sctx->cs_max_waves_per_sh = cs_waves_per_sh;
-
-						ctx->launch_grid(ctx, &info);
-
-						ctx->bind_compute_state(ctx, NULL);
-						ctx->delete_compute_state(ctx, cs);
-						sctx->cs_max_waves_per_sh = 0; /* disable the limit */
-
-						sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH;
-					}
-
-					/* Flush L2, so that we don't just test L2 cache performance. */
-					if (!test_sdma) {
-						sctx->flags |= SI_CONTEXT_WB_L2;
-						sctx->emit_cache_flush(sctx);
-					}
-
-					ctx->end_query(ctx, q[iter]);
-					ctx->flush(ctx, NULL, PIPE_FLUSH_ASYNC);
-				}
-				pipe_resource_reference(&dst, NULL);
-				pipe_resource_reference(&src, NULL);
-
-				/* Get results. */
-				uint64_t min = ~0ull, max = 0, total = 0;
-
-				for (unsigned iter = 0; iter < NUM_RUNS; iter++) {
-					union pipe_query_result result;
-
-					ctx->get_query_result(ctx, q[iter], true, &result);
-					ctx->destroy_query(ctx, q[iter]);
-
-					min = MIN2(min, result.u64);
-					max = MAX2(max, result.u64);
-					total += result.u64;
-				}
-
-				score = get_MBps_rate(size, total / (double)NUM_RUNS);
-				printf("%7.0f ,", score);
-				fflush(stdout);
-
-				struct si_result *r = &results[util_logbase2(size)][placement][method];
-				r->is_valid = true;
-				r->is_cp = test_cp;
-				r->is_sdma = test_sdma;
-				r->is_cs = test_cs;
-				r->cache_policy = cache_policy;
-				r->dwords_per_thread = cs_dwords_per_thread;
-				r->waves_per_sh = cs_waves_per_sh;
-				r->score = score;
-				r->index = method;
-			}
-			puts("");
-		}
-	}
-
-	puts("");
-	puts("static struct si_method");
-	printf("get_best_clear_for_%s(enum radeon_bo_domain dst, uint64_t size64, bool async, bool cached)\n",
-	       sctx->screen->info.name);
-	puts("{");
-	puts("   unsigned size = MIN2(size64, UINT_MAX);\n");
-
-	/* Analyze results and find the best methods. */
-	for (unsigned placement = 0; placement < ARRAY_SIZE(placement_str); placement++) {
-		if (placement == 0)
-			puts("   if (dst == RADEON_DOMAIN_VRAM) {");
-		else if (placement == 1)
-			puts("   } else { /* GTT */");
-		else if (placement == 2) {
-			puts("}");
-			puts("");
-			puts("static struct si_method");
-			printf("get_best_copy_for_%s(enum radeon_bo_domain dst, enum radeon_bo_domain src,\n",
-			       sctx->screen->info.name);
-			printf("                     uint64_t size64, bool async, bool cached)\n");
-			puts("{");
-			puts("   unsigned size = MIN2(size64, UINT_MAX);\n");
-			puts("   if (src == RADEON_DOMAIN_VRAM && dst == RADEON_DOMAIN_VRAM) {");
-		} else if (placement == 3)
-			puts("   } else if (src == RADEON_DOMAIN_VRAM && dst == RADEON_DOMAIN_GTT) {");
-		else
-			puts("   } else { /* GTT -> VRAM */");
-
-		for (unsigned mode = 0; mode < 3; mode++) {
-			bool async = mode == 0;
-			bool cached = mode == 1;
-
-			if (async)
-				puts("      if (async) { /* SDMA or async compute */");
-			else if (cached)
-				puts("      if (cached) { /* gfx ring */");
-			else
-				puts("      } else { /* gfx ring - uncached */");
-
-			/* The list of best chosen methods. */
-			struct si_result *methods[32];
-			unsigned method_max_size[32];
-			unsigned num_methods = 0;
-
-			for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT) {
-				/* Find the best method. */
-				struct si_result *best = NULL;
-
-				for (unsigned i = 0; i < NUM_METHODS; i++) {
-					struct si_result *r = &results[util_logbase2(size)][placement][i];
-
-					if (!r->is_valid)
-						continue;
-
-					/* Ban CP DMA clears via MC on <= GFX8. They are super slow
-					 * on GTT, which we can get due to BO evictions.
-					 */
-					if (sctx->chip_class <= GFX8 && placement == 1 &&
-					    r->is_cp && r->cache_policy == L2_BYPASS)
-						continue;
-
-					if (async) {
-						/* The following constraints for compute IBs try to limit
-						 * resource usage so as not to decrease the performance
-						 * of gfx IBs too much.
-						 */
-
-						/* Don't use CP DMA on asynchronous rings, because
-						 * the engine is shared with gfx IBs.
-						 */
-						if (r->is_cp)
-							continue;
-
-						/* Don't use L2 caching on asynchronous rings to minimize
-						 * L2 usage.
-						 */
-						if (r->cache_policy == L2_LRU)
-							continue;
-
-						/* Asynchronous compute recommends waves_per_sh != 0
-						 * to limit CU usage. */
-						if (r->is_cs && r->waves_per_sh == 0)
-							continue;
-					} else {
-						/* SDMA is always asynchronous */
-						if (r->is_sdma)
-							continue;
-
-						if (cached && r->cache_policy == L2_BYPASS)
-							continue;
-						if (!cached && r->cache_policy == L2_LRU)
-							continue;
-					}
-
-					if (!best) {
-						best = r;
-						continue;
-					}
-
-					/* Assume some measurement error. Earlier methods occupy fewer
-					 * resources, so the next method is always more greedy, and we
-					 * don't want to select it due to a measurement error.
-					 */
-					double min_improvement = 1.03;
-
-					if (best->score * min_improvement < r->score)
-						best = r;
-				}
-
-				if (num_methods > 0) {
-					unsigned prev_index = num_methods - 1;
-					struct si_result *prev = methods[prev_index];
-					struct si_result *prev_this_size = &results[util_logbase2(size)][placement][prev->index];
-
-					/* If the best one is also the best for the previous size,
-					 * just bump the size for the previous one.
-					 *
-					 * If there is no best, it means all methods were too slow
-					 * for this size and were not tested. Use the best one for
-					 * the previous size.
-					 */
-					if (!best ||
-					    /* If it's the same method as for the previous size: */
-					    (prev->is_cp == best->is_cp &&
-					     prev->is_sdma == best->is_sdma &&
-					     prev->is_cs == best->is_cs &&
-					     prev->cache_policy == best->cache_policy &&
-					     prev->dwords_per_thread == best->dwords_per_thread &&
-					     prev->waves_per_sh == best->waves_per_sh) ||
-					    /* If the method for the previous size is also the best
-					     * for this size: */
-					    (prev_this_size->is_valid &&
-					     prev_this_size->score * 1.03 > best->score)) {
-						method_max_size[prev_index] = size;
-						continue;
-					}
-				}
-
-				/* Add it to the list. */
-				assert(num_methods < ARRAY_SIZE(methods));
-				methods[num_methods] = best;
-				method_max_size[num_methods] = size;
-				num_methods++;
-			}
-
-			for (unsigned i = 0; i < num_methods; i++) {
-				struct si_result *best = methods[i];
-				unsigned size = method_max_size[i];
-
-				/* The size threshold is between the current benchmarked
-				 * size and the next benchmarked size. */
-				if (i < num_methods - 1)
-					printf("         if (size <= %9u) ", (size + (size << SIZE_SHIFT)) / 2);
-				else if (i > 0)
-					printf("         else                   ");
-				else
-					printf("         ");
-				printf("return ");
-
-				assert(best);
-				if (best->is_cp) {
-					printf("CP_DMA(%s);\n",
-					       best->cache_policy == L2_BYPASS ? "L2_BYPASS" :
-					       best->cache_policy == L2_LRU ?    "L2_LRU   " : "L2_STREAM");
-				}
-				if (best->is_sdma)
-					printf("SDMA;\n");
-				if (best->is_cs) {
-					printf("COMPUTE(%s, %u, %u);\n",
-					       best->cache_policy == L2_LRU ? "L2_LRU   " : "L2_STREAM",
-					       best->dwords_per_thread,
-					       best->waves_per_sh);
-				}
-			}
-		}
-		puts("      }");
-	}
-	puts("   }");
-	puts("}");
-
-	ctx->destroy(ctx);
-	exit(0);
+#define NUM_METHODS (4 + 2 * NUM_SHADERS * ARRAY_SIZE(cs_waves_per_sh_list))
+
+   static const char *method_str[] = {
+      "CP MC   ",
+      "CP L2   ",
+      "CP L2   ",
+      "SDMA    ",
+   };
+   static const char *placement_str[] = {
+      /* Clear */
+      "fill->VRAM",
+      "fill->GTT ",
+      /* Copy */
+      "VRAM->VRAM",
+      "VRAM->GTT ",
+      "GTT ->VRAM",
+   };
+
+   printf("DMA rate is in MB/s for each size. Slow cases are skipped and print 0.\n");
+   printf("Heap       ,Method  ,L2p,Wa,");
+   for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT) {
+      if (size >= 1024)
+         printf("%6uKB,", size / 1024);
+      else
+         printf(" %6uB,", size);
+   }
+   printf("\n");
+
+   /* results[log2(size)][placement][method][] */
+   struct si_result {
+      bool is_valid;
+      bool is_cp;
+      bool is_sdma;
+      bool is_cs;
+      unsigned cache_policy;
+      unsigned dwords_per_thread;
+      unsigned waves_per_sh;
+      unsigned score;
+      unsigned index; /* index in results[x][y][index] */
+   } results[32][ARRAY_SIZE(placement_str)][NUM_METHODS] = {};
+
+   /* Run benchmarks. */
+   for (unsigned placement = 0; placement < ARRAY_SIZE(placement_str); placement++) {
+      bool is_copy = placement >= 2;
+
+      printf("-----------,--------,---,--,");
+      for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT)
+         printf("--------,");
+      printf("\n");
+
+      for (unsigned method = 0; method < NUM_METHODS; method++) {
+         bool test_cp = method <= 2;
+         bool test_sdma = method == 3;
+         bool test_cs = method >= 4;
+         unsigned cs_method = method - 4;
+         STATIC_ASSERT(L2_STREAM + 1 == L2_LRU);
+         unsigned cs_waves_per_sh =
+            test_cs ? cs_waves_per_sh_list[cs_method / (2 * NUM_SHADERS)] : 0;
+         cs_method %= 2 * NUM_SHADERS;
+         unsigned cache_policy =
+            test_cp ? method % 3 : test_cs ? L2_STREAM + (cs_method / NUM_SHADERS) : 0;
+         unsigned cs_dwords_per_thread =
+            test_cs ? cs_dwords_per_thread_list[cs_method % NUM_SHADERS] : 0;
+
+         if (test_sdma && !sctx->sdma_cs)
+            continue;
+
+         if (sctx->chip_class == GFX6) {
+            /* GFX6 doesn't support CP DMA operations through L2. */
+            if (test_cp && cache_policy != L2_BYPASS)
+               continue;
+            /* WAVES_PER_SH is in multiples of 16 on GFX6. */
+            if (test_cs && cs_waves_per_sh % 16 != 0)
+               continue;
+         }
+
+         printf("%s ,", placement_str[placement]);
+         if (test_cs) {
+            printf("CS x%-4u,%3s,", cs_dwords_per_thread,
+                   cache_policy == L2_LRU ? "LRU" : cache_policy == L2_STREAM ? "Str" : "");
+         } else {
+            printf("%s,%3s,", method_str[method],
+                   method == L2_LRU ? "LRU" : method == L2_STREAM ? "Str" : "");
+         }
+         if (test_cs && cs_waves_per_sh)
+            printf("%2u,", cs_waves_per_sh);
+         else
+            printf("  ,");
+
+         double score = 0;
+         for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT) {
+            /* Don't test bigger sizes if it's too slow. Print 0. */
+            if (size >= 512 * 1024 && score < 400 * (size / (4 * 1024 * 1024))) {
+               printf("%7.0f ,", 0.0);
+               continue;
+            }
+
+            enum pipe_resource_usage dst_usage, src_usage;
+            struct pipe_resource *dst, *src;
+            struct pipe_query *q[NUM_RUNS];
+            unsigned query_type = PIPE_QUERY_TIME_ELAPSED;
+
+            if (test_sdma) {
+               if (sctx->chip_class == GFX6)
+                  query_type = SI_QUERY_TIME_ELAPSED_SDMA_SI;
+               else
+                  query_type = SI_QUERY_TIME_ELAPSED_SDMA;
+            }
+
+            if (placement == 0 || placement == 2 || placement == 4)
+               dst_usage = PIPE_USAGE_DEFAULT;
+            else
+               dst_usage = PIPE_USAGE_STREAM;
+
+            if (placement == 2 || placement == 3)
+               src_usage = PIPE_USAGE_DEFAULT;
+            else
+               src_usage = PIPE_USAGE_STREAM;
+
+            dst = pipe_buffer_create(screen, 0, dst_usage, size);
+            src = is_copy ? pipe_buffer_create(screen, 0, src_usage, size) : NULL;
+
+            /* Run tests. */
+            for (unsigned iter = 0; iter < NUM_RUNS; iter++) {
+               q[iter] = ctx->create_query(ctx, query_type, 0);
+               ctx->begin_query(ctx, q[iter]);
+
+               if (test_cp) {
+                  /* CP DMA */
+                  if (is_copy) {
+                     si_cp_dma_copy_buffer(sctx, dst, src, 0, 0, size, 0, SI_COHERENCY_NONE,
+                                           cache_policy);
+                  } else {
+                     si_cp_dma_clear_buffer(sctx, sctx->gfx_cs, dst, 0, size, clear_value, 0,
+                                            SI_COHERENCY_NONE, cache_policy);
+                  }
+               } else if (test_sdma) {
+                  /* SDMA */
+                  if (is_copy) {
+                     si_sdma_copy_buffer(sctx, dst, src, 0, 0, size);
+                  } else {
+                     si_sdma_clear_buffer(sctx, dst, 0, size, clear_value);
+                  }
+               } else {
+                  /* Compute */
+                  /* The memory accesses are coalesced, meaning that the 1st instruction writes
+                   * the 1st contiguous block of data for the whole wave, the 2nd instruction
+                   * writes the 2nd contiguous block of data, etc.
+                   */
+                  unsigned instructions_per_thread = MAX2(1, cs_dwords_per_thread / 4);
+                  unsigned dwords_per_instruction = cs_dwords_per_thread / instructions_per_thread;
+                  unsigned dwords_per_wave = cs_dwords_per_thread * 64;
+
+                  unsigned num_dwords = size / 4;
+                  unsigned num_instructions = DIV_ROUND_UP(num_dwords, dwords_per_instruction);
+
+                  void *cs = si_create_dma_compute_shader(ctx, cs_dwords_per_thread,
+                                                          cache_policy == L2_STREAM, is_copy);
+
+                  struct pipe_grid_info info = {};
+                  info.block[0] = MIN2(64, num_instructions);
+                  info.block[1] = 1;
+                  info.block[2] = 1;
+                  info.grid[0] = DIV_ROUND_UP(num_dwords, dwords_per_wave);
+                  info.grid[1] = 1;
+                  info.grid[2] = 1;
+
+                  struct pipe_shader_buffer sb[2] = {};
+                  sb[0].buffer = dst;
+                  sb[0].buffer_size = size;
+
+                  if (is_copy) {
+                     sb[1].buffer = src;
+                     sb[1].buffer_size = size;
+                  } else {
+                     for (unsigned i = 0; i < 4; i++)
+                        sctx->cs_user_data[i] = clear_value;
+                  }
+
+                  sctx->flags |= SI_CONTEXT_INV_VCACHE | SI_CONTEXT_INV_SCACHE;
+
+                  ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, is_copy ? 2 : 1, sb, 0x1);
+                  ctx->bind_compute_state(ctx, cs);
+                  sctx->cs_max_waves_per_sh = cs_waves_per_sh;
+
+                  ctx->launch_grid(ctx, &info);
+
+                  ctx->bind_compute_state(ctx, NULL);
+                  ctx->delete_compute_state(ctx, cs);
+                  sctx->cs_max_waves_per_sh = 0; /* disable the limit */
+
+                  sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH;
+               }
+
+               /* Flush L2, so that we don't just test L2 cache performance. */
+               if (!test_sdma) {
+                  sctx->flags |= SI_CONTEXT_WB_L2;
+                  sctx->emit_cache_flush(sctx);
+               }
+
+               ctx->end_query(ctx, q[iter]);
+               ctx->flush(ctx, NULL, PIPE_FLUSH_ASYNC);
+            }
+            pipe_resource_reference(&dst, NULL);
+            pipe_resource_reference(&src, NULL);
+
+            /* Get results. */
+            uint64_t min = ~0ull, max = 0, total = 0;
+
+            for (unsigned iter = 0; iter < NUM_RUNS; iter++) {
+               union pipe_query_result result;
+
+               ctx->get_query_result(ctx, q[iter], true, &result);
+               ctx->destroy_query(ctx, q[iter]);
+
+               min = MIN2(min, result.u64);
+               max = MAX2(max, result.u64);
+               total += result.u64;
+            }
+
+            score = get_MBps_rate(size, total / (double)NUM_RUNS);
+            printf("%7.0f ,", score);
+            fflush(stdout);
+
+            struct si_result *r = &results[util_logbase2(size)][placement][method];
+            r->is_valid = true;
+            r->is_cp = test_cp;
+            r->is_sdma = test_sdma;
+            r->is_cs = test_cs;
+            r->cache_policy = cache_policy;
+            r->dwords_per_thread = cs_dwords_per_thread;
+            r->waves_per_sh = cs_waves_per_sh;
+            r->score = score;
+            r->index = method;
+         }
+         puts("");
+      }
+   }
+
+   puts("");
+   puts("static struct si_method");
+   printf("get_best_clear_for_%s(enum radeon_bo_domain dst, uint64_t size64, bool async, bool "
+          "cached)\n",
+          sctx->screen->info.name);
+   puts("{");
+   puts("   unsigned size = MIN2(size64, UINT_MAX);\n");
+
+   /* Analyze results and find the best methods. */
+   for (unsigned placement = 0; placement < ARRAY_SIZE(placement_str); placement++) {
+      if (placement == 0)
+         puts("   if (dst == RADEON_DOMAIN_VRAM) {");
+      else if (placement == 1)
+         puts("   } else { /* GTT */");
+      else if (placement == 2) {
+         puts("}");
+         puts("");
+         puts("static struct si_method");
+         printf("get_best_copy_for_%s(enum radeon_bo_domain dst, enum radeon_bo_domain src,\n",
+                sctx->screen->info.name);
+         printf("                     uint64_t size64, bool async, bool cached)\n");
+         puts("{");
+         puts("   unsigned size = MIN2(size64, UINT_MAX);\n");
+         puts("   if (src == RADEON_DOMAIN_VRAM && dst == RADEON_DOMAIN_VRAM) {");
+      } else if (placement == 3)
+         puts("   } else if (src == RADEON_DOMAIN_VRAM && dst == RADEON_DOMAIN_GTT) {");
+      else
+         puts("   } else { /* GTT -> VRAM */");
+
+      for (unsigned mode = 0; mode < 3; mode++) {
+         bool async = mode == 0;
+         bool cached = mode == 1;
+
+         if (async)
+            puts("      if (async) { /* SDMA or async compute */");
+         else if (cached)
+            puts("      if (cached) { /* gfx ring */");
+         else
+            puts("      } else { /* gfx ring - uncached */");
+
+         /* The list of best chosen methods. */
+         struct si_result *methods[32];
+         unsigned method_max_size[32];
+         unsigned num_methods = 0;
+
+         for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT) {
+            /* Find the best method. */
+            struct si_result *best = NULL;
+
+            for (unsigned i = 0; i < NUM_METHODS; i++) {
+               struct si_result *r = &results[util_logbase2(size)][placement][i];
+
+               if (!r->is_valid)
+                  continue;
+
+               /* Ban CP DMA clears via MC on <= GFX8. They are super slow
+                * on GTT, which we can get due to BO evictions.
+                */
+               if (sctx->chip_class <= GFX8 && placement == 1 && r->is_cp &&
+                   r->cache_policy == L2_BYPASS)
+                  continue;
+
+               if (async) {
+                  /* The following constraints for compute IBs try to limit
+                   * resource usage so as not to decrease the performance
+                   * of gfx IBs too much.
+                   */
+
+                  /* Don't use CP DMA on asynchronous rings, because
+                   * the engine is shared with gfx IBs.
+                   */
+                  if (r->is_cp)
+                     continue;
+
+                  /* Don't use L2 caching on asynchronous rings to minimize
+                   * L2 usage.
+                   */
+                  if (r->cache_policy == L2_LRU)
+                     continue;
+
+                  /* Asynchronous compute recommends waves_per_sh != 0
+                   * to limit CU usage. */
+                  if (r->is_cs && r->waves_per_sh == 0)
+                     continue;
+               } else {
+                  /* SDMA is always asynchronous */
+                  if (r->is_sdma)
+                     continue;
+
+                  if (cached && r->cache_policy == L2_BYPASS)
+                     continue;
+                  if (!cached && r->cache_policy == L2_LRU)
+                     continue;
+               }
+
+               if (!best) {
+                  best = r;
+                  continue;
+               }
+
+               /* Assume some measurement error. Earlier methods occupy fewer
+                * resources, so the next method is always more greedy, and we
+                * don't want to select it due to a measurement error.
+                */
+               double min_improvement = 1.03;
+
+               if (best->score * min_improvement < r->score)
+                  best = r;
+            }
+
+            if (num_methods > 0) {
+               unsigned prev_index = num_methods - 1;
+               struct si_result *prev = methods[prev_index];
+               struct si_result *prev_this_size =
+                  &results[util_logbase2(size)][placement][prev->index];
+
+               /* If the best one is also the best for the previous size,
+                * just bump the size for the previous one.
+                *
+                * If there is no best, it means all methods were too slow
+                * for this size and were not tested. Use the best one for
+                * the previous size.
+                */
+               if (!best ||
+                   /* If it's the same method as for the previous size: */
+                   (prev->is_cp == best->is_cp && prev->is_sdma == best->is_sdma &&
+                    prev->is_cs == best->is_cs && prev->cache_policy == best->cache_policy &&
+                    prev->dwords_per_thread == best->dwords_per_thread &&
+                    prev->waves_per_sh == best->waves_per_sh) ||
+                   /* If the method for the previous size is also the best
+                    * for this size: */
+                   (prev_this_size->is_valid && prev_this_size->score * 1.03 > best->score)) {
+                  method_max_size[prev_index] = size;
+                  continue;
+               }
+            }
+
+            /* Add it to the list. */
+            assert(num_methods < ARRAY_SIZE(methods));
+            methods[num_methods] = best;
+            method_max_size[num_methods] = size;
+            num_methods++;
+         }
+
+         for (unsigned i = 0; i < num_methods; i++) {
+            struct si_result *best = methods[i];
+            unsigned size = method_max_size[i];
+
+            /* The size threshold is between the current benchmarked
+             * size and the next benchmarked size. */
+            if (i < num_methods - 1)
+               printf("         if (size <= %9u) ", (size + (size << SIZE_SHIFT)) / 2);
+            else if (i > 0)
+               printf("         else                   ");
+            else
+               printf("         ");
+            printf("return ");
+
+            assert(best);
+            if (best->is_cp) {
+               printf("CP_DMA(%s);\n",
+                      best->cache_policy == L2_BYPASS
+                         ? "L2_BYPASS"
+                         : best->cache_policy == L2_LRU ? "L2_LRU   " : "L2_STREAM");
+            }
+            if (best->is_sdma)
+               printf("SDMA;\n");
+            if (best->is_cs) {
+               printf("COMPUTE(%s, %u, %u);\n",
+                      best->cache_policy == L2_LRU ? "L2_LRU   " : "L2_STREAM",
+                      best->dwords_per_thread, best->waves_per_sh);
+            }
+         }
+      }
+      puts("      }");
+   }
+   puts("   }");
+   puts("}");
+
+   ctx->destroy(ctx);
+   exit(0);
 }
diff --git a/src/gallium/drivers/radeonsi/si_texture.c b/src/gallium/drivers/radeonsi/si_texture.c
index bcf9187082b..4f7744a887d 100644
--- a/src/gallium/drivers/radeonsi/si_texture.c
+++ b/src/gallium/drivers/radeonsi/si_texture.c
@@ -23,462 +23,419 @@
  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  */
 
+#include "drm-uapi/drm_fourcc.h"
 #include "si_pipe.h"
 #include "si_query.h"
+#include "sid.h"
+#include "state_tracker/drm_driver.h"
 #include "util/format/u_format.h"
+#include "util/os_time.h"
 #include "util/u_log.h"
 #include "util/u_memory.h"
 #include "util/u_pack_color.h"
 #include "util/u_resource.h"
 #include "util/u_surface.h"
 #include "util/u_transfer.h"
-#include "util/os_time.h"
+
 #include <errno.h>
 #include <inttypes.h>
-#include "state_tracker/drm_driver.h"
-#include "sid.h"
-#include "amd/addrlib/inc/addrinterface.h"
-#include "drm-uapi/drm_fourcc.h"
 
-static enum radeon_surf_mode
-si_choose_tiling(struct si_screen *sscreen,
-		 const struct pipe_resource *templ, bool tc_compatible_htile);
+#include "amd/addrlib/inc/addrinterface.h"
 
+static enum radeon_surf_mode si_choose_tiling(struct si_screen *sscreen,
+                                              const struct pipe_resource *templ,
+                                              bool tc_compatible_htile);
 
-bool si_prepare_for_dma_blit(struct si_context *sctx,
-			     struct si_texture *dst,
-			     unsigned dst_level, unsigned dstx,
-			     unsigned dsty, unsigned dstz,
-			     struct si_texture *src,
-			     unsigned src_level,
-			     const struct pipe_box *src_box)
+bool si_prepare_for_dma_blit(struct si_context *sctx, struct si_texture *dst, unsigned dst_level,
+                             unsigned dstx, unsigned dsty, unsigned dstz, struct si_texture *src,
+                             unsigned src_level, const struct pipe_box *src_box)
 {
-	if (!sctx->sdma_cs)
-		return false;
-
-	if (dst->surface.bpe != src->surface.bpe)
-		return false;
-
-	/* MSAA: Blits don't exist in the real world. */
-	if (src->buffer.b.b.nr_samples > 1 ||
-	    dst->buffer.b.b.nr_samples > 1)
-		return false;
-
-	/* Depth-stencil surfaces:
-	 *   When dst is linear, the DB->CB copy preserves HTILE.
-	 *   When dst is tiled, the 3D path must be used to update HTILE.
-	 */
-	if (src->is_depth || dst->is_depth)
-		return false;
-
-	/* DCC as:
-	 *   src: Use the 3D path. DCC decompression is expensive.
-	 *   dst: Use the 3D path to compress the pixels with DCC.
-	 */
-	if (vi_dcc_enabled(src, src_level) ||
-	    vi_dcc_enabled(dst, dst_level))
-		return false;
-
-	/* CMASK as:
-	 *   src: Both texture and SDMA paths need decompression. Use SDMA.
-	 *   dst: If overwriting the whole texture, discard CMASK and use
-	 *        SDMA. Otherwise, use the 3D path.
-	 */
-	if (dst->cmask_buffer && dst->dirty_level_mask & (1 << dst_level)) {
-		/* The CMASK clear is only enabled for the first level. */
-		assert(dst_level == 0);
-		if (!util_texrange_covers_whole_level(&dst->buffer.b.b, dst_level,
-						      dstx, dsty, dstz, src_box->width,
-						      src_box->height, src_box->depth))
-			return false;
-
-		si_texture_discard_cmask(sctx->screen, dst);
-	}
-
-	/* All requirements are met. Prepare textures for SDMA. */
-	if (src->cmask_buffer && src->dirty_level_mask & (1 << src_level))
-		sctx->b.flush_resource(&sctx->b, &src->buffer.b.b);
-
-	assert(!(src->dirty_level_mask & (1 << src_level)));
-	assert(!(dst->dirty_level_mask & (1 << dst_level)));
-
-	return true;
+   if (!sctx->sdma_cs)
+      return false;
+
+   if (dst->surface.bpe != src->surface.bpe)
+      return false;
+
+   /* MSAA: Blits don't exist in the real world. */
+   if (src->buffer.b.b.nr_samples > 1 || dst->buffer.b.b.nr_samples > 1)
+      return false;
+
+   /* Depth-stencil surfaces:
+    *   When dst is linear, the DB->CB copy preserves HTILE.
+    *   When dst is tiled, the 3D path must be used to update HTILE.
+    */
+   if (src->is_depth || dst->is_depth)
+      return false;
+
+   /* DCC as:
+    *   src: Use the 3D path. DCC decompression is expensive.
+    *   dst: Use the 3D path to compress the pixels with DCC.
+    */
+   if (vi_dcc_enabled(src, src_level) || vi_dcc_enabled(dst, dst_level))
+      return false;
+
+   /* CMASK as:
+    *   src: Both texture and SDMA paths need decompression. Use SDMA.
+    *   dst: If overwriting the whole texture, discard CMASK and use
+    *        SDMA. Otherwise, use the 3D path.
+    */
+   if (dst->cmask_buffer && dst->dirty_level_mask & (1 << dst_level)) {
+      /* The CMASK clear is only enabled for the first level. */
+      assert(dst_level == 0);
+      if (!util_texrange_covers_whole_level(&dst->buffer.b.b, dst_level, dstx, dsty, dstz,
+                                            src_box->width, src_box->height, src_box->depth))
+         return false;
+
+      si_texture_discard_cmask(sctx->screen, dst);
+   }
+
+   /* All requirements are met. Prepare textures for SDMA. */
+   if (src->cmask_buffer && src->dirty_level_mask & (1 << src_level))
+      sctx->b.flush_resource(&sctx->b, &src->buffer.b.b);
+
+   assert(!(src->dirty_level_mask & (1 << src_level)));
+   assert(!(dst->dirty_level_mask & (1 << dst_level)));
+
+   return true;
 }
 
 /* Same as resource_copy_region, except that both upsampling and downsampling are allowed. */
-static void si_copy_region_with_blit(struct pipe_context *pipe,
-				     struct pipe_resource *dst,
-				     unsigned dst_level,
-				     unsigned dstx, unsigned dsty, unsigned dstz,
-				     struct pipe_resource *src,
-				     unsigned src_level,
-				     const struct pipe_box *src_box)
+static void si_copy_region_with_blit(struct pipe_context *pipe, struct pipe_resource *dst,
+                                     unsigned dst_level, unsigned dstx, unsigned dsty,
+                                     unsigned dstz, struct pipe_resource *src, unsigned src_level,
+                                     const struct pipe_box *src_box)
 {
-	struct pipe_blit_info blit;
-
-	memset(&blit, 0, sizeof(blit));
-	blit.src.resource = src;
-	blit.src.format = src->format;
-	blit.src.level = src_level;
-	blit.src.box = *src_box;
-	blit.dst.resource = dst;
-	blit.dst.format = dst->format;
-	blit.dst.level = dst_level;
-	blit.dst.box.x = dstx;
-	blit.dst.box.y = dsty;
-	blit.dst.box.z = dstz;
-	blit.dst.box.width = src_box->width;
-	blit.dst.box.height = src_box->height;
-	blit.dst.box.depth = src_box->depth;
-	blit.mask = util_format_get_mask(dst->format);
-	blit.filter = PIPE_TEX_FILTER_NEAREST;
-
-	if (blit.mask) {
-		pipe->blit(pipe, &blit);
-	}
+   struct pipe_blit_info blit;
+
+   memset(&blit, 0, sizeof(blit));
+   blit.src.resource = src;
+   blit.src.format = src->format;
+   blit.src.level = src_level;
+   blit.src.box = *src_box;
+   blit.dst.resource = dst;
+   blit.dst.format = dst->format;
+   blit.dst.level = dst_level;
+   blit.dst.box.x = dstx;
+   blit.dst.box.y = dsty;
+   blit.dst.box.z = dstz;
+   blit.dst.box.width = src_box->width;
+   blit.dst.box.height = src_box->height;
+   blit.dst.box.depth = src_box->depth;
+   blit.mask = util_format_get_mask(dst->format);
+   blit.filter = PIPE_TEX_FILTER_NEAREST;
+
+   if (blit.mask) {
+      pipe->blit(pipe, &blit);
+   }
 }
 
 /* Copy from a full GPU texture to a transfer's staging one. */
 static void si_copy_to_staging_texture(struct pipe_context *ctx, struct si_transfer *stransfer)
 {
-	struct si_context *sctx = (struct si_context*)ctx;
-	struct pipe_transfer *transfer = (struct pipe_transfer*)stransfer;
-	struct pipe_resource *dst = &stransfer->staging->b.b;
-	struct pipe_resource *src = transfer->resource;
-
-	if (src->nr_samples > 1 || ((struct si_texture*)src)->is_depth) {
-		si_copy_region_with_blit(ctx, dst, 0, 0, 0, 0,
-					   src, transfer->level, &transfer->box);
-		return;
-	}
-
-	sctx->dma_copy(ctx, dst, 0, 0, 0, 0, src, transfer->level,
-		       &transfer->box);
+   struct si_context *sctx = (struct si_context *)ctx;
+   struct pipe_transfer *transfer = (struct pipe_transfer *)stransfer;
+   struct pipe_resource *dst = &stransfer->staging->b.b;
+   struct pipe_resource *src = transfer->resource;
+
+   if (src->nr_samples > 1 || ((struct si_texture *)src)->is_depth) {
+      si_copy_region_with_blit(ctx, dst, 0, 0, 0, 0, src, transfer->level, &transfer->box);
+      return;
+   }
+
+   sctx->dma_copy(ctx, dst, 0, 0, 0, 0, src, transfer->level, &transfer->box);
 }
 
 /* Copy from a transfer's staging texture to a full GPU one. */
 static void si_copy_from_staging_texture(struct pipe_context *ctx, struct si_transfer *stransfer)
 {
-	struct si_context *sctx = (struct si_context*)ctx;
-	struct pipe_transfer *transfer = (struct pipe_transfer*)stransfer;
-	struct pipe_resource *dst = transfer->resource;
-	struct pipe_resource *src = &stransfer->staging->b.b;
-	struct pipe_box sbox;
-
-	u_box_3d(0, 0, 0, transfer->box.width, transfer->box.height, transfer->box.depth, &sbox);
-
-	if (dst->nr_samples > 1 || ((struct si_texture*)dst)->is_depth) {
-		si_copy_region_with_blit(ctx, dst, transfer->level,
-					   transfer->box.x, transfer->box.y, transfer->box.z,
-					   src, 0, &sbox);
-		return;
-	}
-
-	if (util_format_is_compressed(dst->format)) {
-		sbox.width = util_format_get_nblocksx(dst->format, sbox.width);
-		sbox.height = util_format_get_nblocksx(dst->format, sbox.height);
-	}
-
-	sctx->dma_copy(ctx, dst, transfer->level,
-		       transfer->box.x, transfer->box.y, transfer->box.z,
-		       src, 0, &sbox);
+   struct si_context *sctx = (struct si_context *)ctx;
+   struct pipe_transfer *transfer = (struct pipe_transfer *)stransfer;
+   struct pipe_resource *dst = transfer->resource;
+   struct pipe_resource *src = &stransfer->staging->b.b;
+   struct pipe_box sbox;
+
+   u_box_3d(0, 0, 0, transfer->box.width, transfer->box.height, transfer->box.depth, &sbox);
+
+   if (dst->nr_samples > 1 || ((struct si_texture *)dst)->is_depth) {
+      si_copy_region_with_blit(ctx, dst, transfer->level, transfer->box.x, transfer->box.y,
+                               transfer->box.z, src, 0, &sbox);
+      return;
+   }
+
+   if (util_format_is_compressed(dst->format)) {
+      sbox.width = util_format_get_nblocksx(dst->format, sbox.width);
+      sbox.height = util_format_get_nblocksx(dst->format, sbox.height);
+   }
+
+   sctx->dma_copy(ctx, dst, transfer->level, transfer->box.x, transfer->box.y, transfer->box.z, src,
+                  0, &sbox);
 }
 
-static unsigned si_texture_get_offset(struct si_screen *sscreen,
-				      struct si_texture *tex, unsigned level,
-				      const struct pipe_box *box,
-				      unsigned *stride,
-				      unsigned *layer_stride)
+static unsigned si_texture_get_offset(struct si_screen *sscreen, struct si_texture *tex,
+                                      unsigned level, const struct pipe_box *box, unsigned *stride,
+                                      unsigned *layer_stride)
 {
-	if (sscreen->info.chip_class >= GFX9) {
-		*stride = tex->surface.u.gfx9.surf_pitch * tex->surface.bpe;
-		*layer_stride = tex->surface.u.gfx9.surf_slice_size;
-
-		if (!box)
-			return 0;
-
-		/* Each texture is an array of slices. Each slice is an array
-		 * of mipmap levels. */
-		return tex->surface.u.gfx9.surf_offset +
-		       box->z * tex->surface.u.gfx9.surf_slice_size +
-		       tex->surface.u.gfx9.offset[level] +
-		       (box->y / tex->surface.blk_h *
-			tex->surface.u.gfx9.surf_pitch +
-			box->x / tex->surface.blk_w) * tex->surface.bpe;
-	} else {
-		*stride = tex->surface.u.legacy.level[level].nblk_x *
-			  tex->surface.bpe;
-		assert((uint64_t)tex->surface.u.legacy.level[level].slice_size_dw * 4 <= UINT_MAX);
-		*layer_stride = (uint64_t)tex->surface.u.legacy.level[level].slice_size_dw * 4;
-
-		if (!box)
-			return tex->surface.u.legacy.level[level].offset;
-
-		/* Each texture is an array of mipmap levels. Each level is
-		 * an array of slices. */
-		return tex->surface.u.legacy.level[level].offset +
-		       box->z * (uint64_t)tex->surface.u.legacy.level[level].slice_size_dw * 4 +
-		       (box->y / tex->surface.blk_h *
-		        tex->surface.u.legacy.level[level].nblk_x +
-		        box->x / tex->surface.blk_w) * tex->surface.bpe;
-	}
+   if (sscreen->info.chip_class >= GFX9) {
+      *stride = tex->surface.u.gfx9.surf_pitch * tex->surface.bpe;
+      *layer_stride = tex->surface.u.gfx9.surf_slice_size;
+
+      if (!box)
+         return 0;
+
+      /* Each texture is an array of slices. Each slice is an array
+       * of mipmap levels. */
+      return tex->surface.u.gfx9.surf_offset + box->z * tex->surface.u.gfx9.surf_slice_size +
+             tex->surface.u.gfx9.offset[level] +
+             (box->y / tex->surface.blk_h * tex->surface.u.gfx9.surf_pitch +
+              box->x / tex->surface.blk_w) *
+                tex->surface.bpe;
+   } else {
+      *stride = tex->surface.u.legacy.level[level].nblk_x * tex->surface.bpe;
+      assert((uint64_t)tex->surface.u.legacy.level[level].slice_size_dw * 4 <= UINT_MAX);
+      *layer_stride = (uint64_t)tex->surface.u.legacy.level[level].slice_size_dw * 4;
+
+      if (!box)
+         return tex->surface.u.legacy.level[level].offset;
+
+      /* Each texture is an array of mipmap levels. Each level is
+       * an array of slices. */
+      return tex->surface.u.legacy.level[level].offset +
+             box->z * (uint64_t)tex->surface.u.legacy.level[level].slice_size_dw * 4 +
+             (box->y / tex->surface.blk_h * tex->surface.u.legacy.level[level].nblk_x +
+              box->x / tex->surface.blk_w) *
+                tex->surface.bpe;
+   }
 }
 
-static int si_init_surface(struct si_screen *sscreen,
-			   struct radeon_surf *surface,
-			   const struct pipe_resource *ptex,
-			   enum radeon_surf_mode array_mode,
-			   unsigned pitch_in_bytes_override,
-			   bool is_imported,
-			   bool is_scanout,
-			   bool is_flushed_depth,
-			   bool tc_compatible_htile)
+static int si_init_surface(struct si_screen *sscreen, struct radeon_surf *surface,
+                           const struct pipe_resource *ptex, enum radeon_surf_mode array_mode,
+                           unsigned pitch_in_bytes_override, bool is_imported, bool is_scanout,
+                           bool is_flushed_depth, bool tc_compatible_htile)
 {
-	const struct util_format_description *desc =
-		util_format_description(ptex->format);
-	bool is_depth, is_stencil;
-	int r;
-	unsigned bpe, flags = 0;
-
-	is_depth = util_format_has_depth(desc);
-	is_stencil = util_format_has_stencil(desc);
-
-	if (!is_flushed_depth &&
-	    ptex->format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT) {
-		bpe = 4; /* stencil is allocated separately */
-	} else {
-		bpe = util_format_get_blocksize(ptex->format);
-		assert(util_is_power_of_two_or_zero(bpe));
-	}
-
-	if (!is_flushed_depth && is_depth) {
-		flags |= RADEON_SURF_ZBUFFER;
-
-		if (sscreen->debug_flags & DBG(NO_HYPERZ)) {
-			flags |= RADEON_SURF_NO_HTILE;
-		} else if (tc_compatible_htile &&
-			   (sscreen->info.chip_class >= GFX9 ||
-			    array_mode == RADEON_SURF_MODE_2D)) {
-			/* TC-compatible HTILE only supports Z32_FLOAT.
-			 * GFX9 also supports Z16_UNORM.
-			 * On GFX8, promote Z16 to Z32. DB->CB copies will convert
-			 * the format for transfers.
-			 */
-			if (sscreen->info.chip_class == GFX8)
-				bpe = 4;
-
-			flags |= RADEON_SURF_TC_COMPATIBLE_HTILE;
-		}
-
-		if (is_stencil)
-			flags |= RADEON_SURF_SBUFFER;
-	}
-
-	if (sscreen->info.chip_class >= GFX8 &&
-	    (ptex->flags & SI_RESOURCE_FLAG_DISABLE_DCC ||
-	     ptex->format == PIPE_FORMAT_R9G9B9E5_FLOAT ||
-	     (ptex->nr_samples >= 2 && !sscreen->dcc_msaa_allowed)))
-		flags |= RADEON_SURF_DISABLE_DCC;
-
-	/* Stoney: 128bpp MSAA textures randomly fail piglit tests with DCC. */
-	if (sscreen->info.family == CHIP_STONEY &&
-	    bpe == 16 && ptex->nr_samples >= 2)
-		flags |= RADEON_SURF_DISABLE_DCC;
-
-	/* GFX8: DCC clear for 4x and 8x MSAA array textures unimplemented. */
-	if (sscreen->info.chip_class == GFX8 &&
-	    ptex->nr_storage_samples >= 4 &&
-	    ptex->array_size > 1)
-		flags |= RADEON_SURF_DISABLE_DCC;
-
-	/* GFX9: DCC clear for 4x and 8x MSAA textures unimplemented. */
-	if (sscreen->info.chip_class == GFX9 &&
-	    (ptex->nr_storage_samples >= 4 ||
-	     (sscreen->info.family == CHIP_RAVEN &&
-	      ptex->nr_storage_samples >= 2 && bpe < 4)))
-		flags |= RADEON_SURF_DISABLE_DCC;
-
-	/* TODO: GFX10: DCC causes corruption with MSAA. */
-	if (sscreen->info.chip_class >= GFX10 &&
-	    ptex->nr_storage_samples >= 2)
-		flags |= RADEON_SURF_DISABLE_DCC;
-
-	/* Shared textures must always set up DCC.
-	 * If it's not present, it will be disabled by
-	 * si_get_opaque_metadata later.
-	 */
-	if (!is_imported && (sscreen->debug_flags & DBG(NO_DCC)))
-		flags |= RADEON_SURF_DISABLE_DCC;
-
-	if (is_scanout) {
-		/* This should catch bugs in gallium users setting incorrect flags. */
-		assert(ptex->nr_samples <= 1 &&
-		       ptex->array_size == 1 &&
-		       ptex->depth0 == 1 &&
-		       ptex->last_level == 0 &&
-		       !(flags & RADEON_SURF_Z_OR_SBUFFER));
-
-		flags |= RADEON_SURF_SCANOUT;
-	}
-
-	if (ptex->bind & PIPE_BIND_SHARED)
-		flags |= RADEON_SURF_SHAREABLE;
-	if (is_imported)
-		flags |= RADEON_SURF_IMPORTED | RADEON_SURF_SHAREABLE;
-	if (!(ptex->flags & SI_RESOURCE_FLAG_FORCE_MSAA_TILING))
-		flags |= RADEON_SURF_OPTIMIZE_FOR_SPACE;
-	if (sscreen->debug_flags & DBG(NO_FMASK))
-		flags |= RADEON_SURF_NO_FMASK;
-
-	if (sscreen->info.chip_class == GFX9 &&
-	    (ptex->flags & SI_RESOURCE_FLAG_FORCE_MICRO_TILE_MODE)) {
-		flags |= RADEON_SURF_FORCE_MICRO_TILE_MODE;
-		surface->micro_tile_mode = SI_RESOURCE_FLAG_MICRO_TILE_MODE_GET(ptex->flags);
-	}
-
-	if (sscreen->info.chip_class >= GFX10 &&
-	    (ptex->flags & SI_RESOURCE_FLAG_FORCE_MSAA_TILING)) {
-		flags |= RADEON_SURF_FORCE_SWIZZLE_MODE;
-		surface->u.gfx9.surf.swizzle_mode = ADDR_SW_64KB_R_X;
-	}
-
-	r = sscreen->ws->surface_init(sscreen->ws, ptex, flags, bpe,
-				      array_mode, surface);
-	if (r) {
-		return r;
-	}
-
-	unsigned pitch = pitch_in_bytes_override / bpe;
-
-	if (sscreen->info.chip_class >= GFX9) {
-		if (pitch) {
-			surface->u.gfx9.surf_pitch = pitch;
-			if (ptex->last_level == 0)
-				surface->u.gfx9.surf.epitch = pitch - 1;
-			surface->u.gfx9.surf_slice_size =
-				(uint64_t)pitch * surface->u.gfx9.surf_height * bpe;
-		}
-	} else {
-		if (pitch) {
-			surface->u.legacy.level[0].nblk_x = pitch;
-			surface->u.legacy.level[0].slice_size_dw =
-				((uint64_t)pitch * surface->u.legacy.level[0].nblk_y * bpe) / 4;
-		}
-	}
-	return 0;
+   const struct util_format_description *desc = util_format_description(ptex->format);
+   bool is_depth, is_stencil;
+   int r;
+   unsigned bpe, flags = 0;
+
+   is_depth = util_format_has_depth(desc);
+   is_stencil = util_format_has_stencil(desc);
+
+   if (!is_flushed_depth && ptex->format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT) {
+      bpe = 4; /* stencil is allocated separately */
+   } else {
+      bpe = util_format_get_blocksize(ptex->format);
+      assert(util_is_power_of_two_or_zero(bpe));
+   }
+
+   if (!is_flushed_depth && is_depth) {
+      flags |= RADEON_SURF_ZBUFFER;
+
+      if (sscreen->debug_flags & DBG(NO_HYPERZ)) {
+         flags |= RADEON_SURF_NO_HTILE;
+      } else if (tc_compatible_htile &&
+                 (sscreen->info.chip_class >= GFX9 || array_mode == RADEON_SURF_MODE_2D)) {
+         /* TC-compatible HTILE only supports Z32_FLOAT.
+          * GFX9 also supports Z16_UNORM.
+          * On GFX8, promote Z16 to Z32. DB->CB copies will convert
+          * the format for transfers.
+          */
+         if (sscreen->info.chip_class == GFX8)
+            bpe = 4;
+
+         flags |= RADEON_SURF_TC_COMPATIBLE_HTILE;
+      }
+
+      if (is_stencil)
+         flags |= RADEON_SURF_SBUFFER;
+   }
+
+   if (sscreen->info.chip_class >= GFX8 &&
+       (ptex->flags & SI_RESOURCE_FLAG_DISABLE_DCC || ptex->format == PIPE_FORMAT_R9G9B9E5_FLOAT ||
+        (ptex->nr_samples >= 2 && !sscreen->dcc_msaa_allowed)))
+      flags |= RADEON_SURF_DISABLE_DCC;
+
+   /* Stoney: 128bpp MSAA textures randomly fail piglit tests with DCC. */
+   if (sscreen->info.family == CHIP_STONEY && bpe == 16 && ptex->nr_samples >= 2)
+      flags |= RADEON_SURF_DISABLE_DCC;
+
+   /* GFX8: DCC clear for 4x and 8x MSAA array textures unimplemented. */
+   if (sscreen->info.chip_class == GFX8 && ptex->nr_storage_samples >= 4 && ptex->array_size > 1)
+      flags |= RADEON_SURF_DISABLE_DCC;
+
+   /* GFX9: DCC clear for 4x and 8x MSAA textures unimplemented. */
+   if (sscreen->info.chip_class == GFX9 &&
+       (ptex->nr_storage_samples >= 4 ||
+        (sscreen->info.family == CHIP_RAVEN && ptex->nr_storage_samples >= 2 && bpe < 4)))
+      flags |= RADEON_SURF_DISABLE_DCC;
+
+   /* TODO: GFX10: DCC causes corruption with MSAA. */
+   if (sscreen->info.chip_class >= GFX10 && ptex->nr_storage_samples >= 2)
+      flags |= RADEON_SURF_DISABLE_DCC;
+
+   /* Shared textures must always set up DCC.
+    * If it's not present, it will be disabled by
+    * si_get_opaque_metadata later.
+    */
+   if (!is_imported && (sscreen->debug_flags & DBG(NO_DCC)))
+      flags |= RADEON_SURF_DISABLE_DCC;
+
+   if (is_scanout) {
+      /* This should catch bugs in gallium users setting incorrect flags. */
+      assert(ptex->nr_samples <= 1 && ptex->array_size == 1 && ptex->depth0 == 1 &&
+             ptex->last_level == 0 && !(flags & RADEON_SURF_Z_OR_SBUFFER));
+
+      flags |= RADEON_SURF_SCANOUT;
+   }
+
+   if (ptex->bind & PIPE_BIND_SHARED)
+      flags |= RADEON_SURF_SHAREABLE;
+   if (is_imported)
+      flags |= RADEON_SURF_IMPORTED | RADEON_SURF_SHAREABLE;
+   if (!(ptex->flags & SI_RESOURCE_FLAG_FORCE_MSAA_TILING))
+      flags |= RADEON_SURF_OPTIMIZE_FOR_SPACE;
+   if (sscreen->debug_flags & DBG(NO_FMASK))
+      flags |= RADEON_SURF_NO_FMASK;
+
+   if (sscreen->info.chip_class == GFX9 && (ptex->flags & SI_RESOURCE_FLAG_FORCE_MICRO_TILE_MODE)) {
+      flags |= RADEON_SURF_FORCE_MICRO_TILE_MODE;
+      surface->micro_tile_mode = SI_RESOURCE_FLAG_MICRO_TILE_MODE_GET(ptex->flags);
+   }
+
+   if (sscreen->info.chip_class >= GFX10 && (ptex->flags & SI_RESOURCE_FLAG_FORCE_MSAA_TILING)) {
+      flags |= RADEON_SURF_FORCE_SWIZZLE_MODE;
+      surface->u.gfx9.surf.swizzle_mode = ADDR_SW_64KB_R_X;
+   }
+
+   r = sscreen->ws->surface_init(sscreen->ws, ptex, flags, bpe, array_mode, surface);
+   if (r) {
+      return r;
+   }
+
+   unsigned pitch = pitch_in_bytes_override / bpe;
+
+   if (sscreen->info.chip_class >= GFX9) {
+      if (pitch) {
+         surface->u.gfx9.surf_pitch = pitch;
+         if (ptex->last_level == 0)
+            surface->u.gfx9.surf.epitch = pitch - 1;
+         surface->u.gfx9.surf_slice_size = (uint64_t)pitch * surface->u.gfx9.surf_height * bpe;
+      }
+   } else {
+      if (pitch) {
+         surface->u.legacy.level[0].nblk_x = pitch;
+         surface->u.legacy.level[0].slice_size_dw =
+            ((uint64_t)pitch * surface->u.legacy.level[0].nblk_y * bpe) / 4;
+      }
+   }
+   return 0;
 }
 
-static void si_get_display_metadata(struct si_screen *sscreen,
-				    struct radeon_surf *surf,
-				    struct radeon_bo_metadata *metadata,
-				    enum radeon_surf_mode *array_mode,
-				    bool *is_scanout)
+static void si_get_display_metadata(struct si_screen *sscreen, struct radeon_surf *surf,
+                                    struct radeon_bo_metadata *metadata,
+                                    enum radeon_surf_mode *array_mode, bool *is_scanout)
 {
-	if (sscreen->info.chip_class >= GFX9) {
-		if (metadata->u.gfx9.swizzle_mode > 0)
-			*array_mode = RADEON_SURF_MODE_2D;
-		else
-			*array_mode = RADEON_SURF_MODE_LINEAR_ALIGNED;
-
-		surf->u.gfx9.surf.swizzle_mode = metadata->u.gfx9.swizzle_mode;
-		*is_scanout = metadata->u.gfx9.scanout;
-
-		if (metadata->u.gfx9.dcc_offset_256B) {
-			surf->u.gfx9.display_dcc_pitch_max = metadata->u.gfx9.dcc_pitch_max;
-			assert(metadata->u.gfx9.dcc_independent_64B == 1);
-		}
-	} else {
-		surf->u.legacy.pipe_config = metadata->u.legacy.pipe_config;
-		surf->u.legacy.bankw = metadata->u.legacy.bankw;
-		surf->u.legacy.bankh = metadata->u.legacy.bankh;
-		surf->u.legacy.tile_split = metadata->u.legacy.tile_split;
-		surf->u.legacy.mtilea = metadata->u.legacy.mtilea;
-		surf->u.legacy.num_banks = metadata->u.legacy.num_banks;
-
-		if (metadata->u.legacy.macrotile == RADEON_LAYOUT_TILED)
-			*array_mode = RADEON_SURF_MODE_2D;
-		else if (metadata->u.legacy.microtile == RADEON_LAYOUT_TILED)
-			*array_mode = RADEON_SURF_MODE_1D;
-		else
-			*array_mode = RADEON_SURF_MODE_LINEAR_ALIGNED;
-
-		*is_scanout = metadata->u.legacy.scanout;
-	}
+   if (sscreen->info.chip_class >= GFX9) {
+      if (metadata->u.gfx9.swizzle_mode > 0)
+         *array_mode = RADEON_SURF_MODE_2D;
+      else
+         *array_mode = RADEON_SURF_MODE_LINEAR_ALIGNED;
+
+      surf->u.gfx9.surf.swizzle_mode = metadata->u.gfx9.swizzle_mode;
+      *is_scanout = metadata->u.gfx9.scanout;
+
+      if (metadata->u.gfx9.dcc_offset_256B) {
+         surf->u.gfx9.display_dcc_pitch_max = metadata->u.gfx9.dcc_pitch_max;
+         assert(metadata->u.gfx9.dcc_independent_64B == 1);
+      }
+   } else {
+      surf->u.legacy.pipe_config = metadata->u.legacy.pipe_config;
+      surf->u.legacy.bankw = metadata->u.legacy.bankw;
+      surf->u.legacy.bankh = metadata->u.legacy.bankh;
+      surf->u.legacy.tile_split = metadata->u.legacy.tile_split;
+      surf->u.legacy.mtilea = metadata->u.legacy.mtilea;
+      surf->u.legacy.num_banks = metadata->u.legacy.num_banks;
+
+      if (metadata->u.legacy.macrotile == RADEON_LAYOUT_TILED)
+         *array_mode = RADEON_SURF_MODE_2D;
+      else if (metadata->u.legacy.microtile == RADEON_LAYOUT_TILED)
+         *array_mode = RADEON_SURF_MODE_1D;
+      else
+         *array_mode = RADEON_SURF_MODE_LINEAR_ALIGNED;
+
+      *is_scanout = metadata->u.legacy.scanout;
+   }
 }
 
-void si_eliminate_fast_color_clear(struct si_context *sctx,
-				   struct si_texture *tex)
+void si_eliminate_fast_color_clear(struct si_context *sctx, struct si_texture *tex)
 {
-	struct si_screen *sscreen = sctx->screen;
-	struct pipe_context *ctx = &sctx->b;
+   struct si_screen *sscreen = sctx->screen;
+   struct pipe_context *ctx = &sctx->b;
 
-	if (ctx == sscreen->aux_context)
-		simple_mtx_lock(&sscreen->aux_context_lock);
+   if (ctx == sscreen->aux_context)
+      simple_mtx_lock(&sscreen->aux_context_lock);
 
-	unsigned n = sctx->num_decompress_calls;
-	ctx->flush_resource(ctx, &tex->buffer.b.b);
+   unsigned n = sctx->num_decompress_calls;
+   ctx->flush_resource(ctx, &tex->buffer.b.b);
 
-	/* Flush only if any fast clear elimination took place. */
-	if (n != sctx->num_decompress_calls)
-		ctx->flush(ctx, NULL, 0);
+   /* Flush only if any fast clear elimination took place. */
+   if (n != sctx->num_decompress_calls)
+      ctx->flush(ctx, NULL, 0);
 
-	if (ctx == sscreen->aux_context)
-		simple_mtx_unlock(&sscreen->aux_context_lock);
+   if (ctx == sscreen->aux_context)
+      simple_mtx_unlock(&sscreen->aux_context_lock);
 }
 
-void si_texture_discard_cmask(struct si_screen *sscreen,
-			      struct si_texture *tex)
+void si_texture_discard_cmask(struct si_screen *sscreen, struct si_texture *tex)
 {
-	if (!tex->cmask_buffer)
-		return;
+   if (!tex->cmask_buffer)
+      return;
 
-	assert(tex->buffer.b.b.nr_samples <= 1);
+   assert(tex->buffer.b.b.nr_samples <= 1);
 
-	/* Disable CMASK. */
-	tex->cmask_base_address_reg = tex->buffer.gpu_address >> 8;
-	tex->dirty_level_mask = 0;
+   /* Disable CMASK. */
+   tex->cmask_base_address_reg = tex->buffer.gpu_address >> 8;
+   tex->dirty_level_mask = 0;
 
-	tex->cb_color_info &= ~S_028C70_FAST_CLEAR(1);
+   tex->cb_color_info &= ~S_028C70_FAST_CLEAR(1);
 
-	if (tex->cmask_buffer != &tex->buffer)
-	    si_resource_reference(&tex->cmask_buffer, NULL);
+   if (tex->cmask_buffer != &tex->buffer)
+      si_resource_reference(&tex->cmask_buffer, NULL);
 
-	tex->cmask_buffer = NULL;
+   tex->cmask_buffer = NULL;
 
-	/* Notify all contexts about the change. */
-	p_atomic_inc(&sscreen->dirty_tex_counter);
-	p_atomic_inc(&sscreen->compressed_colortex_counter);
+   /* Notify all contexts about the change. */
+   p_atomic_inc(&sscreen->dirty_tex_counter);
+   p_atomic_inc(&sscreen->compressed_colortex_counter);
 }
 
 static bool si_can_disable_dcc(struct si_texture *tex)
 {
-	/* We can't disable DCC if it can be written by another process. */
-	return tex->surface.dcc_offset &&
-	       (!tex->buffer.b.is_shared ||
-		!(tex->buffer.external_usage & PIPE_HANDLE_USAGE_FRAMEBUFFER_WRITE));
+   /* We can't disable DCC if it can be written by another process. */
+   return tex->surface.dcc_offset &&
+          (!tex->buffer.b.is_shared ||
+           !(tex->buffer.external_usage & PIPE_HANDLE_USAGE_FRAMEBUFFER_WRITE));
 }
 
 static void si_texture_zero_dcc_fields(struct si_texture *tex)
 {
-	tex->surface.dcc_offset = 0;
-	tex->surface.display_dcc_offset = 0;
-	tex->surface.dcc_retile_map_offset = 0;
+   tex->surface.dcc_offset = 0;
+   tex->surface.display_dcc_offset = 0;
+   tex->surface.dcc_retile_map_offset = 0;
 }
 
-static bool si_texture_discard_dcc(struct si_screen *sscreen,
-				   struct si_texture *tex)
+static bool si_texture_discard_dcc(struct si_screen *sscreen, struct si_texture *tex)
 {
-	if (!si_can_disable_dcc(tex))
-		return false;
+   if (!si_can_disable_dcc(tex))
+      return false;
 
-	assert(tex->dcc_separate_buffer == NULL);
+   assert(tex->dcc_separate_buffer == NULL);
 
-	/* Disable DCC. */
-	si_texture_zero_dcc_fields(tex);
+   /* Disable DCC. */
+   si_texture_zero_dcc_fields(tex);
 
-	/* Notify all contexts about the change. */
-	p_atomic_inc(&sscreen->dirty_tex_counter);
-	return true;
+   /* Notify all contexts about the change. */
+   p_atomic_inc(&sscreen->dirty_tex_counter);
+   return true;
 }
 
 /**
@@ -502,783 +459,726 @@ static bool si_texture_discard_dcc(struct si_screen *sscreen,
  * \param sctx  the current context if you have one, or sscreen->aux_context
  *              if you don't.
  */
-bool si_texture_disable_dcc(struct si_context *sctx,
-			    struct si_texture *tex)
+bool si_texture_disable_dcc(struct si_context *sctx, struct si_texture *tex)
 {
-	struct si_screen *sscreen = sctx->screen;
+   struct si_screen *sscreen = sctx->screen;
 
-	if (!sctx->has_graphics)
-		return si_texture_discard_dcc(sscreen, tex);
+   if (!sctx->has_graphics)
+      return si_texture_discard_dcc(sscreen, tex);
 
-	if (!si_can_disable_dcc(tex))
-		return false;
+   if (!si_can_disable_dcc(tex))
+      return false;
 
-	if (&sctx->b == sscreen->aux_context)
-		simple_mtx_lock(&sscreen->aux_context_lock);
+   if (&sctx->b == sscreen->aux_context)
+      simple_mtx_lock(&sscreen->aux_context_lock);
 
-	/* Decompress DCC. */
-	si_decompress_dcc(sctx, tex);
-	sctx->b.flush(&sctx->b, NULL, 0);
+   /* Decompress DCC. */
+   si_decompress_dcc(sctx, tex);
+   sctx->b.flush(&sctx->b, NULL, 0);
 
-	if (&sctx->b == sscreen->aux_context)
-		simple_mtx_unlock(&sscreen->aux_context_lock);
+   if (&sctx->b == sscreen->aux_context)
+      simple_mtx_unlock(&sscreen->aux_context_lock);
 
-	return si_texture_discard_dcc(sscreen, tex);
+   return si_texture_discard_dcc(sscreen, tex);
 }
 
-static void si_reallocate_texture_inplace(struct si_context *sctx,
-					  struct si_texture *tex,
-					  unsigned new_bind_flag,
-					  bool invalidate_storage)
+static void si_reallocate_texture_inplace(struct si_context *sctx, struct si_texture *tex,
+                                          unsigned new_bind_flag, bool invalidate_storage)
 {
-	struct pipe_screen *screen = sctx->b.screen;
-	struct si_texture *new_tex;
-	struct pipe_resource templ = tex->buffer.b.b;
-	unsigned i;
-
-	templ.bind |= new_bind_flag;
-
-	if (tex->buffer.b.is_shared || tex->num_planes > 1)
-		return;
-
-	if (new_bind_flag == PIPE_BIND_LINEAR) {
-		if (tex->surface.is_linear)
-			return;
-
-		/* This fails with MSAA, depth, and compressed textures. */
-		if (si_choose_tiling(sctx->screen, &templ, false) !=
-		    RADEON_SURF_MODE_LINEAR_ALIGNED)
-			return;
-	}
-
-	new_tex = (struct si_texture*)screen->resource_create(screen, &templ);
-	if (!new_tex)
-		return;
-
-	/* Copy the pixels to the new texture. */
-	if (!invalidate_storage) {
-		for (i = 0; i <= templ.last_level; i++) {
-			struct pipe_box box;
-
-			u_box_3d(0, 0, 0,
-				 u_minify(templ.width0, i), u_minify(templ.height0, i),
-				 util_num_layers(&templ, i), &box);
-
-			sctx->dma_copy(&sctx->b, &new_tex->buffer.b.b, i, 0, 0, 0,
-				       &tex->buffer.b.b, i, &box);
-		}
-	}
-
-	if (new_bind_flag == PIPE_BIND_LINEAR) {
-		si_texture_discard_cmask(sctx->screen, tex);
-		si_texture_discard_dcc(sctx->screen, tex);
-	}
-
-	/* Replace the structure fields of tex. */
-	tex->buffer.b.b.bind = templ.bind;
-	pb_reference(&tex->buffer.buf, new_tex->buffer.buf);
-	tex->buffer.gpu_address = new_tex->buffer.gpu_address;
-	tex->buffer.vram_usage = new_tex->buffer.vram_usage;
-	tex->buffer.gart_usage = new_tex->buffer.gart_usage;
-	tex->buffer.bo_size = new_tex->buffer.bo_size;
-	tex->buffer.bo_alignment = new_tex->buffer.bo_alignment;
-	tex->buffer.domains = new_tex->buffer.domains;
-	tex->buffer.flags = new_tex->buffer.flags;
-
-	tex->surface = new_tex->surface;
-	si_texture_reference(&tex->flushed_depth_texture,
-			     new_tex->flushed_depth_texture);
-
-	tex->surface.fmask_offset = new_tex->surface.fmask_offset;
-	tex->surface.cmask_offset = new_tex->surface.cmask_offset;
-	tex->cmask_base_address_reg = new_tex->cmask_base_address_reg;
-
-	if (tex->cmask_buffer == &tex->buffer)
-		tex->cmask_buffer = NULL;
-	else
-		si_resource_reference(&tex->cmask_buffer, NULL);
-
-	if (new_tex->cmask_buffer == &new_tex->buffer)
-		tex->cmask_buffer = &tex->buffer;
-	else
-		si_resource_reference(&tex->cmask_buffer, new_tex->cmask_buffer);
-
-	tex->surface.dcc_offset = new_tex->surface.dcc_offset;
-	tex->cb_color_info = new_tex->cb_color_info;
-	memcpy(tex->color_clear_value, new_tex->color_clear_value,
-	       sizeof(tex->color_clear_value));
-	tex->last_msaa_resolve_target_micro_mode = new_tex->last_msaa_resolve_target_micro_mode;
-
-	tex->surface.htile_offset = new_tex->surface.htile_offset;
-	tex->depth_clear_value = new_tex->depth_clear_value;
-	tex->dirty_level_mask = new_tex->dirty_level_mask;
-	tex->stencil_dirty_level_mask = new_tex->stencil_dirty_level_mask;
-	tex->db_render_format = new_tex->db_render_format;
-	tex->stencil_clear_value = new_tex->stencil_clear_value;
-	tex->tc_compatible_htile = new_tex->tc_compatible_htile;
-	tex->depth_cleared = new_tex->depth_cleared;
-	tex->stencil_cleared = new_tex->stencil_cleared;
-	tex->upgraded_depth = new_tex->upgraded_depth;
-	tex->db_compatible = new_tex->db_compatible;
-	tex->can_sample_z = new_tex->can_sample_z;
-	tex->can_sample_s = new_tex->can_sample_s;
-
-	tex->separate_dcc_dirty = new_tex->separate_dcc_dirty;
-	tex->displayable_dcc_dirty = new_tex->displayable_dcc_dirty;
-	tex->dcc_gather_statistics = new_tex->dcc_gather_statistics;
-	si_resource_reference(&tex->dcc_separate_buffer,
-				new_tex->dcc_separate_buffer);
-	si_resource_reference(&tex->last_dcc_separate_buffer,
-				new_tex->last_dcc_separate_buffer);
-
-	if (new_bind_flag == PIPE_BIND_LINEAR) {
-		assert(!tex->surface.htile_offset);
-		assert(!tex->cmask_buffer);
-		assert(!tex->surface.fmask_size);
-		assert(!tex->surface.dcc_offset);
-		assert(!tex->is_depth);
-	}
-
-	si_texture_reference(&new_tex, NULL);
-
-	p_atomic_inc(&sctx->screen->dirty_tex_counter);
+   struct pipe_screen *screen = sctx->b.screen;
+   struct si_texture *new_tex;
+   struct pipe_resource templ = tex->buffer.b.b;
+   unsigned i;
+
+   templ.bind |= new_bind_flag;
+
+   if (tex->buffer.b.is_shared || tex->num_planes > 1)
+      return;
+
+   if (new_bind_flag == PIPE_BIND_LINEAR) {
+      if (tex->surface.is_linear)
+         return;
+
+      /* This fails with MSAA, depth, and compressed textures. */
+      if (si_choose_tiling(sctx->screen, &templ, false) != RADEON_SURF_MODE_LINEAR_ALIGNED)
+         return;
+   }
+
+   new_tex = (struct si_texture *)screen->resource_create(screen, &templ);
+   if (!new_tex)
+      return;
+
+   /* Copy the pixels to the new texture. */
+   if (!invalidate_storage) {
+      for (i = 0; i <= templ.last_level; i++) {
+         struct pipe_box box;
+
+         u_box_3d(0, 0, 0, u_minify(templ.width0, i), u_minify(templ.height0, i),
+                  util_num_layers(&templ, i), &box);
+
+         sctx->dma_copy(&sctx->b, &new_tex->buffer.b.b, i, 0, 0, 0, &tex->buffer.b.b, i, &box);
+      }
+   }
+
+   if (new_bind_flag == PIPE_BIND_LINEAR) {
+      si_texture_discard_cmask(sctx->screen, tex);
+      si_texture_discard_dcc(sctx->screen, tex);
+   }
+
+   /* Replace the structure fields of tex. */
+   tex->buffer.b.b.bind = templ.bind;
+   pb_reference(&tex->buffer.buf, new_tex->buffer.buf);
+   tex->buffer.gpu_address = new_tex->buffer.gpu_address;
+   tex->buffer.vram_usage = new_tex->buffer.vram_usage;
+   tex->buffer.gart_usage = new_tex->buffer.gart_usage;
+   tex->buffer.bo_size = new_tex->buffer.bo_size;
+   tex->buffer.bo_alignment = new_tex->buffer.bo_alignment;
+   tex->buffer.domains = new_tex->buffer.domains;
+   tex->buffer.flags = new_tex->buffer.flags;
+
+   tex->surface = new_tex->surface;
+   si_texture_reference(&tex->flushed_depth_texture, new_tex->flushed_depth_texture);
+
+   tex->surface.fmask_offset = new_tex->surface.fmask_offset;
+   tex->surface.cmask_offset = new_tex->surface.cmask_offset;
+   tex->cmask_base_address_reg = new_tex->cmask_base_address_reg;
+
+   if (tex->cmask_buffer == &tex->buffer)
+      tex->cmask_buffer = NULL;
+   else
+      si_resource_reference(&tex->cmask_buffer, NULL);
+
+   if (new_tex->cmask_buffer == &new_tex->buffer)
+      tex->cmask_buffer = &tex->buffer;
+   else
+      si_resource_reference(&tex->cmask_buffer, new_tex->cmask_buffer);
+
+   tex->surface.dcc_offset = new_tex->surface.dcc_offset;
+   tex->cb_color_info = new_tex->cb_color_info;
+   memcpy(tex->color_clear_value, new_tex->color_clear_value, sizeof(tex->color_clear_value));
+   tex->last_msaa_resolve_target_micro_mode = new_tex->last_msaa_resolve_target_micro_mode;
+
+   tex->surface.htile_offset = new_tex->surface.htile_offset;
+   tex->depth_clear_value = new_tex->depth_clear_value;
+   tex->dirty_level_mask = new_tex->dirty_level_mask;
+   tex->stencil_dirty_level_mask = new_tex->stencil_dirty_level_mask;
+   tex->db_render_format = new_tex->db_render_format;
+   tex->stencil_clear_value = new_tex->stencil_clear_value;
+   tex->tc_compatible_htile = new_tex->tc_compatible_htile;
+   tex->depth_cleared = new_tex->depth_cleared;
+   tex->stencil_cleared = new_tex->stencil_cleared;
+   tex->upgraded_depth = new_tex->upgraded_depth;
+   tex->db_compatible = new_tex->db_compatible;
+   tex->can_sample_z = new_tex->can_sample_z;
+   tex->can_sample_s = new_tex->can_sample_s;
+
+   tex->separate_dcc_dirty = new_tex->separate_dcc_dirty;
+   tex->displayable_dcc_dirty = new_tex->displayable_dcc_dirty;
+   tex->dcc_gather_statistics = new_tex->dcc_gather_statistics;
+   si_resource_reference(&tex->dcc_separate_buffer, new_tex->dcc_separate_buffer);
+   si_resource_reference(&tex->last_dcc_separate_buffer, new_tex->last_dcc_separate_buffer);
+
+   if (new_bind_flag == PIPE_BIND_LINEAR) {
+      assert(!tex->surface.htile_offset);
+      assert(!tex->cmask_buffer);
+      assert(!tex->surface.fmask_size);
+      assert(!tex->surface.dcc_offset);
+      assert(!tex->is_depth);
+   }
+
+   si_texture_reference(&new_tex, NULL);
+
+   p_atomic_inc(&sctx->screen->dirty_tex_counter);
 }
 
 static uint32_t si_get_bo_metadata_word1(struct si_screen *sscreen)
 {
-	return (ATI_VENDOR_ID << 16) | sscreen->info.pci_id;
+   return (ATI_VENDOR_ID << 16) | sscreen->info.pci_id;
 }
 
-static void si_set_tex_bo_metadata(struct si_screen *sscreen,
-				   struct si_texture *tex)
+static void si_set_tex_bo_metadata(struct si_screen *sscreen, struct si_texture *tex)
 {
-	struct radeon_surf *surface = &tex->surface;
-	struct pipe_resource *res = &tex->buffer.b.b;
-	struct radeon_bo_metadata md;
-
-	memset(&md, 0, sizeof(md));
-
-	if (sscreen->info.chip_class >= GFX9) {
-		md.u.gfx9.swizzle_mode = surface->u.gfx9.surf.swizzle_mode;
-		md.u.gfx9.scanout = (surface->flags & RADEON_SURF_SCANOUT) != 0;
-
-		if (tex->surface.dcc_offset && !tex->dcc_separate_buffer) {
-			uint64_t dcc_offset =
-				tex->surface.display_dcc_offset ? tex->surface.display_dcc_offset
-							: tex->surface.dcc_offset;
-
-			assert((dcc_offset >> 8) != 0 && (dcc_offset >> 8) < (1 << 24));
-			md.u.gfx9.dcc_offset_256B = dcc_offset >> 8;
-			md.u.gfx9.dcc_pitch_max = tex->surface.u.gfx9.display_dcc_pitch_max;
-			md.u.gfx9.dcc_independent_64B = 1;
-		}
-	} else {
-		md.u.legacy.microtile = surface->u.legacy.level[0].mode >= RADEON_SURF_MODE_1D ?
-					   RADEON_LAYOUT_TILED : RADEON_LAYOUT_LINEAR;
-		md.u.legacy.macrotile = surface->u.legacy.level[0].mode >= RADEON_SURF_MODE_2D ?
-					   RADEON_LAYOUT_TILED : RADEON_LAYOUT_LINEAR;
-		md.u.legacy.pipe_config = surface->u.legacy.pipe_config;
-		md.u.legacy.bankw = surface->u.legacy.bankw;
-		md.u.legacy.bankh = surface->u.legacy.bankh;
-		md.u.legacy.tile_split = surface->u.legacy.tile_split;
-		md.u.legacy.mtilea = surface->u.legacy.mtilea;
-		md.u.legacy.num_banks = surface->u.legacy.num_banks;
-		md.u.legacy.stride = surface->u.legacy.level[0].nblk_x * surface->bpe;
-		md.u.legacy.scanout = (surface->flags & RADEON_SURF_SCANOUT) != 0;
-	}
-
-	assert(tex->dcc_separate_buffer == NULL);
-	assert(tex->surface.fmask_size == 0);
-
-	/* Metadata image format format version 1:
-	 * [0] = 1 (metadata format identifier)
-	 * [1] = (VENDOR_ID << 16) | PCI_ID
-	 * [2:9] = image descriptor for the whole resource
-	 *         [2] is always 0, because the base address is cleared
-	 *         [9] is the DCC offset bits [39:8] from the beginning of
-	 *             the buffer
-	 * [10:10+LAST_LEVEL] = mipmap level offset bits [39:8] for each level
-	 */
-
-	md.metadata[0] = 1; /* metadata image format version 1 */
-
-	/* TILE_MODE_INDEX is ambiguous without a PCI ID. */
-	md.metadata[1] = si_get_bo_metadata_word1(sscreen);
-
-	static const unsigned char swizzle[] = {
-		PIPE_SWIZZLE_X,
-		PIPE_SWIZZLE_Y,
-		PIPE_SWIZZLE_Z,
-		PIPE_SWIZZLE_W
-	};
-	bool is_array = util_texture_is_array(res->target);
-	uint32_t desc[8];
-
-	sscreen->make_texture_descriptor(sscreen, tex, true,
-				   res->target, res->format,
-				   swizzle, 0, res->last_level, 0,
-				   is_array ? res->array_size - 1 : 0,
-				   res->width0, res->height0, res->depth0,
-				   desc, NULL);
-
-	si_set_mutable_tex_desc_fields(sscreen, tex, &tex->surface.u.legacy.level[0],
-				       0, 0, tex->surface.blk_w, false, desc);
-
-	/* Clear the base address and set the relative DCC offset. */
-	desc[0] = 0;
-	desc[1] &= C_008F14_BASE_ADDRESS_HI;
-
-	switch (sscreen->info.chip_class) {
-	case GFX6:
-	case GFX7:
-		break;
-	case GFX8:
-		desc[7] = tex->surface.dcc_offset >> 8;
-		break;
-	case GFX9:
-		desc[7] = tex->surface.dcc_offset >> 8;
-		desc[5] &= C_008F24_META_DATA_ADDRESS;
-		desc[5] |= S_008F24_META_DATA_ADDRESS(tex->surface.dcc_offset >> 40);
-		break;
-	case GFX10:
-		desc[6] &= C_00A018_META_DATA_ADDRESS_LO;
-		desc[6] |= S_00A018_META_DATA_ADDRESS_LO(tex->surface.dcc_offset >> 8);
-		desc[7] = tex->surface.dcc_offset >> 16;
-		break;
-	default:
-		assert(0);
-	}
-
-
-	/* Dwords [2:9] contain the image descriptor. */
-	memcpy(&md.metadata[2], desc, sizeof(desc));
-	md.size_metadata = 10 * 4;
-
-	/* Dwords [10:..] contain the mipmap level offsets. */
-	if (sscreen->info.chip_class <= GFX8) {
-		for (unsigned i = 0; i <= res->last_level; i++)
-			md.metadata[10+i] = tex->surface.u.legacy.level[i].offset >> 8;
-
-		md.size_metadata += (1 + res->last_level) * 4;
-	}
-
-	sscreen->ws->buffer_set_metadata(tex->buffer.buf, &md);
+   struct radeon_surf *surface = &tex->surface;
+   struct pipe_resource *res = &tex->buffer.b.b;
+   struct radeon_bo_metadata md;
+
+   memset(&md, 0, sizeof(md));
+
+   if (sscreen->info.chip_class >= GFX9) {
+      md.u.gfx9.swizzle_mode = surface->u.gfx9.surf.swizzle_mode;
+      md.u.gfx9.scanout = (surface->flags & RADEON_SURF_SCANOUT) != 0;
+
+      if (tex->surface.dcc_offset && !tex->dcc_separate_buffer) {
+         uint64_t dcc_offset = tex->surface.display_dcc_offset ? tex->surface.display_dcc_offset
+                                                               : tex->surface.dcc_offset;
+
+         assert((dcc_offset >> 8) != 0 && (dcc_offset >> 8) < (1 << 24));
+         md.u.gfx9.dcc_offset_256B = dcc_offset >> 8;
+         md.u.gfx9.dcc_pitch_max = tex->surface.u.gfx9.display_dcc_pitch_max;
+         md.u.gfx9.dcc_independent_64B = 1;
+      }
+   } else {
+      md.u.legacy.microtile = surface->u.legacy.level[0].mode >= RADEON_SURF_MODE_1D
+                                 ? RADEON_LAYOUT_TILED
+                                 : RADEON_LAYOUT_LINEAR;
+      md.u.legacy.macrotile = surface->u.legacy.level[0].mode >= RADEON_SURF_MODE_2D
+                                 ? RADEON_LAYOUT_TILED
+                                 : RADEON_LAYOUT_LINEAR;
+      md.u.legacy.pipe_config = surface->u.legacy.pipe_config;
+      md.u.legacy.bankw = surface->u.legacy.bankw;
+      md.u.legacy.bankh = surface->u.legacy.bankh;
+      md.u.legacy.tile_split = surface->u.legacy.tile_split;
+      md.u.legacy.mtilea = surface->u.legacy.mtilea;
+      md.u.legacy.num_banks = surface->u.legacy.num_banks;
+      md.u.legacy.stride = surface->u.legacy.level[0].nblk_x * surface->bpe;
+      md.u.legacy.scanout = (surface->flags & RADEON_SURF_SCANOUT) != 0;
+   }
+
+   assert(tex->dcc_separate_buffer == NULL);
+   assert(tex->surface.fmask_size == 0);
+
+   /* Metadata image format format version 1:
+    * [0] = 1 (metadata format identifier)
+    * [1] = (VENDOR_ID << 16) | PCI_ID
+    * [2:9] = image descriptor for the whole resource
+    *         [2] is always 0, because the base address is cleared
+    *         [9] is the DCC offset bits [39:8] from the beginning of
+    *             the buffer
+    * [10:10+LAST_LEVEL] = mipmap level offset bits [39:8] for each level
+    */
+
+   md.metadata[0] = 1; /* metadata image format version 1 */
+
+   /* TILE_MODE_INDEX is ambiguous without a PCI ID. */
+   md.metadata[1] = si_get_bo_metadata_word1(sscreen);
+
+   static const unsigned char swizzle[] = {PIPE_SWIZZLE_X, PIPE_SWIZZLE_Y, PIPE_SWIZZLE_Z,
+                                           PIPE_SWIZZLE_W};
+   bool is_array = util_texture_is_array(res->target);
+   uint32_t desc[8];
+
+   sscreen->make_texture_descriptor(sscreen, tex, true, res->target, res->format, swizzle, 0,
+                                    res->last_level, 0, is_array ? res->array_size - 1 : 0,
+                                    res->width0, res->height0, res->depth0, desc, NULL);
+
+   si_set_mutable_tex_desc_fields(sscreen, tex, &tex->surface.u.legacy.level[0], 0, 0,
+                                  tex->surface.blk_w, false, desc);
+
+   /* Clear the base address and set the relative DCC offset. */
+   desc[0] = 0;
+   desc[1] &= C_008F14_BASE_ADDRESS_HI;
+
+   switch (sscreen->info.chip_class) {
+   case GFX6:
+   case GFX7:
+      break;
+   case GFX8:
+      desc[7] = tex->surface.dcc_offset >> 8;
+      break;
+   case GFX9:
+      desc[7] = tex->surface.dcc_offset >> 8;
+      desc[5] &= C_008F24_META_DATA_ADDRESS;
+      desc[5] |= S_008F24_META_DATA_ADDRESS(tex->surface.dcc_offset >> 40);
+      break;
+   case GFX10:
+      desc[6] &= C_00A018_META_DATA_ADDRESS_LO;
+      desc[6] |= S_00A018_META_DATA_ADDRESS_LO(tex->surface.dcc_offset >> 8);
+      desc[7] = tex->surface.dcc_offset >> 16;
+      break;
+   default:
+      assert(0);
+   }
+
+   /* Dwords [2:9] contain the image descriptor. */
+   memcpy(&md.metadata[2], desc, sizeof(desc));
+   md.size_metadata = 10 * 4;
+
+   /* Dwords [10:..] contain the mipmap level offsets. */
+   if (sscreen->info.chip_class <= GFX8) {
+      for (unsigned i = 0; i <= res->last_level; i++)
+         md.metadata[10 + i] = tex->surface.u.legacy.level[i].offset >> 8;
+
+      md.size_metadata += (1 + res->last_level) * 4;
+   }
+
+   sscreen->ws->buffer_set_metadata(tex->buffer.buf, &md);
 }
 
-static bool si_read_tex_bo_metadata(struct si_screen *sscreen,
-				    struct si_texture *tex,
-				    uint64_t offset,
-				    struct radeon_bo_metadata *md)
+static bool si_read_tex_bo_metadata(struct si_screen *sscreen, struct si_texture *tex,
+                                    uint64_t offset, struct radeon_bo_metadata *md)
 {
-	uint32_t *desc = &md->metadata[2];
-
-	if (offset || /* Non-zero planes ignore metadata. */
-	    md->size_metadata < 10 * 4 || /* at least 2(header) + 8(desc) dwords */
-	    md->metadata[0] == 0 || /* invalid version number */
-	    md->metadata[1] != si_get_bo_metadata_word1(sscreen)) /* invalid PCI ID */ {
-		/* Disable DCC because it might not be enabled. */
-		si_texture_zero_dcc_fields(tex);
-
-		/* Don't report an error if the texture comes from an incompatible driver,
-		 * but this might not work.
-		 */
-		return true;
-	}
-
-	/* Validate that sample counts and the number of mipmap levels match. */
-	unsigned last_level = G_008F1C_LAST_LEVEL(desc[3]);
-	unsigned type = G_008F1C_TYPE(desc[3]);
-
-	if (type == V_008F1C_SQ_RSRC_IMG_2D_MSAA ||
-	    type == V_008F1C_SQ_RSRC_IMG_2D_MSAA_ARRAY) {
-		unsigned log_samples =
-			util_logbase2(MAX2(1, tex->buffer.b.b.nr_storage_samples));
-
-		if (last_level != log_samples) {
-			fprintf(stderr, "radeonsi: invalid MSAA texture import, "
-					"metadata has log2(samples) = %u, the caller set %u\n",
-				last_level, log_samples);
-			return false;
-		}
-	} else {
-		if (last_level != tex->buffer.b.b.last_level) {
-			fprintf(stderr, "radeonsi: invalid mipmapped texture import, "
-					"metadata has last_level = %u, the caller set %u\n",
-				last_level, tex->buffer.b.b.last_level);
-			return false;
-		}
-	}
-
-	if (sscreen->info.chip_class >= GFX8 &&
-	    G_008F28_COMPRESSION_EN(desc[6])) {
-		/* Read DCC information. */
-		switch (sscreen->info.chip_class) {
-		case GFX8:
-			tex->surface.dcc_offset = (uint64_t)desc[7] << 8;
-			break;
-
-		case GFX9:
-			tex->surface.dcc_offset =
-				((uint64_t)desc[7] << 8) |
-				((uint64_t)G_008F24_META_DATA_ADDRESS(desc[5]) << 40);
-			tex->surface.u.gfx9.dcc.pipe_aligned =
-				G_008F24_META_PIPE_ALIGNED(desc[5]);
-			tex->surface.u.gfx9.dcc.rb_aligned =
-				G_008F24_META_RB_ALIGNED(desc[5]);
-
-			/* If DCC is unaligned, this can only be a displayable image. */
-			if (!tex->surface.u.gfx9.dcc.pipe_aligned &&
-			    !tex->surface.u.gfx9.dcc.rb_aligned)
-				assert(tex->surface.is_displayable);
-			break;
-
-		case GFX10:
-			tex->surface.dcc_offset =
-				((uint64_t)G_00A018_META_DATA_ADDRESS_LO(desc[6]) << 8) |
-				((uint64_t)desc[7] << 16);
-			tex->surface.u.gfx9.dcc.pipe_aligned =
-				G_00A018_META_PIPE_ALIGNED(desc[6]);
-			break;
-
-		default:
-			assert(0);
-			return false;
-		}
-	} else {
-		/* Disable DCC. dcc_offset is always set by texture_from_handle
-		 * and must be cleared here.
-		 */
-		si_texture_zero_dcc_fields(tex);
-	}
-
-	return true;
+   uint32_t *desc = &md->metadata[2];
+
+   if (offset ||                     /* Non-zero planes ignore metadata. */
+       md->size_metadata < 10 * 4 || /* at least 2(header) + 8(desc) dwords */
+       md->metadata[0] == 0 ||       /* invalid version number */
+       md->metadata[1] != si_get_bo_metadata_word1(sscreen)) /* invalid PCI ID */ {
+      /* Disable DCC because it might not be enabled. */
+      si_texture_zero_dcc_fields(tex);
+
+      /* Don't report an error if the texture comes from an incompatible driver,
+       * but this might not work.
+       */
+      return true;
+   }
+
+   /* Validate that sample counts and the number of mipmap levels match. */
+   unsigned last_level = G_008F1C_LAST_LEVEL(desc[3]);
+   unsigned type = G_008F1C_TYPE(desc[3]);
+
+   if (type == V_008F1C_SQ_RSRC_IMG_2D_MSAA || type == V_008F1C_SQ_RSRC_IMG_2D_MSAA_ARRAY) {
+      unsigned log_samples = util_logbase2(MAX2(1, tex->buffer.b.b.nr_storage_samples));
+
+      if (last_level != log_samples) {
+         fprintf(stderr,
+                 "radeonsi: invalid MSAA texture import, "
+                 "metadata has log2(samples) = %u, the caller set %u\n",
+                 last_level, log_samples);
+         return false;
+      }
+   } else {
+      if (last_level != tex->buffer.b.b.last_level) {
+         fprintf(stderr,
+                 "radeonsi: invalid mipmapped texture import, "
+                 "metadata has last_level = %u, the caller set %u\n",
+                 last_level, tex->buffer.b.b.last_level);
+         return false;
+      }
+   }
+
+   if (sscreen->info.chip_class >= GFX8 && G_008F28_COMPRESSION_EN(desc[6])) {
+      /* Read DCC information. */
+      switch (sscreen->info.chip_class) {
+      case GFX8:
+         tex->surface.dcc_offset = (uint64_t)desc[7] << 8;
+         break;
+
+      case GFX9:
+         tex->surface.dcc_offset =
+            ((uint64_t)desc[7] << 8) | ((uint64_t)G_008F24_META_DATA_ADDRESS(desc[5]) << 40);
+         tex->surface.u.gfx9.dcc.pipe_aligned = G_008F24_META_PIPE_ALIGNED(desc[5]);
+         tex->surface.u.gfx9.dcc.rb_aligned = G_008F24_META_RB_ALIGNED(desc[5]);
+
+         /* If DCC is unaligned, this can only be a displayable image. */
+         if (!tex->surface.u.gfx9.dcc.pipe_aligned && !tex->surface.u.gfx9.dcc.rb_aligned)
+            assert(tex->surface.is_displayable);
+         break;
+
+      case GFX10:
+         tex->surface.dcc_offset =
+            ((uint64_t)G_00A018_META_DATA_ADDRESS_LO(desc[6]) << 8) | ((uint64_t)desc[7] << 16);
+         tex->surface.u.gfx9.dcc.pipe_aligned = G_00A018_META_PIPE_ALIGNED(desc[6]);
+         break;
+
+      default:
+         assert(0);
+         return false;
+      }
+   } else {
+      /* Disable DCC. dcc_offset is always set by texture_from_handle
+       * and must be cleared here.
+       */
+      si_texture_zero_dcc_fields(tex);
+   }
+
+   return true;
 }
 
 static bool si_has_displayable_dcc(struct si_texture *tex)
 {
-	struct si_screen *sscreen = (struct si_screen*)tex->buffer.b.b.screen;
-
-	if (sscreen->info.chip_class <= GFX8)
-		return false;
-
-	/* This needs a cache flush before scanout.
-	 * (it can't be scanned out and rendered to simultaneously)
-	 */
-	if (sscreen->info.use_display_dcc_unaligned &&
-	    tex->surface.dcc_offset &&
-	    !tex->surface.u.gfx9.dcc.pipe_aligned &&
-	    !tex->surface.u.gfx9.dcc.rb_aligned)
-		return true;
-
-	/* This needs an explicit flush (flush_resource). */
-	if (sscreen->info.use_display_dcc_with_retile_blit &&
-	    tex->surface.display_dcc_offset)
-		return true;
-
-	return false;
+   struct si_screen *sscreen = (struct si_screen *)tex->buffer.b.b.screen;
+
+   if (sscreen->info.chip_class <= GFX8)
+      return false;
+
+   /* This needs a cache flush before scanout.
+    * (it can't be scanned out and rendered to simultaneously)
+    */
+   if (sscreen->info.use_display_dcc_unaligned && tex->surface.dcc_offset &&
+       !tex->surface.u.gfx9.dcc.pipe_aligned && !tex->surface.u.gfx9.dcc.rb_aligned)
+      return true;
+
+   /* This needs an explicit flush (flush_resource). */
+   if (sscreen->info.use_display_dcc_with_retile_blit && tex->surface.display_dcc_offset)
+      return true;
+
+   return false;
 }
 
-static bool si_resource_get_param(struct pipe_screen *screen,
-				  struct pipe_context *context,
-				  struct pipe_resource *resource,
-				  unsigned plane,
-				  unsigned layer,
-				  enum pipe_resource_param param,
-				  unsigned handle_usage,
-				  uint64_t *value)
+static bool si_resource_get_param(struct pipe_screen *screen, struct pipe_context *context,
+                                  struct pipe_resource *resource, unsigned plane, unsigned layer,
+                                  enum pipe_resource_param param, unsigned handle_usage,
+                                  uint64_t *value)
 {
-	for (unsigned i = 0; i < plane; i++)
-		resource = resource->next;
-
-	struct si_screen *sscreen = (struct si_screen*)screen;
-	struct si_texture *tex = (struct si_texture*)resource;
-	struct winsys_handle whandle;
-
-	switch (param) {
-	case PIPE_RESOURCE_PARAM_NPLANES:
-		*value = resource->target == PIPE_BUFFER ? 1 : tex->num_planes;
-		return true;
-
-	case PIPE_RESOURCE_PARAM_STRIDE:
-		if (resource->target == PIPE_BUFFER)
-			*value = 0;
-		else if (sscreen->info.chip_class >= GFX9)
-			*value = tex->surface.u.gfx9.surf_pitch * tex->surface.bpe;
-		else
-			*value = tex->surface.u.legacy.level[0].nblk_x * tex->surface.bpe;
-		return true;
-
-	case PIPE_RESOURCE_PARAM_OFFSET:
-		if (resource->target == PIPE_BUFFER)
-			*value = 0;
-		else if (sscreen->info.chip_class >= GFX9)
-			*value = tex->surface.u.gfx9.surf_offset +
-				 layer * tex->surface.u.gfx9.surf_slice_size;
-		else
-			*value = tex->surface.u.legacy.level[0].offset +
-				 layer * (uint64_t)tex->surface.u.legacy.level[0].slice_size_dw * 4;
-		return true;
-
-	case PIPE_RESOURCE_PARAM_MODIFIER:
-		*value = DRM_FORMAT_MOD_INVALID;
-		return true;
-
-	case PIPE_RESOURCE_PARAM_HANDLE_TYPE_SHARED:
-	case PIPE_RESOURCE_PARAM_HANDLE_TYPE_KMS:
-	case PIPE_RESOURCE_PARAM_HANDLE_TYPE_FD:
-		memset(&whandle, 0, sizeof(whandle));
-
-		if (param == PIPE_RESOURCE_PARAM_HANDLE_TYPE_SHARED)
-			whandle.type = WINSYS_HANDLE_TYPE_SHARED;
-		else if (param == PIPE_RESOURCE_PARAM_HANDLE_TYPE_KMS)
-			whandle.type = WINSYS_HANDLE_TYPE_KMS;
-		else if (param == PIPE_RESOURCE_PARAM_HANDLE_TYPE_FD)
-			whandle.type = WINSYS_HANDLE_TYPE_FD;
-
-		if (!screen->resource_get_handle(screen, context, resource,
-						 &whandle, handle_usage))
-			return false;
-
-		*value = whandle.handle;
-		return true;
-	}
-	return false;
+   for (unsigned i = 0; i < plane; i++)
+      resource = resource->next;
+
+   struct si_screen *sscreen = (struct si_screen *)screen;
+   struct si_texture *tex = (struct si_texture *)resource;
+   struct winsys_handle whandle;
+
+   switch (param) {
+   case PIPE_RESOURCE_PARAM_NPLANES:
+      *value = resource->target == PIPE_BUFFER ? 1 : tex->num_planes;
+      return true;
+
+   case PIPE_RESOURCE_PARAM_STRIDE:
+      if (resource->target == PIPE_BUFFER)
+         *value = 0;
+      else if (sscreen->info.chip_class >= GFX9)
+         *value = tex->surface.u.gfx9.surf_pitch * tex->surface.bpe;
+      else
+         *value = tex->surface.u.legacy.level[0].nblk_x * tex->surface.bpe;
+      return true;
+
+   case PIPE_RESOURCE_PARAM_OFFSET:
+      if (resource->target == PIPE_BUFFER)
+         *value = 0;
+      else if (sscreen->info.chip_class >= GFX9)
+         *value = tex->surface.u.gfx9.surf_offset + layer * tex->surface.u.gfx9.surf_slice_size;
+      else
+         *value = tex->surface.u.legacy.level[0].offset +
+                  layer * (uint64_t)tex->surface.u.legacy.level[0].slice_size_dw * 4;
+      return true;
+
+   case PIPE_RESOURCE_PARAM_MODIFIER:
+      *value = DRM_FORMAT_MOD_INVALID;
+      return true;
+
+   case PIPE_RESOURCE_PARAM_HANDLE_TYPE_SHARED:
+   case PIPE_RESOURCE_PARAM_HANDLE_TYPE_KMS:
+   case PIPE_RESOURCE_PARAM_HANDLE_TYPE_FD:
+      memset(&whandle, 0, sizeof(whandle));
+
+      if (param == PIPE_RESOURCE_PARAM_HANDLE_TYPE_SHARED)
+         whandle.type = WINSYS_HANDLE_TYPE_SHARED;
+      else if (param == PIPE_RESOURCE_PARAM_HANDLE_TYPE_KMS)
+         whandle.type = WINSYS_HANDLE_TYPE_KMS;
+      else if (param == PIPE_RESOURCE_PARAM_HANDLE_TYPE_FD)
+         whandle.type = WINSYS_HANDLE_TYPE_FD;
+
+      if (!screen->resource_get_handle(screen, context, resource, &whandle, handle_usage))
+         return false;
+
+      *value = whandle.handle;
+      return true;
+   }
+   return false;
 }
 
-static void si_texture_get_info(struct pipe_screen* screen,
-				struct pipe_resource *resource,
-				unsigned *pstride,
-				unsigned *poffset)
+static void si_texture_get_info(struct pipe_screen *screen, struct pipe_resource *resource,
+                                unsigned *pstride, unsigned *poffset)
 {
-	uint64_t value;
-
-	if (pstride) {
-		si_resource_get_param(screen, NULL, resource, 0, 0,
-				      PIPE_RESOURCE_PARAM_STRIDE, 0, &value);
-		*pstride = value;
-	}
-
-	if (poffset) {
-		si_resource_get_param(screen, NULL, resource, 0, 0,
-				      PIPE_RESOURCE_PARAM_OFFSET, 0, &value);
-		*poffset = value;
-	}
+   uint64_t value;
+
+   if (pstride) {
+      si_resource_get_param(screen, NULL, resource, 0, 0, PIPE_RESOURCE_PARAM_STRIDE, 0, &value);
+      *pstride = value;
+   }
+
+   if (poffset) {
+      si_resource_get_param(screen, NULL, resource, 0, 0, PIPE_RESOURCE_PARAM_OFFSET, 0, &value);
+      *poffset = value;
+   }
 }
 
-static bool si_texture_get_handle(struct pipe_screen* screen,
-				  struct pipe_context *ctx,
-				  struct pipe_resource *resource,
-				  struct winsys_handle *whandle,
-				  unsigned usage)
+static bool si_texture_get_handle(struct pipe_screen *screen, struct pipe_context *ctx,
+                                  struct pipe_resource *resource, struct winsys_handle *whandle,
+                                  unsigned usage)
 {
-	struct si_screen *sscreen = (struct si_screen*)screen;
-	struct si_context *sctx;
-	struct si_resource *res = si_resource(resource);
-	struct si_texture *tex = (struct si_texture*)resource;
-	bool update_metadata = false;
-	unsigned stride, offset, slice_size;
-	bool flush = false;
-
-	ctx = threaded_context_unwrap_sync(ctx);
-	sctx = (struct si_context*)(ctx ? ctx : sscreen->aux_context);
-
-	if (resource->target != PIPE_BUFFER) {
-		/* Individual planes are chained pipe_resource instances. */
-		for (unsigned i = 0; i < whandle->plane; i++) {
-			resource = resource->next;
-			res = si_resource(resource);
-			tex = (struct si_texture*)resource;
-		}
-
-		/* This is not supported now, but it might be required for OpenCL
-		 * interop in the future.
-		 */
-		if (resource->nr_samples > 1 || tex->is_depth)
-			return false;
-
-		/* Move a suballocated texture into a non-suballocated allocation. */
-		if (sscreen->ws->buffer_is_suballocated(res->buf) ||
-		    tex->surface.tile_swizzle ||
-		    (tex->buffer.flags & RADEON_FLAG_NO_INTERPROCESS_SHARING &&
-		     sscreen->info.has_local_buffers)) {
-			assert(!res->b.is_shared);
-			si_reallocate_texture_inplace(sctx, tex,
-							PIPE_BIND_SHARED, false);
-			flush = true;
-			assert(res->b.b.bind & PIPE_BIND_SHARED);
-			assert(res->flags & RADEON_FLAG_NO_SUBALLOC);
-			assert(!(res->flags & RADEON_FLAG_NO_INTERPROCESS_SHARING));
-			assert(tex->surface.tile_swizzle == 0);
-		}
-
-		/* Since shader image stores don't support DCC on GFX8,
-		 * disable it for external clients that want write
-		 * access.
-		 */
-		if ((usage & PIPE_HANDLE_USAGE_SHADER_WRITE && tex->surface.dcc_offset) ||
-		    /* Displayable DCC requires an explicit flush. */
-		    (!(usage & PIPE_HANDLE_USAGE_EXPLICIT_FLUSH) &&
-		     si_has_displayable_dcc(tex))) {
-			if (si_texture_disable_dcc(sctx, tex)) {
-				update_metadata = true;
-				/* si_texture_disable_dcc flushes the context */
-				flush = false;
-			}
-		}
-
-		if (!(usage & PIPE_HANDLE_USAGE_EXPLICIT_FLUSH) &&
-		    (tex->cmask_buffer || tex->surface.dcc_offset)) {
-			/* Eliminate fast clear (both CMASK and DCC) */
-			si_eliminate_fast_color_clear(sctx, tex);
-			/* eliminate_fast_color_clear flushes the context */
-			flush = false;
-
-			/* Disable CMASK if flush_resource isn't going
-			 * to be called.
-			 */
-			if (tex->cmask_buffer)
-				si_texture_discard_cmask(sscreen, tex);
-		}
-
-		/* Set metadata. */
-		if ((!res->b.is_shared || update_metadata) && whandle->offset == 0)
-			si_set_tex_bo_metadata(sscreen, tex);
-
-		if (sscreen->info.chip_class >= GFX9) {
-			slice_size = tex->surface.u.gfx9.surf_slice_size;
-		} else {
-			slice_size = (uint64_t)tex->surface.u.legacy.level[0].slice_size_dw * 4;
-		}
-	} else {
-		/* Buffer exports are for the OpenCL interop. */
-		/* Move a suballocated buffer into a non-suballocated allocation. */
-		if (sscreen->ws->buffer_is_suballocated(res->buf) ||
-		    /* A DMABUF export always fails if the BO is local. */
-		    (tex->buffer.flags & RADEON_FLAG_NO_INTERPROCESS_SHARING &&
-		     sscreen->info.has_local_buffers)) {
-			assert(!res->b.is_shared);
-
-			/* Allocate a new buffer with PIPE_BIND_SHARED. */
-			struct pipe_resource templ = res->b.b;
-			templ.bind |= PIPE_BIND_SHARED;
-
-			struct pipe_resource *newb =
-				screen->resource_create(screen, &templ);
-			if (!newb)
-				return false;
-
-			/* Copy the old buffer contents to the new one. */
-			struct pipe_box box;
-			u_box_1d(0, newb->width0, &box);
-			sctx->b.resource_copy_region(&sctx->b, newb, 0, 0, 0, 0,
-						     &res->b.b, 0, &box);
-			flush = true;
-			/* Move the new buffer storage to the old pipe_resource. */
-			si_replace_buffer_storage(&sctx->b, &res->b.b, newb);
-			pipe_resource_reference(&newb, NULL);
-
-			assert(res->b.b.bind & PIPE_BIND_SHARED);
-			assert(res->flags & RADEON_FLAG_NO_SUBALLOC);
-		}
-
-		/* Buffers */
-		slice_size = 0;
-	}
-
-	si_texture_get_info(screen, resource, &stride, &offset);
-
-	if (flush)
-		sctx->b.flush(&sctx->b, NULL, 0);
-
-	if (res->b.is_shared) {
-		/* USAGE_EXPLICIT_FLUSH must be cleared if at least one user
-		 * doesn't set it.
-		 */
-		res->external_usage |= usage & ~PIPE_HANDLE_USAGE_EXPLICIT_FLUSH;
-		if (!(usage & PIPE_HANDLE_USAGE_EXPLICIT_FLUSH))
-			res->external_usage &= ~PIPE_HANDLE_USAGE_EXPLICIT_FLUSH;
-	} else {
-		res->b.is_shared = true;
-		res->external_usage = usage;
-	}
-
-	whandle->stride = stride;
-	whandle->offset = offset + slice_size * whandle->layer;
-
-	return sscreen->ws->buffer_get_handle(sscreen->ws, res->buf, whandle);
+   struct si_screen *sscreen = (struct si_screen *)screen;
+   struct si_context *sctx;
+   struct si_resource *res = si_resource(resource);
+   struct si_texture *tex = (struct si_texture *)resource;
+   bool update_metadata = false;
+   unsigned stride, offset, slice_size;
+   bool flush = false;
+
+   ctx = threaded_context_unwrap_sync(ctx);
+   sctx = (struct si_context *)(ctx ? ctx : sscreen->aux_context);
+
+   if (resource->target != PIPE_BUFFER) {
+      /* Individual planes are chained pipe_resource instances. */
+      for (unsigned i = 0; i < whandle->plane; i++) {
+         resource = resource->next;
+         res = si_resource(resource);
+         tex = (struct si_texture *)resource;
+      }
+
+      /* This is not supported now, but it might be required for OpenCL
+       * interop in the future.
+       */
+      if (resource->nr_samples > 1 || tex->is_depth)
+         return false;
+
+      /* Move a suballocated texture into a non-suballocated allocation. */
+      if (sscreen->ws->buffer_is_suballocated(res->buf) || tex->surface.tile_swizzle ||
+          (tex->buffer.flags & RADEON_FLAG_NO_INTERPROCESS_SHARING &&
+           sscreen->info.has_local_buffers)) {
+         assert(!res->b.is_shared);
+         si_reallocate_texture_inplace(sctx, tex, PIPE_BIND_SHARED, false);
+         flush = true;
+         assert(res->b.b.bind & PIPE_BIND_SHARED);
+         assert(res->flags & RADEON_FLAG_NO_SUBALLOC);
+         assert(!(res->flags & RADEON_FLAG_NO_INTERPROCESS_SHARING));
+         assert(tex->surface.tile_swizzle == 0);
+      }
+
+      /* Since shader image stores don't support DCC on GFX8,
+       * disable it for external clients that want write
+       * access.
+       */
+      if ((usage & PIPE_HANDLE_USAGE_SHADER_WRITE && tex->surface.dcc_offset) ||
+          /* Displayable DCC requires an explicit flush. */
+          (!(usage & PIPE_HANDLE_USAGE_EXPLICIT_FLUSH) && si_has_displayable_dcc(tex))) {
+         if (si_texture_disable_dcc(sctx, tex)) {
+            update_metadata = true;
+            /* si_texture_disable_dcc flushes the context */
+            flush = false;
+         }
+      }
+
+      if (!(usage & PIPE_HANDLE_USAGE_EXPLICIT_FLUSH) &&
+          (tex->cmask_buffer || tex->surface.dcc_offset)) {
+         /* Eliminate fast clear (both CMASK and DCC) */
+         si_eliminate_fast_color_clear(sctx, tex);
+         /* eliminate_fast_color_clear flushes the context */
+         flush = false;
+
+         /* Disable CMASK if flush_resource isn't going
+          * to be called.
+          */
+         if (tex->cmask_buffer)
+            si_texture_discard_cmask(sscreen, tex);
+      }
+
+      /* Set metadata. */
+      if ((!res->b.is_shared || update_metadata) && whandle->offset == 0)
+         si_set_tex_bo_metadata(sscreen, tex);
+
+      if (sscreen->info.chip_class >= GFX9) {
+         slice_size = tex->surface.u.gfx9.surf_slice_size;
+      } else {
+         slice_size = (uint64_t)tex->surface.u.legacy.level[0].slice_size_dw * 4;
+      }
+   } else {
+      /* Buffer exports are for the OpenCL interop. */
+      /* Move a suballocated buffer into a non-suballocated allocation. */
+      if (sscreen->ws->buffer_is_suballocated(res->buf) ||
+          /* A DMABUF export always fails if the BO is local. */
+          (tex->buffer.flags & RADEON_FLAG_NO_INTERPROCESS_SHARING &&
+           sscreen->info.has_local_buffers)) {
+         assert(!res->b.is_shared);
+
+         /* Allocate a new buffer with PIPE_BIND_SHARED. */
+         struct pipe_resource templ = res->b.b;
+         templ.bind |= PIPE_BIND_SHARED;
+
+         struct pipe_resource *newb = screen->resource_create(screen, &templ);
+         if (!newb)
+            return false;
+
+         /* Copy the old buffer contents to the new one. */
+         struct pipe_box box;
+         u_box_1d(0, newb->width0, &box);
+         sctx->b.resource_copy_region(&sctx->b, newb, 0, 0, 0, 0, &res->b.b, 0, &box);
+         flush = true;
+         /* Move the new buffer storage to the old pipe_resource. */
+         si_replace_buffer_storage(&sctx->b, &res->b.b, newb);
+         pipe_resource_reference(&newb, NULL);
+
+         assert(res->b.b.bind & PIPE_BIND_SHARED);
+         assert(res->flags & RADEON_FLAG_NO_SUBALLOC);
+      }
+
+      /* Buffers */
+      slice_size = 0;
+   }
+
+   si_texture_get_info(screen, resource, &stride, &offset);
+
+   if (flush)
+      sctx->b.flush(&sctx->b, NULL, 0);
+
+   if (res->b.is_shared) {
+      /* USAGE_EXPLICIT_FLUSH must be cleared if at least one user
+       * doesn't set it.
+       */
+      res->external_usage |= usage & ~PIPE_HANDLE_USAGE_EXPLICIT_FLUSH;
+      if (!(usage & PIPE_HANDLE_USAGE_EXPLICIT_FLUSH))
+         res->external_usage &= ~PIPE_HANDLE_USAGE_EXPLICIT_FLUSH;
+   } else {
+      res->b.is_shared = true;
+      res->external_usage = usage;
+   }
+
+   whandle->stride = stride;
+   whandle->offset = offset + slice_size * whandle->layer;
+
+   return sscreen->ws->buffer_get_handle(sscreen->ws, res->buf, whandle);
 }
 
-static void si_texture_destroy(struct pipe_screen *screen,
-			       struct pipe_resource *ptex)
+static void si_texture_destroy(struct pipe_screen *screen, struct pipe_resource *ptex)
 {
-	struct si_screen *sscreen = (struct si_screen*)screen;
-	struct si_texture *tex = (struct si_texture*)ptex;
-	struct si_resource *resource = &tex->buffer;
-
-	if (sscreen->info.chip_class >= GFX9)
-		free(tex->surface.u.gfx9.dcc_retile_map);
-
-	si_texture_reference(&tex->flushed_depth_texture, NULL);
-
-	if (tex->cmask_buffer != &tex->buffer) {
-	    si_resource_reference(&tex->cmask_buffer, NULL);
-	}
-	pb_reference(&resource->buf, NULL);
-	si_resource_reference(&tex->dcc_separate_buffer, NULL);
-	si_resource_reference(&tex->last_dcc_separate_buffer, NULL);
-	FREE(tex);
+   struct si_screen *sscreen = (struct si_screen *)screen;
+   struct si_texture *tex = (struct si_texture *)ptex;
+   struct si_resource *resource = &tex->buffer;
+
+   if (sscreen->info.chip_class >= GFX9)
+      free(tex->surface.u.gfx9.dcc_retile_map);
+
+   si_texture_reference(&tex->flushed_depth_texture, NULL);
+
+   if (tex->cmask_buffer != &tex->buffer) {
+      si_resource_reference(&tex->cmask_buffer, NULL);
+   }
+   pb_reference(&resource->buf, NULL);
+   si_resource_reference(&tex->dcc_separate_buffer, NULL);
+   si_resource_reference(&tex->last_dcc_separate_buffer, NULL);
+   FREE(tex);
 }
 
 static const struct u_resource_vtbl si_texture_vtbl;
 
-void si_print_texture_info(struct si_screen *sscreen,
-			   struct si_texture *tex, struct u_log_context *log)
+void si_print_texture_info(struct si_screen *sscreen, struct si_texture *tex,
+                           struct u_log_context *log)
 {
-	int i;
-
-	/* Common parameters. */
-	u_log_printf(log, "  Info: npix_x=%u, npix_y=%u, npix_z=%u, blk_w=%u, "
-		"blk_h=%u, array_size=%u, last_level=%u, "
-		"bpe=%u, nsamples=%u, flags=0x%x, %s\n",
-		tex->buffer.b.b.width0, tex->buffer.b.b.height0,
-		tex->buffer.b.b.depth0, tex->surface.blk_w,
-		tex->surface.blk_h,
-		tex->buffer.b.b.array_size, tex->buffer.b.b.last_level,
-		tex->surface.bpe, tex->buffer.b.b.nr_samples,
-		tex->surface.flags, util_format_short_name(tex->buffer.b.b.format));
-
-	if (sscreen->info.chip_class >= GFX9) {
-		u_log_printf(log, "  Surf: size=%"PRIu64", slice_size=%"PRIu64", "
-			"alignment=%u, swmode=%u, epitch=%u, pitch=%u\n",
-			tex->surface.surf_size,
-			tex->surface.u.gfx9.surf_slice_size,
-			tex->surface.surf_alignment,
-			tex->surface.u.gfx9.surf.swizzle_mode,
-			tex->surface.u.gfx9.surf.epitch,
-			tex->surface.u.gfx9.surf_pitch);
-
-		if (tex->surface.fmask_offset) {
-			u_log_printf(log, "  FMASK: offset=%"PRIu64", size=%"PRIu64", "
-				"alignment=%u, swmode=%u, epitch=%u\n",
-				tex->surface.fmask_offset,
-				tex->surface.fmask_size,
-				tex->surface.fmask_alignment,
-				tex->surface.u.gfx9.fmask.swizzle_mode,
-				tex->surface.u.gfx9.fmask.epitch);
-		}
-
-		if (tex->cmask_buffer) {
-			u_log_printf(log, "  CMask: offset=%"PRIu64", size=%u, "
-				"alignment=%u, rb_aligned=%u, pipe_aligned=%u\n",
-				tex->surface.cmask_offset,
-				tex->surface.cmask_size,
-				tex->surface.cmask_alignment,
-				tex->surface.u.gfx9.cmask.rb_aligned,
-				tex->surface.u.gfx9.cmask.pipe_aligned);
-		}
-
-		if (tex->surface.htile_offset) {
-			u_log_printf(log, "  HTile: offset=%"PRIu64", size=%u, alignment=%u, "
-				"rb_aligned=%u, pipe_aligned=%u\n",
-				tex->surface.htile_offset,
-				tex->surface.htile_size,
-				tex->surface.htile_alignment,
-				tex->surface.u.gfx9.htile.rb_aligned,
-				tex->surface.u.gfx9.htile.pipe_aligned);
-		}
-
-		if (tex->surface.dcc_offset) {
-			u_log_printf(log, "  DCC: offset=%"PRIu64", size=%u, "
-				"alignment=%u, pitch_max=%u, num_dcc_levels=%u\n",
-				tex->surface.dcc_offset, tex->surface.dcc_size,
-				tex->surface.dcc_alignment,
-				tex->surface.u.gfx9.display_dcc_pitch_max,
-				tex->surface.num_dcc_levels);
-		}
-
-		if (tex->surface.u.gfx9.stencil_offset) {
-			u_log_printf(log, "  Stencil: offset=%"PRIu64", swmode=%u, epitch=%u\n",
-				tex->surface.u.gfx9.stencil_offset,
-				tex->surface.u.gfx9.stencil.swizzle_mode,
-				tex->surface.u.gfx9.stencil.epitch);
-		}
-		return;
-	}
-
-	u_log_printf(log, "  Layout: size=%"PRIu64", alignment=%u, bankw=%u, "
-		"bankh=%u, nbanks=%u, mtilea=%u, tilesplit=%u, pipeconfig=%u, scanout=%u\n",
-		tex->surface.surf_size, tex->surface.surf_alignment, tex->surface.u.legacy.bankw,
-		tex->surface.u.legacy.bankh, tex->surface.u.legacy.num_banks, tex->surface.u.legacy.mtilea,
-		tex->surface.u.legacy.tile_split, tex->surface.u.legacy.pipe_config,
-		(tex->surface.flags & RADEON_SURF_SCANOUT) != 0);
-
-	if (tex->surface.fmask_offset)
-		u_log_printf(log, "  FMask: offset=%"PRIu64", size=%"PRIu64", alignment=%u, pitch_in_pixels=%u, "
-			"bankh=%u, slice_tile_max=%u, tile_mode_index=%u\n",
-			tex->surface.fmask_offset, tex->surface.fmask_size, tex->surface.fmask_alignment,
-			tex->surface.u.legacy.fmask.pitch_in_pixels,
-			tex->surface.u.legacy.fmask.bankh,
-			tex->surface.u.legacy.fmask.slice_tile_max,
-			tex->surface.u.legacy.fmask.tiling_index);
-
-	if (tex->cmask_buffer)
-		u_log_printf(log, "  CMask: offset=%"PRIu64", size=%u, alignment=%u, "
-			"slice_tile_max=%u\n",
-			tex->surface.cmask_offset, tex->surface.cmask_size, tex->surface.cmask_alignment,
-			tex->surface.u.legacy.cmask_slice_tile_max);
-
-	if (tex->surface.htile_offset)
-		u_log_printf(log, "  HTile: offset=%"PRIu64", size=%u, "
-			"alignment=%u, TC_compatible = %u\n",
-			tex->surface.htile_offset, tex->surface.htile_size,
-			tex->surface.htile_alignment,
-			tex->tc_compatible_htile);
-
-	if (tex->surface.dcc_offset) {
-		u_log_printf(log, "  DCC: offset=%"PRIu64", size=%u, alignment=%u\n",
-			tex->surface.dcc_offset, tex->surface.dcc_size,
-			tex->surface.dcc_alignment);
-		for (i = 0; i <= tex->buffer.b.b.last_level; i++)
-			u_log_printf(log, "  DCCLevel[%i]: enabled=%u, offset=%u, "
-				"fast_clear_size=%u\n",
-				i, i < tex->surface.num_dcc_levels,
-				tex->surface.u.legacy.level[i].dcc_offset,
-				tex->surface.u.legacy.level[i].dcc_fast_clear_size);
-	}
-
-	for (i = 0; i <= tex->buffer.b.b.last_level; i++)
-		u_log_printf(log, "  Level[%i]: offset=%"PRIu64", slice_size=%"PRIu64", "
-			"npix_x=%u, npix_y=%u, npix_z=%u, nblk_x=%u, nblk_y=%u, "
-			"mode=%u, tiling_index = %u\n",
-			i, tex->surface.u.legacy.level[i].offset,
-			(uint64_t)tex->surface.u.legacy.level[i].slice_size_dw * 4,
-			u_minify(tex->buffer.b.b.width0, i),
-			u_minify(tex->buffer.b.b.height0, i),
-			u_minify(tex->buffer.b.b.depth0, i),
-			tex->surface.u.legacy.level[i].nblk_x,
-			tex->surface.u.legacy.level[i].nblk_y,
-			tex->surface.u.legacy.level[i].mode,
-			tex->surface.u.legacy.tiling_index[i]);
-
-	if (tex->surface.has_stencil) {
-		u_log_printf(log, "  StencilLayout: tilesplit=%u\n",
-			tex->surface.u.legacy.stencil_tile_split);
-		for (i = 0; i <= tex->buffer.b.b.last_level; i++) {
-			u_log_printf(log, "  StencilLevel[%i]: offset=%"PRIu64", "
-				"slice_size=%"PRIu64", npix_x=%u, "
-				"npix_y=%u, npix_z=%u, nblk_x=%u, nblk_y=%u, "
-				"mode=%u, tiling_index = %u\n",
-				i, tex->surface.u.legacy.stencil_level[i].offset,
-				(uint64_t)tex->surface.u.legacy.stencil_level[i].slice_size_dw * 4,
-				u_minify(tex->buffer.b.b.width0, i),
-				u_minify(tex->buffer.b.b.height0, i),
-				u_minify(tex->buffer.b.b.depth0, i),
-				tex->surface.u.legacy.stencil_level[i].nblk_x,
-				tex->surface.u.legacy.stencil_level[i].nblk_y,
-				tex->surface.u.legacy.stencil_level[i].mode,
-				tex->surface.u.legacy.stencil_tiling_index[i]);
-		}
-	}
+   int i;
+
+   /* Common parameters. */
+   u_log_printf(log,
+                "  Info: npix_x=%u, npix_y=%u, npix_z=%u, blk_w=%u, "
+                "blk_h=%u, array_size=%u, last_level=%u, "
+                "bpe=%u, nsamples=%u, flags=0x%x, %s\n",
+                tex->buffer.b.b.width0, tex->buffer.b.b.height0, tex->buffer.b.b.depth0,
+                tex->surface.blk_w, tex->surface.blk_h, tex->buffer.b.b.array_size,
+                tex->buffer.b.b.last_level, tex->surface.bpe, tex->buffer.b.b.nr_samples,
+                tex->surface.flags, util_format_short_name(tex->buffer.b.b.format));
+
+   if (sscreen->info.chip_class >= GFX9) {
+      u_log_printf(log,
+                   "  Surf: size=%" PRIu64 ", slice_size=%" PRIu64 ", "
+                   "alignment=%u, swmode=%u, epitch=%u, pitch=%u\n",
+                   tex->surface.surf_size, tex->surface.u.gfx9.surf_slice_size,
+                   tex->surface.surf_alignment, tex->surface.u.gfx9.surf.swizzle_mode,
+                   tex->surface.u.gfx9.surf.epitch, tex->surface.u.gfx9.surf_pitch);
+
+      if (tex->surface.fmask_offset) {
+         u_log_printf(log,
+                      "  FMASK: offset=%" PRIu64 ", size=%" PRIu64 ", "
+                      "alignment=%u, swmode=%u, epitch=%u\n",
+                      tex->surface.fmask_offset, tex->surface.fmask_size,
+                      tex->surface.fmask_alignment, tex->surface.u.gfx9.fmask.swizzle_mode,
+                      tex->surface.u.gfx9.fmask.epitch);
+      }
+
+      if (tex->cmask_buffer) {
+         u_log_printf(log,
+                      "  CMask: offset=%" PRIu64 ", size=%u, "
+                      "alignment=%u, rb_aligned=%u, pipe_aligned=%u\n",
+                      tex->surface.cmask_offset, tex->surface.cmask_size,
+                      tex->surface.cmask_alignment, tex->surface.u.gfx9.cmask.rb_aligned,
+                      tex->surface.u.gfx9.cmask.pipe_aligned);
+      }
+
+      if (tex->surface.htile_offset) {
+         u_log_printf(log,
+                      "  HTile: offset=%" PRIu64 ", size=%u, alignment=%u, "
+                      "rb_aligned=%u, pipe_aligned=%u\n",
+                      tex->surface.htile_offset, tex->surface.htile_size,
+                      tex->surface.htile_alignment, tex->surface.u.gfx9.htile.rb_aligned,
+                      tex->surface.u.gfx9.htile.pipe_aligned);
+      }
+
+      if (tex->surface.dcc_offset) {
+         u_log_printf(log,
+                      "  DCC: offset=%" PRIu64 ", size=%u, "
+                      "alignment=%u, pitch_max=%u, num_dcc_levels=%u\n",
+                      tex->surface.dcc_offset, tex->surface.dcc_size, tex->surface.dcc_alignment,
+                      tex->surface.u.gfx9.display_dcc_pitch_max, tex->surface.num_dcc_levels);
+      }
+
+      if (tex->surface.u.gfx9.stencil_offset) {
+         u_log_printf(log, "  Stencil: offset=%" PRIu64 ", swmode=%u, epitch=%u\n",
+                      tex->surface.u.gfx9.stencil_offset, tex->surface.u.gfx9.stencil.swizzle_mode,
+                      tex->surface.u.gfx9.stencil.epitch);
+      }
+      return;
+   }
+
+   u_log_printf(log,
+                "  Layout: size=%" PRIu64 ", alignment=%u, bankw=%u, "
+                "bankh=%u, nbanks=%u, mtilea=%u, tilesplit=%u, pipeconfig=%u, scanout=%u\n",
+                tex->surface.surf_size, tex->surface.surf_alignment, tex->surface.u.legacy.bankw,
+                tex->surface.u.legacy.bankh, tex->surface.u.legacy.num_banks,
+                tex->surface.u.legacy.mtilea, tex->surface.u.legacy.tile_split,
+                tex->surface.u.legacy.pipe_config, (tex->surface.flags & RADEON_SURF_SCANOUT) != 0);
+
+   if (tex->surface.fmask_offset)
+      u_log_printf(
+         log,
+         "  FMask: offset=%" PRIu64 ", size=%" PRIu64 ", alignment=%u, pitch_in_pixels=%u, "
+         "bankh=%u, slice_tile_max=%u, tile_mode_index=%u\n",
+         tex->surface.fmask_offset, tex->surface.fmask_size, tex->surface.fmask_alignment,
+         tex->surface.u.legacy.fmask.pitch_in_pixels, tex->surface.u.legacy.fmask.bankh,
+         tex->surface.u.legacy.fmask.slice_tile_max, tex->surface.u.legacy.fmask.tiling_index);
+
+   if (tex->cmask_buffer)
+      u_log_printf(log,
+                   "  CMask: offset=%" PRIu64 ", size=%u, alignment=%u, "
+                   "slice_tile_max=%u\n",
+                   tex->surface.cmask_offset, tex->surface.cmask_size, tex->surface.cmask_alignment,
+                   tex->surface.u.legacy.cmask_slice_tile_max);
+
+   if (tex->surface.htile_offset)
+      u_log_printf(log,
+                   "  HTile: offset=%" PRIu64 ", size=%u, "
+                   "alignment=%u, TC_compatible = %u\n",
+                   tex->surface.htile_offset, tex->surface.htile_size, tex->surface.htile_alignment,
+                   tex->tc_compatible_htile);
+
+   if (tex->surface.dcc_offset) {
+      u_log_printf(log, "  DCC: offset=%" PRIu64 ", size=%u, alignment=%u\n",
+                   tex->surface.dcc_offset, tex->surface.dcc_size, tex->surface.dcc_alignment);
+      for (i = 0; i <= tex->buffer.b.b.last_level; i++)
+         u_log_printf(log,
+                      "  DCCLevel[%i]: enabled=%u, offset=%u, "
+                      "fast_clear_size=%u\n",
+                      i, i < tex->surface.num_dcc_levels, tex->surface.u.legacy.level[i].dcc_offset,
+                      tex->surface.u.legacy.level[i].dcc_fast_clear_size);
+   }
+
+   for (i = 0; i <= tex->buffer.b.b.last_level; i++)
+      u_log_printf(log,
+                   "  Level[%i]: offset=%" PRIu64 ", slice_size=%" PRIu64 ", "
+                   "npix_x=%u, npix_y=%u, npix_z=%u, nblk_x=%u, nblk_y=%u, "
+                   "mode=%u, tiling_index = %u\n",
+                   i, tex->surface.u.legacy.level[i].offset,
+                   (uint64_t)tex->surface.u.legacy.level[i].slice_size_dw * 4,
+                   u_minify(tex->buffer.b.b.width0, i), u_minify(tex->buffer.b.b.height0, i),
+                   u_minify(tex->buffer.b.b.depth0, i), tex->surface.u.legacy.level[i].nblk_x,
+                   tex->surface.u.legacy.level[i].nblk_y, tex->surface.u.legacy.level[i].mode,
+                   tex->surface.u.legacy.tiling_index[i]);
+
+   if (tex->surface.has_stencil) {
+      u_log_printf(log, "  StencilLayout: tilesplit=%u\n",
+                   tex->surface.u.legacy.stencil_tile_split);
+      for (i = 0; i <= tex->buffer.b.b.last_level; i++) {
+         u_log_printf(log,
+                      "  StencilLevel[%i]: offset=%" PRIu64 ", "
+                      "slice_size=%" PRIu64 ", npix_x=%u, "
+                      "npix_y=%u, npix_z=%u, nblk_x=%u, nblk_y=%u, "
+                      "mode=%u, tiling_index = %u\n",
+                      i, tex->surface.u.legacy.stencil_level[i].offset,
+                      (uint64_t)tex->surface.u.legacy.stencil_level[i].slice_size_dw * 4,
+                      u_minify(tex->buffer.b.b.width0, i), u_minify(tex->buffer.b.b.height0, i),
+                      u_minify(tex->buffer.b.b.depth0, i),
+                      tex->surface.u.legacy.stencil_level[i].nblk_x,
+                      tex->surface.u.legacy.stencil_level[i].nblk_y,
+                      tex->surface.u.legacy.stencil_level[i].mode,
+                      tex->surface.u.legacy.stencil_tiling_index[i]);
+      }
+   }
 }
 
 /**
@@ -1293,611 +1193,569 @@ void si_print_texture_info(struct si_screen *sscreen,
  * \param alloc_size	the size to allocate if plane0 != NULL
  * \param alignment	alignment for the allocation
  */
-static struct si_texture *
-si_texture_create_object(struct pipe_screen *screen,
-			 const struct pipe_resource *base,
-			 const struct radeon_surf *surface,
-			 const struct si_texture *plane0,
-			 struct pb_buffer *imported_buf,
-			 uint64_t offset,
-			 uint64_t alloc_size,
-			 unsigned alignment)
+static struct si_texture *si_texture_create_object(struct pipe_screen *screen,
+                                                   const struct pipe_resource *base,
+                                                   const struct radeon_surf *surface,
+                                                   const struct si_texture *plane0,
+                                                   struct pb_buffer *imported_buf, uint64_t offset,
+                                                   uint64_t alloc_size, unsigned alignment)
 {
-	struct si_texture *tex;
-	struct si_resource *resource;
-	struct si_screen *sscreen = (struct si_screen*)screen;
-
-	tex = CALLOC_STRUCT(si_texture);
-	if (!tex)
-		goto error;
-
-	resource = &tex->buffer;
-	resource->b.b = *base;
-	resource->b.b.next = NULL;
-	resource->b.vtbl = &si_texture_vtbl;
-	pipe_reference_init(&resource->b.b.reference, 1);
-	resource->b.b.screen = screen;
-
-	/* don't include stencil-only formats which we don't support for rendering */
-	tex->is_depth = util_format_has_depth(util_format_description(tex->buffer.b.b.format));
-	tex->surface = *surface;
-	tex->tc_compatible_htile = tex->surface.htile_size != 0 &&
-				   (tex->surface.flags &
-				    RADEON_SURF_TC_COMPATIBLE_HTILE);
-
-	/* TC-compatible HTILE:
-	 * - GFX8 only supports Z32_FLOAT.
-	 * - GFX9 only supports Z32_FLOAT and Z16_UNORM. */
-	if (tex->tc_compatible_htile) {
-		if (sscreen->info.chip_class >= GFX9 &&
-		    base->format == PIPE_FORMAT_Z16_UNORM)
-			tex->db_render_format = base->format;
-		else {
-			tex->db_render_format = PIPE_FORMAT_Z32_FLOAT;
-			tex->upgraded_depth = base->format != PIPE_FORMAT_Z32_FLOAT &&
-					       base->format != PIPE_FORMAT_Z32_FLOAT_S8X24_UINT;
-		}
-	} else {
-		tex->db_render_format = base->format;
-	}
-
-	/* Applies to GCN. */
-	tex->last_msaa_resolve_target_micro_mode = tex->surface.micro_tile_mode;
-
-	/* Disable separate DCC at the beginning. DRI2 doesn't reuse buffers
-	 * between frames, so the only thing that can enable separate DCC
-	 * with DRI2 is multiple slow clears within a frame.
-	 */
-	tex->ps_draw_ratio = 0;
-
-	if (sscreen->info.chip_class >= GFX9) {
-		tex->surface.u.gfx9.surf_offset = offset;
-	} else {
-		for (unsigned i = 0; i < ARRAY_SIZE(surface->u.legacy.level); ++i)
-			tex->surface.u.legacy.level[i].offset += offset;
-	}
-
-	if (tex->is_depth) {
-		if (sscreen->info.chip_class >= GFX9) {
-			tex->can_sample_z = true;
-			tex->can_sample_s = true;
-
-			/* Stencil texturing with HTILE doesn't work
-			 * with mipmapping on Navi10-14. */
-			if ((sscreen->info.family == CHIP_NAVI10 ||
-			     sscreen->info.family == CHIP_NAVI12 ||
-			     sscreen->info.family == CHIP_NAVI14) &&
-			    base->last_level > 0)
-				tex->htile_stencil_disabled = true;
-		} else {
-			tex->can_sample_z = !tex->surface.u.legacy.depth_adjusted;
-			tex->can_sample_s = !tex->surface.u.legacy.stencil_adjusted;
-		}
-
-		tex->db_compatible = surface->flags & RADEON_SURF_ZBUFFER;
-	} else {
-		if (tex->surface.cmask_offset) {
-			tex->cb_color_info |= S_028C70_FAST_CLEAR(1);
-			tex->cmask_buffer = &tex->buffer;
-		}
-	}
-
-	if (plane0) {
-		/* The buffer is shared with the first plane. */
-		resource->bo_size = plane0->buffer.bo_size;
-		resource->bo_alignment = plane0->buffer.bo_alignment;
-		resource->flags = plane0->buffer.flags;
-		resource->domains = plane0->buffer.domains;
-		resource->vram_usage = plane0->buffer.vram_usage;
-		resource->gart_usage = plane0->buffer.gart_usage;
-
-		pb_reference(&resource->buf, plane0->buffer.buf);
-		resource->gpu_address = plane0->buffer.gpu_address;
-	} else if (!(surface->flags & RADEON_SURF_IMPORTED)) {
-		/* Create the backing buffer. */
-		si_init_resource_fields(sscreen, resource, alloc_size, alignment);
-
-		if (!si_alloc_resource(sscreen, resource))
-			goto error;
-	} else {
-		resource->buf = imported_buf;
-		resource->gpu_address = sscreen->ws->buffer_get_virtual_address(resource->buf);
-		resource->bo_size = imported_buf->size;
-		resource->bo_alignment = imported_buf->alignment;
-		resource->domains = sscreen->ws->buffer_get_initial_domain(resource->buf);
-		if (resource->domains & RADEON_DOMAIN_VRAM)
-			resource->vram_usage = resource->bo_size;
-		else if (resource->domains & RADEON_DOMAIN_GTT)
-			resource->gart_usage = resource->bo_size;
-	}
-
-	if (tex->cmask_buffer) {
-		/* Initialize the cmask to 0xCC (= compressed state). */
-		si_screen_clear_buffer(sscreen, &tex->cmask_buffer->b.b,
-					 tex->surface.cmask_offset, tex->surface.cmask_size,
-					 0xCCCCCCCC);
-	}
-	if (tex->surface.htile_offset) {
-		uint32_t clear_value = 0;
-
-		if (sscreen->info.chip_class >= GFX9 || tex->tc_compatible_htile)
-			clear_value = 0x0000030F;
-
-		si_screen_clear_buffer(sscreen, &tex->buffer.b.b,
-					 tex->surface.htile_offset,
-					 tex->surface.htile_size,
-					 clear_value);
-	}
-
-	/* Initialize DCC only if the texture is not being imported. */
-	if (!(surface->flags & RADEON_SURF_IMPORTED) && tex->surface.dcc_offset) {
-		/* Clear DCC to black for all tiles with DCC enabled.
-		 *
-		 * This fixes corruption in 3DMark Slingshot Extreme, which
-		 * uses uninitialized textures, causing corruption.
-		 */
-		if (tex->surface.num_dcc_levels == tex->buffer.b.b.last_level + 1 &&
-		    tex->buffer.b.b.nr_samples <= 2) {
-			/* Simple case - all tiles have DCC enabled. */
-			si_screen_clear_buffer(sscreen, &tex->buffer.b.b,
-					       tex->surface.dcc_offset,
-					       tex->surface.dcc_size,
-					       DCC_CLEAR_COLOR_0000);
-		} else if (sscreen->info.chip_class >= GFX9) {
-			/* Clear to uncompressed. Clearing this to black is complicated. */
-			si_screen_clear_buffer(sscreen, &tex->buffer.b.b,
-					       tex->surface.dcc_offset,
-					       tex->surface.dcc_size,
-					       DCC_UNCOMPRESSED);
-		} else {
-			/* GFX8: Initialize mipmap levels and multisamples separately. */
-			if (tex->buffer.b.b.nr_samples >= 2) {
-				/* Clearing this to black is complicated. */
-				si_screen_clear_buffer(sscreen, &tex->buffer.b.b,
-						       tex->surface.dcc_offset,
-						       tex->surface.dcc_size,
-						       DCC_UNCOMPRESSED);
-			} else {
-				/* Clear the enabled mipmap levels to black. */
-				unsigned size = 0;
-
-				for (unsigned i = 0; i < tex->surface.num_dcc_levels; i++) {
-					if (!tex->surface.u.legacy.level[i].dcc_fast_clear_size)
-						break;
-
-					size = tex->surface.u.legacy.level[i].dcc_offset +
-					       tex->surface.u.legacy.level[i].dcc_fast_clear_size;
-				}
-
-				/* Mipmap levels with DCC. */
-				if (size) {
-					si_screen_clear_buffer(sscreen, &tex->buffer.b.b,
-							       tex->surface.dcc_offset, size,
-							       DCC_CLEAR_COLOR_0000);
-				}
-				/* Mipmap levels without DCC. */
-				if (size != tex->surface.dcc_size) {
-					si_screen_clear_buffer(sscreen, &tex->buffer.b.b,
-							       tex->surface.dcc_offset + size,
-							       tex->surface.dcc_size - size,
-							       DCC_UNCOMPRESSED);
-				}
-			}
-		}
-
-		/* Initialize displayable DCC that requires the retile blit. */
-		if (tex->surface.dcc_retile_map_offset) {
-			/* Uninitialized DCC can hang the display hw.
-			 * Clear to white to indicate that. */
-			si_screen_clear_buffer(sscreen, &tex->buffer.b.b,
-					       tex->surface.display_dcc_offset,
-					       tex->surface.u.gfx9.display_dcc_size,
-					       DCC_CLEAR_COLOR_1111);
-
-			/* Upload the DCC retile map.
-			 * Use a staging buffer for the upload, because
-			 * the buffer backing the texture is unmappable.
-			 */
-			bool use_uint16 = tex->surface.u.gfx9.dcc_retile_use_uint16;
-			unsigned num_elements = tex->surface.u.gfx9.dcc_retile_num_elements;
-			struct si_resource *buf =
-				si_aligned_buffer_create(screen, 0, PIPE_USAGE_STREAM,
-							 num_elements * (use_uint16 ? 2 : 4),
-							 sscreen->info.tcc_cache_line_size);
-			uint32_t *ui = (uint32_t*)sscreen->ws->buffer_map(buf->buf, NULL,
-									  PIPE_TRANSFER_WRITE);
-			uint16_t *us = (uint16_t*)ui;
-
-			/* Upload the retile map into a staging buffer. */
-			if (use_uint16) {
-				for (unsigned i = 0; i < num_elements; i++)
-					us[i] = tex->surface.u.gfx9.dcc_retile_map[i];
-			} else {
-				for (unsigned i = 0; i < num_elements; i++)
-					ui[i] = tex->surface.u.gfx9.dcc_retile_map[i];
-			}
-
-			/* Copy the staging buffer to the buffer backing the texture. */
-			struct si_context *sctx = (struct si_context*)sscreen->aux_context;
-
-			assert(tex->surface.dcc_retile_map_offset <= UINT_MAX);
-			simple_mtx_lock(&sscreen->aux_context_lock);
-			si_sdma_copy_buffer(sctx, &tex->buffer.b.b, &buf->b.b,
-					    tex->surface.dcc_retile_map_offset,
-					    0, buf->b.b.width0);
-			sscreen->aux_context->flush(sscreen->aux_context, NULL, 0);
-			simple_mtx_unlock(&sscreen->aux_context_lock);
-
-			si_resource_reference(&buf, NULL);
-		}
-	}
-
-	/* Initialize the CMASK base register value. */
-	tex->cmask_base_address_reg =
-		(tex->buffer.gpu_address + tex->surface.cmask_offset) >> 8;
-
-	if (sscreen->debug_flags & DBG(VM)) {
-		fprintf(stderr, "VM start=0x%"PRIX64"  end=0x%"PRIX64" | Texture %ix%ix%i, %i levels, %i samples, %s\n",
-			tex->buffer.gpu_address,
-			tex->buffer.gpu_address + tex->buffer.buf->size,
-			base->width0, base->height0, util_num_layers(base, 0), base->last_level+1,
-			base->nr_samples ? base->nr_samples : 1, util_format_short_name(base->format));
-	}
-
-	if (sscreen->debug_flags & DBG(TEX)) {
-		puts("Texture:");
-		struct u_log_context log;
-		u_log_context_init(&log);
-		si_print_texture_info(sscreen, tex, &log);
-		u_log_new_page_print(&log, stdout);
-		fflush(stdout);
-		u_log_context_destroy(&log);
-	}
-
-	return tex;
+   struct si_texture *tex;
+   struct si_resource *resource;
+   struct si_screen *sscreen = (struct si_screen *)screen;
+
+   tex = CALLOC_STRUCT(si_texture);
+   if (!tex)
+      goto error;
+
+   resource = &tex->buffer;
+   resource->b.b = *base;
+   resource->b.b.next = NULL;
+   resource->b.vtbl = &si_texture_vtbl;
+   pipe_reference_init(&resource->b.b.reference, 1);
+   resource->b.b.screen = screen;
+
+   /* don't include stencil-only formats which we don't support for rendering */
+   tex->is_depth = util_format_has_depth(util_format_description(tex->buffer.b.b.format));
+   tex->surface = *surface;
+   tex->tc_compatible_htile =
+      tex->surface.htile_size != 0 && (tex->surface.flags & RADEON_SURF_TC_COMPATIBLE_HTILE);
+
+   /* TC-compatible HTILE:
+    * - GFX8 only supports Z32_FLOAT.
+    * - GFX9 only supports Z32_FLOAT and Z16_UNORM. */
+   if (tex->tc_compatible_htile) {
+      if (sscreen->info.chip_class >= GFX9 && base->format == PIPE_FORMAT_Z16_UNORM)
+         tex->db_render_format = base->format;
+      else {
+         tex->db_render_format = PIPE_FORMAT_Z32_FLOAT;
+         tex->upgraded_depth = base->format != PIPE_FORMAT_Z32_FLOAT &&
+                               base->format != PIPE_FORMAT_Z32_FLOAT_S8X24_UINT;
+      }
+   } else {
+      tex->db_render_format = base->format;
+   }
+
+   /* Applies to GCN. */
+   tex->last_msaa_resolve_target_micro_mode = tex->surface.micro_tile_mode;
+
+   /* Disable separate DCC at the beginning. DRI2 doesn't reuse buffers
+    * between frames, so the only thing that can enable separate DCC
+    * with DRI2 is multiple slow clears within a frame.
+    */
+   tex->ps_draw_ratio = 0;
+
+   if (sscreen->info.chip_class >= GFX9) {
+      tex->surface.u.gfx9.surf_offset = offset;
+   } else {
+      for (unsigned i = 0; i < ARRAY_SIZE(surface->u.legacy.level); ++i)
+         tex->surface.u.legacy.level[i].offset += offset;
+   }
+
+   if (tex->is_depth) {
+      if (sscreen->info.chip_class >= GFX9) {
+         tex->can_sample_z = true;
+         tex->can_sample_s = true;
+
+         /* Stencil texturing with HTILE doesn't work
+          * with mipmapping on Navi10-14. */
+         if ((sscreen->info.family == CHIP_NAVI10 || sscreen->info.family == CHIP_NAVI12 ||
+              sscreen->info.family == CHIP_NAVI14) &&
+             base->last_level > 0)
+            tex->htile_stencil_disabled = true;
+      } else {
+         tex->can_sample_z = !tex->surface.u.legacy.depth_adjusted;
+         tex->can_sample_s = !tex->surface.u.legacy.stencil_adjusted;
+      }
+
+      tex->db_compatible = surface->flags & RADEON_SURF_ZBUFFER;
+   } else {
+      if (tex->surface.cmask_offset) {
+         tex->cb_color_info |= S_028C70_FAST_CLEAR(1);
+         tex->cmask_buffer = &tex->buffer;
+      }
+   }
+
+   if (plane0) {
+      /* The buffer is shared with the first plane. */
+      resource->bo_size = plane0->buffer.bo_size;
+      resource->bo_alignment = plane0->buffer.bo_alignment;
+      resource->flags = plane0->buffer.flags;
+      resource->domains = plane0->buffer.domains;
+      resource->vram_usage = plane0->buffer.vram_usage;
+      resource->gart_usage = plane0->buffer.gart_usage;
+
+      pb_reference(&resource->buf, plane0->buffer.buf);
+      resource->gpu_address = plane0->buffer.gpu_address;
+   } else if (!(surface->flags & RADEON_SURF_IMPORTED)) {
+      /* Create the backing buffer. */
+      si_init_resource_fields(sscreen, resource, alloc_size, alignment);
+
+      if (!si_alloc_resource(sscreen, resource))
+         goto error;
+   } else {
+      resource->buf = imported_buf;
+      resource->gpu_address = sscreen->ws->buffer_get_virtual_address(resource->buf);
+      resource->bo_size = imported_buf->size;
+      resource->bo_alignment = imported_buf->alignment;
+      resource->domains = sscreen->ws->buffer_get_initial_domain(resource->buf);
+      if (resource->domains & RADEON_DOMAIN_VRAM)
+         resource->vram_usage = resource->bo_size;
+      else if (resource->domains & RADEON_DOMAIN_GTT)
+         resource->gart_usage = resource->bo_size;
+   }
+
+   if (tex->cmask_buffer) {
+      /* Initialize the cmask to 0xCC (= compressed state). */
+      si_screen_clear_buffer(sscreen, &tex->cmask_buffer->b.b, tex->surface.cmask_offset,
+                             tex->surface.cmask_size, 0xCCCCCCCC);
+   }
+   if (tex->surface.htile_offset) {
+      uint32_t clear_value = 0;
+
+      if (sscreen->info.chip_class >= GFX9 || tex->tc_compatible_htile)
+         clear_value = 0x0000030F;
+
+      si_screen_clear_buffer(sscreen, &tex->buffer.b.b, tex->surface.htile_offset,
+                             tex->surface.htile_size, clear_value);
+   }
+
+   /* Initialize DCC only if the texture is not being imported. */
+   if (!(surface->flags & RADEON_SURF_IMPORTED) && tex->surface.dcc_offset) {
+      /* Clear DCC to black for all tiles with DCC enabled.
+       *
+       * This fixes corruption in 3DMark Slingshot Extreme, which
+       * uses uninitialized textures, causing corruption.
+       */
+      if (tex->surface.num_dcc_levels == tex->buffer.b.b.last_level + 1 &&
+          tex->buffer.b.b.nr_samples <= 2) {
+         /* Simple case - all tiles have DCC enabled. */
+         si_screen_clear_buffer(sscreen, &tex->buffer.b.b, tex->surface.dcc_offset,
+                                tex->surface.dcc_size, DCC_CLEAR_COLOR_0000);
+      } else if (sscreen->info.chip_class >= GFX9) {
+         /* Clear to uncompressed. Clearing this to black is complicated. */
+         si_screen_clear_buffer(sscreen, &tex->buffer.b.b, tex->surface.dcc_offset,
+                                tex->surface.dcc_size, DCC_UNCOMPRESSED);
+      } else {
+         /* GFX8: Initialize mipmap levels and multisamples separately. */
+         if (tex->buffer.b.b.nr_samples >= 2) {
+            /* Clearing this to black is complicated. */
+            si_screen_clear_buffer(sscreen, &tex->buffer.b.b, tex->surface.dcc_offset,
+                                   tex->surface.dcc_size, DCC_UNCOMPRESSED);
+         } else {
+            /* Clear the enabled mipmap levels to black. */
+            unsigned size = 0;
+
+            for (unsigned i = 0; i < tex->surface.num_dcc_levels; i++) {
+               if (!tex->surface.u.legacy.level[i].dcc_fast_clear_size)
+                  break;
+
+               size = tex->surface.u.legacy.level[i].dcc_offset +
+                      tex->surface.u.legacy.level[i].dcc_fast_clear_size;
+            }
+
+            /* Mipmap levels with DCC. */
+            if (size) {
+               si_screen_clear_buffer(sscreen, &tex->buffer.b.b, tex->surface.dcc_offset, size,
+                                      DCC_CLEAR_COLOR_0000);
+            }
+            /* Mipmap levels without DCC. */
+            if (size != tex->surface.dcc_size) {
+               si_screen_clear_buffer(sscreen, &tex->buffer.b.b, tex->surface.dcc_offset + size,
+                                      tex->surface.dcc_size - size, DCC_UNCOMPRESSED);
+            }
+         }
+      }
+
+      /* Initialize displayable DCC that requires the retile blit. */
+      if (tex->surface.dcc_retile_map_offset) {
+         /* Uninitialized DCC can hang the display hw.
+          * Clear to white to indicate that. */
+         si_screen_clear_buffer(sscreen, &tex->buffer.b.b, tex->surface.display_dcc_offset,
+                                tex->surface.u.gfx9.display_dcc_size, DCC_CLEAR_COLOR_1111);
+
+         /* Upload the DCC retile map.
+          * Use a staging buffer for the upload, because
+          * the buffer backing the texture is unmappable.
+          */
+         bool use_uint16 = tex->surface.u.gfx9.dcc_retile_use_uint16;
+         unsigned num_elements = tex->surface.u.gfx9.dcc_retile_num_elements;
+         struct si_resource *buf = si_aligned_buffer_create(screen, 0, PIPE_USAGE_STREAM,
+                                                            num_elements * (use_uint16 ? 2 : 4),
+                                                            sscreen->info.tcc_cache_line_size);
+         uint32_t *ui = (uint32_t *)sscreen->ws->buffer_map(buf->buf, NULL, PIPE_TRANSFER_WRITE);
+         uint16_t *us = (uint16_t *)ui;
+
+         /* Upload the retile map into a staging buffer. */
+         if (use_uint16) {
+            for (unsigned i = 0; i < num_elements; i++)
+               us[i] = tex->surface.u.gfx9.dcc_retile_map[i];
+         } else {
+            for (unsigned i = 0; i < num_elements; i++)
+               ui[i] = tex->surface.u.gfx9.dcc_retile_map[i];
+         }
+
+         /* Copy the staging buffer to the buffer backing the texture. */
+         struct si_context *sctx = (struct si_context *)sscreen->aux_context;
+
+         assert(tex->surface.dcc_retile_map_offset <= UINT_MAX);
+         simple_mtx_lock(&sscreen->aux_context_lock);
+         si_sdma_copy_buffer(sctx, &tex->buffer.b.b, &buf->b.b, tex->surface.dcc_retile_map_offset,
+                             0, buf->b.b.width0);
+         sscreen->aux_context->flush(sscreen->aux_context, NULL, 0);
+         simple_mtx_unlock(&sscreen->aux_context_lock);
+
+         si_resource_reference(&buf, NULL);
+      }
+   }
+
+   /* Initialize the CMASK base register value. */
+   tex->cmask_base_address_reg = (tex->buffer.gpu_address + tex->surface.cmask_offset) >> 8;
+
+   if (sscreen->debug_flags & DBG(VM)) {
+      fprintf(stderr,
+              "VM start=0x%" PRIX64 "  end=0x%" PRIX64
+              " | Texture %ix%ix%i, %i levels, %i samples, %s\n",
+              tex->buffer.gpu_address, tex->buffer.gpu_address + tex->buffer.buf->size,
+              base->width0, base->height0, util_num_layers(base, 0), base->last_level + 1,
+              base->nr_samples ? base->nr_samples : 1, util_format_short_name(base->format));
+   }
+
+   if (sscreen->debug_flags & DBG(TEX)) {
+      puts("Texture:");
+      struct u_log_context log;
+      u_log_context_init(&log);
+      si_print_texture_info(sscreen, tex, &log);
+      u_log_new_page_print(&log, stdout);
+      fflush(stdout);
+      u_log_context_destroy(&log);
+   }
+
+   return tex;
 
 error:
-	FREE(tex);
-	if (sscreen->info.chip_class >= GFX9)
-		free(surface->u.gfx9.dcc_retile_map);
-	return NULL;
+   FREE(tex);
+   if (sscreen->info.chip_class >= GFX9)
+      free(surface->u.gfx9.dcc_retile_map);
+   return NULL;
 }
 
-static enum radeon_surf_mode
-si_choose_tiling(struct si_screen *sscreen,
-		 const struct pipe_resource *templ, bool tc_compatible_htile)
+static enum radeon_surf_mode si_choose_tiling(struct si_screen *sscreen,
+                                              const struct pipe_resource *templ,
+                                              bool tc_compatible_htile)
 {
-	const struct util_format_description *desc = util_format_description(templ->format);
-	bool force_tiling = templ->flags & SI_RESOURCE_FLAG_FORCE_MSAA_TILING;
-	bool is_depth_stencil = util_format_is_depth_or_stencil(templ->format) &&
-				!(templ->flags & SI_RESOURCE_FLAG_FLUSHED_DEPTH);
-
-	/* MSAA resources must be 2D tiled. */
-	if (templ->nr_samples > 1)
-		return RADEON_SURF_MODE_2D;
-
-	/* Transfer resources should be linear. */
-	if (templ->flags & SI_RESOURCE_FLAG_TRANSFER)
-		return RADEON_SURF_MODE_LINEAR_ALIGNED;
-
-	/* Avoid Z/S decompress blits by forcing TC-compatible HTILE on GFX8,
-	 * which requires 2D tiling.
-	 */
-	if (sscreen->info.chip_class == GFX8 && tc_compatible_htile)
-		return RADEON_SURF_MODE_2D;
-
-	/* Handle common candidates for the linear mode.
-	 * Compressed textures and DB surfaces must always be tiled.
-	 */
-	if (!force_tiling &&
-	    !is_depth_stencil &&
-	    !util_format_is_compressed(templ->format)) {
-		if (sscreen->debug_flags & DBG(NO_TILING))
-			return RADEON_SURF_MODE_LINEAR_ALIGNED;
-
-		/* Tiling doesn't work with the 422 (SUBSAMPLED) formats. */
-		if (desc->layout == UTIL_FORMAT_LAYOUT_SUBSAMPLED)
-			return RADEON_SURF_MODE_LINEAR_ALIGNED;
-
-		/* Cursors are linear on AMD GCN.
-		 * (XXX double-check, maybe also use RADEON_SURF_SCANOUT) */
-		if (templ->bind & PIPE_BIND_CURSOR)
-			return RADEON_SURF_MODE_LINEAR_ALIGNED;
-
-		if (templ->bind & PIPE_BIND_LINEAR)
-			return RADEON_SURF_MODE_LINEAR_ALIGNED;
-
-		/* Textures with a very small height are recommended to be linear. */
-		if (templ->target == PIPE_TEXTURE_1D ||
-		    templ->target == PIPE_TEXTURE_1D_ARRAY ||
-		    /* Only very thin and long 2D textures should benefit from
-		     * linear_aligned. */
-		    (templ->width0 > 8 && templ->height0 <= 2))
-			return RADEON_SURF_MODE_LINEAR_ALIGNED;
-
-		/* Textures likely to be mapped often. */
-		if (templ->usage == PIPE_USAGE_STAGING ||
-		    templ->usage == PIPE_USAGE_STREAM)
-			return RADEON_SURF_MODE_LINEAR_ALIGNED;
-	}
-
-	/* Make small textures 1D tiled. */
-	if (templ->width0 <= 16 || templ->height0 <= 16 ||
-	    (sscreen->debug_flags & DBG(NO_2D_TILING)))
-		return RADEON_SURF_MODE_1D;
-
-	/* The allocator will switch to 1D if needed. */
-	return RADEON_SURF_MODE_2D;
+   const struct util_format_description *desc = util_format_description(templ->format);
+   bool force_tiling = templ->flags & SI_RESOURCE_FLAG_FORCE_MSAA_TILING;
+   bool is_depth_stencil = util_format_is_depth_or_stencil(templ->format) &&
+                           !(templ->flags & SI_RESOURCE_FLAG_FLUSHED_DEPTH);
+
+   /* MSAA resources must be 2D tiled. */
+   if (templ->nr_samples > 1)
+      return RADEON_SURF_MODE_2D;
+
+   /* Transfer resources should be linear. */
+   if (templ->flags & SI_RESOURCE_FLAG_TRANSFER)
+      return RADEON_SURF_MODE_LINEAR_ALIGNED;
+
+   /* Avoid Z/S decompress blits by forcing TC-compatible HTILE on GFX8,
+    * which requires 2D tiling.
+    */
+   if (sscreen->info.chip_class == GFX8 && tc_compatible_htile)
+      return RADEON_SURF_MODE_2D;
+
+   /* Handle common candidates for the linear mode.
+    * Compressed textures and DB surfaces must always be tiled.
+    */
+   if (!force_tiling && !is_depth_stencil && !util_format_is_compressed(templ->format)) {
+      if (sscreen->debug_flags & DBG(NO_TILING))
+         return RADEON_SURF_MODE_LINEAR_ALIGNED;
+
+      /* Tiling doesn't work with the 422 (SUBSAMPLED) formats. */
+      if (desc->layout == UTIL_FORMAT_LAYOUT_SUBSAMPLED)
+         return RADEON_SURF_MODE_LINEAR_ALIGNED;
+
+      /* Cursors are linear on AMD GCN.
+       * (XXX double-check, maybe also use RADEON_SURF_SCANOUT) */
+      if (templ->bind & PIPE_BIND_CURSOR)
+         return RADEON_SURF_MODE_LINEAR_ALIGNED;
+
+      if (templ->bind & PIPE_BIND_LINEAR)
+         return RADEON_SURF_MODE_LINEAR_ALIGNED;
+
+      /* Textures with a very small height are recommended to be linear. */
+      if (templ->target == PIPE_TEXTURE_1D || templ->target == PIPE_TEXTURE_1D_ARRAY ||
+          /* Only very thin and long 2D textures should benefit from
+           * linear_aligned. */
+          (templ->width0 > 8 && templ->height0 <= 2))
+         return RADEON_SURF_MODE_LINEAR_ALIGNED;
+
+      /* Textures likely to be mapped often. */
+      if (templ->usage == PIPE_USAGE_STAGING || templ->usage == PIPE_USAGE_STREAM)
+         return RADEON_SURF_MODE_LINEAR_ALIGNED;
+   }
+
+   /* Make small textures 1D tiled. */
+   if (templ->width0 <= 16 || templ->height0 <= 16 || (sscreen->debug_flags & DBG(NO_2D_TILING)))
+      return RADEON_SURF_MODE_1D;
+
+   /* The allocator will switch to 1D if needed. */
+   return RADEON_SURF_MODE_2D;
 }
 
 struct pipe_resource *si_texture_create(struct pipe_screen *screen,
-					const struct pipe_resource *templ)
+                                        const struct pipe_resource *templ)
 {
-	struct si_screen *sscreen = (struct si_screen*)screen;
-	bool is_zs = util_format_is_depth_or_stencil(templ->format);
-
-	if (templ->nr_samples >= 2) {
-		/* This is hackish (overwriting the const pipe_resource template),
-		 * but should be harmless and state trackers can also see
-		 * the overriden number of samples in the created pipe_resource.
-		 */
-		if (is_zs && sscreen->eqaa_force_z_samples) {
-			((struct pipe_resource*)templ)->nr_samples =
-			((struct pipe_resource*)templ)->nr_storage_samples =
-				sscreen->eqaa_force_z_samples;
-		} else if (!is_zs && sscreen->eqaa_force_color_samples) {
-			((struct pipe_resource*)templ)->nr_samples =
-				sscreen->eqaa_force_coverage_samples;
-			((struct pipe_resource*)templ)->nr_storage_samples =
-				sscreen->eqaa_force_color_samples;
-		}
-	}
-
-	bool is_flushed_depth = templ->flags & SI_RESOURCE_FLAG_FLUSHED_DEPTH ||
-				templ->flags & SI_RESOURCE_FLAG_TRANSFER;
-	bool tc_compatible_htile =
-		sscreen->info.chip_class >= GFX8 &&
-		/* There are issues with TC-compatible HTILE on Tonga (and
-		 * Iceland is the same design), and documented bug workarounds
-		 * don't help. For example, this fails:
-		 *   piglit/bin/tex-miplevel-selection 'texture()' 2DShadow -auto
-		 */
-		sscreen->info.family != CHIP_TONGA &&
-		sscreen->info.family != CHIP_ICELAND &&
-		(templ->flags & PIPE_RESOURCE_FLAG_TEXTURING_MORE_LIKELY) &&
-		!(sscreen->debug_flags & DBG(NO_HYPERZ)) &&
-		!is_flushed_depth &&
-		templ->nr_samples <= 1 && /* TC-compat HTILE is less efficient with MSAA */
-		is_zs;
-	enum radeon_surf_mode tile_mode = si_choose_tiling(sscreen, templ,
-							   tc_compatible_htile);
-
-	/* This allocates textures with multiple planes like NV12 in 1 buffer. */
-	enum { SI_TEXTURE_MAX_PLANES = 3 };
-	struct radeon_surf surface[SI_TEXTURE_MAX_PLANES] = {};
-	struct pipe_resource plane_templ[SI_TEXTURE_MAX_PLANES];
-	uint64_t plane_offset[SI_TEXTURE_MAX_PLANES] = {};
-	uint64_t total_size = 0;
-	unsigned max_alignment = 0;
-	unsigned num_planes = util_format_get_num_planes(templ->format);
-	assert(num_planes <= SI_TEXTURE_MAX_PLANES);
-
-	/* Compute texture or plane layouts and offsets. */
-	for (unsigned i = 0; i < num_planes; i++) {
-		plane_templ[i] = *templ;
-		plane_templ[i].format = util_format_get_plane_format(templ->format, i);
-		plane_templ[i].width0 = util_format_get_plane_width(templ->format, i, templ->width0);
-		plane_templ[i].height0 = util_format_get_plane_height(templ->format, i, templ->height0);
-
-		/* Multi-plane allocations need PIPE_BIND_SHARED, because we can't
-		 * reallocate the storage to add PIPE_BIND_SHARED, because it's
-		 * shared by 3 pipe_resources.
-		 */
-		if (num_planes > 1)
-			plane_templ[i].bind |= PIPE_BIND_SHARED;
-
-		if (si_init_surface(sscreen, &surface[i], &plane_templ[i],
-				    tile_mode, 0, false,
-				    plane_templ[i].bind & PIPE_BIND_SCANOUT,
-				    is_flushed_depth, tc_compatible_htile))
-			return NULL;
-
-		plane_offset[i] = align64(total_size, surface[i].surf_alignment);
-		total_size = plane_offset[i] + surface[i].total_size;
-		max_alignment = MAX2(max_alignment, surface[i].surf_alignment);
-	}
-
-	struct si_texture *plane0 = NULL, *last_plane = NULL;
-
-	for (unsigned i = 0; i < num_planes; i++) {
-		struct si_texture *tex =
-			si_texture_create_object(screen, &plane_templ[i], &surface[i],
-						 plane0, NULL, plane_offset[i],
-						 total_size, max_alignment);
-		if (!tex) {
-			si_texture_reference(&plane0, NULL);
-			return NULL;
-		}
-
-		tex->plane_index = i;
-		tex->num_planes = num_planes;
-
-		if (!plane0) {
-			plane0 = last_plane = tex;
-		} else {
-			last_plane->buffer.b.b.next = &tex->buffer.b.b;
-			last_plane = tex;
-		}
-	}
-
-	return (struct pipe_resource *)plane0;
+   struct si_screen *sscreen = (struct si_screen *)screen;
+   bool is_zs = util_format_is_depth_or_stencil(templ->format);
+
+   if (templ->nr_samples >= 2) {
+      /* This is hackish (overwriting the const pipe_resource template),
+       * but should be harmless and state trackers can also see
+       * the overriden number of samples in the created pipe_resource.
+       */
+      if (is_zs && sscreen->eqaa_force_z_samples) {
+         ((struct pipe_resource *)templ)->nr_samples =
+            ((struct pipe_resource *)templ)->nr_storage_samples = sscreen->eqaa_force_z_samples;
+      } else if (!is_zs && sscreen->eqaa_force_color_samples) {
+         ((struct pipe_resource *)templ)->nr_samples = sscreen->eqaa_force_coverage_samples;
+         ((struct pipe_resource *)templ)->nr_storage_samples = sscreen->eqaa_force_color_samples;
+      }
+   }
+
+   bool is_flushed_depth =
+      templ->flags & SI_RESOURCE_FLAG_FLUSHED_DEPTH || templ->flags & SI_RESOURCE_FLAG_TRANSFER;
+   bool tc_compatible_htile =
+      sscreen->info.chip_class >= GFX8 &&
+      /* There are issues with TC-compatible HTILE on Tonga (and
+       * Iceland is the same design), and documented bug workarounds
+       * don't help. For example, this fails:
+       *   piglit/bin/tex-miplevel-selection 'texture()' 2DShadow -auto
+       */
+      sscreen->info.family != CHIP_TONGA && sscreen->info.family != CHIP_ICELAND &&
+      (templ->flags & PIPE_RESOURCE_FLAG_TEXTURING_MORE_LIKELY) &&
+      !(sscreen->debug_flags & DBG(NO_HYPERZ)) && !is_flushed_depth &&
+      templ->nr_samples <= 1 && /* TC-compat HTILE is less efficient with MSAA */
+      is_zs;
+   enum radeon_surf_mode tile_mode = si_choose_tiling(sscreen, templ, tc_compatible_htile);
+
+   /* This allocates textures with multiple planes like NV12 in 1 buffer. */
+   enum
+   {
+      SI_TEXTURE_MAX_PLANES = 3
+   };
+   struct radeon_surf surface[SI_TEXTURE_MAX_PLANES] = {};
+   struct pipe_resource plane_templ[SI_TEXTURE_MAX_PLANES];
+   uint64_t plane_offset[SI_TEXTURE_MAX_PLANES] = {};
+   uint64_t total_size = 0;
+   unsigned max_alignment = 0;
+   unsigned num_planes = util_format_get_num_planes(templ->format);
+   assert(num_planes <= SI_TEXTURE_MAX_PLANES);
+
+   /* Compute texture or plane layouts and offsets. */
+   for (unsigned i = 0; i < num_planes; i++) {
+      plane_templ[i] = *templ;
+      plane_templ[i].format = util_format_get_plane_format(templ->format, i);
+      plane_templ[i].width0 = util_format_get_plane_width(templ->format, i, templ->width0);
+      plane_templ[i].height0 = util_format_get_plane_height(templ->format, i, templ->height0);
+
+      /* Multi-plane allocations need PIPE_BIND_SHARED, because we can't
+       * reallocate the storage to add PIPE_BIND_SHARED, because it's
+       * shared by 3 pipe_resources.
+       */
+      if (num_planes > 1)
+         plane_templ[i].bind |= PIPE_BIND_SHARED;
+
+      if (si_init_surface(sscreen, &surface[i], &plane_templ[i], tile_mode, 0, false,
+                          plane_templ[i].bind & PIPE_BIND_SCANOUT, is_flushed_depth,
+                          tc_compatible_htile))
+         return NULL;
+
+      plane_offset[i] = align64(total_size, surface[i].surf_alignment);
+      total_size = plane_offset[i] + surface[i].total_size;
+      max_alignment = MAX2(max_alignment, surface[i].surf_alignment);
+   }
+
+   struct si_texture *plane0 = NULL, *last_plane = NULL;
+
+   for (unsigned i = 0; i < num_planes; i++) {
+      struct si_texture *tex =
+         si_texture_create_object(screen, &plane_templ[i], &surface[i], plane0, NULL,
+                                  plane_offset[i], total_size, max_alignment);
+      if (!tex) {
+         si_texture_reference(&plane0, NULL);
+         return NULL;
+      }
+
+      tex->plane_index = i;
+      tex->num_planes = num_planes;
+
+      if (!plane0) {
+         plane0 = last_plane = tex;
+      } else {
+         last_plane->buffer.b.b.next = &tex->buffer.b.b;
+         last_plane = tex;
+      }
+   }
+
+   return (struct pipe_resource *)plane0;
 }
 
 static struct pipe_resource *si_texture_from_winsys_buffer(struct si_screen *sscreen,
-							   const struct pipe_resource *templ,
-							   struct pb_buffer *buf,
-							   unsigned stride,
-							   unsigned offset,
-							   unsigned usage,
-							   bool dedicated)
+                                                           const struct pipe_resource *templ,
+                                                           struct pb_buffer *buf, unsigned stride,
+                                                           unsigned offset, unsigned usage,
+                                                           bool dedicated)
 {
-	enum radeon_surf_mode array_mode;
-	struct radeon_surf surface = {};
-	struct radeon_bo_metadata metadata = {};
-	struct si_texture *tex;
-	bool is_scanout;
-	int r;
-
-	/* Ignore metadata for non-zero planes. */
-	if (offset != 0)
-		dedicated = false;
-
-	if (dedicated) {
-		sscreen->ws->buffer_get_metadata(buf, &metadata);
-		si_get_display_metadata(sscreen, &surface, &metadata,
-					&array_mode, &is_scanout);
-	} else {
-		/**
-		 * The bo metadata is unset for un-dedicated images. So we fall
-		 * back to linear. See answer to question 5 of the
-		 * VK_KHX_external_memory spec for some details.
-		 *
-		 * It is possible that this case isn't going to work if the
-		 * surface pitch isn't correctly aligned by default.
-		 *
-		 * In order to support it correctly we require multi-image
-		 * metadata to be syncrhonized between radv and radeonsi. The
-		 * semantics of associating multiple image metadata to a memory
-		 * object on the vulkan export side are not concretely defined
-		 * either.
-		 *
-		 * All the use cases we are aware of at the moment for memory
-		 * objects use dedicated allocations. So lets keep the initial
-		 * implementation simple.
-		 *
-		 * A possible alternative is to attempt to reconstruct the
-		 * tiling information when the TexParameter TEXTURE_TILING_EXT
-		 * is set.
-		 */
-		array_mode = RADEON_SURF_MODE_LINEAR_ALIGNED;
-		is_scanout = false;
-	}
-
-	r = si_init_surface(sscreen, &surface, templ,
-			    array_mode, stride, true, is_scanout,
-			    false, false);
-	if (r)
-		return NULL;
-
-	tex = si_texture_create_object(&sscreen->b, templ, &surface, NULL, buf,
-				       offset, 0, 0);
-	if (!tex)
-		return NULL;
-
-	tex->buffer.b.is_shared = true;
-	tex->buffer.external_usage = usage;
-	tex->num_planes = 1;
-
-	if (!si_read_tex_bo_metadata(sscreen, tex, offset, &metadata)) {
-		si_texture_reference(&tex, NULL);
-		return NULL;
-	}
-
-	/* Displayable DCC requires an explicit flush. */
-	if (dedicated && offset == 0 &&
-	    !(usage & PIPE_HANDLE_USAGE_EXPLICIT_FLUSH) &&
-	    si_has_displayable_dcc(tex)) {
-		/* TODO: do we need to decompress DCC? */
-		if (si_texture_discard_dcc(sscreen, tex)) {
-			/* Update BO metadata after disabling DCC. */
-			si_set_tex_bo_metadata(sscreen, tex);
-		}
-	}
-
-	assert(tex->surface.tile_swizzle == 0);
-	return &tex->buffer.b.b;
+   enum radeon_surf_mode array_mode;
+   struct radeon_surf surface = {};
+   struct radeon_bo_metadata metadata = {};
+   struct si_texture *tex;
+   bool is_scanout;
+   int r;
+
+   /* Ignore metadata for non-zero planes. */
+   if (offset != 0)
+      dedicated = false;
+
+   if (dedicated) {
+      sscreen->ws->buffer_get_metadata(buf, &metadata);
+      si_get_display_metadata(sscreen, &surface, &metadata, &array_mode, &is_scanout);
+   } else {
+      /**
+       * The bo metadata is unset for un-dedicated images. So we fall
+       * back to linear. See answer to question 5 of the
+       * VK_KHX_external_memory spec for some details.
+       *
+       * It is possible that this case isn't going to work if the
+       * surface pitch isn't correctly aligned by default.
+       *
+       * In order to support it correctly we require multi-image
+       * metadata to be syncrhonized between radv and radeonsi. The
+       * semantics of associating multiple image metadata to a memory
+       * object on the vulkan export side are not concretely defined
+       * either.
+       *
+       * All the use cases we are aware of at the moment for memory
+       * objects use dedicated allocations. So lets keep the initial
+       * implementation simple.
+       *
+       * A possible alternative is to attempt to reconstruct the
+       * tiling information when the TexParameter TEXTURE_TILING_EXT
+       * is set.
+       */
+      array_mode = RADEON_SURF_MODE_LINEAR_ALIGNED;
+      is_scanout = false;
+   }
+
+   r =
+      si_init_surface(sscreen, &surface, templ, array_mode, stride, true, is_scanout, false, false);
+   if (r)
+      return NULL;
+
+   tex = si_texture_create_object(&sscreen->b, templ, &surface, NULL, buf, offset, 0, 0);
+   if (!tex)
+      return NULL;
+
+   tex->buffer.b.is_shared = true;
+   tex->buffer.external_usage = usage;
+   tex->num_planes = 1;
+
+   if (!si_read_tex_bo_metadata(sscreen, tex, offset, &metadata)) {
+      si_texture_reference(&tex, NULL);
+      return NULL;
+   }
+
+   /* Displayable DCC requires an explicit flush. */
+   if (dedicated && offset == 0 && !(usage & PIPE_HANDLE_USAGE_EXPLICIT_FLUSH) &&
+       si_has_displayable_dcc(tex)) {
+      /* TODO: do we need to decompress DCC? */
+      if (si_texture_discard_dcc(sscreen, tex)) {
+         /* Update BO metadata after disabling DCC. */
+         si_set_tex_bo_metadata(sscreen, tex);
+      }
+   }
+
+   assert(tex->surface.tile_swizzle == 0);
+   return &tex->buffer.b.b;
 }
 
 static struct pipe_resource *si_texture_from_handle(struct pipe_screen *screen,
-						    const struct pipe_resource *templ,
-						    struct winsys_handle *whandle,
-						    unsigned usage)
+                                                    const struct pipe_resource *templ,
+                                                    struct winsys_handle *whandle, unsigned usage)
 {
-	struct si_screen *sscreen = (struct si_screen*)screen;
-	struct pb_buffer *buf = NULL;
-
-	/* Support only 2D textures without mipmaps */
-	if ((templ->target != PIPE_TEXTURE_2D && templ->target != PIPE_TEXTURE_RECT &&
-	     templ->target != PIPE_TEXTURE_2D_ARRAY) ||
-	      templ->last_level != 0)
-		return NULL;
-
-	buf = sscreen->ws->buffer_from_handle(sscreen->ws, whandle,
-					      sscreen->info.max_alignment);
-	if (!buf)
-		return NULL;
-
-	return si_texture_from_winsys_buffer(sscreen, templ, buf,
-					     whandle->stride, whandle->offset,
-					     usage, true);
+   struct si_screen *sscreen = (struct si_screen *)screen;
+   struct pb_buffer *buf = NULL;
+
+   /* Support only 2D textures without mipmaps */
+   if ((templ->target != PIPE_TEXTURE_2D && templ->target != PIPE_TEXTURE_RECT &&
+        templ->target != PIPE_TEXTURE_2D_ARRAY) ||
+       templ->last_level != 0)
+      return NULL;
+
+   buf = sscreen->ws->buffer_from_handle(sscreen->ws, whandle, sscreen->info.max_alignment);
+   if (!buf)
+      return NULL;
+
+   return si_texture_from_winsys_buffer(sscreen, templ, buf, whandle->stride, whandle->offset,
+                                        usage, true);
 }
 
-bool si_init_flushed_depth_texture(struct pipe_context *ctx,
-				   struct pipe_resource *texture)
+bool si_init_flushed_depth_texture(struct pipe_context *ctx, struct pipe_resource *texture)
 {
-	struct si_texture *tex = (struct si_texture*)texture;
-	struct pipe_resource resource;
-	enum pipe_format pipe_format = texture->format;
-
-	assert(!tex->flushed_depth_texture);
-
-	if (!tex->can_sample_z && tex->can_sample_s) {
-		switch (pipe_format) {
-		case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT:
-			/* Save memory by not allocating the S plane. */
-			pipe_format = PIPE_FORMAT_Z32_FLOAT;
-			break;
-		case PIPE_FORMAT_Z24_UNORM_S8_UINT:
-		case PIPE_FORMAT_S8_UINT_Z24_UNORM:
-			/* Save memory bandwidth by not copying the
-			 * stencil part during flush.
-			 *
-			 * This potentially increases memory bandwidth
-			 * if an application uses both Z and S texturing
-			 * simultaneously (a flushed Z24S8 texture
-			 * would be stored compactly), but how often
-			 * does that really happen?
-			 */
-			pipe_format = PIPE_FORMAT_Z24X8_UNORM;
-			break;
-		default:;
-		}
-	} else if (!tex->can_sample_s && tex->can_sample_z) {
-		assert(util_format_has_stencil(util_format_description(pipe_format)));
-
-		/* DB->CB copies to an 8bpp surface don't work. */
-		pipe_format = PIPE_FORMAT_X24S8_UINT;
-	}
-
-	memset(&resource, 0, sizeof(resource));
-	resource.target = texture->target;
-	resource.format = pipe_format;
-	resource.width0 = texture->width0;
-	resource.height0 = texture->height0;
-	resource.depth0 = texture->depth0;
-	resource.array_size = texture->array_size;
-	resource.last_level = texture->last_level;
-	resource.nr_samples = texture->nr_samples;
-	resource.usage = PIPE_USAGE_DEFAULT;
-	resource.bind = texture->bind & ~PIPE_BIND_DEPTH_STENCIL;
-	resource.flags = texture->flags | SI_RESOURCE_FLAG_FLUSHED_DEPTH;
-
-	tex->flushed_depth_texture = (struct si_texture *)ctx->screen->resource_create(ctx->screen, &resource);
-	if (!tex->flushed_depth_texture) {
-		PRINT_ERR("failed to create temporary texture to hold flushed depth\n");
-		return false;
-	}
-	return true;
+   struct si_texture *tex = (struct si_texture *)texture;
+   struct pipe_resource resource;
+   enum pipe_format pipe_format = texture->format;
+
+   assert(!tex->flushed_depth_texture);
+
+   if (!tex->can_sample_z && tex->can_sample_s) {
+      switch (pipe_format) {
+      case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT:
+         /* Save memory by not allocating the S plane. */
+         pipe_format = PIPE_FORMAT_Z32_FLOAT;
+         break;
+      case PIPE_FORMAT_Z24_UNORM_S8_UINT:
+      case PIPE_FORMAT_S8_UINT_Z24_UNORM:
+         /* Save memory bandwidth by not copying the
+          * stencil part during flush.
+          *
+          * This potentially increases memory bandwidth
+          * if an application uses both Z and S texturing
+          * simultaneously (a flushed Z24S8 texture
+          * would be stored compactly), but how often
+          * does that really happen?
+          */
+         pipe_format = PIPE_FORMAT_Z24X8_UNORM;
+         break;
+      default:;
+      }
+   } else if (!tex->can_sample_s && tex->can_sample_z) {
+      assert(util_format_has_stencil(util_format_description(pipe_format)));
+
+      /* DB->CB copies to an 8bpp surface don't work. */
+      pipe_format = PIPE_FORMAT_X24S8_UINT;
+   }
+
+   memset(&resource, 0, sizeof(resource));
+   resource.target = texture->target;
+   resource.format = pipe_format;
+   resource.width0 = texture->width0;
+   resource.height0 = texture->height0;
+   resource.depth0 = texture->depth0;
+   resource.array_size = texture->array_size;
+   resource.last_level = texture->last_level;
+   resource.nr_samples = texture->nr_samples;
+   resource.usage = PIPE_USAGE_DEFAULT;
+   resource.bind = texture->bind & ~PIPE_BIND_DEPTH_STENCIL;
+   resource.flags = texture->flags | SI_RESOURCE_FLAG_FLUSHED_DEPTH;
+
+   tex->flushed_depth_texture =
+      (struct si_texture *)ctx->screen->resource_create(ctx->screen, &resource);
+   if (!tex->flushed_depth_texture) {
+      PRINT_ERR("failed to create temporary texture to hold flushed depth\n");
+      return false;
+   }
+   return true;
 }
 
 /**
@@ -1905,836 +1763,759 @@ bool si_init_flushed_depth_texture(struct pipe_context *ctx,
  * which is supposed to hold a subregion of the texture "orig" at the given
  * mipmap level.
  */
-static void si_init_temp_resource_from_box(struct pipe_resource *res,
-					   struct pipe_resource *orig,
-					   const struct pipe_box *box,
-					   unsigned level, unsigned flags)
+static void si_init_temp_resource_from_box(struct pipe_resource *res, struct pipe_resource *orig,
+                                           const struct pipe_box *box, unsigned level,
+                                           unsigned flags)
 {
-	memset(res, 0, sizeof(*res));
-	res->format = orig->format;
-	res->width0 = box->width;
-	res->height0 = box->height;
-	res->depth0 = 1;
-	res->array_size = 1;
-	res->usage = flags & SI_RESOURCE_FLAG_TRANSFER ? PIPE_USAGE_STAGING : PIPE_USAGE_DEFAULT;
-	res->flags = flags;
-
-	if (flags & SI_RESOURCE_FLAG_TRANSFER &&
-	    util_format_is_compressed(orig->format)) {
-		/* Transfer resources are allocated with linear tiling, which is
-		 * not supported for compressed formats.
-		 */
-		unsigned blocksize =
-			util_format_get_blocksize(orig->format);
-
-		if (blocksize == 8) {
-			res->format = PIPE_FORMAT_R16G16B16A16_UINT;
-		} else {
-			assert(blocksize == 16);
-			res->format = PIPE_FORMAT_R32G32B32A32_UINT;
-		}
-
-		res->width0 = util_format_get_nblocksx(orig->format, box->width);
-		res->height0 = util_format_get_nblocksy(orig->format, box->height);
-	}
-
-	/* We must set the correct texture target and dimensions for a 3D box. */
-	if (box->depth > 1 && util_max_layer(orig, level) > 0) {
-		res->target = PIPE_TEXTURE_2D_ARRAY;
-		res->array_size = box->depth;
-	} else {
-		res->target = PIPE_TEXTURE_2D;
-	}
+   memset(res, 0, sizeof(*res));
+   res->format = orig->format;
+   res->width0 = box->width;
+   res->height0 = box->height;
+   res->depth0 = 1;
+   res->array_size = 1;
+   res->usage = flags & SI_RESOURCE_FLAG_TRANSFER ? PIPE_USAGE_STAGING : PIPE_USAGE_DEFAULT;
+   res->flags = flags;
+
+   if (flags & SI_RESOURCE_FLAG_TRANSFER && util_format_is_compressed(orig->format)) {
+      /* Transfer resources are allocated with linear tiling, which is
+       * not supported for compressed formats.
+       */
+      unsigned blocksize = util_format_get_blocksize(orig->format);
+
+      if (blocksize == 8) {
+         res->format = PIPE_FORMAT_R16G16B16A16_UINT;
+      } else {
+         assert(blocksize == 16);
+         res->format = PIPE_FORMAT_R32G32B32A32_UINT;
+      }
+
+      res->width0 = util_format_get_nblocksx(orig->format, box->width);
+      res->height0 = util_format_get_nblocksy(orig->format, box->height);
+   }
+
+   /* We must set the correct texture target and dimensions for a 3D box. */
+   if (box->depth > 1 && util_max_layer(orig, level) > 0) {
+      res->target = PIPE_TEXTURE_2D_ARRAY;
+      res->array_size = box->depth;
+   } else {
+      res->target = PIPE_TEXTURE_2D;
+   }
 }
 
-static bool si_can_invalidate_texture(struct si_screen *sscreen,
-				      struct si_texture *tex,
-				      unsigned transfer_usage,
-				      const struct pipe_box *box)
+static bool si_can_invalidate_texture(struct si_screen *sscreen, struct si_texture *tex,
+                                      unsigned transfer_usage, const struct pipe_box *box)
 {
-	return !tex->buffer.b.is_shared &&
-		!(tex->surface.flags & RADEON_SURF_IMPORTED) &&
-		!(transfer_usage & PIPE_TRANSFER_READ) &&
-		tex->buffer.b.b.last_level == 0 &&
-		util_texrange_covers_whole_level(&tex->buffer.b.b, 0,
-						 box->x, box->y, box->z,
-						 box->width, box->height,
-						 box->depth);
+   return !tex->buffer.b.is_shared && !(tex->surface.flags & RADEON_SURF_IMPORTED) &&
+          !(transfer_usage & PIPE_TRANSFER_READ) && tex->buffer.b.b.last_level == 0 &&
+          util_texrange_covers_whole_level(&tex->buffer.b.b, 0, box->x, box->y, box->z, box->width,
+                                           box->height, box->depth);
 }
 
-static void si_texture_invalidate_storage(struct si_context *sctx,
-					  struct si_texture *tex)
+static void si_texture_invalidate_storage(struct si_context *sctx, struct si_texture *tex)
 {
-	struct si_screen *sscreen = sctx->screen;
+   struct si_screen *sscreen = sctx->screen;
 
-	/* There is no point in discarding depth and tiled buffers. */
-	assert(!tex->is_depth);
-	assert(tex->surface.is_linear);
+   /* There is no point in discarding depth and tiled buffers. */
+   assert(!tex->is_depth);
+   assert(tex->surface.is_linear);
 
-	/* Reallocate the buffer in the same pipe_resource. */
-	si_alloc_resource(sscreen, &tex->buffer);
+   /* Reallocate the buffer in the same pipe_resource. */
+   si_alloc_resource(sscreen, &tex->buffer);
 
-	/* Initialize the CMASK base address (needed even without CMASK). */
-	tex->cmask_base_address_reg =
-		(tex->buffer.gpu_address + tex->surface.cmask_offset) >> 8;
+   /* Initialize the CMASK base address (needed even without CMASK). */
+   tex->cmask_base_address_reg = (tex->buffer.gpu_address + tex->surface.cmask_offset) >> 8;
 
-	p_atomic_inc(&sscreen->dirty_tex_counter);
+   p_atomic_inc(&sscreen->dirty_tex_counter);
 
-	sctx->num_alloc_tex_transfer_bytes += tex->surface.total_size;
+   sctx->num_alloc_tex_transfer_bytes += tex->surface.total_size;
 }
 
-static void *si_texture_transfer_map(struct pipe_context *ctx,
-				     struct pipe_resource *texture,
-				     unsigned level,
-				     unsigned usage,
-				     const struct pipe_box *box,
-				     struct pipe_transfer **ptransfer)
+static void *si_texture_transfer_map(struct pipe_context *ctx, struct pipe_resource *texture,
+                                     unsigned level, unsigned usage, const struct pipe_box *box,
+                                     struct pipe_transfer **ptransfer)
 {
-	struct si_context *sctx = (struct si_context*)ctx;
-	struct si_texture *tex = (struct si_texture*)texture;
-	struct si_transfer *trans;
-	struct si_resource *buf;
-	unsigned offset = 0;
-	char *map;
-	bool use_staging_texture = false;
-
-	assert(!(texture->flags & SI_RESOURCE_FLAG_TRANSFER));
-	assert(box->width && box->height && box->depth);
-
-	if (tex->is_depth) {
-		/* Depth textures use staging unconditionally. */
-		use_staging_texture = true;
-	} else {
-		/* Degrade the tile mode if we get too many transfers on APUs.
-		 * On dGPUs, the staging texture is always faster.
-		 * Only count uploads that are at least 4x4 pixels large.
-		 */
-		if (!sctx->screen->info.has_dedicated_vram &&
-		    level == 0 &&
-		    box->width >= 4 && box->height >= 4 &&
-		    p_atomic_inc_return(&tex->num_level0_transfers) == 10) {
-			bool can_invalidate =
-				si_can_invalidate_texture(sctx->screen, tex,
-							    usage, box);
-
-			si_reallocate_texture_inplace(sctx, tex,
-							PIPE_BIND_LINEAR,
-							can_invalidate);
-		}
-
-		/* Tiled textures need to be converted into a linear texture for CPU
-		 * access. The staging texture is always linear and is placed in GART.
-		 *
-		 * Reading from VRAM or GTT WC is slow, always use the staging
-		 * texture in this case.
-		 *
-		 * Use the staging texture for uploads if the underlying BO
-		 * is busy.
-		 */
-		if (!tex->surface.is_linear)
-			use_staging_texture = true;
-		else if (usage & PIPE_TRANSFER_READ)
-			use_staging_texture =
-				tex->buffer.domains & RADEON_DOMAIN_VRAM ||
-				tex->buffer.flags & RADEON_FLAG_GTT_WC;
-		/* Write & linear only: */
-		else if (si_rings_is_buffer_referenced(sctx, tex->buffer.buf,
-						       RADEON_USAGE_READWRITE) ||
-			 !sctx->ws->buffer_wait(tex->buffer.buf, 0,
-						RADEON_USAGE_READWRITE)) {
-			/* It's busy. */
-			if (si_can_invalidate_texture(sctx->screen, tex,
-							usage, box))
-				si_texture_invalidate_storage(sctx, tex);
-			else
-				use_staging_texture = true;
-		}
-	}
-
-	trans = CALLOC_STRUCT(si_transfer);
-	if (!trans)
-		return NULL;
-	pipe_resource_reference(&trans->b.b.resource, texture);
-	trans->b.b.level = level;
-	trans->b.b.usage = usage;
-	trans->b.b.box = *box;
-
-	if (use_staging_texture) {
-		struct pipe_resource resource;
-		struct si_texture *staging;
-
-		si_init_temp_resource_from_box(&resource, texture, box, level,
-						 SI_RESOURCE_FLAG_TRANSFER);
-		resource.usage = (usage & PIPE_TRANSFER_READ) ?
-			PIPE_USAGE_STAGING : PIPE_USAGE_STREAM;
-
-		/* Since depth-stencil textures don't support linear tiling,
-		 * blit from ZS to color and vice versa. u_blitter will do
-		 * the packing for these formats.
-		 */
-		if (tex->is_depth)
-			resource.format = util_blitter_get_color_format_for_zs(resource.format);
-
-		/* Create the temporary texture. */
-		staging = (struct si_texture*)ctx->screen->resource_create(ctx->screen, &resource);
-		if (!staging) {
-			PRINT_ERR("failed to create temporary texture to hold untiled copy\n");
-			goto fail_trans;
-		}
-		trans->staging = &staging->buffer;
-
-		/* Just get the strides. */
-		si_texture_get_offset(sctx->screen, staging, 0, NULL,
-					&trans->b.b.stride,
-					&trans->b.b.layer_stride);
-
-		if (usage & PIPE_TRANSFER_READ)
-			si_copy_to_staging_texture(ctx, trans);
-		else
-			usage |= PIPE_TRANSFER_UNSYNCHRONIZED;
-
-		buf = trans->staging;
-	} else {
-		/* the resource is mapped directly */
-		offset = si_texture_get_offset(sctx->screen, tex, level, box,
-						 &trans->b.b.stride,
-						 &trans->b.b.layer_stride);
-		buf = &tex->buffer;
-	}
-
-	/* Always unmap texture CPU mappings on 32-bit architectures, so that
-	 * we don't run out of the CPU address space.
-	 */
-	if (sizeof(void*) == 4)
-		usage |= RADEON_TRANSFER_TEMPORARY;
-
-	if (!(map = si_buffer_map_sync_with_rings(sctx, buf, usage)))
-		goto fail_trans;
-
-	*ptransfer = &trans->b.b;
-	return map + offset;
+   struct si_context *sctx = (struct si_context *)ctx;
+   struct si_texture *tex = (struct si_texture *)texture;
+   struct si_transfer *trans;
+   struct si_resource *buf;
+   unsigned offset = 0;
+   char *map;
+   bool use_staging_texture = false;
+
+   assert(!(texture->flags & SI_RESOURCE_FLAG_TRANSFER));
+   assert(box->width && box->height && box->depth);
+
+   if (tex->is_depth) {
+      /* Depth textures use staging unconditionally. */
+      use_staging_texture = true;
+   } else {
+      /* Degrade the tile mode if we get too many transfers on APUs.
+       * On dGPUs, the staging texture is always faster.
+       * Only count uploads that are at least 4x4 pixels large.
+       */
+      if (!sctx->screen->info.has_dedicated_vram && level == 0 && box->width >= 4 &&
+          box->height >= 4 && p_atomic_inc_return(&tex->num_level0_transfers) == 10) {
+         bool can_invalidate = si_can_invalidate_texture(sctx->screen, tex, usage, box);
+
+         si_reallocate_texture_inplace(sctx, tex, PIPE_BIND_LINEAR, can_invalidate);
+      }
+
+      /* Tiled textures need to be converted into a linear texture for CPU
+       * access. The staging texture is always linear and is placed in GART.
+       *
+       * Reading from VRAM or GTT WC is slow, always use the staging
+       * texture in this case.
+       *
+       * Use the staging texture for uploads if the underlying BO
+       * is busy.
+       */
+      if (!tex->surface.is_linear)
+         use_staging_texture = true;
+      else if (usage & PIPE_TRANSFER_READ)
+         use_staging_texture =
+            tex->buffer.domains & RADEON_DOMAIN_VRAM || tex->buffer.flags & RADEON_FLAG_GTT_WC;
+      /* Write & linear only: */
+      else if (si_rings_is_buffer_referenced(sctx, tex->buffer.buf, RADEON_USAGE_READWRITE) ||
+               !sctx->ws->buffer_wait(tex->buffer.buf, 0, RADEON_USAGE_READWRITE)) {
+         /* It's busy. */
+         if (si_can_invalidate_texture(sctx->screen, tex, usage, box))
+            si_texture_invalidate_storage(sctx, tex);
+         else
+            use_staging_texture = true;
+      }
+   }
+
+   trans = CALLOC_STRUCT(si_transfer);
+   if (!trans)
+      return NULL;
+   pipe_resource_reference(&trans->b.b.resource, texture);
+   trans->b.b.level = level;
+   trans->b.b.usage = usage;
+   trans->b.b.box = *box;
+
+   if (use_staging_texture) {
+      struct pipe_resource resource;
+      struct si_texture *staging;
+
+      si_init_temp_resource_from_box(&resource, texture, box, level, SI_RESOURCE_FLAG_TRANSFER);
+      resource.usage = (usage & PIPE_TRANSFER_READ) ? PIPE_USAGE_STAGING : PIPE_USAGE_STREAM;
+
+      /* Since depth-stencil textures don't support linear tiling,
+       * blit from ZS to color and vice versa. u_blitter will do
+       * the packing for these formats.
+       */
+      if (tex->is_depth)
+         resource.format = util_blitter_get_color_format_for_zs(resource.format);
+
+      /* Create the temporary texture. */
+      staging = (struct si_texture *)ctx->screen->resource_create(ctx->screen, &resource);
+      if (!staging) {
+         PRINT_ERR("failed to create temporary texture to hold untiled copy\n");
+         goto fail_trans;
+      }
+      trans->staging = &staging->buffer;
+
+      /* Just get the strides. */
+      si_texture_get_offset(sctx->screen, staging, 0, NULL, &trans->b.b.stride,
+                            &trans->b.b.layer_stride);
+
+      if (usage & PIPE_TRANSFER_READ)
+         si_copy_to_staging_texture(ctx, trans);
+      else
+         usage |= PIPE_TRANSFER_UNSYNCHRONIZED;
+
+      buf = trans->staging;
+   } else {
+      /* the resource is mapped directly */
+      offset = si_texture_get_offset(sctx->screen, tex, level, box, &trans->b.b.stride,
+                                     &trans->b.b.layer_stride);
+      buf = &tex->buffer;
+   }
+
+   /* Always unmap texture CPU mappings on 32-bit architectures, so that
+    * we don't run out of the CPU address space.
+    */
+   if (sizeof(void *) == 4)
+      usage |= RADEON_TRANSFER_TEMPORARY;
+
+   if (!(map = si_buffer_map_sync_with_rings(sctx, buf, usage)))
+      goto fail_trans;
+
+   *ptransfer = &trans->b.b;
+   return map + offset;
 
 fail_trans:
-	si_resource_reference(&trans->staging, NULL);
-	pipe_resource_reference(&trans->b.b.resource, NULL);
-	FREE(trans);
-	return NULL;
+   si_resource_reference(&trans->staging, NULL);
+   pipe_resource_reference(&trans->b.b.resource, NULL);
+   FREE(trans);
+   return NULL;
 }
 
-static void si_texture_transfer_unmap(struct pipe_context *ctx,
-				      struct pipe_transfer* transfer)
+static void si_texture_transfer_unmap(struct pipe_context *ctx, struct pipe_transfer *transfer)
 {
-	struct si_context *sctx = (struct si_context*)ctx;
-	struct si_transfer *stransfer = (struct si_transfer*)transfer;
-	struct pipe_resource *texture = transfer->resource;
-	struct si_texture *tex = (struct si_texture*)texture;
-
-	/* Always unmap texture CPU mappings on 32-bit architectures, so that
-	 * we don't run out of the CPU address space.
-	 */
-	if (sizeof(void*) == 4) {
-		struct si_resource *buf =
-			stransfer->staging ? stransfer->staging : &tex->buffer;
-
-		sctx->ws->buffer_unmap(buf->buf);
-	}
-
-	if ((transfer->usage & PIPE_TRANSFER_WRITE) && stransfer->staging)
-		si_copy_from_staging_texture(ctx, stransfer);
-
-	if (stransfer->staging) {
-		sctx->num_alloc_tex_transfer_bytes += stransfer->staging->buf->size;
-		si_resource_reference(&stransfer->staging, NULL);
-	}
-
-	/* Heuristic for {upload, draw, upload, draw, ..}:
-	 *
-	 * Flush the gfx IB if we've allocated too much texture storage.
-	 *
-	 * The idea is that we don't want to build IBs that use too much
-	 * memory and put pressure on the kernel memory manager and we also
-	 * want to make temporary and invalidated buffers go idle ASAP to
-	 * decrease the total memory usage or make them reusable. The memory
-	 * usage will be slightly higher than given here because of the buffer
-	 * cache in the winsys.
-	 *
-	 * The result is that the kernel memory manager is never a bottleneck.
-	 */
-	if (sctx->num_alloc_tex_transfer_bytes > sctx->screen->info.gart_size / 4) {
-		si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
-		sctx->num_alloc_tex_transfer_bytes = 0;
-	}
-
-	pipe_resource_reference(&transfer->resource, NULL);
-	FREE(transfer);
+   struct si_context *sctx = (struct si_context *)ctx;
+   struct si_transfer *stransfer = (struct si_transfer *)transfer;
+   struct pipe_resource *texture = transfer->resource;
+   struct si_texture *tex = (struct si_texture *)texture;
+
+   /* Always unmap texture CPU mappings on 32-bit architectures, so that
+    * we don't run out of the CPU address space.
+    */
+   if (sizeof(void *) == 4) {
+      struct si_resource *buf = stransfer->staging ? stransfer->staging : &tex->buffer;
+
+      sctx->ws->buffer_unmap(buf->buf);
+   }
+
+   if ((transfer->usage & PIPE_TRANSFER_WRITE) && stransfer->staging)
+      si_copy_from_staging_texture(ctx, stransfer);
+
+   if (stransfer->staging) {
+      sctx->num_alloc_tex_transfer_bytes += stransfer->staging->buf->size;
+      si_resource_reference(&stransfer->staging, NULL);
+   }
+
+   /* Heuristic for {upload, draw, upload, draw, ..}:
+    *
+    * Flush the gfx IB if we've allocated too much texture storage.
+    *
+    * The idea is that we don't want to build IBs that use too much
+    * memory and put pressure on the kernel memory manager and we also
+    * want to make temporary and invalidated buffers go idle ASAP to
+    * decrease the total memory usage or make them reusable. The memory
+    * usage will be slightly higher than given here because of the buffer
+    * cache in the winsys.
+    *
+    * The result is that the kernel memory manager is never a bottleneck.
+    */
+   if (sctx->num_alloc_tex_transfer_bytes > sctx->screen->info.gart_size / 4) {
+      si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
+      sctx->num_alloc_tex_transfer_bytes = 0;
+   }
+
+   pipe_resource_reference(&transfer->resource, NULL);
+   FREE(transfer);
 }
 
-static const struct u_resource_vtbl si_texture_vtbl =
-{
-	NULL,				/* get_handle */
-	si_texture_destroy,		/* resource_destroy */
-	si_texture_transfer_map,	/* transfer_map */
-	u_default_transfer_flush_region, /* transfer_flush_region */
-	si_texture_transfer_unmap,	/* transfer_unmap */
+static const struct u_resource_vtbl si_texture_vtbl = {
+   NULL,                            /* get_handle */
+   si_texture_destroy,              /* resource_destroy */
+   si_texture_transfer_map,         /* transfer_map */
+   u_default_transfer_flush_region, /* transfer_flush_region */
+   si_texture_transfer_unmap,       /* transfer_unmap */
 };
 
 /* Return if it's allowed to reinterpret one format as another with DCC enabled.
  */
-bool vi_dcc_formats_compatible(struct si_screen *sscreen,
-			       enum pipe_format format1,
-			       enum pipe_format format2)
+bool vi_dcc_formats_compatible(struct si_screen *sscreen, enum pipe_format format1,
+                               enum pipe_format format2)
 {
-	const struct util_format_description *desc1, *desc2;
-
-	/* No format change - exit early. */
-	if (format1 == format2)
-		return true;
-
-	format1 = si_simplify_cb_format(format1);
-	format2 = si_simplify_cb_format(format2);
-
-	/* Check again after format adjustments. */
-	if (format1 == format2)
-		return true;
-
-	desc1 = util_format_description(format1);
-	desc2 = util_format_description(format2);
-
-	if (desc1->layout != UTIL_FORMAT_LAYOUT_PLAIN ||
-	    desc2->layout != UTIL_FORMAT_LAYOUT_PLAIN)
-		return false;
-
-	/* Float and non-float are totally incompatible. */
-	if ((desc1->channel[0].type == UTIL_FORMAT_TYPE_FLOAT) !=
-	    (desc2->channel[0].type == UTIL_FORMAT_TYPE_FLOAT))
-		return false;
-
-	/* Channel sizes must match across DCC formats.
-	 * Comparing just the first 2 channels should be enough.
-	 */
-	if (desc1->channel[0].size != desc2->channel[0].size ||
-	    (desc1->nr_channels >= 2 &&
-	     desc1->channel[1].size != desc2->channel[1].size))
-		return false;
-
-	/* Everything below is not needed if the driver never uses the DCC
-	 * clear code with the value of 1.
-	 */
-
-	/* If the clear values are all 1 or all 0, this constraint can be
-	 * ignored. */
-	if (vi_alpha_is_on_msb(sscreen, format1) != vi_alpha_is_on_msb(sscreen, format2))
-		return false;
-
-	/* Channel types must match if the clear value of 1 is used.
-	 * The type categories are only float, signed, unsigned.
-	 * NORM and INT are always compatible.
-	 */
-	if (desc1->channel[0].type != desc2->channel[0].type ||
-	    (desc1->nr_channels >= 2 &&
-	     desc1->channel[1].type != desc2->channel[1].type))
-		return false;
-
-	return true;
+   const struct util_format_description *desc1, *desc2;
+
+   /* No format change - exit early. */
+   if (format1 == format2)
+      return true;
+
+   format1 = si_simplify_cb_format(format1);
+   format2 = si_simplify_cb_format(format2);
+
+   /* Check again after format adjustments. */
+   if (format1 == format2)
+      return true;
+
+   desc1 = util_format_description(format1);
+   desc2 = util_format_description(format2);
+
+   if (desc1->layout != UTIL_FORMAT_LAYOUT_PLAIN || desc2->layout != UTIL_FORMAT_LAYOUT_PLAIN)
+      return false;
+
+   /* Float and non-float are totally incompatible. */
+   if ((desc1->channel[0].type == UTIL_FORMAT_TYPE_FLOAT) !=
+       (desc2->channel[0].type == UTIL_FORMAT_TYPE_FLOAT))
+      return false;
+
+   /* Channel sizes must match across DCC formats.
+    * Comparing just the first 2 channels should be enough.
+    */
+   if (desc1->channel[0].size != desc2->channel[0].size ||
+       (desc1->nr_channels >= 2 && desc1->channel[1].size != desc2->channel[1].size))
+      return false;
+
+   /* Everything below is not needed if the driver never uses the DCC
+    * clear code with the value of 1.
+    */
+
+   /* If the clear values are all 1 or all 0, this constraint can be
+    * ignored. */
+   if (vi_alpha_is_on_msb(sscreen, format1) != vi_alpha_is_on_msb(sscreen, format2))
+      return false;
+
+   /* Channel types must match if the clear value of 1 is used.
+    * The type categories are only float, signed, unsigned.
+    * NORM and INT are always compatible.
+    */
+   if (desc1->channel[0].type != desc2->channel[0].type ||
+       (desc1->nr_channels >= 2 && desc1->channel[1].type != desc2->channel[1].type))
+      return false;
+
+   return true;
 }
 
-bool vi_dcc_formats_are_incompatible(struct pipe_resource *tex,
-				     unsigned level,
-				     enum pipe_format view_format)
+bool vi_dcc_formats_are_incompatible(struct pipe_resource *tex, unsigned level,
+                                     enum pipe_format view_format)
 {
-	struct si_texture *stex = (struct si_texture *)tex;
+   struct si_texture *stex = (struct si_texture *)tex;
 
-	return vi_dcc_enabled(stex, level) &&
-	       !vi_dcc_formats_compatible((struct si_screen*)tex->screen,
-					  tex->format, view_format);
+   return vi_dcc_enabled(stex, level) &&
+          !vi_dcc_formats_compatible((struct si_screen *)tex->screen, tex->format, view_format);
 }
 
 /* This can't be merged with the above function, because
  * vi_dcc_formats_compatible should be called only when DCC is enabled. */
-void vi_disable_dcc_if_incompatible_format(struct si_context *sctx,
-					   struct pipe_resource *tex,
-					   unsigned level,
-					   enum pipe_format view_format)
+void vi_disable_dcc_if_incompatible_format(struct si_context *sctx, struct pipe_resource *tex,
+                                           unsigned level, enum pipe_format view_format)
 {
-	struct si_texture *stex = (struct si_texture *)tex;
+   struct si_texture *stex = (struct si_texture *)tex;
 
-	if (vi_dcc_formats_are_incompatible(tex, level, view_format))
-		if (!si_texture_disable_dcc(sctx, stex))
-			si_decompress_dcc(sctx, stex);
+   if (vi_dcc_formats_are_incompatible(tex, level, view_format))
+      if (!si_texture_disable_dcc(sctx, stex))
+         si_decompress_dcc(sctx, stex);
 }
 
 struct pipe_surface *si_create_surface_custom(struct pipe_context *pipe,
-					      struct pipe_resource *texture,
-					      const struct pipe_surface *templ,
-					      unsigned width0, unsigned height0,
-					      unsigned width, unsigned height)
+                                              struct pipe_resource *texture,
+                                              const struct pipe_surface *templ, unsigned width0,
+                                              unsigned height0, unsigned width, unsigned height)
 {
-	struct si_surface *surface = CALLOC_STRUCT(si_surface);
-
-	if (!surface)
-		return NULL;
-
-	assert(templ->u.tex.first_layer <= util_max_layer(texture, templ->u.tex.level));
-	assert(templ->u.tex.last_layer <= util_max_layer(texture, templ->u.tex.level));
-
-	pipe_reference_init(&surface->base.reference, 1);
-	pipe_resource_reference(&surface->base.texture, texture);
-	surface->base.context = pipe;
-	surface->base.format = templ->format;
-	surface->base.width = width;
-	surface->base.height = height;
-	surface->base.u = templ->u;
-
-	surface->width0 = width0;
-	surface->height0 = height0;
-
-	surface->dcc_incompatible =
-		texture->target != PIPE_BUFFER &&
-		vi_dcc_formats_are_incompatible(texture, templ->u.tex.level,
-						templ->format);
-	return &surface->base;
+   struct si_surface *surface = CALLOC_STRUCT(si_surface);
+
+   if (!surface)
+      return NULL;
+
+   assert(templ->u.tex.first_layer <= util_max_layer(texture, templ->u.tex.level));
+   assert(templ->u.tex.last_layer <= util_max_layer(texture, templ->u.tex.level));
+
+   pipe_reference_init(&surface->base.reference, 1);
+   pipe_resource_reference(&surface->base.texture, texture);
+   surface->base.context = pipe;
+   surface->base.format = templ->format;
+   surface->base.width = width;
+   surface->base.height = height;
+   surface->base.u = templ->u;
+
+   surface->width0 = width0;
+   surface->height0 = height0;
+
+   surface->dcc_incompatible =
+      texture->target != PIPE_BUFFER &&
+      vi_dcc_formats_are_incompatible(texture, templ->u.tex.level, templ->format);
+   return &surface->base;
 }
 
-static struct pipe_surface *si_create_surface(struct pipe_context *pipe,
-					      struct pipe_resource *tex,
-					      const struct pipe_surface *templ)
+static struct pipe_surface *si_create_surface(struct pipe_context *pipe, struct pipe_resource *tex,
+                                              const struct pipe_surface *templ)
 {
-	unsigned level = templ->u.tex.level;
-	unsigned width = u_minify(tex->width0, level);
-	unsigned height = u_minify(tex->height0, level);
-	unsigned width0 = tex->width0;
-	unsigned height0 = tex->height0;
-
-	if (tex->target != PIPE_BUFFER && templ->format != tex->format) {
-		const struct util_format_description *tex_desc
-			= util_format_description(tex->format);
-		const struct util_format_description *templ_desc
-			= util_format_description(templ->format);
-
-		assert(tex_desc->block.bits == templ_desc->block.bits);
-
-		/* Adjust size of surface if and only if the block width or
-		 * height is changed. */
-		if (tex_desc->block.width != templ_desc->block.width ||
-		    tex_desc->block.height != templ_desc->block.height) {
-			unsigned nblks_x = util_format_get_nblocksx(tex->format, width);
-			unsigned nblks_y = util_format_get_nblocksy(tex->format, height);
-
-			width = nblks_x * templ_desc->block.width;
-			height = nblks_y * templ_desc->block.height;
-
-			width0 = util_format_get_nblocksx(tex->format, width0);
-			height0 = util_format_get_nblocksy(tex->format, height0);
-		}
-	}
-
-	return si_create_surface_custom(pipe, tex, templ,
-					  width0, height0,
-					  width, height);
+   unsigned level = templ->u.tex.level;
+   unsigned width = u_minify(tex->width0, level);
+   unsigned height = u_minify(tex->height0, level);
+   unsigned width0 = tex->width0;
+   unsigned height0 = tex->height0;
+
+   if (tex->target != PIPE_BUFFER && templ->format != tex->format) {
+      const struct util_format_description *tex_desc = util_format_description(tex->format);
+      const struct util_format_description *templ_desc = util_format_description(templ->format);
+
+      assert(tex_desc->block.bits == templ_desc->block.bits);
+
+      /* Adjust size of surface if and only if the block width or
+       * height is changed. */
+      if (tex_desc->block.width != templ_desc->block.width ||
+          tex_desc->block.height != templ_desc->block.height) {
+         unsigned nblks_x = util_format_get_nblocksx(tex->format, width);
+         unsigned nblks_y = util_format_get_nblocksy(tex->format, height);
+
+         width = nblks_x * templ_desc->block.width;
+         height = nblks_y * templ_desc->block.height;
+
+         width0 = util_format_get_nblocksx(tex->format, width0);
+         height0 = util_format_get_nblocksy(tex->format, height0);
+      }
+   }
+
+   return si_create_surface_custom(pipe, tex, templ, width0, height0, width, height);
 }
 
-static void si_surface_destroy(struct pipe_context *pipe,
-			       struct pipe_surface *surface)
+static void si_surface_destroy(struct pipe_context *pipe, struct pipe_surface *surface)
 {
-	pipe_resource_reference(&surface->texture, NULL);
-	FREE(surface);
+   pipe_resource_reference(&surface->texture, NULL);
+   FREE(surface);
 }
 
 unsigned si_translate_colorswap(enum pipe_format format, bool do_endian_swap)
 {
-	const struct util_format_description *desc = util_format_description(format);
-
-#define HAS_SWIZZLE(chan,swz) (desc->swizzle[chan] == PIPE_SWIZZLE_##swz)
-
-	if (format == PIPE_FORMAT_R11G11B10_FLOAT) /* isn't plain */
-		return V_028C70_SWAP_STD;
-
-	if (desc->layout != UTIL_FORMAT_LAYOUT_PLAIN)
-		return ~0U;
-
-	switch (desc->nr_channels) {
-	case 1:
-		if (HAS_SWIZZLE(0,X))
-			return V_028C70_SWAP_STD; /* X___ */
-		else if (HAS_SWIZZLE(3,X))
-			return V_028C70_SWAP_ALT_REV; /* ___X */
-		break;
-	case 2:
-		if ((HAS_SWIZZLE(0,X) && HAS_SWIZZLE(1,Y)) ||
-		    (HAS_SWIZZLE(0,X) && HAS_SWIZZLE(1,NONE)) ||
-		    (HAS_SWIZZLE(0,NONE) && HAS_SWIZZLE(1,Y)))
-			return V_028C70_SWAP_STD; /* XY__ */
-		else if ((HAS_SWIZZLE(0,Y) && HAS_SWIZZLE(1,X)) ||
-			 (HAS_SWIZZLE(0,Y) && HAS_SWIZZLE(1,NONE)) ||
-		         (HAS_SWIZZLE(0,NONE) && HAS_SWIZZLE(1,X)))
-			/* YX__ */
-			return (do_endian_swap ? V_028C70_SWAP_STD : V_028C70_SWAP_STD_REV);
-		else if (HAS_SWIZZLE(0,X) && HAS_SWIZZLE(3,Y))
-			return V_028C70_SWAP_ALT; /* X__Y */
-		else if (HAS_SWIZZLE(0,Y) && HAS_SWIZZLE(3,X))
-			return V_028C70_SWAP_ALT_REV; /* Y__X */
-		break;
-	case 3:
-		if (HAS_SWIZZLE(0,X))
-			return (do_endian_swap ? V_028C70_SWAP_STD_REV : V_028C70_SWAP_STD);
-		else if (HAS_SWIZZLE(0,Z))
-			return V_028C70_SWAP_STD_REV; /* ZYX */
-		break;
-	case 4:
-		/* check the middle channels, the 1st and 4th channel can be NONE */
-		if (HAS_SWIZZLE(1,Y) && HAS_SWIZZLE(2,Z)) {
-			return V_028C70_SWAP_STD; /* XYZW */
-		} else if (HAS_SWIZZLE(1,Z) && HAS_SWIZZLE(2,Y)) {
-			return V_028C70_SWAP_STD_REV; /* WZYX */
-		} else if (HAS_SWIZZLE(1,Y) && HAS_SWIZZLE(2,X)) {
-			return V_028C70_SWAP_ALT; /* ZYXW */
-		} else if (HAS_SWIZZLE(1,Z) && HAS_SWIZZLE(2,W)) {
-			/* YZWX */
-			if (desc->is_array)
-				return V_028C70_SWAP_ALT_REV;
-			else
-				return (do_endian_swap ? V_028C70_SWAP_ALT : V_028C70_SWAP_ALT_REV);
-		}
-		break;
-	}
-	return ~0U;
+   const struct util_format_description *desc = util_format_description(format);
+
+#define HAS_SWIZZLE(chan, swz) (desc->swizzle[chan] == PIPE_SWIZZLE_##swz)
+
+   if (format == PIPE_FORMAT_R11G11B10_FLOAT) /* isn't plain */
+      return V_028C70_SWAP_STD;
+
+   if (desc->layout != UTIL_FORMAT_LAYOUT_PLAIN)
+      return ~0U;
+
+   switch (desc->nr_channels) {
+   case 1:
+      if (HAS_SWIZZLE(0, X))
+         return V_028C70_SWAP_STD; /* X___ */
+      else if (HAS_SWIZZLE(3, X))
+         return V_028C70_SWAP_ALT_REV; /* ___X */
+      break;
+   case 2:
+      if ((HAS_SWIZZLE(0, X) && HAS_SWIZZLE(1, Y)) || (HAS_SWIZZLE(0, X) && HAS_SWIZZLE(1, NONE)) ||
+          (HAS_SWIZZLE(0, NONE) && HAS_SWIZZLE(1, Y)))
+         return V_028C70_SWAP_STD; /* XY__ */
+      else if ((HAS_SWIZZLE(0, Y) && HAS_SWIZZLE(1, X)) ||
+               (HAS_SWIZZLE(0, Y) && HAS_SWIZZLE(1, NONE)) ||
+               (HAS_SWIZZLE(0, NONE) && HAS_SWIZZLE(1, X)))
+         /* YX__ */
+         return (do_endian_swap ? V_028C70_SWAP_STD : V_028C70_SWAP_STD_REV);
+      else if (HAS_SWIZZLE(0, X) && HAS_SWIZZLE(3, Y))
+         return V_028C70_SWAP_ALT; /* X__Y */
+      else if (HAS_SWIZZLE(0, Y) && HAS_SWIZZLE(3, X))
+         return V_028C70_SWAP_ALT_REV; /* Y__X */
+      break;
+   case 3:
+      if (HAS_SWIZZLE(0, X))
+         return (do_endian_swap ? V_028C70_SWAP_STD_REV : V_028C70_SWAP_STD);
+      else if (HAS_SWIZZLE(0, Z))
+         return V_028C70_SWAP_STD_REV; /* ZYX */
+      break;
+   case 4:
+      /* check the middle channels, the 1st and 4th channel can be NONE */
+      if (HAS_SWIZZLE(1, Y) && HAS_SWIZZLE(2, Z)) {
+         return V_028C70_SWAP_STD; /* XYZW */
+      } else if (HAS_SWIZZLE(1, Z) && HAS_SWIZZLE(2, Y)) {
+         return V_028C70_SWAP_STD_REV; /* WZYX */
+      } else if (HAS_SWIZZLE(1, Y) && HAS_SWIZZLE(2, X)) {
+         return V_028C70_SWAP_ALT; /* ZYXW */
+      } else if (HAS_SWIZZLE(1, Z) && HAS_SWIZZLE(2, W)) {
+         /* YZWX */
+         if (desc->is_array)
+            return V_028C70_SWAP_ALT_REV;
+         else
+            return (do_endian_swap ? V_028C70_SWAP_ALT : V_028C70_SWAP_ALT_REV);
+      }
+      break;
+   }
+   return ~0U;
 }
 
 /* PIPELINE_STAT-BASED DCC ENABLEMENT FOR DISPLAYABLE SURFACES */
 
-static void vi_dcc_clean_up_context_slot(struct si_context *sctx,
-					 int slot)
+static void vi_dcc_clean_up_context_slot(struct si_context *sctx, int slot)
 {
-	int i;
+   int i;
 
-	if (sctx->dcc_stats[slot].query_active)
-		vi_separate_dcc_stop_query(sctx,
-					   sctx->dcc_stats[slot].tex);
+   if (sctx->dcc_stats[slot].query_active)
+      vi_separate_dcc_stop_query(sctx, sctx->dcc_stats[slot].tex);
 
-	for (i = 0; i < ARRAY_SIZE(sctx->dcc_stats[slot].ps_stats); i++)
-		if (sctx->dcc_stats[slot].ps_stats[i]) {
-			sctx->b.destroy_query(&sctx->b,
-					      sctx->dcc_stats[slot].ps_stats[i]);
-			sctx->dcc_stats[slot].ps_stats[i] = NULL;
-		}
+   for (i = 0; i < ARRAY_SIZE(sctx->dcc_stats[slot].ps_stats); i++)
+      if (sctx->dcc_stats[slot].ps_stats[i]) {
+         sctx->b.destroy_query(&sctx->b, sctx->dcc_stats[slot].ps_stats[i]);
+         sctx->dcc_stats[slot].ps_stats[i] = NULL;
+      }
 
-	si_texture_reference(&sctx->dcc_stats[slot].tex, NULL);
+   si_texture_reference(&sctx->dcc_stats[slot].tex, NULL);
 }
 
 /**
  * Return the per-context slot where DCC statistics queries for the texture live.
  */
-static unsigned vi_get_context_dcc_stats_index(struct si_context *sctx,
-					       struct si_texture *tex)
+static unsigned vi_get_context_dcc_stats_index(struct si_context *sctx, struct si_texture *tex)
 {
-	int i, empty_slot = -1;
-
-	/* Remove zombie textures (textures kept alive by this array only). */
-	for (i = 0; i < ARRAY_SIZE(sctx->dcc_stats); i++)
-		if (sctx->dcc_stats[i].tex &&
-		    sctx->dcc_stats[i].tex->buffer.b.b.reference.count == 1)
-			vi_dcc_clean_up_context_slot(sctx, i);
-
-	/* Find the texture. */
-	for (i = 0; i < ARRAY_SIZE(sctx->dcc_stats); i++) {
-		/* Return if found. */
-		if (sctx->dcc_stats[i].tex == tex) {
-			sctx->dcc_stats[i].last_use_timestamp = os_time_get();
-			return i;
-		}
-
-		/* Record the first seen empty slot. */
-		if (empty_slot == -1 && !sctx->dcc_stats[i].tex)
-			empty_slot = i;
-	}
-
-	/* Not found. Remove the oldest member to make space in the array. */
-	if (empty_slot == -1) {
-		int oldest_slot = 0;
-
-		/* Find the oldest slot. */
-		for (i = 1; i < ARRAY_SIZE(sctx->dcc_stats); i++)
-			if (sctx->dcc_stats[oldest_slot].last_use_timestamp >
-			    sctx->dcc_stats[i].last_use_timestamp)
-				oldest_slot = i;
-
-		/* Clean up the oldest slot. */
-		vi_dcc_clean_up_context_slot(sctx, oldest_slot);
-		empty_slot = oldest_slot;
-	}
-
-	/* Add the texture to the new slot. */
-	si_texture_reference(&sctx->dcc_stats[empty_slot].tex, tex);
-	sctx->dcc_stats[empty_slot].last_use_timestamp = os_time_get();
-	return empty_slot;
+   int i, empty_slot = -1;
+
+   /* Remove zombie textures (textures kept alive by this array only). */
+   for (i = 0; i < ARRAY_SIZE(sctx->dcc_stats); i++)
+      if (sctx->dcc_stats[i].tex && sctx->dcc_stats[i].tex->buffer.b.b.reference.count == 1)
+         vi_dcc_clean_up_context_slot(sctx, i);
+
+   /* Find the texture. */
+   for (i = 0; i < ARRAY_SIZE(sctx->dcc_stats); i++) {
+      /* Return if found. */
+      if (sctx->dcc_stats[i].tex == tex) {
+         sctx->dcc_stats[i].last_use_timestamp = os_time_get();
+         return i;
+      }
+
+      /* Record the first seen empty slot. */
+      if (empty_slot == -1 && !sctx->dcc_stats[i].tex)
+         empty_slot = i;
+   }
+
+   /* Not found. Remove the oldest member to make space in the array. */
+   if (empty_slot == -1) {
+      int oldest_slot = 0;
+
+      /* Find the oldest slot. */
+      for (i = 1; i < ARRAY_SIZE(sctx->dcc_stats); i++)
+         if (sctx->dcc_stats[oldest_slot].last_use_timestamp >
+             sctx->dcc_stats[i].last_use_timestamp)
+            oldest_slot = i;
+
+      /* Clean up the oldest slot. */
+      vi_dcc_clean_up_context_slot(sctx, oldest_slot);
+      empty_slot = oldest_slot;
+   }
+
+   /* Add the texture to the new slot. */
+   si_texture_reference(&sctx->dcc_stats[empty_slot].tex, tex);
+   sctx->dcc_stats[empty_slot].last_use_timestamp = os_time_get();
+   return empty_slot;
 }
 
-static struct pipe_query *
-vi_create_resuming_pipestats_query(struct si_context *sctx)
+static struct pipe_query *vi_create_resuming_pipestats_query(struct si_context *sctx)
 {
-	struct si_query_hw *query = (struct si_query_hw*)
-		sctx->b.create_query(&sctx->b, PIPE_QUERY_PIPELINE_STATISTICS, 0);
+   struct si_query_hw *query =
+      (struct si_query_hw *)sctx->b.create_query(&sctx->b, PIPE_QUERY_PIPELINE_STATISTICS, 0);
 
-	query->flags |= SI_QUERY_HW_FLAG_BEGIN_RESUMES;
-	return (struct pipe_query*)query;
+   query->flags |= SI_QUERY_HW_FLAG_BEGIN_RESUMES;
+   return (struct pipe_query *)query;
 }
 
 /**
  * Called when binding a color buffer.
  */
-void vi_separate_dcc_start_query(struct si_context *sctx,
-				 struct si_texture *tex)
+void vi_separate_dcc_start_query(struct si_context *sctx, struct si_texture *tex)
 {
-	unsigned i = vi_get_context_dcc_stats_index(sctx, tex);
+   unsigned i = vi_get_context_dcc_stats_index(sctx, tex);
 
-	assert(!sctx->dcc_stats[i].query_active);
+   assert(!sctx->dcc_stats[i].query_active);
 
-	if (!sctx->dcc_stats[i].ps_stats[0])
-		sctx->dcc_stats[i].ps_stats[0] = vi_create_resuming_pipestats_query(sctx);
+   if (!sctx->dcc_stats[i].ps_stats[0])
+      sctx->dcc_stats[i].ps_stats[0] = vi_create_resuming_pipestats_query(sctx);
 
-	/* begin or resume the query */
-	sctx->b.begin_query(&sctx->b, sctx->dcc_stats[i].ps_stats[0]);
-	sctx->dcc_stats[i].query_active = true;
+   /* begin or resume the query */
+   sctx->b.begin_query(&sctx->b, sctx->dcc_stats[i].ps_stats[0]);
+   sctx->dcc_stats[i].query_active = true;
 }
 
 /**
  * Called when unbinding a color buffer.
  */
-void vi_separate_dcc_stop_query(struct si_context *sctx,
-				struct si_texture *tex)
+void vi_separate_dcc_stop_query(struct si_context *sctx, struct si_texture *tex)
 {
-	unsigned i = vi_get_context_dcc_stats_index(sctx, tex);
+   unsigned i = vi_get_context_dcc_stats_index(sctx, tex);
 
-	assert(sctx->dcc_stats[i].query_active);
-	assert(sctx->dcc_stats[i].ps_stats[0]);
+   assert(sctx->dcc_stats[i].query_active);
+   assert(sctx->dcc_stats[i].ps_stats[0]);
 
-	/* pause or end the query */
-	sctx->b.end_query(&sctx->b, sctx->dcc_stats[i].ps_stats[0]);
-	sctx->dcc_stats[i].query_active = false;
+   /* pause or end the query */
+   sctx->b.end_query(&sctx->b, sctx->dcc_stats[i].ps_stats[0]);
+   sctx->dcc_stats[i].query_active = false;
 }
 
 static bool vi_should_enable_separate_dcc(struct si_texture *tex)
 {
-	/* The minimum number of fullscreen draws per frame that is required
-	 * to enable DCC. */
-	return tex->ps_draw_ratio + tex->num_slow_clears >= 5;
+   /* The minimum number of fullscreen draws per frame that is required
+    * to enable DCC. */
+   return tex->ps_draw_ratio + tex->num_slow_clears >= 5;
 }
 
 /* Called by fast clear. */
-void vi_separate_dcc_try_enable(struct si_context *sctx,
-				struct si_texture *tex)
+void vi_separate_dcc_try_enable(struct si_context *sctx, struct si_texture *tex)
 {
-	/* The intent is to use this with shared displayable back buffers,
-	 * but it's not strictly limited only to them.
-	 */
-	if (!tex->buffer.b.is_shared ||
-	    !(tex->buffer.external_usage & PIPE_HANDLE_USAGE_EXPLICIT_FLUSH) ||
-	    tex->buffer.b.b.target != PIPE_TEXTURE_2D ||
-	    tex->buffer.b.b.last_level > 0 ||
-	    !tex->surface.dcc_size ||
-	    sctx->screen->debug_flags & DBG(NO_DCC) ||
-	    sctx->screen->debug_flags & DBG(NO_DCC_FB))
-		return;
-
-	assert(sctx->chip_class >= GFX8);
-
-	if (tex->surface.dcc_offset)
-		return; /* already enabled */
-
-	/* Enable the DCC stat gathering. */
-	if (!tex->dcc_gather_statistics) {
-		tex->dcc_gather_statistics = true;
-		vi_separate_dcc_start_query(sctx, tex);
-	}
-
-	if (!vi_should_enable_separate_dcc(tex))
-		return; /* stats show that DCC decompression is too expensive */
-
-	assert(tex->surface.num_dcc_levels);
-	assert(!tex->dcc_separate_buffer);
-
-	si_texture_discard_cmask(sctx->screen, tex);
-
-	/* Get a DCC buffer. */
-	if (tex->last_dcc_separate_buffer) {
-		assert(tex->dcc_gather_statistics);
-		assert(!tex->dcc_separate_buffer);
-		tex->dcc_separate_buffer = tex->last_dcc_separate_buffer;
-		tex->last_dcc_separate_buffer = NULL;
-	} else {
-		tex->dcc_separate_buffer =
-			si_aligned_buffer_create(sctx->b.screen,
-						   SI_RESOURCE_FLAG_UNMAPPABLE,
-						   PIPE_USAGE_DEFAULT,
-						   tex->surface.dcc_size,
-						   tex->surface.dcc_alignment);
-		if (!tex->dcc_separate_buffer)
-			return;
-	}
-
-	/* dcc_offset is the absolute GPUVM address. */
-	tex->surface.dcc_offset = tex->dcc_separate_buffer->gpu_address;
-
-	/* no need to flag anything since this is called by fast clear that
-	 * flags framebuffer state
-	 */
+   /* The intent is to use this with shared displayable back buffers,
+    * but it's not strictly limited only to them.
+    */
+   if (!tex->buffer.b.is_shared ||
+       !(tex->buffer.external_usage & PIPE_HANDLE_USAGE_EXPLICIT_FLUSH) ||
+       tex->buffer.b.b.target != PIPE_TEXTURE_2D || tex->buffer.b.b.last_level > 0 ||
+       !tex->surface.dcc_size || sctx->screen->debug_flags & DBG(NO_DCC) ||
+       sctx->screen->debug_flags & DBG(NO_DCC_FB))
+      return;
+
+   assert(sctx->chip_class >= GFX8);
+
+   if (tex->surface.dcc_offset)
+      return; /* already enabled */
+
+   /* Enable the DCC stat gathering. */
+   if (!tex->dcc_gather_statistics) {
+      tex->dcc_gather_statistics = true;
+      vi_separate_dcc_start_query(sctx, tex);
+   }
+
+   if (!vi_should_enable_separate_dcc(tex))
+      return; /* stats show that DCC decompression is too expensive */
+
+   assert(tex->surface.num_dcc_levels);
+   assert(!tex->dcc_separate_buffer);
+
+   si_texture_discard_cmask(sctx->screen, tex);
+
+   /* Get a DCC buffer. */
+   if (tex->last_dcc_separate_buffer) {
+      assert(tex->dcc_gather_statistics);
+      assert(!tex->dcc_separate_buffer);
+      tex->dcc_separate_buffer = tex->last_dcc_separate_buffer;
+      tex->last_dcc_separate_buffer = NULL;
+   } else {
+      tex->dcc_separate_buffer =
+         si_aligned_buffer_create(sctx->b.screen, SI_RESOURCE_FLAG_UNMAPPABLE, PIPE_USAGE_DEFAULT,
+                                  tex->surface.dcc_size, tex->surface.dcc_alignment);
+      if (!tex->dcc_separate_buffer)
+         return;
+   }
+
+   /* dcc_offset is the absolute GPUVM address. */
+   tex->surface.dcc_offset = tex->dcc_separate_buffer->gpu_address;
+
+   /* no need to flag anything since this is called by fast clear that
+    * flags framebuffer state
+    */
 }
 
 /**
  * Called by pipe_context::flush_resource, the place where DCC decompression
  * takes place.
  */
-void vi_separate_dcc_process_and_reset_stats(struct pipe_context *ctx,
-					     struct si_texture *tex)
+void vi_separate_dcc_process_and_reset_stats(struct pipe_context *ctx, struct si_texture *tex)
 {
-	struct si_context *sctx = (struct si_context*)ctx;
-	struct pipe_query *tmp;
-	unsigned i = vi_get_context_dcc_stats_index(sctx, tex);
-	bool query_active = sctx->dcc_stats[i].query_active;
-	bool disable = false;
-
-	if (sctx->dcc_stats[i].ps_stats[2]) {
-		union pipe_query_result result;
-
-		/* Read the results. */
-		struct pipe_query *query = sctx->dcc_stats[i].ps_stats[2];
-		ctx->get_query_result(ctx, query,
-				      true, &result);
-		si_query_buffer_reset(sctx, &((struct si_query_hw*)query)->buffer);
-
-		/* Compute the approximate number of fullscreen draws. */
-		tex->ps_draw_ratio =
-			result.pipeline_statistics.ps_invocations /
-			(tex->buffer.b.b.width0 * tex->buffer.b.b.height0);
-		sctx->last_tex_ps_draw_ratio = tex->ps_draw_ratio;
-
-		disable = tex->dcc_separate_buffer &&
-			  !vi_should_enable_separate_dcc(tex);
-	}
-
-	tex->num_slow_clears = 0;
-
-	/* stop the statistics query for ps_stats[0] */
-	if (query_active)
-		vi_separate_dcc_stop_query(sctx, tex);
-
-	/* Move the queries in the queue by one. */
-	tmp = sctx->dcc_stats[i].ps_stats[2];
-	sctx->dcc_stats[i].ps_stats[2] = sctx->dcc_stats[i].ps_stats[1];
-	sctx->dcc_stats[i].ps_stats[1] = sctx->dcc_stats[i].ps_stats[0];
-	sctx->dcc_stats[i].ps_stats[0] = tmp;
-
-	/* create and start a new query as ps_stats[0] */
-	if (query_active)
-		vi_separate_dcc_start_query(sctx, tex);
-
-	if (disable) {
-		assert(!tex->last_dcc_separate_buffer);
-		tex->last_dcc_separate_buffer = tex->dcc_separate_buffer;
-		tex->dcc_separate_buffer = NULL;
-		tex->surface.dcc_offset = 0;
-		/* no need to flag anything since this is called after
-		 * decompression that re-sets framebuffer state
-		 */
-	}
+   struct si_context *sctx = (struct si_context *)ctx;
+   struct pipe_query *tmp;
+   unsigned i = vi_get_context_dcc_stats_index(sctx, tex);
+   bool query_active = sctx->dcc_stats[i].query_active;
+   bool disable = false;
+
+   if (sctx->dcc_stats[i].ps_stats[2]) {
+      union pipe_query_result result;
+
+      /* Read the results. */
+      struct pipe_query *query = sctx->dcc_stats[i].ps_stats[2];
+      ctx->get_query_result(ctx, query, true, &result);
+      si_query_buffer_reset(sctx, &((struct si_query_hw *)query)->buffer);
+
+      /* Compute the approximate number of fullscreen draws. */
+      tex->ps_draw_ratio = result.pipeline_statistics.ps_invocations /
+                           (tex->buffer.b.b.width0 * tex->buffer.b.b.height0);
+      sctx->last_tex_ps_draw_ratio = tex->ps_draw_ratio;
+
+      disable = tex->dcc_separate_buffer && !vi_should_enable_separate_dcc(tex);
+   }
+
+   tex->num_slow_clears = 0;
+
+   /* stop the statistics query for ps_stats[0] */
+   if (query_active)
+      vi_separate_dcc_stop_query(sctx, tex);
+
+   /* Move the queries in the queue by one. */
+   tmp = sctx->dcc_stats[i].ps_stats[2];
+   sctx->dcc_stats[i].ps_stats[2] = sctx->dcc_stats[i].ps_stats[1];
+   sctx->dcc_stats[i].ps_stats[1] = sctx->dcc_stats[i].ps_stats[0];
+   sctx->dcc_stats[i].ps_stats[0] = tmp;
+
+   /* create and start a new query as ps_stats[0] */
+   if (query_active)
+      vi_separate_dcc_start_query(sctx, tex);
+
+   if (disable) {
+      assert(!tex->last_dcc_separate_buffer);
+      tex->last_dcc_separate_buffer = tex->dcc_separate_buffer;
+      tex->dcc_separate_buffer = NULL;
+      tex->surface.dcc_offset = 0;
+      /* no need to flag anything since this is called after
+       * decompression that re-sets framebuffer state
+       */
+   }
 }
 
 static struct pipe_memory_object *
-si_memobj_from_handle(struct pipe_screen *screen,
-		      struct winsys_handle *whandle,
-		      bool dedicated)
+si_memobj_from_handle(struct pipe_screen *screen, struct winsys_handle *whandle, bool dedicated)
 {
-	struct si_screen *sscreen = (struct si_screen*)screen;
-	struct si_memory_object *memobj = CALLOC_STRUCT(si_memory_object);
-	struct pb_buffer *buf = NULL;
-
-	if (!memobj)
-		return NULL;
+   struct si_screen *sscreen = (struct si_screen *)screen;
+   struct si_memory_object *memobj = CALLOC_STRUCT(si_memory_object);
+   struct pb_buffer *buf = NULL;
 
-	buf = sscreen->ws->buffer_from_handle(sscreen->ws, whandle,
-					      sscreen->info.max_alignment);
-	if (!buf) {
-		free(memobj);
-		return NULL;
-	}
+   if (!memobj)
+      return NULL;
 
-	memobj->b.dedicated = dedicated;
-	memobj->buf = buf;
-	memobj->stride = whandle->stride;
+   buf = sscreen->ws->buffer_from_handle(sscreen->ws, whandle, sscreen->info.max_alignment);
+   if (!buf) {
+      free(memobj);
+      return NULL;
+   }
 
-	return (struct pipe_memory_object *)memobj;
+   memobj->b.dedicated = dedicated;
+   memobj->buf = buf;
+   memobj->stride = whandle->stride;
 
+   return (struct pipe_memory_object *)memobj;
 }
 
-static void
-si_memobj_destroy(struct pipe_screen *screen,
-		  struct pipe_memory_object *_memobj)
+static void si_memobj_destroy(struct pipe_screen *screen, struct pipe_memory_object *_memobj)
 {
-	struct si_memory_object *memobj = (struct si_memory_object *)_memobj;
+   struct si_memory_object *memobj = (struct si_memory_object *)_memobj;
 
-	pb_reference(&memobj->buf, NULL);
-	free(memobj);
+   pb_reference(&memobj->buf, NULL);
+   free(memobj);
 }
 
-static struct pipe_resource *
-si_texture_from_memobj(struct pipe_screen *screen,
-		       const struct pipe_resource *templ,
-		       struct pipe_memory_object *_memobj,
-		       uint64_t offset)
+static struct pipe_resource *si_texture_from_memobj(struct pipe_screen *screen,
+                                                    const struct pipe_resource *templ,
+                                                    struct pipe_memory_object *_memobj,
+                                                    uint64_t offset)
 {
-	struct si_screen *sscreen = (struct si_screen*)screen;
-	struct si_memory_object *memobj = (struct si_memory_object *)_memobj;
-	struct pipe_resource *tex =
-		si_texture_from_winsys_buffer(sscreen, templ, memobj->buf,
-					      memobj->stride, offset,
-					      PIPE_HANDLE_USAGE_FRAMEBUFFER_WRITE |
-					      PIPE_HANDLE_USAGE_SHADER_WRITE,
-					      memobj->b.dedicated);
-	if (!tex)
-		return NULL;
-
-	/* si_texture_from_winsys_buffer doesn't increment refcount of
-	 * memobj->buf, so increment it here.
-	 */
-	struct pb_buffer *buf = NULL;
-	pb_reference(&buf, memobj->buf);
-	return tex;
+   struct si_screen *sscreen = (struct si_screen *)screen;
+   struct si_memory_object *memobj = (struct si_memory_object *)_memobj;
+   struct pipe_resource *tex = si_texture_from_winsys_buffer(
+      sscreen, templ, memobj->buf, memobj->stride, offset,
+      PIPE_HANDLE_USAGE_FRAMEBUFFER_WRITE | PIPE_HANDLE_USAGE_SHADER_WRITE, memobj->b.dedicated);
+   if (!tex)
+      return NULL;
+
+   /* si_texture_from_winsys_buffer doesn't increment refcount of
+    * memobj->buf, so increment it here.
+    */
+   struct pb_buffer *buf = NULL;
+   pb_reference(&buf, memobj->buf);
+   return tex;
 }
 
-static bool si_check_resource_capability(struct pipe_screen *screen,
-					 struct pipe_resource *resource,
-					 unsigned bind)
+static bool si_check_resource_capability(struct pipe_screen *screen, struct pipe_resource *resource,
+                                         unsigned bind)
 {
-	struct si_texture *tex = (struct si_texture*)resource;
+   struct si_texture *tex = (struct si_texture *)resource;
 
-	/* Buffers only support the linear flag. */
-	if (resource->target == PIPE_BUFFER)
-		return (bind & ~PIPE_BIND_LINEAR) == 0;
+   /* Buffers only support the linear flag. */
+   if (resource->target == PIPE_BUFFER)
+      return (bind & ~PIPE_BIND_LINEAR) == 0;
 
-	if (bind & PIPE_BIND_LINEAR && !tex->surface.is_linear)
-		return false;
+   if (bind & PIPE_BIND_LINEAR && !tex->surface.is_linear)
+      return false;
 
-	if (bind & PIPE_BIND_SCANOUT && !tex->surface.is_displayable)
-		return false;
+   if (bind & PIPE_BIND_SCANOUT && !tex->surface.is_displayable)
+      return false;
 
-	/* TODO: PIPE_BIND_CURSOR - do we care? */
-	return true;
+   /* TODO: PIPE_BIND_CURSOR - do we care? */
+   return true;
 }
 
 void si_init_screen_texture_functions(struct si_screen *sscreen)
 {
-	sscreen->b.resource_from_handle = si_texture_from_handle;
-	sscreen->b.resource_get_handle = si_texture_get_handle;
-	sscreen->b.resource_get_param = si_resource_get_param;
-	sscreen->b.resource_get_info = si_texture_get_info;
-	sscreen->b.resource_from_memobj = si_texture_from_memobj;
-	sscreen->b.memobj_create_from_handle = si_memobj_from_handle;
-	sscreen->b.memobj_destroy = si_memobj_destroy;
-	sscreen->b.check_resource_capability = si_check_resource_capability;
+   sscreen->b.resource_from_handle = si_texture_from_handle;
+   sscreen->b.resource_get_handle = si_texture_get_handle;
+   sscreen->b.resource_get_param = si_resource_get_param;
+   sscreen->b.resource_get_info = si_texture_get_info;
+   sscreen->b.resource_from_memobj = si_texture_from_memobj;
+   sscreen->b.memobj_create_from_handle = si_memobj_from_handle;
+   sscreen->b.memobj_destroy = si_memobj_destroy;
+   sscreen->b.check_resource_capability = si_check_resource_capability;
 }
 
 void si_init_context_texture_functions(struct si_context *sctx)
 {
-	sctx->b.create_surface = si_create_surface;
-	sctx->b.surface_destroy = si_surface_destroy;
+   sctx->b.create_surface = si_create_surface;
+   sctx->b.surface_destroy = si_surface_destroy;
 }
diff --git a/src/gallium/drivers/radeonsi/si_uvd.c b/src/gallium/drivers/radeonsi/si_uvd.c
index 5511c2d7ad2..0f38cce0f96 100644
--- a/src/gallium/drivers/radeonsi/si_uvd.c
+++ b/src/gallium/drivers/radeonsi/si_uvd.c
@@ -25,79 +25,77 @@
  *
  **************************************************************************/
 
-#include "si_pipe.h"
-#include "radeon/radeon_video.h"
 #include "radeon/radeon_uvd.h"
+#include "radeon/radeon_uvd_enc.h"
 #include "radeon/radeon_vce.h"
 #include "radeon/radeon_vcn_dec.h"
 #include "radeon/radeon_vcn_enc.h"
-#include "radeon/radeon_uvd_enc.h"
+#include "radeon/radeon_video.h"
+#include "si_pipe.h"
 #include "util/u_video.h"
 
 /**
  * creates an video buffer with an UVD compatible memory layout
  */
 struct pipe_video_buffer *si_video_buffer_create(struct pipe_context *pipe,
-						 const struct pipe_video_buffer *tmpl)
+                                                 const struct pipe_video_buffer *tmpl)
 {
-	struct pipe_video_buffer vidbuf = *tmpl;
-	/* TODO: get tiling working */
-	vidbuf.bind |= PIPE_BIND_LINEAR;
+   struct pipe_video_buffer vidbuf = *tmpl;
+   /* TODO: get tiling working */
+   vidbuf.bind |= PIPE_BIND_LINEAR;
 
-	return vl_video_buffer_create_as_resource(pipe, &vidbuf);
+   return vl_video_buffer_create_as_resource(pipe, &vidbuf);
 }
 
 /* set the decoding target buffer offsets */
-static struct pb_buffer* si_uvd_set_dtb(struct ruvd_msg *msg, struct vl_video_buffer *buf)
+static struct pb_buffer *si_uvd_set_dtb(struct ruvd_msg *msg, struct vl_video_buffer *buf)
 {
-	struct si_screen *sscreen = (struct si_screen*)buf->base.context->screen;
-	struct si_texture *luma = (struct si_texture *)buf->resources[0];
-	struct si_texture *chroma = (struct si_texture *)buf->resources[1];
-	enum ruvd_surface_type type =  (sscreen->info.chip_class >= GFX9) ?
-					RUVD_SURFACE_TYPE_GFX9 :
-					RUVD_SURFACE_TYPE_LEGACY;
+   struct si_screen *sscreen = (struct si_screen *)buf->base.context->screen;
+   struct si_texture *luma = (struct si_texture *)buf->resources[0];
+   struct si_texture *chroma = (struct si_texture *)buf->resources[1];
+   enum ruvd_surface_type type =
+      (sscreen->info.chip_class >= GFX9) ? RUVD_SURFACE_TYPE_GFX9 : RUVD_SURFACE_TYPE_LEGACY;
 
-	msg->body.decode.dt_field_mode = buf->base.interlaced;
+   msg->body.decode.dt_field_mode = buf->base.interlaced;
 
-	si_uvd_set_dt_surfaces(msg, &luma->surface, (chroma) ? &chroma->surface : NULL, type);
+   si_uvd_set_dt_surfaces(msg, &luma->surface, (chroma) ? &chroma->surface : NULL, type);
 
-	return luma->buffer.buf;
+   return luma->buffer.buf;
 }
 
 /* get the radeon resources for VCE */
-static void si_vce_get_buffer(struct pipe_resource *resource,
-			      struct pb_buffer **handle,
-			      struct radeon_surf **surface)
+static void si_vce_get_buffer(struct pipe_resource *resource, struct pb_buffer **handle,
+                              struct radeon_surf **surface)
 {
-	struct si_texture *res = (struct si_texture *)resource;
+   struct si_texture *res = (struct si_texture *)resource;
 
-	if (handle)
-		*handle = res->buffer.buf;
+   if (handle)
+      *handle = res->buffer.buf;
 
-	if (surface)
-		*surface = &res->surface;
+   if (surface)
+      *surface = &res->surface;
 }
 
 /**
  * creates an UVD compatible decoder
  */
 struct pipe_video_codec *si_uvd_create_decoder(struct pipe_context *context,
-					       const struct pipe_video_codec *templ)
+                                               const struct pipe_video_codec *templ)
 {
-	struct si_context *ctx = (struct si_context *)context;
-	bool vcn = ctx->family >= CHIP_RAVEN;
+   struct si_context *ctx = (struct si_context *)context;
+   bool vcn = ctx->family >= CHIP_RAVEN;
 
-	if (templ->entrypoint == PIPE_VIDEO_ENTRYPOINT_ENCODE) {
-		if (vcn) {
-			return radeon_create_encoder(context, templ, ctx->ws, si_vce_get_buffer);
-		} else {
-			if (u_reduce_video_profile(templ->profile) == PIPE_VIDEO_FORMAT_HEVC)
-				return radeon_uvd_create_encoder(context, templ, ctx->ws, si_vce_get_buffer);
-			else
-				return si_vce_create_encoder(context, templ, ctx->ws, si_vce_get_buffer);
-		}
-	}
+   if (templ->entrypoint == PIPE_VIDEO_ENTRYPOINT_ENCODE) {
+      if (vcn) {
+         return radeon_create_encoder(context, templ, ctx->ws, si_vce_get_buffer);
+      } else {
+         if (u_reduce_video_profile(templ->profile) == PIPE_VIDEO_FORMAT_HEVC)
+            return radeon_uvd_create_encoder(context, templ, ctx->ws, si_vce_get_buffer);
+         else
+            return si_vce_create_encoder(context, templ, ctx->ws, si_vce_get_buffer);
+      }
+   }
 
-	return (vcn) ? 	radeon_create_decoder(context, templ) :
-		si_common_uvd_create_decoder(context, templ, si_uvd_set_dtb);
+   return (vcn) ? radeon_create_decoder(context, templ)
+                : si_common_uvd_create_decoder(context, templ, si_uvd_set_dtb);
 }