X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Fgallium%2Fdrivers%2Fradeonsi%2Fsi_state_shaders.c;h=18015bbec485f74bfffdd202d240d112ce64c808;hb=312e04689a9d8d4f9c319e69c61220e10653cfcd;hp=832e59828949c0da0279d2c95a2c5ffd3fba5722;hpb=896885025f37484cc693f480ee797e83e3a6f9ee;p=mesa.git

diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c
index 832e5982894..18015bbec48 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -27,7 +27,6 @@
 
 #include "compiler/nir/nir_serialize.h"
 #include "nir/tgsi_to_nir.h"
-#include "tgsi/tgsi_parse.h"
 #include "util/hash_table.h"
 #include "util/crc32.h"
 #include "util/u_async_debug.h"
@@ -42,24 +41,23 @@
 /* SHADER_CACHE */
 
 /**
- * Return the IR binary in a buffer. For TGSI the first 4 bytes contain its
- * size as integer.
+ * Return the IR key for the shader cache.
  */
-void *si_get_ir_binary(struct si_shader_selector *sel, bool ngg, bool es)
+void si_get_ir_cache_key(struct si_shader_selector *sel, bool ngg, bool es,
+			 unsigned char ir_sha1_cache_key[20])
 {
-	struct blob blob;
+	struct blob blob = {};
 	unsigned ir_size;
 	void *ir_binary;
 
-	if (sel->tokens) {
-		ir_binary = sel->tokens;
-		ir_size = tgsi_num_tokens(sel->tokens) *
-					  sizeof(struct tgsi_token);
+	if (sel->nir_binary) {
+		ir_binary = sel->nir_binary;
+		ir_size = sel->nir_size;
 	} else {
 		assert(sel->nir);
 
 		blob_init(&blob);
-		nir_serialize(&blob, sel->nir);
+		nir_serialize(&blob, sel->nir, true);
 		ir_binary = blob.data;
 		ir_size = blob.size;
 	}
@@ -78,20 +76,18 @@ void *si_get_ir_binary(struct si_shader_selector *sel, bool ngg, bool es)
 	if (sel->force_correct_derivs_after_kill)
 		shader_variant_flags |= 1 << 3;
 
-	unsigned size = 4 + 4 + ir_size + sizeof(sel->so);
-	char *result = (char*)MALLOC(size);
-	if (!result)
-		return NULL;
-
-	((uint32_t*)result)[0] = size;
-	((uint32_t*)result)[1] = shader_variant_flags;
-	memcpy(result + 8, ir_binary, ir_size);
-	memcpy(result + 8 + ir_size, &sel->so, sizeof(sel->so));
-
-	if (sel->nir)
+	struct mesa_sha1 ctx;
+	_mesa_sha1_init(&ctx);
+	_mesa_sha1_update(&ctx, &shader_variant_flags, 4);
+	_mesa_sha1_update(&ctx, ir_binary, ir_size);
+	if (sel->type == PIPE_SHADER_VERTEX ||
+	    sel->type == PIPE_SHADER_TESS_EVAL ||
+	    sel->type == PIPE_SHADER_GEOMETRY)
+		_mesa_sha1_update(&ctx, &sel->so, sizeof(sel->so));
+	_mesa_sha1_final(&ctx, ir_sha1_cache_key);
+
+	if (ir_binary == blob.data)
 		blob_finish(&blob);
-
-	return result;
 }
 
 /** Copy "data" to "ptr" and return the next dword following copied data. */
@@ -208,10 +204,9 @@ static bool si_load_shader_binary(struct si_shader *shader, void *binary)
 /**
  * Insert a shader into the cache. It's assumed the shader is not in the cache.
  * Use si_shader_cache_load_shader before calling this.
- *
- * Returns false on failure, in which case the ir_binary should be freed.
  */
-bool si_shader_cache_insert_shader(struct si_screen *sscreen, void *ir_binary,
+void si_shader_cache_insert_shader(struct si_screen *sscreen,
+				   unsigned char ir_sha1_cache_key[20],
 				   struct si_shader *shader,
 				   bool insert_into_disk_cache)
 {
@@ -219,42 +214,41 @@ bool si_shader_cache_insert_shader(struct si_screen *sscreen, void *ir_binary,
 	struct hash_entry *entry;
 	uint8_t key[CACHE_KEY_SIZE];
 
-	entry = _mesa_hash_table_search(sscreen->shader_cache, ir_binary);
+	entry = _mesa_hash_table_search(sscreen->shader_cache, ir_sha1_cache_key);
 	if (entry)
-		return false; /* already added */
+		return; /* already added */
 
 	hw_binary = si_get_shader_binary(shader);
 	if (!hw_binary)
-		return false;
+		return;
 
-	if (_mesa_hash_table_insert(sscreen->shader_cache, ir_binary,
+	if (_mesa_hash_table_insert(sscreen->shader_cache,
+				    mem_dup(ir_sha1_cache_key, 20),
 				    hw_binary) == NULL) {
 		FREE(hw_binary);
-		return false;
+		return;
 	}
 
 	if (sscreen->disk_shader_cache && insert_into_disk_cache) {
-		disk_cache_compute_key(sscreen->disk_shader_cache, ir_binary,
-				       *((uint32_t *)ir_binary), key);
+		disk_cache_compute_key(sscreen->disk_shader_cache,
+				       ir_sha1_cache_key, 20, key);
 		disk_cache_put(sscreen->disk_shader_cache, key, hw_binary,
 			       *((uint32_t *) hw_binary), NULL);
 	}
-
-	return true;
 }
 
-bool si_shader_cache_load_shader(struct si_screen *sscreen, void *ir_binary,
+bool si_shader_cache_load_shader(struct si_screen *sscreen,
+				 unsigned char ir_sha1_cache_key[20],
 				 struct si_shader *shader)
 {
 	struct hash_entry *entry =
-		_mesa_hash_table_search(sscreen->shader_cache, ir_binary);
+		_mesa_hash_table_search(sscreen->shader_cache, ir_sha1_cache_key);
 	if (!entry) {
 		if (sscreen->disk_shader_cache) {
 			unsigned char sha1[CACHE_KEY_SIZE];
-			size_t tg_size = *((uint32_t *) ir_binary);
 
 			disk_cache_compute_key(sscreen->disk_shader_cache,
-					       ir_binary, tg_size, sha1);
+					       ir_sha1_cache_key, 20, sha1);
 
 			size_t binary_size;
 			uint8_t *buffer =
@@ -285,16 +279,13 @@ bool si_shader_cache_load_shader(struct si_screen *sscreen, void *ir_binary,
 			}
 			free(buffer);
 
-			if (!si_shader_cache_insert_shader(sscreen, ir_binary,
-							   shader, false))
-				FREE(ir_binary);
+			si_shader_cache_insert_shader(sscreen, ir_sha1_cache_key,
+						      shader, false);
 		} else {
 			return false;
 		}
 	} else {
-		if (si_load_shader_binary(shader, entry->data))
-			FREE(ir_binary);
-		else
+		if (!si_load_shader_binary(shader, entry->data))
 			return false;
 	}
 	p_atomic_inc(&sscreen->num_shader_cache_hits);
@@ -303,20 +294,14 @@ bool si_shader_cache_load_shader(struct si_screen *sscreen, void *ir_binary,
 
 static uint32_t si_shader_cache_key_hash(const void *key)
 {
-	/* The first dword is the key size. */
-	return util_hash_crc32(key, *(uint32_t*)key);
+	/* Take the first dword of SHA1. */
+	return *(uint32_t*)key;
 }
 
 static bool si_shader_cache_key_equals(const void *a, const void *b)
 {
-	uint32_t *keya = (uint32_t*)a;
-	uint32_t *keyb = (uint32_t*)b;
-
-	/* The first dword is the key size. */
-	if (*keya != *keyb)
-		return false;
-
-	return memcmp(keya, keyb, *keya) == 0;
+	/* Compare SHA1s. */
+	return memcmp(a, b, 20) == 0;
 }
 
 static void si_destroy_shader_cache_entry(struct hash_entry *entry)
@@ -327,7 +312,7 @@ static void si_destroy_shader_cache_entry(struct hash_entry *entry)
 
 bool si_init_shader_cache(struct si_screen *sscreen)
 {
-	(void) mtx_init(&sscreen->shader_cache_mutex, mtx_plain);
+	(void) simple_mtx_init(&sscreen->shader_cache_mutex, mtx_plain);
 	sscreen->shader_cache =
 		_mesa_hash_table_create(NULL,
 					si_shader_cache_key_hash,
@@ -341,7 +326,7 @@ void si_destroy_shader_cache(struct si_screen *sscreen)
 	if (sscreen->shader_cache)
 		_mesa_hash_table_destroy(sscreen->shader_cache,
 					 si_destroy_shader_cache_entry);
-	mtx_destroy(&sscreen->shader_cache_mutex);
+	simple_mtx_destroy(&sscreen->shader_cache_mutex);
 }
 
 /* SHADER STATES */
@@ -1212,7 +1197,8 @@ static void gfx10_shader_ngg(struct si_screen *sscreen, struct si_shader *shader
 
 	shader->ctx_reg.ngg.vgt_primitiveid_en =
 		S_028A84_PRIMITIVEID_EN(es_enable_prim_id) |
-		S_028A84_NGG_DISABLE_PROVOK_REUSE(es_enable_prim_id);
+		S_028A84_NGG_DISABLE_PROVOK_REUSE(shader->key.mono.u.vs_export_prim_id ||
+						  gs_sel->info.writes_primid);
 
 	if (gs_type == PIPE_SHADER_GEOMETRY) {
 		shader->ctx_reg.ngg.vgt_esgs_ring_itemsize = es_sel->esgs_itemsize / 4;
@@ -1250,7 +1236,7 @@ static void gfx10_shader_ngg(struct si_screen *sscreen, struct si_shader *shader
 
 	shader->ge_cntl =
 		S_03096C_PRIM_GRP_SIZE(shader->ngg.max_gsprims) |
-		S_03096C_VERT_GRP_SIZE(shader->ngg.hw_max_esverts) |
+		S_03096C_VERT_GRP_SIZE(256) | /* 256 = disable vertex grouping */
 		S_03096C_BREAK_WAVE_AT_EOI(break_wave_at_eoi);
 
 	/* Bug workaround for a possible hang with non-tessellation cases.
@@ -1326,9 +1312,6 @@ static void si_emit_shader_vs(struct si_context *sctx)
 					   SI_TRACKED_VGT_VERTEX_REUSE_BLOCK_CNTL,
 					   shader->vgt_vertex_reuse_block_cntl);
 
-	if (initial_cdw != sctx->gfx_cs->current.cdw)
-		sctx->context_roll = true;
-
 	/* Required programming for tessellation. (legacy pipeline only) */
 	if (sctx->chip_class == GFX10 &&
 	    shader->selector->type == PIPE_SHADER_TESS_EVAL) {
@@ -1345,6 +1328,9 @@ static void si_emit_shader_vs(struct si_context *sctx)
 					       shader->pa_cl_vs_out_cntl,
 					       SI_TRACKED_PA_CL_VS_OUT_CNTL__VS_MASK);
 	}
+
+	if (initial_cdw != sctx->gfx_cs->current.cdw)
+		sctx->context_roll = true;
 }
 
 /**
@@ -2105,6 +2091,9 @@ static void si_build_shader_variant(struct si_shader *shader,
 		compiler = shader->compiler_ctx_state.compiler;
 	}
 
+	if (!compiler->passes)
+		si_init_compiler(sscreen, compiler);
+
 	if (unlikely(!si_shader_create(sscreen, compiler, shader, debug))) {
 		PRINT_ERR("Failed to build shader variant (type=%u)\n",
 			  sel->type);
@@ -2159,7 +2148,7 @@ static bool si_check_missing_main_part(struct si_screen *sscreen,
 		main_part->key.as_ngg = key->as_ngg;
 		main_part->is_monolithic = false;
 
-		if (si_compile_tgsi_shader(sscreen, compiler_state->compiler,
+		if (si_compile_shader(sscreen, compiler_state->compiler,
 					   main_part, &compiler_state->debug) != 0) {
 			FREE(main_part);
 			return false;
@@ -2221,14 +2210,14 @@ current_not_ready:
 	if (thread_index < 0)
 		util_queue_fence_wait(&sel->ready);
 
-	mtx_lock(&sel->mutex);
+	simple_mtx_lock(&sel->mutex);
 
 	/* Find the shader variant. */
 	for (iter = sel->first_variant; iter; iter = iter->next_variant) {
 		/* Don't check the "current" shader. We checked it above. */
 		if (current != iter &&
 		    memcmp(&iter->key, key, sizeof(*key)) == 0) {
-			mtx_unlock(&sel->mutex);
+			simple_mtx_unlock(&sel->mutex);
 
 			if (unlikely(!util_queue_fence_is_signalled(&iter->ready))) {
 				/* If it's an optimized shader and its compilation has
@@ -2257,7 +2246,7 @@ current_not_ready:
 	/* Build a new shader. */
 	shader = CALLOC_STRUCT(si_shader);
 	if (!shader) {
-		mtx_unlock(&sel->mutex);
+		simple_mtx_unlock(&sel->mutex);
 		return -ENOMEM;
 	}
 
@@ -2314,11 +2303,11 @@ current_not_ready:
 				assert(0);
 			}
 
-			mtx_lock(&previous_stage_sel->mutex);
+			simple_mtx_lock(&previous_stage_sel->mutex);
 			ok = si_check_missing_main_part(sscreen,
 							previous_stage_sel,
 							compiler_state, &shader1_key);
-			mtx_unlock(&previous_stage_sel->mutex);
+			simple_mtx_unlock(&previous_stage_sel->mutex);
 		}
 
 		if (ok) {
@@ -2328,7 +2317,7 @@ current_not_ready:
 
 		if (!ok) {
 			FREE(shader);
-			mtx_unlock(&sel->mutex);
+			simple_mtx_unlock(&sel->mutex);
 			return -ENOMEM; /* skip the draw call */
 		}
 	}
@@ -2373,7 +2362,7 @@ current_not_ready:
 
 		/* Use the default (unoptimized) shader for now. */
 		memset(&key->opt, 0, sizeof(key->opt));
-		mtx_unlock(&sel->mutex);
+		simple_mtx_unlock(&sel->mutex);
 
 		if (sscreen->options.sync_compile)
 			util_queue_fence_wait(&shader->ready);
@@ -2394,7 +2383,7 @@ current_not_ready:
 		sel->last_variant = shader;
 	}
 
-	mtx_unlock(&sel->mutex);
+	simple_mtx_unlock(&sel->mutex);
 
 	assert(!shader->is_optimized);
 	si_build_shader_variant(shader, thread_index, false);
@@ -2472,8 +2461,25 @@ static void si_init_shader_selector_async(void *job, int thread_index)
 	assert(thread_index < ARRAY_SIZE(sscreen->compiler));
 	compiler = &sscreen->compiler[thread_index];
 
-	if (sel->nir)
-		si_lower_nir(sel);
+	if (!compiler->passes)
+		si_init_compiler(sscreen, compiler);
+
+	/* Serialize NIR to save memory. Monolithic shader variants
+	 * have to deserialize NIR before compilation.
+	 */
+	if (sel->nir) {
+		struct blob blob;
+                size_t size;
+
+		blob_init(&blob);
+		/* true = remove optional debugging data to increase
+		 * the likehood of getting more shader cache hits.
+		 * It also drops variable names, so we'll save more memory.
+		 */
+		nir_serialize(&blob, sel->nir, true);
+		blob_finish_get_buffer(&blob, &sel->nir_binary, &size);
+		sel->nir_size = size;
+	}
 
 	/* Compile the main shader part for use with a prolog and/or epilog.
 	 * If this fails, the driver will try to compile a monolithic shader
@@ -2481,7 +2487,7 @@ static void si_init_shader_selector_async(void *job, int thread_index)
 	 */
 	if (!sscreen->use_monolithic_shaders) {
 		struct si_shader *shader = CALLOC_STRUCT(si_shader);
-		void *ir_binary = NULL;
+		unsigned char ir_sha1_cache_key[20];
 
 		if (!shader) {
 			fprintf(stderr, "radeonsi: can't allocate a main shader part\n");
@@ -2505,36 +2511,32 @@ static void si_init_shader_selector_async(void *job, int thread_index)
 		     sel->type == PIPE_SHADER_GEOMETRY))
 			shader->key.as_ngg = 1;
 
-		if (sel->tokens || sel->nir) {
-			ir_binary = si_get_ir_binary(sel, shader->key.as_ngg,
-						     shader->key.as_es);
+		if (sel->nir) {
+			si_get_ir_cache_key(sel, shader->key.as_ngg,
+					    shader->key.as_es, ir_sha1_cache_key);
 		}
 
 		/* Try to load the shader from the shader cache. */
-		mtx_lock(&sscreen->shader_cache_mutex);
+		simple_mtx_lock(&sscreen->shader_cache_mutex);
 
-		if (ir_binary &&
-		    si_shader_cache_load_shader(sscreen, ir_binary, shader)) {
-			mtx_unlock(&sscreen->shader_cache_mutex);
+		if (si_shader_cache_load_shader(sscreen, ir_sha1_cache_key, shader)) {
+			simple_mtx_unlock(&sscreen->shader_cache_mutex);
 			si_shader_dump_stats_for_shader_db(sscreen, shader, debug);
 		} else {
-			mtx_unlock(&sscreen->shader_cache_mutex);
+			simple_mtx_unlock(&sscreen->shader_cache_mutex);
 
 			/* Compile the shader if it hasn't been loaded from the cache. */
-			if (si_compile_tgsi_shader(sscreen, compiler, shader,
+			if (si_compile_shader(sscreen, compiler, shader,
 						   debug) != 0) {
 				FREE(shader);
-				FREE(ir_binary);
 				fprintf(stderr, "radeonsi: can't compile a main shader part\n");
 				return;
 			}
 
-			if (ir_binary) {
-				mtx_lock(&sscreen->shader_cache_mutex);
-				if (!si_shader_cache_insert_shader(sscreen, ir_binary, shader, true))
-					FREE(ir_binary);
-				mtx_unlock(&sscreen->shader_cache_mutex);
-			}
+			simple_mtx_lock(&sscreen->shader_cache_mutex);
+			si_shader_cache_insert_shader(sscreen, ir_sha1_cache_key,
+						      shader, true);
+			simple_mtx_unlock(&sscreen->shader_cache_mutex);
 		}
 
 		*si_get_main_shader_part(sel, &shader->key) = shader;
@@ -2595,6 +2597,12 @@ static void si_init_shader_selector_async(void *job, int thread_index)
 
 		si_shader_vs(sscreen, sel->gs_copy_shader, sel);
 	}
+
+	/* Free NIR. We only keep serialized NIR after this point. */
+	if (sel->nir) {
+		ralloc_free(sel->nir);
+		sel->nir = NULL;
+	}
 }
 
 void si_schedule_initial_compile(struct si_context *sctx, unsigned processor,
@@ -2633,12 +2641,13 @@ void si_get_active_slot_masks(const struct tgsi_shader_info *info,
 			      uint32_t *const_and_shader_buffers,
 			      uint64_t *samplers_and_images)
 {
-	unsigned start, num_shaderbufs, num_constbufs, num_images, num_samplers;
+	unsigned start, num_shaderbufs, num_constbufs, num_images, num_msaa_images, num_samplers;
 
 	num_shaderbufs = util_last_bit(info->shader_buffers_declared);
 	num_constbufs = util_last_bit(info->const_buffers_declared);
 	/* two 8-byte images share one 16-byte slot */
 	num_images = align(util_last_bit(info->images_declared), 2);
+	num_msaa_images = align(util_last_bit(info->msaa_images_declared), 2);
 	num_samplers = util_last_bit(info->samplers_declared);
 
 	/* The layout is: sb[last] ... sb[0], cb[0] ... cb[last] */
@@ -2646,7 +2655,18 @@ void si_get_active_slot_masks(const struct tgsi_shader_info *info,
 	*const_and_shader_buffers =
 		u_bit_consecutive(start, num_shaderbufs + num_constbufs);
 
-	/* The layout is: image[last] ... image[0], sampler[0] ... sampler[last] */
+	/* The layout is:
+	 *   - fmask[last] ... fmask[0]     go to [15-last .. 15]
+	 *   - image[last] ... image[0]     go to [31-last .. 31]
+	 *   - sampler[0] ... sampler[last] go to [32 .. 32+last*2]
+	 *
+	 * FMASKs for images are placed separately, because MSAA images are rare,
+	 * and so we can benefit from a better cache hit rate if we keep image
+	 * descriptors together.
+	 */
+	if (num_msaa_images)
+		num_images = SI_NUM_IMAGES + num_msaa_images; /* add FMASK descriptors */
+
 	start = si_get_image_slot(num_images - 1) / 2;
 	*samplers_and_images =
 		u_bit_consecutive64(start, num_images / 2 + num_samplers);
@@ -2670,45 +2690,17 @@ static void *si_create_shader_selector(struct pipe_context *ctx,
 
 	sel->so = state->stream_output;
 
-	if (state->type == PIPE_SHADER_IR_TGSI &&
-	    !sscreen->options.enable_nir) {
-		sel->tokens = tgsi_dup_tokens(state->tokens);
-		if (!sel->tokens) {
-			FREE(sel);
-			return NULL;
-		}
-
-		tgsi_scan_shader(state->tokens, &sel->info);
-		tgsi_scan_tess_ctrl(state->tokens, &sel->info, &sel->tcs_info);
-
-		/* Fixup for TGSI: Set which opcode uses which (i,j) pair. */
-		if (sel->info.uses_persp_opcode_interp_centroid)
-			sel->info.uses_persp_centroid = true;
-
-		if (sel->info.uses_linear_opcode_interp_centroid)
-			sel->info.uses_linear_centroid = true;
-
-		if (sel->info.uses_persp_opcode_interp_offset ||
-		    sel->info.uses_persp_opcode_interp_sample)
-			sel->info.uses_persp_center = true;
-
-		if (sel->info.uses_linear_opcode_interp_offset ||
-		    sel->info.uses_linear_opcode_interp_sample)
-			sel->info.uses_linear_center = true;
+	if (state->type == PIPE_SHADER_IR_TGSI) {
+		sel->nir = tgsi_to_nir(state->tokens, ctx->screen);
 	} else {
-		if (state->type == PIPE_SHADER_IR_TGSI) {
-			sel->nir = tgsi_to_nir(state->tokens, ctx->screen);
-		} else {
-			assert(state->type == PIPE_SHADER_IR_NIR);
-			sel->nir = state->ir.nir;
-		}
-
-		si_nir_lower_ps_inputs(sel->nir);
-		si_nir_opts(sel->nir);
-		si_nir_scan_shader(sel->nir, &sel->info);
-		si_nir_scan_tess_ctrl(sel->nir, &sel->tcs_info);
+		assert(state->type == PIPE_SHADER_IR_NIR);
+		sel->nir = state->ir.nir;
 	}
 
+	si_nir_scan_shader(sel->nir, &sel->info);
+	si_nir_scan_tess_ctrl(sel->nir, &sel->tcs_info);
+	si_nir_adjust_driver_locations(sel->nir);
+
 	sel->type = sel->info.processor;
 	p_atomic_inc(&sscreen->num_shaders_created);
 	si_get_active_slot_masks(&sel->info,
@@ -2722,6 +2714,10 @@ static void *si_create_shader_selector(struct pipe_context *ctx,
 			(sel->so.output[i].stream * 4);
 	}
 
+	sel->num_vs_inputs = sel->type == PIPE_SHADER_VERTEX &&
+			     !sel->info.properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD] ?
+				     sel->info.num_inputs : 0;
+
 	/* The prolog is a no-op if there are no inputs. */
 	sel->vs_needs_prolog = sel->type == PIPE_SHADER_VERTEX &&
 			       sel->info.num_inputs &&
@@ -2770,9 +2766,7 @@ static void *si_create_shader_selector(struct pipe_context *ctx,
 
 		/* EN_MAX_VERT_OUT_PER_GS_INSTANCE does not work with tesselation. */
 		sel->tess_turns_off_ngg =
-			(sscreen->info.family == CHIP_NAVI10 ||
-			 sscreen->info.family == CHIP_NAVI12 ||
-			 sscreen->info.family == CHIP_NAVI14) &&
+			sscreen->info.chip_class == GFX10 &&
 			sel->gs_num_invocations * sel->gs_max_out_vertices > 256;
 		break;
 
@@ -2933,7 +2927,7 @@ static void *si_create_shader_selector(struct pipe_context *ctx,
 	if (sel->info.properties[TGSI_PROPERTY_FS_POST_DEPTH_COVERAGE])
 		sel->db_shader_control |= S_02880C_PRE_SHADER_DEPTH_COVERAGE_ENABLE(1);
 
-	(void) mtx_init(&sel->mutex, mtx_plain);
+	(void) simple_mtx_init(&sel->mutex, mtx_plain);
 
 	si_schedule_initial_compile(sctx, sel->info.processor, &sel->ready,
 				    &sel->compiler_ctx_state, sel,
@@ -3060,7 +3054,7 @@ bool si_update_ngg(struct si_context *sctx)
 			sctx->flags |= SI_CONTEXT_VGT_FLUSH;
 
 		sctx->ngg = new_ngg;
-		sctx->last_rast_prim = -1; /* reset this so that it gets updated */
+		sctx->last_gs_out_prim = -1; /* reset this so that it gets updated */
 		return true;
 	}
 	return false;
@@ -3083,7 +3077,7 @@ static void si_bind_gs_shader(struct pipe_context *ctx, void *state)
 	sctx->ia_multi_vgt_param_key.u.uses_gs = sel != NULL;
 
 	si_update_common_shader_state(sctx);
-	sctx->last_rast_prim = -1; /* reset this so that it gets updated */
+	sctx->last_gs_out_prim = -1; /* reset this so that it gets updated */
 
 	ngg_changed = si_update_ngg(sctx);
 	if (ngg_changed || enable_changed)
@@ -3137,7 +3131,7 @@ static void si_bind_tes_shader(struct pipe_context *ctx, void *state)
 	si_update_tess_uses_prim_id(sctx);
 
 	si_update_common_shader_state(sctx);
-	sctx->last_rast_prim = -1; /* reset this so that it gets updated */
+	sctx->last_gs_out_prim = -1; /* reset this so that it gets updated */
 
 	bool ngg_changed = si_update_ngg(sctx);
 	if (ngg_changed || enable_changed)
@@ -3281,9 +3275,9 @@ void si_destroy_shader_selector(struct si_context *sctx,
 		si_delete_shader(sctx, sel->gs_copy_shader);
 
 	util_queue_fence_destroy(&sel->ready);
-	mtx_destroy(&sel->mutex);
-	free(sel->tokens);
+	simple_mtx_destroy(&sel->mutex);
 	ralloc_free(sel->nir);
+	free(sel->nir_binary);
 	free(sel);
 }
 
@@ -3564,18 +3558,18 @@ static bool si_update_gs_ring_buffers(struct si_context *sctx)
 
 static void si_shader_lock(struct si_shader *shader)
 {
-	mtx_lock(&shader->selector->mutex);
+	simple_mtx_lock(&shader->selector->mutex);
 	if (shader->previous_stage_sel) {
 		assert(shader->previous_stage_sel != shader->selector);
-		mtx_lock(&shader->previous_stage_sel->mutex);
+		simple_mtx_lock(&shader->previous_stage_sel->mutex);
 	}
 }
 
 static void si_shader_unlock(struct si_shader *shader)
 {
 	if (shader->previous_stage_sel)
-		mtx_unlock(&shader->previous_stage_sel->mutex);
-	mtx_unlock(&shader->selector->mutex);
+		simple_mtx_unlock(&shader->previous_stage_sel->mutex);
+	simple_mtx_unlock(&shader->selector->mutex);
 }
 
 /**
@@ -3850,9 +3844,9 @@ static struct si_pm4_state *si_build_vgt_shader_config(struct si_screen *screen,
 	}
 
 	if (key.u.ngg) {
-		stages |= S_028B54_PRIMGEN_EN(1);
-		if (key.u.streamout)
-			stages |= S_028B54_NGG_WAVE_ID_EN(1);
+		stages |= S_028B54_PRIMGEN_EN(1) |
+			  S_028B54_NGG_WAVE_ID_EN(key.u.streamout) |
+			  S_028B54_PRIMGEN_PASSTHRU_EN(key.u.ngg_passthrough);
 	} else if (key.u.gs)
 		stages |= S_028B54_VS_EN(V_028B54_VS_STAGE_COPY_SHADER);
 
@@ -3892,6 +3886,9 @@ bool si_update_shaders(struct si_context *sctx)
 		old_ps ? old_ps->key.part.ps.epilog.spi_shader_col_format : 0;
 	int r;
 
+	if (!sctx->compiler.passes)
+		si_init_compiler(sctx->screen, &sctx->compiler);
+
 	compiler_state.compiler = &sctx->compiler;
 	compiler_state.debug = sctx->debug;
 	compiler_state.is_debug_context = sctx->is_debug;
@@ -4002,6 +3999,10 @@ bool si_update_shaders(struct si_context *sctx)
 		}
 	}
 
+	/* This must be done after the shader variant is selected. */
+	if (sctx->ngg)
+		key.u.ngg_passthrough = gfx10_is_ngg_passthrough(si_get_vs(sctx)->current);
+
 	si_update_vgt_shader_config(sctx, key);
 
 	if (old_clip_disable != si_get_vs_state(sctx)->key.opt.clip_disable)