X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Famd%2Fvulkan%2Fsi_cmd_buffer.c;h=bee868d3b3cbd95b1f5cddb4607d6b61d6382f98;hb=f18fc34c4d56d6e7d511002b39a257e18d8b3af3;hp=379d8d5fcd44edb7cb92bad38edf2ae83f2c6e9e;hpb=53b50be35cd11dfa1209de63e997256404e51468;p=mesa.git

diff --git a/src/amd/vulkan/si_cmd_buffer.c b/src/amd/vulkan/si_cmd_buffer.c
index 379d8d5fcd4..bee868d3b3c 100644
--- a/src/amd/vulkan/si_cmd_buffer.c
+++ b/src/amd/vulkan/si_cmd_buffer.c
@@ -32,7 +32,6 @@
 #include "radv_cs.h"
 #include "sid.h"
 #include "radv_util.h"
-#include "main/macros.h"
 
 static void
 si_write_harvested_raster_configs(struct radv_physical_device *physical_device,
@@ -79,7 +78,7 @@ si_write_harvested_raster_configs(struct radv_physical_device *physical_device,
 }
 
 void
-si_emit_compute(struct radv_physical_device *physical_device,
+si_emit_compute(struct radv_device *device,
                 struct radeon_cmdbuf *cs)
 {
 	radeon_set_sh_reg_seq(cs, R_00B810_COMPUTE_START_X, 3);
@@ -93,7 +92,7 @@ si_emit_compute(struct radv_physical_device *physical_device,
 	radeon_emit(cs, S_00B858_SH0_CU_EN(0xffff) | S_00B858_SH1_CU_EN(0xffff));
 	radeon_emit(cs, S_00B858_SH0_CU_EN(0xffff) | S_00B858_SH1_CU_EN(0xffff));
 
-	if (physical_device->rad_info.chip_class >= GFX7) {
+	if (device->physical_device->rad_info.chip_class >= GFX7) {
 		/* Also set R_00B858_COMPUTE_STATIC_THREAD_MGMT_SE2 / SE3 */
 		radeon_set_sh_reg_seq(cs,
 				      R_00B864_COMPUTE_STATIC_THREAD_MGMT_SE2, 2);
@@ -101,22 +100,46 @@ si_emit_compute(struct radv_physical_device *physical_device,
 			    S_00B858_SH1_CU_EN(0xffff));
 		radeon_emit(cs, S_00B858_SH0_CU_EN(0xffff) |
 			    S_00B858_SH1_CU_EN(0xffff));
+
+		if (device->border_color_data.bo) {
+			uint64_t bc_va = radv_buffer_get_va(device->border_color_data.bo);
+
+			radeon_set_uconfig_reg_seq(cs, R_030E00_TA_CS_BC_BASE_ADDR, 2);
+			radeon_emit(cs, bc_va >> 8);
+			radeon_emit(cs, S_030E04_ADDRESS(bc_va >> 40));
+		}
 	}
 
-	if (physical_device->rad_info.chip_class >= GFX10)
+	if (device->physical_device->rad_info.chip_class >= GFX9) {
+		radeon_set_uconfig_reg(cs, R_0301EC_CP_COHER_START_DELAY,
+				       device->physical_device->rad_info.chip_class >= GFX10 ? 0x20 : 0);
+	}
+
+	if (device->physical_device->rad_info.chip_class >= GFX10) {
+		radeon_set_sh_reg(cs, R_00B890_COMPUTE_USER_ACCUM_0, 0);
+		radeon_set_sh_reg(cs, R_00B894_COMPUTE_USER_ACCUM_1, 0);
+		radeon_set_sh_reg(cs, R_00B898_COMPUTE_USER_ACCUM_2, 0);
+		radeon_set_sh_reg(cs, R_00B89C_COMPUTE_USER_ACCUM_3, 0);
 		radeon_set_sh_reg(cs, R_00B8A0_COMPUTE_PGM_RSRC3, 0);
+		radeon_set_sh_reg(cs, R_00B9F4_COMPUTE_DISPATCH_TUNNEL, 0);
+	}
 
 	/* This register has been moved to R_00CD20_COMPUTE_MAX_WAVE_ID
 	 * and is now per pipe, so it should be handled in the
 	 * kernel if we want to use something other than the default value,
 	 * which is now 0x22f.
 	 */
-	if (physical_device->rad_info.chip_class <= GFX6) {
+	if (device->physical_device->rad_info.chip_class <= GFX6) {
 		/* XXX: This should be:
 		 * (number of compute units) * 4 * (waves per simd) - 1 */
 
 		radeon_set_sh_reg(cs, R_00B82C_COMPUTE_MAX_WAVE_ID,
 		                  0x190 /* Default value */);
+
+		if (device->border_color_data.bo) {
+			uint64_t bc_va = radv_buffer_get_va(device->border_color_data.bo);
+			radeon_set_config_reg(cs, R_00950C_TA_CS_BC_BASE_ADDR, bc_va >> 8);
+		}
 	}
 }
 
@@ -156,15 +179,17 @@ si_set_raster_config(struct radv_physical_device *physical_device,
 }
 
 void
-si_emit_graphics(struct radv_physical_device *physical_device,
+si_emit_graphics(struct radv_device *device,
 		 struct radeon_cmdbuf *cs)
 {
+	struct radv_physical_device *physical_device = device->physical_device;
+
 	bool has_clear_state = physical_device->rad_info.has_clear_state;
 	int i;
 
 	radeon_emit(cs, PKT3(PKT3_CONTEXT_CONTROL, 1, 0));
-	radeon_emit(cs, CONTEXT_CONTROL_LOAD_ENABLE(1));
-	radeon_emit(cs, CONTEXT_CONTROL_SHADOW_ENABLE(1));
+	radeon_emit(cs, CC0_UPDATE_LOAD_ENABLES(1));
+	radeon_emit(cs, CC1_UPDATE_SHADOW_ENABLES(1));
 
 	if (has_clear_state) {
 		radeon_emit(cs, PKT3(PKT3_CLEAR_STATE, 0, 0));
@@ -292,55 +317,70 @@ si_emit_graphics(struct radv_physical_device *physical_device,
 		}
 
 		/* Compute LATE_ALLOC_VS.LIMIT. */
-		unsigned num_cu_per_sh = physical_device->rad_info.num_good_cu_per_sh;
-		unsigned late_alloc_limit; /* The limit is per SH. */
-
-		if (physical_device->rad_info.family == CHIP_KABINI) {
-			late_alloc_limit = 0; /* Potential hang on Kabini. */
-		} else if (num_cu_per_sh <= 4) {
-			/* Too few available compute units per SH. Disallowing
-			 * VS to run on one CU could hurt us more than late VS
-			 * allocation would help.
-			 *
-			 * 2 is the highest safe number that allows us to keep
-			 * all CUs enabled.
-			 */
-			late_alloc_limit = 2;
-		} else {
-			/* This is a good initial value, allowing 1 late_alloc
-			 * wave per SIMD on num_cu - 2.
-			 */
-			late_alloc_limit = (num_cu_per_sh - 2) * 4;
-		}
-
-		unsigned late_alloc_limit_gs = late_alloc_limit;
+		unsigned num_cu_per_sh = physical_device->rad_info.min_good_cu_per_sa;
+		unsigned late_alloc_wave64 = 0; /* The limit is per SA. */
+		unsigned late_alloc_wave64_gs = 0;
 		unsigned cu_mask_vs = 0xffff;
 		unsigned cu_mask_gs = 0xffff;
 
-		if (late_alloc_limit > 2) {
-			if (physical_device->rad_info.chip_class >= GFX10) {
+		if (physical_device->rad_info.chip_class >= GFX10) {
+			/* For Wave32, the hw will launch twice the number of late
+			 * alloc waves, so 1 == 2x wave32.
+			 */
+			if (!physical_device->rad_info.use_late_alloc) {
+				late_alloc_wave64 = 0;
+			} else if (num_cu_per_sh <= 6) {
+				late_alloc_wave64 = num_cu_per_sh - 2;
+			} else {
+				late_alloc_wave64 = (num_cu_per_sh - 2) * 4;
+
 				/* CU2 & CU3 disabled because of the dual CU design */
 				cu_mask_vs = 0xfff3;
 				cu_mask_gs = 0xfff3; /* NGG only */
+			}
+
+			late_alloc_wave64_gs = late_alloc_wave64;
+
+			/* Don't use late alloc for NGG on Navi14 due to a hw
+			 * bug. If NGG is never used, enable all CUs.
+			 */
+			if (!physical_device->use_ngg ||
+			    physical_device->rad_info.family == CHIP_NAVI14) {
+				late_alloc_wave64_gs = 0;
+				cu_mask_gs = 0xffff;
+			}
+
+			/* Limit LATE_ALLOC_GS for prevent a hang (hw bug). */
+			if (physical_device->rad_info.chip_class == GFX10)
+				late_alloc_wave64_gs = MIN2(late_alloc_wave64_gs, 64);
+		} else {
+			if (!physical_device->rad_info.use_late_alloc) {
+				late_alloc_wave64 = 0;
+			} else if (num_cu_per_sh <= 4) {
+				/* Too few available compute units per SA.
+				 * Disallowing VS to run on one CU could hurt
+				 * us more than late VS allocation would help.
+				 *
+				 * 2 is the highest safe number that allows us
+				 * to keep all CUs enabled.
+				 */
+				late_alloc_wave64 = 2;
 			} else {
-				cu_mask_vs = 0xfffe; /* 1 CU disabled */
+				/* This is a good initial value, allowing 1
+				 * late_alloc wave per SIMD on num_cu - 2.
+				 */
+				late_alloc_wave64 = (num_cu_per_sh - 2) * 4;
 			}
-		}
 
-		/* Don't use late alloc for NGG on Navi14 due to a hw bug.
-		 * If NGG is never used, enable all CUs.
-		 */
-		if (!physical_device->use_ngg ||
-		    physical_device->rad_info.family == CHIP_NAVI14) {
-			late_alloc_limit_gs = 0;
-			cu_mask_gs = 0xffff;
+			if (late_alloc_wave64 > 2)
+				cu_mask_vs = 0xfffe; /* 1 CU disabled */
 		}
 
 		radeon_set_sh_reg_idx(physical_device, cs, R_00B118_SPI_SHADER_PGM_RSRC3_VS,
 				      3, S_00B118_CU_EN(cu_mask_vs) |
 				      S_00B118_WAVE_LIMIT(0x3F));
 		radeon_set_sh_reg(cs, R_00B11C_SPI_SHADER_LATE_ALLOC_VS,
-				  S_00B11C_LIMIT(late_alloc_limit));
+				  S_00B11C_LIMIT(late_alloc_wave64));
 
 		radeon_set_sh_reg_idx(physical_device, cs, R_00B21C_SPI_SHADER_PGM_RSRC3_GS,
 				      3, S_00B21C_CU_EN(cu_mask_gs) | S_00B21C_WAVE_LIMIT(0x3F));
@@ -348,7 +388,7 @@ si_emit_graphics(struct radv_physical_device *physical_device,
 		if (physical_device->rad_info.chip_class >= GFX10) {
 			radeon_set_sh_reg_idx(physical_device, cs, R_00B204_SPI_SHADER_PGM_RSRC4_GS,
 					      3, S_00B204_CU_EN(0xffff) |
-					      S_00B204_SPI_SHADER_LATE_ALLOC_GS_GFX10(late_alloc_limit_gs));
+					      S_00B204_SPI_SHADER_LATE_ALLOC_GS_GFX10(late_alloc_wave64_gs));
 		}
 
 		radeon_set_sh_reg_idx(physical_device, cs, R_00B01C_SPI_SHADER_PGM_RSRC3_PS,
@@ -368,34 +408,36 @@ si_emit_graphics(struct radv_physical_device *physical_device,
 		radeon_set_context_reg(cs, R_028C50_PA_SC_NGG_MODE_CNTL,
 				       S_028C50_MAX_DEALLOCS_IN_WAVE(512));
 		radeon_set_context_reg(cs, R_028C58_VGT_VERTEX_REUSE_BLOCK_CNTL, 14);
-		radeon_set_context_reg(cs, R_02807C_DB_RMI_L2_CACHE_CONTROL,
-				       S_02807C_Z_WR_POLICY(V_02807C_CACHE_STREAM_WR) |
-				       S_02807C_S_WR_POLICY(V_02807C_CACHE_STREAM_WR) |
-				       S_02807C_HTILE_WR_POLICY(V_02807C_CACHE_STREAM_WR) |
-				       S_02807C_ZPCPSD_WR_POLICY(V_02807C_CACHE_STREAM_WR) |
-				       S_02807C_Z_RD_POLICY(V_02807C_CACHE_NOA_RD) |
-				       S_02807C_S_RD_POLICY(V_02807C_CACHE_NOA_RD) |
-				       S_02807C_HTILE_RD_POLICY(V_02807C_CACHE_NOA_RD));
-
-		radeon_set_context_reg(cs, R_028410_CB_RMI_GL2_CACHE_CONTROL,
-				       S_028410_CMASK_WR_POLICY(V_028410_CACHE_STREAM_WR) |
-				       S_028410_FMASK_WR_POLICY(V_028410_CACHE_STREAM_WR) |
-				       S_028410_DCC_WR_POLICY(V_028410_CACHE_STREAM_WR) |
-				       S_028410_COLOR_WR_POLICY(V_028410_CACHE_STREAM_WR) |
-				       S_028410_CMASK_RD_POLICY(V_028410_CACHE_NOA_RD) |
-				       S_028410_FMASK_RD_POLICY(V_028410_CACHE_NOA_RD) |
-				       S_028410_DCC_RD_POLICY(V_028410_CACHE_NOA_RD) |
-				       S_028410_COLOR_RD_POLICY(V_028410_CACHE_NOA_RD));
 		radeon_set_context_reg(cs, R_028428_CB_COVERAGE_OUT_CONTROL, 0);
 
+		radeon_set_sh_reg(cs, R_00B0C8_SPI_SHADER_USER_ACCUM_PS_0, 0);
+		radeon_set_sh_reg(cs, R_00B0CC_SPI_SHADER_USER_ACCUM_PS_1, 0);
+		radeon_set_sh_reg(cs, R_00B0D0_SPI_SHADER_USER_ACCUM_PS_2, 0);
+		radeon_set_sh_reg(cs, R_00B0D4_SPI_SHADER_USER_ACCUM_PS_3, 0);
+		radeon_set_sh_reg(cs, R_00B1C8_SPI_SHADER_USER_ACCUM_VS_0, 0);
+		radeon_set_sh_reg(cs, R_00B1CC_SPI_SHADER_USER_ACCUM_VS_1, 0);
+		radeon_set_sh_reg(cs, R_00B1D0_SPI_SHADER_USER_ACCUM_VS_2, 0);
+		radeon_set_sh_reg(cs, R_00B1D4_SPI_SHADER_USER_ACCUM_VS_3, 0);
+		radeon_set_sh_reg(cs, R_00B2C8_SPI_SHADER_USER_ACCUM_ESGS_0, 0);
+		radeon_set_sh_reg(cs, R_00B2CC_SPI_SHADER_USER_ACCUM_ESGS_1, 0);
+		radeon_set_sh_reg(cs, R_00B2D0_SPI_SHADER_USER_ACCUM_ESGS_2, 0);
+		radeon_set_sh_reg(cs, R_00B2D4_SPI_SHADER_USER_ACCUM_ESGS_3, 0);
+		radeon_set_sh_reg(cs, R_00B4C8_SPI_SHADER_USER_ACCUM_LSHS_0, 0);
+		radeon_set_sh_reg(cs, R_00B4CC_SPI_SHADER_USER_ACCUM_LSHS_1, 0);
+		radeon_set_sh_reg(cs, R_00B4D0_SPI_SHADER_USER_ACCUM_LSHS_2, 0);
+		radeon_set_sh_reg(cs, R_00B4D4_SPI_SHADER_USER_ACCUM_LSHS_3, 0);
+
 		radeon_set_sh_reg(cs, R_00B0C0_SPI_SHADER_REQ_CTRL_PS,
 				  S_00B0C0_SOFT_GROUPING_EN(1) |
 				  S_00B0C0_NUMBER_OF_REQUESTS_PER_CU(4 - 1));
 		radeon_set_sh_reg(cs, R_00B1C0_SPI_SHADER_REQ_CTRL_VS, 0);
 
-		if (physical_device->rad_info.family == CHIP_NAVI10 ||
-		    physical_device->rad_info.family == CHIP_NAVI12 ||
-		    physical_device->rad_info.family == CHIP_NAVI14) {
+		if (physical_device->rad_info.chip_class >= GFX10_3) {
+			radeon_set_context_reg(cs, R_028750_SX_PS_DOWNCONVERT_CONTROL, 0xff);
+			radeon_set_context_reg(cs, 0x28848, 1 << 9); /* This fixes sample shading. */
+		}
+
+		if (physical_device->rad_info.chip_class == GFX10) {
 			/* SQ_NON_EVENT must be emitted before GE_PC_ALLOC is written. */
 			radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
 			radeon_emit(cs, EVENT_TYPE(V_028A90_SQ_NON_EVENT) | EVENT_INDEX(0));
@@ -403,11 +445,18 @@ si_emit_graphics(struct radv_physical_device *physical_device,
 
 		/* TODO: For culling, replace 128 with 256. */
 		radeon_set_uconfig_reg(cs, R_030980_GE_PC_ALLOC,
-				       S_030980_OVERSUB_EN(1) |
+				       S_030980_OVERSUB_EN(physical_device->rad_info.use_late_alloc) |
 				       S_030980_NUM_PC_LINES(128 * physical_device->rad_info.max_se - 1));
 	}
 
-	if (physical_device->rad_info.chip_class >= GFX8) {
+	if (physical_device->rad_info.chip_class >= GFX9) {
+		radeon_set_context_reg(cs, R_028B50_VGT_TESS_DISTRIBUTION,
+				       S_028B50_ACCUM_ISOLINE(40) |
+				       S_028B50_ACCUM_TRI(30) |
+				       S_028B50_ACCUM_QUAD(24) |
+				       S_028B50_DONUT_SPLIT(24) |
+				       S_028B50_TRAP_SPLIT(6));
+	} else if (physical_device->rad_info.chip_class >= GFX8) {
 		uint32_t vgt_tess_distribution;
 
 		vgt_tess_distribution = S_028B50_ACCUM_ISOLINE(32) |
@@ -426,6 +475,16 @@ si_emit_graphics(struct radv_physical_device *physical_device,
 		radeon_set_context_reg(cs, R_028C5C_VGT_OUT_DEALLOC_CNTL, 16);
 	}
 
+	if (device->border_color_data.bo) {
+		uint64_t border_color_va = radv_buffer_get_va(device->border_color_data.bo);
+
+		radeon_set_context_reg(cs, R_028080_TA_BC_BASE_ADDR, border_color_va >> 8);
+		if (physical_device->rad_info.chip_class >= GFX7) {
+			radeon_set_context_reg(cs, R_028084_TA_BC_BASE_ADDR_HI,
+					       S_028084_ADDRESS(border_color_va >> 40));
+		}
+	}
+
 	if (physical_device->rad_info.chip_class >= GFX9) {
 		radeon_set_context_reg(cs, R_028C48_PA_SC_BINNER_CNTL_1,
 				       S_028C48_MAX_ALLOC_COUNT(physical_device->rad_info.pbb_max_alloc_count - 1) |
@@ -440,7 +499,7 @@ si_emit_graphics(struct radv_physical_device *physical_device,
 	radeon_emit(cs, S_028A00_HEIGHT(tmp) | S_028A00_WIDTH(tmp));
 	radeon_set_context_reg_seq(cs, R_028A04_PA_SU_POINT_MINMAX, 1);
 	radeon_emit(cs, S_028A04_MIN_SIZE(radv_pack_float_12p4(0)) |
-		    S_028A04_MAX_SIZE(radv_pack_float_12p4(8192/2)));
+		    S_028A04_MAX_SIZE(radv_pack_float_12p4(8191.875/2)));
 
 	if (!has_clear_state) {
 		radeon_set_context_reg(cs, R_028004_DB_COUNT_CONTROL,
@@ -462,7 +521,27 @@ si_emit_graphics(struct radv_physical_device *physical_device,
 				       small_prim_filter_cntl);
 	}
 
-	si_emit_compute(physical_device, cs);
+	radeon_set_context_reg(cs, R_0286D4_SPI_INTERP_CONTROL_0,
+	                       S_0286D4_FLAT_SHADE_ENA(1) |
+	                       S_0286D4_PNT_SPRITE_ENA(1) |
+	                       S_0286D4_PNT_SPRITE_OVRD_X(V_0286D4_SPI_PNT_SPRITE_SEL_S) |
+	                       S_0286D4_PNT_SPRITE_OVRD_Y(V_0286D4_SPI_PNT_SPRITE_SEL_T) |
+	                       S_0286D4_PNT_SPRITE_OVRD_Z(V_0286D4_SPI_PNT_SPRITE_SEL_0) |
+	                       S_0286D4_PNT_SPRITE_OVRD_W(V_0286D4_SPI_PNT_SPRITE_SEL_1) |
+	                       S_0286D4_PNT_SPRITE_TOP_1(0)); /* vulkan is top to bottom - 1.0 at bottom */
+
+	radeon_set_context_reg(cs, R_028BE4_PA_SU_VTX_CNTL,
+	                       S_028BE4_PIX_CENTER(1) |
+	                       S_028BE4_ROUND_MODE(V_028BE4_X_ROUND_TO_EVEN) |
+	                       S_028BE4_QUANT_MODE(V_028BE4_X_16_8_FIXED_POINT_1_256TH));
+
+	radeon_set_context_reg(cs, R_028818_PA_CL_VTE_CNTL,
+			       S_028818_VTX_W0_FMT(1) |
+			       S_028818_VPORT_X_SCALE_ENA(1) | S_028818_VPORT_X_OFFSET_ENA(1) |
+			       S_028818_VPORT_Y_SCALE_ENA(1) | S_028818_VPORT_Y_OFFSET_ENA(1) |
+			       S_028818_VPORT_Z_SCALE_ENA(1) | S_028818_VPORT_Z_OFFSET_ENA(1));
+
+	si_emit_compute(device, cs);
 }
 
 void
@@ -472,13 +551,13 @@ cik_create_gfx_config(struct radv_device *device)
 	if (!cs)
 		return;
 
-	si_emit_graphics(device->physical_device, cs);
+	si_emit_graphics(device, cs);
 
 	while (cs->cdw & 7) {
 		if (device->physical_device->rad_info.gfx_ib_pad_with_type2)
-			radeon_emit(cs, 0x80000000);
+			radeon_emit(cs, PKT2_NOP_PAD);
 		else
-			radeon_emit(cs, 0xffff1000);
+			radeon_emit(cs, PKT3_NOP_PAD);
 	}
 
 	device->gfx_init = device->ws->buffer_create(device->ws,
@@ -486,7 +565,8 @@ cik_create_gfx_config(struct radv_device *device)
 						     RADEON_DOMAIN_GTT,
 						     RADEON_FLAG_CPU_ACCESS|
 						     RADEON_FLAG_NO_INTERPROCESS_SHARING |
-						     RADEON_FLAG_READ_ONLY,
+						     RADEON_FLAG_READ_ONLY |
+						     RADEON_FLAG_GTT_WC,
 						     RADV_BO_PRIORITY_CS);
 	if (!device->gfx_init)
 		goto fail;
@@ -565,10 +645,10 @@ static VkRect2D si_scissor_from_viewport(const VkViewport *viewport)
 
 	get_viewport_xform(viewport, scale, translate);
 
-	rect.offset.x = translate[0] - fabs(scale[0]);
-	rect.offset.y = translate[1] - fabs(scale[1]);
-	rect.extent.width = ceilf(translate[0] + fabs(scale[0])) - rect.offset.x;
-	rect.extent.height = ceilf(translate[1] + fabs(scale[1])) - rect.offset.y;
+	rect.offset.x = translate[0] - fabsf(scale[0]);
+	rect.offset.y = translate[1] - fabsf(scale[1]);
+	rect.extent.width = ceilf(translate[0] + fabsf(scale[0])) - rect.offset.x;
+	rect.extent.height = ceilf(translate[1] + fabsf(scale[1])) - rect.offset.y;
 
 	return rect;
 }
@@ -645,11 +725,30 @@ radv_prims_for_vertices(struct radv_prim_vertex_count *info, unsigned num)
 	return 1 + ((num - info->min) / info->incr);
 }
 
+static const struct radv_prim_vertex_count prim_size_table[] = {
+	[V_008958_DI_PT_NONE] = {0, 0},
+	[V_008958_DI_PT_POINTLIST] = {1, 1},
+	[V_008958_DI_PT_LINELIST] = {2, 2},
+	[V_008958_DI_PT_LINESTRIP] = {2, 1},
+	[V_008958_DI_PT_TRILIST] = {3, 3},
+	[V_008958_DI_PT_TRIFAN] = {3, 1},
+	[V_008958_DI_PT_TRISTRIP] = {3, 1},
+	[V_008958_DI_PT_LINELIST_ADJ] = {4, 4},
+	[V_008958_DI_PT_LINESTRIP_ADJ] = {4, 1},
+	[V_008958_DI_PT_TRILIST_ADJ] = {6, 6},
+	[V_008958_DI_PT_TRISTRIP_ADJ] = {6, 2},
+	[V_008958_DI_PT_RECTLIST] = {3, 3},
+	[V_008958_DI_PT_LINELOOP] = {2, 1},
+	[V_008958_DI_PT_POLYGON] = {3, 1},
+	[V_008958_DI_PT_2D_TRI_STRIP] = {0, 0},
+};
+
 uint32_t
 si_get_ia_multi_vgt_param(struct radv_cmd_buffer *cmd_buffer,
 			  bool instanced_draw, bool indirect_draw,
 			  bool count_from_stream_output,
-			  uint32_t draw_vertex_count)
+			  uint32_t draw_vertex_count,
+			  unsigned topology)
 {
 	enum chip_class chip_class = cmd_buffer->device->physical_device->rad_info.chip_class;
 	enum radeon_family family = cmd_buffer->device->physical_device->rad_info.family;
@@ -662,10 +761,18 @@ si_get_ia_multi_vgt_param(struct radv_cmd_buffer *cmd_buffer,
 	bool partial_vs_wave = false;
 	bool partial_es_wave = cmd_buffer->state.pipeline->graphics.ia_multi_vgt_param.partial_es_wave;
 	bool multi_instances_smaller_than_primgroup;
+	struct radv_prim_vertex_count prim_vertex_count = prim_size_table[topology];
+
+	if (radv_pipeline_has_tess(cmd_buffer->state.pipeline)) {
+		if (topology == V_008958_DI_PT_PATCH) {
+			prim_vertex_count.min = cmd_buffer->state.pipeline->graphics.tess_patch_control_points;
+			prim_vertex_count.incr = 1;
+		}
+	}
 
 	multi_instances_smaller_than_primgroup = indirect_draw;
 	if (!multi_instances_smaller_than_primgroup && instanced_draw) {
-		uint32_t num_prims = radv_prims_for_vertices(&cmd_buffer->state.pipeline->graphics.prim_vertex_count, draw_vertex_count);
+		uint32_t num_prims = radv_prims_for_vertices(&prim_vertex_count, draw_vertex_count);
 		if (num_prims < cmd_buffer->state.pipeline->graphics.ia_multi_vgt_param.primgroup_size)
 			multi_instances_smaller_than_primgroup = true;
 	}
@@ -674,7 +781,19 @@ si_get_ia_multi_vgt_param(struct radv_cmd_buffer *cmd_buffer,
 	partial_vs_wave = cmd_buffer->state.pipeline->graphics.ia_multi_vgt_param.partial_vs_wave;
 
 	if (chip_class >= GFX7) {
-		wd_switch_on_eop = cmd_buffer->state.pipeline->graphics.ia_multi_vgt_param.wd_switch_on_eop;
+		/* WD_SWITCH_ON_EOP has no effect on GPUs with less than
+		 * 4 shader engines. Set 1 to pass the assertion below.
+		 * The other cases are hardware requirements. */
+		if (cmd_buffer->device->physical_device->rad_info.max_se < 4 ||
+		    topology == V_008958_DI_PT_POLYGON ||
+		    topology == V_008958_DI_PT_LINELOOP ||
+		    topology == V_008958_DI_PT_TRIFAN ||
+		    topology == V_008958_DI_PT_TRISTRIP_ADJ ||
+		    (cmd_buffer->state.pipeline->graphics.prim_restart_enable &&
+		     (cmd_buffer->device->physical_device->rad_info.family < CHIP_POLARIS10 ||
+		      (topology != V_008958_DI_PT_POINTLIST &&
+		       topology != V_008958_DI_PT_LINESTRIP))))
+			wd_switch_on_eop = true;
 
 		/* Hawaii hangs if instancing is enabled and WD_SWITCH_ON_EOP is 0.
 		 * We don't know that for indirect drawing, so treat it as
@@ -731,7 +850,7 @@ si_get_ia_multi_vgt_param(struct radv_cmd_buffer *cmd_buffer,
 		if (family == CHIP_HAWAII && ia_switch_on_eoi) {
 			bool set_vgt_flush = indirect_draw;
 			if (!set_vgt_flush && instanced_draw) {
-				uint32_t num_prims = radv_prims_for_vertices(&cmd_buffer->state.pipeline->graphics.prim_vertex_count, draw_vertex_count);
+				uint32_t num_prims = radv_prims_for_vertices(&prim_vertex_count, draw_vertex_count);
 				if (num_prims <= 1)
 					set_vgt_flush = true;
 			}
@@ -740,6 +859,17 @@ si_get_ia_multi_vgt_param(struct radv_cmd_buffer *cmd_buffer,
 		}
 	}
 
+	/* Workaround for a VGT hang when strip primitive types are used with
+	 * primitive restart.
+	 */
+	if (cmd_buffer->state.pipeline->graphics.prim_restart_enable &&
+	    (topology == V_008958_DI_PT_LINESTRIP ||
+	     topology == V_008958_DI_PT_TRISTRIP ||
+	     topology == V_008958_DI_PT_LINESTRIP_ADJ ||
+	     topology == V_008958_DI_PT_TRISTRIP_ADJ)) {
+		partial_vs_wave = true;
+	}
+
 	return cmd_buffer->state.pipeline->graphics.ia_multi_vgt_param.base |
 		S_028AA8_SWITCH_ON_EOP(ia_switch_on_eop) |
 		S_028AA8_SWITCH_ON_EOI(ia_switch_on_eoi) |