From d814c21b1bea0396c735d65b363a8f2c6324c7d8 Mon Sep 17 00:00:00 2001
From: =?utf8?q?Nicolai=20H=C3=A4hnle?= <nicolai.haehnle@amd.com>
Date: Mon, 1 Apr 2019 15:44:39 +0200
Subject: [PATCH] radeonsi: overhaul the vertex fetch fixup mechanism
MIME-Version: 1.0
Content-Type: text/plain; charset=utf8
Content-Transfer-Encoding: 8bit

The overall goal is to support unaligned loads from vertex buffers
natively on SI.

In the unaligned case, we fall back to the general case implementation in
ac_build_opencoded_load_format. Since this function is fully general,
we will also use it going forward for cases requiring fully manual format
conversions of dwords anyway.

This requires a different encoding of the fix_fetch array, which will now
contain the entire format information if a fixup is required.

Having to check the alignment of vertex buffers is awkward. To keep the
impact on the fast path minimal, the si_context will keep track of which
vertex buffers are (not) at least dword-aligned, while the
si_vertex_elements will note which vertex buffers have some (at most dword)
alignment requirement. Vertex buffers should be dword-aligned most of the
time, which allows a fast early-out in almost all cases.

Add the radeonsi_vs_fetch_always_opencode configuration variable for
testing purposes. Note that it can only be used reliably on LLVM >= 9,
because support for byte and short load is required.

v2:
- add a missing check to si_bind_vertex_elements

Reviewed-by: Marek OlÅ¡Ã¡k <marek.olsak@amd.com>
---
 .../drivers/radeonsi/si_debug_options.h       |   1 +
 src/gallium/drivers/radeonsi/si_get.c         |   2 +-
 src/gallium/drivers/radeonsi/si_pipe.h        |   1 +
 src/gallium/drivers/radeonsi/si_shader.c      | 249 ++++++------------
 src/gallium/drivers/radeonsi/si_shader.h      |  46 ++--
 src/gallium/drivers/radeonsi/si_state.c       | 237 ++++++++++-------
 src/gallium/drivers/radeonsi/si_state.h       |  19 ++
 .../drivers/radeonsi/si_state_shaders.c       |  26 +-
 8 files changed, 301 insertions(+), 280 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_debug_options.h b/src/gallium/drivers/radeonsi/si_debug_options.h
index 019256ca1d1..0bde7910fc6 100644
--- a/src/gallium/drivers/radeonsi/si_debug_options.h
+++ b/src/gallium/drivers/radeonsi/si_debug_options.h
@@ -2,5 +2,6 @@ OPT_BOOL(clear_db_cache_before_clear, false, "Clear DB cache before fast depth c
 OPT_BOOL(enable_nir, false, "Enable NIR")
 OPT_BOOL(aux_debug, false, "Generate ddebug_dumps for the auxiliary context")
 OPT_BOOL(sync_compile, false, "Always compile synchronously (will cause stalls)")
+OPT_BOOL(vs_fetch_always_opencode, false, "Always open code vertex fetches (less efficient, purely for testing)")
 
 #undef OPT_BOOL
diff --git a/src/gallium/drivers/radeonsi/si_get.c b/src/gallium/drivers/radeonsi/si_get.c
index 700777186d8..eb23d4fe88a 100644
--- a/src/gallium/drivers/radeonsi/si_get.c
+++ b/src/gallium/drivers/radeonsi/si_get.c
@@ -197,7 +197,7 @@ static int si_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
 	case PIPE_CAP_VERTEX_BUFFER_OFFSET_4BYTE_ALIGNED_ONLY:
 	case PIPE_CAP_VERTEX_BUFFER_STRIDE_4BYTE_ALIGNED_ONLY:
 	case PIPE_CAP_VERTEX_ELEMENT_SRC_OFFSET_4BYTE_ALIGNED_ONLY:
-		return !sscreen->info.has_unaligned_shader_loads;
+		return HAVE_LLVM < 0x0900 && !sscreen->info.has_unaligned_shader_loads;
 
 	case PIPE_CAP_SPARSE_BUFFER_PAGE_SIZE:
 		return sscreen->info.has_sparse_vm_mappings ?
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
index d3ddb912245..695827c9dd7 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -939,6 +939,7 @@ struct si_context {
 	bool				vertex_buffers_dirty;
 	bool				vertex_buffer_pointer_dirty;
 	struct pipe_vertex_buffer	vertex_buffer[SI_NUM_VERTEX_BUFFERS];
+	uint16_t			vertex_buffer_unaligned; /* bitmask of not dword-aligned buffers */
 
 	/* MSAA config state. */
 	int				ps_iter_samples;
diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index d783555ca33..f6d882cf583 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -430,21 +430,6 @@ static LLVMValueRef get_tcs_in_vertex_dw_stride(struct si_shader_context *ctx)
 	}
 }
 
-/* Bitcast <4 x float> to <2 x double>, extract the component, and convert
- * to float. */
-static LLVMValueRef extract_double_to_float(struct si_shader_context *ctx,
-					    LLVMValueRef vec4,
-					    unsigned double_index)
-{
-	LLVMBuilderRef builder = ctx->ac.builder;
-	LLVMTypeRef f64 = LLVMDoubleTypeInContext(ctx->ac.context);
-	LLVMValueRef dvec2 = LLVMBuildBitCast(builder, vec4,
-					      LLVMVectorType(f64, 2), "");
-	LLVMValueRef index = LLVMConstInt(ctx->i32, double_index, 0);
-	LLVMValueRef value = LLVMBuildExtractElement(builder, dvec2, index, "");
-	return LLVMBuildFPTrunc(builder, value, ctx->f32, "");
-}
-
 static LLVMValueRef unpack_sint16(struct si_shader_context *ctx,
 				 LLVMValueRef i32, unsigned index)
 {
@@ -536,17 +521,12 @@ void si_llvm_load_input_vs(
 		return;
 	}
 
-	unsigned chan;
-	unsigned fix_fetch;
-	unsigned num_fetches;
-	unsigned fetch_stride;
-	unsigned num_channels;
-
+	union si_vs_fix_fetch fix_fetch;
 	LLVMValueRef t_list_ptr;
 	LLVMValueRef t_offset;
 	LLVMValueRef t_list;
 	LLVMValueRef vertex_index;
-	LLVMValueRef input[3];
+	LLVMValueRef tmp;
 
 	/* Load the T list */
 	t_list_ptr = LLVMGetParam(ctx->main_fn, ctx->param_vertex_buffers);
@@ -559,74 +539,84 @@ void si_llvm_load_input_vs(
 				    ctx->param_vertex_index0 +
 				    input_index);
 
-	fix_fetch = ctx->shader->key.mono.vs_fix_fetch[input_index];
+	/* Use the open-coded implementation for all loads of doubles and
+	 * of dword-sized data that needs fixups. We need to insert conversion
+	 * code anyway, and the amd/common code does it for us.
+	 *
+	 * Note: On LLVM <= 8, we can only open-code formats with
+	 * channel size >= 4 bytes.
+	 */
+	bool opencode = ctx->shader->key.mono.vs_fetch_opencode & (1 << input_index);
+	fix_fetch.bits = ctx->shader->key.mono.vs_fix_fetch[input_index].bits;
+	if (opencode ||
+	    (fix_fetch.u.log_size == 3 && fix_fetch.u.format == AC_FETCH_FORMAT_FLOAT) ||
+	    (fix_fetch.u.log_size == 2)) {
+		tmp = ac_build_opencoded_load_format(
+				&ctx->ac, fix_fetch.u.log_size, fix_fetch.u.num_channels_m1 + 1,
+				fix_fetch.u.format, fix_fetch.u.reverse, !opencode,
+				t_list, vertex_index, ctx->ac.i32_0, ctx->ac.i32_0,
+				false, false, true);
+		for (unsigned i = 0; i < 4; ++i)
+			out[i] = LLVMBuildExtractElement(ctx->ac.builder, tmp, LLVMConstInt(ctx->i32, i, false), "");
+		return;
+	}
 
 	/* Do multiple loads for special formats. */
-	switch (fix_fetch) {
-	case SI_FIX_FETCH_RG_64_FLOAT:
-		num_fetches = 1; /* 1 2-dword or 4-dword load */
-		fetch_stride = 0;
-		if (util_last_bit(info->input_usage_mask[input_index]) >= 2)
-			num_channels = 4; /* 2 doubles in 4 dwords */
-		else
-			num_channels = 2; /* 1 double in 2 dwords */
-		break;
-	case SI_FIX_FETCH_RGB_64_FLOAT:
-		num_fetches = 3; /* 3 2-dword loads */
-		fetch_stride = 8;
-		num_channels = 2;
-		break;
-	case SI_FIX_FETCH_RGBA_64_FLOAT:
-		num_fetches = 2; /* 2 4-dword loads */
-		fetch_stride = 16;
-		num_channels = 4;
-		break;
-	case SI_FIX_FETCH_RGB_8:
-	case SI_FIX_FETCH_RGB_8_INT:
-		num_fetches = 3;
-		fetch_stride = 1;
-		num_channels = 1;
-		break;
-	case SI_FIX_FETCH_RGB_16:
-	case SI_FIX_FETCH_RGB_16_INT:
-		num_fetches = 3;
-		fetch_stride = 2;
-		num_channels = 1;
-		break;
-	default:
+	unsigned required_channels = util_last_bit(info->input_usage_mask[input_index]);
+	LLVMValueRef fetches[4];
+	unsigned num_fetches;
+	unsigned fetch_stride;
+	unsigned channels_per_fetch;
+
+	if (fix_fetch.u.log_size <= 1 && fix_fetch.u.num_channels_m1 == 2) {
+		num_fetches = MIN2(required_channels, 3);
+		fetch_stride = 1 << fix_fetch.u.log_size;
+		channels_per_fetch = 1;
+	} else {
 		num_fetches = 1;
 		fetch_stride = 0;
-		num_channels = util_last_bit(info->input_usage_mask[input_index]);
+		channels_per_fetch = required_channels;
 	}
 
-	for (unsigned i = 0; i < num_fetches; i++) {
+	for (unsigned i = 0; i < num_fetches; ++i) {
 		LLVMValueRef voffset = LLVMConstInt(ctx->i32, fetch_stride * i, 0);
-
-		input[i] = ac_build_buffer_load_format(&ctx->ac, t_list,
-						       vertex_index, voffset,
-						       num_channels, false, true);
-		input[i] = ac_build_expand_to_vec4(&ctx->ac, input[i], num_channels);
+		fetches[i] = ac_build_buffer_load_format(&ctx->ac, t_list, vertex_index, voffset,
+							 channels_per_fetch, false, true);
 	}
 
-	/* Break up the vec4 into individual components */
-	for (chan = 0; chan < 4; chan++) {
-		LLVMValueRef llvm_chan = LLVMConstInt(ctx->i32, chan, 0);
-		out[chan] = LLVMBuildExtractElement(ctx->ac.builder,
-						    input[0], llvm_chan, "");
+	if (num_fetches == 1 && channels_per_fetch > 1) {
+		LLVMValueRef fetch = fetches[0];
+		for (unsigned i = 0; i < channels_per_fetch; ++i) {
+			tmp = LLVMConstInt(ctx->i32, i, false);
+			fetches[i] = LLVMBuildExtractElement(
+				ctx->ac.builder, fetch, tmp, "");
+		}
+		num_fetches = channels_per_fetch;
+		channels_per_fetch = 1;
 	}
 
-	switch (fix_fetch) {
-	case SI_FIX_FETCH_A2_SNORM:
-	case SI_FIX_FETCH_A2_SSCALED:
-	case SI_FIX_FETCH_A2_SINT: {
-		/* The hardware returns an unsigned value; convert it to a
-		 * signed one.
+	for (unsigned i = num_fetches; i < 4; ++i)
+		fetches[i] = LLVMGetUndef(ctx->f32);
+
+	if (fix_fetch.u.log_size <= 1 && fix_fetch.u.num_channels_m1 == 2 &&
+	    required_channels == 4) {
+		if (fix_fetch.u.format == AC_FETCH_FORMAT_UINT || fix_fetch.u.format == AC_FETCH_FORMAT_SINT)
+			fetches[3] = ctx->ac.i32_1;
+		else
+			fetches[3] = ctx->ac.f32_1;
+	} else if (fix_fetch.u.log_size == 3 &&
+		   (fix_fetch.u.format == AC_FETCH_FORMAT_SNORM ||
+		    fix_fetch.u.format == AC_FETCH_FORMAT_SSCALED ||
+		    fix_fetch.u.format == AC_FETCH_FORMAT_SINT) &&
+		   required_channels == 4) {
+		/* For 2_10_10_10, the hardware returns an unsigned value;
+		 * convert it to a signed one.
 		 */
-		LLVMValueRef tmp = out[3];
+		LLVMValueRef tmp = fetches[3];
 		LLVMValueRef c30 = LLVMConstInt(ctx->i32, 30, 0);
 
 		/* First, recover the sign-extended signed integer value. */
-		if (fix_fetch == SI_FIX_FETCH_A2_SSCALED)
+		if (fix_fetch.u.format == AC_FETCH_FORMAT_SSCALED)
 			tmp = LLVMBuildFPToUI(ctx->ac.builder, tmp, ctx->i32, "");
 		else
 			tmp = ac_to_integer(&ctx->ac, tmp);
@@ -638,110 +628,26 @@ void si_llvm_load_input_vs(
 		 * exponent.
 		 */
 		tmp = LLVMBuildShl(ctx->ac.builder, tmp,
-				   fix_fetch == SI_FIX_FETCH_A2_SNORM ?
+				   fix_fetch.u.format == AC_FETCH_FORMAT_SNORM ?
 				   LLVMConstInt(ctx->i32, 7, 0) : c30, "");
 		tmp = LLVMBuildAShr(ctx->ac.builder, tmp, c30, "");
 
 		/* Convert back to the right type. */
-		if (fix_fetch == SI_FIX_FETCH_A2_SNORM) {
+		if (fix_fetch.u.format == AC_FETCH_FORMAT_SNORM) {
 			LLVMValueRef clamp;
 			LLVMValueRef neg_one = LLVMConstReal(ctx->f32, -1.0);
 			tmp = LLVMBuildSIToFP(ctx->ac.builder, tmp, ctx->f32, "");
 			clamp = LLVMBuildFCmp(ctx->ac.builder, LLVMRealULT, tmp, neg_one, "");
 			tmp = LLVMBuildSelect(ctx->ac.builder, clamp, neg_one, tmp, "");
-		} else if (fix_fetch == SI_FIX_FETCH_A2_SSCALED) {
+		} else if (fix_fetch.u.format == AC_FETCH_FORMAT_SSCALED) {
 			tmp = LLVMBuildSIToFP(ctx->ac.builder, tmp, ctx->f32, "");
 		}
 
-		out[3] = tmp;
-		break;
+		fetches[3] = tmp;
 	}
-	case SI_FIX_FETCH_RGBA_32_UNORM:
-	case SI_FIX_FETCH_RGBX_32_UNORM:
-		for (chan = 0; chan < 4; chan++) {
-			out[chan] = ac_to_integer(&ctx->ac, out[chan]);
-			out[chan] = LLVMBuildUIToFP(ctx->ac.builder,
-						    out[chan], ctx->f32, "");
-			out[chan] = LLVMBuildFMul(ctx->ac.builder, out[chan],
-						  LLVMConstReal(ctx->f32, 1.0 / UINT_MAX), "");
-		}
-		/* RGBX UINT returns 1 in alpha, which would be rounded to 0 by normalizing. */
-		if (fix_fetch == SI_FIX_FETCH_RGBX_32_UNORM)
-			out[3] = LLVMConstReal(ctx->f32, 1);
-		break;
-	case SI_FIX_FETCH_RGBA_32_SNORM:
-	case SI_FIX_FETCH_RGBX_32_SNORM:
-	case SI_FIX_FETCH_RGBA_32_FIXED:
-	case SI_FIX_FETCH_RGBX_32_FIXED: {
-		double scale;
-		if (fix_fetch >= SI_FIX_FETCH_RGBA_32_FIXED)
-			scale = 1.0 / 0x10000;
-		else
-			scale = 1.0 / INT_MAX;
 
-		for (chan = 0; chan < 4; chan++) {
-			out[chan] = ac_to_integer(&ctx->ac, out[chan]);
-			out[chan] = LLVMBuildSIToFP(ctx->ac.builder,
-						    out[chan], ctx->f32, "");
-			out[chan] = LLVMBuildFMul(ctx->ac.builder, out[chan],
-						  LLVMConstReal(ctx->f32, scale), "");
-		}
-		/* RGBX SINT returns 1 in alpha, which would be rounded to 0 by normalizing. */
-		if (fix_fetch == SI_FIX_FETCH_RGBX_32_SNORM ||
-		    fix_fetch == SI_FIX_FETCH_RGBX_32_FIXED)
-			out[3] = LLVMConstReal(ctx->f32, 1);
-		break;
-	}
-	case SI_FIX_FETCH_RGBA_32_USCALED:
-		for (chan = 0; chan < 4; chan++) {
-			out[chan] = ac_to_integer(&ctx->ac, out[chan]);
-			out[chan] = LLVMBuildUIToFP(ctx->ac.builder,
-						    out[chan], ctx->f32, "");
-		}
-		break;
-	case SI_FIX_FETCH_RGBA_32_SSCALED:
-		for (chan = 0; chan < 4; chan++) {
-			out[chan] = ac_to_integer(&ctx->ac, out[chan]);
-			out[chan] = LLVMBuildSIToFP(ctx->ac.builder,
-						    out[chan], ctx->f32, "");
-		}
-		break;
-	case SI_FIX_FETCH_RG_64_FLOAT:
-		for (chan = 0; chan < 2; chan++)
-			out[chan] = extract_double_to_float(ctx, input[0], chan);
-
-		out[2] = LLVMConstReal(ctx->f32, 0);
-		out[3] = LLVMConstReal(ctx->f32, 1);
-		break;
-	case SI_FIX_FETCH_RGB_64_FLOAT:
-		for (chan = 0; chan < 3; chan++)
-			out[chan] = extract_double_to_float(ctx, input[chan], 0);
-
-		out[3] = LLVMConstReal(ctx->f32, 1);
-		break;
-	case SI_FIX_FETCH_RGBA_64_FLOAT:
-		for (chan = 0; chan < 4; chan++) {
-			out[chan] = extract_double_to_float(ctx, input[chan / 2],
-							    chan % 2);
-		}
-		break;
-	case SI_FIX_FETCH_RGB_8:
-	case SI_FIX_FETCH_RGB_8_INT:
-	case SI_FIX_FETCH_RGB_16:
-	case SI_FIX_FETCH_RGB_16_INT:
-		for (chan = 0; chan < 3; chan++) {
-			out[chan] = LLVMBuildExtractElement(ctx->ac.builder,
-							    input[chan],
-							    ctx->i32_0, "");
-		}
-		if (fix_fetch == SI_FIX_FETCH_RGB_8 ||
-		    fix_fetch == SI_FIX_FETCH_RGB_16) {
-			out[3] = LLVMConstReal(ctx->f32, 1);
-		} else {
-			out[3] = ac_to_float(&ctx->ac, ctx->i32_1);
-		}
-		break;
-	}
+	for (unsigned i = 0; i < 4; ++i)
+		out[i] = ac_to_float(&ctx->ac, fetches[i]);
 }
 
 static void declare_input_vs(
@@ -5777,9 +5683,18 @@ static void si_dump_shader_key_vs(const struct si_shader_key *key,
 	fprintf(f, "  %s.ls_vgpr_fix = %u\n",
 		prefix, prolog->ls_vgpr_fix);
 
+	fprintf(f, "  mono.vs.fetch_opencode = %x\n", key->mono.vs_fetch_opencode);
 	fprintf(f, "  mono.vs.fix_fetch = {");
-	for (int i = 0; i < SI_MAX_ATTRIBS; i++)
-		fprintf(f, !i ? "%u" : ", %u", key->mono.vs_fix_fetch[i]);
+	for (int i = 0; i < SI_MAX_ATTRIBS; i++) {
+		union si_vs_fix_fetch fix = key->mono.vs_fix_fetch[i];
+		if (i)
+			fprintf(f, ", ");
+		if (!fix.bits)
+			fprintf(f, "0");
+		else
+			fprintf(f, "%u.%u.%u.%u", fix.u.reverse, fix.u.log_size,
+				fix.u.num_channels_m1, fix.u.format);
+	}
 	fprintf(f, "}\n");
 }
 
diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h
index 71ce27b2f55..82c521efcb7 100644
--- a/src/gallium/drivers/radeonsi/si_shader.h
+++ b/src/gallium/drivers/radeonsi/si_shader.h
@@ -273,27 +273,24 @@ enum {
 	SI_VS_BLIT_SGPRS_POS_TEXCOORD = 9,
 };
 
-/* For VS shader key fix_fetch. */
-enum {
-	SI_FIX_FETCH_NONE = 0,
-	SI_FIX_FETCH_A2_SNORM,
-	SI_FIX_FETCH_A2_SSCALED,
-	SI_FIX_FETCH_A2_SINT,
-	SI_FIX_FETCH_RGBA_32_UNORM,
-	SI_FIX_FETCH_RGBX_32_UNORM,
-	SI_FIX_FETCH_RGBA_32_SNORM,
-	SI_FIX_FETCH_RGBX_32_SNORM,
-	SI_FIX_FETCH_RGBA_32_USCALED,
-	SI_FIX_FETCH_RGBA_32_SSCALED,
-	SI_FIX_FETCH_RGBA_32_FIXED,
-	SI_FIX_FETCH_RGBX_32_FIXED,
-	SI_FIX_FETCH_RG_64_FLOAT,
-	SI_FIX_FETCH_RGB_64_FLOAT,
-	SI_FIX_FETCH_RGBA_64_FLOAT,
-	SI_FIX_FETCH_RGB_8,	/* A = 1.0 */
-	SI_FIX_FETCH_RGB_8_INT,	/* A = 1 */
-	SI_FIX_FETCH_RGB_16,
-	SI_FIX_FETCH_RGB_16_INT,
+/**
+ * For VS shader keys, describe any fixups required for vertex fetch.
+ *
+ * \ref log_size, \ref format, and the number of channels are interpreted as
+ * by \ref ac_build_opencoded_load_format.
+ *
+ * Note: all bits 0 (size = 1 byte, num channels = 1, format = float) is an
+ * impossible format and indicates that no fixup is needed (just use
+ * buffer_load_format_xyzw).
+ */
+union si_vs_fix_fetch {
+	struct {
+		uint8_t log_size : 2; /* 1, 2, 4, 8 or bytes per channel */
+		uint8_t num_channels_m1 : 2; /* number of channels minus 1 */
+		uint8_t format : 3; /* AC_FETCH_FORMAT_xxx */
+		uint8_t reverse : 1; /* reverse XYZ channels */
+	} u;
+	uint8_t bits;
 };
 
 struct si_shader;
@@ -524,8 +521,11 @@ struct si_shader_key {
 
 	/* Flags for monolithic compilation only. */
 	struct {
-		/* One byte for every input: SI_FIX_FETCH_* enums. */
-		uint8_t		vs_fix_fetch[SI_MAX_ATTRIBS];
+		/* Whether fetch should be opencoded according to vs_fix_fetch.
+		 * Otherwise, if vs_fix_fetch is non-zero, buffer_load_format_xyzw
+		 * with minimal fixups is used. */
+		uint16_t vs_fetch_opencode;
+		union si_vs_fix_fetch vs_fix_fetch[SI_MAX_ATTRIBS];
 
 		union {
 			uint64_t	ff_tcs_inputs_to_copy; /* for fixed-func TCS */
diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c
index 876a993b158..55965bc86a1 100644
--- a/src/gallium/drivers/radeonsi/si_state.c
+++ b/src/gallium/drivers/radeonsi/si_state.c
@@ -4459,10 +4459,8 @@ static void *si_create_vertex_elements(struct pipe_context *ctx,
 	for (i = 0; i < count; ++i) {
 		const struct util_format_description *desc;
 		const struct util_format_channel_description *channel;
-		unsigned data_format, num_format;
 		int first_non_void;
 		unsigned vbo_index = elements[i].vertex_buffer_index;
-		unsigned char swizzle[4];
 
 		if (vbo_index >= SI_NUM_VERTEX_BUFFERS) {
 			FREE(v);
@@ -4489,105 +4487,137 @@ static void *si_create_vertex_elements(struct pipe_context *ctx,
 
 		desc = util_format_description(elements[i].src_format);
 		first_non_void = util_format_get_first_non_void_channel(elements[i].src_format);
-		data_format = si_translate_buffer_dataformat(ctx->screen, desc, first_non_void);
-		num_format = si_translate_buffer_numformat(ctx->screen, desc, first_non_void);
 		channel = first_non_void >= 0 ? &desc->channel[first_non_void] : NULL;
-		memcpy(swizzle, desc->swizzle, sizeof(swizzle));
 
 		v->format_size[i] = desc->block.bits / 8;
 		v->src_offset[i] = elements[i].src_offset;
 		v->vertex_buffer_index[i] = vbo_index;
 
-		/* The hardware always treats the 2-bit alpha channel as
-		 * unsigned, so a shader workaround is needed. The affected
-		 * chips are VI and older except Stoney (GFX8.1).
-		 */
-		if (data_format == V_008F0C_BUF_DATA_FORMAT_2_10_10_10 &&
-		    sscreen->info.chip_class <= VI &&
-		    sscreen->info.family != CHIP_STONEY) {
-			if (num_format == V_008F0C_BUF_NUM_FORMAT_SNORM) {
-				v->fix_fetch[i] = SI_FIX_FETCH_A2_SNORM;
-			} else if (num_format == V_008F0C_BUF_NUM_FORMAT_SSCALED) {
-				v->fix_fetch[i] = SI_FIX_FETCH_A2_SSCALED;
-			} else if (num_format == V_008F0C_BUF_NUM_FORMAT_SINT) {
-				/* This isn't actually used in OpenGL. */
-				v->fix_fetch[i] = SI_FIX_FETCH_A2_SINT;
-			}
-		} else if (channel && channel->type == UTIL_FORMAT_TYPE_FIXED) {
-			if (desc->swizzle[3] == PIPE_SWIZZLE_1)
-				v->fix_fetch[i] = SI_FIX_FETCH_RGBX_32_FIXED;
-			else
-				v->fix_fetch[i] = SI_FIX_FETCH_RGBA_32_FIXED;
-		} else if (channel && channel->size == 32 && !channel->pure_integer) {
-			if (channel->type == UTIL_FORMAT_TYPE_SIGNED) {
-				if (channel->normalized) {
-					if (desc->swizzle[3] == PIPE_SWIZZLE_1)
-						v->fix_fetch[i] = SI_FIX_FETCH_RGBX_32_SNORM;
-					else
-						v->fix_fetch[i] = SI_FIX_FETCH_RGBA_32_SNORM;
-				} else {
-					v->fix_fetch[i] = SI_FIX_FETCH_RGBA_32_SSCALED;
-				}
-			} else if (channel->type == UTIL_FORMAT_TYPE_UNSIGNED) {
-				if (channel->normalized) {
-					if (desc->swizzle[3] == PIPE_SWIZZLE_1)
-						v->fix_fetch[i] = SI_FIX_FETCH_RGBX_32_UNORM;
-					else
-						v->fix_fetch[i] = SI_FIX_FETCH_RGBA_32_UNORM;
-				} else {
-					v->fix_fetch[i] = SI_FIX_FETCH_RGBA_32_USCALED;
-				}
-			}
-		} else if (channel && channel->size == 64 &&
-			   channel->type == UTIL_FORMAT_TYPE_FLOAT) {
-			switch (desc->nr_channels) {
-			case 1:
-			case 2:
-				v->fix_fetch[i] = SI_FIX_FETCH_RG_64_FLOAT;
-				swizzle[0] = PIPE_SWIZZLE_X;
-				swizzle[1] = PIPE_SWIZZLE_Y;
-				swizzle[2] = desc->nr_channels == 2 ? PIPE_SWIZZLE_Z : PIPE_SWIZZLE_0;
-				swizzle[3] = desc->nr_channels == 2 ? PIPE_SWIZZLE_W : PIPE_SWIZZLE_0;
-				break;
-			case 3:
-				v->fix_fetch[i] = SI_FIX_FETCH_RGB_64_FLOAT;
-				swizzle[0] = PIPE_SWIZZLE_X; /* 3 loads */
-				swizzle[1] = PIPE_SWIZZLE_Y;
-				swizzle[2] = PIPE_SWIZZLE_0;
-				swizzle[3] = PIPE_SWIZZLE_0;
-				break;
-			case 4:
-				v->fix_fetch[i] = SI_FIX_FETCH_RGBA_64_FLOAT;
-				swizzle[0] = PIPE_SWIZZLE_X; /* 2 loads */
-				swizzle[1] = PIPE_SWIZZLE_Y;
-				swizzle[2] = PIPE_SWIZZLE_Z;
-				swizzle[3] = PIPE_SWIZZLE_W;
-				break;
-			default:
-				assert(0);
-			}
-		} else if (channel && desc->nr_channels == 3) {
-			assert(desc->swizzle[0] == PIPE_SWIZZLE_X);
+		bool always_fix = false;
+		union si_vs_fix_fetch fix_fetch;
+		unsigned log_hw_load_size; /* the load element size as seen by the hardware */
 
-			if (channel->size == 8) {
+		fix_fetch.bits = 0;
+		log_hw_load_size = MIN2(2, util_logbase2(desc->block.bits) - 3);
+
+		if (channel) {
+			switch (channel->type) {
+			case UTIL_FORMAT_TYPE_FLOAT: fix_fetch.u.format = AC_FETCH_FORMAT_FLOAT; break;
+			case UTIL_FORMAT_TYPE_FIXED: fix_fetch.u.format = AC_FETCH_FORMAT_FIXED; break;
+			case UTIL_FORMAT_TYPE_SIGNED: {
 				if (channel->pure_integer)
-					v->fix_fetch[i] = SI_FIX_FETCH_RGB_8_INT;
+					fix_fetch.u.format = AC_FETCH_FORMAT_SINT;
+				else if (channel->normalized)
+					fix_fetch.u.format = AC_FETCH_FORMAT_SNORM;
 				else
-					v->fix_fetch[i] = SI_FIX_FETCH_RGB_8;
-			} else if (channel->size == 16) {
+					fix_fetch.u.format = AC_FETCH_FORMAT_SSCALED;
+				break;
+			}
+			case UTIL_FORMAT_TYPE_UNSIGNED: {
 				if (channel->pure_integer)
-					v->fix_fetch[i] = SI_FIX_FETCH_RGB_16_INT;
+					fix_fetch.u.format = AC_FETCH_FORMAT_UINT;
+				else if (channel->normalized)
+					fix_fetch.u.format = AC_FETCH_FORMAT_UNORM;
 				else
-					v->fix_fetch[i] = SI_FIX_FETCH_RGB_16;
+					fix_fetch.u.format = AC_FETCH_FORMAT_USCALED;
+				break;
+			}
+			default: unreachable("bad format type");
+			}
+		} else {
+			switch (elements[i].src_format) {
+			case PIPE_FORMAT_R11G11B10_FLOAT: fix_fetch.u.format = AC_FETCH_FORMAT_FLOAT; break;
+			default: unreachable("bad other format");
 			}
 		}
 
-		v->rsrc_word3[i] = S_008F0C_DST_SEL_X(si_map_swizzle(swizzle[0])) |
-				   S_008F0C_DST_SEL_Y(si_map_swizzle(swizzle[1])) |
-				   S_008F0C_DST_SEL_Z(si_map_swizzle(swizzle[2])) |
-				   S_008F0C_DST_SEL_W(si_map_swizzle(swizzle[3])) |
-				   S_008F0C_NUM_FORMAT(num_format) |
-				   S_008F0C_DATA_FORMAT(data_format);
+		if (desc->channel[0].size == 10) {
+			fix_fetch.u.log_size = 3; /* special encoding for 2_10_10_10 */
+			log_hw_load_size = 2;
+
+			/* The hardware always treats the 2-bit alpha channel as
+			 * unsigned, so a shader workaround is needed. The affected
+			 * chips are VI and older except Stoney (GFX8.1).
+			 */
+			always_fix = sscreen->info.chip_class <= VI &&
+				     sscreen->info.family != CHIP_STONEY &&
+				     channel->type == UTIL_FORMAT_TYPE_SIGNED;
+		} else if (elements[i].src_format == PIPE_FORMAT_R11G11B10_FLOAT) {
+			fix_fetch.u.log_size = 3; /* special encoding */
+			fix_fetch.u.format = AC_FETCH_FORMAT_FIXED;
+			log_hw_load_size = 2;
+		} else {
+			fix_fetch.u.log_size = util_logbase2(channel->size) - 3;
+			fix_fetch.u.num_channels_m1 = desc->nr_channels - 1;
+
+			/* Always fix up:
+			 * - doubles (multiple loads + truncate to float)
+			 * - 32-bit requiring a conversion
+			 */
+			always_fix =
+				(fix_fetch.u.log_size == 3) ||
+				(fix_fetch.u.log_size == 2 &&
+				 fix_fetch.u.format != AC_FETCH_FORMAT_FLOAT &&
+				 fix_fetch.u.format != AC_FETCH_FORMAT_UINT &&
+				 fix_fetch.u.format != AC_FETCH_FORMAT_SINT);
+
+			/* Also fixup 8_8_8 and 16_16_16. */
+			if (desc->nr_channels == 3 && fix_fetch.u.log_size <= 1) {
+				always_fix = true;
+				log_hw_load_size = fix_fetch.u.log_size;
+			}
+		}
+
+		if (desc->swizzle[0] != PIPE_SWIZZLE_X) {
+			assert(desc->swizzle[0] == PIPE_SWIZZLE_Z &&
+			       (desc->swizzle[2] == PIPE_SWIZZLE_X || desc->swizzle[2] == PIPE_SWIZZLE_0));
+			fix_fetch.u.reverse = 1;
+		}
+
+		/* Force the workaround for unaligned access here already if the
+		 * offset relative to the vertex buffer base is unaligned.
+		 *
+		 * There is a theoretical case in which this is too conservative:
+		 * if the vertex buffer's offset is also unaligned in just the
+		 * right way, we end up with an aligned address after all.
+		 * However, this case should be extremely rare in practice (it
+		 * won't happen in well-behaved applications), and taking it
+		 * into account would complicate the fast path (where everything
+		 * is nicely aligned).
+		 */
+		bool check_alignment = log_hw_load_size >= 1 && sscreen->info.chip_class == SI;
+		bool opencode = sscreen->options.vs_fetch_always_opencode;
+
+		if (check_alignment &&
+		    (elements[i].src_offset & ((1 << log_hw_load_size) - 1)) != 0)
+			opencode = true;
+
+		if (always_fix || check_alignment || opencode)
+			v->fix_fetch[i] = fix_fetch.bits;
+
+		if (opencode)
+			v->fix_fetch_opencode |= 1 << i;
+		if (opencode || always_fix)
+			v->fix_fetch_always |= 1 << i;
+
+		if (check_alignment && !opencode) {
+			assert(log_hw_load_size == 1 || log_hw_load_size == 2);
+
+			v->fix_fetch_unaligned |= 1 << i;
+			v->hw_load_is_dword |= (log_hw_load_size - 1) << i;
+			v->vb_alignment_check_mask |= 1 << vbo_index;
+		}
+
+		v->rsrc_word3[i] = S_008F0C_DST_SEL_X(si_map_swizzle(desc->swizzle[0])) |
+				   S_008F0C_DST_SEL_Y(si_map_swizzle(desc->swizzle[1])) |
+				   S_008F0C_DST_SEL_Z(si_map_swizzle(desc->swizzle[2])) |
+				   S_008F0C_DST_SEL_W(si_map_swizzle(desc->swizzle[3]));
+
+		unsigned data_format, num_format;
+		data_format = si_translate_buffer_dataformat(ctx->screen, desc, first_non_void);
+		num_format = si_translate_buffer_numformat(ctx->screen, desc, first_non_void);
+		v->rsrc_word3[i] |= S_008F0C_NUM_FORMAT(num_format) |
+				    S_008F0C_DATA_FORMAT(data_format);
 	}
 
 	if (v->instance_divisor_is_fetched) {
@@ -4621,7 +4651,17 @@ static void si_bind_vertex_elements(struct pipe_context *ctx, void *state)
 	    (!old ||
 	     old->count != v->count ||
 	     old->uses_instance_divisors != v->uses_instance_divisors ||
-	     v->uses_instance_divisors || /* we don't check which divisors changed */
+	     /* we don't check which divisors changed */
+	     v->uses_instance_divisors ||
+	     (old->vb_alignment_check_mask ^ v->vb_alignment_check_mask) & sctx->vertex_buffer_unaligned ||
+	     ((v->vb_alignment_check_mask & sctx->vertex_buffer_unaligned) &&
+	      memcmp(old->vertex_buffer_index, v->vertex_buffer_index,
+		     sizeof(v->vertex_buffer_index[0]) * v->count)) ||
+	     /* fix_fetch_{always,opencode,unaligned} and hw_load_is_dword are
+	      * functions of fix_fetch and the src_offset alignment.
+	      * If they change and fix_fetch doesn't, it must be due to different
+	      * src_offset alignment, which is reflected in fix_fetch_opencode. */
+	     old->fix_fetch_opencode != v->fix_fetch_opencode ||
 	     memcmp(old->fix_fetch, v->fix_fetch, sizeof(v->fix_fetch[0]) * v->count)))
 		sctx->do_update_shaders = true;
 
@@ -4653,6 +4693,8 @@ static void si_set_vertex_buffers(struct pipe_context *ctx,
 {
 	struct si_context *sctx = (struct si_context *)ctx;
 	struct pipe_vertex_buffer *dst = sctx->vertex_buffer + start_slot;
+	uint32_t orig_unaligned = sctx->vertex_buffer_unaligned;
+	uint32_t unaligned = orig_unaligned;
 	int i;
 
 	assert(start_slot + count <= ARRAY_SIZE(sctx->vertex_buffer));
@@ -4666,6 +4708,11 @@ static void si_set_vertex_buffers(struct pipe_context *ctx,
 			pipe_resource_reference(&dsti->buffer.resource, buf);
 			dsti->buffer_offset = src->buffer_offset;
 			dsti->stride = src->stride;
+			if (dsti->buffer_offset & 3 || dsti->stride & 3)
+				unaligned |= 1 << (start_slot + i);
+			else
+				unaligned &= ~(1 << (start_slot + i));
+
 			si_context_add_resource_size(sctx, buf);
 			if (buf)
 				si_resource(buf)->bind_history |= PIPE_BIND_VERTEX_BUFFER;
@@ -4674,8 +4721,22 @@ static void si_set_vertex_buffers(struct pipe_context *ctx,
 		for (i = 0; i < count; i++) {
 			pipe_resource_reference(&dst[i].buffer.resource, NULL);
 		}
+		unaligned &= ~u_bit_consecutive(start_slot, count);
 	}
 	sctx->vertex_buffers_dirty = true;
+	sctx->vertex_buffer_unaligned = unaligned;
+
+	/* Check whether alignment may have changed in a way that requires
+	 * shader changes. This check is conservative: a vertex buffer can only
+	 * trigger a shader change if the misalignment amount changes (e.g.
+	 * from byte-aligned to short-aligned), but we only keep track of
+	 * whether buffers are at least dword-aligned, since that should always
+	 * be the case in well-behaved applications anyway.
+	 */
+	if (sctx->vertex_elements &&
+	    (sctx->vertex_elements->vb_alignment_check_mask &
+	     (unaligned | orig_unaligned) & u_bit_consecutive(start_slot, count)))
+		sctx->do_update_shaders = true;
 }
 
 /*
diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h
index 6df24f9648a..e4b1cf79132 100644
--- a/src/gallium/drivers/radeonsi/si_state.h
+++ b/src/gallium/drivers/radeonsi/si_state.h
@@ -139,6 +139,25 @@ struct si_vertex_elements
 	uint8_t				format_size[SI_MAX_ATTRIBS];
 	uint8_t				vertex_buffer_index[SI_MAX_ATTRIBS];
 
+	/* Bitmask of elements that always need a fixup to be applied. */
+	uint16_t			fix_fetch_always;
+
+	/* Bitmask of elements whose fetch should always be opencoded. */
+	uint16_t			fix_fetch_opencode;
+
+	/* Bitmask of elements which need to be opencoded if the vertex buffer
+	 * is unaligned. */
+	uint16_t			fix_fetch_unaligned;
+
+	/* For elements in fix_fetch_unaligned: whether the effective
+	 * element load size as seen by the hardware is a dword (as opposed
+	 * to a short).
+	 */
+	uint16_t			hw_load_is_dword;
+
+	/* Bitmask of vertex buffers requiring alignment check */
+	uint16_t			vb_alignment_check_mask;
+
 	uint8_t				count;
 	bool				uses_instance_divisors;
 
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c
index a9c5c7b9e4d..51a3af92d0c 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -1390,7 +1390,31 @@ static void si_shader_selector_key_vs(struct si_context *sctx,
 		key->opt.prefer_mono = 1;
 
 	unsigned count = MIN2(vs->info.num_inputs, elts->count);
-	memcpy(key->mono.vs_fix_fetch, elts->fix_fetch, count);
+	unsigned count_mask = (1 << count) - 1;
+	unsigned fix = elts->fix_fetch_always & count_mask;
+	unsigned opencode = elts->fix_fetch_opencode & count_mask;
+
+	if (sctx->vertex_buffer_unaligned & elts->vb_alignment_check_mask) {
+		uint32_t mask = elts->fix_fetch_unaligned & count_mask;
+		while (mask) {
+			unsigned i = u_bit_scan(&mask);
+			unsigned log_hw_load_size = 1 + ((elts->hw_load_is_dword >> i) & 1);
+			unsigned vbidx = elts->vertex_buffer_index[i];
+			struct pipe_vertex_buffer *vb = &sctx->vertex_buffer[vbidx];
+			unsigned align_mask = (1 << log_hw_load_size) - 1;
+			if (vb->buffer_offset & align_mask ||
+			    vb->stride & align_mask) {
+				fix |= 1 << i;
+				opencode |= 1 << i;
+			}
+		}
+	}
+
+	while (fix) {
+		unsigned i = u_bit_scan(&fix);
+		key->mono.vs_fix_fetch[i].bits = elts->fix_fetch[i];
+	}
+	key->mono.vs_fetch_opencode = opencode;
 }
 
 static void si_shader_selector_key_hw_vs(struct si_context *sctx,
-- 
2.30.2