OPT_BOOL(enable_nir, false, "Enable NIR")
OPT_BOOL(aux_debug, false, "Generate ddebug_dumps for the auxiliary context")
OPT_BOOL(sync_compile, false, "Always compile synchronously (will cause stalls)")
+OPT_BOOL(vs_fetch_always_opencode, false, "Always open code vertex fetches (less efficient, purely for testing)")
#undef OPT_BOOL
case PIPE_CAP_VERTEX_BUFFER_OFFSET_4BYTE_ALIGNED_ONLY:
case PIPE_CAP_VERTEX_BUFFER_STRIDE_4BYTE_ALIGNED_ONLY:
case PIPE_CAP_VERTEX_ELEMENT_SRC_OFFSET_4BYTE_ALIGNED_ONLY:
- return !sscreen->info.has_unaligned_shader_loads;
+ return HAVE_LLVM < 0x0900 && !sscreen->info.has_unaligned_shader_loads;
case PIPE_CAP_SPARSE_BUFFER_PAGE_SIZE:
return sscreen->info.has_sparse_vm_mappings ?
bool vertex_buffers_dirty;
bool vertex_buffer_pointer_dirty;
struct pipe_vertex_buffer vertex_buffer[SI_NUM_VERTEX_BUFFERS];
+ uint16_t vertex_buffer_unaligned; /* bitmask of not dword-aligned buffers */
/* MSAA config state. */
int ps_iter_samples;
}
}
-/* Bitcast <4 x float> to <2 x double>, extract the component, and convert
- * to float. */
-static LLVMValueRef extract_double_to_float(struct si_shader_context *ctx,
- LLVMValueRef vec4,
- unsigned double_index)
-{
- LLVMBuilderRef builder = ctx->ac.builder;
- LLVMTypeRef f64 = LLVMDoubleTypeInContext(ctx->ac.context);
- LLVMValueRef dvec2 = LLVMBuildBitCast(builder, vec4,
- LLVMVectorType(f64, 2), "");
- LLVMValueRef index = LLVMConstInt(ctx->i32, double_index, 0);
- LLVMValueRef value = LLVMBuildExtractElement(builder, dvec2, index, "");
- return LLVMBuildFPTrunc(builder, value, ctx->f32, "");
-}
-
static LLVMValueRef unpack_sint16(struct si_shader_context *ctx,
LLVMValueRef i32, unsigned index)
{
return;
}
- unsigned chan;
- unsigned fix_fetch;
- unsigned num_fetches;
- unsigned fetch_stride;
- unsigned num_channels;
-
+ union si_vs_fix_fetch fix_fetch;
LLVMValueRef t_list_ptr;
LLVMValueRef t_offset;
LLVMValueRef t_list;
LLVMValueRef vertex_index;
- LLVMValueRef input[3];
+ LLVMValueRef tmp;
/* Load the T list */
t_list_ptr = LLVMGetParam(ctx->main_fn, ctx->param_vertex_buffers);
ctx->param_vertex_index0 +
input_index);
- fix_fetch = ctx->shader->key.mono.vs_fix_fetch[input_index];
+ /* Use the open-coded implementation for all loads of doubles and
+ * of dword-sized data that needs fixups. We need to insert conversion
+ * code anyway, and the amd/common code does it for us.
+ *
+ * Note: On LLVM <= 8, we can only open-code formats with
+ * channel size >= 4 bytes.
+ */
+ bool opencode = ctx->shader->key.mono.vs_fetch_opencode & (1 << input_index);
+ fix_fetch.bits = ctx->shader->key.mono.vs_fix_fetch[input_index].bits;
+ if (opencode ||
+ (fix_fetch.u.log_size == 3 && fix_fetch.u.format == AC_FETCH_FORMAT_FLOAT) ||
+ (fix_fetch.u.log_size == 2)) {
+ tmp = ac_build_opencoded_load_format(
+ &ctx->ac, fix_fetch.u.log_size, fix_fetch.u.num_channels_m1 + 1,
+ fix_fetch.u.format, fix_fetch.u.reverse, !opencode,
+ t_list, vertex_index, ctx->ac.i32_0, ctx->ac.i32_0,
+ false, false, true);
+ for (unsigned i = 0; i < 4; ++i)
+ out[i] = LLVMBuildExtractElement(ctx->ac.builder, tmp, LLVMConstInt(ctx->i32, i, false), "");
+ return;
+ }
/* Do multiple loads for special formats. */
- switch (fix_fetch) {
- case SI_FIX_FETCH_RG_64_FLOAT:
- num_fetches = 1; /* 1 2-dword or 4-dword load */
- fetch_stride = 0;
- if (util_last_bit(info->input_usage_mask[input_index]) >= 2)
- num_channels = 4; /* 2 doubles in 4 dwords */
- else
- num_channels = 2; /* 1 double in 2 dwords */
- break;
- case SI_FIX_FETCH_RGB_64_FLOAT:
- num_fetches = 3; /* 3 2-dword loads */
- fetch_stride = 8;
- num_channels = 2;
- break;
- case SI_FIX_FETCH_RGBA_64_FLOAT:
- num_fetches = 2; /* 2 4-dword loads */
- fetch_stride = 16;
- num_channels = 4;
- break;
- case SI_FIX_FETCH_RGB_8:
- case SI_FIX_FETCH_RGB_8_INT:
- num_fetches = 3;
- fetch_stride = 1;
- num_channels = 1;
- break;
- case SI_FIX_FETCH_RGB_16:
- case SI_FIX_FETCH_RGB_16_INT:
- num_fetches = 3;
- fetch_stride = 2;
- num_channels = 1;
- break;
- default:
+ unsigned required_channels = util_last_bit(info->input_usage_mask[input_index]);
+ LLVMValueRef fetches[4];
+ unsigned num_fetches;
+ unsigned fetch_stride;
+ unsigned channels_per_fetch;
+
+ if (fix_fetch.u.log_size <= 1 && fix_fetch.u.num_channels_m1 == 2) {
+ num_fetches = MIN2(required_channels, 3);
+ fetch_stride = 1 << fix_fetch.u.log_size;
+ channels_per_fetch = 1;
+ } else {
num_fetches = 1;
fetch_stride = 0;
- num_channels = util_last_bit(info->input_usage_mask[input_index]);
+ channels_per_fetch = required_channels;
}
- for (unsigned i = 0; i < num_fetches; i++) {
+ for (unsigned i = 0; i < num_fetches; ++i) {
LLVMValueRef voffset = LLVMConstInt(ctx->i32, fetch_stride * i, 0);
-
- input[i] = ac_build_buffer_load_format(&ctx->ac, t_list,
- vertex_index, voffset,
- num_channels, false, true);
- input[i] = ac_build_expand_to_vec4(&ctx->ac, input[i], num_channels);
+ fetches[i] = ac_build_buffer_load_format(&ctx->ac, t_list, vertex_index, voffset,
+ channels_per_fetch, false, true);
}
- /* Break up the vec4 into individual components */
- for (chan = 0; chan < 4; chan++) {
- LLVMValueRef llvm_chan = LLVMConstInt(ctx->i32, chan, 0);
- out[chan] = LLVMBuildExtractElement(ctx->ac.builder,
- input[0], llvm_chan, "");
+ if (num_fetches == 1 && channels_per_fetch > 1) {
+ LLVMValueRef fetch = fetches[0];
+ for (unsigned i = 0; i < channels_per_fetch; ++i) {
+ tmp = LLVMConstInt(ctx->i32, i, false);
+ fetches[i] = LLVMBuildExtractElement(
+ ctx->ac.builder, fetch, tmp, "");
+ }
+ num_fetches = channels_per_fetch;
+ channels_per_fetch = 1;
}
- switch (fix_fetch) {
- case SI_FIX_FETCH_A2_SNORM:
- case SI_FIX_FETCH_A2_SSCALED:
- case SI_FIX_FETCH_A2_SINT: {
- /* The hardware returns an unsigned value; convert it to a
- * signed one.
+ for (unsigned i = num_fetches; i < 4; ++i)
+ fetches[i] = LLVMGetUndef(ctx->f32);
+
+ if (fix_fetch.u.log_size <= 1 && fix_fetch.u.num_channels_m1 == 2 &&
+ required_channels == 4) {
+ if (fix_fetch.u.format == AC_FETCH_FORMAT_UINT || fix_fetch.u.format == AC_FETCH_FORMAT_SINT)
+ fetches[3] = ctx->ac.i32_1;
+ else
+ fetches[3] = ctx->ac.f32_1;
+ } else if (fix_fetch.u.log_size == 3 &&
+ (fix_fetch.u.format == AC_FETCH_FORMAT_SNORM ||
+ fix_fetch.u.format == AC_FETCH_FORMAT_SSCALED ||
+ fix_fetch.u.format == AC_FETCH_FORMAT_SINT) &&
+ required_channels == 4) {
+ /* For 2_10_10_10, the hardware returns an unsigned value;
+ * convert it to a signed one.
*/
- LLVMValueRef tmp = out[3];
+ LLVMValueRef tmp = fetches[3];
LLVMValueRef c30 = LLVMConstInt(ctx->i32, 30, 0);
/* First, recover the sign-extended signed integer value. */
- if (fix_fetch == SI_FIX_FETCH_A2_SSCALED)
+ if (fix_fetch.u.format == AC_FETCH_FORMAT_SSCALED)
tmp = LLVMBuildFPToUI(ctx->ac.builder, tmp, ctx->i32, "");
else
tmp = ac_to_integer(&ctx->ac, tmp);
* exponent.
*/
tmp = LLVMBuildShl(ctx->ac.builder, tmp,
- fix_fetch == SI_FIX_FETCH_A2_SNORM ?
+ fix_fetch.u.format == AC_FETCH_FORMAT_SNORM ?
LLVMConstInt(ctx->i32, 7, 0) : c30, "");
tmp = LLVMBuildAShr(ctx->ac.builder, tmp, c30, "");
/* Convert back to the right type. */
- if (fix_fetch == SI_FIX_FETCH_A2_SNORM) {
+ if (fix_fetch.u.format == AC_FETCH_FORMAT_SNORM) {
LLVMValueRef clamp;
LLVMValueRef neg_one = LLVMConstReal(ctx->f32, -1.0);
tmp = LLVMBuildSIToFP(ctx->ac.builder, tmp, ctx->f32, "");
clamp = LLVMBuildFCmp(ctx->ac.builder, LLVMRealULT, tmp, neg_one, "");
tmp = LLVMBuildSelect(ctx->ac.builder, clamp, neg_one, tmp, "");
- } else if (fix_fetch == SI_FIX_FETCH_A2_SSCALED) {
+ } else if (fix_fetch.u.format == AC_FETCH_FORMAT_SSCALED) {
tmp = LLVMBuildSIToFP(ctx->ac.builder, tmp, ctx->f32, "");
}
- out[3] = tmp;
- break;
+ fetches[3] = tmp;
}
- case SI_FIX_FETCH_RGBA_32_UNORM:
- case SI_FIX_FETCH_RGBX_32_UNORM:
- for (chan = 0; chan < 4; chan++) {
- out[chan] = ac_to_integer(&ctx->ac, out[chan]);
- out[chan] = LLVMBuildUIToFP(ctx->ac.builder,
- out[chan], ctx->f32, "");
- out[chan] = LLVMBuildFMul(ctx->ac.builder, out[chan],
- LLVMConstReal(ctx->f32, 1.0 / UINT_MAX), "");
- }
- /* RGBX UINT returns 1 in alpha, which would be rounded to 0 by normalizing. */
- if (fix_fetch == SI_FIX_FETCH_RGBX_32_UNORM)
- out[3] = LLVMConstReal(ctx->f32, 1);
- break;
- case SI_FIX_FETCH_RGBA_32_SNORM:
- case SI_FIX_FETCH_RGBX_32_SNORM:
- case SI_FIX_FETCH_RGBA_32_FIXED:
- case SI_FIX_FETCH_RGBX_32_FIXED: {
- double scale;
- if (fix_fetch >= SI_FIX_FETCH_RGBA_32_FIXED)
- scale = 1.0 / 0x10000;
- else
- scale = 1.0 / INT_MAX;
- for (chan = 0; chan < 4; chan++) {
- out[chan] = ac_to_integer(&ctx->ac, out[chan]);
- out[chan] = LLVMBuildSIToFP(ctx->ac.builder,
- out[chan], ctx->f32, "");
- out[chan] = LLVMBuildFMul(ctx->ac.builder, out[chan],
- LLVMConstReal(ctx->f32, scale), "");
- }
- /* RGBX SINT returns 1 in alpha, which would be rounded to 0 by normalizing. */
- if (fix_fetch == SI_FIX_FETCH_RGBX_32_SNORM ||
- fix_fetch == SI_FIX_FETCH_RGBX_32_FIXED)
- out[3] = LLVMConstReal(ctx->f32, 1);
- break;
- }
- case SI_FIX_FETCH_RGBA_32_USCALED:
- for (chan = 0; chan < 4; chan++) {
- out[chan] = ac_to_integer(&ctx->ac, out[chan]);
- out[chan] = LLVMBuildUIToFP(ctx->ac.builder,
- out[chan], ctx->f32, "");
- }
- break;
- case SI_FIX_FETCH_RGBA_32_SSCALED:
- for (chan = 0; chan < 4; chan++) {
- out[chan] = ac_to_integer(&ctx->ac, out[chan]);
- out[chan] = LLVMBuildSIToFP(ctx->ac.builder,
- out[chan], ctx->f32, "");
- }
- break;
- case SI_FIX_FETCH_RG_64_FLOAT:
- for (chan = 0; chan < 2; chan++)
- out[chan] = extract_double_to_float(ctx, input[0], chan);
-
- out[2] = LLVMConstReal(ctx->f32, 0);
- out[3] = LLVMConstReal(ctx->f32, 1);
- break;
- case SI_FIX_FETCH_RGB_64_FLOAT:
- for (chan = 0; chan < 3; chan++)
- out[chan] = extract_double_to_float(ctx, input[chan], 0);
-
- out[3] = LLVMConstReal(ctx->f32, 1);
- break;
- case SI_FIX_FETCH_RGBA_64_FLOAT:
- for (chan = 0; chan < 4; chan++) {
- out[chan] = extract_double_to_float(ctx, input[chan / 2],
- chan % 2);
- }
- break;
- case SI_FIX_FETCH_RGB_8:
- case SI_FIX_FETCH_RGB_8_INT:
- case SI_FIX_FETCH_RGB_16:
- case SI_FIX_FETCH_RGB_16_INT:
- for (chan = 0; chan < 3; chan++) {
- out[chan] = LLVMBuildExtractElement(ctx->ac.builder,
- input[chan],
- ctx->i32_0, "");
- }
- if (fix_fetch == SI_FIX_FETCH_RGB_8 ||
- fix_fetch == SI_FIX_FETCH_RGB_16) {
- out[3] = LLVMConstReal(ctx->f32, 1);
- } else {
- out[3] = ac_to_float(&ctx->ac, ctx->i32_1);
- }
- break;
- }
+ for (unsigned i = 0; i < 4; ++i)
+ out[i] = ac_to_float(&ctx->ac, fetches[i]);
}
static void declare_input_vs(
fprintf(f, " %s.ls_vgpr_fix = %u\n",
prefix, prolog->ls_vgpr_fix);
+ fprintf(f, " mono.vs.fetch_opencode = %x\n", key->mono.vs_fetch_opencode);
fprintf(f, " mono.vs.fix_fetch = {");
- for (int i = 0; i < SI_MAX_ATTRIBS; i++)
- fprintf(f, !i ? "%u" : ", %u", key->mono.vs_fix_fetch[i]);
+ for (int i = 0; i < SI_MAX_ATTRIBS; i++) {
+ union si_vs_fix_fetch fix = key->mono.vs_fix_fetch[i];
+ if (i)
+ fprintf(f, ", ");
+ if (!fix.bits)
+ fprintf(f, "0");
+ else
+ fprintf(f, "%u.%u.%u.%u", fix.u.reverse, fix.u.log_size,
+ fix.u.num_channels_m1, fix.u.format);
+ }
fprintf(f, "}\n");
}
SI_VS_BLIT_SGPRS_POS_TEXCOORD = 9,
};
-/* For VS shader key fix_fetch. */
-enum {
- SI_FIX_FETCH_NONE = 0,
- SI_FIX_FETCH_A2_SNORM,
- SI_FIX_FETCH_A2_SSCALED,
- SI_FIX_FETCH_A2_SINT,
- SI_FIX_FETCH_RGBA_32_UNORM,
- SI_FIX_FETCH_RGBX_32_UNORM,
- SI_FIX_FETCH_RGBA_32_SNORM,
- SI_FIX_FETCH_RGBX_32_SNORM,
- SI_FIX_FETCH_RGBA_32_USCALED,
- SI_FIX_FETCH_RGBA_32_SSCALED,
- SI_FIX_FETCH_RGBA_32_FIXED,
- SI_FIX_FETCH_RGBX_32_FIXED,
- SI_FIX_FETCH_RG_64_FLOAT,
- SI_FIX_FETCH_RGB_64_FLOAT,
- SI_FIX_FETCH_RGBA_64_FLOAT,
- SI_FIX_FETCH_RGB_8, /* A = 1.0 */
- SI_FIX_FETCH_RGB_8_INT, /* A = 1 */
- SI_FIX_FETCH_RGB_16,
- SI_FIX_FETCH_RGB_16_INT,
+/**
+ * For VS shader keys, describe any fixups required for vertex fetch.
+ *
+ * \ref log_size, \ref format, and the number of channels are interpreted as
+ * by \ref ac_build_opencoded_load_format.
+ *
+ * Note: all bits 0 (size = 1 byte, num channels = 1, format = float) is an
+ * impossible format and indicates that no fixup is needed (just use
+ * buffer_load_format_xyzw).
+ */
+union si_vs_fix_fetch {
+ struct {
+ uint8_t log_size : 2; /* 1, 2, 4, 8 or bytes per channel */
+ uint8_t num_channels_m1 : 2; /* number of channels minus 1 */
+ uint8_t format : 3; /* AC_FETCH_FORMAT_xxx */
+ uint8_t reverse : 1; /* reverse XYZ channels */
+ } u;
+ uint8_t bits;
};
struct si_shader;
/* Flags for monolithic compilation only. */
struct {
- /* One byte for every input: SI_FIX_FETCH_* enums. */
- uint8_t vs_fix_fetch[SI_MAX_ATTRIBS];
+ /* Whether fetch should be opencoded according to vs_fix_fetch.
+ * Otherwise, if vs_fix_fetch is non-zero, buffer_load_format_xyzw
+ * with minimal fixups is used. */
+ uint16_t vs_fetch_opencode;
+ union si_vs_fix_fetch vs_fix_fetch[SI_MAX_ATTRIBS];
union {
uint64_t ff_tcs_inputs_to_copy; /* for fixed-func TCS */
for (i = 0; i < count; ++i) {
const struct util_format_description *desc;
const struct util_format_channel_description *channel;
- unsigned data_format, num_format;
int first_non_void;
unsigned vbo_index = elements[i].vertex_buffer_index;
- unsigned char swizzle[4];
if (vbo_index >= SI_NUM_VERTEX_BUFFERS) {
FREE(v);
desc = util_format_description(elements[i].src_format);
first_non_void = util_format_get_first_non_void_channel(elements[i].src_format);
- data_format = si_translate_buffer_dataformat(ctx->screen, desc, first_non_void);
- num_format = si_translate_buffer_numformat(ctx->screen, desc, first_non_void);
channel = first_non_void >= 0 ? &desc->channel[first_non_void] : NULL;
- memcpy(swizzle, desc->swizzle, sizeof(swizzle));
v->format_size[i] = desc->block.bits / 8;
v->src_offset[i] = elements[i].src_offset;
v->vertex_buffer_index[i] = vbo_index;
- /* The hardware always treats the 2-bit alpha channel as
- * unsigned, so a shader workaround is needed. The affected
- * chips are VI and older except Stoney (GFX8.1).
- */
- if (data_format == V_008F0C_BUF_DATA_FORMAT_2_10_10_10 &&
- sscreen->info.chip_class <= VI &&
- sscreen->info.family != CHIP_STONEY) {
- if (num_format == V_008F0C_BUF_NUM_FORMAT_SNORM) {
- v->fix_fetch[i] = SI_FIX_FETCH_A2_SNORM;
- } else if (num_format == V_008F0C_BUF_NUM_FORMAT_SSCALED) {
- v->fix_fetch[i] = SI_FIX_FETCH_A2_SSCALED;
- } else if (num_format == V_008F0C_BUF_NUM_FORMAT_SINT) {
- /* This isn't actually used in OpenGL. */
- v->fix_fetch[i] = SI_FIX_FETCH_A2_SINT;
- }
- } else if (channel && channel->type == UTIL_FORMAT_TYPE_FIXED) {
- if (desc->swizzle[3] == PIPE_SWIZZLE_1)
- v->fix_fetch[i] = SI_FIX_FETCH_RGBX_32_FIXED;
- else
- v->fix_fetch[i] = SI_FIX_FETCH_RGBA_32_FIXED;
- } else if (channel && channel->size == 32 && !channel->pure_integer) {
- if (channel->type == UTIL_FORMAT_TYPE_SIGNED) {
- if (channel->normalized) {
- if (desc->swizzle[3] == PIPE_SWIZZLE_1)
- v->fix_fetch[i] = SI_FIX_FETCH_RGBX_32_SNORM;
- else
- v->fix_fetch[i] = SI_FIX_FETCH_RGBA_32_SNORM;
- } else {
- v->fix_fetch[i] = SI_FIX_FETCH_RGBA_32_SSCALED;
- }
- } else if (channel->type == UTIL_FORMAT_TYPE_UNSIGNED) {
- if (channel->normalized) {
- if (desc->swizzle[3] == PIPE_SWIZZLE_1)
- v->fix_fetch[i] = SI_FIX_FETCH_RGBX_32_UNORM;
- else
- v->fix_fetch[i] = SI_FIX_FETCH_RGBA_32_UNORM;
- } else {
- v->fix_fetch[i] = SI_FIX_FETCH_RGBA_32_USCALED;
- }
- }
- } else if (channel && channel->size == 64 &&
- channel->type == UTIL_FORMAT_TYPE_FLOAT) {
- switch (desc->nr_channels) {
- case 1:
- case 2:
- v->fix_fetch[i] = SI_FIX_FETCH_RG_64_FLOAT;
- swizzle[0] = PIPE_SWIZZLE_X;
- swizzle[1] = PIPE_SWIZZLE_Y;
- swizzle[2] = desc->nr_channels == 2 ? PIPE_SWIZZLE_Z : PIPE_SWIZZLE_0;
- swizzle[3] = desc->nr_channels == 2 ? PIPE_SWIZZLE_W : PIPE_SWIZZLE_0;
- break;
- case 3:
- v->fix_fetch[i] = SI_FIX_FETCH_RGB_64_FLOAT;
- swizzle[0] = PIPE_SWIZZLE_X; /* 3 loads */
- swizzle[1] = PIPE_SWIZZLE_Y;
- swizzle[2] = PIPE_SWIZZLE_0;
- swizzle[3] = PIPE_SWIZZLE_0;
- break;
- case 4:
- v->fix_fetch[i] = SI_FIX_FETCH_RGBA_64_FLOAT;
- swizzle[0] = PIPE_SWIZZLE_X; /* 2 loads */
- swizzle[1] = PIPE_SWIZZLE_Y;
- swizzle[2] = PIPE_SWIZZLE_Z;
- swizzle[3] = PIPE_SWIZZLE_W;
- break;
- default:
- assert(0);
- }
- } else if (channel && desc->nr_channels == 3) {
- assert(desc->swizzle[0] == PIPE_SWIZZLE_X);
+ bool always_fix = false;
+ union si_vs_fix_fetch fix_fetch;
+ unsigned log_hw_load_size; /* the load element size as seen by the hardware */
- if (channel->size == 8) {
+ fix_fetch.bits = 0;
+ log_hw_load_size = MIN2(2, util_logbase2(desc->block.bits) - 3);
+
+ if (channel) {
+ switch (channel->type) {
+ case UTIL_FORMAT_TYPE_FLOAT: fix_fetch.u.format = AC_FETCH_FORMAT_FLOAT; break;
+ case UTIL_FORMAT_TYPE_FIXED: fix_fetch.u.format = AC_FETCH_FORMAT_FIXED; break;
+ case UTIL_FORMAT_TYPE_SIGNED: {
if (channel->pure_integer)
- v->fix_fetch[i] = SI_FIX_FETCH_RGB_8_INT;
+ fix_fetch.u.format = AC_FETCH_FORMAT_SINT;
+ else if (channel->normalized)
+ fix_fetch.u.format = AC_FETCH_FORMAT_SNORM;
else
- v->fix_fetch[i] = SI_FIX_FETCH_RGB_8;
- } else if (channel->size == 16) {
+ fix_fetch.u.format = AC_FETCH_FORMAT_SSCALED;
+ break;
+ }
+ case UTIL_FORMAT_TYPE_UNSIGNED: {
if (channel->pure_integer)
- v->fix_fetch[i] = SI_FIX_FETCH_RGB_16_INT;
+ fix_fetch.u.format = AC_FETCH_FORMAT_UINT;
+ else if (channel->normalized)
+ fix_fetch.u.format = AC_FETCH_FORMAT_UNORM;
else
- v->fix_fetch[i] = SI_FIX_FETCH_RGB_16;
+ fix_fetch.u.format = AC_FETCH_FORMAT_USCALED;
+ break;
+ }
+ default: unreachable("bad format type");
+ }
+ } else {
+ switch (elements[i].src_format) {
+ case PIPE_FORMAT_R11G11B10_FLOAT: fix_fetch.u.format = AC_FETCH_FORMAT_FLOAT; break;
+ default: unreachable("bad other format");
}
}
- v->rsrc_word3[i] = S_008F0C_DST_SEL_X(si_map_swizzle(swizzle[0])) |
- S_008F0C_DST_SEL_Y(si_map_swizzle(swizzle[1])) |
- S_008F0C_DST_SEL_Z(si_map_swizzle(swizzle[2])) |
- S_008F0C_DST_SEL_W(si_map_swizzle(swizzle[3])) |
- S_008F0C_NUM_FORMAT(num_format) |
- S_008F0C_DATA_FORMAT(data_format);
+ if (desc->channel[0].size == 10) {
+ fix_fetch.u.log_size = 3; /* special encoding for 2_10_10_10 */
+ log_hw_load_size = 2;
+
+ /* The hardware always treats the 2-bit alpha channel as
+ * unsigned, so a shader workaround is needed. The affected
+ * chips are VI and older except Stoney (GFX8.1).
+ */
+ always_fix = sscreen->info.chip_class <= VI &&
+ sscreen->info.family != CHIP_STONEY &&
+ channel->type == UTIL_FORMAT_TYPE_SIGNED;
+ } else if (elements[i].src_format == PIPE_FORMAT_R11G11B10_FLOAT) {
+ fix_fetch.u.log_size = 3; /* special encoding */
+ fix_fetch.u.format = AC_FETCH_FORMAT_FIXED;
+ log_hw_load_size = 2;
+ } else {
+ fix_fetch.u.log_size = util_logbase2(channel->size) - 3;
+ fix_fetch.u.num_channels_m1 = desc->nr_channels - 1;
+
+ /* Always fix up:
+ * - doubles (multiple loads + truncate to float)
+ * - 32-bit requiring a conversion
+ */
+ always_fix =
+ (fix_fetch.u.log_size == 3) ||
+ (fix_fetch.u.log_size == 2 &&
+ fix_fetch.u.format != AC_FETCH_FORMAT_FLOAT &&
+ fix_fetch.u.format != AC_FETCH_FORMAT_UINT &&
+ fix_fetch.u.format != AC_FETCH_FORMAT_SINT);
+
+ /* Also fixup 8_8_8 and 16_16_16. */
+ if (desc->nr_channels == 3 && fix_fetch.u.log_size <= 1) {
+ always_fix = true;
+ log_hw_load_size = fix_fetch.u.log_size;
+ }
+ }
+
+ if (desc->swizzle[0] != PIPE_SWIZZLE_X) {
+ assert(desc->swizzle[0] == PIPE_SWIZZLE_Z &&
+ (desc->swizzle[2] == PIPE_SWIZZLE_X || desc->swizzle[2] == PIPE_SWIZZLE_0));
+ fix_fetch.u.reverse = 1;
+ }
+
+ /* Force the workaround for unaligned access here already if the
+ * offset relative to the vertex buffer base is unaligned.
+ *
+ * There is a theoretical case in which this is too conservative:
+ * if the vertex buffer's offset is also unaligned in just the
+ * right way, we end up with an aligned address after all.
+ * However, this case should be extremely rare in practice (it
+ * won't happen in well-behaved applications), and taking it
+ * into account would complicate the fast path (where everything
+ * is nicely aligned).
+ */
+ bool check_alignment = log_hw_load_size >= 1 && sscreen->info.chip_class == SI;
+ bool opencode = sscreen->options.vs_fetch_always_opencode;
+
+ if (check_alignment &&
+ (elements[i].src_offset & ((1 << log_hw_load_size) - 1)) != 0)
+ opencode = true;
+
+ if (always_fix || check_alignment || opencode)
+ v->fix_fetch[i] = fix_fetch.bits;
+
+ if (opencode)
+ v->fix_fetch_opencode |= 1 << i;
+ if (opencode || always_fix)
+ v->fix_fetch_always |= 1 << i;
+
+ if (check_alignment && !opencode) {
+ assert(log_hw_load_size == 1 || log_hw_load_size == 2);
+
+ v->fix_fetch_unaligned |= 1 << i;
+ v->hw_load_is_dword |= (log_hw_load_size - 1) << i;
+ v->vb_alignment_check_mask |= 1 << vbo_index;
+ }
+
+ v->rsrc_word3[i] = S_008F0C_DST_SEL_X(si_map_swizzle(desc->swizzle[0])) |
+ S_008F0C_DST_SEL_Y(si_map_swizzle(desc->swizzle[1])) |
+ S_008F0C_DST_SEL_Z(si_map_swizzle(desc->swizzle[2])) |
+ S_008F0C_DST_SEL_W(si_map_swizzle(desc->swizzle[3]));
+
+ unsigned data_format, num_format;
+ data_format = si_translate_buffer_dataformat(ctx->screen, desc, first_non_void);
+ num_format = si_translate_buffer_numformat(ctx->screen, desc, first_non_void);
+ v->rsrc_word3[i] |= S_008F0C_NUM_FORMAT(num_format) |
+ S_008F0C_DATA_FORMAT(data_format);
}
if (v->instance_divisor_is_fetched) {
(!old ||
old->count != v->count ||
old->uses_instance_divisors != v->uses_instance_divisors ||
- v->uses_instance_divisors || /* we don't check which divisors changed */
+ /* we don't check which divisors changed */
+ v->uses_instance_divisors ||
+ (old->vb_alignment_check_mask ^ v->vb_alignment_check_mask) & sctx->vertex_buffer_unaligned ||
+ ((v->vb_alignment_check_mask & sctx->vertex_buffer_unaligned) &&
+ memcmp(old->vertex_buffer_index, v->vertex_buffer_index,
+ sizeof(v->vertex_buffer_index[0]) * v->count)) ||
+ /* fix_fetch_{always,opencode,unaligned} and hw_load_is_dword are
+ * functions of fix_fetch and the src_offset alignment.
+ * If they change and fix_fetch doesn't, it must be due to different
+ * src_offset alignment, which is reflected in fix_fetch_opencode. */
+ old->fix_fetch_opencode != v->fix_fetch_opencode ||
memcmp(old->fix_fetch, v->fix_fetch, sizeof(v->fix_fetch[0]) * v->count)))
sctx->do_update_shaders = true;
{
struct si_context *sctx = (struct si_context *)ctx;
struct pipe_vertex_buffer *dst = sctx->vertex_buffer + start_slot;
+ uint32_t orig_unaligned = sctx->vertex_buffer_unaligned;
+ uint32_t unaligned = orig_unaligned;
int i;
assert(start_slot + count <= ARRAY_SIZE(sctx->vertex_buffer));
pipe_resource_reference(&dsti->buffer.resource, buf);
dsti->buffer_offset = src->buffer_offset;
dsti->stride = src->stride;
+ if (dsti->buffer_offset & 3 || dsti->stride & 3)
+ unaligned |= 1 << (start_slot + i);
+ else
+ unaligned &= ~(1 << (start_slot + i));
+
si_context_add_resource_size(sctx, buf);
if (buf)
si_resource(buf)->bind_history |= PIPE_BIND_VERTEX_BUFFER;
for (i = 0; i < count; i++) {
pipe_resource_reference(&dst[i].buffer.resource, NULL);
}
+ unaligned &= ~u_bit_consecutive(start_slot, count);
}
sctx->vertex_buffers_dirty = true;
+ sctx->vertex_buffer_unaligned = unaligned;
+
+ /* Check whether alignment may have changed in a way that requires
+ * shader changes. This check is conservative: a vertex buffer can only
+ * trigger a shader change if the misalignment amount changes (e.g.
+ * from byte-aligned to short-aligned), but we only keep track of
+ * whether buffers are at least dword-aligned, since that should always
+ * be the case in well-behaved applications anyway.
+ */
+ if (sctx->vertex_elements &&
+ (sctx->vertex_elements->vb_alignment_check_mask &
+ (unaligned | orig_unaligned) & u_bit_consecutive(start_slot, count)))
+ sctx->do_update_shaders = true;
}
/*
uint8_t format_size[SI_MAX_ATTRIBS];
uint8_t vertex_buffer_index[SI_MAX_ATTRIBS];
+ /* Bitmask of elements that always need a fixup to be applied. */
+ uint16_t fix_fetch_always;
+
+ /* Bitmask of elements whose fetch should always be opencoded. */
+ uint16_t fix_fetch_opencode;
+
+ /* Bitmask of elements which need to be opencoded if the vertex buffer
+ * is unaligned. */
+ uint16_t fix_fetch_unaligned;
+
+ /* For elements in fix_fetch_unaligned: whether the effective
+ * element load size as seen by the hardware is a dword (as opposed
+ * to a short).
+ */
+ uint16_t hw_load_is_dword;
+
+ /* Bitmask of vertex buffers requiring alignment check */
+ uint16_t vb_alignment_check_mask;
+
uint8_t count;
bool uses_instance_divisors;
key->opt.prefer_mono = 1;
unsigned count = MIN2(vs->info.num_inputs, elts->count);
- memcpy(key->mono.vs_fix_fetch, elts->fix_fetch, count);
+ unsigned count_mask = (1 << count) - 1;
+ unsigned fix = elts->fix_fetch_always & count_mask;
+ unsigned opencode = elts->fix_fetch_opencode & count_mask;
+
+ if (sctx->vertex_buffer_unaligned & elts->vb_alignment_check_mask) {
+ uint32_t mask = elts->fix_fetch_unaligned & count_mask;
+ while (mask) {
+ unsigned i = u_bit_scan(&mask);
+ unsigned log_hw_load_size = 1 + ((elts->hw_load_is_dword >> i) & 1);
+ unsigned vbidx = elts->vertex_buffer_index[i];
+ struct pipe_vertex_buffer *vb = &sctx->vertex_buffer[vbidx];
+ unsigned align_mask = (1 << log_hw_load_size) - 1;
+ if (vb->buffer_offset & align_mask ||
+ vb->stride & align_mask) {
+ fix |= 1 << i;
+ opencode |= 1 << i;
+ }
+ }
+ }
+
+ while (fix) {
+ unsigned i = u_bit_scan(&fix);
+ key->mono.vs_fix_fetch[i].bits = elts->fix_fetch[i];
+ }
+ key->mono.vs_fetch_opencode = opencode;
}
static void si_shader_selector_key_hw_vs(struct si_context *sctx,