From d814c21b1bea0396c735d65b363a8f2c6324c7d8 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Nicolai=20H=C3=A4hnle?= Date: Mon, 1 Apr 2019 15:44:39 +0200 Subject: [PATCH] radeonsi: overhaul the vertex fetch fixup mechanism MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit The overall goal is to support unaligned loads from vertex buffers natively on SI. In the unaligned case, we fall back to the general case implementation in ac_build_opencoded_load_format. Since this function is fully general, we will also use it going forward for cases requiring fully manual format conversions of dwords anyway. This requires a different encoding of the fix_fetch array, which will now contain the entire format information if a fixup is required. Having to check the alignment of vertex buffers is awkward. To keep the impact on the fast path minimal, the si_context will keep track of which vertex buffers are (not) at least dword-aligned, while the si_vertex_elements will note which vertex buffers have some (at most dword) alignment requirement. Vertex buffers should be dword-aligned most of the time, which allows a fast early-out in almost all cases. Add the radeonsi_vs_fetch_always_opencode configuration variable for testing purposes. Note that it can only be used reliably on LLVM >= 9, because support for byte and short load is required. v2: - add a missing check to si_bind_vertex_elements Reviewed-by: Marek Olšák --- .../drivers/radeonsi/si_debug_options.h | 1 + src/gallium/drivers/radeonsi/si_get.c | 2 +- src/gallium/drivers/radeonsi/si_pipe.h | 1 + src/gallium/drivers/radeonsi/si_shader.c | 249 ++++++------------ src/gallium/drivers/radeonsi/si_shader.h | 46 ++-- src/gallium/drivers/radeonsi/si_state.c | 237 ++++++++++------- src/gallium/drivers/radeonsi/si_state.h | 19 ++ .../drivers/radeonsi/si_state_shaders.c | 26 +- 8 files changed, 301 insertions(+), 280 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_debug_options.h b/src/gallium/drivers/radeonsi/si_debug_options.h index 019256ca1d1..0bde7910fc6 100644 --- a/src/gallium/drivers/radeonsi/si_debug_options.h +++ b/src/gallium/drivers/radeonsi/si_debug_options.h @@ -2,5 +2,6 @@ OPT_BOOL(clear_db_cache_before_clear, false, "Clear DB cache before fast depth c OPT_BOOL(enable_nir, false, "Enable NIR") OPT_BOOL(aux_debug, false, "Generate ddebug_dumps for the auxiliary context") OPT_BOOL(sync_compile, false, "Always compile synchronously (will cause stalls)") +OPT_BOOL(vs_fetch_always_opencode, false, "Always open code vertex fetches (less efficient, purely for testing)") #undef OPT_BOOL diff --git a/src/gallium/drivers/radeonsi/si_get.c b/src/gallium/drivers/radeonsi/si_get.c index 700777186d8..eb23d4fe88a 100644 --- a/src/gallium/drivers/radeonsi/si_get.c +++ b/src/gallium/drivers/radeonsi/si_get.c @@ -197,7 +197,7 @@ static int si_get_param(struct pipe_screen *pscreen, enum pipe_cap param) case PIPE_CAP_VERTEX_BUFFER_OFFSET_4BYTE_ALIGNED_ONLY: case PIPE_CAP_VERTEX_BUFFER_STRIDE_4BYTE_ALIGNED_ONLY: case PIPE_CAP_VERTEX_ELEMENT_SRC_OFFSET_4BYTE_ALIGNED_ONLY: - return !sscreen->info.has_unaligned_shader_loads; + return HAVE_LLVM < 0x0900 && !sscreen->info.has_unaligned_shader_loads; case PIPE_CAP_SPARSE_BUFFER_PAGE_SIZE: return sscreen->info.has_sparse_vm_mappings ? diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index d3ddb912245..695827c9dd7 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -939,6 +939,7 @@ struct si_context { bool vertex_buffers_dirty; bool vertex_buffer_pointer_dirty; struct pipe_vertex_buffer vertex_buffer[SI_NUM_VERTEX_BUFFERS]; + uint16_t vertex_buffer_unaligned; /* bitmask of not dword-aligned buffers */ /* MSAA config state. */ int ps_iter_samples; diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c index d783555ca33..f6d882cf583 100644 --- a/src/gallium/drivers/radeonsi/si_shader.c +++ b/src/gallium/drivers/radeonsi/si_shader.c @@ -430,21 +430,6 @@ static LLVMValueRef get_tcs_in_vertex_dw_stride(struct si_shader_context *ctx) } } -/* Bitcast <4 x float> to <2 x double>, extract the component, and convert - * to float. */ -static LLVMValueRef extract_double_to_float(struct si_shader_context *ctx, - LLVMValueRef vec4, - unsigned double_index) -{ - LLVMBuilderRef builder = ctx->ac.builder; - LLVMTypeRef f64 = LLVMDoubleTypeInContext(ctx->ac.context); - LLVMValueRef dvec2 = LLVMBuildBitCast(builder, vec4, - LLVMVectorType(f64, 2), ""); - LLVMValueRef index = LLVMConstInt(ctx->i32, double_index, 0); - LLVMValueRef value = LLVMBuildExtractElement(builder, dvec2, index, ""); - return LLVMBuildFPTrunc(builder, value, ctx->f32, ""); -} - static LLVMValueRef unpack_sint16(struct si_shader_context *ctx, LLVMValueRef i32, unsigned index) { @@ -536,17 +521,12 @@ void si_llvm_load_input_vs( return; } - unsigned chan; - unsigned fix_fetch; - unsigned num_fetches; - unsigned fetch_stride; - unsigned num_channels; - + union si_vs_fix_fetch fix_fetch; LLVMValueRef t_list_ptr; LLVMValueRef t_offset; LLVMValueRef t_list; LLVMValueRef vertex_index; - LLVMValueRef input[3]; + LLVMValueRef tmp; /* Load the T list */ t_list_ptr = LLVMGetParam(ctx->main_fn, ctx->param_vertex_buffers); @@ -559,74 +539,84 @@ void si_llvm_load_input_vs( ctx->param_vertex_index0 + input_index); - fix_fetch = ctx->shader->key.mono.vs_fix_fetch[input_index]; + /* Use the open-coded implementation for all loads of doubles and + * of dword-sized data that needs fixups. We need to insert conversion + * code anyway, and the amd/common code does it for us. + * + * Note: On LLVM <= 8, we can only open-code formats with + * channel size >= 4 bytes. + */ + bool opencode = ctx->shader->key.mono.vs_fetch_opencode & (1 << input_index); + fix_fetch.bits = ctx->shader->key.mono.vs_fix_fetch[input_index].bits; + if (opencode || + (fix_fetch.u.log_size == 3 && fix_fetch.u.format == AC_FETCH_FORMAT_FLOAT) || + (fix_fetch.u.log_size == 2)) { + tmp = ac_build_opencoded_load_format( + &ctx->ac, fix_fetch.u.log_size, fix_fetch.u.num_channels_m1 + 1, + fix_fetch.u.format, fix_fetch.u.reverse, !opencode, + t_list, vertex_index, ctx->ac.i32_0, ctx->ac.i32_0, + false, false, true); + for (unsigned i = 0; i < 4; ++i) + out[i] = LLVMBuildExtractElement(ctx->ac.builder, tmp, LLVMConstInt(ctx->i32, i, false), ""); + return; + } /* Do multiple loads for special formats. */ - switch (fix_fetch) { - case SI_FIX_FETCH_RG_64_FLOAT: - num_fetches = 1; /* 1 2-dword or 4-dword load */ - fetch_stride = 0; - if (util_last_bit(info->input_usage_mask[input_index]) >= 2) - num_channels = 4; /* 2 doubles in 4 dwords */ - else - num_channels = 2; /* 1 double in 2 dwords */ - break; - case SI_FIX_FETCH_RGB_64_FLOAT: - num_fetches = 3; /* 3 2-dword loads */ - fetch_stride = 8; - num_channels = 2; - break; - case SI_FIX_FETCH_RGBA_64_FLOAT: - num_fetches = 2; /* 2 4-dword loads */ - fetch_stride = 16; - num_channels = 4; - break; - case SI_FIX_FETCH_RGB_8: - case SI_FIX_FETCH_RGB_8_INT: - num_fetches = 3; - fetch_stride = 1; - num_channels = 1; - break; - case SI_FIX_FETCH_RGB_16: - case SI_FIX_FETCH_RGB_16_INT: - num_fetches = 3; - fetch_stride = 2; - num_channels = 1; - break; - default: + unsigned required_channels = util_last_bit(info->input_usage_mask[input_index]); + LLVMValueRef fetches[4]; + unsigned num_fetches; + unsigned fetch_stride; + unsigned channels_per_fetch; + + if (fix_fetch.u.log_size <= 1 && fix_fetch.u.num_channels_m1 == 2) { + num_fetches = MIN2(required_channels, 3); + fetch_stride = 1 << fix_fetch.u.log_size; + channels_per_fetch = 1; + } else { num_fetches = 1; fetch_stride = 0; - num_channels = util_last_bit(info->input_usage_mask[input_index]); + channels_per_fetch = required_channels; } - for (unsigned i = 0; i < num_fetches; i++) { + for (unsigned i = 0; i < num_fetches; ++i) { LLVMValueRef voffset = LLVMConstInt(ctx->i32, fetch_stride * i, 0); - - input[i] = ac_build_buffer_load_format(&ctx->ac, t_list, - vertex_index, voffset, - num_channels, false, true); - input[i] = ac_build_expand_to_vec4(&ctx->ac, input[i], num_channels); + fetches[i] = ac_build_buffer_load_format(&ctx->ac, t_list, vertex_index, voffset, + channels_per_fetch, false, true); } - /* Break up the vec4 into individual components */ - for (chan = 0; chan < 4; chan++) { - LLVMValueRef llvm_chan = LLVMConstInt(ctx->i32, chan, 0); - out[chan] = LLVMBuildExtractElement(ctx->ac.builder, - input[0], llvm_chan, ""); + if (num_fetches == 1 && channels_per_fetch > 1) { + LLVMValueRef fetch = fetches[0]; + for (unsigned i = 0; i < channels_per_fetch; ++i) { + tmp = LLVMConstInt(ctx->i32, i, false); + fetches[i] = LLVMBuildExtractElement( + ctx->ac.builder, fetch, tmp, ""); + } + num_fetches = channels_per_fetch; + channels_per_fetch = 1; } - switch (fix_fetch) { - case SI_FIX_FETCH_A2_SNORM: - case SI_FIX_FETCH_A2_SSCALED: - case SI_FIX_FETCH_A2_SINT: { - /* The hardware returns an unsigned value; convert it to a - * signed one. + for (unsigned i = num_fetches; i < 4; ++i) + fetches[i] = LLVMGetUndef(ctx->f32); + + if (fix_fetch.u.log_size <= 1 && fix_fetch.u.num_channels_m1 == 2 && + required_channels == 4) { + if (fix_fetch.u.format == AC_FETCH_FORMAT_UINT || fix_fetch.u.format == AC_FETCH_FORMAT_SINT) + fetches[3] = ctx->ac.i32_1; + else + fetches[3] = ctx->ac.f32_1; + } else if (fix_fetch.u.log_size == 3 && + (fix_fetch.u.format == AC_FETCH_FORMAT_SNORM || + fix_fetch.u.format == AC_FETCH_FORMAT_SSCALED || + fix_fetch.u.format == AC_FETCH_FORMAT_SINT) && + required_channels == 4) { + /* For 2_10_10_10, the hardware returns an unsigned value; + * convert it to a signed one. */ - LLVMValueRef tmp = out[3]; + LLVMValueRef tmp = fetches[3]; LLVMValueRef c30 = LLVMConstInt(ctx->i32, 30, 0); /* First, recover the sign-extended signed integer value. */ - if (fix_fetch == SI_FIX_FETCH_A2_SSCALED) + if (fix_fetch.u.format == AC_FETCH_FORMAT_SSCALED) tmp = LLVMBuildFPToUI(ctx->ac.builder, tmp, ctx->i32, ""); else tmp = ac_to_integer(&ctx->ac, tmp); @@ -638,110 +628,26 @@ void si_llvm_load_input_vs( * exponent. */ tmp = LLVMBuildShl(ctx->ac.builder, tmp, - fix_fetch == SI_FIX_FETCH_A2_SNORM ? + fix_fetch.u.format == AC_FETCH_FORMAT_SNORM ? LLVMConstInt(ctx->i32, 7, 0) : c30, ""); tmp = LLVMBuildAShr(ctx->ac.builder, tmp, c30, ""); /* Convert back to the right type. */ - if (fix_fetch == SI_FIX_FETCH_A2_SNORM) { + if (fix_fetch.u.format == AC_FETCH_FORMAT_SNORM) { LLVMValueRef clamp; LLVMValueRef neg_one = LLVMConstReal(ctx->f32, -1.0); tmp = LLVMBuildSIToFP(ctx->ac.builder, tmp, ctx->f32, ""); clamp = LLVMBuildFCmp(ctx->ac.builder, LLVMRealULT, tmp, neg_one, ""); tmp = LLVMBuildSelect(ctx->ac.builder, clamp, neg_one, tmp, ""); - } else if (fix_fetch == SI_FIX_FETCH_A2_SSCALED) { + } else if (fix_fetch.u.format == AC_FETCH_FORMAT_SSCALED) { tmp = LLVMBuildSIToFP(ctx->ac.builder, tmp, ctx->f32, ""); } - out[3] = tmp; - break; + fetches[3] = tmp; } - case SI_FIX_FETCH_RGBA_32_UNORM: - case SI_FIX_FETCH_RGBX_32_UNORM: - for (chan = 0; chan < 4; chan++) { - out[chan] = ac_to_integer(&ctx->ac, out[chan]); - out[chan] = LLVMBuildUIToFP(ctx->ac.builder, - out[chan], ctx->f32, ""); - out[chan] = LLVMBuildFMul(ctx->ac.builder, out[chan], - LLVMConstReal(ctx->f32, 1.0 / UINT_MAX), ""); - } - /* RGBX UINT returns 1 in alpha, which would be rounded to 0 by normalizing. */ - if (fix_fetch == SI_FIX_FETCH_RGBX_32_UNORM) - out[3] = LLVMConstReal(ctx->f32, 1); - break; - case SI_FIX_FETCH_RGBA_32_SNORM: - case SI_FIX_FETCH_RGBX_32_SNORM: - case SI_FIX_FETCH_RGBA_32_FIXED: - case SI_FIX_FETCH_RGBX_32_FIXED: { - double scale; - if (fix_fetch >= SI_FIX_FETCH_RGBA_32_FIXED) - scale = 1.0 / 0x10000; - else - scale = 1.0 / INT_MAX; - for (chan = 0; chan < 4; chan++) { - out[chan] = ac_to_integer(&ctx->ac, out[chan]); - out[chan] = LLVMBuildSIToFP(ctx->ac.builder, - out[chan], ctx->f32, ""); - out[chan] = LLVMBuildFMul(ctx->ac.builder, out[chan], - LLVMConstReal(ctx->f32, scale), ""); - } - /* RGBX SINT returns 1 in alpha, which would be rounded to 0 by normalizing. */ - if (fix_fetch == SI_FIX_FETCH_RGBX_32_SNORM || - fix_fetch == SI_FIX_FETCH_RGBX_32_FIXED) - out[3] = LLVMConstReal(ctx->f32, 1); - break; - } - case SI_FIX_FETCH_RGBA_32_USCALED: - for (chan = 0; chan < 4; chan++) { - out[chan] = ac_to_integer(&ctx->ac, out[chan]); - out[chan] = LLVMBuildUIToFP(ctx->ac.builder, - out[chan], ctx->f32, ""); - } - break; - case SI_FIX_FETCH_RGBA_32_SSCALED: - for (chan = 0; chan < 4; chan++) { - out[chan] = ac_to_integer(&ctx->ac, out[chan]); - out[chan] = LLVMBuildSIToFP(ctx->ac.builder, - out[chan], ctx->f32, ""); - } - break; - case SI_FIX_FETCH_RG_64_FLOAT: - for (chan = 0; chan < 2; chan++) - out[chan] = extract_double_to_float(ctx, input[0], chan); - - out[2] = LLVMConstReal(ctx->f32, 0); - out[3] = LLVMConstReal(ctx->f32, 1); - break; - case SI_FIX_FETCH_RGB_64_FLOAT: - for (chan = 0; chan < 3; chan++) - out[chan] = extract_double_to_float(ctx, input[chan], 0); - - out[3] = LLVMConstReal(ctx->f32, 1); - break; - case SI_FIX_FETCH_RGBA_64_FLOAT: - for (chan = 0; chan < 4; chan++) { - out[chan] = extract_double_to_float(ctx, input[chan / 2], - chan % 2); - } - break; - case SI_FIX_FETCH_RGB_8: - case SI_FIX_FETCH_RGB_8_INT: - case SI_FIX_FETCH_RGB_16: - case SI_FIX_FETCH_RGB_16_INT: - for (chan = 0; chan < 3; chan++) { - out[chan] = LLVMBuildExtractElement(ctx->ac.builder, - input[chan], - ctx->i32_0, ""); - } - if (fix_fetch == SI_FIX_FETCH_RGB_8 || - fix_fetch == SI_FIX_FETCH_RGB_16) { - out[3] = LLVMConstReal(ctx->f32, 1); - } else { - out[3] = ac_to_float(&ctx->ac, ctx->i32_1); - } - break; - } + for (unsigned i = 0; i < 4; ++i) + out[i] = ac_to_float(&ctx->ac, fetches[i]); } static void declare_input_vs( @@ -5777,9 +5683,18 @@ static void si_dump_shader_key_vs(const struct si_shader_key *key, fprintf(f, " %s.ls_vgpr_fix = %u\n", prefix, prolog->ls_vgpr_fix); + fprintf(f, " mono.vs.fetch_opencode = %x\n", key->mono.vs_fetch_opencode); fprintf(f, " mono.vs.fix_fetch = {"); - for (int i = 0; i < SI_MAX_ATTRIBS; i++) - fprintf(f, !i ? "%u" : ", %u", key->mono.vs_fix_fetch[i]); + for (int i = 0; i < SI_MAX_ATTRIBS; i++) { + union si_vs_fix_fetch fix = key->mono.vs_fix_fetch[i]; + if (i) + fprintf(f, ", "); + if (!fix.bits) + fprintf(f, "0"); + else + fprintf(f, "%u.%u.%u.%u", fix.u.reverse, fix.u.log_size, + fix.u.num_channels_m1, fix.u.format); + } fprintf(f, "}\n"); } diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h index 71ce27b2f55..82c521efcb7 100644 --- a/src/gallium/drivers/radeonsi/si_shader.h +++ b/src/gallium/drivers/radeonsi/si_shader.h @@ -273,27 +273,24 @@ enum { SI_VS_BLIT_SGPRS_POS_TEXCOORD = 9, }; -/* For VS shader key fix_fetch. */ -enum { - SI_FIX_FETCH_NONE = 0, - SI_FIX_FETCH_A2_SNORM, - SI_FIX_FETCH_A2_SSCALED, - SI_FIX_FETCH_A2_SINT, - SI_FIX_FETCH_RGBA_32_UNORM, - SI_FIX_FETCH_RGBX_32_UNORM, - SI_FIX_FETCH_RGBA_32_SNORM, - SI_FIX_FETCH_RGBX_32_SNORM, - SI_FIX_FETCH_RGBA_32_USCALED, - SI_FIX_FETCH_RGBA_32_SSCALED, - SI_FIX_FETCH_RGBA_32_FIXED, - SI_FIX_FETCH_RGBX_32_FIXED, - SI_FIX_FETCH_RG_64_FLOAT, - SI_FIX_FETCH_RGB_64_FLOAT, - SI_FIX_FETCH_RGBA_64_FLOAT, - SI_FIX_FETCH_RGB_8, /* A = 1.0 */ - SI_FIX_FETCH_RGB_8_INT, /* A = 1 */ - SI_FIX_FETCH_RGB_16, - SI_FIX_FETCH_RGB_16_INT, +/** + * For VS shader keys, describe any fixups required for vertex fetch. + * + * \ref log_size, \ref format, and the number of channels are interpreted as + * by \ref ac_build_opencoded_load_format. + * + * Note: all bits 0 (size = 1 byte, num channels = 1, format = float) is an + * impossible format and indicates that no fixup is needed (just use + * buffer_load_format_xyzw). + */ +union si_vs_fix_fetch { + struct { + uint8_t log_size : 2; /* 1, 2, 4, 8 or bytes per channel */ + uint8_t num_channels_m1 : 2; /* number of channels minus 1 */ + uint8_t format : 3; /* AC_FETCH_FORMAT_xxx */ + uint8_t reverse : 1; /* reverse XYZ channels */ + } u; + uint8_t bits; }; struct si_shader; @@ -524,8 +521,11 @@ struct si_shader_key { /* Flags for monolithic compilation only. */ struct { - /* One byte for every input: SI_FIX_FETCH_* enums. */ - uint8_t vs_fix_fetch[SI_MAX_ATTRIBS]; + /* Whether fetch should be opencoded according to vs_fix_fetch. + * Otherwise, if vs_fix_fetch is non-zero, buffer_load_format_xyzw + * with minimal fixups is used. */ + uint16_t vs_fetch_opencode; + union si_vs_fix_fetch vs_fix_fetch[SI_MAX_ATTRIBS]; union { uint64_t ff_tcs_inputs_to_copy; /* for fixed-func TCS */ diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c index 876a993b158..55965bc86a1 100644 --- a/src/gallium/drivers/radeonsi/si_state.c +++ b/src/gallium/drivers/radeonsi/si_state.c @@ -4459,10 +4459,8 @@ static void *si_create_vertex_elements(struct pipe_context *ctx, for (i = 0; i < count; ++i) { const struct util_format_description *desc; const struct util_format_channel_description *channel; - unsigned data_format, num_format; int first_non_void; unsigned vbo_index = elements[i].vertex_buffer_index; - unsigned char swizzle[4]; if (vbo_index >= SI_NUM_VERTEX_BUFFERS) { FREE(v); @@ -4489,105 +4487,137 @@ static void *si_create_vertex_elements(struct pipe_context *ctx, desc = util_format_description(elements[i].src_format); first_non_void = util_format_get_first_non_void_channel(elements[i].src_format); - data_format = si_translate_buffer_dataformat(ctx->screen, desc, first_non_void); - num_format = si_translate_buffer_numformat(ctx->screen, desc, first_non_void); channel = first_non_void >= 0 ? &desc->channel[first_non_void] : NULL; - memcpy(swizzle, desc->swizzle, sizeof(swizzle)); v->format_size[i] = desc->block.bits / 8; v->src_offset[i] = elements[i].src_offset; v->vertex_buffer_index[i] = vbo_index; - /* The hardware always treats the 2-bit alpha channel as - * unsigned, so a shader workaround is needed. The affected - * chips are VI and older except Stoney (GFX8.1). - */ - if (data_format == V_008F0C_BUF_DATA_FORMAT_2_10_10_10 && - sscreen->info.chip_class <= VI && - sscreen->info.family != CHIP_STONEY) { - if (num_format == V_008F0C_BUF_NUM_FORMAT_SNORM) { - v->fix_fetch[i] = SI_FIX_FETCH_A2_SNORM; - } else if (num_format == V_008F0C_BUF_NUM_FORMAT_SSCALED) { - v->fix_fetch[i] = SI_FIX_FETCH_A2_SSCALED; - } else if (num_format == V_008F0C_BUF_NUM_FORMAT_SINT) { - /* This isn't actually used in OpenGL. */ - v->fix_fetch[i] = SI_FIX_FETCH_A2_SINT; - } - } else if (channel && channel->type == UTIL_FORMAT_TYPE_FIXED) { - if (desc->swizzle[3] == PIPE_SWIZZLE_1) - v->fix_fetch[i] = SI_FIX_FETCH_RGBX_32_FIXED; - else - v->fix_fetch[i] = SI_FIX_FETCH_RGBA_32_FIXED; - } else if (channel && channel->size == 32 && !channel->pure_integer) { - if (channel->type == UTIL_FORMAT_TYPE_SIGNED) { - if (channel->normalized) { - if (desc->swizzle[3] == PIPE_SWIZZLE_1) - v->fix_fetch[i] = SI_FIX_FETCH_RGBX_32_SNORM; - else - v->fix_fetch[i] = SI_FIX_FETCH_RGBA_32_SNORM; - } else { - v->fix_fetch[i] = SI_FIX_FETCH_RGBA_32_SSCALED; - } - } else if (channel->type == UTIL_FORMAT_TYPE_UNSIGNED) { - if (channel->normalized) { - if (desc->swizzle[3] == PIPE_SWIZZLE_1) - v->fix_fetch[i] = SI_FIX_FETCH_RGBX_32_UNORM; - else - v->fix_fetch[i] = SI_FIX_FETCH_RGBA_32_UNORM; - } else { - v->fix_fetch[i] = SI_FIX_FETCH_RGBA_32_USCALED; - } - } - } else if (channel && channel->size == 64 && - channel->type == UTIL_FORMAT_TYPE_FLOAT) { - switch (desc->nr_channels) { - case 1: - case 2: - v->fix_fetch[i] = SI_FIX_FETCH_RG_64_FLOAT; - swizzle[0] = PIPE_SWIZZLE_X; - swizzle[1] = PIPE_SWIZZLE_Y; - swizzle[2] = desc->nr_channels == 2 ? PIPE_SWIZZLE_Z : PIPE_SWIZZLE_0; - swizzle[3] = desc->nr_channels == 2 ? PIPE_SWIZZLE_W : PIPE_SWIZZLE_0; - break; - case 3: - v->fix_fetch[i] = SI_FIX_FETCH_RGB_64_FLOAT; - swizzle[0] = PIPE_SWIZZLE_X; /* 3 loads */ - swizzle[1] = PIPE_SWIZZLE_Y; - swizzle[2] = PIPE_SWIZZLE_0; - swizzle[3] = PIPE_SWIZZLE_0; - break; - case 4: - v->fix_fetch[i] = SI_FIX_FETCH_RGBA_64_FLOAT; - swizzle[0] = PIPE_SWIZZLE_X; /* 2 loads */ - swizzle[1] = PIPE_SWIZZLE_Y; - swizzle[2] = PIPE_SWIZZLE_Z; - swizzle[3] = PIPE_SWIZZLE_W; - break; - default: - assert(0); - } - } else if (channel && desc->nr_channels == 3) { - assert(desc->swizzle[0] == PIPE_SWIZZLE_X); + bool always_fix = false; + union si_vs_fix_fetch fix_fetch; + unsigned log_hw_load_size; /* the load element size as seen by the hardware */ - if (channel->size == 8) { + fix_fetch.bits = 0; + log_hw_load_size = MIN2(2, util_logbase2(desc->block.bits) - 3); + + if (channel) { + switch (channel->type) { + case UTIL_FORMAT_TYPE_FLOAT: fix_fetch.u.format = AC_FETCH_FORMAT_FLOAT; break; + case UTIL_FORMAT_TYPE_FIXED: fix_fetch.u.format = AC_FETCH_FORMAT_FIXED; break; + case UTIL_FORMAT_TYPE_SIGNED: { if (channel->pure_integer) - v->fix_fetch[i] = SI_FIX_FETCH_RGB_8_INT; + fix_fetch.u.format = AC_FETCH_FORMAT_SINT; + else if (channel->normalized) + fix_fetch.u.format = AC_FETCH_FORMAT_SNORM; else - v->fix_fetch[i] = SI_FIX_FETCH_RGB_8; - } else if (channel->size == 16) { + fix_fetch.u.format = AC_FETCH_FORMAT_SSCALED; + break; + } + case UTIL_FORMAT_TYPE_UNSIGNED: { if (channel->pure_integer) - v->fix_fetch[i] = SI_FIX_FETCH_RGB_16_INT; + fix_fetch.u.format = AC_FETCH_FORMAT_UINT; + else if (channel->normalized) + fix_fetch.u.format = AC_FETCH_FORMAT_UNORM; else - v->fix_fetch[i] = SI_FIX_FETCH_RGB_16; + fix_fetch.u.format = AC_FETCH_FORMAT_USCALED; + break; + } + default: unreachable("bad format type"); + } + } else { + switch (elements[i].src_format) { + case PIPE_FORMAT_R11G11B10_FLOAT: fix_fetch.u.format = AC_FETCH_FORMAT_FLOAT; break; + default: unreachable("bad other format"); } } - v->rsrc_word3[i] = S_008F0C_DST_SEL_X(si_map_swizzle(swizzle[0])) | - S_008F0C_DST_SEL_Y(si_map_swizzle(swizzle[1])) | - S_008F0C_DST_SEL_Z(si_map_swizzle(swizzle[2])) | - S_008F0C_DST_SEL_W(si_map_swizzle(swizzle[3])) | - S_008F0C_NUM_FORMAT(num_format) | - S_008F0C_DATA_FORMAT(data_format); + if (desc->channel[0].size == 10) { + fix_fetch.u.log_size = 3; /* special encoding for 2_10_10_10 */ + log_hw_load_size = 2; + + /* The hardware always treats the 2-bit alpha channel as + * unsigned, so a shader workaround is needed. The affected + * chips are VI and older except Stoney (GFX8.1). + */ + always_fix = sscreen->info.chip_class <= VI && + sscreen->info.family != CHIP_STONEY && + channel->type == UTIL_FORMAT_TYPE_SIGNED; + } else if (elements[i].src_format == PIPE_FORMAT_R11G11B10_FLOAT) { + fix_fetch.u.log_size = 3; /* special encoding */ + fix_fetch.u.format = AC_FETCH_FORMAT_FIXED; + log_hw_load_size = 2; + } else { + fix_fetch.u.log_size = util_logbase2(channel->size) - 3; + fix_fetch.u.num_channels_m1 = desc->nr_channels - 1; + + /* Always fix up: + * - doubles (multiple loads + truncate to float) + * - 32-bit requiring a conversion + */ + always_fix = + (fix_fetch.u.log_size == 3) || + (fix_fetch.u.log_size == 2 && + fix_fetch.u.format != AC_FETCH_FORMAT_FLOAT && + fix_fetch.u.format != AC_FETCH_FORMAT_UINT && + fix_fetch.u.format != AC_FETCH_FORMAT_SINT); + + /* Also fixup 8_8_8 and 16_16_16. */ + if (desc->nr_channels == 3 && fix_fetch.u.log_size <= 1) { + always_fix = true; + log_hw_load_size = fix_fetch.u.log_size; + } + } + + if (desc->swizzle[0] != PIPE_SWIZZLE_X) { + assert(desc->swizzle[0] == PIPE_SWIZZLE_Z && + (desc->swizzle[2] == PIPE_SWIZZLE_X || desc->swizzle[2] == PIPE_SWIZZLE_0)); + fix_fetch.u.reverse = 1; + } + + /* Force the workaround for unaligned access here already if the + * offset relative to the vertex buffer base is unaligned. + * + * There is a theoretical case in which this is too conservative: + * if the vertex buffer's offset is also unaligned in just the + * right way, we end up with an aligned address after all. + * However, this case should be extremely rare in practice (it + * won't happen in well-behaved applications), and taking it + * into account would complicate the fast path (where everything + * is nicely aligned). + */ + bool check_alignment = log_hw_load_size >= 1 && sscreen->info.chip_class == SI; + bool opencode = sscreen->options.vs_fetch_always_opencode; + + if (check_alignment && + (elements[i].src_offset & ((1 << log_hw_load_size) - 1)) != 0) + opencode = true; + + if (always_fix || check_alignment || opencode) + v->fix_fetch[i] = fix_fetch.bits; + + if (opencode) + v->fix_fetch_opencode |= 1 << i; + if (opencode || always_fix) + v->fix_fetch_always |= 1 << i; + + if (check_alignment && !opencode) { + assert(log_hw_load_size == 1 || log_hw_load_size == 2); + + v->fix_fetch_unaligned |= 1 << i; + v->hw_load_is_dword |= (log_hw_load_size - 1) << i; + v->vb_alignment_check_mask |= 1 << vbo_index; + } + + v->rsrc_word3[i] = S_008F0C_DST_SEL_X(si_map_swizzle(desc->swizzle[0])) | + S_008F0C_DST_SEL_Y(si_map_swizzle(desc->swizzle[1])) | + S_008F0C_DST_SEL_Z(si_map_swizzle(desc->swizzle[2])) | + S_008F0C_DST_SEL_W(si_map_swizzle(desc->swizzle[3])); + + unsigned data_format, num_format; + data_format = si_translate_buffer_dataformat(ctx->screen, desc, first_non_void); + num_format = si_translate_buffer_numformat(ctx->screen, desc, first_non_void); + v->rsrc_word3[i] |= S_008F0C_NUM_FORMAT(num_format) | + S_008F0C_DATA_FORMAT(data_format); } if (v->instance_divisor_is_fetched) { @@ -4621,7 +4651,17 @@ static void si_bind_vertex_elements(struct pipe_context *ctx, void *state) (!old || old->count != v->count || old->uses_instance_divisors != v->uses_instance_divisors || - v->uses_instance_divisors || /* we don't check which divisors changed */ + /* we don't check which divisors changed */ + v->uses_instance_divisors || + (old->vb_alignment_check_mask ^ v->vb_alignment_check_mask) & sctx->vertex_buffer_unaligned || + ((v->vb_alignment_check_mask & sctx->vertex_buffer_unaligned) && + memcmp(old->vertex_buffer_index, v->vertex_buffer_index, + sizeof(v->vertex_buffer_index[0]) * v->count)) || + /* fix_fetch_{always,opencode,unaligned} and hw_load_is_dword are + * functions of fix_fetch and the src_offset alignment. + * If they change and fix_fetch doesn't, it must be due to different + * src_offset alignment, which is reflected in fix_fetch_opencode. */ + old->fix_fetch_opencode != v->fix_fetch_opencode || memcmp(old->fix_fetch, v->fix_fetch, sizeof(v->fix_fetch[0]) * v->count))) sctx->do_update_shaders = true; @@ -4653,6 +4693,8 @@ static void si_set_vertex_buffers(struct pipe_context *ctx, { struct si_context *sctx = (struct si_context *)ctx; struct pipe_vertex_buffer *dst = sctx->vertex_buffer + start_slot; + uint32_t orig_unaligned = sctx->vertex_buffer_unaligned; + uint32_t unaligned = orig_unaligned; int i; assert(start_slot + count <= ARRAY_SIZE(sctx->vertex_buffer)); @@ -4666,6 +4708,11 @@ static void si_set_vertex_buffers(struct pipe_context *ctx, pipe_resource_reference(&dsti->buffer.resource, buf); dsti->buffer_offset = src->buffer_offset; dsti->stride = src->stride; + if (dsti->buffer_offset & 3 || dsti->stride & 3) + unaligned |= 1 << (start_slot + i); + else + unaligned &= ~(1 << (start_slot + i)); + si_context_add_resource_size(sctx, buf); if (buf) si_resource(buf)->bind_history |= PIPE_BIND_VERTEX_BUFFER; @@ -4674,8 +4721,22 @@ static void si_set_vertex_buffers(struct pipe_context *ctx, for (i = 0; i < count; i++) { pipe_resource_reference(&dst[i].buffer.resource, NULL); } + unaligned &= ~u_bit_consecutive(start_slot, count); } sctx->vertex_buffers_dirty = true; + sctx->vertex_buffer_unaligned = unaligned; + + /* Check whether alignment may have changed in a way that requires + * shader changes. This check is conservative: a vertex buffer can only + * trigger a shader change if the misalignment amount changes (e.g. + * from byte-aligned to short-aligned), but we only keep track of + * whether buffers are at least dword-aligned, since that should always + * be the case in well-behaved applications anyway. + */ + if (sctx->vertex_elements && + (sctx->vertex_elements->vb_alignment_check_mask & + (unaligned | orig_unaligned) & u_bit_consecutive(start_slot, count))) + sctx->do_update_shaders = true; } /* diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h index 6df24f9648a..e4b1cf79132 100644 --- a/src/gallium/drivers/radeonsi/si_state.h +++ b/src/gallium/drivers/radeonsi/si_state.h @@ -139,6 +139,25 @@ struct si_vertex_elements uint8_t format_size[SI_MAX_ATTRIBS]; uint8_t vertex_buffer_index[SI_MAX_ATTRIBS]; + /* Bitmask of elements that always need a fixup to be applied. */ + uint16_t fix_fetch_always; + + /* Bitmask of elements whose fetch should always be opencoded. */ + uint16_t fix_fetch_opencode; + + /* Bitmask of elements which need to be opencoded if the vertex buffer + * is unaligned. */ + uint16_t fix_fetch_unaligned; + + /* For elements in fix_fetch_unaligned: whether the effective + * element load size as seen by the hardware is a dword (as opposed + * to a short). + */ + uint16_t hw_load_is_dword; + + /* Bitmask of vertex buffers requiring alignment check */ + uint16_t vb_alignment_check_mask; + uint8_t count; bool uses_instance_divisors; diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c index a9c5c7b9e4d..51a3af92d0c 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.c +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c @@ -1390,7 +1390,31 @@ static void si_shader_selector_key_vs(struct si_context *sctx, key->opt.prefer_mono = 1; unsigned count = MIN2(vs->info.num_inputs, elts->count); - memcpy(key->mono.vs_fix_fetch, elts->fix_fetch, count); + unsigned count_mask = (1 << count) - 1; + unsigned fix = elts->fix_fetch_always & count_mask; + unsigned opencode = elts->fix_fetch_opencode & count_mask; + + if (sctx->vertex_buffer_unaligned & elts->vb_alignment_check_mask) { + uint32_t mask = elts->fix_fetch_unaligned & count_mask; + while (mask) { + unsigned i = u_bit_scan(&mask); + unsigned log_hw_load_size = 1 + ((elts->hw_load_is_dword >> i) & 1); + unsigned vbidx = elts->vertex_buffer_index[i]; + struct pipe_vertex_buffer *vb = &sctx->vertex_buffer[vbidx]; + unsigned align_mask = (1 << log_hw_load_size) - 1; + if (vb->buffer_offset & align_mask || + vb->stride & align_mask) { + fix |= 1 << i; + opencode |= 1 << i; + } + } + } + + while (fix) { + unsigned i = u_bit_scan(&fix); + key->mono.vs_fix_fetch[i].bits = elts->fix_fetch[i]; + } + key->mono.vs_fetch_opencode = opencode; } static void si_shader_selector_key_hw_vs(struct si_context *sctx, -- 2.30.2