From 03a0d39366db367b26aea29b04b032c6f1f7cd84 Mon Sep 17 00:00:00 2001 From: Rhys Perry Date: Fri, 3 Jan 2020 17:13:42 +0000 Subject: [PATCH] aco: use MUBUF in some situations instead of splitting vertex fetches MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Fixes most of the regressions from splitting vertex fetches in an earlier commit. pipeline-db (Vega): Totals from affected shaders: SGPRS: 0 -> 0 (0.00 %) VGPRS: 0 -> 0 (0.00 %) Spilled SGPRs: 0 -> 0 (0.00 %) Spilled VGPRs: 0 -> 0 (0.00 %) Private memory VGPRs: 0 -> 0 (0.00 %) Scratch size: 0 -> 0 (0.00 %) dwords per thread Code Size: 0 -> 0 (0.00 %) bytes LDS: 0 -> 0 (0.00 %) blocks Max Waves: 0 -> 0 (0.00 %) pipeline-db (Navi): Totals from affected shaders: SGPRS: 562696 -> 558344 (-0.77 %) VGPRS: 395596 -> 393752 (-0.47 %) Spilled SGPRs: 0 -> 0 (0.00 %) Spilled VGPRs: 0 -> 0 (0.00 %) Private memory VGPRs: 0 -> 0 (0.00 %) Scratch size: 0 -> 0 (0.00 %) dwords per thread Code Size: 11600912 -> 11311804 (-2.49 %) bytes LDS: 0 -> 0 (0.00 %) blocks Max Waves: 101839 -> 102372 (0.52 %) Signed-off-by: Rhys Perry Reviewed-by: Daniel Schürmann Tested-by: Marge Bot Part-of: --- .../compiler/aco_instruction_selection.cpp | 35 +++++++++++++------ 1 file changed, 25 insertions(+), 10 deletions(-) diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp index 16a93ce4c16..af18e9a4bdd 100644 --- a/src/amd/compiler/aco_instruction_selection.cpp +++ b/src/amd/compiler/aco_instruction_selection.cpp @@ -3228,7 +3228,16 @@ void visit_load_input(isel_context *ctx, nir_intrinsic_instr *instr) while (channel_start < num_channels) { unsigned fetch_size = num_channels - channel_start; unsigned fetch_offset = attrib_offset + channel_start * vtx_info->chan_byte_size; - unsigned fetch_dfmt = get_fetch_data_format(ctx, vtx_info, fetch_offset, attrib_stride, &fetch_size); + + /* use MUBUF when possible to avoid possible alignment issues */ + /* TODO: we could use SDWA to unpack 8/16-bit attributes without extra instructions */ + bool use_mubuf = (nfmt == V_008F0C_BUF_NUM_FORMAT_FLOAT || + nfmt == V_008F0C_BUF_NUM_FORMAT_UINT || + nfmt == V_008F0C_BUF_NUM_FORMAT_SINT) && + vtx_info->chan_byte_size == 4; + unsigned fetch_dfmt = V_008F0C_BUF_DATA_FORMAT_INVALID; + if (!use_mubuf) + fetch_dfmt = get_fetch_data_format(ctx, vtx_info, fetch_offset, attrib_stride, &fetch_size); Temp fetch_index = index; if (attrib_stride != 0 && fetch_offset > attrib_stride) { @@ -3245,16 +3254,16 @@ void visit_load_input(isel_context *ctx, nir_intrinsic_instr *instr) aco_opcode opcode; switch (fetch_size) { case 1: - opcode = aco_opcode::tbuffer_load_format_x; + opcode = use_mubuf ? aco_opcode::buffer_load_dword : aco_opcode::tbuffer_load_format_x; break; case 2: - opcode = aco_opcode::tbuffer_load_format_xy; + opcode = use_mubuf ? aco_opcode::buffer_load_dwordx2 : aco_opcode::tbuffer_load_format_xy; break; case 3: - opcode = aco_opcode::tbuffer_load_format_xyz; + opcode = use_mubuf ? aco_opcode::buffer_load_dwordx3 : aco_opcode::tbuffer_load_format_xyz; break; case 4: - opcode = aco_opcode::tbuffer_load_format_xyzw; + opcode = use_mubuf ? aco_opcode::buffer_load_dwordx4 : aco_opcode::tbuffer_load_format_xyzw; break; default: unreachable("Unimplemented load_input vector size"); @@ -3269,11 +3278,17 @@ void visit_load_input(isel_context *ctx, nir_intrinsic_instr *instr) fetch_dst = bld.tmp(RegType::vgpr, fetch_size); } - Instruction *mtbuf = bld.mtbuf(opcode, - Definition(fetch_dst), fetch_index, list, soffset, - fetch_dfmt, nfmt, fetch_offset, - false, true).instr; - static_cast(mtbuf)->can_reorder = true; + if (use_mubuf) { + Instruction *mubuf = bld.mubuf(opcode, + Definition(fetch_dst), fetch_index, list, soffset, + fetch_offset, false, true).instr; + static_cast(mubuf)->can_reorder = true; + } else { + Instruction *mtbuf = bld.mtbuf(opcode, + Definition(fetch_dst), fetch_index, list, soffset, + fetch_dfmt, nfmt, fetch_offset, false, true).instr; + static_cast(mtbuf)->can_reorder = true; + } emit_split_vector(ctx, fetch_dst, fetch_dst.size()); -- 2.30.2