From 8847370424b12fc83a069eb80cfde76b348a7aec Mon Sep 17 00:00:00 2001 From: Eric Anholt Date: Thu, 20 Dec 2018 09:43:43 -0800 Subject: [PATCH] v3d: Use the core tex lowering. Even without any clever optimization on the unpack operations, this gives us a useful value for the channels read field, which we can use to avoid ldtmu instructions to the no-op register. instructions in affected programs: 890712 -> 881974 (-0.98%) --- src/broadcom/compiler/v3d33_tex.c | 61 ++------------------------- src/broadcom/compiler/v3d40_tex.c | 68 ++----------------------------- src/broadcom/compiler/vir.c | 4 ++ 3 files changed, 10 insertions(+), 123 deletions(-) diff --git a/src/broadcom/compiler/v3d33_tex.c b/src/broadcom/compiler/v3d33_tex.c index 9af9285e07b..7e9cd27d31b 100644 --- a/src/broadcom/compiler/v3d33_tex.c +++ b/src/broadcom/compiler/v3d33_tex.c @@ -126,19 +126,12 @@ v3d33_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr) } } - bool return_16 = (c->key->tex[unit].return_size == 16 || - p0_unpacked.shadow); - /* Limit the number of channels returned to both how many the NIR * instruction writes and how many the instruction could produce. */ - uint32_t instr_return_channels = nir_tex_instr_dest_size(instr); - if (return_16) - instr_return_channels = (instr_return_channels + 1) / 2; - + assert(instr->dest.is_ssa); p1_unpacked.return_words_of_texture_data = - (1 << MIN2(instr_return_channels, - c->key->tex[unit].return_channels)) - 1; + nir_ssa_def_components_read(&instr->dest.ssa); uint32_t p0_packed; V3D33_TEXTURE_UNIFORM_PARAMETER_0_CFG_MODE1_pack(NULL, @@ -193,56 +186,8 @@ v3d33_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr) vir_emit_thrsw(c); - struct qreg return_values[4]; for (int i = 0; i < 4; i++) { - /* Swizzling .zw of an RG texture should give undefined - * results, not crash the compiler. - */ if (p1_unpacked.return_words_of_texture_data & (1 << i)) - return_values[i] = vir_LDTMU(c); - else - return_values[i] = c->undef; - } - - for (int i = 0; i < nir_tex_instr_dest_size(instr); i++) { - struct qreg chan; - - if (return_16) { - STATIC_ASSERT(PIPE_SWIZZLE_X == 0); - chan = return_values[i / 2]; - - if (nir_alu_type_get_base_type(instr->dest_type) == - nir_type_float) { - enum v3d_qpu_input_unpack unpack; - if (i & 1) - unpack = V3D_QPU_UNPACK_H; - else - unpack = V3D_QPU_UNPACK_L; - - chan = vir_FMOV(c, chan); - vir_set_unpack(c->defs[chan.index], 0, unpack); - } else { - /* If we're unpacking the low field, shift it - * up to the top first. - */ - if ((i & 1) == 0) { - chan = vir_SHL(c, chan, - vir_uniform_ui(c, 16)); - } - - /* Do proper sign extension to a 32-bit int. */ - if (nir_alu_type_get_base_type(instr->dest_type) == - nir_type_int) { - chan = vir_ASR(c, chan, - vir_uniform_ui(c, 16)); - } else { - chan = vir_SHR(c, chan, - vir_uniform_ui(c, 16)); - } - } - } else { - chan = vir_MOV(c, return_values[i]); - } - ntq_store_dest(c, &instr->dest, i, chan); + ntq_store_dest(c, &instr->dest, i, vir_LDTMU(c)); } } diff --git a/src/broadcom/compiler/v3d40_tex.c b/src/broadcom/compiler/v3d40_tex.c index c41cd33505a..7cac6d5ca41 100644 --- a/src/broadcom/compiler/v3d40_tex.c +++ b/src/broadcom/compiler/v3d40_tex.c @@ -166,18 +166,10 @@ v3d40_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr) /* Limit the number of channels returned to both how many the NIR * instruction writes and how many the instruction could produce. - * - * XXX perf: Can we also limit to the number of channels that are - * actually read by the users of this NIR dest, so that we don't need - * to emit unused LDTMUs? */ - uint32_t instr_return_channels = nir_tex_instr_dest_size(instr); - if (!p1_unpacked.output_type_32_bit) - instr_return_channels = (instr_return_channels + 1) / 2; - + assert(instr->dest.is_ssa); p0_unpacked.return_words_of_texture_data = - (1 << MIN2(instr_return_channels, - c->key->tex[unit].return_channels)) - 1; + nir_ssa_def_components_read(&instr->dest.ssa); /* Word enables can't ask for more channels than the output type could * provide (2 for f16, 4 for 32-bit). @@ -232,62 +224,8 @@ v3d40_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr) while (tmu_writes > 16 / c->threads) c->threads /= 2; - struct qreg return_values[4]; for (int i = 0; i < 4; i++) { - /* Swizzling .zw of an RG texture should give undefined - * results, not crash the compiler. - */ if (p0_unpacked.return_words_of_texture_data & (1 << i)) - return_values[i] = vir_LDTMU(c); - else - return_values[i] = c->undef; - } - - for (int i = 0; i < nir_tex_instr_dest_size(instr); i++) { - struct qreg chan; - - if (!p1_unpacked.output_type_32_bit) { - STATIC_ASSERT(PIPE_SWIZZLE_X == 0); - chan = return_values[i / 2]; - - /* XXX perf: We should move this unpacking into NIR. - * That would give us exposure of these types to NIR - * optimization, so that (for example) a repacking of - * half-float samples to the half-float render target - * could be eliminated. - */ - if (nir_alu_type_get_base_type(instr->dest_type) == - nir_type_float) { - enum v3d_qpu_input_unpack unpack; - if (i & 1) - unpack = V3D_QPU_UNPACK_H; - else - unpack = V3D_QPU_UNPACK_L; - - chan = vir_FMOV(c, chan); - vir_set_unpack(c->defs[chan.index], 0, unpack); - } else { - /* If we're unpacking the low field, shift it - * up to the top first. - */ - if ((i & 1) == 0) { - chan = vir_SHL(c, chan, - vir_uniform_ui(c, 16)); - } - - /* Do proper sign extension to a 32-bit int. */ - if (nir_alu_type_get_base_type(instr->dest_type) == - nir_type_int) { - chan = vir_ASR(c, chan, - vir_uniform_ui(c, 16)); - } else { - chan = vir_SHR(c, chan, - vir_uniform_ui(c, 16)); - } - } - } else { - chan = vir_MOV(c, return_values[i]); - } - ntq_store_dest(c, &instr->dest, i, chan); + ntq_store_dest(c, &instr->dest, i, vir_LDTMU(c)); } } diff --git a/src/broadcom/compiler/vir.c b/src/broadcom/compiler/vir.c index 6eb346ce9fd..01e18ffd074 100644 --- a/src/broadcom/compiler/vir.c +++ b/src/broadcom/compiler/vir.c @@ -630,6 +630,10 @@ v3d_lower_nir(struct v3d_compile *c) tex_options.saturate_t |= 1 << i; if (c->key->tex[i].clamp_r) tex_options.saturate_r |= 1 << i; + if (c->key->tex[i].return_size == 16) { + tex_options.lower_tex_packing[i] = + nir_lower_tex_packing_16; + } } NIR_PASS_V(c->s, nir_lower_tex, &tex_options); -- 2.30.2