From b440c28b78fc5e6f319f988f21a7470254b00f06 Mon Sep 17 00:00:00 2001 From: Eric Anholt Date: Thu, 23 Jul 2020 14:32:13 -0700 Subject: [PATCH] nir: Shrink store intrinsic num_components to the size used by the writemask. This cuts a bunch of vector setup for undef components in the i965 vec4 backend. Noticed while looking into codegen regressions in nir-to-tgsi. brw results: total instructions in shared programs: 3893221 -> 3881461 (-0.30%) total cycles in shared programs: 113792154 -> 113810288 (0.02%) Reviewed-by: Rhys Perry Part-of: --- src/compiler/nir/nir_opt_shrink_vectors.c | 37 +++++++++++++++++++---- 1 file changed, 31 insertions(+), 6 deletions(-) diff --git a/src/compiler/nir/nir_opt_shrink_vectors.c b/src/compiler/nir/nir_opt_shrink_vectors.c index 8b01f9f4817..0790138a749 100644 --- a/src/compiler/nir/nir_opt_shrink_vectors.c +++ b/src/compiler/nir/nir_opt_shrink_vectors.c @@ -110,7 +110,7 @@ opt_shrink_vectors_alu(nir_builder *b, nir_alu_instr *instr) } static bool -opt_shrink_vectors_intrinsic(nir_intrinsic_instr *instr) +opt_shrink_vectors_intrinsic(nir_builder *b, nir_intrinsic_instr *instr) { switch (instr->intrinsic) { case nir_intrinsic_load_uniform: @@ -125,18 +125,43 @@ opt_shrink_vectors_intrinsic(nir_intrinsic_instr *instr) case nir_intrinsic_load_global: case nir_intrinsic_load_kernel_input: case nir_intrinsic_load_scratch: + case nir_intrinsic_store_output: + case nir_intrinsic_store_per_vertex_output: + case nir_intrinsic_store_ssbo: + case nir_intrinsic_store_shared: + case nir_intrinsic_store_global: + case nir_intrinsic_store_scratch: break; default: return false; } - assert(nir_intrinsic_infos[instr->intrinsic].has_dest); /* Must be a vectorized intrinsic that we can resize. */ assert(instr->num_components != 0); - if (shrink_dest_to_read_mask(&instr->dest.ssa)) { - instr->num_components = instr->dest.ssa.num_components; - return true; + if (nir_intrinsic_infos[instr->intrinsic].has_dest) { + /* loads: Trim the dest to the used channels */ + + if (shrink_dest_to_read_mask(&instr->dest.ssa)) { + instr->num_components = instr->dest.ssa.num_components; + return true; + } + } else { + /* Stores: trim the num_components stored according to the write + * mask. + */ + unsigned write_mask = nir_intrinsic_write_mask(instr); + unsigned last_bit = util_last_bit(write_mask); + if (last_bit < instr->num_components && instr->src[0].is_ssa) { + nir_ssa_def *def = nir_channels(b, instr->src[0].ssa, + BITSET_MASK(last_bit)); + nir_instr_rewrite_src(&instr->instr, + &instr->src[0], + nir_src_for_ssa(def)); + instr->num_components = last_bit; + + return true; + } } return false; @@ -164,7 +189,7 @@ opt_shrink_vectors_instr(nir_builder *b, nir_instr *instr) return opt_shrink_vectors_alu(b, nir_instr_as_alu(instr)); case nir_instr_type_intrinsic: - return opt_shrink_vectors_intrinsic(nir_instr_as_intrinsic(instr)); + return opt_shrink_vectors_intrinsic(b, nir_instr_as_intrinsic(instr)); case nir_instr_type_load_const: return opt_shrink_vectors_load_const(nir_instr_as_load_const(instr)); -- 2.30.2