From 29348631fe7bf732a38856ea842cfc7aa2263468 Mon Sep 17 00:00:00 2001 From: Jason Ekstrand Date: Wed, 9 Sep 2015 17:18:55 -0700 Subject: [PATCH] nir/lower_vec_to_movs: Coalesce into destinations of fdot instructions Now that we have a replicating fdot instruction, we can actually coalesce into the destinations of vec4 instructions. We couldn't really do this before because, if the destination had to end up in .z, we couldn't reswizzle the instruction. With a replicated destination, the result ends up in all channels so we can just set the writemask and we're done. Shader-db results for vec4 programs on Haswell: total instructions in shared programs: 1747753 -> 1746280 (-0.08%) instructions in affected programs: 143274 -> 141801 (-1.03%) helped: 667 HURT: 0 It turns out that dot-products matter... Reviewed-by: Eduardo Lima Mitev --- src/glsl/nir/nir_lower_vec_to_movs.c | 49 ++++++++++++++++++++-------- 1 file changed, 36 insertions(+), 13 deletions(-) diff --git a/src/glsl/nir/nir_lower_vec_to_movs.c b/src/glsl/nir/nir_lower_vec_to_movs.c index 9ff86ea7543..2cb0457f9ba 100644 --- a/src/glsl/nir/nir_lower_vec_to_movs.c +++ b/src/glsl/nir/nir_lower_vec_to_movs.c @@ -79,6 +79,14 @@ insert_mov(nir_alu_instr *vec, unsigned start_idx, nir_shader *shader) return mov->dest.write_mask; } +static bool +has_replicated_dest(nir_alu_instr *alu) +{ + return alu->op == nir_op_fdot_replicated2 || + alu->op == nir_op_fdot_replicated3 || + alu->op == nir_op_fdot_replicated4; +} + /* Attempts to coalesce the "move" from the given source of the vec to the * destination of the instruction generating the value. If, for whatever * reason, we cannot coalesce the mmove, it does nothing and returns 0. We @@ -116,19 +124,28 @@ try_coalesce(nir_alu_instr *vec, unsigned start_idx, nir_shader *shader) nir_alu_instr *src_alu = nir_instr_as_alu(vec->src[start_idx].src.ssa->parent_instr); - /* We only care about being able to re-swizzle the instruction if it is - * something that we can reswizzle. It must be per-component. - */ - if (nir_op_infos[src_alu->op].output_size != 0) - return 0; - - /* If we are going to reswizzle the instruction, we can't have any - * non-per-component sources either. - */ - for (unsigned j = 0; j < nir_op_infos[src_alu->op].num_inputs; j++) - if (nir_op_infos[src_alu->op].input_sizes[j] != 0) + if (has_replicated_dest(src_alu)) { + /* The fdot instruction is special: It replicates its result to all + * components. This means that we can always rewrite its destination + * and we don't need to swizzle anything. + */ + } else { + /* We only care about being able to re-swizzle the instruction if it is + * something that we can reswizzle. It must be per-component. The one + * exception to this is the fdotN instructions which implicitly splat + * their result out to all channels. + */ + if (nir_op_infos[src_alu->op].output_size != 0) return 0; + /* If we are going to reswizzle the instruction, we can't have any + * non-per-component sources either. + */ + for (unsigned j = 0; j < nir_op_infos[src_alu->op].num_inputs; j++) + if (nir_op_infos[src_alu->op].input_sizes[j] != 0) + return 0; + } + /* Stash off all of the ALU instruction's swizzles. */ uint8_t swizzles[4][4]; for (unsigned j = 0; j < nir_op_infos[src_alu->op].num_inputs; j++) @@ -148,8 +165,14 @@ try_coalesce(nir_alu_instr *vec, unsigned start_idx, nir_shader *shader) * instruction so we can re-swizzle that component to match. */ write_mask |= 1 << i; - for (unsigned j = 0; j < nir_op_infos[src_alu->op].num_inputs; j++) - src_alu->src[j].swizzle[i] = swizzles[j][vec->src[i].swizzle[0]]; + if (has_replicated_dest(src_alu)) { + /* Since the destination is a single replicated value, we don't need + * to do any reswizzling + */ + } else { + for (unsigned j = 0; j < nir_op_infos[src_alu->op].num_inputs; j++) + src_alu->src[j].swizzle[i] = swizzles[j][vec->src[i].swizzle[0]]; + } /* Clear the no longer needed vec source */ nir_instr_rewrite_src(&vec->instr, &vec->src[i].src, NIR_SRC_INIT); -- 2.30.2