X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Fglsl%2Fnir%2Fnir_lower_vec_to_movs.c;h=2cb0457f9ba6ee8a8be3caeaa56c00c29309c613;hb=aecbc93f2d1ff9de4e03a2b216e86dcb9a4ce414;hp=602853ea665a89817ce4c41ce261f2de5d25cb25;hpb=dd4d9a4e62ecf44011cfa2e020d08029299dd7e0;p=mesa.git diff --git a/src/glsl/nir/nir_lower_vec_to_movs.c b/src/glsl/nir/nir_lower_vec_to_movs.c index 602853ea665..2cb0457f9ba 100644 --- a/src/glsl/nir/nir_lower_vec_to_movs.c +++ b/src/glsl/nir/nir_lower_vec_to_movs.c @@ -53,29 +53,25 @@ src_matches_dest_reg(nir_dest *dest, nir_src *src) * which ones have been processed. */ static unsigned -insert_mov(nir_alu_instr *vec, unsigned start_channel, - unsigned start_src_idx, void *mem_ctx) +insert_mov(nir_alu_instr *vec, unsigned start_idx, nir_shader *shader) { - unsigned src_idx = start_src_idx; - assert(src_idx < nir_op_infos[vec->op].num_inputs); + assert(start_idx < nir_op_infos[vec->op].num_inputs); - nir_alu_instr *mov = nir_alu_instr_create(mem_ctx, nir_op_imov); - nir_alu_src_copy(&mov->src[0], &vec->src[src_idx], mem_ctx); - nir_alu_dest_copy(&mov->dest, &vec->dest, mem_ctx); + nir_alu_instr *mov = nir_alu_instr_create(shader, nir_op_imov); + nir_alu_src_copy(&mov->src[0], &vec->src[start_idx], mov); + nir_alu_dest_copy(&mov->dest, &vec->dest, mov); - mov->dest.write_mask = (1u << start_channel); - mov->src[0].swizzle[start_channel] = vec->src[src_idx].swizzle[0]; - src_idx++; + mov->dest.write_mask = (1u << start_idx); + mov->src[0].swizzle[start_idx] = vec->src[start_idx].swizzle[0]; - for (unsigned i = start_channel + 1; i < 4; i++) { + for (unsigned i = start_idx + 1; i < 4; i++) { if (!(vec->dest.write_mask & (1 << i))) continue; - if (nir_srcs_equal(vec->src[src_idx].src, vec->src[start_src_idx].src)) { + if (nir_srcs_equal(vec->src[i].src, vec->src[start_idx].src)) { mov->dest.write_mask |= (1 << i); - mov->src[0].swizzle[i] = vec->src[src_idx].swizzle[0]; + mov->src[0].swizzle[i] = vec->src[i].swizzle[0]; } - src_idx++; } nir_instr_insert_before(&vec->instr, &mov->instr); @@ -84,13 +80,121 @@ insert_mov(nir_alu_instr *vec, unsigned start_channel, } static bool -lower_vec_to_movs_block(nir_block *block, void *mem_ctx) +has_replicated_dest(nir_alu_instr *alu) { + return alu->op == nir_op_fdot_replicated2 || + alu->op == nir_op_fdot_replicated3 || + alu->op == nir_op_fdot_replicated4; +} + +/* Attempts to coalesce the "move" from the given source of the vec to the + * destination of the instruction generating the value. If, for whatever + * reason, we cannot coalesce the mmove, it does nothing and returns 0. We + * can then call insert_mov as normal. + */ +static unsigned +try_coalesce(nir_alu_instr *vec, unsigned start_idx, nir_shader *shader) +{ + assert(start_idx < nir_op_infos[vec->op].num_inputs); + + /* We will only even try if the source is SSA */ + if (!vec->src[start_idx].src.is_ssa) + return 0; + + assert(vec->src[start_idx].src.ssa); + + /* If we are going to do a reswizzle, then the vecN operation must be the + * only use of the source value. We also can't have any source modifiers. + */ + nir_foreach_use(vec->src[start_idx].src.ssa, src) { + if (src->parent_instr != &vec->instr) + return 0; + + nir_alu_src *alu_src = exec_node_data(nir_alu_src, src, src); + if (alu_src->abs || alu_src->negate) + return 0; + } + + if (!list_empty(&vec->src[start_idx].src.ssa->if_uses)) + return 0; + + if (vec->src[start_idx].src.ssa->parent_instr->type != nir_instr_type_alu) + return 0; + + nir_alu_instr *src_alu = + nir_instr_as_alu(vec->src[start_idx].src.ssa->parent_instr); + + if (has_replicated_dest(src_alu)) { + /* The fdot instruction is special: It replicates its result to all + * components. This means that we can always rewrite its destination + * and we don't need to swizzle anything. + */ + } else { + /* We only care about being able to re-swizzle the instruction if it is + * something that we can reswizzle. It must be per-component. The one + * exception to this is the fdotN instructions which implicitly splat + * their result out to all channels. + */ + if (nir_op_infos[src_alu->op].output_size != 0) + return 0; + + /* If we are going to reswizzle the instruction, we can't have any + * non-per-component sources either. + */ + for (unsigned j = 0; j < nir_op_infos[src_alu->op].num_inputs; j++) + if (nir_op_infos[src_alu->op].input_sizes[j] != 0) + return 0; + } + + /* Stash off all of the ALU instruction's swizzles. */ + uint8_t swizzles[4][4]; + for (unsigned j = 0; j < nir_op_infos[src_alu->op].num_inputs; j++) + for (unsigned i = 0; i < 4; i++) + swizzles[j][i] = src_alu->src[j].swizzle[i]; + + unsigned write_mask = 0; + for (unsigned i = start_idx; i < 4; i++) { + if (!(vec->dest.write_mask & (1 << i))) + continue; + + if (!vec->src[i].src.is_ssa || + vec->src[i].src.ssa != &src_alu->dest.dest.ssa) + continue; + + /* At this point, the give vec source matchese up with the ALU + * instruction so we can re-swizzle that component to match. + */ + write_mask |= 1 << i; + if (has_replicated_dest(src_alu)) { + /* Since the destination is a single replicated value, we don't need + * to do any reswizzling + */ + } else { + for (unsigned j = 0; j < nir_op_infos[src_alu->op].num_inputs; j++) + src_alu->src[j].swizzle[i] = swizzles[j][vec->src[i].swizzle[0]]; + } + + /* Clear the no longer needed vec source */ + nir_instr_rewrite_src(&vec->instr, &vec->src[i].src, NIR_SRC_INIT); + } + + nir_instr_rewrite_dest(&src_alu->instr, &src_alu->dest.dest, vec->dest.dest); + src_alu->dest.write_mask = write_mask; + + return write_mask; +} + +static bool +lower_vec_to_movs_block(nir_block *block, void *void_impl) +{ + nir_function_impl *impl = void_impl; + nir_shader *shader = impl->overload->function->shader; + nir_foreach_instr_safe(block, instr) { if (instr->type != nir_instr_type_alu) continue; - nir_alu_instr *vec = (nir_alu_instr *)instr; + nir_alu_instr *vec = nir_instr_as_alu(instr); switch (vec->op) { case nir_op_vec2: @@ -101,8 +205,16 @@ lower_vec_to_movs_block(nir_block *block, void *mem_ctx) continue; /* The loop */ } - /* Since we insert multiple MOVs, we have to be non-SSA. */ - assert(!vec->dest.dest.is_ssa); + if (vec->dest.dest.is_ssa) { + /* Since we insert multiple MOVs, we have a register destination. */ + nir_register *reg = nir_local_reg_create(impl); + reg->num_components = vec->dest.dest.ssa.num_components; + + nir_ssa_def_rewrite_uses(&vec->dest.dest.ssa, nir_src_for_reg(reg)); + + nir_instr_rewrite_dest(&vec->instr, &vec->dest.dest, + nir_dest_for_reg(reg)); + } unsigned finished_write_mask = 0; @@ -110,26 +222,26 @@ lower_vec_to_movs_block(nir_block *block, void *mem_ctx) * destination reg, in case other values we're populating in the dest * might overwrite them. */ - for (unsigned i = 0, src_idx = 0; i < 4; i++) { + for (unsigned i = 0; i < 4; i++) { if (!(vec->dest.write_mask & (1 << i))) continue; - if (src_matches_dest_reg(&vec->dest.dest, &vec->src[src_idx].src)) { - finished_write_mask |= insert_mov(vec, i, src_idx, mem_ctx); + if (src_matches_dest_reg(&vec->dest.dest, &vec->src[i].src)) { + finished_write_mask |= insert_mov(vec, i, shader); break; } - src_idx++; } /* Now, emit MOVs for all the other src channels. */ - for (unsigned i = 0, src_idx = 0; i < 4; i++) { + for (unsigned i = 0; i < 4; i++) { if (!(vec->dest.write_mask & (1 << i))) continue; if (!(finished_write_mask & (1 << i))) - finished_write_mask |= insert_mov(vec, i, src_idx, mem_ctx); + finished_write_mask |= try_coalesce(vec, i, shader); - src_idx++; + if (!(finished_write_mask & (1 << i))) + finished_write_mask |= insert_mov(vec, i, shader); } nir_instr_remove(&vec->instr); @@ -142,7 +254,7 @@ lower_vec_to_movs_block(nir_block *block, void *mem_ctx) static void nir_lower_vec_to_movs_impl(nir_function_impl *impl) { - nir_foreach_block(impl, lower_vec_to_movs_block, ralloc_parent(impl)); + nir_foreach_block(impl, lower_vec_to_movs_block, impl); } void