* which ones have been processed.
*/
static unsigned
-insert_mov(nir_alu_instr *vec, unsigned start_channel,
- unsigned start_src_idx, void *mem_ctx)
+insert_mov(nir_alu_instr *vec, unsigned start_idx, nir_shader *shader)
{
- unsigned src_idx = start_src_idx;
- assert(src_idx < nir_op_infos[vec->op].num_inputs);
+ assert(start_idx < nir_op_infos[vec->op].num_inputs);
- nir_alu_instr *mov = nir_alu_instr_create(mem_ctx, nir_op_imov);
- nir_alu_src_copy(&mov->src[0], &vec->src[src_idx], mem_ctx);
- nir_alu_dest_copy(&mov->dest, &vec->dest, mem_ctx);
+ nir_alu_instr *mov = nir_alu_instr_create(shader, nir_op_imov);
+ nir_alu_src_copy(&mov->src[0], &vec->src[start_idx], mov);
+ nir_alu_dest_copy(&mov->dest, &vec->dest, mov);
- mov->dest.write_mask = (1u << start_channel);
- mov->src[0].swizzle[start_channel] = vec->src[src_idx].swizzle[0];
- src_idx++;
+ mov->dest.write_mask = (1u << start_idx);
+ mov->src[0].swizzle[start_idx] = vec->src[start_idx].swizzle[0];
- for (unsigned i = start_channel + 1; i < 4; i++) {
+ for (unsigned i = start_idx + 1; i < 4; i++) {
if (!(vec->dest.write_mask & (1 << i)))
continue;
- if (nir_srcs_equal(vec->src[src_idx].src, vec->src[start_src_idx].src)) {
+ if (nir_srcs_equal(vec->src[i].src, vec->src[start_idx].src)) {
mov->dest.write_mask |= (1 << i);
- mov->src[0].swizzle[i] = vec->src[src_idx].swizzle[0];
+ mov->src[0].swizzle[i] = vec->src[i].swizzle[0];
}
- src_idx++;
}
nir_instr_insert_before(&vec->instr, &mov->instr);
}
static bool
-lower_vec_to_movs_block(nir_block *block, void *mem_ctx)
+has_replicated_dest(nir_alu_instr *alu)
{
+ return alu->op == nir_op_fdot_replicated2 ||
+ alu->op == nir_op_fdot_replicated3 ||
+ alu->op == nir_op_fdot_replicated4;
+}
+
+/* Attempts to coalesce the "move" from the given source of the vec to the
+ * destination of the instruction generating the value. If, for whatever
+ * reason, we cannot coalesce the mmove, it does nothing and returns 0. We
+ * can then call insert_mov as normal.
+ */
+static unsigned
+try_coalesce(nir_alu_instr *vec, unsigned start_idx, nir_shader *shader)
+{
+ assert(start_idx < nir_op_infos[vec->op].num_inputs);
+
+ /* We will only even try if the source is SSA */
+ if (!vec->src[start_idx].src.is_ssa)
+ return 0;
+
+ assert(vec->src[start_idx].src.ssa);
+
+ /* If we are going to do a reswizzle, then the vecN operation must be the
+ * only use of the source value. We also can't have any source modifiers.
+ */
+ nir_foreach_use(vec->src[start_idx].src.ssa, src) {
+ if (src->parent_instr != &vec->instr)
+ return 0;
+
+ nir_alu_src *alu_src = exec_node_data(nir_alu_src, src, src);
+ if (alu_src->abs || alu_src->negate)
+ return 0;
+ }
+
+ if (!list_empty(&vec->src[start_idx].src.ssa->if_uses))
+ return 0;
+
+ if (vec->src[start_idx].src.ssa->parent_instr->type != nir_instr_type_alu)
+ return 0;
+
+ nir_alu_instr *src_alu =
+ nir_instr_as_alu(vec->src[start_idx].src.ssa->parent_instr);
+
+ if (has_replicated_dest(src_alu)) {
+ /* The fdot instruction is special: It replicates its result to all
+ * components. This means that we can always rewrite its destination
+ * and we don't need to swizzle anything.
+ */
+ } else {
+ /* We only care about being able to re-swizzle the instruction if it is
+ * something that we can reswizzle. It must be per-component. The one
+ * exception to this is the fdotN instructions which implicitly splat
+ * their result out to all channels.
+ */
+ if (nir_op_infos[src_alu->op].output_size != 0)
+ return 0;
+
+ /* If we are going to reswizzle the instruction, we can't have any
+ * non-per-component sources either.
+ */
+ for (unsigned j = 0; j < nir_op_infos[src_alu->op].num_inputs; j++)
+ if (nir_op_infos[src_alu->op].input_sizes[j] != 0)
+ return 0;
+ }
+
+ /* Stash off all of the ALU instruction's swizzles. */
+ uint8_t swizzles[4][4];
+ for (unsigned j = 0; j < nir_op_infos[src_alu->op].num_inputs; j++)
+ for (unsigned i = 0; i < 4; i++)
+ swizzles[j][i] = src_alu->src[j].swizzle[i];
+
+ unsigned write_mask = 0;
+ for (unsigned i = start_idx; i < 4; i++) {
+ if (!(vec->dest.write_mask & (1 << i)))
+ continue;
+
+ if (!vec->src[i].src.is_ssa ||
+ vec->src[i].src.ssa != &src_alu->dest.dest.ssa)
+ continue;
+
+ /* At this point, the give vec source matchese up with the ALU
+ * instruction so we can re-swizzle that component to match.
+ */
+ write_mask |= 1 << i;
+ if (has_replicated_dest(src_alu)) {
+ /* Since the destination is a single replicated value, we don't need
+ * to do any reswizzling
+ */
+ } else {
+ for (unsigned j = 0; j < nir_op_infos[src_alu->op].num_inputs; j++)
+ src_alu->src[j].swizzle[i] = swizzles[j][vec->src[i].swizzle[0]];
+ }
+
+ /* Clear the no longer needed vec source */
+ nir_instr_rewrite_src(&vec->instr, &vec->src[i].src, NIR_SRC_INIT);
+ }
+
+ nir_instr_rewrite_dest(&src_alu->instr, &src_alu->dest.dest, vec->dest.dest);
+ src_alu->dest.write_mask = write_mask;
+
+ return write_mask;
+}
+
+static bool
+lower_vec_to_movs_block(nir_block *block, void *void_impl)
+{
+ nir_function_impl *impl = void_impl;
+ nir_shader *shader = impl->overload->function->shader;
+
nir_foreach_instr_safe(block, instr) {
if (instr->type != nir_instr_type_alu)
continue;
- nir_alu_instr *vec = (nir_alu_instr *)instr;
+ nir_alu_instr *vec = nir_instr_as_alu(instr);
switch (vec->op) {
case nir_op_vec2:
continue; /* The loop */
}
- /* Since we insert multiple MOVs, we have to be non-SSA. */
- assert(!vec->dest.dest.is_ssa);
+ if (vec->dest.dest.is_ssa) {
+ /* Since we insert multiple MOVs, we have a register destination. */
+ nir_register *reg = nir_local_reg_create(impl);
+ reg->num_components = vec->dest.dest.ssa.num_components;
+
+ nir_ssa_def_rewrite_uses(&vec->dest.dest.ssa, nir_src_for_reg(reg));
+
+ nir_instr_rewrite_dest(&vec->instr, &vec->dest.dest,
+ nir_dest_for_reg(reg));
+ }
unsigned finished_write_mask = 0;
* destination reg, in case other values we're populating in the dest
* might overwrite them.
*/
- for (unsigned i = 0, src_idx = 0; i < 4; i++) {
+ for (unsigned i = 0; i < 4; i++) {
if (!(vec->dest.write_mask & (1 << i)))
continue;
- if (src_matches_dest_reg(&vec->dest.dest, &vec->src[src_idx].src)) {
- finished_write_mask |= insert_mov(vec, i, src_idx, mem_ctx);
+ if (src_matches_dest_reg(&vec->dest.dest, &vec->src[i].src)) {
+ finished_write_mask |= insert_mov(vec, i, shader);
break;
}
- src_idx++;
}
/* Now, emit MOVs for all the other src channels. */
- for (unsigned i = 0, src_idx = 0; i < 4; i++) {
+ for (unsigned i = 0; i < 4; i++) {
if (!(vec->dest.write_mask & (1 << i)))
continue;
if (!(finished_write_mask & (1 << i)))
- finished_write_mask |= insert_mov(vec, i, src_idx, mem_ctx);
+ finished_write_mask |= try_coalesce(vec, i, shader);
- src_idx++;
+ if (!(finished_write_mask & (1 << i)))
+ finished_write_mask |= insert_mov(vec, i, shader);
}
nir_instr_remove(&vec->instr);
static void
nir_lower_vec_to_movs_impl(nir_function_impl *impl)
{
- nir_foreach_block(impl, lower_vec_to_movs_block, ralloc_parent(impl));
+ nir_foreach_block(impl, lower_vec_to_movs_block, impl);
}
void