struct ir3_shader_variant *so;
};
-/* is it a type preserving mov, with ok flags? */
-static bool is_eligible_mov(struct ir3_instruction *instr, bool allow_flags)
+/* is it a type preserving mov, with ok flags?
+ *
+ * @instr: the mov to consider removing
+ * @dst_instr: the instruction consuming the mov (instr)
+ *
+ * TODO maybe drop allow_flags since this is only false when dst is
+ * NULL (ie. outputs)
+ */
+static bool is_eligible_mov(struct ir3_instruction *instr,
+ struct ir3_instruction *dst_instr, bool allow_flags)
{
if (is_same_type_mov(instr)) {
struct ir3_register *dst = instr->regs[0];
IR3_REG_SABS | IR3_REG_SNEG | IR3_REG_BNOT))
return false;
- /* TODO: remove this hack: */
- if (src_instr->opc == OPC_META_FO)
- return false;
+ /* If src is coming from fanout/split (ie. one component of a
+ * texture fetch, etc) and we have constraints on swizzle of
+ * destination, then skip it.
+ *
+ * We could possibly do a bit better, and copy-propagation if
+ * we can CP all components that are being fanned out.
+ */
+ if (src_instr->opc == OPC_META_SPLIT) {
+ if (!dst_instr)
+ return false;
+ if (dst_instr->opc == OPC_META_COLLECT)
+ return false;
+ if (dst_instr->cp.left || dst_instr->cp.right)
+ return false;
+ }
return true;
}
if ((flags & IR3_REG_IMMED) && (reg->flags & IR3_REG_IMMED))
return false;
}
- /* cannot be const + ABS|NEG: */
- if (flags & (IR3_REG_FABS | IR3_REG_FNEG |
- IR3_REG_SABS | IR3_REG_SNEG | IR3_REG_BNOT))
- return false;
}
break;
case 3:
return false;
}
- if (flags & IR3_REG_CONST) {
- /* cannot be const + ABS|NEG: */
- if (flags & (IR3_REG_FABS | IR3_REG_FNEG |
- IR3_REG_SABS | IR3_REG_SNEG | IR3_REG_BNOT))
- return false;
- }
break;
case 4:
/* seems like blob compiler avoids const as src.. */
if ((instr->opc == OPC_STL) && (n != 2))
return false;
+ if (instr->opc == OPC_STLW && n == 0)
+ return false;
+
+ if (instr->opc == OPC_LDLW && n == 0)
+ return false;
+
/* disallow CP into anything but the SSBO slot argument for
* atomics:
*/
if (is_atomic(instr->opc) && !(instr->flags & IR3_INSTR_G))
return false;
- /* as with atomics, ldib on a6xx can only have immediate for
- * SSBO slot argument
+ if (instr->opc == OPC_STG && (instr->flags & IR3_INSTR_G) && (n != 2))
+ return false;
+
+ /* as with atomics, ldib and ldc on a6xx can only have immediate
+ * for SSBO slot argument
*/
- if ((instr->opc == OPC_LDIB) && (n != 0))
+ if ((instr->opc == OPC_LDIB || instr->opc == OPC_LDC) && (n != 0))
return false;
}
reg = ir3_reg_clone(ctx->shader, reg);
+ /* Half constant registers seems to handle only 32-bit values
+ * within floating-point opcodes. So convert back to 32-bit values.
+ */
+ if (f_opcode && (new_flags & IR3_REG_HALF))
+ reg->uim_val = fui(_mesa_half_to_float(reg->uim_val));
+
/* in some cases, there are restrictions on (abs)/(neg) plus const..
* so just evaluate those and clear the flags:
*/
swiz = i % 4;
idx = i / 4;
- /* Half constant registers seems to handle only 32-bit values
- * within floating-point opcodes. So convert back to 32-bit values. */
- if (f_opcode && (new_flags & IR3_REG_HALF)) {
- reg->uim_val = fui(_mesa_half_to_float(reg->uim_val));
- }
-
const_state->immediates[idx].val[swiz] = reg->uim_val;
const_state->immediates_count = idx + 1;
const_state->immediate_idx++;
*/
swap(instr->regs[0 + 1], instr->regs[1 + 1]);
+ /* cat3 doesn't encode immediate, but we can lower immediate
+ * to const if that helps:
+ */
+ if (new_flags & IR3_REG_IMMED) {
+ new_flags &= ~IR3_REG_IMMED;
+ new_flags |= IR3_REG_CONST;
+ }
+
bool valid_swap =
/* can we propagate mov if we move 2nd src to first? */
valid_flags(instr, 0, new_flags) &&
* src (which needs to also fixup the address src reference by the
* instruction).
*/
-static void
+static bool
reg_cp(struct ir3_cp_ctx *ctx, struct ir3_instruction *instr,
struct ir3_register *reg, unsigned n)
{
struct ir3_instruction *src = ssa(reg);
- if (is_eligible_mov(src, true)) {
+ if (is_eligible_mov(src, instr, true)) {
/* simple case, no immed/const/relativ, only mov's w/ ssa src: */
struct ir3_register *src_reg = src->regs[1];
unsigned new_flags = reg->flags;
unuse(src);
reg->instr->use_count++;
- }
- } else if (is_same_type_mov(src) &&
+ return true;
+ }
+ } else if ((is_same_type_mov(src) || is_const_mov(src)) &&
/* cannot collapse const/immed/etc into meta instrs: */
!is_meta(instr)) {
/* immed/const/etc cases, which require some special handling: */
debug_assert(new_flags & IR3_REG_IMMED);
instr->regs[n + 1] = lower_immed(ctx, src_reg, new_flags, f_opcode);
- return;
+ return true;
}
/* special case for "normal" mad instructions, we can
* src[0] is !CONST and src[1] is CONST:
*/
if ((n == 1) && try_swap_mad_two_srcs(instr, new_flags)) {
- /* we swapped, so now we are dealing with 1st src: */
- n = 0;
+ return true;
} else {
- return;
+ return false;
}
}
*/
if ((src_reg->flags & IR3_REG_RELATIV) &&
conflicts(instr->address, reg->instr->address))
- return;
+ return false;
/* This seems to be a hw bug, or something where the timings
* just somehow don't work out. This restriction may only
if ((opc_cat(instr->opc) == 3) && (n == 2) &&
(src_reg->flags & IR3_REG_RELATIV) &&
(src_reg->array.offset == 0))
- return;
+ return false;
+
+ /* When narrowing constant from 32b to 16b, it seems
+ * to work only for float. So we should do this only with
+ * float opcodes.
+ */
+ if (src->cat1.dst_type == TYPE_F16) {
+ if (instr->opc == OPC_MOV && !type_float(instr->cat1.src_type))
+ return false;
+ if (!ir3_cat2_float(instr->opc) && !ir3_cat3_float(instr->opc))
+ return false;
+ }
src_reg = ir3_reg_clone(instr->block->shader, src_reg);
src_reg->flags = new_flags;
if (src_reg->flags & IR3_REG_RELATIV)
ir3_instr_set_address(instr, reg->instr->address);
- return;
+ return true;
}
if ((src_reg->flags & IR3_REG_RELATIV) &&
instr->regs[n+1] = src_reg;
ir3_instr_set_address(instr, reg->instr->address);
- return;
+ return true;
}
/* NOTE: seems we can only do immed integers, so don't
iim_val = ~iim_val;
/* other than category 1 (mov) we can only encode up to 10 bits: */
- if ((instr->opc == OPC_MOV) ||
- !((iim_val & ~0x3ff) && (-iim_val & ~0x3ff))) {
+ if (valid_flags(instr, n, new_flags) &&
+ ((instr->opc == OPC_MOV) ||
+ !((iim_val & ~0x3ff) && (-iim_val & ~0x3ff)))) {
new_flags &= ~(IR3_REG_SABS | IR3_REG_SNEG | IR3_REG_BNOT);
src_reg = ir3_reg_clone(instr->block->shader, src_reg);
src_reg->flags = new_flags;
src_reg->iim_val = iim_val;
instr->regs[n+1] = src_reg;
+
+ return true;
} else if (valid_flags(instr, n, (new_flags & ~IR3_REG_IMMED) | IR3_REG_CONST)) {
bool f_opcode = (ir3_cat2_float(instr->opc) ||
ir3_cat3_float(instr->opc)) ? true : false;
/* See if lowering an immediate to const would help. */
instr->regs[n+1] = lower_immed(ctx, src_reg, new_flags, f_opcode);
+
+ return true;
}
- return;
}
}
+
+ return false;
}
/* Handle special case of eliminating output mov, and similar cases where
static struct ir3_instruction *
eliminate_output_mov(struct ir3_instruction *instr)
{
- if (is_eligible_mov(instr, false)) {
+ if (is_eligible_mov(instr, NULL, false)) {
struct ir3_register *reg = instr->regs[1];
if (!(reg->flags & IR3_REG_ARRAY)) {
struct ir3_instruction *src_instr = ssa(reg);
return;
/* walk down the graph from each src: */
- foreach_src_n(reg, n, instr) {
- struct ir3_instruction *src = ssa(reg);
+ bool progress;
+ do {
+ progress = false;
+ foreach_src_n (reg, n, instr) {
+ struct ir3_instruction *src = ssa(reg);
- if (!src)
- continue;
+ if (!src)
+ continue;
- instr_cp(ctx, src);
+ instr_cp(ctx, src);
- /* TODO non-indirect access we could figure out which register
- * we actually want and allow cp..
- */
- if (reg->flags & IR3_REG_ARRAY)
- continue;
+ /* TODO non-indirect access we could figure out which register
+ * we actually want and allow cp..
+ */
+ if (reg->flags & IR3_REG_ARRAY)
+ continue;
- /* Don't CP absneg into meta instructions, that won't end well: */
- if (is_meta(instr) && (src->opc != OPC_MOV))
- continue;
+ /* Don't CP absneg into meta instructions, that won't end well: */
+ if (is_meta(instr) && (src->opc != OPC_MOV))
+ continue;
- reg_cp(ctx, instr, reg, n);
- }
+ progress |= reg_cp(ctx, instr, reg, n);
+ }
+ } while (progress);
if (instr->regs[0]->flags & IR3_REG_ARRAY) {
struct ir3_instruction *src = ssa(instr->regs[0]);
}
}
- /* Handle converting a sam.s2en (taking samp/tex idx params via
- * register) into a normal sam (encoding immediate samp/tex idx)
- * if they are immediate. This saves some instructions and regs
- * in the common case where we know samp/tex at compile time:
+ /* Handle converting a sam.s2en (taking samp/tex idx params via register)
+ * into a normal sam (encoding immediate samp/tex idx) if they are
+ * immediate. This saves some instructions and regs in the common case
+ * where we know samp/tex at compile time. This needs to be done in the
+ * frontend for bindless tex, though, so don't replicate it here.
*/
if (is_tex(instr) && (instr->flags & IR3_INSTR_S2EN) &&
+ !(instr->flags & IR3_INSTR_B) &&
!(ir3_shader_debug & IR3_DBG_FORCES2EN)) {
- /* The first src will be a fan-in (collect), if both of it's
+ /* The first src will be a collect, if both of it's
* two sources are mov from imm, then we can
*/
struct ir3_instruction *samp_tex = ssa(instr->regs[1]);
- debug_assert(samp_tex->opc == OPC_META_FI);
+ debug_assert(samp_tex->opc == OPC_META_COLLECT);
struct ir3_instruction *samp = ssa(samp_tex->regs[1]);
struct ir3_instruction *tex = ssa(samp_tex->regs[2]);
instr->flags &= ~IR3_INSTR_S2EN;
instr->cat5.samp = samp->regs[1]->iim_val;
instr->cat5.tex = tex->regs[1]->iim_val;
- instr->regs[1]->instr = NULL;
+
+ /* shuffle around the regs to remove the first src: */
+ instr->regs_count--;
+ for (unsigned i = 1; i < instr->regs_count; i++) {
+ instr->regs[i] = instr->regs[i + 1];
+ }
}
}
}
* a mov, so we need to do a pass to first count consumers of a
* mov.
*/
- list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
- list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
+ foreach_block (block, &ir->block_list) {
+ foreach_instr (instr, &block->instr_list) {
struct ir3_instruction *src;
/* by the way, we don't account for false-dep's, so the CP
*/
debug_assert(instr->deps_count == 0);
- foreach_ssa_src(src, instr) {
+ foreach_ssa_src (src, instr) {
src->use_count++;
}
}
ir3_clear_mark(ir);
- for (unsigned i = 0; i < ir->noutputs; i++) {
- if (ir->outputs[i]) {
- instr_cp(&ctx, ir->outputs[i]);
- ir->outputs[i] = eliminate_output_mov(ir->outputs[i]);
- }
+ struct ir3_instruction *out;
+ foreach_output_n (out, n, ir) {
+ instr_cp(&ctx, out);
+ ir->outputs[n] = eliminate_output_mov(out);
}
- list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+ foreach_block (block, &ir->block_list) {
if (block->condition) {
instr_cp(&ctx, block->condition);
block->condition = eliminate_output_mov(block->condition);