}
}
+static void
+handle_min_max(struct nv_pass *ctx, struct nv_instruction *nvi)
+{
+ struct nv_value *src0 = nvi->src[0]->value;
+ struct nv_value *src1 = nvi->src[1]->value;
+
+ if (src0 != src1 || (nvi->src[0]->mod | nvi->src[1]->mod))
+ return;
+ if (src0->reg.file != NV_FILE_GPR)
+ return;
+ nvc0_pc_replace_value(ctx->pc, nvi->def[0], src0);
+ nvc0_insn_delete(nvi);
+}
+
+/* check if we can MUL + ADD -> MAD/FMA */
+static void
+handle_add_mul(struct nv_pass *ctx, struct nv_instruction *nvi)
+{
+ struct nv_value *src0 = nvi->src[0]->value;
+ struct nv_value *src1 = nvi->src[1]->value;
+ struct nv_value *src;
+ int s;
+ uint8_t mod[4];
+
+ if (SRC_IS_MUL(src0) && src0->refc == 1) s = 0;
+ else
+ if (SRC_IS_MUL(src1) && src1->refc == 1) s = 1;
+ else
+ return;
+
+ if ((src0->insn && src0->insn->bb != nvi->bb) ||
+ (src1->insn && src1->insn->bb != nvi->bb))
+ return;
+
+ /* check for immediates from prior constant folding */
+ if (src0->reg.file != NV_FILE_GPR || src1->reg.file != NV_FILE_GPR)
+ return;
+ src = nvi->src[s]->value;
+
+ mod[0] = nvi->src[0]->mod;
+ mod[1] = nvi->src[1]->mod;
+ mod[2] = src->insn->src[0]->mod;
+ mod[3] = src->insn->src[1]->mod;
+
+ if ((mod[0] | mod[1] | mod[2] | mod[3]) & ~NV_MOD_NEG)
+ return;
+
+ nvi->opcode = NV_OP_MAD_F32;
+
+ nv_reference(ctx->pc, nvi, s, NULL);
+ nvi->src[2] = nvi->src[!s];
+ nvi->src[!s] = NULL;
+
+ nv_reference(ctx->pc, nvi, 0, src->insn->src[0]->value);
+ nvi->src[0]->mod = mod[2] ^ mod[s];
+ nv_reference(ctx->pc, nvi, 1, src->insn->src[1]->value);
+ nvi->src[1]->mod = mod[3];
+}
+
static int
-nv_pass_lower_arith(struct nv_pass *ctx, struct nv_basic_block *b)
+nv_pass_algebraic_opt(struct nv_pass *ctx, struct nv_basic_block *b)
{
struct nv_instruction *nvi, *next;
int j;
for (nvi = b->entry; nvi; nvi = next) {
- struct nv_value *src0, *src1, *src;
- int s;
- uint8_t mod[4];
+ struct nv_value *src0, *src1;
+ uint baseop = NV_BASEOP(nvi->opcode);
next = nvi->next;
src0 = nvc0_pc_find_immediate(nvi->src[0]);
src1 = nvc0_pc_find_immediate(nvi->src[1]);
- if (src0 && src1)
+ if (src0 && src1) {
constant_expression(ctx->pc, nvi, src0, src1);
- else {
+ } else {
if (src0)
constant_operand(ctx->pc, nvi, src0, 0);
else
constant_operand(ctx->pc, nvi, src1, 1);
}
- /* check if we can MUL + ADD -> MAD/FMA */
- if (nvi->opcode != NV_OP_ADD)
- continue;
-
- src0 = nvi->src[0]->value;
- src1 = nvi->src[1]->value;
-
- if (SRC_IS_MUL(src0) && src0->refc == 1)
- src = src0;
- else
- if (SRC_IS_MUL(src1) && src1->refc == 1)
- src = src1;
+ if (baseop == NV_OP_MIN || baseop == NV_OP_MAX)
+ handle_min_max(ctx, nvi);
else
- continue;
-
- /* could have an immediate from above constant_* */
- if (src0->reg.file != NV_FILE_GPR || src1->reg.file != NV_FILE_GPR)
- continue;
- s = (src == src0) ? 0 : 1;
-
- mod[0] = nvi->src[0]->mod;
- mod[1] = nvi->src[1]->mod;
- mod[2] = src->insn->src[0]->mod;
- mod[3] = src->insn->src[0]->mod;
-
- if ((mod[0] | mod[1] | mod[2] | mod[3]) & ~NV_MOD_NEG)
- continue;
-
- nvi->opcode = NV_OP_MAD;
- nv_reference(ctx->pc, nvi, s, NULL);
- nvi->src[2] = nvi->src[!s];
-
- nvi->src[0] = new_ref(ctx->pc, src->insn->src[0]->value);
- nvi->src[1] = new_ref(ctx->pc, src->insn->src[1]->value);
- nvi->src[0]->mod = mod[2] ^ mod[s];
- nvi->src[1]->mod = mod[3];
+ if (nvi->opcode == NV_OP_ADD_F32)
+ handle_add_mul(ctx, nvi);
}
- DESCEND_ARBITRARY(j, nv_pass_lower_arith);
+ DESCEND_ARBITRARY(j, nv_pass_algebraic_opt);
return 0;
}
pass.n = 0;
pass.pc = pc;
+ /* Do CSE so we can just compare values by pointer in subsequent passes. */
+ pc->pass_seq++;
+ ret = nv_pass_cse(&pass, root);
+ if (ret)
+ return ret;
+
/* Do this first, so we don't have to pay attention
* to whether sources are supported memory loads.
*/
pc->pass_seq++;
- ret = nv_pass_lower_arith(&pass, root);
+ ret = nv_pass_algebraic_opt(&pass, root);
if (ret)
return ret;
reldelim->pc = pc;
}
- pc->pass_seq++;
- ret = nv_pass_cse(&pass, root);
- if (ret)
- return ret;
-
+ /* May run DCE before load-combining since that pass will clean up
+ * after itself.
+ */
dce.pc = pc;
do {
dce.removed = 0;