From b616164c95cb495ce43f6b61dc805ed911a85e89 Mon Sep 17 00:00:00 2001 From: Ian Romanick Date: Fri, 23 Jan 2015 17:31:12 -0800 Subject: [PATCH] i965/fs: Emit better b2f of an expression on GEN4 and GEN5 On platforms that do not natively generate 0u and ~0u for Boolean results, b2f expressions that look like f = b2f(expr cmp 0) will generate better code by pretending the expression is f = ir_triop_sel(0.0, 1.0, expr cmp 0) This is because the last instruction of "expr" can generate the condition code for the "cmp 0". This avoids having to do the "-(b & 1)" trick to generate 0u or ~0u for the Boolean result. This means code like mov(16) g16<1>F 1F mul.ge.f0(16) null g6<8,8,1>F g14<8,8,1>F (+f0) sel(16) m6<1>F g16<8,8,1>F 0F will be generated instead of mul(16) g2<1>F g12<8,8,1>F g4<8,8,1>F cmp.ge.f0(16) g2<1>D g4<8,8,1>F 0F and(16) g4<1>D g2<8,8,1>D 1D and(16) m6<1>D -g4<8,8,1>D 0x3f800000UD v2: When the comparison is either == 0.0 or != 0.0 use the knowledge that the true (or false) case already results in zero would allow better code generation by possibly avoiding a load-immediate instruction. v3: Apply the optimization even when neither comparitor is zero. Shader-db results: GM45 (0x2A42): total instructions in shared programs: 3551002 -> 3550829 (-0.00%) instructions in affected programs: 33269 -> 33096 (-0.52%) helped: 121 Iron Lake (0x0046): total instructions in shared programs: 4993327 -> 4993146 (-0.00%) instructions in affected programs: 34199 -> 34018 (-0.53%) helped: 129 No change on other platforms. Signed-off-by: Ian Romanick Reviewed-by: Tapani Palli --- src/mesa/drivers/dri/i965/brw_fs.h | 2 + src/mesa/drivers/dri/i965/brw_fs_visitor.cpp | 101 ++++++++++++++++++- 2 files changed, 99 insertions(+), 4 deletions(-) diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h index 77165297d0a..23e71353992 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.h +++ b/src/mesa/drivers/dri/i965/brw_fs.h @@ -307,6 +307,7 @@ public: const fs_reg &a); void emit_minmax(enum brw_conditional_mod conditionalmod, const fs_reg &dst, const fs_reg &src0, const fs_reg &src1); + bool try_emit_b2f_of_comparison(ir_expression *ir); bool try_emit_saturate(ir_expression *ir); bool try_emit_line(ir_expression *ir); bool try_emit_mad(ir_expression *ir); @@ -317,6 +318,7 @@ public: bool opt_saturate_propagation(); bool opt_cmod_propagation(); void emit_bool_to_cond_code(ir_rvalue *condition); + void emit_bool_to_cond_code_of_reg(ir_expression *expr, fs_reg op[3]); void emit_if_gen6(ir_if *ir); void emit_unspill(bblock_t *block, fs_inst *inst, fs_reg reg, uint32_t spill_offset, int count); diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp index 6d56115a443..0d5252ab08e 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp @@ -475,6 +475,87 @@ fs_visitor::try_emit_mad(ir_expression *ir) return true; } +bool +fs_visitor::try_emit_b2f_of_comparison(ir_expression *ir) +{ + /* On platforms that do not natively generate 0u and ~0u for Boolean + * results, b2f expressions that look like + * + * f = b2f(expr cmp 0) + * + * will generate better code by pretending the expression is + * + * f = ir_triop_csel(0.0, 1.0, expr cmp 0) + * + * This is because the last instruction of "expr" can generate the + * condition code for the "cmp 0". This avoids having to do the "-(b & 1)" + * trick to generate 0u or ~0u for the Boolean result. This means code like + * + * mov(16) g16<1>F 1F + * mul.ge.f0(16) null g6<8,8,1>F g14<8,8,1>F + * (+f0) sel(16) m6<1>F g16<8,8,1>F 0F + * + * will be generated instead of + * + * mul(16) g2<1>F g12<8,8,1>F g4<8,8,1>F + * cmp.ge.f0(16) g2<1>D g4<8,8,1>F 0F + * and(16) g4<1>D g2<8,8,1>D 1D + * and(16) m6<1>D -g4<8,8,1>D 0x3f800000UD + * + * When the comparison is either == 0.0 or != 0.0 using the knowledge that + * the true (or false) case already results in zero would allow better code + * generation by possibly avoiding a load-immediate instruction. + */ + ir_expression *cmp = ir->operands[0]->as_expression(); + if (cmp == NULL) + return false; + + if (cmp->operation == ir_binop_equal || cmp->operation == ir_binop_nequal) { + for (unsigned i = 0; i < 2; i++) { + ir_constant *c = cmp->operands[i]->as_constant(); + if (c == NULL || !c->is_zero()) + continue; + + ir_expression *expr = cmp->operands[i ^ 1]->as_expression(); + if (expr != NULL) { + fs_reg op[2]; + + for (unsigned j = 0; j < 2; j++) { + cmp->operands[j]->accept(this); + op[j] = this->result; + + resolve_ud_negate(&op[j]); + } + + emit_bool_to_cond_code_of_reg(cmp, op); + + /* In this case we know when the condition is true, op[i ^ 1] + * contains zero. Invert the predicate, use op[i ^ 1] as src0, + * and immediate 1.0f as src1. + */ + this->result = vgrf(ir->type); + op[i ^ 1].type = BRW_REGISTER_TYPE_F; + + fs_inst *inst = emit(SEL(this->result, op[i ^ 1], fs_reg(1.0f))); + inst->predicate = BRW_PREDICATE_NORMAL; + inst->predicate_inverse = cmp->operation == ir_binop_equal; + return true; + } + } + } + + emit_bool_to_cond_code(cmp); + + fs_reg temp = vgrf(ir->type); + emit(MOV(temp, fs_reg(1.0f))); + + this->result = vgrf(ir->type); + fs_inst *inst = emit(SEL(this->result, temp, fs_reg(0.0f))); + inst->predicate = BRW_PREDICATE_NORMAL; + + return true; +} + static int pack_pixel_offset(float x) { @@ -639,6 +720,11 @@ fs_visitor::visit(ir_expression *ir) inst->predicate = BRW_PREDICATE_NORMAL; return; + case ir_unop_b2f: + if (brw->gen <= 5 && try_emit_b2f_of_comparison(ir)) + return; + break; + case ir_unop_interpolate_at_centroid: case ir_binop_interpolate_at_offset: case ir_binop_interpolate_at_sample: @@ -2508,7 +2594,6 @@ fs_visitor::emit_bool_to_cond_code(ir_rvalue *ir) } fs_reg op[3]; - fs_inst *inst; assert(expr->get_num_operands() <= 3); for (unsigned int i = 0; i < expr->get_num_operands(); i++) { @@ -2520,6 +2605,14 @@ fs_visitor::emit_bool_to_cond_code(ir_rvalue *ir) resolve_ud_negate(&op[i]); } + emit_bool_to_cond_code_of_reg(expr, op); +} + +void +fs_visitor::emit_bool_to_cond_code_of_reg(ir_expression *expr, fs_reg op[3]) +{ + fs_inst *inst; + switch (expr->operation) { case ir_unop_logic_not: inst = emit(AND(reg_null_d, op[0], fs_reg(1))); @@ -2528,7 +2621,7 @@ fs_visitor::emit_bool_to_cond_code(ir_rvalue *ir) case ir_binop_logic_xor: if (brw->gen <= 5) { - fs_reg temp = vgrf(ir->type); + fs_reg temp = vgrf(expr->type); emit(XOR(temp, op[0], op[1])); inst = emit(AND(reg_null_d, temp, fs_reg(1))); } else { @@ -2539,7 +2632,7 @@ fs_visitor::emit_bool_to_cond_code(ir_rvalue *ir) case ir_binop_logic_or: if (brw->gen <= 5) { - fs_reg temp = vgrf(ir->type); + fs_reg temp = vgrf(expr->type); emit(OR(temp, op[0], op[1])); inst = emit(AND(reg_null_d, temp, fs_reg(1))); } else { @@ -2550,7 +2643,7 @@ fs_visitor::emit_bool_to_cond_code(ir_rvalue *ir) case ir_binop_logic_and: if (brw->gen <= 5) { - fs_reg temp = vgrf(ir->type); + fs_reg temp = vgrf(expr->type); emit(AND(temp, op[0], op[1])); inst = emit(AND(reg_null_d, temp, fs_reg(1))); } else { -- 2.30.2