From 56879a7ac41b8c7513a97cc02921f76a2ec8407c Mon Sep 17 00:00:00 2001 From: Kenneth Graunke Date: Sun, 23 Feb 2014 16:29:46 -0800 Subject: [PATCH] i965/vec4: Handle ir_triop_lrp on Gen4-5 as well. When the vec4 backend encountered an ir_triop_lrp, it always emitted an actual LRP instruction, which only exists on Gen6+. Gen4-5 used lower_instructions() to decompose ir_triop_lrp at the IR level. Since commit 8d37e9915a3b21 ("glsl: Optimize open-coded lrp into lrp."), we've had an bug where lower_instructions translates ir_triop_lrp into arithmetic, but opt_algebraic reassembles it back into a lrp. To avoid this ordering concern, just handle ir_triop_lrp in the backend. The FS backend already does this, so we may as well do likewise. v2: Add a comment reminding us that we could emit better assembly if we implemented the infrastructure necessary to support using MAC. (Assembly code provided by Eric Anholt). Cc: "10.1" Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=75253 Signed-off-by: Kenneth Graunke Reviewed-by: Matt Turner Acked-by: Eric Anholt --- src/mesa/drivers/dri/i965/brw_vec4.h | 3 ++ .../drivers/dri/i965/brw_vec4_visitor.cpp | 42 +++++++++++++++---- 2 files changed, 38 insertions(+), 7 deletions(-) diff --git a/src/mesa/drivers/dri/i965/brw_vec4.h b/src/mesa/drivers/dri/i965/brw_vec4.h index 6bd8b80cf2f..fb5c0a6d665 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4.h +++ b/src/mesa/drivers/dri/i965/brw_vec4.h @@ -506,6 +506,9 @@ public: void emit_minmax(uint32_t condmod, dst_reg dst, src_reg src0, src_reg src1); + void emit_lrp(const dst_reg &dst, + const src_reg &x, const src_reg &y, const src_reg &a); + void emit_block_move(dst_reg *dst, src_reg *src, const struct glsl_type *type, uint32_t predicate); diff --git a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp index 95e00640d82..aedab931701 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp +++ b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp @@ -1132,6 +1132,40 @@ vec4_visitor::emit_minmax(uint32_t conditionalmod, dst_reg dst, } } +void +vec4_visitor::emit_lrp(const dst_reg &dst, + const src_reg &x, const src_reg &y, const src_reg &a) +{ + if (brw->gen >= 6) { + /* Note that the instruction's argument order is reversed from GLSL + * and the IR. + */ + emit(LRP(dst, + fix_3src_operand(a), fix_3src_operand(y), fix_3src_operand(x))); + } else { + /* Earlier generations don't support three source operations, so we + * need to emit x*(1-a) + y*a. + * + * A better way to do this would be: + * ADD one_minus_a, negate(a), 1.0f + * MUL null, y, a + * MAC dst, x, one_minus_a + * but we would need to support MAC and implicit accumulator. + */ + dst_reg y_times_a = dst_reg(this, glsl_type::vec4_type); + dst_reg one_minus_a = dst_reg(this, glsl_type::vec4_type); + dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type); + y_times_a.writemask = dst.writemask; + one_minus_a.writemask = dst.writemask; + x_times_one_minus_a.writemask = dst.writemask; + + emit(MUL(y_times_a, y, a)); + emit(ADD(one_minus_a, negate(a), src_reg(1.0f))); + emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a))); + emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a))); + } +} + static bool is_16bit_constant(ir_rvalue *rvalue) { @@ -1628,13 +1662,7 @@ vec4_visitor::visit(ir_expression *ir) break; case ir_triop_lrp: - op[0] = fix_3src_operand(op[0]); - op[1] = fix_3src_operand(op[1]); - op[2] = fix_3src_operand(op[2]); - /* Note that the instruction's argument order is reversed from GLSL - * and the IR. - */ - emit(LRP(result_dst, op[2], op[1], op[0])); + emit_lrp(result_dst, op[0], op[1], op[2]); break; case ir_triop_csel: -- 2.30.2