From bf686b2785c63116ab4ab7e62eb77be51b97d346 Mon Sep 17 00:00:00 2001 From: Matt Turner Date: Sat, 8 Mar 2014 17:29:33 -0800 Subject: [PATCH] i965/vec4: Optimize unpackUnorm4x8(). Reduces the number of instructions needed to implement unpackUnorm4x8() from 11 -> 4. Reviewed-by: Kenneth Graunke --- src/mesa/drivers/dri/i965/brw_shader.cpp | 7 +++-- src/mesa/drivers/dri/i965/brw_vec4.h | 1 + .../drivers/dri/i965/brw_vec4_visitor.cpp | 26 ++++++++++++++++++- 3 files changed, 31 insertions(+), 3 deletions(-) diff --git a/src/mesa/drivers/dri/i965/brw_shader.cpp b/src/mesa/drivers/dri/i965/brw_shader.cpp index 6a14932629a..6cb2da8641f 100644 --- a/src/mesa/drivers/dri/i965/brw_shader.cpp +++ b/src/mesa/drivers/dri/i965/brw_shader.cpp @@ -85,8 +85,11 @@ brw_lower_packing_builtins(struct brw_context *brw, | LOWER_UNPACK_UNORM_2x16 | LOWER_PACK_SNORM_4x8 | LOWER_UNPACK_SNORM_4x8 - | LOWER_PACK_UNORM_4x8 - | LOWER_UNPACK_UNORM_4x8; + | LOWER_PACK_UNORM_4x8; + + if (shader_type == MESA_SHADER_FRAGMENT) { + ops |= LOWER_UNPACK_UNORM_4x8; + } if (brw->gen >= 7) { /* Gen7 introduced the f32to16 and f16to32 instructions, which can be diff --git a/src/mesa/drivers/dri/i965/brw_vec4.h b/src/mesa/drivers/dri/i965/brw_vec4.h index a8e972aed51..1c0717404e8 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4.h +++ b/src/mesa/drivers/dri/i965/brw_vec4.h @@ -508,6 +508,7 @@ public: void emit_pack_half_2x16(dst_reg dst, src_reg src0); void emit_unpack_half_2x16(dst_reg dst, src_reg src0); + void emit_unpack_unorm_4x8(const dst_reg &dst, src_reg src0); uint32_t gather_channel(ir_texture *ir, uint32_t sampler); src_reg emit_mcs_fetch(ir_texture *ir, src_reg coordinate, src_reg sampler); diff --git a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp index 5cc2abd6714..45551911008 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp +++ b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp @@ -466,6 +466,28 @@ vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0) emit(F16TO32(dst, tmp_src)); } +void +vec4_visitor::emit_unpack_unorm_4x8(const dst_reg &dst, src_reg src0) +{ + /* Instead of splitting the 32-bit integer, shifting, and ORing it back + * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate + * is not suitable to generate the shift values, but we can use the packed + * vector float and a type-converting MOV. + */ + dst_reg shift(this, glsl_type::uvec4_type); + emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78))); + + dst_reg shifted(this, glsl_type::uvec4_type); + src0.swizzle = BRW_SWIZZLE_XXXX; + emit(SHR(shifted, src0, src_reg(shift))); + + shifted.type = BRW_REGISTER_TYPE_UB; + dst_reg f(this, glsl_type::vec4_type); + emit(MOV(f, src_reg(shifted))); + + emit(MUL(dst, src_reg(f), src_reg(1.0f / 255.0f))); +} + void vec4_visitor::visit_instructions(const exec_list *list) { @@ -1747,6 +1769,9 @@ vec4_visitor::visit(ir_expression *ir) case ir_unop_unpack_half_2x16: emit_unpack_half_2x16(result_dst, op[0]); break; + case ir_unop_unpack_unorm_4x8: + emit_unpack_unorm_4x8(result_dst, op[0]); + break; case ir_unop_pack_snorm_2x16: case ir_unop_pack_snorm_4x8: case ir_unop_pack_unorm_2x16: @@ -1754,7 +1779,6 @@ vec4_visitor::visit(ir_expression *ir) case ir_unop_unpack_snorm_2x16: case ir_unop_unpack_snorm_4x8: case ir_unop_unpack_unorm_2x16: - case ir_unop_unpack_unorm_4x8: unreachable("not reached: should be handled by lower_packing_builtins"); case ir_unop_unpack_half_2x16_split_x: case ir_unop_unpack_half_2x16_split_y: -- 2.30.2