i965/fs/gen7: Emit code for GLSL 3.00 pack/unpack operations (v4)
authorChad Versace <chad.versace@linux.intel.com>
Wed, 9 Jan 2013 19:46:42 +0000 (11:46 -0800)
committerChad Versace <chad.versace@linux.intel.com>
Fri, 25 Jan 2013 05:31:06 +0000 (21:31 -0800)
v2: Remove lewd comment. [for idr]
v3: - Optimize away tmp register for packHalf2x16. [for anholt, paul]
    - Improve comments. [for anholt, paul]
    - Reduce near-duplicate code by removing vec4_visitor emit_pack/unpack
      methods. [for chadv]
v4: Factor our UD/W register conversion into helper function. [for anholt]

Reviewed-by: Eric Anholt <eric@anholt.net>
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com> (v2)
Signed-off-by: Chad Versace <chad.versace@linux.intel.com>
src/mesa/drivers/dri/i965/brw_defines.h
src/mesa/drivers/dri/i965/brw_fs.h
src/mesa/drivers/dri/i965/brw_fs_channel_expressions.cpp
src/mesa/drivers/dri/i965/brw_fs_emit.cpp
src/mesa/drivers/dri/i965/brw_fs_visitor.cpp

index e2f1e653b126237259010c1683222dece750ce6c..79cc12f0f661246c598a2e553c0a1f9e572ad25b 100644 (file)
@@ -726,6 +726,9 @@ enum opcode {
    FS_OPCODE_MOV_DISPATCH_TO_FLAGS,
    FS_OPCODE_DISCARD_JUMP,
    FS_OPCODE_SET_GLOBAL_OFFSET,
+   FS_OPCODE_PACK_HALF_2x16_SPLIT,
+   FS_OPCODE_UNPACK_HALF_2x16_SPLIT_X,
+   FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y,
 
    VS_OPCODE_URB_WRITE,
    VS_OPCODE_SCRATCH_READ,
index b47b0d066c8d2b30487a4d1bbf4dcfde2fdff372..d332502bde084c230365bb0489553777c02c0894 100644 (file)
@@ -542,6 +542,14 @@ private:
                                    struct brw_reg offset);
    void generate_discard_jump(fs_inst *inst);
 
+   void generate_pack_half_2x16_split(fs_inst *inst,
+                                      struct brw_reg dst,
+                                      struct brw_reg x,
+                                      struct brw_reg y);
+   void generate_unpack_half_2x16_split(fs_inst *inst,
+                                        struct brw_reg dst,
+                                        struct brw_reg src);
+
    void patch_discard_jumps_to_fb_writes();
 
    struct brw_context *brw;
index 58521ee6e6d3c3856e09ad3baedd96371b78ccc1..e19da51904952eaa92f817091d02f46252eaaaf6 100644 (file)
@@ -342,9 +342,21 @@ ir_channel_expressions_visitor::visit_leave(ir_assignment *ir)
       assert(!"not yet supported");
       break;
 
+   case ir_unop_pack_snorm_2x16:
+   case ir_unop_pack_unorm_2x16:
+   case ir_unop_pack_half_2x16:
+   case ir_unop_unpack_snorm_2x16:
+   case ir_unop_unpack_unorm_2x16:
+   case ir_unop_unpack_half_2x16:
    case ir_quadop_vector:
       assert(!"should have been lowered");
       break;
+
+   case ir_unop_unpack_half_2x16_split_x:
+   case ir_unop_unpack_half_2x16_split_y:
+   case ir_binop_pack_half_2x16_split:
+      assert("!not reached: expression operates on scalars only");
+      break;
    }
 
    ir->remove();
index 324e6656d1fabc88fa8d4c397a5466c49a02f654..27c5302b9f521ba8c4b5dac479dad7f85f0b73b2 100644 (file)
@@ -922,6 +922,95 @@ fs_generator::generate_set_global_offset(fs_inst *inst,
    brw_pop_insn_state(p);
 }
 
+/**
+ * Change the register's data type from UD to W, doubling the strides in order
+ * to compensate for halving the data type width.
+ */
+static struct brw_reg
+ud_reg_to_w(struct brw_reg r)
+{
+   assert(r.type == BRW_REGISTER_TYPE_UD);
+   r.type = BRW_REGISTER_TYPE_W;
+
+   /* The BRW_*_STRIDE enums are defined so that incrementing the field
+    * doubles the real stride.
+    */
+   if (r.hstride != 0)
+      ++r.hstride;
+   if (r.vstride != 0)
+      ++r.vstride;
+
+   return r;
+}
+
+void
+fs_generator::generate_pack_half_2x16_split(fs_inst *inst,
+                                            struct brw_reg dst,
+                                            struct brw_reg x,
+                                            struct brw_reg y)
+{
+   assert(intel->gen >= 7);
+   assert(dst.type == BRW_REGISTER_TYPE_UD);
+   assert(x.type = BRW_REGISTER_TYPE_F);
+   assert(y.type = BRW_REGISTER_TYPE_F);
+
+   /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
+    *
+    *   Because this instruction does not have a 16-bit floating-point type,
+    *   the destination data type must be Word (W).
+    *
+    *   The destination must be DWord-aligned and specify a horizontal stride
+    *   (HorzStride) of 2. The 16-bit result is stored in the lower word of
+    *   each destination channel and the upper word is not modified.
+    */
+   struct brw_reg dst_w = ud_reg_to_w(dst);
+
+   /* Give each 32-bit channel of dst the form below , where "." means
+    * unchanged.
+    *   0x....hhhh
+    */
+   brw_F32TO16(p, dst_w, y);
+
+   /* Now the form:
+    *   0xhhhh0000
+    */
+   brw_SHL(p, dst, dst, brw_imm_ud(16u));
+
+   /* And, finally the form of packHalf2x16's output:
+    *   0xhhhhllll
+    */
+   brw_F32TO16(p, dst_w, x);
+}
+
+void
+fs_generator::generate_unpack_half_2x16_split(fs_inst *inst,
+                                              struct brw_reg dst,
+                                              struct brw_reg src)
+{
+   assert(intel->gen >= 7);
+   assert(dst.type == BRW_REGISTER_TYPE_F);
+   assert(src.type == BRW_REGISTER_TYPE_UD);
+
+   /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
+    *
+    *   Because this instruction does not have a 16-bit floating-point type,
+    *   the source data type must be Word (W). The destination type must be
+    *   F (Float).
+    */
+   struct brw_reg src_w = ud_reg_to_w(src);
+
+   /* Each channel of src has the form of unpackHalf2x16's input: 0xhhhhllll.
+    * For the Y case, we wish to access only the upper word; therefore
+    * a 16-bit subregister offset is needed.
+    */
+   assert(inst->opcode == FS_OPCODE_UNPACK_HALF_2x16_SPLIT_X ||
+          inst->opcode == FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y);
+   if (inst->opcode == FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y)
+      src.subnr += 2;
+
+   brw_F16TO32(p, dst, src_w);
+}
+
 void
 fs_generator::generate_code(exec_list *instructions)
 {
@@ -1082,7 +1171,12 @@ fs_generator::generate_code(exec_list *instructions)
       case BRW_OPCODE_SHL:
         brw_SHL(p, dst, src[0], src[1]);
         break;
-
+      case BRW_OPCODE_F32TO16:
+         brw_F32TO16(p, dst, src[0]);
+         break;
+      case BRW_OPCODE_F16TO32:
+         brw_F16TO32(p, dst, src[0]);
+         break;
       case BRW_OPCODE_CMP:
         brw_CMP(p, dst, inst->conditional_mod, src[0], src[1]);
         break;
@@ -1229,6 +1323,15 @@ fs_generator::generate_code(exec_list *instructions)
          generate_set_global_offset(inst, dst, src[0], src[1]);
          break;
 
+      case FS_OPCODE_PACK_HALF_2x16_SPLIT:
+          generate_pack_half_2x16_split(inst, dst, src[0], src[1]);
+          break;
+
+      case FS_OPCODE_UNPACK_HALF_2x16_SPLIT_X:
+      case FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y:
+         generate_unpack_half_2x16_split(inst, dst, src[0]);
+         break;
+
       default:
         if (inst->opcode < (int) ARRAY_SIZE(opcode_descs)) {
            _mesa_problem(ctx, "Unsupported opcode `%s' in FS",
index ebb37fd31138add35449e4b6a62114bf4bf6936f..2b1332f1acc526420e0fab5317858c6ddf75eac4 100644 (file)
@@ -538,7 +538,20 @@ fs_visitor::visit(ir_expression *ir)
                   BRW_CONDITIONAL_L : BRW_CONDITIONAL_GE,
                   this->result, op[0], op[1]);
       break;
-
+   case ir_unop_pack_snorm_2x16:
+   case ir_unop_pack_unorm_2x16:
+   case ir_unop_unpack_snorm_2x16:
+   case ir_unop_unpack_unorm_2x16:
+   case ir_unop_unpack_half_2x16:
+   case ir_unop_pack_half_2x16:
+      assert(!"not reached: should be handled by lower_packing_builtins");
+      break;
+   case ir_unop_unpack_half_2x16_split_x:
+      emit(FS_OPCODE_UNPACK_HALF_2x16_SPLIT_X, this->result, op[0]);
+      break;
+   case ir_unop_unpack_half_2x16_split_y:
+      emit(FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y, this->result, op[0]);
+      break;
    case ir_binop_pow:
       emit_math(SHADER_OPCODE_POW, this->result, op[0], op[1]);
       break;
@@ -566,7 +579,9 @@ fs_visitor::visit(ir_expression *ir)
       else
         inst = emit(SHR(this->result, op[0], op[1]));
       break;
-
+   case ir_binop_pack_half_2x16_split:
+      emit(FS_OPCODE_PACK_HALF_2x16_SPLIT, this->result, op[0], op[1]);
+      break;
    case ir_binop_ubo_load:
       /* This IR node takes a constant uniform block and a constant or
        * variable byte offset within the block and loads a vector from that.