glsl: provide the option of using BFE for unpack builting lowering
authorIlia Mirkin <imirkin@alum.mit.edu>
Fri, 21 Aug 2015 01:55:52 +0000 (21:55 -0400)
committerIlia Mirkin <imirkin@alum.mit.edu>
Fri, 28 Aug 2015 22:28:04 +0000 (18:28 -0400)
This greatly improves generated code, especially for the snorm variants,
since it is able to get rid of the lshift/rshift for sext, as well as
replacing each shift + mask with a single op.

Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
Reviewed-by: Matt Turner <mattst88@gmail.com>
src/glsl/ir_builder.cpp
src/glsl/ir_builder.h
src/glsl/ir_optimization.h
src/glsl/lower_packing_builtins.cpp
src/mesa/state_tracker/st_glsl_to_tgsi.cpp

index cd03859cac084661b2c67e0e1ad9783c74454fed..c9cf1240dfebca5baefb02ac72b90f4e2b065ae1 100644 (file)
@@ -566,6 +566,12 @@ csel(operand a, operand b, operand c)
    return expr(ir_triop_csel, a, b, c);
 }
 
+ir_expression *
+bitfield_extract(operand a, operand b, operand c)
+{
+   return expr(ir_triop_bitfield_extract, a, b, c);
+}
+
 ir_expression *
 bitfield_insert(operand a, operand b, operand c, operand d)
 {
index f76453ffcf00166db5e4d4264a4ee11f4a6a4262..b483ebf6269968921626e28ac0508c76df020e91 100644 (file)
@@ -200,6 +200,7 @@ ir_expression *interpolate_at_sample(operand a, operand b);
 ir_expression *fma(operand a, operand b, operand c);
 ir_expression *lrp(operand x, operand y, operand a);
 ir_expression *csel(operand a, operand b, operand c);
+ir_expression *bitfield_extract(operand a, operand b, operand c);
 ir_expression *bitfield_insert(operand a, operand b, operand c, operand d);
 
 ir_swizzle *swizzle(operand a, int swizzle, int components);
index b955874df84e5843b15d0dfc2a80d1db49a0f068..265b2234cb638ccde2bd4c4e83f646c236bd75b3 100644 (file)
@@ -69,6 +69,7 @@ enum lower_packing_builtins_op {
    LOWER_UNPACK_UNORM_4x8               = 0x0800,
 
    LOWER_PACK_USE_BFI                   = 0x1000,
+   LOWER_PACK_USE_BFE                   = 0x2000,
 };
 
 bool do_common_optimization(exec_list *ir, bool linked,
index 1d76ebf935f4fa764ca46da7ef34b5705ef49cd1..c8bf68be829a395c3cde2ceb22e25ff6e3460555 100644 (file)
@@ -119,6 +119,7 @@ public:
          break;
       case LOWER_PACK_UNPACK_NONE:
       case LOWER_PACK_USE_BFI:
+      case LOWER_PACK_USE_BFE:
          assert(!"not reached");
          break;
       }
@@ -305,6 +306,39 @@ private:
       return deref(u2).val;
    }
 
+   /**
+    * \brief Unpack a uint32 into two int16's.
+    *
+    * Specifically each 16-bit value is sign-extended to the full width of an
+    * int32 on return.
+    */
+   ir_rvalue *
+   unpack_uint_to_ivec2(ir_rvalue *uint_rval)
+   {
+      assert(uint_rval->type == glsl_type::uint_type);
+
+      if (!(op_mask & LOWER_PACK_USE_BFE)) {
+         return rshift(lshift(u2i(unpack_uint_to_uvec2(uint_rval)),
+                              constant(16u)),
+                       constant(16u));
+      }
+
+      ir_variable *i = factory.make_temp(glsl_type::int_type,
+                                         "tmp_unpack_uint_to_ivec2_i");
+      factory.emit(assign(i, u2i(uint_rval)));
+
+      /* ivec2 i2; */
+      ir_variable *i2 = factory.make_temp(glsl_type::ivec2_type,
+                                          "tmp_unpack_uint_to_ivec2_i2");
+
+      factory.emit(assign(i2, bitfield_extract(i, constant(0), constant(16)),
+                          WRITEMASK_X));
+      factory.emit(assign(i2, bitfield_extract(i, constant(16), constant(16)),
+                          WRITEMASK_Y));
+
+      return deref(i2).val;
+   }
+
    /**
     * \brief Unpack a uint32 into four uint8's.
     *
@@ -329,13 +363,23 @@ private:
       /* u4.x = u & 0xffu; */
       factory.emit(assign(u4, bit_and(u, constant(0xffu)), WRITEMASK_X));
 
-      /* u4.y = (u >> 8u) & 0xffu; */
-      factory.emit(assign(u4, bit_and(rshift(u, constant(8u)),
-                                      constant(0xffu)), WRITEMASK_Y));
-
-      /* u4.z = (u >> 16u) & 0xffu; */
-      factory.emit(assign(u4, bit_and(rshift(u, constant(16u)),
-                                      constant(0xffu)), WRITEMASK_Z));
+      if (op_mask & LOWER_PACK_USE_BFE) {
+         /* u4.y = bitfield_extract(u, 8, 8); */
+         factory.emit(assign(u4, bitfield_extract(u, constant(8), constant(8)),
+                             WRITEMASK_Y));
+
+         /* u4.z = bitfield_extract(u, 16, 8); */
+         factory.emit(assign(u4, bitfield_extract(u, constant(16), constant(8)),
+                             WRITEMASK_Z));
+      } else {
+         /* u4.y = (u >> 8u) & 0xffu; */
+         factory.emit(assign(u4, bit_and(rshift(u, constant(8u)),
+                                         constant(0xffu)), WRITEMASK_Y));
+
+         /* u4.z = (u >> 16u) & 0xffu; */
+         factory.emit(assign(u4, bit_and(rshift(u, constant(16u)),
+                                         constant(0xffu)), WRITEMASK_Z));
+      }
 
       /* u4.w = (u >> 24u) */
       factory.emit(assign(u4, rshift(u, constant(24u)), WRITEMASK_W));
@@ -343,6 +387,43 @@ private:
       return deref(u4).val;
    }
 
+   /**
+    * \brief Unpack a uint32 into four int8's.
+    *
+    * Specifically each 8-bit value is sign-extended to the full width of an
+    * int32 on return.
+    */
+   ir_rvalue *
+   unpack_uint_to_ivec4(ir_rvalue *uint_rval)
+   {
+      assert(uint_rval->type == glsl_type::uint_type);
+
+      if (!(op_mask & LOWER_PACK_USE_BFE)) {
+         return rshift(lshift(u2i(unpack_uint_to_uvec4(uint_rval)),
+                              constant(24u)),
+                       constant(24u));
+      }
+
+      ir_variable *i = factory.make_temp(glsl_type::int_type,
+                                         "tmp_unpack_uint_to_ivec4_i");
+      factory.emit(assign(i, u2i(uint_rval)));
+
+      /* ivec4 i4; */
+      ir_variable *i4 = factory.make_temp(glsl_type::ivec4_type,
+                                          "tmp_unpack_uint_to_ivec4_i4");
+
+      factory.emit(assign(i4, bitfield_extract(i, constant(0), constant(8)),
+                          WRITEMASK_X));
+      factory.emit(assign(i4, bitfield_extract(i, constant(8), constant(8)),
+                          WRITEMASK_Y));
+      factory.emit(assign(i4, bitfield_extract(i, constant(16), constant(8)),
+                          WRITEMASK_Z));
+      factory.emit(assign(i4, bitfield_extract(i, constant(24), constant(8)),
+                          WRITEMASK_W));
+
+      return deref(i4).val;
+   }
+
    /**
     * \brief Lower a packSnorm2x16 expression.
     *
@@ -489,9 +570,7 @@ private:
       assert(uint_rval->type == glsl_type::uint_type);
 
       ir_rvalue *result =
-        clamp(div(i2f(rshift(lshift(u2i(unpack_uint_to_uvec2(uint_rval)),
-                                    constant(16)),
-                             constant(16u))),
+        clamp(div(i2f(unpack_uint_to_ivec2(uint_rval)),
                   constant(32767.0f)),
               constant(-1.0f),
               constant(1.0f));
@@ -548,9 +627,7 @@ private:
       assert(uint_rval->type == glsl_type::uint_type);
 
       ir_rvalue *result =
-        clamp(div(i2f(rshift(lshift(u2i(unpack_uint_to_uvec4(uint_rval)),
-                                    constant(24u)),
-                             constant(24u))),
+        clamp(div(i2f(unpack_uint_to_ivec4(uint_rval)),
                   constant(127.0f)),
               constant(-1.0f),
               constant(1.0f));
index 7a8c4e1b8fac45396f843037df14f1af4e00ddc1..95a25c12fb46bdcee42403100f64533a20f4fe65 100644 (file)
@@ -6020,7 +6020,8 @@ st_link_shader(struct gl_context *ctx, struct gl_shader_program *prog)
                                LOWER_UNPACK_HALF_2x16;
 
          if (ctx->Extensions.ARB_gpu_shader5)
-            lower_inst |= LOWER_PACK_USE_BFI;
+            lower_inst |= LOWER_PACK_USE_BFI |
+                          LOWER_PACK_USE_BFE;
 
          lower_packing_builtins(ir, lower_inst);
       }