i965/fs: Indent the implementation of 32x32-bit MUL lowering by one level.

author Francisco Jerez <currojerez@riseup.net>

Wed, 5 Aug 2015 13:47:18 +0000 (16:47 +0300)

committer Francisco Jerez <currojerez@riseup.net>

Thu, 6 Aug 2015 11:12:12 +0000 (14:12 +0300)
author Francisco Jerez <currojerez@riseup.net>
Wed, 5 Aug 2015 13:47:18 +0000 (16:47 +0300)
committer Francisco Jerez <currojerez@riseup.net>
Thu, 6 Aug 2015 11:12:12 +0000 (14:12 +0300)
diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp

index 0278237101e92490e9247e295e764c7a1db38793..17f0eaba311caaa91402ee54434317bcc93153f9 100644 (file)
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -3129,155 +3129,159 @@ fs_visitor::lower_integer_multiplication()
  {
     bool progress = false;
  
-   /* Gen8's MUL instruction can do a 32-bit x 32-bit -> 32-bit operation
-    * directly, but CHV/BXT cannot.
-    */
-   if (devinfo->gen >= 8 && !devinfo->is_cherryview && !devinfo->is_broxton)
-      return false;
-
     foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
-      if (inst->opcode != BRW_OPCODE_MUL ||
-          inst->dst.is_accumulator() ||
-          (inst->dst.type != BRW_REGISTER_TYPE_D &&
-           inst->dst.type != BRW_REGISTER_TYPE_UD))
-         continue;
-
        const fs_builder ibld(this, block, inst);
  
-      /* The MUL instruction isn't commutative. On Gen <= 6, only the low
-       * 16-bits of src0 are read, and on Gen >= 7 only the low 16-bits of
-       * src1 are used.
-       *
-       * If multiplying by an immediate value that fits in 16-bits, do a
-       * single MUL instruction with that value in the proper location.
-       */
-      if (inst->src[1].file == IMM &&
-          inst->src[1].fixed_hw_reg.dw1.ud < (1 << 16)) {
-         if (devinfo->gen < 7) {
-            fs_reg imm(GRF, alloc.allocate(dispatch_width / 8),
-                       inst->dst.type);
-            ibld.MOV(imm, inst->src[1]);
-            ibld.MUL(inst->dst, imm, inst->src[0]);
-         } else {
-            ibld.MUL(inst->dst, inst->src[0], inst->src[1]);
-         }
-      } else {
-         /* Gen < 8 (and some Gen8+ low-power parts like Cherryview) cannot
-          * do 32-bit integer multiplication in one instruction, but instead
-          * must do a sequence (which actually calculates a 64-bit result):
-          *
-          *    mul(8)  acc0<1>D   g3<8,8,1>D      g4<8,8,1>D
-          *    mach(8) null       g3<8,8,1>D      g4<8,8,1>D
-          *    mov(8)  g2<1>D     acc0<8,8,1>D
-          *
-          * But on Gen > 6, the ability to use second accumulator register
-          * (acc1) for non-float data types was removed, preventing a simple
-          * implementation in SIMD16. A 16-channel result can be calculated by
-          * executing the three instructions twice in SIMD8, once with quarter
-          * control of 1Q for the first eight channels and again with 2Q for
-          * the second eight channels.
-          *
-          * Which accumulator register is implicitly accessed (by AccWrEnable
-          * for instance) is determined by the quarter control. Unfortunately
-          * Ivybridge (and presumably Baytrail) has a hardware bug in which an
-          * implicit accumulator access by an instruction with 2Q will access
-          * acc1 regardless of whether the data type is usable in acc1.
-          *
-          * Specifically, the 2Q mach(8) writes acc1 which does not exist for
-          * integer data types.
-          *
-          * Since we only want the low 32-bits of the result, we can do two
-          * 32-bit x 16-bit multiplies (like the mul and mach are doing), and
-          * adjust the high result and add them (like the mach is doing):
-          *
-          *    mul(8)  g7<1>D     g3<8,8,1>D      g4.0<8,8,1>UW
-          *    mul(8)  g8<1>D     g3<8,8,1>D      g4.1<8,8,1>UW
-          *    shl(8)  g9<1>D     g8<8,8,1>D      16D
-          *    add(8)  g2<1>D     g7<8,8,1>D      g8<8,8,1>D
-          *
-          * We avoid the shl instruction by realizing that we only want to add
-          * the low 16-bits of the "high" result to the high 16-bits of the
-          * "low" result and using proper regioning on the add:
-          *
-          *    mul(8)  g7<1>D     g3<8,8,1>D      g4.0<16,8,2>UW
-          *    mul(8)  g8<1>D     g3<8,8,1>D      g4.1<16,8,2>UW
-          *    add(8)  g7.1<2>UW  g7.1<16,8,2>UW  g8<16,8,2>UW
-          *
-          * Since it does not use the (single) accumulator register, we can
-          * schedule multi-component multiplications much better.
+      if (inst->opcode == BRW_OPCODE_MUL) {
+         if (inst->dst.is_accumulator() ||
+             (inst->dst.type != BRW_REGISTER_TYPE_D &&
+              inst->dst.type != BRW_REGISTER_TYPE_UD))
+            continue;
+
+         /* Gen8's MUL instruction can do a 32-bit x 32-bit -> 32-bit
+          * operation directly, but CHV/BXT cannot.
            */
+         if (devinfo->gen >= 8 &&
+             !devinfo->is_cherryview && !devinfo->is_broxton)
+            continue;
  
-         if (inst->conditional_mod && inst->dst.is_null()) {
-            inst->dst = fs_reg(GRF, alloc.allocate(dispatch_width / 8),
-                               inst->dst.type);
-         }
-         fs_reg low = inst->dst;
-         fs_reg high(GRF, alloc.allocate(dispatch_width / 8),
-                     inst->dst.type);
+         if (inst->src[1].file == IMM &&
+             inst->src[1].fixed_hw_reg.dw1.ud < (1 << 16)) {
+            /* The MUL instruction isn't commutative. On Gen <= 6, only the low
+             * 16-bits of src0 are read, and on Gen >= 7 only the low 16-bits of
+             * src1 are used.
+             *
+             * If multiplying by an immediate value that fits in 16-bits, do a
+             * single MUL instruction with that value in the proper location.
+             */
+            if (devinfo->gen < 7) {
+               fs_reg imm(GRF, alloc.allocate(dispatch_width / 8),
+                          inst->dst.type);
+               ibld.MOV(imm, inst->src[1]);
+               ibld.MUL(inst->dst, imm, inst->src[0]);
+            } else {
+               ibld.MUL(inst->dst, inst->src[0], inst->src[1]);
+            }
+         } else {
+            /* Gen < 8 (and some Gen8+ low-power parts like Cherryview) cannot
+             * do 32-bit integer multiplication in one instruction, but instead
+             * must do a sequence (which actually calculates a 64-bit result):
+             *
+             *    mul(8)  acc0<1>D   g3<8,8,1>D      g4<8,8,1>D
+             *    mach(8) null       g3<8,8,1>D      g4<8,8,1>D
+             *    mov(8)  g2<1>D     acc0<8,8,1>D
+             *
+             * But on Gen > 6, the ability to use second accumulator register
+             * (acc1) for non-float data types was removed, preventing a simple
+             * implementation in SIMD16. A 16-channel result can be calculated by
+             * executing the three instructions twice in SIMD8, once with quarter
+             * control of 1Q for the first eight channels and again with 2Q for
+             * the second eight channels.
+             *
+             * Which accumulator register is implicitly accessed (by AccWrEnable
+             * for instance) is determined by the quarter control. Unfortunately
+             * Ivybridge (and presumably Baytrail) has a hardware bug in which an
+             * implicit accumulator access by an instruction with 2Q will access
+             * acc1 regardless of whether the data type is usable in acc1.
+             *
+             * Specifically, the 2Q mach(8) writes acc1 which does not exist for
+             * integer data types.
+             *
+             * Since we only want the low 32-bits of the result, we can do two
+             * 32-bit x 16-bit multiplies (like the mul and mach are doing), and
+             * adjust the high result and add them (like the mach is doing):
+             *
+             *    mul(8)  g7<1>D     g3<8,8,1>D      g4.0<8,8,1>UW
+             *    mul(8)  g8<1>D     g3<8,8,1>D      g4.1<8,8,1>UW
+             *    shl(8)  g9<1>D     g8<8,8,1>D      16D
+             *    add(8)  g2<1>D     g7<8,8,1>D      g8<8,8,1>D
+             *
+             * We avoid the shl instruction by realizing that we only want to add
+             * the low 16-bits of the "high" result to the high 16-bits of the
+             * "low" result and using proper regioning on the add:
+             *
+             *    mul(8)  g7<1>D     g3<8,8,1>D      g4.0<16,8,2>UW
+             *    mul(8)  g8<1>D     g3<8,8,1>D      g4.1<16,8,2>UW
+             *    add(8)  g7.1<2>UW  g7.1<16,8,2>UW  g8<16,8,2>UW
+             *
+             * Since it does not use the (single) accumulator register, we can
+             * schedule multi-component multiplications much better.
+             */
  
-         if (devinfo->gen >= 7) {
-            fs_reg src1_0_w = inst->src[1];
-            fs_reg src1_1_w = inst->src[1];
+            if (inst->conditional_mod && inst->dst.is_null()) {
+               inst->dst = fs_reg(GRF, alloc.allocate(dispatch_width / 8),
+                                  inst->dst.type);
+            }
+            fs_reg low = inst->dst;
+            fs_reg high(GRF, alloc.allocate(dispatch_width / 8),
+                        inst->dst.type);
  
-            if (inst->src[1].file == IMM) {
-               src1_0_w.fixed_hw_reg.dw1.ud &= 0xffff;
-               src1_1_w.fixed_hw_reg.dw1.ud >>= 16;
-            } else {
-               src1_0_w.type = BRW_REGISTER_TYPE_UW;
-               if (src1_0_w.stride != 0) {
-                  assert(src1_0_w.stride == 1);
-                  src1_0_w.stride = 2;
+            if (devinfo->gen >= 7) {
+               fs_reg src1_0_w = inst->src[1];
+               fs_reg src1_1_w = inst->src[1];
+
+               if (inst->src[1].file == IMM) {
+                  src1_0_w.fixed_hw_reg.dw1.ud &= 0xffff;
+                  src1_1_w.fixed_hw_reg.dw1.ud >>= 16;
+               } else {
+                  src1_0_w.type = BRW_REGISTER_TYPE_UW;
+                  if (src1_0_w.stride != 0) {
+                     assert(src1_0_w.stride == 1);
+                     src1_0_w.stride = 2;
+                  }
+
+                  src1_1_w.type = BRW_REGISTER_TYPE_UW;
+                  if (src1_1_w.stride != 0) {
+                     assert(src1_1_w.stride == 1);
+                     src1_1_w.stride = 2;
+                  }
+                  src1_1_w.subreg_offset += type_sz(BRW_REGISTER_TYPE_UW);
                 }
+               ibld.MUL(low, inst->src[0], src1_0_w);
+               ibld.MUL(high, inst->src[0], src1_1_w);
+            } else {
+               fs_reg src0_0_w = inst->src[0];
+               fs_reg src0_1_w = inst->src[0];
  
-               src1_1_w.type = BRW_REGISTER_TYPE_UW;
-               if (src1_1_w.stride != 0) {
-                  assert(src1_1_w.stride == 1);
-                  src1_1_w.stride = 2;
+               src0_0_w.type = BRW_REGISTER_TYPE_UW;
+               if (src0_0_w.stride != 0) {
+                  assert(src0_0_w.stride == 1);
+                  src0_0_w.stride = 2;
                 }
-               src1_1_w.subreg_offset += type_sz(BRW_REGISTER_TYPE_UW);
-            }
-            ibld.MUL(low, inst->src[0], src1_0_w);
-            ibld.MUL(high, inst->src[0], src1_1_w);
-         } else {
-            fs_reg src0_0_w = inst->src[0];
-            fs_reg src0_1_w = inst->src[0];
  
-            src0_0_w.type = BRW_REGISTER_TYPE_UW;
-            if (src0_0_w.stride != 0) {
-               assert(src0_0_w.stride == 1);
-               src0_0_w.stride = 2;
-            }
+               src0_1_w.type = BRW_REGISTER_TYPE_UW;
+               if (src0_1_w.stride != 0) {
+                  assert(src0_1_w.stride == 1);
+                  src0_1_w.stride = 2;
+               }
+               src0_1_w.subreg_offset += type_sz(BRW_REGISTER_TYPE_UW);
  
-            src0_1_w.type = BRW_REGISTER_TYPE_UW;
-            if (src0_1_w.stride != 0) {
-               assert(src0_1_w.stride == 1);
-               src0_1_w.stride = 2;
+               ibld.MUL(low, src0_0_w, inst->src[1]);
+               ibld.MUL(high, src0_1_w, inst->src[1]);
              }
-            src0_1_w.subreg_offset += type_sz(BRW_REGISTER_TYPE_UW);
  
-            ibld.MUL(low, src0_0_w, inst->src[1]);
-            ibld.MUL(high, src0_1_w, inst->src[1]);
-         }
-
-         fs_reg dst = inst->dst;
-         dst.type = BRW_REGISTER_TYPE_UW;
-         dst.subreg_offset = 2;
-         dst.stride = 2;
+            fs_reg dst = inst->dst;
+            dst.type = BRW_REGISTER_TYPE_UW;
+            dst.subreg_offset = 2;
+            dst.stride = 2;
  
-         high.type = BRW_REGISTER_TYPE_UW;
-         high.stride = 2;
+            high.type = BRW_REGISTER_TYPE_UW;
+            high.stride = 2;
  
-         low.type = BRW_REGISTER_TYPE_UW;
-         low.subreg_offset = 2;
-         low.stride = 2;
+            low.type = BRW_REGISTER_TYPE_UW;
+            low.subreg_offset = 2;
+            low.stride = 2;
  
-         ibld.ADD(dst, low, high);
+            ibld.ADD(dst, low, high);
  
-         if (inst->conditional_mod) {
-            fs_reg null(retype(ibld.null_reg_f(), inst->dst.type));
-            set_condmod(inst->conditional_mod,
-                        ibld.MOV(null, inst->dst));
+            if (inst->conditional_mod) {
+               fs_reg null(retype(ibld.null_reg_f(), inst->dst.type));
+               set_condmod(inst->conditional_mod,
+                           ibld.MOV(null, inst->dst));
+            }
           }
+      } else {
+         continue;
        }
  
        inst->remove(block);
author	Francisco Jerez <currojerez@riseup.net>
	Wed, 5 Aug 2015 13:47:18 +0000 (16:47 +0300)
committer	Francisco Jerez <currojerez@riseup.net>
	Thu, 6 Aug 2015 11:12:12 +0000 (14:12 +0300)