i965/vec4: extend the DWORD multiply DepCtrl restriction to all gen8 platforms
[mesa.git] / src / mesa / drivers / dri / i965 / brw_vec4.cpp
index d42bcd989dae6ad688342076bb356f36aa664af7..c461f26f71ed9198732703351a738f9e6deed0b9 100644 (file)
@@ -228,8 +228,8 @@ vec4_instruction::size_read(unsigned arg) const
    case UNIFORM:
       return 4 * type_sz(src[arg].type);
    default:
-      /* XXX - Represent actual execution size and vertical stride. */
-      return 8 * type_sz(src[arg].type);
+      /* XXX - Represent actual vertical stride. */
+      return exec_size * type_sz(src[arg].type);
    }
 }
 
@@ -253,6 +253,12 @@ vec4_instruction::can_do_writemask(const struct gen_device_info *devinfo)
 {
    switch (opcode) {
    case SHADER_OPCODE_GEN4_SCRATCH_READ:
+   case VEC4_OPCODE_FROM_DOUBLE:
+   case VEC4_OPCODE_TO_DOUBLE:
+   case VEC4_OPCODE_PICK_LOW_32BIT:
+   case VEC4_OPCODE_PICK_HIGH_32BIT:
+   case VEC4_OPCODE_SET_LOW_32BIT:
+   case VEC4_OPCODE_SET_HIGH_32BIT:
    case VS_OPCODE_PULL_CONSTANT_LOAD:
    case VS_OPCODE_PULL_CONSTANT_LOAD_GEN7:
    case VS_OPCODE_SET_SIMD4X2_HEADER_GEN9:
@@ -387,6 +393,7 @@ vec4_visitor::opt_vector_float()
              inst->src[0].file == IMM &&
              inst->predicate == BRW_PREDICATE_NONE &&
              inst->dst.writemask != WRITEMASK_XYZW &&
+             type_sz(inst->src[0].type) < 8 &&
              (inst->src[0].type == inst->dst.type || inst->src[0].d == 0)) {
 
             vf = brw_float_to_vf(inst->src[0].d);
@@ -505,6 +512,16 @@ vec4_visitor::opt_reduce_swizzle()
       case BRW_OPCODE_DP2:
          swizzle = brw_swizzle_for_size(2);
          break;
+
+      case VEC4_OPCODE_TO_DOUBLE:
+      case VEC4_OPCODE_FROM_DOUBLE:
+      case VEC4_OPCODE_PICK_LOW_32BIT:
+      case VEC4_OPCODE_PICK_HIGH_32BIT:
+      case VEC4_OPCODE_SET_LOW_32BIT:
+      case VEC4_OPCODE_SET_HIGH_32BIT:
+         swizzle = brw_swizzle_for_size(4);
+         break;
+
       default:
          swizzle = brw_swizzle_for_mask(inst->dst.writemask);
          break;
@@ -593,13 +610,20 @@ vec4_visitor::pack_uniform_registers()
          if (inst->src[i].file != UNIFORM)
             continue;
 
+         assert(type_sz(inst->src[i].type) % 4 == 0);
+         unsigned channel_size = type_sz(inst->src[i].type) / 4;
+
          int reg = inst->src[i].nr;
          for (int c = 0; c < 4; c++) {
             if (!(readmask & (1 << c)))
                continue;
 
-            chans_used[reg] = MAX2(chans_used[reg],
-                                   BRW_GET_SWZ(inst->src[i].swizzle, c) + 1);
+            unsigned channel = BRW_GET_SWZ(inst->src[i].swizzle, c) + 1;
+            unsigned used = MAX2(chans_used[reg], channel * channel_size);
+            if (used <= 4)
+               chans_used[reg] = used;
+            else
+               chans_used[reg + 1] = used - 4;
          }
       }
 
@@ -636,25 +660,25 @@ vec4_visitor::pack_uniform_registers()
       int dst;
       /* Find the lowest place we can slot this uniform in. */
       for (dst = 0; dst < src; dst++) {
-        if (chans_used[dst] + size <= 4)
-           break;
+         if (chans_used[dst] + size <= 4)
+            break;
       }
 
       if (src == dst) {
-        new_loc[src] = dst;
-        new_chan[src] = 0;
+         new_loc[src] = dst;
+         new_chan[src] = 0;
       } else {
-        new_loc[src] = dst;
-        new_chan[src] = chans_used[dst];
+         new_loc[src] = dst;
+         new_chan[src] = chans_used[dst];
 
-        /* Move the references to the data */
-        for (int j = 0; j < size; j++) {
-           stage_prog_data->param[dst * 4 + new_chan[src] + j] =
-              stage_prog_data->param[src * 4 + j];
-        }
+         /* Move the references to the data */
+         for (int j = 0; j < size; j++) {
+            stage_prog_data->param[dst * 4 + new_chan[src] + j] =
+               stage_prog_data->param[src * 4 + j];
+         }
 
-        chans_used[dst] += size;
-        chans_used[src] = 0;
+         chans_used[dst] += size;
+         chans_used[src] = 0;
       }
 
       new_uniform_count = MAX2(new_uniform_count, dst + 1);
@@ -667,8 +691,8 @@ vec4_visitor::pack_uniform_registers()
       for (int i = 0 ; i < 3; i++) {
          int src = inst->src[i].nr;
 
-        if (inst->src[i].file != UNIFORM)
-           continue;
+         if (inst->src[i].file != UNIFORM)
+            continue;
 
          inst->src[i].nr = new_loc[src];
          inst->src[i].swizzle += BRW_SWIZZLE4(new_chan[src], new_chan[src],
@@ -884,11 +908,14 @@ vec4_visitor::is_dep_ctrl_unsafe(const vec4_instruction *inst)
    (reg.type == BRW_REGISTER_TYPE_UD || \
     reg.type == BRW_REGISTER_TYPE_D)
 
-   /* "When source or destination datatype is 64b or operation is integer DWord
+   /* From the Cherryview and Broadwell PRMs:
+    *
+    * "When source or destination datatype is 64b or operation is integer DWord
     * multiply, DepCtrl must not be used."
-    * May apply to future SoCs as well.
+    *
+    * SKL PRMs don't include this restriction though.
     */
-   if (devinfo->is_cherryview) {
+   if (devinfo->gen == 8 || devinfo->is_broxton) {
       if (inst->opcode == BRW_OPCODE_MUL &&
          IS_DWORD(inst->src[0]) &&
          IS_DWORD(inst->src[1]))
@@ -1123,7 +1150,7 @@ vec4_visitor::opt_register_coalesce()
       /* Can't coalesce this GRF if someone else was going to
        * read it later.
        */
-      if (var_range_end(var_from_reg(alloc, dst_reg(inst->src[0])), 4) > ip)
+      if (var_range_end(var_from_reg(alloc, dst_reg(inst->src[0])), 8) > ip)
         continue;
 
       /* We need to check interference with the final destination between this
@@ -1174,6 +1201,20 @@ vec4_visitor::opt_register_coalesce()
                   scan_inst->dst.type == scan_inst->src[0].type))
                break;
 
+            /* Only allow coalescing between registers of the same type size.
+             * Otherwise we would need to make the pass aware of the fact that
+             * channel sizes are different for single and double precision.
+             */
+            if (type_sz(inst->src[0].type) != type_sz(scan_inst->src[0].type))
+               break;
+
+            /* Check that scan_inst writes the same amount of data as the
+             * instruction, otherwise coalescing would lead to writing a
+             * different (larger or smaller) region of the destination
+             */
+            if (scan_inst->size_written != inst->size_written)
+               break;
+
             /* If we can't handle the swizzle, bail. */
             if (!scan_inst->can_reswizzle(devinfo, inst->dst.writemask,
                                           inst->src[0].swizzle,
@@ -1181,10 +1222,12 @@ vec4_visitor::opt_register_coalesce()
                break;
             }
 
-            /* This only handles coalescing of a single register starting at
-             * the source offset of the copy instruction.
+            /* This only handles coalescing writes of 8 channels (1 register
+             * for single-precision and 2 registers for double-precision)
+             * starting at the source offset of the copy instruction.
              */
-            if (scan_inst->size_written > REG_SIZE ||
+            if (DIV_ROUND_UP(scan_inst->size_written,
+                             type_sz(scan_inst->dst.type)) > 8 ||
                 scan_inst->dst.offset != inst->src[0].offset)
                break;
 
@@ -1420,7 +1463,8 @@ vec4_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
               pred_ctrl_align16[inst->predicate]);
    }
 
-   fprintf(file, "%s", brw_instruction_name(devinfo, inst->opcode));
+   fprintf(file, "%s(%d)", brw_instruction_name(devinfo, inst->opcode),
+           inst->exec_size);
    if (inst->saturate)
       fprintf(file, ".sat");
    if (inst->conditional_mod) {
@@ -1517,6 +1561,9 @@ vec4_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
          case BRW_REGISTER_TYPE_F:
             fprintf(file, "%fF", inst->src[i].f);
             break;
+         case BRW_REGISTER_TYPE_DF:
+            fprintf(file, "%fDF", inst->src[i].df);
+            break;
          case BRW_REGISTER_TYPE_D:
             fprintf(file, "%dD", inst->src[i].d);
             break;
@@ -1591,6 +1638,9 @@ vec4_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
    if (inst->force_writemask_all)
       fprintf(file, " NoMask");
 
+   if (inst->exec_size != 8)
+      fprintf(file, " group%d", inst->group);
+
    fprintf(file, "\n");
 }
 
@@ -1862,31 +1912,39 @@ vec4_visitor::convert_to_hw_regs()
          struct src_reg &src = inst->src[i];
          struct brw_reg reg;
          switch (src.file) {
-         case VGRF:
-            reg = byte_offset(brw_vec8_grf(src.nr, 0), src.offset);
+         case VGRF: {
+            const unsigned type_size = type_sz(src.type);
+            const unsigned width = REG_SIZE / 2 / MAX2(4, type_size);
+            reg = byte_offset(brw_vecn_grf(width, src.nr, 0), src.offset);
             reg.type = src.type;
-            reg.swizzle = src.swizzle;
             reg.abs = src.abs;
             reg.negate = src.negate;
             break;
+         }
 
-         case UNIFORM:
+         case UNIFORM: {
+            const unsigned width = REG_SIZE / 2 / MAX2(4, type_sz(src.type));
             reg = stride(byte_offset(brw_vec4_grf(
                                         prog_data->base.dispatch_grf_start_reg +
                                         src.nr / 2, src.nr % 2 * 4),
                                      src.offset),
-                         0, 4, 1);
+                         0, width, 1);
             reg.type = src.type;
-            reg.swizzle = src.swizzle;
             reg.abs = src.abs;
             reg.negate = src.negate;
 
             /* This should have been moved to pull constants. */
             assert(!src.reladdr);
             break;
+         }
 
-         case ARF:
          case FIXED_GRF:
+            if (type_sz(src.type) == 8) {
+               reg = src.as_brw_reg();
+               break;
+            }
+            /* fallthrough */
+         case ARF:
          case IMM:
             continue;
 
@@ -1900,15 +1958,19 @@ vec4_visitor::convert_to_hw_regs()
             unreachable("not reached");
          }
 
+         apply_logical_swizzle(&reg, inst, i);
          src = reg;
       }
 
       if (inst->is_3src(devinfo)) {
          /* 3-src instructions with scalar sources support arbitrary subnr,
           * but don't actually use swizzles.  Convert swizzle into subnr.
+          * Skip this for double-precision instructions: RepCtrl=1 is not
+          * allowed for them and needs special handling.
           */
          for (int i = 0; i < 3; i++) {
-            if (inst->src[i].vstride == BRW_VERTICAL_STRIDE_0) {
+            if (inst->src[i].vstride == BRW_VERTICAL_STRIDE_0 &&
+                type_sz(inst->src[i].type) < 8) {
                assert(brw_is_single_value_swizzle(inst->src[i].swizzle));
                inst->src[i].subnr += 4 * BRW_GET_SWZ(inst->src[i].swizzle, 0);
             }
@@ -1951,6 +2013,358 @@ vec4_visitor::convert_to_hw_regs()
    }
 }
 
+/**
+ * Get the closest native SIMD width supported by the hardware for instruction
+ * \p inst.  The instruction will be left untouched by
+ * vec4_visitor::lower_simd_width() if the returned value matches the
+ * instruction's original execution size.
+ */
+static unsigned
+get_lowered_simd_width(const struct gen_device_info *devinfo,
+                       const vec4_instruction *inst)
+{
+   unsigned lowered_width = MIN2(16, inst->exec_size);
+
+   /* We need to split some cases of double-precision instructions that write
+    * 2 registers. We only need to care about this in gen7 because that is the
+    * only hardware that implements fp64 in Align16.
+    */
+   if (devinfo->gen == 7 && inst->size_written > REG_SIZE) {
+      /* Align16 8-wide double-precision SEL does not work well. Verified
+       * empirically.
+       */
+      if (inst->opcode == BRW_OPCODE_SEL && type_sz(inst->dst.type) == 8)
+         lowered_width = MIN2(lowered_width, 4);
+
+      /* HSW PRM, 3D Media GPGPU Engine, Region Alignment Rules for Direct
+       * Register Addressing:
+       *
+       *    "When destination spans two registers, the source MUST span two
+       *     registers."
+       */
+      for (unsigned i = 0; i < 3; i++) {
+         if (inst->src[i].file == BAD_FILE)
+            continue;
+         if (inst->size_read(i) <= REG_SIZE)
+            lowered_width = MIN2(lowered_width, 4);
+      }
+   }
+
+   return lowered_width;
+}
+
+static bool
+dst_src_regions_overlap(vec4_instruction *inst)
+{
+   if (inst->size_written == 0)
+      return false;
+
+   unsigned dst_start = inst->dst.offset;
+   unsigned dst_end = dst_start + inst->size_written - 1;
+   for (int i = 0; i < 3; i++) {
+      if (inst->src[i].file == BAD_FILE)
+         continue;
+
+      if (inst->dst.file != inst->src[i].file ||
+          inst->dst.nr != inst->src[i].nr)
+         continue;
+
+      unsigned src_start = inst->src[i].offset;
+      unsigned src_end = src_start + inst->size_read(i) - 1;
+
+      if ((dst_start >= src_start && dst_start <= src_end) ||
+          (dst_end >= src_start && dst_end <= src_end) ||
+          (dst_start <= src_start && dst_end >= src_end)) {
+         return true;
+      }
+   }
+
+   return false;
+}
+
+bool
+vec4_visitor::lower_simd_width()
+{
+   bool progress = false;
+
+   foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
+      const unsigned lowered_width = get_lowered_simd_width(devinfo, inst);
+      assert(lowered_width <= inst->exec_size);
+      if (lowered_width == inst->exec_size)
+         continue;
+
+      /* We need to deal with source / destination overlaps when splitting.
+       * The hardware supports reading from and writing to the same register
+       * in the same instruction, but we need to be careful that each split
+       * instruction we produce does not corrupt the source of the next.
+       *
+       * The easiest way to handle this is to make the split instructions write
+       * to temporaries if there is an src/dst overlap and then move from the
+       * temporaries to the original destination. We also need to consider
+       * instructions that do partial writes via align1 opcodes, in which case
+       * we need to make sure that the we initialize the temporary with the
+       * value of the instruction's dst.
+       */
+      bool needs_temp = dst_src_regions_overlap(inst);
+      for (unsigned n = 0; n < inst->exec_size / lowered_width; n++)  {
+         unsigned channel_offset = lowered_width * n;
+
+         unsigned size_written = lowered_width * type_sz(inst->dst.type);
+
+         /* Create the split instruction from the original so that we copy all
+          * relevant instruction fields, then set the width and calculate the
+          * new dst/src regions.
+          */
+         vec4_instruction *linst = new(mem_ctx) vec4_instruction(*inst);
+         linst->exec_size = lowered_width;
+         linst->group = channel_offset;
+         linst->size_written = size_written;
+
+         /* Compute split dst region */
+         dst_reg dst;
+         if (needs_temp) {
+            unsigned num_regs = DIV_ROUND_UP(size_written, REG_SIZE);
+            dst = retype(dst_reg(VGRF, alloc.allocate(num_regs)),
+                         inst->dst.type);
+            if (inst->is_align1_partial_write()) {
+               vec4_instruction *copy = MOV(dst, src_reg(inst->dst));
+               copy->exec_size = lowered_width;
+               copy->group = channel_offset;
+               copy->size_written = size_written;
+               inst->insert_before(block, copy);
+            }
+         } else {
+            dst = horiz_offset(inst->dst, channel_offset);
+         }
+         linst->dst = dst;
+
+         /* Compute split source regions */
+         for (int i = 0; i < 3; i++) {
+            if (linst->src[i].file == BAD_FILE)
+               continue;
+
+            if (!is_uniform(linst->src[i]))
+               linst->src[i] = horiz_offset(linst->src[i], channel_offset);
+         }
+
+         inst->insert_before(block, linst);
+
+         /* If we used a temporary to store the result of the split
+          * instruction, copy the result to the original destination
+          */
+         if (needs_temp) {
+            vec4_instruction *mov =
+               MOV(offset(inst->dst, lowered_width, n), src_reg(dst));
+            mov->exec_size = lowered_width;
+            mov->group = channel_offset;
+            mov->size_written = size_written;
+            mov->predicate = inst->predicate;
+            inst->insert_before(block, mov);
+         }
+      }
+
+      inst->remove(block);
+      progress = true;
+   }
+
+   if (progress)
+      invalidate_live_intervals();
+
+   return progress;
+}
+
+static bool
+is_align1_df(vec4_instruction *inst)
+{
+   switch (inst->opcode) {
+   case VEC4_OPCODE_FROM_DOUBLE:
+   case VEC4_OPCODE_TO_DOUBLE:
+   case VEC4_OPCODE_PICK_LOW_32BIT:
+   case VEC4_OPCODE_PICK_HIGH_32BIT:
+   case VEC4_OPCODE_SET_LOW_32BIT:
+   case VEC4_OPCODE_SET_HIGH_32BIT:
+      return true;
+   default:
+      return false;
+   }
+}
+
+static brw_predicate
+scalarize_predicate(brw_predicate predicate, unsigned writemask)
+{
+   if (predicate != BRW_PREDICATE_NORMAL)
+      return predicate;
+
+   switch (writemask) {
+   case WRITEMASK_X:
+      return BRW_PREDICATE_ALIGN16_REPLICATE_X;
+   case WRITEMASK_Y:
+      return BRW_PREDICATE_ALIGN16_REPLICATE_Y;
+   case WRITEMASK_Z:
+      return BRW_PREDICATE_ALIGN16_REPLICATE_Z;
+   case WRITEMASK_W:
+      return BRW_PREDICATE_ALIGN16_REPLICATE_W;
+   default:
+      unreachable("invalid writemask");
+   }
+}
+
+bool
+vec4_visitor::scalarize_df()
+{
+   bool progress = false;
+
+   foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
+      /* Skip DF instructions that operate in Align1 mode */
+      if (is_align1_df(inst))
+         continue;
+
+      /* Check if this is a double-precision instruction */
+      bool is_double = type_sz(inst->dst.type) == 8;
+      for (int arg = 0; !is_double && arg < 3; arg++) {
+         is_double = inst->src[arg].file != BAD_FILE &&
+                     type_sz(inst->src[arg].type) == 8;
+      }
+
+      if (!is_double)
+         continue;
+
+      /* Generate scalar instructions for each enabled channel */
+      for (unsigned chan = 0; chan < 4; chan++) {
+         unsigned chan_mask = 1 << chan;
+         if (!(inst->dst.writemask & chan_mask))
+            continue;
+
+         vec4_instruction *scalar_inst = new(mem_ctx) vec4_instruction(*inst);
+
+         for (unsigned i = 0; i < 3; i++) {
+            unsigned swz = BRW_GET_SWZ(inst->src[i].swizzle, chan);
+            scalar_inst->src[i].swizzle = BRW_SWIZZLE4(swz, swz, swz, swz);
+         }
+
+         scalar_inst->dst.writemask = chan_mask;
+
+         if (inst->predicate != BRW_PREDICATE_NONE) {
+            scalar_inst->predicate =
+               scalarize_predicate(inst->predicate, chan_mask);
+         }
+
+         inst->insert_before(block, scalar_inst);
+      }
+
+      inst->remove(block);
+      progress = true;
+   }
+
+   if (progress)
+      invalidate_live_intervals();
+
+   return progress;
+}
+
+bool
+vec4_visitor::lower_64bit_mad_to_mul_add()
+{
+   bool progress = false;
+
+   foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
+      if (inst->opcode != BRW_OPCODE_MAD)
+         continue;
+
+      if (type_sz(inst->dst.type) != 8)
+         continue;
+
+      dst_reg mul_dst = dst_reg(this, glsl_type::dvec4_type);
+
+      /* Use the copy constructor so we copy all relevant instruction fields
+       * from the original mad into the add and mul instructions
+       */
+      vec4_instruction *mul = new(mem_ctx) vec4_instruction(*inst);
+      mul->opcode = BRW_OPCODE_MUL;
+      mul->dst = mul_dst;
+      mul->src[0] = inst->src[1];
+      mul->src[1] = inst->src[2];
+      mul->src[2].file = BAD_FILE;
+
+      vec4_instruction *add = new(mem_ctx) vec4_instruction(*inst);
+      add->opcode = BRW_OPCODE_ADD;
+      add->src[0] = src_reg(mul_dst);
+      add->src[1] = inst->src[0];
+      add->src[2].file = BAD_FILE;
+
+      inst->insert_before(block, mul);
+      inst->insert_before(block, add);
+      inst->remove(block);
+
+      progress = true;
+   }
+
+   if (progress)
+      invalidate_live_intervals();
+
+   return progress;
+}
+
+/* The align16 hardware can only do 32-bit swizzle channels, so we need to
+ * translate the logical 64-bit swizzle channels that we use in the Vec4 IR
+ * to 32-bit swizzle channels in hardware registers.
+ *
+ * @inst and @arg identify the original vec4 IR source operand we need to
+ * translate the swizzle for and @hw_reg is the hardware register where we
+ * will write the hardware swizzle to use.
+ *
+ * This pass assumes that Align16/DF instructions have been fully scalarized
+ * previously so there is just one 64-bit swizzle channel to deal with for any
+ * given Vec4 IR source.
+ */
+void
+vec4_visitor::apply_logical_swizzle(struct brw_reg *hw_reg,
+                                    vec4_instruction *inst, int arg)
+{
+   src_reg reg = inst->src[arg];
+
+   if (reg.file == BAD_FILE || reg.file == BRW_IMMEDIATE_VALUE)
+      return;
+
+   /* If this is not a 64-bit operand or this is a scalar instruction we don't
+    * need to do anything about the swizzles.
+    */
+   if(type_sz(reg.type) < 8 || is_align1_df(inst)) {
+      hw_reg->swizzle = reg.swizzle;
+      return;
+   }
+
+   /* Otherwise we should have scalarized the instruction, so take the single
+    * 64-bit logical swizzle channel and translate it to 32-bit
+    */
+   assert(brw_is_single_value_swizzle(reg.swizzle));
+
+   /* To gain access to Z/W components we need to select the second half
+    * of the register and then use a X/Y swizzle to select Z/W respectively.
+    */
+   unsigned swizzle = BRW_GET_SWZ(reg.swizzle, 0);
+
+   if (swizzle >= 2) {
+      *hw_reg = suboffset(*hw_reg, 2);
+      swizzle -= 2;
+   }
+
+   /* Any 64-bit source with an offset at 16B is intended to address the
+    * second half of a register and needs a vertical stride of 0 so we:
+    *
+    * 1. Don't violate register region restrictions.
+    * 2. Activate the gen7 instruction decompresion bug exploit when
+    *    execsize > 4
+    */
+   if (hw_reg->subnr % REG_SIZE == 16) {
+      assert(devinfo->gen == 7);
+      hw_reg->vstride = BRW_VERTICAL_STRIDE_0;
+   }
+
+   hw_reg->swizzle = BRW_SWIZZLE4(swizzle * 2, swizzle * 2 + 1,
+                                  swizzle * 2, swizzle * 2 + 1);
+}
+
 bool
 vec4_visitor::run()
 {
@@ -2042,9 +2456,17 @@ vec4_visitor::run()
       OPT(dead_code_eliminate);
    }
 
+   if (OPT(lower_simd_width)) {
+      OPT(opt_copy_propagation);
+      OPT(dead_code_eliminate);
+   }
+
    if (failed)
       return false;
 
+   OPT(lower_64bit_mad_to_mul_add);
+   OPT(scalarize_df);
+
    setup_payload();
 
    if (unlikely(INTEL_DEBUG & DEBUG_SPILL_VEC4)) {