i965/vec4: extend the DWORD multiply DepCtrl restriction to all gen8 platforms

[mesa.git] / src / mesa / drivers / dri / i965 / brw_vec4.cpp
diff --git a/src/mesa/drivers/dri/i965/brw_vec4.cpp b/src/mesa/drivers/dri/i965/brw_vec4.cpp

index 42fde07c4ef9548753b6f4fd272f19cd57f75106..c461f26f71ed9198732703351a738f9e6deed0b9 100644 (file)
--- a/src/mesa/drivers/dri/i965/brw_vec4.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4.cpp
@@ -610,13 +610,20 @@ vec4_visitor::pack_uniform_registers()
           if (inst->src[i].file != UNIFORM)
              continue;
  
+         assert(type_sz(inst->src[i].type) % 4 == 0);
+         unsigned channel_size = type_sz(inst->src[i].type) / 4;
+
           int reg = inst->src[i].nr;
           for (int c = 0; c < 4; c++) {
              if (!(readmask & (1 << c)))
                 continue;
  
-            chans_used[reg] = MAX2(chans_used[reg],
-                                   BRW_GET_SWZ(inst->src[i].swizzle, c) + 1);
+            unsigned channel = BRW_GET_SWZ(inst->src[i].swizzle, c) + 1;
+            unsigned used = MAX2(chans_used[reg], channel * channel_size);
+            if (used <= 4)
+               chans_used[reg] = used;
+            else
+               chans_used[reg + 1] = used - 4;
           }
        }
  
@@ -653,25 +660,25 @@ vec4_visitor::pack_uniform_registers()
        int dst;
        /* Find the lowest place we can slot this uniform in. */
        for (dst = 0; dst < src; dst++) {
-        if (chans_used[dst] + size <= 4)
-           break;
+         if (chans_used[dst] + size <= 4)
+            break;
        }
  
        if (src == dst) {
-        new_loc[src] = dst;
-        new_chan[src] = 0;
+         new_loc[src] = dst;
+         new_chan[src] = 0;
        } else {
-        new_loc[src] = dst;
-        new_chan[src] = chans_used[dst];
+         new_loc[src] = dst;
+         new_chan[src] = chans_used[dst];
  
-        /* Move the references to the data */
-        for (int j = 0; j < size; j++) {
-           stage_prog_data->param[dst * 4 + new_chan[src] + j] =
-              stage_prog_data->param[src * 4 + j];
-        }
+         /* Move the references to the data */
+         for (int j = 0; j < size; j++) {
+            stage_prog_data->param[dst * 4 + new_chan[src] + j] =
+               stage_prog_data->param[src * 4 + j];
+         }
  
-        chans_used[dst] += size;
-        chans_used[src] = 0;
+         chans_used[dst] += size;
+         chans_used[src] = 0;
        }
  
        new_uniform_count = MAX2(new_uniform_count, dst + 1);
@@ -684,8 +691,8 @@ vec4_visitor::pack_uniform_registers()
        for (int i = 0 ; i < 3; i++) {
           int src = inst->src[i].nr;
  
-        if (inst->src[i].file != UNIFORM)
-           continue;
+         if (inst->src[i].file != UNIFORM)
+            continue;
  
           inst->src[i].nr = new_loc[src];
           inst->src[i].swizzle += BRW_SWIZZLE4(new_chan[src], new_chan[src],
@@ -901,11 +908,14 @@ vec4_visitor::is_dep_ctrl_unsafe(const vec4_instruction *inst)
     (reg.type == BRW_REGISTER_TYPE_UD || \
      reg.type == BRW_REGISTER_TYPE_D)
  
-   /* "When source or destination datatype is 64b or operation is integer DWord
+   /* From the Cherryview and Broadwell PRMs:
+    *
+    * "When source or destination datatype is 64b or operation is integer DWord
      * multiply, DepCtrl must not be used."
-    * May apply to future SoCs as well.
+    *
+    * SKL PRMs don't include this restriction though.
      */
-   if (devinfo->is_cherryview) {
+   if (devinfo->gen == 8 || devinfo->is_broxton) {
        if (inst->opcode == BRW_OPCODE_MUL &&
           IS_DWORD(inst->src[0]) &&
           IS_DWORD(inst->src[1]))
@@ -1955,9 +1965,12 @@ vec4_visitor::convert_to_hw_regs()
        if (inst->is_3src(devinfo)) {
           /* 3-src instructions with scalar sources support arbitrary subnr,
            * but don't actually use swizzles.  Convert swizzle into subnr.
+          * Skip this for double-precision instructions: RepCtrl=1 is not
+          * allowed for them and needs special handling.
            */
           for (int i = 0; i < 3; i++) {
-            if (inst->src[i].vstride == BRW_VERTICAL_STRIDE_0) {
+            if (inst->src[i].vstride == BRW_VERTICAL_STRIDE_0 &&
+                type_sz(inst->src[i].type) < 8) {
                 assert(brw_is_single_value_swizzle(inst->src[i].swizzle));
                 inst->src[i].subnr += 4 * BRW_GET_SWZ(inst->src[i].swizzle, 0);
              }
@@ -2249,6 +2262,49 @@ vec4_visitor::scalarize_df()
     return progress;
  }
  
+bool
+vec4_visitor::lower_64bit_mad_to_mul_add()
+{
+   bool progress = false;
+
+   foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
+      if (inst->opcode != BRW_OPCODE_MAD)
+         continue;
+
+      if (type_sz(inst->dst.type) != 8)
+         continue;
+
+      dst_reg mul_dst = dst_reg(this, glsl_type::dvec4_type);
+
+      /* Use the copy constructor so we copy all relevant instruction fields
+       * from the original mad into the add and mul instructions
+       */
+      vec4_instruction *mul = new(mem_ctx) vec4_instruction(*inst);
+      mul->opcode = BRW_OPCODE_MUL;
+      mul->dst = mul_dst;
+      mul->src[0] = inst->src[1];
+      mul->src[1] = inst->src[2];
+      mul->src[2].file = BAD_FILE;
+
+      vec4_instruction *add = new(mem_ctx) vec4_instruction(*inst);
+      add->opcode = BRW_OPCODE_ADD;
+      add->src[0] = src_reg(mul_dst);
+      add->src[1] = inst->src[0];
+      add->src[2].file = BAD_FILE;
+
+      inst->insert_before(block, mul);
+      inst->insert_before(block, add);
+      inst->remove(block);
+
+      progress = true;
+   }
+
+   if (progress)
+      invalidate_live_intervals();
+
+   return progress;
+}
+
  /* The align16 hardware can only do 32-bit swizzle channels, so we need to
   * translate the logical 64-bit swizzle channels that we use in the Vec4 IR
   * to 32-bit swizzle channels in hardware registers.
@@ -2408,6 +2464,7 @@ vec4_visitor::run()
     if (failed)
        return false;
  
+   OPT(lower_64bit_mad_to_mul_add);
     OPT(scalarize_df);
  
     setup_payload();