i965/vec4: Compare full register offsets in cmod propagation.

[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs.cpp
diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp

index 60907cd20b9723327c17950d09aa2eb9fb2c9a99..c858f449c8f0642548a964dd26099e222640428c 100644 (file)
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -172,12 +172,12 @@ fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_builder &bld,
      * be any component of a vector, and then we load 4 contiguous
      * components starting from that.
      *
-    * We break down the const_offset to a portion added to the variable
-    * offset and a portion done using reg_offset, which means that if you
-    * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
-    * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
-    * CSE can later notice that those loads are all the same and eliminate
-    * the redundant ones.
+    * We break down the const_offset to a portion added to the variable offset
+    * and a portion done using fs_reg::offset, which means that if you have
+    * GLSL using something like "uniform vec4 a[20]; gl_FragColor = a[i]",
+    * we'll temporarily generate 4 vec4 loads from offset i * 4, and CSE can
+    * later notice that those loads are all the same and eliminate the
+    * redundant ones.
      */
     fs_reg vec4_offset = vgrf(glsl_type::uint_type);
     bld.ADD(vec4_offset, varying_offset, brw_imm_ud(const_offset & ~0xf));
@@ -191,7 +191,7 @@ fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_builder &bld,
     fs_reg vec4_result = bld.vgrf(BRW_REGISTER_TYPE_F, 4);
     fs_inst *inst = bld.emit(FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_LOGICAL,
                              vec4_result, surf_index, vec4_offset);
-   inst->size_written = 4 * bld.dispatch_width() / 8 * REG_SIZE;
+   inst->size_written = 4 * vec4_result.component_size(inst->exec_size);
  
     if (type_sz(dst.type) == 8) {
        shuffle_32bit_load_result_to_64bit_data(
@@ -240,12 +240,6 @@ fs_inst::equals(fs_inst *inst) const
             offset == inst->offset);
  }
  
-bool
-fs_inst::overwrites_reg(const fs_reg &reg) const
-{
-   return reg.in_range(dst, DIV_ROUND_UP(size_written, REG_SIZE));
-}
-
  bool
  fs_inst::is_send_from_grf() const
  {
@@ -353,7 +347,7 @@ fs_inst::is_copy_payload(const brw::simple_allocator &grf_alloc) const
        return false;
  
     fs_reg reg = this->src[0];
-   if (reg.file != VGRF || reg.offset / REG_SIZE != 0 || reg.stride == 0)
+   if (reg.file != VGRF || reg.offset != 0 || reg.stride != 1)
        return false;
  
     if (grf_alloc.sizes[reg.nr] * REG_SIZE != this->size_written)
@@ -441,15 +435,6 @@ fs_reg::equals(const fs_reg &r) const
             stride == r.stride);
  }
  
-fs_reg &
-fs_reg::set_smear(unsigned subreg)
-{
-   assert(file != ARF && file != FIXED_GRF && file != IMM);
-   offset = ROUND_DOWN_TO(offset, REG_SIZE) + subreg * type_sz(type);
-   stride = 0;
-   return *this;
-}
-
  bool
  fs_reg::is_contiguous() const
  {
@@ -562,15 +547,14 @@ fs_visitor::get_timestamp(const fs_builder &bld)
  void
  fs_visitor::emit_shader_time_begin()
  {
-   shader_start_time = get_timestamp(bld.annotate("shader time start"));
-
     /* We want only the low 32 bits of the timestamp.  Since it's running
      * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
      * which is plenty of time for our purposes.  It is identical across the
      * EUs, but since it's tracking GPU core speed it will increment at a
      * varying rate as render P-states change.
      */
-   shader_start_time.set_smear(0);
+   shader_start_time = component(
+      get_timestamp(bld.annotate("shader time start")), 0);
  }
  
  void
@@ -581,8 +565,7 @@ fs_visitor::emit_shader_time_end()
     assert(end && ((fs_inst *) end)->eot);
     const fs_builder ibld = bld.annotate("shader time end")
                                .exec_all().at(NULL, end);
-
-   fs_reg shader_end_time = get_timestamp(ibld);
+   const fs_reg timestamp = get_timestamp(ibld);
  
     /* We only use the low 32 bits of the timestamp - see
      * emit_shader_time_begin()).
@@ -591,22 +574,21 @@ fs_visitor::emit_shader_time_end()
      * else that might disrupt timing) by setting smear to 2 and checking if
      * that field is != 0.
      */
-   shader_end_time.set_smear(0);
+   const fs_reg shader_end_time = component(timestamp, 0);
  
     /* Check that there weren't any timestamp reset events (assuming these
      * were the only two timestamp reads that happened).
      */
-   fs_reg reset = shader_end_time;
-   reset.set_smear(2);
+   const fs_reg reset = component(timestamp, 2);
     set_condmod(BRW_CONDITIONAL_Z,
                 ibld.AND(ibld.null_reg_ud(), reset, brw_imm_ud(1u)));
     ibld.IF(BRW_PREDICATE_NORMAL);
  
     fs_reg start = shader_start_time;
     start.negate = true;
-   fs_reg diff = fs_reg(VGRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
-   diff.set_smear(0);
-
+   const fs_reg diff = component(fs_reg(VGRF, alloc.allocate(1),
+                                        BRW_REGISTER_TYPE_UD),
+                                 0);
     const fs_builder cbld = ibld.group(1, 0);
     cbld.group(1, 0).ADD(diff, start, shader_end_time);
  
@@ -817,7 +799,7 @@ fs_inst::components_read(unsigned i) const
     }
  }
  
-int
+unsigned
  fs_inst::size_read(int arg) const
  {
     switch (opcode) {
@@ -863,31 +845,7 @@ fs_inst::size_read(int arg) const
     case SHADER_OPCODE_MOV_INDIRECT:
        if (arg == 0) {
           assert(src[2].file == IMM);
-         unsigned region_length = src[2].ud;
-
-         if (src[0].file == UNIFORM) {
-            assert(region_length % 4 == 0);
-            return region_length;
-         } else if (src[0].file == FIXED_GRF) {
-            /* If the start of the region is not register aligned, then
-             * there's some portion of the register that's technically
-             * unread at the beginning.
-             *
-             * However, the register allocator works in terms of whole
-             * registers, and does not use subnr.  It assumes that the
-             * read starts at the beginning of the register, and extends
-             * regs_read() whole registers beyond that.
-             *
-             * To compensate, we extend the region length to include this
-             * unread portion at the beginning.
-             */
-            if (src[0].subnr)
-               region_length += src[0].subnr;
-
-            return region_length;
-         } else {
-            assert(!"Invalid register file");
-         }
+         return src[2].ud;
        }
        break;
  
@@ -1287,9 +1245,9 @@ fs_visitor::emit_sampleid_setup()
                      brw_imm_v(0x44440000));
        abld.AND(*reg, tmp, brw_imm_w(0xf));
     } else {
-      fs_reg t1(VGRF, alloc.allocate(1), BRW_REGISTER_TYPE_D);
-      t1.set_smear(0);
-      fs_reg t2(VGRF, alloc.allocate(1), BRW_REGISTER_TYPE_W);
+      const fs_reg t1 = component(fs_reg(VGRF, alloc.allocate(1),
+                                         BRW_REGISTER_TYPE_D), 0);
+      const fs_reg t2(VGRF, alloc.allocate(1), BRW_REGISTER_TYPE_W);
  
        /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
         * 8x multisampling, subspan 0 will represent sample N (where N
@@ -2104,7 +2062,7 @@ fs_visitor::assign_constant_locations()
     stage_prog_data->nr_params = num_push_constants;
     stage_prog_data->nr_pull_params = num_pull_constants;
  
-   /* Up until now, the param[] array has been indexed by reg + reg_offset
+   /* Up until now, the param[] array has been indexed by reg + offset
      * of UNIFORM registers.  Move pull constants into pull_param[] and
      * condense param[] to only contain the uniforms we chose to push.
      *
@@ -2179,9 +2137,7 @@ fs_visitor::lower_constant_loads()
           /* Rewrite the instruction to use the temporary VGRF. */
           inst->src[i].file = VGRF;
           inst->src[i].nr = dst.nr;
-         inst->src[i].offset %= 4;
-         inst->src[i].set_smear((pull_index & 3) * 4 /
-                                type_sz(inst->src[i].type));
+         inst->src[i].offset = (pull_index & 3) * 4 + inst->src[i].offset % 4;
  
           brw_mark_surface_used(prog_data, index);
        }
@@ -2676,16 +2632,18 @@ fs_visitor::opt_redundant_discard_jumps()
  
  /**
   * Compute a bitmask with GRF granularity with a bit set for each GRF starting
- * from \p r which overlaps the region starting at \p r and spanning \p n GRF
- * units.
+ * from \p r.offset which overlaps the region starting at \p s.offset and
+ * spanning \p ds bytes.
   */
  static inline unsigned
-mask_relative_to(const fs_reg &r, const fs_reg &s, unsigned n)
+mask_relative_to(const fs_reg &r, const fs_reg &s, unsigned ds)
  {
-   const int rel_offset = (reg_offset(s) - reg_offset(r)) / REG_SIZE;
+   const int rel_offset = reg_offset(s) - reg_offset(r);
+   const int shift = rel_offset / REG_SIZE;
+   const unsigned n = DIV_ROUND_UP(rel_offset % REG_SIZE + ds, REG_SIZE);
     assert(reg_space(r) == reg_space(s) &&
-          rel_offset >= 0 && rel_offset < int(8 * sizeof(unsigned)));
-   return ((1 << n) - 1) << rel_offset;
+          shift >= 0 && shift < int(8 * sizeof(unsigned)));
+   return ((1 << n) - 1) << shift;
  }
  
  bool
@@ -2745,9 +2703,8 @@ fs_visitor::compute_to_mrf()
               * would need us to understand coalescing out more than one MOV at
               * a time.
               */
-            if (scan_inst->dst.offset / REG_SIZE < inst->src[0].offset / REG_SIZE ||
-                scan_inst->dst.offset / REG_SIZE + DIV_ROUND_UP(scan_inst->size_written, REG_SIZE) >
-                inst->src[0].offset / REG_SIZE + DIV_ROUND_UP(inst->size_read(0), REG_SIZE))
+            if (!region_contained_in(scan_inst->dst, scan_inst->size_written,
+                                     inst->src[0], inst->size_read(0)))
                 break;
  
             /* SEND instructions can't have MRF as a destination. */
@@ -2765,8 +2722,7 @@ fs_visitor::compute_to_mrf()
  
              /* Clear the bits for any registers this instruction overwrites. */
              regs_left &= ~mask_relative_to(
-               inst->src[0], scan_inst->dst, DIV_ROUND_UP(scan_inst->size_written,
-                                                          REG_SIZE));
+               inst->src[0], scan_inst->dst, scan_inst->size_written);
              if (!regs_left)
                 break;
          }
@@ -2824,18 +2780,17 @@ fs_visitor::compute_to_mrf()
                               inst->src[0], inst->size_read(0))) {
              /* Clear the bits for any registers this instruction overwrites. */
              regs_left &= ~mask_relative_to(
-               inst->src[0], scan_inst->dst, DIV_ROUND_UP(scan_inst->size_written,
-                                                          REG_SIZE));
+               inst->src[0], scan_inst->dst, scan_inst->size_written);
  
-            const unsigned rel_offset = (reg_offset(scan_inst->dst) -
-                                         reg_offset(inst->src[0])) / REG_SIZE;
+            const unsigned rel_offset = reg_offset(scan_inst->dst) -
+                                        reg_offset(inst->src[0]);
  
              if (inst->dst.nr & BRW_MRF_COMPR4) {
                 /* Apply the same address transformation done by the hardware
                  * for COMPR4 MRF writes.
                  */
-               assert(rel_offset < 2);
-               scan_inst->dst.nr = inst->dst.nr + rel_offset * 4;
+               assert(rel_offset < 2 * REG_SIZE);
+               scan_inst->dst.nr = inst->dst.nr + rel_offset / REG_SIZE * 4;
  
                 /* Clear the COMPR4 bit if the generating instruction is not
                  * compressed.
@@ -2847,11 +2802,11 @@ fs_visitor::compute_to_mrf()
                 /* Calculate the MRF number the result of this instruction is
                  * ultimately written to.
                  */
-               scan_inst->dst.nr = inst->dst.nr + rel_offset;
+               scan_inst->dst.nr = inst->dst.nr + rel_offset / REG_SIZE;
              }
  
              scan_inst->dst.file = MRF;
-            scan_inst->dst.offset %= REG_SIZE;
+            scan_inst->dst.offset = inst->dst.offset + rel_offset % REG_SIZE;
              scan_inst->saturate |= inst->saturate;
              if (!regs_left)
                 break;
@@ -3211,10 +3166,6 @@ fs_visitor::insert_gen4_send_dependency_workarounds()
  
     bool progress = false;
  
-   /* Note that we're done with register allocation, so GRF fs_regs always
-    * have a .reg_offset of 0.
-    */
-
     foreach_block_and_inst(block, fs_inst, inst, cfg) {
        if (inst->mlen != 0 && inst->dst.file == VGRF) {
           insert_gen4_pre_send_dependency_workarounds(block, inst);
@@ -3498,62 +3449,27 @@ fs_visitor::lower_integer_multiplication()
                          inst->dst.type);
  
              if (devinfo->gen >= 7) {
-               fs_reg src1_0_w = inst->src[1];
-               fs_reg src1_1_w = inst->src[1];
-
                 if (inst->src[1].file == IMM) {
-                  src1_0_w.ud &= 0xffff;
-                  src1_1_w.ud >>= 16;
+                  ibld.MUL(low, inst->src[0],
+                           brw_imm_uw(inst->src[1].ud & 0xffff));
+                  ibld.MUL(high, inst->src[0],
+                           brw_imm_uw(inst->src[1].ud >> 16));
                 } else {
-                  src1_0_w.type = BRW_REGISTER_TYPE_UW;
-                  if (src1_0_w.stride != 0) {
-                     assert(src1_0_w.stride == 1);
-                     src1_0_w.stride = 2;
-                  }
-
-                  src1_1_w.type = BRW_REGISTER_TYPE_UW;
-                  if (src1_1_w.stride != 0) {
-                     assert(src1_1_w.stride == 1);
-                     src1_1_w.stride = 2;
-                  }
-                  src1_1_w.offset += type_sz(BRW_REGISTER_TYPE_UW);
+                  ibld.MUL(low, inst->src[0],
+                           subscript(inst->src[1], BRW_REGISTER_TYPE_UW, 0));
+                  ibld.MUL(high, inst->src[0],
+                           subscript(inst->src[1], BRW_REGISTER_TYPE_UW, 1));
                 }
-               ibld.MUL(low, inst->src[0], src1_0_w);
-               ibld.MUL(high, inst->src[0], src1_1_w);
              } else {
-               fs_reg src0_0_w = inst->src[0];
-               fs_reg src0_1_w = inst->src[0];
-
-               src0_0_w.type = BRW_REGISTER_TYPE_UW;
-               if (src0_0_w.stride != 0) {
-                  assert(src0_0_w.stride == 1);
-                  src0_0_w.stride = 2;
-               }
-
-               src0_1_w.type = BRW_REGISTER_TYPE_UW;
-               if (src0_1_w.stride != 0) {
-                  assert(src0_1_w.stride == 1);
-                  src0_1_w.stride = 2;
-               }
-               src0_1_w.offset += type_sz(BRW_REGISTER_TYPE_UW);
-
-               ibld.MUL(low, src0_0_w, inst->src[1]);
-               ibld.MUL(high, src0_1_w, inst->src[1]);
+               ibld.MUL(low, subscript(inst->src[0], BRW_REGISTER_TYPE_UW, 0),
+                        inst->src[1]);
+               ibld.MUL(high, subscript(inst->src[0], BRW_REGISTER_TYPE_UW, 1),
+                        inst->src[1]);
              }
  
-            fs_reg dst = inst->dst;
-            dst.type = BRW_REGISTER_TYPE_UW;
-            dst.offset = ROUND_DOWN_TO(dst.offset, REG_SIZE) + 2;
-            dst.stride = 2;
-
-            high.type = BRW_REGISTER_TYPE_UW;
-            high.stride = 2;
-
-            low.type = BRW_REGISTER_TYPE_UW;
-            low.offset = ROUND_DOWN_TO(low.offset, REG_SIZE) + 2;
-            low.stride = 2;
-
-            ibld.ADD(dst, low, high);
+            ibld.ADD(subscript(inst->dst, BRW_REGISTER_TYPE_UW, 1),
+                     subscript(low, BRW_REGISTER_TYPE_UW, 1),
+                     subscript(high, BRW_REGISTER_TYPE_UW, 0));
  
              if (inst->conditional_mod || orig_dst.file == MRF) {
                 set_condmod(inst->conditional_mod,
@@ -4629,8 +4545,8 @@ get_fpu_lowered_simd_width(const struct gen_device_info *devinfo,
      */
     if (devinfo->gen < 8) {
        for (unsigned i = 0; i < inst->sources; i++) {
-         if (DIV_ROUND_UP(inst->size_written, REG_SIZE) == 2 &&
-             inst->size_read(i) != 0 && DIV_ROUND_UP(inst->size_read(i), REG_SIZE) != 2 &&
+         if (inst->size_written > REG_SIZE &&
+             inst->size_read(i) != 0 && inst->size_read(i) <= REG_SIZE &&
               !is_uniform(inst->src[i]) &&
               !(type_sz(inst->dst.type) == 4 && inst->dst.stride == 1 &&
                 type_sz(inst->src[i].type) == 2 && inst->src[i].stride == 1)) {
@@ -5313,10 +5229,6 @@ fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
     switch (inst->dst.file) {
     case VGRF:
        fprintf(file, "vgrf%d", inst->dst.nr);
-      if (alloc.sizes[inst->dst.nr] * REG_SIZE != inst->size_written ||
-          inst->dst.offset % REG_SIZE)
-         fprintf(file, "+%d.%d",
-                 inst->dst.offset / REG_SIZE, inst->dst.offset % REG_SIZE);
        break;
     case FIXED_GRF:
        fprintf(file, "g%d", inst->dst.nr);
@@ -5328,10 +5240,10 @@ fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
        fprintf(file, "(null)");
        break;
     case UNIFORM:
-      fprintf(file, "***u%d***", inst->dst.nr + inst->dst.offset / 4);
+      fprintf(file, "***u%d***", inst->dst.nr);
        break;
     case ATTR:
-      fprintf(file, "***attr%d***", inst->dst.nr + inst->dst.offset / REG_SIZE);
+      fprintf(file, "***attr%d***", inst->dst.nr);
        break;
     case ARF:
        switch (inst->dst.nr) {
@@ -5351,12 +5263,19 @@ fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
           fprintf(file, "arf%d.%d", inst->dst.nr & 0xf, inst->dst.subnr);
           break;
        }
-      if (inst->dst.subnr)
-         fprintf(file, "+%d", inst->dst.subnr);
        break;
     case IMM:
        unreachable("not reached");
     }
+
+   if (inst->dst.offset ||
+       (inst->dst.file == VGRF &&
+        alloc.sizes[inst->dst.nr] * REG_SIZE != inst->size_written)) {
+      const unsigned reg_size = (inst->dst.file == UNIFORM ? 4 : REG_SIZE);
+      fprintf(file, "+%d.%d", inst->dst.offset / reg_size,
+              inst->dst.offset % reg_size);
+   }
+
     if (inst->dst.stride != 1)
        fprintf(file, "<%u>", inst->dst.stride);
     fprintf(file, ":%s, ", brw_reg_type_letters(inst->dst.type));
@@ -5369,10 +5288,6 @@ fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
        switch (inst->src[i].file) {
        case VGRF:
           fprintf(file, "vgrf%d", inst->src[i].nr);
-         if (alloc.sizes[inst->src[i].nr] * REG_SIZE != inst->size_read(i) ||
-             inst->src[i].offset % REG_SIZE != 0)
-            fprintf(file, "+%d.%d", inst->src[i].offset / REG_SIZE,
-                    inst->src[i].offset % REG_SIZE);
           break;
        case FIXED_GRF:
           fprintf(file, "g%d", inst->src[i].nr);
@@ -5381,14 +5296,10 @@ fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
           fprintf(file, "***m%d***", inst->src[i].nr);
           break;
        case ATTR:
-         fprintf(file, "attr%d+%d", inst->src[i].nr, inst->src[i].offset / REG_SIZE);
+         fprintf(file, "attr%d", inst->src[i].nr);
           break;
        case UNIFORM:
-         fprintf(file, "u%d", inst->src[i].nr + inst->src[i].offset / 4);
-         if (inst->src[i].offset % 4 != 0) {
-            fprintf(file, "+%d.%d", inst->src[i].offset / 4,
-                    inst->src[i].offset % 4);
-         }
+         fprintf(file, "u%d", inst->src[i].nr);
           break;
        case BAD_FILE:
           fprintf(file, "(null)");
@@ -5439,10 +5350,17 @@ fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
              fprintf(file, "arf%d.%d", inst->src[i].nr & 0xf, inst->src[i].subnr);
              break;
           }
-         if (inst->src[i].subnr)
-            fprintf(file, "+%d", inst->src[i].subnr);
           break;
        }
+
+      if (inst->src[i].offset ||
+          (inst->src[i].file == VGRF &&
+           alloc.sizes[inst->src[i].nr] * REG_SIZE != inst->size_read(i))) {
+         const unsigned reg_size = (inst->src[i].file == UNIFORM ? 4 : REG_SIZE);
+         fprintf(file, "+%d.%d", inst->src[i].offset / reg_size,
+                 inst->src[i].offset % reg_size);
+      }
+
        if (inst->src[i].abs)
           fprintf(file, "|");