From: Francisco Jerez <currojerez@riseup.net>
Date: Wed, 7 Sep 2016 20:38:20 +0000 (-0700)
Subject: i965/fs: Replace fs_inst::regs_written with ::size_written field in bytes.
X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=69570bbad876bb9da609c3b651aacda28cecc542;p=mesa.git

i965/fs: Replace fs_inst::regs_written with ::size_written field in bytes.

The previous regs_written field can be recovered by rewriting each
rvalue reference of regs_written like 'x = i.regs_written' to 'x =
DIV_ROUND_UP(i.size_written, reg_unit)', and each lvalue reference
like 'i.regs_written = x' to 'i.size_written = x * reg_unit'.

For the same reason as in the previous patches, this doesn't attempt
to be particularly clever about simplifying the result in the interest
of keeping the rather lengthy patch as obvious as possible.  I'll come
back later to clean up any ugliness introduced here.

Reviewed-by: Iago Toral Quiroga <itoral@igalia.com>
---

diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index 802aa9f76f4..0244f593149 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -76,11 +76,10 @@ fs_inst::init(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
    case FIXED_GRF:
    case MRF:
    case ATTR:
-      this->regs_written = DIV_ROUND_UP(dst.component_size(exec_size),
-                                        REG_SIZE);
+      this->size_written = dst.component_size(exec_size);
       break;
    case BAD_FILE:
-      this->regs_written = 0;
+      this->size_written = 0;
       break;
    case IMM:
    case UNIFORM:
@@ -192,7 +191,7 @@ fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_builder &bld,
    fs_reg vec4_result = bld.vgrf(BRW_REGISTER_TYPE_F, 4);
    fs_inst *inst = bld.emit(FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_LOGICAL,
                             vec4_result, surf_index, vec4_offset);
-   inst->regs_written = 4 * bld.dispatch_width() / 8;
+   inst->size_written = 4 * bld.dispatch_width() / 8 * REG_SIZE;
 
    if (type_sz(dst.type) == 8) {
       shuffle_32bit_load_result_to_64bit_data(
@@ -244,7 +243,7 @@ fs_inst::equals(fs_inst *inst) const
 bool
 fs_inst::overwrites_reg(const fs_reg &reg) const
 {
-   return reg.in_range(dst, regs_written);
+   return reg.in_range(dst, DIV_ROUND_UP(size_written, REG_SIZE));
 }
 
 bool
@@ -357,7 +356,7 @@ fs_inst::is_copy_payload(const brw::simple_allocator &grf_alloc) const
    if (reg.file != VGRF || reg.offset / REG_SIZE != 0 || reg.stride == 0)
       return false;
 
-   if (grf_alloc.sizes[reg.nr] != this->regs_written)
+   if (grf_alloc.sizes[reg.nr] * REG_SIZE != this->size_written)
       return false;
 
    for (int i = 0; i < this->sources; i++) {
@@ -2548,7 +2547,7 @@ fs_visitor::opt_sampler_eot()
    for (unsigned i = 0; i < FB_WRITE_LOGICAL_NUM_SRCS; i++) {
       if (i == FB_WRITE_LOGICAL_SRC_COLOR0) {
          if (!fb_write->src[i].equals(tex_inst->dst) ||
-             fb_write->regs_read(i) != tex_inst->regs_written)
+             fb_write->regs_read(i) * REG_SIZE != tex_inst->size_written)
          return false;
       } else if (i != FB_WRITE_LOGICAL_SRC_COMPONENTS) {
          if (fb_write->src[i].file != BAD_FILE)
@@ -2564,7 +2563,7 @@ fs_visitor::opt_sampler_eot()
    tex_inst->offset |= fb_write->target << 24;
    tex_inst->eot = true;
    tex_inst->dst = ibld.null_reg_ud();
-   tex_inst->regs_written = 0;
+   tex_inst->size_written = 0;
    fb_write->remove(cfg->blocks[cfg->num_blocks - 1]);
 
    /* Marking EOT is sufficient, lower_logical_sends() will notice the EOT
@@ -2606,7 +2605,7 @@ fs_visitor::opt_register_renaming()
 
       if (depth == 0 &&
           inst->dst.file == VGRF &&
-          alloc.sizes[inst->dst.nr] == inst->regs_written &&
+          alloc.sizes[inst->dst.nr] * REG_SIZE == inst->size_written &&
           !inst->is_partial_write()) {
          if (remap[dst] == -1) {
             remap[dst] = dst;
@@ -2730,7 +2729,7 @@ fs_visitor::compute_to_mrf()
       unsigned regs_left = (1 << regs_read(inst, 0)) - 1;
 
       foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst) {
-         if (regions_overlap(scan_inst->dst, scan_inst->regs_written * REG_SIZE,
+         if (regions_overlap(scan_inst->dst, scan_inst->size_written,
                              inst->src[0], inst->regs_read(0) * REG_SIZE)) {
 	    /* Found the last thing to write our reg we want to turn
 	     * into a compute-to-MRF.
@@ -2749,7 +2748,7 @@ fs_visitor::compute_to_mrf()
              * a time.
              */
             if (scan_inst->dst.offset / REG_SIZE < inst->src[0].offset / REG_SIZE ||
-                scan_inst->dst.offset / REG_SIZE + scan_inst->regs_written >
+                scan_inst->dst.offset / REG_SIZE + DIV_ROUND_UP(scan_inst->size_written, REG_SIZE) >
                 inst->src[0].offset / REG_SIZE + inst->regs_read(0))
                break;
 
@@ -2768,7 +2767,8 @@ fs_visitor::compute_to_mrf()
 
             /* Clear the bits for any registers this instruction overwrites. */
             regs_left &= ~mask_relative_to(
-               inst->src[0], scan_inst->dst, scan_inst->regs_written);
+               inst->src[0], scan_inst->dst, DIV_ROUND_UP(scan_inst->size_written,
+                                                          REG_SIZE));
             if (!regs_left)
                break;
 	 }
@@ -2793,8 +2793,8 @@ fs_visitor::compute_to_mrf()
 	 if (interfered)
 	    break;
 
-         if (regions_overlap(scan_inst->dst, scan_inst->regs_written * REG_SIZE,
-                             inst->dst, inst->regs_written * REG_SIZE)) {
+         if (regions_overlap(scan_inst->dst, scan_inst->size_written,
+                             inst->dst, inst->size_written)) {
 	    /* If somebody else writes our MRF here, we can't
 	     * compute-to-MRF before that.
 	     */
@@ -2803,7 +2803,7 @@ fs_visitor::compute_to_mrf()
 
          if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1 &&
              regions_overlap(fs_reg(MRF, scan_inst->base_mrf), scan_inst->mlen * REG_SIZE,
-                             inst->dst, inst->regs_written * REG_SIZE)) {
+                             inst->dst, inst->size_written)) {
 	    /* Found a SEND instruction, which means that there are
 	     * live values in MRFs from base_mrf to base_mrf +
 	     * scan_inst->mlen - 1.  Don't go pushing our MRF write up
@@ -2822,11 +2822,12 @@ fs_visitor::compute_to_mrf()
       regs_left = (1 << regs_read(inst, 0)) - 1;
 
       foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst) {
-         if (regions_overlap(scan_inst->dst, scan_inst->regs_written * REG_SIZE,
+         if (regions_overlap(scan_inst->dst, scan_inst->size_written,
                              inst->src[0], inst->regs_read(0) * REG_SIZE)) {
             /* Clear the bits for any registers this instruction overwrites. */
             regs_left &= ~mask_relative_to(
-               inst->src[0], scan_inst->dst, scan_inst->regs_written);
+               inst->src[0], scan_inst->dst, DIV_ROUND_UP(scan_inst->size_written,
+                                                          REG_SIZE));
 
             const unsigned rel_offset = (reg_offset(scan_inst->dst) -
                                          reg_offset(inst->src[0])) / REG_SIZE;
@@ -2841,7 +2842,7 @@ fs_visitor::compute_to_mrf()
                /* Clear the COMPR4 bit if the generating instruction is not
                 * compressed.
                 */
-               if (scan_inst->regs_written < 2)
+               if (scan_inst->size_written < 2 * REG_SIZE)
                   scan_inst->dst.nr &= ~BRW_MRF_COMPR4;
 
             } else {
@@ -3024,7 +3025,7 @@ fs_visitor::remove_duplicate_mrf_writes()
       /* Clear out any MRF move records whose sources got overwritten. */
       for (unsigned i = 0; i < ARRAY_SIZE(last_mrf_move); i++) {
          if (last_mrf_move[i] &&
-             regions_overlap(inst->dst, inst->regs_written * REG_SIZE,
+             regions_overlap(inst->dst, inst->size_written,
                              last_mrf_move[i]->src[0],
                              last_mrf_move[i]->regs_read(0) * REG_SIZE)) {
             last_mrf_move[i] = NULL;
@@ -4603,7 +4604,7 @@ get_fpu_lowered_simd_width(const struct gen_device_info *devinfo,
     * which is the one that is going to limit the overall execution size of
     * the instruction due to this rule.
     */
-   unsigned reg_count = inst->regs_written;
+   unsigned reg_count = DIV_ROUND_UP(inst->size_written, REG_SIZE);
 
    for (unsigned i = 0; i < inst->sources; i++)
       reg_count = MAX2(reg_count, (unsigned)inst->regs_read(i));
@@ -4630,13 +4631,14 @@ get_fpu_lowered_simd_width(const struct gen_device_info *devinfo,
     */
    if (devinfo->gen < 8) {
       for (unsigned i = 0; i < inst->sources; i++) {
-         if (inst->regs_written == 2 &&
+         if (DIV_ROUND_UP(inst->size_written, REG_SIZE) == 2 &&
              inst->regs_read(i) != 0 && inst->regs_read(i) != 2 &&
              !is_uniform(inst->src[i]) &&
              !(type_sz(inst->dst.type) == 4 && inst->dst.stride == 1 &&
-               type_sz(inst->src[i].type) == 2 && inst->src[i].stride == 1))
-            max_width = MIN2(max_width, inst->exec_size /
-                             inst->regs_written);
+               type_sz(inst->src[i].type) == 2 && inst->src[i].stride == 1)) {
+            const unsigned reg_count = DIV_ROUND_UP(inst->size_written, REG_SIZE);
+            max_width = MIN2(max_width, inst->exec_size / reg_count);
+         }
       }
    }
 
@@ -4681,9 +4683,10 @@ get_fpu_lowered_simd_width(const struct gen_device_info *devinfo,
     * In this situation we calculate the maximum size of the split
     * instructions so they only ever write to a single register.
     */
-   if (devinfo->gen < 8 && inst->regs_written > 1 &&
+   if (devinfo->gen < 8 && inst->size_written > REG_SIZE &&
        !inst->force_writemask_all) {
-      const unsigned channels_per_grf = inst->exec_size / inst->regs_written;
+      const unsigned channels_per_grf = inst->exec_size /
+         DIV_ROUND_UP(inst->size_written, REG_SIZE);
       unsigned exec_type_size = 0;
       for (int i = 0; i < inst->sources; i++) {
          if (inst->src[i].file != BAD_FILE)
@@ -5087,8 +5090,7 @@ needs_dst_copy(const fs_builder &lbld, const fs_inst *inst)
     * the results of multiple lowered instructions in order to make sure that
     * they end up arranged correctly in the original destination region.
     */
-   if (inst->regs_written * REG_SIZE >
-       inst->dst.component_size(inst->exec_size))
+   if (inst->size_written > inst->dst.component_size(inst->exec_size))
       return true;
 
    /* If the lowered execution size is larger than the original the result of
@@ -5111,7 +5113,7 @@ needs_dst_copy(const fs_builder &lbld, const fs_inst *inst)
        * group which could cause one of the lowered instructions to overwrite
        * the data read from the same source by other lowered instructions.
        */
-      if (regions_overlap(inst->dst, inst->regs_written * REG_SIZE,
+      if (regions_overlap(inst->dst, inst->size_written,
                           inst->src[i], inst->regs_read(i) * REG_SIZE) &&
           !inst->dst.equals(inst->src[i]))
         return true;
@@ -5138,8 +5140,8 @@ emit_zip(const fs_builder &lbld, bblock_t *block, fs_inst *inst)
 
    /* Specified channel group from the destination region. */
    const fs_reg dst = horiz_offset(inst->dst, lbld.group());
-   const unsigned dst_size = inst->regs_written * REG_SIZE /
-            inst->dst.component_size(inst->exec_size);
+   const unsigned dst_size = inst->size_written /
+      inst->dst.component_size(inst->exec_size);
 
    if (needs_dst_copy(lbld, inst)) {
       const fs_reg tmp = lbld.vgrf(inst->dst.type, dst_size);
@@ -5191,7 +5193,7 @@ fs_visitor::lower_simd_width()
           * original or the lowered instruction, whichever is lower.
           */
          const unsigned n = DIV_ROUND_UP(inst->exec_size, lower_width);
-         const unsigned dst_size = inst->regs_written * REG_SIZE /
+         const unsigned dst_size = inst->size_written /
             inst->dst.component_size(inst->exec_size);
 
          assert(!inst->writes_accumulator && !inst->mlen);
@@ -5215,9 +5217,8 @@ fs_visitor::lower_simd_width()
                split_inst.src[j] = emit_unzip(lbld, block, inst, j);
 
             split_inst.dst = emit_zip(lbld, block, inst);
-            split_inst.regs_written = DIV_ROUND_UP(
-               split_inst.dst.component_size(lower_width) * dst_size,
-               REG_SIZE);
+            split_inst.size_written =
+               split_inst.dst.component_size(lower_width) * dst_size;
 
             lbld.emit(split_inst);
          }
@@ -5314,7 +5315,7 @@ fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
    switch (inst->dst.file) {
    case VGRF:
       fprintf(file, "vgrf%d", inst->dst.nr);
-      if (alloc.sizes[inst->dst.nr] != inst->regs_written ||
+      if (alloc.sizes[inst->dst.nr] * REG_SIZE != inst->size_written ||
           inst->dst.offset % REG_SIZE)
          fprintf(file, "+%d.%d",
                  inst->dst.offset / REG_SIZE, inst->dst.offset % REG_SIZE);
diff --git a/src/mesa/drivers/dri/i965/brw_fs_builder.h b/src/mesa/drivers/dri/i965/brw_fs_builder.h
index 483672fbd96..bae151ca042 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_builder.h
+++ b/src/mesa/drivers/dri/i965/brw_fs_builder.h
@@ -569,11 +569,11 @@ namespace brw {
       {
          instruction *inst = emit(SHADER_OPCODE_LOAD_PAYLOAD, dst, src, sources);
          inst->header_size = header_size;
-         inst->regs_written = header_size;
+         inst->size_written = header_size * REG_SIZE;
          for (unsigned i = header_size; i < sources; i++) {
-            inst->regs_written +=
-               DIV_ROUND_UP(dispatch_width() * type_sz(src[i].type) *
-                            dst.stride, REG_SIZE);
+            inst->size_written +=
+               ALIGN(dispatch_width() * type_sz(src[i].type) * dst.stride,
+                     REG_SIZE);
          }
 
          return inst;
diff --git a/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp b/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp
index 10f0a5b2820..0e239d28d44 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp
@@ -43,7 +43,7 @@ namespace { /* avoid conflict with opt_copy_propagation_elements */
 struct acp_entry : public exec_node {
    fs_reg dst;
    fs_reg src;
-   uint8_t regs_written;
+   uint8_t size_written;
    uint8_t regs_read;
    enum opcode opcode;
    bool saturate;
@@ -368,7 +368,8 @@ fs_visitor::try_copy_propagate(fs_inst *inst, int arg, acp_entry *entry)
     * that entry is writing.
     */
    if (!region_contained_in(inst->src[arg], inst->regs_read(arg),
-                            entry->dst, entry->regs_written))
+                            entry->dst, DIV_ROUND_UP(entry->size_written,
+                                                     REG_SIZE)))
       return false;
 
    /* we can't generally copy-propagate UD negations because we
@@ -524,7 +525,8 @@ fs_visitor::try_constant_propagate(fs_inst *inst, acp_entry *entry)
        * that entry is writing.
        */
       if (!region_contained_in(inst->src[i], inst->regs_read(i),
-                               entry->dst, entry->regs_written))
+                               entry->dst, DIV_ROUND_UP(entry->size_written,
+                                                        REG_SIZE)))
          continue;
 
       /* If the type sizes don't match each channel of the instruction is
@@ -770,8 +772,8 @@ fs_visitor::opt_copy_propagate_local(void *copy_prop_ctx, bblock_t *block,
       /* kill the destination from the ACP */
       if (inst->dst.file == VGRF) {
          foreach_in_list_safe(acp_entry, entry, &acp[inst->dst.nr % ACP_HASH_SIZE]) {
-            if (regions_overlap(entry->dst, entry->regs_written * REG_SIZE,
-                                inst->dst, inst->regs_written * REG_SIZE))
+            if (regions_overlap(entry->dst, entry->size_written,
+                                inst->dst, inst->size_written))
                entry->remove();
          }
 
@@ -784,7 +786,7 @@ fs_visitor::opt_copy_propagate_local(void *copy_prop_ctx, bblock_t *block,
                 * _any_ of the registers that it reads
                 */
                if (regions_overlap(entry->src, entry->regs_read * REG_SIZE,
-                                   inst->dst, inst->regs_written * REG_SIZE))
+                                   inst->dst, inst->size_written))
                   entry->remove();
             }
 	 }
@@ -797,7 +799,7 @@ fs_visitor::opt_copy_propagate_local(void *copy_prop_ctx, bblock_t *block,
          acp_entry *entry = ralloc(copy_prop_ctx, acp_entry);
          entry->dst = inst->dst;
          entry->src = inst->src[0];
-         entry->regs_written = inst->regs_written;
+         entry->size_written = inst->size_written;
          entry->regs_read = inst->regs_read(0);
          entry->opcode = inst->opcode;
          entry->saturate = inst->saturate;
@@ -808,14 +810,14 @@ fs_visitor::opt_copy_propagate_local(void *copy_prop_ctx, bblock_t *block,
          for (int i = 0; i < inst->sources; i++) {
             int effective_width = i < inst->header_size ? 8 : inst->exec_size;
             assert(effective_width * type_sz(inst->src[i].type) % REG_SIZE == 0);
-            int regs_written = effective_width *
-               type_sz(inst->src[i].type) / REG_SIZE;
+            const unsigned size_written = effective_width *
+                                          type_sz(inst->src[i].type);
             if (inst->src[i].file == VGRF) {
                acp_entry *entry = ralloc(copy_prop_ctx, acp_entry);
                entry->dst = inst->dst;
                entry->dst.offset += offset * REG_SIZE;
                entry->src = inst->src[i];
-               entry->regs_written = regs_written;
+               entry->size_written = size_written;
                entry->regs_read = inst->regs_read(i);
                entry->opcode = inst->opcode;
                if (!entry->dst.equals(inst->src[i])) {
@@ -824,7 +826,7 @@ fs_visitor::opt_copy_propagate_local(void *copy_prop_ctx, bblock_t *block,
                   ralloc_free(entry);
                }
             }
-            offset += regs_written;
+            offset += DIV_ROUND_UP(size_written, REG_SIZE);
          }
       }
    }
diff --git a/src/mesa/drivers/dri/i965/brw_fs_cse.cpp b/src/mesa/drivers/dri/i965/brw_fs_cse.cpp
index 4744142a4b6..2acbfea71f0 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_cse.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_cse.cpp
@@ -185,7 +185,7 @@ instructions_match(fs_inst *a, fs_inst *b, bool *negate)
           a->dst.type == b->dst.type &&
           a->offset == b->offset &&
           a->mlen == b->mlen &&
-          a->regs_written == b->regs_written &&
+          a->size_written == b->size_written &&
           a->base_mrf == b->base_mrf &&
           a->eot == b->eot &&
           a->header_size == b->header_size &&
@@ -296,7 +296,7 @@ fs_visitor::opt_cse_local(bblock_t *block)
 
             /* dest <- temp */
             if (!inst->dst.is_null()) {
-               assert(inst->regs_written == entry->generator->regs_written);
+               assert(inst->size_written == entry->generator->size_written);
                assert(inst->dst.type == entry->tmp.type);
                const fs_builder ibld(this, block, inst);
 
diff --git a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
index 12ab7b3fe66..8a581c9f02c 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
@@ -357,13 +357,14 @@ void
 fs_generator::generate_fb_read(fs_inst *inst, struct brw_reg dst,
                                struct brw_reg payload)
 {
+   assert(inst->size_written % REG_SIZE == 0);
    brw_wm_prog_data *prog_data =
       reinterpret_cast<brw_wm_prog_data *>(this->prog_data);
    const unsigned surf_index =
       prog_data->binding_table.render_target_start + inst->target;
 
    gen9_fb_READ(p, dst, payload, surf_index,
-                inst->header_size, inst->regs_written,
+                inst->header_size, inst->size_written / REG_SIZE,
                 prog_data->persample_dispatch);
 
    brw_mark_surface_used(&prog_data->base, surf_index);
@@ -452,6 +453,7 @@ fs_generator::generate_urb_read(fs_inst *inst,
                                 struct brw_reg dst,
                                 struct brw_reg header)
 {
+   assert(inst->size_written % REG_SIZE == 0);
    assert(header.file == BRW_GENERAL_REGISTER_FILE);
    assert(header.type == BRW_REGISTER_TYPE_UD);
 
@@ -467,7 +469,7 @@ fs_generator::generate_urb_read(fs_inst *inst,
       brw_inst_set_urb_per_slot_offset(p->devinfo, send, true);
 
    brw_inst_set_mlen(p->devinfo, send, inst->mlen);
-   brw_inst_set_rlen(p->devinfo, send, inst->regs_written);
+   brw_inst_set_rlen(p->devinfo, send, inst->size_written / REG_SIZE);
    brw_inst_set_header_present(p->devinfo, send, true);
    brw_inst_set_urb_global_offset(p->devinfo, send, inst->offset);
 }
@@ -625,6 +627,7 @@ fs_generator::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src
                            struct brw_reg surface_index,
                            struct brw_reg sampler_index)
 {
+   assert(inst->size_written % REG_SIZE == 0);
    int msg_type = -1;
    uint32_t simd_mode;
    uint32_t return_format;
@@ -895,7 +898,7 @@ fs_generator::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src
                  surface + base_binding_table_index,
                  sampler % 16,
                  msg_type,
-                 inst->regs_written,
+                 inst->size_written / REG_SIZE,
                  inst->mlen,
                  inst->header_size != 0,
                  simd_mode,
@@ -932,7 +935,7 @@ fs_generator::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src
                               0 /* surface */,
                               0 /* sampler */,
                               msg_type,
-                              inst->regs_written,
+                              inst->size_written / REG_SIZE,
                               inst->mlen /* mlen */,
                               inst->header_size != 0 /* header */,
                               simd_mode,
@@ -1263,7 +1266,7 @@ fs_generator::generate_varying_pull_constant_load_gen4(fs_inst *inst,
        */
       msg_type = BRW_SAMPLER_MESSAGE_SIMD16_LD;
       assert(inst->mlen == 3);
-      assert(inst->regs_written == 8);
+      assert(inst->size_written == 8 * REG_SIZE);
       rlen = 8;
       simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
    }
@@ -1399,6 +1402,7 @@ fs_generator::generate_pixel_interpolator_query(fs_inst *inst,
                                                 struct brw_reg msg_data,
                                                 unsigned msg_type)
 {
+   assert(inst->size_written % REG_SIZE == 0);
    assert(msg_data.type == BRW_REGISTER_TYPE_UD);
 
    brw_pixel_interpolator_query(p,
@@ -1408,7 +1412,7 @@ fs_generator::generate_pixel_interpolator_query(fs_inst *inst,
          msg_type,
          msg_data,
          inst->mlen,
-         inst->regs_written);
+         inst->size_written / REG_SIZE);
 }
 
 
diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
index cd4005c0e60..42ed131854e 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
@@ -1661,7 +1661,7 @@ emit_pixel_interpolater_send(const fs_builder &bld,
    inst = bld.emit(opcode, dst, payload, desc);
    inst->mlen = mlen;
    /* 2 floats per slot returned */
-   inst->regs_written = 2 * bld.dispatch_width() / 8;
+   inst->size_written = 2 * bld.dispatch_width() / 8 * REG_SIZE;
    inst->pi_noperspective = interpolation == INTERP_MODE_NOPERSPECTIVE;
 
    wm_prog_data->pulls_bary = true;
@@ -2144,7 +2144,7 @@ fs_visitor::emit_gs_input_load(const fs_reg &dst,
             unsigned read_components = num_components + first_component;
             fs_reg tmp = bld.vgrf(dst.type, read_components);
             inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp, icp_handle);
-            inst->regs_written = read_components * type_sz(tmp_dst.type) / 4;
+            inst->size_written = read_components * type_sz(tmp_dst.type) / 4 * REG_SIZE;
             for (unsigned i = 0; i < num_components; i++) {
                bld.MOV(offset(tmp_dst, bld, i),
                        offset(tmp, bld, i + first_component));
@@ -2152,7 +2152,7 @@ fs_visitor::emit_gs_input_load(const fs_reg &dst,
          } else {
             inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp_dst,
                             icp_handle);
-            inst->regs_written = num_components * type_sz(tmp_dst.type) / 4;
+            inst->size_written = num_components * type_sz(tmp_dst.type) / 4 * REG_SIZE;
          }
          inst->offset = base_offset + offset_const->u32[0];
          inst->mlen = 1;
@@ -2166,7 +2166,7 @@ fs_visitor::emit_gs_input_load(const fs_reg &dst,
          if (first_component != 0) {
             inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, tmp,
                             payload);
-            inst->regs_written = read_components * type_sz(tmp_dst.type) / 4;
+            inst->size_written = read_components * type_sz(tmp_dst.type) / 4 * REG_SIZE;
             for (unsigned i = 0; i < num_components; i++) {
                bld.MOV(offset(tmp_dst, bld, i),
                        offset(tmp, bld, i + first_component));
@@ -2174,7 +2174,7 @@ fs_visitor::emit_gs_input_load(const fs_reg &dst,
          } else {
             inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, tmp_dst,
                          payload);
-            inst->regs_written = num_components * type_sz(tmp_dst.type) / 4;
+            inst->size_written = num_components * type_sz(tmp_dst.type) / 4 * REG_SIZE;
          }
          inst->offset = base_offset;
          inst->mlen = 2;
@@ -2204,7 +2204,7 @@ fs_visitor::emit_gs_input_load(const fs_reg &dst,
       /* Read the whole VUE header (because of alignment) and read .w. */
       fs_reg tmp = bld.vgrf(dst.type, 4);
       inst->dst = tmp;
-      inst->regs_written = 4;
+      inst->size_written = 4 * REG_SIZE;
       bld.MOV(dst, offset(tmp, bld, 3));
    }
 }
@@ -2510,8 +2510,8 @@ fs_visitor::nir_emit_tcs_intrinsic(const fs_builder &bld,
             inst->offset = imm_offset;
             inst->mlen = 2;
          }
-         inst->regs_written =
-            ((num_components + first_component) * type_sz(dst.type) / 4);
+         inst->size_written =
+            ((num_components + first_component) * type_sz(dst.type) / 4) * REG_SIZE;
 
          /* If we are reading 64-bit data using 32-bit read messages we need
           * build proper 64-bit data elements by shuffling the low and high
@@ -2535,7 +2535,7 @@ fs_visitor::nir_emit_tcs_intrinsic(const fs_builder &bld,
          if (inst->offset == 0 && indirect_offset.file == BAD_FILE) {
             assert(type_sz(dst.type) < 8);
             inst->dst = bld.vgrf(dst.type, 4);
-            inst->regs_written = 4;
+            inst->size_written = 4 * REG_SIZE;
             bld.MOV(dst, offset(inst->dst, bld, 3));
          }
 
@@ -2576,7 +2576,7 @@ fs_visitor::nir_emit_tcs_intrinsic(const fs_builder &bld,
                inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp, patch_handle);
                inst->offset = 0;
                inst->mlen = 1;
-               inst->regs_written = 4;
+               inst->size_written = 4 * REG_SIZE;
 
                /* dst.xy = tmp.wz */
                bld.MOV(dst,                 offset(tmp, bld, 3));
@@ -2584,11 +2584,11 @@ fs_visitor::nir_emit_tcs_intrinsic(const fs_builder &bld,
                break;
             }
             case GL_TRIANGLES:
-               /* DWord 4; hardcode offset = 1 and regs_written = 1 */
+               /* DWord 4; hardcode offset = 1 and size_written = REG_SIZE */
                inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, dst, patch_handle);
                inst->offset = 1;
                inst->mlen = 1;
-               inst->regs_written = 1;
+               inst->size_written = REG_SIZE;
                break;
             case GL_ISOLINES:
                /* All channels are undefined. */
@@ -2606,7 +2606,7 @@ fs_visitor::nir_emit_tcs_intrinsic(const fs_builder &bld,
             inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp, patch_handle);
             inst->offset = 1;
             inst->mlen = 1;
-            inst->regs_written = 4;
+            inst->size_written = 4 * REG_SIZE;
 
             /* Reswizzle: WZYX */
             fs_reg srcs[4] = {
@@ -2641,7 +2641,7 @@ fs_visitor::nir_emit_tcs_intrinsic(const fs_builder &bld,
                fs_reg tmp = bld.vgrf(dst.type, read_components);
                inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp,
                                patch_handle);
-               inst->regs_written = read_components;
+               inst->size_written = read_components * REG_SIZE;
                for (unsigned i = 0; i < instr->num_components; i++) {
                   bld.MOV(offset(dst, bld, i),
                           offset(tmp, bld, i + first_component));
@@ -2649,7 +2649,7 @@ fs_visitor::nir_emit_tcs_intrinsic(const fs_builder &bld,
             } else {
                inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, dst,
                                patch_handle);
-               inst->regs_written = instr->num_components;
+               inst->size_written = instr->num_components * REG_SIZE;
             }
             inst->offset = imm_offset;
             inst->mlen = 1;
@@ -2668,7 +2668,7 @@ fs_visitor::nir_emit_tcs_intrinsic(const fs_builder &bld,
             fs_reg tmp = bld.vgrf(dst.type, read_components);
             inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, tmp,
                             payload);
-            inst->regs_written = read_components;
+            inst->size_written = read_components * REG_SIZE;
             for (unsigned i = 0; i < instr->num_components; i++) {
                bld.MOV(offset(dst, bld, i),
                        offset(tmp, bld, i + first_component));
@@ -2676,7 +2676,7 @@ fs_visitor::nir_emit_tcs_intrinsic(const fs_builder &bld,
          } else {
             inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, dst,
                             payload);
-            inst->regs_written = instr->num_components;
+            inst->size_written = instr->num_components * REG_SIZE;
          }
          inst->offset = imm_offset;
          inst->mlen = 2;
@@ -2976,7 +2976,7 @@ fs_visitor::nir_emit_tes_intrinsic(const fs_builder &bld,
                fs_reg tmp = bld.vgrf(dest.type, read_components);
                inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp,
                                patch_handle);
-               inst->regs_written = read_components;
+               inst->size_written = read_components * REG_SIZE;
                for (unsigned i = 0; i < instr->num_components; i++) {
                   bld.MOV(offset(dest, bld, i),
                           offset(tmp, bld, i + first_component));
@@ -2984,7 +2984,7 @@ fs_visitor::nir_emit_tes_intrinsic(const fs_builder &bld,
             } else {
                inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, dest,
                                patch_handle);
-               inst->regs_written = instr->num_components;
+               inst->size_written = instr->num_components * REG_SIZE;
             }
             inst->mlen = 1;
             inst->offset = imm_offset;
@@ -3032,8 +3032,9 @@ fs_visitor::nir_emit_tes_intrinsic(const fs_builder &bld,
             }
             inst->mlen = 2;
             inst->offset = imm_offset;
-            inst->regs_written =
-               ((num_components + first_component) * type_sz(dest.type) / 4);
+            inst->size_written =
+               ((num_components + first_component) * type_sz(dest.type) / 4) *
+               REG_SIZE;
 
             /* If we are reading 64-bit data using 32-bit read messages we need
              * build proper 64-bit data elements by shuffling the low and high
@@ -3207,8 +3208,7 @@ fs_visitor::emit_non_coherent_fb_read(const fs_builder &bld, const fs_reg &dst,
    STATIC_ASSERT(ARRAY_SIZE(srcs) == TEX_LOGICAL_NUM_SRCS);
 
    fs_inst *inst = bld.emit(op, dst, srcs, ARRAY_SIZE(srcs));
-   inst->regs_written = 4 * inst->dst.component_size(inst->exec_size) /
-                        REG_SIZE;
+   inst->size_written = 4 * inst->dst.component_size(inst->exec_size);
 
    return inst;
 }
@@ -3223,8 +3223,7 @@ emit_coherent_fb_read(const fs_builder &bld, const fs_reg &dst, unsigned target)
    assert(bld.shader->devinfo->gen >= 9);
    fs_inst *inst = bld.emit(FS_OPCODE_FB_READ_LOGICAL, dst);
    inst->target = target;
-   inst->regs_written = 4 * inst->dst.component_size(inst->exec_size) /
-                        REG_SIZE;
+   inst->size_written = 4 * inst->dst.component_size(inst->exec_size);
 
    return inst;
 }
@@ -3903,7 +3902,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
       const fs_builder ubld = bld.group(8, 0);
       const fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD, 2);
       ubld.emit(SHADER_OPCODE_MEMORY_FENCE, tmp)
-         ->regs_written = 2;
+         ->size_written = 2 * REG_SIZE;
       break;
    }
 
@@ -4338,7 +4337,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
                                 src_payload, brw_imm_ud(index));
       inst->header_size = 0;
       inst->mlen = 1;
-      inst->regs_written = 4;
+      inst->size_written = 4 * REG_SIZE;
 
       bld.MOV(retype(dest, ret_payload.type), component(ret_payload, 0));
       brw_mark_surface_used(prog_data, index);
@@ -4685,9 +4684,9 @@ fs_visitor::nir_emit_texture(const fs_builder &bld, nir_tex_instr *instr)
                             nir_ssa_def_components_read(&instr->dest.ssa):
                             (1 << dest_size) - 1;
       assert(write_mask != 0); /* dead code should have been eliminated */
-      inst->regs_written = util_last_bit(write_mask) * dispatch_width / 8;
+      inst->size_written = util_last_bit(write_mask) * dispatch_width / 8 * REG_SIZE;
    } else {
-      inst->regs_written = 4 * dispatch_width / 8;
+      inst->size_written = 4 * dispatch_width / 8 * REG_SIZE;
    }
 
    if (srcs[TEX_LOGICAL_SRC_SHADOW_C].file != BAD_FILE)
diff --git a/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp b/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp
index 572735a379a..5c6f3d490f0 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp
@@ -826,7 +826,8 @@ fs_visitor::choose_spill_reg(struct ra_graph *g)
       }
 
       if (inst->dst.file == VGRF)
-         spill_costs[inst->dst.nr] += inst->regs_written * loop_scale;
+         spill_costs[inst->dst.nr] += DIV_ROUND_UP(inst->size_written, REG_SIZE)
+                                      * loop_scale;
 
       switch (inst->opcode) {
 
diff --git a/src/mesa/drivers/dri/i965/brw_fs_register_coalesce.cpp b/src/mesa/drivers/dri/i965/brw_fs_register_coalesce.cpp
index 3dd0fbfc1c1..310e8019fcb 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_register_coalesce.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_register_coalesce.cpp
@@ -204,7 +204,7 @@ fs_visitor::register_coalesce()
             continue;
          }
          dst_reg_offset[offset] = inst->dst.offset / REG_SIZE;
-         if (inst->regs_written > 1)
+         if (inst->size_written > REG_SIZE)
             dst_reg_offset[offset + 1] = inst->dst.offset / REG_SIZE + 1;
          mov[offset] = inst;
          channels_remaining -= regs_written(inst);
diff --git a/src/mesa/drivers/dri/i965/brw_fs_surface_builder.cpp b/src/mesa/drivers/dri/i965/brw_fs_surface_builder.cpp
index 5fa7c42d2ee..37e893bb89e 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_surface_builder.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_surface_builder.cpp
@@ -50,7 +50,7 @@ namespace brw {
             const fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_UD, rsize);
             fs_inst *inst = bld.emit(opcode, dst, srcs, ARRAY_SIZE(srcs));
 
-            inst->regs_written = rsize * bld.dispatch_width() / 8;
+            inst->size_written = rsize * bld.dispatch_width() / 8 * REG_SIZE;
             inst->predicate = pred;
             return dst;
          }
diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
index d0f504c1a2e..5aea62c4cbc 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
@@ -97,7 +97,7 @@ fs_visitor::emit_mcs_fetch(const fs_reg &coordinate, unsigned components,
    /* We only care about one or two regs of response, but the sampler always
     * writes 4/8.
     */
-   inst->regs_written = 4 * dispatch_width / 8;
+   inst->size_written = 4 * dispatch_width / 8 * REG_SIZE;
 
    return dest;
 }
diff --git a/src/mesa/drivers/dri/i965/brw_ir_fs.h b/src/mesa/drivers/dri/i965/brw_ir_fs.h
index de08a691055..cea81e4646a 100644
--- a/src/mesa/drivers/dri/i965/brw_ir_fs.h
+++ b/src/mesa/drivers/dri/i965/brw_ir_fs.h
@@ -421,7 +421,8 @@ inline unsigned
 regs_written(const fs_inst *inst)
 {
    /* XXX - Take into account register-misaligned offsets correctly. */
-   return inst->regs_written;
+   assert(inst->dst.file != UNIFORM && inst->dst.file != IMM);
+   return DIV_ROUND_UP(inst->size_written, REG_SIZE);
 }
 
 /**
diff --git a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp b/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
index c12bf09e835..5e1e61683a2 100644
--- a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
+++ b/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
@@ -1494,11 +1494,11 @@ fs_instruction_scheduler::choose_instruction_to_schedule()
                 * single-result send is probably actually reducing register
                 * pressure.
                 */
-               if (inst->regs_written <= inst->exec_size / 8 &&
-                   chosen_inst->regs_written > chosen_inst->exec_size / 8) {
+               if (inst->size_written <= inst->exec_size / 8 * REG_SIZE &&
+                   chosen_inst->size_written > chosen_inst->exec_size / 8 * REG_SIZE) {
                   chosen = n;
                   continue;
-               } else if (inst->regs_written > chosen_inst->regs_written) {
+               } else if (inst->size_written > chosen_inst->size_written) {
                   continue;
                }
             }
diff --git a/src/mesa/drivers/dri/i965/brw_shader.h b/src/mesa/drivers/dri/i965/brw_shader.h
index 66264b4ea7e..2173f3226e1 100644
--- a/src/mesa/drivers/dri/i965/brw_shader.h
+++ b/src/mesa/drivers/dri/i965/brw_shader.h
@@ -138,6 +138,7 @@ struct backend_instruction {
    int8_t base_mrf; /**< First MRF in the SEND message, if mlen is nonzero. */
    uint8_t target; /**< MRT target. */
    uint8_t regs_written; /**< Number of registers written by the instruction. */
+   unsigned size_written; /**< Data written to the destination register in bytes. */
 
    enum opcode opcode; /* BRW_OPCODE_* or FS_OPCODE_* */
    enum brw_conditional_mod conditional_mod; /**< BRW_CONDITIONAL_* */
diff --git a/src/mesa/drivers/dri/i965/test_fs_cmod_propagation.cpp b/src/mesa/drivers/dri/i965/test_fs_cmod_propagation.cpp
index 8ba7bc59481..f71c6ee1e42 100644
--- a/src/mesa/drivers/dri/i965/test_fs_cmod_propagation.cpp
+++ b/src/mesa/drivers/dri/i965/test_fs_cmod_propagation.cpp
@@ -281,7 +281,7 @@ TEST_F(cmod_propagation_test, intervening_dest_write)
    fs_reg zero(brw_imm_f(0.0f));
    bld.ADD(offset(dest, bld, 2), src0, src1);
    bld.emit(SHADER_OPCODE_TEX, dest, src2)
-      ->regs_written = 4;
+      ->size_written = 4 * REG_SIZE;
    bld.CMP(bld.null_reg_f(), offset(dest, bld, 2), zero, BRW_CONDITIONAL_GE);
 
    /* = Before =
diff --git a/src/mesa/drivers/dri/i965/test_fs_saturate_propagation.cpp b/src/mesa/drivers/dri/i965/test_fs_saturate_propagation.cpp
index fd623a5e00f..680fe72dfd5 100644
--- a/src/mesa/drivers/dri/i965/test_fs_saturate_propagation.cpp
+++ b/src/mesa/drivers/dri/i965/test_fs_saturate_propagation.cpp
@@ -525,7 +525,7 @@ TEST_F(saturate_propagation_test, intervening_dest_write)
    fs_reg src2 = v->vgrf(glsl_type::vec2_type);
    bld.ADD(offset(dst0, bld, 2), src0, src1);
    bld.emit(SHADER_OPCODE_TEX, dst0, src2)
-      ->regs_written = 4;
+      ->size_written = 4 * REG_SIZE;
    set_saturate(true, bld.MOV(dst1, offset(dst0, bld, 2)));
 
    /* = Before =