From: Jason Ekstrand <jason.ekstrand@intel.com>
Date: Mon, 18 Aug 2014 21:27:55 +0000 (-0700)
Subject: i965/fs_reg: Allocate double the number of vgrfs in SIMD16 mode
X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=7210583eb;p=mesa.git

i965/fs_reg: Allocate double the number of vgrfs in SIMD16 mode

This is actually the squash of a bunch of different changes.  Individual
commit titles follow:

i965/fs: Always 2-align registers SIMD16 for gen <= 5

i965/fs: Use the register width when applying offsets

   This reworks both byte_offset() and offset() to be more intelligent.
   The byte_offset() function now supports offsets bigger than 32. The
   offset() function uses the byte_offset() function together with the
   register width and the type size to offset the register by the correct
   amount.

i965/fs: Change regs_read to be in hardware registers

i965/fs: Change regs_written to be actual hardware registers

i965/fs: Properly handle register widths in LOAD_PAYLOAD

   The LOAD_PAYLOAD instruction is a bit special because it collects a
   bunch of registers (with possibly different widths) into a single
   payload block.  Once the payload is constructed, it's treated as a
   single block of data and most of the information such as register widths
   doesn't matter anymore.  In particular, the offset of any particular
   source register is the accumulation of the sizes of the previous source
   registers.

i965/fs: Properly set writemasks in LOAD_PAYLOAD

i965/fs: Handle register widths in demote_pull_constants

i965/fs: Get rid of implicit register doubling in the allocator

i965/fs: Reserve enough registers for PLN instructions

i965/fs: Make sources and destinations interfere in 16-wide

i965/fs: Properly handle register widths in CSE

i965/fs: Properly handle register widths in register_coalesce

i965/fs: Properly handle widths in copy propagation

i965/fs: Properly handle register widths in VARYING_PULL_CONSTANT_LOAD

i965/fs: Properly handle register widths and odd register sizes in spilling

i965/fs: Don't waste a register on texture lookups for gen >= 7

   Previously, we were waisting a register in SIMD16 mode because we could
   only allocate registers in pairs.  Now that we can allocate and address
   odd-sized registers, let's get rid of this special-case.

Signed-off-by: Jason Ekstrand <jason.ekstrand@intel.com>
Reviewed-by: Matt Turner <mattst88@gmail.com>
---

diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index 5d7e8670532..9a9dbda4619 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -65,7 +65,21 @@ fs_inst::init(enum opcode opcode, const fs_reg &dst, fs_reg *src, int sources)
    this->conditional_mod = BRW_CONDITIONAL_NONE;
 
    /* This will be the case for almost all instructions. */
-   this->regs_written = 1;
+   switch (dst.file) {
+   case GRF:
+   case HW_REG:
+   case MRF:
+      this->regs_written = (dst.width * dst.stride * type_sz(dst.type) + 31) / 32;
+      break;
+   case BAD_FILE:
+      this->regs_written = 0;
+      break;
+   case IMM:
+   case UNIFORM:
+      unreachable("Invalid destination register file");
+   default:
+      unreachable("Invalid register file");
+   }
 
    this->writes_accumulator = false;
 }
@@ -252,7 +266,16 @@ fs_visitor::LOAD_PAYLOAD(const fs_reg &dst, fs_reg *src, int sources)
 {
    fs_inst *inst = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD, dst, src,
                                         sources);
-   inst->regs_written = sources;
+   inst->regs_written = 0;
+   for (int i = 0; i < sources; ++i) {
+      /* The LOAD_PAYLOAD instruction only really makes sense if we are
+       * dealing with whole registers.  If this ever changes, we can deal
+       * with it later.
+       */
+      int size = src[i].effective_width(this) * type_sz(src[i].type);
+      assert(size % 32 == 0);
+      inst->regs_written += (size + 31) / 32;
+   }
 
    return inst;
 }
@@ -282,7 +305,7 @@ fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_reg &dst,
                               varying_offset, fs_reg(const_offset & ~3)));
 
    int scale = 1;
-   if (brw->gen == 4 && dispatch_width == 8) {
+   if (brw->gen == 4 && dst.width == 8) {
       /* Pre-gen5, we can either use a SIMD8 message that requires (header,
        * u, v, r) as parameters, or we can just use the SIMD16 message
        * consisting of (header, u).  We choose the second, at the cost of a
@@ -296,9 +319,13 @@ fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_reg &dst,
       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
    else
       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
-   fs_reg vec4_result = fs_reg(GRF, virtual_grf_alloc(4 * scale), dst.type);
+
+   assert(dst.width % 8 == 0);
+   int regs_written = 4 * (dst.width / 8) * scale;
+   fs_reg vec4_result = fs_reg(GRF, virtual_grf_alloc(regs_written),
+                               dst.type, dst.width);
    inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
-   inst->regs_written = 4 * scale;
+   inst->regs_written = regs_written;
    instructions.push_tail(inst);
 
    if (brw->gen < 7) {
@@ -802,12 +829,27 @@ int
 fs_inst::regs_read(fs_visitor *v, int arg) const
 {
    if (is_tex() && arg == 0 && src[0].file == GRF) {
-      if (v->dispatch_width == 16)
-	 return (mlen + 1) / 2;
-      else
-	 return mlen;
+      return mlen;
+   }
+
+   switch (src[arg].file) {
+   case BAD_FILE:
+   case UNIFORM:
+   case IMM:
+      return 1;
+   case GRF:
+   case HW_REG:
+      if (src[arg].stride == 0) {
+         return 1;
+      } else {
+         int size = src[arg].width * src[arg].stride * type_sz(src[arg].type);
+         return (size + 31) / 32;
+      }
+   case MRF:
+      unreachable("MRF registers are not allowed as sources");
+   default:
+      unreachable("Invalid register file");
    }
-   return 1;
 }
 
 bool
@@ -948,9 +990,10 @@ fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type,
 fs_reg::fs_reg(fs_visitor *v, const struct glsl_type *type)
 {
    init();
+   int reg_width = v->dispatch_width / 8;
 
    this->file = GRF;
-   this->reg = v->virtual_grf_alloc(v->type_size(type));
+   this->reg = v->virtual_grf_alloc(v->type_size(type) * reg_width);
    this->reg_offset = 0;
    this->type = brw_type_for_base_type(type);
    this->width = v->dispatch_width;
@@ -2096,6 +2139,7 @@ fs_visitor::demote_pull_constants()
          inst->src[i].file = GRF;
          inst->src[i].reg = dst.reg;
          inst->src[i].reg_offset = 0;
+         inst->src[i].width = dispatch_width;
       }
    }
    invalidate_live_intervals();
@@ -2241,12 +2285,12 @@ fs_visitor::opt_register_renaming()
 
       if (depth == 0 &&
           inst->dst.file == GRF &&
-          virtual_grf_sizes[inst->dst.reg] == 1 &&
+          virtual_grf_sizes[inst->dst.reg] == inst->dst.width / 8 &&
           !inst->is_partial_write()) {
          if (remap[dst] == -1) {
             remap[dst] = dst;
          } else {
-            remap[dst] = virtual_grf_alloc(1);
+            remap[dst] = virtual_grf_alloc(inst->dst.width / 8);
             inst->dst.reg = remap[dst];
             progress = true;
          }
@@ -2338,7 +2382,7 @@ fs_visitor::compute_to_mrf()
             /* Things returning more than one register would need us to
              * understand coalescing out more than one MOV at a time.
              */
-            if (scan_inst->regs_written > 1)
+            if (scan_inst->regs_written > scan_inst->dst.width / 8)
                break;
 
 	    /* SEND instructions can't have MRF as a destination. */
@@ -2599,8 +2643,7 @@ void
 fs_visitor::insert_gen4_pre_send_dependency_workarounds(bblock_t *block,
                                                         fs_inst *inst)
 {
-   int reg_size = dispatch_width / 8;
-   int write_len = inst->regs_written * reg_size;
+   int write_len = inst->regs_written;
    int first_write_grf = inst->dst.reg;
    bool needs_dep[BRW_MAX_MRF];
    assert(write_len < (int)sizeof(needs_dep) - 1);
@@ -2639,7 +2682,7 @@ fs_visitor::insert_gen4_pre_send_dependency_workarounds(bblock_t *block,
        */
       if (scan_inst->dst.file == GRF) {
          for (int i = 0; i < scan_inst->regs_written; i++) {
-            int reg = scan_inst->dst.reg + i * reg_size;
+            int reg = scan_inst->dst.reg + i;
 
             if (reg >= first_write_grf &&
                 reg < first_write_grf + write_len &&
@@ -2677,7 +2720,7 @@ fs_visitor::insert_gen4_pre_send_dependency_workarounds(bblock_t *block,
 void
 fs_visitor::insert_gen4_post_send_dependency_workarounds(bblock_t *block, fs_inst *inst)
 {
-   int write_len = inst->regs_written * dispatch_width / 8;
+   int write_len = inst->regs_written;
    int first_write_grf = inst->dst.reg;
    bool needs_dep[BRW_MAX_MRF];
    assert(write_len < (int)sizeof(needs_dep) - 1);
@@ -2829,19 +2872,77 @@ fs_visitor::lower_load_payload()
 {
    bool progress = false;
 
+   int vgrf_to_reg[virtual_grf_count];
+   int reg_count = 16; /* Leave room for MRF */
+   for (int i = 0; i < virtual_grf_count; ++i) {
+      vgrf_to_reg[i] = reg_count;
+      reg_count += virtual_grf_sizes[i];
+   }
+
+   struct {
+      bool written:1; /* Whether this register has ever been written */
+      bool force_writemask_all:1;
+      bool force_sechalf:1;
+   } metadata[reg_count];
+   memset(metadata, 0, sizeof(metadata));
+
    foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
+      int dst_reg;
+      if (inst->dst.file == MRF) {
+         dst_reg = inst->dst.reg;
+      } else if (inst->dst.file == GRF) {
+         dst_reg = vgrf_to_reg[inst->dst.reg];
+      }
+
+      if (inst->dst.file == MRF || inst->dst.file == GRF) {
+         bool force_sechalf = inst->force_sechalf;
+         bool toggle_sechalf = inst->dst.width == 16 &&
+                               type_sz(inst->dst.type) == 4;
+         for (int i = 0; i < inst->regs_written; ++i) {
+            metadata[dst_reg + i].written = true;
+            metadata[dst_reg + i].force_sechalf = force_sechalf;
+            metadata[dst_reg + i].force_writemask_all = inst->force_writemask_all;
+            force_sechalf = (toggle_sechalf != force_sechalf);
+         }
+      }
+
       if (inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD) {
+         assert(inst->dst.file == MRF || inst->dst.file == GRF);
          fs_reg dst = inst->dst;
 
-         /* src[0] represents the (optional) message header. */
-         if (inst->src[0].file != BAD_FILE) {
-            inst->insert_before(block, MOV(dst, inst->src[0]));
-         }
-         dst.reg_offset++;
+         for (int i = 0; i < inst->sources; i++) {
+            dst.width = inst->src[i].effective_width(this);
+            dst.type = inst->src[i].type;
+
+            if (inst->src[i].file == BAD_FILE) {
+               /* Do nothing but otherwise increment as normal */
+            } else {
+               fs_inst *mov = MOV(dst, inst->src[i]);
+               if (inst->src[i].file == GRF) {
+                  int src_reg = vgrf_to_reg[inst->src[i].reg] +
+                                inst->src[i].reg_offset;
+                  mov->force_sechalf = metadata[src_reg].force_sechalf;
+                  mov->force_writemask_all = metadata[src_reg].force_writemask_all;
+                  metadata[dst_reg] = metadata[src_reg];
+                  if (dst.width * type_sz(dst.type) > 32) {
+                     assert((!metadata[src_reg].written ||
+                             !metadata[src_reg].force_sechalf) &&
+                            (!metadata[src_reg + 1].written ||
+                             metadata[src_reg + 1].force_sechalf));
+                     metadata[dst_reg + 1] = metadata[src_reg + 1];
+                  }
+               } else {
+                  metadata[dst_reg].force_writemask_all = false;
+                  metadata[dst_reg].force_sechalf = false;
+                  if (dst.width == 16) {
+                     metadata[dst_reg + 1].force_writemask_all = false;
+                     metadata[dst_reg + 1].force_sechalf = true;
+                  }
+               }
+               inst->insert_before(block, mov);
+            }
 
-         for (int i = 1; i < inst->sources; i++) {
-            inst->insert_before(block, MOV(dst, inst->src[i]));
-            dst.reg_offset++;
+            dst = offset(dst, 1);
          }
 
          inst->remove(block);
diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h
index 0d3931e479f..05fb71d724c 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.h
+++ b/src/mesa/drivers/dri/i965/brw_fs.h
@@ -124,18 +124,40 @@ retype(fs_reg reg, enum brw_reg_type type)
 }
 
 static inline fs_reg
-offset(fs_reg reg, unsigned delta)
+byte_offset(fs_reg reg, unsigned delta)
 {
-   assert(delta == 0 || (reg.file != HW_REG && reg.file != IMM));
-   reg.reg_offset += delta;
+   switch (reg.file) {
+   case BAD_FILE:
+      break;
+   case GRF:
+      reg.reg_offset += delta / 32;
+      break;
+   case MRF:
+      reg.reg += delta / 32;
+      break;
+   default:
+      assert(delta == 0);
+   }
+   reg.subreg_offset += delta % 32;
    return reg;
 }
 
 static inline fs_reg
-byte_offset(fs_reg reg, unsigned delta)
+offset(fs_reg reg, unsigned delta)
 {
-   assert(delta == 0 || (reg.file != HW_REG && reg.file != IMM));
-   reg.subreg_offset += delta;
+   assert(reg.stride > 0);
+   switch (reg.file) {
+   case BAD_FILE:
+      break;
+   case GRF:
+   case MRF:
+      return byte_offset(reg, delta * reg.width * reg.stride * type_sz(reg.type));
+   case UNIFORM:
+      reg.reg_offset += delta;
+      break;
+   default:
+      assert(delta == 0);
+   }
    return reg;
 }
 
@@ -426,6 +448,8 @@ public:
    void emit_if_gen6(ir_if *ir);
    void emit_unspill(bblock_t *block, fs_inst *inst, fs_reg reg,
                      uint32_t spill_offset, int count);
+   void emit_spill(bblock_t *block, fs_inst *inst, fs_reg reg,
+                   uint32_t spill_offset, int count);
 
    void emit_fragment_program_code();
    void setup_fp_regs();
diff --git a/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp b/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp
index bd502c4ca80..b4f4431a10c 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp
@@ -42,6 +42,7 @@ namespace { /* avoid conflict with opt_copy_propagation_elements */
 struct acp_entry : public exec_node {
    fs_reg dst;
    fs_reg src;
+   uint8_t regs_written;
    enum opcode opcode;
    bool saturate;
 };
@@ -295,11 +296,10 @@ fs_visitor::try_copy_propagate(fs_inst *inst, int arg, acp_entry *entry)
    /* Bail if inst is reading a range that isn't contained in the range
     * that entry is writing.
     */
-   int reg_size = dispatch_width * sizeof(float);
    if (inst->src[arg].reg_offset < entry->dst.reg_offset ||
-       (inst->src[arg].reg_offset * reg_size + inst->src[arg].subreg_offset +
-        inst->regs_read(this, arg) * inst->src[arg].stride * reg_size) >
-       (entry->dst.reg_offset + 1) * reg_size)
+       (inst->src[arg].reg_offset * 32 + inst->src[arg].subreg_offset +
+        inst->regs_read(this, arg) * inst->src[arg].stride * 32) >
+       (entry->dst.reg_offset + entry->regs_written) * 32)
       return false;
 
    /* See resolve_ud_negate() and comment in brw_fs_emit.cpp. */
@@ -371,16 +371,25 @@ fs_visitor::try_copy_propagate(fs_inst *inst, int arg, acp_entry *entry)
    inst->saturate = inst->saturate || entry->saturate;
 
    switch (entry->src.file) {
+   case UNIFORM:
+      assert(entry->src.width == 1);
    case BAD_FILE:
    case HW_REG:
-   case UNIFORM:
+      inst->src[arg].width = entry->src.width;
       inst->src[arg].reg_offset = entry->src.reg_offset;
       inst->src[arg].subreg_offset = entry->src.subreg_offset;
       break;
    case GRF:
       {
-         /* In this case, we have to deal with mapping parts of vgrfs to
-          * other parts of vgrfs so we have to do some reg_offset magic.
+         assert(entry->src.width % inst->src[arg].width == 0);
+         /* In this case, we'll just leave the width alone.  The source
+          * register could have different widths depending on how it is
+          * being used.  For instance, if only half of the register was
+          * used then we want to preserve that and continue to only use
+          * half.
+          *
+          * Also, we have to deal with mapping parts of vgrfs to other
+          * parts of vgrfs so we have to do some reg_offset magic.
           */
 
          /* Compute the offset of inst->src[arg] relative to inst->dst */
@@ -389,10 +398,10 @@ fs_visitor::try_copy_propagate(fs_inst *inst, int arg, acp_entry *entry)
          int rel_suboffset = inst->src[arg].subreg_offset;
 
          /* Compute the final register offset (in bytes) */
-         int offset = entry->src.reg_offset * reg_size + entry->src.subreg_offset;
-         offset += rel_offset * reg_size + rel_suboffset;
-         inst->src[arg].reg_offset = offset / reg_size;
-         inst->src[arg].subreg_offset = offset % reg_size;
+         int offset = entry->src.reg_offset * 32 + entry->src.subreg_offset;
+         offset += rel_offset * 32 + rel_suboffset;
+         inst->src[arg].reg_offset = offset / 32;
+         inst->src[arg].subreg_offset = offset % 32;
       }
       break;
    default:
@@ -429,11 +438,10 @@ fs_visitor::try_constant_propagate(fs_inst *inst, acp_entry *entry)
       /* Bail if inst is reading a range that isn't contained in the range
        * that entry is writing.
        */
-      int reg_size = dispatch_width * sizeof(float);
       if (inst->src[i].reg_offset < entry->dst.reg_offset ||
-          (inst->src[i].reg_offset * reg_size + inst->src[i].subreg_offset +
-           inst->regs_read(this, i) * inst->src[i].stride * reg_size) >
-          (entry->dst.reg_offset + 1) * reg_size)
+          (inst->src[i].reg_offset * 32 + inst->src[i].subreg_offset +
+           inst->regs_read(this, i) * inst->src[i].stride * 32) >
+          (entry->dst.reg_offset + entry->regs_written) * 32)
          continue;
 
       /* Don't bother with cases that should have been taken care of by the
@@ -623,17 +631,23 @@ fs_visitor::opt_copy_propagate_local(void *copy_prop_ctx, bblock_t *block,
 	 acp_entry *entry = ralloc(copy_prop_ctx, acp_entry);
 	 entry->dst = inst->dst;
 	 entry->src = inst->src[0];
+         entry->regs_written = inst->regs_written;
          entry->opcode = inst->opcode;
          entry->saturate = inst->saturate;
 	 acp[entry->dst.reg % ACP_HASH_SIZE].push_tail(entry);
       } else if (inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD &&
                  inst->dst.file == GRF) {
+         int offset = 0;
          for (int i = 0; i < inst->sources; i++) {
+            int regs_written = ((inst->src[i].effective_width(this) *
+                                 type_sz(inst->src[i].type)) + 31) / 32;
             if (inst->src[i].file == GRF) {
                acp_entry *entry = ralloc(copy_prop_ctx, acp_entry);
                entry->dst = inst->dst;
-               entry->dst.reg_offset = i;
+               entry->dst.reg_offset = offset;
+               entry->dst.width = inst->src[i].effective_width(this);
                entry->src = inst->src[i];
+               entry->regs_written = regs_written;
                entry->opcode = inst->opcode;
                if (!entry->dst.equals(inst->src[i])) {
                   acp[entry->dst.reg % ACP_HASH_SIZE].push_tail(entry);
@@ -641,6 +655,7 @@ fs_visitor::opt_copy_propagate_local(void *copy_prop_ctx, bblock_t *block,
                   ralloc_free(entry);
                }
             }
+            offset += regs_written;
          }
       }
    }
diff --git a/src/mesa/drivers/dri/i965/brw_fs_cse.cpp b/src/mesa/drivers/dri/i965/brw_fs_cse.cpp
index 7edbe1915c2..817fc1f1a1f 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_cse.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_cse.cpp
@@ -202,19 +202,21 @@ fs_visitor::opt_cse_local(bblock_t *block)
             bool no_existing_temp = entry->tmp.file == BAD_FILE;
             if (no_existing_temp && !entry->generator->dst.is_null()) {
                int written = entry->generator->regs_written;
+               int dst_width = entry->generator->dst.width / 8;
+               assert(written % dst_width == 0);
 
                fs_reg orig_dst = entry->generator->dst;
                fs_reg tmp = fs_reg(GRF, virtual_grf_alloc(written),
-                                   orig_dst.type);
+                                   orig_dst.type, orig_dst.width);
                entry->tmp = tmp;
                entry->generator->dst = tmp;
 
                fs_inst *copy;
-               if (written > 1) {
-                  fs_reg *sources = ralloc_array(mem_ctx, fs_reg, written);
-                  for (int i = 0; i < written; i++)
+               if (written > dst_width) {
+                  fs_reg *sources = ralloc_array(mem_ctx, fs_reg, written / dst_width);
+                  for (int i = 0; i < written / dst_width; i++)
                      sources[i] = offset(tmp, i);
-                  copy = LOAD_PAYLOAD(orig_dst, sources, written);
+                  copy = LOAD_PAYLOAD(orig_dst, sources, written / dst_width);
                } else {
                   copy = MOV(orig_dst, tmp);
                   copy->force_writemask_all =
@@ -226,16 +228,18 @@ fs_visitor::opt_cse_local(bblock_t *block)
             /* dest <- temp */
             if (!inst->dst.is_null()) {
                int written = inst->regs_written;
+               int dst_width = inst->dst.width / 8;
                assert(written == entry->generator->regs_written);
+               assert(dst_width == entry->generator->dst.width / 8);
                assert(inst->dst.type == entry->tmp.type);
                fs_reg dst = inst->dst;
                fs_reg tmp = entry->tmp;
                fs_inst *copy;
-               if (written > 1) {
-                  fs_reg *sources = ralloc_array(mem_ctx, fs_reg, written);
-                  for (int i = 0; i < written; i++)
+               if (written > dst_width) {
+                  fs_reg *sources = ralloc_array(mem_ctx, fs_reg, written / dst_width);
+                  for (int i = 0; i < written / dst_width; i++)
                      sources[i] = offset(tmp, i);
-                  copy = LOAD_PAYLOAD(dst, sources, written);
+                  copy = LOAD_PAYLOAD(dst, sources, written / dst_width);
                } else {
                   copy = MOV(dst, tmp);
                   copy->force_writemask_all = inst->force_writemask_all;
diff --git a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
index 122a43f9316..5bfc559889f 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
@@ -542,15 +542,8 @@ fs_generator::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src
       dst = vec16(dst);
    }
 
-   if (brw->gen >= 7 && inst->header_present && dispatch_width == 16) {
-      /* The send-from-GRF for SIMD16 texturing with a header has an extra
-       * hardware register allocated to it, which we need to skip over (since
-       * our coordinates in the payload are in the even-numbered registers,
-       * and the header comes right before the first one).
-       */
-      assert(src.file == BRW_GENERAL_REGISTER_FILE);
-      src.nr++;
-   }
+   assert(brw->gen < 7 || !inst->header_present ||
+          src.file == BRW_GENERAL_REGISTER_FILE);
 
    assert(sampler_index.type == BRW_REGISTER_TYPE_UD);
 
diff --git a/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp b/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp
index a627b64d328..095b45c6c9a 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp
@@ -31,11 +31,11 @@
 #include "glsl/ir_optimization.h"
 
 static void
-assign_reg(int *reg_hw_locations, fs_reg *reg, int reg_width)
+assign_reg(int *reg_hw_locations, fs_reg *reg)
 {
    if (reg->file == GRF) {
       assert(reg->reg_offset >= 0);
-      reg->reg = reg_hw_locations[reg->reg] + reg->reg_offset * reg_width;
+      reg->reg = reg_hw_locations[reg->reg] + reg->reg_offset;
       reg->reg_offset = 0;
    }
 }
@@ -51,14 +51,14 @@ fs_visitor::assign_regs_trivial()
    hw_reg_mapping[0] = ALIGN(this->first_non_payload_grf, reg_width);
    for (i = 1; i <= this->virtual_grf_count; i++) {
       hw_reg_mapping[i] = (hw_reg_mapping[i - 1] +
-			   this->virtual_grf_sizes[i - 1] * reg_width);
+			   this->virtual_grf_sizes[i - 1]);
    }
    this->grf_used = hw_reg_mapping[this->virtual_grf_count];
 
    foreach_block_and_inst(block, fs_inst, inst, cfg) {
-      assign_reg(hw_reg_mapping, &inst->dst, reg_width);
+      assign_reg(hw_reg_mapping, &inst->dst);
       for (i = 0; i < inst->sources; i++) {
-         assign_reg(hw_reg_mapping, &inst->src[i], reg_width);
+         assign_reg(hw_reg_mapping, &inst->src[i]);
       }
    }
 
@@ -75,7 +75,7 @@ static void
 brw_alloc_reg_set(struct intel_screen *screen, int reg_width)
 {
    const struct brw_device_info *devinfo = screen->devinfo;
-   int base_reg_count = BRW_MAX_GRF / reg_width;
+   int base_reg_count = BRW_MAX_GRF;
    int index = reg_width - 1;
 
    /* The registers used to make up almost all values handled in the compiler
@@ -105,8 +105,7 @@ brw_alloc_reg_set(struct intel_screen *screen, int reg_width)
    int class_sizes[BRW_MAX_MRF];
 
    if (devinfo->gen >= 7) {
-      for (class_count = 0; class_count < MAX_SAMPLER_MESSAGE_SIZE;
-           class_count++)
+      for (class_count = 0; class_count < BRW_MAX_MRF; class_count++)
          class_sizes[class_count] = class_count + 1;
    } else {
       for (class_count = 0; class_count < 4; class_count++)
@@ -117,7 +116,21 @@ brw_alloc_reg_set(struct intel_screen *screen, int reg_width)
    /* Compute the total number of registers across all classes. */
    int ra_reg_count = 0;
    for (int i = 0; i < class_count; i++) {
-      ra_reg_count += base_reg_count - (class_sizes[i] - 1);
+      if (devinfo->gen <= 5 && reg_width == 2) {
+         /* From the G45 PRM:
+          *
+          * In order to reduce the hardware complexity, the following
+          * rules and restrictions apply to the compressed instruction:
+          * ...
+          * * Operand Alignment Rule: With the exceptions listed below, a
+          *   source/destination operand in general should be aligned to
+          *   even 256-bit physical register with a region size equal to
+          *   two 256-bit physical register
+          */
+         ra_reg_count += (base_reg_count - (class_sizes[i] - 1)) / 2;
+      } else {
+         ra_reg_count += base_reg_count - (class_sizes[i] - 1);
+      }
    }
 
    uint8_t *ra_reg_to_grf = ralloc_array(screen, uint8_t, ra_reg_count);
@@ -134,27 +147,48 @@ brw_alloc_reg_set(struct intel_screen *screen, int reg_width)
    int pairs_base_reg = 0;
    int pairs_reg_count = 0;
    for (int i = 0; i < class_count; i++) {
-      int class_reg_count = base_reg_count - (class_sizes[i] - 1);
+      int class_reg_count;
+      if (devinfo->gen <= 5 && reg_width == 2) {
+         class_reg_count = (base_reg_count - (class_sizes[i] - 1)) / 2;
+      } else {
+         class_reg_count = base_reg_count - (class_sizes[i] - 1);
+      }
       classes[i] = ra_alloc_reg_class(regs);
 
       /* Save this off for the aligned pair class at the end. */
       if (class_sizes[i] == 2) {
-	 pairs_base_reg = reg;
-	 pairs_reg_count = class_reg_count;
+         pairs_base_reg = reg;
+         pairs_reg_count = class_reg_count;
       }
 
-      for (int j = 0; j < class_reg_count; j++) {
-	 ra_class_add_reg(regs, classes[i], reg);
+      if (devinfo->gen <= 5 && reg_width == 2) {
+         for (int j = 0; j < class_reg_count; j++) {
+            ra_class_add_reg(regs, classes[i], reg);
 
-	 ra_reg_to_grf[reg] = j;
+            ra_reg_to_grf[reg] = j * 2;
 
-	 for (int base_reg = j;
-	      base_reg < j + class_sizes[i];
-	      base_reg++) {
-	    ra_add_transitive_reg_conflict(regs, base_reg, reg);
-	 }
+            for (int base_reg = j * 2;
+                 base_reg < j * 2 + class_sizes[i];
+                 base_reg++) {
+               ra_add_transitive_reg_conflict(regs, base_reg, reg);
+            }
+
+            reg++;
+         }
+      } else {
+         for (int j = 0; j < class_reg_count; j++) {
+            ra_class_add_reg(regs, classes[i], reg);
+
+            ra_reg_to_grf[reg] = j;
 
-	 reg++;
+            for (int base_reg = j;
+                 base_reg < j + class_sizes[i];
+                 base_reg++) {
+               ra_add_transitive_reg_conflict(regs, base_reg, reg);
+            }
+
+            reg++;
+         }
       }
    }
    assert(reg == ra_reg_count);
@@ -162,7 +196,7 @@ brw_alloc_reg_set(struct intel_screen *screen, int reg_width)
    /* Add a special class for aligned pairs, which we'll put delta_x/y
     * in on gen5 so that we can do PLN.
     */
-   if (devinfo->has_pln && reg_width == 1 && devinfo->gen < 6) {
+   if (devinfo->has_pln && devinfo->gen < 6) {
       aligned_pairs_class = ra_alloc_reg_class(regs);
 
       for (int i = 0; i < pairs_reg_count; i++) {
@@ -236,7 +270,6 @@ fs_visitor::setup_payload_interference(struct ra_graph *g,
                                        int payload_node_count,
                                        int first_payload_node)
 {
-   int reg_width = dispatch_width / 8;
    int loop_depth = 0;
    int loop_end_ip = 0;
 
@@ -276,7 +309,7 @@ fs_visitor::setup_payload_interference(struct ra_graph *g,
       for (int i = 0; i < inst->sources; i++) {
          if (inst->src[i].file == HW_REG &&
              inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
-            int node_nr = inst->src[i].fixed_hw_reg.nr / reg_width;
+            int node_nr = inst->src[i].fixed_hw_reg.nr;
             if (node_nr >= payload_node_count)
                continue;
 
@@ -292,25 +325,26 @@ fs_visitor::setup_payload_interference(struct ra_graph *g,
           * sideband.  It also really freaks out driver developers to see g0
           * used in unusual places, so just always reserve it.
           */
-         payload_last_use_ip[0 / reg_width] = use_ip;
-         payload_last_use_ip[1 / reg_width] = use_ip;
+         payload_last_use_ip[0] = use_ip;
+         payload_last_use_ip[1] = use_ip;
          break;
 
       case FS_OPCODE_LINTERP:
-         /* On gen6+ in SIMD16, there are 4 adjacent registers (so 2 nodes)
-          * used by PLN's sourcing of the deltas, while we list only the first
-          * two in the arguments (1 node).  Pre-gen6, the deltas are computed
-          * in normal VGRFs.
+         /* On gen6+ in SIMD16, there are 4 adjacent registers used by
+          * PLN's sourcing of the deltas, while we list only the first one
+          * in the arguments.  Pre-gen6, the deltas are computed in normal
+          * VGRFs.
           */
          if (brw->gen >= 6) {
             int delta_x_arg = 0;
             if (inst->src[delta_x_arg].file == HW_REG &&
                 inst->src[delta_x_arg].fixed_hw_reg.file ==
                 BRW_GENERAL_REGISTER_FILE) {
-               int sechalf_node = (inst->src[delta_x_arg].fixed_hw_reg.nr /
-                                   reg_width) + 1;
-               assert(sechalf_node < payload_node_count);
-               payload_last_use_ip[sechalf_node] = use_ip;
+               for (int i = 1; i < 4; ++i) {
+                  int node = inst->src[delta_x_arg].fixed_hw_reg.nr + i;
+                  assert(node < payload_node_count);
+                  payload_last_use_ip[node] = use_ip;
+               }
             }
          }
          break;
@@ -391,8 +425,6 @@ fs_visitor::get_used_mrfs(bool *mrf_used)
 void
 fs_visitor::setup_mrf_hack_interference(struct ra_graph *g, int first_mrf_node)
 {
-   int reg_width = dispatch_width / 8;
-
    bool mrf_used[BRW_MAX_MRF];
    get_used_mrfs(mrf_used);
 
@@ -402,8 +434,7 @@ fs_visitor::setup_mrf_hack_interference(struct ra_graph *g, int first_mrf_node)
        * The alternative would be to have per-physical-register classes, which
        * would just be silly.
        */
-      ra_set_node_reg(g, first_mrf_node + i,
-                      (GEN7_MRF_HACK_START + i) / reg_width);
+      ra_set_node_reg(g, first_mrf_node + i, GEN7_MRF_HACK_START + i);
 
       /* Since we don't have any live/dead analysis on the MRFs, just mark all
        * that are used as conflicting with all virtual GRFs.
@@ -428,8 +459,7 @@ fs_visitor::assign_regs(bool allow_spilling)
     */
    int reg_width = dispatch_width / 8;
    int hw_reg_mapping[this->virtual_grf_count];
-   int payload_node_count = (ALIGN(this->first_non_payload_grf, reg_width) /
-                            reg_width);
+   int payload_node_count = ALIGN(this->first_non_payload_grf, reg_width);
    int rsi = reg_width - 1; /* Which screen->wm_reg_sets[] to use */
    calculate_live_intervals();
 
@@ -478,6 +508,30 @@ fs_visitor::assign_regs(bool allow_spilling)
    if (brw->gen >= 7)
       setup_mrf_hack_interference(g, first_mrf_hack_node);
 
+   if (dispatch_width > 8) {
+      /* In 16-wide dispatch we have an issue where a compressed
+       * instruction is actually two instructions executed simultaneiously.
+       * It's actually ok to have the source and destination registers be
+       * the same.  In this case, each instruction over-writes its own
+       * source and there's no problem.  The real problem here is if the
+       * source and destination registers are off by one.  Then you can end
+       * up in a scenario where the first instruction over-writes the
+       * source of the second instruction.  Since the compiler doesn't know
+       * about this level of granularity, we simply make the source and
+       * destination interfere.
+       */
+      foreach_block_and_inst(block, fs_inst, inst, cfg) {
+         if (inst->dst.file != GRF)
+            continue;
+
+         for (int i = 0; i < inst->sources; ++i) {
+            if (inst->src[i].file == GRF) {
+               ra_add_node_interference(g, inst->dst.reg, inst->src[i].reg);
+            }
+         }
+      }
+   }
+
    /* Debug of register spilling: Go spill everything. */
    if (0) {
       int reg = choose_spill_reg(g);
@@ -511,20 +565,19 @@ fs_visitor::assign_regs(bool allow_spilling)
     * regs in the register classes back down to real hardware reg
     * numbers.
     */
-   this->grf_used = payload_node_count * reg_width;
+   this->grf_used = payload_node_count;
    for (int i = 0; i < this->virtual_grf_count; i++) {
       int reg = ra_get_node_reg(g, i);
 
-      hw_reg_mapping[i] = screen->wm_reg_sets[rsi].ra_reg_to_grf[reg] * reg_width;
+      hw_reg_mapping[i] = screen->wm_reg_sets[rsi].ra_reg_to_grf[reg];
       this->grf_used = MAX2(this->grf_used,
-			    hw_reg_mapping[i] + this->virtual_grf_sizes[i] *
-			    reg_width);
+			    hw_reg_mapping[i] + this->virtual_grf_sizes[i]);
    }
 
    foreach_block_and_inst(block, fs_inst, inst, cfg) {
-      assign_reg(hw_reg_mapping, &inst->dst, reg_width);
+      assign_reg(hw_reg_mapping, &inst->dst);
       for (int i = 0; i < inst->sources; i++) {
-         assign_reg(hw_reg_mapping, &inst->src[i], reg_width);
+         assign_reg(hw_reg_mapping, &inst->src[i]);
       }
    }
 
@@ -539,7 +592,11 @@ void
 fs_visitor::emit_unspill(bblock_t *block, fs_inst *inst, fs_reg dst,
                          uint32_t spill_offset, int count)
 {
-   for (int i = 0; i < count; i++) {
+   int reg_size = 1;
+   if (count % 2 == 0)
+      reg_size = 2;
+
+   for (int i = 0; i < count / reg_size; i++) {
       /* The gen7 descriptor-based offset is 12 bits of HWORD units. */
       bool gen7_read = brw->gen >= 7 && spill_offset < (1 << 12) * REG_SIZE;
 
@@ -558,8 +615,32 @@ fs_visitor::emit_unspill(bblock_t *block, fs_inst *inst, fs_reg dst,
       }
       inst->insert_before(block, unspill_inst);
 
-      dst = offset(dst, 1);
-      spill_offset += dispatch_width * sizeof(float);
+      dst.reg_offset += reg_size;
+      spill_offset += reg_size * 8 * sizeof(float);
+   }
+}
+
+void
+fs_visitor::emit_spill(bblock_t *block, fs_inst *inst, fs_reg src,
+                       uint32_t spill_offset, int count)
+{
+   int spill_base_mrf = dispatch_width > 8 ? 13 : 14;
+
+   int reg_size = 1;
+   if (count % 2 == 0)
+      reg_size = 2;
+
+   for (int i = 0; i < count / reg_size; i++) {
+      fs_inst *spill_inst =
+         new(mem_ctx) fs_inst(SHADER_OPCODE_GEN4_SCRATCH_WRITE,
+                              reg_null_f, src);
+      src.reg_offset += reg_size;
+      spill_inst->offset = spill_offset + i * reg_size;
+      spill_inst->ir = inst->ir;
+      spill_inst->annotation = inst->annotation;
+      spill_inst->mlen = 1 + reg_size; /* header, value */
+      spill_inst->base_mrf = spill_base_mrf;
+      inst->insert_after(block, spill_inst);
    }
 }
 
@@ -712,18 +793,8 @@ fs_visitor::spill_reg(int spill_reg)
                          inst->regs_written);
 	 }
 
-	 for (int chan = 0; chan < inst->regs_written; chan++) {
-	    fs_inst *spill_inst =
-               new(mem_ctx) fs_inst(SHADER_OPCODE_GEN4_SCRATCH_WRITE,
-                                    reg_null_f, spill_src);
-	    spill_src = offset(spill_src, 1);
-	    spill_inst->offset = subset_spill_offset + chan * reg_size;
-	    spill_inst->ir = inst->ir;
-	    spill_inst->annotation = inst->annotation;
-	    spill_inst->mlen = 1 + dispatch_width / 8; /* header, value */
-	    spill_inst->base_mrf = spill_base_mrf;
-	    inst->insert_after(block, spill_inst);
-	 }
+         emit_spill(block, inst, spill_src, subset_spill_offset,
+                    inst->regs_written);
       }
    }
 
diff --git a/src/mesa/drivers/dri/i965/brw_fs_register_coalesce.cpp b/src/mesa/drivers/dri/i965/brw_fs_register_coalesce.cpp
index 73f18f9d62b..9546dcdf06a 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_register_coalesce.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_register_coalesce.cpp
@@ -69,16 +69,12 @@ is_copy_payload(const fs_visitor *v, const fs_inst *inst)
    if (v->virtual_grf_sizes[inst->src[0].reg] != inst->regs_written)
       return false;
 
-   const int reg = inst->src[0].reg;
-   if (inst->src[0].reg_offset != 0)
-      return false;
+   fs_reg reg = inst->src[0];
 
-   for (int i = 1; i < inst->sources; i++) {
-      if (inst->src[i].reg != reg ||
-          inst->src[i].reg_offset != i) {
+   for (int i = 0; i < inst->sources; i++)
+      if (!inst->src[i].equals(offset(reg, i)))
          return false;
-      }
-   }
+
    return true;
 }
 
@@ -186,6 +182,7 @@ fs_visitor::register_coalesce()
          src_size = virtual_grf_sizes[inst->src[0].reg];
          assert(src_size <= MAX_SAMPLER_MESSAGE_SIZE);
 
+         assert(inst->src[0].width % 8 == 0);
          channels_remaining = src_size;
          memset(mov, 0, sizeof(mov));
 
@@ -200,12 +197,14 @@ fs_visitor::register_coalesce()
             reg_to_offset[i] = i;
          }
          mov[0] = inst;
-         channels_remaining -= inst->sources;
+         channels_remaining -= inst->regs_written;
       } else {
          const int offset = inst->src[0].reg_offset;
          reg_to_offset[offset] = inst->dst.reg_offset;
+         if (inst->src[0].width == 16)
+            reg_to_offset[offset + 1] = inst->dst.reg_offset + 1;
          mov[offset] = inst;
-         channels_remaining--;
+         channels_remaining -= inst->regs_written;
       }
 
       if (channels_remaining)
diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
index 72ffe1ff04f..3b31e341e9c 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
@@ -287,7 +287,8 @@ fs_visitor::try_emit_saturate(ir_expression *ir)
     * src, just set the saturate flag instead of emmitting a separate mov.
     */
    fs_inst *modify = get_instruction_generating_reg(pre_inst, last_inst, src);
-   if (modify && modify->regs_written == 1 && modify->can_do_saturate()) {
+   if (modify && modify->regs_written == modify->dst.width / 8 &&
+       modify->can_do_saturate()) {
       modify->saturate = true;
       this->result = src;
       return true;
@@ -1434,7 +1435,7 @@ fs_visitor::emit_texture_gen5(ir_texture *ir, fs_reg dst, fs_reg coordinate,
    inst->base_mrf = base_mrf;
    inst->mlen = mlen;
    inst->header_present = header_present;
-   inst->regs_written = 4;
+   inst->regs_written = 4 * reg_width;
 
    if (mlen > MAX_SAMPLER_MESSAGE_SIZE) {
       fail("Message length >" STRINGIFY(MAX_SAMPLER_MESSAGE_SIZE)
@@ -1480,7 +1481,7 @@ fs_visitor::emit_texture_gen7(ir_texture *ir, fs_reg dst, fs_reg coordinate,
        * need to offset the Sampler State Pointer in the header.
        */
       header_present = true;
-      sources[length] = reg_undef;
+      sources[0] = fs_reg(GRF, virtual_grf_alloc(1), BRW_REGISTER_TYPE_UD);
       length++;
    }
 
@@ -1618,7 +1619,13 @@ fs_visitor::emit_texture_gen7(ir_texture *ir, fs_reg dst, fs_reg coordinate,
       }
    }
 
-   fs_reg src_payload = fs_reg(GRF, virtual_grf_alloc(length),
+   int mlen;
+   if (reg_width == 2)
+      mlen = length * reg_width - header_present;
+   else
+      mlen = length * reg_width;
+
+   fs_reg src_payload = fs_reg(GRF, virtual_grf_alloc(mlen),
                                BRW_REGISTER_TYPE_F);
    emit(LOAD_PAYLOAD(src_payload, sources, length));
 
@@ -1645,12 +1652,9 @@ fs_visitor::emit_texture_gen7(ir_texture *ir, fs_reg dst, fs_reg coordinate,
    }
    fs_inst *inst = emit(opcode, dst, src_payload, sampler);
    inst->base_mrf = -1;
-   if (reg_width == 2)
-      inst->mlen = length * reg_width - header_present;
-   else
-      inst->mlen = length * reg_width;
+   inst->mlen = mlen;
    inst->header_present = header_present;
-   inst->regs_written = 4;
+   inst->regs_written = 4 * reg_width;
 
    if (inst->mlen > MAX_SAMPLER_MESSAGE_SIZE) {
       fail("Message length >" STRINGIFY(MAX_SAMPLER_MESSAGE_SIZE)
@@ -1784,7 +1788,7 @@ fs_visitor::emit_mcs_fetch(ir_texture *ir, fs_reg coordinate, fs_reg sampler)
 {
    int reg_width = dispatch_width / 8;
    int length = ir->coordinate->type->vector_elements;
-   fs_reg payload = fs_reg(GRF, virtual_grf_alloc(length),
+   fs_reg payload = fs_reg(GRF, virtual_grf_alloc(length * reg_width),
                            BRW_REGISTER_TYPE_F);
    fs_reg dest = fs_reg(this, glsl_type::uvec4_type);
    fs_reg *sources = ralloc_array(mem_ctx, fs_reg, length);
@@ -1802,9 +1806,10 @@ fs_visitor::emit_mcs_fetch(ir_texture *ir, fs_reg coordinate, fs_reg sampler)
    inst->base_mrf = -1;
    inst->mlen = length * reg_width;
    inst->header_present = false;
-   inst->regs_written = 4; /* we only care about one reg of response,
-                            * but the sampler always writes 4/8
-                            */
+   inst->regs_written = 4 * reg_width; /* we only care about one reg of
+                                        * response, but the sampler always
+                                        * writes 4/8
+                                        */
 
    return dest;
 }
@@ -1979,14 +1984,15 @@ fs_visitor::visit(ir_texture *ir)
          emit_math(SHADER_OPCODE_INT_QUOTIENT, fixed_depth, depth, fs_reg(6));
 
          fs_reg *fixed_payload = ralloc_array(mem_ctx, fs_reg, inst->regs_written);
-         for (int i = 0; i < inst->regs_written; i++) {
+         int components = inst->regs_written / (dst.width / 8);
+         for (int i = 0; i < components; i++) {
             if (i == 2) {
                fixed_payload[i] = fixed_depth;
             } else {
                fixed_payload[i] = offset(dst, i);
             }
          }
-         emit(LOAD_PAYLOAD(dst, fixed_payload, inst->regs_written));
+         emit(LOAD_PAYLOAD(dst, fixed_payload, components));
       }
    }
 
diff --git a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp b/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
index b963bdaa9f9..5e8c98a36df 100644
--- a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
+++ b/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
@@ -795,7 +795,7 @@ fs_instruction_scheduler::calculate_deps()
       for (int i = 0; i < inst->sources; i++) {
 	 if (inst->src[i].file == GRF) {
             if (post_reg_alloc) {
-               for (int r = 0; r < reg_width * inst->regs_read(v, i); r++)
+               for (int r = 0; r < inst->regs_read(v, i); r++)
                   add_dep(last_grf_write[inst->src[i].reg + r], n);
             } else {
                for (int r = 0; r < inst->regs_read(v, i); r++) {
@@ -847,7 +847,7 @@ fs_instruction_scheduler::calculate_deps()
       /* write-after-write deps. */
       if (inst->dst.file == GRF) {
          if (post_reg_alloc) {
-            for (int r = 0; r < inst->regs_written * reg_width; r++) {
+            for (int r = 0; r < inst->regs_written; r++) {
                add_dep(last_grf_write[inst->dst.reg + r], n);
                last_grf_write[inst->dst.reg + r] = n;
             }
@@ -923,7 +923,7 @@ fs_instruction_scheduler::calculate_deps()
       for (int i = 0; i < inst->sources; i++) {
 	 if (inst->src[i].file == GRF) {
             if (post_reg_alloc) {
-               for (int r = 0; r < reg_width * inst->regs_read(v, i); r++)
+               for (int r = 0; r < inst->regs_read(v, i); r++)
                   add_dep(n, last_grf_write[inst->src[i].reg + r]);
             } else {
                for (int r = 0; r < inst->regs_read(v, i); r++) {
@@ -977,7 +977,7 @@ fs_instruction_scheduler::calculate_deps()
        */
       if (inst->dst.file == GRF) {
          if (post_reg_alloc) {
-            for (int r = 0; r < inst->regs_written * reg_width; r++)
+            for (int r = 0; r < inst->regs_written; r++)
                last_grf_write[inst->dst.reg + r] = n;
          } else {
             for (int r = 0; r < inst->regs_written; r++) {
@@ -1300,7 +1300,8 @@ fs_instruction_scheduler::choose_instruction_to_schedule()
                 * single-result send is probably actually reducing register
                 * pressure.
                 */
-               if (inst->regs_written <= 1 && chosen_inst->regs_written > 1) {
+               if (inst->regs_written <= inst->dst.width / 8 &&
+                   chosen_inst->regs_written > chosen_inst->dst.width / 8) {
                   chosen = n;
                   continue;
                } else if (inst->regs_written > chosen_inst->regs_written) {