Added few more stubs so that control reaches to DestroyDevice().

[mesa.git] / src / intel / compiler / brw_fs_copy_propagation.cpp
diff --git a/src/intel/compiler/brw_fs_copy_propagation.cpp b/src/intel/compiler/brw_fs_copy_propagation.cpp

index fcf4706b7ab5bbe6f45a9fb2d99146f375e3e1da..31e55e64a27bf6fb0b99670e6e0ca15ad8d5ecd7 100644 (file)
--- a/src/intel/compiler/brw_fs_copy_propagation.cpp
+++ b/src/intel/compiler/brw_fs_copy_propagation.cpp
@@ -32,19 +32,24 @@
   * 12.5 (p356).
   */
  
-#define ACP_HASH_SIZE 16
+#define ACP_HASH_SIZE 64
  
  #include "util/bitset.h"
+#include "util/u_math.h"
  #include "brw_fs.h"
+#include "brw_fs_live_variables.h"
  #include "brw_cfg.h"
  #include "brw_eu.h"
  
+using namespace brw;
+
  namespace { /* avoid conflict with opt_copy_propagation_elements */
  struct acp_entry : public exec_node {
     fs_reg dst;
     fs_reg src;
-   uint8_t size_written;
-   uint8_t size_read;
+   unsigned global_idx;
+   unsigned size_written;
+   unsigned size_read;
     enum opcode opcode;
     bool saturate;
  };
@@ -77,12 +82,19 @@ struct block_data {
      * course of this block.
      */
     BITSET_WORD *kill;
+
+   /**
+    * Which entries in the fs_copy_prop_dataflow acp table are guaranteed to
+    * have a fully uninitialized destination at the end of this block.
+    */
+   BITSET_WORD *undef;
  };
  
  class fs_copy_prop_dataflow
  {
  public:
     fs_copy_prop_dataflow(void *mem_ctx, cfg_t *cfg,
+                         const fs_live_variables &live,
                           exec_list *out_acp[ACP_HASH_SIZE]);
  
     void setup_initial_values();
@@ -92,6 +104,7 @@ public:
  
     void *mem_ctx;
     cfg_t *cfg;
+   const fs_live_variables &live;
  
     acp_entry **acp;
     int num_acp;
@@ -102,8 +115,9 @@ public:
  } /* anonymous namespace */
  
  fs_copy_prop_dataflow::fs_copy_prop_dataflow(void *mem_ctx, cfg_t *cfg,
+                                             const fs_live_variables &live,
                                               exec_list *out_acp[ACP_HASH_SIZE])
-   : mem_ctx(mem_ctx), cfg(cfg)
+   : mem_ctx(mem_ctx), cfg(cfg), live(live)
  {
     bd = rzalloc_array(mem_ctx, struct block_data, cfg->num_blocks);
  
@@ -124,11 +138,14 @@ fs_copy_prop_dataflow::fs_copy_prop_dataflow(void *mem_ctx, cfg_t *cfg,
        bd[block->num].liveout = rzalloc_array(bd, BITSET_WORD, bitset_words);
        bd[block->num].copy = rzalloc_array(bd, BITSET_WORD, bitset_words);
        bd[block->num].kill = rzalloc_array(bd, BITSET_WORD, bitset_words);
+      bd[block->num].undef = rzalloc_array(bd, BITSET_WORD, bitset_words);
  
        for (int i = 0; i < ACP_HASH_SIZE; i++) {
           foreach_in_list(acp_entry, entry, &out_acp[block->num][i]) {
              acp[next_acp] = entry;
  
+            entry->global_idx = next_acp;
+
              /* opt_copy_propagation_local populates out_acp with copies created
               * in a block which are still live at the end of the block.  This
               * is exactly what we want in the COPY set.
@@ -154,27 +171,80 @@ void
  fs_copy_prop_dataflow::setup_initial_values()
  {
     /* Initialize the COPY and KILL sets. */
-   foreach_block (block, cfg) {
-      foreach_inst_in_block(fs_inst, inst, block) {
-         if (inst->dst.file != VGRF)
-            continue;
+   {
+      /* Create a temporary table of ACP entries which we'll use for efficient
+       * look-up.  Unfortunately, we have to do this in two steps because we
+       * have to match both sources and destinations and an ACP entry can only
+       * be in one list at a time.
+       *
+       * We choose to make the table size between num_acp/2 and num_acp/4 to
+       * try and trade off between the time it takes to initialize the table
+       * via exec_list constructors or make_empty() and the cost of
+       * collisions.  In practice, it doesn't appear to matter too much what
+       * size we make the table as long as it's roughly the same order of
+       * magnitude as num_acp.  We get most of the benefit of the table
+       * approach even if we use a table of size ACP_HASH_SIZE though a
+       * full-sized table is 1-2% faster in practice.
+       */
+      unsigned acp_table_size = util_next_power_of_two(num_acp) / 4;
+      acp_table_size = MAX2(acp_table_size, ACP_HASH_SIZE);
+      exec_list *acp_table = new exec_list[acp_table_size];
+
+      /* First, get all the KILLs for instructions which overwrite ACP
+       * destinations.
+       */
+      for (int i = 0; i < num_acp; i++) {
+         unsigned idx = reg_space(acp[i]->dst) & (acp_table_size - 1);
+         acp_table[idx].push_tail(acp[i]);
+      }
+
+      foreach_block (block, cfg) {
+         foreach_inst_in_block(fs_inst, inst, block) {
+            if (inst->dst.file != VGRF)
+               continue;
+
+            unsigned idx = reg_space(inst->dst) & (acp_table_size - 1);
+            foreach_in_list(acp_entry, entry, &acp_table[idx]) {
+               if (regions_overlap(inst->dst, inst->size_written,
+                                   entry->dst, entry->size_written))
+                  BITSET_SET(bd[block->num].kill, entry->global_idx);
+            }
+         }
+      }
+
+      /* Clear the table for the second pass */
+      for (unsigned i = 0; i < acp_table_size; i++)
+         acp_table[i].make_empty();
+
+      /* Next, get all the KILLs for instructions which overwrite ACP
+       * sources.
+       */
+      for (int i = 0; i < num_acp; i++) {
+         unsigned idx = reg_space(acp[i]->src) & (acp_table_size - 1);
+         acp_table[idx].push_tail(acp[i]);
+      }
  
-         /* Mark ACP entries which are killed by this instruction. */
-         for (int i = 0; i < num_acp; i++) {
-            if (regions_overlap(inst->dst, inst->size_written,
-                                acp[i]->dst, acp[i]->size_written) ||
-                regions_overlap(inst->dst, inst->size_written,
-                                acp[i]->src, acp[i]->size_read)) {
-               BITSET_SET(bd[block->num].kill, i);
+      foreach_block (block, cfg) {
+         foreach_inst_in_block(fs_inst, inst, block) {
+            if (inst->dst.file != VGRF &&
+                inst->dst.file != FIXED_GRF)
+               continue;
+
+            unsigned idx = reg_space(inst->dst) & (acp_table_size - 1);
+            foreach_in_list(acp_entry, entry, &acp_table[idx]) {
+               if (regions_overlap(inst->dst, inst->size_written,
+                                   entry->src, entry->size_read))
+                  BITSET_SET(bd[block->num].kill, entry->global_idx);
              }
           }
        }
+
+      delete [] acp_table;
     }
  
     /* Populate the initial values for the livein and liveout sets.  For the
      * block at the start of the program, livein = 0 and liveout = copy.
-    * For the others, set liveout to 0 (the empty set) and livein to ~0
-    * (the universal set).
+    * For the others, set liveout and livein to ~0 (the universal set).
      */
     foreach_block (block, cfg) {
        if (block->parents.is_empty()) {
@@ -184,11 +254,23 @@ fs_copy_prop_dataflow::setup_initial_values()
           }
        } else {
           for (int i = 0; i < bitset_words; i++) {
-            bd[block->num].liveout[i] = 0u;
+            bd[block->num].liveout[i] = ~0u;
              bd[block->num].livein[i] = ~0u;
           }
        }
     }
+
+   /* Initialize the undef set. */
+   foreach_block (block, cfg) {
+      for (int i = 0; i < num_acp; i++) {
+         BITSET_SET(bd[block->num].undef, i);
+         for (unsigned off = 0; off < acp[i]->size_written; off += REG_SIZE) {
+            if (BITSET_TEST(live.block_data[block->num].defout,
+                            live.var_from_reg(byte_offset(acp[i]->dst, off))))
+               BITSET_CLEAR(bd[block->num].undef, i);
+         }
+      }
+   }
  }
  
  /**
@@ -203,40 +285,45 @@ fs_copy_prop_dataflow::run()
     do {
        progress = false;
  
-      /* Update liveout for all blocks. */
        foreach_block (block, cfg) {
           if (block->parents.is_empty())
              continue;
  
           for (int i = 0; i < bitset_words; i++) {
              const BITSET_WORD old_liveout = bd[block->num].liveout[i];
+            BITSET_WORD livein_from_any_block = 0;
  
-            bd[block->num].liveout[i] =
-               bd[block->num].copy[i] | (bd[block->num].livein[i] &
-                                         ~bd[block->num].kill[i]);
-
-            if (old_liveout != bd[block->num].liveout[i])
-               progress = true;
-         }
-      }
-
-      /* Update livein for all blocks.  If a copy is live out of all parent
-       * blocks, it's live coming in to this block.
-       */
-      foreach_block (block, cfg) {
-         if (block->parents.is_empty())
-            continue;
-
-         for (int i = 0; i < bitset_words; i++) {
-            const BITSET_WORD old_livein = bd[block->num].livein[i];
-
+            /* Update livein for this block.  If a copy is live out of all
+             * parent blocks, it's live coming in to this block.
+             */
              bd[block->num].livein[i] = ~0u;
              foreach_list_typed(bblock_link, parent_link, link, &block->parents) {
                 bblock_t *parent = parent_link->block;
-               bd[block->num].livein[i] &= bd[parent->num].liveout[i];
+               /* Consider ACP entries with a known-undefined destination to
+                * be available from the parent.  This is valid because we're
+                * free to set the undefined variable equal to the source of
+                * the ACP entry without breaking the application's
+                * expectations, since the variable is undefined.
+                */
+               bd[block->num].livein[i] &= (bd[parent->num].liveout[i] |
+                                            bd[parent->num].undef[i]);
+               livein_from_any_block |= bd[parent->num].liveout[i];
              }
  
-            if (old_livein != bd[block->num].livein[i])
+            /* Limit to the set of ACP entries that can possibly be available
+             * at the start of the block, since propagating from a variable
+             * which is guaranteed to be undefined (rather than potentially
+             * undefined for some dynamic control-flow paths) doesn't seem
+             * particularly useful.
+             */
+            bd[block->num].livein[i] &= livein_from_any_block;
+
+            /* Update liveout for this block. */
+            bd[block->num].liveout[i] =
+               bd[block->num].copy[i] | (bd[block->num].livein[i] &
+                                         ~bd[block->num].kill[i]);
+
+            if (old_liveout != bd[block->num].liveout[i])
                 progress = true;
           }
        }
@@ -286,6 +373,16 @@ can_take_stride(fs_inst *inst, unsigned arg, unsigned stride,
     if (stride > 4)
        return false;
  
+   /* Bail if the channels of the source need to be aligned to the byte offset
+    * of the corresponding channel of the destination, and the provided stride
+    * would break this restriction.
+    */
+   if (has_dst_aligned_region_restriction(devinfo, inst) &&
+       !(type_sz(inst->src[arg].type) * stride ==
+           type_sz(inst->dst.type) * inst->dst.stride ||
+         stride == 0))
+      return false;
+
     /* 3-source instructions can only be Align16, which restricts what strides
      * they can take. They can only take a stride of 1 (the usual case), or 0
      * with a special "repctrl" bit. But the repctrl bit doesn't work for
@@ -332,6 +429,20 @@ can_take_stride(fs_inst *inst, unsigned arg, unsigned stride,
     return true;
  }
  
+static bool
+instruction_requires_packed_data(fs_inst *inst)
+{
+   switch (inst->opcode) {
+   case FS_OPCODE_DDX_FINE:
+   case FS_OPCODE_DDX_COARSE:
+   case FS_OPCODE_DDY_FINE:
+   case FS_OPCODE_DDY_COARSE:
+      return true;
+   default:
+      return false;
+   }
+}
+
  bool
  fs_visitor::try_copy_propagate(fs_inst *inst, int arg, acp_entry *entry)
  {
@@ -341,10 +452,24 @@ fs_visitor::try_copy_propagate(fs_inst *inst, int arg, acp_entry *entry)
     if (entry->src.file == IMM)
        return false;
     assert(entry->src.file == VGRF || entry->src.file == UNIFORM ||
-          entry->src.file == ATTR);
-
+          entry->src.file == ATTR || entry->src.file == FIXED_GRF);
+
+   /* Avoid propagating a LOAD_PAYLOAD instruction into another if there is a
+    * good chance that we'll be able to eliminate the latter through register
+    * coalescing.  If only part of the sources of the second LOAD_PAYLOAD can
+    * be simplified through copy propagation we would be making register
+    * coalescing impossible, ending up with unnecessary copies in the program.
+    * This is also the case for is_multi_copy_payload() copies that can only
+    * be coalesced when the instruction is lowered into a sequence of MOVs.
+    *
+    * Worse -- In cases where the ACP entry was the result of CSE combining
+    * multiple LOAD_PAYLOAD subexpressions, propagating the first LOAD_PAYLOAD
+    * into the second would undo the work of CSE, leading to an infinite
+    * optimization loop.  Avoid this by detecting LOAD_PAYLOAD copies from CSE
+    * temporaries which should match is_coalescing_payload().
+    */
     if (entry->opcode == SHADER_OPCODE_LOAD_PAYLOAD &&
-       inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD)
+       (is_coalescing_payload(alloc, inst) || is_multi_copy_payload(inst)))
        return false;
  
     assert(entry->dst.file == VGRF);
@@ -358,6 +483,21 @@ fs_visitor::try_copy_propagate(fs_inst *inst, int arg, acp_entry *entry)
                              entry->dst, entry->size_written))
        return false;
  
+   /* Avoid propagating a FIXED_GRF register into an EOT instruction in order
+    * for any register allocation restrictions to be applied.
+    */
+   if (entry->src.file == FIXED_GRF && inst->eot)
+      return false;
+
+   /* Avoid propagating odd-numbered FIXED_GRF registers into the first source
+    * of a LINTERP instruction on platforms where the PLN instruction has
+    * register alignment restrictions.
+    */
+   if (devinfo->has_pln && devinfo->gen <= 6 &&
+       entry->src.file == FIXED_GRF && (entry->src.nr & 1) &&
+       inst->opcode == FS_OPCODE_LINTERP && arg == 0)
+      return false;
+
     /* we can't generally copy-propagate UD negations because we
      * can end up accessing the resulting values as signed integers
      * instead. See also resolve_ud_negate() and comment in
@@ -378,13 +518,34 @@ fs_visitor::try_copy_propagate(fs_inst *inst, int arg, acp_entry *entry)
         inst->opcode == SHADER_OPCODE_GEN4_SCRATCH_WRITE)
        return false;
  
+   /* Some instructions implemented in the generator backend, such as
+    * derivatives, assume that their operands are packed so we can't
+    * generally propagate strided regions to them.
+    */
+   const unsigned entry_stride = (entry->src.file == FIXED_GRF ? 1 :
+                                  entry->src.stride);
+   if (instruction_requires_packed_data(inst) && entry_stride != 1)
+      return false;
+
     /* Bail if the result of composing both strides would exceed the
      * hardware limit.
      */
-   if (!can_take_stride(inst, arg, entry->src.stride * inst->src[arg].stride,
+   if (!can_take_stride(inst, arg, entry_stride * inst->src[arg].stride,
                          devinfo))
        return false;
  
+   /* Bail if the source FIXED_GRF region of the copy cannot be trivially
+    * composed with the source region of the instruction -- E.g. because the
+    * copy uses some extended stride greater than 4 not supported natively by
+    * the hardware as a horizontal stride, or because instruction compression
+    * could require us to use a vertical stride shorter than a GRF.
+    */
+   if (entry->src.file == FIXED_GRF &&
+       (inst->src[arg].stride > 4 ||
+        inst->dst.component_size(inst->exec_size) >
+        inst->src[arg].component_size(inst->exec_size)))
+      return false;
+
     /* Bail if the instruction type is larger than the execution type of the
      * copy, what implies that each channel is reading multiple channels of the
      * destination of the copy, and simply replacing the sources would give a
@@ -406,7 +567,7 @@ fs_visitor::try_copy_propagate(fs_inst *inst, int arg, acp_entry *entry)
      *
      * Which would have different semantics.
      */
-   if (entry->src.stride != 1 &&
+   if (entry_stride != 1 &&
         (inst->src[arg].stride *
          type_sz(inst->src[arg].type)) % type_sz(entry->src.type) != 0)
        return false;
@@ -444,13 +605,42 @@ fs_visitor::try_copy_propagate(fs_inst *inst, int arg, acp_entry *entry)
        }
     }
  
+   /* Save the offset of inst->src[arg] relative to entry->dst for it to be
+    * applied later.
+    */
+   const unsigned rel_offset = inst->src[arg].offset - entry->dst.offset;
+
+   /* Fold the copy into the instruction consuming it. */
     inst->src[arg].file = entry->src.file;
     inst->src[arg].nr = entry->src.nr;
-   inst->src[arg].stride *= entry->src.stride;
-   inst->saturate = inst->saturate || entry->saturate;
+   inst->src[arg].subnr = entry->src.subnr;
+   inst->src[arg].offset = entry->src.offset;
+
+   /* Compose the strides of both regions. */
+   if (entry->src.file == FIXED_GRF) {
+      if (inst->src[arg].stride) {
+         const unsigned orig_width = 1 << entry->src.width;
+         const unsigned reg_width = REG_SIZE / (type_sz(inst->src[arg].type) *
+                                                inst->src[arg].stride);
+         inst->src[arg].width = cvt(MIN2(orig_width, reg_width)) - 1;
+         inst->src[arg].hstride = cvt(inst->src[arg].stride);
+         inst->src[arg].vstride = inst->src[arg].hstride + inst->src[arg].width;
+      } else {
+         inst->src[arg].vstride = inst->src[arg].hstride =
+            inst->src[arg].width = 0;
+      }
  
-   /* Compute the offset of inst->src[arg] relative to entry->dst */
-   const unsigned rel_offset = inst->src[arg].offset - entry->dst.offset;
+      inst->src[arg].stride = 1;
+
+      /* Hopefully no Align16 around here... */
+      assert(entry->src.swizzle == BRW_SWIZZLE_XYZW);
+      inst->src[arg].swizzle = entry->src.swizzle;
+   } else {
+      inst->src[arg].stride *= entry->src.stride;
+   }
+
+   /* Compose any saturate modifiers. */
+   inst->saturate = inst->saturate || entry->saturate;
  
     /* Compute the first component of the copy that the instruction is
      * reading, and the base byte offset within that component.
@@ -462,9 +652,8 @@ fs_visitor::try_copy_propagate(fs_inst *inst, int arg, acp_entry *entry)
     /* Calculate the byte offset at the origin of the copy of the given
      * component and suboffset.
      */
-   inst->src[arg].offset = suboffset +
-      component * entry->src.stride * type_sz(entry->src.type) +
-      entry->src.offset;
+   inst->src[arg] = byte_offset(inst->src[arg],
+      component * entry_stride * type_sz(entry->src.type) + suboffset);
  
     if (has_source_modifiers) {
        if (entry->dst.type != inst->src[arg].type) {
@@ -634,7 +823,11 @@ fs_visitor::try_constant_propagate(fs_inst *inst, acp_entry *entry)
           if (i == 1) {
              inst->src[i] = val;
              progress = true;
-         } else if (i == 0 && inst->src[1].file != IMM) {
+         } else if (i == 0 && inst->src[1].file != IMM &&
+                    (inst->conditional_mod == BRW_CONDITIONAL_NONE ||
+                     /* Only GE and L are commutative. */
+                     inst->conditional_mod == BRW_CONDITIONAL_GE ||
+                     inst->conditional_mod == BRW_CONDITIONAL_L)) {
              inst->src[0] = inst->src[1];
              inst->src[1] = val;
  
@@ -649,22 +842,6 @@ fs_visitor::try_constant_propagate(fs_inst *inst, acp_entry *entry)
           }
           break;
  
-      case SHADER_OPCODE_UNTYPED_ATOMIC:
-      case SHADER_OPCODE_UNTYPED_SURFACE_READ:
-      case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
-      case SHADER_OPCODE_TYPED_ATOMIC:
-      case SHADER_OPCODE_TYPED_SURFACE_READ:
-      case SHADER_OPCODE_TYPED_SURFACE_WRITE:
-      case SHADER_OPCODE_BYTE_SCATTERED_WRITE:
-         /* We only propagate into the surface argument of the
-          * instruction. Everything else goes through LOAD_PAYLOAD.
-          */
-         if (i == 1) {
-            inst->src[i] = val;
-            progress = true;
-         }
-         break;
-
        case FS_OPCODE_FB_WRITE_LOGICAL:
           /* The stencil and omask sources of FS_OPCODE_FB_WRITE_LOGICAL are
            * bit-cast using a strided region so they cannot be immediates.
@@ -690,12 +867,14 @@ fs_visitor::try_constant_propagate(fs_inst *inst, acp_entry *entry)
        case SHADER_OPCODE_TG4_LOGICAL:
        case SHADER_OPCODE_TG4_OFFSET_LOGICAL:
        case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:
+      case SHADER_OPCODE_UNTYPED_ATOMIC_FLOAT_LOGICAL:
        case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:
        case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
        case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL:
        case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
        case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
        case SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL:
+      case SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL:
           inst->src[i] = val;
           progress = true;
           break;
@@ -730,9 +909,12 @@ can_propagate_from(fs_inst *inst)
                                inst->src[0], inst->size_read(0))) ||
              inst->src[0].file == ATTR ||
              inst->src[0].file == UNIFORM ||
-            inst->src[0].file == IMM) &&
+            inst->src[0].file == IMM ||
+            (inst->src[0].file == FIXED_GRF &&
+             inst->src[0].is_contiguous())) &&
             inst->src[0].type == inst->dst.type &&
-           !inst->is_partial_write());
+           !inst->is_partial_write()) ||
+          is_identity_payload(FIXED_GRF, inst);
  }
  
  /* Walks a basic block and does copy propagation on it using the acp
@@ -759,7 +941,7 @@ fs_visitor::opt_copy_propagation_local(void *copy_prop_ctx, bblock_t *block,
        }
  
        /* kill the destination from the ACP */
-      if (inst->dst.file == VGRF) {
+      if (inst->dst.file == VGRF || inst->dst.file == FIXED_GRF) {
           foreach_in_list_safe(acp_entry, entry, &acp[inst->dst.nr % ACP_HASH_SIZE]) {
              if (regions_overlap(entry->dst, entry->size_written,
                                  inst->dst, inst->size_written))
@@ -785,11 +967,12 @@ fs_visitor::opt_copy_propagation_local(void *copy_prop_ctx, bblock_t *block,
         * operand of another instruction, add it to the ACP.
         */
        if (can_propagate_from(inst)) {
-         acp_entry *entry = ralloc(copy_prop_ctx, acp_entry);
+         acp_entry *entry = rzalloc(copy_prop_ctx, acp_entry);
           entry->dst = inst->dst;
           entry->src = inst->src[0];
           entry->size_written = inst->size_written;
-         entry->size_read = inst->size_read(0);
+         for (unsigned i = 0; i < inst->sources; i++)
+            entry->size_read += inst->size_read(i);
           entry->opcode = inst->opcode;
           entry->saturate = inst->saturate;
           acp[entry->dst.nr % ACP_HASH_SIZE].push_tail(entry);
@@ -801,7 +984,9 @@ fs_visitor::opt_copy_propagation_local(void *copy_prop_ctx, bblock_t *block,
              assert(effective_width * type_sz(inst->src[i].type) % REG_SIZE == 0);
              const unsigned size_written = effective_width *
                                            type_sz(inst->src[i].type);
-            if (inst->src[i].file == VGRF) {
+            if (inst->src[i].file == VGRF ||
+                (inst->src[i].file == FIXED_GRF &&
+                 inst->src[i].is_contiguous())) {
                 acp_entry *entry = rzalloc(copy_prop_ctx, acp_entry);
                 entry->dst = byte_offset(inst->dst, offset);
                 entry->src = inst->src[i];
@@ -832,16 +1017,37 @@ fs_visitor::opt_copy_propagation()
     for (int i = 0; i < cfg->num_blocks; i++)
        out_acp[i] = new exec_list [ACP_HASH_SIZE];
  
+   const fs_live_variables &live = live_analysis.require();
+
     /* First, walk through each block doing local copy propagation and getting
      * the set of copies available at the end of the block.
      */
     foreach_block (block, cfg) {
        progress = opt_copy_propagation_local(copy_prop_ctx, block,
                                              out_acp[block->num]) || progress;
+
+      /* If the destination of an ACP entry exists only within this block,
+       * then there's no need to keep it for dataflow analysis.  We can delete
+       * it from the out_acp table and avoid growing the bitsets any bigger
+       * than we absolutely have to.
+       *
+       * Because nothing in opt_copy_propagation_local touches the block
+       * start/end IPs and opt_copy_propagation_local is incapable of
+       * extending the live range of an ACP destination beyond the block,
+       * it's safe to use the liveness information in this way.
+       */
+      for (unsigned a = 0; a < ACP_HASH_SIZE; a++) {
+         foreach_in_list_safe(acp_entry, entry, &out_acp[block->num][a]) {
+            assert(entry->dst.file == VGRF);
+            if (block->start_ip <= live.vgrf_start[entry->dst.nr] &&
+                live.vgrf_end[entry->dst.nr] <= block->end_ip)
+               entry->remove();
+         }
+      }
     }
  
     /* Do dataflow analysis for those available copies. */
-   fs_copy_prop_dataflow dataflow(copy_prop_ctx, cfg, out_acp);
+   fs_copy_prop_dataflow dataflow(copy_prop_ctx, cfg, live, out_acp);
  
     /* Next, re-run local copy propagation, this time with the set of copies
      * provided by the dataflow analysis available at the start of a block.
@@ -865,7 +1071,8 @@ fs_visitor::opt_copy_propagation()
     ralloc_free(copy_prop_ctx);
  
     if (progress)
-      invalidate_live_intervals();
+      invalidate_analysis(DEPENDENCY_INSTRUCTION_DATA_FLOW |
+                          DEPENDENCY_INSTRUCTION_DETAIL);
  
     return progress;
  }