i965: fix double-precision vertex inputs measurement

[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs.cpp
diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp

index b479684fc675b0a9ca7467b20b11e8a37c929c2a..bb2caa54e175e383ffe51114e88616937b0f00d0 100644 (file)
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -194,8 +194,15 @@ fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_builder &bld,
     else
        op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
  
+   /* The pull load message will load a vec4 (16 bytes). If we are loading
+    * a double this means we are only loading 2 elements worth of data.
+    * We also want to use a 32-bit data type for the dst of the load operation
+    * so other parts of the driver don't get confused about the size of the
+    * result.
+    */
     int regs_written = 4 * (bld.dispatch_width() / 8) * scale;
-   fs_reg vec4_result = fs_reg(VGRF, alloc.allocate(regs_written), dst.type);
+   fs_reg vec4_result = fs_reg(VGRF, alloc.allocate(regs_written),
+                               BRW_REGISTER_TYPE_F);
     fs_inst *inst = bld.emit(op, vec4_result, surf_index, vec4_offset);
     inst->regs_written = regs_written;
  
@@ -208,7 +215,15 @@ fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_builder &bld,
           inst->mlen = 1 + bld.dispatch_width() / 8;
     }
  
-   bld.MOV(dst, offset(vec4_result, bld, ((const_offset & 0xf) / 4) * scale));
+   if (type_sz(dst.type) == 8) {
+      assert(scale == 1);
+      shuffle_32bit_load_result_to_64bit_data(
+         bld, retype(vec4_result, dst.type), vec4_result, 2);
+   }
+
+   vec4_result.type = dst.type;
+   bld.MOV(dst, offset(vec4_result, bld,
+                       (const_offset & 0xf) / type_sz(vec4_result.type) * scale));
  }
  
  /**
@@ -530,6 +545,19 @@ type_size_vec4_times_4(const struct glsl_type *type)
     return 4 * type_size_vec4(type);
  }
  
+/* Attribute arrays are loaded as one vec4 per element (or matrix column),
+ * except for double-precision types, which are loaded as one dvec4.
+ */
+extern "C" int
+type_size_vs_input(const struct glsl_type *type)
+{
+   if (type->is_double()) {
+      return type_size_dvec4(type);
+   } else {
+      return type_size_vec4(type);
+   }
+}
+
  /**
   * Create a MOV to read the timestamp register.
   *
@@ -952,12 +980,14 @@ fs_visitor::implied_mrf_writes(fs_inst *inst)
     case FS_OPCODE_TXB:
     case SHADER_OPCODE_TXD:
     case SHADER_OPCODE_TXF:
+   case SHADER_OPCODE_TXF_LZ:
     case SHADER_OPCODE_TXF_CMS:
     case SHADER_OPCODE_TXF_CMS_W:
     case SHADER_OPCODE_TXF_MCS:
     case SHADER_OPCODE_TG4:
     case SHADER_OPCODE_TG4_OFFSET:
     case SHADER_OPCODE_TXL:
+   case SHADER_OPCODE_TXL_LZ:
     case SHADER_OPCODE_TXS:
     case SHADER_OPCODE_LOD:
     case SHADER_OPCODE_SAMPLEINFO:
@@ -1030,37 +1060,18 @@ fs_visitor::import_uniforms(fs_visitor *v)
  }
  
  fs_reg *
-fs_visitor::emit_fragcoord_interpolation(bool pixel_center_integer,
-                                         bool origin_upper_left)
+fs_visitor::emit_fragcoord_interpolation()
  {
     assert(stage == MESA_SHADER_FRAGMENT);
-   brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
     fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec4_type));
     fs_reg wpos = *reg;
-   bool flip = !origin_upper_left ^ key->render_to_fbo;
  
     /* gl_FragCoord.x */
-   if (pixel_center_integer) {
-      bld.MOV(wpos, this->pixel_x);
-   } else {
-      bld.ADD(wpos, this->pixel_x, brw_imm_f(0.5f));
-   }
+   bld.MOV(wpos, this->pixel_x);
     wpos = offset(wpos, bld, 1);
  
     /* gl_FragCoord.y */
-   if (!flip && pixel_center_integer) {
-      bld.MOV(wpos, this->pixel_y);
-   } else {
-      fs_reg pixel_y = this->pixel_y;
-      float offset = (pixel_center_integer ? 0.0f : 0.5f);
-
-      if (flip) {
-        pixel_y.negate = true;
-        offset += key->drawable_height - 1.0f;
-      }
-
-      bld.ADD(wpos, pixel_y, brw_imm_f(offset));
-   }
+   bld.MOV(wpos, this->pixel_y);
     wpos = offset(wpos, bld, 1);
  
     /* gl_FragCoord.z */
@@ -1195,8 +1206,8 @@ fs_visitor::emit_general_interpolation(fs_reg *attr, const char *name,
                    inst->no_dd_clear = true;
  
                 inst = emit_linterp(*attr, fs_reg(interp), interpolation_mode,
-                                   mod_centroid && !key->persample_shading,
-                                   mod_sample || key->persample_shading);
+                                   mod_centroid && !key->persample_interp,
+                                   mod_sample || key->persample_interp);
                 inst->predicate = BRW_PREDICATE_NORMAL;
                 inst->predicate_inverse = false;
                 if (devinfo->has_pln)
@@ -1204,8 +1215,8 @@ fs_visitor::emit_general_interpolation(fs_reg *attr, const char *name,
  
              } else {
                 emit_linterp(*attr, fs_reg(interp), interpolation_mode,
-                            mod_centroid && !key->persample_shading,
-                            mod_sample || key->persample_shading);
+                            mod_centroid && !key->persample_interp,
+                            mod_sample || key->persample_interp);
              }
              if (devinfo->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
                 bld.MUL(*attr, *attr, this->pixel_w);
@@ -1262,10 +1273,10 @@ void
  fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
  {
     assert(stage == MESA_SHADER_FRAGMENT);
-   brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
+   brw_wm_prog_data *wm_prog_data = (brw_wm_prog_data *) this->prog_data;
     assert(dst.type == BRW_REGISTER_TYPE_F);
  
-   if (key->compute_pos_offset) {
+   if (wm_prog_data->persample_dispatch) {
        /* Convert int_sample_pos to floating point */
        bld.MOV(dst, int_sample_pos);
        /* Scale to the range [0, 1] */
@@ -1430,7 +1441,7 @@ fs_reg *
  fs_visitor::emit_samplemaskin_setup()
  {
     assert(stage == MESA_SHADER_FRAGMENT);
-   brw_wm_prog_key *key = (brw_wm_prog_key *) this->key;
+   brw_wm_prog_data *wm_prog_data = (brw_wm_prog_data *) this->prog_data;
     assert(devinfo->gen >= 6);
  
     fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::int_type));
@@ -1438,7 +1449,7 @@ fs_visitor::emit_samplemaskin_setup()
     fs_reg coverage_mask(retype(brw_vec8_grf(payload.sample_mask_in_reg, 0),
                                 BRW_REGISTER_TYPE_D));
  
-   if (key->persample_shading) {
+   if (wm_prog_data->persample_dispatch) {
        /* gl_SampleMaskIn[] comes from two sources: the input coverage mask,
         * and a mask representing which sample is being processed by the
         * current shader invocation.
@@ -1549,20 +1560,6 @@ fs_visitor::emit_gs_thread_end()
  void
  fs_visitor::assign_curb_setup()
  {
-   if (dispatch_width == 8) {
-      prog_data->dispatch_grf_start_reg = payload.num_regs;
-   } else {
-      if (stage == MESA_SHADER_FRAGMENT) {
-         brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
-         prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
-      } else if (stage == MESA_SHADER_COMPUTE) {
-         brw_cs_prog_data *prog_data = (brw_cs_prog_data*) this->prog_data;
-         prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
-      } else {
-         unreachable("Unsupported shader type!");
-      }
-   }
-
     prog_data->curb_read_length = ALIGN(stage_prog_data->nr_params, 8) / 8;
  
     /* Map the offsets in the UNIFORM file to fixed HW regs. */
@@ -1728,11 +1725,28 @@ fs_visitor::convert_attr_sources_to_hw_regs(fs_inst *inst)
                     inst->src[i].nr +
                     inst->src[i].reg_offset;
  
-         unsigned width = inst->src[i].stride == 0 ? 1 : inst->exec_size;
+         /* As explained at brw_reg_from_fs_reg, From the Haswell PRM:
+          *
+          * VertStride must be used to cross GRF register boundaries. This
+          * rule implies that elements within a 'Width' cannot cross GRF
+          * boundaries.
+          *
+          * So, for registers that are large enough, we have to split the exec
+          * size in two and trust the compression state to sort it out.
+          */
+         unsigned total_size = inst->exec_size *
+                               inst->src[i].stride *
+                               type_sz(inst->src[i].type);
+
+         assert(total_size <= 2 * REG_SIZE);
+         const unsigned exec_size =
+            (total_size <= REG_SIZE) ? inst->exec_size : inst->exec_size / 2;
+
+         unsigned width = inst->src[i].stride == 0 ? 1 : exec_size;
           struct brw_reg reg =
              stride(byte_offset(retype(brw_vec8_grf(grf, 0), inst->src[i].type),
                                 inst->src[i].subreg_offset),
-                   inst->exec_size * inst->src[i].stride,
+                   exec_size * inst->src[i].stride,
                     width, inst->src[i].stride);
           reg.abs = inst->src[i].abs;
           reg.negate = inst->src[i].negate;
@@ -1750,7 +1764,7 @@ fs_visitor::assign_vs_urb_setup()
     assert(stage == MESA_SHADER_VERTEX);
  
     /* Each attribute is 4 regs. */
-   this->first_non_payload_grf += 4 * vs_prog_data->nr_attributes;
+   this->first_non_payload_grf += 4 * vs_prog_data->nr_attribute_slots;
  
     assert(vs_prog_data->base.urb_read_length <= 15);
  
@@ -2138,6 +2152,10 @@ fs_visitor::assign_constant_locations()
     push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
     pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
  
+   /* Default to -1 meaning no location */
+   memset(push_constant_loc, -1, uniforms * sizeof(*push_constant_loc));
+   memset(pull_constant_loc, -1, uniforms * sizeof(*pull_constant_loc));
+
     int chunk_start = -1;
  
     /* First push 64-bit uniforms to ensure they are properly aligned */
@@ -2145,9 +2163,6 @@ fs_visitor::assign_constant_locations()
        if (!is_live[u] || !is_live_64bit[u])
           continue;
  
-      pull_constant_loc[u] = -1;
-      push_constant_loc[u] = -1;
-
        set_push_pull_constant_loc(u, &chunk_start, contiguous[u],
                                   push_constant_loc, pull_constant_loc,
                                   &num_push_constants, &num_pull_constants,
@@ -2161,9 +2176,6 @@ fs_visitor::assign_constant_locations()
        if (!is_live[u] || is_live_64bit[u])
           continue;
  
-      pull_constant_loc[u] = -1;
-      push_constant_loc[u] = -1;
-
        set_push_pull_constant_loc(u, &chunk_start, contiguous[u],
                                   push_constant_loc, pull_constant_loc,
                                   &num_push_constants, &num_pull_constants,
@@ -2250,7 +2262,8 @@ fs_visitor::lower_constant_loads()
           inst->src[i].file = VGRF;
           inst->src[i].nr = dst.nr;
           inst->src[i].reg_offset = 0;
-         inst->src[i].set_smear(pull_index & 3);
+         inst->src[i].set_smear((pull_index & 3) * 4 /
+                                type_sz(inst->src[i].type));
  
           brw_mark_surface_used(prog_data, index);
        }
@@ -2974,7 +2987,7 @@ void
  fs_visitor::emit_repclear_shader()
  {
     brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
-   int base_mrf = 1;
+   int base_mrf = 0;
     int color_mrf = base_mrf + 2;
     fs_inst *mov;
  
@@ -4157,6 +4170,10 @@ lower_sampler_logical_send_gen7(const fs_builder &bld, fs_inst *inst, opcode op,
     switch (op) {
     case FS_OPCODE_TXB:
     case SHADER_OPCODE_TXL:
+      if (devinfo->gen >= 9 && op == SHADER_OPCODE_TXL && lod.is_zero()) {
+         op = SHADER_OPCODE_TXL_LZ;
+         break;
+      }
        bld.MOV(sources[length], lod);
        length++;
        break;
@@ -4208,8 +4225,12 @@ lower_sampler_logical_send_gen7(const fs_builder &bld, fs_inst *inst, opcode op,
           length++;
        }
  
-      bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D), lod);
-      length++;
+      if (devinfo->gen >= 9 && lod.is_zero()) {
+         op = SHADER_OPCODE_TXF_LZ;
+      } else {
+         bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D), lod);
+         length++;
+      }
  
        for (unsigned i = devinfo->gen >= 9 ? 2 : 1; i < coord_components; i++) {
           bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D), coordinate);
@@ -4219,6 +4240,7 @@ lower_sampler_logical_send_gen7(const fs_builder &bld, fs_inst *inst, opcode op,
  
        coordinate_done = true;
        break;
+
     case SHADER_OPCODE_TXF_CMS:
     case SHADER_OPCODE_TXF_CMS_W:
     case SHADER_OPCODE_TXF_UMS:
@@ -4697,29 +4719,6 @@ get_lowered_simd_width(const struct brw_device_info *devinfo,
     }
  }
  
-/**
- * The \p rows array of registers represents a \p num_rows by \p num_columns
- * matrix in row-major order, write it in column-major order into the register
- * passed as destination.  \p stride gives the separation between matrix
- * elements in the input in fs_builder::dispatch_width() units.
- */
-static void
-emit_transpose(const fs_builder &bld,
-               const fs_reg &dst, const fs_reg *rows,
-               unsigned num_rows, unsigned num_columns, unsigned stride)
-{
-   fs_reg *const components = new fs_reg[num_rows * num_columns];
-
-   for (unsigned i = 0; i < num_columns; ++i) {
-      for (unsigned j = 0; j < num_rows; ++j)
-         components[num_rows * i + j] = offset(rows[j], bld, stride * i);
-   }
-
-   bld.LOAD_PAYLOAD(dst, components, num_rows * num_columns, 0);
-
-   delete[] components;
-}
-
  bool
  fs_visitor::lower_simd_width()
  {
@@ -4770,16 +4769,19 @@ fs_visitor::lower_simd_width()
                 if (inst->src[j].file != BAD_FILE &&
                     !is_uniform(inst->src[j])) {
                    /* Get the i-th copy_width-wide chunk of the source. */
-                  const fs_reg src = horiz_offset(inst->src[j], copy_width * i);
+                  const fs_builder cbld = lbld.group(copy_width, 0);
+                  const fs_reg src = offset(inst->src[j], cbld, i);
                    const unsigned src_size = inst->components_read(j);
  
-                  /* Use a trivial transposition to copy one every n
-                   * copy_width-wide components of the register into a
-                   * temporary passed as source to the lowered instruction.
+                  /* Copy one every n copy_width-wide components of the
+                   * register into a temporary passed as source to the lowered
+                   * instruction.
                     */
                    split_inst.src[j] = lbld.vgrf(inst->src[j].type, src_size);
-                  emit_transpose(lbld.group(copy_width, 0),
-                                 split_inst.src[j], &src, 1, src_size, n);
+
+                  for (unsigned k = 0; k < src_size; ++k)
+                     cbld.MOV(offset(split_inst.src[j], lbld, k),
+                              offset(src, cbld, n * k));
                 }
              }
  
@@ -4798,20 +4800,18 @@ fs_visitor::lower_simd_width()
           }
  
           if (inst->regs_written) {
-            /* Distance between useful channels in the temporaries, skipping
-             * garbage if the lowered instruction is wider than the original.
-             */
-            const unsigned m = lower_width / copy_width;
+            const fs_builder lbld = ibld.group(lower_width, 0);
  
              /* Interleave the components of the result from the lowered
-             * instructions.  We need to set exec_all() when copying more than
-             * one half per component, because LOAD_PAYLOAD (in terms of which
-             * emit_transpose is implemented) can only use the same channel
-             * enable signals for all of its non-header sources.
+             * instructions.
               */
-            emit_transpose(ibld.exec_all(inst->exec_size > copy_width)
-                               .group(copy_width, 0),
-                           inst->dst, dsts, n, dst_size, m);
+            for (unsigned i = 0; i < dst_size; ++i) {
+               for (unsigned j = 0; j < n; ++j) {
+                  const fs_builder cbld = ibld.group(copy_width, j);
+                  cbld.MOV(offset(inst->dst, cbld, n * i + j),
+                           offset(dsts[j], lbld, i));
+               }
+            }
           }
  
           inst->remove(block);
@@ -5100,7 +5100,6 @@ fs_visitor::setup_fs_payload_gen6()
  {
     assert(stage == MESA_SHADER_FRAGMENT);
     brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
-   brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
  
     unsigned barycentric_interp_modes =
        (stage == MESA_SHADER_FRAGMENT) ?
@@ -5153,9 +5152,19 @@ fs_visitor::setup_fs_payload_gen6()
        }
     }
  
-   prog_data->uses_pos_offset = key->compute_pos_offset;
     /* R31: MSAA position offsets. */
-   if (prog_data->uses_pos_offset) {
+   if (prog_data->persample_dispatch &&
+       (nir->info.system_values_read & SYSTEM_BIT_SAMPLE_POS)) {
+      /* From the Ivy Bridge PRM documentation for 3DSTATE_PS:
+       *
+       *    "MSDISPMODE_PERSAMPLE is required in order to select
+       *    POSOFFSET_SAMPLE"
+       *
+       * So we can only really get sample positions if we are doing real
+       * per-sample dispatch.  If we need gl_SamplePosition and we don't have
+       * persample dispatch, we hard-code it to 0.5.
+       */
+      prog_data->uses_pos_offset = true;
        payload.sample_pos_reg = payload.num_regs;
        payload.num_regs++;
     }
@@ -5231,8 +5240,8 @@ fs_visitor::setup_gs_payload()
        payload.num_regs++;
     }
  
-   /* Use a maximum of 32 registers for push-model inputs. */
-   const unsigned max_push_components = 32;
+   /* Use a maximum of 24 registers for push-model inputs. */
+   const unsigned max_push_components = 24;
  
     /* If pushing our inputs would take too many registers, reduce the URB read
      * length (which is in HWords, or 8 registers), and resort to pulling.
@@ -5469,7 +5478,7 @@ fs_visitor::fixup_3src_null_dest()
  }
  
  void
-fs_visitor::allocate_registers()
+fs_visitor::allocate_registers(bool allow_spilling)
  {
     bool allocated_without_spills;
  
@@ -5479,6 +5488,8 @@ fs_visitor::allocate_registers()
        SCHEDULE_PRE_LIFO,
     };
  
+   bool spill_all = allow_spilling && (INTEL_DEBUG & DEBUG_SPILL_FS);
+
     /* Try each scheduling heuristic to see if it can successfully register
      * allocate without spilling.  They should be ordered by decreasing
      * performance but increasing likelihood of allocating.
@@ -5490,7 +5501,7 @@ fs_visitor::allocate_registers()
           assign_regs_trivial();
           allocated_without_spills = true;
        } else {
-         allocated_without_spills = assign_regs(false);
+         allocated_without_spills = assign_regs(false, spill_all);
        }
        if (allocated_without_spills)
           break;
@@ -5515,12 +5526,14 @@ fs_visitor::allocate_registers()
        /* Since we're out of heuristics, just go spill registers until we
         * get an allocation.
         */
-      while (!assign_regs(true)) {
+      while (!assign_regs(true, spill_all)) {
           if (failed)
              break;
        }
     }
  
+   assert(last_scratch == 0 || allow_spilling);
+
     /* This must come after all optimization and register allocation, since
      * it inserts dead code that happens to have side effects, and it does
      * so based on the actual physical registers in use.
@@ -5566,7 +5579,7 @@ fs_visitor::run_vs(gl_clip_plane *clip_planes)
     assign_vs_urb_setup();
  
     fixup_3src_null_dest();
-   allocate_registers();
+   allocate_registers(true);
  
     return !failed;
  }
@@ -5648,7 +5661,7 @@ fs_visitor::run_tcs_single_patch()
     assign_tcs_single_patch_urb_setup();
  
     fixup_3src_null_dest();
-   allocate_registers();
+   allocate_registers(true);
  
     return !failed;
  }
@@ -5682,7 +5695,7 @@ fs_visitor::run_tes()
     assign_tes_urb_setup();
  
     fixup_3src_null_dest();
-   allocate_registers();
+   allocate_registers(true);
  
     return !failed;
  }
@@ -5731,13 +5744,13 @@ fs_visitor::run_gs()
     assign_gs_urb_setup();
  
     fixup_3src_null_dest();
-   allocate_registers();
+   allocate_registers(true);
  
     return !failed;
  }
  
  bool
-fs_visitor::run_fs(bool do_rep_send)
+fs_visitor::run_fs(bool allow_spilling, bool do_rep_send)
  {
     brw_wm_prog_data *wm_prog_data = (brw_wm_prog_data *) this->prog_data;
     brw_wm_prog_key *wm_key = (brw_wm_prog_key *) this->key;
@@ -5801,17 +5814,12 @@ fs_visitor::run_fs(bool do_rep_send)
        assign_urb_setup();
  
        fixup_3src_null_dest();
-      allocate_registers();
+      allocate_registers(allow_spilling);
  
        if (failed)
           return false;
     }
  
-   if (dispatch_width == 8)
-      wm_prog_data->reg_blocks = brw_register_blocks(grf_used);
-   else
-      wm_prog_data->reg_blocks_16 = brw_register_blocks(grf_used);
-
     return !failed;
  }
  
@@ -5849,7 +5857,7 @@ fs_visitor::run_cs()
     assign_curb_setup();
  
     fixup_3src_null_dest();
-   allocate_registers();
+   allocate_registers(true);
  
     if (failed)
        return false;
@@ -5974,6 +5982,7 @@ brw_compile_fs(const struct brw_compiler *compiler, void *log_data,
                 const nir_shader *src_shader,
                 struct gl_program *prog,
                 int shader_time_index8, int shader_time_index16,
+               bool allow_spilling,
                 bool use_rep_send,
                 unsigned *final_assembly_size,
                 char **error_str)
@@ -5995,39 +6004,84 @@ brw_compile_fs(const struct brw_compiler *compiler, void *log_data,
     prog_data->computed_stencil =
        shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_STENCIL);
  
+   prog_data->persample_dispatch =
+      key->multisample_fbo &&
+      (key->persample_interp ||
+       (shader->info.system_values_read & (SYSTEM_BIT_SAMPLE_ID |
+                                           SYSTEM_BIT_SAMPLE_POS)) ||
+       shader->info.fs.uses_sample_qualifier);
+
     prog_data->early_fragment_tests = shader->info.fs.early_fragment_tests;
  
     prog_data->barycentric_interp_modes =
        brw_compute_barycentric_interp_modes(compiler->devinfo,
                                             key->flat_shade,
-                                           key->persample_shading,
+                                           key->persample_interp,
                                             shader);
  
-   fs_visitor v(compiler, log_data, mem_ctx, key,
-                &prog_data->base, prog, shader, 8,
-                shader_time_index8);
-   if (!v.run_fs(false /* do_rep_send */)) {
+   cfg_t *simd8_cfg = NULL, *simd16_cfg = NULL;
+   uint8_t simd8_grf_start = 0, simd16_grf_start = 0;
+   unsigned simd8_grf_used = 0, simd16_grf_used = 0;
+
+   fs_visitor v8(compiler, log_data, mem_ctx, key,
+                 &prog_data->base, prog, shader, 8,
+                 shader_time_index8);
+   if (!v8.run_fs(allow_spilling, false /* do_rep_send */)) {
        if (error_str)
-         *error_str = ralloc_strdup(mem_ctx, v.fail_msg);
+         *error_str = ralloc_strdup(mem_ctx, v8.fail_msg);
  
        return NULL;
+   } else if (likely(!(INTEL_DEBUG & DEBUG_NO8))) {
+      simd8_cfg = v8.cfg;
+      simd8_grf_start = v8.payload.num_regs;
+      simd8_grf_used = v8.grf_used;
     }
  
-   cfg_t *simd16_cfg = NULL;
-   fs_visitor v2(compiler, log_data, mem_ctx, key,
-                 &prog_data->base, prog, shader, 16,
-                 shader_time_index16);
-   if (likely(!(INTEL_DEBUG & DEBUG_NO16) || use_rep_send)) {
-      if (!v.simd16_unsupported) {
-         /* Try a SIMD16 compile */
-         v2.import_uniforms(&v);
-         if (!v2.run_fs(use_rep_send)) {
-            compiler->shader_perf_log(log_data,
-                                      "SIMD16 shader failed to compile: %s",
-                                      v2.fail_msg);
-         } else {
-            simd16_cfg = v2.cfg;
-         }
+   if (!v8.simd16_unsupported &&
+       likely(!(INTEL_DEBUG & DEBUG_NO16) || use_rep_send)) {
+      /* Try a SIMD16 compile */
+      fs_visitor v16(compiler, log_data, mem_ctx, key,
+                     &prog_data->base, prog, shader, 16,
+                     shader_time_index16);
+      v16.import_uniforms(&v8);
+      if (!v16.run_fs(allow_spilling, use_rep_send)) {
+         compiler->shader_perf_log(log_data,
+                                   "SIMD16 shader failed to compile: %s",
+                                   v16.fail_msg);
+      } else {
+         simd16_cfg = v16.cfg;
+         simd16_grf_start = v16.payload.num_regs;
+         simd16_grf_used = v16.grf_used;
+      }
+   }
+
+   /* When the caller requests a repclear shader, they want SIMD16-only */
+   if (use_rep_send)
+      simd8_cfg = NULL;
+
+   /* Prior to Iron Lake, the PS had a single shader offset with a jump table
+    * at the top to select the shader.  We've never implemented that.
+    * Instead, we just give them exactly one shader and we pick the widest one
+    * available.
+    */
+   if (compiler->devinfo->gen < 5 && simd16_cfg)
+      simd8_cfg = NULL;
+
+   if (prog_data->persample_dispatch) {
+      /* Starting with SandyBridge (where we first get MSAA), the different
+       * pixel dispatch combinations are grouped into classifications A
+       * through F (SNB PRM Vol. 2 Part 1 Section 7.7.1).  On all hardware
+       * generations, the only configurations supporting persample dispatch
+       * are are this in which only one dispatch width is enabled.
+       *
+       * If computed depth is enabled, SNB only allows SIMD8 while IVB+
+       * allow SIMD8 or SIMD16 so we choose SIMD16 if available.
+       */
+      if (compiler->devinfo->gen == 6 &&
+          prog_data->computed_depth_mode != BRW_PSCDEPTH_OFF) {
+         simd16_cfg = NULL;
+      } else if (simd16_cfg) {
+         simd8_cfg = NULL;
        }
     }
  
@@ -6037,18 +6091,8 @@ brw_compile_fs(const struct brw_compiler *compiler, void *log_data,
      */
     brw_compute_flat_inputs(prog_data, key->flat_shade, shader);
  
-   cfg_t *simd8_cfg;
-   int no_simd8 = (INTEL_DEBUG & DEBUG_NO8) || use_rep_send;
-   if ((no_simd8 || compiler->devinfo->gen < 5) && simd16_cfg) {
-      simd8_cfg = NULL;
-      prog_data->no_8 = true;
-   } else {
-      simd8_cfg = v.cfg;
-      prog_data->no_8 = false;
-   }
-
     fs_generator g(compiler, log_data, mem_ctx, (void *) key, &prog_data->base,
-                  v.promoted_constants, v.runtime_check_aads_emit,
+                  v8.promoted_constants, v8.runtime_check_aads_emit,
                    MESA_SHADER_FRAGMENT);
  
     if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
@@ -6058,10 +6102,24 @@ brw_compile_fs(const struct brw_compiler *compiler, void *log_data,
                                       shader->info.name));
     }
  
-   if (simd8_cfg)
+   if (simd8_cfg) {
+      prog_data->dispatch_8 = true;
        g.generate_code(simd8_cfg, 8);
-   if (simd16_cfg)
-      prog_data->prog_offset_16 = g.generate_code(simd16_cfg, 16);
+      prog_data->base.dispatch_grf_start_reg = simd8_grf_start;
+      prog_data->reg_blocks_0 = brw_register_blocks(simd8_grf_used);
+
+      if (simd16_cfg) {
+         prog_data->dispatch_16 = true;
+         prog_data->prog_offset_2 = g.generate_code(simd16_cfg, 16);
+         prog_data->dispatch_grf_start_reg_2 = simd16_grf_start;
+         prog_data->reg_blocks_2 = brw_register_blocks(simd16_grf_used);
+      }
+   } else if (simd16_cfg) {
+      prog_data->dispatch_16 = true;
+      g.generate_code(simd16_cfg, 16);
+      prog_data->base.dispatch_grf_start_reg = simd16_grf_start;
+      prog_data->reg_blocks_0 = brw_register_blocks(simd16_grf_used);
+   }
  
     return g.get_assembly(final_assembly_size);
  }
@@ -6144,6 +6202,7 @@ brw_compile_cs(const struct brw_compiler *compiler, void *log_data,
        } else {
           cfg = v8.cfg;
           prog_data->simd_size = 8;
+         prog_data->base.dispatch_grf_start_reg = v8.payload.num_regs;
        }
     }
  
@@ -6168,6 +6227,7 @@ brw_compile_cs(const struct brw_compiler *compiler, void *log_data,
        } else {
           cfg = v16.cfg;
           prog_data->simd_size = 16;
+         prog_data->dispatch_grf_start_reg_16 = v16.payload.num_regs;
        }
     }