From: Jason Ekstrand <jason.ekstrand@intel.com>
Date: Thu, 28 Apr 2016 22:37:39 +0000 (-0700)
Subject: i965/fs: Organize prog_data by ksp number rather than SIMD width
X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=bee160b31be9e09eeab83f62d26ac331f08955fa;p=mesa.git

i965/fs: Organize prog_data by ksp number rather than SIMD width

The hardware packets organize kernel pointers and GRF start by slots that
don't map directly to dispatch width.  This means that all of the state
setup code has to re-arrange the data from prog_data into these slots.
This logic has been duplicated 4 times in the GL driver and one more time
in the Vulkan driver.  Let's just put it all in brw_fs.cpp.

Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
---

diff --git a/src/intel/vulkan/anv_pipeline.c b/src/intel/vulkan/anv_pipeline.c
index f55069ee747..a8e31b13cf1 100644
--- a/src/intel/vulkan/anv_pipeline.c
+++ b/src/intel/vulkan/anv_pipeline.c
@@ -585,17 +585,17 @@ anv_pipeline_compile_fs(struct anv_pipeline *pipeline,
    const struct brw_stage_prog_data *stage_prog_data;
    struct anv_pipeline_bind_map map;
    struct brw_wm_prog_key key;
-   uint32_t kernel = NO_KERNEL;
    unsigned char sha1[20];
 
    populate_wm_prog_key(&pipeline->device->info, info, extra, &key);
 
    if (module->size > 0) {
       anv_hash_shader(sha1, &key, sizeof(key), module, entrypoint, spec_info);
-      kernel = anv_pipeline_cache_search(cache, sha1, &stage_prog_data, &map);
+      pipeline->ps_ksp0 =
+         anv_pipeline_cache_search(cache, sha1, &stage_prog_data, &map);
    }
 
-   if (kernel == NO_KERNEL) {
+   if (pipeline->ps_ksp0 == NO_KERNEL) {
       struct brw_wm_prog_data prog_data = { 0, };
       struct anv_pipeline_binding surface_to_descriptor[256];
       struct anv_pipeline_binding sampler_to_descriptor[256];
@@ -682,43 +682,16 @@ anv_pipeline_compile_fs(struct anv_pipeline *pipeline,
       }
 
       stage_prog_data = &prog_data.base;
-      kernel = anv_pipeline_cache_upload_kernel(cache,
-                                                module->size > 0 ? sha1 : NULL,
-                                                shader_code, code_size,
+      pipeline->ps_ksp0 =
+         anv_pipeline_cache_upload_kernel(cache,
+                                          module->size > 0 ? sha1 : NULL,
+                                          shader_code, code_size,
                                                 &stage_prog_data, sizeof(prog_data),
                                                 &map);
 
       ralloc_free(mem_ctx);
    }
 
-   const struct brw_wm_prog_data *wm_prog_data =
-      (const struct brw_wm_prog_data *) stage_prog_data;
-
-   if (wm_prog_data->no_8)
-      pipeline->ps_simd8 = NO_KERNEL;
-   else
-      pipeline->ps_simd8 = kernel;
-
-   if (wm_prog_data->no_8 || wm_prog_data->prog_offset_16) {
-      pipeline->ps_simd16 = kernel + wm_prog_data->prog_offset_16;
-   } else {
-      pipeline->ps_simd16 = NO_KERNEL;
-   }
-
-   pipeline->ps_ksp2 = 0;
-   pipeline->ps_grf_start2 = 0;
-   if (pipeline->ps_simd8 != NO_KERNEL) {
-      pipeline->ps_ksp0 = pipeline->ps_simd8;
-      pipeline->ps_grf_start0 = wm_prog_data->base.dispatch_grf_start_reg;
-      if (pipeline->ps_simd16 != NO_KERNEL) {
-         pipeline->ps_ksp2 = pipeline->ps_simd16;
-         pipeline->ps_grf_start2 = wm_prog_data->dispatch_grf_start_reg_16;
-      }
-   } else if (pipeline->ps_simd16 != NO_KERNEL) {
-      pipeline->ps_ksp0 = pipeline->ps_simd16;
-      pipeline->ps_grf_start0 = wm_prog_data->dispatch_grf_start_reg_16;
-   }
-
    anv_pipeline_add_compiled_stage(pipeline, MESA_SHADER_FRAGMENT,
                                    stage_prog_data, &map);
 
diff --git a/src/intel/vulkan/anv_private.h b/src/intel/vulkan/anv_private.h
index d8a21942d83..c55f1db5180 100644
--- a/src/intel/vulkan/anv_private.h
+++ b/src/intel/vulkan/anv_private.h
@@ -1418,12 +1418,7 @@ struct anv_pipeline {
    struct anv_state                             blend_state;
    uint32_t                                     vs_simd8;
    uint32_t                                     vs_vec4;
-   uint32_t                                     ps_simd8;
-   uint32_t                                     ps_simd16;
    uint32_t                                     ps_ksp0;
-   uint32_t                                     ps_ksp2;
-   uint32_t                                     ps_grf_start0;
-   uint32_t                                     ps_grf_start2;
    uint32_t                                     gs_kernel;
    uint32_t                                     cs_simd;
 
diff --git a/src/intel/vulkan/gen7_pipeline.c b/src/intel/vulkan/gen7_pipeline.c
index d4797c59977..285b191352c 100644
--- a/src/intel/vulkan/gen7_pipeline.c
+++ b/src/intel/vulkan/gen7_pipeline.c
@@ -375,19 +375,21 @@ genX(graphics_pipeline_create)(
                                             POSOFFSET_SAMPLE : POSOFFSET_NONE;
 
          ps._32PixelDispatchEnable        = false;
-         ps._16PixelDispatchEnable        = pipeline->ps_simd16 != NO_KERNEL;
-         ps._8PixelDispatchEnable         = pipeline->ps_simd8 != NO_KERNEL;
+         ps._16PixelDispatchEnable        = wm_prog_data->dispatch_16;
+         ps._8PixelDispatchEnable         = wm_prog_data->dispatch_8;
 
-         ps.DispatchGRFStartRegisterforConstantSetupData0 = pipeline->ps_grf_start0,
+         ps.DispatchGRFStartRegisterforConstantSetupData0 =
+            wm_prog_data->base.dispatch_grf_start_reg,
          ps.DispatchGRFStartRegisterforConstantSetupData1 = 0,
-         ps.DispatchGRFStartRegisterforConstantSetupData2 = pipeline->ps_grf_start2,
+         ps.DispatchGRFStartRegisterforConstantSetupData2 =
+            wm_prog_data->dispatch_grf_start_reg_2,
 
          /* Haswell requires the sample mask to be set in this packet as well as
           * in 3DSTATE_SAMPLE_MASK; the values should match. */
          /* _NEW_BUFFERS, _NEW_MULTISAMPLE */
 
          ps.KernelStartPointer1           = 0;
-         ps.KernelStartPointer2           = pipeline->ps_ksp2;
+         ps.KernelStartPointer2           = pipeline->ps_ksp0 + wm_prog_data->prog_offset_2;
       }
 
       /* FIXME-GEN7: This needs a lot more work, cf gen7 upload_wm_state(). */
diff --git a/src/intel/vulkan/gen8_pipeline.c b/src/intel/vulkan/gen8_pipeline.c
index 857f9798111..d96669494a2 100644
--- a/src/intel/vulkan/gen8_pipeline.c
+++ b/src/intel/vulkan/gen8_pipeline.c
@@ -502,9 +502,9 @@ genX(graphics_pipeline_create)(
       anv_batch_emit(&pipeline->batch, GENX(3DSTATE_PS), ps) {
          ps.KernelStartPointer0     = pipeline->ps_ksp0;
          ps.KernelStartPointer1     = 0;
-         ps.KernelStartPointer2     = pipeline->ps_ksp2;
-         ps._8PixelDispatchEnable   = pipeline->ps_simd8 != NO_KERNEL;
-         ps._16PixelDispatchEnable  = pipeline->ps_simd16 != NO_KERNEL;
+         ps.KernelStartPointer2     = pipeline->ps_ksp0 + wm_prog_data->prog_offset_2;
+         ps._8PixelDispatchEnable   = wm_prog_data->dispatch_8;
+         ps._16PixelDispatchEnable  = wm_prog_data->dispatch_16;
          ps._32PixelDispatchEnable  = false;
          ps.SingleProgramFlow       = false;
          ps.VectorMaskEnable        = true;
@@ -518,9 +518,11 @@ genX(graphics_pipeline_create)(
          ps.ScratchSpaceBasePointer = pipeline->scratch_start[MESA_SHADER_FRAGMENT];
          ps.PerThreadScratchSpace   = scratch_space(&wm_prog_data->base);
 
-         ps.DispatchGRFStartRegisterForConstantSetupData0 = pipeline->ps_grf_start0;
+         ps.DispatchGRFStartRegisterForConstantSetupData0 =
+            wm_prog_data->base.dispatch_grf_start_reg;
          ps.DispatchGRFStartRegisterForConstantSetupData1 = 0;
-         ps.DispatchGRFStartRegisterForConstantSetupData2 = pipeline->ps_grf_start2;
+         ps.DispatchGRFStartRegisterForConstantSetupData2 =
+            wm_prog_data->dispatch_grf_start_reg_2;
       }
 
       bool per_sample_ps = pCreateInfo->pMultisampleState &&
diff --git a/src/mesa/drivers/dri/i965/brw_compiler.h b/src/mesa/drivers/dri/i965/brw_compiler.h
index 3fcd7e87c4e..a2148ae2656 100644
--- a/src/mesa/drivers/dri/i965/brw_compiler.h
+++ b/src/mesa/drivers/dri/i965/brw_compiler.h
@@ -367,9 +367,11 @@ struct brw_wm_prog_data {
 
    GLuint num_varying_inputs;
 
-   GLuint dispatch_grf_start_reg_16;
-   GLuint reg_blocks;
-   GLuint reg_blocks_16;
+   uint8_t reg_blocks_0;
+   uint8_t reg_blocks_2;
+
+   uint8_t dispatch_grf_start_reg_2;
+   uint32_t prog_offset_2;
 
    struct {
       /** @{
@@ -383,7 +385,8 @@ struct brw_wm_prog_data {
    bool computed_stencil;
 
    bool early_fragment_tests;
-   bool no_8;
+   bool dispatch_8;
+   bool dispatch_16;
    bool dual_src_blend;
    bool persample_dispatch;
    bool uses_pos_offset;
@@ -393,7 +396,6 @@ struct brw_wm_prog_data {
    bool uses_src_w;
    bool uses_sample_mask;
    bool pulls_bary;
-   uint32_t prog_offset_16;
 
    /**
     * Mask of which interpolation modes are required by the fragment shader.
diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index f66ba473411..1e84b101a8e 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -5800,11 +5800,6 @@ fs_visitor::run_fs(bool do_rep_send)
          return false;
    }
 
-   if (dispatch_width == 8)
-      wm_prog_data->reg_blocks = brw_register_blocks(grf_used);
-   else
-      wm_prog_data->reg_blocks_16 = brw_register_blocks(grf_used);
-
    return !failed;
 }
 
@@ -6004,6 +5999,8 @@ brw_compile_fs(const struct brw_compiler *compiler, void *log_data,
                                            shader);
 
    cfg_t *simd8_cfg = NULL, *simd16_cfg = NULL;
+   uint8_t simd8_grf_start, simd16_grf_start;
+   unsigned simd8_grf_used, simd16_grf_used;
 
    fs_visitor v8(compiler, log_data, mem_ctx, key,
                  &prog_data->base, prog, shader, 8,
@@ -6015,7 +6012,8 @@ brw_compile_fs(const struct brw_compiler *compiler, void *log_data,
       return NULL;
    } else if (likely(!(INTEL_DEBUG & DEBUG_NO8))) {
       simd8_cfg = v8.cfg;
-      prog_data->base.dispatch_grf_start_reg = v8.payload.num_regs;
+      simd8_grf_start = v8.payload.num_regs;
+      simd8_grf_used = v8.grf_used;
    }
 
    if (!v8.simd16_unsupported &&
@@ -6031,7 +6029,8 @@ brw_compile_fs(const struct brw_compiler *compiler, void *log_data,
                                    v16.fail_msg);
       } else {
          simd16_cfg = v16.cfg;
-         prog_data->dispatch_grf_start_reg_16 = v16.payload.num_regs;
+         simd16_grf_start = v16.payload.num_regs;
+         simd16_grf_used = v16.grf_used;
       }
    }
 
@@ -6047,6 +6046,24 @@ brw_compile_fs(const struct brw_compiler *compiler, void *log_data,
    if (compiler->devinfo->gen < 5 && simd16_cfg)
       simd8_cfg = NULL;
 
+   if (prog_data->persample_dispatch) {
+      /* Starting with SandyBridge (where we first get MSAA), the different
+       * pixel dispatch combinations are grouped into classifications A
+       * through F (SNB PRM Vol. 2 Part 1 Section 7.7.1).  On all hardware
+       * generations, the only configurations supporting persample dispatch
+       * are are this in which only one dispatch width is enabled.
+       *
+       * If computed depth is enabled, SNB only allows SIMD8 while IVB+
+       * allow SIMD8 or SIMD16 so we choose SIMD16 if available.
+       */
+      if (compiler->devinfo->gen == 6 &&
+          prog_data->computed_depth_mode != BRW_PSCDEPTH_OFF) {
+         simd16_cfg = NULL;
+      } else if (simd16_cfg) {
+         simd8_cfg = NULL;
+      }
+   }
+
    /* We have to compute the flat inputs after the visitor is finished running
     * because it relies on prog_data->urb_setup which is computed in
     * fs_visitor::calculate_urb_setup().
@@ -6065,15 +6082,24 @@ brw_compile_fs(const struct brw_compiler *compiler, void *log_data,
    }
 
    if (simd8_cfg) {
+      prog_data->dispatch_8 = true;
       g.generate_code(simd8_cfg, 8);
-      prog_data->no_8 = false;
-   } else {
-      prog_data->no_8 = true;
+      prog_data->base.dispatch_grf_start_reg = simd8_grf_start;
+      prog_data->reg_blocks_0 = brw_register_blocks(simd8_grf_used);
+
+      if (simd16_cfg) {
+         prog_data->dispatch_16 = true;
+         prog_data->prog_offset_2 = g.generate_code(simd16_cfg, 16);
+         prog_data->dispatch_grf_start_reg_2 = simd16_grf_start;
+         prog_data->reg_blocks_2 = brw_register_blocks(simd16_grf_used);
+      }
+   } else if (simd16_cfg) {
+      prog_data->dispatch_16 = true;
+      g.generate_code(simd16_cfg, 16);
+      prog_data->base.dispatch_grf_start_reg = simd16_grf_start;
+      prog_data->reg_blocks_0 = brw_register_blocks(simd16_grf_used);
    }
 
-   if (simd16_cfg)
-      prog_data->prog_offset_16 = g.generate_code(simd16_cfg, 16);
-
    return g.get_assembly(final_assembly_size);
 }
 
diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
index 58faf2f4694..012492c0e0d 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
@@ -169,7 +169,7 @@ fs_visitor::emit_dummy_fs()
    stage_prog_data->nr_pull_params = 0;
    stage_prog_data->curb_read_length = 0;
    stage_prog_data->dispatch_grf_start_reg = 2;
-   wm_prog_data->dispatch_grf_start_reg_16 = 2;
+   wm_prog_data->dispatch_grf_start_reg_2 = 2;
    grf_used = 1; /* Gen4-5 don't allow zero GRF blocks */
 
    calculate_cfg();
diff --git a/src/mesa/drivers/dri/i965/brw_wm_state.c b/src/mesa/drivers/dri/i965/brw_wm_state.c
index 91b35cd681a..bf1bdc9948f 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_state.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_state.c
@@ -86,48 +86,37 @@ brw_upload_wm_unit(struct brw_context *brw)
 			sizeof(*wm), 32, &brw->wm.base.state_offset);
    memset(wm, 0, sizeof(*wm));
 
-   if (prog_data->prog_offset_16) {
+   if (prog_data->dispatch_8 && prog_data->dispatch_16) {
       /* These two fields should be the same pre-gen6, which is why we
        * only have one hardware field to program for both dispatch
        * widths.
        */
       assert(prog_data->base.dispatch_grf_start_reg ==
-	     prog_data->dispatch_grf_start_reg_16);
+	     prog_data->dispatch_grf_start_reg_2);
    }
 
    /* BRW_NEW_PROGRAM_CACHE | BRW_NEW_FS_PROG_DATA */
-   if (prog_data->no_8) {
-      wm->wm5.enable_16_pix = 1;
-      wm->thread0.grf_reg_count = prog_data->reg_blocks_16;
-      wm->thread0.kernel_start_pointer =
-         brw_program_reloc(brw,
-                           brw->wm.base.state_offset +
-                           offsetof(struct brw_wm_unit_state, thread0),
-                           brw->wm.base.prog_offset +
-                           prog_data->prog_offset_16 +
-                           (prog_data->reg_blocks_16 << 1)) >> 6;
-
-   } else {
-      wm->thread0.grf_reg_count = prog_data->reg_blocks;
-      wm->wm9.grf_reg_count_2 = prog_data->reg_blocks_16;
-
-      wm->wm5.enable_8_pix = 1;
-      if (prog_data->prog_offset_16)
-         wm->wm5.enable_16_pix = 1;
+   wm->wm5.enable_8_pix = prog_data->dispatch_8;
+   wm->wm5.enable_16_pix = prog_data->dispatch_16;
 
+   if (prog_data->dispatch_8 || prog_data->dispatch_16) {
+      wm->thread0.grf_reg_count = prog_data->reg_blocks_0;
       wm->thread0.kernel_start_pointer =
          brw_program_reloc(brw,
                            brw->wm.base.state_offset +
                            offsetof(struct brw_wm_unit_state, thread0),
                            brw->wm.base.prog_offset +
                            (wm->thread0.grf_reg_count << 1)) >> 6;
+   }
 
+   if (prog_data->prog_offset_2) {
+      wm->wm9.grf_reg_count_2 = prog_data->reg_blocks_2;
       wm->wm9.kernel_start_pointer_2 =
          brw_program_reloc(brw,
                            brw->wm.base.state_offset +
                            offsetof(struct brw_wm_unit_state, wm9),
                            brw->wm.base.prog_offset +
-                           prog_data->prog_offset_16 +
+                           prog_data->prog_offset_2 +
                            (wm->wm9.grf_reg_count_2 << 1)) >> 6;
    }
 
diff --git a/src/mesa/drivers/dri/i965/gen6_wm_state.c b/src/mesa/drivers/dri/i965/gen6_wm_state.c
index 4a5aa129d41..3e872af4894 100644
--- a/src/mesa/drivers/dri/i965/gen6_wm_state.c
+++ b/src/mesa/drivers/dri/i965/gen6_wm_state.c
@@ -129,29 +129,19 @@ gen6_upload_wm_state(struct brw_context *brw,
 
    dw5 |= (brw->max_wm_threads - 1) << GEN6_WM_MAX_THREADS_SHIFT;
 
-   if (prog_data->prog_offset_16 || prog_data->no_8) {
+   if (prog_data->dispatch_8)
+      dw5 |= GEN6_WM_8_DISPATCH_ENABLE;
+
+   if (prog_data->dispatch_16)
       dw5 |= GEN6_WM_16_DISPATCH_ENABLE;
 
-      if (!prog_data->no_8 && !prog_data->persample_dispatch) {
-         dw5 |= GEN6_WM_8_DISPATCH_ENABLE;
-         dw4 |= (prog_data->base.dispatch_grf_start_reg <<
-                 GEN6_WM_DISPATCH_START_GRF_SHIFT_0);
-         dw4 |= (prog_data->dispatch_grf_start_reg_16 <<
-                 GEN6_WM_DISPATCH_START_GRF_SHIFT_2);
-         ksp0 = stage_state->prog_offset;
-         ksp2 = stage_state->prog_offset + prog_data->prog_offset_16;
-      } else {
-         dw4 |= (prog_data->dispatch_grf_start_reg_16 <<
-                GEN6_WM_DISPATCH_START_GRF_SHIFT_0);
-         ksp0 = stage_state->prog_offset + prog_data->prog_offset_16;
-      }
-   }
-   else {
-      dw5 |= GEN6_WM_8_DISPATCH_ENABLE;
-      dw4 |= (prog_data->base.dispatch_grf_start_reg <<
-              GEN6_WM_DISPATCH_START_GRF_SHIFT_0);
-      ksp0 = stage_state->prog_offset;
-   }
+   dw4 |= prog_data->base.dispatch_grf_start_reg <<
+          GEN6_WM_DISPATCH_START_GRF_SHIFT_0;
+   dw4 |= prog_data->dispatch_grf_start_reg_2 <<
+          GEN6_WM_DISPATCH_START_GRF_SHIFT_2;
+
+   ksp0 = stage_state->prog_offset;
+   ksp2 = stage_state->prog_offset + prog_data->prog_offset_2;
 
    if (dual_source_blend_enable)
       dw5 |= GEN6_WM_DUAL_SOURCE_BLEND_ENABLE;
@@ -200,37 +190,6 @@ gen6_upload_wm_state(struct brw_context *brw,
          dw6 |= GEN6_WM_MSDISPMODE_PERSAMPLE;
       else {
          dw6 |= GEN6_WM_MSDISPMODE_PERPIXEL;
-
-         /* From the Sandy Bridge PRM, Vol 2 part 1, 7.7.1 ("Pixel Grouping
-          * (Dispatch Size) Control"), p.334:
-          *
-          *     Note: in the table below, the Valid column indicates which
-          *     products that combination is supported on. Combinations of
-          *     dispatch enables not listed in the table are not available on
-          *     any product.
-          *
-          *     A: Valid on all products
-          *
-          *     B: Not valid on [DevSNB] if 4x PERPIXEL mode with pixel shader
-          *     computed depth.
-          *
-          *     D: Valid on all products, except when in non-1x PERSAMPLE mode
-          *     (applies to [DevSNB+] only). Not valid on [DevSNB] if 4x
-          *     PERPIXEL mode with pixel shader computed depth.
-          *
-          *     E: Not valid on [DevSNB] if 4x PERPIXEL mode with pixel shader
-          *     computed depth.
-          *
-          *     F: Valid on all products, except not valid on [DevSNB] if 4x
-          *     PERPIXEL mode with pixel shader computed depth.
-          *
-          * In the table that follows, the only entry with "A" in the Valid
-          * column is the entry where only 8 pixel dispatch is enabled.
-          * Therefore, when we are in PERPIXEL mode with pixel shader computed
-          * depth, we need to disable SIMD16 dispatch.
-          */
-         if (dw5 & GEN6_WM_COMPUTED_DEPTH)
-            dw5 &= ~GEN6_WM_16_DISPATCH_ENABLE;
       }
    } else {
       dw6 |= GEN6_WM_MSRAST_OFF_PIXEL;
diff --git a/src/mesa/drivers/dri/i965/gen7_wm_state.c b/src/mesa/drivers/dri/i965/gen7_wm_state.c
index 8d2e2c32bb4..a618c3ed87b 100644
--- a/src/mesa/drivers/dri/i965/gen7_wm_state.c
+++ b/src/mesa/drivers/dri/i965/gen7_wm_state.c
@@ -216,34 +216,19 @@ gen7_upload_ps_state(struct brw_context *brw,
 
    dw4 |= fast_clear_op;
 
-   if (prog_data->prog_offset_16 || prog_data->no_8) {
+   if (prog_data->dispatch_16)
       dw4 |= GEN7_PS_16_DISPATCH_ENABLE;
 
-      /* In case of non 1x per sample shading, only one of SIMD8 and SIMD16
-       * should be enabled. We do 'SIMD16 only' dispatch if a SIMD16 shader
-       * is successfully compiled. In majority of the cases that bring us
-       * better performance than 'SIMD8 only' dispatch.
-       */
-      if (!prog_data->no_8 && !prog_data->persample_dispatch) {
-         dw4 |= GEN7_PS_8_DISPATCH_ENABLE;
-         dw5 |= (prog_data->base.dispatch_grf_start_reg <<
-                 GEN7_PS_DISPATCH_START_GRF_SHIFT_0);
-         dw5 |= (prog_data->dispatch_grf_start_reg_16 <<
-                 GEN7_PS_DISPATCH_START_GRF_SHIFT_2);
-         ksp0 = stage_state->prog_offset;
-         ksp2 = stage_state->prog_offset + prog_data->prog_offset_16;
-      } else {
-         dw5 |= (prog_data->dispatch_grf_start_reg_16 <<
-                 GEN7_PS_DISPATCH_START_GRF_SHIFT_0);
-         ksp0 = stage_state->prog_offset + prog_data->prog_offset_16;
-      }
-   }
-   else {
+   if (prog_data->dispatch_8)
       dw4 |= GEN7_PS_8_DISPATCH_ENABLE;
-      dw5 |= (prog_data->base.dispatch_grf_start_reg <<
-              GEN7_PS_DISPATCH_START_GRF_SHIFT_0);
-      ksp0 = stage_state->prog_offset;
-   }
+
+   dw5 |= prog_data->base.dispatch_grf_start_reg <<
+          GEN7_PS_DISPATCH_START_GRF_SHIFT_0;
+   dw5 |= prog_data->dispatch_grf_start_reg_2 <<
+          GEN7_PS_DISPATCH_START_GRF_SHIFT_2;
+
+   ksp0 = stage_state->prog_offset;
+   ksp2 = stage_state->prog_offset + prog_data->prog_offset_2;
 
    BEGIN_BATCH(8);
    OUT_BATCH(_3DSTATE_PS << 16 | (8 - 2));
diff --git a/src/mesa/drivers/dri/i965/gen8_ps_state.c b/src/mesa/drivers/dri/i965/gen8_ps_state.c
index b677a8e1793..c475a52afe0 100644
--- a/src/mesa/drivers/dri/i965/gen8_ps_state.c
+++ b/src/mesa/drivers/dri/i965/gen8_ps_state.c
@@ -234,34 +234,19 @@ gen8_upload_ps_state(struct brw_context *brw,
 
    dw6 |= fast_clear_op;
 
-   if (prog_data->prog_offset_16 || prog_data->no_8) {
+   if (prog_data->dispatch_8)
+      dw6 |= GEN7_PS_8_DISPATCH_ENABLE;
+
+   if (prog_data->dispatch_16)
       dw6 |= GEN7_PS_16_DISPATCH_ENABLE;
 
-      /* In case of non 1x per sample shading, only one of SIMD8 and SIMD16
-       * should be enabled. We do 'SIMD16 only' dispatch if a SIMD16 shader
-       * is successfully compiled. In majority of the cases that bring us
-       * better performance than 'SIMD8 only' dispatch.
-       */
-      if (!prog_data->no_8 && !prog_data->persample_dispatch) {
-         dw6 |= GEN7_PS_8_DISPATCH_ENABLE;
-         dw7 |= (prog_data->base.dispatch_grf_start_reg <<
-                 GEN7_PS_DISPATCH_START_GRF_SHIFT_0);
-         dw7 |= (prog_data->dispatch_grf_start_reg_16 <<
-                 GEN7_PS_DISPATCH_START_GRF_SHIFT_2);
-         ksp0 = stage_state->prog_offset;
-         ksp2 = stage_state->prog_offset + prog_data->prog_offset_16;
-      } else {
-         dw7 |= (prog_data->dispatch_grf_start_reg_16 <<
-                 GEN7_PS_DISPATCH_START_GRF_SHIFT_0);
-
-         ksp0 = stage_state->prog_offset + prog_data->prog_offset_16;
-      }
-   } else {
-      dw6 |= GEN7_PS_8_DISPATCH_ENABLE;
-      dw7 |= (prog_data->base.dispatch_grf_start_reg <<
-              GEN7_PS_DISPATCH_START_GRF_SHIFT_0);
-      ksp0 = stage_state->prog_offset;
-   }
+   dw7 |= prog_data->base.dispatch_grf_start_reg <<
+          GEN7_PS_DISPATCH_START_GRF_SHIFT_0;
+   dw7 |= prog_data->dispatch_grf_start_reg_2 <<
+          GEN7_PS_DISPATCH_START_GRF_SHIFT_2;
+
+   ksp0 = stage_state->prog_offset;
+   ksp2 = stage_state->prog_offset + prog_data->prog_offset_2;
 
    BEGIN_BATCH(12);
    OUT_BATCH(_3DSTATE_PS << 16 | (12 - 2));