ilo: add ilo_state_ps
authorChia-I Wu <olvaffe@gmail.com>
Fri, 12 Jun 2015 06:56:56 +0000 (14:56 +0800)
committerChia-I Wu <olvaffe@gmail.com>
Sun, 14 Jun 2015 17:06:44 +0000 (01:06 +0800)
We want to make ilo_shader_cso a union of ilo_state_{vs,hs,ds,gs,ps}.

src/gallium/drivers/ilo/Makefile.sources
src/gallium/drivers/ilo/core/ilo_state_shader.h
src/gallium/drivers/ilo/core/ilo_state_shader_ps.c [new file with mode: 0644]

index 2a6307798487ce782fd03c0cb28e0f2890d2da1b..3b38277af196756683a0c76b7c9f228420515888 100644 (file)
@@ -32,6 +32,7 @@ C_SOURCES := \
        core/ilo_state_sbe.c \
        core/ilo_state_sbe.h \
        core/ilo_state_shader.c \
+       core/ilo_state_shader_ps.c \
        core/ilo_state_shader.h \
        core/ilo_state_sol.c \
        core/ilo_state_sol.h \
index c1dbf23dfb37848e08799c69522d12bb00d02a97..44690c5b0bb42340cb2a6901d74ea1c7e8d99d4a 100644 (file)
@@ -123,6 +123,54 @@ struct ilo_state_gs_info {
    bool stats_enable;
 };
 
+struct ilo_state_ps_io_info {
+   /* inputs */
+   enum gen_position_offset posoffset;
+   uint8_t attr_count;
+   bool use_z;
+   bool use_w;
+   bool use_coverage_mask;
+
+   /* outputs */
+   enum gen_pscdepth_mode pscdepth;
+   bool has_rt_write;
+   bool write_pixel_mask;
+   bool write_omask;
+};
+
+struct ilo_state_ps_params_info {
+   /* compatibility with raster states */
+   uint32_t sample_mask;
+   bool earlyz_control_psexec;
+
+   /* compatibility with cc states */
+   bool alpha_may_kill;
+   bool dual_source_blending;
+   bool has_writeable_rt;
+};
+
+struct ilo_state_ps_info {
+   struct ilo_state_shader_kernel_info kernel_8;
+   struct ilo_state_shader_kernel_info kernel_16;
+   struct ilo_state_shader_kernel_info kernel_32;
+   struct ilo_state_shader_resource_info resource;
+
+   struct ilo_state_ps_io_info io;
+   struct ilo_state_ps_params_info params;
+
+   /* bitmask of GEN6_PS_DISPATCH_x */
+   uint8_t valid_kernels;
+   bool per_sample_dispatch;
+   bool sample_count_one;
+   bool cv_per_sample_interp;
+   bool cv_has_earlyz_op;
+
+   bool rt_clear_enable;
+   bool rt_resolve_enable;
+
+   bool cv_has_depth_buffer;
+};
+
 struct ilo_state_vs {
    uint32_t vs[5];
 };
@@ -140,6 +188,20 @@ struct ilo_state_gs {
    uint32_t gs[5];
 };
 
+struct ilo_state_ps {
+   uint32_t ps[8];
+
+   struct ilo_state_ps_dispatch_conds {
+      bool ps_valid;
+
+      bool has_rt_write;
+      bool write_odepth;
+      bool write_ostencil;
+      bool has_uav_write;
+      bool ps_may_kill;
+   } conds;
+};
+
 bool
 ilo_state_vs_init(struct ilo_state_vs *vs,
                   const struct ilo_dev *dev,
@@ -177,4 +239,18 @@ bool
 ilo_state_gs_init_disabled(struct ilo_state_gs *gs,
                            const struct ilo_dev *dev);
 
+bool
+ilo_state_ps_init(struct ilo_state_ps *ps,
+                  const struct ilo_dev *dev,
+                  const struct ilo_state_ps_info *info);
+
+bool
+ilo_state_ps_init_disabled(struct ilo_state_ps *ps,
+                           const struct ilo_dev *dev);
+
+bool
+ilo_state_ps_set_params(struct ilo_state_ps *ps,
+                        const struct ilo_dev *dev,
+                        const struct ilo_state_ps_params_info *params);
+
 #endif /* ILO_STATE_SHADER_H */
diff --git a/src/gallium/drivers/ilo/core/ilo_state_shader_ps.c b/src/gallium/drivers/ilo/core/ilo_state_shader_ps.c
new file mode 100644 (file)
index 0000000..f4d801e
--- /dev/null
@@ -0,0 +1,771 @@
+/*
+ * Mesa 3-D graphics library
+ *
+ * Copyright (C) 2012-2015 LunarG, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Chia-I Wu <olv@lunarg.com>
+ */
+
+#include "ilo_debug.h"
+#include "ilo_state_shader.h"
+
+struct pixel_ff {
+   uint8_t dispatch_modes;
+
+   uint32_t kernel_offsets[3];
+   uint8_t grf_starts[3];
+   bool pcb_enable;
+   uint8_t scratch_space;
+
+   uint8_t sampler_count;
+   uint8_t surface_count;
+   bool has_uav;
+
+   uint16_t thread_count;
+
+   struct ilo_state_ps_dispatch_conds conds;
+
+   bool kill_pixel;
+   bool dispatch_enable;
+   bool dual_source_blending;
+   uint32_t sample_mask;
+};
+
+static bool
+ps_kernel_validate_gen6(const struct ilo_dev *dev,
+                        const struct ilo_state_shader_kernel_info *kernel)
+{
+   /* "Dispatch GRF Start Register for Constant/Setup Data" is U7 */
+   const uint8_t max_grf_start = 128;
+   /*
+    * From the Sandy Bridge PRM, volume 2 part 1, page 271:
+    *
+    *     "(Per-Thread Scratch Space)
+    *      Range  [0,11] indicating [1k bytes, 2M bytes] in powers of two"
+    */
+   const uint32_t max_scratch_size = 2 * 1024 * 1024;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   /* "Kernel Start Pointer" is 64-byte aligned */
+   assert(kernel->offset % 64 == 0);
+
+   assert(kernel->grf_start < max_grf_start);
+   assert(kernel->scratch_size <= max_scratch_size);
+
+   return true;
+}
+
+static bool
+ps_validate_gen6(const struct ilo_dev *dev,
+                 const struct ilo_state_ps_info *info)
+{
+   const struct ilo_state_shader_kernel_info *kernel_8 = &info->kernel_8;
+   const struct ilo_state_shader_kernel_info *kernel_16 = &info->kernel_16;
+   const struct ilo_state_shader_kernel_info *kernel_32 = &info->kernel_32;
+   const struct ilo_state_ps_io_info *io = &info->io;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   if (!ps_kernel_validate_gen6(dev, kernel_8) ||
+       !ps_kernel_validate_gen6(dev, kernel_16) ||
+       !ps_kernel_validate_gen6(dev, kernel_32))
+      return false;
+
+   /* unsupported on Gen6 */
+   if (ilo_dev_gen(dev) == ILO_GEN(6))
+      assert(!io->use_coverage_mask);
+
+   /*
+    * From the Sandy Bridge PRM, volume 2 part 1, page 275:
+    *
+    *     "If a NULL Depth Buffer is selected, the Pixel Shader Computed Depth
+    *      field must be set to disabled."
+    */
+   if (ilo_dev_gen(dev) == ILO_GEN(6) && io->pscdepth != GEN7_PSCDEPTH_OFF)
+      assert(info->cv_has_depth_buffer);
+
+   if (!info->per_sample_dispatch) {
+      /*
+       * From the Sandy Bridge PRM, volume 2 part 1, page 281:
+       *
+       *     "MSDISPMODE_PERSAMPLE is required in order to select
+       *      POSOFFSET_SAMPLE."
+       */
+      assert(io->posoffset != GEN6_POSOFFSET_SAMPLE);
+
+      /*
+       * From the Sandy Bridge PRM, volume 2 part 1, page 282:
+       *
+       *     "MSDISPMODE_PERSAMPLE is required in order to select
+       *      INTERP_SAMPLE."
+       *
+       * From the Sandy Bridge PRM, volume 2 part 1, page 283:
+       *
+       *     "MSDISPMODE_PERSAMPLE is required in order to select Perspective
+       *      Sample or Non-perspective Sample barycentric coordinates."
+       */
+      assert(!info->cv_per_sample_interp);
+   }
+
+   /*
+    *
+    * From the Sandy Bridge PRM, volume 2 part 1, page 314:
+    *
+    *     "Pixel Shader Dispatch, Alpha... must all be disabled."
+    *
+    * Simply disallow any valid kernel when there is early-z op.  Also, when
+    * there is no valid kernel, io should be zeroed.
+    */
+   if (info->valid_kernels)
+      assert(!info->cv_has_earlyz_op);
+   else
+      assert(ilo_is_zeroed(io, sizeof(*io)));
+
+   return true;
+}
+
+static uint8_t
+ps_get_gen6_dispatch_modes(const struct ilo_dev *dev,
+                           const struct ilo_state_ps_info *info)
+{
+   const struct ilo_state_ps_io_info *io = &info->io;
+   uint8_t dispatch_modes = info->valid_kernels;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   if (!dispatch_modes)
+      return 0;
+
+   /*
+    * From the Sandy Bridge PRM, volume 2 part 1, page 334:
+    *
+    *     "Not valid on [DevSNB] if 4x PERPIXEL mode with pixel shader
+    *      computed depth."
+    *
+    *     "Valid on all products, except when in non-1x PERSAMPLE mode
+    *      (applies to [DevSNB+] only)"
+    *
+    * From the Sandy Bridge PRM, volume 4 part 1, page 239:
+    *
+    *     "[DevSNB]: When Pixel Shader outputs oDepth and PS invocation mode
+    *      is PERPIXEL, Message Type for Render Target Write must be SIMD8.
+    *
+    *      Errata: [DevSNB+]: When Pixel Shader outputs oMask, this message
+    *      type is not supported: SIMD8 (including SIMD8_DUALSRC_xx)."
+    *
+    * It is really hard to follow what combinations are valid on what
+    * platforms.  Judging from the restrictions on RT write messages on Gen6,
+    * oDepth and oMask related issues should be Gen6-specific.  PERSAMPLE
+    * issue should be universal, and disallows multiple dispatch modes.
+    */
+   if (ilo_dev_gen(dev) == ILO_GEN(6)) {
+      if (io->pscdepth != GEN7_PSCDEPTH_OFF && !info->per_sample_dispatch)
+         dispatch_modes &= GEN6_PS_DISPATCH_8;
+      if (io->write_omask)
+         dispatch_modes &= ~GEN6_PS_DISPATCH_8;
+   }
+   if (info->per_sample_dispatch && !info->sample_count_one) {
+      /* prefer 32 over 16 over 8 */
+      if (dispatch_modes & GEN6_PS_DISPATCH_32)
+         dispatch_modes &= GEN6_PS_DISPATCH_32;
+      else if (dispatch_modes & GEN6_PS_DISPATCH_16)
+         dispatch_modes &= GEN6_PS_DISPATCH_16;
+      else
+         dispatch_modes &= GEN6_PS_DISPATCH_8;
+   }
+
+   /*
+    * From the Broadwell PRM, volume 2b, page 149:
+    *
+    *     "When Render Target Fast Clear Enable is ENABLED or Render Target
+    *      Resolve Type = RESOLVE_PARTIAL or RESOLVE_FULL, this bit (8 Pixel
+    *      Dispatch or Dual-8 Pixel Dispatch Enable) must be DISABLED."
+    */
+   if (info->rt_clear_enable || info->rt_resolve_enable)
+      dispatch_modes &= ~GEN6_PS_DISPATCH_8;
+
+   assert(dispatch_modes);
+
+   return dispatch_modes;
+}
+
+static uint16_t
+ps_get_gen6_thread_count(const struct ilo_dev *dev,
+                         const struct ilo_state_ps_info *info)
+{
+   uint16_t thread_count;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   /* Maximum Number of Threads of 3DSTATE_PS */
+   switch (ilo_dev_gen(dev)) {
+   case ILO_GEN(8):
+      /* scaled automatically */
+      thread_count = 64 - 1;
+      break;
+   case ILO_GEN(7.5):
+      thread_count = (dev->gt == 3) ? 408 :
+                     (dev->gt == 2) ? 204 : 102;
+      break;
+   case ILO_GEN(7):
+      thread_count = (dev->gt == 2) ? 172 : 48;
+      break;
+   case ILO_GEN(6):
+   default:
+      /* from the classic driver instead of the PRM */
+      thread_count = (dev->gt == 2) ? 80 : 40;
+      break;
+   }
+
+   return thread_count - 1;
+}
+
+static bool
+ps_params_get_gen6_kill_pixel(const struct ilo_dev *dev,
+                              const struct ilo_state_ps_params_info *params,
+                              const struct ilo_state_ps_dispatch_conds *conds)
+{
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   /*
+    * From the Sandy Bridge PRM, volume 2 part 1, page 275:
+    *
+    *     "This bit (Pixel Shader Kill Pixel), if ENABLED, indicates that the
+    *      PS kernel or color calculator has the ability to kill (discard)
+    *      pixels or samples, other than due to depth or stencil testing.
+    *      This bit is required to be ENABLED in the following situations:
+    *
+    *      The API pixel shader program contains "killpix" or "discard"
+    *      instructions, or other code in the pixel shader kernel that can
+    *      cause the final pixel mask to differ from the pixel mask received
+    *      on dispatch.
+    *
+    *      A sampler with chroma key enabled with kill pixel mode is used by
+    *      the pixel shader.
+    *
+    *      Any render target has Alpha Test Enable or AlphaToCoverage Enable
+    *      enabled.
+    *
+    *      The pixel shader kernel generates and outputs oMask.
+    *
+    *      Note: As ClipDistance clipping is fully supported in hardware and
+    *      therefore not via PS instructions, there should be no need to
+    *      ENABLE this bit due to ClipDistance clipping."
+    */
+   return (conds->ps_may_kill || params->alpha_may_kill);
+}
+
+static bool
+ps_params_get_gen6_dispatch_enable(const struct ilo_dev *dev,
+                                   const struct ilo_state_ps_params_info *params,
+                                   const struct ilo_state_ps_dispatch_conds *conds)
+{
+   /*
+    * We want to skip dispatching when EarlyZ suffices.  The conditions that
+    * require dispatching are
+    *
+    *  - PS writes RTs and RTs are writeable
+    *  - PS changes depth value and depth test/write is enabled
+    *  - PS changes stencil value and stencil test is enabled
+    *  - PS writes UAVs
+    *  - PS or CC kills pixels
+    *  - EDSC is PSEXEC, and depth test/write or stencil test is enabled
+    */
+   bool dispatch_required =
+      ((conds->has_rt_write && params->has_writeable_rt) ||
+       conds->write_odepth ||
+       conds->write_ostencil ||
+       conds->has_uav_write ||
+       ps_params_get_gen6_kill_pixel(dev, params, conds) ||
+       params->earlyz_control_psexec);
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   /*
+    * From the Ivy Bridge PRM, volume 2 part 1, page 280:
+    *
+    *     "If EDSC_PSEXEC mode is selected, Thread Dispatch Enable must be
+    *      set."
+    */
+   if (ilo_dev_gen(dev) < ILO_GEN(8) && params->earlyz_control_psexec)
+      dispatch_required = true;
+
+   /* assert it is valid to dispatch */
+   if (dispatch_required)
+      assert(conds->ps_valid);
+
+   return dispatch_required;
+}
+
+static bool
+ps_get_gen6_ff_kernels(const struct ilo_dev *dev,
+                       const struct ilo_state_ps_info *info,
+                       struct pixel_ff *ff)
+{
+   const struct ilo_state_shader_kernel_info *kernel_8 = &info->kernel_8;
+   const struct ilo_state_shader_kernel_info *kernel_16 = &info->kernel_16;
+   const struct ilo_state_shader_kernel_info *kernel_32 = &info->kernel_32;
+   uint32_t scratch_size;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   ff->dispatch_modes = ps_get_gen6_dispatch_modes(dev, info);
+
+   /* initialize kernel offsets and GRF starts */
+   if (util_is_power_of_two(ff->dispatch_modes)) {
+      if (ff->dispatch_modes & GEN6_PS_DISPATCH_8) {
+         ff->kernel_offsets[0] = kernel_8->offset;
+         ff->grf_starts[0] = kernel_8->grf_start;
+      } else if (ff->dispatch_modes & GEN6_PS_DISPATCH_16) {
+         ff->kernel_offsets[0] = kernel_16->offset;
+         ff->grf_starts[0] = kernel_16->grf_start;
+      } else if (ff->dispatch_modes & GEN6_PS_DISPATCH_32) {
+         ff->kernel_offsets[0] = kernel_32->offset;
+         ff->grf_starts[0] = kernel_32->grf_start;
+      }
+   } else {
+      ff->kernel_offsets[0] = kernel_8->offset;
+      ff->kernel_offsets[1] = kernel_32->offset;
+      ff->kernel_offsets[2] = kernel_16->offset;
+
+      ff->grf_starts[0] = kernel_8->grf_start;
+      ff->grf_starts[1] = kernel_32->grf_start;
+      ff->grf_starts[2] = kernel_16->grf_start;
+   }
+
+   /* we do not want to save it */
+   assert(ff->kernel_offsets[0] == 0);
+
+   ff->pcb_enable = (((ff->dispatch_modes & GEN6_PS_DISPATCH_8) &&
+                      kernel_8->pcb_attr_count) ||
+                     ((ff->dispatch_modes & GEN6_PS_DISPATCH_16) &&
+                      kernel_16->pcb_attr_count) ||
+                     ((ff->dispatch_modes & GEN6_PS_DISPATCH_32) &&
+                      kernel_32->pcb_attr_count));
+
+   scratch_size = 0;
+   if ((ff->dispatch_modes & GEN6_PS_DISPATCH_8) &&
+       scratch_size < kernel_8->scratch_size)
+      scratch_size = kernel_8->scratch_size;
+   if ((ff->dispatch_modes & GEN6_PS_DISPATCH_16) &&
+       scratch_size < kernel_16->scratch_size)
+      scratch_size = kernel_16->scratch_size;
+   if ((ff->dispatch_modes & GEN6_PS_DISPATCH_32) &&
+       scratch_size < kernel_32->scratch_size)
+      scratch_size = kernel_32->scratch_size;
+
+   /* next power of two, starting from 1KB */
+   ff->scratch_space = (scratch_size > 1024) ?
+      (util_last_bit(scratch_size - 1) - 10): 0;
+
+   /* GPU hangs on Haswell if none of the dispatch mode bits is set */
+   if (ilo_dev_gen(dev) == ILO_GEN(7.5) && !ff->dispatch_modes)
+      ff->dispatch_modes |= GEN6_PS_DISPATCH_8;
+
+   return true;
+}
+
+static bool
+ps_get_gen6_ff(const struct ilo_dev *dev,
+               const struct ilo_state_ps_info *info,
+               struct pixel_ff *ff)
+{
+   const struct ilo_state_shader_resource_info *resource = &info->resource;
+   const struct ilo_state_ps_io_info *io = &info->io;
+   const struct ilo_state_ps_params_info *params = &info->params;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   memset(ff, 0, sizeof(*ff));
+
+   if (!ps_validate_gen6(dev, info) || !ps_get_gen6_ff_kernels(dev, info, ff))
+      return false;
+
+   ff->sampler_count = (resource->sampler_count <= 12) ?
+      (resource->sampler_count + 3) / 4 : 4;
+   ff->surface_count = resource->surface_count;
+   ff->has_uav = resource->has_uav;
+
+   ff->thread_count = ps_get_gen6_thread_count(dev, info);
+
+   ff->conds.ps_valid = (info->valid_kernels != 0x0);
+   ff->conds.has_rt_write = io->has_rt_write;
+   ff->conds.write_odepth = (io->pscdepth != GEN7_PSCDEPTH_OFF);
+   ff->conds.write_ostencil = false;
+   ff->conds.has_uav_write = resource->has_uav;
+   ff->conds.ps_may_kill = (io->write_pixel_mask || io->write_omask);
+
+   ff->kill_pixel = ps_params_get_gen6_kill_pixel(dev, params, &ff->conds);
+   ff->dispatch_enable =
+      ps_params_get_gen6_dispatch_enable(dev, params, &ff->conds);
+   ff->dual_source_blending = params->dual_source_blending;
+   ff->sample_mask = params->sample_mask;
+
+   return true;
+}
+
+static bool
+ps_set_gen6_3dstate_wm(struct ilo_state_ps *ps,
+                       const struct ilo_dev *dev,
+                       const struct ilo_state_ps_info *info,
+                       const struct pixel_ff *ff)
+{
+   const struct ilo_state_ps_io_info *io = &info->io;
+   uint32_t dw2, dw3, dw4, dw5, dw6;
+
+   ILO_DEV_ASSERT(dev, 6, 6);
+
+   dw2 = ff->sampler_count << GEN6_THREADDISP_SAMPLER_COUNT__SHIFT |
+         ff->surface_count << GEN6_THREADDISP_BINDING_TABLE_SIZE__SHIFT;
+
+   if (false)
+      dw2 |= GEN6_THREADDISP_FP_MODE_ALT;
+
+   dw3 = ff->scratch_space << GEN6_THREADSCRATCH_SPACE_PER_THREAD__SHIFT;
+
+   dw4 = ff->grf_starts[0] << GEN6_WM_DW4_URB_GRF_START0__SHIFT |
+         ff->grf_starts[1] << GEN6_WM_DW4_URB_GRF_START1__SHIFT |
+         ff->grf_starts[2] << GEN6_WM_DW4_URB_GRF_START2__SHIFT;
+
+   dw5 = ff->thread_count << GEN6_WM_DW5_MAX_THREADS__SHIFT |
+         ff->dispatch_modes << GEN6_WM_DW5_PS_DISPATCH_MODE__SHIFT;
+
+   if (ff->kill_pixel)
+      dw5 |= GEN6_WM_DW5_PS_KILL_PIXEL;
+
+   if (io->pscdepth != GEN7_PSCDEPTH_OFF)
+      dw5 |= GEN6_WM_DW5_PS_COMPUTE_DEPTH;
+   if (io->use_z)
+      dw5 |= GEN6_WM_DW5_PS_USE_DEPTH;
+
+   if (ff->dispatch_enable)
+      dw5 |= GEN6_WM_DW5_PS_DISPATCH_ENABLE;
+
+   if (io->write_omask)
+      dw5 |= GEN6_WM_DW5_PS_COMPUTE_OMASK;
+   if (io->use_w)
+      dw5 |= GEN6_WM_DW5_PS_USE_W;
+
+   if (ff->dual_source_blending)
+      dw5 |= GEN6_WM_DW5_PS_DUAL_SOURCE_BLEND;
+
+   dw6 = io->attr_count << GEN6_WM_DW6_SF_ATTR_COUNT__SHIFT |
+         io->posoffset << GEN6_WM_DW6_PS_POSOFFSET__SHIFT;
+
+   dw6 |= (info->per_sample_dispatch) ?
+      GEN6_WM_DW6_MSDISPMODE_PERSAMPLE : GEN6_WM_DW6_MSDISPMODE_PERPIXEL;
+
+   STATIC_ASSERT(ARRAY_SIZE(ps->ps) >= 7);
+   ps->ps[0] = dw2;
+   ps->ps[1] = dw3;
+   ps->ps[2] = dw4;
+   ps->ps[3] = dw5;
+   ps->ps[4] = dw6;
+   ps->ps[5] = ff->kernel_offsets[1];
+   ps->ps[6] = ff->kernel_offsets[2];
+
+   return true;
+}
+
+static bool
+ps_set_gen7_3dstate_wm(struct ilo_state_ps *ps,
+                       const struct ilo_dev *dev,
+                       const struct ilo_state_ps_info *info,
+                       const struct pixel_ff *ff)
+{
+   const struct ilo_state_ps_io_info *io = &info->io;
+   uint32_t dw1, dw2;
+
+   ILO_DEV_ASSERT(dev, 7, 7.5);
+
+   dw1 = io->pscdepth << GEN7_WM_DW1_PSCDEPTH__SHIFT;
+
+   if (ff->dispatch_enable)
+      dw1 |= GEN7_WM_DW1_PS_DISPATCH_ENABLE;
+   if (ff->kill_pixel)
+      dw1 |= GEN7_WM_DW1_PS_KILL_PIXEL;
+
+   if (io->use_z)
+      dw1 |= GEN7_WM_DW1_PS_USE_DEPTH;
+   if (io->use_w)
+      dw1 |= GEN7_WM_DW1_PS_USE_W;
+   if (io->use_coverage_mask)
+      dw1 |= GEN7_WM_DW1_PS_USE_COVERAGE_MASK;
+
+   dw2 = (info->per_sample_dispatch) ?
+      GEN7_WM_DW2_MSDISPMODE_PERSAMPLE : GEN7_WM_DW2_MSDISPMODE_PERPIXEL;
+
+   STATIC_ASSERT(ARRAY_SIZE(ps->ps) >= 2);
+   ps->ps[0] = dw1;
+   ps->ps[1] = dw2;
+
+   return true;
+}
+
+static bool
+ps_set_gen7_3DSTATE_PS(struct ilo_state_ps *ps,
+                       const struct ilo_dev *dev,
+                       const struct ilo_state_ps_info *info,
+                       const struct pixel_ff *ff)
+{
+   const struct ilo_state_ps_io_info *io = &info->io;
+   uint32_t dw2, dw3, dw4, dw5;
+
+   ILO_DEV_ASSERT(dev, 7, 7.5);
+
+   dw2 = ff->sampler_count << GEN6_THREADDISP_SAMPLER_COUNT__SHIFT |
+         ff->surface_count << GEN6_THREADDISP_BINDING_TABLE_SIZE__SHIFT;
+
+   if (false)
+      dw2 |= GEN6_THREADDISP_FP_MODE_ALT;
+
+   dw3 = ff->scratch_space << GEN6_THREADSCRATCH_SPACE_PER_THREAD__SHIFT;
+
+   dw4 = io->posoffset << GEN7_PS_DW4_POSOFFSET__SHIFT |
+         ff->dispatch_modes << GEN7_PS_DW4_DISPATCH_MODE__SHIFT;
+
+   if (ilo_dev_gen(dev) == ILO_GEN(7.5)) {
+      dw4 |= ff->thread_count << GEN75_PS_DW4_MAX_THREADS__SHIFT |
+             (ff->sample_mask & 0xff) << GEN75_PS_DW4_SAMPLE_MASK__SHIFT;
+   } else {
+      dw4 |= ff->thread_count << GEN7_PS_DW4_MAX_THREADS__SHIFT;
+   }
+
+   if (ff->pcb_enable)
+      dw4 |= GEN7_PS_DW4_PUSH_CONSTANT_ENABLE;
+   if (io->attr_count)
+      dw4 |= GEN7_PS_DW4_ATTR_ENABLE;
+   if (io->write_omask)
+      dw4 |= GEN7_PS_DW4_COMPUTE_OMASK;
+   if (info->rt_clear_enable)
+      dw4 |= GEN7_PS_DW4_RT_FAST_CLEAR;
+   if (ff->dual_source_blending)
+      dw4 |= GEN7_PS_DW4_DUAL_SOURCE_BLEND;
+   if (info->rt_resolve_enable)
+      dw4 |= GEN7_PS_DW4_RT_RESOLVE;
+   if (ilo_dev_gen(dev) >= ILO_GEN(7.5) && ff->has_uav)
+      dw4 |= GEN75_PS_DW4_ACCESS_UAV;
+
+   dw5 = ff->grf_starts[0] << GEN7_PS_DW5_URB_GRF_START0__SHIFT |
+         ff->grf_starts[1] << GEN7_PS_DW5_URB_GRF_START1__SHIFT |
+         ff->grf_starts[2] << GEN7_PS_DW5_URB_GRF_START2__SHIFT;
+
+   STATIC_ASSERT(ARRAY_SIZE(ps->ps) >= 8);
+   ps->ps[2] = dw2;
+   ps->ps[3] = dw3;
+   ps->ps[4] = dw4;
+   ps->ps[5] = dw5;
+   ps->ps[6] = ff->kernel_offsets[1];
+   ps->ps[7] = ff->kernel_offsets[2];
+
+   return true;
+}
+
+static bool
+ps_set_gen8_3DSTATE_PS(struct ilo_state_ps *ps,
+                       const struct ilo_dev *dev,
+                       const struct ilo_state_ps_info *info,
+                       const struct pixel_ff *ff)
+{
+   const struct ilo_state_ps_io_info *io = &info->io;
+   uint32_t dw3, dw4, dw6, dw7;
+
+   ILO_DEV_ASSERT(dev, 8, 8);
+
+   dw3 = ff->sampler_count << GEN6_THREADDISP_SAMPLER_COUNT__SHIFT |
+         ff->surface_count << GEN6_THREADDISP_BINDING_TABLE_SIZE__SHIFT;
+
+   if (false)
+      dw3 |= GEN6_THREADDISP_FP_MODE_ALT;
+
+   dw4 = ff->scratch_space << GEN6_THREADSCRATCH_SPACE_PER_THREAD__SHIFT;
+
+   dw6 = ff->thread_count << GEN8_PS_DW6_MAX_THREADS__SHIFT |
+         io->posoffset << GEN8_PS_DW6_POSOFFSET__SHIFT |
+         ff->dispatch_modes << GEN8_PS_DW6_DISPATCH_MODE__SHIFT;
+
+   if (ff->pcb_enable)
+      dw6 |= GEN8_PS_DW6_PUSH_CONSTANT_ENABLE;
+
+   if (info->rt_clear_enable)
+      dw6 |= GEN8_PS_DW6_RT_FAST_CLEAR;
+   if (info->rt_resolve_enable)
+      dw6 |= GEN8_PS_DW6_RT_RESOLVE;
+
+   dw7 = ff->grf_starts[0] << GEN8_PS_DW7_URB_GRF_START0__SHIFT |
+         ff->grf_starts[1] << GEN8_PS_DW7_URB_GRF_START1__SHIFT |
+         ff->grf_starts[2] << GEN8_PS_DW7_URB_GRF_START2__SHIFT;
+
+   STATIC_ASSERT(ARRAY_SIZE(ps->ps) >= 6);
+   ps->ps[0] = dw3;
+   ps->ps[1] = dw4;
+   ps->ps[2] = dw6;
+   ps->ps[3] = dw7;
+   ps->ps[4] = ff->kernel_offsets[1];
+   ps->ps[5] = ff->kernel_offsets[2];
+
+   return true;
+}
+
+static bool
+ps_set_gen8_3DSTATE_PS_EXTRA(struct ilo_state_ps *ps,
+                             const struct ilo_dev *dev,
+                             const struct ilo_state_ps_info *info,
+                             const struct pixel_ff *ff)
+{
+   const struct ilo_state_ps_io_info *io = &info->io;
+   uint32_t dw1;
+
+   ILO_DEV_ASSERT(dev, 8, 8);
+
+   dw1 = io->pscdepth << GEN8_PSX_DW1_PSCDEPTH__SHIFT;
+
+   if (info->valid_kernels)
+      dw1 |= GEN8_PSX_DW1_VALID;
+   if (!io->has_rt_write)
+      dw1 |= GEN8_PSX_DW1_UAV_ONLY;
+   if (io->write_omask)
+      dw1 |= GEN8_PSX_DW1_COMPUTE_OMASK;
+   if (io->write_pixel_mask)
+      dw1 |= GEN8_PSX_DW1_KILL_PIXEL;
+
+   if (io->use_z)
+      dw1 |= GEN8_PSX_DW1_USE_DEPTH;
+   if (io->use_w)
+      dw1 |= GEN8_PSX_DW1_USE_W;
+   if (io->attr_count)
+      dw1 |= GEN8_PSX_DW1_ATTR_ENABLE;
+
+   if (info->per_sample_dispatch)
+      dw1 |= GEN8_PSX_DW1_PER_SAMPLE;
+   if (ff->has_uav)
+      dw1 |= GEN8_PSX_DW1_ACCESS_UAV;
+   if (io->use_coverage_mask)
+      dw1 |= GEN8_PSX_DW1_USE_COVERAGE_MASK;
+
+   /*
+    * From the Broadwell PRM, volume 2b, page 151:
+    *
+    *     "When this bit (Pixel Shader Valid) clear the rest of this command
+    *      should also be clear.
+    */
+   if (!info->valid_kernels)
+      dw1 = 0;
+
+   STATIC_ASSERT(ARRAY_SIZE(ps->ps) >= 5);
+   ps->ps[4] = dw1;
+
+   return true;
+}
+
+bool
+ilo_state_ps_init(struct ilo_state_ps *ps,
+                  const struct ilo_dev *dev,
+                  const struct ilo_state_ps_info *info)
+{
+   struct pixel_ff ff;
+   bool ret = true;
+
+   assert(ilo_is_zeroed(ps, sizeof(*ps)));
+
+   ret &= ps_get_gen6_ff(dev, info, &ff);
+
+   if (ilo_dev_gen(dev) >= ILO_GEN(8)) {
+      ret &= ps_set_gen8_3DSTATE_PS(ps, dev, info, &ff);
+      ret &= ps_set_gen8_3DSTATE_PS_EXTRA(ps, dev, info, &ff);
+   } else if (ilo_dev_gen(dev) >= ILO_GEN(7)) {
+      ret &= ps_set_gen7_3dstate_wm(ps, dev, info, &ff);
+      ret &= ps_set_gen7_3DSTATE_PS(ps, dev, info, &ff);
+   } else {
+      ret &= ps_set_gen6_3dstate_wm(ps, dev, info, &ff);
+   }
+
+   /* save conditions */
+   ps->conds = ff.conds;
+
+   assert(ret);
+
+   return ret;
+}
+
+bool
+ilo_state_ps_init_disabled(struct ilo_state_ps *ps,
+                           const struct ilo_dev *dev)
+{
+   struct ilo_state_ps_info info;
+
+   memset(&info, 0, sizeof(info));
+
+   return ilo_state_ps_init(ps, dev, &info);
+}
+
+bool
+ilo_state_ps_set_params(struct ilo_state_ps *ps,
+                        const struct ilo_dev *dev,
+                        const struct ilo_state_ps_params_info *params)
+{
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   /* modify sample mask */
+   if (ilo_dev_gen(dev) == ILO_GEN(7.5)) {
+      ps->ps[4] = (ps->ps[4] & ~GEN75_PS_DW4_SAMPLE_MASK__MASK) |
+         (params->sample_mask & 0xff) << GEN75_PS_DW4_SAMPLE_MASK__SHIFT;
+   }
+
+   /* modify dispatch enable, pixel kill, and dual source blending */
+   if (ilo_dev_gen(dev) < ILO_GEN(8)) {
+      if (ilo_dev_gen(dev) >= ILO_GEN(7)) {
+         if (ps_params_get_gen6_dispatch_enable(dev, params, &ps->conds))
+            ps->ps[0] |= GEN7_WM_DW1_PS_DISPATCH_ENABLE;
+         else
+            ps->ps[0] &= ~GEN7_WM_DW1_PS_DISPATCH_ENABLE;
+
+         if (ps_params_get_gen6_kill_pixel(dev, params, &ps->conds))
+            ps->ps[0] |= GEN7_WM_DW1_PS_KILL_PIXEL;
+         else
+            ps->ps[0] &= ~GEN7_WM_DW1_PS_KILL_PIXEL;
+
+         if (params->dual_source_blending)
+            ps->ps[4] |= GEN7_PS_DW4_DUAL_SOURCE_BLEND;
+         else
+            ps->ps[4] &= ~GEN7_PS_DW4_DUAL_SOURCE_BLEND;
+      } else {
+         if (ps_params_get_gen6_dispatch_enable(dev, params, &ps->conds))
+            ps->ps[3] |= GEN6_WM_DW5_PS_DISPATCH_ENABLE;
+         else
+            ps->ps[3] &= ~GEN6_WM_DW5_PS_DISPATCH_ENABLE;
+
+         if (ps_params_get_gen6_kill_pixel(dev, params, &ps->conds))
+            ps->ps[3] |= GEN6_WM_DW5_PS_KILL_PIXEL;
+         else
+            ps->ps[3] &= ~GEN6_WM_DW5_PS_KILL_PIXEL;
+
+         if (params->dual_source_blending)
+            ps->ps[3] |= GEN6_WM_DW5_PS_DUAL_SOURCE_BLEND;
+         else
+            ps->ps[3] &= ~GEN6_WM_DW5_PS_DUAL_SOURCE_BLEND;
+      }
+   }
+
+   return true;
+}