iris: actually set KSP offsets
[mesa.git] / src / gallium / drivers / iris / iris_state.c
index 69bdaafc79631159acc5cd2e77abb95eddf80a48..10b81859bd305dc86b0b724f97438eb58f572f48 100644 (file)
 #include "pipe/p_screen.h"
 #include "util/u_inlines.h"
 #include "util/u_transfer.h"
+#include "i915_drm.h"
 #include "intel/compiler/brw_compiler.h"
 #include "intel/common/gen_sample_positions.h"
 #include "iris_batch.h"
 #include "iris_context.h"
+#include "iris_pipe.h"
 #include "iris_resource.h"
 
-#define __gen_address_type unsigned
-#define __gen_user_data void
+#define __gen_address_type struct iris_address
+#define __gen_user_data struct iris_batch
 
 static uint64_t
-__gen_combine_address(void *user_data, void *location,
-                      unsigned address, uint32_t delta)
+__gen_combine_address(struct iris_batch *batch, void *location,
+                      struct iris_address addr, uint32_t delta)
 {
-   return delta;
+   if (addr.bo == NULL)
+      return addr.offset + delta;
+
+   return iris_batch_reloc(batch, location - batch->cmdbuf.map, addr.bo,
+                           addr.offset + delta, addr.reloc_flags);
 }
 
 #define __genxml_cmd_length(cmd) cmd ## _length
@@ -59,14 +65,25 @@ __gen_combine_address(void *user_data, void *location,
 #define __genxml_cmd_header(cmd) cmd ## _header
 #define __genxml_cmd_pack(cmd) cmd ## _pack
 
-#define iris_pack_command(cmd, dst, name)                         \
+static void *
+get_command_space(struct iris_batch *batch, unsigned bytes)
+{
+   iris_require_command_space(batch, bytes);
+   void *map = batch->cmdbuf.map_next;
+   batch->cmdbuf.map_next += bytes;
+   return map;
+}
+
+#define _iris_pack_command(batch, cmd, dst, name)                 \
    for (struct cmd name = { __genxml_cmd_header(cmd) },           \
         *_dst = (void *)(dst); __builtin_expect(_dst != NULL, 1); \
-        ({ __genxml_cmd_pack(cmd)(NULL, (void *)_dst, &name);     \
-           VG(VALGRIND_CHECK_MEM_IS_DEFINED(_dst, __genxml_cmd_length(cmd) * 4)); \
+        ({ __genxml_cmd_pack(cmd)(batch, (void *)_dst, &name);    \
            _dst = NULL;                                           \
            }))
 
+#define iris_pack_command(cmd, dst, name) \
+   _iris_pack_command(NULL, cmd, dst, name)
+
 #define iris_pack_state(cmd, dst, name)                           \
    for (struct cmd name = {},                                     \
         *_dst = (void *)(dst); __builtin_expect(_dst != NULL, 1); \
@@ -74,22 +91,38 @@ __gen_combine_address(void *user_data, void *location,
         _dst = NULL)
 
 #define iris_emit_cmd(batch, cmd, name) \
-   iris_require_command_space(batch, 4 * __genxml_cmd_length(cmd)); \
-   iris_pack_command(cmd, batch->cmdbuf.map_next, name)
+   _iris_pack_command(batch, cmd, get_command_space(batch, 4 * __genxml_cmd_length(cmd)), name)
+
+#define iris_emit_merge(batch, dwords0, dwords1, num_dwords)   \
+   do {                                                        \
+      uint32_t *dw = get_command_space(batch, 4 * num_dwords); \
+      for (uint32_t i = 0; i < num_dwords; i++)                \
+         dw[i] = (dwords0)[i] | (dwords1)[i];                  \
+      VG(VALGRIND_CHECK_MEM_IS_DEFINED(dw, num_dwords));       \
+   } while (0)
 
-#define iris_emit_merge(batch, dwords0, dwords1) \
+#define iris_emit_with_addr(batch, dwords, num_dw, addr_field, addr)    \
    do {                                                                 \
-      STATIC_ASSERT(ARRAY_SIZE(dwords0) == ARRAY_SIZE(dwords1));        \
-                                                                        \
-      iris_require_command_space(batch, ARRAY_SIZE(dwords0));           \
-      uint32_t *dw = batch->cmdbuf.map_next;                            \
-      for (uint32_t i = 0; i < ARRAY_SIZE(dwords0); i++)                \
-         dw[i] = (dwords0)[i] | (dwords1)[i];                           \
-      VG(VALGRIND_CHECK_MEM_IS_DEFINED(dw, ARRAY_SIZE(dwords0) * 4));   \
+      STATIC_ASSERT((GENX(addr_field) % 64) == 0);                      \
+      assert(num_dw <= ARRAY_SIZE(dwords));                             \
+      int addr_idx = GENX(addr_field) / 32;                             \
+      uint32_t *dw = get_command_space(batch, 4 * num_dw);              \
+      for (uint32_t i = 0; i < addr_idx; i++) {                         \
+         dw[i] = (dwords)[i];                                           \
+      }                                                                 \
+      uint64_t *qw = (uint64_t *) &dw[addr_idx];                        \
+      qw = iris_batch_reloc(batch, qw - batch->cmdbuf.map, addr.bo,     \
+                            addr.offset + (dwords)[addr_idx + 1],       \
+                            addr.reloc_flags);                          \
+      for (uint32_t i = addr_idx + 1; i < num_dw; i++) {                \
+         dw[i] = (dwords)[i];                                           \
+      }                                                                 \
+      VG(VALGRIND_CHECK_MEM_IS_DEFINED(dw, num_dw * 4));                \
    } while (0)
 
 #include "genxml/genX_pack.h"
 #include "genxml/gen_macros.h"
+#include "genxml/genX_bits.h"
 
 #define MOCS_WB (2 << 1)
 
@@ -158,8 +191,6 @@ UNUSED static void pipe_asserts()
 static unsigned
 translate_prim_type(enum pipe_prim_type prim, uint8_t verts_per_patch)
 {
-   assert(prim == PIPE_PRIM_PATCHES || verts_per_patch == 0);
-
    static const unsigned map[] = {
       [PIPE_PRIM_POINTS]                   = _3DPRIM_POINTLIST,
       [PIPE_PRIM_LINES]                    = _3DPRIM_LINELIST,
@@ -178,7 +209,7 @@ translate_prim_type(enum pipe_prim_type prim, uint8_t verts_per_patch)
       [PIPE_PRIM_PATCHES]                  = _3DPRIM_PATCHLIST_1 - 1,
    };
 
-   return map[prim] + verts_per_patch;
+   return map[prim] + (prim == PIPE_PRIM_PATCHES ? verts_per_patch : 0);
 }
 
 static unsigned
@@ -249,10 +280,58 @@ translate_fill_mode(unsigned pipe_polymode)
    return map[pipe_polymode];
 }
 
+static struct iris_address
+ro_bo(struct iris_bo *bo, uint32_t offset)
+{
+   return (struct iris_address) { .bo = bo, .offset = offset };
+}
+
 static void
-iris_upload_initial_gpu_state(struct iris_context *ice,
-                              struct iris_batch *batch)
+iris_emit_state_base_address(struct iris_batch *batch)
 {
+   /* XXX: PIPE_CONTROLs */
+
+   iris_emit_cmd(batch, GENX(STATE_BASE_ADDRESS), sba) {
+   #if 0
+   // XXX: MOCS is stupid for this.
+      sba.GeneralStateMemoryObjectControlState            = MOCS_WB;
+      sba.StatelessDataPortAccessMemoryObjectControlState = MOCS_WB;
+      sba.SurfaceStateMemoryObjectControlState            = MOCS_WB;
+      sba.DynamicStateMemoryObjectControlState            = MOCS_WB;
+      sba.IndirectObjectMemoryObjectControlState          = MOCS_WB;
+      sba.InstructionMemoryObjectControlState             = MOCS_WB;
+      sba.BindlessSurfaceStateMemoryObjectControlState    = MOCS_WB;
+   #endif
+
+      sba.GeneralStateBaseAddressModifyEnable   = true;
+      sba.SurfaceStateBaseAddressModifyEnable   = true;
+      sba.DynamicStateBaseAddressModifyEnable   = true;
+      sba.IndirectObjectBaseAddressModifyEnable = true;
+      sba.InstructionBaseAddressModifyEnable    = true;
+      sba.GeneralStateBufferSizeModifyEnable    = true;
+      sba.DynamicStateBufferSizeModifyEnable    = true;
+      sba.BindlessSurfaceStateBaseAddressModifyEnable = true;
+      sba.IndirectObjectBufferSizeModifyEnable  = true;
+      sba.InstructionBuffersizeModifyEnable     = true;
+
+      sba.SurfaceStateBaseAddress = ro_bo(batch->statebuf.bo, 0);
+      sba.DynamicStateBaseAddress = ro_bo(batch->statebuf.bo, 0);
+
+      sba.GeneralStateBufferSize   = 0xfffff;
+      sba.IndirectObjectBufferSize = 0xfffff;
+      sba.InstructionBufferSize    = 0xfffff;
+      sba.DynamicStateBufferSize   = ALIGN(MAX_STATE_SIZE, 4096);
+   }
+}
+
+static void
+iris_init_render_context(struct iris_screen *screen,
+                         struct iris_batch *batch,
+                         struct pipe_debug_callback *dbg)
+{
+   batch->emit_state_base_address = iris_emit_state_base_address;
+   iris_init_batch(batch, screen, dbg, I915_EXEC_RENDER);
+
    iris_emit_cmd(batch, GENX(3DSTATE_DRAWING_RECTANGLE), rect) {
       rect.ClippedDrawingRectangleXMax = UINT16_MAX;
       rect.ClippedDrawingRectangleYMax = UINT16_MAX;
@@ -269,11 +348,15 @@ iris_upload_initial_gpu_state(struct iris_context *ice,
    iris_emit_cmd(batch, GENX(3DSTATE_WM_HZ_OP), foo);
    /* XXX: may need to set an offset for origin-UL framebuffers */
    iris_emit_cmd(batch, GENX(3DSTATE_POLY_STIPPLE_OFFSET), foo);
-}
 
-static void
-iris_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
-{
+   /* Just assign a static partitioning. */
+   for (int i = 0; i <= MESA_SHADER_FRAGMENT; i++) {
+      iris_emit_cmd(batch, GENX(3DSTATE_PUSH_CONSTANT_ALLOC_VS), alloc) {
+         alloc._3DCommandSubOpcode = 18 + i;
+         alloc.ConstantBufferOffset = 6 * i;
+         alloc.ConstantBufferSize = i == MESA_SHADER_FRAGMENT ? 8 : 6;
+      }
+   }
 }
 
 static void
@@ -296,6 +379,8 @@ struct iris_blend_state {
    uint32_t blend_state[GENX(BLEND_STATE_length)];
    uint32_t blend_entries[BRW_MAX_DRAW_BUFFERS *
                           GENX(BLEND_STATE_ENTRY_length)];
+
+   bool alpha_to_coverage; /* for shader key */
 };
 
 static void *
@@ -304,6 +389,8 @@ iris_create_blend_state(struct pipe_context *ctx,
 {
    struct iris_blend_state *cso = malloc(sizeof(struct iris_blend_state));
 
+   cso->alpha_to_coverage = state->alpha_to_coverage;
+
    iris_pack_state(GENX(BLEND_STATE), cso->blend_state, bs) {
       bs.AlphaToCoverageEnable = state->alpha_to_coverage;
       bs.IndependentAlphaBlendEnable = state->independent_blend_enable;
@@ -444,15 +531,14 @@ struct iris_rasterizer_state {
    uint32_t clip[GENX(3DSTATE_CLIP_length)];
    uint32_t raster[GENX(3DSTATE_RASTER_length)];
    uint32_t wm[GENX(3DSTATE_WM_length)];
+   uint32_t line_stipple[GENX(3DSTATE_LINE_STIPPLE_length)];
 
    bool flatshade; /* for shader state */
+   bool clamp_fragment_color; /* for shader state */
    bool light_twoside; /* for shader state */
    bool rasterizer_discard; /* for 3DSTATE_STREAMOUT */
    bool half_pixel_center; /* for 3DSTATE_MULTISAMPLE */
    enum pipe_sprite_coord_mode sprite_coord_mode; /* PIPE_SPRITE_* */
-
-   uint8_t line_stipple_factor;
-   uint16_t line_stipple_pattern;
 };
 
 static void *
@@ -475,17 +561,12 @@ iris_create_rasterizer_state(struct pipe_context *ctx,
 
       offset_units_unscaled - cap not exposed
    }
-
-   unsigned line_stipple_factor:8;  /**< [1..256] actually */
-   unsigned line_stipple_pattern:16;
    #endif
 
    cso->flatshade = state->flatshade;
+   cso->clamp_fragment_color = state->clamp_fragment_color;
    cso->light_twoside = state->light_twoside;
    cso->rasterizer_discard = state->rasterizer_discard;
-   cso->line_stipple_factor = state->line_stipple_factor;
-   cso->line_stipple_pattern = state->line_stipple_pattern;
-   // for 3DSTATE_MULTISAMPLE, if we want it.
    cso->half_pixel_center = state->half_pixel_center;
 
    iris_pack_command(GENX(3DSTATE_SF), cso->sf, sf) {
@@ -531,6 +612,9 @@ iris_create_rasterizer_state(struct pipe_context *ctx,
    }
 
    iris_pack_command(GENX(3DSTATE_CLIP), cso->clip, cl) {
+      /* cl.NonPerspectiveBarycentricEnable is filled in at draw time from
+       * the FS program; cl.ForceZeroRTAIndexEnable is filled in from the FB.
+       */
       cl.StatisticsEnable = true;
       cl.EarlyCullEnable = true;
       cl.UserClipDistanceClipTestEnableBitmask = state->clip_plane_enable;
@@ -542,8 +626,6 @@ iris_create_rasterizer_state(struct pipe_context *ctx,
       cl.ViewportXYClipTestEnable = state->point_tri_clip;
       cl.MinimumPointWidth = 0.125;
       cl.MaximumPointWidth = 255.875;
-      //.NonPerspectiveBarycentricEnable = <comes from FS prog> :(
-      //.ForceZeroRTAIndexEnable = <comes from FB layers being 0>
 
       if (state->flatshade_first) {
          cl.TriangleStripListProvokingVertexSelect = 2;
@@ -555,14 +637,24 @@ iris_create_rasterizer_state(struct pipe_context *ctx,
    }
 
    iris_pack_command(GENX(3DSTATE_WM), cso->wm, wm) {
+      /* wm.BarycentricInterpolationMode and wm.EarlyDepthStencilControl are
+       * filled in at draw time from the FS program.
+       */
       wm.LineAntialiasingRegionWidth = _10pixels;
       wm.LineEndCapAntialiasingRegionWidth = _05pixels;
       wm.PointRasterizationRule = RASTRULE_UPPER_RIGHT;
       wm.StatisticsEnable = true;
       wm.LineStippleEnable = state->line_stipple_enable;
       wm.PolygonStippleEnable = state->poly_stipple_enable;
-      // wm.BarycentricInterpolationMode = <comes from FS program> :(
-      // wm.EarlyDepthStencilControl = <comes from FS program> :(
+   }
+
+   /* Remap from 0..255 back to 1..256 */
+   const unsigned line_stipple_factor = state->line_stipple_factor + 1;
+
+   iris_pack_command(GENX(3DSTATE_LINE_STIPPLE), cso->line_stipple, line) {
+      line.LineStipplePattern = state->line_stipple_pattern;
+      line.LineStippleInverseRepeatCount = 1.0f / line_stipple_factor;
+      line.LineStippleRepeatCount = line_stipple_factor;
    }
 
    return cso;
@@ -577,9 +669,8 @@ iris_bind_rasterizer_state(struct pipe_context *ctx, void *state)
 
    if (new_cso) {
       /* Try to avoid re-emitting 3DSTATE_LINE_STIPPLE, it's non-pipelined */
-      if (!old_cso ||
-          old_cso->line_stipple_factor != new_cso->line_stipple_factor ||
-          old_cso->line_stipple_pattern != new_cso->line_stipple_pattern) {
+      if (!old_cso || memcmp(old_cso->line_stipple, new_cso->line_stipple,
+                             sizeof(old_cso->line_stipple)) != 0) {
          ice->state.dirty |= IRIS_DIRTY_LINE_STIPPLE;
       }
 
@@ -710,6 +801,24 @@ iris_create_sampler_state(struct pipe_context *pctx,
    return cso;
 }
 
+static void
+iris_bind_sampler_states(struct pipe_context *ctx,
+                         enum pipe_shader_type p_stage,
+                         unsigned start, unsigned count,
+                         void **states)
+{
+   struct iris_context *ice = (struct iris_context *) ctx;
+   gl_shader_stage stage = stage_from_pipe(p_stage);
+
+   assert(start + count <= IRIS_MAX_TEXTURE_SAMPLERS);
+
+   for (int i = 0; i < count; i++) {
+      ice->state.samplers[stage][start + i] = states[i];
+   }
+
+   ice->state.dirty |= IRIS_DIRTY_SAMPLER_STATES_VS << stage;
+}
+
 struct iris_sampler_view {
    struct pipe_sampler_view pipe;
    struct isl_view view;
@@ -840,14 +949,6 @@ iris_set_sampler_views(struct pipe_context *ctx,
 {
 }
 
-static void
-iris_bind_sampler_states(struct pipe_context *ctx,
-                         enum pipe_shader_type shader,
-                         unsigned start, unsigned count,
-                         void **states)
-{
-}
-
 static void
 iris_set_clip_state(struct pipe_context *ctx,
                     const struct pipe_clip_state *state)
@@ -876,15 +977,14 @@ static void
 iris_set_scissor_states(struct pipe_context *ctx,
                         unsigned start_slot,
                         unsigned num_scissors,
-                        const struct pipe_scissor_state *state)
+                        const struct pipe_scissor_state *states)
 {
    struct iris_context *ice = (struct iris_context *) ctx;
 
-   // XXX: start_slot
    ice->state.num_scissors = num_scissors;
 
-   for (unsigned i = start_slot; i < start_slot + num_scissors; i++) {
-      ice->state.scissors[i] = *state;
+   for (unsigned i = 0; i < num_scissors; i++) {
+      ice->state.scissors[start_slot + i] = states[i];
    }
 
    ice->state.dirty |= IRIS_DIRTY_SCISSOR_RECT;
@@ -1002,6 +1102,7 @@ iris_set_viewport_states(struct pipe_context *ctx,
    struct iris_viewport_state *cso =
       malloc(sizeof(struct iris_viewport_state));
 
+   // XXX: sf_cl_vp is only big enough for one slot, we don't iterate right
    for (unsigned i = start_slot; i < start_slot + num_viewports; i++) {
       float x_extent = extent_from_matrix(&state[i], 0);
       float y_extent = extent_from_matrix(&state[i], 1);
@@ -1105,9 +1206,20 @@ iris_delete_state(struct pipe_context *ctx, void *state)
 
 struct iris_vertex_buffer_state {
    uint32_t vertex_buffers[1 + 33 * GENX(VERTEX_BUFFER_STATE_length)];
-   unsigned length; /* length of 3DSTATE_VERTEX_BUFFERS in DWords */
+   struct iris_address bos[33];
+   unsigned num_buffers;
 };
 
+static void
+iris_free_vertex_buffers(struct iris_vertex_buffer_state *cso)
+{
+   if (cso) {
+      for (unsigned i = 0; i < cso->num_buffers; i++)
+         iris_bo_unreference(cso->bos[i].bo);
+      free(cso);
+   }
+}
+
 static void
 iris_set_vertex_buffers(struct pipe_context *ctx,
                         unsigned start_slot, unsigned count,
@@ -1117,12 +1229,6 @@ iris_set_vertex_buffers(struct pipe_context *ctx,
    struct iris_vertex_buffer_state *cso =
       malloc(sizeof(struct iris_vertex_buffer_state));
 
-   cso->length = 4 * count - 1;
-
-   iris_pack_state(GENX(3DSTATE_VERTEX_BUFFERS), cso->vertex_buffers, vb) {
-      vb.DWordLength = cso->length;
-   }
-
    /* If there are no buffers, do nothing.  We can leave the stale
     * 3DSTATE_VERTEX_BUFFERS in place - as long as there are no vertex
     * elements that point to them, it should be fine.
@@ -1130,23 +1236,36 @@ iris_set_vertex_buffers(struct pipe_context *ctx,
    if (!buffers)
       return;
 
+   iris_free_vertex_buffers(ice->state.cso_vertex_buffers);
+
+   cso->num_buffers = count;
+
+   iris_pack_command(GENX(3DSTATE_VERTEX_BUFFERS), cso->vertex_buffers, vb) {
+      vb.DWordLength = 4 * cso->num_buffers - 1;
+   }
+
    uint32_t *vb_pack_dest = &cso->vertex_buffers[1];
 
    for (unsigned i = 0; i < count; i++) {
       assert(!buffers[i].is_user_buffer);
 
+      struct iris_resource *res = (void *) buffers[i].buffer.resource;
+      iris_bo_reference(res->bo);
+      cso->bos[i] = ro_bo(res->bo, buffers[i].buffer_offset);
+
       iris_pack_state(GENX(VERTEX_BUFFER_STATE), vb_pack_dest, vb) {
          vb.VertexBufferIndex = start_slot + i;
          vb.MOCS = MOCS_WB;
          vb.AddressModifyEnable = true;
          vb.BufferPitch = buffers[i].stride;
-         //vb.BufferStartingAddress = ro_bo(bo, buffers[i].buffer_offset);
-         //vb.BufferSize = bo->size;
+         vb.BufferSize = res->bo->size;
+         /* vb.BufferStartingAddress is filled in at draw time */
       }
 
       vb_pack_dest += GENX(VERTEX_BUFFER_STATE_length);
    }
 
+   ice->state.cso_vertex_buffers = cso;
    ice->state.dirty |= IRIS_DIRTY_VERTEX_BUFFERS;
 }
 
@@ -1171,7 +1290,7 @@ iris_create_vertex_elements(struct pipe_context *ctx,
     *  - create SGV ones
     *  - if those are necessary, use count + 1/2/3... OR in the length
     */
-   iris_pack_state(GENX(3DSTATE_VERTEX_ELEMENTS), cso->vertex_elements, ve);
+   iris_pack_command(GENX(3DSTATE_VERTEX_ELEMENTS), cso->vertex_elements, ve);
 
    uint32_t *ve_pack_dest = &cso->vertex_elements[1];
 
@@ -1184,7 +1303,7 @@ iris_create_vertex_elements(struct pipe_context *ctx,
             iris_isl_format_for_pipe_format(state[i].src_format);
       }
 
-      iris_pack_state(GENX(3DSTATE_VF_INSTANCING), cso->vf_instancing[i], vi) {
+      iris_pack_command(GENX(3DSTATE_VF_INSTANCING), cso->vf_instancing[i], vi) {
          vi.VertexElementIndex = i;
          vi.InstancingEnable = state[i].instance_divisor > 0;
          vi.InstanceDataStepRate = state[i].instance_divisor;
@@ -1246,24 +1365,346 @@ iris_set_stream_output_targets(struct pipe_context *ctx,
 {
 }
 
-void
-iris_upload_render_state(struct iris_context *ice,
-                         struct iris_batch *batch,
-                         struct pipe_draw_info *draw)
+static void
+iris_bind_compute_state(struct pipe_context *ctx, void *state)
 {
-   const uint64_t dirty = ice->state.dirty;
+}
 
-   if (dirty & IRIS_DIRTY_WM_DEPTH_STENCIL) {
-      struct iris_depth_stencil_alpha_state *cso = ice->state.cso_zsa;
-      struct pipe_stencil_ref *p_stencil_refs = &ice->state.stencil_ref;
+static void
+iris_populate_vs_key(const struct iris_context *ice,
+                     struct brw_vs_prog_key *key)
+{
+   memset(key, 0, sizeof(*key));
+}
 
-      uint32_t stencil_refs[GENX(3DSTATE_WM_DEPTH_STENCIL_length)];
-      iris_pack_command(GENX(3DSTATE_WM_DEPTH_STENCIL), &stencil_refs, wmds) {
-         wmds.StencilReferenceValue = p_stencil_refs->ref_value[0];
-         wmds.BackfaceStencilReferenceValue = p_stencil_refs->ref_value[1];
+static void
+iris_populate_tcs_key(const struct iris_context *ice,
+                      struct brw_tcs_prog_key *key)
+{
+   memset(key, 0, sizeof(*key));
+}
+
+static void
+iris_populate_tes_key(const struct iris_context *ice,
+                      struct brw_tes_prog_key *key)
+{
+   memset(key, 0, sizeof(*key));
+}
+
+static void
+iris_populate_gs_key(const struct iris_context *ice,
+                     struct brw_gs_prog_key *key)
+{
+   memset(key, 0, sizeof(*key));
+}
+
+static void
+iris_populate_fs_key(const struct iris_context *ice,
+                     struct brw_wm_prog_key *key)
+{
+   memset(key, 0, sizeof(*key));
+
+   /* XXX: dirty flags? */
+   struct pipe_framebuffer_state *fb = &ice->state.framebuffer;
+   struct iris_depth_stencil_alpha_state *zsa = ice->state.cso_zsa;
+   struct iris_rasterizer_state *rast = ice->state.cso_rast;
+   struct iris_blend_state *blend = ice->state.cso_blend;
+
+   key->nr_color_regions = fb->nr_cbufs;
+
+   key->clamp_fragment_color = rast->clamp_fragment_color;
+
+   key->replicate_alpha = fb->nr_cbufs > 1 &&
+      (zsa->alpha.enabled || blend->alpha_to_coverage);
+
+   // key->force_dual_color_blend for unigine
+#if 0
+   if (cso_rast->multisample) {
+      key->persample_interp =
+         ctx->Multisample.SampleShading &&
+         (ctx->Multisample.MinSampleShadingValue *
+          _mesa_geometric_samples(ctx->DrawBuffer) > 1);
+
+      key->multisample_fbo = fb->samples > 1;
+   }
+#endif
+
+   key->coherent_fb_fetch = true;
+}
+
+   //pkt.SamplerCount =                                                     \
+      //DIV_ROUND_UP(CLAMP(stage_state->sampler_count, 0, 16), 4);          \
+   //pkt.PerThreadScratchSpace = prog_data->total_scratch == 0 ? 0 :        \
+      //ffs(stage_state->per_thread_scratch) - 11;                          \
+
+static uint64_t
+KSP(const struct iris_compiled_shader *shader)
+{
+   struct iris_resource *res = (void *) shader->buffer;
+   return res->bo->gtt_offset + shader->offset;
+}
+
+#define INIT_THREAD_DISPATCH_FIELDS(pkt, prefix)                          \
+   pkt.KernelStartPointer = KSP(shader);                                  \
+   pkt.BindingTableEntryCount = prog_data->binding_table.size_bytes / 4;  \
+   pkt.FloatingPointMode = prog_data->use_alt_mode;                       \
+                                                                          \
+   pkt.DispatchGRFStartRegisterForURBData =                               \
+      prog_data->dispatch_grf_start_reg;                                  \
+   pkt.prefix##URBEntryReadLength = vue_prog_data->urb_read_length;       \
+   pkt.prefix##URBEntryReadOffset = 0;                                    \
+                                                                          \
+   pkt.StatisticsEnable = true;                                           \
+   pkt.Enable           = true;
+
+static void
+iris_set_vs_state(const struct gen_device_info *devinfo,
+                  struct iris_compiled_shader *shader)
+{
+   struct brw_stage_prog_data *prog_data = shader->prog_data;
+   struct brw_vue_prog_data *vue_prog_data = (void *) prog_data;
+
+   iris_pack_command(GENX(3DSTATE_VS), shader->derived_data, vs) {
+      INIT_THREAD_DISPATCH_FIELDS(vs, Vertex);
+      vs.MaximumNumberofThreads = devinfo->max_vs_threads - 1;
+      vs.SIMD8DispatchEnable = true;
+      vs.UserClipDistanceCullTestEnableBitmask =
+         vue_prog_data->cull_distance_mask;
+   }
+}
+
+static void
+iris_set_tcs_state(const struct gen_device_info *devinfo,
+                   struct iris_compiled_shader *shader)
+{
+   struct brw_stage_prog_data *prog_data = shader->prog_data;
+   struct brw_vue_prog_data *vue_prog_data = (void *) prog_data;
+   struct brw_tcs_prog_data *tcs_prog_data = (void *) prog_data;
+
+   iris_pack_command(GENX(3DSTATE_HS), shader->derived_data, hs) {
+      INIT_THREAD_DISPATCH_FIELDS(hs, Vertex);
+
+      hs.InstanceCount = tcs_prog_data->instances - 1;
+      hs.MaximumNumberofThreads = devinfo->max_tcs_threads - 1;
+      hs.IncludeVertexHandles = true;
+   }
+}
+
+static void
+iris_set_tes_state(const struct gen_device_info *devinfo,
+                   struct iris_compiled_shader *shader)
+{
+   struct brw_stage_prog_data *prog_data = shader->prog_data;
+   struct brw_vue_prog_data *vue_prog_data = (void *) prog_data;
+   struct brw_tes_prog_data *tes_prog_data = (void *) prog_data;
+
+   uint32_t *te_state = (void *) shader->derived_data;
+   uint32_t *ds_state = te_state + GENX(3DSTATE_TE_length);
+
+   iris_pack_command(GENX(3DSTATE_TE), te_state, te) {
+      te.Partitioning = tes_prog_data->partitioning;
+      te.OutputTopology = tes_prog_data->output_topology;
+      te.TEDomain = tes_prog_data->domain;
+      te.TEEnable = true;
+      te.MaximumTessellationFactorOdd = 63.0;
+      te.MaximumTessellationFactorNotOdd = 64.0;
+   }
+
+   iris_pack_command(GENX(3DSTATE_DS), ds_state, ds) {
+      INIT_THREAD_DISPATCH_FIELDS(ds, Patch);
+
+      ds.DispatchMode = DISPATCH_MODE_SIMD8_SINGLE_PATCH;
+      ds.MaximumNumberofThreads = devinfo->max_tes_threads - 1;
+      ds.ComputeWCoordinateEnable =
+         tes_prog_data->domain == BRW_TESS_DOMAIN_TRI;
+
+      ds.UserClipDistanceCullTestEnableBitmask =
+         vue_prog_data->cull_distance_mask;
+   }
+
+}
+
+static void
+iris_set_gs_state(const struct gen_device_info *devinfo,
+                  struct iris_compiled_shader *shader)
+{
+   struct brw_stage_prog_data *prog_data = shader->prog_data;
+   struct brw_vue_prog_data *vue_prog_data = (void *) prog_data;
+   struct brw_gs_prog_data *gs_prog_data = (void *) prog_data;
+
+   iris_pack_command(GENX(3DSTATE_GS), shader->derived_data, gs) {
+      INIT_THREAD_DISPATCH_FIELDS(gs, Vertex);
+
+      gs.OutputVertexSize = gs_prog_data->output_vertex_size_hwords * 2 - 1;
+      gs.OutputTopology = gs_prog_data->output_topology;
+      gs.ControlDataHeaderSize =
+         gs_prog_data->control_data_header_size_hwords;
+      gs.InstanceControl = gs_prog_data->invocations - 1;
+      gs.DispatchMode = SIMD8;
+      gs.IncludePrimitiveID = gs_prog_data->include_primitive_id;
+      gs.ControlDataFormat = gs_prog_data->control_data_format;
+      gs.ReorderMode = TRAILING;
+      gs.ExpectedVertexCount = gs_prog_data->vertices_in;
+      gs.MaximumNumberofThreads =
+         GEN_GEN == 8 ? (devinfo->max_gs_threads / 2 - 1)
+                      : (devinfo->max_gs_threads - 1);
+
+      if (gs_prog_data->static_vertex_count != -1) {
+         gs.StaticOutput = true;
+         gs.StaticOutputVertexCount = gs_prog_data->static_vertex_count;
       }
-      iris_emit_merge(batch, cso->wmds, stencil_refs);
+      gs.IncludeVertexHandles = vue_prog_data->include_vue_handles;
+
+      gs.UserClipDistanceCullTestEnableBitmask =
+         vue_prog_data->cull_distance_mask;
+
+      const int urb_entry_write_offset = 1;
+      const uint32_t urb_entry_output_length =
+         DIV_ROUND_UP(vue_prog_data->vue_map.num_slots, 2) -
+         urb_entry_write_offset;
+
+      gs.VertexURBEntryOutputReadOffset = urb_entry_write_offset;
+      gs.VertexURBEntryOutputLength = MAX2(urb_entry_output_length, 1);
    }
+}
+
+static void
+iris_set_fs_state(const struct gen_device_info *devinfo,
+                  struct iris_compiled_shader *shader)
+{
+   struct brw_stage_prog_data *prog_data = shader->prog_data;
+   struct brw_wm_prog_data *wm_prog_data = (void *) shader->prog_data;
+
+   uint32_t *ps_state = (void *) shader->derived_data;
+   uint32_t *psx_state = ps_state + GENX(3DSTATE_PS_length);
+
+   iris_pack_command(GENX(3DSTATE_PS), ps_state, ps) {
+      ps.VectorMaskEnable = true;
+      //ps.SamplerCount = ...
+      ps.BindingTableEntryCount = prog_data->binding_table.size_bytes / 4;
+      ps.FloatingPointMode = prog_data->use_alt_mode;
+      ps.MaximumNumberofThreadsPerPSD = 64 - (GEN_GEN == 8 ? 2 : 1);
+
+      ps.PushConstantEnable = prog_data->nr_params > 0 ||
+                              prog_data->ubo_ranges[0].length > 0;
+
+      /* From the documentation for this packet:
+       * "If the PS kernel does not need the Position XY Offsets to
+       *  compute a Position Value, then this field should be programmed
+       *  to POSOFFSET_NONE."
+       *
+       * "SW Recommendation: If the PS kernel needs the Position Offsets
+       *  to compute a Position XY value, this field should match Position
+       *  ZW Interpolation Mode to ensure a consistent position.xyzw
+       *  computation."
+       *
+       * We only require XY sample offsets. So, this recommendation doesn't
+       * look useful at the moment.  We might need this in future.
+       */
+      ps.PositionXYOffsetSelect =
+         wm_prog_data->uses_pos_offset ? POSOFFSET_SAMPLE : POSOFFSET_NONE;
+      ps._8PixelDispatchEnable = wm_prog_data->dispatch_8;
+      ps._16PixelDispatchEnable = wm_prog_data->dispatch_16;
+      ps._32PixelDispatchEnable = wm_prog_data->dispatch_32;
+
+      // XXX: Disable SIMD32 with 16x MSAA
+
+      ps.DispatchGRFStartRegisterForConstantSetupData0 =
+         brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 0);
+      ps.DispatchGRFStartRegisterForConstantSetupData1 =
+         brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 1);
+      ps.DispatchGRFStartRegisterForConstantSetupData2 =
+         brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 2);
+
+      ps.KernelStartPointer0 =
+         KSP(shader) + brw_wm_prog_data_prog_offset(wm_prog_data, ps, 0);
+      ps.KernelStartPointer1 =
+         KSP(shader) + brw_wm_prog_data_prog_offset(wm_prog_data, ps, 1);
+      ps.KernelStartPointer2 =
+         KSP(shader) + brw_wm_prog_data_prog_offset(wm_prog_data, ps, 2);
+   }
+
+   iris_pack_command(GENX(3DSTATE_PS_EXTRA), psx_state, psx) {
+      psx.PixelShaderValid = true;
+      psx.PixelShaderComputedDepthMode = wm_prog_data->computed_depth_mode;
+      psx.PixelShaderKillsPixel = wm_prog_data->uses_kill;
+      psx.AttributeEnable = wm_prog_data->num_varying_inputs != 0;
+      psx.PixelShaderUsesSourceDepth = wm_prog_data->uses_src_depth;
+      psx.PixelShaderUsesSourceW = wm_prog_data->uses_src_w;
+      psx.PixelShaderIsPerSample = wm_prog_data->persample_dispatch;
+
+      if (wm_prog_data->uses_sample_mask) {
+         /* TODO: conservative rasterization */
+         if (wm_prog_data->post_depth_coverage)
+            psx.InputCoverageMaskState = ICMS_DEPTH_COVERAGE;
+         else
+            psx.InputCoverageMaskState = ICMS_NORMAL;
+      }
+
+      psx.oMaskPresenttoRenderTarget = wm_prog_data->uses_omask;
+      psx.PixelShaderPullsBary = wm_prog_data->pulls_bary;
+      psx.PixelShaderComputesStencil = wm_prog_data->computed_stencil;
+
+      // XXX: UAV bit
+   }
+}
+
+static unsigned
+iris_derived_program_state_size(enum iris_program_cache_id cache_id)
+{
+   assert(cache_id <= IRIS_CACHE_CS);
+
+   static const unsigned dwords[] = {
+      [IRIS_CACHE_VS] = GENX(3DSTATE_VS_length),
+      [IRIS_CACHE_TCS] = GENX(3DSTATE_HS_length),
+      [IRIS_CACHE_TES] = GENX(3DSTATE_TE_length) + GENX(3DSTATE_DS_length),
+      [IRIS_CACHE_GS] = GENX(3DSTATE_GS_length),
+      [IRIS_CACHE_FS] =
+         GENX(3DSTATE_PS_length) + GENX(3DSTATE_PS_EXTRA_length),
+      [IRIS_CACHE_CS] = 0,
+      [IRIS_CACHE_BLORP_BLIT] = 0,
+   };
+
+   return sizeof(uint32_t) * dwords[cache_id];
+}
+
+static void
+iris_set_derived_program_state(const struct gen_device_info *devinfo,
+                               enum iris_program_cache_id cache_id,
+                               struct iris_compiled_shader *shader)
+{
+   switch (cache_id) {
+   case IRIS_CACHE_VS:
+      iris_set_vs_state(devinfo, shader);
+      break;
+   case IRIS_CACHE_TCS:
+      iris_set_tcs_state(devinfo, shader);
+      break;
+   case IRIS_CACHE_TES:
+      iris_set_tes_state(devinfo, shader);
+      break;
+   case IRIS_CACHE_GS:
+      iris_set_gs_state(devinfo, shader);
+      break;
+   case IRIS_CACHE_FS:
+      iris_set_fs_state(devinfo, shader);
+      break;
+   case IRIS_CACHE_CS:
+      break;
+   default:
+      break;
+   }
+}
+
+static void
+iris_upload_render_state(struct iris_context *ice,
+                         struct iris_batch *batch,
+                         const struct pipe_draw_info *draw)
+{
+   const uint64_t dirty = ice->state.dirty;
+
+   struct brw_wm_prog_data *wm_prog_data = (void *)
+      ice->shaders.prog[MESA_SHADER_FRAGMENT]->prog_data;
 
    if (dirty & IRIS_DIRTY_CC_VIEWPORT) {
       struct iris_depth_stencil_alpha_state *cso = ice->state.cso_zsa;
@@ -1273,42 +1714,180 @@ iris_upload_render_state(struct iris_context *ice,
       }
    }
 
-   if (dirty & IRIS_DIRTY_PS_BLEND) {
-      struct iris_blend_state *cso = ice->state.cso_blend;
-      iris_batch_emit(batch, cso->ps_blend, sizeof(cso->ps_blend));
+   if (dirty & IRIS_DIRTY_SF_CL_VIEWPORT) {
+      struct iris_viewport_state *cso = ice->state.cso_vp;
+      iris_emit_cmd(batch, GENX(3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP), ptr) {
+         ptr.SFClipViewportPointer =
+            iris_emit_state(batch, cso->sf_cl_vp, sizeof(cso->sf_cl_vp), 64);
+      }
+   }
+
+   /* XXX: L3 State */
+
+   if (dirty & IRIS_DIRTY_URB) {
+      /* XXX: 3DSTATE_URB */
    }
 
    if (dirty & IRIS_DIRTY_BLEND_STATE) {
-      //struct iris_blend_state *cso = ice->state.cso_blend;
+      struct iris_blend_state *cso = ice->state.cso_blend;
       // XXX: 3DSTATE_BLEND_STATE_POINTERS - BLEND_STATE
       // -> from iris_blend_state (most) + iris_depth_stencil_alpha_state
       //    (alpha test function/enable) + has writeable RT from ???????
    }
 
-   if (dirty & IRIS_DIRTY_SF_CL_VIEWPORT) {
-      struct iris_viewport_state *cso = ice->state.cso_vp;
-      iris_emit_cmd(batch, GENX(3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP), ptr) {
-         ptr.SFClipViewportPointer =
-            iris_emit_state(batch, cso->sf_cl_vp, sizeof(cso->sf_cl_vp), 64);
+   if (dirty & IRIS_DIRTY_COLOR_CALC_STATE) {
+      struct iris_depth_stencil_alpha_state *cso = ice->state.cso_zsa;
+      uint32_t cc_offset;
+      void *cc_map =
+         iris_alloc_state(batch,
+                          sizeof(uint32_t) * GENX(COLOR_CALC_STATE_length),
+                          64, &cc_offset);
+      iris_pack_state(GENX(COLOR_CALC_STATE), cc_map, cc) {
+         cc.AlphaTestFormat = ALPHATEST_FLOAT32;
+         cc.AlphaReferenceValueAsFLOAT32 = cso->alpha.ref_value;
+         cc.BlendConstantColorRed   = ice->state.blend_color.color[0];
+         cc.BlendConstantColorGreen = ice->state.blend_color.color[1];
+         cc.BlendConstantColorBlue  = ice->state.blend_color.color[2];
+         cc.BlendConstantColorAlpha = ice->state.blend_color.color[3];
+      }
+      iris_emit_cmd(batch, GENX(3DSTATE_CC_STATE_POINTERS), ptr) {
+         ptr.ColorCalcStatePointer = cc_offset;
+         ptr.ColorCalcStatePointerValid = true;
+      }
+   }
+
+   // XXX: 3DSTATE_CONSTANT_XS
+   // Surfaces:
+   // - pull constants
+   // - ubos/ssbos/abos
+   // - images
+   // - textures
+   // - render targets - write and read
+   // XXX: 3DSTATE_BINDING_TABLE_POINTERS_XS
+
+   for (int stage = 0; stage <= MESA_SHADER_FRAGMENT; stage++) {
+      if (!(dirty & (IRIS_DIRTY_SAMPLER_STATES_VS << stage)))
+         continue;
+
+      // XXX: get sampler count from shader; don't emit them all...
+      const int count = IRIS_MAX_TEXTURE_SAMPLERS;
+
+      uint32_t offset;
+      uint32_t *map = iris_alloc_state(batch,
+                                       count * 4 * GENX(SAMPLER_STATE_length),
+                                       32, &offset);
+
+      for (int i = 0; i < count; i++) {
+         // XXX: when we have a correct count, these better be bound
+         if (!ice->state.samplers[stage][i])
+            continue;
+         memcpy(map, ice->state.samplers[stage][i]->sampler_state,
+                4 * GENX(SAMPLER_STATE_length));
+         map += GENX(SAMPLER_STATE_length);
+      }
+
+      iris_emit_cmd(batch, GENX(3DSTATE_SAMPLER_STATE_POINTERS_VS), ptr) {
+         ptr._3DCommandSubOpcode = 43 + stage;
+         ptr.PointertoVSSamplerState = offset;
       }
    }
 
+   if (dirty & IRIS_DIRTY_MULTISAMPLE) {
+      iris_emit_cmd(batch, GENX(3DSTATE_MULTISAMPLE), ms) {
+         ms.PixelLocation =
+            ice->state.cso_rast->half_pixel_center ? CENTER : UL_CORNER;
+         if (ice->state.framebuffer.samples > 0)
+            ms.NumberofMultisamples = ffs(ice->state.framebuffer.samples) - 1;
+      }
+   }
+
+   if (dirty & IRIS_DIRTY_SAMPLE_MASK) {
+      iris_emit_cmd(batch, GENX(3DSTATE_SAMPLE_MASK), ms) {
+         ms.SampleMask = ice->state.sample_mask;
+      }
+   }
+
+   for (int stage = 0; stage <= MESA_SHADER_FRAGMENT; stage++) {
+      if (!(dirty & (IRIS_DIRTY_VS << stage)))
+         continue;
+
+      if (ice->shaders.prog[stage]) {
+         iris_batch_emit(batch, ice->shaders.prog[stage]->derived_data,
+                         iris_derived_program_state_size(stage));
+      } else {
+         if (stage == MESA_SHADER_TESS_EVAL) {
+            iris_emit_cmd(batch, GENX(3DSTATE_HS), hs);
+            iris_emit_cmd(batch, GENX(3DSTATE_TE), te);
+            iris_emit_cmd(batch, GENX(3DSTATE_DS), ds);
+         } else if (stage == MESA_SHADER_GEOMETRY) {
+            iris_emit_cmd(batch, GENX(3DSTATE_GS), gs);
+         }
+      }
+   }
+
+   // XXX: SOL:
+   // 3DSTATE_STREAMOUT
+   // 3DSTATE_SO_BUFFER
+   // 3DSTATE_SO_DECL_LIST
+
    if (dirty & IRIS_DIRTY_CLIP) {
-      struct iris_rasterizer_state *cso = ice->state.cso_rast;
+      struct iris_rasterizer_state *cso_rast = ice->state.cso_rast;
+      struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
 
       uint32_t dynamic_clip[GENX(3DSTATE_CLIP_length)];
       iris_pack_command(GENX(3DSTATE_CLIP), &dynamic_clip, cl) {
-         //.NonPerspectiveBarycentricEnable = <comes from FS prog> :(
-         //.ForceZeroRTAIndexEnable = <comes from FB layers being 0>
-         // also userclip stuffs...
+         if (wm_prog_data->barycentric_interp_modes &
+             BRW_BARYCENTRIC_NONPERSPECTIVE_BITS)
+            cl.NonPerspectiveBarycentricEnable = true;
+
+         cl.ForceZeroRTAIndexEnable = cso_fb->layers == 0;
       }
-      iris_emit_merge(batch, cso->clip, dynamic_clip);
+      iris_emit_merge(batch, cso_rast->clip, dynamic_clip,
+                      ARRAY_SIZE(cso_rast->clip));
    }
 
    if (dirty & IRIS_DIRTY_RASTER) {
       struct iris_rasterizer_state *cso = ice->state.cso_rast;
       iris_batch_emit(batch, cso->raster, sizeof(cso->raster));
       iris_batch_emit(batch, cso->sf, sizeof(cso->sf));
+
+   }
+
+   if (dirty & (IRIS_DIRTY_RASTER | IRIS_DIRTY_FS)) {
+      struct iris_rasterizer_state *cso = ice->state.cso_rast;
+      uint32_t dynamic_wm[GENX(3DSTATE_WM_length)];
+
+      iris_pack_command(GENX(3DSTATE_WM), &dynamic_wm, wm) {
+         wm.BarycentricInterpolationMode =
+            wm_prog_data->barycentric_interp_modes;
+
+         if (wm_prog_data->early_fragment_tests)
+            wm.EarlyDepthStencilControl = EDSC_PREPS;
+         else if (wm_prog_data->has_side_effects)
+            wm.EarlyDepthStencilControl = EDSC_PSEXEC;
+      }
+      iris_emit_merge(batch, cso->wm, dynamic_wm, ARRAY_SIZE(cso->wm));
+   }
+
+   // XXX: 3DSTATE_SBE, 3DSTATE_SBE_SWIZ
+   // -> iris_raster_state (point sprite texture coordinate origin)
+   // -> bunch of shader state...
+
+   if (dirty & IRIS_DIRTY_PS_BLEND) {
+      struct iris_blend_state *cso = ice->state.cso_blend;
+      iris_batch_emit(batch, cso->ps_blend, sizeof(cso->ps_blend));
+   }
+
+   if (dirty & IRIS_DIRTY_WM_DEPTH_STENCIL) {
+      struct iris_depth_stencil_alpha_state *cso = ice->state.cso_zsa;
+      struct pipe_stencil_ref *p_stencil_refs = &ice->state.stencil_ref;
+
+      uint32_t stencil_refs[GENX(3DSTATE_WM_DEPTH_STENCIL_length)];
+      iris_pack_command(GENX(3DSTATE_WM_DEPTH_STENCIL), &stencil_refs, wmds) {
+         wmds.StencilReferenceValue = p_stencil_refs->ref_value[0];
+         wmds.BackfaceStencilReferenceValue = p_stencil_refs->ref_value[1];
+      }
+      iris_emit_merge(batch, cso->wmds, stencil_refs, ARRAY_SIZE(cso->wmds));
    }
 
    if (dirty & IRIS_DIRTY_SCISSOR) {
@@ -1322,6 +1901,11 @@ iris_upload_render_state(struct iris_context *ice,
       }
    }
 
+   // XXX: 3DSTATE_DEPTH_BUFFER
+   // XXX: 3DSTATE_HIER_DEPTH_BUFFER
+   // XXX: 3DSTATE_STENCIL_BUFFER
+   // XXX: 3DSTATE_CLEAR_PARAMS
+
    if (dirty & IRIS_DIRTY_POLYGON_STIPPLE) {
       iris_emit_cmd(batch, GENX(3DSTATE_POLY_STIPPLE_PATTERN), poly) {
          for (int i = 0; i < 32; i++) {
@@ -1332,39 +1916,50 @@ iris_upload_render_state(struct iris_context *ice,
 
    if (dirty & IRIS_DIRTY_LINE_STIPPLE) {
       struct iris_rasterizer_state *cso = ice->state.cso_rast;
-      iris_emit_cmd(batch, GENX(3DSTATE_LINE_STIPPLE), line) {
-         line.LineStipplePattern = cso->line_stipple_pattern;
-         line.LineStippleInverseRepeatCount = 1.0f / cso->line_stipple_factor;
-         line.LineStippleRepeatCount = cso->line_stipple_factor;
-      }
+      iris_batch_emit(batch, cso->line_stipple, sizeof(cso->line_stipple));
    }
 
-   if (dirty & IRIS_DIRTY_COLOR_CALC_STATE) {
-      struct iris_depth_stencil_alpha_state *cso = ice->state.cso_zsa;
-      uint32_t cc_offset;
-      void *cc_map =
-         iris_alloc_state(batch,
-                          sizeof(uint32_t) * GENX(COLOR_CALC_STATE_length),
-                          64, &cc_offset);
-      iris_pack_state(GENX(COLOR_CALC_STATE), cc_map, cc) {
-         cc.AlphaTestFormat = ALPHATEST_FLOAT32;
-         cc.AlphaReferenceValueAsFLOAT32 = cso->alpha.ref_value;
-         cc.BlendConstantColorRed   = ice->state.blend_color.color[0];
-         cc.BlendConstantColorGreen = ice->state.blend_color.color[1];
-         cc.BlendConstantColorBlue  = ice->state.blend_color.color[2];
-         cc.BlendConstantColorAlpha = ice->state.blend_color.color[3];
+   if (1) {
+      iris_emit_cmd(batch, GENX(3DSTATE_VF_TOPOLOGY), topo) {
+         topo.PrimitiveTopologyType =
+            translate_prim_type(draw->mode, draw->vertices_per_patch);
       }
-      iris_emit_cmd(batch, GENX(3DSTATE_CC_STATE_POINTERS), ptr) {
-         ptr.ColorCalcStatePointer = cc_offset;
-         ptr.ColorCalcStatePointerValid = true;
+   }
+
+   if (draw->index_size > 0) {
+      struct iris_resource *res = (struct iris_resource *)draw->index.resource;
+
+      assert(!draw->has_user_indices);
+
+      iris_emit_cmd(batch, GENX(3DSTATE_INDEX_BUFFER), ib) {
+         ib.IndexFormat = draw->index_size;
+         ib.MOCS = MOCS_WB;
+         ib.BufferSize = res->bo->size;
+         ib.BufferStartingAddress = ro_bo(res->bo, 0);
       }
    }
 
    if (dirty & IRIS_DIRTY_VERTEX_BUFFERS) {
       struct iris_vertex_buffer_state *cso = ice->state.cso_vertex_buffers;
-      // XXX: address!!!
+
+      STATIC_ASSERT(GENX(VERTEX_BUFFER_STATE_length) == 4);
+      STATIC_ASSERT((GENX(VERTEX_BUFFER_STATE_BufferStartingAddress_bits) % 32) == 0);
+
+      uint64_t *addr = batch->cmdbuf.map_next + sizeof(uint32_t) *
+         (GENX(VERTEX_BUFFER_STATE_BufferStartingAddress_bits) / 32);
+      uint32_t *delta = cso->vertex_buffers +
+         (1 + GENX(VERTEX_BUFFER_STATE_BufferStartingAddress_bits) / 32);
+
       iris_batch_emit(batch, cso->vertex_buffers,
-                      sizeof(uint32_t) * cso->length);
+                      sizeof(uint32_t) * (1 + 4 * cso->num_buffers));
+
+      for (unsigned i = 0; i < cso->num_buffers; i++) {
+         *addr = iris_batch_reloc(batch, (void *) addr - batch->cmdbuf.map,
+                                  cso->bos[i].bo, cso->bos[i].offset +
+                                  *delta, cso->bos[i].reloc_flags);
+         addr = (void *) addr + 16;
+         delta = (void *) delta + 16;
+      }
    }
 
    if (dirty & IRIS_DIRTY_VERTEX_ELEMENTS) {
@@ -1372,8 +1967,8 @@ iris_upload_render_state(struct iris_context *ice,
       iris_batch_emit(batch, cso->vertex_elements, sizeof(uint32_t) *
                       (1 + cso->count * GENX(VERTEX_ELEMENT_STATE_length)));
       for (int i = 0; i < cso->count; i++) {
-         iris_batch_emit(batch, cso->vf_instancing[i],
-                         sizeof(cso->vf_instancing[0]));
+         iris_batch_emit(batch, cso->vf_instancing[i], sizeof(uint32_t) *
+                         (cso->count * GENX(3DSTATE_VF_INSTANCING_length)));
       }
       for (int i = 0; i < cso->count; i++) {
          /* TODO: vertexid, instanceid support */
@@ -1381,121 +1976,39 @@ iris_upload_render_state(struct iris_context *ice,
       }
    }
 
-   if (dirty & IRIS_DIRTY_MULTISAMPLE) {
-      iris_emit_cmd(batch, GENX(3DSTATE_MULTISAMPLE), ms) {
-         ms.PixelLocation =
-            ice->state.cso_rast->half_pixel_center ? CENTER : UL_CORNER;
-         ms.NumberofMultisamples = ffs(ice->state.framebuffer.samples) - 1;
-      }
-   }
-
-   if (dirty & IRIS_DIRTY_SAMPLE_MASK) {
-      iris_emit_cmd(batch, GENX(3DSTATE_SAMPLE_MASK), ms) {
-         ms.SampleMask = ice->state.sample_mask;
-      }
-   }
-
-   if (1) {
-      iris_emit_cmd(batch, GENX(3DSTATE_VF_TOPOLOGY), topo) {
-         topo.PrimitiveTopologyType =
-            translate_prim_type(draw->mode, draw->vertices_per_patch);
-      }
-   }
-
    if (1) {
       iris_emit_cmd(batch, GENX(3DSTATE_VF), vf) {
-         vf.IndexedDrawCutIndexEnable = draw->primitive_restart;
-         vf.CutIndex = draw->restart_index;
+         if (draw->primitive_restart) {
+            vf.IndexedDrawCutIndexEnable = true;
+            vf.CutIndex = draw->restart_index;
+         }
       }
    }
 
-   // draw->index_size > 0
-   if (1) {
-      struct iris_resource *res = (struct iris_resource *)draw->index.resource;
-
-      assert(!draw->has_user_indices);
+   // XXX: Gen8 - PMA fix
 
-      iris_emit_cmd(batch, GENX(3DSTATE_INDEX_BUFFER), ib) {
-         ib.IndexFormat = draw->index_size;
-         ib.MOCS = MOCS_WB;
-         ib.BufferSize = res->bo->size;
-         // XXX: gah, addresses :(  need two different combine address funcs
-         // ib.BufferStartingAddress = res->bo;
-      }
+   assert(!draw->indirect); // XXX: indirect support
 
-      assert(!draw->indirect); // XXX: indirect support
+   iris_emit_cmd(batch, GENX(3DPRIMITIVE), prim) {
+      prim.StartInstanceLocation = draw->start_instance;
+      prim.InstanceCount = draw->instance_count;
+      prim.VertexCountPerInstance = draw->count;
+      prim.VertexAccessType = draw->index_size > 0 ? RANDOM : SEQUENTIAL;
 
-      iris_emit_cmd(batch, GENX(3DPRIMITIVE), prim) {
-         prim.StartInstanceLocation = draw->start_instance;
-         prim.InstanceCount = draw->instance_count;
+      // XXX: this is probably bonkers.
+      prim.StartVertexLocation = draw->start;
 
-         // XXX: this is probably bonkers.
-         prim.StartVertexLocation = draw->start;
-
-         if (draw->index_size) {
-            prim.BaseVertexLocation += draw->index_bias;
-         } else {
-            prim.StartVertexLocation += draw->index_bias;
-         }
-
-         //prim.BaseVertexLocation = ...;
+      if (draw->index_size) {
+         prim.BaseVertexLocation += draw->index_bias;
+      } else {
+         prim.StartVertexLocation += draw->index_bias;
       }
+
+      //prim.BaseVertexLocation = ...;
    }
-#if 0
-   l3 configuration
-
-   3DSTATE_PUSH_CONSTANT_ALLOC_*
-   3DSTATE_URB_*
-     -> TODO
-
-   3DSTATE_CONSTANT_* - push constants
-     -> TODO
-
-   Surfaces:
-   - pull constants
-   - ubos/ssbos/abos
-   - images
-   - textures
-   - render targets - write and read
-   3DSTATE_BINDING_TABLE_POINTERS_*
-     -> TODO
-
-   3DSTATE_SAMPLER_STATE_POINTERS_*
-     -> TODO
-
-   3DSTATE_VS
-   3DSTATE_HS
-   3DSTATE_TE
-   3DSTATE_DS
-   3DSTATE_GS
-   3DSTATE_PS_EXTRA
-   3DSTATE_PS
-   3DSTATE_STREAMOUT
-   3DSTATE_SO_BUFFER
-   3DSTATE_SO_DECL_LIST
-
-   3DSTATE_WM
-     -> iris_raster_state + FS state (barycentric, EDSC)
-   3DSTATE_SBE
-     -> iris_raster_state (point sprite texture coordinate origin)
-     -> bunch of shader state...
-   3DSTATE_SBE_SWIZ
-     -> FS state
-
-   3DSTATE_DEPTH_BUFFER
-   3DSTATE_HIER_DEPTH_BUFFER
-   3DSTATE_STENCIL_BUFFER
-   3DSTATE_CLEAR_PARAMS
-     -> iris_framebuffer_state?
-#endif
 }
 
 static void
-iris_bind_state(struct pipe_context *ctx, void *state)
-{
-}
-
-void
 iris_destroy_state(struct iris_context *ice)
 {
    // XXX: unreference resources/surfaces.
@@ -1506,8 +2019,10 @@ iris_destroy_state(struct iris_context *ice)
 }
 
 void
-iris_init_state_functions(struct pipe_context *ctx)
+genX(init_state)(struct iris_context *ice)
 {
+   struct pipe_context *ctx = &ice->ctx;
+
    ctx->create_blend_state = iris_create_blend_state;
    ctx->create_depth_stencil_alpha_state = iris_create_zsa_state;
    ctx->create_rasterizer_state = iris_create_rasterizer_state;
@@ -1519,14 +2034,9 @@ iris_init_state_functions(struct pipe_context *ctx)
    ctx->bind_blend_state = iris_bind_blend_state;
    ctx->bind_depth_stencil_alpha_state = iris_bind_zsa_state;
    ctx->bind_sampler_states = iris_bind_sampler_states;
-   ctx->bind_fs_state = iris_bind_state;
    ctx->bind_rasterizer_state = iris_bind_rasterizer_state;
    ctx->bind_vertex_elements_state = iris_bind_vertex_elements_state;
-   ctx->bind_compute_state = iris_bind_state;
-   ctx->bind_tcs_state = iris_bind_state;
-   ctx->bind_tes_state = iris_bind_state;
-   ctx->bind_gs_state = iris_bind_state;
-   ctx->bind_vs_state = iris_bind_state;
+   ctx->bind_compute_state = iris_bind_compute_state;
    ctx->delete_blend_state = iris_delete_state;
    ctx->delete_depth_stencil_alpha_state = iris_delete_state;
    ctx->delete_fs_state = iris_delete_state;
@@ -1556,4 +2066,18 @@ iris_init_state_functions(struct pipe_context *ctx)
    ctx->create_stream_output_target = iris_create_stream_output_target;
    ctx->stream_output_target_destroy = iris_stream_output_target_destroy;
    ctx->set_stream_output_targets = iris_set_stream_output_targets;
+
+   ice->state.destroy_state = iris_destroy_state;
+   ice->state.init_render_context = iris_init_render_context;
+   ice->state.upload_render_state = iris_upload_render_state;
+   ice->state.derived_program_state_size = iris_derived_program_state_size;
+   ice->state.set_derived_program_state = iris_set_derived_program_state;
+   ice->state.populate_vs_key = iris_populate_vs_key;
+   ice->state.populate_tcs_key = iris_populate_tcs_key;
+   ice->state.populate_tes_key = iris_populate_tes_key;
+   ice->state.populate_gs_key = iris_populate_gs_key;
+   ice->state.populate_fs_key = iris_populate_fs_key;
+
+
+   ice->state.dirty = ~0ull;
 }