iris: actually set cube bit properly
[mesa.git] / src / gallium / drivers / iris / iris_state.c
index 773ea358d42af42a535a4c73cde337ec8758ae99..fa80c5d4db9c1aa33158b5f9ed555fea96f15d85 100644 (file)
 #include <stdio.h>
 #include <errno.h>
 
-#ifdef HAVE_VALGRIND
+#if HAVE_VALGRIND
 #include <valgrind.h>
 #include <memcheck.h>
 #define VG(x) x
+#ifndef NDEBUG
 #define __gen_validate_value(x) VALGRIND_CHECK_MEM_IS_DEFINED(&(x), sizeof(x))
+#endif
 #else
 #define VG(x)
 #endif
 #include "pipe/p_context.h"
 #include "pipe/p_screen.h"
 #include "util/u_inlines.h"
+#include "util/u_format.h"
+#include "util/u_framebuffer.h"
 #include "util/u_transfer.h"
+#include "util/u_upload_mgr.h"
 #include "i915_drm.h"
+#include "nir.h"
 #include "intel/compiler/brw_compiler.h"
 #include "intel/common/gen_l3_config.h"
 #include "intel/common/gen_sample_positions.h"
@@ -56,11 +62,15 @@ static uint64_t
 __gen_combine_address(struct iris_batch *batch, void *location,
                       struct iris_address addr, uint32_t delta)
 {
-   if (addr.bo == NULL)
-      return addr.offset + delta;
+   uint64_t result = addr.offset + delta;
+
+   if (addr.bo) {
+      iris_use_pinned_bo(batch, addr.bo, addr.write);
+      /* Assume this is a general address, not relative to a base. */
+      result += addr.bo->gtt_offset;
+   }
 
-   return iris_batch_reloc(batch, location - batch->cmdbuf.map, addr.bo,
-                           addr.offset + delta, addr.reloc_flags);
+   return result;
 }
 
 #define __genxml_cmd_length(cmd) cmd ## _length
@@ -68,15 +78,6 @@ __gen_combine_address(struct iris_batch *batch, void *location,
 #define __genxml_cmd_header(cmd) cmd ## _header
 #define __genxml_cmd_pack(cmd) cmd ## _pack
 
-static void *
-get_command_space(struct iris_batch *batch, unsigned bytes)
-{
-   iris_require_command_space(batch, bytes);
-   void *map = batch->cmdbuf.map_next;
-   batch->cmdbuf.map_next += bytes;
-   return map;
-}
-
 #define _iris_pack_command(batch, cmd, dst, name)                 \
    for (struct cmd name = { __genxml_cmd_header(cmd) },           \
         *_dst = (void *)(dst); __builtin_expect(_dst != NULL, 1); \
@@ -94,36 +95,16 @@ get_command_space(struct iris_batch *batch, unsigned bytes)
         _dst = NULL)
 
 #define iris_emit_cmd(batch, cmd, name) \
-   _iris_pack_command(batch, cmd, get_command_space(batch, 4 * __genxml_cmd_length(cmd)), name)
+   _iris_pack_command(batch, cmd, iris_get_command_space(batch, 4 * __genxml_cmd_length(cmd)), name)
 
 #define iris_emit_merge(batch, dwords0, dwords1, num_dwords)   \
    do {                                                        \
-      uint32_t *dw = get_command_space(batch, 4 * num_dwords); \
+      uint32_t *dw = iris_get_command_space(batch, 4 * num_dwords); \
       for (uint32_t i = 0; i < num_dwords; i++)                \
          dw[i] = (dwords0)[i] | (dwords1)[i];                  \
       VG(VALGRIND_CHECK_MEM_IS_DEFINED(dw, num_dwords));       \
    } while (0)
 
-#define iris_emit_with_addr(batch, dwords, num_dw, addr_field, addr)    \
-   do {                                                                 \
-      STATIC_ASSERT((GENX(addr_field) % 64) == 0);                      \
-      assert(num_dw <= ARRAY_SIZE(dwords));                             \
-      int addr_idx = GENX(addr_field) / 32;                             \
-      uint32_t *dw = get_command_space(batch, 4 * num_dw);              \
-      for (uint32_t i = 0; i < addr_idx; i++) {                         \
-         dw[i] = (dwords)[i];                                           \
-      }                                                                 \
-      uint64_t *qw = (uint64_t *) &dw[addr_idx];                        \
-      *qw = iris_batch_reloc(batch, (void *)qw - batch->cmdbuf.map,     \
-                             addr.bo,                                   \
-                             addr.offset + (dwords)[addr_idx + 1],      \
-                             addr.reloc_flags);                         \
-      for (uint32_t i = addr_idx + 1; i < num_dw; i++) {                \
-         dw[i] = (dwords)[i];                                           \
-      }                                                                 \
-      VG(VALGRIND_CHECK_MEM_IS_DEFINED(dw, num_dw * 4));                \
-   } while (0)
-
 #include "genxml/genX_pack.h"
 #include "genxml/gen_macros.h"
 #include "genxml/genX_bits.h"
@@ -289,14 +270,73 @@ translate_fill_mode(unsigned pipe_polymode)
 }
 
 static struct iris_address
-ro_bo(struct iris_bo *bo, uint32_t offset)
+ro_bo(struct iris_bo *bo, uint64_t offset)
 {
+   /* Not for CSOs! */
    return (struct iris_address) { .bo = bo, .offset = offset };
 }
 
+static void *
+upload_state(struct u_upload_mgr *uploader,
+             struct iris_state_ref *ref,
+             unsigned size,
+             unsigned alignment)
+{
+   void *p = NULL;
+   u_upload_alloc(uploader, 0, size, alignment, &ref->offset, &ref->res, &p);
+   return p;
+}
+
+static uint32_t *
+stream_state(struct iris_batch *batch,
+             struct u_upload_mgr *uploader,
+             struct pipe_resource **out_res,
+             unsigned size,
+             unsigned alignment,
+             uint32_t *out_offset)
+{
+   void *ptr = NULL;
+
+   u_upload_alloc(uploader, 0, size, alignment, out_offset, out_res, &ptr);
+
+   struct iris_bo *bo = iris_resource_bo(*out_res);
+   iris_use_pinned_bo(batch, bo, false);
+
+   *out_offset += iris_bo_offset_from_base_address(bo);
+
+   return ptr;
+}
+
+static uint32_t
+emit_state(struct iris_batch *batch,
+           struct u_upload_mgr *uploader,
+           struct pipe_resource **out_res,
+           const void *data,
+           unsigned size,
+           unsigned alignment)
+{
+   unsigned offset = 0;
+   uint32_t *map =
+      stream_state(batch, uploader, out_res, size, alignment, &offset);
+
+   if (map)
+      memcpy(map, data, size);
+
+   return offset;
+}
+
+#define cso_changed(x) (!old_cso || (old_cso->x != new_cso->x))
+#define cso_changed_memcmp(x) \
+   (!old_cso || memcmp(old_cso->x, new_cso->x, sizeof(old_cso->x)) != 0)
+
 static void
-iris_emit_state_base_address(struct iris_batch *batch)
+iris_init_render_context(struct iris_screen *screen,
+                         struct iris_batch *batch,
+                         struct iris_vtable *vtbl,
+                         struct pipe_debug_callback *dbg)
 {
+   iris_init_batch(batch, screen, vtbl, dbg, I915_EXEC_RENDER);
+
    /* XXX: PIPE_CONTROLs */
 
    iris_emit_cmd(batch, GENX(STATE_BASE_ADDRESS), sba) {
@@ -322,23 +362,15 @@ iris_emit_state_base_address(struct iris_batch *batch)
       sba.IndirectObjectBufferSizeModifyEnable  = true;
       sba.InstructionBuffersizeModifyEnable     = true;
 
-      sba.SurfaceStateBaseAddress = ro_bo(batch->statebuf.bo, 0);
-      sba.DynamicStateBaseAddress = ro_bo(batch->statebuf.bo, 0);
+      sba.InstructionBaseAddress  = ro_bo(NULL, IRIS_MEMZONE_SHADER_START);
+      sba.SurfaceStateBaseAddress = ro_bo(NULL, IRIS_MEMZONE_SURFACE_START);
+      sba.DynamicStateBaseAddress = ro_bo(NULL, IRIS_MEMZONE_DYNAMIC_START);
 
       sba.GeneralStateBufferSize   = 0xfffff;
       sba.IndirectObjectBufferSize = 0xfffff;
       sba.InstructionBufferSize    = 0xfffff;
-      sba.DynamicStateBufferSize   = ALIGN(MAX_STATE_SIZE, 4096);
+      sba.DynamicStateBufferSize   = 0xfffff;
    }
-}
-
-static void
-iris_init_render_context(struct iris_screen *screen,
-                         struct iris_batch *batch,
-                         struct pipe_debug_callback *dbg)
-{
-   batch->emit_state_base_address = iris_emit_state_base_address;
-   iris_init_batch(batch, screen, dbg, I915_EXEC_RENDER);
 
    iris_emit_cmd(batch, GENX(3DSTATE_DRAWING_RECTANGLE), rect) {
       rect.ClippedDrawingRectangleXMax = UINT16_MAX;
@@ -367,6 +399,33 @@ iris_init_render_context(struct iris_screen *screen,
    }
 }
 
+struct iris_viewport_state {
+   uint32_t sf_cl_vp[GENX(SF_CLIP_VIEWPORT_length) * IRIS_MAX_VIEWPORTS];
+};
+
+struct iris_vertex_buffer_state {
+   uint32_t vertex_buffers[1 + 33 * GENX(VERTEX_BUFFER_STATE_length)];
+   struct pipe_resource *resources[33];
+   unsigned num_buffers;
+};
+
+struct iris_depth_buffer_state {
+   uint32_t packets[GENX(3DSTATE_DEPTH_BUFFER_length) +
+                    GENX(3DSTATE_STENCIL_BUFFER_length) +
+                    GENX(3DSTATE_HIER_DEPTH_BUFFER_length) +
+                    GENX(3DSTATE_CLEAR_PARAMS_length)];
+};
+
+/**
+ * State that can't be stored directly in iris_context because the data
+ * layout varies per generation.
+ */
+struct iris_genx_state {
+   struct iris_viewport_state viewport;
+   struct iris_vertex_buffer_state vertex_buffers;
+   struct iris_depth_buffer_state depth_buffer;
+};
+
 static void
 iris_launch_grid(struct pipe_context *ctx, const struct pipe_grid_info *info)
 {
@@ -383,7 +442,10 @@ iris_set_blend_color(struct pipe_context *ctx,
 }
 
 struct iris_blend_state {
+   /** Partial 3DSTATE_PS_BLEND */
    uint32_t ps_blend[GENX(3DSTATE_PS_BLEND_length)];
+
+   /** Partial BLEND_STATE */
    uint32_t blend_state[GENX(BLEND_STATE_length) +
                         BRW_MAX_DRAW_BUFFERS * GENX(BLEND_STATE_ENTRY_length)];
 
@@ -443,10 +505,10 @@ iris_create_blend_state(struct pipe_context *ctx,
          be.DestinationBlendFactor      = state->rt[i].rgb_dst_factor;
          be.DestinationAlphaBlendFactor = state->rt[i].alpha_dst_factor;
 
-         be.WriteDisableRed   = state->rt[i].colormask & PIPE_MASK_R;
-         be.WriteDisableGreen = state->rt[i].colormask & PIPE_MASK_G;
-         be.WriteDisableBlue  = state->rt[i].colormask & PIPE_MASK_B;
-         be.WriteDisableAlpha = state->rt[i].colormask & PIPE_MASK_A;
+         be.WriteDisableRed   = !(state->rt[i].colormask & PIPE_MASK_R);
+         be.WriteDisableGreen = !(state->rt[i].colormask & PIPE_MASK_G);
+         be.WriteDisableBlue  = !(state->rt[i].colormask & PIPE_MASK_B);
+         be.WriteDisableAlpha = !(state->rt[i].colormask & PIPE_MASK_A);
       }
       blend_state += GENX(BLEND_STATE_ENTRY_length);
    }
@@ -459,15 +521,19 @@ iris_bind_blend_state(struct pipe_context *ctx, void *state)
 {
    struct iris_context *ice = (struct iris_context *) ctx;
    ice->state.cso_blend = state;
-   ice->state.dirty |= IRIS_DIRTY_CC_VIEWPORT;
-   ice->state.dirty |= IRIS_DIRTY_WM_DEPTH_STENCIL;
+   ice->state.dirty |= IRIS_DIRTY_PS_BLEND;
+   ice->state.dirty |= IRIS_DIRTY_BLEND_STATE;
 }
 
 struct iris_depth_stencil_alpha_state {
+   /** Partial 3DSTATE_WM_DEPTH_STENCIL */
    uint32_t wmds[GENX(3DSTATE_WM_DEPTH_STENCIL_length)];
+
+   /** Complete CC_VIEWPORT */
    uint32_t cc_vp[GENX(CC_VIEWPORT_length)];
 
-   struct pipe_alpha_state alpha; /* to BLEND_STATE, 3DSTATE_PS_BLEND */
+   /** Outbound to BLEND_STATE, 3DSTATE_PS_BLEND, COLOR_CALC_STATE */
+   struct pipe_alpha_state alpha;
 };
 
 static void *
@@ -526,9 +592,11 @@ iris_bind_zsa_state(struct pipe_context *ctx, void *state)
    struct iris_depth_stencil_alpha_state *new_cso = state;
 
    if (new_cso) {
-      if (!old_cso || old_cso->alpha.ref_value != new_cso->alpha.ref_value) {
+      if (cso_changed(alpha.ref_value))
          ice->state.dirty |= IRIS_DIRTY_COLOR_CALC_STATE;
-      }
+
+      if (cso_changed(alpha.enabled))
+         ice->state.dirty |= IRIS_DIRTY_PS_BLEND | IRIS_DIRTY_BLEND_STATE;
    }
 
    ice->state.cso_zsa = new_cso;
@@ -548,6 +616,8 @@ struct iris_rasterizer_state {
    bool light_twoside; /* for shader state */
    bool rasterizer_discard; /* for 3DSTATE_STREAMOUT */
    bool half_pixel_center; /* for 3DSTATE_MULTISAMPLE */
+   bool line_stipple_enable;
+   bool poly_stipple_enable;
    enum pipe_sprite_coord_mode sprite_coord_mode; /* PIPE_SPRITE_* */
    uint16_t sprite_coord_enable;
 };
@@ -579,6 +649,8 @@ iris_create_rasterizer_state(struct pipe_context *ctx,
    cso->half_pixel_center = state->half_pixel_center;
    cso->sprite_coord_mode = state->sprite_coord_mode;
    cso->sprite_coord_enable = state->sprite_coord_enable;
+   cso->line_stipple_enable = state->line_stipple_enable;
+   cso->poly_stipple_enable = state->poly_stipple_enable;
 
    iris_pack_command(GENX(3DSTATE_SF), cso->sf, sf) {
       sf.StatisticsEnable = true;
@@ -593,15 +665,14 @@ iris_create_rasterizer_state(struct pipe_context *ctx,
       sf.PointWidth = state->point_size;
 
       if (state->flatshade_first) {
+         sf.TriangleFanProvokingVertexSelect = 1;
+      } else {
          sf.TriangleStripListProvokingVertexSelect = 2;
          sf.TriangleFanProvokingVertexSelect = 2;
          sf.LineStripListProvokingVertexSelect = 1;
-      } else {
-         sf.TriangleFanProvokingVertexSelect = 1;
       }
    }
 
-   /* COMPLETE! */
    iris_pack_command(GENX(3DSTATE_RASTER), cso->raster, rr) {
       rr.FrontWinding = state->front_ccw ? CounterClockwise : Clockwise;
       rr.CullMode = translate_cull_mode(state->cull_face);
@@ -611,7 +682,7 @@ iris_create_rasterizer_state(struct pipe_context *ctx,
       rr.GlobalDepthOffsetEnableSolid = state->offset_tri;
       rr.GlobalDepthOffsetEnableWireframe = state->offset_line;
       rr.GlobalDepthOffsetEnablePoint = state->offset_point;
-      rr.GlobalDepthOffsetConstant = state->offset_units;
+      rr.GlobalDepthOffsetConstant = state->offset_units * 2;
       rr.GlobalDepthOffsetScale = state->offset_scale;
       rr.GlobalDepthOffsetClamp = state->offset_clamp;
       rr.SmoothPointEnable = state->point_smooth;
@@ -639,11 +710,11 @@ iris_create_rasterizer_state(struct pipe_context *ctx,
       cl.MaximumPointWidth = 255.875;
 
       if (state->flatshade_first) {
+         cl.TriangleFanProvokingVertexSelect = 1;
+      } else {
          cl.TriangleStripListProvokingVertexSelect = 2;
          cl.TriangleFanProvokingVertexSelect = 2;
          cl.LineStripListProvokingVertexSelect = 1;
-      } else {
-         cl.TriangleFanProvokingVertexSelect = 1;
       }
    }
 
@@ -680,19 +751,19 @@ iris_bind_rasterizer_state(struct pipe_context *ctx, void *state)
 
    if (new_cso) {
       /* Try to avoid re-emitting 3DSTATE_LINE_STIPPLE, it's non-pipelined */
-      if (!old_cso || memcmp(old_cso->line_stipple, new_cso->line_stipple,
-                             sizeof(old_cso->line_stipple)) != 0) {
+      if (cso_changed_memcmp(line_stipple))
          ice->state.dirty |= IRIS_DIRTY_LINE_STIPPLE;
-      }
 
-      if (!old_cso ||
-          old_cso->half_pixel_center != new_cso->half_pixel_center) {
+      if (cso_changed(half_pixel_center))
          ice->state.dirty |= IRIS_DIRTY_MULTISAMPLE;
-      }
+
+      if (cso_changed(line_stipple_enable) || cso_changed(poly_stipple_enable))
+         ice->state.dirty |= IRIS_DIRTY_WM;
    }
 
    ice->state.cso_rast = new_cso;
    ice->state.dirty |= IRIS_DIRTY_RASTER;
+   ice->state.dirty |= IRIS_DIRTY_CLIP;
 }
 
 static uint32_t
@@ -705,8 +776,10 @@ translate_wrap(unsigned pipe_wrap)
       [PIPE_TEX_WRAP_CLAMP_TO_BORDER]        = TCM_CLAMP_BORDER,
       [PIPE_TEX_WRAP_MIRROR_REPEAT]          = TCM_MIRROR,
       [PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE]   = TCM_MIRROR_ONCE,
-      [PIPE_TEX_WRAP_MIRROR_CLAMP]           = -1, // XXX: ???
-      [PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER] = -1, // XXX: ???
+
+      /* These are unsupported. */
+      [PIPE_TEX_WRAP_MIRROR_CLAMP]           = -1,
+      [PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER] = -1,
    };
    return map[pipe_wrap];
 }
@@ -740,7 +813,7 @@ struct iris_sampler_state {
 };
 
 static void *
-iris_create_sampler_state(struct pipe_context *pctx,
+iris_create_sampler_state(struct pipe_context *ctx,
                           const struct pipe_sampler_state *state)
 {
    struct iris_sampler_state *cso = CALLOC_STRUCT(iris_sampler_state);
@@ -748,6 +821,8 @@ iris_create_sampler_state(struct pipe_context *pctx,
    if (!cso)
       return NULL;
 
+   memcpy(&cso->base, state, sizeof(*state));
+
    STATIC_ASSERT(PIPE_TEX_FILTER_NEAREST == MAPFILTER_NEAREST);
    STATIC_ASSERT(PIPE_TEX_FILTER_LINEAR == MAPFILTER_LINEAR);
 
@@ -806,7 +881,7 @@ iris_create_sampler_state(struct pipe_context *pctx,
       samp.MaxLOD = CLAMP(state->max_lod, 0, hw_max_lod);
       samp.TextureLODBias = CLAMP(state->lod_bias, -16, 15);
 
-      //samp.BorderColorPointer = <<comes from elsewhere>>
+      /* .BorderColorPointer is filled in by iris_bind_sampler_states. */
    }
 
    return cso;
@@ -822,22 +897,74 @@ iris_bind_sampler_states(struct pipe_context *ctx,
    gl_shader_stage stage = stage_from_pipe(p_stage);
 
    assert(start + count <= IRIS_MAX_TEXTURE_SAMPLERS);
+   ice->state.num_samplers[stage] =
+      MAX2(ice->state.num_samplers[stage], start + count);
 
    for (int i = 0; i < count; i++) {
       ice->state.samplers[stage][start + i] = states[i];
    }
 
+   /* Assemble the SAMPLER_STATEs into a contiguous table that lives
+    * in the dynamic state memory zone, so we can point to it via the
+    * 3DSTATE_SAMPLER_STATE_POINTERS_* commands.
+    */
+   void *map = upload_state(ice->state.dynamic_uploader,
+                            &ice->state.sampler_table[stage],
+                            count * 4 * GENX(SAMPLER_STATE_length), 32);
+   if (unlikely(!map))
+      return;
+
+   struct pipe_resource *res = ice->state.sampler_table[stage].res;
+   ice->state.sampler_table[stage].offset +=
+      iris_bo_offset_from_base_address(iris_resource_bo(res));
+
+   /* Make sure all land in the same BO */
+   iris_border_color_pool_reserve(ice, IRIS_MAX_TEXTURE_SAMPLERS);
+
+   for (int i = 0; i < count; i++) {
+      struct iris_sampler_state *state = ice->state.samplers[stage][i];
+
+      /* Save a pointer to the iris_sampler_state, a few fields need
+       * to inform draw-time decisions.
+       */
+      ice->state.samplers[stage][start + i] = state;
+
+      if (!state) {
+         memset(map, 0, 4 * GENX(SAMPLER_STATE_length));
+      } else if (!state->needs_border_color) {
+         memcpy(map, state->sampler_state, 4 * GENX(SAMPLER_STATE_length));
+      } else {
+         ice->state.need_border_colors = true;
+
+         /* Stream out the border color and merge the pointer. */
+         uint32_t offset =
+            iris_upload_border_color(ice, &state->base.border_color);
+
+         uint32_t dynamic[GENX(SAMPLER_STATE_length)];
+         iris_pack_state(GENX(SAMPLER_STATE), dynamic, dyns) {
+            dyns.BorderColorPointer = offset;
+         }
+
+         for (uint32_t j = 0; j < GENX(SAMPLER_STATE_length); j++)
+            ((uint32_t *) map)[j] = state->sampler_state[j] | dynamic[j];
+      }
+
+      map += GENX(SAMPLER_STATE_length);
+   }
+
    ice->state.dirty |= IRIS_DIRTY_SAMPLER_STATES_VS << stage;
 }
 
 struct iris_sampler_view {
    struct pipe_sampler_view pipe;
    struct isl_view view;
-   uint32_t surface_state[GENX(RENDER_SURFACE_STATE_length)];
+
+   /** The resource (BO) holding our SURFACE_STATE. */
+   struct iris_state_ref surface_state;
 };
 
 /**
- * Convert an swizzle enumeration (i.e. SWIZZLE_X) to one of the Gen7.5+
+ * Convert an swizzle enumeration (i.e. PIPE_SWIZZLE_X) to one of the Gen7.5+
  * "Shader Channel Select" enumerations (i.e. HSW_SCS_RED).  The mappings are
  *
  * SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_W, SWIZZLE_ZERO, SWIZZLE_ONE
@@ -860,6 +987,7 @@ iris_create_sampler_view(struct pipe_context *ctx,
                          struct pipe_resource *tex,
                          const struct pipe_sampler_view *tmpl)
 {
+   struct iris_context *ice = (struct iris_context *) ctx;
    struct iris_screen *screen = (struct iris_screen *)ctx->screen;
    struct iris_resource *itex = (struct iris_resource *) tex;
    struct iris_sampler_view *isv = calloc(1, sizeof(struct iris_sampler_view));
@@ -888,34 +1016,38 @@ iris_create_sampler_view(struct pipe_context *ctx,
          .b = pipe_swizzle_to_isl_channel(tmpl->swizzle_b),
          .a = pipe_swizzle_to_isl_channel(tmpl->swizzle_a),
       },
-      .usage = ISL_SURF_USAGE_TEXTURE_BIT,
+      .usage = ISL_SURF_USAGE_TEXTURE_BIT |
+               (itex->surf.usage & ISL_SURF_USAGE_CUBE_BIT),
    };
 
-   isl_surf_fill_state(&screen->isl_dev, isv->surface_state,
+   void *map = upload_state(ice->state.surface_uploader, &isv->surface_state,
+                            4 * GENX(RENDER_SURFACE_STATE_length), 64);
+   if (!unlikely(map))
+      return NULL;
+
+   struct iris_bo *state_bo = iris_resource_bo(isv->surface_state.res);
+   isv->surface_state.offset += iris_bo_offset_from_base_address(state_bo);
+
+   isl_surf_fill_state(&screen->isl_dev, map,
                        .surf = &itex->surf, .view = &isv->view,
-                       .mocs = MOCS_WB);
-                       // .address = ...
+                       .mocs = MOCS_WB,
+                       .address = itex->bo->gtt_offset);
                        // .aux_surf =
                        // .clear_color = clear_color,
 
    return &isv->pipe;
 }
 
-struct iris_surface {
-   struct pipe_surface pipe;
-   struct isl_view view;
-   uint32_t surface_state[GENX(RENDER_SURFACE_STATE_length)];
-};
-
 static struct pipe_surface *
 iris_create_surface(struct pipe_context *ctx,
                     struct pipe_resource *tex,
                     const struct pipe_surface *tmpl)
 {
+   struct iris_context *ice = (struct iris_context *) ctx;
    struct iris_screen *screen = (struct iris_screen *)ctx->screen;
    struct iris_surface *surf = calloc(1, sizeof(struct iris_surface));
    struct pipe_surface *psurf = &surf->pipe;
-   struct iris_resource *itex = (struct iris_resource *) tex;
+   struct iris_resource *res = (struct iris_resource *) tex;
 
    if (!surf)
       return NULL;
@@ -931,6 +1063,14 @@ iris_create_surface(struct pipe_context *ctx,
    psurf->u.tex.last_layer = tmpl->u.tex.last_layer;
    psurf->u.tex.level = tmpl->u.tex.level;
 
+   unsigned usage = 0;
+   if (tmpl->writable)
+      usage = ISL_SURF_USAGE_STORAGE_BIT;
+   else if (util_format_is_depth_or_stencil(tmpl->format))
+      usage = ISL_SURF_USAGE_DEPTH_BIT;
+   else
+      usage = ISL_SURF_USAGE_RENDER_TARGET_BIT;
+
    surf->view = (struct isl_view) {
       .format = iris_isl_format_for_pipe_format(tmpl->format),
       .base_level = tmpl->u.tex.level,
@@ -938,14 +1078,27 @@ iris_create_surface(struct pipe_context *ctx,
       .base_array_layer = tmpl->u.tex.first_layer,
       .array_len = tmpl->u.tex.last_layer - tmpl->u.tex.first_layer + 1,
       .swizzle = ISL_SWIZZLE_IDENTITY,
-      // XXX: DEPTH_BIt, STENCIL_BIT...CUBE_BIT?  Other bits?!
-      .usage = ISL_SURF_USAGE_RENDER_TARGET_BIT,
+      .usage = usage,
    };
 
-   isl_surf_fill_state(&screen->isl_dev, surf->surface_state,
-                       .surf = &itex->surf, .view = &surf->view,
-                       .mocs = MOCS_WB);
-                       // .address = ...
+   /* Bail early for depth/stencil */
+   if (res->surf.usage & (ISL_SURF_USAGE_DEPTH_BIT |
+                          ISL_SURF_USAGE_STENCIL_BIT))
+      return psurf;
+
+
+   void *map = upload_state(ice->state.surface_uploader, &surf->surface_state,
+                            4 * GENX(RENDER_SURFACE_STATE_length), 64);
+   if (!unlikely(map))
+      return NULL;
+
+   struct iris_bo *state_bo = iris_resource_bo(surf->surface_state.res);
+   surf->surface_state.offset += iris_bo_offset_from_base_address(state_bo);
+
+   isl_surf_fill_state(&screen->isl_dev, map,
+                       .surf = &res->surf, .view = &surf->view,
+                       .mocs = MOCS_WB,
+                       .address = res->bo->gtt_offset);
                        // .aux_surf =
                        // .clear_color = clear_color,
 
@@ -954,10 +1107,26 @@ iris_create_surface(struct pipe_context *ctx,
 
 static void
 iris_set_sampler_views(struct pipe_context *ctx,
-                       enum pipe_shader_type shader,
+                       enum pipe_shader_type p_stage,
                        unsigned start, unsigned count,
                        struct pipe_sampler_view **views)
 {
+   struct iris_context *ice = (struct iris_context *) ctx;
+   gl_shader_stage stage = stage_from_pipe(p_stage);
+
+   unsigned i;
+   for (i = 0; i < count; i++) {
+      pipe_sampler_view_reference((struct pipe_sampler_view **)
+                                  &ice->state.textures[stage][i], views[i]);
+   }
+   for (; i < ice->state.num_textures[stage]; i++) {
+      pipe_sampler_view_reference((struct pipe_sampler_view **)
+                                  &ice->state.textures[stage][i], NULL);
+   }
+
+   ice->state.num_textures[stage] = count;
+
+   ice->state.dirty |= (IRIS_DIRTY_BINDINGS_VS << stage);
 }
 
 static void
@@ -992,8 +1161,6 @@ iris_set_scissor_states(struct pipe_context *ctx,
 {
    struct iris_context *ice = (struct iris_context *) ctx;
 
-   ice->state.num_scissors = num_scissors;
-
    for (unsigned i = 0; i < num_scissors; i++) {
       ice->state.scissors[start_slot + i] = states[i];
    }
@@ -1010,15 +1177,10 @@ iris_set_stencil_ref(struct pipe_context *ctx,
    ice->state.dirty |= IRIS_DIRTY_WM_DEPTH_STENCIL;
 }
 
-
-struct iris_viewport_state {
-   uint32_t sf_cl_vp[GENX(SF_CLIP_VIEWPORT_length)];
-};
-
 static float
-extent_from_matrix(const struct pipe_viewport_state *state, int axis)
+viewport_extent(const struct pipe_viewport_state *state, int axis, float sign)
 {
-   return fabsf(state->scale[axis]) * state->translate[axis];
+   return copysignf(state->scale[axis], sign) + state->translate[axis];
 }
 
 #if 0
@@ -1106,25 +1268,23 @@ calculate_guardband_size(uint32_t fb_width, uint32_t fb_height,
 static void
 iris_set_viewport_states(struct pipe_context *ctx,
                          unsigned start_slot,
-                         unsigned num_viewports,
-                         const struct pipe_viewport_state *state)
+                         unsigned count,
+                         const struct pipe_viewport_state *states)
 {
    struct iris_context *ice = (struct iris_context *) ctx;
-   struct iris_viewport_state *cso =
-      malloc(sizeof(struct iris_viewport_state));
+   struct iris_viewport_state *cso = &ice->state.genx->viewport;
+   uint32_t *vp_map = &cso->sf_cl_vp[start_slot];
 
    // XXX: sf_cl_vp is only big enough for one slot, we don't iterate right
-   for (unsigned i = start_slot; i < start_slot + num_viewports; i++) {
-      float x_extent = extent_from_matrix(&state[i], 0);
-      float y_extent = extent_from_matrix(&state[i], 1);
-
-      iris_pack_state(GENX(SF_CLIP_VIEWPORT), cso->sf_cl_vp, vp) {
-         vp.ViewportMatrixElementm00 = state[i].scale[0];
-         vp.ViewportMatrixElementm11 = state[i].scale[1];
-         vp.ViewportMatrixElementm22 = state[i].scale[2];
-         vp.ViewportMatrixElementm30 = state[i].translate[0];
-         vp.ViewportMatrixElementm31 = state[i].translate[1];
-         vp.ViewportMatrixElementm32 = state[i].translate[2];
+   for (unsigned i = 0; i < count; i++) {
+      const struct pipe_viewport_state *state = &states[start_slot + i];
+      iris_pack_state(GENX(SF_CLIP_VIEWPORT), vp_map, vp) {
+         vp.ViewportMatrixElementm00 = state->scale[0];
+         vp.ViewportMatrixElementm11 = state->scale[1];
+         vp.ViewportMatrixElementm22 = state->scale[2];
+         vp.ViewportMatrixElementm30 = state->translate[0];
+         vp.ViewportMatrixElementm31 = state->translate[1];
+         vp.ViewportMatrixElementm32 = state->translate[2];
          /* XXX: in i965 this is computed based on the drawbuffer size,
           * but we don't have that here...
           */
@@ -1132,31 +1292,25 @@ iris_set_viewport_states(struct pipe_context *ctx,
          vp.XMaxClipGuardband = 1.0;
          vp.YMinClipGuardband = -1.0;
          vp.YMaxClipGuardband = 1.0;
-         vp.XMinViewPort = -x_extent;
-         vp.XMaxViewPort =  x_extent;
-         vp.YMinViewPort = -y_extent;
-         vp.YMaxViewPort =  y_extent;
+         vp.XMinViewPort = viewport_extent(state, 0, -1.0f);
+         vp.XMaxViewPort = viewport_extent(state, 0,  1.0f) - 1;
+         vp.YMinViewPort = viewport_extent(state, 1, -1.0f);
+         vp.YMaxViewPort = viewport_extent(state, 1,  1.0f) - 1;
       }
+
+      vp_map += GENX(SF_CLIP_VIEWPORT_length);
    }
 
-   ice->state.cso_vp = cso;
-   // XXX: start_slot
-   ice->state.num_viewports = num_viewports;
    ice->state.dirty |= IRIS_DIRTY_SF_CL_VIEWPORT;
 }
 
-struct iris_depth_state
-{
-   uint32_t depth_buffer[GENX(3DSTATE_DEPTH_BUFFER_length)];
-   uint32_t hier_depth_buffer[GENX(3DSTATE_HIER_DEPTH_BUFFER_length)];
-   uint32_t stencil_buffer[GENX(3DSTATE_STENCIL_BUFFER_length)];
-};
-
 static void
 iris_set_framebuffer_state(struct pipe_context *ctx,
                            const struct pipe_framebuffer_state *state)
 {
    struct iris_context *ice = (struct iris_context *) ctx;
+   struct iris_screen *screen = (struct iris_screen *)ctx->screen;
+   struct isl_device *isl_dev = &screen->isl_dev;
    struct pipe_framebuffer_state *cso = &ice->state.framebuffer;
 
    if (cso->samples != state->samples) {
@@ -1167,50 +1321,138 @@ iris_set_framebuffer_state(struct pipe_context *ctx,
       ice->state.dirty |= IRIS_DIRTY_BLEND_STATE;
    }
 
-   cso->width = state->width;
-   cso->height = state->height;
-   cso->layers = state->layers;
-   cso->samples = state->samples;
+   if ((cso->layers == 0) == (state->layers == 0)) {
+      ice->state.dirty |= IRIS_DIRTY_CLIP;
+   }
 
-   unsigned i;
-   for (i = 0; i < state->nr_cbufs; i++)
-      pipe_surface_reference(&cso->cbufs[i], state->cbufs[i]);
-   for (; i < cso->nr_cbufs; i++)
-      pipe_surface_reference(&cso->cbufs[i], NULL);
+   util_copy_framebuffer_state(cso, state);
 
-   cso->nr_cbufs = state->nr_cbufs;
+   struct iris_depth_buffer_state *cso_z = &ice->state.genx->depth_buffer;
 
-   pipe_surface_reference(&cso->zsbuf, state->zsbuf);
+   struct isl_view view = {
+      .base_level = 0,
+      .levels = 1,
+      .base_array_layer = 0,
+      .array_len = 1,
+      .swizzle = ISL_SWIZZLE_IDENTITY,
+   };
 
    struct isl_depth_stencil_hiz_emit_info info = {
+      .view = &view,
       .mocs = MOCS_WB,
    };
 
-   // XXX: depth buffers
+   struct iris_resource *zres =
+      (void *) (cso->zsbuf ? cso->zsbuf->texture : NULL);
+
+   if (zres) {
+      view.usage |= ISL_SURF_USAGE_DEPTH_BIT;
+
+      info.depth_surf = &zres->surf;
+      info.depth_address = zres->bo->gtt_offset;
+
+      view.format = zres->surf.format;
+
+      view.base_level = cso->zsbuf->u.tex.level;
+      view.base_array_layer = cso->zsbuf->u.tex.first_layer;
+      view.array_len =
+         cso->zsbuf->u.tex.last_layer - cso->zsbuf->u.tex.first_layer + 1;
+
+      info.hiz_usage = ISL_AUX_USAGE_NONE;
+   }
+
+#if 0
+   if (stencil_mt) {
+      view.usage |= ISL_SURF_USAGE_STENCIL_BIT;
+      info.stencil_surf = &stencil_mt->surf;
+
+      if (!depth_mt) {
+         view.base_level = stencil_irb->mt_level - stencil_irb->mt->first_level;
+         view.base_array_layer = stencil_irb->mt_layer;
+         view.array_len = MAX2(stencil_irb->layer_count, 1);
+         view.format = stencil_mt->surf.format;
+      }
+
+      uint32_t stencil_offset = 0;
+      info.stencil_address = stencil_mt->bo->gtt_offset + stencil_mt->offset;
+   }
+#endif
+
+   isl_emit_depth_stencil_hiz_s(isl_dev, cso_z->packets, &info);
+
+   ice->state.dirty |= IRIS_DIRTY_DEPTH_BUFFER;
+
+   /* Render target change */
+   ice->state.dirty |= IRIS_DIRTY_BINDINGS_FS;
 }
 
 static void
 iris_set_constant_buffer(struct pipe_context *ctx,
-                         enum pipe_shader_type shader, uint index,
-                         const struct pipe_constant_buffer *cb)
+                         enum pipe_shader_type p_stage, unsigned index,
+                         const struct pipe_constant_buffer *input)
 {
-}
+   struct iris_context *ice = (struct iris_context *) ctx;
+   struct iris_screen *screen = (struct iris_screen *)ctx->screen;
+   gl_shader_stage stage = stage_from_pipe(p_stage);
+   struct iris_shader_state *shs = &ice->shaders.state[stage];
+   struct iris_const_buffer *cbuf = &shs->constbuf[index];
+
+   if (input && (input->buffer || input->user_buffer)) {
+      if (input->user_buffer) {
+         u_upload_data(ctx->const_uploader, 0, input->buffer_size, 32,
+                       input->user_buffer, &cbuf->data.offset,
+                       &cbuf->data.res);
+      } else {
+         pipe_resource_reference(&cbuf->data.res, input->buffer);
+      }
+
+      // XXX: these are not retained forever, use a separate uploader?
+      void *map =
+         upload_state(ice->state.surface_uploader, &cbuf->surface_state,
+                      4 * GENX(RENDER_SURFACE_STATE_length), 64);
+      if (!unlikely(map)) {
+         pipe_resource_reference(&cbuf->data.res, NULL);
+         return;
+      }
+
+      struct iris_resource *res = (void *) cbuf->data.res;
+      struct iris_bo *surf_bo = iris_resource_bo(cbuf->surface_state.res);
+      cbuf->surface_state.offset += iris_bo_offset_from_base_address(surf_bo);
+
+      isl_buffer_fill_state(&screen->isl_dev, map,
+                            .address = res->bo->gtt_offset + cbuf->data.offset,
+                            .size_B = input->buffer_size,
+                            .format = ISL_FORMAT_R32G32B32A32_FLOAT,
+                            .stride_B = 1,
+                            .mocs = MOCS_WB)
+   } else {
+      pipe_resource_reference(&cbuf->data.res, NULL);
+      pipe_resource_reference(&cbuf->surface_state.res, NULL);
+   }
 
+   ice->state.dirty |= IRIS_DIRTY_CONSTANTS_VS << stage;
+   // XXX: maybe not necessary all the time...?
+   ice->state.dirty |= IRIS_DIRTY_BINDINGS_VS << stage;
+}
 
 static void
 iris_sampler_view_destroy(struct pipe_context *ctx,
                           struct pipe_sampler_view *state)
 {
+   struct iris_sampler_view *isv = (void *) state;
    pipe_resource_reference(&state->texture, NULL);
-   free(state);
+   pipe_resource_reference(&isv->surface_state.res, NULL);
+   free(isv);
 }
 
 
 static void
-iris_surface_destroy(struct pipe_context *ctx, struct pipe_surface *surface)
+iris_surface_destroy(struct pipe_context *ctx, struct pipe_surface *p_surf)
 {
-   pipe_resource_reference(&surface->texture, NULL);
-   free(surface);
+   struct iris_surface *surf = (void *) p_surf;
+   pipe_resource_reference(&p_surf->texture, NULL);
+   pipe_resource_reference(&surf->surface_state.res, NULL);
+   free(surf);
 }
 
 static void
@@ -1219,20 +1461,11 @@ iris_delete_state(struct pipe_context *ctx, void *state)
    free(state);
 }
 
-struct iris_vertex_buffer_state {
-   uint32_t vertex_buffers[1 + 33 * GENX(VERTEX_BUFFER_STATE_length)];
-   struct iris_address bos[33];
-   unsigned num_buffers;
-};
-
 static void
 iris_free_vertex_buffers(struct iris_vertex_buffer_state *cso)
 {
-   if (cso) {
-      for (unsigned i = 0; i < cso->num_buffers; i++)
-         iris_bo_unreference(cso->bos[i].bo);
-      free(cso);
-   }
+   for (unsigned i = 0; i < cso->num_buffers; i++)
+      pipe_resource_reference(&cso->resources[i], NULL);
 }
 
 static void
@@ -1241,32 +1474,34 @@ iris_set_vertex_buffers(struct pipe_context *ctx,
                         const struct pipe_vertex_buffer *buffers)
 {
    struct iris_context *ice = (struct iris_context *) ctx;
-   struct iris_vertex_buffer_state *cso =
-      malloc(sizeof(struct iris_vertex_buffer_state));
+   struct iris_vertex_buffer_state *cso = &ice->state.genx->vertex_buffers;
 
-   /* If there are no buffers, do nothing.  We can leave the stale
-    * 3DSTATE_VERTEX_BUFFERS in place - as long as there are no vertex
-    * elements that point to them, it should be fine.
-    */
-   if (!buffers)
-      return;
+   iris_free_vertex_buffers(&ice->state.genx->vertex_buffers);
 
-   iris_free_vertex_buffers(ice->state.cso_vertex_buffers);
+   if (!buffers)
+      count = 0;
 
    cso->num_buffers = count;
 
    iris_pack_command(GENX(3DSTATE_VERTEX_BUFFERS), cso->vertex_buffers, vb) {
-      vb.DWordLength = 4 * cso->num_buffers - 1;
+      vb.DWordLength = 4 * MAX2(cso->num_buffers, 1) - 1;
    }
 
    uint32_t *vb_pack_dest = &cso->vertex_buffers[1];
 
+   if (count == 0) {
+      iris_pack_state(GENX(VERTEX_BUFFER_STATE), vb_pack_dest, vb) {
+         vb.VertexBufferIndex = start_slot;
+         vb.NullVertexBuffer = true;
+         vb.AddressModifyEnable = true;
+      }
+   }
+
    for (unsigned i = 0; i < count; i++) {
       assert(!buffers[i].is_user_buffer);
 
-      struct iris_resource *res = (void *) buffers[i].buffer.resource;
-      iris_bo_reference(res->bo);
-      cso->bos[i] = ro_bo(res->bo, buffers[i].buffer_offset);
+      pipe_resource_reference(&cso->resources[i], buffers[i].buffer.resource);
+      struct iris_resource *res = (void *) cso->resources[i];
 
       iris_pack_state(GENX(VERTEX_BUFFER_STATE), vb_pack_dest, vb) {
          vb.VertexBufferIndex = start_slot + i;
@@ -1274,19 +1509,19 @@ iris_set_vertex_buffers(struct pipe_context *ctx,
          vb.AddressModifyEnable = true;
          vb.BufferPitch = buffers[i].stride;
          vb.BufferSize = res->bo->size;
-         /* vb.BufferStartingAddress is filled in at draw time */
+         vb.BufferStartingAddress =
+            ro_bo(NULL, res->bo->gtt_offset + buffers[i].buffer_offset);
       }
 
       vb_pack_dest += GENX(VERTEX_BUFFER_STATE_length);
    }
 
-   ice->state.cso_vertex_buffers = cso;
    ice->state.dirty |= IRIS_DIRTY_VERTEX_BUFFERS;
 }
 
 struct iris_vertex_element_state {
    uint32_t vertex_elements[1 + 33 * GENX(VERTEX_ELEMENT_STATE_length)];
-   uint32_t vf_instancing[GENX(3DSTATE_VF_INSTANCING_length)][33];
+   uint32_t vf_instancing[33 * GENX(3DSTATE_VF_INSTANCING_length)];
    unsigned count;
 };
 
@@ -1298,20 +1533,37 @@ iris_create_vertex_elements(struct pipe_context *ctx,
    struct iris_vertex_element_state *cso =
       malloc(sizeof(struct iris_vertex_element_state));
 
-   cso->count = count;
+   cso->count = MAX2(count, 1);
 
    /* TODO:
     *  - create edge flag one
     *  - create SGV ones
     *  - if those are necessary, use count + 1/2/3... OR in the length
     */
-   iris_pack_command(GENX(3DSTATE_VERTEX_ELEMENTS), cso->vertex_elements, ve);
+   iris_pack_command(GENX(3DSTATE_VERTEX_ELEMENTS), cso->vertex_elements, ve) {
+      ve.DWordLength = 1 + GENX(VERTEX_ELEMENT_STATE_length) * cso->count - 2;
+   }
 
    uint32_t *ve_pack_dest = &cso->vertex_elements[1];
+   uint32_t *vfi_pack_dest = cso->vf_instancing;
+
+   if (count == 0) {
+      iris_pack_state(GENX(VERTEX_ELEMENT_STATE), ve_pack_dest, ve) {
+         ve.Valid = true;
+         ve.SourceElementFormat = ISL_FORMAT_R32G32B32A32_FLOAT;
+         ve.Component0Control = VFCOMP_STORE_0;
+         ve.Component1Control = VFCOMP_STORE_0;
+         ve.Component2Control = VFCOMP_STORE_0;
+         ve.Component3Control = VFCOMP_STORE_1_FP;
+      }
+
+      iris_pack_command(GENX(3DSTATE_VF_INSTANCING), vfi_pack_dest, vi) {
+      }
+   }
 
    for (int i = 0; i < count; i++) {
       enum isl_format isl_format =
-            iris_isl_format_for_pipe_format(state[i].src_format);
+         iris_isl_format_for_pipe_format(state[i].src_format);
       unsigned comp[4] = { VFCOMP_STORE_SRC, VFCOMP_STORE_SRC,
                            VFCOMP_STORE_SRC, VFCOMP_STORE_SRC };
 
@@ -1335,13 +1587,14 @@ iris_create_vertex_elements(struct pipe_context *ctx,
          ve.Component3Control = comp[3];
       }
 
-      iris_pack_command(GENX(3DSTATE_VF_INSTANCING), cso->vf_instancing[i], vi) {
+      iris_pack_command(GENX(3DSTATE_VF_INSTANCING), vfi_pack_dest, vi) {
          vi.VertexElementIndex = i;
          vi.InstancingEnable = state[i].instance_divisor > 0;
          vi.InstanceDataStepRate = state[i].instance_divisor;
       }
 
       ve_pack_dest += GENX(VERTEX_ELEMENT_STATE_length);
+      vfi_pack_dest += GENX(3DSTATE_VF_INSTANCING_length);
    }
 
    return cso;
@@ -1397,20 +1650,190 @@ iris_set_stream_output_targets(struct pipe_context *ctx,
 {
 }
 
-#if 0
 static void
-iris_compute_sbe(const struct iris_context *ice,
-                 const struct brw_wm_prog_data *wm_prog_data)
+iris_compute_sbe_urb_read_interval(uint64_t fs_input_slots,
+                                   const struct brw_vue_map *last_vue_map,
+                                   bool two_sided_color,
+                                   unsigned *out_offset,
+                                   unsigned *out_length)
+{
+   /* The compiler computes the first URB slot without considering COL/BFC
+    * swizzling (because it doesn't know whether it's enabled), so we need
+    * to do that here too.  This may result in a smaller offset, which
+    * should be safe.
+    */
+   const unsigned first_slot =
+      brw_compute_first_urb_slot_required(fs_input_slots, last_vue_map);
+
+   /* This becomes the URB read offset (counted in pairs of slots). */
+   assert(first_slot % 2 == 0);
+   *out_offset = first_slot / 2;
+
+   /* We need to adjust the inputs read to account for front/back color
+    * swizzling, as it can make the URB length longer.
+    */
+   for (int c = 0; c <= 1; c++) {
+      if (fs_input_slots & (VARYING_BIT_COL0 << c)) {
+         /* If two sided color is enabled, the fragment shader's gl_Color
+          * (COL0) input comes from either the gl_FrontColor (COL0) or
+          * gl_BackColor (BFC0) input varyings.  Mark BFC as used, too.
+          */
+         if (two_sided_color)
+            fs_input_slots |= (VARYING_BIT_BFC0 << c);
+
+         /* If front color isn't written, we opt to give them back color
+          * instead of an undefined value.  Switch from COL to BFC.
+          */
+         if (last_vue_map->varying_to_slot[VARYING_SLOT_COL0 + c] == -1) {
+            fs_input_slots &= ~(VARYING_BIT_COL0 << c);
+            fs_input_slots |= (VARYING_BIT_BFC0 << c);
+         }
+      }
+   }
+
+   /* Compute the minimum URB Read Length necessary for the FS inputs.
+    *
+    * From the Sandy Bridge PRM, Volume 2, Part 1, documentation for
+    * 3DSTATE_SF DWord 1 bits 15:11, "Vertex URB Entry Read Length":
+    *
+    * "This field should be set to the minimum length required to read the
+    *  maximum source attribute.  The maximum source attribute is indicated
+    *  by the maximum value of the enabled Attribute # Source Attribute if
+    *  Attribute Swizzle Enable is set, Number of Output Attributes-1 if
+    *  enable is not set.
+    *  read_length = ceiling((max_source_attr + 1) / 2)
+    *
+    *  [errata] Corruption/Hang possible if length programmed larger than
+    *  recommended"
+    *
+    * Similar text exists for Ivy Bridge.
+    *
+    * We find the last URB slot that's actually read by the FS.
+    */
+   unsigned last_read_slot = last_vue_map->num_slots - 1;
+   while (last_read_slot > first_slot && !(fs_input_slots &
+          (1ull << last_vue_map->slot_to_varying[last_read_slot])))
+      --last_read_slot;
+
+   /* The URB read length is the difference of the two, counted in pairs. */
+   *out_length = DIV_ROUND_UP(last_read_slot - first_slot + 1, 2);
+}
+
+static void
+iris_emit_sbe_swiz(struct iris_batch *batch,
+                   const struct iris_context *ice,
+                   unsigned urb_read_offset)
+{
+   struct GENX(SF_OUTPUT_ATTRIBUTE_DETAIL) attr_overrides[16] = {};
+   const struct brw_wm_prog_data *wm_prog_data = (void *)
+      ice->shaders.prog[MESA_SHADER_FRAGMENT]->prog_data;
+   const struct brw_vue_map *vue_map = ice->shaders.last_vue_map;
+   const struct iris_rasterizer_state *cso_rast = ice->state.cso_rast;
+
+   /* XXX: this should be generated when putting programs in place */
+
+   // XXX: raster->sprite_coord_enable
+
+   for (int fs_attr = 0; fs_attr < VARYING_SLOT_MAX; fs_attr++) {
+      const int input_index = wm_prog_data->urb_setup[fs_attr];
+      if (input_index < 0 || input_index >= 16)
+         continue;
+
+      struct GENX(SF_OUTPUT_ATTRIBUTE_DETAIL) *attr =
+         &attr_overrides[input_index];
+
+      /* Viewport and Layer are stored in the VUE header.  We need to override
+       * them to zero if earlier stages didn't write them, as GL requires that
+       * they read back as zero when not explicitly set.
+       */
+      switch (fs_attr) {
+      case VARYING_SLOT_VIEWPORT:
+      case VARYING_SLOT_LAYER:
+         attr->ComponentOverrideX = true;
+         attr->ComponentOverrideW = true;
+         attr->ConstantSource = CONST_0000;
+
+         if (!(vue_map->slots_valid & VARYING_BIT_LAYER))
+            attr->ComponentOverrideY = true;
+         if (!(vue_map->slots_valid & VARYING_BIT_VIEWPORT))
+            attr->ComponentOverrideZ = true;
+         continue;
+
+      case VARYING_SLOT_PRIMITIVE_ID:
+         attr->ComponentOverrideX = true;
+         attr->ComponentOverrideY = true;
+         attr->ComponentOverrideZ = true;
+         attr->ComponentOverrideW = true;
+         attr->ConstantSource = PRIM_ID;
+         continue;
+
+      default:
+         break;
+      }
+
+      int slot = vue_map->varying_to_slot[fs_attr];
+
+      /* If there was only a back color written but not front, use back
+       * as the color instead of undefined.
+       */
+      if (slot == -1 && fs_attr == VARYING_SLOT_COL0)
+         slot = vue_map->varying_to_slot[VARYING_SLOT_BFC0];
+      if (slot == -1 && fs_attr == VARYING_SLOT_COL1)
+         slot = vue_map->varying_to_slot[VARYING_SLOT_BFC1];
+
+      /* Not written by the previous stage - undefined. */
+      if (slot == -1) {
+         attr->ComponentOverrideX = true;
+         attr->ComponentOverrideY = true;
+         attr->ComponentOverrideZ = true;
+         attr->ComponentOverrideW = true;
+         attr->ConstantSource = CONST_0001_FLOAT;
+         continue;
+      }
+
+      /* Compute the location of the attribute relative to the read offset,
+       * which is counted in 256-bit increments (two 128-bit VUE slots).
+       */
+      const int source_attr = slot - 2 * urb_read_offset;
+      assert(source_attr >= 0 && source_attr <= 32);
+      attr->SourceAttribute = source_attr;
+
+      /* If we are doing two-sided color, and the VUE slot following this one
+       * represents a back-facing color, then we need to instruct the SF unit
+       * to do back-facing swizzling.
+       */
+      if (cso_rast->light_twoside &&
+          ((vue_map->slot_to_varying[slot] == VARYING_SLOT_COL0 &&
+            vue_map->slot_to_varying[slot+1] == VARYING_SLOT_BFC0) ||
+           (vue_map->slot_to_varying[slot] == VARYING_SLOT_COL1 &&
+            vue_map->slot_to_varying[slot+1] == VARYING_SLOT_BFC1)))
+         attr->SwizzleSelect = INPUTATTR_FACING;
+   }
+
+   iris_emit_cmd(batch, GENX(3DSTATE_SBE_SWIZ), sbes) {
+      for (int i = 0; i < 16; i++)
+         sbes.Attribute[i] = attr_overrides[i];
+   }
+}
+
+static void
+iris_emit_sbe(struct iris_batch *batch, const struct iris_context *ice)
 {
-   uint32_t sbe_map[GENX(3DSTATE_SBE_length)];
-   struct iris_rasterizer_state *cso_rast = ice->state.cso_rast;
+   const struct iris_rasterizer_state *cso_rast = ice->state.cso_rast;
+   const struct brw_wm_prog_data *wm_prog_data = (void *)
+      ice->shaders.prog[MESA_SHADER_FRAGMENT]->prog_data;
+   struct pipe_shader_state *p_fs =
+      (void *) ice->shaders.uncompiled[MESA_SHADER_FRAGMENT];
+   assert(p_fs->type == PIPE_SHADER_IR_NIR);
+   nir_shader *fs_nir = p_fs->ir.nir;
 
    unsigned urb_read_offset, urb_read_length;
-   brw_compute_sbe_urb_slot_interval(fp->info.inputs_read,
-                                     ice->shaders.last_vue_map,
-                                     &urb_read_offset, &urb_read_length);
+   iris_compute_sbe_urb_read_interval(fs_nir->info.inputs_read,
+                                      ice->shaders.last_vue_map,
+                                      cso_rast->light_twoside,
+                                      &urb_read_offset, &urb_read_length);
 
-   iris_pack_command(GENX(3DSTATE_SBE), sbe_map, sbe) {
+   iris_emit_cmd(batch, GENX(3DSTATE_SBE), sbe) {
       sbe.AttributeSwizzleEnable = true;
       sbe.NumberofSFOutputAttributes = wm_prog_data->num_varying_inputs;
       sbe.PointSpriteTextureCoordinateOrigin = cso_rast->sprite_coord_mode;
@@ -1420,23 +1843,34 @@ iris_compute_sbe(const struct iris_context *ice,
       sbe.ForceVertexURBEntryReadLength = true;
       sbe.ConstantInterpolationEnable = wm_prog_data->flat_inputs;
 
-      for (int i = 0; i < urb_read_length * 2; i++) {
+      for (int i = 0; i < 32; i++) {
          sbe.AttributeActiveComponentFormat[i] = ACTIVE_COMPONENT_XYZW;
       }
    }
+
+   iris_emit_sbe_swiz(batch, ice, urb_read_offset);
 }
-#endif
 
 static void
 iris_bind_compute_state(struct pipe_context *ctx, void *state)
 {
 }
 
+static void
+iris_populate_sampler_key(const struct iris_context *ice,
+                          struct brw_sampler_prog_key_data *key)
+{
+   for (int i = 0; i < MAX_SAMPLERS; i++) {
+      key->swizzles[i] = 0x688; /* XYZW */
+   }
+}
+
 static void
 iris_populate_vs_key(const struct iris_context *ice,
                      struct brw_vs_prog_key *key)
 {
    memset(key, 0, sizeof(*key));
+   iris_populate_sampler_key(ice, &key->tex);
 }
 
 static void
@@ -1444,6 +1878,7 @@ iris_populate_tcs_key(const struct iris_context *ice,
                       struct brw_tcs_prog_key *key)
 {
    memset(key, 0, sizeof(*key));
+   iris_populate_sampler_key(ice, &key->tex);
 }
 
 static void
@@ -1451,6 +1886,7 @@ iris_populate_tes_key(const struct iris_context *ice,
                       struct brw_tes_prog_key *key)
 {
    memset(key, 0, sizeof(*key));
+   iris_populate_sampler_key(ice, &key->tex);
 }
 
 static void
@@ -1458,6 +1894,7 @@ iris_populate_gs_key(const struct iris_context *ice,
                      struct brw_gs_prog_key *key)
 {
    memset(key, 0, sizeof(*key));
+   iris_populate_sampler_key(ice, &key->tex);
 }
 
 static void
@@ -1465,6 +1902,7 @@ iris_populate_fs_key(const struct iris_context *ice,
                      struct brw_wm_prog_key *key)
 {
    memset(key, 0, sizeof(*key));
+   iris_populate_sampler_key(ice, &key->tex);
 
    /* XXX: dirty flags? */
    const struct pipe_framebuffer_state *fb = &ice->state.framebuffer;
@@ -1479,6 +1917,9 @@ iris_populate_fs_key(const struct iris_context *ice,
    key->replicate_alpha = fb->nr_cbufs > 1 &&
       (zsa->alpha.enabled || blend->alpha_to_coverage);
 
+   /* XXX: only bother if COL0/1 are read */
+   key->flat_shade = rast->flatshade;
+
    // key->force_dual_color_blend for unigine
 #if 0
    if (cso_rast->multisample) {
@@ -1494,16 +1935,20 @@ iris_populate_fs_key(const struct iris_context *ice,
    key->coherent_fb_fetch = true;
 }
 
-   //pkt.SamplerCount =                                                     \
-      //DIV_ROUND_UP(CLAMP(stage_state->sampler_count, 0, 16), 4);          \
-   //pkt.PerThreadScratchSpace = prog_data->total_scratch == 0 ? 0 :        \
-      //ffs(stage_state->per_thread_scratch) - 11;                          \
+#if 0
+   // XXX: these need to go in INIT_THREAD_DISPATCH_FIELDS
+   pkt.SamplerCount =                                                     \
+      DIV_ROUND_UP(CLAMP(stage_state->sampler_count, 0, 16), 4);          \
+   pkt.PerThreadScratchSpace = prog_data->total_scratch == 0 ? 0 :        \
+      ffs(stage_state->per_thread_scratch) - 11;                          \
+
+#endif
 
 static uint64_t
 KSP(const struct iris_compiled_shader *shader)
 {
-   struct iris_resource *res = (void *) shader->buffer;
-   return res->bo->gtt_offset + shader->offset;
+   struct iris_resource *res = (void *) shader->assembly.res;
+   return iris_bo_offset_from_base_address(res->bo) + shader->assembly.offset;
 }
 
 #define INIT_THREAD_DISPATCH_FIELDS(pkt, prefix)                          \
@@ -1520,8 +1965,8 @@ KSP(const struct iris_compiled_shader *shader)
    pkt.Enable           = true;
 
 static void
-iris_set_vs_state(const struct gen_device_info *devinfo,
-                  struct iris_compiled_shader *shader)
+iris_store_vs_state(const struct gen_device_info *devinfo,
+                    struct iris_compiled_shader *shader)
 {
    struct brw_stage_prog_data *prog_data = shader->prog_data;
    struct brw_vue_prog_data *vue_prog_data = (void *) prog_data;
@@ -1536,8 +1981,8 @@ iris_set_vs_state(const struct gen_device_info *devinfo,
 }
 
 static void
-iris_set_tcs_state(const struct gen_device_info *devinfo,
-                   struct iris_compiled_shader *shader)
+iris_store_tcs_state(const struct gen_device_info *devinfo,
+                     struct iris_compiled_shader *shader)
 {
    struct brw_stage_prog_data *prog_data = shader->prog_data;
    struct brw_vue_prog_data *vue_prog_data = (void *) prog_data;
@@ -1553,8 +1998,8 @@ iris_set_tcs_state(const struct gen_device_info *devinfo,
 }
 
 static void
-iris_set_tes_state(const struct gen_device_info *devinfo,
-                   struct iris_compiled_shader *shader)
+iris_store_tes_state(const struct gen_device_info *devinfo,
+                     struct iris_compiled_shader *shader)
 {
    struct brw_stage_prog_data *prog_data = shader->prog_data;
    struct brw_vue_prog_data *vue_prog_data = (void *) prog_data;
@@ -1587,8 +2032,8 @@ iris_set_tes_state(const struct gen_device_info *devinfo,
 }
 
 static void
-iris_set_gs_state(const struct gen_device_info *devinfo,
-                  struct iris_compiled_shader *shader)
+iris_store_gs_state(const struct gen_device_info *devinfo,
+                    struct iris_compiled_shader *shader)
 {
    struct brw_stage_prog_data *prog_data = shader->prog_data;
    struct brw_vue_prog_data *vue_prog_data = (void *) prog_data;
@@ -1602,7 +2047,7 @@ iris_set_gs_state(const struct gen_device_info *devinfo,
       gs.ControlDataHeaderSize =
          gs_prog_data->control_data_header_size_hwords;
       gs.InstanceControl = gs_prog_data->invocations - 1;
-      gs.DispatchMode = SIMD8;
+      gs.DispatchMode = DISPATCH_MODE_SIMD8;
       gs.IncludePrimitiveID = gs_prog_data->include_primitive_id;
       gs.ControlDataFormat = gs_prog_data->control_data_format;
       gs.ReorderMode = TRAILING;
@@ -1631,8 +2076,8 @@ iris_set_gs_state(const struct gen_device_info *devinfo,
 }
 
 static void
-iris_set_fs_state(const struct gen_device_info *devinfo,
-                  struct iris_compiled_shader *shader)
+iris_store_fs_state(const struct gen_device_info *devinfo,
+                    struct iris_compiled_shader *shader)
 {
    struct brw_stage_prog_data *prog_data = shader->prog_data;
    struct brw_wm_prog_data *wm_prog_data = (void *) shader->prog_data;
@@ -1714,7 +2159,7 @@ iris_set_fs_state(const struct gen_device_info *devinfo,
 static unsigned
 iris_derived_program_state_size(enum iris_program_cache_id cache_id)
 {
-   assert(cache_id <= IRIS_CACHE_CS);
+   assert(cache_id <= IRIS_CACHE_BLORP);
 
    static const unsigned dwords[] = {
       [IRIS_CACHE_VS] = GENX(3DSTATE_VS_length),
@@ -1724,34 +2169,35 @@ iris_derived_program_state_size(enum iris_program_cache_id cache_id)
       [IRIS_CACHE_FS] =
          GENX(3DSTATE_PS_length) + GENX(3DSTATE_PS_EXTRA_length),
       [IRIS_CACHE_CS] = 0,
-      [IRIS_CACHE_BLORP_BLIT] = 0,
+      [IRIS_CACHE_BLORP] = 0,
    };
 
    return sizeof(uint32_t) * dwords[cache_id];
 }
 
 static void
-iris_set_derived_program_state(const struct gen_device_info *devinfo,
-                               enum iris_program_cache_id cache_id,
-                               struct iris_compiled_shader *shader)
+iris_store_derived_program_state(const struct gen_device_info *devinfo,
+                                 enum iris_program_cache_id cache_id,
+                                 struct iris_compiled_shader *shader)
 {
    switch (cache_id) {
    case IRIS_CACHE_VS:
-      iris_set_vs_state(devinfo, shader);
+      iris_store_vs_state(devinfo, shader);
       break;
    case IRIS_CACHE_TCS:
-      iris_set_tcs_state(devinfo, shader);
+      iris_store_tcs_state(devinfo, shader);
       break;
    case IRIS_CACHE_TES:
-      iris_set_tes_state(devinfo, shader);
+      iris_store_tes_state(devinfo, shader);
       break;
    case IRIS_CACHE_GS:
-      iris_set_gs_state(devinfo, shader);
+      iris_store_gs_state(devinfo, shader);
       break;
    case IRIS_CACHE_FS:
-      iris_set_fs_state(devinfo, shader);
+      iris_store_fs_state(devinfo, shader);
       break;
    case IRIS_CACHE_CS:
+   case IRIS_CACHE_BLORP:
       break;
    default:
       break;
@@ -1803,35 +2249,239 @@ static const uint32_t push_constant_opcodes[] = {
    [MESA_SHADER_COMPUTE]   = 0,
 };
 
+/**
+ * Add a surface to the validation list, as well as the buffer containing
+ * the corresponding SURFACE_STATE.
+ *
+ * Returns the binding table entry (offset to SURFACE_STATE).
+ */
 static uint32_t
-emit_patched_surface_state(struct iris_batch *batch,
-                           uint32_t *surface_state,
-                           const struct iris_resource *res,
-                           unsigned reloc_flags)
+use_surface(struct iris_batch *batch,
+            struct pipe_surface *p_surf,
+            bool writeable)
 {
-   const int num_dwords = GENX(RENDER_SURFACE_STATE_length);
-   uint32_t offset;
-   uint32_t *dw = iris_alloc_state(batch, 4 * num_dwords, 64, &offset);
+   struct iris_surface *surf = (void *) p_surf;
 
-   STATIC_ASSERT(GENX(RENDER_SURFACE_STATE_SurfaceBaseAddress_start) % 32 == 0);
-   int addr_idx = GENX(RENDER_SURFACE_STATE_SurfaceBaseAddress_start) / 32;
-   for (uint32_t i = 0; i < addr_idx; i++)
-      dw[i] = surface_state[i];
+   iris_use_pinned_bo(batch, iris_resource_bo(p_surf->texture), writeable);
+   iris_use_pinned_bo(batch, iris_resource_bo(surf->surface_state.res), false);
 
-   uint64_t *qw = (uint64_t *) &dw[addr_idx];
-   // XXX: mt->offset, if needed
-   *qw = iris_state_reloc(batch, (void *)qw - batch->statebuf.map, res->bo,
-                          surface_state[addr_idx + 1], reloc_flags);
+   return surf->surface_state.offset;
+}
 
-   for (uint32_t i = addr_idx + 1; i < num_dwords; i++)
-      dw[i] = surface_state[i];
+static uint32_t
+use_sampler_view(struct iris_batch *batch, struct iris_sampler_view *isv)
+{
+   iris_use_pinned_bo(batch, iris_resource_bo(isv->pipe.texture), false);
+   iris_use_pinned_bo(batch, iris_resource_bo(isv->surface_state.res), false);
 
-   return offset;
+   return isv->surface_state.offset;
 }
 
-static void
-iris_upload_render_state(struct iris_context *ice,
-                         struct iris_batch *batch,
+static uint32_t
+use_const_buffer(struct iris_batch *batch, struct iris_const_buffer *cbuf)
+{
+   iris_use_pinned_bo(batch, iris_resource_bo(cbuf->data.res), false);
+   iris_use_pinned_bo(batch, iris_resource_bo(cbuf->surface_state.res), false);
+
+   return cbuf->surface_state.offset;
+}
+
+static uint32_t
+use_null_surface(struct iris_batch *batch, struct iris_context *ice)
+{
+   struct iris_bo *state_bo = iris_resource_bo(ice->state.unbound_tex.res);
+
+   iris_use_pinned_bo(batch, state_bo, false);
+
+   return ice->state.unbound_tex.offset;
+}
+
+static void
+iris_populate_binding_table(struct iris_context *ice,
+                            struct iris_batch *batch,
+                            gl_shader_stage stage)
+{
+   const struct iris_binder *binder = &batch->binder;
+   struct iris_compiled_shader *shader = ice->shaders.prog[stage];
+   if (!shader)
+      return;
+
+   // Surfaces:
+   // - pull constants
+   // - ubos/ssbos/abos
+   // - images
+   // - textures
+   // - render targets - write and read
+
+   //struct brw_stage_prog_data *prog_data = (void *) shader->prog_data;
+   uint32_t *bt_map = binder->map + binder->bt_offset[stage];
+   int s = 0;
+
+   if (stage == MESA_SHADER_FRAGMENT) {
+      struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
+      for (unsigned i = 0; i < cso_fb->nr_cbufs; i++) {
+         bt_map[s++] = use_surface(batch, cso_fb->cbufs[i], true);
+      }
+   }
+
+   //assert(prog_data->binding_table.texture_start ==
+          //(ice->state.num_textures[stage] ? s : 0xd0d0d0d0));
+
+   for (int i = 0; i < ice->state.num_textures[stage]; i++) {
+      struct iris_sampler_view *view = ice->state.textures[stage][i];
+      bt_map[s++] = view ? use_sampler_view(batch, view)
+                         : use_null_surface(batch, ice);
+   }
+
+   // XXX: want the number of BTE's to shorten this loop
+   struct iris_shader_state *shs = &ice->shaders.state[stage];
+   for (int i = 0; i < PIPE_MAX_CONSTANT_BUFFERS; i++) {
+      struct iris_const_buffer *cbuf = &shs->constbuf[i];
+      if (!cbuf->surface_state.res)
+         break;
+
+      bt_map[s++] = use_const_buffer(batch, cbuf);
+   }
+#if 0
+      // XXX: not implemented yet
+      assert(prog_data->binding_table.pull_constants_start == 0xd0d0d0d0);
+      assert(prog_data->binding_table.ubo_start == 0xd0d0d0d0);
+      assert(prog_data->binding_table.ssbo_start == 0xd0d0d0d0);
+      assert(prog_data->binding_table.image_start == 0xd0d0d0d0);
+      assert(prog_data->binding_table.shader_time_start == 0xd0d0d0d0);
+      //assert(prog_data->binding_table.plane_start[1] == 0xd0d0d0d0);
+      //assert(prog_data->binding_table.plane_start[2] == 0xd0d0d0d0);
+#endif
+}
+
+static void
+iris_use_optional_res(struct iris_batch *batch,
+                      struct pipe_resource *res,
+                      bool writeable)
+{
+   if (res) {
+      struct iris_bo *bo = iris_resource_bo(res);
+      iris_use_pinned_bo(batch, bo, writeable);
+   }
+}
+
+
+/**
+ * Pin any BOs which were installed by a previous batch, and restored
+ * via the hardware logical context mechanism.
+ *
+ * We don't need to re-emit all state every batch - the hardware context
+ * mechanism will save and restore it for us.  This includes pointers to
+ * various BOs...which won't exist unless we ask the kernel to pin them
+ * by adding them to the validation list.
+ *
+ * We can skip buffers if we've re-emitted those packets, as we're
+ * overwriting those stale pointers with new ones, and don't actually
+ * refer to the old BOs.
+ */
+static void
+iris_restore_context_saved_bos(struct iris_context *ice,
+                               struct iris_batch *batch,
+                               const struct pipe_draw_info *draw)
+{
+   // XXX: whack IRIS_SHADER_DIRTY_BINDING_TABLE on new batch
+
+   const uint64_t clean = ~ice->state.dirty;
+
+   if (clean & IRIS_DIRTY_CC_VIEWPORT) {
+      iris_use_optional_res(batch, ice->state.last_res.cc_vp, false);
+   }
+
+   if (clean & IRIS_DIRTY_SF_CL_VIEWPORT) {
+      iris_use_optional_res(batch, ice->state.last_res.sf_cl_vp, false);
+   }
+
+   if (clean & IRIS_DIRTY_BLEND_STATE) {
+      iris_use_optional_res(batch, ice->state.last_res.blend, false);
+   }
+
+   if (clean & IRIS_DIRTY_COLOR_CALC_STATE) {
+      iris_use_optional_res(batch, ice->state.last_res.color_calc, false);
+   }
+
+   if (clean & IRIS_DIRTY_SCISSOR_RECT) {
+      iris_use_optional_res(batch, ice->state.last_res.scissor, false);
+   }
+
+   for (int stage = 0; stage <= MESA_SHADER_FRAGMENT; stage++) {
+      if (clean & (IRIS_DIRTY_CONSTANTS_VS << stage))
+         continue;
+
+      struct iris_shader_state *shs = &ice->shaders.state[stage];
+      struct iris_compiled_shader *shader = ice->shaders.prog[stage];
+
+      if (!shader)
+         continue;
+
+      struct brw_stage_prog_data *prog_data = (void *) shader->prog_data;
+
+      for (int i = 0; i < 4; i++) {
+         const struct brw_ubo_range *range = &prog_data->ubo_ranges[i];
+
+         if (range->length == 0)
+            continue;
+
+         struct iris_const_buffer *cbuf = &shs->constbuf[range->block];
+         struct iris_resource *res = (void *) cbuf->data.res;
+
+         if (res)
+            iris_use_pinned_bo(batch, res->bo, false);
+         else
+            iris_use_pinned_bo(batch, batch->screen->workaround_bo, false);
+      }
+   }
+
+   for (int stage = 0; stage <= MESA_SHADER_FRAGMENT; stage++) {
+      struct pipe_resource *res = ice->state.sampler_table[stage].res;
+      if (res)
+         iris_use_pinned_bo(batch, iris_resource_bo(res), false);
+   }
+
+   for (int stage = 0; stage <= MESA_SHADER_FRAGMENT; stage++) {
+      if (clean & (IRIS_DIRTY_VS << stage)) {
+         struct iris_compiled_shader *shader = ice->shaders.prog[stage];
+         if (shader) {
+            struct iris_bo *bo = iris_resource_bo(shader->assembly.res);
+            iris_use_pinned_bo(batch, bo, false);
+         }
+
+         // XXX: scratch buffer
+      }
+   }
+
+   // XXX: 3DSTATE_SO_BUFFER
+
+   if (clean & IRIS_DIRTY_DEPTH_BUFFER) {
+      struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
+
+      if (cso_fb->zsbuf) {
+         struct iris_resource *zres = (void *) cso_fb->zsbuf->texture;
+         // XXX: depth might not be writable...
+         iris_use_pinned_bo(batch, zres->bo, true);
+      }
+   }
+
+   if (draw->index_size > 0) {
+      // XXX: index buffer
+   }
+
+   if (clean & IRIS_DIRTY_VERTEX_BUFFERS) {
+      struct iris_vertex_buffer_state *cso = &ice->state.genx->vertex_buffers;
+      for (unsigned i = 0; i < cso->num_buffers; i++) {
+         struct iris_resource *res = (void *) cso->resources[i];
+         iris_use_pinned_bo(batch, res->bo, false);
+      }
+   }
+}
+
+static void
+iris_upload_render_state(struct iris_context *ice,
+                         struct iris_batch *batch,
                          const struct pipe_draw_info *draw)
 {
    const uint64_t dirty = ice->state.dirty;
@@ -1843,20 +2493,26 @@ iris_upload_render_state(struct iris_context *ice,
       struct iris_depth_stencil_alpha_state *cso = ice->state.cso_zsa;
       iris_emit_cmd(batch, GENX(3DSTATE_VIEWPORT_STATE_POINTERS_CC), ptr) {
          ptr.CCViewportPointer =
-            iris_emit_state(batch, cso->cc_vp, sizeof(cso->cc_vp), 32);
+            emit_state(batch, ice->state.dynamic_uploader,
+                       &ice->state.last_res.cc_vp,
+                       cso->cc_vp, sizeof(cso->cc_vp), 32);
       }
    }
 
    if (dirty & IRIS_DIRTY_SF_CL_VIEWPORT) {
-      struct iris_viewport_state *cso = ice->state.cso_vp;
+      struct iris_viewport_state *cso = &ice->state.genx->viewport;
       iris_emit_cmd(batch, GENX(3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP), ptr) {
          ptr.SFClipViewportPointer =
-            iris_emit_state(batch, cso->sf_cl_vp, sizeof(cso->sf_cl_vp), 64);
+            emit_state(batch, ice->state.dynamic_uploader,
+                       &ice->state.last_res.sf_cl_vp,
+                       cso->sf_cl_vp, 4 * GENX(SF_CLIP_VIEWPORT_length) *
+                       ice->state.num_viewports, 64);
       }
    }
 
    /* XXX: L3 State */
 
+   // XXX: this is only flagged at setup, we assume a static configuration
    if (dirty & IRIS_DIRTY_URB) {
       iris_upload_urb_config(ice, batch);
    }
@@ -1869,7 +2525,9 @@ iris_upload_render_state(struct iris_context *ice,
          cso_fb->nr_cbufs * GENX(BLEND_STATE_ENTRY_length));
       uint32_t blend_offset;
       uint32_t *blend_map =
-         iris_alloc_state(batch, num_dwords, 64, &blend_offset);
+         stream_state(batch, ice->state.dynamic_uploader,
+                      &ice->state.last_res.blend,
+                      4 * num_dwords, 64, &blend_offset);
 
       uint32_t blend_state_header;
       iris_pack_state(GENX(BLEND_STATE), &blend_state_header, bs) {
@@ -1891,9 +2549,10 @@ iris_upload_render_state(struct iris_context *ice,
       struct iris_depth_stencil_alpha_state *cso = ice->state.cso_zsa;
       uint32_t cc_offset;
       void *cc_map =
-         iris_alloc_state(batch,
-                          sizeof(uint32_t) * GENX(COLOR_CALC_STATE_length),
-                          64, &cc_offset);
+         stream_state(batch, ice->state.dynamic_uploader,
+                      &ice->state.last_res.color_calc,
+                      sizeof(uint32_t) * GENX(COLOR_CALC_STATE_length),
+                      64, &cc_offset);
       iris_pack_state(GENX(COLOR_CALC_STATE), cc_map, cc) {
          cc.AlphaTestFormat = ALPHATEST_FLOAT32;
          cc.AlphaReferenceValueAsFLOAT32 = cso->alpha.ref_value;
@@ -1909,81 +2568,88 @@ iris_upload_render_state(struct iris_context *ice,
    }
 
    for (int stage = 0; stage <= MESA_SHADER_FRAGMENT; stage++) {
+      // XXX: wrong dirty tracking...
       if (!(dirty & (IRIS_DIRTY_CONSTANTS_VS << stage)))
          continue;
 
+      struct iris_shader_state *shs = &ice->shaders.state[stage];
+      struct iris_compiled_shader *shader = ice->shaders.prog[stage];
+
+      if (!shader)
+         continue;
+
+      struct brw_stage_prog_data *prog_data = (void *) shader->prog_data;
+
       iris_emit_cmd(batch, GENX(3DSTATE_CONSTANT_VS), pkt) {
          pkt._3DCommandSubOpcode = push_constant_opcodes[stage];
-         if (ice->shaders.prog[stage]) {
-            // XXX: 3DSTATE_CONSTANT_XS
+         if (prog_data) {
+            /* The Skylake PRM contains the following restriction:
+             *
+             *    "The driver must ensure The following case does not occur
+             *     without a flush to the 3D engine: 3DSTATE_CONSTANT_* with
+             *     buffer 3 read length equal to zero committed followed by a
+             *     3DSTATE_CONSTANT_* with buffer 0 read length not equal to
+             *     zero committed."
+             *
+             * To avoid this, we program the buffers in the highest slots.
+             * This way, slot 0 is only used if slot 3 is also used.
+             */
+            int n = 3;
+
+            for (int i = 3; i >= 0; i--) {
+               const struct brw_ubo_range *range = &prog_data->ubo_ranges[i];
+
+               if (range->length == 0)
+                  continue;
+
+               // XXX: is range->block a constbuf index?  it would be nice
+               struct iris_const_buffer *cbuf = &shs->constbuf[range->block];
+               struct iris_resource *res = (void *) cbuf->data.res;
+
+               assert(cbuf->data.offset % 32 == 0);
+
+               pkt.ConstantBody.ReadLength[n] = range->length;
+               pkt.ConstantBody.Buffer[n] =
+                  res ? ro_bo(res->bo, range->start * 32 + cbuf->data.offset)
+                      : ro_bo(batch->screen->workaround_bo, 0);
+               n--;
+            }
          }
       }
    }
 
-   // Surfaces:
-   // - pull constants
-   // - ubos/ssbos/abos
-   // - images
-   // - textures
-   // - render targets - write and read
-   // XXX: 3DSTATE_BINDING_TABLE_POINTERS_XS
+   struct iris_binder *binder = &batch->binder;
 
    for (int stage = 0; stage <= MESA_SHADER_FRAGMENT; stage++) {
-      struct iris_compiled_shader *shader = ice->shaders.prog[stage];
-      if (!shader) // XXX: dirty bits...also, emit a disable maybe?
-         continue;
-
-      struct brw_stage_prog_data *prog_data = (void *) shader->prog_data;
-      uint32_t bt_offset = 0;
-      uint32_t *bt_map = NULL;
-
-      if (prog_data->binding_table.size_bytes != 0) {
-         bt_map = iris_alloc_state(batch, prog_data->binding_table.size_bytes,
-                                   64, &bt_offset);
-      }
-
-      iris_emit_cmd(batch, GENX(3DSTATE_BINDING_TABLE_POINTERS_VS), ptr) {
-         ptr._3DCommandSubOpcode = 38 + stage;
-         ptr.PointertoVSBindingTable = bt_offset;
+      if (dirty & (IRIS_DIRTY_BINDINGS_VS << stage)) {
+         iris_emit_cmd(batch, GENX(3DSTATE_BINDING_TABLE_POINTERS_VS), ptr) {
+            ptr._3DCommandSubOpcode = 38 + stage;
+            ptr.PointertoVSBindingTable = binder->bt_offset[stage];
+         }
       }
+   }
 
-      if (stage == MESA_SHADER_FRAGMENT) {
-         struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
-         for (unsigned i = 0; i < cso_fb->nr_cbufs; i++) {
-            struct iris_surface *surf = (void *) cso_fb->cbufs[i];
-            struct iris_resource *res = (void *) surf->pipe.texture;
-
-            *bt_map++ = emit_patched_surface_state(batch, surf->surface_state,
-                                                   res, RELOC_WRITE);
-         }
+   for (int stage = 0; stage <= MESA_SHADER_FRAGMENT; stage++) {
+      if (dirty & (IRIS_DIRTY_BINDINGS_VS << stage)) {
+         iris_populate_binding_table(ice, batch, stage);
       }
    }
 
+   if (ice->state.need_border_colors)
+      iris_use_pinned_bo(batch, ice->state.border_color_pool.bo, false);
+
    for (int stage = 0; stage <= MESA_SHADER_FRAGMENT; stage++) {
       if (!(dirty & (IRIS_DIRTY_SAMPLER_STATES_VS << stage)) ||
           !ice->shaders.prog[stage])
          continue;
 
-      // XXX: get sampler count from shader; don't emit them all...
-      const int count = IRIS_MAX_TEXTURE_SAMPLERS;
-
-      uint32_t offset;
-      uint32_t *map = iris_alloc_state(batch,
-                                       count * 4 * GENX(SAMPLER_STATE_length),
-                                       32, &offset);
-
-      for (int i = 0; i < count; i++) {
-         // XXX: when we have a correct count, these better be bound
-         if (!ice->state.samplers[stage][i])
-            continue;
-         memcpy(map, ice->state.samplers[stage][i]->sampler_state,
-                4 * GENX(SAMPLER_STATE_length));
-         map += GENX(SAMPLER_STATE_length);
-      }
+      struct pipe_resource *res = ice->state.sampler_table[stage].res;
+      if (res)
+         iris_use_pinned_bo(batch, iris_resource_bo(res), false);
 
       iris_emit_cmd(batch, GENX(3DSTATE_SAMPLER_STATE_POINTERS_VS), ptr) {
          ptr._3DCommandSubOpcode = 43 + stage;
-         ptr.PointertoVSSamplerState = offset;
+         ptr.PointertoVSSamplerState = ice->state.sampler_table[stage].offset;
       }
    }
 
@@ -1998,7 +2664,7 @@ iris_upload_render_state(struct iris_context *ice,
 
    if (dirty & IRIS_DIRTY_SAMPLE_MASK) {
       iris_emit_cmd(batch, GENX(3DSTATE_SAMPLE_MASK), ms) {
-         ms.SampleMask = ice->state.sample_mask;
+         ms.SampleMask = MAX2(ice->state.sample_mask, 1);
       }
    }
 
@@ -2009,8 +2675,8 @@ iris_upload_render_state(struct iris_context *ice,
       struct iris_compiled_shader *shader = ice->shaders.prog[stage];
 
       if (shader) {
-         struct iris_resource *cache = (void *) shader->buffer;
-         iris_use_pinned_bo(batch, cache->bo);
+         struct iris_resource *cache = (void *) shader->assembly.res;
+         iris_use_pinned_bo(batch, cache->bo, false);
          iris_batch_emit(batch, shader->derived_data,
                          iris_derived_program_state_size(stage));
       } else {
@@ -2040,6 +2706,7 @@ iris_upload_render_state(struct iris_context *ice,
             cl.NonPerspectiveBarycentricEnable = true;
 
          cl.ForceZeroRTAIndexEnable = cso_fb->layers == 0;
+         cl.MaximumVPIndex = ice->state.num_viewports - 1;
       }
       iris_emit_merge(batch, cso_rast->clip, dynamic_clip,
                       ARRAY_SIZE(cso_rast->clip));
@@ -2052,7 +2719,8 @@ iris_upload_render_state(struct iris_context *ice,
 
    }
 
-   if (dirty & (IRIS_DIRTY_RASTER | IRIS_DIRTY_FS)) {
+   /* XXX: FS program updates needs to flag IRIS_DIRTY_WM */
+   if (dirty & IRIS_DIRTY_WM) {
       struct iris_rasterizer_state *cso = ice->state.cso_rast;
       uint32_t dynamic_wm[GENX(3DSTATE_WM_length)];
 
@@ -2072,10 +2740,7 @@ iris_upload_render_state(struct iris_context *ice,
       // XXX: 3DSTATE_SBE, 3DSTATE_SBE_SWIZ
       // -> iris_raster_state (point sprite texture coordinate origin)
       // -> bunch of shader state...
-      iris_emit_cmd(batch, GENX(3DSTATE_SBE), sbe) {
-      }
-      iris_emit_cmd(batch, GENX(3DSTATE_SBE_SWIZ), sbe) {
-      }
+      iris_emit_sbe(batch, ice);
    }
 
    if (dirty & IRIS_DIRTY_PS_BLEND) {
@@ -2103,21 +2768,31 @@ iris_upload_render_state(struct iris_context *ice,
       iris_emit_merge(batch, cso->wmds, stencil_refs, ARRAY_SIZE(cso->wmds));
    }
 
-   if (dirty & IRIS_DIRTY_SCISSOR) {
+   if (dirty & IRIS_DIRTY_SCISSOR_RECT) {
       uint32_t scissor_offset =
-         iris_emit_state(batch, ice->state.scissors,
-                         sizeof(struct pipe_scissor_state) *
-                         ice->state.num_scissors, 32);
+         emit_state(batch, ice->state.dynamic_uploader,
+                    &ice->state.last_res.scissor,
+                    ice->state.scissors,
+                    sizeof(struct pipe_scissor_state) *
+                    ice->state.num_viewports, 32);
 
       iris_emit_cmd(batch, GENX(3DSTATE_SCISSOR_STATE_POINTERS), ptr) {
          ptr.ScissorRectPointer = scissor_offset;
       }
    }
 
-   // XXX: 3DSTATE_DEPTH_BUFFER
-   // XXX: 3DSTATE_HIER_DEPTH_BUFFER
-   // XXX: 3DSTATE_STENCIL_BUFFER
-   // XXX: 3DSTATE_CLEAR_PARAMS
+   if (dirty & IRIS_DIRTY_DEPTH_BUFFER) {
+      struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
+      struct iris_depth_buffer_state *cso_z = &ice->state.genx->depth_buffer;
+
+      iris_batch_emit(batch, cso_z->packets, sizeof(cso_z->packets));
+
+      if (cso_fb->zsbuf) {
+         struct iris_resource *zres = (void *) cso_fb->zsbuf->texture;
+         // XXX: depth might not be writable...
+         iris_use_pinned_bo(batch, zres->bo, true);
+      }
+   }
 
    if (dirty & IRIS_DIRTY_POLYGON_STIPPLE) {
       iris_emit_cmd(batch, GENX(3DSTATE_POLY_STIPPLE_PATTERN), poly) {
@@ -2140,38 +2815,38 @@ iris_upload_render_state(struct iris_context *ice,
    }
 
    if (draw->index_size > 0) {
-      struct iris_resource *res = (struct iris_resource *)draw->index.resource;
+      struct iris_resource *res = NULL;
+      unsigned offset;
 
-      assert(!draw->has_user_indices);
+      if (draw->has_user_indices) {
+         u_upload_data(ice->ctx.stream_uploader, 0,
+                       draw->count * draw->index_size, 4, draw->index.user,
+                       &offset, (struct pipe_resource **) &res);
+      } else {
+         res = (struct iris_resource *) draw->index.resource;
+         offset = 0;
+      }
 
       iris_emit_cmd(batch, GENX(3DSTATE_INDEX_BUFFER), ib) {
-         ib.IndexFormat = draw->index_size;
+         ib.IndexFormat = draw->index_size >> 1;
          ib.MOCS = MOCS_WB;
          ib.BufferSize = res->bo->size;
-         ib.BufferStartingAddress = ro_bo(res->bo, 0);
+         ib.BufferStartingAddress = ro_bo(res->bo, offset);
       }
    }
 
    if (dirty & IRIS_DIRTY_VERTEX_BUFFERS) {
-      struct iris_vertex_buffer_state *cso = ice->state.cso_vertex_buffers;
-
-      STATIC_ASSERT(GENX(VERTEX_BUFFER_STATE_length) == 4);
-      STATIC_ASSERT((GENX(VERTEX_BUFFER_STATE_BufferStartingAddress_bits) % 32) == 0);
+      struct iris_vertex_buffer_state *cso = &ice->state.genx->vertex_buffers;
+      const unsigned vb_dwords = GENX(VERTEX_BUFFER_STATE_length);
 
-      uint64_t *addr = batch->cmdbuf.map_next + sizeof(uint32_t) *
-         (GENX(VERTEX_BUFFER_STATE_BufferStartingAddress_bits) / 32);
-      uint32_t *delta = cso->vertex_buffers +
-         (1 + GENX(VERTEX_BUFFER_STATE_BufferStartingAddress_bits) / 32);
+      if (cso->num_buffers > 0) {
+         iris_batch_emit(batch, cso->vertex_buffers, sizeof(uint32_t) *
+                         (1 + vb_dwords * cso->num_buffers));
 
-      iris_batch_emit(batch, cso->vertex_buffers,
-                      sizeof(uint32_t) * (1 + 4 * cso->num_buffers));
-
-      for (unsigned i = 0; i < cso->num_buffers; i++) {
-         *addr = iris_batch_reloc(batch, (void *) addr - batch->cmdbuf.map,
-                                  cso->bos[i].bo, cso->bos[i].offset +
-                                  *delta, cso->bos[i].reloc_flags);
-         addr = (void *) addr + 16;
-         delta = (void *) delta + 16;
+         for (unsigned i = 0; i < cso->num_buffers; i++) {
+            struct iris_resource *res = (void *) cso->resources[i];
+            iris_use_pinned_bo(batch, res->bo, false);
+         }
       }
    }
 
@@ -2179,10 +2854,8 @@ iris_upload_render_state(struct iris_context *ice,
       struct iris_vertex_element_state *cso = ice->state.cso_vertex_elements;
       iris_batch_emit(batch, cso->vertex_elements, sizeof(uint32_t) *
                       (1 + cso->count * GENX(VERTEX_ELEMENT_STATE_length)));
-      for (int i = 0; i < cso->count; i++) {
-         iris_batch_emit(batch, cso->vf_instancing[i], sizeof(uint32_t) *
-                         (cso->count * GENX(3DSTATE_VF_INSTANCING_length)));
-      }
+      iris_batch_emit(batch, cso->vf_instancing, sizeof(uint32_t) *
+                      cso->count * GENX(3DSTATE_VF_INSTANCING_length));
       for (int i = 0; i < cso->count; i++) {
          /* TODO: vertexid, instanceid support */
          iris_emit_cmd(batch, GENX(3DSTATE_VF_SGVS), sgvs);
@@ -2219,22 +2892,449 @@ iris_upload_render_state(struct iris_context *ice,
 
       //prim.BaseVertexLocation = ...;
    }
+
+   if (!batch->contains_draw) {
+      iris_restore_context_saved_bos(ice, batch, draw);
+      batch->contains_draw = true;
+   }
 }
 
+/**
+ * State module teardown.
+ */
 static void
 iris_destroy_state(struct iris_context *ice)
 {
+   iris_free_vertex_buffers(&ice->state.genx->vertex_buffers);
+
    // XXX: unreference resources/surfaces.
    for (unsigned i = 0; i < ice->state.framebuffer.nr_cbufs; i++) {
       pipe_surface_reference(&ice->state.framebuffer.cbufs[i], NULL);
    }
    pipe_surface_reference(&ice->state.framebuffer.zsbuf, NULL);
+
+   for (int stage = 0; stage < MESA_SHADER_STAGES; stage++) {
+      pipe_resource_reference(&ice->state.sampler_table[stage].res, NULL);
+   }
+   free(ice->state.genx);
+
+   pipe_resource_reference(&ice->state.last_res.cc_vp, NULL);
+   pipe_resource_reference(&ice->state.last_res.sf_cl_vp, NULL);
+   pipe_resource_reference(&ice->state.last_res.color_calc, NULL);
+   pipe_resource_reference(&ice->state.last_res.scissor, NULL);
+   pipe_resource_reference(&ice->state.last_res.blend, NULL);
+}
+
+static unsigned
+flags_to_post_sync_op(uint32_t flags)
+{
+   if (flags & PIPE_CONTROL_WRITE_IMMEDIATE)
+      return WriteImmediateData;
+
+   if (flags & PIPE_CONTROL_WRITE_DEPTH_COUNT)
+      return WritePSDepthCount;
+
+   if (flags & PIPE_CONTROL_WRITE_TIMESTAMP)
+      return WriteTimestamp;
+
+   return 0;
+}
+
+/**
+ * Do the given flags have a Post Sync or LRI Post Sync operation?
+ */
+static enum pipe_control_flags
+get_post_sync_flags(enum pipe_control_flags flags)
+{
+   flags &= PIPE_CONTROL_WRITE_IMMEDIATE |
+            PIPE_CONTROL_WRITE_DEPTH_COUNT |
+            PIPE_CONTROL_WRITE_TIMESTAMP |
+            PIPE_CONTROL_LRI_POST_SYNC_OP;
+
+   /* Only one "Post Sync Op" is allowed, and it's mutually exclusive with
+    * "LRI Post Sync Operation".  So more than one bit set would be illegal.
+    */
+   assert(util_bitcount(flags) <= 1);
+
+   return flags;
+}
+
+// XXX: compute support
+#define IS_COMPUTE_PIPELINE(batch) (batch->ring != I915_EXEC_RENDER)
+
+/**
+ * Emit a series of PIPE_CONTROL commands, taking into account any
+ * workarounds necessary to actually accomplish the caller's request.
+ *
+ * Unless otherwise noted, spec quotations in this function come from:
+ *
+ * Synchronization of the 3D Pipeline > PIPE_CONTROL Command > Programming
+ * Restrictions for PIPE_CONTROL.
+ */
+static void
+iris_emit_raw_pipe_control(struct iris_batch *batch, uint32_t flags,
+                           struct iris_bo *bo, uint32_t offset, uint64_t imm)
+{
+   UNUSED const struct gen_device_info *devinfo = &batch->screen->devinfo;
+   enum pipe_control_flags post_sync_flags = get_post_sync_flags(flags);
+   enum pipe_control_flags non_lri_post_sync_flags =
+      post_sync_flags & ~PIPE_CONTROL_LRI_POST_SYNC_OP;
+
+   /* Recursive PIPE_CONTROL workarounds --------------------------------
+    * (http://knowyourmeme.com/memes/xzibit-yo-dawg)
+    *
+    * We do these first because we want to look at the original operation,
+    * rather than any workarounds we set.
+    */
+   if (GEN_GEN == 9 && (flags & PIPE_CONTROL_VF_CACHE_INVALIDATE)) {
+      /* The PIPE_CONTROL "VF Cache Invalidation Enable" bit description
+       * lists several workarounds:
+       *
+       *    "Project: SKL, KBL, BXT
+       *
+       *     If the VF Cache Invalidation Enable is set to a 1 in a
+       *     PIPE_CONTROL, a separate Null PIPE_CONTROL, all bitfields
+       *     sets to 0, with the VF Cache Invalidation Enable set to 0
+       *     needs to be sent prior to the PIPE_CONTROL with VF Cache
+       *     Invalidation Enable set to a 1."
+       */
+      iris_emit_raw_pipe_control(batch, 0, NULL, 0, 0);
+   }
+
+   if (GEN_GEN == 9 && IS_COMPUTE_PIPELINE(batch) && post_sync_flags) {
+      /* Project: SKL / Argument: LRI Post Sync Operation [23]
+       *
+       * "PIPECONTROL command with “Command Streamer Stall Enable” must be
+       *  programmed prior to programming a PIPECONTROL command with "LRI
+       *  Post Sync Operation" in GPGPU mode of operation (i.e when
+       *  PIPELINE_SELECT command is set to GPGPU mode of operation)."
+       *
+       * The same text exists a few rows below for Post Sync Op.
+       */
+      iris_emit_raw_pipe_control(batch, PIPE_CONTROL_CS_STALL, bo, offset, imm);
+   }
+
+   if (GEN_GEN == 10 && (flags & PIPE_CONTROL_RENDER_TARGET_FLUSH)) {
+      /* Cannonlake:
+       * "Before sending a PIPE_CONTROL command with bit 12 set, SW must issue
+       *  another PIPE_CONTROL with Render Target Cache Flush Enable (bit 12)
+       *  = 0 and Pipe Control Flush Enable (bit 7) = 1"
+       */
+      iris_emit_raw_pipe_control(batch, PIPE_CONTROL_FLUSH_ENABLE, bo,
+                                 offset, imm);
+   }
+
+   /* "Flush Types" workarounds ---------------------------------------------
+    * We do these now because they may add post-sync operations or CS stalls.
+    */
+
+   if (flags & PIPE_CONTROL_VF_CACHE_INVALIDATE) {
+      /* Project: BDW, SKL+ (stopping at CNL) / Argument: VF Invalidate
+       *
+       * "'Post Sync Operation' must be enabled to 'Write Immediate Data' or
+       *  'Write PS Depth Count' or 'Write Timestamp'."
+       */
+      if (!bo) {
+         flags |= PIPE_CONTROL_WRITE_IMMEDIATE;
+         post_sync_flags |= PIPE_CONTROL_WRITE_IMMEDIATE;
+         non_lri_post_sync_flags |= PIPE_CONTROL_WRITE_IMMEDIATE;
+         bo = batch->screen->workaround_bo;
+      }
+   }
+
+   /* #1130 from Gen10 workarounds page:
+    *
+    *    "Enable Depth Stall on every Post Sync Op if Render target Cache
+    *     Flush is not enabled in same PIPE CONTROL and Enable Pixel score
+    *     board stall if Render target cache flush is enabled."
+    *
+    * Applicable to CNL B0 and C0 steppings only.
+    *
+    * The wording here is unclear, and this workaround doesn't look anything
+    * like the internal bug report recommendations, but leave it be for now...
+    */
+   if (GEN_GEN == 10) {
+      if (flags & PIPE_CONTROL_RENDER_TARGET_FLUSH) {
+         flags |= PIPE_CONTROL_STALL_AT_SCOREBOARD;
+      } else if (flags & non_lri_post_sync_flags) {
+         flags |= PIPE_CONTROL_DEPTH_STALL;
+      }
+   }
+
+   if (flags & PIPE_CONTROL_DEPTH_STALL) {
+      /* From the PIPE_CONTROL instruction table, bit 13 (Depth Stall Enable):
+       *
+       *    "This bit must be DISABLED for operations other than writing
+       *     PS_DEPTH_COUNT."
+       *
+       * This seems like nonsense.  An Ivybridge workaround requires us to
+       * emit a PIPE_CONTROL with a depth stall and write immediate post-sync
+       * operation.  Gen8+ requires us to emit depth stalls and depth cache
+       * flushes together.  So, it's hard to imagine this means anything other
+       * than "we originally intended this to be used for PS_DEPTH_COUNT".
+       *
+       * We ignore the supposed restriction and do nothing.
+       */
+   }
+
+   if (flags & (PIPE_CONTROL_RENDER_TARGET_FLUSH |
+                PIPE_CONTROL_STALL_AT_SCOREBOARD)) {
+      /* From the PIPE_CONTROL instruction table, bit 12 and bit 1:
+       *
+       *    "This bit must be DISABLED for End-of-pipe (Read) fences,
+       *     PS_DEPTH_COUNT or TIMESTAMP queries."
+       *
+       * TODO: Implement end-of-pipe checking.
+       */
+      assert(!(post_sync_flags & (PIPE_CONTROL_WRITE_DEPTH_COUNT |
+                                  PIPE_CONTROL_WRITE_TIMESTAMP)));
+   }
+
+   if (flags & PIPE_CONTROL_STALL_AT_SCOREBOARD) {
+      /* From the PIPE_CONTROL instruction table, bit 1:
+       *
+       *    "This bit is ignored if Depth Stall Enable is set.
+       *     Further, the render cache is not flushed even if Write Cache
+       *     Flush Enable bit is set."
+       *
+       * We assert that the caller doesn't do this combination, to try and
+       * prevent mistakes.  It shouldn't hurt the GPU, though.
+       */
+      assert(!(flags & (PIPE_CONTROL_DEPTH_STALL |
+                        PIPE_CONTROL_RENDER_TARGET_FLUSH)));
+   }
+
+   /* PIPE_CONTROL page workarounds ------------------------------------- */
+
+   if (GEN_GEN <= 8 && (flags & PIPE_CONTROL_STATE_CACHE_INVALIDATE)) {
+      /* From the PIPE_CONTROL page itself:
+       *
+       *    "IVB, HSW, BDW
+       *     Restriction: Pipe_control with CS-stall bit set must be issued
+       *     before a pipe-control command that has the State Cache
+       *     Invalidate bit set."
+       */
+      flags |= PIPE_CONTROL_CS_STALL;
+   }
+
+   if (flags & PIPE_CONTROL_FLUSH_LLC) {
+      /* From the PIPE_CONTROL instruction table, bit 26 (Flush LLC):
+       *
+       *    "Project: ALL
+       *     SW must always program Post-Sync Operation to "Write Immediate
+       *     Data" when Flush LLC is set."
+       *
+       * For now, we just require the caller to do it.
+       */
+      assert(flags & PIPE_CONTROL_WRITE_IMMEDIATE);
+   }
+
+   /* "Post-Sync Operation" workarounds -------------------------------- */
+
+   /* Project: All / Argument: Global Snapshot Count Reset [19]
+    *
+    * "This bit must not be exercised on any product.
+    *  Requires stall bit ([20] of DW1) set."
+    *
+    * We don't use this, so we just assert that it isn't used.  The
+    * PIPE_CONTROL instruction page indicates that they intended this
+    * as a debug feature and don't think it is useful in production,
+    * but it may actually be usable, should we ever want to.
+    */
+   assert((flags & PIPE_CONTROL_GLOBAL_SNAPSHOT_COUNT_RESET) == 0);
+
+   if (flags & (PIPE_CONTROL_MEDIA_STATE_CLEAR |
+                PIPE_CONTROL_INDIRECT_STATE_POINTERS_DISABLE)) {
+      /* Project: All / Arguments:
+       *
+       * - Generic Media State Clear [16]
+       * - Indirect State Pointers Disable [16]
+       *
+       *    "Requires stall bit ([20] of DW1) set."
+       *
+       * Also, the PIPE_CONTROL instruction table, bit 16 (Generic Media
+       * State Clear) says:
+       *
+       *    "PIPECONTROL command with “Command Streamer Stall Enable” must be
+       *     programmed prior to programming a PIPECONTROL command with "Media
+       *     State Clear" set in GPGPU mode of operation"
+       *
+       * This is a subset of the earlier rule, so there's nothing to do.
+       */
+      flags |= PIPE_CONTROL_CS_STALL;
+   }
+
+   if (flags & PIPE_CONTROL_STORE_DATA_INDEX) {
+      /* Project: All / Argument: Store Data Index
+       *
+       * "Post-Sync Operation ([15:14] of DW1) must be set to something other
+       *  than '0'."
+       *
+       * For now, we just assert that the caller does this.  We might want to
+       * automatically add a write to the workaround BO...
+       */
+      assert(non_lri_post_sync_flags != 0);
+   }
+
+   if (flags & PIPE_CONTROL_SYNC_GFDT) {
+      /* Project: All / Argument: Sync GFDT
+       *
+       * "Post-Sync Operation ([15:14] of DW1) must be set to something other
+       *  than '0' or 0x2520[13] must be set."
+       *
+       * For now, we just assert that the caller does this.
+       */
+      assert(non_lri_post_sync_flags != 0);
+   }
+
+   if (flags & PIPE_CONTROL_TLB_INVALIDATE) {
+      /* Project: IVB+ / Argument: TLB inv
+       *
+       *    "Requires stall bit ([20] of DW1) set."
+       *
+       * Also, from the PIPE_CONTROL instruction table:
+       *
+       *    "Project: SKL+
+       *     Post Sync Operation or CS stall must be set to ensure a TLB
+       *     invalidation occurs.  Otherwise no cycle will occur to the TLB
+       *     cache to invalidate."
+       *
+       * This is not a subset of the earlier rule, so there's nothing to do.
+       */
+      flags |= PIPE_CONTROL_CS_STALL;
+   }
+
+   if (GEN_GEN == 9 && devinfo->gt == 4) {
+      /* TODO: The big Skylake GT4 post sync op workaround */
+   }
+
+   /* "GPGPU specific workarounds" (both post-sync and flush) ------------ */
+
+   if (IS_COMPUTE_PIPELINE(batch)) {
+      if (GEN_GEN >= 9 && (flags & PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE)) {
+         /* Project: SKL+ / Argument: Tex Invalidate
+          * "Requires stall bit ([20] of DW) set for all GPGPU Workloads."
+          */
+         flags |= PIPE_CONTROL_CS_STALL;
+      }
+
+      if (GEN_GEN == 8 && (post_sync_flags ||
+                           (flags & (PIPE_CONTROL_NOTIFY_ENABLE |
+                                     PIPE_CONTROL_DEPTH_STALL |
+                                     PIPE_CONTROL_RENDER_TARGET_FLUSH |
+                                     PIPE_CONTROL_DEPTH_CACHE_FLUSH |
+                                     PIPE_CONTROL_DATA_CACHE_FLUSH)))) {
+         /* Project: BDW / Arguments:
+          *
+          * - LRI Post Sync Operation   [23]
+          * - Post Sync Op              [15:14]
+          * - Notify En                 [8]
+          * - Depth Stall               [13]
+          * - Render Target Cache Flush [12]
+          * - Depth Cache Flush         [0]
+          * - DC Flush Enable           [5]
+          *
+          *    "Requires stall bit ([20] of DW) set for all GPGPU and Media
+          *     Workloads."
+          */
+         flags |= PIPE_CONTROL_CS_STALL;
+
+         /* Also, from the PIPE_CONTROL instruction table, bit 20:
+          *
+          *    "Project: BDW
+          *     This bit must be always set when PIPE_CONTROL command is
+          *     programmed by GPGPU and MEDIA workloads, except for the cases
+          *     when only Read Only Cache Invalidation bits are set (State
+          *     Cache Invalidation Enable, Instruction cache Invalidation
+          *     Enable, Texture Cache Invalidation Enable, Constant Cache
+          *     Invalidation Enable). This is to WA FFDOP CG issue, this WA
+          *     need not implemented when FF_DOP_CG is disable via "Fixed
+          *     Function DOP Clock Gate Disable" bit in RC_PSMI_CTRL register."
+          *
+          * It sounds like we could avoid CS stalls in some cases, but we
+          * don't currently bother.  This list isn't exactly the list above,
+          * either...
+          */
+      }
+   }
+
+   /* "Stall" workarounds ----------------------------------------------
+    * These have to come after the earlier ones because we may have added
+    * some additional CS stalls above.
+    */
+
+   if (GEN_GEN < 9 && (flags & PIPE_CONTROL_CS_STALL)) {
+      /* Project: PRE-SKL, VLV, CHV
+       *
+       * "[All Stepping][All SKUs]:
+       *
+       *  One of the following must also be set:
+       *
+       *  - Render Target Cache Flush Enable ([12] of DW1)
+       *  - Depth Cache Flush Enable ([0] of DW1)
+       *  - Stall at Pixel Scoreboard ([1] of DW1)
+       *  - Depth Stall ([13] of DW1)
+       *  - Post-Sync Operation ([13] of DW1)
+       *  - DC Flush Enable ([5] of DW1)"
+       *
+       * If we don't already have one of those bits set, we choose to add
+       * "Stall at Pixel Scoreboard".  Some of the other bits require a
+       * CS stall as a workaround (see above), which would send us into
+       * an infinite recursion of PIPE_CONTROLs.  "Stall at Pixel Scoreboard"
+       * appears to be safe, so we choose that.
+       */
+      const uint32_t wa_bits = PIPE_CONTROL_RENDER_TARGET_FLUSH |
+                               PIPE_CONTROL_DEPTH_CACHE_FLUSH |
+                               PIPE_CONTROL_WRITE_IMMEDIATE |
+                               PIPE_CONTROL_WRITE_DEPTH_COUNT |
+                               PIPE_CONTROL_WRITE_TIMESTAMP |
+                               PIPE_CONTROL_STALL_AT_SCOREBOARD |
+                               PIPE_CONTROL_DEPTH_STALL |
+                               PIPE_CONTROL_DATA_CACHE_FLUSH;
+      if (!(flags & wa_bits))
+         flags |= PIPE_CONTROL_STALL_AT_SCOREBOARD;
+   }
+
+   /* Emit --------------------------------------------------------------- */
+
+   iris_emit_cmd(batch, GENX(PIPE_CONTROL), pc) {
+      pc.LRIPostSyncOperation = NoLRIOperation;
+      pc.PipeControlFlushEnable = flags & PIPE_CONTROL_FLUSH_ENABLE;
+      pc.DCFlushEnable = flags & PIPE_CONTROL_DATA_CACHE_FLUSH;
+      pc.StoreDataIndex = 0;
+      pc.CommandStreamerStallEnable = flags & PIPE_CONTROL_CS_STALL;
+      pc.GlobalSnapshotCountReset =
+         flags & PIPE_CONTROL_GLOBAL_SNAPSHOT_COUNT_RESET;
+      pc.TLBInvalidate = flags & PIPE_CONTROL_TLB_INVALIDATE;
+      pc.GenericMediaStateClear = flags & PIPE_CONTROL_MEDIA_STATE_CLEAR;
+      pc.StallAtPixelScoreboard = flags & PIPE_CONTROL_STALL_AT_SCOREBOARD;
+      pc.RenderTargetCacheFlushEnable =
+         flags & PIPE_CONTROL_RENDER_TARGET_FLUSH;
+      pc.DepthCacheFlushEnable = flags & PIPE_CONTROL_DEPTH_CACHE_FLUSH;
+      pc.StateCacheInvalidationEnable =
+         flags & PIPE_CONTROL_STATE_CACHE_INVALIDATE;
+      pc.VFCacheInvalidationEnable = flags & PIPE_CONTROL_VF_CACHE_INVALIDATE;
+      pc.ConstantCacheInvalidationEnable =
+         flags & PIPE_CONTROL_CONST_CACHE_INVALIDATE;
+      pc.PostSyncOperation = flags_to_post_sync_op(flags);
+      pc.DepthStallEnable = flags & PIPE_CONTROL_DEPTH_STALL;
+      pc.InstructionCacheInvalidateEnable =
+         flags & PIPE_CONTROL_INSTRUCTION_INVALIDATE;
+      pc.NotifyEnable = flags & PIPE_CONTROL_NOTIFY_ENABLE;
+      pc.IndirectStatePointersDisable =
+         flags & PIPE_CONTROL_INDIRECT_STATE_POINTERS_DISABLE;
+      pc.TextureCacheInvalidationEnable =
+         flags & PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
+      pc.Address = ro_bo(bo, offset);
+      pc.ImmediateData = imm;
+   }
 }
 
 void
 genX(init_state)(struct iris_context *ice)
 {
    struct pipe_context *ctx = &ice->ctx;
+   struct iris_screen *screen = (struct iris_screen *)ctx->screen;
 
    ctx->create_blend_state = iris_create_blend_state;
    ctx->create_depth_stencil_alpha_state = iris_create_zsa_state;
@@ -2280,17 +3380,26 @@ genX(init_state)(struct iris_context *ice)
    ctx->stream_output_target_destroy = iris_stream_output_target_destroy;
    ctx->set_stream_output_targets = iris_set_stream_output_targets;
 
-   ice->state.destroy_state = iris_destroy_state;
-   ice->state.init_render_context = iris_init_render_context;
-   ice->state.upload_render_state = iris_upload_render_state;
-   ice->state.derived_program_state_size = iris_derived_program_state_size;
-   ice->state.set_derived_program_state = iris_set_derived_program_state;
-   ice->state.populate_vs_key = iris_populate_vs_key;
-   ice->state.populate_tcs_key = iris_populate_tcs_key;
-   ice->state.populate_tes_key = iris_populate_tes_key;
-   ice->state.populate_gs_key = iris_populate_gs_key;
-   ice->state.populate_fs_key = iris_populate_fs_key;
-
+   ice->vtbl.destroy_state = iris_destroy_state;
+   ice->vtbl.init_render_context = iris_init_render_context;
+   ice->vtbl.upload_render_state = iris_upload_render_state;
+   ice->vtbl.emit_raw_pipe_control = iris_emit_raw_pipe_control;
+   ice->vtbl.derived_program_state_size = iris_derived_program_state_size;
+   ice->vtbl.store_derived_program_state = iris_store_derived_program_state;
+   ice->vtbl.populate_vs_key = iris_populate_vs_key;
+   ice->vtbl.populate_tcs_key = iris_populate_tcs_key;
+   ice->vtbl.populate_tes_key = iris_populate_tes_key;
+   ice->vtbl.populate_gs_key = iris_populate_gs_key;
+   ice->vtbl.populate_fs_key = iris_populate_fs_key;
 
    ice->state.dirty = ~0ull;
+
+   ice->state.num_viewports = 1;
+   ice->state.genx = calloc(1, sizeof(struct iris_genx_state));
+
+   /* Make a 1x1x1 null surface for unbound textures */
+   void *null_surf_map =
+      upload_state(ice->state.surface_uploader, &ice->state.unbound_tex,
+                   4 * GENX(RENDER_SURFACE_STATE_length), 64);
+   isl_null_fill_state(&screen->isl_dev, null_surf_map, isl_extent3d(1, 1, 1));
 }