iris: use vtbl to avoid multiple symbols, fix state base address
authorKenneth Graunke <kenneth@whitecape.org>
Thu, 25 Jan 2018 09:36:49 +0000 (01:36 -0800)
committerKenneth Graunke <kenneth@whitecape.org>
Thu, 21 Feb 2019 18:26:05 +0000 (10:26 -0800)
src/gallium/drivers/iris/iris_batch.c
src/gallium/drivers/iris/iris_batch.h
src/gallium/drivers/iris/iris_context.c
src/gallium/drivers/iris/iris_context.h
src/gallium/drivers/iris/iris_draw.c
src/gallium/drivers/iris/iris_program_cache.c
src/gallium/drivers/iris/iris_state.c

index 23cff4189606cd4fb682c5f0f03971c717ef9ff2..5d3d6698a37c446476e9c7e61dd7af285000af8a 100644 (file)
@@ -201,6 +201,9 @@ iris_batch_reset(struct iris_batch *batch)
 
    if (batch->state_sizes)
       _mesa_hash_table_clear(batch->state_sizes, NULL);
+
+   if (batch->ring == I915_EXEC_RENDER)
+      batch->emit_state_base_address(batch);
 }
 
 static void
index 47da23baeb11cef8eb1d5493b77901b452441a52..6b67737f614d675b21a9b5270ff72368b3ceac4a 100644 (file)
@@ -89,6 +89,8 @@ struct iris_batch {
 
    /** Map from batch offset to iris_alloc_state data (with DEBUG_BATCH) */
    struct hash_table *state_sizes;
+
+   void (*emit_state_base_address)(struct iris_batch *batch);
 };
 
 void iris_init_batch(struct iris_batch *batch,
index ffebaf3c0a0570bd81c98ecd5c70b087b51a7679..ca2aeeb4942a117c601e266909ba204c737f9559 100644 (file)
@@ -84,10 +84,23 @@ iris_destroy_context(struct pipe_context *ctx)
    ralloc_free(ice);
 }
 
+#define genX_call(devinfo, func, ...)             \
+   switch (devinfo->gen) {                        \
+   case 10:                                       \
+      gen10_##func(__VA_ARGS__);                  \
+      break;                                      \
+   case 9:                                        \
+      gen9_##func(__VA_ARGS__);                   \
+      break;                                      \
+   default:                                       \
+      unreachable("Unknown hardware generation"); \
+   }
+
 struct pipe_context *
 iris_create_context(struct pipe_screen *pscreen, void *priv, unsigned flags)
 {
    struct iris_screen *screen = (struct iris_screen*)pscreen;
+   const struct gen_device_info *devinfo = &screen->devinfo;
    struct iris_context *ice = rzalloc(NULL, struct iris_context);
 
    if (!ice)
@@ -115,11 +128,10 @@ iris_create_context(struct pipe_screen *pscreen, void *priv, unsigned flags)
    iris_init_resource_functions(ctx);
    iris_init_query_functions(ctx);
 
-   iris_init_state(ice);
    iris_init_program_cache(ice);
 
    iris_init_batch(&ice->render_batch, screen, &ice->dbg, I915_EXEC_RENDER);
-   iris_upload_initial_gpu_state(&ice->render_batch);
+   genX_call(devinfo, init_state, ice);
 
    return ctx;
 }
index 46ef2d172690d95491ce200f426f77314a1207a7..b85c63729d28bfd977508b692ea271f4706cb8cc 100644 (file)
@@ -75,6 +75,16 @@ struct iris_batch;
 
 struct iris_depth_stencil_alpha_state;
 
+enum iris_program_cache_id {
+   IRIS_CACHE_VS  = MESA_SHADER_VERTEX,
+   IRIS_CACHE_TCS = MESA_SHADER_TESS_CTRL,
+   IRIS_CACHE_TES = MESA_SHADER_TESS_EVAL,
+   IRIS_CACHE_GS  = MESA_SHADER_GEOMETRY,
+   IRIS_CACHE_FS  = MESA_SHADER_FRAGMENT,
+   IRIS_CACHE_CS  = MESA_SHADER_COMPUTE,
+   IRIS_CACHE_BLORP_BLIT,
+};
+
 struct iris_program_cache {
    struct hash_table *table;
    struct iris_bo *bo;
@@ -131,6 +141,15 @@ struct iris_context {
       struct pipe_framebuffer_state framebuffer;
 
       struct iris_sampler_state *samplers[MESA_SHADER_STAGES][IRIS_MAX_TEXTURE_SAMPLERS];
+
+      void (*upload_render_state)(struct iris_context *ice,
+                                  struct iris_batch *batch,
+                                  const struct pipe_draw_info *draw);
+      unsigned (*derived_program_state_size)(enum iris_program_cache_id id);
+      void (*set_derived_program_state)(const struct gen_device_info *devinfo,
+                                        enum iris_program_cache_id cache_id,
+                                        struct iris_compiled_shader *shader);
+      void (*destroy_state)(struct iris_context *ice);
    } state;
 };
 
@@ -151,38 +170,15 @@ void iris_init_clear_functions(struct pipe_context *ctx);
 void iris_init_program_functions(struct pipe_context *ctx);
 void iris_init_resource_functions(struct pipe_context *ctx);
 void iris_init_query_functions(struct pipe_context *ctx);
-
-void iris_setup_state_base_address(struct iris_context *ice,
-                                   struct iris_batch *batch,
-                                   struct iris_bo *instruction_bo);
-void iris_upload_initial_gpu_state(struct iris_batch *batch);
-void iris_upload_render_state(struct iris_context *ice,
-                              struct iris_batch *batch,
-                              const struct pipe_draw_info *draw);
-void iris_destroy_state(struct iris_context *ice);
-
 void iris_update_compiled_shaders(struct iris_context *ice);
 
 void iris_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info);
 
-enum iris_program_cache_id {
-   IRIS_CACHE_VS  = MESA_SHADER_VERTEX,
-   IRIS_CACHE_TCS = MESA_SHADER_TESS_CTRL,
-   IRIS_CACHE_TES = MESA_SHADER_TESS_EVAL,
-   IRIS_CACHE_GS  = MESA_SHADER_GEOMETRY,
-   IRIS_CACHE_FS  = MESA_SHADER_FRAGMENT,
-   IRIS_CACHE_CS  = MESA_SHADER_COMPUTE,
-   IRIS_CACHE_BLORP_BLIT,
-};
-
-void iris_init_state(struct iris_context *ice);
+void gen9_init_state(struct iris_context *ice);
+void gen10_init_state(struct iris_context *ice);
 void iris_init_program_cache(struct iris_context *ice);
 void iris_destroy_program_cache(struct iris_context *ice);
 void iris_print_program_cache(struct iris_context *ice);
-unsigned iris_derived_program_state_size(enum iris_program_cache_id cache_id);
-void iris_set_derived_program_state(const struct gen_device_info *devinfo,
-                                    enum iris_program_cache_id cache_id,
-                                    struct iris_compiled_shader *shader);
 bool iris_bind_cached_shader(struct iris_context *ice,
                              enum iris_program_cache_id cache_id,
                              const void *key);
index 72537ccb26291f6a61466c554f6ea102a907b8e5..96e05fa5f24612aeb5712f4e0902749bd9948a4c 100644 (file)
@@ -37,5 +37,5 @@ iris_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
    struct iris_context *ice = (struct iris_context *) ctx;
 
    iris_update_compiled_shaders(ice);
-   iris_upload_render_state(ice, &ice->render_batch, info);
+   ice->state.upload_render_state(ice, &ice->render_batch, info);
 }
index acbfba0681a121b32f0c8b38eff51d5ff784280a..c1d7f5fbe05dc0569445698ca14addb503c251b5 100644 (file)
@@ -259,7 +259,7 @@ iris_upload_and_bind_shader(struct iris_context *ice,
    struct iris_program_cache *cache = &ice->shaders.cache;
    struct iris_compiled_shader *shader =
       ralloc_size(cache->table, sizeof(struct iris_compiled_shader) +
-                  iris_derived_program_state_size(cache_id));
+                  ice->state.derived_program_state_size(cache_id));
    const struct iris_compiled_shader *existing =
       find_existing_assembly(cache, assembly, prog_data->program_size);
 
@@ -283,7 +283,7 @@ iris_upload_and_bind_shader(struct iris_context *ice,
    ralloc_steal(shader->prog_data, prog_data->pull_param);
 
    /* Store the 3DSTATE shader packets and other derived state. */
-   iris_set_derived_program_state(devinfo, cache_id, shader);
+   ice->state.set_derived_program_state(devinfo, cache_id, shader);
 
    struct keybox *keybox = make_keybox(cache, cache_id, key);
    _mesa_hash_table_insert(cache->table, keybox, shader);
index 058bcea07eb8c0d5f22fb2e1b91be7f2f1e68fe9..0a05b55fdf322d25e07ce4775a6689148d850574 100644 (file)
@@ -282,7 +282,7 @@ ro_bo(struct iris_bo *bo, uint32_t offset)
    return (struct iris_address) { .bo = bo, .offset = offset };
 }
 
-void
+static void
 iris_upload_initial_gpu_state(struct iris_batch *batch)
 {
    iris_emit_cmd(batch, GENX(3DSTATE_DRAWING_RECTANGLE), rect) {
@@ -1312,18 +1312,9 @@ iris_set_stream_output_targets(struct pipe_context *ctx,
 {
 }
 
-void
-iris_setup_state_base_address(struct iris_context *ice,
-                              struct iris_batch *batch,
-                              struct iris_bo *instruction_bo)
+static void
+iris_emit_state_base_address(struct iris_batch *batch)
 {
-   if (!(ice->state.dirty & IRIS_DIRTY_STATE_BASE_ADDRESS))
-      return;
-
-   //iris_batchbuffer_flush(...)
-
-   ice->state.dirty &= ~IRIS_DIRTY_STATE_BASE_ADDRESS;
-
    /* XXX: PIPE_CONTROLs */
 
    iris_emit_cmd(batch, GENX(STATE_BASE_ADDRESS), sba) {
@@ -1351,611 +1342,611 @@ iris_setup_state_base_address(struct iris_context *ice,
 
       sba.SurfaceStateBaseAddress = ro_bo(batch->statebuf.bo, 0);
       sba.DynamicStateBaseAddress = ro_bo(batch->statebuf.bo, 0);
-      sba.InstructionBaseAddress =  ro_bo(instruction_bo, 0);
 
       sba.GeneralStateBufferSize = 0xfffff000;
-      sba.DynamicStateBufferSize = ALIGN(MAX_STATE_SIZE, 4096);
       sba.IndirectObjectBufferSize = 0xfffff000;
-      sba.InstructionBufferSize = ALIGN(ice->shaders.cache.bo->size, 4096);
-      sba.BindlessSurfaceStateSize = 0;
+      sba.InstructionBufferSize = 0xfffff000;
+      sba.DynamicStateBufferSize = ALIGN(MAX_STATE_SIZE, 4096);
    }
 }
 
-void
-iris_upload_render_state(struct iris_context *ice,
-                         struct iris_batch *batch,
-                         const struct pipe_draw_info *draw)
+static void
+iris_bind_compute_state(struct pipe_context *ctx, void *state)
 {
-   const uint64_t dirty = ice->state.dirty;
-
-   struct brw_wm_prog_data *wm_prog_data = (void *)
-      ice->shaders.prog[MESA_SHADER_FRAGMENT]->prog_data;
+}
 
-   if (dirty & IRIS_DIRTY_CC_VIEWPORT) {
-      struct iris_depth_stencil_alpha_state *cso = ice->state.cso_zsa;
-      iris_emit_cmd(batch, GENX(3DSTATE_VIEWPORT_STATE_POINTERS_CC), ptr) {
-         ptr.CCViewportPointer =
-            iris_emit_state(batch, cso->cc_vp, sizeof(cso->cc_vp), 32);
-      }
-   }
+   //pkt.SamplerCount =                                                     \
+      //DIV_ROUND_UP(CLAMP(stage_state->sampler_count, 0, 16), 4);          \
+   //pkt.PerThreadScratchSpace = prog_data->total_scratch == 0 ? 0 :        \
+      //ffs(stage_state->per_thread_scratch) - 11;                          \
 
-   if (dirty & IRIS_DIRTY_SF_CL_VIEWPORT) {
-      struct iris_viewport_state *cso = ice->state.cso_vp;
-      iris_emit_cmd(batch, GENX(3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP), ptr) {
-         ptr.SFClipViewportPointer =
-            iris_emit_state(batch, cso->sf_cl_vp, sizeof(cso->sf_cl_vp), 64);
-      }
-   }
+#define INIT_THREAD_DISPATCH_FIELDS(pkt, prefix)                          \
+   pkt.KernelStartPointer = shader->prog_offset;                          \
+   pkt.BindingTableEntryCount = prog_data->binding_table.size_bytes / 4;  \
+   pkt.FloatingPointMode = prog_data->use_alt_mode;                       \
+                                                                          \
+   pkt.DispatchGRFStartRegisterForURBData =                               \
+      prog_data->dispatch_grf_start_reg;                                  \
+   pkt.prefix##URBEntryReadLength = vue_prog_data->urb_read_length;       \
+   pkt.prefix##URBEntryReadOffset = 0;                                    \
+                                                                          \
+   pkt.StatisticsEnable = true;                                           \
+   pkt.Enable           = true;
 
-   /* XXX: L3 State */
+static void
+iris_set_vs_state(const struct gen_device_info *devinfo,
+                  struct iris_compiled_shader *shader)
+{
+   struct brw_stage_prog_data *prog_data = shader->prog_data;
+   struct brw_vue_prog_data *vue_prog_data = (void *) prog_data;
 
-   if (dirty & IRIS_DIRTY_URB) {
-      /* XXX: URB */
+   iris_pack_command(GENX(3DSTATE_VS), shader->derived_data, vs) {
+      INIT_THREAD_DISPATCH_FIELDS(vs, Vertex);
+      vs.MaximumNumberofThreads = devinfo->max_vs_threads - 1;
+      vs.SIMD8DispatchEnable = true;
+      vs.UserClipDistanceCullTestEnableBitmask =
+         vue_prog_data->cull_distance_mask;
    }
+}
 
-   if (dirty & IRIS_DIRTY_BLEND_STATE) {
-      struct iris_blend_state *cso = ice->state.cso_blend;
-      // XXX: 3DSTATE_BLEND_STATE_POINTERS - BLEND_STATE
-      // -> from iris_blend_state (most) + iris_depth_stencil_alpha_state
-      //    (alpha test function/enable) + has writeable RT from ???????
-   }
+static void
+iris_set_tcs_state(const struct gen_device_info *devinfo,
+                   struct iris_compiled_shader *shader)
+{
+   struct brw_stage_prog_data *prog_data = shader->prog_data;
+   struct brw_vue_prog_data *vue_prog_data = (void *) prog_data;
+   struct brw_tcs_prog_data *tcs_prog_data = (void *) prog_data;
 
-   if (dirty & IRIS_DIRTY_COLOR_CALC_STATE) {
-      struct iris_depth_stencil_alpha_state *cso = ice->state.cso_zsa;
-      uint32_t cc_offset;
-      void *cc_map =
-         iris_alloc_state(batch,
-                          sizeof(uint32_t) * GENX(COLOR_CALC_STATE_length),
-                          64, &cc_offset);
-      iris_pack_state(GENX(COLOR_CALC_STATE), cc_map, cc) {
-         cc.AlphaTestFormat = ALPHATEST_FLOAT32;
-         cc.AlphaReferenceValueAsFLOAT32 = cso->alpha.ref_value;
-         cc.BlendConstantColorRed   = ice->state.blend_color.color[0];
-         cc.BlendConstantColorGreen = ice->state.blend_color.color[1];
-         cc.BlendConstantColorBlue  = ice->state.blend_color.color[2];
-         cc.BlendConstantColorAlpha = ice->state.blend_color.color[3];
-      }
-      iris_emit_cmd(batch, GENX(3DSTATE_CC_STATE_POINTERS), ptr) {
-         ptr.ColorCalcStatePointer = cc_offset;
-         ptr.ColorCalcStatePointerValid = true;
-      }
+   iris_pack_command(GENX(3DSTATE_HS), shader->derived_data, hs) {
+      INIT_THREAD_DISPATCH_FIELDS(hs, Vertex);
+
+      hs.InstanceCount = tcs_prog_data->instances - 1;
+      hs.MaximumNumberofThreads = devinfo->max_tcs_threads - 1;
+      hs.IncludeVertexHandles = true;
    }
+}
 
-   // XXX: 3DSTATE_CONSTANT_XS
-   // XXX: 3DSTATE_BINDING_TABLE_POINTERS_XS
+static void
+iris_set_tes_state(const struct gen_device_info *devinfo,
+                   struct iris_compiled_shader *shader)
+{
+   struct brw_stage_prog_data *prog_data = shader->prog_data;
+   struct brw_vue_prog_data *vue_prog_data = (void *) prog_data;
+   struct brw_tes_prog_data *tes_prog_data = (void *) prog_data;
 
-   for (int stage = 0; stage <= MESA_SHADER_FRAGMENT; stage++) {
-      if (!(dirty & (IRIS_DIRTY_SAMPLER_STATES_VS << stage)))
-         continue;
+   uint32_t *te_state = (void *) shader->derived_data;
+   uint32_t *ds_state = te_state + GENX(3DSTATE_TE_length);
 
-      // XXX: get sampler count from shader; don't emit them all...
-      const int count = IRIS_MAX_TEXTURE_SAMPLERS;
+   iris_pack_command(GENX(3DSTATE_TE), te_state, te) {
+      te.Partitioning = tes_prog_data->partitioning;
+      te.OutputTopology = tes_prog_data->output_topology;
+      te.TEDomain = tes_prog_data->domain;
+      te.TEEnable = true;
+      te.MaximumTessellationFactorOdd = 63.0;
+      te.MaximumTessellationFactorNotOdd = 64.0;
+   }
 
-      uint32_t offset;
-      uint32_t *map = iris_alloc_state(batch,
-                                       count * 4 * GENX(SAMPLER_STATE_length),
-                                       32, &offset);
+   iris_pack_command(GENX(3DSTATE_DS), ds_state, ds) {
+      INIT_THREAD_DISPATCH_FIELDS(ds, Patch);
 
-      for (int i = 0; i < count; i++) {
-         // XXX: when we have a correct count, these better be bound
-         if (!ice->state.samplers[stage][i])
-            continue;
-         memcpy(map, ice->state.samplers[stage][i]->sampler_state,
-                4 * GENX(SAMPLER_STATE_length));
-         map += GENX(SAMPLER_STATE_length);
-      }
+      ds.DispatchMode = DISPATCH_MODE_SIMD8_SINGLE_PATCH;
+      ds.MaximumNumberofThreads = devinfo->max_tes_threads - 1;
+      ds.ComputeWCoordinateEnable =
+         tes_prog_data->domain == BRW_TESS_DOMAIN_TRI;
 
-      iris_emit_cmd(batch, GENX(3DSTATE_SAMPLER_STATE_POINTERS_VS), ptr) {
-         ptr._3DCommandSubOpcode = 43 + stage;
-         ptr.PointertoVSSamplerState = offset;
-      }
+      ds.UserClipDistanceCullTestEnableBitmask =
+         vue_prog_data->cull_distance_mask;
    }
 
-   if (dirty & IRIS_DIRTY_MULTISAMPLE) {
-      iris_emit_cmd(batch, GENX(3DSTATE_MULTISAMPLE), ms) {
-         ms.PixelLocation =
-            ice->state.cso_rast->half_pixel_center ? CENTER : UL_CORNER;
-         if (ice->state.framebuffer.samples > 0)
-            ms.NumberofMultisamples = ffs(ice->state.framebuffer.samples) - 1;
-      }
-   }
+}
 
-   if (dirty & IRIS_DIRTY_SAMPLE_MASK) {
-      iris_emit_cmd(batch, GENX(3DSTATE_SAMPLE_MASK), ms) {
-         ms.SampleMask = ice->state.sample_mask;
-      }
-   }
+static void
+iris_set_gs_state(const struct gen_device_info *devinfo,
+                  struct iris_compiled_shader *shader)
+{
+   struct brw_stage_prog_data *prog_data = shader->prog_data;
+   struct brw_vue_prog_data *vue_prog_data = (void *) prog_data;
+   struct brw_gs_prog_data *gs_prog_data = (void *) prog_data;
 
-   for (int stage = 0; stage <= MESA_SHADER_FRAGMENT; stage++) {
-      if (!(dirty & (IRIS_DIRTY_VS << stage)))
-         continue;
+   iris_pack_command(GENX(3DSTATE_GS), shader->derived_data, gs) {
+      INIT_THREAD_DISPATCH_FIELDS(gs, Vertex);
 
-      if (ice->shaders.prog[stage]) {
-         iris_batch_emit(batch, ice->shaders.prog[stage]->derived_data,
-                         iris_derived_program_state_size(stage));
-      } else {
-         if (stage == MESA_SHADER_TESS_EVAL) {
-            iris_emit_cmd(batch, GENX(3DSTATE_HS), hs);
-            iris_emit_cmd(batch, GENX(3DSTATE_TE), te);
-            iris_emit_cmd(batch, GENX(3DSTATE_DS), ds);
-         } else if (stage == MESA_SHADER_GEOMETRY) {
-            iris_emit_cmd(batch, GENX(3DSTATE_GS), gs);
-         }
-      }
-   }
+      gs.OutputVertexSize = gs_prog_data->output_vertex_size_hwords * 2 - 1;
+      gs.OutputTopology = gs_prog_data->output_topology;
+      gs.ControlDataHeaderSize =
+         gs_prog_data->control_data_header_size_hwords;
+      gs.InstanceControl = gs_prog_data->invocations - 1;
+      gs.DispatchMode = SIMD8;
+      gs.IncludePrimitiveID = gs_prog_data->include_primitive_id;
+      gs.ControlDataFormat = gs_prog_data->control_data_format;
+      gs.ReorderMode = TRAILING;
+      gs.ExpectedVertexCount = gs_prog_data->vertices_in;
+      gs.MaximumNumberofThreads =
+         GEN_GEN == 8 ? (devinfo->max_gs_threads / 2 - 1)
+                      : (devinfo->max_gs_threads - 1);
 
-   // XXX: SOL and so on
+      if (gs_prog_data->static_vertex_count != -1) {
+         gs.StaticOutput = true;
+         gs.StaticOutputVertexCount = gs_prog_data->static_vertex_count;
+      }
+      gs.IncludeVertexHandles = vue_prog_data->include_vue_handles;
 
-   if (dirty & IRIS_DIRTY_CLIP) {
-      struct iris_rasterizer_state *cso_rast = ice->state.cso_rast;
-      struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
+      gs.UserClipDistanceCullTestEnableBitmask =
+         vue_prog_data->cull_distance_mask;
 
-      uint32_t dynamic_clip[GENX(3DSTATE_CLIP_length)];
-      iris_pack_command(GENX(3DSTATE_CLIP), &dynamic_clip, cl) {
-         if (wm_prog_data->barycentric_interp_modes &
-             BRW_BARYCENTRIC_NONPERSPECTIVE_BITS)
-            cl.NonPerspectiveBarycentricEnable = true;
+      const int urb_entry_write_offset = 1;
+      const uint32_t urb_entry_output_length =
+         DIV_ROUND_UP(vue_prog_data->vue_map.num_slots, 2) -
+         urb_entry_write_offset;
 
-         cl.ForceZeroRTAIndexEnable = cso_fb->layers == 0;
-      }
-      iris_emit_merge(batch, cso_rast->clip, dynamic_clip,
-                      ARRAY_SIZE(cso_rast->clip));
+      gs.VertexURBEntryOutputReadOffset = urb_entry_write_offset;
+      gs.VertexURBEntryOutputLength = MAX2(urb_entry_output_length, 1);
    }
+}
 
-   if (dirty & IRIS_DIRTY_RASTER) {
-      struct iris_rasterizer_state *cso = ice->state.cso_rast;
-      iris_batch_emit(batch, cso->raster, sizeof(cso->raster));
-      iris_batch_emit(batch, cso->sf, sizeof(cso->sf));
+static void
+iris_set_fs_state(const struct gen_device_info *devinfo,
+                  struct iris_compiled_shader *shader)
+{
+   struct brw_stage_prog_data *prog_data = shader->prog_data;
+   struct brw_wm_prog_data *wm_prog_data = (void *) shader->prog_data;
 
-   }
+   uint32_t *ps_state = (void *) shader->derived_data;
+   uint32_t *psx_state = ps_state + GENX(3DSTATE_PS_length);
 
-   if (dirty & (IRIS_DIRTY_RASTER | IRIS_DIRTY_FS)) {
-      struct iris_rasterizer_state *cso = ice->state.cso_rast;
-      uint32_t dynamic_wm[GENX(3DSTATE_WM_length)];
+   iris_pack_command(GENX(3DSTATE_PS), ps_state, ps) {
+      ps.VectorMaskEnable = true;
+      //ps.SamplerCount = ...
+      ps.BindingTableEntryCount = prog_data->binding_table.size_bytes / 4;
+      ps.FloatingPointMode = prog_data->use_alt_mode;
+      ps.MaximumNumberofThreadsPerPSD = 64 - (GEN_GEN == 8 ? 2 : 1);
 
-      iris_pack_command(GENX(3DSTATE_WM), &dynamic_wm, wm) {
-         wm.BarycentricInterpolationMode =
-            wm_prog_data->barycentric_interp_modes;
+      ps.PushConstantEnable = prog_data->nr_params > 0 ||
+                              prog_data->ubo_ranges[0].length > 0;
 
-         if (wm_prog_data->early_fragment_tests)
-            wm.EarlyDepthStencilControl = EDSC_PREPS;
-         else if (wm_prog_data->has_side_effects)
-            wm.EarlyDepthStencilControl = EDSC_PSEXEC;
-      }
-      iris_emit_merge(batch, cso->wm, dynamic_wm, ARRAY_SIZE(cso->wm));
-   }
+      /* From the documentation for this packet:
+       * "If the PS kernel does not need the Position XY Offsets to
+       *  compute a Position Value, then this field should be programmed
+       *  to POSOFFSET_NONE."
+       *
+       * "SW Recommendation: If the PS kernel needs the Position Offsets
+       *  to compute a Position XY value, this field should match Position
+       *  ZW Interpolation Mode to ensure a consistent position.xyzw
+       *  computation."
+       *
+       * We only require XY sample offsets. So, this recommendation doesn't
+       * look useful at the moment.  We might need this in future.
+       */
+      ps.PositionXYOffsetSelect =
+         wm_prog_data->uses_pos_offset ? POSOFFSET_SAMPLE : POSOFFSET_NONE;
+      ps._8PixelDispatchEnable = wm_prog_data->dispatch_8;
+      ps._16PixelDispatchEnable = wm_prog_data->dispatch_16;
+      ps._32PixelDispatchEnable = wm_prog_data->dispatch_32;
 
-   // XXX: SBE, SBE_SWIZ
+      // XXX: Disable SIMD32 with 16x MSAA
 
-   if (dirty & IRIS_DIRTY_PS_BLEND) {
-      struct iris_blend_state *cso = ice->state.cso_blend;
-      iris_batch_emit(batch, cso->ps_blend, sizeof(cso->ps_blend));
+      ps.DispatchGRFStartRegisterForConstantSetupData0 =
+         brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 0);
+      ps.DispatchGRFStartRegisterForConstantSetupData1 =
+         brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 1);
+      ps.DispatchGRFStartRegisterForConstantSetupData2 =
+         brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 2);
+
+      ps.KernelStartPointer0 =
+         shader->prog_offset + brw_wm_prog_data_prog_offset(wm_prog_data, ps, 0);
+      ps.KernelStartPointer1 =
+         shader->prog_offset + brw_wm_prog_data_prog_offset(wm_prog_data, ps, 1);
+      ps.KernelStartPointer2 =
+         shader->prog_offset + brw_wm_prog_data_prog_offset(wm_prog_data, ps, 2);
    }
 
-   if (dirty & IRIS_DIRTY_WM_DEPTH_STENCIL) {
-      struct iris_depth_stencil_alpha_state *cso = ice->state.cso_zsa;
-      struct pipe_stencil_ref *p_stencil_refs = &ice->state.stencil_ref;
+   iris_pack_command(GENX(3DSTATE_PS_EXTRA), psx_state, psx) {
+      psx.PixelShaderValid = true;
+      psx.PixelShaderComputedDepthMode = wm_prog_data->computed_depth_mode;
+      psx.PixelShaderKillsPixel = wm_prog_data->uses_kill;
+      psx.AttributeEnable = wm_prog_data->num_varying_inputs != 0;
+      psx.PixelShaderUsesSourceDepth = wm_prog_data->uses_src_depth;
+      psx.PixelShaderUsesSourceW = wm_prog_data->uses_src_w;
+      psx.PixelShaderIsPerSample = wm_prog_data->persample_dispatch;
 
-      uint32_t stencil_refs[GENX(3DSTATE_WM_DEPTH_STENCIL_length)];
-      iris_pack_command(GENX(3DSTATE_WM_DEPTH_STENCIL), &stencil_refs, wmds) {
-         wmds.StencilReferenceValue = p_stencil_refs->ref_value[0];
-         wmds.BackfaceStencilReferenceValue = p_stencil_refs->ref_value[1];
+      if (wm_prog_data->uses_sample_mask) {
+         /* TODO: conservative rasterization */
+         if (wm_prog_data->post_depth_coverage)
+            psx.InputCoverageMaskState = ICMS_DEPTH_COVERAGE;
+         else
+            psx.InputCoverageMaskState = ICMS_NORMAL;
       }
-      iris_emit_merge(batch, cso->wmds, stencil_refs, ARRAY_SIZE(cso->wmds));
-   }
 
-   if (dirty & IRIS_DIRTY_SCISSOR) {
-      uint32_t scissor_offset =
-         iris_emit_state(batch, ice->state.scissors,
-                         sizeof(struct pipe_scissor_state) *
-                         ice->state.num_scissors, 32);
+      psx.oMaskPresenttoRenderTarget = wm_prog_data->uses_omask;
+      psx.PixelShaderPullsBary = wm_prog_data->pulls_bary;
+      psx.PixelShaderComputesStencil = wm_prog_data->computed_stencil;
 
-      iris_emit_cmd(batch, GENX(3DSTATE_SCISSOR_STATE_POINTERS), ptr) {
-         ptr.ScissorRectPointer = scissor_offset;
-      }
+      // XXX: UAV bit
    }
+}
 
-   // XXX: 3DSTATE_DEPTH_BUFFER and friends
+static unsigned
+iris_derived_program_state_size(enum iris_program_cache_id cache_id)
+{
+   assert(cache_id <= IRIS_CACHE_CS);
 
-   if (dirty & IRIS_DIRTY_POLYGON_STIPPLE) {
-      iris_emit_cmd(batch, GENX(3DSTATE_POLY_STIPPLE_PATTERN), poly) {
-         for (int i = 0; i < 32; i++) {
-            poly.PatternRow[i] = ice->state.poly_stipple.stipple[i];
-         }
-      }
+   static const unsigned dwords[] = {
+      [IRIS_CACHE_VS] = GENX(3DSTATE_VS_length),
+      [IRIS_CACHE_TCS] = GENX(3DSTATE_HS_length),
+      [IRIS_CACHE_TES] = GENX(3DSTATE_TE_length) + GENX(3DSTATE_DS_length),
+      [IRIS_CACHE_GS] = GENX(3DSTATE_GS_length),
+      [IRIS_CACHE_FS] =
+         GENX(3DSTATE_PS_length) + GENX(3DSTATE_PS_EXTRA_length),
+      [IRIS_CACHE_CS] = 0,
+      [IRIS_CACHE_BLORP_BLIT] = 0,
+   };
+
+   return sizeof(uint32_t) * dwords[cache_id];
+}
+
+static void
+iris_set_derived_program_state(const struct gen_device_info *devinfo,
+                               enum iris_program_cache_id cache_id,
+                               struct iris_compiled_shader *shader)
+{
+   switch (cache_id) {
+   case IRIS_CACHE_VS:
+      iris_set_vs_state(devinfo, shader);
+      break;
+   case IRIS_CACHE_TCS:
+      iris_set_tcs_state(devinfo, shader);
+      break;
+   case IRIS_CACHE_TES:
+      iris_set_tes_state(devinfo, shader);
+      break;
+   case IRIS_CACHE_GS:
+      iris_set_gs_state(devinfo, shader);
+      break;
+   case IRIS_CACHE_FS:
+      iris_set_fs_state(devinfo, shader);
+      break;
+   case IRIS_CACHE_CS:
+      break;
+   default:
+      break;
    }
+}
 
-   if (dirty & IRIS_DIRTY_LINE_STIPPLE) {
-      struct iris_rasterizer_state *cso = ice->state.cso_rast;
-      iris_batch_emit(batch, cso->line_stipple, sizeof(cso->line_stipple));
+static void
+iris_upload_render_state(struct iris_context *ice,
+                         struct iris_batch *batch,
+                         const struct pipe_draw_info *draw)
+{
+   const uint64_t dirty = ice->state.dirty;
+
+   struct brw_wm_prog_data *wm_prog_data = (void *)
+      ice->shaders.prog[MESA_SHADER_FRAGMENT]->prog_data;
+
+   if (dirty & IRIS_DIRTY_CC_VIEWPORT) {
+      struct iris_depth_stencil_alpha_state *cso = ice->state.cso_zsa;
+      iris_emit_cmd(batch, GENX(3DSTATE_VIEWPORT_STATE_POINTERS_CC), ptr) {
+         ptr.CCViewportPointer =
+            iris_emit_state(batch, cso->cc_vp, sizeof(cso->cc_vp), 32);
+      }
    }
 
-   if (1) {
-      iris_emit_cmd(batch, GENX(3DSTATE_VF_TOPOLOGY), topo) {
-         topo.PrimitiveTopologyType =
-            translate_prim_type(draw->mode, draw->vertices_per_patch);
+   if (dirty & IRIS_DIRTY_SF_CL_VIEWPORT) {
+      struct iris_viewport_state *cso = ice->state.cso_vp;
+      iris_emit_cmd(batch, GENX(3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP), ptr) {
+         ptr.SFClipViewportPointer =
+            iris_emit_state(batch, cso->sf_cl_vp, sizeof(cso->sf_cl_vp), 64);
       }
    }
 
-   if (draw->index_size > 0) {
-      struct iris_resource *res = (struct iris_resource *)draw->index.resource;
+   /* XXX: L3 State */
 
-      assert(!draw->has_user_indices);
+   if (dirty & IRIS_DIRTY_URB) {
+      /* XXX: URB */
+   }
 
-      iris_emit_cmd(batch, GENX(3DSTATE_INDEX_BUFFER), ib) {
-         ib.IndexFormat = draw->index_size;
-         ib.MOCS = MOCS_WB;
-         ib.BufferSize = res->bo->size;
-         ib.BufferStartingAddress = ro_bo(res->bo, 0);
+   if (dirty & IRIS_DIRTY_BLEND_STATE) {
+      struct iris_blend_state *cso = ice->state.cso_blend;
+      // XXX: 3DSTATE_BLEND_STATE_POINTERS - BLEND_STATE
+      // -> from iris_blend_state (most) + iris_depth_stencil_alpha_state
+      //    (alpha test function/enable) + has writeable RT from ???????
+   }
+
+   if (dirty & IRIS_DIRTY_COLOR_CALC_STATE) {
+      struct iris_depth_stencil_alpha_state *cso = ice->state.cso_zsa;
+      uint32_t cc_offset;
+      void *cc_map =
+         iris_alloc_state(batch,
+                          sizeof(uint32_t) * GENX(COLOR_CALC_STATE_length),
+                          64, &cc_offset);
+      iris_pack_state(GENX(COLOR_CALC_STATE), cc_map, cc) {
+         cc.AlphaTestFormat = ALPHATEST_FLOAT32;
+         cc.AlphaReferenceValueAsFLOAT32 = cso->alpha.ref_value;
+         cc.BlendConstantColorRed   = ice->state.blend_color.color[0];
+         cc.BlendConstantColorGreen = ice->state.blend_color.color[1];
+         cc.BlendConstantColorBlue  = ice->state.blend_color.color[2];
+         cc.BlendConstantColorAlpha = ice->state.blend_color.color[3];
+      }
+      iris_emit_cmd(batch, GENX(3DSTATE_CC_STATE_POINTERS), ptr) {
+         ptr.ColorCalcStatePointer = cc_offset;
+         ptr.ColorCalcStatePointerValid = true;
       }
    }
 
-   if (dirty & IRIS_DIRTY_VERTEX_BUFFERS) {
-      struct iris_vertex_buffer_state *cso = ice->state.cso_vertex_buffers;
+   // XXX: 3DSTATE_CONSTANT_XS
+   // XXX: 3DSTATE_BINDING_TABLE_POINTERS_XS
 
-      STATIC_ASSERT(GENX(VERTEX_BUFFER_STATE_length) == 4);
-      STATIC_ASSERT((GENX(VERTEX_BUFFER_STATE_BufferStartingAddress_bits) % 32) == 0);
+   for (int stage = 0; stage <= MESA_SHADER_FRAGMENT; stage++) {
+      if (!(dirty & (IRIS_DIRTY_SAMPLER_STATES_VS << stage)))
+         continue;
 
-      uint64_t *addr = batch->cmdbuf.map_next + sizeof(uint32_t) *
-         (GENX(VERTEX_BUFFER_STATE_BufferStartingAddress_bits) / 32);
-      uint32_t *delta = cso->vertex_buffers +
-         (1 + GENX(VERTEX_BUFFER_STATE_BufferStartingAddress_bits) / 32);
+      // XXX: get sampler count from shader; don't emit them all...
+      const int count = IRIS_MAX_TEXTURE_SAMPLERS;
 
-      iris_batch_emit(batch, cso->vertex_buffers,
-                      sizeof(uint32_t) * (1 + 4 * cso->num_buffers));
+      uint32_t offset;
+      uint32_t *map = iris_alloc_state(batch,
+                                       count * 4 * GENX(SAMPLER_STATE_length),
+                                       32, &offset);
 
-      for (unsigned i = 0; i < cso->num_buffers; i++) {
-         *addr = iris_batch_reloc(batch, (void *) addr - batch->cmdbuf.map,
-                                  cso->bos[i].bo, cso->bos[i].offset +
-                                  *delta, cso->bos[i].reloc_flags);
-         addr = (void *) addr + 16;
-         delta = (void *) delta + 16;
+      for (int i = 0; i < count; i++) {
+         // XXX: when we have a correct count, these better be bound
+         if (!ice->state.samplers[stage][i])
+            continue;
+         memcpy(map, ice->state.samplers[stage][i]->sampler_state,
+                4 * GENX(SAMPLER_STATE_length));
+         map += GENX(SAMPLER_STATE_length);
+      }
+
+      iris_emit_cmd(batch, GENX(3DSTATE_SAMPLER_STATE_POINTERS_VS), ptr) {
+         ptr._3DCommandSubOpcode = 43 + stage;
+         ptr.PointertoVSSamplerState = offset;
       }
    }
 
-   if (dirty & IRIS_DIRTY_VERTEX_ELEMENTS) {
-      struct iris_vertex_element_state *cso = ice->state.cso_vertex_elements;
-      iris_batch_emit(batch, cso->vertex_elements, sizeof(uint32_t) *
-                      (1 + cso->count * GENX(VERTEX_ELEMENT_STATE_length)));
-      for (int i = 0; i < cso->count; i++) {
-         iris_batch_emit(batch, cso->vf_instancing[i], sizeof(uint32_t) *
-                         (cso->count * GENX(3DSTATE_VF_INSTANCING_length)));
+   if (dirty & IRIS_DIRTY_MULTISAMPLE) {
+      iris_emit_cmd(batch, GENX(3DSTATE_MULTISAMPLE), ms) {
+         ms.PixelLocation =
+            ice->state.cso_rast->half_pixel_center ? CENTER : UL_CORNER;
+         if (ice->state.framebuffer.samples > 0)
+            ms.NumberofMultisamples = ffs(ice->state.framebuffer.samples) - 1;
       }
-      for (int i = 0; i < cso->count; i++) {
-         /* TODO: vertexid, instanceid support */
-         iris_emit_cmd(batch, GENX(3DSTATE_VF_SGVS), sgvs);
+   }
+
+   if (dirty & IRIS_DIRTY_SAMPLE_MASK) {
+      iris_emit_cmd(batch, GENX(3DSTATE_SAMPLE_MASK), ms) {
+         ms.SampleMask = ice->state.sample_mask;
       }
    }
 
-   if (1) {
-      iris_emit_cmd(batch, GENX(3DSTATE_VF), vf) {
-         if (draw->primitive_restart) {
-            vf.IndexedDrawCutIndexEnable = true;
-            vf.CutIndex = draw->restart_index;
+   for (int stage = 0; stage <= MESA_SHADER_FRAGMENT; stage++) {
+      if (!(dirty & (IRIS_DIRTY_VS << stage)))
+         continue;
+
+      if (ice->shaders.prog[stage]) {
+         iris_batch_emit(batch, ice->shaders.prog[stage]->derived_data,
+                         iris_derived_program_state_size(stage));
+      } else {
+         if (stage == MESA_SHADER_TESS_EVAL) {
+            iris_emit_cmd(batch, GENX(3DSTATE_HS), hs);
+            iris_emit_cmd(batch, GENX(3DSTATE_TE), te);
+            iris_emit_cmd(batch, GENX(3DSTATE_DS), ds);
+         } else if (stage == MESA_SHADER_GEOMETRY) {
+            iris_emit_cmd(batch, GENX(3DSTATE_GS), gs);
          }
       }
    }
 
-   // XXX: Gen8 - PMA fix
-
-   assert(!draw->indirect); // XXX: indirect support
+   // XXX: SOL and so on
 
-   iris_emit_cmd(batch, GENX(3DPRIMITIVE), prim) {
-      prim.StartInstanceLocation = draw->start_instance;
-      prim.InstanceCount = draw->instance_count;
-      prim.VertexCountPerInstance = draw->count;
-      prim.VertexAccessType = draw->index_size > 0 ? RANDOM : SEQUENTIAL;
+   if (dirty & IRIS_DIRTY_CLIP) {
+      struct iris_rasterizer_state *cso_rast = ice->state.cso_rast;
+      struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
 
-      // XXX: this is probably bonkers.
-      prim.StartVertexLocation = draw->start;
+      uint32_t dynamic_clip[GENX(3DSTATE_CLIP_length)];
+      iris_pack_command(GENX(3DSTATE_CLIP), &dynamic_clip, cl) {
+         if (wm_prog_data->barycentric_interp_modes &
+             BRW_BARYCENTRIC_NONPERSPECTIVE_BITS)
+            cl.NonPerspectiveBarycentricEnable = true;
 
-      if (draw->index_size) {
-         prim.BaseVertexLocation += draw->index_bias;
-      } else {
-         prim.StartVertexLocation += draw->index_bias;
+         cl.ForceZeroRTAIndexEnable = cso_fb->layers == 0;
       }
-
-      //prim.BaseVertexLocation = ...;
+      iris_emit_merge(batch, cso_rast->clip, dynamic_clip,
+                      ARRAY_SIZE(cso_rast->clip));
    }
-#if 0
-   l3 configuration
 
-   3DSTATE_URB_*
-     -> TODO
+   if (dirty & IRIS_DIRTY_RASTER) {
+      struct iris_rasterizer_state *cso = ice->state.cso_rast;
+      iris_batch_emit(batch, cso->raster, sizeof(cso->raster));
+      iris_batch_emit(batch, cso->sf, sizeof(cso->sf));
 
-   3DSTATE_CONSTANT_* - push constants
-     -> TODO
+   }
 
-   Surfaces:
-   - pull constants
-   - ubos/ssbos/abos
-   - images
-   - textures
-   - render targets - write and read
-   3DSTATE_BINDING_TABLE_POINTERS_*
-     -> TODO
+   if (dirty & (IRIS_DIRTY_RASTER | IRIS_DIRTY_FS)) {
+      struct iris_rasterizer_state *cso = ice->state.cso_rast;
+      uint32_t dynamic_wm[GENX(3DSTATE_WM_length)];
 
-   3DSTATE_STREAMOUT
-   3DSTATE_SO_BUFFER
-   3DSTATE_SO_DECL_LIST
+      iris_pack_command(GENX(3DSTATE_WM), &dynamic_wm, wm) {
+         wm.BarycentricInterpolationMode =
+            wm_prog_data->barycentric_interp_modes;
 
-   3DSTATE_SBE
-     -> iris_raster_state (point sprite texture coordinate origin)
-     -> bunch of shader state...
-   3DSTATE_SBE_SWIZ
-     -> FS state
+         if (wm_prog_data->early_fragment_tests)
+            wm.EarlyDepthStencilControl = EDSC_PREPS;
+         else if (wm_prog_data->has_side_effects)
+            wm.EarlyDepthStencilControl = EDSC_PSEXEC;
+      }
+      iris_emit_merge(batch, cso->wm, dynamic_wm, ARRAY_SIZE(cso->wm));
+   }
 
-   3DSTATE_DEPTH_BUFFER
-   3DSTATE_HIER_DEPTH_BUFFER
-   3DSTATE_STENCIL_BUFFER
-   3DSTATE_CLEAR_PARAMS
-     -> iris_framebuffer_state?
-#endif
-}
+   // XXX: SBE, SBE_SWIZ
 
-static void
-iris_bind_compute_state(struct pipe_context *ctx, void *state)
-{
-}
+   if (dirty & IRIS_DIRTY_PS_BLEND) {
+      struct iris_blend_state *cso = ice->state.cso_blend;
+      iris_batch_emit(batch, cso->ps_blend, sizeof(cso->ps_blend));
+   }
 
-   //pkt.SamplerCount =                                                     \
-      //DIV_ROUND_UP(CLAMP(stage_state->sampler_count, 0, 16), 4);          \
-   //pkt.PerThreadScratchSpace = prog_data->total_scratch == 0 ? 0 :        \
-      //ffs(stage_state->per_thread_scratch) - 11;                          \
+   if (dirty & IRIS_DIRTY_WM_DEPTH_STENCIL) {
+      struct iris_depth_stencil_alpha_state *cso = ice->state.cso_zsa;
+      struct pipe_stencil_ref *p_stencil_refs = &ice->state.stencil_ref;
 
-#define INIT_THREAD_DISPATCH_FIELDS(pkt, prefix)                          \
-   pkt.KernelStartPointer = shader->prog_offset;                          \
-   pkt.BindingTableEntryCount = prog_data->binding_table.size_bytes / 4;  \
-   pkt.FloatingPointMode = prog_data->use_alt_mode;                       \
-                                                                          \
-   pkt.DispatchGRFStartRegisterForURBData =                               \
-      prog_data->dispatch_grf_start_reg;                                  \
-   pkt.prefix##URBEntryReadLength = vue_prog_data->urb_read_length;       \
-   pkt.prefix##URBEntryReadOffset = 0;                                    \
-                                                                          \
-   pkt.StatisticsEnable = true;                                           \
-   pkt.Enable           = true;
+      uint32_t stencil_refs[GENX(3DSTATE_WM_DEPTH_STENCIL_length)];
+      iris_pack_command(GENX(3DSTATE_WM_DEPTH_STENCIL), &stencil_refs, wmds) {
+         wmds.StencilReferenceValue = p_stencil_refs->ref_value[0];
+         wmds.BackfaceStencilReferenceValue = p_stencil_refs->ref_value[1];
+      }
+      iris_emit_merge(batch, cso->wmds, stencil_refs, ARRAY_SIZE(cso->wmds));
+   }
 
-static void
-iris_set_vs_state(const struct gen_device_info *devinfo,
-                  struct iris_compiled_shader *shader)
-{
-   struct brw_stage_prog_data *prog_data = shader->prog_data;
-   struct brw_vue_prog_data *vue_prog_data = (void *) prog_data;
+   if (dirty & IRIS_DIRTY_SCISSOR) {
+      uint32_t scissor_offset =
+         iris_emit_state(batch, ice->state.scissors,
+                         sizeof(struct pipe_scissor_state) *
+                         ice->state.num_scissors, 32);
 
-   iris_pack_command(GENX(3DSTATE_VS), shader->derived_data, vs) {
-      INIT_THREAD_DISPATCH_FIELDS(vs, Vertex);
-      vs.MaximumNumberofThreads = devinfo->max_vs_threads - 1;
-      vs.SIMD8DispatchEnable = true;
-      vs.UserClipDistanceCullTestEnableBitmask =
-         vue_prog_data->cull_distance_mask;
+      iris_emit_cmd(batch, GENX(3DSTATE_SCISSOR_STATE_POINTERS), ptr) {
+         ptr.ScissorRectPointer = scissor_offset;
+      }
    }
-}
-
-static void
-iris_set_tcs_state(const struct gen_device_info *devinfo,
-                   struct iris_compiled_shader *shader)
-{
-   struct brw_stage_prog_data *prog_data = shader->prog_data;
-   struct brw_vue_prog_data *vue_prog_data = (void *) prog_data;
-   struct brw_tcs_prog_data *tcs_prog_data = (void *) prog_data;
 
-   iris_pack_command(GENX(3DSTATE_HS), shader->derived_data, hs) {
-      INIT_THREAD_DISPATCH_FIELDS(hs, Vertex);
+   // XXX: 3DSTATE_DEPTH_BUFFER and friends
 
-      hs.InstanceCount = tcs_prog_data->instances - 1;
-      hs.MaximumNumberofThreads = devinfo->max_tcs_threads - 1;
-      hs.IncludeVertexHandles = true;
+   if (dirty & IRIS_DIRTY_POLYGON_STIPPLE) {
+      iris_emit_cmd(batch, GENX(3DSTATE_POLY_STIPPLE_PATTERN), poly) {
+         for (int i = 0; i < 32; i++) {
+            poly.PatternRow[i] = ice->state.poly_stipple.stipple[i];
+         }
+      }
    }
-}
-
-static void
-iris_set_tes_state(const struct gen_device_info *devinfo,
-                   struct iris_compiled_shader *shader)
-{
-   struct brw_stage_prog_data *prog_data = shader->prog_data;
-   struct brw_vue_prog_data *vue_prog_data = (void *) prog_data;
-   struct brw_tes_prog_data *tes_prog_data = (void *) prog_data;
 
-   uint32_t *te_state = (void *) shader->derived_data;
-   uint32_t *ds_state = te_state + GENX(3DSTATE_TE_length);
+   if (dirty & IRIS_DIRTY_LINE_STIPPLE) {
+      struct iris_rasterizer_state *cso = ice->state.cso_rast;
+      iris_batch_emit(batch, cso->line_stipple, sizeof(cso->line_stipple));
+   }
 
-   iris_pack_command(GENX(3DSTATE_TE), te_state, te) {
-      te.Partitioning = tes_prog_data->partitioning;
-      te.OutputTopology = tes_prog_data->output_topology;
-      te.TEDomain = tes_prog_data->domain;
-      te.TEEnable = true;
-      te.MaximumTessellationFactorOdd = 63.0;
-      te.MaximumTessellationFactorNotOdd = 64.0;
+   if (1) {
+      iris_emit_cmd(batch, GENX(3DSTATE_VF_TOPOLOGY), topo) {
+         topo.PrimitiveTopologyType =
+            translate_prim_type(draw->mode, draw->vertices_per_patch);
+      }
    }
 
-   iris_pack_command(GENX(3DSTATE_DS), ds_state, ds) {
-      INIT_THREAD_DISPATCH_FIELDS(ds, Patch);
+   if (draw->index_size > 0) {
+      struct iris_resource *res = (struct iris_resource *)draw->index.resource;
 
-      ds.DispatchMode = DISPATCH_MODE_SIMD8_SINGLE_PATCH;
-      ds.MaximumNumberofThreads = devinfo->max_tes_threads - 1;
-      ds.ComputeWCoordinateEnable =
-         tes_prog_data->domain == BRW_TESS_DOMAIN_TRI;
+      assert(!draw->has_user_indices);
 
-      ds.UserClipDistanceCullTestEnableBitmask =
-         vue_prog_data->cull_distance_mask;
+      iris_emit_cmd(batch, GENX(3DSTATE_INDEX_BUFFER), ib) {
+         ib.IndexFormat = draw->index_size;
+         ib.MOCS = MOCS_WB;
+         ib.BufferSize = res->bo->size;
+         ib.BufferStartingAddress = ro_bo(res->bo, 0);
+      }
    }
 
-}
+   if (dirty & IRIS_DIRTY_VERTEX_BUFFERS) {
+      struct iris_vertex_buffer_state *cso = ice->state.cso_vertex_buffers;
 
-static void
-iris_set_gs_state(const struct gen_device_info *devinfo,
-                  struct iris_compiled_shader *shader)
-{
-   struct brw_stage_prog_data *prog_data = shader->prog_data;
-   struct brw_vue_prog_data *vue_prog_data = (void *) prog_data;
-   struct brw_gs_prog_data *gs_prog_data = (void *) prog_data;
+      STATIC_ASSERT(GENX(VERTEX_BUFFER_STATE_length) == 4);
+      STATIC_ASSERT((GENX(VERTEX_BUFFER_STATE_BufferStartingAddress_bits) % 32) == 0);
 
-   iris_pack_command(GENX(3DSTATE_GS), shader->derived_data, gs) {
-      INIT_THREAD_DISPATCH_FIELDS(gs, Vertex);
+      uint64_t *addr = batch->cmdbuf.map_next + sizeof(uint32_t) *
+         (GENX(VERTEX_BUFFER_STATE_BufferStartingAddress_bits) / 32);
+      uint32_t *delta = cso->vertex_buffers +
+         (1 + GENX(VERTEX_BUFFER_STATE_BufferStartingAddress_bits) / 32);
 
-      gs.OutputVertexSize = gs_prog_data->output_vertex_size_hwords * 2 - 1;
-      gs.OutputTopology = gs_prog_data->output_topology;
-      gs.ControlDataHeaderSize =
-         gs_prog_data->control_data_header_size_hwords;
-      gs.InstanceControl = gs_prog_data->invocations - 1;
-      gs.DispatchMode = SIMD8;
-      gs.IncludePrimitiveID = gs_prog_data->include_primitive_id;
-      gs.ControlDataFormat = gs_prog_data->control_data_format;
-      gs.ReorderMode = TRAILING;
-      gs.ExpectedVertexCount = gs_prog_data->vertices_in;
-      gs.MaximumNumberofThreads =
-         GEN_GEN == 8 ? (devinfo->max_gs_threads / 2 - 1)
-                      : (devinfo->max_gs_threads - 1);
+      iris_batch_emit(batch, cso->vertex_buffers,
+                      sizeof(uint32_t) * (1 + 4 * cso->num_buffers));
 
-      if (gs_prog_data->static_vertex_count != -1) {
-         gs.StaticOutput = true;
-         gs.StaticOutputVertexCount = gs_prog_data->static_vertex_count;
+      for (unsigned i = 0; i < cso->num_buffers; i++) {
+         *addr = iris_batch_reloc(batch, (void *) addr - batch->cmdbuf.map,
+                                  cso->bos[i].bo, cso->bos[i].offset +
+                                  *delta, cso->bos[i].reloc_flags);
+         addr = (void *) addr + 16;
+         delta = (void *) delta + 16;
       }
-      gs.IncludeVertexHandles = vue_prog_data->include_vue_handles;
-
-      gs.UserClipDistanceCullTestEnableBitmask =
-         vue_prog_data->cull_distance_mask;
-
-      const int urb_entry_write_offset = 1;
-      const uint32_t urb_entry_output_length =
-         DIV_ROUND_UP(vue_prog_data->vue_map.num_slots, 2) -
-         urb_entry_write_offset;
-
-      gs.VertexURBEntryOutputReadOffset = urb_entry_write_offset;
-      gs.VertexURBEntryOutputLength = MAX2(urb_entry_output_length, 1);
    }
-}
 
-static void
-iris_set_fs_state(const struct gen_device_info *devinfo,
-                  struct iris_compiled_shader *shader)
-{
-   struct brw_stage_prog_data *prog_data = shader->prog_data;
-   struct brw_wm_prog_data *wm_prog_data = (void *) shader->prog_data;
+   if (dirty & IRIS_DIRTY_VERTEX_ELEMENTS) {
+      struct iris_vertex_element_state *cso = ice->state.cso_vertex_elements;
+      iris_batch_emit(batch, cso->vertex_elements, sizeof(uint32_t) *
+                      (1 + cso->count * GENX(VERTEX_ELEMENT_STATE_length)));
+      for (int i = 0; i < cso->count; i++) {
+         iris_batch_emit(batch, cso->vf_instancing[i], sizeof(uint32_t) *
+                         (cso->count * GENX(3DSTATE_VF_INSTANCING_length)));
+      }
+      for (int i = 0; i < cso->count; i++) {
+         /* TODO: vertexid, instanceid support */
+         iris_emit_cmd(batch, GENX(3DSTATE_VF_SGVS), sgvs);
+      }
+   }
 
-   uint32_t *ps_state = (void *) shader->derived_data;
-   uint32_t *psx_state = ps_state + GENX(3DSTATE_PS_length);
+   if (1) {
+      iris_emit_cmd(batch, GENX(3DSTATE_VF), vf) {
+         if (draw->primitive_restart) {
+            vf.IndexedDrawCutIndexEnable = true;
+            vf.CutIndex = draw->restart_index;
+         }
+      }
+   }
 
-   iris_pack_command(GENX(3DSTATE_PS), ps_state, ps) {
-      ps.VectorMaskEnable = true;
-      //ps.SamplerCount = ...
-      ps.BindingTableEntryCount = prog_data->binding_table.size_bytes / 4;
-      ps.FloatingPointMode = prog_data->use_alt_mode;
-      ps.MaximumNumberofThreadsPerPSD = 64 - (GEN_GEN == 8 ? 2 : 1);
+   // XXX: Gen8 - PMA fix
 
-      ps.PushConstantEnable = prog_data->nr_params > 0 ||
-                              prog_data->ubo_ranges[0].length > 0;
+   assert(!draw->indirect); // XXX: indirect support
 
-      /* From the documentation for this packet:
-       * "If the PS kernel does not need the Position XY Offsets to
-       *  compute a Position Value, then this field should be programmed
-       *  to POSOFFSET_NONE."
-       *
-       * "SW Recommendation: If the PS kernel needs the Position Offsets
-       *  to compute a Position XY value, this field should match Position
-       *  ZW Interpolation Mode to ensure a consistent position.xyzw
-       *  computation."
-       *
-       * We only require XY sample offsets. So, this recommendation doesn't
-       * look useful at the moment.  We might need this in future.
-       */
-      ps.PositionXYOffsetSelect =
-         wm_prog_data->uses_pos_offset ? POSOFFSET_SAMPLE : POSOFFSET_NONE;
-      ps._8PixelDispatchEnable = wm_prog_data->dispatch_8;
-      ps._16PixelDispatchEnable = wm_prog_data->dispatch_16;
-      ps._32PixelDispatchEnable = wm_prog_data->dispatch_32;
+   iris_emit_cmd(batch, GENX(3DPRIMITIVE), prim) {
+      prim.StartInstanceLocation = draw->start_instance;
+      prim.InstanceCount = draw->instance_count;
+      prim.VertexCountPerInstance = draw->count;
+      prim.VertexAccessType = draw->index_size > 0 ? RANDOM : SEQUENTIAL;
 
-      // XXX: Disable SIMD32 with 16x MSAA
+      // XXX: this is probably bonkers.
+      prim.StartVertexLocation = draw->start;
 
-      ps.DispatchGRFStartRegisterForConstantSetupData0 =
-         brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 0);
-      ps.DispatchGRFStartRegisterForConstantSetupData1 =
-         brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 1);
-      ps.DispatchGRFStartRegisterForConstantSetupData2 =
-         brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 2);
+      if (draw->index_size) {
+         prim.BaseVertexLocation += draw->index_bias;
+      } else {
+         prim.StartVertexLocation += draw->index_bias;
+      }
 
-      ps.KernelStartPointer0 =
-         shader->prog_offset + brw_wm_prog_data_prog_offset(wm_prog_data, ps, 0);
-      ps.KernelStartPointer1 =
-         shader->prog_offset + brw_wm_prog_data_prog_offset(wm_prog_data, ps, 1);
-      ps.KernelStartPointer2 =
-         shader->prog_offset + brw_wm_prog_data_prog_offset(wm_prog_data, ps, 2);
+      //prim.BaseVertexLocation = ...;
    }
+#if 0
+   l3 configuration
 
-   iris_pack_command(GENX(3DSTATE_PS_EXTRA), psx_state, psx) {
-      psx.PixelShaderValid = true;
-      psx.PixelShaderComputedDepthMode = wm_prog_data->computed_depth_mode;
-      psx.PixelShaderKillsPixel = wm_prog_data->uses_kill;
-      psx.AttributeEnable = wm_prog_data->num_varying_inputs != 0;
-      psx.PixelShaderUsesSourceDepth = wm_prog_data->uses_src_depth;
-      psx.PixelShaderUsesSourceW = wm_prog_data->uses_src_w;
-      psx.PixelShaderIsPerSample = wm_prog_data->persample_dispatch;
-
-      if (wm_prog_data->uses_sample_mask) {
-         /* TODO: conservative rasterization */
-         if (wm_prog_data->post_depth_coverage)
-            psx.InputCoverageMaskState = ICMS_DEPTH_COVERAGE;
-         else
-            psx.InputCoverageMaskState = ICMS_NORMAL;
-      }
+   3DSTATE_URB_*
+     -> TODO
 
-      psx.oMaskPresenttoRenderTarget = wm_prog_data->uses_omask;
-      psx.PixelShaderPullsBary = wm_prog_data->pulls_bary;
-      psx.PixelShaderComputesStencil = wm_prog_data->computed_stencil;
+   3DSTATE_CONSTANT_* - push constants
+     -> TODO
 
-      // XXX: UAV bit
-   }
-}
+   Surfaces:
+   - pull constants
+   - ubos/ssbos/abos
+   - images
+   - textures
+   - render targets - write and read
+   3DSTATE_BINDING_TABLE_POINTERS_*
+     -> TODO
 
-unsigned
-iris_derived_program_state_size(enum iris_program_cache_id cache_id)
-{
-   assert(cache_id <= IRIS_CACHE_CS);
+   3DSTATE_STREAMOUT
+   3DSTATE_SO_BUFFER
+   3DSTATE_SO_DECL_LIST
 
-   static const unsigned dwords[] = {
-      [IRIS_CACHE_VS] = GENX(3DSTATE_VS_length),
-      [IRIS_CACHE_TCS] = GENX(3DSTATE_HS_length),
-      [IRIS_CACHE_TES] = GENX(3DSTATE_TE_length) + GENX(3DSTATE_DS_length),
-      [IRIS_CACHE_GS] = GENX(3DSTATE_GS_length),
-      [IRIS_CACHE_FS] =
-         GENX(3DSTATE_PS_length) + GENX(3DSTATE_PS_EXTRA_length),
-      [IRIS_CACHE_CS] = 0,
-      [IRIS_CACHE_BLORP_BLIT] = 0,
-   };
+   3DSTATE_SBE
+     -> iris_raster_state (point sprite texture coordinate origin)
+     -> bunch of shader state...
+   3DSTATE_SBE_SWIZ
+     -> FS state
 
-   return sizeof(uint32_t) * dwords[cache_id];
+   3DSTATE_DEPTH_BUFFER
+   3DSTATE_HIER_DEPTH_BUFFER
+   3DSTATE_STENCIL_BUFFER
+   3DSTATE_CLEAR_PARAMS
+     -> iris_framebuffer_state?
+#endif
 }
 
-void
-iris_set_derived_program_state(const struct gen_device_info *devinfo,
-                               enum iris_program_cache_id cache_id,
-                               struct iris_compiled_shader *shader)
-{
-   switch (cache_id) {
-   case IRIS_CACHE_VS:
-      iris_set_vs_state(devinfo, shader);
-      break;
-   case IRIS_CACHE_TCS:
-      iris_set_tcs_state(devinfo, shader);
-      break;
-   case IRIS_CACHE_TES:
-      iris_set_tes_state(devinfo, shader);
-      break;
-   case IRIS_CACHE_GS:
-      iris_set_gs_state(devinfo, shader);
-      break;
-   case IRIS_CACHE_FS:
-      iris_set_fs_state(devinfo, shader);
-      break;
-   case IRIS_CACHE_CS:
-      break;
-   default:
-      break;
-   }
-}
 
-void
+
+static void
 iris_destroy_state(struct iris_context *ice)
 {
    // XXX: unreference resources/surfaces.
@@ -1966,12 +1957,10 @@ iris_destroy_state(struct iris_context *ice)
 }
 
 void
-iris_init_state(struct iris_context *ice)
+genX(init_state)(struct iris_context *ice)
 {
    struct pipe_context *ctx = &ice->ctx;
 
-   ice->state.dirty = ~0ull;
-
    ctx->create_blend_state = iris_create_blend_state;
    ctx->create_depth_stencil_alpha_state = iris_create_zsa_state;
    ctx->create_rasterizer_state = iris_create_rasterizer_state;
@@ -2015,4 +2004,14 @@ iris_init_state(struct iris_context *ice)
    ctx->create_stream_output_target = iris_create_stream_output_target;
    ctx->stream_output_target_destroy = iris_stream_output_target_destroy;
    ctx->set_stream_output_targets = iris_set_stream_output_targets;
+
+   ice->render_batch.emit_state_base_address = iris_emit_state_base_address;
+   ice->state.upload_render_state = iris_upload_render_state;
+   ice->state.derived_program_state_size = iris_derived_program_state_size;
+   ice->state.set_derived_program_state = iris_set_derived_program_state;
+   ice->state.destroy_state = iris_destroy_state;
+
+   ice->state.dirty = ~0ull;
+
+   iris_upload_initial_gpu_state(&ice->render_batch);
 }