i965: Split the gen6 GS binding table to a separate table.

[mesa.git] / src / mesa / drivers / dri / i965 / brw_misc_state.c
diff --git a/src/mesa/drivers/dri/i965/brw_misc_state.c b/src/mesa/drivers/dri/i965/brw_misc_state.c

index 24041e57b00161755c360a284bbf3e5cc6b85ee1..c86755de6593cc2044ab285804f7246617f490e8 100644 (file)
--- a/src/mesa/drivers/dri/i965/brw_misc_state.c
+++ b/src/mesa/drivers/dri/i965/brw_misc_state.c
@@ -32,46 +32,14 @@
  
  
  #include "intel_batchbuffer.h"
+#include "intel_fbo.h"
+#include "intel_mipmap_tree.h"
  #include "intel_regions.h"
  
  #include "brw_context.h"
  #include "brw_state.h"
  #include "brw_defines.h"
  
-
-
-
-
-/***********************************************************************
- * Blend color
- */
-
-static void upload_blend_constant_color(struct brw_context *brw)
-{
-   struct gl_context *ctx = &brw->intel.ctx;
-   struct brw_blend_constant_color bcc;
-
-   memset(&bcc, 0, sizeof(bcc));      
-   bcc.header.opcode = CMD_BLEND_CONSTANT_COLOR;
-   bcc.header.length = sizeof(bcc)/4-2;
-   bcc.blend_constant_color[0] = ctx->Color.BlendColor[0];
-   bcc.blend_constant_color[1] = ctx->Color.BlendColor[1];
-   bcc.blend_constant_color[2] = ctx->Color.BlendColor[2];
-   bcc.blend_constant_color[3] = ctx->Color.BlendColor[3];
-
-   BRW_CACHED_BATCH_STRUCT(brw, &bcc);
-}
-
-
-const struct brw_tracked_state brw_blend_constant_color = {
-   .dirty = {
-      .mesa = _NEW_COLOR,
-      .brw = BRW_NEW_CONTEXT,
-      .cache = 0
-   },
-   .emit = upload_blend_constant_color
-};
-
  /* Constant single cliprect for framebuffer object or DRI2 drawing */
  static void upload_drawing_rect(struct brw_context *brw)
  {
@@ -79,7 +47,7 @@ static void upload_drawing_rect(struct brw_context *brw)
     struct gl_context *ctx = &intel->ctx;
  
     BEGIN_BATCH(4);
-   OUT_BATCH(_3DSTATE_DRAWRECT_INFO_I965);
+   OUT_BATCH(_3DSTATE_DRAWING_RECTANGLE << 16 | (4 - 2));
     OUT_BATCH(0); /* xmin, ymin */
     OUT_BATCH(((ctx->DrawBuffer->Width - 1) & 0xffff) |
             ((ctx->DrawBuffer->Height - 1) << 16));
@@ -108,19 +76,23 @@ static void upload_binding_table_pointers(struct brw_context *brw)
     struct intel_context *intel = &brw->intel;
  
     BEGIN_BATCH(6);
-   OUT_BATCH(CMD_BINDING_TABLE_PTRS << 16 | (6 - 2));
+   OUT_BATCH(_3DSTATE_BINDING_TABLE_POINTERS << 16 | (6 - 2));
     OUT_BATCH(brw->vs.bind_bo_offset);
     OUT_BATCH(0); /* gs */
     OUT_BATCH(0); /* clip */
     OUT_BATCH(0); /* sf */
-   OUT_BATCH(brw->wm.bind_bo_offset);
+   OUT_BATCH(brw->bind.bo_offset);
     ADVANCE_BATCH();
  }
  
  const struct brw_tracked_state brw_binding_table_pointers = {
     .dirty = {
        .mesa = 0,
-      .brw = BRW_NEW_BATCH | BRW_NEW_BINDING_TABLE,
+      .brw = (BRW_NEW_BATCH |
+             BRW_NEW_STATE_BASE_ADDRESS |
+             BRW_NEW_VS_BINDING_TABLE |
+             BRW_NEW_GS_BINDING_TABLE |
+             BRW_NEW_PS_BINDING_TABLE),
        .cache = 0,
     },
     .emit = upload_binding_table_pointers,
@@ -138,21 +110,25 @@ static void upload_gen6_binding_table_pointers(struct brw_context *brw)
     struct intel_context *intel = &brw->intel;
  
     BEGIN_BATCH(4);
-   OUT_BATCH(CMD_BINDING_TABLE_PTRS << 16 |
+   OUT_BATCH(_3DSTATE_BINDING_TABLE_POINTERS << 16 |
              GEN6_BINDING_TABLE_MODIFY_VS |
              GEN6_BINDING_TABLE_MODIFY_GS |
              GEN6_BINDING_TABLE_MODIFY_PS |
              (4 - 2));
     OUT_BATCH(brw->vs.bind_bo_offset); /* vs */
-   OUT_BATCH(0); /* gs */
-   OUT_BATCH(brw->wm.bind_bo_offset); /* wm/ps */
+   OUT_BATCH(brw->gs.bind_bo_offset); /* gs */
+   OUT_BATCH(brw->bind.bo_offset); /* wm/ps */
     ADVANCE_BATCH();
  }
  
  const struct brw_tracked_state gen6_binding_table_pointers = {
     .dirty = {
        .mesa = 0,
-      .brw = BRW_NEW_BATCH | BRW_NEW_BINDING_TABLE,
+      .brw = (BRW_NEW_BATCH |
+             BRW_NEW_STATE_BASE_ADDRESS |
+             BRW_NEW_VS_BINDING_TABLE |
+             BRW_NEW_GS_BINDING_TABLE |
+             BRW_NEW_PS_BINDING_TABLE),
        .cache = 0,
     },
     .emit = upload_gen6_binding_table_pointers,
@@ -176,32 +152,27 @@ static void upload_pipelined_state_pointers(struct brw_context *brw )
     }
  
     BEGIN_BATCH(7);
-   OUT_BATCH(CMD_PIPELINED_STATE_POINTERS << 16 | (7 - 2));
-   OUT_RELOC(brw->vs.state_bo, I915_GEM_DOMAIN_INSTRUCTION, 0, 0);
+   OUT_BATCH(_3DSTATE_PIPELINED_POINTERS << 16 | (7 - 2));
+   OUT_RELOC(intel->batch.bo, I915_GEM_DOMAIN_INSTRUCTION, 0,
+            brw->vs.state_offset);
     if (brw->gs.prog_active)
-      OUT_RELOC(brw->gs.state_bo, I915_GEM_DOMAIN_INSTRUCTION, 0, 1);
+      OUT_RELOC(brw->intel.batch.bo, I915_GEM_DOMAIN_INSTRUCTION, 0,
+               brw->gs.state_offset | 1);
     else
        OUT_BATCH(0);
-   OUT_RELOC(brw->clip.state_bo, I915_GEM_DOMAIN_INSTRUCTION, 0, 1);
-   OUT_RELOC(brw->sf.state_bo, I915_GEM_DOMAIN_INSTRUCTION, 0, 0);
-   OUT_RELOC(brw->wm.state_bo, I915_GEM_DOMAIN_INSTRUCTION, 0, 0);
-   OUT_RELOC(brw->cc.state_bo, I915_GEM_DOMAIN_INSTRUCTION, 0,
+   OUT_RELOC(brw->intel.batch.bo, I915_GEM_DOMAIN_INSTRUCTION, 0,
+            brw->clip.state_offset | 1);
+   OUT_RELOC(brw->intel.batch.bo, I915_GEM_DOMAIN_INSTRUCTION, 0,
+            brw->sf.state_offset);
+   OUT_RELOC(brw->intel.batch.bo, I915_GEM_DOMAIN_INSTRUCTION, 0,
+            brw->wm.state_offset);
+   OUT_RELOC(brw->intel.batch.bo, I915_GEM_DOMAIN_INSTRUCTION, 0,
              brw->cc.state_offset);
     ADVANCE_BATCH();
  
     brw->state.dirty.brw |= BRW_NEW_PSP;
  }
  
-
-static void prepare_psp_urb_cbs(struct brw_context *brw)
-{
-   brw_add_validated_bo(brw, brw->vs.state_bo);
-   brw_add_validated_bo(brw, brw->gs.state_bo);
-   brw_add_validated_bo(brw, brw->clip.state_bo);
-   brw_add_validated_bo(brw, brw->sf.state_bo);
-   brw_add_validated_bo(brw, brw->wm.state_bo);
-}
-
  static void upload_psp_urb_cbs(struct brw_context *brw )
  {
     upload_pipelined_state_pointers(brw);
@@ -212,7 +183,9 @@ static void upload_psp_urb_cbs(struct brw_context *brw )
  const struct brw_tracked_state brw_psp_urb_cbs = {
     .dirty = {
        .mesa = 0,
-      .brw = BRW_NEW_URB_FENCE | BRW_NEW_BATCH,
+      .brw = (BRW_NEW_URB_FENCE |
+             BRW_NEW_BATCH |
+             BRW_NEW_STATE_BASE_ADDRESS),
        .cache = (CACHE_NEW_VS_UNIT | 
                 CACHE_NEW_GS_UNIT | 
                 CACHE_NEW_GS_PROG | 
@@ -221,23 +194,105 @@ const struct brw_tracked_state brw_psp_urb_cbs = {
                 CACHE_NEW_WM_UNIT | 
                 CACHE_NEW_CC_UNIT)
     },
-   .prepare = prepare_psp_urb_cbs,
     .emit = upload_psp_urb_cbs,
  };
  
-static void prepare_depthbuffer(struct brw_context *brw)
+uint32_t
+brw_depthbuffer_format(struct brw_context *brw)
  {
-   struct intel_region *region = brw->state.depth_region;
+   struct intel_context *intel = &brw->intel;
+   struct gl_context *ctx = &intel->ctx;
+   struct gl_framebuffer *fb = ctx->DrawBuffer;
+   struct intel_renderbuffer *drb = intel_get_renderbuffer(fb, BUFFER_DEPTH);
+   struct intel_renderbuffer *srb;
+
+   if (!drb &&
+       (srb = intel_get_renderbuffer(fb, BUFFER_STENCIL)) &&
+       !srb->mt->stencil_mt &&
+       (intel_rb_format(srb) == MESA_FORMAT_S8_Z24 ||
+       intel_rb_format(srb) == MESA_FORMAT_Z32_FLOAT_X24S8)) {
+      drb = srb;
+   }
  
-   if (region != NULL)
-      brw_add_validated_bo(brw, region->buffer);
+   if (!drb)
+      return BRW_DEPTHFORMAT_D32_FLOAT;
+
+   switch (drb->mt->format) {
+   case MESA_FORMAT_Z16:
+      return BRW_DEPTHFORMAT_D16_UNORM;
+   case MESA_FORMAT_Z32_FLOAT:
+      return BRW_DEPTHFORMAT_D32_FLOAT;
+   case MESA_FORMAT_X8_Z24:
+      if (intel->gen >= 6) {
+        return BRW_DEPTHFORMAT_D24_UNORM_X8_UINT;
+      } else {
+        /* Use D24_UNORM_S8, not D24_UNORM_X8.
+         *
+         * D24_UNORM_X8 was not introduced until Gen5. (See the Ironlake PRM,
+         * Volume 2, Part 1, Section 8.4.6 "Depth/Stencil Buffer State", Bits
+         * 3DSTATE_DEPTH_BUFFER.Surface_Format).
+         *
+         * However, on Gen5, D24_UNORM_X8 may be used only if separate
+         * stencil is enabled, and we never enable it. From the Ironlake PRM,
+         * same section as above, Bit 3DSTATE_DEPTH_BUFFER.Separate_Stencil_Buffer_Enable:
+         *     If this field is disabled, the Surface Format of the depth
+         *     buffer cannot be D24_UNORM_X8_UINT.
+         */
+        return BRW_DEPTHFORMAT_D24_UNORM_S8_UINT;
+      }
+   case MESA_FORMAT_S8_Z24:
+      return BRW_DEPTHFORMAT_D24_UNORM_S8_UINT;
+   case MESA_FORMAT_Z32_FLOAT_X24S8:
+      return BRW_DEPTHFORMAT_D32_FLOAT_S8X24_UINT;
+   default:
+      _mesa_problem(ctx, "Unexpected depth format %s\n",
+                   _mesa_get_format_name(intel_rb_format(drb)));
+      return BRW_DEPTHFORMAT_D16_UNORM;
+   }
  }
  
  static void emit_depthbuffer(struct brw_context *brw)
  {
     struct intel_context *intel = &brw->intel;
-   struct intel_region *region = brw->state.depth_region;
+   struct gl_context *ctx = &intel->ctx;
+   struct gl_framebuffer *fb = ctx->DrawBuffer;
+   /* _NEW_BUFFERS */
+   struct intel_renderbuffer *depth_irb = intel_get_renderbuffer(fb, BUFFER_DEPTH);
+   struct intel_renderbuffer *stencil_irb = intel_get_renderbuffer(fb, BUFFER_STENCIL);
+   struct intel_mipmap_tree *stencil_mt = NULL;
+   struct intel_region *hiz_region = NULL;
     unsigned int len;
+   bool separate_stencil = false;
+
+   if (depth_irb &&
+       depth_irb->mt &&
+       depth_irb->mt->hiz_mt) {
+      hiz_region = depth_irb->mt->hiz_mt->region;
+   }
+
+   /* 3DSTATE_DEPTH_BUFFER, 3DSTATE_STENCIL_BUFFER are both
+    * non-pipelined state that will need the PIPE_CONTROL workaround.
+    */
+   if (intel->gen == 6) {
+      intel_emit_post_sync_nonzero_flush(intel);
+      intel_emit_depth_stall_flushes(intel);
+   }
+
+   /* Find the real separate stencil mt if present. */
+   if (stencil_irb) {
+      stencil_mt = stencil_irb->mt;
+      if (stencil_mt->stencil_mt)
+        stencil_mt = stencil_mt->stencil_mt;
+
+      if (stencil_mt->format == MESA_FORMAT_S8)
+        separate_stencil = true;
+   }
+
+   /* If there's a packed depth/stencil bound to stencil only, we need to
+    * emit the packed depth/stencil buffer packet.
+    */
+   if (!depth_irb && stencil_irb && !separate_stencil)
+      depth_irb = stencil_irb;
  
     if (intel->gen >= 6)
        len = 7;
@@ -246,9 +301,9 @@ static void emit_depthbuffer(struct brw_context *brw)
     else
        len = 5;
  
-   if (region == NULL) {
+   if (!depth_irb && !separate_stencil) {
        BEGIN_BATCH(len);
-      OUT_BATCH(CMD_DEPTH_BUFFER << 16 | (len - 2));
+      OUT_BATCH(_3DSTATE_DEPTH_BUFFER << 16 | (len - 2));
        OUT_BATCH((BRW_DEPTHFORMAT_D32_FLOAT << 18) |
                 (BRW_SURFACE_NULL << 29));
        OUT_BATCH(0);
@@ -262,45 +317,86 @@ static void emit_depthbuffer(struct brw_context *brw)
          OUT_BATCH(0);
  
        ADVANCE_BATCH();
-   } else {
-      unsigned int format;
-
-      switch (region->cpp) {
-      case 2:
-        format = BRW_DEPTHFORMAT_D16_UNORM;
-        break;
-      case 4:
-        if (intel->depth_buffer_is_float)
-           format = BRW_DEPTHFORMAT_D32_FLOAT;
-        else
-           format = BRW_DEPTHFORMAT_D24_UNORM_S8_UINT;
-        break;
-      default:
-        assert(0);
-        return;
-      }
  
-      assert(region->tiling != I915_TILING_X);
+   } else if (!depth_irb && separate_stencil) {
+      /*
+       * There exists a separate stencil buffer but no depth buffer.
+       *
+       * The stencil buffer inherits most of its fields from
+       * 3DSTATE_DEPTH_BUFFER: namely the tile walk, surface type, width, and
+       * height.
+       *
+       * Since the stencil buffer has quirky pitch requirements, its region
+       * was allocated with half height and double cpp. So we need
+       * a multiplier of 2 to obtain the surface's real height.
+       *
+       * Enable the hiz bit because it and the separate stencil bit must have
+       * the same value. From Section 2.11.5.6.1.1 3DSTATE_DEPTH_BUFFER, Bit
+       * 1.21 "Separate Stencil Enable":
+       *     [DevIL]: If this field is enabled, Hierarchical Depth Buffer
+       *     Enable must also be enabled.
+       *
+       *     [DevGT]: This field must be set to the same value (enabled or
+       *     disabled) as Hierarchical Depth Buffer Enable
+       *
+       * The tiled bit must be set. From the Sandybridge PRM, Volume 2, Part 1,
+       * Section 7.5.5.1.1 3DSTATE_DEPTH_BUFFER, Bit 1.27 Tiled Surface:
+       *     [DevGT+]: This field must be set to TRUE.
+       */
+      assert(intel->has_separate_stencil);
+
+      BEGIN_BATCH(len);
+      OUT_BATCH(_3DSTATE_DEPTH_BUFFER << 16 | (len - 2));
+      OUT_BATCH((BRW_DEPTHFORMAT_D32_FLOAT << 18) |
+               (1 << 21) | /* separate stencil enable */
+               (1 << 22) | /* hiz enable */
+               (BRW_TILEWALK_YMAJOR << 26) |
+               (1 << 27) | /* tiled surface */
+               (BRW_SURFACE_2D << 29));
+      OUT_BATCH(0);
+      OUT_BATCH(((stencil_irb->Base.Base.Width - 1) << 6) |
+                (stencil_irb->Base.Base.Height - 1) << 19);
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+
        if (intel->gen >= 6)
-        assert(region->tiling != I915_TILING_NONE);
+        OUT_BATCH(0);
+
+      ADVANCE_BATCH();
+
+   } else {
+      struct intel_region *region = depth_irb->mt->region;
+      uint32_t tile_x, tile_y, offset;
+
+      /* If using separate stencil, hiz must be enabled. */
+      assert(!separate_stencil || hiz_region);
+
+      offset = intel_renderbuffer_tile_offsets(depth_irb, &tile_x, &tile_y);
+
+      assert(intel->gen < 6 || region->tiling == I915_TILING_Y);
+      assert(!hiz_region || region->tiling == I915_TILING_Y);
  
        BEGIN_BATCH(len);
-      OUT_BATCH(CMD_DEPTH_BUFFER << 16 | (len - 2));
+      OUT_BATCH(_3DSTATE_DEPTH_BUFFER << 16 | (len - 2));
        OUT_BATCH(((region->pitch * region->cpp) - 1) |
-               (format << 18) |
+               (brw_depthbuffer_format(brw) << 18) |
+               ((hiz_region ? 1 : 0) << 21) | /* separate stencil enable */
+               ((hiz_region ? 1 : 0) << 22) | /* hiz enable */
                 (BRW_TILEWALK_YMAJOR << 26) |
                 ((region->tiling != I915_TILING_NONE) << 27) |
                 (BRW_SURFACE_2D << 29));
-      OUT_RELOC(region->buffer,
+      OUT_RELOC(region->bo,
                 I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
-               0);
+               offset);
        OUT_BATCH((BRW_SURFACE_MIPMAPLAYOUT_BELOW << 1) |
-               ((region->width - 1) << 6) |
-               ((region->height - 1) << 19));
+               (((depth_irb->Base.Base.Width + tile_x) - 1) << 6) |
+               (((depth_irb->Base.Base.Height + tile_y) - 1) << 19));
        OUT_BATCH(0);
  
        if (intel->is_g4x || intel->gen >= 5)
-         OUT_BATCH(0);
+         OUT_BATCH(tile_x | (tile_y << 16));
+      else
+        assert(tile_x == 0 && tile_y == 0);
  
        if (intel->gen >= 6)
          OUT_BATCH(0);
@@ -308,10 +404,65 @@ static void emit_depthbuffer(struct brw_context *brw)
        ADVANCE_BATCH();
     }
  
-   /* Initialize it for safety. */
-   if (intel->gen >= 6) {
+   if (hiz_region || separate_stencil) {
+      /*
+       * In the 3DSTATE_DEPTH_BUFFER batch emitted above, the 'separate
+       * stencil enable' and 'hiz enable' bits were set. Therefore we must
+       * emit 3DSTATE_HIER_DEPTH_BUFFER and 3DSTATE_STENCIL_BUFFER. Even if
+       * there is no stencil buffer, 3DSTATE_STENCIL_BUFFER must be emitted;
+       * failure to do so causes hangs on gen5 and a stall on gen6.
+       */
+
+      /* Emit hiz buffer. */
+      if (hiz_region) {
+        BEGIN_BATCH(3);
+        OUT_BATCH((_3DSTATE_HIER_DEPTH_BUFFER << 16) | (3 - 2));
+        OUT_BATCH(hiz_region->pitch * hiz_region->cpp - 1);
+        OUT_RELOC(hiz_region->bo,
+                  I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
+                  0);
+        ADVANCE_BATCH();
+      } else {
+        BEGIN_BATCH(3);
+        OUT_BATCH((_3DSTATE_HIER_DEPTH_BUFFER << 16) | (3 - 2));
+        OUT_BATCH(0);
+        OUT_BATCH(0);
+        ADVANCE_BATCH();
+      }
+
+      /* Emit stencil buffer. */
+      if (separate_stencil) {
+        struct intel_region *region = stencil_mt->region;
+        BEGIN_BATCH(3);
+        OUT_BATCH((_3DSTATE_STENCIL_BUFFER << 16) | (3 - 2));
+        OUT_BATCH(region->pitch * region->cpp - 1);
+        OUT_RELOC(region->bo,
+                  I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
+                  0);
+        ADVANCE_BATCH();
+      } else {
+        BEGIN_BATCH(3);
+        OUT_BATCH((_3DSTATE_STENCIL_BUFFER << 16) | (3 - 2));
+        OUT_BATCH(0);
+        OUT_BATCH(0);
+        ADVANCE_BATCH();
+      }
+   }
+
+   /*
+    * On Gen >= 6, emit clear params for safety. If using hiz, then clear
+    * params must be emitted.
+    *
+    * From Section 2.11.5.6.4.1 3DSTATE_CLEAR_PARAMS:
+    *     3DSTATE_CLEAR_PARAMS packet must follow the DEPTH_BUFFER_STATE packet
+    *     when HiZ is enabled and the DEPTH_BUFFER_STATE changes.
+    */
+   if (intel->gen >= 6 || hiz_region) {
+      if (intel->gen == 6)
+        intel_emit_post_sync_nonzero_flush(intel);
+
        BEGIN_BATCH(2);
-      OUT_BATCH(CMD_3D_CLEAR_PARAMS << 16 | (2 - 2));
+      OUT_BATCH(_3DSTATE_CLEAR_PARAMS << 16 | (2 - 2));
        OUT_BATCH(0);
        ADVANCE_BATCH();
     }
@@ -319,11 +470,10 @@ static void emit_depthbuffer(struct brw_context *brw)
  
  const struct brw_tracked_state brw_depthbuffer = {
     .dirty = {
-      .mesa = 0,
-      .brw = BRW_NEW_DEPTH_BUFFER | BRW_NEW_BATCH,
+      .mesa = _NEW_BUFFERS,
+      .brw = BRW_NEW_BATCH,
        .cache = 0,
     },
-   .prepare = prepare_depthbuffer,
     .emit = emit_depthbuffer,
  };
  
@@ -335,13 +485,19 @@ const struct brw_tracked_state brw_depthbuffer = {
  
  static void upload_polygon_stipple(struct brw_context *brw)
  {
+   struct intel_context *intel = &brw->intel;
     struct gl_context *ctx = &brw->intel.ctx;
-   struct brw_polygon_stipple bps;
     GLuint i;
  
-   memset(&bps, 0, sizeof(bps));
-   bps.header.opcode = CMD_POLY_STIPPLE_PATTERN;
-   bps.header.length = sizeof(bps)/4-2;
+   /* _NEW_POLYGON */
+   if (!ctx->Polygon.StippleFlag)
+      return;
+
+   if (intel->gen == 6)
+      intel_emit_post_sync_nonzero_flush(intel);
+
+   BEGIN_BATCH(33);
+   OUT_BATCH(_3DSTATE_POLY_STIPPLE_PATTERN << 16 | (33 - 2));
  
     /* Polygon stipple is provided in OpenGL order, i.e. bottom
      * row first.  If we're rendering to a window (i.e. the
@@ -352,19 +508,19 @@ static void upload_polygon_stipple(struct brw_context *brw)
      */
     if (ctx->DrawBuffer->Name == 0) {
        for (i = 0; i < 32; i++)
-         bps.stipple[i] = ctx->PolygonStipple[31 - i]; /* invert */
+         OUT_BATCH(ctx->PolygonStipple[31 - i]); /* invert */
     }
     else {
        for (i = 0; i < 32; i++)
-         bps.stipple[i] = ctx->PolygonStipple[i]; /* don't invert */
+        OUT_BATCH(ctx->PolygonStipple[i]);
     }
-
-   BRW_CACHED_BATCH_STRUCT(brw, &bps);
+   CACHED_BATCH();
  }
  
  const struct brw_tracked_state brw_polygon_stipple = {
     .dirty = {
-      .mesa = _NEW_POLYGONSTIPPLE,
+      .mesa = (_NEW_POLYGONSTIPPLE |
+              _NEW_POLYGON),
        .brw = BRW_NEW_CONTEXT,
        .cache = 0
     },
@@ -378,14 +534,22 @@ const struct brw_tracked_state brw_polygon_stipple = {
  
  static void upload_polygon_stipple_offset(struct brw_context *brw)
  {
+   struct intel_context *intel = &brw->intel;
     struct gl_context *ctx = &brw->intel.ctx;
-   struct brw_polygon_stipple_offset bpso;
  
-   memset(&bpso, 0, sizeof(bpso));
-   bpso.header.opcode = CMD_POLY_STIPPLE_OFFSET;
-   bpso.header.length = sizeof(bpso)/4-2;
+   /* _NEW_POLYGON */
+   if (!ctx->Polygon.StippleFlag)
+      return;
+
+   if (intel->gen == 6)
+      intel_emit_post_sync_nonzero_flush(intel);
  
-   /* If we're drawing to a system window (ctx->DrawBuffer->Name == 0),
+   BEGIN_BATCH(2);
+   OUT_BATCH(_3DSTATE_POLY_STIPPLE_OFFSET << 16 | (2-2));
+
+   /* _NEW_BUFFERS
+    *
+    * If we're drawing to a system window (ctx->DrawBuffer->Name == 0),
      * we have to invert the Y axis in order to match the OpenGL
      * pixel coordinate system, and our offset must be matched
      * to the window position.  If we're drawing to a FBO
@@ -393,23 +557,17 @@ static void upload_polygon_stipple_offset(struct brw_context *brw)
      * system works just fine, and there's no window system to
      * worry about.
      */
-   if (brw->intel.ctx.DrawBuffer->Name == 0) {
-      bpso.bits0.x_offset = 0;
-      bpso.bits0.y_offset = (32 - (ctx->DrawBuffer->Height & 31)) & 31;
-   }
-   else {
-      bpso.bits0.y_offset = 0;
-      bpso.bits0.x_offset = 0;
-   }
-
-   BRW_CACHED_BATCH_STRUCT(brw, &bpso);
+   if (brw->intel.ctx.DrawBuffer->Name == 0)
+      OUT_BATCH((32 - (ctx->DrawBuffer->Height & 31)) & 31);
+   else
+      OUT_BATCH(0);
+   CACHED_BATCH();
  }
  
-#define _NEW_WINDOW_POS 0x40000000
-
  const struct brw_tracked_state brw_polygon_stipple_offset = {
     .dirty = {
-      .mesa = _NEW_WINDOW_POS,
+      .mesa = (_NEW_BUFFERS |
+              _NEW_POLYGON),
        .brw = BRW_NEW_CONTEXT,
        .cache = 0
     },
@@ -421,22 +579,25 @@ const struct brw_tracked_state brw_polygon_stipple_offset = {
   */
  static void upload_aa_line_parameters(struct brw_context *brw)
  {
-   struct brw_aa_line_parameters balp;
+   struct intel_context *intel = &brw->intel;
+   struct gl_context *ctx = &brw->intel.ctx;
  
-   if (!brw->has_aa_line_parameters)
+   if (!ctx->Line.SmoothFlag || !brw->has_aa_line_parameters)
        return;
  
+   if (intel->gen == 6)
+      intel_emit_post_sync_nonzero_flush(intel);
+
+   OUT_BATCH(_3DSTATE_AA_LINE_PARAMETERS << 16 | (3 - 2));
     /* use legacy aa line coverage computation */
-   memset(&balp, 0, sizeof(balp));
-   balp.header.opcode = CMD_AA_LINE_PARAMETERS;
-   balp.header.length = sizeof(balp) / 4 - 2;
-   
-   BRW_CACHED_BATCH_STRUCT(brw, &balp);
+   OUT_BATCH(0);
+   OUT_BATCH(0);
+   CACHED_BATCH();
  }
  
  const struct brw_tracked_state brw_aa_line_parameters = {
     .dirty = {
-      .mesa = 0,
+      .mesa = _NEW_LINE,
        .brw = BRW_NEW_CONTEXT,
        .cache = 0
     },
@@ -449,25 +610,24 @@ const struct brw_tracked_state brw_aa_line_parameters = {
  
  static void upload_line_stipple(struct brw_context *brw)
  {
+   struct intel_context *intel = &brw->intel;
     struct gl_context *ctx = &brw->intel.ctx;
-   struct brw_line_stipple bls;
     GLfloat tmp;
     GLint tmpi;
  
-   memset(&bls, 0, sizeof(bls));
-   bls.header.opcode = CMD_LINE_STIPPLE_PATTERN;
-   bls.header.length = sizeof(bls)/4 - 2;
+   if (!ctx->Line.StippleFlag)
+      return;
  
-   bls.bits0.pattern = ctx->Line.StipplePattern;
-   bls.bits1.repeat_count = ctx->Line.StippleFactor;
+   if (intel->gen == 6)
+      intel_emit_post_sync_nonzero_flush(intel);
  
+   BEGIN_BATCH(3);
+   OUT_BATCH(_3DSTATE_LINE_STIPPLE_PATTERN << 16 | (3 - 2));
+   OUT_BATCH(ctx->Line.StipplePattern);
     tmp = 1.0 / (GLfloat) ctx->Line.StippleFactor;
     tmpi = tmp * (1<<13);
-
-
-   bls.bits1.inverse_repeat_count = tmpi;
-
-   BRW_CACHED_BATCH_STRUCT(brw, &bls);
+   OUT_BATCH(tmpi << 16 | ctx->Line.StippleFactor);
+   CACHED_BATCH();
  }
  
  const struct brw_tracked_state brw_line_stipple = {
@@ -481,94 +641,78 @@ const struct brw_tracked_state brw_line_stipple = {
  
  
  /***********************************************************************
- * Misc invarient state packets
+ * Misc invariant state packets
   */
  
-static void upload_invarient_state( struct brw_context *brw )
+static void upload_invariant_state( struct brw_context *brw )
  {
     struct intel_context *intel = &brw->intel;
  
-   {
-      /* 0x61040000  Pipeline Select */
-      /*     PipelineSelect            : 0 */
-      struct brw_pipeline_select ps;
+   /* 3DSTATE_SIP, 3DSTATE_MULTISAMPLE, etc. are nonpipelined. */
+   if (intel->gen == 6)
+      intel_emit_post_sync_nonzero_flush(intel);
  
-      memset(&ps, 0, sizeof(ps));
-      ps.header.opcode = brw->CMD_PIPELINE_SELECT;
-      ps.header.pipeline_select = 0;
-      BRW_BATCH_STRUCT(brw, &ps);
-   }
+   /* Select the 3D pipeline (as opposed to media) */
+   BEGIN_BATCH(1);
+   OUT_BATCH(brw->CMD_PIPELINE_SELECT << 16 | 0);
+   ADVANCE_BATCH();
  
     if (intel->gen < 6) {
-      struct brw_global_depth_offset_clamp gdo;
-      memset(&gdo, 0, sizeof(gdo));
-
-      /* Disable depth offset clamping. 
-       */
-      gdo.header.opcode = CMD_GLOBAL_DEPTH_OFFSET_CLAMP;
-      gdo.header.length = sizeof(gdo)/4 - 2;
-      gdo.depth_offset_clamp = 0.0;
-
-      BRW_BATCH_STRUCT(brw, &gdo);
+      /* Disable depth offset clamping. */
+      BEGIN_BATCH(2);
+      OUT_BATCH(_3DSTATE_GLOBAL_DEPTH_OFFSET_CLAMP << 16 | (2 - 2));
+      OUT_BATCH_F(0.0);
+      ADVANCE_BATCH();
     }
  
     if (intel->gen >= 6) {
        int i;
+      int len = intel->gen >= 7 ? 4 : 3;
  
-      BEGIN_BATCH(3);
-      OUT_BATCH(CMD_3D_MULTISAMPLE << 16 | (3 - 2));
+      BEGIN_BATCH(len);
+      OUT_BATCH(_3DSTATE_MULTISAMPLE << 16 | (len - 2));
        OUT_BATCH(MS_PIXEL_LOCATION_CENTER |
                 MS_NUMSAMPLES_1);
        OUT_BATCH(0); /* positions for 4/8-sample */
+      if (intel->gen >= 7)
+        OUT_BATCH(0);
        ADVANCE_BATCH();
  
        BEGIN_BATCH(2);
-      OUT_BATCH(CMD_3D_SAMPLE_MASK << 16 | (2 - 2));
+      OUT_BATCH(_3DSTATE_SAMPLE_MASK << 16 | (2 - 2));
        OUT_BATCH(1);
        ADVANCE_BATCH();
  
-      for (i = 0; i < 4; i++) {
-        BEGIN_BATCH(4);
-        OUT_BATCH(CMD_GS_SVB_INDEX << 16 | (4 - 2));
-        OUT_BATCH(i << SVB_INDEX_SHIFT);
-        OUT_BATCH(0);
-        OUT_BATCH(0xffffffff);
-        ADVANCE_BATCH();
+      if (intel->gen < 7) {
+        for (i = 0; i < 4; i++) {
+           BEGIN_BATCH(4);
+           OUT_BATCH(_3DSTATE_GS_SVB_INDEX << 16 | (4 - 2));
+           OUT_BATCH(i << SVB_INDEX_SHIFT);
+           OUT_BATCH(0);
+           OUT_BATCH(0xffffffff);
+           ADVANCE_BATCH();
+        }
        }
     }
  
-   /* 0x61020000  State Instruction Pointer */
-   {
-      struct brw_system_instruction_pointer sip;
-      memset(&sip, 0, sizeof(sip));
-
-      sip.header.opcode = CMD_STATE_INSN_POINTER;
-      sip.header.length = 0;
-      sip.bits0.pad = 0;
-      sip.bits0.system_instruction_pointer = 0;
-      BRW_BATCH_STRUCT(brw, &sip);
-   }
-
-
-   {
-      struct brw_vf_statistics vfs;
-      memset(&vfs, 0, sizeof(vfs));
-
-      vfs.opcode = brw->CMD_VF_STATISTICS;
-      if (INTEL_DEBUG & DEBUG_STATS)
-        vfs.statistics_enable = 1; 
+   BEGIN_BATCH(2);
+   OUT_BATCH(CMD_STATE_SIP << 16 | (2 - 2));
+   OUT_BATCH(0);
+   ADVANCE_BATCH();
  
-      BRW_BATCH_STRUCT(brw, &vfs);
-   }
+   BEGIN_BATCH(1);
+   OUT_BATCH(brw->CMD_VF_STATISTICS << 16 |
+            (unlikely(INTEL_DEBUG & DEBUG_STATS) ? 1 : 0));
+   ADVANCE_BATCH();
  }
  
-const struct brw_tracked_state brw_invarient_state = {
+const struct brw_tracked_state brw_invariant_state = {
     .dirty = {
        .mesa = 0,
        .brw = BRW_NEW_CONTEXT,
        .cache = 0
     },
-   .emit = upload_invarient_state
+   .emit = upload_invariant_state
  };
  
  /**
@@ -585,17 +729,53 @@ static void upload_state_base_address( struct brw_context *brw )
  {
     struct intel_context *intel = &brw->intel;
  
+   /* FINISHME: According to section 3.6.1 "STATE_BASE_ADDRESS" of
+    * vol1a of the G45 PRM, MI_FLUSH with the ISC invalidate should be
+    * programmed prior to STATE_BASE_ADDRESS.
+    *
+    * However, given that the instruction SBA (general state base
+    * address) on this chipset is always set to 0 across X and GL,
+    * maybe this isn't required for us in particular.
+    */
+
     if (intel->gen >= 6) {
+      if (intel->gen == 6)
+        intel_emit_post_sync_nonzero_flush(intel);
+
         BEGIN_BATCH(10);
         OUT_BATCH(CMD_STATE_BASE_ADDRESS << 16 | (10 - 2));
-       OUT_BATCH(1); /* General state base address */
-       OUT_RELOC(intel->batch->buf, I915_GEM_DOMAIN_SAMPLER, 0,
-                1); /* Surface state base address */
-       OUT_BATCH(1); /* Dynamic state base address */
-       OUT_BATCH(1); /* Indirect object base address */
-       OUT_BATCH(1); /* Instruction base address */
+       /* General state base address: stateless DP read/write requests */
+       OUT_BATCH(1);
+       /* Surface state base address:
+       * BINDING_TABLE_STATE
+       * SURFACE_STATE
+       */
+       OUT_RELOC(intel->batch.bo, I915_GEM_DOMAIN_SAMPLER, 0, 1);
+        /* Dynamic state base address:
+        * SAMPLER_STATE
+        * SAMPLER_BORDER_COLOR_STATE
+        * CLIP, SF, WM/CC viewport state
+        * COLOR_CALC_STATE
+        * DEPTH_STENCIL_STATE
+        * BLEND_STATE
+        * Push constants (when INSTPM: CONSTANT_BUFFER Address Offset
+        * Disable is clear, which we rely on)
+        */
+       OUT_RELOC(intel->batch.bo, (I915_GEM_DOMAIN_RENDER |
+                                  I915_GEM_DOMAIN_INSTRUCTION), 0, 1);
+
+       OUT_BATCH(1); /* Indirect object base address: MEDIA_OBJECT data */
+       OUT_RELOC(brw->cache.bo, I915_GEM_DOMAIN_INSTRUCTION, 0,
+                1); /* Instruction base address: shader kernels (incl. SIP) */
+
         OUT_BATCH(1); /* General state upper bound */
-       OUT_BATCH(1); /* Dynamic state upper bound */
+       /* Dynamic state upper bound.  Although the documentation says that
+       * programming it to zero will cause it to be ignored, that is a lie.
+       * If this isn't programmed to a real bound, the sampler border color
+       * pointer is rejected, causing border color to mysteriously fail.
+       */
+       OUT_RELOC(intel->batch.bo, I915_GEM_DOMAIN_INSTRUCTION, 0,
+                intel->batch.bo->size | 1);
         OUT_BATCH(1); /* Indirect object upper bound */
         OUT_BATCH(1); /* Instruction access upper bound */
         ADVANCE_BATCH();
@@ -603,11 +783,12 @@ static void upload_state_base_address( struct brw_context *brw )
         BEGIN_BATCH(8);
         OUT_BATCH(CMD_STATE_BASE_ADDRESS << 16 | (8 - 2));
         OUT_BATCH(1); /* General state base address */
-       OUT_RELOC(intel->batch->buf, I915_GEM_DOMAIN_SAMPLER, 0,
+       OUT_RELOC(intel->batch.bo, I915_GEM_DOMAIN_SAMPLER, 0,
                  1); /* Surface state base address */
         OUT_BATCH(1); /* Indirect object base address */
-       OUT_BATCH(1); /* Instruction base address */
-       OUT_BATCH(1); /* General state upper bound */
+       OUT_RELOC(brw->cache.bo, I915_GEM_DOMAIN_INSTRUCTION, 0,
+                1); /* Instruction base address */
+       OUT_BATCH(0xfffff001); /* General state upper bound */
         OUT_BATCH(1); /* Indirect object upper bound */
         OUT_BATCH(1); /* Instruction access upper bound */
         ADVANCE_BATCH();
@@ -615,19 +796,44 @@ static void upload_state_base_address( struct brw_context *brw )
         BEGIN_BATCH(6);
         OUT_BATCH(CMD_STATE_BASE_ADDRESS << 16 | (6 - 2));
         OUT_BATCH(1); /* General state base address */
-       OUT_RELOC(intel->batch->buf, I915_GEM_DOMAIN_SAMPLER, 0,
+       OUT_RELOC(intel->batch.bo, I915_GEM_DOMAIN_SAMPLER, 0,
                  1); /* Surface state base address */
         OUT_BATCH(1); /* Indirect object base address */
         OUT_BATCH(1); /* General state upper bound */
         OUT_BATCH(1); /* Indirect object upper bound */
         ADVANCE_BATCH();
     }
+
+   /* According to section 3.6.1 of VOL1 of the 965 PRM,
+    * STATE_BASE_ADDRESS updates require a reissue of:
+    *
+    * 3DSTATE_PIPELINE_POINTERS
+    * 3DSTATE_BINDING_TABLE_POINTERS
+    * MEDIA_STATE_POINTERS
+    *
+    * and this continues through Ironlake.  The Sandy Bridge PRM, vol
+    * 1 part 1 says that the folowing packets must be reissued:
+    *
+    * 3DSTATE_CC_POINTERS
+    * 3DSTATE_BINDING_TABLE_POINTERS
+    * 3DSTATE_SAMPLER_STATE_POINTERS
+    * 3DSTATE_VIEWPORT_STATE_POINTERS
+    * MEDIA_STATE_POINTERS
+    *
+    * Those are always reissued following SBA updates anyway (new
+    * batch time), except in the case of the program cache BO
+    * changing.  Having a separate state flag makes the sequence more
+    * obvious.
+    */
+
+   brw->state.dirty.brw |= BRW_NEW_STATE_BASE_ADDRESS;
  }
  
  const struct brw_tracked_state brw_state_base_address = {
     .dirty = {
        .mesa = 0,
-      .brw = BRW_NEW_BATCH,
+      .brw = (BRW_NEW_BATCH |
+             BRW_NEW_PROGRAM_CACHE),
        .cache = 0,
     },
     .emit = upload_state_base_address