radeonsi: kill point size VS output if it's not used by the rasterizer
[mesa.git] / src / gallium / drivers / i915 / i915_state_emit.c
index 7bb7893d93995279e4e17165039608c6c401b429..9f0f9e33ca3fc84ea8645369e1ad2e2ac4b6aaab 100644 (file)
@@ -1,6 +1,6 @@
 /**************************************************************************
  * 
- * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * Copyright 2003 VMware, Inc.
  * All Rights Reserved.
  * 
  * Permission is hereby granted, free of charge, to any person obtaining a
@@ -18,7 +18,7 @@
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "i915_context.h"
 #include "i915_batch.h"
 #include "i915_debug.h"
-#include "i915_reg.h"
+#include "i915_fpc.h"
 #include "i915_resource.h"
 
 #include "pipe/p_context.h"
 #include "pipe/p_defines.h"
+#include "pipe/p_format.h"
+
+#include "util/format/u_format.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+
+struct i915_tracked_hw_state {
+   const char *name;
+   void (*validate)(struct i915_context *, unsigned *batch_space);
+   void (*emit)(struct i915_context *);
+   unsigned dirty, batch_space;
+};
 
-static unsigned translate_format( enum pipe_format format )
+
+static void
+validate_flush(struct i915_context *i915, unsigned *batch_space)
 {
-   switch (format) {
-   case PIPE_FORMAT_B8G8R8A8_UNORM:
-      return COLOR_BUF_ARGB8888;
-   case PIPE_FORMAT_B5G6R5_UNORM:
-      return COLOR_BUF_RGB565;
-   default:
-      assert(0);
-      return 0;
-   }
+   *batch_space = i915->flush_dirty ? 1 : 0;
 }
 
-static unsigned translate_depth_format( enum pipe_format zformat )
+static void
+emit_flush(struct i915_context *i915)
 {
-   switch (zformat) {
-   case PIPE_FORMAT_Z24X8_UNORM:
-   case PIPE_FORMAT_Z24_UNORM_S8_USCALED:
-      return DEPTH_FRMT_24_FIXED_8_OTHER;
-   case PIPE_FORMAT_Z16_UNORM:
-      return DEPTH_FRMT_16_FIXED;
-   default:
-      assert(0);
-      return 0;
-   }
+   /* Cache handling is very cheap atm. State handling can request to flushes:
+    * - I915_FLUSH_CACHE which is a flush everything request and
+    * - I915_PIPELINE_FLUSH which is specifically for the draw_offset flush.
+    * Because the cache handling is so dumb, no explicit "invalidate map cache".
+    * Also, the first is a strict superset of the latter, so the following logic
+    * works. */
+   if (i915->flush_dirty & I915_FLUSH_CACHE)
+      OUT_BATCH(MI_FLUSH | FLUSH_MAP_CACHE);
+   else if (i915->flush_dirty & I915_PIPELINE_FLUSH)
+      OUT_BATCH(MI_FLUSH | INHIBIT_FLUSH_RENDER_CACHE);
 }
 
+uint32_t invariant_state[] = {
+   _3DSTATE_AA_CMD | AA_LINE_ECAAR_WIDTH_ENABLE | AA_LINE_ECAAR_WIDTH_1_0 |
+             AA_LINE_REGION_WIDTH_ENABLE | AA_LINE_REGION_WIDTH_1_0,
 
-/**
- * Examine framebuffer state to determine width, height.
- */
-static boolean
-framebuffer_size(const struct pipe_framebuffer_state *fb,
-                 uint *width, uint *height)
-{
-   if (fb->cbufs[0]) {
-      *width = fb->cbufs[0]->width;
-      *height = fb->cbufs[0]->height;
-      return TRUE;
-   }
-   else if (fb->zsbuf) {
-      *width = fb->zsbuf->width;
-      *height = fb->zsbuf->height;
-      return TRUE;
-   }
-   else {
-      *width = *height = 0;
-      return FALSE;
-   }
-}
+   _3DSTATE_DFLT_DIFFUSE_CMD, 0,
 
+   _3DSTATE_DFLT_SPEC_CMD, 0,
 
-/* Push the state into the sarea and/or texture memory.
- */
-void
-i915_emit_hardware_state(struct i915_context *i915 )
-{
-   /* XXX: there must be an easier way */
-   const unsigned dwords = ( 14 + 
-                             7 + 
-                             I915_MAX_DYNAMIC + 
-                             8 + 
-                             2 + I915_TEX_UNITS*3 + 
-                             2 + I915_TEX_UNITS*3 +
-                             2 + I915_MAX_CONSTANT*4 + 
-#if 0
-                             i915->current.program_len + 
-#else
-                             i915->fs->program_len + 
-#endif
-                             6 
-                           ) * 3/2; /* plus 50% margin */
-   const unsigned relocs = ( I915_TEX_UNITS +
-                             3
-                           ) * 3/2; /* plus 50% margin */
+   _3DSTATE_DFLT_Z_CMD, 0,
 
-   uintptr_t save_ptr;
-   size_t save_relocs;
+   _3DSTATE_COORD_SET_BINDINGS |
+             CSB_TCB(0, 0) |
+             CSB_TCB(1, 1) |
+             CSB_TCB(2, 2) |
+             CSB_TCB(3, 3) |
+             CSB_TCB(4, 4) |
+             CSB_TCB(5, 5) |
+             CSB_TCB(6, 6) |
+             CSB_TCB(7, 7),
 
-   if (I915_DBG_ON(DBG_ATOMS))
-      i915_dump_hardware_dirty(i915, __FUNCTION__);
+   _3DSTATE_RASTER_RULES_CMD |
+             ENABLE_POINT_RASTER_RULE |
+             OGL_POINT_RASTER_RULE |
+             ENABLE_LINE_STRIP_PROVOKE_VRTX |
+             ENABLE_TRI_FAN_PROVOKE_VRTX |
+             LINE_STRIP_PROVOKE_VRTX(1) |
+             TRI_FAN_PROVOKE_VRTX(2) |
+             ENABLE_TEXKILL_3D_4D |
+             TEXKILL_4D,
 
-   if(!BEGIN_BATCH(dwords, relocs)) {
-      FLUSH_BATCH(NULL);
-      assert(BEGIN_BATCH(dwords, relocs));
-   }
+   _3DSTATE_DEPTH_SUBRECT_DISABLE,
 
-   save_ptr = (uintptr_t)i915->batch->ptr;
-   save_relocs = i915->batch->relocs;
+   /* disable indirect state for now
+    */
+   _3DSTATE_LOAD_INDIRECT | 0, 0};
 
-   /* 14 dwords, 0 relocs */
-   if (i915->hardware_dirty & I915_HW_INVARIENT)
-   {
-      OUT_BATCH(_3DSTATE_AA_CMD |
-                AA_LINE_ECAAR_WIDTH_ENABLE |
-                AA_LINE_ECAAR_WIDTH_1_0 |
-                AA_LINE_REGION_WIDTH_ENABLE | AA_LINE_REGION_WIDTH_1_0);
+static void
+emit_invariant(struct i915_context *i915)
+{
+   i915_winsys_batchbuffer_write(i915->batch, invariant_state,
+                                 ARRAY_SIZE(invariant_state)*sizeof(uint32_t));
+}
 
-      OUT_BATCH(_3DSTATE_DFLT_DIFFUSE_CMD);
-      OUT_BATCH(0);
+static void
+validate_immediate(struct i915_context *i915, unsigned *batch_space)
+{
+   unsigned dirty = (1 << I915_IMMEDIATE_S0 | 1 << I915_IMMEDIATE_S1 |
+                     1 << I915_IMMEDIATE_S2 | 1 << I915_IMMEDIATE_S3 |
+                     1 << I915_IMMEDIATE_S3 | 1 << I915_IMMEDIATE_S4 |
+                     1 << I915_IMMEDIATE_S5 | 1 << I915_IMMEDIATE_S6) &
+                    i915->immediate_dirty;
 
-      OUT_BATCH(_3DSTATE_DFLT_SPEC_CMD);
-      OUT_BATCH(0);
-      
-      OUT_BATCH(_3DSTATE_DFLT_Z_CMD);
-      OUT_BATCH(0);
+   if (i915->immediate_dirty & (1 << I915_IMMEDIATE_S0) && i915->vbo)
+      i915->validation_buffers[i915->num_validation_buffers++] = i915->vbo;
 
-      OUT_BATCH(_3DSTATE_COORD_SET_BINDINGS |
-                CSB_TCB(0, 0) |
-                CSB_TCB(1, 1) |
-                CSB_TCB(2, 2) |
-                CSB_TCB(3, 3) |
-                CSB_TCB(4, 4) | 
-                CSB_TCB(5, 5) | 
-                CSB_TCB(6, 6) | 
-                CSB_TCB(7, 7));
-
-      OUT_BATCH(_3DSTATE_RASTER_RULES_CMD |
-                ENABLE_POINT_RASTER_RULE |
-                OGL_POINT_RASTER_RULE |
-                ENABLE_LINE_STRIP_PROVOKE_VRTX |
-                ENABLE_TRI_FAN_PROVOKE_VRTX |
-                LINE_STRIP_PROVOKE_VRTX(1) |
-                TRI_FAN_PROVOKE_VRTX(2) | 
-                ENABLE_TEXKILL_3D_4D | 
-                TEXKILL_4D);
-
-      /* Need to initialize this to zero.
-       */
-      OUT_BATCH(_3DSTATE_LOAD_STATE_IMMEDIATE_1 | I1_LOAD_S(3) | (0));
-      OUT_BATCH(0);
+   *batch_space = 1 + util_bitcount(dirty);
+}
+
+static uint target_fixup(struct pipe_surface *p, int component)
+{
+   const struct
+   {
+      enum pipe_format format;
+      uint hw_mask[4];
+   } fixup_mask[] = {
+      { PIPE_FORMAT_R8G8B8A8_UNORM, { S5_WRITEDISABLE_BLUE, S5_WRITEDISABLE_GREEN, S5_WRITEDISABLE_RED, S5_WRITEDISABLE_ALPHA}},
+      { PIPE_FORMAT_R8G8B8X8_UNORM, { S5_WRITEDISABLE_BLUE, S5_WRITEDISABLE_GREEN, S5_WRITEDISABLE_RED, S5_WRITEDISABLE_ALPHA}},
+      { PIPE_FORMAT_L8_UNORM,       { S5_WRITEDISABLE_RED | S5_WRITEDISABLE_GREEN | S5_WRITEDISABLE_BLUE, 0, 0, S5_WRITEDISABLE_ALPHA}},
+      { PIPE_FORMAT_I8_UNORM,       { S5_WRITEDISABLE_RED | S5_WRITEDISABLE_GREEN | S5_WRITEDISABLE_BLUE, 0, 0, S5_WRITEDISABLE_ALPHA}},
+      { PIPE_FORMAT_A8_UNORM,       { 0, 0, 0, S5_WRITEDISABLE_RED | S5_WRITEDISABLE_GREEN | S5_WRITEDISABLE_BLUE | S5_WRITEDISABLE_ALPHA}},
+      { 0,                          { S5_WRITEDISABLE_RED, S5_WRITEDISABLE_GREEN, S5_WRITEDISABLE_BLUE, S5_WRITEDISABLE_ALPHA}}
+   };
+   int i = sizeof(fixup_mask) / sizeof(*fixup_mask) - 1;
+
+   if (p)
+      for(i = 0; fixup_mask[i].format != 0; i++)
+         if (p->format == fixup_mask[i].format)
+            return fixup_mask[i].hw_mask[component];
+
+   /* Just return default masks */
+   return fixup_mask[i].hw_mask[component];
+}
 
-      OUT_BATCH(_3DSTATE_DEPTH_SUBRECT_DISABLE);
+static void emit_immediate_s5(struct i915_context *i915, uint imm)
+{
+   /* Fixup write mask for non-BGRA render targets */
+   uint fixup_imm = imm & ~( S5_WRITEDISABLE_RED | S5_WRITEDISABLE_GREEN |
+                             S5_WRITEDISABLE_BLUE | S5_WRITEDISABLE_ALPHA );
+   struct pipe_surface *surf = i915->framebuffer.cbufs[0];
+
+   if (imm & S5_WRITEDISABLE_RED)
+      fixup_imm |= target_fixup(surf, 0);
+   if (imm & S5_WRITEDISABLE_GREEN)
+      fixup_imm |= target_fixup(surf, 1);
+   if (imm & S5_WRITEDISABLE_BLUE)
+      fixup_imm |= target_fixup(surf, 2);
+   if (imm & S5_WRITEDISABLE_ALPHA)
+      fixup_imm |= target_fixup(surf, 3);
+
+   OUT_BATCH(fixup_imm);
+}
 
-      /* disable indirect state for now
-       */
-      OUT_BATCH(_3DSTATE_LOAD_INDIRECT | 0);
-      OUT_BATCH(0);
+static void emit_immediate_s6(struct i915_context *i915, uint imm)
+{
+   /* Fixup blend function for A8 dst buffers.
+    * When we blend to an A8 buffer, the GPU thinks it's a G8 buffer,
+    * and therefore we need to use the color factor for alphas. */
+   uint srcRGB;
+
+   if (i915->current.target_fixup_format == PIPE_FORMAT_A8_UNORM) {
+      srcRGB = (imm >> S6_CBUF_SRC_BLEND_FACT_SHIFT) & BLENDFACT_MASK;
+      if (srcRGB == BLENDFACT_DST_ALPHA)
+         srcRGB = BLENDFACT_DST_COLR;
+      else if (srcRGB == BLENDFACT_INV_DST_ALPHA)
+         srcRGB = BLENDFACT_INV_DST_COLR;
+      imm &= ~SRC_BLND_FACT(BLENDFACT_MASK);
+      imm |= SRC_BLND_FACT(srcRGB);
    }
 
-   /* 7 dwords, 1 relocs */
-   if (i915->hardware_dirty & I915_HW_IMMEDIATE)
-   {
-      OUT_BATCH(_3DSTATE_LOAD_STATE_IMMEDIATE_1 | 
-                I1_LOAD_S(0) |
-                I1_LOAD_S(1) |
-                I1_LOAD_S(2) |
-                I1_LOAD_S(4) |
-                I1_LOAD_S(5) |
-                I1_LOAD_S(6) | 
-                (5));
-      
-      if(i915->vbo)
-         OUT_RELOC(i915->vbo,
-                   I915_USAGE_VERTEX,
+   OUT_BATCH(imm);
+}
+
+static void
+emit_immediate(struct i915_context *i915)
+{
+   /* remove unwanted bits and S7 */
+   unsigned dirty = (1 << I915_IMMEDIATE_S0 | 1 << I915_IMMEDIATE_S1 |
+                     1 << I915_IMMEDIATE_S2 | 1 << I915_IMMEDIATE_S3 |
+                     1 << I915_IMMEDIATE_S3 | 1 << I915_IMMEDIATE_S4 |
+                     1 << I915_IMMEDIATE_S5 | 1 << I915_IMMEDIATE_S6) &
+                    i915->immediate_dirty;
+   int i, num = util_bitcount(dirty);
+   assert(num && num <= I915_MAX_IMMEDIATE);
+
+   OUT_BATCH(_3DSTATE_LOAD_STATE_IMMEDIATE_1 |
+             dirty << 4 | (num - 1));
+
+   if (i915->immediate_dirty & (1 << I915_IMMEDIATE_S0)) {
+      if (i915->vbo)
+         OUT_RELOC(i915->vbo, I915_USAGE_VERTEX,
                    i915->current.immediate[I915_IMMEDIATE_S0]);
       else
-         /* FIXME: we should not do this */
          OUT_BATCH(0);
-      OUT_BATCH(i915->current.immediate[I915_IMMEDIATE_S1]);
-      OUT_BATCH(i915->current.immediate[I915_IMMEDIATE_S2]);
-      OUT_BATCH(i915->current.immediate[I915_IMMEDIATE_S4]);
-      OUT_BATCH(i915->current.immediate[I915_IMMEDIATE_S5]);
-      OUT_BATCH(i915->current.immediate[I915_IMMEDIATE_S6]);
-   } 
-
-#if 01
-   /* I915_MAX_DYNAMIC dwords, 0 relocs */
-   if (i915->hardware_dirty & I915_HW_DYNAMIC) 
-   {
-      int i;
-      for (i = 0; i < I915_MAX_DYNAMIC; i++) {
-         OUT_BATCH(i915->current.dynamic[i]);
+   }
+
+   for (i = 1; i < I915_MAX_IMMEDIATE; i++) {
+      if (dirty & (1 << i)) {
+         if (i == I915_IMMEDIATE_S5)
+            emit_immediate_s5(i915, i915->current.immediate[i]);
+         else if (i == I915_IMMEDIATE_S6)
+            emit_immediate_s6(i915, i915->current.immediate[i]);
+         else
+            OUT_BATCH(i915->current.immediate[i]);
       }
    }
-#endif
+}
 
-#if 01
-   /* 8 dwords, 2 relocs */
-   if (i915->hardware_dirty & I915_HW_STATIC)
-   {
-      struct pipe_surface *cbuf_surface = i915->framebuffer.cbufs[0];
-      struct pipe_surface *depth_surface = i915->framebuffer.zsbuf;
+static void
+validate_dynamic(struct i915_context *i915, unsigned *batch_space)
+{
+   *batch_space = util_bitcount(i915->dynamic_dirty & ((1 << I915_MAX_DYNAMIC) - 1));
+}
 
-      if (cbuf_surface) {
-         unsigned ctile = BUF_3D_USE_FENCE;
-         struct i915_texture *tex = i915_texture(cbuf_surface->texture);
-         assert(tex);
+static void
+emit_dynamic(struct i915_context *i915)
+{
+   int i;
+   for (i = 0; i < I915_MAX_DYNAMIC; i++) {
+      if (i915->dynamic_dirty & (1 << i))
+         OUT_BATCH(i915->current.dynamic[i]);
+   }
+}
 
-         if (tex && tex->sw_tiled) {
-            ctile = BUF_3D_TILED_SURFACE;
-         }
+static void
+validate_static(struct i915_context *i915, unsigned *batch_space)
+{
+   *batch_space = 0;
 
-         OUT_BATCH(_3DSTATE_BUF_INFO_CMD);
+   if (i915->current.cbuf_bo && (i915->static_dirty & I915_DST_BUF_COLOR)) {
+      i915->validation_buffers[i915->num_validation_buffers++]
+         = i915->current.cbuf_bo;
+      *batch_space += 3;
+   }
 
-         OUT_BATCH(BUF_3D_ID_COLOR_BACK |
-                   BUF_3D_PITCH(tex->stride) |  /* pitch in bytes */
-                   ctile);
+   if (i915->current.depth_bo && (i915->static_dirty & I915_DST_BUF_DEPTH)) {
+      i915->validation_buffers[i915->num_validation_buffers++]
+         = i915->current.depth_bo;
+      *batch_space += 3;
+   }
 
-         OUT_RELOC(tex->buffer,
-                   I915_USAGE_RENDER,
-                   cbuf_surface->offset);
-      }
+   if (i915->static_dirty & I915_DST_VARS)
+      *batch_space += 2;
 
-      /* What happens if no zbuf??
-       */
-      if (depth_surface) {
-         unsigned ztile = BUF_3D_USE_FENCE;
-         struct i915_texture *tex = i915_texture(depth_surface->texture);
-         assert(tex);
+   if (i915->static_dirty & I915_DST_RECT)
+      *batch_space += 5;
+}
 
-         if (tex && tex->sw_tiled) {
-            ztile = BUF_3D_TILED_SURFACE;
-         }
+static void
+emit_static(struct i915_context *i915)
+{
+   if (i915->current.cbuf_bo && (i915->static_dirty & I915_DST_BUF_COLOR)) {
+      OUT_BATCH(_3DSTATE_BUF_INFO_CMD);
+      OUT_BATCH(i915->current.cbuf_flags);
+      OUT_RELOC(i915->current.cbuf_bo,
+                I915_USAGE_RENDER,
+                0);
+   }
 
-         OUT_BATCH(_3DSTATE_BUF_INFO_CMD);
+   /* What happens if no zbuf??
+    */
+   if (i915->current.depth_bo && (i915->static_dirty & I915_DST_BUF_DEPTH)) {
+      OUT_BATCH(_3DSTATE_BUF_INFO_CMD);
+      OUT_BATCH(i915->current.depth_flags);
+      OUT_RELOC(i915->current.depth_bo,
+                I915_USAGE_RENDER,
+                0);
+   }
 
-         assert(tex);
-         OUT_BATCH(BUF_3D_ID_DEPTH |
-                   BUF_3D_PITCH(tex->stride) |  /* pitch in bytes */
-                   ztile);
+   if (i915->static_dirty & I915_DST_VARS) {
+      OUT_BATCH(_3DSTATE_DST_BUF_VARS_CMD);
+      OUT_BATCH(i915->current.dst_buf_vars);
+   }
+}
 
-         OUT_RELOC(tex->buffer,
-                   I915_USAGE_RENDER,
-                   depth_surface->offset);
-      }
+static void
+validate_map(struct i915_context *i915, unsigned *batch_space)
+{
+   const uint enabled = i915->current.sampler_enable_flags;
+   uint unit;
+   struct i915_texture *tex;
 
-      {
-         unsigned cformat, zformat = 0;
+   *batch_space = i915->current.sampler_enable_nr ?
+     2 + 3*i915->current.sampler_enable_nr : 0;
 
-         if (cbuf_surface)
-            cformat = cbuf_surface->format;
-         else
-            cformat = PIPE_FORMAT_B8G8R8A8_UNORM; /* arbitrary */
-         cformat = translate_format(cformat);
-
-         if (depth_surface) 
-            zformat = translate_depth_format( i915->framebuffer.zsbuf->format );
-
-         OUT_BATCH(_3DSTATE_DST_BUF_VARS_CMD);
-         OUT_BATCH(DSTORG_HORT_BIAS(0x8) | /* .5 */
-                   DSTORG_VERT_BIAS(0x8) | /* .5 */
-                   LOD_PRECLAMP_OGL |
-                   TEX_DEFAULT_COLOR_OGL |
-                   cformat |
-                   zformat );
+   for (unit = 0; unit < I915_TEX_UNITS; unit++) {
+      if (enabled & (1 << unit)) {
+         tex = i915_texture(i915->fragment_sampler_views[unit]->texture);
+         i915->validation_buffers[i915->num_validation_buffers++] = tex->buffer;
       }
    }
-#endif
+}
 
-#if 01
-      /* texture images */
-      /* 2 + I915_TEX_UNITS*3 dwords, I915_TEX_UNITS relocs */
-      if (i915->hardware_dirty & (I915_HW_MAP | I915_HW_SAMPLER))
-      {
-         const uint nr = i915->current.sampler_enable_nr;
-         if (nr) {
-            const uint enabled = i915->current.sampler_enable_flags;
-            uint unit;
-            uint count = 0;
-            OUT_BATCH(_3DSTATE_MAP_STATE | (3 * nr));
-            OUT_BATCH(enabled);
-            for (unit = 0; unit < I915_TEX_UNITS; unit++) {
-               if (enabled & (1 << unit)) {
-                  struct i915_texture *texture = i915_texture(i915->fragment_sampler_views[unit]->texture);
-                  struct i915_winsys_buffer *buf = texture->buffer;
-                  uint offset = 0;
-                  assert(buf);
-
-                  count++;
-
-                  OUT_RELOC(buf, I915_USAGE_SAMPLER, offset);
-                  OUT_BATCH(i915->current.texbuffer[unit][0]); /* MS3 */
-                  OUT_BATCH(i915->current.texbuffer[unit][1]); /* MS4 */
-               }
-            }
-            assert(count == nr);
+static void
+emit_map(struct i915_context *i915)
+{
+   const uint nr = i915->current.sampler_enable_nr;
+   if (nr) {
+      const uint enabled = i915->current.sampler_enable_flags;
+      uint unit;
+      uint count = 0;
+      OUT_BATCH(_3DSTATE_MAP_STATE | (3 * nr));
+      OUT_BATCH(enabled);
+      for (unit = 0; unit < I915_TEX_UNITS; unit++) {
+         if (enabled & (1 << unit)) {
+            struct i915_texture *texture = i915_texture(i915->fragment_sampler_views[unit]->texture);
+            struct i915_winsys_buffer *buf = texture->buffer;
+            unsigned offset = i915->current.texbuffer[unit][2];
+
+            assert(buf);
+
+            count++;
+
+            OUT_RELOC(buf, I915_USAGE_SAMPLER, offset);
+            OUT_BATCH(i915->current.texbuffer[unit][0]); /* MS3 */
+            OUT_BATCH(i915->current.texbuffer[unit][1]); /* MS4 */
          }
       }
-#endif
+      assert(count == nr);
+   }
+}
 
-#if 01
-   /* samplers */
-   /* 2 + I915_TEX_UNITS*3 dwords, 0 relocs */
-   if (i915->hardware_dirty & I915_HW_SAMPLER) 
-   {
-      if (i915->current.sampler_enable_nr) {
-         int i;
+static void
+validate_sampler(struct i915_context *i915, unsigned *batch_space)
+{
+   *batch_space = i915->current.sampler_enable_nr ?
+     2 + 3*i915->current.sampler_enable_nr : 0;
+}
 
-         OUT_BATCH( _3DSTATE_SAMPLER_STATE | 
-                    (3 * i915->current.sampler_enable_nr) );
+static void
+emit_sampler(struct i915_context *i915)
+{
+   if (i915->current.sampler_enable_nr) {
+      int i;
 
-         OUT_BATCH( i915->current.sampler_enable_flags );
+      OUT_BATCH( _3DSTATE_SAMPLER_STATE |
+                 (3 * i915->current.sampler_enable_nr) );
 
-         for (i = 0; i < I915_TEX_UNITS; i++) {
-            if (i915->current.sampler_enable_flags & (1<<i)) {
-               OUT_BATCH( i915->current.sampler[i][0] );
-               OUT_BATCH( i915->current.sampler[i][1] );
-               OUT_BATCH( i915->current.sampler[i][2] );
-            }
+      OUT_BATCH( i915->current.sampler_enable_flags );
+
+      for (i = 0; i < I915_TEX_UNITS; i++) {
+         if (i915->current.sampler_enable_flags & (1<<i)) {
+            OUT_BATCH( i915->current.sampler[i][0] );
+            OUT_BATCH( i915->current.sampler[i][1] );
+            OUT_BATCH( i915->current.sampler[i][2] );
          }
       }
    }
-#endif
+}
 
-#if 01
-   /* constants */
-   /* 2 + I915_MAX_CONSTANT*4 dwords, 0 relocs */
-   if (i915->hardware_dirty & I915_HW_CONSTANTS)
-   {
-      /* Collate the user-defined constants with the fragment shader's
-       * immediates according to the constant_flags[] array.
-       */
-      const uint nr = i915->fs->num_constants;
-      if (nr) {
-         uint i;
-
-         OUT_BATCH( _3DSTATE_PIXEL_SHADER_CONSTANTS | (nr * 4) );
-         OUT_BATCH( (1 << (nr - 1)) | ((1 << (nr - 1)) - 1) );
-
-         for (i = 0; i < nr; i++) {
-            const uint *c;
-            if (i915->fs->constant_flags[i] == I915_CONSTFLAG_USER) {
-               /* grab user-defined constant */
-               c = (uint *) i915->current.constants[PIPE_SHADER_FRAGMENT][i];
-            }
-            else {
-               /* emit program constant */
-               c = (uint *) i915->fs->constants[i];
-            }
+static void
+validate_constants(struct i915_context *i915, unsigned *batch_space)
+{
+   int nr = i915->fs->num_constants ?
+      2 + 4*i915->fs->num_constants : 0;
+
+   *batch_space = nr;
+}
+
+static void
+emit_constants(struct i915_context *i915)
+{
+   /* Collate the user-defined constants with the fragment shader's
+    * immediates according to the constant_flags[] array.
+    */
+   const uint nr = i915->fs->num_constants;
+
+   assert(nr < I915_MAX_CONSTANT);
+   if (nr) {
+      uint i;
+
+      OUT_BATCH( _3DSTATE_PIXEL_SHADER_CONSTANTS | (nr * 4) );
+      OUT_BATCH((1 << nr) - 1);
+
+      for (i = 0; i < nr; i++) {
+         const uint *c;
+         if (i915->fs->constant_flags[i] == I915_CONSTFLAG_USER) {
+            /* grab user-defined constant */
+            c = (uint *) i915_buffer(i915->constants[PIPE_SHADER_FRAGMENT])->data;
+            c += 4 * i;
+         }
+         else {
+            /* emit program constant */
+            c = (uint *) i915->fs->constants[i];
+         }
 #if 0 /* debug */
-            {
-               float *f = (float *) c;
-               printf("Const %2d: %f %f %f %f %s\n", i, f[0], f[1], f[2], f[3],
-                      (i915->fs->constant_flags[i] == I915_CONSTFLAG_USER
-                       ? "user" : "immediate"));
-            }
-#endif
-            OUT_BATCH(*c++);
-            OUT_BATCH(*c++);
-            OUT_BATCH(*c++);
-            OUT_BATCH(*c++);
+         {
+            float *f = (float *) c;
+            printf("Const %2d: %f %f %f %f %s\n", i, f[0], f[1], f[2], f[3],
+                   (i915->fs->constant_flags[i] == I915_CONSTFLAG_USER
+                    ? "user" : "immediate"));
          }
+#endif
+         OUT_BATCH(*c++);
+         OUT_BATCH(*c++);
+         OUT_BATCH(*c++);
+         OUT_BATCH(*c++);
       }
    }
-#endif
+}
+
+static void
+validate_program(struct i915_context *i915, unsigned *batch_space)
+{
+   uint additional_size = 0;
+
+   additional_size += i915->current.target_fixup_format ? 3 : 0;
 
-#if 01
-   /* Fragment program */
-   /* i915->current.program_len dwords, 0 relocs */
-   if (i915->hardware_dirty & I915_HW_PROGRAM)
+   /* we need more batch space if we want to emulate rgba framebuffers */
+   *batch_space = i915->fs->decl_len + i915->fs->program_len + additional_size;
+}
+
+static void
+emit_program(struct i915_context *i915)
+{
+   uint additional_size = 0;
+   uint i;
+
+   /* count how much additional space we'll need */
+   validate_program(i915, &additional_size);
+   additional_size -= i915->fs->decl_len + i915->fs->program_len;
+
+   /* we should always have, at least, a pass-through program */
+   assert(i915->fs->program_len > 0);
+
+   /* output the declarations */
    {
-      uint i;
-      /* we should always have, at least, a pass-through program */
-      assert(i915->fs->program_len > 0);
-      for (i = 0; i < i915->fs->program_len; i++) {
-         OUT_BATCH(i915->fs->program[i]);
-      }
+      /* first word has the size, we have to adjust that */
+      uint size = (i915->fs->decl[0]);
+      size += additional_size;
+      OUT_BATCH(size);
    }
-#endif
 
-#if 01
-   /* drawing surface size */
-   /* 6 dwords, 0 relocs */
-   {
-      uint w, h;
-      boolean k = framebuffer_size(&i915->framebuffer, &w, &h);
-      (void)k;
-      assert(k);
+   for (i = 1 ; i < i915->fs->decl_len; i++)
+      OUT_BATCH(i915->fs->decl[i]);
 
-      OUT_BATCH(_3DSTATE_DRAW_RECT_CMD);
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-      OUT_BATCH(((w - 1) & 0xffff) | ((h - 1) << 16));
-      OUT_BATCH(0);
+   /* output the program */
+   assert(i915->fs->program_len % 3 == 0);
+   for (i = 0 ; i < i915->fs->program_len; i+=3) {
+      OUT_BATCH(i915->fs->program[i]);
+      OUT_BATCH(i915->fs->program[i+1]);
+      OUT_BATCH(i915->fs->program[i+2]);
+   }
+
+   /* we emit an additional mov with swizzle to fake RGBA framebuffers */
+   if (i915->current.target_fixup_format) {
+      /* mov out_color, out_color.zyxw */
+      OUT_BATCH(A0_MOV |
+                (REG_TYPE_OC << A0_DEST_TYPE_SHIFT) |
+                A0_DEST_CHANNEL_ALL |
+                (REG_TYPE_OC << A0_SRC0_TYPE_SHIFT) |
+                (T_DIFFUSE << A0_SRC0_NR_SHIFT));
+      OUT_BATCH(i915->current.fixup_swizzle);
       OUT_BATCH(0);
    }
+}
+
+static void
+emit_draw_rect(struct i915_context *i915)
+{
+   if (i915->static_dirty & I915_DST_RECT) {
+      OUT_BATCH(_3DSTATE_DRAW_RECT_CMD);
+      OUT_BATCH(DRAW_RECT_DIS_DEPTH_OFS);
+      OUT_BATCH(i915->current.draw_offset);
+      OUT_BATCH(i915->current.draw_size);
+      OUT_BATCH(i915->current.draw_offset);
+   }
+}
+
+static boolean
+i915_validate_state(struct i915_context *i915, unsigned *batch_space)
+{
+   unsigned tmp;
+
+   i915->num_validation_buffers = 0;
+   if (i915->hardware_dirty & I915_HW_INVARIANT)
+      *batch_space = ARRAY_SIZE(invariant_state);
+   else
+      *batch_space = 0;
+
+#if 0
+static int counter_total = 0;
+#define VALIDATE_ATOM(atom, hw_dirty) \
+   if (i915->hardware_dirty & hw_dirty) { \
+      static int counter_##atom = 0;\
+      validate_##atom(i915, &tmp); \
+      *batch_space += tmp;\
+      counter_##atom += tmp;\
+      counter_total += tmp;\
+      printf("%s: \t%d/%d \t%2.2f\n",#atom, counter_##atom, counter_total, counter_##atom*100.f/counter_total);}
+#else
+#define VALIDATE_ATOM(atom, hw_dirty) \
+   if (i915->hardware_dirty & hw_dirty) { \
+      validate_##atom(i915, &tmp); \
+      *batch_space += tmp; }
 #endif
+   VALIDATE_ATOM(flush, I915_HW_FLUSH);
+   VALIDATE_ATOM(immediate, I915_HW_IMMEDIATE);
+   VALIDATE_ATOM(dynamic, I915_HW_DYNAMIC);
+   VALIDATE_ATOM(static, I915_HW_STATIC);
+   VALIDATE_ATOM(map, I915_HW_MAP);
+   VALIDATE_ATOM(sampler, I915_HW_SAMPLER);
+   VALIDATE_ATOM(constants, I915_HW_CONSTANTS);
+   VALIDATE_ATOM(program, I915_HW_PROGRAM);
+#undef VALIDATE_ATOM
+
+   if (i915->num_validation_buffers == 0)
+      return TRUE;
+
+   if (!i915_winsys_validate_buffers(i915->batch, i915->validation_buffers,
+                                     i915->num_validation_buffers))
+      return FALSE;
+
+   return TRUE;
+}
+
+/* Push the state into the sarea and/or texture memory.
+ */
+void
+i915_emit_hardware_state(struct i915_context *i915 )
+{
+   unsigned batch_space;
+   uintptr_t save_ptr;
+
+   assert(i915->dirty == 0);
+
+   if (I915_DBG_ON(DBG_ATOMS))
+      i915_dump_hardware_dirty(i915, __FUNCTION__);
+
+   if (!i915_validate_state(i915, &batch_space)) {
+      FLUSH_BATCH(NULL, I915_FLUSH_ASYNC);
+      assert(i915_validate_state(i915, &batch_space));
+   }
+
+   if(!BEGIN_BATCH(batch_space)) {
+      FLUSH_BATCH(NULL, I915_FLUSH_ASYNC);
+      assert(i915_validate_state(i915, &batch_space));
+      assert(BEGIN_BATCH(batch_space));
+   }
+
+   save_ptr = (uintptr_t)i915->batch->ptr;
 
-   I915_DBG(DBG_EMIT, "%s: used %d dwords, %d relocs\n", __FUNCTION__,
+#define EMIT_ATOM(atom, hw_dirty) \
+   if (i915->hardware_dirty & hw_dirty) \
+      emit_##atom(i915);
+   EMIT_ATOM(flush, I915_HW_FLUSH);
+   EMIT_ATOM(invariant, I915_HW_INVARIANT);
+   EMIT_ATOM(immediate, I915_HW_IMMEDIATE);
+   EMIT_ATOM(dynamic, I915_HW_DYNAMIC);
+   EMIT_ATOM(static, I915_HW_STATIC);
+   EMIT_ATOM(map, I915_HW_MAP);
+   EMIT_ATOM(sampler, I915_HW_SAMPLER);
+   EMIT_ATOM(constants, I915_HW_CONSTANTS);
+   EMIT_ATOM(program, I915_HW_PROGRAM);
+   EMIT_ATOM(draw_rect, I915_HW_STATIC);
+#undef EMIT_ATOM
+
+   I915_DBG(DBG_EMIT, "%s: used %d dwords, %d dwords reserved\n", __FUNCTION__,
             ((uintptr_t)i915->batch->ptr - save_ptr) / 4,
-            i915->batch->relocs - save_relocs);
+            batch_space);
+   assert(((uintptr_t)i915->batch->ptr - save_ptr) / 4 == batch_space);
 
    i915->hardware_dirty = 0;
+   i915->immediate_dirty = 0;
+   i915->dynamic_dirty = 0;
+   i915->static_dirty = 0;
+   i915->flush_dirty = 0;
 }