From aac120977d1ead319141d48d65c9bba626ec03b8 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Sun, 20 Feb 2011 13:23:47 +0000
Subject: [PATCH] i965: Move repeat-instruction-suppression to batchbuffer core

Move the tracking of the last emitted instructions into the core
batchbuffer routines and take advantage of the shadow batch copy to
avoid extra memory allocations and copies.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 src/mesa/drivers/dri/i965/brw_cc.c            | 22 +++---
 src/mesa/drivers/dri/i965/brw_curbe.c         | 20 +++---
 src/mesa/drivers/dri/i965/brw_misc_state.c    | 66 +++++++-----------
 src/mesa/drivers/dri/i965/brw_state.h         |  6 --
 src/mesa/drivers/dri/i965/brw_state_batch.c   | 65 ------------------
 src/mesa/drivers/dri/i965/brw_state_upload.c  |  4 --
 .../drivers/dri/intel/intel_batchbuffer.c     | 68 ++++++++++++++++++-
 .../drivers/dri/intel/intel_batchbuffer.h     |  7 +-
 src/mesa/drivers/dri/intel/intel_context.h    | 14 +---
 9 files changed, 120 insertions(+), 152 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_cc.c b/src/mesa/drivers/dri/i965/brw_cc.c
index c37376ef0de..b7048ecb39b 100644
--- a/src/mesa/drivers/dri/i965/brw_cc.c
+++ b/src/mesa/drivers/dri/i965/brw_cc.c
@@ -233,18 +233,16 @@ const struct brw_tracked_state brw_cc_unit = {
 
 static void upload_blend_constant_color(struct brw_context *brw)
 {
-   struct gl_context *ctx = &brw->intel.ctx;
-   struct brw_blend_constant_color bcc;
-
-   memset(&bcc, 0, sizeof(bcc));
-   bcc.header.opcode = _3DSTATE_BLEND_CONSTANT_COLOR;
-   bcc.header.length = sizeof(bcc)/4-2;
-   bcc.blend_constant_color[0] = ctx->Color.BlendColor[0];
-   bcc.blend_constant_color[1] = ctx->Color.BlendColor[1];
-   bcc.blend_constant_color[2] = ctx->Color.BlendColor[2];
-   bcc.blend_constant_color[3] = ctx->Color.BlendColor[3];
-
-   BRW_CACHED_BATCH_STRUCT(brw, &bcc);
+   struct intel_context *intel = &brw->intel;
+   struct gl_context *ctx = &intel->ctx;
+
+   BEGIN_BATCH(5);
+   OUT_BATCH(_3DSTATE_BLEND_CONSTANT_COLOR << 16 | (5-2));
+   OUT_BATCH(ctx->Color.BlendColor[0]);
+   OUT_BATCH(ctx->Color.BlendColor[1]);
+   OUT_BATCH(ctx->Color.BlendColor[2]);
+   OUT_BATCH(ctx->Color.BlendColor[3]);
+   CACHED_BATCH();
 }
 
 const struct brw_tracked_state brw_blend_constant_color = {
diff --git a/src/mesa/drivers/dri/i965/brw_curbe.c b/src/mesa/drivers/dri/i965/brw_curbe.c
index 877b22fec19..ae11c487a2c 100644
--- a/src/mesa/drivers/dri/i965/brw_curbe.c
+++ b/src/mesa/drivers/dri/i965/brw_curbe.c
@@ -146,22 +146,24 @@ const struct brw_tracked_state brw_curbe_offsets = {
  */
 void brw_upload_cs_urb_state(struct brw_context *brw)
 {
-   struct brw_cs_urb_state cs_urb;
-   memset(&cs_urb, 0, sizeof(cs_urb));
+   struct intel_context *intel = &brw->intel;
 
+   BEGIN_BATCH(2);
    /* It appears that this is the state packet for the CS unit, ie. the
     * urb entries detailed here are housed in the CS range from the
     * URB_FENCE command.
     */
-   cs_urb.header.opcode = CMD_CS_URB_STATE;
-   cs_urb.header.length = sizeof(cs_urb)/4 - 2;
+   OUT_BATCH(CMD_CS_URB_STATE << 16 | (2-2));
 
    /* BRW_NEW_URB_FENCE */
-   cs_urb.bits0.nr_urb_entries = brw->urb.nr_cs_entries;
-   cs_urb.bits0.urb_entry_size = brw->urb.csize - 1;
-
-   assert(brw->urb.nr_cs_entries);
-   BRW_CACHED_BATCH_STRUCT(brw, &cs_urb);
+   if (brw->urb.csize == 0) {
+      OUT_BATCH(0);
+   } else {
+      /* BRW_NEW_URB_FENCE */
+      assert(brw->urb.nr_cs_entries);
+      OUT_BATCH((brw->urb.csize - 1) << 4 | brw->urb.nr_cs_entries);
+   }
+   CACHED_BATCH();
 }
 
 static GLfloat fixed_plane[6][4] = {
diff --git a/src/mesa/drivers/dri/i965/brw_misc_state.c b/src/mesa/drivers/dri/i965/brw_misc_state.c
index c5c74098246..c768be23fa7 100644
--- a/src/mesa/drivers/dri/i965/brw_misc_state.c
+++ b/src/mesa/drivers/dri/i965/brw_misc_state.c
@@ -301,16 +301,15 @@ const struct brw_tracked_state brw_depthbuffer = {
 
 static void upload_polygon_stipple(struct brw_context *brw)
 {
+   struct intel_context *intel = &brw->intel;
    struct gl_context *ctx = &brw->intel.ctx;
-   struct brw_polygon_stipple bps;
    GLuint i;
 
    if (!ctx->Polygon.StippleFlag)
       return;
 
-   memset(&bps, 0, sizeof(bps));
-   bps.header.opcode = _3DSTATE_POLY_STIPPLE_PATTERN;
-   bps.header.length = sizeof(bps)/4-2;
+   BEGIN_BATCH(33);
+   OUT_BATCH(_3DSTATE_POLY_STIPPLE_PATTERN << 16 | (33 - 2));
 
    /* Polygon stipple is provided in OpenGL order, i.e. bottom
     * row first.  If we're rendering to a window (i.e. the
@@ -321,14 +320,13 @@ static void upload_polygon_stipple(struct brw_context *brw)
     */
    if (ctx->DrawBuffer->Name == 0) {
       for (i = 0; i < 32; i++)
-         bps.stipple[i] = ctx->PolygonStipple[31 - i]; /* invert */
+	  OUT_BATCH(ctx->PolygonStipple[31 - i]); /* invert */
    }
    else {
       for (i = 0; i < 32; i++)
-         bps.stipple[i] = ctx->PolygonStipple[i]; /* don't invert */
+	 OUT_BATCH(ctx->PolygonStipple[i]);
    }
-
-   BRW_CACHED_BATCH_STRUCT(brw, &bps);
+   CACHED_BATCH();
 }
 
 const struct brw_tracked_state brw_polygon_stipple = {
@@ -347,15 +345,14 @@ const struct brw_tracked_state brw_polygon_stipple = {
 
 static void upload_polygon_stipple_offset(struct brw_context *brw)
 {
+   struct intel_context *intel = &brw->intel;
    struct gl_context *ctx = &brw->intel.ctx;
-   struct brw_polygon_stipple_offset bpso;
 
    if (!ctx->Polygon.StippleFlag)
       return;
 
-   memset(&bpso, 0, sizeof(bpso));
-   bpso.header.opcode = _3DSTATE_POLY_STIPPLE_OFFSET;
-   bpso.header.length = sizeof(bpso)/4-2;
+   BEGIN_BATCH(2);
+   OUT_BATCH(_3DSTATE_POLY_STIPPLE_OFFSET << 16 | (2-2));
 
    /* If we're drawing to a system window (ctx->DrawBuffer->Name == 0),
     * we have to invert the Y axis in order to match the OpenGL
@@ -365,16 +362,11 @@ static void upload_polygon_stipple_offset(struct brw_context *brw)
     * system works just fine, and there's no window system to
     * worry about.
     */
-   if (brw->intel.ctx.DrawBuffer->Name == 0) {
-      bpso.bits0.x_offset = 0;
-      bpso.bits0.y_offset = (32 - (ctx->DrawBuffer->Height & 31)) & 31;
-   }
-   else {
-      bpso.bits0.y_offset = 0;
-      bpso.bits0.x_offset = 0;
-   }
-
-   BRW_CACHED_BATCH_STRUCT(brw, &bpso);
+   if (brw->intel.ctx.DrawBuffer->Name == 0)
+      OUT_BATCH((32 - (ctx->DrawBuffer->Height & 31)) & 31);
+   else
+      OUT_BATCH(0);
+   CACHED_BATCH();
 }
 
 #define _NEW_WINDOW_POS 0x40000000
@@ -393,18 +385,17 @@ const struct brw_tracked_state brw_polygon_stipple_offset = {
  */
 static void upload_aa_line_parameters(struct brw_context *brw)
 {
+   struct intel_context *intel = &brw->intel;
    struct gl_context *ctx = &brw->intel.ctx;
-   struct brw_aa_line_parameters balp;
 
    if (!ctx->Line.SmoothFlag || !brw->has_aa_line_parameters)
       return;
 
+   OUT_BATCH(_3DSTATE_AA_LINE_PARAMETERS << 16 | (3 - 2));
    /* use legacy aa line coverage computation */
-   memset(&balp, 0, sizeof(balp));
-   balp.header.opcode = _3DSTATE_AA_LINE_PARAMETERS;
-   balp.header.length = sizeof(balp) / 4 - 2;
-   
-   BRW_CACHED_BATCH_STRUCT(brw, &balp);
+   OUT_BATCH(0);
+   OUT_BATCH(0);
+   CACHED_BATCH();
 }
 
 const struct brw_tracked_state brw_aa_line_parameters = {
@@ -422,28 +413,21 @@ const struct brw_tracked_state brw_aa_line_parameters = {
 
 static void upload_line_stipple(struct brw_context *brw)
 {
+   struct intel_context *intel = &brw->intel;
    struct gl_context *ctx = &brw->intel.ctx;
-   struct brw_line_stipple bls;
    GLfloat tmp;
    GLint tmpi;
 
    if (!ctx->Line.StippleFlag)
       return;
 
-   memset(&bls, 0, sizeof(bls));
-   bls.header.opcode = _3DSTATE_LINE_STIPPLE_PATTERN;
-   bls.header.length = sizeof(bls)/4 - 2;
-
-   bls.bits0.pattern = ctx->Line.StipplePattern;
-   bls.bits1.repeat_count = ctx->Line.StippleFactor;
-
+   BEGIN_BATCH(3);
+   OUT_BATCH(_3DSTATE_LINE_STIPPLE_PATTERN << 16 | (3 - 2));
+   OUT_BATCH(ctx->Line.StipplePattern);
    tmp = 1.0 / (GLfloat) ctx->Line.StippleFactor;
    tmpi = tmp * (1<<13);
-
-
-   bls.bits1.inverse_repeat_count = tmpi;
-
-   BRW_CACHED_BATCH_STRUCT(brw, &bls);
+   OUT_BATCH(tmpi << 16 | ctx->Line.StippleFactor);
+   CACHED_BATCH();
 }
 
 const struct brw_tracked_state brw_line_stipple = {
diff --git a/src/mesa/drivers/dri/i965/brw_state.h b/src/mesa/drivers/dri/i965/brw_state.h
index 23d9e90c6e4..86b0caa4a4e 100644
--- a/src/mesa/drivers/dri/i965/brw_state.h
+++ b/src/mesa/drivers/dri/i965/brw_state.h
@@ -166,13 +166,7 @@ void brw_destroy_caches( struct brw_context *brw );
  */
 #define BRW_BATCH_STRUCT(brw, s) intel_batchbuffer_data(&brw->intel, (s), \
 							sizeof(*(s)), false)
-#define BRW_CACHED_BATCH_STRUCT(brw, s) brw_cached_batch_struct( brw, (s), sizeof(*(s)) )
 
-GLboolean brw_cached_batch_struct( struct brw_context *brw,
-				   const void *data,
-				   GLuint sz );
-void brw_destroy_batch_cache( struct brw_context *brw );
-void brw_clear_batch_cache( struct brw_context *brw );
 void *brw_state_batch(struct brw_context *brw,
 		      int size,
 		      int alignment,
diff --git a/src/mesa/drivers/dri/i965/brw_state_batch.c b/src/mesa/drivers/dri/i965/brw_state_batch.c
index f363a922c1d..213c7a38d8c 100644
--- a/src/mesa/drivers/dri/i965/brw_state_batch.c
+++ b/src/mesa/drivers/dri/i965/brw_state_batch.c
@@ -29,75 +29,10 @@
   *   Keith Whitwell <keith@tungstengraphics.com>
   */
      
-
-
 #include "brw_state.h"
 #include "intel_batchbuffer.h"
 #include "main/imports.h"
 
-
-
-/* A facility similar to the data caching code above, which aims to
- * prevent identical commands being issued repeatedly.
- */
-GLboolean brw_cached_batch_struct( struct brw_context *brw,
-				   const void *data,
-				   GLuint sz )
-{
-   struct brw_cached_batch_item *item = brw->cached_batch_items;
-   struct header *newheader = (struct header *)data;
-
-   if (brw->emit_state_always) {
-      intel_batchbuffer_data(&brw->intel, data, sz, false);
-      return GL_TRUE;
-   }
-
-   while (item) {
-      if (item->header->opcode == newheader->opcode) {
-	 if (item->sz == sz && memcmp(item->header, newheader, sz) == 0)
-	    return GL_FALSE;
-	 if (item->sz != sz) {
-	    free(item->header);
-	    item->header = malloc(sz);
-	    item->sz = sz;
-	 }
-	 goto emit;
-      }
-      item = item->next;
-   }
-
-   assert(!item);
-   item = CALLOC_STRUCT(brw_cached_batch_item);
-   item->header = malloc(sz);
-   item->sz = sz;
-   item->next = brw->cached_batch_items;
-   brw->cached_batch_items = item;
-
- emit:
-   memcpy(item->header, newheader, sz);
-   intel_batchbuffer_data(&brw->intel, data, sz, false);
-   return GL_TRUE;
-}
-
-void brw_clear_batch_cache( struct brw_context *brw )
-{
-   struct brw_cached_batch_item *item = brw->cached_batch_items;
-
-   while (item) {
-      struct brw_cached_batch_item *next = item->next;
-      free((void *)item->header);
-      free(item);
-      item = next;
-   }
-
-   brw->cached_batch_items = NULL;
-}
-
-void brw_destroy_batch_cache( struct brw_context *brw )
-{
-   brw_clear_batch_cache(brw);
-}
-
 /**
  * Allocates a block of space in the batchbuffer for indirect state.
  *
diff --git a/src/mesa/drivers/dri/i965/brw_state_upload.c b/src/mesa/drivers/dri/i965/brw_state_upload.c
index 08b7ac90796..8bb92fed5d1 100644
--- a/src/mesa/drivers/dri/i965/brw_state_upload.c
+++ b/src/mesa/drivers/dri/i965/brw_state_upload.c
@@ -176,7 +176,6 @@ void brw_init_state( struct brw_context *brw )
 void brw_destroy_state( struct brw_context *brw )
 {
    brw_destroy_caches(brw);
-   brw_destroy_batch_cache(brw);
 }
 
 /***********************************************************************
@@ -383,9 +382,6 @@ void brw_validate_state( struct brw_context *brw )
        state->brw == 0)
       return;
 
-   if (brw->state.dirty.brw & BRW_NEW_CONTEXT)
-      brw_clear_batch_cache(brw);
-
    brw->intel.Fallback = GL_FALSE; /* boolean, not bitfield */
 
    /* do prepare stage for all atoms */
diff --git a/src/mesa/drivers/dri/intel/intel_batchbuffer.c b/src/mesa/drivers/dri/intel/intel_batchbuffer.c
index 7d224ae535c..805f3a34e8b 100644
--- a/src/mesa/drivers/dri/intel/intel_batchbuffer.c
+++ b/src/mesa/drivers/dri/intel/intel_batchbuffer.c
@@ -33,6 +33,25 @@
 #include "intel_bufmgr.h"
 #include "intel_buffers.h"
 
+struct cached_batch_item {
+   struct cached_batch_item *next;
+   uint16_t header;
+   uint16_t size;
+};
+
+static void clear_cache( struct intel_context *intel )
+{
+   struct cached_batch_item *item = intel->batch.cached_items;
+
+   while (item) {
+      struct cached_batch_item *next = item->next;
+      free(item);
+      item = next;
+   }
+
+   intel->batch.cached_items = NULL;
+}
+
 void
 intel_batchbuffer_reset(struct intel_context *intel)
 {
@@ -40,6 +59,7 @@ intel_batchbuffer_reset(struct intel_context *intel)
       drm_intel_bo_unreference(intel->batch.bo);
       intel->batch.bo = NULL;
    }
+   clear_cache(intel);
 
    intel->batch.bo = drm_intel_bo_alloc(intel->bufmgr, "batchbuffer",
 					intel->maxBatchSize, 4096);
@@ -53,6 +73,7 @@ void
 intel_batchbuffer_free(struct intel_context *intel)
 {
    drm_intel_bo_unreference(intel->batch.bo);
+   clear_cache(intel);
 }
 
 
@@ -165,7 +186,8 @@ intel_batchbuffer_emit_reloc(struct intel_context *intel,
    ret = drm_intel_bo_emit_reloc(intel->batch.bo, 4*intel->batch.used,
 				 buffer, delta,
 				 read_domains, write_domain);
-   assert (ret == 0);
+   assert(ret == 0);
+   (void)ret;
 
    /*
     * Using the old buffer offset, write in what the right data would be, in case
@@ -191,7 +213,8 @@ intel_batchbuffer_emit_reloc_fenced(struct intel_context *intel,
    ret = drm_intel_bo_emit_reloc_fence(intel->batch.bo, 4*intel->batch.used,
 				       buffer, delta,
 				       read_domains, write_domain);
-   assert (ret == 0);
+   assert(ret == 0);
+   (void)ret;
 
    /*
     * Using the old buffer offset, write in what the right data would
@@ -213,6 +236,47 @@ intel_batchbuffer_data(struct intel_context *intel,
    intel->batch.used += bytes >> 2;
 }
 
+void
+intel_batchbuffer_cached_advance(struct intel_context *intel)
+{
+   struct cached_batch_item **prev = &intel->batch.cached_items, *item;
+   uint32_t sz = (intel->batch.used - intel->batch.emit) * sizeof(uint32_t);
+   uint32_t *start = intel->batch.map + intel->batch.emit;
+   uint16_t op = *start >> 16;
+
+   while (*prev) {
+      uint32_t *old;
+
+      item = *prev;
+      old = intel->batch.map + item->header;
+      if (op == *old >> 16) {
+	 if (item->size == sz && memcmp(old, start, sz) == 0) {
+	    if (prev != &intel->batch.cached_items) {
+	       *prev = item->next;
+	       item->next = intel->batch.cached_items;
+	       intel->batch.cached_items = item;
+	    }
+	    intel->batch.used = intel->batch.emit;
+	    return;
+	 }
+
+	 goto emit;
+      }
+      prev = &item->next;
+   }
+
+   item = malloc(sizeof(struct cached_batch_item));
+   if (item == NULL)
+      return;
+
+   item->next = intel->batch.cached_items;
+   intel->batch.cached_items = item;
+
+emit:
+   item->size = sz;
+   item->header = intel->batch.emit;
+}
+
 /* Emit a pipelined flush to either flush render and texture cache for
  * reading from a FBO-drawn texture, or flush so that frontbuffer
  * render appears on the screen in DRI1.
diff --git a/src/mesa/drivers/dri/intel/intel_batchbuffer.h b/src/mesa/drivers/dri/intel/intel_batchbuffer.h
index 7699715c0ea..a0a5c9841c6 100644
--- a/src/mesa/drivers/dri/intel/intel_batchbuffer.h
+++ b/src/mesa/drivers/dri/intel/intel_batchbuffer.h
@@ -101,9 +101,9 @@ intel_batchbuffer_begin(struct intel_context *intel, int n, bool is_blit)
 {
    intel_batchbuffer_require_space(intel, n * 4, is_blit);
 
+   intel->batch.emit = intel->batch.used;
 #ifdef DEBUG
-   intel->batch.emit.total = n;
-   intel->batch.emit.start_ptr = intel->batch.used;
+   intel->batch.total = n;
 #endif
 }
 
@@ -123,6 +123,8 @@ intel_batchbuffer_advance(struct intel_context *intel)
 #endif
 }
 
+void intel_batchbuffer_cached_advance(struct intel_context *intel);
+
 /* Here are the crusty old macros, to be removed:
  */
 #define BATCH_LOCALS
@@ -141,5 +143,6 @@ intel_batchbuffer_advance(struct intel_context *intel)
 } while (0)
 
 #define ADVANCE_BATCH() intel_batchbuffer_advance(intel);
+#define CACHED_BATCH() intel_batchbuffer_cached_advance(intel);
 
 #endif
diff --git a/src/mesa/drivers/dri/intel/intel_context.h b/src/mesa/drivers/dri/intel/intel_context.h
index bf2a0b4ead8..6116764ad65 100644
--- a/src/mesa/drivers/dri/intel/intel_context.h
+++ b/src/mesa/drivers/dri/intel/intel_context.h
@@ -171,22 +171,14 @@ struct intel_context
 
    struct intel_batchbuffer {
       drm_intel_bo *bo;
+      struct cached_batch_item *cached_items;
 
-      uint16_t used;
-      uint16_t reserved_space;
+      uint16_t emit, total;
+      uint16_t used, reserved_space;
       uint32_t map[8192];
 #define BATCH_SZ (8192*sizeof(uint32_t))
 
       uint32_t state_batch_offset;
-
-#ifdef DEBUG
-      /** Tracking of BEGIN_BATCH()/OUT_BATCH()/ADVANCE_BATCH() debugging */
-      struct {
-	 uint16_t total;
-	 uint16_t start_ptr;
-      } emit;
-#endif
-
       bool is_blit;
    } batch;
 
-- 
2.30.2