i965: Convert the binding table to streamed indirect state.
authorEric Anholt <eric@anholt.net>
Mon, 7 Jun 2010 16:25:10 +0000 (09:25 -0700)
committerEric Anholt <eric@anholt.net>
Fri, 11 Jun 2010 07:15:56 +0000 (00:15 -0700)
This slightly reduces reduces cairo-gl firefox-talos-gfx runtime on my
Ironlake:
before:
[ # ]  backend                         test   min(s) median(s) stddev. count
[  0]       gl            firefox-talos-gfx   38.236   38.383   0.43%    5/6
after:
[  0]       gl            firefox-talos-gfx   37.799   38.203   0.39%    6/6

It turns out the cost of caching these objects and looking them up in
the cache again is greater than the cost of just computing the object
again, particularly when the overhead of having a separate BO to pin
is removed.

(Those that are paying close attention will note that this is a
reversal of the path I was moving the driver in a couple of years ago.
The major thing that has changed is that back then all state was
recomputed when we wrapped the streaming state buffer, including
recompiling our precious programs.  Now, we're uncaching just the
objects that are cheap to compute, and retaining caching of expensive
objects)

configure.ac
src/mesa/drivers/dri/i965/brw_context.h
src/mesa/drivers/dri/i965/brw_misc_state.c
src/mesa/drivers/dri/i965/brw_state.h
src/mesa/drivers/dri/i965/brw_state_cache.c
src/mesa/drivers/dri/i965/brw_state_upload.c
src/mesa/drivers/dri/i965/brw_vs_surface_state.c
src/mesa/drivers/dri/i965/brw_wm_surface_state.c

index 1056fa181005e564a1bcdd5696972976f8298cf3..7307d89ae2162914f0a62ff9162cab965b25cd00 100644 (file)
@@ -860,7 +860,7 @@ AC_SUBST([DRI_LIB_DEPS])
 
 case $DRI_DIRS in
 *i915*|*i965*)
-    PKG_CHECK_MODULES([INTEL], [libdrm_intel >= 2.4.19])
+    PKG_CHECK_MODULES([INTEL], [libdrm_intel >= 2.4.21])
     ;;
 esac
 
index a8290673838b8e62f40f595f292fd8b94b8e5303..86b86fde9a772af220010553a15bcbc97e72408d 100644 (file)
@@ -131,6 +131,7 @@ struct brw_context;
 #define BRW_NEW_WM_INPUT_DIMENSIONS     0x100
 #define BRW_NEW_PSP                     0x800
 #define BRW_NEW_WM_SURFACES            0x1000
+#define BRW_NEW_BINDING_TABLE          0x2000
 #define BRW_NEW_INDICES                        0x4000
 #define BRW_NEW_VERTICES               0x8000
 /**
@@ -302,7 +303,6 @@ enum brw_cache_id {
    BRW_CLIP_UNIT,
    BRW_CLIP_PROG,
    BRW_SS_SURFACE,
-   BRW_SS_SURF_BIND,
 
    BRW_MAX_CACHE
 };
@@ -377,7 +377,6 @@ struct brw_tracked_state {
 #define CACHE_NEW_CLIP_UNIT              (1<<BRW_CLIP_UNIT)
 #define CACHE_NEW_CLIP_PROG              (1<<BRW_CLIP_PROG)
 #define CACHE_NEW_SURFACE                (1<<BRW_SS_SURFACE)
-#define CACHE_NEW_SURF_BIND              (1<<BRW_SS_SURF_BIND)
 
 struct brw_cached_batch_item {
    struct header *header;
@@ -460,7 +459,7 @@ struct brw_context
        * consisting of the vertex buffers, pipelined state pointers,
        * the CURBE, the depth buffer, and a query BO.
        */
-      drm_intel_bo *validated_bos[VERT_ATTRIB_MAX + 16];
+      drm_intel_bo *validated_bos[VERT_ATTRIB_MAX + BRW_WM_MAX_SURF + 16];
       int validated_bo_count;
    } state;
 
@@ -598,6 +597,7 @@ struct brw_context
 
       /** Binding table of pointers to surf_bo entries */
       drm_intel_bo *bind_bo;
+      uint32_t bind_bo_offset;
       drm_intel_bo *surf_bo[BRW_VS_MAX_SURF];
       GLuint nr_surfaces;      
    } vs;
@@ -650,6 +650,7 @@ struct brw_context
 
       /** Binding table of pointers to surf_bo entries */
       drm_intel_bo *bind_bo;
+      uint32_t bind_bo_offset;
       drm_intel_bo *surf_bo[BRW_WM_MAX_SURF];
 
       drm_intel_bo *prog_bo;
index 35908ee7b694f9db7910413912f6fe3bb199220a..ab5d5240d0f214d1602979bf85ecaa08656436e6 100644 (file)
@@ -96,12 +96,6 @@ const struct brw_tracked_state brw_drawing_rect = {
    .emit = upload_drawing_rect
 };
 
-static void prepare_binding_table_pointers(struct brw_context *brw)
-{
-   brw_add_validated_bo(brw, brw->vs.bind_bo);
-   brw_add_validated_bo(brw, brw->wm.bind_bo);
-}
-
 /**
  * Upload the binding table pointers, which point each stage's array of surface
  * state pointers.
@@ -116,23 +110,24 @@ static void upload_binding_table_pointers(struct brw_context *brw)
    BEGIN_BATCH(6);
    OUT_BATCH(CMD_BINDING_TABLE_PTRS << 16 | (6 - 2));
    if (brw->vs.bind_bo != NULL)
-      OUT_RELOC(brw->vs.bind_bo, I915_GEM_DOMAIN_SAMPLER, 0, 0); /* vs */
+      OUT_RELOC(brw->vs.bind_bo, I915_GEM_DOMAIN_SAMPLER, 0,
+               brw->vs.bind_bo_offset); /* vs */
    else
       OUT_BATCH(0);
    OUT_BATCH(0); /* gs */
    OUT_BATCH(0); /* clip */
    OUT_BATCH(0); /* sf */
-   OUT_RELOC(brw->wm.bind_bo, I915_GEM_DOMAIN_SAMPLER, 0, 0); /* wm/ps */
+   OUT_RELOC(brw->wm.bind_bo, I915_GEM_DOMAIN_SAMPLER, 0,
+            brw->wm.bind_bo_offset); /* wm/ps */
    ADVANCE_BATCH();
 }
 
 const struct brw_tracked_state brw_binding_table_pointers = {
    .dirty = {
       .mesa = 0,
-      .brw = BRW_NEW_BATCH,
-      .cache = CACHE_NEW_SURF_BIND,
+      .brw = BRW_NEW_BATCH | BRW_NEW_BINDING_TABLE,
+      .cache = 0,
    },
-   .prepare = prepare_binding_table_pointers,
    .emit = upload_binding_table_pointers,
 };
 
@@ -154,21 +149,22 @@ static void upload_gen6_binding_table_pointers(struct brw_context *brw)
             GEN6_BINDING_TABLE_MODIFY_PS |
             (4 - 2));
    if (brw->vs.bind_bo != NULL)
-      OUT_RELOC(brw->vs.bind_bo, I915_GEM_DOMAIN_SAMPLER, 0, 0); /* vs */
+      OUT_RELOC(brw->vs.bind_bo, I915_GEM_DOMAIN_SAMPLER, 0,
+               brw->vs.bind_bo_offset); /* vs */
    else
       OUT_BATCH(0);
    OUT_BATCH(0); /* gs */
-   OUT_RELOC(brw->wm.bind_bo, I915_GEM_DOMAIN_SAMPLER, 0, 0); /* wm/ps */
+   OUT_RELOC(brw->wm.bind_bo, I915_GEM_DOMAIN_SAMPLER, 0,
+            brw->wm.bind_bo_offset); /* wm/ps */
    ADVANCE_BATCH();
 }
 
 const struct brw_tracked_state gen6_binding_table_pointers = {
    .dirty = {
       .mesa = 0,
-      .brw = BRW_NEW_BATCH,
-      .cache = CACHE_NEW_SURF_BIND,
+      .brw = BRW_NEW_BATCH | BRW_NEW_BINDING_TABLE,
+      .cache = 0,
    },
-   .prepare = prepare_binding_table_pointers,
    .emit = upload_gen6_binding_table_pointers,
 };
 
index 364be9411712361bca8a76729a18a6a79dd00265..68fd7d4f807f95d182f43f50be59b8a32fd82bf2 100644 (file)
@@ -81,6 +81,7 @@ const struct brw_tracked_state brw_wm_prog;
 const struct brw_tracked_state brw_wm_samplers;
 const struct brw_tracked_state brw_wm_constant_surface;
 const struct brw_tracked_state brw_wm_surfaces;
+const struct brw_tracked_state brw_wm_binding_table;
 const struct brw_tracked_state brw_wm_unit;
 
 const struct brw_tracked_state brw_psp_urb_cbs;
index f1d89484fcef33ccf6907afc330dd20fa582f5a2..45f1088f4aad9f29c312086267e890b934c4fd1d 100644 (file)
@@ -410,7 +410,6 @@ brw_init_surface_cache(struct brw_context *brw)
       calloc(1, cache->size * sizeof(struct brw_cache_item));
 
    brw_init_cache_id(cache, "SS_SURFACE", BRW_SS_SURFACE);
-   brw_init_cache_id(cache, "SS_SURF_BIND", BRW_SS_SURF_BIND);
 }
 
 
index 08535bb59cc5b857b28052c9d9b034692e22b3c0..e345dbcf5b7cd88c2936cbaa15fec4387869a3a8 100644 (file)
@@ -69,6 +69,7 @@ static const struct brw_tracked_state *gen4_atoms[] =
    &brw_vs_surfaces,           /* must do before unit */
    &brw_wm_constant_surface,   /* must do before wm surfaces/bind bo */
    &brw_wm_surfaces,           /* must do before samplers and unit */
+   &brw_wm_binding_table,
    &brw_wm_samplers,
 
    &brw_wm_unit,
@@ -268,6 +269,8 @@ static struct dirty_bit_map brw_bits[] = {
    DEFINE_BIT(BRW_NEW_CONTEXT),
    DEFINE_BIT(BRW_NEW_WM_INPUT_DIMENSIONS),
    DEFINE_BIT(BRW_NEW_PSP),
+   DEFINE_BIT(BRW_NEW_WM_SURFACES),
+   DEFINE_BIT(BRW_NEW_BINDING_TABLE),
    DEFINE_BIT(BRW_NEW_INDICES),
    DEFINE_BIT(BRW_NEW_INDEX_BUFFER),
    DEFINE_BIT(BRW_NEW_VERTICES),
@@ -295,7 +298,6 @@ static struct dirty_bit_map cache_bits[] = {
    DEFINE_BIT(CACHE_NEW_CLIP_UNIT),
    DEFINE_BIT(CACHE_NEW_CLIP_PROG),
    DEFINE_BIT(CACHE_NEW_SURFACE),
-   DEFINE_BIT(CACHE_NEW_SURF_BIND),
    {0, 0, 0}
 };
 
index 26164e907f4fbf351e17a62386007ebaa1a36b21..d946756af70575a8dbd834a3dd335a3d9666095c 100644 (file)
@@ -151,49 +151,29 @@ brw_update_vs_constant_surface( GLcontext *ctx,
 }
 
 
-/**
- * Constructs the binding table for the VS surface state.
- */
-static drm_intel_bo *
-brw_vs_get_binding_table(struct brw_context *brw)
+static void
+prepare_vs_surfaces(struct brw_context *brw)
 {
-   drm_intel_bo *bind_bo;
-
-   bind_bo = brw_search_cache(&brw->surface_cache, BRW_SS_SURF_BIND,
-                             NULL, 0,
-                             brw->vs.surf_bo, BRW_VS_MAX_SURF,
-                             NULL);
-
-   if (bind_bo == NULL) {
-      GLuint data_size = BRW_VS_MAX_SURF * sizeof(GLuint);
-      uint32_t data[BRW_VS_MAX_SURF];
-      int i;
-
-      for (i = 0; i < BRW_VS_MAX_SURF; i++)
-         if (brw->vs.surf_bo[i])
-            data[i] = brw->vs.surf_bo[i]->offset;
-         else
-            data[i] = 0;
-
-      bind_bo = brw_upload_cache( &brw->surface_cache, BRW_SS_SURF_BIND,
-                                 NULL, 0,
-                                 brw->vs.surf_bo, BRW_VS_MAX_SURF,
-                                 data, data_size);
-
-      /* Emit binding table relocations to surface state */
-      for (i = 0; i < BRW_VS_MAX_SURF; i++) {
-        if (brw->vs.surf_bo[i] != NULL) {
-           /* The presumed offsets were set in the data values for
-            * brw_upload_cache.
-            */
-           drm_intel_bo_emit_reloc(bind_bo, i * 4,
-                                   brw->vs.surf_bo[i], 0,
-                                   I915_GEM_DOMAIN_INSTRUCTION, 0);
-        }
+   GLcontext *ctx = &brw->intel.ctx;
+   int i;
+   int nr_surfaces = 0;
+
+   brw_update_vs_constant_surface(ctx, SURF_INDEX_VERT_CONST_BUFFER);
+
+   for (i = 0; i < BRW_VS_MAX_SURF; i++) {
+      if (brw->vs.surf_bo[i] != NULL) {
+        nr_surfaces = i + 1;
       }
    }
 
-   return bind_bo;
+   if (brw->vs.nr_surfaces != nr_surfaces) {
+      brw->state.dirty.brw |= BRW_NEW_NR_VS_SURFACES;
+      brw->vs.nr_surfaces = nr_surfaces;
+   }
+
+   for (i = 0; i < BRW_VS_MAX_SURF; i++) {
+      brw_add_validated_bo(brw, brw->vs.surf_bo[i]);
+   }
 }
 
 /**
@@ -203,43 +183,51 @@ brw_vs_get_binding_table(struct brw_context *brw)
  * to be updated, and produces BRW_NEW_NR_VS_SURFACES for the VS unit and
  * CACHE_NEW_SURF_BIND for the binding table upload.
  */
-static void prepare_vs_surfaces(struct brw_context *brw )
+static void upload_vs_surfaces(struct brw_context *brw)
 {
-   GLcontext *ctx = &brw->intel.ctx;
+   uint32_t *bind;
    int i;
-   int nr_surfaces = 0;
-
-   brw_update_vs_constant_surface(ctx, SURF_INDEX_VERT_CONST_BUFFER);
 
-   for (i = 0; i < BRW_VS_MAX_SURF; i++) {
-      if (brw->vs.surf_bo[i] != NULL) {
-        nr_surfaces = i + 1;
+   /* BRW_NEW_NR_VS_SURFACES */
+   if (brw->vs.nr_surfaces == 0) {
+      if (brw->vs.bind_bo) {
+        drm_intel_bo_unreference(brw->vs.bind_bo);
+        brw->vs.bind_bo = NULL;
+        brw->state.dirty.brw |= BRW_NEW_BINDING_TABLE;
       }
+      return;
    }
 
-   if (brw->vs.nr_surfaces != nr_surfaces) {
-      brw->state.dirty.brw |= BRW_NEW_NR_VS_SURFACES;
-      brw->vs.nr_surfaces = nr_surfaces;
-   }
-
-   /* Note that we don't end up updating the bind_bo if we don't have a
-    * surface to be pointing at.  This should be relatively harmless, as it
-    * just slightly increases our working set size.
+   /* Might want to calculate nr_surfaces first, to avoid taking up so much
+    * space for the binding table. (once we have vs samplers)
     */
-   if (brw->vs.nr_surfaces != 0) {
-      drm_intel_bo_unreference(brw->vs.bind_bo);
-      brw->vs.bind_bo = brw_vs_get_binding_table(brw);
+   bind = brw_state_batch(brw, sizeof(uint32_t) * BRW_VS_MAX_SURF,
+                         32, &brw->vs.bind_bo, &brw->vs.bind_bo_offset);
+
+   for (i = 0; i < BRW_VS_MAX_SURF; i++) {
+      /* BRW_NEW_VS_CONSTBUF */
+      if (brw->vs.surf_bo[i]) {
+        drm_intel_bo_emit_reloc(brw->vs.bind_bo,
+                                brw->vs.bind_bo_offset + i * sizeof(uint32_t),
+                                brw->vs.surf_bo[i], 0,
+                                I915_GEM_DOMAIN_INSTRUCTION, 0);
+        bind[i] = brw->vs.surf_bo[i]->offset;
+      } else {
+        bind[i] = 0;
+      }
    }
+
+   brw->state.dirty.brw |= BRW_NEW_BINDING_TABLE;
 }
 
 const struct brw_tracked_state brw_vs_surfaces = {
    .dirty = {
       .mesa = 0,
-      .brw = (BRW_NEW_VS_CONSTBUF),
+      .brw = (BRW_NEW_VS_CONSTBUF |
+             BRW_NEW_NR_VS_SURFACES |
+             BRW_NEW_BATCH),
       .cache = 0
    },
    .prepare = prepare_vs_surfaces,
+   .emit = upload_vs_surfaces,
 };
-
-
-
index 2b216fddbb5369885116af1f24f1b35addff4012..ba6a6258f573ad3174a75287c3f0ce0831b3c830 100644 (file)
@@ -639,57 +639,10 @@ brw_update_renderbuffer_surface(struct brw_context *brw,
    }
 }
 
-
-/**
- * Constructs the binding table for the WM surface state, which maps unit
- * numbers to surface state objects.
- */
-static drm_intel_bo *
-brw_wm_get_binding_table(struct brw_context *brw)
-{
-   drm_intel_bo *bind_bo;
-
-   assert(brw->wm.nr_surfaces <= BRW_WM_MAX_SURF);
-
-   bind_bo = brw_search_cache(&brw->surface_cache, BRW_SS_SURF_BIND,
-                             NULL, 0,
-                             brw->wm.surf_bo, brw->wm.nr_surfaces,
-                             NULL);
-
-   if (bind_bo == NULL) {
-      GLuint data_size = brw->wm.nr_surfaces * sizeof(GLuint);
-      uint32_t data[BRW_WM_MAX_SURF];
-      int i;
-
-      for (i = 0; i < brw->wm.nr_surfaces; i++)
-         if (brw->wm.surf_bo[i])
-            data[i] = brw->wm.surf_bo[i]->offset;
-         else
-            data[i] = 0;
-
-      bind_bo = brw_upload_cache( &brw->surface_cache, BRW_SS_SURF_BIND,
-                                 NULL, 0,
-                                 brw->wm.surf_bo, brw->wm.nr_surfaces,
-                                 data, data_size);
-
-      /* Emit binding table relocations to surface state */
-      for (i = 0; i < BRW_WM_MAX_SURF; i++) {
-        if (brw->wm.surf_bo[i] != NULL) {
-           drm_intel_bo_emit_reloc(bind_bo, i * sizeof(GLuint),
-                                   brw->wm.surf_bo[i], 0,
-                                   I915_GEM_DOMAIN_INSTRUCTION, 0);
-        }
-      }
-   }
-
-   return bind_bo;
-}
-
 static void prepare_wm_surfaces(struct brw_context *brw )
 {
    GLcontext *ctx = &brw->intel.ctx;
    GLuint i;
-   int old_nr_surfaces;
 
    /* _NEW_BUFFERS | _NEW_COLOR */
    /* Update surfaces for drawing buffers */
@@ -703,32 +656,21 @@ static void prepare_wm_surfaces(struct brw_context *brw )
       brw_update_renderbuffer_surface(brw, NULL, 0);
    }
 
-   old_nr_surfaces = brw->wm.nr_surfaces;
-   brw->wm.nr_surfaces = BRW_MAX_DRAW_BUFFERS;
-
-   if (brw->wm.surf_bo[SURF_INDEX_FRAG_CONST_BUFFER] != NULL)
-       brw->wm.nr_surfaces = SURF_INDEX_FRAG_CONST_BUFFER + 1;
-
    /* Update surfaces for textures */
    for (i = 0; i < BRW_MAX_TEX_UNIT; i++) {
       const struct gl_texture_unit *texUnit = &ctx->Texture.Unit[i];
       const GLuint surf = SURF_INDEX_TEXTURE(i);
 
-      /* _NEW_TEXTURE, BRW_NEW_TEXDATA */
+      /* _NEW_TEXTURE */
       if (texUnit->_ReallyEnabled) {
         brw_update_texture_surface(ctx, i);
-        brw->wm.nr_surfaces = surf + 1;
       } else {
          drm_intel_bo_unreference(brw->wm.surf_bo[surf]);
          brw->wm.surf_bo[surf] = NULL;
       }
    }
 
-   drm_intel_bo_unreference(brw->wm.bind_bo);
-   brw->wm.bind_bo = brw_wm_get_binding_table(brw);
-
-   if (brw->wm.nr_surfaces != old_nr_surfaces)
-      brw->state.dirty.brw |= BRW_NEW_NR_WM_SURFACES;
+   brw->state.dirty.brw |= BRW_NEW_WM_SURFACES;
 }
 
 const struct brw_tracked_state brw_wm_surfaces = {
@@ -736,12 +678,69 @@ const struct brw_tracked_state brw_wm_surfaces = {
       .mesa = (_NEW_COLOR |
                _NEW_TEXTURE |
                _NEW_BUFFERS),
-      .brw = (BRW_NEW_CONTEXT |
-             BRW_NEW_WM_SURFACES),
+      .brw = (BRW_NEW_CONTEXT),
       .cache = 0
    },
    .prepare = prepare_wm_surfaces,
 };
 
+static void
+brw_wm_prepare_binding_table(struct brw_context *brw)
+{
+   int i;
 
+   for (i = 0; i < BRW_WM_MAX_SURF; i++) {
+      if (brw->wm.surf_bo[i]) {
+        brw_add_validated_bo(brw, brw->wm.surf_bo[i]);
+      }
+   }
+}
 
+/**
+ * Constructs the binding table for the WM surface state, which maps unit
+ * numbers to surface state objects.
+ */
+static void
+brw_wm_upload_binding_table(struct brw_context *brw)
+{
+   uint32_t *bind;
+   int i, nr_surfaces = 0;
+
+   /* Might want to calculate nr_surfaces first, to avoid taking up so much
+    * space for the binding table.
+    */
+   bind = brw_state_batch(brw, sizeof(uint32_t) * BRW_WM_MAX_SURF,
+                         32, &brw->wm.bind_bo, &brw->wm.bind_bo_offset);
+
+   for (i = 0; i < BRW_WM_MAX_SURF; i++) {
+      /* BRW_NEW_WM_SURFACES */
+      if (brw->wm.surf_bo[i]) {
+        drm_intel_bo_emit_reloc(brw->wm.bind_bo,
+                                brw->wm.bind_bo_offset + i * sizeof(uint32_t),
+                                brw->wm.surf_bo[i], 0,
+                                I915_GEM_DOMAIN_INSTRUCTION, 0);
+        bind[i] = brw->wm.surf_bo[i]->offset;
+        nr_surfaces = i + 1;
+      } else {
+        bind[i] = 0;
+      }
+   }
+
+   if (brw->wm.nr_surfaces != nr_surfaces) {
+      brw->wm.nr_surfaces = nr_surfaces;
+      brw->state.dirty.brw |= BRW_NEW_NR_WM_SURFACES;
+   }
+
+   brw->state.dirty.brw |= BRW_NEW_BINDING_TABLE;
+}
+
+const struct brw_tracked_state brw_wm_binding_table = {
+   .dirty = {
+      .mesa = 0,
+      .brw = (BRW_NEW_BATCH |
+             BRW_NEW_WM_SURFACES),
+      .cache = 0
+   },
+   .prepare = brw_wm_prepare_binding_table,
+   .emit = brw_wm_upload_binding_table,
+};