i915: Emit a single relocation per vbo
authorChris Wilson <chris@chris-wilson.co.uk>
Thu, 25 Nov 2010 15:41:37 +0000 (15:41 +0000)
committerChris Wilson <chris@chris-wilson.co.uk>
Mon, 21 Feb 2011 13:04:46 +0000 (13:04 +0000)
Reducing the number of relocations has lots of nice knock-on effects,
not least including reducing batch buffer size, auxilliary array sizes
(vmalloced and copied into the kernel), processing of uncached
relocations etc.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
src/mesa/drivers/dri/i915/i915_context.h
src/mesa/drivers/dri/i915/i915_fragprog.c
src/mesa/drivers/dri/i915/i915_reg.h
src/mesa/drivers/dri/i915/i915_vtbl.c
src/mesa/drivers/dri/i915/intel_tris.c

index e38281ee03493bd71f41b7e2e69ea33571363c2b..601620275f46d0d19ad0ccd8ba7bcb098a3aa5d3 100644 (file)
@@ -29,7 +29,6 @@
 #define I915CONTEXT_INC
 
 #include "intel_context.h"
-#include "i915_reg.h"
 
 #define I915_FALLBACK_TEXTURE           0x1000
 #define I915_FALLBACK_COLORMASK                 0x2000
@@ -126,6 +125,12 @@ enum {
 #define I915_MAX_CONSTANT      32
 #define I915_CONSTANT_SIZE     (2+(4*I915_MAX_CONSTANT))
 
+#define I915_MAX_TEX_INDIRECT 4
+#define I915_MAX_TEX_INSN     32
+#define I915_MAX_ALU_INSN     64
+#define I915_MAX_DECL_INSN    27
+#define I915_MAX_TEMPORARY    16
+
 #define I915_MAX_INSN          (I915_MAX_DECL_INSN + \
                                I915_MAX_TEX_INSN + \
                                I915_MAX_ALU_INSN)
@@ -264,6 +269,9 @@ struct i915_context
 
    struct i915_fragment_program *current_program;
 
+   drm_intel_bo *current_vb_bo;
+   unsigned int current_vertex_size;
+
    struct i915_hw_state state;
    uint32_t last_draw_offset;
    GLuint last_sampler;
index 8bc88a8f4416d5c44a3406a8344134fa4521058c..25f4fc3c8b1da7ddf7e871695df8c2ef846c0995 100644 (file)
@@ -1422,6 +1422,10 @@ i915ValidateFragmentProgram(struct i915_context *i915)
                                               intel->vertex_attr_count,
                                               intel->ViewportMatrix.m, 0);
 
+      assert(intel->prim.current_offset == intel->prim.start_offset);
+      intel->prim.start_offset = (intel->prim.current_offset + intel->vertex_size-1) / intel->vertex_size * intel->vertex_size;
+      intel->prim.current_offset = intel->prim.start_offset;
+
       intel->vertex_size >>= 2;
 
       i915->state.Ctx[I915_CTXREG_LIS2] = s2;
index 7f31ff674f24c478357b38ba807e1de1027d3a79..766547a4c6a0eeaac2a607ad1afa2f596cd3defc 100644 (file)
 /* p222 */
 
 
-#define I915_MAX_TEX_INDIRECT 4
-#define I915_MAX_TEX_INSN     32
-#define I915_MAX_ALU_INSN     64
-#define I915_MAX_DECL_INSN    27
-#define I915_MAX_TEMPORARY    16
-
-
 /* Each instruction is 3 dwords long, though most don't require all
  * this space.  Maximum of 123 instructions.  Smaller maxes per insn
  * type.
index 4049c37fdb1a15e6672d838d120f7ddc4068bd0b..921183b81dff7db3ef8520fe716a2036fc9aad72 100644 (file)
@@ -678,6 +678,9 @@ i915_new_batch(struct intel_context *intel)
    i915->state.emitted = 0;
    i915->last_draw_offset = 0;
    i915->last_sampler = 0;
+
+   i915->current_vb_bo = NULL;
+   i915->current_vertex_size = 0;
 }
 
 static void 
index c6b5a01885fdcdd300e2cc2d632fff13040f2ab9..cf9291cdfca14775704d3e3c3db9ac15c7662be4 100644 (file)
@@ -54,6 +54,7 @@
 #include "intel_span.h"
 #include "i830_context.h"
 #include "i830_reg.h"
+#include "i915_context.h"
 
 static void intelRenderPrimitive(struct gl_context * ctx, GLenum prim);
 static void intelRasterPrimitive(struct gl_context * ctx, GLenum rprim,
@@ -215,7 +216,7 @@ void intel_flush_prim(struct intel_context *intel)
    offset = intel->prim.start_offset;
    intel->prim.start_offset = intel->prim.current_offset;
    if (intel->gen < 3)
-      intel->prim.start_offset = ALIGN(intel->prim.start_offset, 128);
+      intel->prim.current_offset = intel->prim.start_offset = ALIGN(intel->prim.start_offset, 128);
    intel->prim.flush = NULL;
 
    intel->vtbl.emit_state(intel);
@@ -240,20 +241,39 @@ void intel_flush_prim(struct intel_context *intel)
 #endif
 
    if (intel->gen >= 3) {
-      BEGIN_BATCH(5);
-      OUT_BATCH(_3DSTATE_LOAD_STATE_IMMEDIATE_1 |
-               I1_LOAD_S(0) | I1_LOAD_S(1) | 1);
-      assert((offset & ~S0_VB_OFFSET_MASK) == 0);
-      OUT_RELOC(vb_bo, I915_GEM_DOMAIN_VERTEX, 0, offset);
-      OUT_BATCH((intel->vertex_size << S1_VERTEX_WIDTH_SHIFT) |
-               (intel->vertex_size << S1_VERTEX_PITCH_SHIFT));
+      struct i915_context *i915 = i915_context(&intel->ctx);
+      unsigned int cmd = 0, len = 0;
+
+      if (vb_bo != i915->current_vb_bo) {
+        cmd |= I1_LOAD_S(0);
+        len++;
+      }
 
+      if (intel->vertex_size != i915->current_vertex_size) {
+        cmd |= I1_LOAD_S(1);
+        len++;
+      }
+      if (len)
+        len++;
+
+      BEGIN_BATCH(2+len);
+      if (cmd)
+        OUT_BATCH(_3DSTATE_LOAD_STATE_IMMEDIATE_1 | cmd | (len - 2));
+      if (vb_bo != i915->current_vb_bo) {
+        OUT_RELOC(vb_bo, I915_GEM_DOMAIN_VERTEX, 0, 0);
+        i915->current_vb_bo = vb_bo;
+      }
+      if (intel->vertex_size != i915->current_vertex_size) {
+        OUT_BATCH((intel->vertex_size << S1_VERTEX_WIDTH_SHIFT) |
+                  (intel->vertex_size << S1_VERTEX_PITCH_SHIFT));
+        i915->current_vertex_size = intel->vertex_size;
+      }
       OUT_BATCH(_3DPRIMITIVE |
                PRIM_INDIRECT |
                PRIM_INDIRECT_SEQUENTIAL |
                intel->prim.primitive |
                count);
-      OUT_BATCH(0); /* Beginning vertex index */
+      OUT_BATCH(offset / (intel->vertex_size * 4));
       ADVANCE_BATCH();
    } else {
       struct i830_context *i830 = i830_context(&intel->ctx);