From: Paul Berry <stereotype441@gmail.com>
Date: Wed, 7 Dec 2011 17:56:42 +0000 (-0800)
Subject: i965 gen6: Allocate URB space for GS
X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=43e39b58c705714c01919e5b4b5566e82e803d58;p=mesa.git

i965 gen6: Allocate URB space for GS

When the GS is not in use, the entire URB space is available for the
VS.  When the GS is in use, we split the URB space 50/50.

The 50/50 split is probably not optimal--we'll probably want tune this
for performance in a future patch.  For example, in most situations,
it's probably worth allocating more than 50% of the space to the VS,
since VS space is used for vertex caching.  But for now this is good
enough.

Based on previous work by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Eric Anholt <eric@anholt.net>
---

diff --git a/src/mesa/drivers/dri/i965/brw_context.c b/src/mesa/drivers/dri/i965/brw_context.c
index f97164991f7..faa02bf8a38 100644
--- a/src/mesa/drivers/dri/i965/brw_context.c
+++ b/src/mesa/drivers/dri/i965/brw_context.c
@@ -293,6 +293,7 @@ brwCreateContext(int api,
 	 brw->urb.max_vs_entries = 128; /* volume 2a (see 3DSTATE_URB) */
 	 brw->urb.max_gs_entries = 256;
       }
+      brw->urb.gen6_gs_previously_active = false;
    } else if (intel->gen == 5) {
       brw->urb.size = 1024;
       brw->max_vs_threads = 72;
diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h
index 171f3eff8a0..70a45c77260 100644
--- a/src/mesa/drivers/dri/i965/brw_context.h
+++ b/src/mesa/drivers/dri/i965/brw_context.h
@@ -708,6 +708,11 @@ struct brw_context
       GLuint sf_start;
       GLuint cs_start;
       GLuint size; /* Hardware URB size, in KB. */
+
+      /* gen6: True if the most recently sent _3DSTATE_URB message allocated
+       * URB space for the GS.
+       */
+      bool gen6_gs_previously_active;
    } urb;
 
    
diff --git a/src/mesa/drivers/dri/i965/gen6_urb.c b/src/mesa/drivers/dri/i965/gen6_urb.c
index d045bf28ec2..2d69cbe0d39 100644
--- a/src/mesa/drivers/dri/i965/gen6_urb.c
+++ b/src/mesa/drivers/dri/i965/gen6_urb.c
@@ -31,35 +31,64 @@
 #include "brw_state.h"
 #include "brw_defines.h"
 
+/**
+ * When the GS is not in use, we assign the entire URB space to the VS.  When
+ * the GS is in use, we split the URB space evenly between the VS and the GS.
+ * This is not ideal, but it's simple.
+ *
+ *           URB size / 2                   URB size / 2
+ *   _____________-______________   _____________-______________
+ *  /                            \ /                            \
+ * +-------------------------------------------------------------+
+ * | Vertex Shader Entries        | Geometry Shader Entries      |
+ * +-------------------------------------------------------------+
+ *
+ * Sandybridge GT1 has 32kB of URB space, while GT2 has 64kB.
+ * (See the Sandybridge PRM, Volume 2, Part 1, Section 1.4.7: 3DSTATE_URB.)
+ */
 static void
 gen6_upload_urb( struct brw_context *brw )
 {
    struct intel_context *intel = &brw->intel;
-   int nr_vs_entries;
+   int nr_vs_entries, nr_gs_entries;
+   int total_urb_size = brw->urb.size * 1024; /* in bytes */
 
    /* CACHE_NEW_VS_PROG */
    brw->urb.vs_size = MAX2(brw->vs.prog_data->urb_entry_size, 1);
 
-   /* Calculate how many VS URB entries fit in the total URB size */
-   nr_vs_entries = (brw->urb.size * 1024) / (brw->urb.vs_size * 128);
+   /* We use the same VUE layout for VS outputs and GS outputs (as it's what
+    * the SF and Clipper expect), so we can simply make the GS URB entry size
+    * the same as for the VS.  This may technically be too large in cases
+    * where we have few vertex attributes and a lot of varyings, since the VS
+    * size is determined by the larger of the two.  For now, it's safe.
+    */
+   brw->urb.gs_size = brw->urb.vs_size;
+
+   /* Calculate how many entries fit in each stage's section of the URB */
+   if (brw->gs.prog_active) {
+      nr_vs_entries = (total_urb_size/2) / (brw->urb.vs_size * 128);
+      nr_gs_entries = (total_urb_size/2) / (brw->urb.gs_size * 128);
+   } else {
+      nr_vs_entries = total_urb_size / (brw->urb.vs_size * 128);
+      nr_gs_entries = 0;
+   }
 
+   /* Then clamp to the maximum allowed by the hardware */
    if (nr_vs_entries > brw->urb.max_vs_entries)
       nr_vs_entries = brw->urb.max_vs_entries;
 
-   /* According to volume 2a, nr_vs_entries must be a multiple of 4. */
-   brw->urb.nr_vs_entries = ROUND_DOWN_TO(nr_vs_entries, 4);
+   if (nr_gs_entries > brw->urb.max_gs_entries)
+      nr_gs_entries = brw->urb.max_gs_entries;
 
-   /* Since we currently don't support Geometry Shaders, we always put the
-    * GS unit in passthrough mode and don't allocate it any URB space.
-    */
-   brw->urb.nr_gs_entries = 0;
-   brw->urb.gs_size = 1; /* Incorrect, but with 0 GS entries it doesn't matter. */
+   /* Finally, both must be a multiple of 4 (see 3DSTATE_URB in the PRM). */
+   brw->urb.nr_vs_entries = ROUND_DOWN_TO(nr_vs_entries, 4);
+   brw->urb.nr_gs_entries = ROUND_DOWN_TO(nr_gs_entries, 4);
 
    assert(brw->urb.nr_vs_entries >= 24);
    assert(brw->urb.nr_vs_entries % 4 == 0);
    assert(brw->urb.nr_gs_entries % 4 == 0);
-   /* GS requirement */
-   assert(!brw->gs.prog_active || brw->urb.vs_size < 5);
+   assert(brw->urb.vs_size < 5);
+   assert(brw->urb.gs_size < 5);
 
    BEGIN_BATCH(3);
    OUT_BATCH(_3DSTATE_URB << 16 | (3 - 2));
@@ -68,6 +97,22 @@ gen6_upload_urb( struct brw_context *brw )
    OUT_BATCH(((brw->urb.gs_size - 1) << GEN6_URB_GS_SIZE_SHIFT) |
 	     ((brw->urb.nr_gs_entries) << GEN6_URB_GS_ENTRIES_SHIFT));
    ADVANCE_BATCH();
+
+   /* From the PRM Volume 2 part 1, section 1.4.7:
+    *
+    *   Because of a urb corruption caused by allocating a previous gsunitâs
+    *   urb entry to vsunit software is required to send a "GS NULL
+    *   Fence"(Send URB fence with VS URB size == 1 and GS URB size == 0) plus
+    *   a dummy DRAW call before any case where VS will be taking over GS URB
+    *   space.
+    *
+    * It is not clear exactly what this means ("URB fence" is a command that
+    * doesn't exist on Gen6).  So for now we just do a full pipeline flush as
+    * a workaround.
+    */
+   if (brw->urb.gen6_gs_previously_active && !brw->gs.prog_active)
+      intel_batchbuffer_emit_mi_flush(intel);
+   brw->urb.gen6_gs_previously_active = brw->gs.prog_active;
 }
 
 const struct brw_tracked_state gen6_urb = {