From 3f2283172bcaf3db00a99baad0319bc7e0be5fc2 Mon Sep 17 00:00:00 2001
From: Paul Berry <stereotype441@gmail.com>
Date: Tue, 29 Nov 2011 14:54:02 -0800
Subject: [PATCH] i965 gs: Clean up dodgy register re-use, at the cost of a few
 MOVs.

Prior to this patch, in the Gen4 and Gen5 GS, we used GRF 0 (called
"R0" in the code) as a staging area to prepare the message header for
the FF_SYNC and URB_WRITE messages.  This cleverly avoided an
unnecessary MOV operation (since the initial value of GRF 0 contains
data that needs to be included in the message header), but it made the
code confusing, since GRF 0 could no longer be relied upon to contain
its initial value once the GS started preparing its first message.
This patch avoids confusion by using a separate register ("header") as
the staging area, at the cost of one MOV instruction.

Worse yet, prior to this patch, the GS would completely overwrite the
contents of GRF 0 with the writeback data it received from a completed
FF_SYNC or URB_WRITE message.  It did this because DWORD 0 of the
writeback data contains the new URB handle, and that neds to be
included in DWORD 0 of the next URB_WRITE message header.  However,
that caused the rest of the message header to be corrupted either with
undefined data or zeros.  Astonishingly, this did not produce any
known failures (probably by dumb luck).  However, it seems really
dodgy--corrupting FFTID in particular seems likely to cause GPU hangs.
This patch avoids the corruption by storing the writeback data in a
temporary register and then copying just DWORD 0 to the header for the
next message.  This costs one extra MOV instruction per message sent,
except for the final message.

Also, this patch moves the logic for overriding DWORD 2 of the header
(which contains PrimType, PrimStart, PrimEnd, and some other data that
we don't care about yet).  This logic is now in the function
brw_gs_overwrite_header_dw2() rather than in brw_gs_emit_vue().  This
saves one MOV instruction in brw_gs_quads() and brw_gs_quad_strip(),
and paves the way for the Gen6 GS, which will need more complex logic
to override DWORD 2 of the header.

Finally, the function brw_gs_alloc_regs() contained a benign bug: it
neglected to increment the register counter when allocating space for
the "temp" register.  This turned out not to have any effect because
the temp register wasn't used on Gen4 and Gen5, the only hardware
models (so far) to require a GS program.  Now, all the registers
allocated by brw_gs_alloc_regs() are actually used, and properly
accounted for.

Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
---
 src/mesa/drivers/dri/i965/brw_gs.h      |   1 +
 src/mesa/drivers/dri/i965/brw_gs_emit.c | 175 +++++++++++++++---------
 2 files changed, 111 insertions(+), 65 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_gs.h b/src/mesa/drivers/dri/i965/brw_gs.h
index 12889a62e6b..93448a77f08 100644
--- a/src/mesa/drivers/dri/i965/brw_gs.h
+++ b/src/mesa/drivers/dri/i965/brw_gs.h
@@ -60,6 +60,7 @@ struct brw_gs_compile {
    struct {
       struct brw_reg R0;
       struct brw_reg vertex[MAX_GS_VERTS];
+      struct brw_reg header;
       struct brw_reg temp;
    } reg;
 
diff --git a/src/mesa/drivers/dri/i965/brw_gs_emit.c b/src/mesa/drivers/dri/i965/brw_gs_emit.c
index e9875cdbf11..6d39df195f6 100644
--- a/src/mesa/drivers/dri/i965/brw_gs_emit.c
+++ b/src/mesa/drivers/dri/i965/brw_gs_emit.c
@@ -58,35 +58,66 @@ static void brw_gs_alloc_regs( struct brw_gs_compile *c,
       i += c->nr_regs;
    }
 
-   c->reg.temp = brw_vec8_grf(i, 0);
+   c->reg.header = retype(brw_vec8_grf(i++, 0), BRW_REGISTER_TYPE_UD);
+   c->reg.temp = retype(brw_vec8_grf(i++, 0), BRW_REGISTER_TYPE_UD);
 
    c->prog_data.urb_read_length = c->nr_regs; 
    c->prog_data.total_grf = i;
 }
 
 
+/**
+ * Set up the initial value of c->reg.header register based on c->reg.R0.
+ *
+ * The following information is passed to the GS thread in R0, and needs to be
+ * included in the first URB_WRITE or FF_SYNC message sent by the GS:
+ *
+ * - DWORD 0 [31:0] handle info (Gen4 only)
+ * - DWORD 5 [7:0] FFTID
+ * - DWORD 6 [31:0] Debug info
+ * - DWORD 7 [31:0] Debug info
+ *
+ * This function sets up the above data by copying by copying the contents of
+ * R0 to the header register.
+ */
+static void brw_gs_initialize_header(struct brw_gs_compile *c)
+{
+   struct brw_compile *p = &c->func;
+   brw_MOV(p, c->reg.header, c->reg.R0);
+}
+
+/**
+ * Overwrite DWORD 2 of c->reg.header with the given immediate unsigned value.
+ *
+ * In URB_WRITE messages, DWORD 2 contains the fields PrimType, PrimStart,
+ * PrimEnd, Increment CL_INVOCATIONS, and SONumPrimsWritten, many of which we
+ * need to be able to update on a per-vertex basis.
+ */
+static void brw_gs_overwrite_header_dw2(struct brw_gs_compile *c,
+                                        unsigned dw2)
+{
+   struct brw_compile *p = &c->func;
+   brw_MOV(p, get_element_ud(c->reg.header, 2), brw_imm_ud(dw2));
+}
+
+/**
+ * Emit a vertex using the URB_WRITE message.  Use the contents of
+ * c->reg.header for the message header, and the registers starting at \c vert
+ * for the vertex data.
+ *
+ * If \c last is true, then this is the last vertex, so no further URB space
+ * should be allocated, and this message should end the thread.
+ *
+ * If \c last is false, then a new URB entry will be allocated, and its handle
+ * will be stored in DWORD 0 of c->reg.header for use in the next URB_WRITE
+ * message.
+ */
 static void brw_gs_emit_vue(struct brw_gs_compile *c, 
 			    struct brw_reg vert,
-			    bool last,
-			    GLuint header)
+			    bool last)
 {
    struct brw_compile *p = &c->func;
-   struct intel_context *intel = &c->func.brw->intel;
    bool allocate = !last;
-   struct brw_reg temp;
-
-   if (intel->gen < 6)
-      temp = c->reg.R0;
-   else {
-      temp = c->reg.temp;
-      brw_MOV(p, retype(temp, BRW_REGISTER_TYPE_UD),
-	      retype(c->reg.R0, BRW_REGISTER_TYPE_UD));
-   }
-
-   /* Overwrite PrimType and PrimStart in the message header, for
-    * each vertex in turn:
-    */
-   brw_MOV(p, get_element_ud(temp, 2), brw_imm_ud(header));
 
    /* Copy the vertex from vertn into m1..mN+1:
     */
@@ -99,9 +130,10 @@ static void brw_gs_emit_vue(struct brw_gs_compile *c,
     * allocated each time.
     */
    brw_urb_WRITE(p, 
-		 allocate ? temp : retype(brw_null_reg(), BRW_REGISTER_TYPE_UD),
+		 allocate ? c->reg.temp
+                          : retype(brw_null_reg(), BRW_REGISTER_TYPE_UD),
 		 0,
-		 temp,
+		 c->reg.header,
 		 allocate,
 		 1,		/* used */
 		 c->nr_regs + 1, /* msg length */
@@ -111,38 +143,37 @@ static void brw_gs_emit_vue(struct brw_gs_compile *c,
 		 0,		/* urb offset */
 		 BRW_URB_SWIZZLE_NONE);
 
-   if (intel->gen >= 6 && allocate)
-       brw_MOV(p, get_element_ud(c->reg.R0, 0), get_element_ud(temp, 0));
+   if (allocate) {
+      brw_MOV(p, get_element_ud(c->reg.header, 0),
+              get_element_ud(c->reg.temp, 0));
+   }
 }
 
+/**
+ * Send an FF_SYNC message to ensure that all previously spawned GS threads
+ * have finished sending primitives down the pipeline, and to allocate a URB
+ * entry for the first output vertex.  Only needed when intel->needs_ff_sync
+ * is true.
+ *
+ * This function modifies c->reg.header: in DWORD 1, it stores num_prim (which
+ * is needed by the FF_SYNC message), and in DWORD 0, it stores the handle to
+ * the allocated URB entry (which will be needed by the URB_WRITE meesage that
+ * follows).
+ */
 static void brw_gs_ff_sync(struct brw_gs_compile *c, int num_prim)
 {
    struct brw_compile *p = &c->func;
-   struct intel_context *intel = &c->func.brw->intel;
 
-   if (intel->gen < 6) {
-      brw_MOV(p, get_element_ud(c->reg.R0, 1), brw_imm_ud(num_prim));
-      brw_ff_sync(p,
-		  c->reg.R0,
-		  0,
-		  c->reg.R0,
-		  1, /* allocate */
-		  1, /* response length */
-		  0 /* eot */);
-   } else {
-      brw_MOV(p, retype(c->reg.temp, BRW_REGISTER_TYPE_UD),
-	      retype(c->reg.R0, BRW_REGISTER_TYPE_UD));
-      brw_MOV(p, get_element_ud(c->reg.temp, 1), brw_imm_ud(num_prim));
-      brw_ff_sync(p,
-		  c->reg.temp,
-		  0,
-		  c->reg.temp,
-		  1, /* allocate */
-		  1, /* response length */
-		  0 /* eot */);
-      brw_MOV(p, get_element_ud(c->reg.R0, 0),
-      get_element_ud(c->reg.temp, 0));
-   }
+   brw_MOV(p, get_element_ud(c->reg.header, 1), brw_imm_ud(num_prim));
+   brw_ff_sync(p,
+               c->reg.temp,
+               0,
+               c->reg.header,
+               1, /* allocate */
+               1, /* response length */
+               0 /* eot */);
+   brw_MOV(p, get_element_ud(c->reg.header, 0),
+           get_element_ud(c->reg.temp, 0));
 }
 
 
@@ -151,23 +182,28 @@ void brw_gs_quads( struct brw_gs_compile *c, struct brw_gs_prog_key *key )
    struct intel_context *intel = &c->func.brw->intel;
 
    brw_gs_alloc_regs(c, 4);
-   
+   brw_gs_initialize_header(c);
    /* Use polygons for correct edgeflag behaviour. Note that vertex 3
     * is the PV for quads, but vertex 0 for polygons:
     */
    if (intel->needs_ff_sync)
       brw_gs_ff_sync(c, 1);
+   brw_gs_overwrite_header_dw2(c, (_3DPRIM_POLYGON << 2) | R02_PRIM_START);
    if (key->pv_first) {
-      brw_gs_emit_vue(c, c->reg.vertex[0], 0, ((_3DPRIM_POLYGON << 2) | R02_PRIM_START));
-      brw_gs_emit_vue(c, c->reg.vertex[1], 0, (_3DPRIM_POLYGON << 2));
-      brw_gs_emit_vue(c, c->reg.vertex[2], 0, (_3DPRIM_POLYGON << 2));
-      brw_gs_emit_vue(c, c->reg.vertex[3], 1, ((_3DPRIM_POLYGON << 2) | R02_PRIM_END));
+      brw_gs_emit_vue(c, c->reg.vertex[0], 0);
+      brw_gs_overwrite_header_dw2(c, _3DPRIM_POLYGON << 2);
+      brw_gs_emit_vue(c, c->reg.vertex[1], 0);
+      brw_gs_emit_vue(c, c->reg.vertex[2], 0);
+      brw_gs_overwrite_header_dw2(c, (_3DPRIM_POLYGON << 2) | R02_PRIM_END);
+      brw_gs_emit_vue(c, c->reg.vertex[3], 1);
    }
    else {
-      brw_gs_emit_vue(c, c->reg.vertex[3], 0, ((_3DPRIM_POLYGON << 2) | R02_PRIM_START));
-      brw_gs_emit_vue(c, c->reg.vertex[0], 0, (_3DPRIM_POLYGON << 2));
-      brw_gs_emit_vue(c, c->reg.vertex[1], 0, (_3DPRIM_POLYGON << 2));
-      brw_gs_emit_vue(c, c->reg.vertex[2], 1, ((_3DPRIM_POLYGON << 2) | R02_PRIM_END));
+      brw_gs_emit_vue(c, c->reg.vertex[3], 0);
+      brw_gs_overwrite_header_dw2(c, _3DPRIM_POLYGON << 2);
+      brw_gs_emit_vue(c, c->reg.vertex[0], 0);
+      brw_gs_emit_vue(c, c->reg.vertex[1], 0);
+      brw_gs_overwrite_header_dw2(c, (_3DPRIM_POLYGON << 2) | R02_PRIM_END);
+      brw_gs_emit_vue(c, c->reg.vertex[2], 1);
    }
 }
 
@@ -176,20 +212,26 @@ void brw_gs_quad_strip( struct brw_gs_compile *c, struct brw_gs_prog_key *key )
    struct intel_context *intel = &c->func.brw->intel;
 
    brw_gs_alloc_regs(c, 4);
+   brw_gs_initialize_header(c);
    
    if (intel->needs_ff_sync)
       brw_gs_ff_sync(c, 1);
+   brw_gs_overwrite_header_dw2(c, (_3DPRIM_POLYGON << 2) | R02_PRIM_START);
    if (key->pv_first) {
-      brw_gs_emit_vue(c, c->reg.vertex[0], 0, ((_3DPRIM_POLYGON << 2) | R02_PRIM_START));
-      brw_gs_emit_vue(c, c->reg.vertex[1], 0, (_3DPRIM_POLYGON << 2));
-      brw_gs_emit_vue(c, c->reg.vertex[2], 0, (_3DPRIM_POLYGON << 2));
-      brw_gs_emit_vue(c, c->reg.vertex[3], 1, ((_3DPRIM_POLYGON << 2) | R02_PRIM_END));
+      brw_gs_emit_vue(c, c->reg.vertex[0], 0);
+      brw_gs_overwrite_header_dw2(c, _3DPRIM_POLYGON << 2);
+      brw_gs_emit_vue(c, c->reg.vertex[1], 0);
+      brw_gs_emit_vue(c, c->reg.vertex[2], 0);
+      brw_gs_overwrite_header_dw2(c, (_3DPRIM_POLYGON << 2) | R02_PRIM_END);
+      brw_gs_emit_vue(c, c->reg.vertex[3], 1);
    }
    else {
-      brw_gs_emit_vue(c, c->reg.vertex[2], 0, ((_3DPRIM_POLYGON << 2) | R02_PRIM_START));
-      brw_gs_emit_vue(c, c->reg.vertex[3], 0, (_3DPRIM_POLYGON << 2));
-      brw_gs_emit_vue(c, c->reg.vertex[0], 0, (_3DPRIM_POLYGON << 2));
-      brw_gs_emit_vue(c, c->reg.vertex[1], 1, ((_3DPRIM_POLYGON << 2) | R02_PRIM_END));
+      brw_gs_emit_vue(c, c->reg.vertex[2], 0);
+      brw_gs_overwrite_header_dw2(c, _3DPRIM_POLYGON << 2);
+      brw_gs_emit_vue(c, c->reg.vertex[3], 0);
+      brw_gs_emit_vue(c, c->reg.vertex[0], 0);
+      brw_gs_overwrite_header_dw2(c, (_3DPRIM_POLYGON << 2) | R02_PRIM_END);
+      brw_gs_emit_vue(c, c->reg.vertex[1], 1);
    }
 }
 
@@ -198,9 +240,12 @@ void brw_gs_lines( struct brw_gs_compile *c )
    struct intel_context *intel = &c->func.brw->intel;
 
    brw_gs_alloc_regs(c, 2);
+   brw_gs_initialize_header(c);
 
    if (intel->needs_ff_sync)
       brw_gs_ff_sync(c, 1);
-   brw_gs_emit_vue(c, c->reg.vertex[0], 0, ((_3DPRIM_LINESTRIP << 2) | R02_PRIM_START));
-   brw_gs_emit_vue(c, c->reg.vertex[1], 1, ((_3DPRIM_LINESTRIP << 2) | R02_PRIM_END));
+   brw_gs_overwrite_header_dw2(c, (_3DPRIM_LINESTRIP << 2) | R02_PRIM_START);
+   brw_gs_emit_vue(c, c->reg.vertex[0], 0);
+   brw_gs_overwrite_header_dw2(c, (_3DPRIM_LINESTRIP << 2) | R02_PRIM_END);
+   brw_gs_emit_vue(c, c->reg.vertex[1], 1);
 }
-- 
2.30.2