i965: first attempt at handling URB overflow when there's too many vs outputs
authorBrian Paul <brianp@vmware.com>
Tue, 30 Jun 2009 23:12:34 +0000 (17:12 -0600)
committerBrian Paul <brianp@vmware.com>
Tue, 30 Jun 2009 23:12:44 +0000 (17:12 -0600)
If we can't fit all the VS outputs into the MRF, we need to overflow into
temporary GRF registers, then use some MOVs and a second brw_urb_WRITE()
instruction to place the overflow vertex results into the URB.

This is hit when a vertex/fragment shader pair has a large number of varying
variables (12 or more).

There's still something broken here, but it seems close...

src/mesa/drivers/dri/i965/brw_vs.h
src/mesa/drivers/dri/i965/brw_vs_emit.c

index 1e4f66091e34635fe921cf900bf7453fa36488ff..4a591365c986b29404c2a4913e303976f4f761d0 100644 (file)
@@ -58,6 +58,7 @@ struct brw_vs_compile {
 
    GLuint first_output;
    GLuint nr_outputs;
+   GLuint first_overflow_output; /**< VERT_ATTRIB_x */
 
    GLuint first_tmp;
    GLuint last_tmp;
index 01364232a42f364dcf5e4191ca37af35c201a9a6..9467295d345a829e564757afd97c7e14715adf41 100644 (file)
@@ -133,6 +133,7 @@ static void brw_vs_alloc_regs( struct brw_vs_compile *c )
     */
    c->nr_outputs = 0;
    c->first_output = reg;
+   c->first_overflow_output = 0;
    mrf = 4;
    for (i = 0; i < VERT_RESULT_MAX; i++) {
       if (c->prog_data.outputs_written & (1 << i)) {
@@ -148,8 +149,17 @@ static void brw_vs_alloc_regs( struct brw_vs_compile *c )
            mrf++;              /* just a placeholder?  XXX fix later stages & remove this */
         }
         else {
-           c->regs[PROGRAM_OUTPUT][i] = brw_message_reg(mrf);
-           mrf++;
+            if (mrf < 16) {
+               c->regs[PROGRAM_OUTPUT][i] = brw_message_reg(mrf);
+               mrf++;
+            }
+            else {
+               /* too many vertex results to fit in MRF, use GRF for overflow */
+               if (!c->first_overflow_output)
+                  c->first_overflow_output = i;
+               c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0);
+               reg++;
+            }
         }
       }
    }     
@@ -1067,6 +1077,7 @@ static void emit_vertex_write( struct brw_vs_compile *c)
    struct brw_reg m0 = brw_message_reg(0);
    struct brw_reg pos = c->regs[PROGRAM_OUTPUT][VERT_RESULT_HPOS];
    struct brw_reg ndc;
+   int eot;
 
    if (c->key.copy_edgeflag) {
       brw_MOV(p, 
@@ -1145,18 +1156,51 @@ static void emit_vertex_write( struct brw_vs_compile *c)
    brw_MOV(p, offset(m0, 2), ndc);
    brw_MOV(p, offset(m0, 3), pos);
 
+   eot = (c->first_overflow_output == 0);
+
    brw_urb_WRITE(p, 
                 brw_null_reg(), /* dest */
                 0,             /* starting mrf reg nr */
                 c->r0,         /* src */
                 0,             /* allocate */
                 1,             /* used */
-                c->nr_outputs + 3, /* msg len */
+                MIN2(c->nr_outputs + 3, (BRW_MAX_MRF-1)), /* msg len */
                 0,             /* response len */
-                1,             /* eot */
+                eot,           /* eot */
                 1,             /* writes complete */
                 0,             /* urb destination offset */
                 BRW_URB_SWIZZLE_INTERLEAVE);
+
+   if (c->first_overflow_output > 0) {
+      /* Not all of the vertex outputs/results fit into the MRF.
+       * Move the overflowed attributes from the GRF to the MRF and
+       * issue another brw_urb_WRITE().
+       */
+      /* XXX I'm not 100% sure about which MRF regs to use here.  Starting
+       * at mrf[4] atm...
+       */
+      GLuint i, mrf = 0;
+      for (i = c->first_overflow_output; i < VERT_RESULT_MAX; i++) {
+         if (c->prog_data.outputs_written & (1 << i)) {
+            /* move from GRF to MRF */
+            brw_MOV(p, brw_message_reg(4+mrf), c->regs[PROGRAM_OUTPUT][i]);
+            mrf++;
+         }
+      }
+
+      brw_urb_WRITE(p,
+                    brw_null_reg(), /* dest */
+                    4,              /* starting mrf reg nr */
+                    c->r0,          /* src */
+                    0,              /* allocate */
+                    1,              /* used */
+                    mrf+1,          /* msg len */
+                    0,              /* response len */
+                    1,              /* eot */
+                    1,              /* writes complete */
+                    BRW_MAX_MRF-1,  /* urb destination offset */
+                    BRW_URB_SWIZZLE_INTERLEAVE);
+   }
 }