From b4824520ecf453cd8de90e57e839cb11a698d9c0 Mon Sep 17 00:00:00 2001
From: Jonathan Adamczewski <jadamcze@utas.edu.au>
Date: Thu, 21 May 2009 08:18:03 -0600
Subject: [PATCH] cell: unroll inner loop of spu_render.c:cmd_render()

It was taking approximately 50 cycles to extract the vertex indices,
calculate the vertex_header pointers and call tri_draw() for each
three vertices - .

Unrolled, it takes less than 100 cycles to extract, unpack,
calculate pointers and call tri_draw() eight times.  It does have a
nasty jump-tabled switch.  I'm sure that there's a better way...

Code size of spu_render.o gets larger due to the extra constants and
work in the inner loop, there are extra stack saves and loads
because there are more registers in use, and an assert.  spu_tri.o
gets a little smaller.
---
 src/gallium/drivers/cell/spu/spu_render.c | 79 ++++++++++++++++++++---
 src/gallium/drivers/cell/spu/spu_tri.c    | 40 ++++++------
 src/gallium/drivers/cell/spu/spu_tri.h    |  2 +-
 3 files changed, 89 insertions(+), 32 deletions(-)

diff --git a/src/gallium/drivers/cell/spu/spu_render.c b/src/gallium/drivers/cell/spu/spu_render.c
index 7c225e2f27c..5ffb7073abf 100644
--- a/src/gallium/drivers/cell/spu/spu_render.c
+++ b/src/gallium/drivers/cell/spu/spu_render.c
@@ -32,6 +32,7 @@
 
 #include "spu_main.h"
 #include "spu_render.h"
+#include "spu_shuffle.h"
 #include "spu_tri.h"
 #include "spu_tile.h"
 #include "cell/common.h"
@@ -267,15 +268,75 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr)
 
       uint drawn = 0;
 
-      /* loop over tris */
-      for (j = 0; j < render->num_indexes; j += 3) {
-         const float *v0, *v1, *v2;
-
-         v0 = (const float *) (vertices + indexes[j+0] * vertex_size);
-         v1 = (const float *) (vertices + indexes[j+1] * vertex_size);
-         v2 = (const float *) (vertices + indexes[j+2] * vertex_size);
-
-         drawn += tri_draw(v0, v1, v2, tx, ty);
+      const qword vertex_sizes = (qword)spu_splats(vertex_size);
+      const qword verticess = (qword)spu_splats((uint)vertices);
+
+      ASSERT_ALIGN16(&indexes[0]);
+
+      const uint num_indexes = render->num_indexes;
+
+      /* loop over tris
+	   * &indexes[0] will be 16 byte aligned.  This loop is heavily unrolled
+	   * avoiding variable rotates when extracting vertex indices.
+	   */
+      for (j = 0; j < num_indexes; j += 24) {
+         /* Load three vectors, containing 24 ushort indices */
+         const qword* lower_qword = (qword*)&indexes[j];
+         const qword indices0 = lower_qword[0];
+         const qword indices1 = lower_qword[1];
+         const qword indices2 = lower_qword[2];
+
+         /* stores three indices for each tri n in slots 0, 1 and 2 of vsn */
+		 /* Straightforward rotates for these */
+         qword vs0 = indices0;
+         qword vs1 = si_shlqbyi(indices0, 6);
+         qword vs3 = si_shlqbyi(indices1, 2);
+         qword vs4 = si_shlqbyi(indices1, 8);
+         qword vs6 = si_shlqbyi(indices2, 4);
+         qword vs7 = si_shlqbyi(indices2, 10);
+
+         /* For tri 2 and 5, the three indices are split across two machine
+		  * words - rotate and combine */
+         const qword tmp2a = si_shlqbyi(indices0, 12);
+         const qword tmp2b = si_rotqmbyi(indices1, 12|16);
+         qword vs2 = si_selb(tmp2a, tmp2b, si_fsmh(si_from_uint(0x20)));
+
+         const qword tmp5a = si_shlqbyi(indices1, 14);
+         const qword tmp5b = si_rotqmbyi(indices2, 14|16);
+         qword vs5 = si_selb(tmp5a, tmp5b, si_fsmh(si_from_uint(0x60)));
+
+         /* unpack indices from halfword slots to word slots */
+         vs0 = si_shufb(vs0, vs0, SHUFB8(0,A,0,B,0,C,0,0));
+         vs1 = si_shufb(vs1, vs1, SHUFB8(0,A,0,B,0,C,0,0));
+         vs2 = si_shufb(vs2, vs2, SHUFB8(0,A,0,B,0,C,0,0));
+         vs3 = si_shufb(vs3, vs3, SHUFB8(0,A,0,B,0,C,0,0));
+         vs4 = si_shufb(vs4, vs4, SHUFB8(0,A,0,B,0,C,0,0));
+         vs5 = si_shufb(vs5, vs5, SHUFB8(0,A,0,B,0,C,0,0));
+         vs6 = si_shufb(vs6, vs6, SHUFB8(0,A,0,B,0,C,0,0));
+         vs7 = si_shufb(vs7, vs7, SHUFB8(0,A,0,B,0,C,0,0));
+
+         /* Calculate address of vertex in vertices[] */
+         vs0 = si_mpya(vs0, vertex_sizes, verticess);
+         vs1 = si_mpya(vs1, vertex_sizes, verticess);
+         vs2 = si_mpya(vs2, vertex_sizes, verticess);
+         vs3 = si_mpya(vs3, vertex_sizes, verticess);
+         vs4 = si_mpya(vs4, vertex_sizes, verticess);
+         vs5 = si_mpya(vs5, vertex_sizes, verticess);
+         vs6 = si_mpya(vs6, vertex_sizes, verticess);
+         vs7 = si_mpya(vs7, vertex_sizes, verticess);
+
+         /* Select the appropriate call based on the number of vertices 
+		  * remaining */
+         switch(num_indexes - j) {
+            default: drawn += tri_draw(vs7, tx, ty);
+            case 21: drawn += tri_draw(vs6, tx, ty);
+            case 18: drawn += tri_draw(vs5, tx, ty);
+            case 15: drawn += tri_draw(vs4, tx, ty);
+            case 12: drawn += tri_draw(vs3, tx, ty);
+            case 9:  drawn += tri_draw(vs2, tx, ty);
+            case 6:  drawn += tri_draw(vs1, tx, ty);
+            case 3:  drawn += tri_draw(vs0, tx, ty);
+         }
       }
 
       //printf("SPU %u: drew %u of %u\n", spu.init.id, drawn, render->num_indexes/3);
diff --git a/src/gallium/drivers/cell/spu/spu_tri.c b/src/gallium/drivers/cell/spu/spu_tri.c
index d727268475e..d9f5a466720 100644
--- a/src/gallium/drivers/cell/spu/spu_tri.c
+++ b/src/gallium/drivers/cell/spu/spu_tri.c
@@ -440,9 +440,7 @@ print_vertex(const struct vertex_header *v)
  * \return  FALSE if tri is totally outside tile, TRUE otherwise
  */
 static boolean
-setup_sort_vertices(const struct vertex_header *v0,
-                    const struct vertex_header *v1,
-                    const struct vertex_header *v2)
+setup_sort_vertices(const qword vs)
 {
    float area, sign;
 
@@ -459,23 +457,23 @@ setup_sort_vertices(const struct vertex_header *v0,
    {
       /* A table of shuffle patterns for putting vertex_header pointers into
          correct order.  Quite magical. */
-      const vec_uchar16 sort_order_patterns[] = {
-         SHUFFLE4(A,B,C,C),
-         SHUFFLE4(C,A,B,C),
-         SHUFFLE4(A,C,B,C),
-         SHUFFLE4(B,C,A,C),
-         SHUFFLE4(B,A,C,C),
-         SHUFFLE4(C,B,A,C) };
-
-      /* The vertex_header pointers, packed for easy shuffling later */
-      const vec_uint4 vs = {(unsigned)v0, (unsigned)v1, (unsigned)v2};
+      const qword sort_order_patterns[] = {
+         SHUFB4(A,B,C,C),
+         SHUFB4(C,A,B,C),
+         SHUFB4(A,C,B,C),
+         SHUFB4(B,C,A,C),
+         SHUFB4(B,A,C,C),
+         SHUFB4(C,B,A,C) };
 
       /* Collate y values into two vectors for comparison.
          Using only one shuffle constant! ;) */
-      const vec_float4 y_02_ = spu_shuffle(v0->data[0], v2->data[0], SHUFFLE4(0,B,b,C));
-      const vec_float4 y_10_ = spu_shuffle(v1->data[0], v0->data[0], SHUFFLE4(0,B,b,C));
-      const vec_float4 y_012 = spu_shuffle(y_02_, v1->data[0], SHUFFLE4(0,B,b,C));
-      const vec_float4 y_120 = spu_shuffle(y_10_, v2->data[0], SHUFFLE4(0,B,b,C));
+      const vector float f0 = ((const struct vertex_header*)si_to_ptr(vs))->data[0];
+      const vector float f1 = ((const struct vertex_header*)si_to_ptr(si_rotqbyi(vs, 4)))->data[0];
+      const vector float f2 = ((const struct vertex_header*)si_to_ptr(si_rotqbyi(vs, 8)))->data[0];
+      const vec_float4 y_02_ = spu_shuffle(f0, f2, SHUFFLE4(0,B,b,C));
+      const vec_float4 y_10_ = spu_shuffle(f1, f0, SHUFFLE4(0,B,b,C));
+      const vec_float4 y_012 = spu_shuffle(y_02_, f1, SHUFFLE4(0,B,b,C));
+      const vec_float4 y_120 = spu_shuffle(y_10_, f2, SHUFFLE4(0,B,b,C));
 
       /* Perform comparison: {y0,y1,y2} > {y1,y2,y0} */
       const vec_uint4 compare = spu_cmpgt(y_012, y_120);
@@ -485,7 +483,7 @@ setup_sort_vertices(const struct vertex_header *v0,
       const unsigned int index = spu_extract(gather, 0) - 1;
 
       /* Load the appropriate pattern and construct the desired vector. */
-      setup.vertex_headers = (qword)spu_shuffle(vs, vs, sort_order_patterns[index]);
+      setup.vertex_headers = si_shufb(vs, vs, sort_order_patterns[index]);
 
       /* Using the result of the comparison, set sign.
          Very magical. */
@@ -761,7 +759,7 @@ subtriangle(struct edge *eleft, struct edge *eright, unsigned lines)
  * The tile data should have already been fetched.
  */
 boolean
-tri_draw(const float *v0, const float *v1, const float *v2,
+tri_draw(const qword vs,
          uint tx, uint ty)
 {
    setup.tx = tx;
@@ -773,9 +771,7 @@ tri_draw(const float *v0, const float *v1, const float *v2,
    setup.cliprect_maxx = (tx + 1) * TILE_SIZE;
    setup.cliprect_maxy = (ty + 1) * TILE_SIZE;
 
-   if (!setup_sort_vertices((struct vertex_header *) v0,
-                            (struct vertex_header *) v1,
-                            (struct vertex_header *) v2)) {
+   if(!setup_sort_vertices(vs)) {
       return FALSE; /* totally clipped */
    }
 
diff --git a/src/gallium/drivers/cell/spu/spu_tri.h b/src/gallium/drivers/cell/spu/spu_tri.h
index aa694dd7c93..82e3b19ad7e 100644
--- a/src/gallium/drivers/cell/spu/spu_tri.h
+++ b/src/gallium/drivers/cell/spu/spu_tri.h
@@ -31,7 +31,7 @@
 
 
 extern boolean
-tri_draw(const float *v0, const float *v1, const float *v2, uint tx, uint ty);
+tri_draw(const qword vs, uint tx, uint ty);
 
 
 #endif /* SPU_TRI_H */
-- 
2.30.2