cell: perform triangle cull a little earlier

[mesa.git] / src / gallium / drivers / cell / spu / spu_render.c
diff --git a/src/gallium/drivers/cell/spu/spu_render.c b/src/gallium/drivers/cell/spu/spu_render.c

index 305dc9888103a6781a871d57ebf84828535429a7..5ffb7073abf155c80ab87219cbfd99846f112a37 100644 (file)
--- a/src/gallium/drivers/cell/spu/spu_render.c
+++ b/src/gallium/drivers/cell/spu/spu_render.c
@@ -32,6 +32,7 @@
  
  #include "spu_main.h"
  #include "spu_render.h"
+#include "spu_shuffle.h"
  #include "spu_tri.h"
  #include "spu_tile.h"
  #include "cell/common.h"
@@ -98,7 +99,7 @@ my_tile(uint tx, uint ty)
  static INLINE void
  get_cz_tiles(uint tx, uint ty)
  {
-   if (spu.read_depth) {
+   if (spu.read_depth_stencil) {
        if (spu.cur_ztile_status != TILE_STATUS_CLEAR) {
           //printf("SPU %u: getting Z tile %u, %u\n", spu.init.id, tx, ty);
           get_tile(tx, ty, &spu.ztile, TAG_READ_TILE_Z, 1);
@@ -153,7 +154,7 @@ static INLINE void
  wait_put_cz_tiles(void)
  {
     wait_on_mask(1 << TAG_WRITE_TILE_COLOR);
-   if (spu.read_depth) {
+   if (spu.read_depth_stencil) {
        wait_on_mask(1 << TAG_WRITE_TILE_Z);
     }
  }
@@ -175,22 +176,14 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr)
     const ubyte *vertices;
     const ushort *indexes;
     uint i, j;
+   uint num_tiles;
  
-
-   if (Debug) {
-      printf("SPU %u: RENDER prim %u, num_vert=%u  num_ind=%u  "
-             "inline_vert=%u\n",
-             spu.init.id,
-             render->prim_type,
-             render->num_verts,
-             render->num_indexes,
-             render->inline_verts);
-
-      /*
-      printf("       bound: %g, %g .. %g, %g\n",
-             render->xmin, render->ymin, render->xmax, render->ymax);
-      */
-   }
+   D_PRINTF(CELL_DEBUG_CMD,
+            "RENDER prim=%u num_vert=%u num_ind=%u inline_vert=%u\n",
+            render->prim_type,
+            render->num_verts,
+            render->num_indexes,
+            render->inline_verts);
  
     ASSERT(sizeof(*render) % 4 == 0);
     ASSERT(total_vertex_bytes % 16 == 0);
@@ -251,6 +244,8 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr)
     wait_on_mask(1 << TAG_SURFACE_CLEAR); /* XXX temporary */
  
  
+   num_tiles = 0;
+
     /**
      ** loop over tiles, rendering tris
      **/
@@ -264,6 +259,8 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr)
        if (!my_tile(tx, ty))
           continue;
  
+      num_tiles++;
+
        spu.cur_ctile_status = spu.ctile_status[ty][tx];
        spu.cur_ztile_status = spu.ztile_status[ty][tx];
  
@@ -271,15 +268,75 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr)
  
        uint drawn = 0;
  
-      /* loop over tris */
-      for (j = 0; j < render->num_indexes; j += 3) {
-         const float *v0, *v1, *v2;
-
-         v0 = (const float *) (vertices + indexes[j+0] * vertex_size);
-         v1 = (const float *) (vertices + indexes[j+1] * vertex_size);
-         v2 = (const float *) (vertices + indexes[j+2] * vertex_size);
-
-         drawn += tri_draw(v0, v1, v2, tx, ty);
+      const qword vertex_sizes = (qword)spu_splats(vertex_size);
+      const qword verticess = (qword)spu_splats((uint)vertices);
+
+      ASSERT_ALIGN16(&indexes[0]);
+
+      const uint num_indexes = render->num_indexes;
+
+      /* loop over tris
+          * &indexes[0] will be 16 byte aligned.  This loop is heavily unrolled
+          * avoiding variable rotates when extracting vertex indices.
+          */
+      for (j = 0; j < num_indexes; j += 24) {
+         /* Load three vectors, containing 24 ushort indices */
+         const qword* lower_qword = (qword*)&indexes[j];
+         const qword indices0 = lower_qword[0];
+         const qword indices1 = lower_qword[1];
+         const qword indices2 = lower_qword[2];
+
+         /* stores three indices for each tri n in slots 0, 1 and 2 of vsn */
+                /* Straightforward rotates for these */
+         qword vs0 = indices0;
+         qword vs1 = si_shlqbyi(indices0, 6);
+         qword vs3 = si_shlqbyi(indices1, 2);
+         qword vs4 = si_shlqbyi(indices1, 8);
+         qword vs6 = si_shlqbyi(indices2, 4);
+         qword vs7 = si_shlqbyi(indices2, 10);
+
+         /* For tri 2 and 5, the three indices are split across two machine
+                 * words - rotate and combine */
+         const qword tmp2a = si_shlqbyi(indices0, 12);
+         const qword tmp2b = si_rotqmbyi(indices1, 12|16);
+         qword vs2 = si_selb(tmp2a, tmp2b, si_fsmh(si_from_uint(0x20)));
+
+         const qword tmp5a = si_shlqbyi(indices1, 14);
+         const qword tmp5b = si_rotqmbyi(indices2, 14|16);
+         qword vs5 = si_selb(tmp5a, tmp5b, si_fsmh(si_from_uint(0x60)));
+
+         /* unpack indices from halfword slots to word slots */
+         vs0 = si_shufb(vs0, vs0, SHUFB8(0,A,0,B,0,C,0,0));
+         vs1 = si_shufb(vs1, vs1, SHUFB8(0,A,0,B,0,C,0,0));
+         vs2 = si_shufb(vs2, vs2, SHUFB8(0,A,0,B,0,C,0,0));
+         vs3 = si_shufb(vs3, vs3, SHUFB8(0,A,0,B,0,C,0,0));
+         vs4 = si_shufb(vs4, vs4, SHUFB8(0,A,0,B,0,C,0,0));
+         vs5 = si_shufb(vs5, vs5, SHUFB8(0,A,0,B,0,C,0,0));
+         vs6 = si_shufb(vs6, vs6, SHUFB8(0,A,0,B,0,C,0,0));
+         vs7 = si_shufb(vs7, vs7, SHUFB8(0,A,0,B,0,C,0,0));
+
+         /* Calculate address of vertex in vertices[] */
+         vs0 = si_mpya(vs0, vertex_sizes, verticess);
+         vs1 = si_mpya(vs1, vertex_sizes, verticess);
+         vs2 = si_mpya(vs2, vertex_sizes, verticess);
+         vs3 = si_mpya(vs3, vertex_sizes, verticess);
+         vs4 = si_mpya(vs4, vertex_sizes, verticess);
+         vs5 = si_mpya(vs5, vertex_sizes, verticess);
+         vs6 = si_mpya(vs6, vertex_sizes, verticess);
+         vs7 = si_mpya(vs7, vertex_sizes, verticess);
+
+         /* Select the appropriate call based on the number of vertices 
+                 * remaining */
+         switch(num_indexes - j) {
+            default: drawn += tri_draw(vs7, tx, ty);
+            case 21: drawn += tri_draw(vs6, tx, ty);
+            case 18: drawn += tri_draw(vs5, tx, ty);
+            case 15: drawn += tri_draw(vs4, tx, ty);
+            case 12: drawn += tri_draw(vs3, tx, ty);
+            case 9:  drawn += tri_draw(vs2, tx, ty);
+            case 6:  drawn += tri_draw(vs1, tx, ty);
+            case 3:  drawn += tri_draw(vs0, tx, ty);
+         }
        }
  
        //printf("SPU %u: drew %u of %u\n", spu.init.id, drawn, render->num_indexes/3);
@@ -293,9 +350,7 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr)
        spu.ztile_status[ty][tx] = spu.cur_ztile_status;
     }
  
-   if (Debug)
-      printf("SPU %u: RENDER done\n",
-             spu.init.id);
+   D_PRINTF(CELL_DEBUG_CMD,
+            "RENDER done (%u tiles hit)\n",
+            num_tiles);
  }
-
-