From 2c007517b541ab470ad251365bd95c83002dfb16 Mon Sep 17 00:00:00 2001 From: Jonathan Adamczewski Date: Thu, 21 May 2009 08:19:00 -0600 Subject: [PATCH] cell: perform triangle cull a little earlier In spu_tri.c:setup_sort_vertices() triangles are culled after the vertices are sorted. This patch moves the check a little earlier and performs the actual check a little faster through intrinsics and a little trickery. Reduced code size and less work is done before a triangle is deemed OK to skip. --- src/gallium/drivers/cell/spu/spu_tri.c | 105 +++++++++++++++++-------- 1 file changed, 74 insertions(+), 31 deletions(-) diff --git a/src/gallium/drivers/cell/spu/spu_tri.c b/src/gallium/drivers/cell/spu/spu_tri.c index d9f5a466720..58be001be4c 100644 --- a/src/gallium/drivers/cell/spu/spu_tri.c +++ b/src/gallium/drivers/cell/spu/spu_tri.c @@ -133,7 +133,15 @@ struct setup_stage { uint tx, ty; /**< position of current tile (x, y) */ - int cliprect_minx, cliprect_maxx, cliprect_miny, cliprect_maxy; + union { + struct { + int cliprect_minx; + int cliprect_miny; + int cliprect_maxx; + int cliprect_maxy; + }; + qword cliprect; + }; struct interp_coef coef[PIPE_MAX_SHADER_INPUTS]; @@ -432,6 +440,41 @@ print_vertex(const struct vertex_header *v) } #endif +/* Returns the minimum of each slot of two vec_float4s as qwords. + * i.e. return[n] = min(q0[n],q1[n]); + */ +static qword +minfq(qword q0, qword q1) +{ + const qword q0q1m = si_fcgt(q0, q1); + return si_selb(q0, q1, q0q1m); +} + +/* Returns the minimum of each slot of three vec_float4s as qwords. + * i.e. return[n] = min(q0[n],q1[n],q2[n]); + */ +static qword +min3fq(qword q0, qword q1, qword q2) +{ + return minfq(minfq(q0, q1), q2); +} + +/* Returns the maximum of each slot of two vec_float4s as qwords. + * i.e. return[n] = min(q0[n],q1[n],q2[n]); + */ +static qword +maxfq(qword q0, qword q1) { + const qword q0q1m = si_fcgt(q0, q1); + return si_selb(q1, q0, q0q1m); +} + +/* Returns the maximum of each slot of three vec_float4s as qwords. + * i.e. return[n] = min(q0[n],q1[n],q2[n]); + */ +static qword +max3fq(qword q0, qword q1, qword q2) { + return maxfq(maxfq(q0, q1), q2); +} /** * Sort vertices from top to bottom. @@ -453,8 +496,25 @@ setup_sort_vertices(const qword vs) } #endif - /* determine bottom to top order of vertices */ { + /* Load the float values for various processing... */ + const qword f0 = (qword)(((const struct vertex_header*)si_to_ptr(vs))->data[0]); + const qword f1 = (qword)(((const struct vertex_header*)si_to_ptr(si_rotqbyi(vs, 4)))->data[0]); + const qword f2 = (qword)(((const struct vertex_header*)si_to_ptr(si_rotqbyi(vs, 8)))->data[0]); + + /* Check if triangle is completely outside the tile bounds + * Find the min and max x and y positions of the three poits */ + const qword minf = min3fq(f0, f1, f2); + const qword maxf = max3fq(f0, f1, f2); + + /* Compare min and max against cliprect vals */ + const qword maxsmins = si_shufb(maxf, minf, SHUFB4(A,B,a,b)); + const qword outside = si_fcgt(maxsmins, si_csflt(setup.cliprect, 0)); + + /* Use a little magic to work out of the tri is visible or not */ + if(si_to_uint(si_xori(si_gb(outside), 0xc))) return FALSE; + + /* determine bottom to top order of vertices */ /* A table of shuffle patterns for putting vertex_header pointers into correct order. Quite magical. */ const qword sort_order_patterns[] = { @@ -467,43 +527,26 @@ setup_sort_vertices(const qword vs) /* Collate y values into two vectors for comparison. Using only one shuffle constant! ;) */ - const vector float f0 = ((const struct vertex_header*)si_to_ptr(vs))->data[0]; - const vector float f1 = ((const struct vertex_header*)si_to_ptr(si_rotqbyi(vs, 4)))->data[0]; - const vector float f2 = ((const struct vertex_header*)si_to_ptr(si_rotqbyi(vs, 8)))->data[0]; - const vec_float4 y_02_ = spu_shuffle(f0, f2, SHUFFLE4(0,B,b,C)); - const vec_float4 y_10_ = spu_shuffle(f1, f0, SHUFFLE4(0,B,b,C)); - const vec_float4 y_012 = spu_shuffle(y_02_, f1, SHUFFLE4(0,B,b,C)); - const vec_float4 y_120 = spu_shuffle(y_10_, f2, SHUFFLE4(0,B,b,C)); + const qword y_02_ = si_shufb(f0, f2, SHUFB4(0,B,b,C)); + const qword y_10_ = si_shufb(f1, f0, SHUFB4(0,B,b,C)); + const qword y_012 = si_shufb(y_02_, f1, SHUFB4(0,B,b,C)); + const qword y_120 = si_shufb(y_10_, f2, SHUFB4(0,B,b,C)); /* Perform comparison: {y0,y1,y2} > {y1,y2,y0} */ - const vec_uint4 compare = spu_cmpgt(y_012, y_120); + const qword compare = si_fcgt(y_012, y_120); /* Compress the result of the comparison into 4 bits */ - const vec_uint4 gather = spu_gather(compare); + const qword gather = si_gb(compare); /* Subtract one to attain the index into the LUT. Magical. */ - const unsigned int index = spu_extract(gather, 0) - 1; + const unsigned int index = si_to_uint(gather) - 1; /* Load the appropriate pattern and construct the desired vector. */ setup.vertex_headers = si_shufb(vs, vs, sort_order_patterns[index]); /* Using the result of the comparison, set sign. Very magical. */ - sign = ((si_to_uint(si_cntb((qword)gather)) == 2) ? 1.0f : -1.0f); + sign = ((si_to_uint(si_cntb(gather)) == 2) ? 1.0f : -1.0f); } - /* Check if triangle is completely outside the tile bounds */ - if (spu_extract(setup.vmin->data[0], 1) > setup.cliprect_maxy) - return FALSE; - if (spu_extract(setup.vmax->data[0], 1) < setup.cliprect_miny) - return FALSE; - if (spu_extract(setup.vmin->data[0], 0) < setup.cliprect_minx && - spu_extract(setup.vmid->data[0], 0) < setup.cliprect_minx && - spu_extract(setup.vmax->data[0], 0) < setup.cliprect_minx) - return FALSE; - if (spu_extract(setup.vmin->data[0], 0) > setup.cliprect_maxx && - spu_extract(setup.vmid->data[0], 0) > setup.cliprect_maxx && - spu_extract(setup.vmax->data[0], 0) > setup.cliprect_maxx) - return FALSE; - setup.ebot.ds = spu_sub(setup.vmid->data[0], setup.vmin->data[0]); setup.emaj.ds = spu_sub(setup.vmax->data[0], setup.vmin->data[0]); setup.etop.ds = spu_sub(setup.vmax->data[0], setup.vmid->data[0]); @@ -766,10 +809,10 @@ tri_draw(const qword vs, setup.ty = ty; /* set clipping bounds to tile bounds */ - setup.cliprect_minx = tx * TILE_SIZE; - setup.cliprect_miny = ty * TILE_SIZE; - setup.cliprect_maxx = (tx + 1) * TILE_SIZE; - setup.cliprect_maxy = (ty + 1) * TILE_SIZE; + const qword clipbase = (qword)((vec_uint4){tx, ty}); + const qword clipmin = si_mpyui(clipbase, TILE_SIZE); + const qword clipmax = si_ai(clipmin, TILE_SIZE); + setup.cliprect = si_shufb(clipmin, clipmax, SHUFB4(A,B,a,b)); if(!setup_sort_vertices(vs)) { return FALSE; /* totally clipped */ -- 2.30.2