src/gallium/drivers/cell/spu/spu_tri.c

   1 /**************************************************************************
   2  *
   3  * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
   4  * All Rights Reserved.
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a
   7  * copy of this software and associated documentation files (the
   8  * "Software"), to deal in the Software without restriction, including
   9  * without limitation the rights to use, copy, modify, merge, publish,
  10  * distribute, sub license, and/or sell copies of the Software, and to
  11  * permit persons to whom the Software is furnished to do so, subject to
  12  * the following conditions:
  13  *
  14  * The above copyright notice and this permission notice (including the
  15  * next paragraph) shall be included in all copies or substantial portions
  16  * of the Software.
  17  *
  18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  19  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
  21  * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
  22  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  23  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  24  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25  *
  26  **************************************************************************/
  27
  28 /**
  29  * Triangle rendering within a tile.
  30  */
  31
  32 #include <transpose_matrix4x4.h>
  33 #include "pipe/p_compiler.h"
  34 #include "pipe/p_format.h"
  35 #include "util/u_math.h"
  36 #include "spu_colorpack.h"
  37 #include "spu_main.h"
  38 #include "spu_shuffle.h"
  39 #include "spu_texture.h"
  40 #include "spu_tile.h"
  41 #include "spu_tri.h"
  42
  43
  44 /** Masks are uint[4] vectors with each element being 0 or 0xffffffff */
  45 typedef vector unsigned int mask_t;
  46
  47
  48
  49 /**
  50  * Simplified types taken from other parts of Gallium
  51  */
  52 struct vertex_header {
  53    vector float data[1];
  54 };
  55
  56
  57
  58 /* XXX fix this */
  59 #undef CEILF
  60 #define CEILF(X) ((float) (int) ((X) + 0.99999))
  61
  62
  63 #define QUAD_TOP_LEFT     0
  64 #define QUAD_TOP_RIGHT    1
  65 #define QUAD_BOTTOM_LEFT  2
  66 #define QUAD_BOTTOM_RIGHT 3
  67 #define MASK_TOP_LEFT     (1 << QUAD_TOP_LEFT)
  68 #define MASK_TOP_RIGHT    (1 << QUAD_TOP_RIGHT)
  69 #define MASK_BOTTOM_LEFT  (1 << QUAD_BOTTOM_LEFT)
  70 #define MASK_BOTTOM_RIGHT (1 << QUAD_BOTTOM_RIGHT)
  71 #define MASK_ALL          0xf
  72
  73
  74 #define DEBUG_VERTS 0
  75
  76 /**
  77  * Triangle edge info
  78  */
  79 struct edge {
  80    union {
  81       struct {
  82          float dx;      /**< X(v1) - X(v0), used only during setup */
  83          float dy;      /**< Y(v1) - Y(v0), used only during setup */
  84       };
  85       vec_float4 ds;    /**< vector accessor for dx and dy */
  86    };
  87    float dxdy;          /**< dx/dy */
  88    float sx, sy;        /**< first sample point coord */
  89    int lines;           /**< number of lines on this edge */
  90 };
  91
  92
  93 struct interp_coef
  94 {
  95    vector float a0;
  96    vector float dadx;
  97    vector float dady;
  98 };
  99
 100
 101 /**
 102  * Triangle setup info (derived from draw_stage).
 103  * Also used for line drawing (taking some liberties).
 104  */
 105 struct setup_stage {
 106
 107    /* Vertices are just an array of floats making up each attribute in
 108     * turn.  Currently fixed at 4 floats, but should change in time.
 109     * Codegen will help cope with this.
 110     */
 111    union {
 112       struct {
 113          const struct vertex_header *vmin;
 114          const struct vertex_header *vmid;
 115          const struct vertex_header *vmax;
 116          const struct vertex_header *vprovoke;
 117       };
 118       qword vertex_headers;
 119    };
 120
 121    struct edge ebot;
 122    struct edge etop;
 123    struct edge emaj;
 124
 125    float oneOverArea;  /* XXX maybe make into vector? */
 126
 127    uint facing;
 128
 129    uint tx, ty;  /**< position of current tile (x, y) */
 130
 131    int cliprect_minx, cliprect_maxx, cliprect_miny, cliprect_maxy;
 132
 133    struct interp_coef coef[PIPE_MAX_SHADER_INPUTS];
 134
 135    struct {
 136       vec_int4 quad; /**< [0] = row0, [1] = row1; {left[0],left[1],right[0],right[1]} */
 137       int y;
 138       unsigned y_flags;
 139       unsigned mask;     /**< mask of MASK_BOTTOM/TOP_LEFT/RIGHT bits */
 140    } span;
 141 };
 142
 143
 144 static struct setup_stage setup;
 145
 146
 147 /**
 148  * Evaluate attribute coefficients (plane equations) to compute
 149  * attribute values for the four fragments in a quad.
 150  * Eg: four colors will be computed (in AoS format).
 151  */
 152 static INLINE void
 153 eval_coeff(uint slot, float x, float y, vector float w, vector float result[4])
 154 {
 155    switch (spu.vertex_info.attrib[slot].interp_mode) {
 156    case INTERP_CONSTANT:
 157       result[QUAD_TOP_LEFT] =
 158       result[QUAD_TOP_RIGHT] =
 159       result[QUAD_BOTTOM_LEFT] =
 160       result[QUAD_BOTTOM_RIGHT] = setup.coef[slot].a0;
 161       break;
 162    case INTERP_LINEAR:
 163       {
 164          vector float dadx = setup.coef[slot].dadx;
 165          vector float dady = setup.coef[slot].dady;
 166          vector float topLeft =
 167             spu_add(setup.coef[slot].a0,
 168                     spu_add(spu_mul(spu_splats(x), dadx),
 169                             spu_mul(spu_splats(y), dady)));
 170
 171          result[QUAD_TOP_LEFT] = topLeft;
 172          result[QUAD_TOP_RIGHT] = spu_add(topLeft, dadx);
 173          result[QUAD_BOTTOM_LEFT] = spu_add(topLeft, dady);
 174          result[QUAD_BOTTOM_RIGHT] = spu_add(spu_add(topLeft, dadx), dady);
 175       }
 176       break;
 177    case INTERP_PERSPECTIVE:
 178       {
 179          vector float dadx = setup.coef[slot].dadx;
 180          vector float dady = setup.coef[slot].dady;
 181          vector float topLeft =
 182             spu_add(setup.coef[slot].a0,
 183                     spu_add(spu_mul(spu_splats(x), dadx),
 184                             spu_mul(spu_splats(y), dady)));
 185
 186          vector float wInv = spu_re(w);  /* 1.0 / w */
 187
 188          result[QUAD_TOP_LEFT] = spu_mul(topLeft, wInv);
 189          result[QUAD_TOP_RIGHT] = spu_mul(spu_add(topLeft, dadx), wInv);
 190          result[QUAD_BOTTOM_LEFT] = spu_mul(spu_add(topLeft, dady), wInv);
 191          result[QUAD_BOTTOM_RIGHT] = spu_mul(spu_add(spu_add(topLeft, dadx), dady), wInv);
 192       }
 193       break;
 194    case INTERP_POS:
 195    case INTERP_NONE:
 196       break;
 197    default:
 198       ASSERT(0);
 199    }
 200 }
 201
 202
 203 /**
 204  * As above, but return 4 vectors in SOA format.
 205  * XXX this will all be re-written someday.
 206  */
 207 static INLINE void
 208 eval_coeff_soa(uint slot, float x, float y, vector float w, vector float result[4])
 209 {
 210    eval_coeff(slot, x, y, w, result);
 211    _transpose_matrix4x4(result, result);
 212 }
 213
 214
 215 /** Evalute coefficients to get Z for four pixels in a quad */
 216 static INLINE vector float
 217 eval_z(float x, float y)
 218 {
 219    const uint slot = 0;
 220    const float dzdx = spu_extract(setup.coef[slot].dadx, 2);
 221    const float dzdy = spu_extract(setup.coef[slot].dady, 2);
 222    const float topLeft = spu_extract(setup.coef[slot].a0, 2) + x * dzdx + y * dzdy;
 223    const vector float topLeftv = spu_splats(topLeft);
 224    const vector float derivs = (vector float) { 0.0, dzdx, dzdy, dzdx + dzdy };
 225    return spu_add(topLeftv, derivs);
 226 }
 227
 228
 229 /** Evalute coefficients to get W for four pixels in a quad */
 230 static INLINE vector float
 231 eval_w(float x, float y)
 232 {
 233    const uint slot = 0;
 234    const float dwdx = spu_extract(setup.coef[slot].dadx, 3);
 235    const float dwdy = spu_extract(setup.coef[slot].dady, 3);
 236    const float topLeft = spu_extract(setup.coef[slot].a0, 3) + x * dwdx + y * dwdy;
 237    const vector float topLeftv = spu_splats(topLeft);
 238    const vector float derivs = (vector float) { 0.0, dwdx, dwdy, dwdx + dwdy };
 239    return spu_add(topLeftv, derivs);
 240 }
 241
 242
 243 /**
 244  * Emit a quad (pass to next stage).  No clipping is done.
 245  * Note: about 1/5 to 1/7 of the time, mask is zero and this function
 246  * should be skipped.  But adding the test for that slows things down
 247  * overall.
 248  */
 249 static INLINE void
 250 emit_quad( int x, int y, mask_t mask)
 251 {
 252    /* If any bits in mask are set... */
 253    if (spu_extract(spu_orx(mask), 0)) {
 254       const int ix = x - setup.cliprect_minx;
 255       const int iy = y - setup.cliprect_miny;
 256
 257       spu.cur_ctile_status = TILE_STATUS_DIRTY;
 258       spu.cur_ztile_status = TILE_STATUS_DIRTY;
 259
 260       {
 261          /*
 262           * Run fragment shader, execute per-fragment ops, update fb/tile.
 263           */
 264          vector float inputs[4*4], outputs[2*4];
 265          vector float fragZ = eval_z((float) x, (float) y);
 266          vector float fragW = eval_w((float) x, (float) y);
 267          vector unsigned int kill_mask;
 268
 269          /* setup inputs */
 270 #if 0
 271          eval_coeff_soa(1, (float) x, (float) y, fragW, inputs);
 272 #else
 273          uint i;
 274          for (i = 0; i < spu.vertex_info.num_attribs; i++) {
 275             eval_coeff_soa(i+1, (float) x, (float) y, fragW, inputs + i * 4);
 276          }
 277 #endif
 278          ASSERT(spu.fragment_program);
 279          ASSERT(spu.fragment_ops);
 280
 281          /* Execute the current fragment program */
 282          kill_mask = spu.fragment_program(inputs, outputs, spu.constants);
 283
 284          mask = spu_andc(mask, kill_mask);
 285
 286          /* Execute per-fragment/quad operations, including:
 287           * alpha test, z test, stencil test, blend and framebuffer writing.
 288           * Note that there are two different fragment operations functions
 289           * that can be called, one for front-facing fragments, and one
 290           * for back-facing fragments.  (Often the two are the same;
 291           * but in some cases, like two-sided stenciling, they can be
 292           * very different.)  So choose the correct function depending
 293           * on the calculated facing.
 294           */
 295          spu.fragment_ops[setup.facing](ix, iy, &spu.ctile, &spu.ztile,
 296                           fragZ,
 297                           outputs[0*4+0],
 298                           outputs[0*4+1],
 299                           outputs[0*4+2],
 300                           outputs[0*4+3],
 301                           mask);
 302       }
 303    }
 304 }
 305
 306
 307 /**
 308  * Given an X or Y coordinate, return the block/quad coordinate that it
 309  * belongs to.
 310  */
 311 static INLINE int
 312 block(int x)
 313 {
 314    return x & ~1;
 315 }
 316
 317
 318 /**
 319  * Render a horizontal span of quads
 320  */
 321 static void
 322 flush_spans(void)
 323 {
 324    int minleft, maxright;
 325
 326    const int l0 = spu_extract(setup.span.quad, 0);
 327    const int l1 = spu_extract(setup.span.quad, 1);
 328    const int r0 = spu_extract(setup.span.quad, 2);
 329    const int r1 = spu_extract(setup.span.quad, 3);
 330
 331    switch (setup.span.y_flags) {
 332    case 0x3:
 333       /* both odd and even lines written (both quad rows) */
 334       minleft = MIN2(l0, l1);
 335       maxright = MAX2(r0, r1);
 336       break;
 337
 338    case 0x1:
 339       /* only even line written (quad top row) */
 340       minleft = l0;
 341       maxright = r0;
 342       break;
 343
 344    case 0x2:
 345       /* only odd line written (quad bottom row) */
 346       minleft = l1;
 347       maxright = r1;
 348       break;
 349
 350    default:
 351       return;
 352    }
 353
 354    /* OK, we're very likely to need the tile data now.
 355     * clear or finish waiting if needed.
 356     */
 357    if (spu.cur_ctile_status == TILE_STATUS_GETTING) {
 358       /* wait for mfc_get() to complete */
 359       //printf("SPU: %u: waiting for ctile\n", spu.init.id);
 360       wait_on_mask(1 << TAG_READ_TILE_COLOR);
 361       spu.cur_ctile_status = TILE_STATUS_CLEAN;
 362    }
 363    else if (spu.cur_ctile_status == TILE_STATUS_CLEAR) {
 364       //printf("SPU %u: clearing C tile %u, %u\n", spu.init.id, setup.tx, setup.ty);
 365       clear_c_tile(&spu.ctile);
 366       spu.cur_ctile_status = TILE_STATUS_DIRTY;
 367    }
 368    ASSERT(spu.cur_ctile_status != TILE_STATUS_DEFINED);
 369
 370    if (spu.read_depth_stencil) {
 371       if (spu.cur_ztile_status == TILE_STATUS_GETTING) {
 372          /* wait for mfc_get() to complete */
 373          //printf("SPU: %u: waiting for ztile\n", spu.init.id);
 374          wait_on_mask(1 << TAG_READ_TILE_Z);
 375          spu.cur_ztile_status = TILE_STATUS_CLEAN;
 376       }
 377       else if (spu.cur_ztile_status == TILE_STATUS_CLEAR) {
 378          //printf("SPU %u: clearing Z tile %u, %u\n", spu.init.id, setup.tx, setup.ty);
 379          clear_z_tile(&spu.ztile);
 380          spu.cur_ztile_status = TILE_STATUS_DIRTY;
 381       }
 382       ASSERT(spu.cur_ztile_status != TILE_STATUS_DEFINED);
 383    }
 384
 385    /* XXX this loop could be moved into the above switch cases... */
 386
 387    /* Setup for mask calculation */
 388    const vec_int4 quad_LlRr = setup.span.quad;
 389    const vec_int4 quad_RrLl = spu_rlqwbyte(quad_LlRr, 8);
 390    const vec_int4 quad_LLll = spu_shuffle(quad_LlRr, quad_LlRr, SHUFFLE4(A,A,B,B));
 391    const vec_int4 quad_RRrr = spu_shuffle(quad_RrLl, quad_RrLl, SHUFFLE4(A,A,B,B));
 392
 393    const vec_int4 twos = spu_splats(2);
 394
 395    const int x = block(minleft);
 396    vec_int4 xs = {x, x+1, x, x+1};
 397
 398    for (; spu_extract(xs, 0) <= block(maxright); xs += twos) {
 399       /**
 400        * Computes mask to indicate which pixels in the 2x2 quad are actually
 401        * inside the triangle's bounds.
 402        */
 403
 404       /* Calculate ({x,x+1,x,x+1} >= {l[0],l[0],l[1],l[1]}) */
 405       const mask_t gt_LLll_xs = spu_cmpgt(quad_LLll, xs);
 406       const mask_t gte_xs_LLll = spu_nand(gt_LLll_xs, gt_LLll_xs);
 407
 408       /* Calculate ({r[0],r[0],r[1],r[1]} > {x,x+1,x,x+1}) */
 409       const mask_t gt_RRrr_xs = spu_cmpgt(quad_RRrr, xs);
 410
 411       /* Combine results to create mask */
 412       const mask_t mask = spu_and(gte_xs_LLll, gt_RRrr_xs);
 413
 414       emit_quad(spu_extract(xs, 0), setup.span.y, mask);
 415    }
 416
 417    setup.span.y = 0;
 418    setup.span.y_flags = 0;
 419    /* Zero right elements */
 420    setup.span.quad = spu_shuffle(setup.span.quad, setup.span.quad, SHUFFLE4(A,B,0,0));
 421 }
 422
 423
 424 #if DEBUG_VERTS
 425 static void
 426 print_vertex(const struct vertex_header *v)
 427 {
 428    uint i;
 429    fprintf(stderr, "  Vertex: (%p)\n", v);
 430    for (i = 0; i < spu.vertex_info.num_attribs; i++) {
 431       fprintf(stderr, "    %d: %f %f %f %f\n",  i,
 432               spu_extract(v->data[i], 0),
 433               spu_extract(v->data[i], 1),
 434               spu_extract(v->data[i], 2),
 435               spu_extract(v->data[i], 3));
 436    }
 437 }
 438 #endif
 439
 440
 441 /**
 442  * Sort vertices from top to bottom.
 443  * Compute area and determine front vs. back facing.
 444  * Do coarse clip test against tile bounds
 445  * \return  FALSE if tri is totally outside tile, TRUE otherwise
 446  */
 447 static boolean
 448 setup_sort_vertices(const struct vertex_header *v0,
 449                     const struct vertex_header *v1,
 450                     const struct vertex_header *v2)
 451 {
 452    float area, sign;
 453
 454 #if DEBUG_VERTS
 455    if (spu.init.id==0) {
 456       fprintf(stderr, "SPU %u: Triangle:\n", spu.init.id);
 457       print_vertex(v0);
 458       print_vertex(v1);
 459       print_vertex(v2);
 460    }
 461 #endif
 462
 463    /* determine bottom to top order of vertices */
 464    {
 465       /* A table of shuffle patterns for putting vertex_header pointers into
 466          correct order.  Quite magical. */
 467       const vec_uchar16 sort_order_patterns[] = {
 468          SHUFFLE4(A,B,C,C),
 469          SHUFFLE4(C,A,B,C),
 470          SHUFFLE4(A,C,B,C),
 471          SHUFFLE4(B,C,A,C),
 472          SHUFFLE4(B,A,C,C),
 473          SHUFFLE4(C,B,A,C) };
 474
 475       /* The vertex_header pointers, packed for easy shuffling later */
 476       const vec_uint4 vs = {(unsigned)v0, (unsigned)v1, (unsigned)v2};
 477
 478       /* Collate y values into two vectors for comparison.
 479          Using only one shuffle constant! ;) */
 480       const vec_float4 y_02_ = spu_shuffle(v0->data[0], v2->data[0], SHUFFLE4(0,B,b,C));
 481       const vec_float4 y_10_ = spu_shuffle(v1->data[0], v0->data[0], SHUFFLE4(0,B,b,C));
 482       const vec_float4 y_012 = spu_shuffle(y_02_, v1->data[0], SHUFFLE4(0,B,b,C));
 483       const vec_float4 y_120 = spu_shuffle(y_10_, v2->data[0], SHUFFLE4(0,B,b,C));
 484
 485       /* Perform comparison: {y0,y1,y2} > {y1,y2,y0} */
 486       const vec_uint4 compare = spu_cmpgt(y_012, y_120);
 487       /* Compress the result of the comparison into 4 bits */
 488       const vec_uint4 gather = spu_gather(compare);
 489       /* Subtract one to attain the index into the LUT.  Magical. */
 490       const unsigned int index = spu_extract(gather, 0) - 1;
 491
 492       /* Load the appropriate pattern and construct the desired vector. */
 493       setup.vertex_headers = (qword)spu_shuffle(vs, vs, sort_order_patterns[index]);
 494
 495       /* Using the result of the comparison, set sign.
 496          Very magical. */
 497       sign = ((si_to_uint(si_cntb((qword)gather)) == 2) ? 1.0f : -1.0f);
 498    }
 499
 500    /* Check if triangle is completely outside the tile bounds */
 501    if (spu_extract(setup.vmin->data[0], 1) > setup.cliprect_maxy)
 502       return FALSE;
 503    if (spu_extract(setup.vmax->data[0], 1) < setup.cliprect_miny)
 504       return FALSE;
 505    if (spu_extract(setup.vmin->data[0], 0) < setup.cliprect_minx &&
 506        spu_extract(setup.vmid->data[0], 0) < setup.cliprect_minx &&
 507        spu_extract(setup.vmax->data[0], 0) < setup.cliprect_minx)
 508       return FALSE;
 509    if (spu_extract(setup.vmin->data[0], 0) > setup.cliprect_maxx &&
 510        spu_extract(setup.vmid->data[0], 0) > setup.cliprect_maxx &&
 511        spu_extract(setup.vmax->data[0], 0) > setup.cliprect_maxx)
 512       return FALSE;
 513
 514    setup.ebot.ds = spu_sub(setup.vmid->data[0], setup.vmin->data[0]);
 515    setup.emaj.ds = spu_sub(setup.vmax->data[0], setup.vmin->data[0]);
 516    setup.etop.ds = spu_sub(setup.vmax->data[0], setup.vmid->data[0]);
 517
 518    /*
 519     * Compute triangle's area.  Use 1/area to compute partial
 520     * derivatives of attributes later.
 521     */
 522    area = setup.emaj.dx * setup.ebot.dy - setup.ebot.dx * setup.emaj.dy;
 523
 524    setup.oneOverArea = 1.0f / area;
 525
 526    /* The product of area * sign indicates front/back orientation (0/1).
 527     * Just in case someone gets the bright idea of switching the front
 528     * and back constants without noticing that we're assuming their
 529     * values in this operation, also assert that the values are
 530     * what we think they are.
 531     */
 532    ASSERT(CELL_FACING_FRONT == 0);
 533    ASSERT(CELL_FACING_BACK == 1);
 534    setup.facing = (area * sign > 0.0f)
 535       ^ (spu.rasterizer.front_winding == PIPE_WINDING_CW);
 536
 537    return TRUE;
 538 }
 539
 540
 541 /**
 542  * Compute a0 for a constant-valued coefficient (GL_FLAT shading).
 543  * The value value comes from vertex->data[slot].
 544  * The result will be put into setup.coef[slot].a0.
 545  * \param slot  which attribute slot
 546  */
 547 static INLINE void
 548 const_coeff4(uint slot)
 549 {
 550    setup.coef[slot].dadx = (vector float) {0.0, 0.0, 0.0, 0.0};
 551    setup.coef[slot].dady = (vector float) {0.0, 0.0, 0.0, 0.0};
 552    setup.coef[slot].a0 = setup.vprovoke->data[slot];
 553 }
 554
 555
 556 /**
 557  * As above, but interp setup all four vector components.
 558  */
 559 static INLINE void
 560 tri_linear_coeff4(uint slot)
 561 {
 562    const vector float vmin_d = setup.vmin->data[slot];
 563    const vector float vmid_d = setup.vmid->data[slot];
 564    const vector float vmax_d = setup.vmax->data[slot];
 565    const vector float xxxx = spu_splats(spu_extract(setup.vmin->data[0], 0) - 0.5f);
 566    const vector float yyyy = spu_splats(spu_extract(setup.vmin->data[0], 1) - 0.5f);
 567
 568    vector float botda = vmid_d - vmin_d;
 569    vector float majda = vmax_d - vmin_d;
 570
 571    vector float a = spu_sub(spu_mul(spu_splats(setup.ebot.dy), majda),
 572                             spu_mul(botda, spu_splats(setup.emaj.dy)));
 573    vector float b = spu_sub(spu_mul(spu_splats(setup.emaj.dx), botda),
 574                             spu_mul(majda, spu_splats(setup.ebot.dx)));
 575
 576    setup.coef[slot].dadx = spu_mul(a, spu_splats(setup.oneOverArea));
 577    setup.coef[slot].dady = spu_mul(b, spu_splats(setup.oneOverArea));
 578
 579    vector float tempx = spu_mul(setup.coef[slot].dadx, xxxx);
 580    vector float tempy = spu_mul(setup.coef[slot].dady, yyyy);
 581
 582    setup.coef[slot].a0 = spu_sub(vmin_d, spu_add(tempx, tempy));
 583 }
 584
 585
 586 /**
 587  * Compute a0, dadx and dady for a perspective-corrected interpolant,
 588  * for a triangle.
 589  * We basically multiply the vertex value by 1/w before computing
 590  * the plane coefficients (a0, dadx, dady).
 591  * Later, when we compute the value at a particular fragment position we'll
 592  * divide the interpolated value by the interpolated W at that fragment.
 593  */
 594 static void
 595 tri_persp_coeff4(uint slot)
 596 {
 597    const vector float xxxx = spu_splats(spu_extract(setup.vmin->data[0], 0) - 0.5f);
 598    const vector float yyyy = spu_splats(spu_extract(setup.vmin->data[0], 1) - 0.5f);
 599
 600    const vector float vmin_w = spu_splats(spu_extract(setup.vmin->data[0], 3));
 601    const vector float vmid_w = spu_splats(spu_extract(setup.vmid->data[0], 3));
 602    const vector float vmax_w = spu_splats(spu_extract(setup.vmax->data[0], 3));
 603
 604    vector float vmin_d = setup.vmin->data[slot];
 605    vector float vmid_d = setup.vmid->data[slot];
 606    vector float vmax_d = setup.vmax->data[slot];
 607
 608    vmin_d = spu_mul(vmin_d, vmin_w);
 609    vmid_d = spu_mul(vmid_d, vmid_w);
 610    vmax_d = spu_mul(vmax_d, vmax_w);
 611
 612    vector float botda = vmid_d - vmin_d;
 613    vector float majda = vmax_d - vmin_d;
 614
 615    vector float a = spu_sub(spu_mul(spu_splats(setup.ebot.dy), majda),
 616                             spu_mul(botda, spu_splats(setup.emaj.dy)));
 617    vector float b = spu_sub(spu_mul(spu_splats(setup.emaj.dx), botda),
 618                             spu_mul(majda, spu_splats(setup.ebot.dx)));
 619
 620    setup.coef[slot].dadx = spu_mul(a, spu_splats(setup.oneOverArea));
 621    setup.coef[slot].dady = spu_mul(b, spu_splats(setup.oneOverArea));
 622
 623    vector float tempx = spu_mul(setup.coef[slot].dadx, xxxx);
 624    vector float tempy = spu_mul(setup.coef[slot].dady, yyyy);
 625
 626    setup.coef[slot].a0 = spu_sub(vmin_d, spu_add(tempx, tempy));
 627 }
 628
 629
 630
 631 /**
 632  * Compute the setup.coef[] array dadx, dady, a0 values.
 633  * Must be called after setup.vmin,vmid,vmax,vprovoke are initialized.
 634  */
 635 static void
 636 setup_tri_coefficients(void)
 637 {
 638    uint i;
 639
 640    for (i = 0; i < spu.vertex_info.num_attribs; i++) {
 641       switch (spu.vertex_info.attrib[i].interp_mode) {
 642       case INTERP_NONE:
 643          break;
 644       case INTERP_CONSTANT:
 645          const_coeff4(i);
 646          break;
 647       case INTERP_POS:
 648          /* fall-through */
 649       case INTERP_LINEAR:
 650          tri_linear_coeff4(i);
 651          break;
 652       case INTERP_PERSPECTIVE:
 653          tri_persp_coeff4(i);
 654          break;
 655       default:
 656          ASSERT(0);
 657       }
 658    }
 659 }
 660
 661
 662 static void
 663 setup_tri_edges(void)
 664 {
 665    float vmin_x = spu_extract(setup.vmin->data[0], 0) + 0.5f;
 666    float vmid_x = spu_extract(setup.vmid->data[0], 0) + 0.5f;
 667
 668    float vmin_y = spu_extract(setup.vmin->data[0], 1) - 0.5f;
 669    float vmid_y = spu_extract(setup.vmid->data[0], 1) - 0.5f;
 670    float vmax_y = spu_extract(setup.vmax->data[0], 1) - 0.5f;
 671
 672    setup.emaj.sy = CEILF(vmin_y);
 673    setup.emaj.lines = (int) CEILF(vmax_y - setup.emaj.sy);
 674    setup.emaj.dxdy = setup.emaj.dx / setup.emaj.dy;
 675    setup.emaj.sx = vmin_x + (setup.emaj.sy - vmin_y) * setup.emaj.dxdy;
 676
 677    setup.etop.sy = CEILF(vmid_y);
 678    setup.etop.lines = (int) CEILF(vmax_y - setup.etop.sy);
 679    setup.etop.dxdy = setup.etop.dx / setup.etop.dy;
 680    setup.etop.sx = vmid_x + (setup.etop.sy - vmid_y) * setup.etop.dxdy;
 681
 682    setup.ebot.sy = CEILF(vmin_y);
 683    setup.ebot.lines = (int) CEILF(vmid_y - setup.ebot.sy);
 684    setup.ebot.dxdy = setup.ebot.dx / setup.ebot.dy;
 685    setup.ebot.sx = vmin_x + (setup.ebot.sy - vmin_y) * setup.ebot.dxdy;
 686 }
 687
 688
 689 /**
 690  * Render the upper or lower half of a triangle.
 691  * Scissoring/cliprect is applied here too.
 692  */
 693 static void
 694 subtriangle(struct edge *eleft, struct edge *eright, unsigned lines)
 695 {
 696    const int minx = setup.cliprect_minx;
 697    const int maxx = setup.cliprect_maxx;
 698    const int miny = setup.cliprect_miny;
 699    const int maxy = setup.cliprect_maxy;
 700    int y, start_y, finish_y;
 701    int sy = (int)eleft->sy;
 702
 703    ASSERT((int)eleft->sy == (int) eright->sy);
 704
 705    /* clip top/bottom */
 706    start_y = sy;
 707    finish_y = sy + lines;
 708
 709    if (start_y < miny)
 710       start_y = miny;
 711
 712    if (finish_y > maxy)
 713       finish_y = maxy;
 714
 715    start_y -= sy;
 716    finish_y -= sy;
 717
 718    /*
 719    _mesa_printf("%s %d %d\n", __FUNCTION__, start_y, finish_y);
 720    */
 721
 722    for (y = start_y; y < finish_y; y++) {
 723
 724       /* avoid accumulating adds as floats don't have the precision to
 725        * accurately iterate large triangle edges that way.  luckily we
 726        * can just multiply these days.
 727        *
 728        * this is all drowned out by the attribute interpolation anyway.
 729        */
 730       int left = (int)(eleft->sx + y * eleft->dxdy);
 731       int right = (int)(eright->sx + y * eright->dxdy);
 732
 733       /* clip left/right */
 734       if (left < minx)
 735          left = minx;
 736       if (right > maxx)
 737          right = maxx;
 738
 739       if (left < right) {
 740          int _y = sy + y;
 741          if (block(_y) != setup.span.y) {
 742             flush_spans();
 743             setup.span.y = block(_y);
 744          }
 745
 746          int offset = _y&1;
 747          vec_int4 quad_LlRr = {left, left, right, right};
 748          /* Store left and right in 0 or 1 row of quad based on offset */
 749          setup.span.quad = spu_sel(quad_LlRr, setup.span.quad, spu_maskw(5<<offset));
 750          setup.span.y_flags |= 1<<offset;
 751       }
 752    }
 753
 754
 755    /* save the values so that emaj can be restarted:
 756     */
 757    eleft->sx += lines * eleft->dxdy;
 758    eright->sx += lines * eright->dxdy;
 759    eleft->sy += lines;
 760    eright->sy += lines;
 761 }
 762
 763
 764 /**
 765  * Draw triangle into tile at (tx, ty) (tile coords)
 766  * The tile data should have already been fetched.
 767  */
 768 boolean
 769 tri_draw(const float *v0, const float *v1, const float *v2,
 770          uint tx, uint ty)
 771 {
 772    setup.tx = tx;
 773    setup.ty = ty;
 774
 775    /* set clipping bounds to tile bounds */
 776    setup.cliprect_minx = tx * TILE_SIZE;
 777    setup.cliprect_miny = ty * TILE_SIZE;
 778    setup.cliprect_maxx = (tx + 1) * TILE_SIZE;
 779    setup.cliprect_maxy = (ty + 1) * TILE_SIZE;
 780
 781    if (!setup_sort_vertices((struct vertex_header *) v0,
 782                             (struct vertex_header *) v1,
 783                             (struct vertex_header *) v2)) {
 784       return FALSE; /* totally clipped */
 785    }
 786
 787    setup_tri_coefficients();
 788    setup_tri_edges();
 789
 790    setup.span.y = 0;
 791    setup.span.y_flags = 0;
 792    /* Zero right elements */
 793    setup.span.quad = spu_shuffle(setup.span.quad, setup.span.quad, SHUFFLE4(A,B,0,0));
 794
 795    if (setup.oneOverArea < 0.0) {
 796       /* emaj on left */
 797       subtriangle( &setup.emaj, &setup.ebot, setup.ebot.lines );
 798       subtriangle( &setup.emaj, &setup.etop, setup.etop.lines );
 799    }
 800    else {
 801       /* emaj on right */
 802       subtriangle( &setup.ebot, &setup.emaj, setup.ebot.lines );
 803       subtriangle( &setup.etop, &setup.emaj, setup.etop.lines );
 804    }
 805
 806    flush_spans();
 807
 808    return TRUE;
 809 }