src/gallium/drivers/cell/spu/spu_tri.c

   1 /**************************************************************************
   2  *
   3  * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
   4  * All Rights Reserved.
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a
   7  * copy of this software and associated documentation files (the
   8  * "Software"), to deal in the Software without restriction, including
   9  * without limitation the rights to use, copy, modify, merge, publish,
  10  * distribute, sub license, and/or sell copies of the Software, and to
  11  * permit persons to whom the Software is furnished to do so, subject to
  12  * the following conditions:
  13  *
  14  * The above copyright notice and this permission notice (including the
  15  * next paragraph) shall be included in all copies or substantial portions
  16  * of the Software.
  17  *
  18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  19  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
  21  * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
  22  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  23  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  24  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25  *
  26  **************************************************************************/
  27
  28 /**
  29  * Triangle rendering within a tile.
  30  */
  31
  32 #include <transpose_matrix4x4.h>
  33 #include "pipe/p_compiler.h"
  34 #include "pipe/p_format.h"
  35 #include "util/u_math.h"
  36 #include "spu_colorpack.h"
  37 #include "spu_main.h"
  38 #include "spu_texture.h"
  39 #include "spu_tile.h"
  40 #include "spu_tri.h"
  41
  42
  43 /** Masks are uint[4] vectors with each element being 0 or 0xffffffff */
  44 typedef vector unsigned int mask_t;
  45
  46 typedef union
  47 {
  48    vector float v;
  49    float f[4];
  50 } float4;
  51
  52
  53 /**
  54  * Simplified types taken from other parts of Gallium
  55  */
  56 struct vertex_header {
  57    vector float data[1];
  58 };
  59
  60
  61
  62 /* XXX fix this */
  63 #undef CEILF
  64 #define CEILF(X) ((float) (int) ((X) + 0.99999))
  65
  66
  67 #define QUAD_TOP_LEFT     0
  68 #define QUAD_TOP_RIGHT    1
  69 #define QUAD_BOTTOM_LEFT  2
  70 #define QUAD_BOTTOM_RIGHT 3
  71 #define MASK_TOP_LEFT     (1 << QUAD_TOP_LEFT)
  72 #define MASK_TOP_RIGHT    (1 << QUAD_TOP_RIGHT)
  73 #define MASK_BOTTOM_LEFT  (1 << QUAD_BOTTOM_LEFT)
  74 #define MASK_BOTTOM_RIGHT (1 << QUAD_BOTTOM_RIGHT)
  75 #define MASK_ALL          0xf
  76
  77
  78 #define DEBUG_VERTS 0
  79
  80 /**
  81  * Triangle edge info
  82  */
  83 struct edge {
  84    float dx;            /**< X(v1) - X(v0), used only during setup */
  85    float dy;            /**< Y(v1) - Y(v0), used only during setup */
  86    float dxdy;          /**< dx/dy */
  87    float sx, sy;        /**< first sample point coord */
  88    int lines;           /**< number of lines on this edge */
  89 };
  90
  91
  92 struct interp_coef
  93 {
  94    float4 a0;
  95    float4 dadx;
  96    float4 dady;
  97 };
  98
  99
 100 /**
 101  * Triangle setup info (derived from draw_stage).
 102  * Also used for line drawing (taking some liberties).
 103  */
 104 struct setup_stage {
 105
 106    /* Vertices are just an array of floats making up each attribute in
 107     * turn.  Currently fixed at 4 floats, but should change in time.
 108     * Codegen will help cope with this.
 109     */
 110    const struct vertex_header *vmax;
 111    const struct vertex_header *vmid;
 112    const struct vertex_header *vmin;
 113    const struct vertex_header *vprovoke;
 114
 115    struct edge ebot;
 116    struct edge etop;
 117    struct edge emaj;
 118
 119    float oneOverArea;
 120
 121    uint facing;
 122
 123    uint tx, ty;  /**< position of current tile (x, y) */
 124
 125    int cliprect_minx, cliprect_maxx, cliprect_miny, cliprect_maxy;
 126
 127    struct interp_coef coef[PIPE_MAX_SHADER_INPUTS];
 128
 129    struct {
 130       int left[2];   /**< [0] = row0, [1] = row1 */
 131       int right[2];
 132       int y;
 133       unsigned y_flags;
 134       unsigned mask;     /**< mask of MASK_BOTTOM/TOP_LEFT/RIGHT bits */
 135    } span;
 136 };
 137
 138
 139 static struct setup_stage setup;
 140
 141
 142 /**
 143  * Evaluate attribute coefficients (plane equations) to compute
 144  * attribute values for the four fragments in a quad.
 145  * Eg: four colors will be computed (in AoS format).
 146  */
 147 static INLINE void
 148 eval_coeff(uint slot, float x, float y, vector float w, vector float result[4])
 149 {
 150    switch (spu.vertex_info.attrib[slot].interp_mode) {
 151    case INTERP_CONSTANT:
 152       result[QUAD_TOP_LEFT] =
 153       result[QUAD_TOP_RIGHT] =
 154       result[QUAD_BOTTOM_LEFT] =
 155       result[QUAD_BOTTOM_RIGHT] = setup.coef[slot].a0.v;
 156       break;
 157    case INTERP_LINEAR:
 158       {
 159          vector float dadx = setup.coef[slot].dadx.v;
 160          vector float dady = setup.coef[slot].dady.v;
 161          vector float topLeft =
 162             spu_add(setup.coef[slot].a0.v,
 163                     spu_add(spu_mul(spu_splats(x), dadx),
 164                             spu_mul(spu_splats(y), dady)));
 165
 166          result[QUAD_TOP_LEFT] = topLeft;
 167          result[QUAD_TOP_RIGHT] = spu_add(topLeft, dadx);
 168          result[QUAD_BOTTOM_LEFT] = spu_add(topLeft, dady);
 169          result[QUAD_BOTTOM_RIGHT] = spu_add(spu_add(topLeft, dadx), dady);
 170       }
 171       break;
 172    case INTERP_PERSPECTIVE:
 173       {
 174          vector float dadx = setup.coef[slot].dadx.v;
 175          vector float dady = setup.coef[slot].dady.v;
 176          vector float topLeft =
 177             spu_add(setup.coef[slot].a0.v,
 178                     spu_add(spu_mul(spu_splats(x), dadx),
 179                             spu_mul(spu_splats(y), dady)));
 180
 181          vector float wInv = spu_re(w);  /* 1.0 / w */
 182
 183          result[QUAD_TOP_LEFT] = spu_mul(topLeft, wInv);
 184          result[QUAD_TOP_RIGHT] = spu_mul(spu_add(topLeft, dadx), wInv);
 185          result[QUAD_BOTTOM_LEFT] = spu_mul(spu_add(topLeft, dady), wInv);
 186          result[QUAD_BOTTOM_RIGHT] = spu_mul(spu_add(spu_add(topLeft, dadx), dady), wInv);
 187       }
 188       break;
 189    case INTERP_POS:
 190    case INTERP_NONE:
 191       break;
 192    default:
 193       ASSERT(0);
 194    }
 195 }
 196
 197
 198 /**
 199  * As above, but return 4 vectors in SOA format.
 200  * XXX this will all be re-written someday.
 201  */
 202 static INLINE void
 203 eval_coeff_soa(uint slot, float x, float y, vector float w, vector float result[4])
 204 {
 205    eval_coeff(slot, x, y, w, result);
 206    _transpose_matrix4x4(result, result);
 207 }
 208
 209
 210 /** Evalute coefficients to get Z for four pixels in a quad */
 211 static INLINE vector float
 212 eval_z(float x, float y)
 213 {
 214    const uint slot = 0;
 215    const float dzdx = setup.coef[slot].dadx.f[2];
 216    const float dzdy = setup.coef[slot].dady.f[2];
 217    const float topLeft = setup.coef[slot].a0.f[2] + x * dzdx + y * dzdy;
 218    const vector float topLeftv = spu_splats(topLeft);
 219    const vector float derivs = (vector float) { 0.0, dzdx, dzdy, dzdx + dzdy };
 220    return spu_add(topLeftv, derivs);
 221 }
 222
 223
 224 /** Evalute coefficients to get W for four pixels in a quad */
 225 static INLINE vector float
 226 eval_w(float x, float y)
 227 {
 228    const uint slot = 0;
 229    const float dwdx = setup.coef[slot].dadx.f[3];
 230    const float dwdy = setup.coef[slot].dady.f[3];
 231    const float topLeft = setup.coef[slot].a0.f[3] + x * dwdx + y * dwdy;
 232    const vector float topLeftv = spu_splats(topLeft);
 233    const vector float derivs = (vector float) { 0.0, dwdx, dwdy, dwdx + dwdy };
 234    return spu_add(topLeftv, derivs);
 235 }
 236
 237
 238 /**
 239  * Emit a quad (pass to next stage).  No clipping is done.
 240  * Note: about 1/5 to 1/7 of the time, mask is zero and this function
 241  * should be skipped.  But adding the test for that slows things down
 242  * overall.
 243  */
 244 static INLINE void
 245 emit_quad( int x, int y, mask_t mask)
 246 {
 247    /* If any bits in mask are set... */
 248    if (spu_extract(spu_orx(mask), 0)) {
 249       const int ix = x - setup.cliprect_minx;
 250       const int iy = y - setup.cliprect_miny;
 251
 252       spu.cur_ctile_status = TILE_STATUS_DIRTY;
 253       spu.cur_ztile_status = TILE_STATUS_DIRTY;
 254
 255       {
 256          /*
 257           * Run fragment shader, execute per-fragment ops, update fb/tile.
 258           */
 259          vector float inputs[4*4], outputs[2*4];
 260          vector float fragZ = eval_z((float) x, (float) y);
 261          vector float fragW = eval_w((float) x, (float) y);
 262
 263          /* setup inputs */
 264 #if 0
 265          eval_coeff_soa(1, (float) x, (float) y, fragW, inputs);
 266 #else
 267          uint i;
 268          for (i = 0; i < spu.vertex_info.num_attribs; i++) {
 269             eval_coeff_soa(i+1, (float) x, (float) y, fragW, inputs + i * 4);
 270          }
 271 #endif
 272          ASSERT(spu.fragment_program);
 273          ASSERT(spu.fragment_ops);
 274
 275          /* Execute the current fragment program */
 276          spu.fragment_program(inputs, outputs, spu.constants);
 277
 278          /* Execute per-fragment/quad operations, including:
 279           * alpha test, z test, stencil test, blend and framebuffer writing.
 280           */
 281          spu.fragment_ops(ix, iy, &spu.ctile, &spu.ztile,
 282                           fragZ,
 283                           outputs[0*4+0],
 284                           outputs[0*4+1],
 285                           outputs[0*4+2],
 286                           outputs[0*4+3],
 287                           mask,
 288                           setup.facing);
 289       }
 290    }
 291 }
 292
 293
 294 /**
 295  * Given an X or Y coordinate, return the block/quad coordinate that it
 296  * belongs to.
 297  */
 298 static INLINE int
 299 block(int x)
 300 {
 301    return x & ~1;
 302 }
 303
 304
 305 /**
 306  * Compute mask which indicates which pixels in the 2x2 quad are actually inside
 307  * the triangle's bounds.
 308  * The mask is a uint4 vector and each element will be 0 or 0xffffffff.
 309  */
 310 static INLINE mask_t
 311 calculate_mask(int x)
 312 {
 313    /* This is a little tricky.
 314     * Use & instead of && to avoid branches.
 315     * Use negation to convert true/false to ~0/0 values.
 316     */
 317    mask_t mask;
 318    mask = spu_insert(-((x   >= setup.span.left[0]) & (x   < setup.span.right[0])), mask, 0);
 319    mask = spu_insert(-((x+1 >= setup.span.left[0]) & (x+1 < setup.span.right[0])), mask, 1);
 320    mask = spu_insert(-((x   >= setup.span.left[1]) & (x   < setup.span.right[1])), mask, 2);
 321    mask = spu_insert(-((x+1 >= setup.span.left[1]) & (x+1 < setup.span.right[1])), mask, 3);
 322    return mask;
 323 }
 324
 325
 326 /**
 327  * Render a horizontal span of quads
 328  */
 329 static void
 330 flush_spans(void)
 331 {
 332    int minleft, maxright;
 333    int x;
 334
 335    switch (setup.span.y_flags) {
 336    case 0x3:
 337       /* both odd and even lines written (both quad rows) */
 338       minleft = MIN2(setup.span.left[0], setup.span.left[1]);
 339       maxright = MAX2(setup.span.right[0], setup.span.right[1]);
 340       break;
 341
 342    case 0x1:
 343       /* only even line written (quad top row) */
 344       minleft = setup.span.left[0];
 345       maxright = setup.span.right[0];
 346       break;
 347
 348    case 0x2:
 349       /* only odd line written (quad bottom row) */
 350       minleft = setup.span.left[1];
 351       maxright = setup.span.right[1];
 352       break;
 353
 354    default:
 355       return;
 356    }
 357
 358    /* OK, we're very likely to need the tile data now.
 359     * clear or finish waiting if needed.
 360     */
 361    if (spu.cur_ctile_status == TILE_STATUS_GETTING) {
 362       /* wait for mfc_get() to complete */
 363       //printf("SPU: %u: waiting for ctile\n", spu.init.id);
 364       wait_on_mask(1 << TAG_READ_TILE_COLOR);
 365       spu.cur_ctile_status = TILE_STATUS_CLEAN;
 366    }
 367    else if (spu.cur_ctile_status == TILE_STATUS_CLEAR) {
 368       //printf("SPU %u: clearing C tile %u, %u\n", spu.init.id, setup.tx, setup.ty);
 369       clear_c_tile(&spu.ctile);
 370       spu.cur_ctile_status = TILE_STATUS_DIRTY;
 371    }
 372    ASSERT(spu.cur_ctile_status != TILE_STATUS_DEFINED);
 373
 374    if (spu.read_depth) {
 375       if (spu.cur_ztile_status == TILE_STATUS_GETTING) {
 376          /* wait for mfc_get() to complete */
 377          //printf("SPU: %u: waiting for ztile\n", spu.init.id);
 378          wait_on_mask(1 << TAG_READ_TILE_Z);
 379          spu.cur_ztile_status = TILE_STATUS_CLEAN;
 380       }
 381       else if (spu.cur_ztile_status == TILE_STATUS_CLEAR) {
 382          //printf("SPU %u: clearing Z tile %u, %u\n", spu.init.id, setup.tx, setup.ty);
 383          clear_z_tile(&spu.ztile);
 384          spu.cur_ztile_status = TILE_STATUS_DIRTY;
 385       }
 386       ASSERT(spu.cur_ztile_status != TILE_STATUS_DEFINED);
 387    }
 388
 389    /* XXX this loop could be moved into the above switch cases and
 390     * calculate_mask() could be simplified a bit...
 391     */
 392    for (x = block(minleft); x <= block(maxright); x += 2) {
 393       emit_quad( x, setup.span.y, calculate_mask( x ));
 394    }
 395
 396    setup.span.y = 0;
 397    setup.span.y_flags = 0;
 398    setup.span.right[0] = 0;
 399    setup.span.right[1] = 0;
 400 }
 401
 402
 403 #if DEBUG_VERTS
 404 static void
 405 print_vertex(const struct vertex_header *v)
 406 {
 407    uint i;
 408    fprintf(stderr, "  Vertex: (%p)\n", v);
 409    for (i = 0; i < spu.vertex_info.num_attribs; i++) {
 410       fprintf(stderr, "    %d: %f %f %f %f\n",  i,
 411               spu_extract(v->data[i], 0),
 412               spu_extract(v->data[i], 1),
 413               spu_extract(v->data[i], 2),
 414               spu_extract(v->data[i], 3));
 415    }
 416 }
 417 #endif
 418
 419
 420 static boolean
 421 setup_sort_vertices(const struct vertex_header *v0,
 422                     const struct vertex_header *v1,
 423                     const struct vertex_header *v2)
 424 {
 425 #if DEBUG_VERTS
 426    if (spu.init.id==0) {
 427       fprintf(stderr, "SPU %u: Triangle:\n", spu.init.id);
 428       print_vertex(v0);
 429       print_vertex(v1);
 430       print_vertex(v2);
 431    }
 432 #endif
 433
 434    setup.vprovoke = v2;
 435
 436    /* determine bottom to top order of vertices */
 437    {
 438       float y0 = spu_extract(v0->data[0], 1);
 439       float y1 = spu_extract(v1->data[0], 1);
 440       float y2 = spu_extract(v2->data[0], 1);
 441       if (y0 <= y1) {
 442          if (y1 <= y2) {
 443             /* y0<=y1<=y2 */
 444             setup.vmin = v0;
 445             setup.vmid = v1;
 446             setup.vmax = v2;
 447          }
 448          else if (y2 <= y0) {
 449             /* y2<=y0<=y1 */
 450             setup.vmin = v2;
 451             setup.vmid = v0;
 452             setup.vmax = v1;
 453          }
 454          else {
 455             /* y0<=y2<=y1 */
 456             setup.vmin = v0;
 457             setup.vmid = v2;
 458             setup.vmax = v1;
 459          }
 460       }
 461       else {
 462          if (y0 <= y2) {
 463             /* y1<=y0<=y2 */
 464             setup.vmin = v1;
 465             setup.vmid = v0;
 466             setup.vmax = v2;
 467          }
 468          else if (y2 <= y1) {
 469             /* y2<=y1<=y0 */
 470             setup.vmin = v2;
 471             setup.vmid = v1;
 472             setup.vmax = v0;
 473          }
 474          else {
 475             /* y1<=y2<=y0 */
 476             setup.vmin = v1;
 477             setup.vmid = v2;
 478             setup.vmax = v0;
 479          }
 480       }
 481    }
 482
 483    /* Check if triangle is completely outside the tile bounds */
 484    if (spu_extract(setup.vmin->data[0], 1) > setup.cliprect_maxy)
 485       return FALSE;
 486    if (spu_extract(setup.vmax->data[0], 1) < setup.cliprect_miny)
 487       return FALSE;
 488    if (spu_extract(setup.vmin->data[0], 0) < setup.cliprect_minx &&
 489        spu_extract(setup.vmid->data[0], 0) < setup.cliprect_minx &&
 490        spu_extract(setup.vmax->data[0], 0) < setup.cliprect_minx)
 491       return FALSE;
 492    if (spu_extract(setup.vmin->data[0], 0) > setup.cliprect_maxx &&
 493        spu_extract(setup.vmid->data[0], 0) > setup.cliprect_maxx &&
 494        spu_extract(setup.vmax->data[0], 0) > setup.cliprect_maxx)
 495       return FALSE;
 496
 497    setup.ebot.dx = spu_extract(setup.vmid->data[0], 0) - spu_extract(setup.vmin->data[0], 0);
 498    setup.ebot.dy = spu_extract(setup.vmid->data[0], 1) - spu_extract(setup.vmin->data[0], 1);
 499    setup.emaj.dx = spu_extract(setup.vmax->data[0], 0) - spu_extract(setup.vmin->data[0], 0);
 500    setup.emaj.dy = spu_extract(setup.vmax->data[0], 1) - spu_extract(setup.vmin->data[0], 1);
 501    setup.etop.dx = spu_extract(setup.vmax->data[0], 0) - spu_extract(setup.vmid->data[0], 0);
 502    setup.etop.dy = spu_extract(setup.vmax->data[0], 1) - spu_extract(setup.vmid->data[0], 1);
 503
 504    /*
 505     * Compute triangle's area.  Use 1/area to compute partial
 506     * derivatives of attributes later.
 507     *
 508     * The area will be the same as prim->det, but the sign may be
 509     * different depending on how the vertices get sorted above.
 510     *
 511     * To determine whether the primitive is front or back facing we
 512     * use the prim->det value because its sign is correct.
 513     */
 514    {
 515       const float area = (setup.emaj.dx * setup.ebot.dy -
 516                           setup.ebot.dx * setup.emaj.dy);
 517
 518       setup.oneOverArea = 1.0f / area;
 519       /*
 520       _mesa_printf("%s one-over-area %f  area %f  det %f\n",
 521                    __FUNCTION__, setup.oneOverArea, area, prim->det );
 522       */
 523    }
 524
 525 #if 0
 526    /* We need to know if this is a front or back-facing triangle for:
 527     *  - the GLSL gl_FrontFacing fragment attribute (bool)
 528     *  - two-sided stencil test
 529     */
 530    setup.quad.facing = (prim->det > 0.0) ^ (setup.softpipe->rasterizer->front_winding == PIPE_WINDING_CW);
 531 #endif
 532
 533    return TRUE;
 534 }
 535
 536
 537 /**
 538  * Compute a0 for a constant-valued coefficient (GL_FLAT shading).
 539  * The value value comes from vertex->data[slot].
 540  * The result will be put into setup.coef[slot].a0.
 541  * \param slot  which attribute slot
 542  */
 543 static INLINE void
 544 const_coeff4(uint slot)
 545 {
 546    setup.coef[slot].dadx.v = (vector float) {0.0, 0.0, 0.0, 0.0};
 547    setup.coef[slot].dady.v = (vector float) {0.0, 0.0, 0.0, 0.0};
 548    setup.coef[slot].a0.v = setup.vprovoke->data[slot];
 549 }
 550
 551
 552 /**
 553  * As above, but interp setup all four vector components.
 554  */
 555 static INLINE void
 556 tri_linear_coeff4(uint slot)
 557 {
 558    const vector float vmin_d = setup.vmin->data[slot];
 559    const vector float vmid_d = setup.vmid->data[slot];
 560    const vector float vmax_d = setup.vmax->data[slot];
 561    const vector float xxxx = spu_splats(spu_extract(setup.vmin->data[0], 0) - 0.5f);
 562    const vector float yyyy = spu_splats(spu_extract(setup.vmin->data[0], 1) - 0.5f);
 563
 564    vector float botda = vmid_d - vmin_d;
 565    vector float majda = vmax_d - vmin_d;
 566
 567    vector float a = spu_sub(spu_mul(spu_splats(setup.ebot.dy), majda),
 568                             spu_mul(botda, spu_splats(setup.emaj.dy)));
 569    vector float b = spu_sub(spu_mul(spu_splats(setup.emaj.dx), botda),
 570                             spu_mul(majda, spu_splats(setup.ebot.dx)));
 571
 572    setup.coef[slot].dadx.v = spu_mul(a, spu_splats(setup.oneOverArea));
 573    setup.coef[slot].dady.v = spu_mul(b, spu_splats(setup.oneOverArea));
 574
 575    vector float tempx = spu_mul(setup.coef[slot].dadx.v, xxxx);
 576    vector float tempy = spu_mul(setup.coef[slot].dady.v, yyyy);
 577
 578    setup.coef[slot].a0.v = spu_sub(vmin_d, spu_add(tempx, tempy));
 579 }
 580
 581
 582 /**
 583  * Compute a0, dadx and dady for a perspective-corrected interpolant,
 584  * for a triangle.
 585  * We basically multiply the vertex value by 1/w before computing
 586  * the plane coefficients (a0, dadx, dady).
 587  * Later, when we compute the value at a particular fragment position we'll
 588  * divide the interpolated value by the interpolated W at that fragment.
 589  */
 590 static void
 591 tri_persp_coeff4(uint slot)
 592 {
 593    const vector float xxxx = spu_splats(spu_extract(setup.vmin->data[0], 0) - 0.5f);
 594    const vector float yyyy = spu_splats(spu_extract(setup.vmin->data[0], 1) - 0.5f);
 595
 596    const vector float vmin_w = spu_splats(spu_extract(setup.vmin->data[0], 3));
 597    const vector float vmid_w = spu_splats(spu_extract(setup.vmid->data[0], 3));
 598    const vector float vmax_w = spu_splats(spu_extract(setup.vmax->data[0], 3));
 599
 600    vector float vmin_d = setup.vmin->data[slot];
 601    vector float vmid_d = setup.vmid->data[slot];
 602    vector float vmax_d = setup.vmax->data[slot];
 603
 604    vmin_d = spu_mul(vmin_d, vmin_w);
 605    vmid_d = spu_mul(vmid_d, vmid_w);
 606    vmax_d = spu_mul(vmax_d, vmax_w);
 607
 608    vector float botda = vmid_d - vmin_d;
 609    vector float majda = vmax_d - vmin_d;
 610
 611    vector float a = spu_sub(spu_mul(spu_splats(setup.ebot.dy), majda),
 612                             spu_mul(botda, spu_splats(setup.emaj.dy)));
 613    vector float b = spu_sub(spu_mul(spu_splats(setup.emaj.dx), botda),
 614                             spu_mul(majda, spu_splats(setup.ebot.dx)));
 615
 616    setup.coef[slot].dadx.v = spu_mul(a, spu_splats(setup.oneOverArea));
 617    setup.coef[slot].dady.v = spu_mul(b, spu_splats(setup.oneOverArea));
 618
 619    vector float tempx = spu_mul(setup.coef[slot].dadx.v, xxxx);
 620    vector float tempy = spu_mul(setup.coef[slot].dady.v, yyyy);
 621
 622    setup.coef[slot].a0.v = spu_sub(vmin_d, spu_add(tempx, tempy));
 623 }
 624
 625
 626
 627 /**
 628  * Compute the setup.coef[] array dadx, dady, a0 values.
 629  * Must be called after setup.vmin,vmid,vmax,vprovoke are initialized.
 630  */
 631 static void
 632 setup_tri_coefficients(void)
 633 {
 634    uint i;
 635
 636    for (i = 0; i < spu.vertex_info.num_attribs; i++) {
 637       switch (spu.vertex_info.attrib[i].interp_mode) {
 638       case INTERP_NONE:
 639          break;
 640       case INTERP_CONSTANT:
 641          const_coeff4(i);
 642          break;
 643       case INTERP_POS:
 644          /* fall-through */
 645       case INTERP_LINEAR:
 646          tri_linear_coeff4(i);
 647          break;
 648       case INTERP_PERSPECTIVE:
 649          tri_persp_coeff4(i);
 650          break;
 651       default:
 652          ASSERT(0);
 653       }
 654    }
 655 }
 656
 657
 658 static void
 659 setup_tri_edges(void)
 660 {
 661    float vmin_x = spu_extract(setup.vmin->data[0], 0) + 0.5f;
 662    float vmid_x = spu_extract(setup.vmid->data[0], 0) + 0.5f;
 663
 664    float vmin_y = spu_extract(setup.vmin->data[0], 1) - 0.5f;
 665    float vmid_y = spu_extract(setup.vmid->data[0], 1) - 0.5f;
 666    float vmax_y = spu_extract(setup.vmax->data[0], 1) - 0.5f;
 667
 668    setup.emaj.sy = CEILF(vmin_y);
 669    setup.emaj.lines = (int) CEILF(vmax_y - setup.emaj.sy);
 670    setup.emaj.dxdy = setup.emaj.dx / setup.emaj.dy;
 671    setup.emaj.sx = vmin_x + (setup.emaj.sy - vmin_y) * setup.emaj.dxdy;
 672
 673    setup.etop.sy = CEILF(vmid_y);
 674    setup.etop.lines = (int) CEILF(vmax_y - setup.etop.sy);
 675    setup.etop.dxdy = setup.etop.dx / setup.etop.dy;
 676    setup.etop.sx = vmid_x + (setup.etop.sy - vmid_y) * setup.etop.dxdy;
 677
 678    setup.ebot.sy = CEILF(vmin_y);
 679    setup.ebot.lines = (int) CEILF(vmid_y - setup.ebot.sy);
 680    setup.ebot.dxdy = setup.ebot.dx / setup.ebot.dy;
 681    setup.ebot.sx = vmin_x + (setup.ebot.sy - vmin_y) * setup.ebot.dxdy;
 682 }
 683
 684
 685 /**
 686  * Render the upper or lower half of a triangle.
 687  * Scissoring/cliprect is applied here too.
 688  */
 689 static void
 690 subtriangle(struct edge *eleft, struct edge *eright, unsigned lines)
 691 {
 692    const int minx = setup.cliprect_minx;
 693    const int maxx = setup.cliprect_maxx;
 694    const int miny = setup.cliprect_miny;
 695    const int maxy = setup.cliprect_maxy;
 696    int y, start_y, finish_y;
 697    int sy = (int)eleft->sy;
 698
 699    ASSERT((int)eleft->sy == (int) eright->sy);
 700
 701    /* clip top/bottom */
 702    start_y = sy;
 703    finish_y = sy + lines;
 704
 705    if (start_y < miny)
 706       start_y = miny;
 707
 708    if (finish_y > maxy)
 709       finish_y = maxy;
 710
 711    start_y -= sy;
 712    finish_y -= sy;
 713
 714    /*
 715    _mesa_printf("%s %d %d\n", __FUNCTION__, start_y, finish_y);
 716    */
 717
 718    for (y = start_y; y < finish_y; y++) {
 719
 720       /* avoid accumulating adds as floats don't have the precision to
 721        * accurately iterate large triangle edges that way.  luckily we
 722        * can just multiply these days.
 723        *
 724        * this is all drowned out by the attribute interpolation anyway.
 725        */
 726       int left = (int)(eleft->sx + y * eleft->dxdy);
 727       int right = (int)(eright->sx + y * eright->dxdy);
 728
 729       /* clip left/right */
 730       if (left < minx)
 731          left = minx;
 732       if (right > maxx)
 733          right = maxx;
 734
 735       if (left < right) {
 736          int _y = sy + y;
 737          if (block(_y) != setup.span.y) {
 738             flush_spans();
 739             setup.span.y = block(_y);
 740          }
 741
 742          setup.span.left[_y&1] = left;
 743          setup.span.right[_y&1] = right;
 744          setup.span.y_flags |= 1<<(_y&1);
 745       }
 746    }
 747
 748
 749    /* save the values so that emaj can be restarted:
 750     */
 751    eleft->sx += lines * eleft->dxdy;
 752    eright->sx += lines * eright->dxdy;
 753    eleft->sy += lines;
 754    eright->sy += lines;
 755 }
 756
 757
 758 static float
 759 determinant(const float *v0, const float *v1, const float *v2)
 760 {
 761    /* edge vectors e = v0 - v2, f = v1 - v2 */
 762    const float ex = v0[0] - v2[0];
 763    const float ey = v0[1] - v2[1];
 764    const float fx = v1[0] - v2[0];
 765    const float fy = v1[1] - v2[1];
 766
 767    /* det = cross(e,f).z */
 768    return ex * fy - ey * fx;
 769 }
 770
 771
 772 /**
 773  * Draw triangle into tile at (tx, ty) (tile coords)
 774  * The tile data should have already been fetched.
 775  */
 776 boolean
 777 tri_draw(const float *v0, const float *v1, const float *v2,
 778          uint tx, uint ty)
 779 {
 780    setup.tx = tx;
 781    setup.ty = ty;
 782
 783    /* set clipping bounds to tile bounds */
 784    setup.cliprect_minx = tx * TILE_SIZE;
 785    setup.cliprect_miny = ty * TILE_SIZE;
 786    setup.cliprect_maxx = (tx + 1) * TILE_SIZE;
 787    setup.cliprect_maxy = (ty + 1) * TILE_SIZE;
 788
 789    /* Before we sort vertices, determine the facing of the triangle,
 790     * which will be needed for front/back-face stencil application
 791     */
 792    float det = determinant(v0, v1, v2);
 793    setup.facing = (det > 0.0) ^ (spu.rasterizer.front_winding == PIPE_WINDING_CW);
 794
 795    if (!setup_sort_vertices((struct vertex_header *) v0,
 796                             (struct vertex_header *) v1,
 797                             (struct vertex_header *) v2)) {
 798       return FALSE; /* totally clipped */
 799    }
 800
 801    setup_tri_coefficients();
 802    setup_tri_edges();
 803
 804    setup.span.y = 0;
 805    setup.span.y_flags = 0;
 806    setup.span.right[0] = 0;
 807    setup.span.right[1] = 0;
 808
 809    if (setup.oneOverArea < 0.0) {
 810       /* emaj on left */
 811       subtriangle( &setup.emaj, &setup.ebot, setup.ebot.lines );
 812       subtriangle( &setup.emaj, &setup.etop, setup.etop.lines );
 813    }
 814    else {
 815       /* emaj on right */
 816       subtriangle( &setup.ebot, &setup.emaj, setup.ebot.lines );
 817       subtriangle( &setup.etop, &setup.emaj, setup.etop.lines );
 818    }
 819
 820    flush_spans();
 821
 822    return TRUE;
 823 }