src/gallium/drivers/cell/spu/spu_tri.c

   1 /**************************************************************************
   2  *
   3  * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
   4  * All Rights Reserved.
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a
   7  * copy of this software and associated documentation files (the
   8  * "Software"), to deal in the Software without restriction, including
   9  * without limitation the rights to use, copy, modify, merge, publish,
  10  * distribute, sub license, and/or sell copies of the Software, and to
  11  * permit persons to whom the Software is furnished to do so, subject to
  12  * the following conditions:
  13  *
  14  * The above copyright notice and this permission notice (including the
  15  * next paragraph) shall be included in all copies or substantial portions
  16  * of the Software.
  17  *
  18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  19  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
  21  * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
  22  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  23  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  24  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25  *
  26  **************************************************************************/
  27
  28 /**
  29  * Triangle rendering within a tile.
  30  */
  31
  32 #include <transpose_matrix4x4.h>
  33 #include "pipe/p_compiler.h"
  34 #include "pipe/p_format.h"
  35 #include "util/u_math.h"
  36 #include "spu_colorpack.h"
  37 #include "spu_main.h"
  38 #include "spu_texture.h"
  39 #include "spu_tile.h"
  40 #include "spu_tri.h"
  41
  42
  43 /** Masks are uint[4] vectors with each element being 0 or 0xffffffff */
  44 typedef vector unsigned int mask_t;
  45
  46
  47
  48 /**
  49  * Simplified types taken from other parts of Gallium
  50  */
  51 struct vertex_header {
  52    vector float data[1];
  53 };
  54
  55
  56
  57 /* XXX fix this */
  58 #undef CEILF
  59 #define CEILF(X) ((float) (int) ((X) + 0.99999))
  60
  61
  62 #define QUAD_TOP_LEFT     0
  63 #define QUAD_TOP_RIGHT    1
  64 #define QUAD_BOTTOM_LEFT  2
  65 #define QUAD_BOTTOM_RIGHT 3
  66 #define MASK_TOP_LEFT     (1 << QUAD_TOP_LEFT)
  67 #define MASK_TOP_RIGHT    (1 << QUAD_TOP_RIGHT)
  68 #define MASK_BOTTOM_LEFT  (1 << QUAD_BOTTOM_LEFT)
  69 #define MASK_BOTTOM_RIGHT (1 << QUAD_BOTTOM_RIGHT)
  70 #define MASK_ALL          0xf
  71
  72
  73 #define DEBUG_VERTS 0
  74
  75 /**
  76  * Triangle edge info
  77  */
  78 struct edge {
  79    float dx;            /**< X(v1) - X(v0), used only during setup */
  80    float dy;            /**< Y(v1) - Y(v0), used only during setup */
  81    float dxdy;          /**< dx/dy */
  82    float sx, sy;        /**< first sample point coord */
  83    int lines;           /**< number of lines on this edge */
  84 };
  85
  86
  87 struct interp_coef
  88 {
  89    vector float a0;
  90    vector float dadx;
  91    vector float dady;
  92 };
  93
  94
  95 /**
  96  * Triangle setup info (derived from draw_stage).
  97  * Also used for line drawing (taking some liberties).
  98  */
  99 struct setup_stage {
 100
 101    /* Vertices are just an array of floats making up each attribute in
 102     * turn.  Currently fixed at 4 floats, but should change in time.
 103     * Codegen will help cope with this.
 104     */
 105    const struct vertex_header *vmax;
 106    const struct vertex_header *vmid;
 107    const struct vertex_header *vmin;
 108    const struct vertex_header *vprovoke;
 109
 110    struct edge ebot;
 111    struct edge etop;
 112    struct edge emaj;
 113
 114    float oneOverArea;  /* XXX maybe make into vector? */
 115
 116    uint facing;
 117
 118    uint tx, ty;  /**< position of current tile (x, y) */
 119
 120    int cliprect_minx, cliprect_maxx, cliprect_miny, cliprect_maxy;
 121
 122    struct interp_coef coef[PIPE_MAX_SHADER_INPUTS];
 123
 124    struct {
 125       int left[2];   /**< [0] = row0, [1] = row1 */
 126       int right[2];
 127       int y;
 128       unsigned y_flags;
 129       unsigned mask;     /**< mask of MASK_BOTTOM/TOP_LEFT/RIGHT bits */
 130    } span;
 131 };
 132
 133
 134 static struct setup_stage setup;
 135
 136
 137 /**
 138  * Evaluate attribute coefficients (plane equations) to compute
 139  * attribute values for the four fragments in a quad.
 140  * Eg: four colors will be computed (in AoS format).
 141  */
 142 static INLINE void
 143 eval_coeff(uint slot, float x, float y, vector float w, vector float result[4])
 144 {
 145    switch (spu.vertex_info.attrib[slot].interp_mode) {
 146    case INTERP_CONSTANT:
 147       result[QUAD_TOP_LEFT] =
 148       result[QUAD_TOP_RIGHT] =
 149       result[QUAD_BOTTOM_LEFT] =
 150       result[QUAD_BOTTOM_RIGHT] = setup.coef[slot].a0;
 151       break;
 152    case INTERP_LINEAR:
 153       {
 154          vector float dadx = setup.coef[slot].dadx;
 155          vector float dady = setup.coef[slot].dady;
 156          vector float topLeft =
 157             spu_add(setup.coef[slot].a0,
 158                     spu_add(spu_mul(spu_splats(x), dadx),
 159                             spu_mul(spu_splats(y), dady)));
 160
 161          result[QUAD_TOP_LEFT] = topLeft;
 162          result[QUAD_TOP_RIGHT] = spu_add(topLeft, dadx);
 163          result[QUAD_BOTTOM_LEFT] = spu_add(topLeft, dady);
 164          result[QUAD_BOTTOM_RIGHT] = spu_add(spu_add(topLeft, dadx), dady);
 165       }
 166       break;
 167    case INTERP_PERSPECTIVE:
 168       {
 169          vector float dadx = setup.coef[slot].dadx;
 170          vector float dady = setup.coef[slot].dady;
 171          vector float topLeft =
 172             spu_add(setup.coef[slot].a0,
 173                     spu_add(spu_mul(spu_splats(x), dadx),
 174                             spu_mul(spu_splats(y), dady)));
 175
 176          vector float wInv = spu_re(w);  /* 1.0 / w */
 177
 178          result[QUAD_TOP_LEFT] = spu_mul(topLeft, wInv);
 179          result[QUAD_TOP_RIGHT] = spu_mul(spu_add(topLeft, dadx), wInv);
 180          result[QUAD_BOTTOM_LEFT] = spu_mul(spu_add(topLeft, dady), wInv);
 181          result[QUAD_BOTTOM_RIGHT] = spu_mul(spu_add(spu_add(topLeft, dadx), dady), wInv);
 182       }
 183       break;
 184    case INTERP_POS:
 185    case INTERP_NONE:
 186       break;
 187    default:
 188       ASSERT(0);
 189    }
 190 }
 191
 192
 193 /**
 194  * As above, but return 4 vectors in SOA format.
 195  * XXX this will all be re-written someday.
 196  */
 197 static INLINE void
 198 eval_coeff_soa(uint slot, float x, float y, vector float w, vector float result[4])
 199 {
 200    eval_coeff(slot, x, y, w, result);
 201    _transpose_matrix4x4(result, result);
 202 }
 203
 204
 205 /** Evalute coefficients to get Z for four pixels in a quad */
 206 static INLINE vector float
 207 eval_z(float x, float y)
 208 {
 209    const uint slot = 0;
 210    const float dzdx = spu_extract(setup.coef[slot].dadx, 2);
 211    const float dzdy = spu_extract(setup.coef[slot].dady, 2);
 212    const float topLeft = spu_extract(setup.coef[slot].a0, 2) + x * dzdx + y * dzdy;
 213    const vector float topLeftv = spu_splats(topLeft);
 214    const vector float derivs = (vector float) { 0.0, dzdx, dzdy, dzdx + dzdy };
 215    return spu_add(topLeftv, derivs);
 216 }
 217
 218
 219 /** Evalute coefficients to get W for four pixels in a quad */
 220 static INLINE vector float
 221 eval_w(float x, float y)
 222 {
 223    const uint slot = 0;
 224    const float dwdx = spu_extract(setup.coef[slot].dadx, 3);
 225    const float dwdy = spu_extract(setup.coef[slot].dady, 3);
 226    const float topLeft = spu_extract(setup.coef[slot].a0, 3) + x * dwdx + y * dwdy;
 227    const vector float topLeftv = spu_splats(topLeft);
 228    const vector float derivs = (vector float) { 0.0, dwdx, dwdy, dwdx + dwdy };
 229    return spu_add(topLeftv, derivs);
 230 }
 231
 232
 233 /**
 234  * Emit a quad (pass to next stage).  No clipping is done.
 235  * Note: about 1/5 to 1/7 of the time, mask is zero and this function
 236  * should be skipped.  But adding the test for that slows things down
 237  * overall.
 238  */
 239 static INLINE void
 240 emit_quad( int x, int y, mask_t mask)
 241 {
 242    /* If any bits in mask are set... */
 243    if (spu_extract(spu_orx(mask), 0)) {
 244       const int ix = x - setup.cliprect_minx;
 245       const int iy = y - setup.cliprect_miny;
 246
 247       spu.cur_ctile_status = TILE_STATUS_DIRTY;
 248       spu.cur_ztile_status = TILE_STATUS_DIRTY;
 249
 250       {
 251          /*
 252           * Run fragment shader, execute per-fragment ops, update fb/tile.
 253           */
 254          vector float inputs[4*4], outputs[2*4];
 255          vector float fragZ = eval_z((float) x, (float) y);
 256          vector float fragW = eval_w((float) x, (float) y);
 257          vector unsigned int kill_mask;
 258
 259          /* setup inputs */
 260 #if 0
 261          eval_coeff_soa(1, (float) x, (float) y, fragW, inputs);
 262 #else
 263          uint i;
 264          for (i = 0; i < spu.vertex_info.num_attribs; i++) {
 265             eval_coeff_soa(i+1, (float) x, (float) y, fragW, inputs + i * 4);
 266          }
 267 #endif
 268          ASSERT(spu.fragment_program);
 269          ASSERT(spu.fragment_ops);
 270
 271          /* Execute the current fragment program */
 272          kill_mask = spu.fragment_program(inputs, outputs, spu.constants);
 273
 274          mask = spu_andc(mask, kill_mask);
 275
 276          /* Execute per-fragment/quad operations, including:
 277           * alpha test, z test, stencil test, blend and framebuffer writing.
 278           * Note that there are two different fragment operations functions
 279           * that can be called, one for front-facing fragments, and one
 280           * for back-facing fragments.  (Often the two are the same;
 281           * but in some cases, like two-sided stenciling, they can be
 282           * very different.)  So choose the correct function depending
 283           * on the calculated facing.
 284           */
 285          spu.fragment_ops[setup.facing](ix, iy, &spu.ctile, &spu.ztile,
 286                           fragZ,
 287                           outputs[0*4+0],
 288                           outputs[0*4+1],
 289                           outputs[0*4+2],
 290                           outputs[0*4+3],
 291                           mask);
 292       }
 293    }
 294 }
 295
 296
 297 /**
 298  * Given an X or Y coordinate, return the block/quad coordinate that it
 299  * belongs to.
 300  */
 301 static INLINE int
 302 block(int x)
 303 {
 304    return x & ~1;
 305 }
 306
 307
 308 /**
 309  * Compute mask which indicates which pixels in the 2x2 quad are actually inside
 310  * the triangle's bounds.
 311  * The mask is a uint4 vector and each element will be 0 or 0xffffffff.
 312  */
 313 static INLINE mask_t
 314 calculate_mask(int x)
 315 {
 316    /* This is a little tricky.
 317     * Use & instead of && to avoid branches.
 318     * Use negation to convert true/false to ~0/0 values.
 319     */
 320    mask_t mask;
 321    mask = spu_insert(-((x   >= setup.span.left[0]) & (x   < setup.span.right[0])), mask, 0);
 322    mask = spu_insert(-((x+1 >= setup.span.left[0]) & (x+1 < setup.span.right[0])), mask, 1);
 323    mask = spu_insert(-((x   >= setup.span.left[1]) & (x   < setup.span.right[1])), mask, 2);
 324    mask = spu_insert(-((x+1 >= setup.span.left[1]) & (x+1 < setup.span.right[1])), mask, 3);
 325    return mask;
 326 }
 327
 328
 329 /**
 330  * Render a horizontal span of quads
 331  */
 332 static void
 333 flush_spans(void)
 334 {
 335    int minleft, maxright;
 336    int x;
 337
 338    switch (setup.span.y_flags) {
 339    case 0x3:
 340       /* both odd and even lines written (both quad rows) */
 341       minleft = MIN2(setup.span.left[0], setup.span.left[1]);
 342       maxright = MAX2(setup.span.right[0], setup.span.right[1]);
 343       break;
 344
 345    case 0x1:
 346       /* only even line written (quad top row) */
 347       minleft = setup.span.left[0];
 348       maxright = setup.span.right[0];
 349       break;
 350
 351    case 0x2:
 352       /* only odd line written (quad bottom row) */
 353       minleft = setup.span.left[1];
 354       maxright = setup.span.right[1];
 355       break;
 356
 357    default:
 358       return;
 359    }
 360
 361    /* OK, we're very likely to need the tile data now.
 362     * clear or finish waiting if needed.
 363     */
 364    if (spu.cur_ctile_status == TILE_STATUS_GETTING) {
 365       /* wait for mfc_get() to complete */
 366       //printf("SPU: %u: waiting for ctile\n", spu.init.id);
 367       wait_on_mask(1 << TAG_READ_TILE_COLOR);
 368       spu.cur_ctile_status = TILE_STATUS_CLEAN;
 369    }
 370    else if (spu.cur_ctile_status == TILE_STATUS_CLEAR) {
 371       //printf("SPU %u: clearing C tile %u, %u\n", spu.init.id, setup.tx, setup.ty);
 372       clear_c_tile(&spu.ctile);
 373       spu.cur_ctile_status = TILE_STATUS_DIRTY;
 374    }
 375    ASSERT(spu.cur_ctile_status != TILE_STATUS_DEFINED);
 376
 377    if (spu.read_depth_stencil) {
 378       if (spu.cur_ztile_status == TILE_STATUS_GETTING) {
 379          /* wait for mfc_get() to complete */
 380          //printf("SPU: %u: waiting for ztile\n", spu.init.id);
 381          wait_on_mask(1 << TAG_READ_TILE_Z);
 382          spu.cur_ztile_status = TILE_STATUS_CLEAN;
 383       }
 384       else if (spu.cur_ztile_status == TILE_STATUS_CLEAR) {
 385          //printf("SPU %u: clearing Z tile %u, %u\n", spu.init.id, setup.tx, setup.ty);
 386          clear_z_tile(&spu.ztile);
 387          spu.cur_ztile_status = TILE_STATUS_DIRTY;
 388       }
 389       ASSERT(spu.cur_ztile_status != TILE_STATUS_DEFINED);
 390    }
 391
 392    /* XXX this loop could be moved into the above switch cases and
 393     * calculate_mask() could be simplified a bit...
 394     */
 395    for (x = block(minleft); x <= block(maxright); x += 2) {
 396       emit_quad( x, setup.span.y, calculate_mask( x ));
 397    }
 398
 399    setup.span.y = 0;
 400    setup.span.y_flags = 0;
 401    setup.span.right[0] = 0;
 402    setup.span.right[1] = 0;
 403 }
 404
 405
 406 #if DEBUG_VERTS
 407 static void
 408 print_vertex(const struct vertex_header *v)
 409 {
 410    uint i;
 411    fprintf(stderr, "  Vertex: (%p)\n", v);
 412    for (i = 0; i < spu.vertex_info.num_attribs; i++) {
 413       fprintf(stderr, "    %d: %f %f %f %f\n",  i,
 414               spu_extract(v->data[i], 0),
 415               spu_extract(v->data[i], 1),
 416               spu_extract(v->data[i], 2),
 417               spu_extract(v->data[i], 3));
 418    }
 419 }
 420 #endif
 421
 422
 423 /**
 424  * Sort vertices from top to bottom.
 425  * Compute area and determine front vs. back facing.
 426  * Do coarse clip test against tile bounds
 427  * \return  FALSE if tri is totally outside tile, TRUE otherwise
 428  */
 429 static boolean
 430 setup_sort_vertices(const struct vertex_header *v0,
 431                     const struct vertex_header *v1,
 432                     const struct vertex_header *v2)
 433 {
 434    float area, sign;
 435
 436 #if DEBUG_VERTS
 437    if (spu.init.id==0) {
 438       fprintf(stderr, "SPU %u: Triangle:\n", spu.init.id);
 439       print_vertex(v0);
 440       print_vertex(v1);
 441       print_vertex(v2);
 442    }
 443 #endif
 444
 445    /* determine bottom to top order of vertices */
 446    {
 447       float y0 = spu_extract(v0->data[0], 1);
 448       float y1 = spu_extract(v1->data[0], 1);
 449       float y2 = spu_extract(v2->data[0], 1);
 450       if (y0 <= y1) {
 451          if (y1 <= y2) {
 452             /* y0<=y1<=y2 */
 453             setup.vmin = v0;
 454             setup.vmid = v1;
 455             setup.vmax = v2;
 456             sign = -1.0f;
 457          }
 458          else if (y2 <= y0) {
 459             /* y2<=y0<=y1 */
 460             setup.vmin = v2;
 461             setup.vmid = v0;
 462             setup.vmax = v1;
 463             sign = -1.0f;
 464          }
 465          else {
 466             /* y0<=y2<=y1 */
 467             setup.vmin = v0;
 468             setup.vmid = v2;
 469             setup.vmax = v1;
 470             sign = 1.0f;
 471          }
 472       }
 473       else {
 474          if (y0 <= y2) {
 475             /* y1<=y0<=y2 */
 476             setup.vmin = v1;
 477             setup.vmid = v0;
 478             setup.vmax = v2;
 479             sign = 1.0f;
 480          }
 481          else if (y2 <= y1) {
 482             /* y2<=y1<=y0 */
 483             setup.vmin = v2;
 484             setup.vmid = v1;
 485             setup.vmax = v0;
 486             sign = 1.0f;
 487          }
 488          else {
 489             /* y1<=y2<=y0 */
 490             setup.vmin = v1;
 491             setup.vmid = v2;
 492             setup.vmax = v0;
 493             sign = -1.0f;
 494          }
 495       }
 496    }
 497
 498    /* Check if triangle is completely outside the tile bounds */
 499    if (spu_extract(setup.vmin->data[0], 1) > setup.cliprect_maxy)
 500       return FALSE;
 501    if (spu_extract(setup.vmax->data[0], 1) < setup.cliprect_miny)
 502       return FALSE;
 503    if (spu_extract(setup.vmin->data[0], 0) < setup.cliprect_minx &&
 504        spu_extract(setup.vmid->data[0], 0) < setup.cliprect_minx &&
 505        spu_extract(setup.vmax->data[0], 0) < setup.cliprect_minx)
 506       return FALSE;
 507    if (spu_extract(setup.vmin->data[0], 0) > setup.cliprect_maxx &&
 508        spu_extract(setup.vmid->data[0], 0) > setup.cliprect_maxx &&
 509        spu_extract(setup.vmax->data[0], 0) > setup.cliprect_maxx)
 510       return FALSE;
 511
 512    setup.ebot.dx = spu_extract(setup.vmid->data[0], 0) - spu_extract(setup.vmin->data[0], 0);
 513    setup.ebot.dy = spu_extract(setup.vmid->data[0], 1) - spu_extract(setup.vmin->data[0], 1);
 514    setup.emaj.dx = spu_extract(setup.vmax->data[0], 0) - spu_extract(setup.vmin->data[0], 0);
 515    setup.emaj.dy = spu_extract(setup.vmax->data[0], 1) - spu_extract(setup.vmin->data[0], 1);
 516    setup.etop.dx = spu_extract(setup.vmax->data[0], 0) - spu_extract(setup.vmid->data[0], 0);
 517    setup.etop.dy = spu_extract(setup.vmax->data[0], 1) - spu_extract(setup.vmid->data[0], 1);
 518
 519    /*
 520     * Compute triangle's area.  Use 1/area to compute partial
 521     * derivatives of attributes later.
 522     */
 523    area = setup.emaj.dx * setup.ebot.dy - setup.ebot.dx * setup.emaj.dy;
 524
 525    setup.oneOverArea = 1.0f / area;
 526
 527    /* The product of area * sign indicates front/back orientation (0/1).
 528     * Just in case someone gets the bright idea of switching the front
 529     * and back constants without noticing that we're assuming their
 530     * values in this operation, also assert that the values are
 531     * what we think they are.
 532     */
 533    ASSERT(CELL_FACING_FRONT == 0);
 534    ASSERT(CELL_FACING_BACK == 1);
 535    setup.facing = (area * sign > 0.0f)
 536       ^ (spu.rasterizer.front_winding == PIPE_WINDING_CW);
 537
 538    setup.vprovoke = v2;
 539
 540    return TRUE;
 541 }
 542
 543
 544 /**
 545  * Compute a0 for a constant-valued coefficient (GL_FLAT shading).
 546  * The value value comes from vertex->data[slot].
 547  * The result will be put into setup.coef[slot].a0.
 548  * \param slot  which attribute slot
 549  */
 550 static INLINE void
 551 const_coeff4(uint slot)
 552 {
 553    setup.coef[slot].dadx = (vector float) {0.0, 0.0, 0.0, 0.0};
 554    setup.coef[slot].dady = (vector float) {0.0, 0.0, 0.0, 0.0};
 555    setup.coef[slot].a0 = setup.vprovoke->data[slot];
 556 }
 557
 558
 559 /**
 560  * As above, but interp setup all four vector components.
 561  */
 562 static INLINE void
 563 tri_linear_coeff4(uint slot)
 564 {
 565    const vector float vmin_d = setup.vmin->data[slot];
 566    const vector float vmid_d = setup.vmid->data[slot];
 567    const vector float vmax_d = setup.vmax->data[slot];
 568    const vector float xxxx = spu_splats(spu_extract(setup.vmin->data[0], 0) - 0.5f);
 569    const vector float yyyy = spu_splats(spu_extract(setup.vmin->data[0], 1) - 0.5f);
 570
 571    vector float botda = vmid_d - vmin_d;
 572    vector float majda = vmax_d - vmin_d;
 573
 574    vector float a = spu_sub(spu_mul(spu_splats(setup.ebot.dy), majda),
 575                             spu_mul(botda, spu_splats(setup.emaj.dy)));
 576    vector float b = spu_sub(spu_mul(spu_splats(setup.emaj.dx), botda),
 577                             spu_mul(majda, spu_splats(setup.ebot.dx)));
 578
 579    setup.coef[slot].dadx = spu_mul(a, spu_splats(setup.oneOverArea));
 580    setup.coef[slot].dady = spu_mul(b, spu_splats(setup.oneOverArea));
 581
 582    vector float tempx = spu_mul(setup.coef[slot].dadx, xxxx);
 583    vector float tempy = spu_mul(setup.coef[slot].dady, yyyy);
 584
 585    setup.coef[slot].a0 = spu_sub(vmin_d, spu_add(tempx, tempy));
 586 }
 587
 588
 589 /**
 590  * Compute a0, dadx and dady for a perspective-corrected interpolant,
 591  * for a triangle.
 592  * We basically multiply the vertex value by 1/w before computing
 593  * the plane coefficients (a0, dadx, dady).
 594  * Later, when we compute the value at a particular fragment position we'll
 595  * divide the interpolated value by the interpolated W at that fragment.
 596  */
 597 static void
 598 tri_persp_coeff4(uint slot)
 599 {
 600    const vector float xxxx = spu_splats(spu_extract(setup.vmin->data[0], 0) - 0.5f);
 601    const vector float yyyy = spu_splats(spu_extract(setup.vmin->data[0], 1) - 0.5f);
 602
 603    const vector float vmin_w = spu_splats(spu_extract(setup.vmin->data[0], 3));
 604    const vector float vmid_w = spu_splats(spu_extract(setup.vmid->data[0], 3));
 605    const vector float vmax_w = spu_splats(spu_extract(setup.vmax->data[0], 3));
 606
 607    vector float vmin_d = setup.vmin->data[slot];
 608    vector float vmid_d = setup.vmid->data[slot];
 609    vector float vmax_d = setup.vmax->data[slot];
 610
 611    vmin_d = spu_mul(vmin_d, vmin_w);
 612    vmid_d = spu_mul(vmid_d, vmid_w);
 613    vmax_d = spu_mul(vmax_d, vmax_w);
 614
 615    vector float botda = vmid_d - vmin_d;
 616    vector float majda = vmax_d - vmin_d;
 617
 618    vector float a = spu_sub(spu_mul(spu_splats(setup.ebot.dy), majda),
 619                             spu_mul(botda, spu_splats(setup.emaj.dy)));
 620    vector float b = spu_sub(spu_mul(spu_splats(setup.emaj.dx), botda),
 621                             spu_mul(majda, spu_splats(setup.ebot.dx)));
 622
 623    setup.coef[slot].dadx = spu_mul(a, spu_splats(setup.oneOverArea));
 624    setup.coef[slot].dady = spu_mul(b, spu_splats(setup.oneOverArea));
 625
 626    vector float tempx = spu_mul(setup.coef[slot].dadx, xxxx);
 627    vector float tempy = spu_mul(setup.coef[slot].dady, yyyy);
 628
 629    setup.coef[slot].a0 = spu_sub(vmin_d, spu_add(tempx, tempy));
 630 }
 631
 632
 633
 634 /**
 635  * Compute the setup.coef[] array dadx, dady, a0 values.
 636  * Must be called after setup.vmin,vmid,vmax,vprovoke are initialized.
 637  */
 638 static void
 639 setup_tri_coefficients(void)
 640 {
 641    uint i;
 642
 643    for (i = 0; i < spu.vertex_info.num_attribs; i++) {
 644       switch (spu.vertex_info.attrib[i].interp_mode) {
 645       case INTERP_NONE:
 646          break;
 647       case INTERP_CONSTANT:
 648          const_coeff4(i);
 649          break;
 650       case INTERP_POS:
 651          /* fall-through */
 652       case INTERP_LINEAR:
 653          tri_linear_coeff4(i);
 654          break;
 655       case INTERP_PERSPECTIVE:
 656          tri_persp_coeff4(i);
 657          break;
 658       default:
 659          ASSERT(0);
 660       }
 661    }
 662 }
 663
 664
 665 static void
 666 setup_tri_edges(void)
 667 {
 668    float vmin_x = spu_extract(setup.vmin->data[0], 0) + 0.5f;
 669    float vmid_x = spu_extract(setup.vmid->data[0], 0) + 0.5f;
 670
 671    float vmin_y = spu_extract(setup.vmin->data[0], 1) - 0.5f;
 672    float vmid_y = spu_extract(setup.vmid->data[0], 1) - 0.5f;
 673    float vmax_y = spu_extract(setup.vmax->data[0], 1) - 0.5f;
 674
 675    setup.emaj.sy = CEILF(vmin_y);
 676    setup.emaj.lines = (int) CEILF(vmax_y - setup.emaj.sy);
 677    setup.emaj.dxdy = setup.emaj.dx / setup.emaj.dy;
 678    setup.emaj.sx = vmin_x + (setup.emaj.sy - vmin_y) * setup.emaj.dxdy;
 679
 680    setup.etop.sy = CEILF(vmid_y);
 681    setup.etop.lines = (int) CEILF(vmax_y - setup.etop.sy);
 682    setup.etop.dxdy = setup.etop.dx / setup.etop.dy;
 683    setup.etop.sx = vmid_x + (setup.etop.sy - vmid_y) * setup.etop.dxdy;
 684
 685    setup.ebot.sy = CEILF(vmin_y);
 686    setup.ebot.lines = (int) CEILF(vmid_y - setup.ebot.sy);
 687    setup.ebot.dxdy = setup.ebot.dx / setup.ebot.dy;
 688    setup.ebot.sx = vmin_x + (setup.ebot.sy - vmin_y) * setup.ebot.dxdy;
 689 }
 690
 691
 692 /**
 693  * Render the upper or lower half of a triangle.
 694  * Scissoring/cliprect is applied here too.
 695  */
 696 static void
 697 subtriangle(struct edge *eleft, struct edge *eright, unsigned lines)
 698 {
 699    const int minx = setup.cliprect_minx;
 700    const int maxx = setup.cliprect_maxx;
 701    const int miny = setup.cliprect_miny;
 702    const int maxy = setup.cliprect_maxy;
 703    int y, start_y, finish_y;
 704    int sy = (int)eleft->sy;
 705
 706    ASSERT((int)eleft->sy == (int) eright->sy);
 707
 708    /* clip top/bottom */
 709    start_y = sy;
 710    finish_y = sy + lines;
 711
 712    if (start_y < miny)
 713       start_y = miny;
 714
 715    if (finish_y > maxy)
 716       finish_y = maxy;
 717
 718    start_y -= sy;
 719    finish_y -= sy;
 720
 721    /*
 722    _mesa_printf("%s %d %d\n", __FUNCTION__, start_y, finish_y);
 723    */
 724
 725    for (y = start_y; y < finish_y; y++) {
 726
 727       /* avoid accumulating adds as floats don't have the precision to
 728        * accurately iterate large triangle edges that way.  luckily we
 729        * can just multiply these days.
 730        *
 731        * this is all drowned out by the attribute interpolation anyway.
 732        */
 733       int left = (int)(eleft->sx + y * eleft->dxdy);
 734       int right = (int)(eright->sx + y * eright->dxdy);
 735
 736       /* clip left/right */
 737       if (left < minx)
 738          left = minx;
 739       if (right > maxx)
 740          right = maxx;
 741
 742       if (left < right) {
 743          int _y = sy + y;
 744          if (block(_y) != setup.span.y) {
 745             flush_spans();
 746             setup.span.y = block(_y);
 747          }
 748
 749          setup.span.left[_y&1] = left;
 750          setup.span.right[_y&1] = right;
 751          setup.span.y_flags |= 1<<(_y&1);
 752       }
 753    }
 754
 755
 756    /* save the values so that emaj can be restarted:
 757     */
 758    eleft->sx += lines * eleft->dxdy;
 759    eright->sx += lines * eright->dxdy;
 760    eleft->sy += lines;
 761    eright->sy += lines;
 762 }
 763
 764
 765 /**
 766  * Draw triangle into tile at (tx, ty) (tile coords)
 767  * The tile data should have already been fetched.
 768  */
 769 boolean
 770 tri_draw(const float *v0, const float *v1, const float *v2,
 771          uint tx, uint ty)
 772 {
 773    setup.tx = tx;
 774    setup.ty = ty;
 775
 776    /* set clipping bounds to tile bounds */
 777    setup.cliprect_minx = tx * TILE_SIZE;
 778    setup.cliprect_miny = ty * TILE_SIZE;
 779    setup.cliprect_maxx = (tx + 1) * TILE_SIZE;
 780    setup.cliprect_maxy = (ty + 1) * TILE_SIZE;
 781
 782    if (!setup_sort_vertices((struct vertex_header *) v0,
 783                             (struct vertex_header *) v1,
 784                             (struct vertex_header *) v2)) {
 785       return FALSE; /* totally clipped */
 786    }
 787
 788    setup_tri_coefficients();
 789    setup_tri_edges();
 790
 791    setup.span.y = 0;
 792    setup.span.y_flags = 0;
 793    setup.span.right[0] = 0;
 794    setup.span.right[1] = 0;
 795
 796    if (setup.oneOverArea < 0.0) {
 797       /* emaj on left */
 798       subtriangle( &setup.emaj, &setup.ebot, setup.ebot.lines );
 799       subtriangle( &setup.emaj, &setup.etop, setup.etop.lines );
 800    }
 801    else {
 802       /* emaj on right */
 803       subtriangle( &setup.ebot, &setup.emaj, setup.ebot.lines );
 804       subtriangle( &setup.etop, &setup.emaj, setup.etop.lines );
 805    }
 806
 807    flush_spans();
 808
 809    return TRUE;
 810 }