src/gallium/drivers/cell/spu/spu_tri.c

   1 /**************************************************************************
   2  *
   3  * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
   4  * All Rights Reserved.
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a
   7  * copy of this software and associated documentation files (the
   8  * "Software"), to deal in the Software without restriction, including
   9  * without limitation the rights to use, copy, modify, merge, publish,
  10  * distribute, sub license, and/or sell copies of the Software, and to
  11  * permit persons to whom the Software is furnished to do so, subject to
  12  * the following conditions:
  13  *
  14  * The above copyright notice and this permission notice (including the
  15  * next paragraph) shall be included in all copies or substantial portions
  16  * of the Software.
  17  *
  18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  19  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
  21  * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
  22  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  23  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  24  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25  *
  26  **************************************************************************/
  27
  28 /**
  29  * Triangle rendering within a tile.
  30  */
  31
  32 #include "pipe/p_compiler.h"
  33 #include "pipe/p_format.h"
  34 #include "util/u_math.h"
  35 #include "spu_colorpack.h"
  36 #include "spu_main.h"
  37 #include "spu_shuffle.h"
  38 #include "spu_texture.h"
  39 #include "spu_tile.h"
  40 #include "spu_tri.h"
  41
  42
  43 /** Masks are uint[4] vectors with each element being 0 or 0xffffffff */
  44 typedef vector unsigned int mask_t;
  45
  46
  47
  48 /**
  49  * Simplified types taken from other parts of Gallium
  50  */
  51 struct vertex_header {
  52    vector float data[1];
  53 };
  54
  55
  56
  57 /* XXX fix this */
  58 #undef CEILF
  59 #define CEILF(X) ((float) (int) ((X) + 0.99999f))
  60
  61
  62 #define QUAD_TOP_LEFT     0
  63 #define QUAD_TOP_RIGHT    1
  64 #define QUAD_BOTTOM_LEFT  2
  65 #define QUAD_BOTTOM_RIGHT 3
  66 #define MASK_TOP_LEFT     (1 << QUAD_TOP_LEFT)
  67 #define MASK_TOP_RIGHT    (1 << QUAD_TOP_RIGHT)
  68 #define MASK_BOTTOM_LEFT  (1 << QUAD_BOTTOM_LEFT)
  69 #define MASK_BOTTOM_RIGHT (1 << QUAD_BOTTOM_RIGHT)
  70 #define MASK_ALL          0xf
  71
  72
  73 #define CHAN0 0
  74 #define CHAN1 1
  75 #define CHAN2 2
  76 #define CHAN3 3
  77
  78
  79 #define DEBUG_VERTS 0
  80
  81 /**
  82  * Triangle edge info
  83  */
  84 struct edge {
  85    union {
  86       struct {
  87          float dx;      /**< X(v1) - X(v0), used only during setup */
  88          float dy;      /**< Y(v1) - Y(v0), used only during setup */
  89       };
  90       vec_float4 ds;    /**< vector accessor for dx and dy */
  91    };
  92    float dxdy;          /**< dx/dy */
  93    float sx, sy;        /**< first sample point coord */
  94    int lines;           /**< number of lines on this edge */
  95 };
  96
  97
  98 struct interp_coef
  99 {
 100    vector float a0;
 101    vector float dadx;
 102    vector float dady;
 103 };
 104
 105
 106 /**
 107  * Triangle setup info (derived from draw_stage).
 108  * Also used for line drawing (taking some liberties).
 109  */
 110 struct setup_stage {
 111
 112    /* Vertices are just an array of floats making up each attribute in
 113     * turn.  Currently fixed at 4 floats, but should change in time.
 114     * Codegen will help cope with this.
 115     */
 116    union {
 117       struct {
 118          const struct vertex_header *vmin;
 119          const struct vertex_header *vmid;
 120          const struct vertex_header *vmax;
 121          const struct vertex_header *vprovoke;
 122       };
 123       qword vertex_headers;
 124    };
 125
 126    struct edge ebot;
 127    struct edge etop;
 128    struct edge emaj;
 129
 130    float oneOverArea;  /* XXX maybe make into vector? */
 131
 132    uint facing;
 133
 134    uint tx, ty;  /**< position of current tile (x, y) */
 135
 136    int cliprect_minx, cliprect_maxx, cliprect_miny, cliprect_maxy;
 137
 138    struct interp_coef coef[PIPE_MAX_SHADER_INPUTS];
 139
 140    struct {
 141       vec_int4 quad; /**< [0] = row0, [1] = row1; {left[0],left[1],right[0],right[1]} */
 142       int y;
 143       unsigned y_flags;
 144       unsigned mask;     /**< mask of MASK_BOTTOM/TOP_LEFT/RIGHT bits */
 145    } span;
 146 };
 147
 148
 149 static struct setup_stage setup;
 150
 151
 152 static INLINE vector float
 153 splatx(vector float v)
 154 {
 155    return spu_splats(spu_extract(v, CHAN0));
 156 }
 157
 158 static INLINE vector float
 159 splaty(vector float v)
 160 {
 161    return spu_splats(spu_extract(v, CHAN1));
 162 }
 163
 164 static INLINE vector float
 165 splatz(vector float v)
 166 {
 167    return spu_splats(spu_extract(v, CHAN2));
 168 }
 169
 170 static INLINE vector float
 171 splatw(vector float v)
 172 {
 173    return spu_splats(spu_extract(v, CHAN3));
 174 }
 175
 176
 177 /**
 178  * Setup fragment shader inputs by evaluating triangle's vertex
 179  * attribute coefficient info.
 180  * \param x  quad x pos
 181  * \param y  quad y pos
 182  * \param fragZ  returns quad Z values
 183  * \param fragInputs  returns fragment program inputs
 184  * Note: this code could be incorporated into the fragment program
 185  * itself to avoid the loop and switch.
 186  */
 187 static void
 188 eval_inputs(float x, float y, vector float *fragZ, vector float fragInputs[])
 189 {
 190    static const vector float deltaX = (const vector float) {0, 1, 0, 1};
 191    static const vector float deltaY = (const vector float) {0, 0, 1, 1};
 192
 193    const uint posSlot = 0;
 194    const vector float pos = setup.coef[posSlot].a0;
 195    const vector float dposdx = setup.coef[posSlot].dadx;
 196    const vector float dposdy = setup.coef[posSlot].dady;
 197    const vector float fragX = spu_splats(x) + deltaX;
 198    const vector float fragY = spu_splats(y) + deltaY;
 199    vector float fragW, wInv;
 200    uint i;
 201
 202    *fragZ = splatz(pos) + fragX * splatz(dposdx) + fragY * splatz(dposdy);
 203    fragW =  splatw(pos) + fragX * splatw(dposdx) + fragY * splatw(dposdy);
 204    wInv = spu_re(fragW);  /* 1 / w */
 205
 206    /* loop over fragment program inputs */
 207    for (i = 0; i < spu.vertex_info.num_attribs; i++) {
 208       uint attr = i + 1;
 209       enum interp_mode interp = spu.vertex_info.attrib[attr].interp_mode;
 210
 211       /* constant term */
 212       vector float a0 = setup.coef[attr].a0;
 213       vector float r0 = splatx(a0);
 214       vector float r1 = splaty(a0);
 215       vector float r2 = splatz(a0);
 216       vector float r3 = splatw(a0);
 217
 218       if (interp == INTERP_LINEAR || interp == INTERP_PERSPECTIVE) {
 219          /* linear term */
 220          vector float dadx = setup.coef[attr].dadx;
 221          vector float dady = setup.coef[attr].dady;
 222          /* Use SPU intrinsics here to get slightly better code.
 223           * originally: r0 += fragX * splatx(dadx) + fragY * splatx(dady);
 224           */
 225          r0 = spu_madd(fragX, splatx(dadx), spu_madd(fragY, splatx(dady), r0));
 226          r1 = spu_madd(fragX, splaty(dadx), spu_madd(fragY, splaty(dady), r1));
 227          r2 = spu_madd(fragX, splatz(dadx), spu_madd(fragY, splatz(dady), r2));
 228          r3 = spu_madd(fragX, splatw(dadx), spu_madd(fragY, splatw(dady), r3));
 229          if (interp == INTERP_PERSPECTIVE) {
 230             /* perspective term */
 231             r0 *= wInv;
 232             r1 *= wInv;
 233             r2 *= wInv;
 234             r3 *= wInv;
 235          }
 236       }
 237       fragInputs[CHAN0] = r0;
 238       fragInputs[CHAN1] = r1;
 239       fragInputs[CHAN2] = r2;
 240       fragInputs[CHAN3] = r3;
 241       fragInputs += 4;
 242    }
 243 }
 244
 245
 246 /**
 247  * Emit a quad (pass to next stage).  No clipping is done.
 248  * Note: about 1/5 to 1/7 of the time, mask is zero and this function
 249  * should be skipped.  But adding the test for that slows things down
 250  * overall.
 251  */
 252 static INLINE void
 253 emit_quad( int x, int y, mask_t mask)
 254 {
 255    /* If any bits in mask are set... */
 256    if (spu_extract(spu_orx(mask), 0)) {
 257       const int ix = x - setup.cliprect_minx;
 258       const int iy = y - setup.cliprect_miny;
 259
 260       spu.cur_ctile_status = TILE_STATUS_DIRTY;
 261       spu.cur_ztile_status = TILE_STATUS_DIRTY;
 262
 263       {
 264          /*
 265           * Run fragment shader, execute per-fragment ops, update fb/tile.
 266           */
 267          vector float inputs[4*4], outputs[2*4];
 268          vector unsigned int kill_mask;
 269          vector float fragZ;
 270
 271          eval_inputs((float) x, (float) y, &fragZ, inputs);
 272
 273          ASSERT(spu.fragment_program);
 274          ASSERT(spu.fragment_ops);
 275
 276          /* Execute the current fragment program */
 277          kill_mask = spu.fragment_program(inputs, outputs, spu.constants);
 278
 279          mask = spu_andc(mask, kill_mask);
 280
 281          /* Execute per-fragment/quad operations, including:
 282           * alpha test, z test, stencil test, blend and framebuffer writing.
 283           * Note that there are two different fragment operations functions
 284           * that can be called, one for front-facing fragments, and one
 285           * for back-facing fragments.  (Often the two are the same;
 286           * but in some cases, like two-sided stenciling, they can be
 287           * very different.)  So choose the correct function depending
 288           * on the calculated facing.
 289           */
 290          spu.fragment_ops[setup.facing](ix, iy, &spu.ctile, &spu.ztile,
 291                           fragZ,
 292                           outputs[0*4+0],
 293                           outputs[0*4+1],
 294                           outputs[0*4+2],
 295                           outputs[0*4+3],
 296                           mask);
 297       }
 298    }
 299 }
 300
 301
 302 /**
 303  * Given an X or Y coordinate, return the block/quad coordinate that it
 304  * belongs to.
 305  */
 306 static INLINE int
 307 block(int x)
 308 {
 309    return x & ~1;
 310 }
 311
 312
 313 /**
 314  * Render a horizontal span of quads
 315  */
 316 static void
 317 flush_spans(void)
 318 {
 319    int minleft, maxright;
 320
 321    const int l0 = spu_extract(setup.span.quad, 0);
 322    const int l1 = spu_extract(setup.span.quad, 1);
 323    const int r0 = spu_extract(setup.span.quad, 2);
 324    const int r1 = spu_extract(setup.span.quad, 3);
 325
 326    switch (setup.span.y_flags) {
 327    case 0x3:
 328       /* both odd and even lines written (both quad rows) */
 329       minleft = MIN2(l0, l1);
 330       maxright = MAX2(r0, r1);
 331       break;
 332
 333    case 0x1:
 334       /* only even line written (quad top row) */
 335       minleft = l0;
 336       maxright = r0;
 337       break;
 338
 339    case 0x2:
 340       /* only odd line written (quad bottom row) */
 341       minleft = l1;
 342       maxright = r1;
 343       break;
 344
 345    default:
 346       return;
 347    }
 348
 349    /* OK, we're very likely to need the tile data now.
 350     * clear or finish waiting if needed.
 351     */
 352    if (spu.cur_ctile_status == TILE_STATUS_GETTING) {
 353       /* wait for mfc_get() to complete */
 354       //printf("SPU: %u: waiting for ctile\n", spu.init.id);
 355       wait_on_mask(1 << TAG_READ_TILE_COLOR);
 356       spu.cur_ctile_status = TILE_STATUS_CLEAN;
 357    }
 358    else if (spu.cur_ctile_status == TILE_STATUS_CLEAR) {
 359       //printf("SPU %u: clearing C tile %u, %u\n", spu.init.id, setup.tx, setup.ty);
 360       clear_c_tile(&spu.ctile);
 361       spu.cur_ctile_status = TILE_STATUS_DIRTY;
 362    }
 363    ASSERT(spu.cur_ctile_status != TILE_STATUS_DEFINED);
 364
 365    if (spu.read_depth_stencil) {
 366       if (spu.cur_ztile_status == TILE_STATUS_GETTING) {
 367          /* wait for mfc_get() to complete */
 368          //printf("SPU: %u: waiting for ztile\n", spu.init.id);
 369          wait_on_mask(1 << TAG_READ_TILE_Z);
 370          spu.cur_ztile_status = TILE_STATUS_CLEAN;
 371       }
 372       else if (spu.cur_ztile_status == TILE_STATUS_CLEAR) {
 373          //printf("SPU %u: clearing Z tile %u, %u\n", spu.init.id, setup.tx, setup.ty);
 374          clear_z_tile(&spu.ztile);
 375          spu.cur_ztile_status = TILE_STATUS_DIRTY;
 376       }
 377       ASSERT(spu.cur_ztile_status != TILE_STATUS_DEFINED);
 378    }
 379
 380    /* XXX this loop could be moved into the above switch cases... */
 381
 382    /* Setup for mask calculation */
 383    const vec_int4 quad_LlRr = setup.span.quad;
 384    const vec_int4 quad_RrLl = spu_rlqwbyte(quad_LlRr, 8);
 385    const vec_int4 quad_LLll = spu_shuffle(quad_LlRr, quad_LlRr, SHUFFLE4(A,A,B,B));
 386    const vec_int4 quad_RRrr = spu_shuffle(quad_RrLl, quad_RrLl, SHUFFLE4(A,A,B,B));
 387
 388    const vec_int4 twos = spu_splats(2);
 389
 390    const int x = block(minleft);
 391    vec_int4 xs = {x, x+1, x, x+1};
 392
 393    for (; spu_extract(xs, 0) <= block(maxright); xs += twos) {
 394       /**
 395        * Computes mask to indicate which pixels in the 2x2 quad are actually
 396        * inside the triangle's bounds.
 397        */
 398
 399       /* Calculate ({x,x+1,x,x+1} >= {l[0],l[0],l[1],l[1]}) */
 400       const mask_t gt_LLll_xs = spu_cmpgt(quad_LLll, xs);
 401       const mask_t gte_xs_LLll = spu_nand(gt_LLll_xs, gt_LLll_xs);
 402
 403       /* Calculate ({r[0],r[0],r[1],r[1]} > {x,x+1,x,x+1}) */
 404       const mask_t gt_RRrr_xs = spu_cmpgt(quad_RRrr, xs);
 405
 406       /* Combine results to create mask */
 407       const mask_t mask = spu_and(gte_xs_LLll, gt_RRrr_xs);
 408
 409       emit_quad(spu_extract(xs, 0), setup.span.y, mask);
 410    }
 411
 412    setup.span.y = 0;
 413    setup.span.y_flags = 0;
 414    /* Zero right elements */
 415    setup.span.quad = spu_shuffle(setup.span.quad, setup.span.quad, SHUFFLE4(A,B,0,0));
 416 }
 417
 418
 419 #if DEBUG_VERTS
 420 static void
 421 print_vertex(const struct vertex_header *v)
 422 {
 423    uint i;
 424    fprintf(stderr, "  Vertex: (%p)\n", v);
 425    for (i = 0; i < spu.vertex_info.num_attribs; i++) {
 426       fprintf(stderr, "    %d: %f %f %f %f\n",  i,
 427               spu_extract(v->data[i], 0),
 428               spu_extract(v->data[i], 1),
 429               spu_extract(v->data[i], 2),
 430               spu_extract(v->data[i], 3));
 431    }
 432 }
 433 #endif
 434
 435
 436 /**
 437  * Sort vertices from top to bottom.
 438  * Compute area and determine front vs. back facing.
 439  * Do coarse clip test against tile bounds
 440  * \return  FALSE if tri is totally outside tile, TRUE otherwise
 441  */
 442 static boolean
 443 setup_sort_vertices(const qword vs)
 444 {
 445    float area, sign;
 446
 447 #if DEBUG_VERTS
 448    if (spu.init.id==0) {
 449       fprintf(stderr, "SPU %u: Triangle:\n", spu.init.id);
 450       print_vertex(v0);
 451       print_vertex(v1);
 452       print_vertex(v2);
 453    }
 454 #endif
 455
 456    /* determine bottom to top order of vertices */
 457    {
 458       /* A table of shuffle patterns for putting vertex_header pointers into
 459          correct order.  Quite magical. */
 460       const qword sort_order_patterns[] = {
 461          SHUFB4(A,B,C,C),
 462          SHUFB4(C,A,B,C),
 463          SHUFB4(A,C,B,C),
 464          SHUFB4(B,C,A,C),
 465          SHUFB4(B,A,C,C),
 466          SHUFB4(C,B,A,C) };
 467
 468       /* Collate y values into two vectors for comparison.
 469          Using only one shuffle constant! ;) */
 470       const vector float f0 = ((const struct vertex_header*)si_to_ptr(vs))->data[0];
 471       const vector float f1 = ((const struct vertex_header*)si_to_ptr(si_rotqbyi(vs, 4)))->data[0];
 472       const vector float f2 = ((const struct vertex_header*)si_to_ptr(si_rotqbyi(vs, 8)))->data[0];
 473       const vec_float4 y_02_ = spu_shuffle(f0, f2, SHUFFLE4(0,B,b,C));
 474       const vec_float4 y_10_ = spu_shuffle(f1, f0, SHUFFLE4(0,B,b,C));
 475       const vec_float4 y_012 = spu_shuffle(y_02_, f1, SHUFFLE4(0,B,b,C));
 476       const vec_float4 y_120 = spu_shuffle(y_10_, f2, SHUFFLE4(0,B,b,C));
 477
 478       /* Perform comparison: {y0,y1,y2} > {y1,y2,y0} */
 479       const vec_uint4 compare = spu_cmpgt(y_012, y_120);
 480       /* Compress the result of the comparison into 4 bits */
 481       const vec_uint4 gather = spu_gather(compare);
 482       /* Subtract one to attain the index into the LUT.  Magical. */
 483       const unsigned int index = spu_extract(gather, 0) - 1;
 484
 485       /* Load the appropriate pattern and construct the desired vector. */
 486       setup.vertex_headers = si_shufb(vs, vs, sort_order_patterns[index]);
 487
 488       /* Using the result of the comparison, set sign.
 489          Very magical. */
 490       sign = ((si_to_uint(si_cntb((qword)gather)) == 2) ? 1.0f : -1.0f);
 491    }
 492
 493    /* Check if triangle is completely outside the tile bounds */
 494    if (spu_extract(setup.vmin->data[0], 1) > setup.cliprect_maxy)
 495       return FALSE;
 496    if (spu_extract(setup.vmax->data[0], 1) < setup.cliprect_miny)
 497       return FALSE;
 498    if (spu_extract(setup.vmin->data[0], 0) < setup.cliprect_minx &&
 499        spu_extract(setup.vmid->data[0], 0) < setup.cliprect_minx &&
 500        spu_extract(setup.vmax->data[0], 0) < setup.cliprect_minx)
 501       return FALSE;
 502    if (spu_extract(setup.vmin->data[0], 0) > setup.cliprect_maxx &&
 503        spu_extract(setup.vmid->data[0], 0) > setup.cliprect_maxx &&
 504        spu_extract(setup.vmax->data[0], 0) > setup.cliprect_maxx)
 505       return FALSE;
 506
 507    setup.ebot.ds = spu_sub(setup.vmid->data[0], setup.vmin->data[0]);
 508    setup.emaj.ds = spu_sub(setup.vmax->data[0], setup.vmin->data[0]);
 509    setup.etop.ds = spu_sub(setup.vmax->data[0], setup.vmid->data[0]);
 510
 511    /*
 512     * Compute triangle's area.  Use 1/area to compute partial
 513     * derivatives of attributes later.
 514     */
 515    area = setup.emaj.dx * setup.ebot.dy - setup.ebot.dx * setup.emaj.dy;
 516
 517    setup.oneOverArea = 1.0f / area;
 518
 519    /* The product of area * sign indicates front/back orientation (0/1).
 520     * Just in case someone gets the bright idea of switching the front
 521     * and back constants without noticing that we're assuming their
 522     * values in this operation, also assert that the values are
 523     * what we think they are.
 524     */
 525    ASSERT(CELL_FACING_FRONT == 0);
 526    ASSERT(CELL_FACING_BACK == 1);
 527    setup.facing = (area * sign > 0.0f)
 528       ^ (spu.rasterizer.front_winding == PIPE_WINDING_CW);
 529
 530    return TRUE;
 531 }
 532
 533
 534 /**
 535  * Compute a0 for a constant-valued coefficient (GL_FLAT shading).
 536  * The value value comes from vertex->data[slot].
 537  * The result will be put into setup.coef[slot].a0.
 538  * \param slot  which attribute slot
 539  */
 540 static INLINE void
 541 const_coeff4(uint slot)
 542 {
 543    setup.coef[slot].dadx = (vector float) {0.0, 0.0, 0.0, 0.0};
 544    setup.coef[slot].dady = (vector float) {0.0, 0.0, 0.0, 0.0};
 545    setup.coef[slot].a0 = setup.vprovoke->data[slot];
 546 }
 547
 548
 549 /**
 550  * As above, but interp setup all four vector components.
 551  */
 552 static INLINE void
 553 tri_linear_coeff4(uint slot)
 554 {
 555    const vector float vmin_d = setup.vmin->data[slot];
 556    const vector float vmid_d = setup.vmid->data[slot];
 557    const vector float vmax_d = setup.vmax->data[slot];
 558    const vector float xxxx = spu_splats(spu_extract(setup.vmin->data[0], 0) - 0.5f);
 559    const vector float yyyy = spu_splats(spu_extract(setup.vmin->data[0], 1) - 0.5f);
 560
 561    vector float botda = vmid_d - vmin_d;
 562    vector float majda = vmax_d - vmin_d;
 563
 564    vector float a = spu_sub(spu_mul(spu_splats(setup.ebot.dy), majda),
 565                             spu_mul(botda, spu_splats(setup.emaj.dy)));
 566    vector float b = spu_sub(spu_mul(spu_splats(setup.emaj.dx), botda),
 567                             spu_mul(majda, spu_splats(setup.ebot.dx)));
 568
 569    setup.coef[slot].dadx = spu_mul(a, spu_splats(setup.oneOverArea));
 570    setup.coef[slot].dady = spu_mul(b, spu_splats(setup.oneOverArea));
 571
 572    vector float tempx = spu_mul(setup.coef[slot].dadx, xxxx);
 573    vector float tempy = spu_mul(setup.coef[slot].dady, yyyy);
 574
 575    setup.coef[slot].a0 = spu_sub(vmin_d, spu_add(tempx, tempy));
 576 }
 577
 578
 579 /**
 580  * Compute a0, dadx and dady for a perspective-corrected interpolant,
 581  * for a triangle.
 582  * We basically multiply the vertex value by 1/w before computing
 583  * the plane coefficients (a0, dadx, dady).
 584  * Later, when we compute the value at a particular fragment position we'll
 585  * divide the interpolated value by the interpolated W at that fragment.
 586  */
 587 static void
 588 tri_persp_coeff4(uint slot)
 589 {
 590    const vector float xxxx = spu_splats(spu_extract(setup.vmin->data[0], 0) - 0.5f);
 591    const vector float yyyy = spu_splats(spu_extract(setup.vmin->data[0], 1) - 0.5f);
 592
 593    const vector float vmin_w = spu_splats(spu_extract(setup.vmin->data[0], 3));
 594    const vector float vmid_w = spu_splats(spu_extract(setup.vmid->data[0], 3));
 595    const vector float vmax_w = spu_splats(spu_extract(setup.vmax->data[0], 3));
 596
 597    vector float vmin_d = setup.vmin->data[slot];
 598    vector float vmid_d = setup.vmid->data[slot];
 599    vector float vmax_d = setup.vmax->data[slot];
 600
 601    vmin_d = spu_mul(vmin_d, vmin_w);
 602    vmid_d = spu_mul(vmid_d, vmid_w);
 603    vmax_d = spu_mul(vmax_d, vmax_w);
 604
 605    vector float botda = vmid_d - vmin_d;
 606    vector float majda = vmax_d - vmin_d;
 607
 608    vector float a = spu_sub(spu_mul(spu_splats(setup.ebot.dy), majda),
 609                             spu_mul(botda, spu_splats(setup.emaj.dy)));
 610    vector float b = spu_sub(spu_mul(spu_splats(setup.emaj.dx), botda),
 611                             spu_mul(majda, spu_splats(setup.ebot.dx)));
 612
 613    setup.coef[slot].dadx = spu_mul(a, spu_splats(setup.oneOverArea));
 614    setup.coef[slot].dady = spu_mul(b, spu_splats(setup.oneOverArea));
 615
 616    vector float tempx = spu_mul(setup.coef[slot].dadx, xxxx);
 617    vector float tempy = spu_mul(setup.coef[slot].dady, yyyy);
 618
 619    setup.coef[slot].a0 = spu_sub(vmin_d, spu_add(tempx, tempy));
 620 }
 621
 622
 623
 624 /**
 625  * Compute the setup.coef[] array dadx, dady, a0 values.
 626  * Must be called after setup.vmin,vmid,vmax,vprovoke are initialized.
 627  */
 628 static void
 629 setup_tri_coefficients(void)
 630 {
 631    uint i;
 632
 633    for (i = 0; i < spu.vertex_info.num_attribs; i++) {
 634       switch (spu.vertex_info.attrib[i].interp_mode) {
 635       case INTERP_NONE:
 636          break;
 637       case INTERP_CONSTANT:
 638          const_coeff4(i);
 639          break;
 640       case INTERP_POS:
 641          /* fall-through */
 642       case INTERP_LINEAR:
 643          tri_linear_coeff4(i);
 644          break;
 645       case INTERP_PERSPECTIVE:
 646          tri_persp_coeff4(i);
 647          break;
 648       default:
 649          ASSERT(0);
 650       }
 651    }
 652 }
 653
 654
 655 static void
 656 setup_tri_edges(void)
 657 {
 658    float vmin_x = spu_extract(setup.vmin->data[0], 0) + 0.5f;
 659    float vmid_x = spu_extract(setup.vmid->data[0], 0) + 0.5f;
 660
 661    float vmin_y = spu_extract(setup.vmin->data[0], 1) - 0.5f;
 662    float vmid_y = spu_extract(setup.vmid->data[0], 1) - 0.5f;
 663    float vmax_y = spu_extract(setup.vmax->data[0], 1) - 0.5f;
 664
 665    setup.emaj.sy = CEILF(vmin_y);
 666    setup.emaj.lines = (int) CEILF(vmax_y - setup.emaj.sy);
 667    setup.emaj.dxdy = setup.emaj.dx / setup.emaj.dy;
 668    setup.emaj.sx = vmin_x + (setup.emaj.sy - vmin_y) * setup.emaj.dxdy;
 669
 670    setup.etop.sy = CEILF(vmid_y);
 671    setup.etop.lines = (int) CEILF(vmax_y - setup.etop.sy);
 672    setup.etop.dxdy = setup.etop.dx / setup.etop.dy;
 673    setup.etop.sx = vmid_x + (setup.etop.sy - vmid_y) * setup.etop.dxdy;
 674
 675    setup.ebot.sy = CEILF(vmin_y);
 676    setup.ebot.lines = (int) CEILF(vmid_y - setup.ebot.sy);
 677    setup.ebot.dxdy = setup.ebot.dx / setup.ebot.dy;
 678    setup.ebot.sx = vmin_x + (setup.ebot.sy - vmin_y) * setup.ebot.dxdy;
 679 }
 680
 681
 682 /**
 683  * Render the upper or lower half of a triangle.
 684  * Scissoring/cliprect is applied here too.
 685  */
 686 static void
 687 subtriangle(struct edge *eleft, struct edge *eright, unsigned lines)
 688 {
 689    const int minx = setup.cliprect_minx;
 690    const int maxx = setup.cliprect_maxx;
 691    const int miny = setup.cliprect_miny;
 692    const int maxy = setup.cliprect_maxy;
 693    int y, start_y, finish_y;
 694    int sy = (int)eleft->sy;
 695
 696    ASSERT((int)eleft->sy == (int) eright->sy);
 697
 698    /* clip top/bottom */
 699    start_y = sy;
 700    finish_y = sy + lines;
 701
 702    if (start_y < miny)
 703       start_y = miny;
 704
 705    if (finish_y > maxy)
 706       finish_y = maxy;
 707
 708    start_y -= sy;
 709    finish_y -= sy;
 710
 711    /*
 712    _mesa_printf("%s %d %d\n", __FUNCTION__, start_y, finish_y);
 713    */
 714
 715    for (y = start_y; y < finish_y; y++) {
 716
 717       /* avoid accumulating adds as floats don't have the precision to
 718        * accurately iterate large triangle edges that way.  luckily we
 719        * can just multiply these days.
 720        *
 721        * this is all drowned out by the attribute interpolation anyway.
 722        */
 723       int left = (int)(eleft->sx + y * eleft->dxdy);
 724       int right = (int)(eright->sx + y * eright->dxdy);
 725
 726       /* clip left/right */
 727       if (left < minx)
 728          left = minx;
 729       if (right > maxx)
 730          right = maxx;
 731
 732       if (left < right) {
 733          int _y = sy + y;
 734          if (block(_y) != setup.span.y) {
 735             flush_spans();
 736             setup.span.y = block(_y);
 737          }
 738
 739          int offset = _y&1;
 740          vec_int4 quad_LlRr = {left, left, right, right};
 741          /* Store left and right in 0 or 1 row of quad based on offset */
 742          setup.span.quad = spu_sel(quad_LlRr, setup.span.quad, spu_maskw(5<<offset));
 743          setup.span.y_flags |= 1<<offset;
 744       }
 745    }
 746
 747
 748    /* save the values so that emaj can be restarted:
 749     */
 750    eleft->sx += lines * eleft->dxdy;
 751    eright->sx += lines * eright->dxdy;
 752    eleft->sy += lines;
 753    eright->sy += lines;
 754 }
 755
 756
 757 /**
 758  * Draw triangle into tile at (tx, ty) (tile coords)
 759  * The tile data should have already been fetched.
 760  */
 761 boolean
 762 tri_draw(const qword vs,
 763          uint tx, uint ty)
 764 {
 765    setup.tx = tx;
 766    setup.ty = ty;
 767
 768    /* set clipping bounds to tile bounds */
 769    setup.cliprect_minx = tx * TILE_SIZE;
 770    setup.cliprect_miny = ty * TILE_SIZE;
 771    setup.cliprect_maxx = (tx + 1) * TILE_SIZE;
 772    setup.cliprect_maxy = (ty + 1) * TILE_SIZE;
 773
 774    if(!setup_sort_vertices(vs)) {
 775       return FALSE; /* totally clipped */
 776    }
 777
 778    setup_tri_coefficients();
 779    setup_tri_edges();
 780
 781    setup.span.y = 0;
 782    setup.span.y_flags = 0;
 783    /* Zero right elements */
 784    setup.span.quad = spu_shuffle(setup.span.quad, setup.span.quad, SHUFFLE4(A,B,0,0));
 785
 786    if (setup.oneOverArea < 0.0) {
 787       /* emaj on left */
 788       subtriangle( &setup.emaj, &setup.ebot, setup.ebot.lines );
 789       subtriangle( &setup.emaj, &setup.etop, setup.etop.lines );
 790    }
 791    else {
 792       /* emaj on right */
 793       subtriangle( &setup.ebot, &setup.emaj, setup.ebot.lines );
 794       subtriangle( &setup.etop, &setup.emaj, setup.etop.lines );
 795    }
 796
 797    flush_spans();
 798
 799    return TRUE;
 800 }