src/gallium/drivers/cell/spu/spu_tri.c

   1 /**************************************************************************
   2  *
   3  * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
   4  * All Rights Reserved.
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a
   7  * copy of this software and associated documentation files (the
   8  * "Software"), to deal in the Software without restriction, including
   9  * without limitation the rights to use, copy, modify, merge, publish,
  10  * distribute, sub license, and/or sell copies of the Software, and to
  11  * permit persons to whom the Software is furnished to do so, subject to
  12  * the following conditions:
  13  *
  14  * The above copyright notice and this permission notice (including the
  15  * next paragraph) shall be included in all copies or substantial portions
  16  * of the Software.
  17  *
  18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  19  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
  21  * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
  22  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  23  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  24  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25  *
  26  **************************************************************************/
  27
  28 /**
  29  * Triangle rendering within a tile.
  30  */
  31
  32 #include "pipe/p_compiler.h"
  33 #include "pipe/p_format.h"
  34 #include "util/u_math.h"
  35 #include "spu_colorpack.h"
  36 #include "spu_main.h"
  37 #include "spu_shuffle.h"
  38 #include "spu_texture.h"
  39 #include "spu_tile.h"
  40 #include "spu_tri.h"
  41
  42
  43 /** Masks are uint[4] vectors with each element being 0 or 0xffffffff */
  44 typedef vector unsigned int mask_t;
  45
  46
  47
  48 /**
  49  * Simplified types taken from other parts of Gallium
  50  */
  51 struct vertex_header {
  52    vector float data[1];
  53 };
  54
  55
  56
  57 /* XXX fix this */
  58 #undef CEILF
  59 #define CEILF(X) ((float) (int) ((X) + 0.99999f))
  60
  61
  62 #define QUAD_TOP_LEFT     0
  63 #define QUAD_TOP_RIGHT    1
  64 #define QUAD_BOTTOM_LEFT  2
  65 #define QUAD_BOTTOM_RIGHT 3
  66 #define MASK_TOP_LEFT     (1 << QUAD_TOP_LEFT)
  67 #define MASK_TOP_RIGHT    (1 << QUAD_TOP_RIGHT)
  68 #define MASK_BOTTOM_LEFT  (1 << QUAD_BOTTOM_LEFT)
  69 #define MASK_BOTTOM_RIGHT (1 << QUAD_BOTTOM_RIGHT)
  70 #define MASK_ALL          0xf
  71
  72
  73 #define CHAN0 0
  74 #define CHAN1 1
  75 #define CHAN2 2
  76 #define CHAN3 3
  77
  78
  79 #define DEBUG_VERTS 0
  80
  81 /**
  82  * Triangle edge info
  83  */
  84 struct edge {
  85    union {
  86       struct {
  87          float dx;      /**< X(v1) - X(v0), used only during setup */
  88          float dy;      /**< Y(v1) - Y(v0), used only during setup */
  89       };
  90       vec_float4 ds;    /**< vector accessor for dx and dy */
  91    };
  92    float dxdy;          /**< dx/dy */
  93    float sx, sy;        /**< first sample point coord */
  94    int lines;           /**< number of lines on this edge */
  95 };
  96
  97
  98 struct interp_coef
  99 {
 100    vector float a0;
 101    vector float dadx;
 102    vector float dady;
 103 };
 104
 105
 106 /**
 107  * Triangle setup info (derived from draw_stage).
 108  * Also used for line drawing (taking some liberties).
 109  */
 110 struct setup_stage {
 111
 112    /* Vertices are just an array of floats making up each attribute in
 113     * turn.  Currently fixed at 4 floats, but should change in time.
 114     * Codegen will help cope with this.
 115     */
 116    union {
 117       struct {
 118          const struct vertex_header *vmin;
 119          const struct vertex_header *vmid;
 120          const struct vertex_header *vmax;
 121          const struct vertex_header *vprovoke;
 122       };
 123       qword vertex_headers;
 124    };
 125
 126    struct edge ebot;
 127    struct edge etop;
 128    struct edge emaj;
 129
 130    float oneOverArea;  /* XXX maybe make into vector? */
 131
 132    uint facing;
 133
 134    uint tx, ty;  /**< position of current tile (x, y) */
 135
 136    int cliprect_minx, cliprect_maxx, cliprect_miny, cliprect_maxy;
 137
 138    struct interp_coef coef[PIPE_MAX_SHADER_INPUTS];
 139
 140    struct {
 141       vec_int4 quad; /**< [0] = row0, [1] = row1; {left[0],left[1],right[0],right[1]} */
 142       int y;
 143       unsigned y_flags;
 144       unsigned mask;     /**< mask of MASK_BOTTOM/TOP_LEFT/RIGHT bits */
 145    } span;
 146 };
 147
 148
 149 static struct setup_stage setup;
 150
 151
 152 static INLINE vector float
 153 splatx(vector float v)
 154 {
 155    return spu_splats(spu_extract(v, CHAN0));
 156 }
 157
 158 static INLINE vector float
 159 splaty(vector float v)
 160 {
 161    return spu_splats(spu_extract(v, CHAN1));
 162 }
 163
 164 static INLINE vector float
 165 splatz(vector float v)
 166 {
 167    return spu_splats(spu_extract(v, CHAN2));
 168 }
 169
 170 static INLINE vector float
 171 splatw(vector float v)
 172 {
 173    return spu_splats(spu_extract(v, CHAN3));
 174 }
 175
 176
 177 /**
 178  * Setup fragment shader inputs by evaluating triangle's vertex
 179  * attribute coefficient info.
 180  * \param x  quad x pos
 181  * \param y  quad y pos
 182  * \param fragZ  returns quad Z values
 183  * \param fragInputs  returns fragment program inputs
 184  * Note: this code could be incorporated into the fragment program
 185  * itself to avoid the loop and switch.
 186  */
 187 static void
 188 eval_inputs(float x, float y, vector float *fragZ, vector float fragInputs[])
 189 {
 190    static const vector float deltaX = (const vector float) {0, 1, 0, 1};
 191    static const vector float deltaY = (const vector float) {0, 0, 1, 1};
 192
 193    const uint posSlot = 0;
 194    const vector float pos = setup.coef[posSlot].a0;
 195    const vector float dposdx = setup.coef[posSlot].dadx;
 196    const vector float dposdy = setup.coef[posSlot].dady;
 197    const vector float fragX = spu_splats(x) + deltaX;
 198    const vector float fragY = spu_splats(y) + deltaY;
 199    vector float fragW, wInv;
 200    uint i;
 201
 202    *fragZ = splatz(pos) + fragX * splatz(dposdx) + fragY * splatz(dposdy);
 203    fragW =  splatw(pos) + fragX * splatw(dposdx) + fragY * splatw(dposdy);
 204    wInv = spu_re(fragW);  /* 1 / w */
 205
 206    /* loop over fragment program inputs */
 207    for (i = 0; i < spu.vertex_info.num_attribs; i++) {
 208       uint attr = i + 1;
 209       enum interp_mode interp = spu.vertex_info.attrib[attr].interp_mode;
 210
 211       /* constant term */
 212       vector float a0 = setup.coef[attr].a0;
 213       vector float r0 = splatx(a0);
 214       vector float r1 = splaty(a0);
 215       vector float r2 = splatz(a0);
 216       vector float r3 = splatw(a0);
 217
 218       if (interp == INTERP_LINEAR || interp == INTERP_PERSPECTIVE) {
 219          /* linear term */
 220          vector float dadx = setup.coef[attr].dadx;
 221          vector float dady = setup.coef[attr].dady;
 222          /* Use SPU intrinsics here to get slightly better code.
 223           * originally: r0 += fragX * splatx(dadx) + fragY * splatx(dady);
 224           */
 225          r0 = spu_madd(fragX, splatx(dadx), spu_madd(fragY, splatx(dady), r0));
 226          r1 = spu_madd(fragX, splaty(dadx), spu_madd(fragY, splaty(dady), r1));
 227          r2 = spu_madd(fragX, splatz(dadx), spu_madd(fragY, splatz(dady), r2));
 228          r3 = spu_madd(fragX, splatw(dadx), spu_madd(fragY, splatw(dady), r3));
 229          if (interp == INTERP_PERSPECTIVE) {
 230             /* perspective term */
 231             r0 *= wInv;
 232             r1 *= wInv;
 233             r2 *= wInv;
 234             r3 *= wInv;
 235          }
 236       }
 237       fragInputs[CHAN0] = r0;
 238       fragInputs[CHAN1] = r1;
 239       fragInputs[CHAN2] = r2;
 240       fragInputs[CHAN3] = r3;
 241       fragInputs += 4;
 242    }
 243 }
 244
 245
 246 /**
 247  * Emit a quad (pass to next stage).  No clipping is done.
 248  * Note: about 1/5 to 1/7 of the time, mask is zero and this function
 249  * should be skipped.  But adding the test for that slows things down
 250  * overall.
 251  */
 252 static INLINE void
 253 emit_quad( int x, int y, mask_t mask)
 254 {
 255    /* If any bits in mask are set... */
 256    if (spu_extract(spu_orx(mask), 0)) {
 257       const int ix = x - setup.cliprect_minx;
 258       const int iy = y - setup.cliprect_miny;
 259
 260       spu.cur_ctile_status = TILE_STATUS_DIRTY;
 261       spu.cur_ztile_status = TILE_STATUS_DIRTY;
 262
 263       {
 264          /*
 265           * Run fragment shader, execute per-fragment ops, update fb/tile.
 266           */
 267          vector float inputs[4*4], outputs[2*4];
 268          vector unsigned int kill_mask;
 269          vector float fragZ;
 270
 271          eval_inputs((float) x, (float) y, &fragZ, inputs);
 272
 273          ASSERT(spu.fragment_program);
 274          ASSERT(spu.fragment_ops);
 275
 276          /* Execute the current fragment program */
 277          kill_mask = spu.fragment_program(inputs, outputs, spu.constants);
 278
 279          mask = spu_andc(mask, kill_mask);
 280
 281          /* Execute per-fragment/quad operations, including:
 282           * alpha test, z test, stencil test, blend and framebuffer writing.
 283           * Note that there are two different fragment operations functions
 284           * that can be called, one for front-facing fragments, and one
 285           * for back-facing fragments.  (Often the two are the same;
 286           * but in some cases, like two-sided stenciling, they can be
 287           * very different.)  So choose the correct function depending
 288           * on the calculated facing.
 289           */
 290          spu.fragment_ops[setup.facing](ix, iy, &spu.ctile, &spu.ztile,
 291                           fragZ,
 292                           outputs[0*4+0],
 293                           outputs[0*4+1],
 294                           outputs[0*4+2],
 295                           outputs[0*4+3],
 296                           mask);
 297       }
 298    }
 299 }
 300
 301
 302 /**
 303  * Given an X or Y coordinate, return the block/quad coordinate that it
 304  * belongs to.
 305  */
 306 static INLINE int
 307 block(int x)
 308 {
 309    return x & ~1;
 310 }
 311
 312
 313 /**
 314  * Render a horizontal span of quads
 315  */
 316 static void
 317 flush_spans(void)
 318 {
 319    int minleft, maxright;
 320
 321    const int l0 = spu_extract(setup.span.quad, 0);
 322    const int l1 = spu_extract(setup.span.quad, 1);
 323    const int r0 = spu_extract(setup.span.quad, 2);
 324    const int r1 = spu_extract(setup.span.quad, 3);
 325
 326    switch (setup.span.y_flags) {
 327    case 0x3:
 328       /* both odd and even lines written (both quad rows) */
 329       minleft = MIN2(l0, l1);
 330       maxright = MAX2(r0, r1);
 331       break;
 332
 333    case 0x1:
 334       /* only even line written (quad top row) */
 335       minleft = l0;
 336       maxright = r0;
 337       break;
 338
 339    case 0x2:
 340       /* only odd line written (quad bottom row) */
 341       minleft = l1;
 342       maxright = r1;
 343       break;
 344
 345    default:
 346       return;
 347    }
 348
 349    /* OK, we're very likely to need the tile data now.
 350     * clear or finish waiting if needed.
 351     */
 352    if (spu.cur_ctile_status == TILE_STATUS_GETTING) {
 353       /* wait for mfc_get() to complete */
 354       //printf("SPU: %u: waiting for ctile\n", spu.init.id);
 355       wait_on_mask(1 << TAG_READ_TILE_COLOR);
 356       spu.cur_ctile_status = TILE_STATUS_CLEAN;
 357    }
 358    else if (spu.cur_ctile_status == TILE_STATUS_CLEAR) {
 359       //printf("SPU %u: clearing C tile %u, %u\n", spu.init.id, setup.tx, setup.ty);
 360       clear_c_tile(&spu.ctile);
 361       spu.cur_ctile_status = TILE_STATUS_DIRTY;
 362    }
 363    ASSERT(spu.cur_ctile_status != TILE_STATUS_DEFINED);
 364
 365    if (spu.read_depth_stencil) {
 366       if (spu.cur_ztile_status == TILE_STATUS_GETTING) {
 367          /* wait for mfc_get() to complete */
 368          //printf("SPU: %u: waiting for ztile\n", spu.init.id);
 369          wait_on_mask(1 << TAG_READ_TILE_Z);
 370          spu.cur_ztile_status = TILE_STATUS_CLEAN;
 371       }
 372       else if (spu.cur_ztile_status == TILE_STATUS_CLEAR) {
 373          //printf("SPU %u: clearing Z tile %u, %u\n", spu.init.id, setup.tx, setup.ty);
 374          clear_z_tile(&spu.ztile);
 375          spu.cur_ztile_status = TILE_STATUS_DIRTY;
 376       }
 377       ASSERT(spu.cur_ztile_status != TILE_STATUS_DEFINED);
 378    }
 379
 380    /* XXX this loop could be moved into the above switch cases... */
 381
 382    /* Setup for mask calculation */
 383    const vec_int4 quad_LlRr = setup.span.quad;
 384    const vec_int4 quad_RrLl = spu_rlqwbyte(quad_LlRr, 8);
 385    const vec_int4 quad_LLll = spu_shuffle(quad_LlRr, quad_LlRr, SHUFFLE4(A,A,B,B));
 386    const vec_int4 quad_RRrr = spu_shuffle(quad_RrLl, quad_RrLl, SHUFFLE4(A,A,B,B));
 387
 388    const vec_int4 twos = spu_splats(2);
 389
 390    const int x = block(minleft);
 391    vec_int4 xs = {x, x+1, x, x+1};
 392
 393    for (; spu_extract(xs, 0) <= block(maxright); xs += twos) {
 394       /**
 395        * Computes mask to indicate which pixels in the 2x2 quad are actually
 396        * inside the triangle's bounds.
 397        */
 398
 399       /* Calculate ({x,x+1,x,x+1} >= {l[0],l[0],l[1],l[1]}) */
 400       const mask_t gt_LLll_xs = spu_cmpgt(quad_LLll, xs);
 401       const mask_t gte_xs_LLll = spu_nand(gt_LLll_xs, gt_LLll_xs);
 402
 403       /* Calculate ({r[0],r[0],r[1],r[1]} > {x,x+1,x,x+1}) */
 404       const mask_t gt_RRrr_xs = spu_cmpgt(quad_RRrr, xs);
 405
 406       /* Combine results to create mask */
 407       const mask_t mask = spu_and(gte_xs_LLll, gt_RRrr_xs);
 408
 409       emit_quad(spu_extract(xs, 0), setup.span.y, mask);
 410    }
 411
 412    setup.span.y = 0;
 413    setup.span.y_flags = 0;
 414    /* Zero right elements */
 415    setup.span.quad = spu_shuffle(setup.span.quad, setup.span.quad, SHUFFLE4(A,B,0,0));
 416 }
 417
 418
 419 #if DEBUG_VERTS
 420 static void
 421 print_vertex(const struct vertex_header *v)
 422 {
 423    uint i;
 424    fprintf(stderr, "  Vertex: (%p)\n", v);
 425    for (i = 0; i < spu.vertex_info.num_attribs; i++) {
 426       fprintf(stderr, "    %d: %f %f %f %f\n",  i,
 427               spu_extract(v->data[i], 0),
 428               spu_extract(v->data[i], 1),
 429               spu_extract(v->data[i], 2),
 430               spu_extract(v->data[i], 3));
 431    }
 432 }
 433 #endif
 434
 435
 436 /**
 437  * Sort vertices from top to bottom.
 438  * Compute area and determine front vs. back facing.
 439  * Do coarse clip test against tile bounds
 440  * \return  FALSE if tri is totally outside tile, TRUE otherwise
 441  */
 442 static boolean
 443 setup_sort_vertices(const struct vertex_header *v0,
 444                     const struct vertex_header *v1,
 445                     const struct vertex_header *v2)
 446 {
 447    float area, sign;
 448
 449 #if DEBUG_VERTS
 450    if (spu.init.id==0) {
 451       fprintf(stderr, "SPU %u: Triangle:\n", spu.init.id);
 452       print_vertex(v0);
 453       print_vertex(v1);
 454       print_vertex(v2);
 455    }
 456 #endif
 457
 458    /* determine bottom to top order of vertices */
 459    {
 460       /* A table of shuffle patterns for putting vertex_header pointers into
 461          correct order.  Quite magical. */
 462       const vec_uchar16 sort_order_patterns[] = {
 463          SHUFFLE4(A,B,C,C),
 464          SHUFFLE4(C,A,B,C),
 465          SHUFFLE4(A,C,B,C),
 466          SHUFFLE4(B,C,A,C),
 467          SHUFFLE4(B,A,C,C),
 468          SHUFFLE4(C,B,A,C) };
 469
 470       /* The vertex_header pointers, packed for easy shuffling later */
 471       const vec_uint4 vs = {(unsigned)v0, (unsigned)v1, (unsigned)v2};
 472
 473       /* Collate y values into two vectors for comparison.
 474          Using only one shuffle constant! ;) */
 475       const vec_float4 y_02_ = spu_shuffle(v0->data[0], v2->data[0], SHUFFLE4(0,B,b,C));
 476       const vec_float4 y_10_ = spu_shuffle(v1->data[0], v0->data[0], SHUFFLE4(0,B,b,C));
 477       const vec_float4 y_012 = spu_shuffle(y_02_, v1->data[0], SHUFFLE4(0,B,b,C));
 478       const vec_float4 y_120 = spu_shuffle(y_10_, v2->data[0], SHUFFLE4(0,B,b,C));
 479
 480       /* Perform comparison: {y0,y1,y2} > {y1,y2,y0} */
 481       const vec_uint4 compare = spu_cmpgt(y_012, y_120);
 482       /* Compress the result of the comparison into 4 bits */
 483       const vec_uint4 gather = spu_gather(compare);
 484       /* Subtract one to attain the index into the LUT.  Magical. */
 485       const unsigned int index = spu_extract(gather, 0) - 1;
 486
 487       /* Load the appropriate pattern and construct the desired vector. */
 488       setup.vertex_headers = (qword)spu_shuffle(vs, vs, sort_order_patterns[index]);
 489
 490       /* Using the result of the comparison, set sign.
 491          Very magical. */
 492       sign = ((si_to_uint(si_cntb((qword)gather)) == 2) ? 1.0f : -1.0f);
 493    }
 494
 495    /* Check if triangle is completely outside the tile bounds */
 496    if (spu_extract(setup.vmin->data[0], 1) > setup.cliprect_maxy)
 497       return FALSE;
 498    if (spu_extract(setup.vmax->data[0], 1) < setup.cliprect_miny)
 499       return FALSE;
 500    if (spu_extract(setup.vmin->data[0], 0) < setup.cliprect_minx &&
 501        spu_extract(setup.vmid->data[0], 0) < setup.cliprect_minx &&
 502        spu_extract(setup.vmax->data[0], 0) < setup.cliprect_minx)
 503       return FALSE;
 504    if (spu_extract(setup.vmin->data[0], 0) > setup.cliprect_maxx &&
 505        spu_extract(setup.vmid->data[0], 0) > setup.cliprect_maxx &&
 506        spu_extract(setup.vmax->data[0], 0) > setup.cliprect_maxx)
 507       return FALSE;
 508
 509    setup.ebot.ds = spu_sub(setup.vmid->data[0], setup.vmin->data[0]);
 510    setup.emaj.ds = spu_sub(setup.vmax->data[0], setup.vmin->data[0]);
 511    setup.etop.ds = spu_sub(setup.vmax->data[0], setup.vmid->data[0]);
 512
 513    /*
 514     * Compute triangle's area.  Use 1/area to compute partial
 515     * derivatives of attributes later.
 516     */
 517    area = setup.emaj.dx * setup.ebot.dy - setup.ebot.dx * setup.emaj.dy;
 518
 519    setup.oneOverArea = 1.0f / area;
 520
 521    /* The product of area * sign indicates front/back orientation (0/1).
 522     * Just in case someone gets the bright idea of switching the front
 523     * and back constants without noticing that we're assuming their
 524     * values in this operation, also assert that the values are
 525     * what we think they are.
 526     */
 527    ASSERT(CELL_FACING_FRONT == 0);
 528    ASSERT(CELL_FACING_BACK == 1);
 529    setup.facing = (area * sign > 0.0f)
 530       ^ (spu.rasterizer.front_winding == PIPE_WINDING_CW);
 531
 532    return TRUE;
 533 }
 534
 535
 536 /**
 537  * Compute a0 for a constant-valued coefficient (GL_FLAT shading).
 538  * The value value comes from vertex->data[slot].
 539  * The result will be put into setup.coef[slot].a0.
 540  * \param slot  which attribute slot
 541  */
 542 static INLINE void
 543 const_coeff4(uint slot)
 544 {
 545    setup.coef[slot].dadx = (vector float) {0.0, 0.0, 0.0, 0.0};
 546    setup.coef[slot].dady = (vector float) {0.0, 0.0, 0.0, 0.0};
 547    setup.coef[slot].a0 = setup.vprovoke->data[slot];
 548 }
 549
 550
 551 /**
 552  * As above, but interp setup all four vector components.
 553  */
 554 static INLINE void
 555 tri_linear_coeff4(uint slot)
 556 {
 557    const vector float vmin_d = setup.vmin->data[slot];
 558    const vector float vmid_d = setup.vmid->data[slot];
 559    const vector float vmax_d = setup.vmax->data[slot];
 560    const vector float xxxx = spu_splats(spu_extract(setup.vmin->data[0], 0) - 0.5f);
 561    const vector float yyyy = spu_splats(spu_extract(setup.vmin->data[0], 1) - 0.5f);
 562
 563    vector float botda = vmid_d - vmin_d;
 564    vector float majda = vmax_d - vmin_d;
 565
 566    vector float a = spu_sub(spu_mul(spu_splats(setup.ebot.dy), majda),
 567                             spu_mul(botda, spu_splats(setup.emaj.dy)));
 568    vector float b = spu_sub(spu_mul(spu_splats(setup.emaj.dx), botda),
 569                             spu_mul(majda, spu_splats(setup.ebot.dx)));
 570
 571    setup.coef[slot].dadx = spu_mul(a, spu_splats(setup.oneOverArea));
 572    setup.coef[slot].dady = spu_mul(b, spu_splats(setup.oneOverArea));
 573
 574    vector float tempx = spu_mul(setup.coef[slot].dadx, xxxx);
 575    vector float tempy = spu_mul(setup.coef[slot].dady, yyyy);
 576
 577    setup.coef[slot].a0 = spu_sub(vmin_d, spu_add(tempx, tempy));
 578 }
 579
 580
 581 /**
 582  * Compute a0, dadx and dady for a perspective-corrected interpolant,
 583  * for a triangle.
 584  * We basically multiply the vertex value by 1/w before computing
 585  * the plane coefficients (a0, dadx, dady).
 586  * Later, when we compute the value at a particular fragment position we'll
 587  * divide the interpolated value by the interpolated W at that fragment.
 588  */
 589 static void
 590 tri_persp_coeff4(uint slot)
 591 {
 592    const vector float xxxx = spu_splats(spu_extract(setup.vmin->data[0], 0) - 0.5f);
 593    const vector float yyyy = spu_splats(spu_extract(setup.vmin->data[0], 1) - 0.5f);
 594
 595    const vector float vmin_w = spu_splats(spu_extract(setup.vmin->data[0], 3));
 596    const vector float vmid_w = spu_splats(spu_extract(setup.vmid->data[0], 3));
 597    const vector float vmax_w = spu_splats(spu_extract(setup.vmax->data[0], 3));
 598
 599    vector float vmin_d = setup.vmin->data[slot];
 600    vector float vmid_d = setup.vmid->data[slot];
 601    vector float vmax_d = setup.vmax->data[slot];
 602
 603    vmin_d = spu_mul(vmin_d, vmin_w);
 604    vmid_d = spu_mul(vmid_d, vmid_w);
 605    vmax_d = spu_mul(vmax_d, vmax_w);
 606
 607    vector float botda = vmid_d - vmin_d;
 608    vector float majda = vmax_d - vmin_d;
 609
 610    vector float a = spu_sub(spu_mul(spu_splats(setup.ebot.dy), majda),
 611                             spu_mul(botda, spu_splats(setup.emaj.dy)));
 612    vector float b = spu_sub(spu_mul(spu_splats(setup.emaj.dx), botda),
 613                             spu_mul(majda, spu_splats(setup.ebot.dx)));
 614
 615    setup.coef[slot].dadx = spu_mul(a, spu_splats(setup.oneOverArea));
 616    setup.coef[slot].dady = spu_mul(b, spu_splats(setup.oneOverArea));
 617
 618    vector float tempx = spu_mul(setup.coef[slot].dadx, xxxx);
 619    vector float tempy = spu_mul(setup.coef[slot].dady, yyyy);
 620
 621    setup.coef[slot].a0 = spu_sub(vmin_d, spu_add(tempx, tempy));
 622 }
 623
 624
 625
 626 /**
 627  * Compute the setup.coef[] array dadx, dady, a0 values.
 628  * Must be called after setup.vmin,vmid,vmax,vprovoke are initialized.
 629  */
 630 static void
 631 setup_tri_coefficients(void)
 632 {
 633    uint i;
 634
 635    for (i = 0; i < spu.vertex_info.num_attribs; i++) {
 636       switch (spu.vertex_info.attrib[i].interp_mode) {
 637       case INTERP_NONE:
 638          break;
 639       case INTERP_CONSTANT:
 640          const_coeff4(i);
 641          break;
 642       case INTERP_POS:
 643          /* fall-through */
 644       case INTERP_LINEAR:
 645          tri_linear_coeff4(i);
 646          break;
 647       case INTERP_PERSPECTIVE:
 648          tri_persp_coeff4(i);
 649          break;
 650       default:
 651          ASSERT(0);
 652       }
 653    }
 654 }
 655
 656
 657 static void
 658 setup_tri_edges(void)
 659 {
 660    float vmin_x = spu_extract(setup.vmin->data[0], 0) + 0.5f;
 661    float vmid_x = spu_extract(setup.vmid->data[0], 0) + 0.5f;
 662
 663    float vmin_y = spu_extract(setup.vmin->data[0], 1) - 0.5f;
 664    float vmid_y = spu_extract(setup.vmid->data[0], 1) - 0.5f;
 665    float vmax_y = spu_extract(setup.vmax->data[0], 1) - 0.5f;
 666
 667    setup.emaj.sy = CEILF(vmin_y);
 668    setup.emaj.lines = (int) CEILF(vmax_y - setup.emaj.sy);
 669    setup.emaj.dxdy = setup.emaj.dx / setup.emaj.dy;
 670    setup.emaj.sx = vmin_x + (setup.emaj.sy - vmin_y) * setup.emaj.dxdy;
 671
 672    setup.etop.sy = CEILF(vmid_y);
 673    setup.etop.lines = (int) CEILF(vmax_y - setup.etop.sy);
 674    setup.etop.dxdy = setup.etop.dx / setup.etop.dy;
 675    setup.etop.sx = vmid_x + (setup.etop.sy - vmid_y) * setup.etop.dxdy;
 676
 677    setup.ebot.sy = CEILF(vmin_y);
 678    setup.ebot.lines = (int) CEILF(vmid_y - setup.ebot.sy);
 679    setup.ebot.dxdy = setup.ebot.dx / setup.ebot.dy;
 680    setup.ebot.sx = vmin_x + (setup.ebot.sy - vmin_y) * setup.ebot.dxdy;
 681 }
 682
 683
 684 /**
 685  * Render the upper or lower half of a triangle.
 686  * Scissoring/cliprect is applied here too.
 687  */
 688 static void
 689 subtriangle(struct edge *eleft, struct edge *eright, unsigned lines)
 690 {
 691    const int minx = setup.cliprect_minx;
 692    const int maxx = setup.cliprect_maxx;
 693    const int miny = setup.cliprect_miny;
 694    const int maxy = setup.cliprect_maxy;
 695    int y, start_y, finish_y;
 696    int sy = (int)eleft->sy;
 697
 698    ASSERT((int)eleft->sy == (int) eright->sy);
 699
 700    /* clip top/bottom */
 701    start_y = sy;
 702    finish_y = sy + lines;
 703
 704    if (start_y < miny)
 705       start_y = miny;
 706
 707    if (finish_y > maxy)
 708       finish_y = maxy;
 709
 710    start_y -= sy;
 711    finish_y -= sy;
 712
 713    /*
 714    _mesa_printf("%s %d %d\n", __FUNCTION__, start_y, finish_y);
 715    */
 716
 717    for (y = start_y; y < finish_y; y++) {
 718
 719       /* avoid accumulating adds as floats don't have the precision to
 720        * accurately iterate large triangle edges that way.  luckily we
 721        * can just multiply these days.
 722        *
 723        * this is all drowned out by the attribute interpolation anyway.
 724        */
 725       int left = (int)(eleft->sx + y * eleft->dxdy);
 726       int right = (int)(eright->sx + y * eright->dxdy);
 727
 728       /* clip left/right */
 729       if (left < minx)
 730          left = minx;
 731       if (right > maxx)
 732          right = maxx;
 733
 734       if (left < right) {
 735          int _y = sy + y;
 736          if (block(_y) != setup.span.y) {
 737             flush_spans();
 738             setup.span.y = block(_y);
 739          }
 740
 741          int offset = _y&1;
 742          vec_int4 quad_LlRr = {left, left, right, right};
 743          /* Store left and right in 0 or 1 row of quad based on offset */
 744          setup.span.quad = spu_sel(quad_LlRr, setup.span.quad, spu_maskw(5<<offset));
 745          setup.span.y_flags |= 1<<offset;
 746       }
 747    }
 748
 749
 750    /* save the values so that emaj can be restarted:
 751     */
 752    eleft->sx += lines * eleft->dxdy;
 753    eright->sx += lines * eright->dxdy;
 754    eleft->sy += lines;
 755    eright->sy += lines;
 756 }
 757
 758
 759 /**
 760  * Draw triangle into tile at (tx, ty) (tile coords)
 761  * The tile data should have already been fetched.
 762  */
 763 boolean
 764 tri_draw(const float *v0, const float *v1, const float *v2,
 765          uint tx, uint ty)
 766 {
 767    setup.tx = tx;
 768    setup.ty = ty;
 769
 770    /* set clipping bounds to tile bounds */
 771    setup.cliprect_minx = tx * TILE_SIZE;
 772    setup.cliprect_miny = ty * TILE_SIZE;
 773    setup.cliprect_maxx = (tx + 1) * TILE_SIZE;
 774    setup.cliprect_maxy = (ty + 1) * TILE_SIZE;
 775
 776    if (!setup_sort_vertices((struct vertex_header *) v0,
 777                             (struct vertex_header *) v1,
 778                             (struct vertex_header *) v2)) {
 779       return FALSE; /* totally clipped */
 780    }
 781
 782    setup_tri_coefficients();
 783    setup_tri_edges();
 784
 785    setup.span.y = 0;
 786    setup.span.y_flags = 0;
 787    /* Zero right elements */
 788    setup.span.quad = spu_shuffle(setup.span.quad, setup.span.quad, SHUFFLE4(A,B,0,0));
 789
 790    if (setup.oneOverArea < 0.0) {
 791       /* emaj on left */
 792       subtriangle( &setup.emaj, &setup.ebot, setup.ebot.lines );
 793       subtriangle( &setup.emaj, &setup.etop, setup.etop.lines );
 794    }
 795    else {
 796       /* emaj on right */
 797       subtriangle( &setup.ebot, &setup.emaj, setup.ebot.lines );
 798       subtriangle( &setup.etop, &setup.emaj, setup.etop.lines );
 799    }
 800
 801    flush_spans();
 802
 803    return TRUE;
 804 }