src/gallium/drivers/cell/spu/spu_tri.c

   1 /**************************************************************************
   2  *
   3  * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
   4  * All Rights Reserved.
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a
   7  * copy of this software and associated documentation files (the
   8  * "Software"), to deal in the Software without restriction, including
   9  * without limitation the rights to use, copy, modify, merge, publish,
  10  * distribute, sub license, and/or sell copies of the Software, and to
  11  * permit persons to whom the Software is furnished to do so, subject to
  12  * the following conditions:
  13  *
  14  * The above copyright notice and this permission notice (including the
  15  * next paragraph) shall be included in all copies or substantial portions
  16  * of the Software.
  17  *
  18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  19  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
  21  * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
  22  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  23  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  24  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25  *
  26  **************************************************************************/
  27
  28 /**
  29  * Triangle rendering within a tile.
  30  */
  31
  32 #include <transpose_matrix4x4.h>
  33 #include "pipe/p_compiler.h"
  34 #include "pipe/p_format.h"
  35 #include "util/u_math.h"
  36 #include "spu_colorpack.h"
  37 #include "spu_main.h"
  38 #include "spu_texture.h"
  39 #include "spu_tile.h"
  40 #include "spu_tri.h"
  41
  42
  43 /** Masks are uint[4] vectors with each element being 0 or 0xffffffff */
  44 typedef vector unsigned int mask_t;
  45
  46 typedef union
  47 {
  48    vector float v;
  49    float f[4];
  50 } float4;
  51
  52
  53 /**
  54  * Simplified types taken from other parts of Gallium
  55  */
  56 struct vertex_header {
  57    vector float data[1];
  58 };
  59
  60
  61
  62 /* XXX fix this */
  63 #undef CEILF
  64 #define CEILF(X) ((float) (int) ((X) + 0.99999))
  65
  66
  67 #define QUAD_TOP_LEFT     0
  68 #define QUAD_TOP_RIGHT    1
  69 #define QUAD_BOTTOM_LEFT  2
  70 #define QUAD_BOTTOM_RIGHT 3
  71 #define MASK_TOP_LEFT     (1 << QUAD_TOP_LEFT)
  72 #define MASK_TOP_RIGHT    (1 << QUAD_TOP_RIGHT)
  73 #define MASK_BOTTOM_LEFT  (1 << QUAD_BOTTOM_LEFT)
  74 #define MASK_BOTTOM_RIGHT (1 << QUAD_BOTTOM_RIGHT)
  75 #define MASK_ALL          0xf
  76
  77
  78 #define DEBUG_VERTS 0
  79
  80 /**
  81  * Triangle edge info
  82  */
  83 struct edge {
  84    float dx;            /**< X(v1) - X(v0), used only during setup */
  85    float dy;            /**< Y(v1) - Y(v0), used only during setup */
  86    float dxdy;          /**< dx/dy */
  87    float sx, sy;        /**< first sample point coord */
  88    int lines;           /**< number of lines on this edge */
  89 };
  90
  91
  92 struct interp_coef
  93 {
  94    float4 a0;
  95    float4 dadx;
  96    float4 dady;
  97 };
  98
  99
 100 /**
 101  * Triangle setup info (derived from draw_stage).
 102  * Also used for line drawing (taking some liberties).
 103  */
 104 struct setup_stage {
 105
 106    /* Vertices are just an array of floats making up each attribute in
 107     * turn.  Currently fixed at 4 floats, but should change in time.
 108     * Codegen will help cope with this.
 109     */
 110    const struct vertex_header *vmax;
 111    const struct vertex_header *vmid;
 112    const struct vertex_header *vmin;
 113    const struct vertex_header *vprovoke;
 114
 115    struct edge ebot;
 116    struct edge etop;
 117    struct edge emaj;
 118
 119    float oneoverarea;
 120
 121    uint facing;
 122
 123    uint tx, ty;
 124
 125    int cliprect_minx, cliprect_maxx, cliprect_miny, cliprect_maxy;
 126
 127 #if 0
 128    struct tgsi_interp_coef coef[PIPE_MAX_SHADER_INPUTS];
 129 #else
 130    struct interp_coef coef[PIPE_MAX_SHADER_INPUTS];
 131 #endif
 132
 133 #if 0
 134    struct quad_header quad;
 135 #endif
 136
 137    struct {
 138       int left[2];   /**< [0] = row0, [1] = row1 */
 139       int right[2];
 140       int y;
 141       unsigned y_flags;
 142       unsigned mask;     /**< mask of MASK_BOTTOM/TOP_LEFT/RIGHT bits */
 143    } span;
 144 };
 145
 146
 147
 148 static struct setup_stage setup;
 149
 150
 151
 152
 153 #if 0
 154 /**
 155  * Basically a cast wrapper.
 156  */
 157 static INLINE struct setup_stage *setup_stage( struct draw_stage *stage )
 158 {
 159    return (struct setup_stage *)stage;
 160 }
 161 #endif
 162
 163 #if 0
 164 /**
 165  * Clip setup.quad against the scissor/surface bounds.
 166  */
 167 static INLINE void
 168 quad_clip(struct setup_stage *setup)
 169 {
 170    const struct pipe_scissor_state *cliprect = &setup.softpipe->cliprect;
 171    const int minx = (int) cliprect->minx;
 172    const int maxx = (int) cliprect->maxx;
 173    const int miny = (int) cliprect->miny;
 174    const int maxy = (int) cliprect->maxy;
 175
 176    if (setup.quad.x0 >= maxx ||
 177        setup.quad.y0 >= maxy ||
 178        setup.quad.x0 + 1 < minx ||
 179        setup.quad.y0 + 1 < miny) {
 180       /* totally clipped */
 181       setup.quad.mask = 0x0;
 182       return;
 183    }
 184    if (setup.quad.x0 < minx)
 185       setup.quad.mask &= (MASK_BOTTOM_RIGHT | MASK_TOP_RIGHT);
 186    if (setup.quad.y0 < miny)
 187       setup.quad.mask &= (MASK_BOTTOM_LEFT | MASK_BOTTOM_RIGHT);
 188    if (setup.quad.x0 == maxx - 1)
 189       setup.quad.mask &= (MASK_BOTTOM_LEFT | MASK_TOP_LEFT);
 190    if (setup.quad.y0 == maxy - 1)
 191       setup.quad.mask &= (MASK_TOP_LEFT | MASK_TOP_RIGHT);
 192 }
 193 #endif
 194
 195 #if 0
 196 /**
 197  * Emit a quad (pass to next stage) with clipping.
 198  */
 199 static INLINE void
 200 clip_emit_quad(struct setup_stage *setup)
 201 {
 202    quad_clip(setup);
 203    if (setup.quad.mask) {
 204       struct softpipe_context *sp = setup.softpipe;
 205       sp->quad.first->run(sp->quad.first, &setup.quad);
 206    }
 207 }
 208 #endif
 209
 210 /**
 211  * Evaluate attribute coefficients (plane equations) to compute
 212  * attribute values for the four fragments in a quad.
 213  * Eg: four colors will be computed (in AoS format).
 214  */
 215 static INLINE void
 216 eval_coeff(uint slot, float x, float y, vector float result[4])
 217 {
 218    switch (spu.vertex_info.interp_mode[slot]) {
 219    case INTERP_CONSTANT:
 220       result[QUAD_TOP_LEFT] =
 221       result[QUAD_TOP_RIGHT] =
 222       result[QUAD_BOTTOM_LEFT] =
 223       result[QUAD_BOTTOM_RIGHT] = setup.coef[slot].a0.v;
 224       break;
 225
 226    case INTERP_LINEAR:
 227       /* fall-through, for now */
 228    default:
 229       {
 230          register vector float dadx = setup.coef[slot].dadx.v;
 231          register vector float dady = setup.coef[slot].dady.v;
 232          register vector float topLeft
 233             = spu_add(setup.coef[slot].a0.v,
 234                       spu_add(spu_mul(spu_splats(x), dadx),
 235                               spu_mul(spu_splats(y), dady)));
 236
 237          result[QUAD_TOP_LEFT] = topLeft;
 238          result[QUAD_TOP_RIGHT] = spu_add(topLeft, dadx);
 239          result[QUAD_BOTTOM_LEFT] = spu_add(topLeft, dady);
 240          result[QUAD_BOTTOM_RIGHT] = spu_add(spu_add(topLeft, dadx), dady);
 241       }
 242    }
 243 }
 244
 245
 246 /**
 247  * As above, but return 4 vectors in SOA format.
 248  * XXX this will all be re-written someday.
 249  */
 250 static INLINE void
 251 eval_coeff_soa(uint slot, float x, float y, vector float result[4])
 252 {
 253    eval_coeff(slot, x, y, result);
 254    _transpose_matrix4x4(result, result);
 255 }
 256
 257
 258
 259 static INLINE vector float
 260 eval_z(float x, float y)
 261 {
 262    const uint slot = 0;
 263    const float dzdx = setup.coef[slot].dadx.f[2];
 264    const float dzdy = setup.coef[slot].dady.f[2];
 265    const float topLeft = setup.coef[slot].a0.f[2] + x * dzdx + y * dzdy;
 266    const vector float topLeftv = spu_splats(topLeft);
 267    const vector float derivs = (vector float) { 0.0, dzdx, dzdy, dzdx + dzdy };
 268    return spu_add(topLeftv, derivs);
 269 }
 270
 271
 272 /**
 273  * Emit a quad (pass to next stage).  No clipping is done.
 274  * Note: about 1/5 to 1/7 of the time, mask is zero and this function
 275  * should be skipped.  But adding the test for that slows things down
 276  * overall.
 277  */
 278 static INLINE void
 279 emit_quad( int x, int y, mask_t mask)
 280 {
 281    /* If any bits in mask are set... */
 282    if (spu_extract(spu_orx(mask), 0)) {
 283       const int ix = x - setup.cliprect_minx;
 284       const int iy = y - setup.cliprect_miny;
 285
 286       spu.cur_ctile_status = TILE_STATUS_DIRTY;
 287       spu.cur_ztile_status = TILE_STATUS_DIRTY;
 288
 289       if (0/*spu.texture[0].start*/) {
 290          /*
 291           * Temporary texture mapping path
 292           * This will go away when fragment programs support TEX inst.
 293           */
 294          const uint unit = 0;
 295          vector float colors[4];
 296          vector float texcoords[4];
 297          eval_coeff(2, (float) x, (float) y, texcoords);
 298
 299          if (spu_extract(mask, 0))
 300             colors[0] = spu.sample_texture[unit](unit, texcoords[0]);
 301          if (spu_extract(mask, 1))
 302             colors[1] = spu.sample_texture[unit](unit, texcoords[1]);
 303          if (spu_extract(mask, 2))
 304             colors[2] = spu.sample_texture[unit](unit, texcoords[2]);
 305          if (spu_extract(mask, 3))
 306             colors[3] = spu.sample_texture[unit](unit, texcoords[3]);
 307
 308
 309          if (spu.texture[1].start) {
 310             /* multi-texture mapping */
 311             const uint unit = 1;
 312             vector float colors1[4];
 313
 314             eval_coeff(2, (float) x, (float) y, texcoords);
 315
 316             if (spu_extract(mask, 0))
 317                colors1[0] = spu.sample_texture[unit](unit, texcoords[0]);
 318             if (spu_extract(mask, 1))
 319                colors1[1] = spu.sample_texture[unit](unit, texcoords[1]);
 320             if (spu_extract(mask, 2))
 321                colors1[2] = spu.sample_texture[unit](unit, texcoords[2]);
 322             if (spu_extract(mask, 3))
 323                colors1[3] = spu.sample_texture[unit](unit, texcoords[3]);
 324
 325             /* hack: modulate first texture by second */
 326             colors[0] = spu_mul(colors[0], colors1[0]);
 327             colors[1] = spu_mul(colors[1], colors1[1]);
 328             colors[2] = spu_mul(colors[2], colors1[2]);
 329             colors[3] = spu_mul(colors[3], colors1[3]);
 330          }
 331
 332          {
 333             /* Convert fragment data from AoS to SoA format.
 334              * I.e. (RGBA,RGBA,RGBA,RGBA) -> (RRRR,GGGG,BBBB,AAAA)
 335              * This is temporary!
 336              */
 337             vector float soa_frag[4];
 338             _transpose_matrix4x4(soa_frag, colors);
 339
 340             vector float fragZ = eval_z((float) x, (float) y);
 341
 342             /* Do all per-fragment/quad operations here, including:
 343              * alpha test, z test, stencil test, blend and framebuffer writing.
 344              */
 345             spu.fragment_ops(ix, iy, &spu.ctile, &spu.ztile,
 346                              fragZ,
 347                              soa_frag[0], soa_frag[1],
 348                              soa_frag[2], soa_frag[3],
 349                              mask,
 350                              setup.facing);
 351          }
 352
 353       }
 354       else {
 355          /*
 356           * Run fragment shader, execute per-fragment ops, update fb/tile.
 357           */
 358          vector float inputs[4*4], outputs[2*4];
 359          vector float fragZ = eval_z((float) x, (float) y);
 360
 361          /* setup inputs */
 362 #if 0
 363          eval_coeff_soa(1, (float) x, (float) y, inputs);
 364 #else
 365          uint i;
 366          for (i = 0; i < spu.vertex_info.num_attribs; i++) {
 367             eval_coeff_soa(i+1, (float) x, (float) y, inputs + i * 4);
 368          }
 369 #endif
 370          ASSERT(spu.fragment_program);
 371          ASSERT(spu.fragment_ops);
 372
 373          /* Execute the current fragment program */
 374          spu.fragment_program(inputs, outputs, spu.constants);
 375
 376          /* Execute per-fragment/quad operations, including:
 377           * alpha test, z test, stencil test, blend and framebuffer writing.
 378           */
 379          spu.fragment_ops(ix, iy, &spu.ctile, &spu.ztile,
 380                           fragZ,
 381                           outputs[0*4+0],
 382                           outputs[0*4+1],
 383                           outputs[0*4+2],
 384                           outputs[0*4+3],
 385                           mask,
 386                           setup.facing);
 387       }
 388    }
 389 }
 390
 391
 392 /**
 393  * Given an X or Y coordinate, return the block/quad coordinate that it
 394  * belongs to.
 395  */
 396 static INLINE int block( int x )
 397 {
 398    return x & ~1;
 399 }
 400
 401
 402 /**
 403  * Compute mask which indicates which pixels in the 2x2 quad are actually inside
 404  * the triangle's bounds.
 405  * The mask is a uint4 vector and each element will be 0 or 0xffffffff.
 406  */
 407 static INLINE mask_t calculate_mask( int x )
 408 {
 409    /* This is a little tricky.
 410     * Use & instead of && to avoid branches.
 411     * Use negation to convert true/false to ~0/0 values.
 412     */
 413    mask_t mask;
 414    mask = spu_insert(-((x   >= setup.span.left[0]) & (x   < setup.span.right[0])), mask, 0);
 415    mask = spu_insert(-((x+1 >= setup.span.left[0]) & (x+1 < setup.span.right[0])), mask, 1);
 416    mask = spu_insert(-((x   >= setup.span.left[1]) & (x   < setup.span.right[1])), mask, 2);
 417    mask = spu_insert(-((x+1 >= setup.span.left[1]) & (x+1 < setup.span.right[1])), mask, 3);
 418    return mask;
 419 }
 420
 421
 422 /**
 423  * Render a horizontal span of quads
 424  */
 425 static void flush_spans( void )
 426 {
 427    int minleft, maxright;
 428    int x;
 429
 430    switch (setup.span.y_flags) {
 431    case 0x3:
 432       /* both odd and even lines written (both quad rows) */
 433       minleft = MIN2(setup.span.left[0], setup.span.left[1]);
 434       maxright = MAX2(setup.span.right[0], setup.span.right[1]);
 435       break;
 436
 437    case 0x1:
 438       /* only even line written (quad top row) */
 439       minleft = setup.span.left[0];
 440       maxright = setup.span.right[0];
 441       break;
 442
 443    case 0x2:
 444       /* only odd line written (quad bottom row) */
 445       minleft = setup.span.left[1];
 446       maxright = setup.span.right[1];
 447       break;
 448
 449    default:
 450       return;
 451    }
 452
 453
 454    /* OK, we're very likely to need the tile data now.
 455     * clear or finish waiting if needed.
 456     */
 457    if (spu.cur_ctile_status == TILE_STATUS_GETTING) {
 458       /* wait for mfc_get() to complete */
 459       //printf("SPU: %u: waiting for ctile\n", spu.init.id);
 460       wait_on_mask(1 << TAG_READ_TILE_COLOR);
 461       spu.cur_ctile_status = TILE_STATUS_CLEAN;
 462    }
 463    else if (spu.cur_ctile_status == TILE_STATUS_CLEAR) {
 464       //printf("SPU %u: clearing C tile %u, %u\n", spu.init.id, setup.tx, setup.ty);
 465       clear_c_tile(&spu.ctile);
 466       spu.cur_ctile_status = TILE_STATUS_DIRTY;
 467    }
 468    ASSERT(spu.cur_ctile_status != TILE_STATUS_DEFINED);
 469
 470    if (spu.read_depth) {
 471       if (spu.cur_ztile_status == TILE_STATUS_GETTING) {
 472          /* wait for mfc_get() to complete */
 473          //printf("SPU: %u: waiting for ztile\n", spu.init.id);
 474          wait_on_mask(1 << TAG_READ_TILE_Z);
 475          spu.cur_ztile_status = TILE_STATUS_CLEAN;
 476       }
 477       else if (spu.cur_ztile_status == TILE_STATUS_CLEAR) {
 478          //printf("SPU %u: clearing Z tile %u, %u\n", spu.init.id, setup.tx, setup.ty);
 479          clear_z_tile(&spu.ztile);
 480          spu.cur_ztile_status = TILE_STATUS_DIRTY;
 481       }
 482       ASSERT(spu.cur_ztile_status != TILE_STATUS_DEFINED);
 483    }
 484
 485    /* XXX this loop could be moved into the above switch cases and
 486     * calculate_mask() could be simplified a bit...
 487     */
 488    for (x = block(minleft); x <= block(maxright); x += 2) {
 489 #if 1
 490       emit_quad( x, setup.span.y, calculate_mask( x ));
 491 #endif
 492    }
 493
 494    setup.span.y = 0;
 495    setup.span.y_flags = 0;
 496    setup.span.right[0] = 0;
 497    setup.span.right[1] = 0;
 498 }
 499
 500 #if DEBUG_VERTS
 501 static void print_vertex(const struct vertex_header *v)
 502 {
 503    int i;
 504    fprintf(stderr, "Vertex: (%p)\n", v);
 505    for (i = 0; i < setup.quad.nr_attrs; i++) {
 506       fprintf(stderr, "  %d: %f %f %f %f\n",  i,
 507               v->data[i][0], v->data[i][1], v->data[i][2], v->data[i][3]);
 508    }
 509 }
 510 #endif
 511
 512
 513 static boolean setup_sort_vertices(const struct vertex_header *v0,
 514                                    const struct vertex_header *v1,
 515                                    const struct vertex_header *v2)
 516 {
 517
 518 #if DEBUG_VERTS
 519    fprintf(stderr, "Triangle:\n");
 520    print_vertex(v0);
 521    print_vertex(v1);
 522    print_vertex(v2);
 523 #endif
 524
 525    setup.vprovoke = v2;
 526
 527    /* determine bottom to top order of vertices */
 528    {
 529       float y0 = spu_extract(v0->data[0], 1);
 530       float y1 = spu_extract(v1->data[0], 1);
 531       float y2 = spu_extract(v2->data[0], 1);
 532       if (y0 <= y1) {
 533          if (y1 <= y2) {
 534             /* y0<=y1<=y2 */
 535             setup.vmin = v0;
 536             setup.vmid = v1;
 537             setup.vmax = v2;
 538          }
 539          else if (y2 <= y0) {
 540             /* y2<=y0<=y1 */
 541             setup.vmin = v2;
 542             setup.vmid = v0;
 543             setup.vmax = v1;
 544          }
 545          else {
 546             /* y0<=y2<=y1 */
 547             setup.vmin = v0;
 548             setup.vmid = v2;
 549             setup.vmax = v1;
 550          }
 551       }
 552       else {
 553          if (y0 <= y2) {
 554             /* y1<=y0<=y2 */
 555             setup.vmin = v1;
 556             setup.vmid = v0;
 557             setup.vmax = v2;
 558          }
 559          else if (y2 <= y1) {
 560             /* y2<=y1<=y0 */
 561             setup.vmin = v2;
 562             setup.vmid = v1;
 563             setup.vmax = v0;
 564          }
 565          else {
 566             /* y1<=y2<=y0 */
 567             setup.vmin = v1;
 568             setup.vmid = v2;
 569             setup.vmax = v0;
 570          }
 571       }
 572    }
 573
 574    /* Check if triangle is completely outside the tile bounds */
 575    if (spu_extract(setup.vmin->data[0], 1) > setup.cliprect_maxy)
 576       return FALSE;
 577    if (spu_extract(setup.vmax->data[0], 1) < setup.cliprect_miny)
 578       return FALSE;
 579    if (spu_extract(setup.vmin->data[0], 0) < setup.cliprect_minx &&
 580        spu_extract(setup.vmid->data[0], 0) < setup.cliprect_minx &&
 581        spu_extract(setup.vmax->data[0], 0) < setup.cliprect_minx)
 582       return FALSE;
 583    if (spu_extract(setup.vmin->data[0], 0) > setup.cliprect_maxx &&
 584        spu_extract(setup.vmid->data[0], 0) > setup.cliprect_maxx &&
 585        spu_extract(setup.vmax->data[0], 0) > setup.cliprect_maxx)
 586       return FALSE;
 587
 588    setup.ebot.dx = spu_extract(setup.vmid->data[0], 0) - spu_extract(setup.vmin->data[0], 0);
 589    setup.ebot.dy = spu_extract(setup.vmid->data[0], 1) - spu_extract(setup.vmin->data[0], 1);
 590    setup.emaj.dx = spu_extract(setup.vmax->data[0], 0) - spu_extract(setup.vmin->data[0], 0);
 591    setup.emaj.dy = spu_extract(setup.vmax->data[0], 1) - spu_extract(setup.vmin->data[0], 1);
 592    setup.etop.dx = spu_extract(setup.vmax->data[0], 0) - spu_extract(setup.vmid->data[0], 0);
 593    setup.etop.dy = spu_extract(setup.vmax->data[0], 1) - spu_extract(setup.vmid->data[0], 1);
 594
 595    /*
 596     * Compute triangle's area.  Use 1/area to compute partial
 597     * derivatives of attributes later.
 598     *
 599     * The area will be the same as prim->det, but the sign may be
 600     * different depending on how the vertices get sorted above.
 601     *
 602     * To determine whether the primitive is front or back facing we
 603     * use the prim->det value because its sign is correct.
 604     */
 605    {
 606       const float area = (setup.emaj.dx * setup.ebot.dy -
 607                             setup.ebot.dx * setup.emaj.dy);
 608
 609       setup.oneoverarea = 1.0f / area;
 610       /*
 611       _mesa_printf("%s one-over-area %f  area %f  det %f\n",
 612                    __FUNCTION__, setup.oneoverarea, area, prim->det );
 613       */
 614    }
 615
 616 #if 0
 617    /* We need to know if this is a front or back-facing triangle for:
 618     *  - the GLSL gl_FrontFacing fragment attribute (bool)
 619     *  - two-sided stencil test
 620     */
 621    setup.quad.facing = (prim->det > 0.0) ^ (setup.softpipe->rasterizer->front_winding == PIPE_WINDING_CW);
 622 #endif
 623
 624    return TRUE;
 625 }
 626
 627
 628 /**
 629  * Compute a0 for a constant-valued coefficient (GL_FLAT shading).
 630  * The value value comes from vertex->data[slot].
 631  * The result will be put into setup.coef[slot].a0.
 632  * \param slot  which attribute slot
 633  */
 634 static INLINE void
 635 const_coeff(uint slot)
 636 {
 637    setup.coef[slot].dadx.v = (vector float) {0.0, 0.0, 0.0, 0.0};
 638    setup.coef[slot].dady.v = (vector float) {0.0, 0.0, 0.0, 0.0};
 639    setup.coef[slot].a0.v = setup.vprovoke->data[slot];
 640 }
 641
 642
 643 /**
 644  * Compute a0, dadx and dady for a linearly interpolated coefficient,
 645  * for a triangle.
 646  */
 647 static INLINE void
 648 tri_linear_coeff(uint slot, uint firstComp, uint lastComp)
 649 {
 650    uint i;
 651    const float *vmin_d = (float *) &setup.vmin->data[slot];
 652    const float *vmid_d = (float *) &setup.vmid->data[slot];
 653    const float *vmax_d = (float *) &setup.vmax->data[slot];
 654    const float x = spu_extract(setup.vmin->data[0], 0) - 0.5f;
 655    const float y = spu_extract(setup.vmin->data[0], 1) - 0.5f;
 656
 657    for (i = firstComp; i < lastComp; i++) {
 658       float botda = vmid_d[i] - vmin_d[i];
 659       float majda = vmax_d[i] - vmin_d[i];
 660       float a = setup.ebot.dy * majda - botda * setup.emaj.dy;
 661       float b = setup.emaj.dx * botda - majda * setup.ebot.dx;
 662
 663       ASSERT(slot < PIPE_MAX_SHADER_INPUTS);
 664
 665       setup.coef[slot].dadx.f[i] = a * setup.oneoverarea;
 666       setup.coef[slot].dady.f[i] = b * setup.oneoverarea;
 667
 668       /* calculate a0 as the value which would be sampled for the
 669        * fragment at (0,0), taking into account that we want to sample at
 670        * pixel centers, in other words (0.5, 0.5).
 671        *
 672        * this is neat but unfortunately not a good way to do things for
 673        * triangles with very large values of dadx or dady as it will
 674        * result in the subtraction and re-addition from a0 of a very
 675        * large number, which means we'll end up loosing a lot of the
 676        * fractional bits and precision from a0.  the way to fix this is
 677        * to define a0 as the sample at a pixel center somewhere near vmin
 678        * instead - i'll switch to this later.
 679        */
 680       setup.coef[slot].a0.f[i] = (vmin_d[i] -
 681                                  (setup.coef[slot].dadx.f[i] * x +
 682                                   setup.coef[slot].dady.f[i] * y));
 683    }
 684
 685    /*
 686    _mesa_printf("attr[%d].%c: %f dx:%f dy:%f\n",
 687                 slot, "xyzw"[i],
 688                 setup.coef[slot].a0[i],
 689                 setup.coef[slot].dadx.f[i],
 690                 setup.coef[slot].dady.f[i]);
 691    */
 692 }
 693
 694
 695 /**
 696  * As above, but interp setup all four vector components.
 697  */
 698 static INLINE void
 699 tri_linear_coeff4(uint slot)
 700 {
 701    const vector float vmin_d = setup.vmin->data[slot];
 702    const vector float vmid_d = setup.vmid->data[slot];
 703    const vector float vmax_d = setup.vmax->data[slot];
 704    const vector float xxxx = spu_splats(spu_extract(setup.vmin->data[0], 0) - 0.5f);
 705    const vector float yyyy = spu_splats(spu_extract(setup.vmin->data[0], 1) - 0.5f);
 706
 707    vector float botda = vmid_d - vmin_d;
 708    vector float majda = vmax_d - vmin_d;
 709
 710    vector float a = spu_sub(spu_mul(spu_splats(setup.ebot.dy), majda),
 711                             spu_mul(botda, spu_splats(setup.emaj.dy)));
 712    vector float b = spu_sub(spu_mul(spu_splats(setup.emaj.dx), botda),
 713                             spu_mul(majda, spu_splats(setup.ebot.dx)));
 714
 715    setup.coef[slot].dadx.v = spu_mul(a, spu_splats(setup.oneoverarea));
 716    setup.coef[slot].dady.v = spu_mul(b, spu_splats(setup.oneoverarea));
 717
 718    vector float tempx = spu_mul(setup.coef[slot].dadx.v, xxxx);
 719    vector float tempy = spu_mul(setup.coef[slot].dady.v, yyyy);
 720
 721    setup.coef[slot].a0.v = spu_sub(vmin_d, spu_add(tempx, tempy));
 722 }
 723
 724
 725
 726 #if 0
 727 /**
 728  * Compute a0, dadx and dady for a perspective-corrected interpolant,
 729  * for a triangle.
 730  * We basically multiply the vertex value by 1/w before computing
 731  * the plane coefficients (a0, dadx, dady).
 732  * Later, when we compute the value at a particular fragment position we'll
 733  * divide the interpolated value by the interpolated W at that fragment.
 734  */
 735 static void tri_persp_coeff( unsigned slot,
 736                              unsigned i )
 737 {
 738    /* premultiply by 1/w:
 739     */
 740    float mina = setup.vmin->data[slot][i] * setup.vmin->data[0][3];
 741    float mida = setup.vmid->data[slot][i] * setup.vmid->data[0][3];
 742    float maxa = setup.vmax->data[slot][i] * setup.vmax->data[0][3];
 743
 744    float botda = mida - mina;
 745    float majda = maxa - mina;
 746    float a = setup.ebot.dy * majda - botda * setup.emaj.dy;
 747    float b = setup.emaj.dx * botda - majda * setup.ebot.dx;
 748
 749    /*
 750    printf("tri persp %d,%d: %f %f %f\n", slot, i,
 751           setup.vmin->data[slot][i],
 752           setup.vmid->data[slot][i],
 753           setup.vmax->data[slot][i]
 754           );
 755    */
 756
 757    assert(slot < PIPE_MAX_SHADER_INPUTS);
 758    assert(i <= 3);
 759
 760    setup.coef[slot].dadx.f[i] = a * setup.oneoverarea;
 761    setup.coef[slot].dady.f[i] = b * setup.oneoverarea;
 762    setup.coef[slot].a0.f[i] = (mina -
 763                             (setup.coef[slot].dadx.f[i] * (setup.vmin->data[0][0] - 0.5f) +
 764                              setup.coef[slot].dady.f[i] * (setup.vmin->data[0][1] - 0.5f)));
 765 }
 766 #endif
 767
 768
 769 /**
 770  * Compute the setup.coef[] array dadx, dady, a0 values.
 771  * Must be called after setup.vmin,vmid,vmax,vprovoke are initialized.
 772  */
 773 static void setup_tri_coefficients(void)
 774 {
 775 #if 1
 776    uint i;
 777
 778    for (i = 0; i < spu.vertex_info.num_attribs; i++) {
 779       switch (spu.vertex_info.interp_mode[i]) {
 780       case INTERP_NONE:
 781          break;
 782       case INTERP_POS:
 783          /*tri_linear_coeff(i, 2, 3);*/
 784          /* XXX interp W if PERSPECTIVE... */
 785          tri_linear_coeff4(i);
 786          break;
 787       case INTERP_CONSTANT:
 788          const_coeff(i);
 789          break;
 790       case INTERP_LINEAR:
 791          tri_linear_coeff4(i);
 792          break;
 793       case INTERP_PERSPECTIVE:
 794          tri_linear_coeff4(i);  /* temporary */
 795          break;
 796       default:
 797          ASSERT(0);
 798       }
 799    }
 800 #else
 801    ASSERT(spu.vertex_info.interp_mode[0] == INTERP_POS);
 802    ASSERT(spu.vertex_info.interp_mode[1] == INTERP_LINEAR ||
 803           spu.vertex_info.interp_mode[1] == INTERP_CONSTANT);
 804    tri_linear_coeff(0, 2, 3);  /* slot 0, z */
 805    tri_linear_coeff(1, 0, 4);  /* slot 1, color */
 806 #endif
 807 }
 808
 809
 810 static void setup_tri_edges(void)
 811 {
 812    float vmin_x = spu_extract(setup.vmin->data[0], 0) + 0.5f;
 813    float vmid_x = spu_extract(setup.vmid->data[0], 0) + 0.5f;
 814
 815    float vmin_y = spu_extract(setup.vmin->data[0], 1) - 0.5f;
 816    float vmid_y = spu_extract(setup.vmid->data[0], 1) - 0.5f;
 817    float vmax_y = spu_extract(setup.vmax->data[0], 1) - 0.5f;
 818
 819    setup.emaj.sy = CEILF(vmin_y);
 820    setup.emaj.lines = (int) CEILF(vmax_y - setup.emaj.sy);
 821    setup.emaj.dxdy = setup.emaj.dx / setup.emaj.dy;
 822    setup.emaj.sx = vmin_x + (setup.emaj.sy - vmin_y) * setup.emaj.dxdy;
 823
 824    setup.etop.sy = CEILF(vmid_y);
 825    setup.etop.lines = (int) CEILF(vmax_y - setup.etop.sy);
 826    setup.etop.dxdy = setup.etop.dx / setup.etop.dy;
 827    setup.etop.sx = vmid_x + (setup.etop.sy - vmid_y) * setup.etop.dxdy;
 828
 829    setup.ebot.sy = CEILF(vmin_y);
 830    setup.ebot.lines = (int) CEILF(vmid_y - setup.ebot.sy);
 831    setup.ebot.dxdy = setup.ebot.dx / setup.ebot.dy;
 832    setup.ebot.sx = vmin_x + (setup.ebot.sy - vmin_y) * setup.ebot.dxdy;
 833 }
 834
 835
 836 /**
 837  * Render the upper or lower half of a triangle.
 838  * Scissoring/cliprect is applied here too.
 839  */
 840 static void subtriangle( struct edge *eleft,
 841                          struct edge *eright,
 842                          unsigned lines )
 843 {
 844    const int minx = setup.cliprect_minx;
 845    const int maxx = setup.cliprect_maxx;
 846    const int miny = setup.cliprect_miny;
 847    const int maxy = setup.cliprect_maxy;
 848    int y, start_y, finish_y;
 849    int sy = (int)eleft->sy;
 850
 851    ASSERT((int)eleft->sy == (int) eright->sy);
 852
 853    /* clip top/bottom */
 854    start_y = sy;
 855    finish_y = sy + lines;
 856
 857    if (start_y < miny)
 858       start_y = miny;
 859
 860    if (finish_y > maxy)
 861       finish_y = maxy;
 862
 863    start_y -= sy;
 864    finish_y -= sy;
 865
 866    /*
 867    _mesa_printf("%s %d %d\n", __FUNCTION__, start_y, finish_y);
 868    */
 869
 870    for (y = start_y; y < finish_y; y++) {
 871
 872       /* avoid accumulating adds as floats don't have the precision to
 873        * accurately iterate large triangle edges that way.  luckily we
 874        * can just multiply these days.
 875        *
 876        * this is all drowned out by the attribute interpolation anyway.
 877        */
 878       int left = (int)(eleft->sx + y * eleft->dxdy);
 879       int right = (int)(eright->sx + y * eright->dxdy);
 880
 881       /* clip left/right */
 882       if (left < minx)
 883          left = minx;
 884       if (right > maxx)
 885          right = maxx;
 886
 887       if (left < right) {
 888          int _y = sy + y;
 889          if (block(_y) != setup.span.y) {
 890             flush_spans();
 891             setup.span.y = block(_y);
 892          }
 893
 894          setup.span.left[_y&1] = left;
 895          setup.span.right[_y&1] = right;
 896          setup.span.y_flags |= 1<<(_y&1);
 897       }
 898    }
 899
 900
 901    /* save the values so that emaj can be restarted:
 902     */
 903    eleft->sx += lines * eleft->dxdy;
 904    eright->sx += lines * eright->dxdy;
 905    eleft->sy += lines;
 906    eright->sy += lines;
 907 }
 908
 909 static float
 910 determinant( const float *v0,
 911              const float *v1,
 912              const float *v2 )
 913 {
 914    /* edge vectors e = v0 - v2, f = v1 - v2 */
 915    const float ex = v0[0] - v2[0];
 916    const float ey = v0[1] - v2[1];
 917    const float fx = v1[0] - v2[0];
 918    const float fy = v1[1] - v2[1];
 919
 920    /* det = cross(e,f).z */
 921    return ex * fy - ey * fx;
 922 }
 923
 924
 925 /**
 926  * Draw triangle into tile at (tx, ty) (tile coords)
 927  * The tile data should have already been fetched.
 928  */
 929 boolean
 930 tri_draw(const float *v0, const float *v1, const float *v2, uint tx, uint ty, uint front_winding)
 931 {
 932    setup.tx = tx;
 933    setup.ty = ty;
 934
 935    /* set clipping bounds to tile bounds */
 936    setup.cliprect_minx = tx * TILE_SIZE;
 937    setup.cliprect_miny = ty * TILE_SIZE;
 938    setup.cliprect_maxx = (tx + 1) * TILE_SIZE;
 939    setup.cliprect_maxy = (ty + 1) * TILE_SIZE;
 940
 941    /* Before we sort vertices, determine the facing of the triangle,
 942     * which will be needed for front/back-face stencil application
 943     */
 944    float det = determinant(v0, v1, v2);
 945    setup.facing = (det > 0.0) ^ (front_winding == PIPE_WINDING_CW);
 946
 947    if (!setup_sort_vertices((struct vertex_header *) v0,
 948                             (struct vertex_header *) v1,
 949                             (struct vertex_header *) v2)) {
 950       return FALSE; /* totally clipped */
 951    }
 952
 953    setup_tri_coefficients();
 954    setup_tri_edges();
 955
 956    setup.span.y = 0;
 957    setup.span.y_flags = 0;
 958    setup.span.right[0] = 0;
 959    setup.span.right[1] = 0;
 960    /*   setup.span.z_mode = tri_z_mode( setup.ctx ); */
 961
 962    /*   init_constant_attribs( setup ); */
 963
 964    if (setup.oneoverarea < 0.0) {
 965       /* emaj on left:
 966        */
 967       subtriangle( &setup.emaj, &setup.ebot, setup.ebot.lines );
 968       subtriangle( &setup.emaj, &setup.etop, setup.etop.lines );
 969    }
 970    else {
 971       /* emaj on right:
 972        */
 973       subtriangle( &setup.ebot, &setup.emaj, setup.ebot.lines );
 974       subtriangle( &setup.etop, &setup.emaj, setup.etop.lines );
 975    }
 976
 977    flush_spans();
 978
 979    return TRUE;
 980 }