src/gallium/drivers/llvmpipe/lp_rast_tri_tmp.h

   1 /**************************************************************************
   2  *
   3  * Copyright 2007-2010 VMware, Inc.
   4  * All Rights Reserved.
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a
   7  * copy of this software and associated documentation files (the
   8  * "Software"), to deal in the Software without restriction, including
   9  * without limitation the rights to use, copy, modify, merge, publish,
  10  * distribute, sub license, and/or sell copies of the Software, and to
  11  * permit persons to whom the Software is furnished to do so, subject to
  12  * the following conditions:
  13  *
  14  * The above copyright notice and this permission notice (including the
  15  * next paragraph) shall be included in all copies or substantial portions
  16  * of the Software.
  17  *
  18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  19  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
  21  * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
  22  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  23  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  24  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25  *
  26  **************************************************************************/
  27
  28 /*
  29  * Rasterization for binned triangles within a tile
  30  */
  31
  32
  33
  34 /**
  35  * Prototype for a 8 plane rasterizer function.  Will codegenerate
  36  * several of these.
  37  *
  38  * XXX: Varients for more/fewer planes.
  39  * XXX: Need ways of dropping planes as we descend.
  40  * XXX: SIMD
  41  */
  42 static void
  43 TAG(do_block_4)(struct lp_rasterizer_task *task,
  44                 const struct lp_rast_triangle *tri,
  45                 const struct lp_rast_plane *plane,
  46                 int x, int y,
  47                 const int64_t *c)
  48 {
  49    unsigned mask = 0xffff;
  50    int j;
  51
  52    for (j = 0; j < NR_PLANES; j++) {
  53 #ifdef RASTER_64
  54       mask &= ~BUILD_MASK_LINEAR(((c[j] - 1) >> (int64_t)FIXED_ORDER),
  55                                  -plane[j].dcdx >> FIXED_ORDER,
  56                                  plane[j].dcdy >> FIXED_ORDER);
  57 #else
  58       mask &= ~BUILD_MASK_LINEAR((c[j] - 1),
  59                                  -plane[j].dcdx,
  60                                  plane[j].dcdy);
  61 #endif
  62    }
  63
  64    /* Now pass to the shader:
  65     */
  66    if (mask)
  67       lp_rast_shade_quads_mask(task, &tri->inputs, x, y, mask);
  68 }
  69
  70 /**
  71  * Evaluate a 16x16 block of pixels to determine which 4x4 subblocks are in/out
  72  * of the triangle's bounds.
  73  */
  74 static void
  75 TAG(do_block_16)(struct lp_rasterizer_task *task,
  76                  const struct lp_rast_triangle *tri,
  77                  const struct lp_rast_plane *plane,
  78                  int x, int y,
  79                  const int64_t *c)
  80 {
  81    unsigned outmask, inmask, partmask, partial_mask;
  82    unsigned j;
  83
  84    outmask = 0;                 /* outside one or more trivial reject planes */
  85    partmask = 0;                /* outside one or more trivial accept planes */
  86
  87    for (j = 0; j < NR_PLANES; j++) {
  88 #ifdef RASTER_64
  89       int32_t dcdx = -plane[j].dcdx >> FIXED_ORDER;
  90       int32_t dcdy = plane[j].dcdy >> FIXED_ORDER;
  91       const int32_t cox = plane[j].eo >> FIXED_ORDER;
  92       const int32_t ei = (dcdy + dcdx - cox) << 2;
  93       const int32_t cox_s = cox << 2;
  94       const int32_t co = (int32_t)(c[j] >> (int64_t)FIXED_ORDER) + cox_s;
  95       int32_t cdiff;
  96       cdiff = ei - cox_s + ((int32_t)((c[j] - 1) >> (int64_t)FIXED_ORDER) -
  97                             (int32_t)(c[j] >> (int64_t)FIXED_ORDER));
  98       dcdx <<= 2;
  99       dcdy <<= 2;
 100 #else
 101       const int64_t dcdx = -IMUL64(plane[j].dcdx, 4);
 102       const int64_t dcdy = IMUL64(plane[j].dcdy, 4);
 103       const int64_t cox = IMUL64(plane[j].eo, 4);
 104       const int32_t ei = plane[j].dcdy - plane[j].dcdx - (int64_t)plane[j].eo;
 105       const int64_t cio = IMUL64(ei, 4) - 1;
 106       int32_t co, cdiff;
 107       co = c[j] + cox;
 108       cdiff = cio - cox;
 109 #endif
 110
 111       BUILD_MASKS(co, cdiff,
 112                   dcdx, dcdy,
 113                   &outmask,   /* sign bits from c[i][0..15] + cox */
 114                   &partmask); /* sign bits from c[i][0..15] + cio */
 115    }
 116
 117    if (outmask == 0xffff)
 118       return;
 119
 120    /* Mask of sub-blocks which are inside all trivial accept planes:
 121     */
 122    inmask = ~partmask & 0xffff;
 123
 124    /* Mask of sub-blocks which are inside all trivial reject planes,
 125     * but outside at least one trivial accept plane:
 126     */
 127    partial_mask = partmask & ~outmask;
 128
 129    assert((partial_mask & inmask) == 0);
 130
 131    LP_COUNT_ADD(nr_empty_4, util_bitcount(0xffff & ~(partial_mask | inmask)));
 132
 133    /* Iterate over partials:
 134     */
 135    while (partial_mask) {
 136       int i = ffs(partial_mask) - 1;
 137       int ix = (i & 3) * 4;
 138       int iy = (i >> 2) * 4;
 139       int px = x + ix;
 140       int py = y + iy;
 141       int64_t cx[NR_PLANES];
 142
 143       partial_mask &= ~(1 << i);
 144
 145       LP_COUNT(nr_partially_covered_4);
 146
 147       for (j = 0; j < NR_PLANES; j++)
 148          cx[j] = (c[j]
 149                   - IMUL64(plane[j].dcdx, ix)
 150                   + IMUL64(plane[j].dcdy, iy));
 151
 152       TAG(do_block_4)(task, tri, plane, px, py, cx);
 153    }
 154
 155    /* Iterate over fulls:
 156     */
 157    while (inmask) {
 158       int i = ffs(inmask) - 1;
 159       int ix = (i & 3) * 4;
 160       int iy = (i >> 2) * 4;
 161       int px = x + ix;
 162       int py = y + iy;
 163
 164       inmask &= ~(1 << i);
 165
 166       LP_COUNT(nr_fully_covered_4);
 167       block_full_4(task, tri, px, py);
 168    }
 169 }
 170
 171
 172 /**
 173  * Scan the tile in chunks and figure out which pixels to rasterize
 174  * for this triangle.
 175  */
 176 void
 177 TAG(lp_rast_triangle)(struct lp_rasterizer_task *task,
 178                       const union lp_rast_cmd_arg arg)
 179 {
 180    const struct lp_rast_triangle *tri = arg.triangle.tri;
 181    unsigned plane_mask = arg.triangle.plane_mask;
 182    const struct lp_rast_plane *tri_plane = GET_PLANES(tri);
 183    const int x = task->x, y = task->y;
 184    struct lp_rast_plane plane[NR_PLANES];
 185    int64_t c[NR_PLANES];
 186    unsigned outmask, inmask, partmask, partial_mask;
 187    unsigned j = 0;
 188
 189    if (tri->inputs.disable) {
 190       /* This triangle was partially binned and has been disabled */
 191       return;
 192    }
 193
 194    outmask = 0;                 /* outside one or more trivial reject planes */
 195    partmask = 0;                /* outside one or more trivial accept planes */
 196
 197    while (plane_mask) {
 198       int i = ffs(plane_mask) - 1;
 199       plane[j] = tri_plane[i];
 200       plane_mask &= ~(1 << i);
 201       c[j] = plane[j].c + IMUL64(plane[j].dcdy, y) - IMUL64(plane[j].dcdx, x);
 202
 203       {
 204 #ifdef RASTER_64
 205          /*
 206           * Strip off lower FIXED_ORDER bits. Note that those bits from
 207           * dcdx, dcdy, eo are always 0 (by definition).
 208           * c values, however, are not. This means that for every
 209           * addition of the form c + n*dcdx the lower FIXED_ORDER bits will
 210           * NOT change. And those bits are not relevant to the sign bit (which
 211           * is only what we need!) that is,
 212           * sign(c + n*dcdx) == sign((c >> FIXED_ORDER) + n*(dcdx >> FIXED_ORDER))
 213           * This means we can get away with using 32bit math for the most part.
 214           * Only tricky part is the -1 adjustment for cdiff.
 215           */
 216          int32_t dcdx = -plane[j].dcdx >> FIXED_ORDER;
 217          int32_t dcdy = plane[j].dcdy >> FIXED_ORDER;
 218          const int32_t cox = plane[j].eo >> FIXED_ORDER;
 219          const int32_t ei = (dcdy + dcdx - cox) << 4;
 220          const int32_t cox_s = cox << 4;
 221          const int32_t co = (int32_t)(c[j] >> (int64_t)FIXED_ORDER) + cox_s;
 222          int32_t cdiff;
 223          /*
 224           * Plausibility check to ensure the 32bit math works.
 225           * Note that within a tile, the max we can move the edge function
 226           * is essentially dcdx * TILE_SIZE + dcdy * TILE_SIZE.
 227           * TILE_SIZE is 64, dcdx/dcdy are nominally 21 bit (for 8192 max size
 228           * and 8 subpixel bits), I'd be happy with 2 bits more too (1 for
 229           * increasing fb size to 16384, the required d3d11 value, another one
 230           * because I'm not quite sure we can't be _just_ above the max value
 231           * here). This gives us 30 bits max - hence if c would exceed that here
 232           * that means the plane is either trivial reject for the whole tile
 233           * (in which case the tri will not get binned), or trivial accept for
 234           * the whole tile (in which case plane_mask will not include it).
 235           */
 236          assert((c[j] >> (int64_t)FIXED_ORDER) > (int32_t)0xb0000000 &&
 237                 (c[j] >> (int64_t)FIXED_ORDER) < (int32_t)0x3fffffff);
 238          /*
 239           * Note the fixup part is constant throughout the tile - thus could
 240           * just calculate this and avoid _all_ 64bit math in rasterization
 241           * (except exactly this fixup calc).
 242           * In fact theoretically could move that even to setup, albeit that
 243           * seems tricky (pre-bin certainly can have values larger than 32bit,
 244           * and would need to communicate that fixup value through).
 245           * And if we want to support msaa, we'd probably don't want to do the
 246           * downscaling in setup in any case...
 247           */
 248          cdiff = ei - cox_s + ((int32_t)((c[j] - 1) >> (int64_t)FIXED_ORDER) -
 249                                (int32_t)(c[j] >> (int64_t)FIXED_ORDER));
 250          dcdx <<= 4;
 251          dcdy <<= 4;
 252 #else
 253          const int32_t dcdx = -plane[j].dcdx << 4;
 254          const int32_t dcdy = plane[j].dcdy << 4;
 255          const int32_t cox = plane[j].eo << 4;
 256          const int32_t ei = plane[j].dcdy - plane[j].dcdx - (int32_t)plane[j].eo;
 257          const int32_t cio = (ei << 4) - 1;
 258          int32_t co, cdiff;
 259          co = c[j] + cox;
 260          cdiff = cio - cox;
 261 #endif
 262          BUILD_MASKS(co, cdiff,
 263                      dcdx, dcdy,
 264                      &outmask,   /* sign bits from c[i][0..15] + cox */
 265                      &partmask); /* sign bits from c[i][0..15] + cio */
 266       }
 267
 268       j++;
 269    }
 270
 271    if (outmask == 0xffff)
 272       return;
 273
 274    /* Mask of sub-blocks which are inside all trivial accept planes:
 275     */
 276    inmask = ~partmask & 0xffff;
 277
 278    /* Mask of sub-blocks which are inside all trivial reject planes,
 279     * but outside at least one trivial accept plane:
 280     */
 281    partial_mask = partmask & ~outmask;
 282
 283    assert((partial_mask & inmask) == 0);
 284
 285    LP_COUNT_ADD(nr_empty_16, util_bitcount(0xffff & ~(partial_mask | inmask)));
 286
 287    /* Iterate over partials:
 288     */
 289    while (partial_mask) {
 290       int i = ffs(partial_mask) - 1;
 291       int ix = (i & 3) * 16;
 292       int iy = (i >> 2) * 16;
 293       int px = x + ix;
 294       int py = y + iy;
 295       int64_t cx[NR_PLANES];
 296
 297       for (j = 0; j < NR_PLANES; j++)
 298          cx[j] = (c[j]
 299                   - IMUL64(plane[j].dcdx, ix)
 300                   + IMUL64(plane[j].dcdy, iy));
 301
 302       partial_mask &= ~(1 << i);
 303
 304       LP_COUNT(nr_partially_covered_16);
 305       TAG(do_block_16)(task, tri, plane, px, py, cx);
 306    }
 307
 308    /* Iterate over fulls:
 309     */
 310    while (inmask) {
 311       int i = ffs(inmask) - 1;
 312       int ix = (i & 3) * 16;
 313       int iy = (i >> 2) * 16;
 314       int px = x + ix;
 315       int py = y + iy;
 316
 317       inmask &= ~(1 << i);
 318
 319       LP_COUNT(nr_fully_covered_16);
 320       block_full_16(task, tri, px, py);
 321    }
 322 }
 323
 324 #if defined(PIPE_ARCH_SSE) && defined(TRI_16)
 325 /* XXX: special case this when intersection is not required.
 326  *      - tile completely within bbox,
 327  *      - bbox completely within tile.
 328  */
 329 void
 330 TRI_16(struct lp_rasterizer_task *task,
 331        const union lp_rast_cmd_arg arg)
 332 {
 333    const struct lp_rast_triangle *tri = arg.triangle.tri;
 334    const struct lp_rast_plane *plane = GET_PLANES(tri);
 335    unsigned mask = arg.triangle.plane_mask;
 336    unsigned outmask, partial_mask;
 337    unsigned j;
 338    __m128i cstep4[NR_PLANES][4];
 339
 340    int x = (mask & 0xff);
 341    int y = (mask >> 8);
 342
 343    outmask = 0;                 /* outside one or more trivial reject planes */
 344
 345    x += task->x;
 346    y += task->y;
 347
 348    for (j = 0; j < NR_PLANES; j++) {
 349       const int dcdx = -plane[j].dcdx * 4;
 350       const int dcdy = plane[j].dcdy * 4;
 351       __m128i xdcdy = _mm_set1_epi32(dcdy);
 352
 353       cstep4[j][0] = _mm_setr_epi32(0, dcdx, dcdx*2, dcdx*3);
 354       cstep4[j][1] = _mm_add_epi32(cstep4[j][0], xdcdy);
 355       cstep4[j][2] = _mm_add_epi32(cstep4[j][1], xdcdy);
 356       cstep4[j][3] = _mm_add_epi32(cstep4[j][2], xdcdy);
 357
 358       {
 359          const int c = plane[j].c + plane[j].dcdy * y - plane[j].dcdx * x;
 360          const int cox = plane[j].eo * 4;
 361
 362          outmask |= sign_bits4(cstep4[j], c + cox);
 363       }
 364    }
 365
 366    if (outmask == 0xffff)
 367       return;
 368
 369
 370    /* Mask of sub-blocks which are inside all trivial reject planes,
 371     * but outside at least one trivial accept plane:
 372     */
 373    partial_mask = 0xffff & ~outmask;
 374
 375    /* Iterate over partials:
 376     */
 377    while (partial_mask) {
 378       int i = ffs(partial_mask) - 1;
 379       int ix = (i & 3) * 4;
 380       int iy = (i >> 2) * 4;
 381       int px = x + ix;
 382       int py = y + iy;
 383       unsigned mask = 0xffff;
 384
 385       partial_mask &= ~(1 << i);
 386
 387       for (j = 0; j < NR_PLANES; j++) {
 388          const int cx = (plane[j].c - 1
 389                          - plane[j].dcdx * px
 390                          + plane[j].dcdy * py) * 4;
 391
 392          mask &= ~sign_bits4(cstep4[j], cx);
 393       }
 394
 395       if (mask)
 396          lp_rast_shade_quads_mask(task, &tri->inputs, px, py, mask);
 397    }
 398 }
 399 #endif
 400
 401 #if defined(PIPE_ARCH_SSE) && defined(TRI_4)
 402 void
 403 TRI_4(struct lp_rasterizer_task *task,
 404       const union lp_rast_cmd_arg arg)
 405 {
 406    const struct lp_rast_triangle *tri = arg.triangle.tri;
 407    const struct lp_rast_plane *plane = GET_PLANES(tri);
 408    unsigned mask = arg.triangle.plane_mask;
 409    const int x = task->x + (mask & 0xff);
 410    const int y = task->y + (mask >> 8);
 411    unsigned j;
 412
 413    /* Iterate over partials:
 414     */
 415    {
 416       unsigned mask = 0xffff;
 417
 418       for (j = 0; j < NR_PLANES; j++) {
 419          const int cx = (plane[j].c
 420                          - plane[j].dcdx * x
 421                          + plane[j].dcdy * y);
 422
 423          const int dcdx = -plane[j].dcdx;
 424          const int dcdy = plane[j].dcdy;
 425          __m128i xdcdy = _mm_set1_epi32(dcdy);
 426
 427          __m128i cstep0 = _mm_setr_epi32(cx, cx + dcdx, cx + dcdx*2, cx + dcdx*3);
 428          __m128i cstep1 = _mm_add_epi32(cstep0, xdcdy);
 429          __m128i cstep2 = _mm_add_epi32(cstep1, xdcdy);
 430          __m128i cstep3 = _mm_add_epi32(cstep2, xdcdy);
 431
 432          __m128i cstep01 = _mm_packs_epi32(cstep0, cstep1);
 433          __m128i cstep23 = _mm_packs_epi32(cstep2, cstep3);
 434          __m128i result = _mm_packs_epi16(cstep01, cstep23);
 435
 436          /* Extract the sign bits
 437           */
 438          mask &= ~_mm_movemask_epi8(result);
 439       }
 440
 441       if (mask)
 442          lp_rast_shade_quads_mask(task, &tri->inputs, x, y, mask);
 443    }
 444 }
 445 #endif
 446
 447
 448
 449 #undef TAG
 450 #undef TRI_4
 451 #undef TRI_16
 452 #undef NR_PLANES
 453