src/gallium/drivers/llvmpipe/lp_rast_tri.c

   1 /**************************************************************************
   2  *
   3  * Copyright 2007-2009 VMware, Inc.
   4  * All Rights Reserved.
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a
   7  * copy of this software and associated documentation files (the
   8  * "Software"), to deal in the Software without restriction, including
   9  * without limitation the rights to use, copy, modify, merge, publish,
  10  * distribute, sub license, and/or sell copies of the Software, and to
  11  * permit persons to whom the Software is furnished to do so, subject to
  12  * the following conditions:
  13  *
  14  * The above copyright notice and this permission notice (including the
  15  * next paragraph) shall be included in all copies or substantial portions
  16  * of the Software.
  17  *
  18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  19  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
  21  * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
  22  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  23  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  24  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25  *
  26  **************************************************************************/
  27
  28 /*
  29  * Rasterization for binned triangles within a tile
  30  */
  31
  32 #include <limits.h>
  33 #include "util/u_math.h"
  34 #include "lp_debug.h"
  35 #include "lp_debug_intrin.h"
  36 #include "lp_perf.h"
  37 #include "lp_rast_priv.h"
  38 #include "lp_tile_soa.h"
  39
  40
  41
  42
  43 /**
  44  * Shade all pixels in a 4x4 block.
  45  */
  46 static void
  47 block_full_4(struct lp_rasterizer_task *task,
  48              const struct lp_rast_triangle *tri,
  49              int x, int y)
  50 {
  51    lp_rast_shade_quads_all(task, &tri->inputs, x, y);
  52 }
  53
  54
  55 /**
  56  * Shade all pixels in a 16x16 block.
  57  */
  58 static void
  59 block_full_16(struct lp_rasterizer_task *task,
  60               const struct lp_rast_triangle *tri,
  61               int x, int y)
  62 {
  63    unsigned ix, iy;
  64    assert(x % 16 == 0);
  65    assert(y % 16 == 0);
  66    for (iy = 0; iy < 16; iy += 4)
  67       for (ix = 0; ix < 16; ix += 4)
  68          block_full_4(task, tri, x + ix, y + iy);
  69 }
  70
  71 #if !defined(PIPE_ARCH_SSE)
  72
  73 static INLINE unsigned
  74 build_mask_linear(int c, int dcdx, int dcdy)
  75 {
  76    int mask = 0;
  77
  78    int c0 = c;
  79    int c1 = c0 + dcdy;
  80    int c2 = c1 + dcdy;
  81    int c3 = c2 + dcdy;
  82
  83    mask |= ((c0 + 0 * dcdx) >> 31) & (1 << 0);
  84    mask |= ((c0 + 1 * dcdx) >> 31) & (1 << 1);
  85    mask |= ((c0 + 2 * dcdx) >> 31) & (1 << 2);
  86    mask |= ((c0 + 3 * dcdx) >> 31) & (1 << 3);
  87    mask |= ((c1 + 0 * dcdx) >> 31) & (1 << 4);
  88    mask |= ((c1 + 1 * dcdx) >> 31) & (1 << 5);
  89    mask |= ((c1 + 2 * dcdx) >> 31) & (1 << 6);
  90    mask |= ((c1 + 3 * dcdx) >> 31) & (1 << 7);
  91    mask |= ((c2 + 0 * dcdx) >> 31) & (1 << 8);
  92    mask |= ((c2 + 1 * dcdx) >> 31) & (1 << 9);
  93    mask |= ((c2 + 2 * dcdx) >> 31) & (1 << 10);
  94    mask |= ((c2 + 3 * dcdx) >> 31) & (1 << 11);
  95    mask |= ((c3 + 0 * dcdx) >> 31) & (1 << 12);
  96    mask |= ((c3 + 1 * dcdx) >> 31) & (1 << 13);
  97    mask |= ((c3 + 2 * dcdx) >> 31) & (1 << 14);
  98    mask |= ((c3 + 3 * dcdx) >> 31) & (1 << 15);
  99
 100    return mask;
 101 }
 102
 103
 104 static INLINE void
 105 build_masks(int c,
 106             int cdiff,
 107             int dcdx,
 108             int dcdy,
 109             unsigned *outmask,
 110             unsigned *partmask)
 111 {
 112    *outmask |= build_mask_linear(c, dcdx, dcdy);
 113    *partmask |= build_mask_linear(c + cdiff, dcdx, dcdy);
 114 }
 115
 116 void
 117 lp_rast_triangle_3_16(struct lp_rasterizer_task *task,
 118                       const union lp_rast_cmd_arg arg)
 119 {
 120    union lp_rast_cmd_arg arg2;
 121    arg2.triangle.tri = arg.triangle.tri;
 122    arg2.triangle.plane_mask = (1<<3)-1;
 123    lp_rast_triangle_3(task, arg2);
 124 }
 125
 126 void
 127 lp_rast_triangle_4_16(struct lp_rasterizer_task *task,
 128                       const union lp_rast_cmd_arg arg)
 129 {
 130    union lp_rast_cmd_arg arg2;
 131    arg2.triangle.tri = arg.triangle.tri;
 132    arg2.triangle.plane_mask = (1<<4)-1;
 133    lp_rast_triangle_3(task, arg2);
 134 }
 135
 136 void
 137 lp_rast_triangle_3_4(struct lp_rasterizer_task *task,
 138                       const union lp_rast_cmd_arg arg)
 139 {
 140    lp_rast_triangle_3_16(task, arg);
 141 }
 142
 143 #else
 144 #include <emmintrin.h>
 145 #include "util/u_sse.h"
 146
 147
 148 static INLINE void
 149 build_masks(int c,
 150             int cdiff,
 151             int dcdx,
 152             int dcdy,
 153             unsigned *outmask,
 154             unsigned *partmask)
 155 {
 156    __m128i cstep0 = _mm_setr_epi32(c, c+dcdx, c+dcdx*2, c+dcdx*3);
 157    __m128i xdcdy = _mm_set1_epi32(dcdy);
 158
 159    /* Get values across the quad
 160     */
 161    __m128i cstep1 = _mm_add_epi32(cstep0, xdcdy);
 162    __m128i cstep2 = _mm_add_epi32(cstep1, xdcdy);
 163    __m128i cstep3 = _mm_add_epi32(cstep2, xdcdy);
 164
 165    {
 166       __m128i cstep01, cstep23, result;
 167
 168       cstep01 = _mm_packs_epi32(cstep0, cstep1);
 169       cstep23 = _mm_packs_epi32(cstep2, cstep3);
 170       result = _mm_packs_epi16(cstep01, cstep23);
 171
 172       *outmask |= _mm_movemask_epi8(result);
 173    }
 174
 175
 176    {
 177       __m128i cio4 = _mm_set1_epi32(cdiff);
 178       __m128i cstep01, cstep23, result;
 179
 180       cstep0 = _mm_add_epi32(cstep0, cio4);
 181       cstep1 = _mm_add_epi32(cstep1, cio4);
 182       cstep2 = _mm_add_epi32(cstep2, cio4);
 183       cstep3 = _mm_add_epi32(cstep3, cio4);
 184
 185       cstep01 = _mm_packs_epi32(cstep0, cstep1);
 186       cstep23 = _mm_packs_epi32(cstep2, cstep3);
 187       result = _mm_packs_epi16(cstep01, cstep23);
 188
 189       *partmask |= _mm_movemask_epi8(result);
 190    }
 191 }
 192
 193
 194 static INLINE unsigned
 195 build_mask_linear(int c, int dcdx, int dcdy)
 196 {
 197    __m128i cstep0 = _mm_setr_epi32(c, c+dcdx, c+dcdx*2, c+dcdx*3);
 198    __m128i xdcdy = _mm_set1_epi32(dcdy);
 199
 200    /* Get values across the quad
 201     */
 202    __m128i cstep1 = _mm_add_epi32(cstep0, xdcdy);
 203    __m128i cstep2 = _mm_add_epi32(cstep1, xdcdy);
 204    __m128i cstep3 = _mm_add_epi32(cstep2, xdcdy);
 205
 206    /* pack pairs of results into epi16
 207     */
 208    __m128i cstep01 = _mm_packs_epi32(cstep0, cstep1);
 209    __m128i cstep23 = _mm_packs_epi32(cstep2, cstep3);
 210
 211    /* pack into epi8, preserving sign bits
 212     */
 213    __m128i result = _mm_packs_epi16(cstep01, cstep23);
 214
 215    /* extract sign bits to create mask
 216     */
 217    return _mm_movemask_epi8(result);
 218 }
 219
 220 static INLINE unsigned
 221 sign_bits4(const __m128i *cstep, int cdiff)
 222 {
 223
 224    /* Adjust the step values
 225     */
 226    __m128i cio4 = _mm_set1_epi32(cdiff);
 227    __m128i cstep0 = _mm_add_epi32(cstep[0], cio4);
 228    __m128i cstep1 = _mm_add_epi32(cstep[1], cio4);
 229    __m128i cstep2 = _mm_add_epi32(cstep[2], cio4);
 230    __m128i cstep3 = _mm_add_epi32(cstep[3], cio4);
 231
 232    /* Pack down to epi8
 233     */
 234    __m128i cstep01 = _mm_packs_epi32(cstep0, cstep1);
 235    __m128i cstep23 = _mm_packs_epi32(cstep2, cstep3);
 236    __m128i result = _mm_packs_epi16(cstep01, cstep23);
 237
 238    /* Extract the sign bits
 239     */
 240    return _mm_movemask_epi8(result);
 241 }
 242
 243 #endif
 244
 245
 246
 247
 248 #define TAG(x) x##_1
 249 #define NR_PLANES 1
 250 #include "lp_rast_tri_tmp.h"
 251
 252 #define TAG(x) x##_2
 253 #define NR_PLANES 2
 254 #include "lp_rast_tri_tmp.h"
 255
 256 #define TAG(x) x##_3
 257 #define NR_PLANES 3
 258 /*#define TRI_4 lp_rast_triangle_3_4*/
 259 /*#define TRI_16 lp_rast_triangle_3_16*/
 260 #include "lp_rast_tri_tmp.h"
 261
 262 #define TAG(x) x##_4
 263 #define NR_PLANES 4
 264 #define TRI_16 lp_rast_triangle_4_16
 265 #include "lp_rast_tri_tmp.h"
 266
 267 #define TAG(x) x##_5
 268 #define NR_PLANES 5
 269 #include "lp_rast_tri_tmp.h"
 270
 271 #define TAG(x) x##_6
 272 #define NR_PLANES 6
 273 #include "lp_rast_tri_tmp.h"
 274
 275 #define TAG(x) x##_7
 276 #define NR_PLANES 7
 277 #include "lp_rast_tri_tmp.h"
 278
 279 #define TAG(x) x##_8
 280 #define NR_PLANES 8
 281 #include "lp_rast_tri_tmp.h"
 282
 283
 284 static INLINE void
 285 transpose4_epi32(const __m128i * restrict a,
 286                  const __m128i * restrict b,
 287                  const __m128i * restrict c,
 288                  const __m128i * restrict d,
 289                  __m128i * restrict o,
 290                  __m128i * restrict p,
 291                  __m128i * restrict q,
 292                  __m128i * restrict r)
 293 {
 294   __m128i t0 = _mm_unpacklo_epi32(*a, *b);
 295   __m128i t1 = _mm_unpacklo_epi32(*c, *d);
 296   __m128i t2 = _mm_unpackhi_epi32(*a, *b);
 297   __m128i t3 = _mm_unpackhi_epi32(*c, *d);
 298
 299   *o = _mm_unpacklo_epi64(t0, t1);
 300   *p = _mm_unpackhi_epi64(t0, t1);
 301   *q = _mm_unpacklo_epi64(t2, t3);
 302   *r = _mm_unpackhi_epi64(t2, t3);
 303 }
 304
 305
 306 #define SCALAR_EPI32(m, i) _mm_shuffle_epi32((m), _MM_SHUFFLE(i,i,i,i))
 307
 308 #define NR_PLANES 3
 309
 310
 311
 312 /* Provide an SSE2 implementation of _mm_mullo_epi32() in terms of
 313  * _mm_mul_epu32().
 314  *
 315  * I suspect this works fine for us because one of our operands is
 316  * always positive, but not sure that this can be used for general
 317  * signed integer multiplication.
 318  *
 319  * This seems close enough to the speed of SSE4 and the real
 320  * _mm_mullo_epi32() intrinsic as to not justify adding an sse4
 321  * dependency at this point.
 322  */
 323 static INLINE __m128i mm_mullo_epi32(const __m128i a, const __m128i b)
 324 {
 325    __m128i a4   = _mm_srli_si128(a, 4);  /* shift by one dword */
 326    __m128i b4   = _mm_srli_si128(b, 4);  /* shift by one dword */
 327    __m128i ba   = _mm_mul_epu32(b, a);   /* multply dwords 0, 2 */
 328    __m128i b4a4 = _mm_mul_epu32(b4, a4); /* multiply dwords 1, 3 */
 329
 330    /* Interleave the results, either with shuffles or (slightly
 331     * faster) direct bit operations:
 332     */
 333 #if 0
 334    __m128i ba8             = _mm_shuffle_epi32(ba, 8);
 335    __m128i b4a48           = _mm_shuffle_epi32(b4a4, 8);
 336    __m128i result          = _mm_unpacklo_epi32(ba8, b4a48);
 337 #else
 338    __m128i mask            = _mm_setr_epi32(~0,0,~0,0);
 339    __m128i ba_mask         = _mm_and_si128(ba, mask);
 340    __m128i b4a4_mask       = _mm_and_si128(b4a4, mask);
 341    __m128i b4a4_mask_shift = _mm_slli_si128(b4a4_mask, 4);
 342    __m128i result          = _mm_or_si128(ba_mask, b4a4_mask_shift);
 343 #endif
 344
 345    return result;
 346 }
 347
 348
 349
 350
 351 void
 352 lp_rast_triangle_3_16(struct lp_rasterizer_task *task,
 353                       const union lp_rast_cmd_arg arg)
 354 {
 355    const struct lp_rast_triangle *tri = arg.triangle.tri;
 356    const struct lp_rast_plane *plane = tri->plane;
 357    int x = (arg.triangle.plane_mask & 0xff) + task->x;
 358    int y = (arg.triangle.plane_mask >> 8) + task->y;
 359    unsigned i, j;
 360
 361    struct { unsigned mask:16; unsigned i:8; unsigned j:8; } out[16];
 362    unsigned nr = 0;
 363
 364    __m128i p0 = _mm_loadu_si128((__m128i *)&plane[0]); /* c, dcdx, dcdy, eo */
 365    __m128i p1 = _mm_loadu_si128((__m128i *)&plane[1]); /* c, dcdx, dcdy, eo */
 366    __m128i p2 = _mm_loadu_si128((__m128i *)&plane[2]); /* c, dcdx, dcdy, eo */
 367    __m128i zero = _mm_setzero_si128();
 368
 369    __m128i c;
 370    __m128i dcdx;
 371    __m128i dcdy;
 372    __m128i rej4;
 373
 374    __m128i dcdx2;
 375    __m128i dcdx3;
 376
 377    __m128i span_0;                /* 0,dcdx,2dcdx,3dcdx for plane 0 */
 378    __m128i span_1;                /* 0,dcdx,2dcdx,3dcdx for plane 1 */
 379    __m128i span_2;                /* 0,dcdx,2dcdx,3dcdx for plane 2 */
 380    __m128i unused;
 381
 382    transpose4_epi32(&p0, &p1, &p2, &zero,
 383                     &c, &dcdx, &dcdy, &rej4);
 384
 385    /* Adjust dcdx;
 386     */
 387    dcdx = _mm_sub_epi32(zero, dcdx);
 388
 389    c = _mm_add_epi32(c, mm_mullo_epi32(dcdx, _mm_set1_epi32(x)));
 390    c = _mm_add_epi32(c, mm_mullo_epi32(dcdy, _mm_set1_epi32(y)));
 391    rej4 = _mm_slli_epi32(rej4, 2);
 392
 393    dcdx2 = _mm_add_epi32(dcdx, dcdx);
 394    dcdx3 = _mm_add_epi32(dcdx2, dcdx);
 395
 396    transpose4_epi32(&zero, &dcdx, &dcdx2, &dcdx3,
 397                     &span_0, &span_1, &span_2, &unused);
 398
 399    for (i = 0; i < 4; i++) {
 400       __m128i cx = c;
 401
 402       for (j = 0; j < 4; j++) {
 403          __m128i c4rej = _mm_add_epi32(cx, rej4);
 404          __m128i rej_masks = _mm_srai_epi32(c4rej, 31);
 405
 406          /* if (is_zero(rej_masks)) */
 407          if (_mm_movemask_epi8(rej_masks) == 0) {
 408             __m128i c0_0 = _mm_add_epi32(SCALAR_EPI32(cx, 0), span_0);
 409             __m128i c1_0 = _mm_add_epi32(SCALAR_EPI32(cx, 1), span_1);
 410             __m128i c2_0 = _mm_add_epi32(SCALAR_EPI32(cx, 2), span_2);
 411
 412             __m128i c_0 = _mm_or_si128(_mm_or_si128(c0_0, c1_0), c2_0);
 413
 414             __m128i c0_1 = _mm_add_epi32(c0_0, SCALAR_EPI32(dcdy, 0));
 415             __m128i c1_1 = _mm_add_epi32(c1_0, SCALAR_EPI32(dcdy, 1));
 416             __m128i c2_1 = _mm_add_epi32(c2_0, SCALAR_EPI32(dcdy, 2));
 417
 418             __m128i c_1 = _mm_or_si128(_mm_or_si128(c0_1, c1_1), c2_1);
 419             __m128i c_01 = _mm_packs_epi32(c_0, c_1);
 420
 421             __m128i c0_2 = _mm_add_epi32(c0_1, SCALAR_EPI32(dcdy, 0));
 422             __m128i c1_2 = _mm_add_epi32(c1_1, SCALAR_EPI32(dcdy, 1));
 423             __m128i c2_2 = _mm_add_epi32(c2_1, SCALAR_EPI32(dcdy, 2));
 424
 425             __m128i c_2 = _mm_or_si128(_mm_or_si128(c0_2, c1_2), c2_2);
 426
 427             __m128i c0_3 = _mm_add_epi32(c0_2, SCALAR_EPI32(dcdy, 0));
 428             __m128i c1_3 = _mm_add_epi32(c1_2, SCALAR_EPI32(dcdy, 1));
 429             __m128i c2_3 = _mm_add_epi32(c2_2, SCALAR_EPI32(dcdy, 2));
 430
 431             __m128i c_3 = _mm_or_si128(_mm_or_si128(c0_3, c1_3), c2_3);
 432             __m128i c_23 = _mm_packs_epi32(c_2, c_3);
 433             __m128i c_0123 = _mm_packs_epi16(c_01, c_23);
 434
 435             unsigned mask = _mm_movemask_epi8(c_0123);
 436
 437             out[nr].i = i;
 438             out[nr].j = j;
 439             out[nr].mask = mask;
 440             if (mask != 0xffff)
 441                nr++;
 442          }
 443          cx = _mm_add_epi32(cx, _mm_slli_epi32(dcdx, 2));
 444       }
 445
 446       c = _mm_add_epi32(c, _mm_slli_epi32(dcdy, 2));
 447    }
 448
 449    for (i = 0; i < nr; i++)
 450       lp_rast_shade_quads_mask(task,
 451                                &tri->inputs,
 452                                x + 4 * out[i].j,
 453                                y + 4 * out[i].i,
 454                                0xffff & ~out[i].mask);
 455 }
 456
 457
 458
 459
 460
 461 void
 462 lp_rast_triangle_3_4(struct lp_rasterizer_task *task,
 463                      const union lp_rast_cmd_arg arg)
 464 {
 465    const struct lp_rast_triangle *tri = arg.triangle.tri;
 466    const struct lp_rast_plane *plane = tri->plane;
 467    int x = (arg.triangle.plane_mask & 0xff) + task->x;
 468    int y = (arg.triangle.plane_mask >> 8) + task->y;
 469
 470    __m128i p0 = _mm_loadu_si128((__m128i *)&plane[0]); /* c, dcdx, dcdy, eo */
 471    __m128i p1 = _mm_loadu_si128((__m128i *)&plane[1]); /* c, dcdx, dcdy, eo */
 472    __m128i p2 = _mm_loadu_si128((__m128i *)&plane[2]); /* c, dcdx, dcdy, eo */
 473    __m128i zero = _mm_setzero_si128();
 474
 475    __m128i c;
 476    __m128i dcdx;
 477    __m128i dcdy;
 478
 479    __m128i dcdx2;
 480    __m128i dcdx3;
 481
 482    __m128i span_0;                /* 0,dcdx,2dcdx,3dcdx for plane 0 */
 483    __m128i span_1;                /* 0,dcdx,2dcdx,3dcdx for plane 1 */
 484    __m128i span_2;                /* 0,dcdx,2dcdx,3dcdx for plane 2 */
 485    __m128i unused;
 486
 487    transpose4_epi32(&p0, &p1, &p2, &zero,
 488                     &c, &dcdx, &dcdy, &unused);
 489
 490    /* Adjust dcdx;
 491     */
 492    dcdx = _mm_sub_epi32(zero, dcdx);
 493
 494    c = _mm_add_epi32(c, mm_mullo_epi32(dcdx, _mm_set1_epi32(x)));
 495    c = _mm_add_epi32(c, mm_mullo_epi32(dcdy, _mm_set1_epi32(y)));
 496
 497    dcdx2 = _mm_add_epi32(dcdx, dcdx);
 498    dcdx3 = _mm_add_epi32(dcdx2, dcdx);
 499
 500    transpose4_epi32(&zero, &dcdx, &dcdx2, &dcdx3,
 501                     &span_0, &span_1, &span_2, &unused);
 502
 503
 504    {
 505       __m128i c0_0 = _mm_add_epi32(SCALAR_EPI32(c, 0), span_0);
 506       __m128i c1_0 = _mm_add_epi32(SCALAR_EPI32(c, 1), span_1);
 507       __m128i c2_0 = _mm_add_epi32(SCALAR_EPI32(c, 2), span_2);
 508
 509       __m128i c_0 = _mm_or_si128(_mm_or_si128(c0_0, c1_0), c2_0);
 510
 511       __m128i c0_1 = _mm_add_epi32(c0_0, SCALAR_EPI32(dcdy, 0));
 512       __m128i c1_1 = _mm_add_epi32(c1_0, SCALAR_EPI32(dcdy, 1));
 513       __m128i c2_1 = _mm_add_epi32(c2_0, SCALAR_EPI32(dcdy, 2));
 514
 515       __m128i c_1 = _mm_or_si128(_mm_or_si128(c0_1, c1_1), c2_1);
 516       __m128i c_01 = _mm_packs_epi32(c_0, c_1);
 517
 518       __m128i c0_2 = _mm_add_epi32(c0_1, SCALAR_EPI32(dcdy, 0));
 519       __m128i c1_2 = _mm_add_epi32(c1_1, SCALAR_EPI32(dcdy, 1));
 520       __m128i c2_2 = _mm_add_epi32(c2_1, SCALAR_EPI32(dcdy, 2));
 521
 522       __m128i c_2 = _mm_or_si128(_mm_or_si128(c0_2, c1_2), c2_2);
 523
 524       __m128i c0_3 = _mm_add_epi32(c0_2, SCALAR_EPI32(dcdy, 0));
 525       __m128i c1_3 = _mm_add_epi32(c1_2, SCALAR_EPI32(dcdy, 1));
 526       __m128i c2_3 = _mm_add_epi32(c2_2, SCALAR_EPI32(dcdy, 2));
 527
 528       __m128i c_3 = _mm_or_si128(_mm_or_si128(c0_3, c1_3), c2_3);
 529       __m128i c_23 = _mm_packs_epi32(c_2, c_3);
 530       __m128i c_0123 = _mm_packs_epi16(c_01, c_23);
 531
 532       unsigned mask = _mm_movemask_epi8(c_0123);
 533
 534       if (mask != 0xffff)
 535          lp_rast_shade_quads_mask(task,
 536                                   &tri->inputs,
 537                                   x,
 538                                   y,
 539                                   0xffff & ~mask);
 540    }
 541 }