src/gallium/drivers/swr/rasterizer/core/clip.h

   1 /****************************************************************************
   2  * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  *
  23  * @file clip.h
  24  *
  25  * @brief Definitions for clipping
  26  *
  27  ******************************************************************************/
  28 #pragma once
  29
  30 #include "common/simdintrin.h"
  31 #include "core/context.h"
  32 #include "core/pa.h"
  33 #include "rdtsc_core.h"
  34
  35 // Temp storage used by the clipper
  36 extern THREAD SIMDVERTEX_T<SIMD256> tlsTempVertices[7];
  37 #if USE_SIMD16_FRONTEND
  38 extern THREAD SIMDVERTEX_T<SIMD512> tlsTempVertices_simd16[7];
  39 #endif
  40
  41 enum SWR_CLIPCODES
  42 {
  43 // Shift clip codes out of the mantissa to prevent denormalized values when used in float compare.
  44 // Guardband is able to use a single high-bit with 4 separate LSBs, because it computes a union,
  45 // rather than intersection, of clipcodes.
  46 #define CLIPCODE_SHIFT 23
  47     FRUSTUM_LEFT   = (0x01 << CLIPCODE_SHIFT),
  48     FRUSTUM_TOP    = (0x02 << CLIPCODE_SHIFT),
  49     FRUSTUM_RIGHT  = (0x04 << CLIPCODE_SHIFT),
  50     FRUSTUM_BOTTOM = (0x08 << CLIPCODE_SHIFT),
  51
  52     FRUSTUM_NEAR = (0x10 << CLIPCODE_SHIFT),
  53     FRUSTUM_FAR  = (0x20 << CLIPCODE_SHIFT),
  54
  55     NEGW = (0x40 << CLIPCODE_SHIFT),
  56
  57     GUARDBAND_LEFT   = (0x80 << CLIPCODE_SHIFT | 0x1),
  58     GUARDBAND_TOP    = (0x80 << CLIPCODE_SHIFT | 0x2),
  59     GUARDBAND_RIGHT  = (0x80 << CLIPCODE_SHIFT | 0x4),
  60     GUARDBAND_BOTTOM = (0x80 << CLIPCODE_SHIFT | 0x8)
  61 };
  62
  63 #define GUARDBAND_CLIP_MASK                                                          \
  64     (FRUSTUM_NEAR | FRUSTUM_FAR | GUARDBAND_LEFT | GUARDBAND_TOP | GUARDBAND_RIGHT | \
  65      GUARDBAND_BOTTOM | NEGW)
  66 #define FRUSTUM_CLIP_MASK \
  67     (FRUSTUM_NEAR | FRUSTUM_FAR | FRUSTUM_LEFT | FRUSTUM_RIGHT | FRUSTUM_TOP | FRUSTUM_BOTTOM)
  68
  69 template <typename SIMD_T>
  70 void ComputeClipCodes(const API_STATE&       state,
  71                       const Vec4<SIMD_T>&    vertex,
  72                       Float<SIMD_T>&         clipCodes,
  73                       Integer<SIMD_T> const& viewportIndexes)
  74 {
  75     clipCodes = SIMD_T::setzero_ps();
  76
  77     // -w
  78     Float<SIMD_T> vNegW = SIMD_T::mul_ps(vertex.w, SIMD_T::set1_ps(-1.0f));
  79
  80     // FRUSTUM_LEFT
  81     Float<SIMD_T> vRes = SIMD_T::cmplt_ps(vertex.x, vNegW);
  82     clipCodes          = SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_LEFT)));
  83
  84     // FRUSTUM_TOP
  85     vRes      = SIMD_T::cmplt_ps(vertex.y, vNegW);
  86     clipCodes = SIMD_T::or_ps(
  87         clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_TOP))));
  88
  89     // FRUSTUM_RIGHT
  90     vRes      = SIMD_T::cmpgt_ps(vertex.x, vertex.w);
  91     clipCodes = SIMD_T::or_ps(
  92         clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_RIGHT))));
  93
  94     // FRUSTUM_BOTTOM
  95     vRes      = SIMD_T::cmpgt_ps(vertex.y, vertex.w);
  96     clipCodes = SIMD_T::or_ps(
  97         clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_BOTTOM))));
  98
  99     if (state.rastState.depthClipEnable)
 100     {
 101         // FRUSTUM_NEAR
 102         // DX clips depth [0..w], GL clips [-w..w]
 103         if (state.rastState.clipHalfZ)
 104         {
 105             vRes = SIMD_T::cmplt_ps(vertex.z, SIMD_T::setzero_ps());
 106         }
 107         else
 108         {
 109             vRes = SIMD_T::cmplt_ps(vertex.z, vNegW);
 110         }
 111         clipCodes = SIMD_T::or_ps(
 112             clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_NEAR))));
 113
 114         // FRUSTUM_FAR
 115         vRes      = SIMD_T::cmpgt_ps(vertex.z, vertex.w);
 116         clipCodes = SIMD_T::or_ps(
 117             clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_FAR))));
 118     }
 119
 120     // NEGW
 121     vRes = SIMD_T::cmple_ps(vertex.w, SIMD_T::setzero_ps());
 122     clipCodes =
 123         SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(NEGW))));
 124
 125     // GUARDBAND_LEFT
 126     Float<SIMD_T> gbMult = SIMD_T::mul_ps(vNegW,
 127                                           SIMD_T::template i32gather_ps<ScaleFactor<SIMD_T>(4)>(
 128                                               &state.gbState.left[0], viewportIndexes));
 129     vRes                 = SIMD_T::cmplt_ps(vertex.x, gbMult);
 130     clipCodes            = SIMD_T::or_ps(
 131         clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_LEFT))));
 132
 133     // GUARDBAND_TOP
 134     gbMult    = SIMD_T::mul_ps(vNegW,
 135                             SIMD_T::template i32gather_ps<ScaleFactor<SIMD_T>(4)>(
 136                                 &state.gbState.top[0], viewportIndexes));
 137     vRes      = SIMD_T::cmplt_ps(vertex.y, gbMult);
 138     clipCodes = SIMD_T::or_ps(
 139         clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_TOP))));
 140
 141     // GUARDBAND_RIGHT
 142     gbMult    = SIMD_T::mul_ps(vertex.w,
 143                             SIMD_T::template i32gather_ps<ScaleFactor<SIMD_T>(4)>(
 144                                 &state.gbState.right[0], viewportIndexes));
 145     vRes      = SIMD_T::cmpgt_ps(vertex.x, gbMult);
 146     clipCodes = SIMD_T::or_ps(
 147         clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_RIGHT))));
 148
 149     // GUARDBAND_BOTTOM
 150     gbMult    = SIMD_T::mul_ps(vertex.w,
 151                             SIMD_T::template i32gather_ps<ScaleFactor<SIMD_T>(4)>(
 152                                 &state.gbState.bottom[0], viewportIndexes));
 153     vRes      = SIMD_T::cmpgt_ps(vertex.y, gbMult);
 154     clipCodes = SIMD_T::or_ps(
 155         clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_BOTTOM))));
 156 }
 157
 158 template <typename SIMD_T>
 159 struct BinnerChooser
 160 {
 161 };
 162
 163 template <>
 164 struct BinnerChooser<SIMD256>
 165 {
 166     PFN_PROCESS_PRIMS pfnBinFunc;
 167
 168     BinnerChooser(uint32_t numVertsPerPrim, uint32_t conservativeRast)
 169         :
 170         pfnBinFunc(nullptr)
 171     {
 172         if (numVertsPerPrim == 3)
 173         {
 174             pfnBinFunc = GetBinTrianglesFunc(conservativeRast > 0);
 175
 176         }
 177         else if (numVertsPerPrim == 2)
 178         {
 179             pfnBinFunc = BinLines;
 180         }
 181         else
 182         {
 183             SWR_ASSERT(0 && "Unexpected points in clipper.");
 184         }
 185     }
 186
 187     BinnerChooser(PRIMITIVE_TOPOLOGY topology, uint32_t conservativeRast)
 188         :
 189         pfnBinFunc(nullptr)
 190     {
 191         switch (topology)
 192         {
 193         case TOP_POINT_LIST:
 194             pfnBinFunc = BinPoints;
 195             break;
 196         case TOP_LINE_LIST:
 197         case TOP_LINE_STRIP:
 198         case TOP_LINE_LOOP:
 199         case TOP_LINE_LIST_ADJ:
 200         case TOP_LISTSTRIP_ADJ:
 201             pfnBinFunc = BinLines;
 202             break;
 203         default:
 204             pfnBinFunc = GetBinTrianglesFunc(conservativeRast > 0);
 205             break;
 206         };
 207     }
 208
 209     void BinFunc(DRAW_CONTEXT*           pDC,
 210                  PA_STATE&               pa,
 211                  uint32_t                workerId,
 212                  SIMD256::Vec4           prims[],
 213                  uint32_t                primMask,
 214                  SIMD256::Integer const& primID,
 215                  SIMD256::Integer&       viewportIdx,
 216                  SIMD256::Integer&       rtIdx)
 217     {
 218         SWR_ASSERT(pfnBinFunc != nullptr);
 219
 220         pfnBinFunc(pDC, pa, workerId, prims, primMask, primID, viewportIdx, rtIdx);
 221     }
 222 };
 223
 224 #if USE_SIMD16_FRONTEND
 225 template <>
 226 struct BinnerChooser<SIMD512>
 227 {
 228     PFN_PROCESS_PRIMS_SIMD16 pfnBinFunc;
 229
 230     BinnerChooser(uint32_t numVertsPerPrim, uint32_t conservativeRast)
 231         :
 232         pfnBinFunc(nullptr)
 233     {
 234         if (numVertsPerPrim == 3)
 235         {
 236             pfnBinFunc = GetBinTrianglesFunc_simd16(conservativeRast > 0);
 237
 238         }
 239         else if (numVertsPerPrim == 2)
 240         {
 241             pfnBinFunc = BinLines_simd16;
 242         }
 243         else
 244         {
 245             SWR_ASSERT(0 && "Unexpected points in clipper.");
 246         }
 247     }
 248
 249     BinnerChooser(PRIMITIVE_TOPOLOGY topology, uint32_t conservativeRast)
 250         :
 251         pfnBinFunc(nullptr)
 252     {
 253         switch (topology)
 254         {
 255         case TOP_POINT_LIST:
 256             pfnBinFunc = BinPoints_simd16;
 257             break;
 258         case TOP_LINE_LIST:
 259         case TOP_LINE_STRIP:
 260         case TOP_LINE_LOOP:
 261         case TOP_LINE_LIST_ADJ:
 262         case TOP_LISTSTRIP_ADJ:
 263             pfnBinFunc = BinLines_simd16;
 264             break;
 265         default:
 266             pfnBinFunc = GetBinTrianglesFunc_simd16(conservativeRast > 0);
 267             break;
 268         };
 269     }
 270
 271     void BinFunc(DRAW_CONTEXT*           pDC,
 272                  PA_STATE&               pa,
 273                  uint32_t                workerId,
 274                  SIMD512::Vec4           prims[],
 275                  uint32_t                primMask,
 276                  SIMD512::Integer const& primID,
 277                  SIMD512::Integer&       viewportIdx,
 278                  SIMD512::Integer&       rtIdx)
 279     {
 280         SWR_ASSERT(pfnBinFunc != nullptr);
 281
 282         pfnBinFunc(pDC, pa, workerId, prims, primMask, primID, viewportIdx, rtIdx);
 283     }
 284 };
 285
 286 #endif
 287 template <typename SIMD_T>
 288 struct SimdHelper
 289 {
 290 };
 291
 292 template <>
 293 struct SimdHelper<SIMD256>
 294 {
 295     static SIMD256::Float insert_lo_ps(SIMD256::Float a) { return a; }
 296
 297     static SIMD256::Mask cmpeq_ps_mask(SIMD256::Float a, SIMD256::Float b)
 298     {
 299         return SIMD256::movemask_ps(SIMD256::cmpeq_ps(a, b));
 300     }
 301 };
 302
 303 #if USE_SIMD16_FRONTEND
 304 template <>
 305 struct SimdHelper<SIMD512>
 306 {
 307     static SIMD512::Float insert_lo_ps(SIMD256::Float a)
 308     {
 309         return SIMD512::insert_ps<0>(SIMD512::setzero_ps(), a);
 310     }
 311
 312     static SIMD512::Mask cmpeq_ps_mask(SIMD512::Float a, SIMD512::Float b)
 313     {
 314         return SIMD512::cmp_ps_mask<SIMD16::CompareType::EQ_OQ>(a, b);
 315     }
 316 };
 317
 318 #endif
 319 // Temp storage used by the clipper
 320 template <typename SIMD_T>
 321 struct ClipHelper
 322 {
 323 };
 324
 325 template <>
 326 struct ClipHelper<SIMD256>
 327 {
 328     static SIMDVERTEX_T<SIMD256>* GetTempVertices() { return tlsTempVertices; }
 329 };
 330
 331 #if USE_SIMD16_FRONTEND
 332 template <>
 333 struct ClipHelper<SIMD512>
 334 {
 335     static SIMDVERTEX_T<SIMD512>* GetTempVertices() { return tlsTempVertices_simd16; }
 336 };
 337
 338 #endif
 339 template <typename SIMD_T, uint32_t NumVertsPerPrim>
 340 class Clipper
 341 {
 342 public:
 343     INLINE Clipper(uint32_t in_workerId, DRAW_CONTEXT* in_pDC) :
 344         workerId(in_workerId), pDC(in_pDC), state(GetApiState(in_pDC))
 345     {
 346         static_assert(NumVertsPerPrim >= 1 && NumVertsPerPrim <= 3, "Invalid NumVertsPerPrim");
 347     }
 348
 349     void ComputeClipCodes(Vec4<SIMD_T> vertex[], const Integer<SIMD_T>& viewportIndexes)
 350     {
 351         for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
 352         {
 353             ::ComputeClipCodes<SIMD_T>(state, vertex[i], clipCodes[i], viewportIndexes);
 354         }
 355     }
 356
 357     Float<SIMD_T> ComputeClipCodeIntersection()
 358     {
 359         Float<SIMD_T> result = clipCodes[0];
 360
 361         for (uint32_t i = 1; i < NumVertsPerPrim; ++i)
 362         {
 363             result = SIMD_T::and_ps(result, clipCodes[i]);
 364         }
 365
 366         return result;
 367     }
 368
 369     Float<SIMD_T> ComputeClipCodeUnion()
 370     {
 371         Float<SIMD_T> result = clipCodes[0];
 372
 373         for (uint32_t i = 1; i < NumVertsPerPrim; ++i)
 374         {
 375             result = SIMD_T::or_ps(result, clipCodes[i]);
 376         }
 377
 378         return result;
 379     }
 380
 381     int ComputeClipMask()
 382     {
 383         Float<SIMD_T> clipUnion = ComputeClipCodeUnion();
 384
 385         clipUnion =
 386             SIMD_T::and_ps(clipUnion, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_CLIP_MASK)));
 387
 388         return SIMD_T::movemask_ps(SIMD_T::cmpneq_ps(clipUnion, SIMD_T::setzero_ps()));
 389     }
 390
 391     // clipper is responsible for culling any prims with NAN coordinates
 392     int ComputeNaNMask(Vec4<SIMD_T> prim[])
 393     {
 394         Float<SIMD_T> vNanMask = SIMD_T::setzero_ps();
 395
 396         for (uint32_t e = 0; e < NumVertsPerPrim; ++e)
 397         {
 398             Float<SIMD_T> vNan01 =
 399                 SIMD_T::template cmp_ps<SIMD_T::CompareType::UNORD_Q>(prim[e].v[0], prim[e].v[1]);
 400             vNanMask = SIMD_T::or_ps(vNanMask, vNan01);
 401
 402             Float<SIMD_T> vNan23 =
 403                 SIMD_T::template cmp_ps<SIMD_T::CompareType::UNORD_Q>(prim[e].v[2], prim[e].v[3]);
 404             vNanMask = SIMD_T::or_ps(vNanMask, vNan23);
 405         }
 406
 407         return SIMD_T::movemask_ps(vNanMask);
 408     }
 409
 410     int ComputeUserClipCullMask(PA_STATE& pa, Vec4<SIMD_T> prim[])
 411     {
 412         uint8_t  cullMask             = state.backendState.cullDistanceMask;
 413         uint32_t vertexClipCullOffset = state.backendState.vertexClipCullOffset;
 414
 415         Float<SIMD_T> vClipCullMask = SIMD_T::setzero_ps();
 416
 417         Vec4<SIMD_T> vClipCullDistLo[3];
 418         Vec4<SIMD_T> vClipCullDistHi[3];
 419
 420         pa.Assemble(vertexClipCullOffset, vClipCullDistLo);
 421         pa.Assemble(vertexClipCullOffset + 1, vClipCullDistHi);
 422
 423         DWORD index;
 424         while (_BitScanForward(&index, cullMask))
 425         {
 426             cullMask &= ~(1 << index);
 427             uint32_t slot      = index >> 2;
 428             uint32_t component = index & 0x3;
 429
 430             Float<SIMD_T> vCullMaskElem = SIMD_T::set1_ps(-1.0f);
 431             for (uint32_t e = 0; e < NumVertsPerPrim; ++e)
 432             {
 433                 Float<SIMD_T> vCullComp;
 434                 if (slot == 0)
 435                 {
 436                     vCullComp = vClipCullDistLo[e][component];
 437                 }
 438                 else
 439                 {
 440                     vCullComp = vClipCullDistHi[e][component];
 441                 }
 442
 443                 // cull if cull distance < 0 || NAN
 444                 Float<SIMD_T> vCull = SIMD_T::template cmp_ps<SIMD_T::CompareType::NLE_UQ>(
 445                     SIMD_T::setzero_ps(), vCullComp);
 446                 vCullMaskElem = SIMD_T::and_ps(vCullMaskElem, vCull);
 447             }
 448             vClipCullMask = SIMD_T::or_ps(vClipCullMask, vCullMaskElem);
 449         }
 450
 451         // clipper should also discard any primitive with NAN clip distance
 452         uint8_t clipMask = state.backendState.clipDistanceMask;
 453         while (_BitScanForward(&index, clipMask))
 454         {
 455             clipMask &= ~(1 << index);
 456             uint32_t slot      = index >> 2;
 457             uint32_t component = index & 0x3;
 458
 459             Float<SIMD_T> vCullMaskElem = SIMD_T::set1_ps(-1.0f);
 460             for (uint32_t e = 0; e < NumVertsPerPrim; ++e)
 461             {
 462                 Float<SIMD_T> vClipComp;
 463                 if (slot == 0)
 464                 {
 465                     vClipComp = vClipCullDistLo[e][component];
 466                 }
 467                 else
 468                 {
 469                     vClipComp = vClipCullDistHi[e][component];
 470                 }
 471
 472                 Float<SIMD_T> vClip =
 473                     SIMD_T::template cmp_ps<SIMD_T::CompareType::UNORD_Q>(vClipComp, vClipComp);
 474                 Float<SIMD_T> vCull = SIMD_T::template cmp_ps<SIMD_T::CompareType::NLE_UQ>(
 475                     SIMD_T::setzero_ps(), vClipComp);
 476                 vCullMaskElem = SIMD_T::and_ps(vCullMaskElem, vCull);
 477                 vClipCullMask = SIMD_T::or_ps(vClipCullMask, vClip);
 478             }
 479             vClipCullMask = SIMD_T::or_ps(vClipCullMask, vCullMaskElem);
 480         }
 481
 482         return SIMD_T::movemask_ps(vClipCullMask);
 483     }
 484
 485     void ClipSimd(const Vec4<SIMD_T>     prim[],
 486                   const Float<SIMD_T>&   vPrimMask,
 487                   const Float<SIMD_T>&   vClipMask,
 488                   PA_STATE&              pa,
 489                   const Integer<SIMD_T>& vPrimId,
 490                   const Integer<SIMD_T>& vViewportIdx,
 491                   const Integer<SIMD_T>& vRtIdx)
 492     {
 493         // input/output vertex store for clipper
 494         SIMDVERTEX_T<SIMD_T> vertices[7]; // maximum 7 verts generated per triangle
 495
 496         uint32_t constantInterpMask = state.backendState.constantInterpolationMask;
 497         uint32_t provokingVertex    = 0;
 498         if (pa.binTopology == TOP_TRIANGLE_FAN)
 499         {
 500             provokingVertex = state.frontendState.provokingVertex.triFan;
 501         }
 502         ///@todo: line topology for wireframe?
 503
 504         // assemble pos
 505         Vec4<SIMD_T> tmpVector[NumVertsPerPrim];
 506         for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
 507         {
 508             vertices[i].attrib[VERTEX_POSITION_SLOT] = prim[i];
 509         }
 510
 511         // assemble attribs
 512         const SWR_BACKEND_STATE& backendState = state.backendState;
 513
 514         int32_t maxSlot = -1;
 515         for (uint32_t slot = 0; slot < backendState.numAttributes; ++slot)
 516         {
 517             // Compute absolute attrib slot in vertex array
 518             uint32_t mapSlot =
 519                 backendState.swizzleEnable ? backendState.swizzleMap[slot].sourceAttrib : slot;
 520             maxSlot            = std::max<int32_t>(maxSlot, mapSlot);
 521             uint32_t inputSlot = backendState.vertexAttribOffset + mapSlot;
 522
 523             pa.Assemble(inputSlot, tmpVector);
 524
 525             // if constant interpolation enabled for this attribute, assign the provoking
 526             // vertex values to all edges
 527             if (CheckBit(constantInterpMask, slot))
 528             {
 529                 for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
 530                 {
 531                     vertices[i].attrib[inputSlot] = tmpVector[provokingVertex];
 532                 }
 533             }
 534             else
 535             {
 536                 for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
 537                 {
 538                     vertices[i].attrib[inputSlot] = tmpVector[i];
 539                 }
 540             }
 541         }
 542
 543         // assemble user clip distances if enabled
 544         uint32_t vertexClipCullSlot = state.backendState.vertexClipCullOffset;
 545         if (state.backendState.clipDistanceMask & 0xf)
 546         {
 547             pa.Assemble(vertexClipCullSlot, tmpVector);
 548             for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
 549             {
 550                 vertices[i].attrib[vertexClipCullSlot] = tmpVector[i];
 551             }
 552         }
 553
 554         if (state.backendState.clipDistanceMask & 0xf0)
 555         {
 556             pa.Assemble(vertexClipCullSlot + 1, tmpVector);
 557             for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
 558             {
 559                 vertices[i].attrib[vertexClipCullSlot + 1] = tmpVector[i];
 560             }
 561         }
 562
 563         uint32_t numAttribs = maxSlot + 1;
 564
 565         Integer<SIMD_T> vNumClippedVerts =
 566             ClipPrims((float*)&vertices[0], vPrimMask, vClipMask, numAttribs);
 567
 568         BinnerChooser<SIMD_T> binner(NumVertsPerPrim,
 569                                      pa.pDC->pState->state.rastState.conservativeRast);
 570
 571         // set up new PA for binning clipped primitives
 572         PRIMITIVE_TOPOLOGY clipTopology = TOP_UNKNOWN;
 573         if (NumVertsPerPrim == 3)
 574         {
 575             clipTopology = TOP_TRIANGLE_FAN;
 576
 577             // so that the binner knows to bloat wide points later
 578             if (pa.binTopology == TOP_POINT_LIST)
 579             {
 580                 clipTopology = TOP_POINT_LIST;
 581             }
 582             else if (pa.binTopology == TOP_RECT_LIST)
 583             {
 584                 clipTopology = TOP_RECT_LIST;
 585             }
 586         }
 587         else if (NumVertsPerPrim == 2)
 588         {
 589             clipTopology = TOP_LINE_LIST;
 590         }
 591         else
 592         {
 593             SWR_ASSERT(0 && "Unexpected points in clipper.");
 594         }
 595
 596         const uint32_t* pVertexCount = reinterpret_cast<const uint32_t*>(&vNumClippedVerts);
 597         const uint32_t* pPrimitiveId = reinterpret_cast<const uint32_t*>(&vPrimId);
 598         const uint32_t* pViewportIdx = reinterpret_cast<const uint32_t*>(&vViewportIdx);
 599         const uint32_t* pRtIdx       = reinterpret_cast<const uint32_t*>(&vRtIdx);
 600
 601         const SIMD256::Integer vOffsets =
 602             SIMD256::set_epi32(0 * sizeof(SIMDVERTEX_T<SIMD_T>), // unused lane
 603                                6 * sizeof(SIMDVERTEX_T<SIMD_T>),
 604                                5 * sizeof(SIMDVERTEX_T<SIMD_T>),
 605                                4 * sizeof(SIMDVERTEX_T<SIMD_T>),
 606                                3 * sizeof(SIMDVERTEX_T<SIMD_T>),
 607                                2 * sizeof(SIMDVERTEX_T<SIMD_T>),
 608                                1 * sizeof(SIMDVERTEX_T<SIMD_T>),
 609                                0 * sizeof(SIMDVERTEX_T<SIMD_T>));
 610
 611         // only need to gather 7 verts
 612         // @todo dynamic mask based on actual # of verts generated per lane
 613         const SIMD256::Float vMask = SIMD256::set_ps(0, -1, -1, -1, -1, -1, -1, -1);
 614
 615         uint32_t numClippedPrims = 0;
 616
 617         // tranpose clipper output so that each lane's vertices are in SIMD order
 618         // set aside space for 2 vertices, as the PA will try to read up to 16 verts
 619         // for triangle fan
 620
 621 #if defined(_DEBUG)
 622         // TODO: need to increase stack size, allocating SIMD16-widened transposedPrims causes stack
 623         // overflow in debug builds
 624         SIMDVERTEX_T<SIMD_T>* transposedPrims = reinterpret_cast<SIMDVERTEX_T<SIMD_T>*>(
 625             AlignedMalloc(sizeof(SIMDVERTEX_T<SIMD_T>) * 2, 64));
 626
 627 #else
 628         SIMDVERTEX_T<SIMD_T>  transposedPrims[2];
 629
 630 #endif
 631         uint32_t              numInputPrims = pa.NumPrims();
 632         for (uint32_t inputPrim = 0; inputPrim < numInputPrims; ++inputPrim)
 633         {
 634             uint32_t numEmittedVerts = pVertexCount[inputPrim];
 635             if (numEmittedVerts < NumVertsPerPrim)
 636             {
 637                 continue;
 638             }
 639             SWR_ASSERT(numEmittedVerts <= 7, "Unexpected vertex count from clipper.");
 640
 641             uint32_t numEmittedPrims = GetNumPrims(clipTopology, numEmittedVerts);
 642             SWR_ASSERT(numEmittedPrims <= 7, "Unexpected primitive count from clipper.");
 643
 644             numClippedPrims += numEmittedPrims;
 645
 646             // tranpose clipper output so that each lane's vertices are in SIMD order
 647             // set aside space for 2 vertices, as the PA will try to read up to 16 verts
 648             // for triangle fan
 649
 650             // transpose pos
 651             uint8_t* pBase = reinterpret_cast<uint8_t*>(&vertices[0].attrib[VERTEX_POSITION_SLOT]) +
 652                              sizeof(float) * inputPrim;
 653
 654 #if 0
 655             // TEMPORARY WORKAROUND for bizarre VS2015 code-gen bug
 656             static const float *dummy = reinterpret_cast<const float *>(pBase);
 657
 658 #endif
 659             for (uint32_t c = 0; c < 4; ++c)
 660             {
 661                 SIMD256::Float temp = SIMD256::template mask_i32gather_ps<ScaleFactor<SIMD_T>(1)>(
 662                     SIMD256::setzero_ps(), reinterpret_cast<const float*>(pBase), vOffsets, vMask);
 663                 transposedPrims[0].attrib[VERTEX_POSITION_SLOT][c] =
 664                     SimdHelper<SIMD_T>::insert_lo_ps(temp);
 665                 pBase += sizeof(Float<SIMD_T>);
 666             }
 667
 668             // transpose attribs
 669             pBase =
 670                 reinterpret_cast<uint8_t*>(&vertices[0].attrib[backendState.vertexAttribOffset]) +
 671                 sizeof(float) * inputPrim;
 672
 673             for (uint32_t attrib = 0; attrib < numAttribs; ++attrib)
 674             {
 675                 uint32_t attribSlot = backendState.vertexAttribOffset + attrib;
 676
 677                 for (uint32_t c = 0; c < 4; ++c)
 678                 {
 679                     SIMD256::Float temp =
 680                         SIMD256::template mask_i32gather_ps<ScaleFactor<SIMD_T>(1)>(
 681                             SIMD256::setzero_ps(),
 682                             reinterpret_cast<const float*>(pBase),
 683                             vOffsets,
 684                             vMask);
 685                     transposedPrims[0].attrib[attribSlot][c] =
 686                         SimdHelper<SIMD_T>::insert_lo_ps(temp);
 687                     pBase += sizeof(Float<SIMD_T>);
 688                 }
 689             }
 690
 691             // transpose user clip distances if enabled
 692             uint32_t vertexClipCullSlot = backendState.vertexClipCullOffset;
 693             if (state.backendState.clipDistanceMask & 0x0f)
 694             {
 695                 pBase = reinterpret_cast<uint8_t*>(&vertices[0].attrib[vertexClipCullSlot]) +
 696                         sizeof(float) * inputPrim;
 697
 698                 for (uint32_t c = 0; c < 4; ++c)
 699                 {
 700                     SIMD256::Float temp =
 701                         SIMD256::template mask_i32gather_ps<ScaleFactor<SIMD_T>(1)>(
 702                             SIMD256::setzero_ps(),
 703                             reinterpret_cast<const float*>(pBase),
 704                             vOffsets,
 705                             vMask);
 706                     transposedPrims[0].attrib[vertexClipCullSlot][c] =
 707                         SimdHelper<SIMD_T>::insert_lo_ps(temp);
 708                     pBase += sizeof(Float<SIMD_T>);
 709                 }
 710             }
 711
 712             if (state.backendState.clipDistanceMask & 0xf0)
 713             {
 714                 pBase = reinterpret_cast<uint8_t*>(&vertices[0].attrib[vertexClipCullSlot + 1]) +
 715                         sizeof(float) * inputPrim;
 716
 717                 for (uint32_t c = 0; c < 4; ++c)
 718                 {
 719                     SIMD256::Float temp =
 720                         SIMD256::template mask_i32gather_ps<ScaleFactor<SIMD_T>(1)>(
 721                             SIMD256::setzero_ps(),
 722                             reinterpret_cast<const float*>(pBase),
 723                             vOffsets,
 724                             vMask);
 725                     transposedPrims[0].attrib[vertexClipCullSlot + 1][c] =
 726                         SimdHelper<SIMD_T>::insert_lo_ps(temp);
 727                     pBase += sizeof(Float<SIMD_T>);
 728                 }
 729             }
 730
 731             PA_STATE_OPT clipPA(pDC,
 732                                 numEmittedPrims,
 733                                 reinterpret_cast<uint8_t*>(&transposedPrims[0]),
 734                                 numEmittedVerts,
 735                                 SWR_VTX_NUM_SLOTS,
 736                                 true,
 737                                 NumVertsPerPrim,
 738                                 clipTopology);
 739             clipPA.viewportArrayActive = pa.viewportArrayActive;
 740             clipPA.rtArrayActive       = pa.rtArrayActive;
 741
 742             static const uint32_t primMaskMap[] = {0x0, 0x1, 0x3, 0x7, 0xf, 0x1f, 0x3f, 0x7f};
 743
 744             const uint32_t primMask = primMaskMap[numEmittedPrims];
 745
 746             const Integer<SIMD_T> primID      = SIMD_T::set1_epi32(pPrimitiveId[inputPrim]);
 747             const Integer<SIMD_T> viewportIdx = SIMD_T::set1_epi32(pViewportIdx[inputPrim]);
 748             const Integer<SIMD_T> rtIdx       = SIMD_T::set1_epi32(pRtIdx[inputPrim]);
 749
 750             while (clipPA.GetNextStreamOutput())
 751             {
 752                 do
 753                 {
 754                     Vec4<SIMD_T> attrib[NumVertsPerPrim];
 755
 756                     bool assemble = clipPA.Assemble(VERTEX_POSITION_SLOT, attrib);
 757
 758                     if (assemble)
 759                     {
 760                         binner.pfnBinFunc(
 761                             pDC, clipPA, workerId, attrib, primMask, primID, viewportIdx, rtIdx);
 762                     }
 763
 764                 } while (clipPA.NextPrim());
 765             }
 766         }
 767
 768 #if defined(_DEBUG)
 769         AlignedFree(transposedPrims);
 770
 771 #endif
 772         // update global pipeline stat
 773         UPDATE_STAT_FE(CPrimitives, numClippedPrims);
 774     }
 775
 776     void ExecuteStage(PA_STATE&              pa,
 777                       Vec4<SIMD_T>           prim[],
 778                       uint32_t               primMask,
 779                       Integer<SIMD_T> const& primId,
 780                       Integer<SIMD_T> const& viewportIdx,
 781                       Integer<SIMD_T> const& rtIdx)
 782     {
 783         SWR_ASSERT(pa.pDC != nullptr);
 784
 785         BinnerChooser<SIMD_T> binner(pa.binTopology,
 786                                      pa.pDC->pState->state.rastState.conservativeRast);
 787
 788         // update clipper invocations pipeline stat
 789         uint32_t numInvoc = _mm_popcnt_u32(primMask);
 790         UPDATE_STAT_FE(CInvocations, numInvoc);
 791
 792         ComputeClipCodes(prim, viewportIdx);
 793
 794         // cull prims with NAN coords
 795         primMask &= ~ComputeNaNMask(prim);
 796
 797         // user cull distance cull
 798         if (state.backendState.cullDistanceMask | state.backendState.clipDistanceMask)
 799         {
 800             primMask &= ~ComputeUserClipCullMask(pa, prim);
 801         }
 802
 803         Float<SIMD_T> clipIntersection = ComputeClipCodeIntersection();
 804         // Mask out non-frustum codes
 805         clipIntersection = SIMD_T::and_ps(clipIntersection,
 806                                           SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_CLIP_MASK)));
 807
 808         // cull prims outside view frustum
 809         int validMask =
 810             primMask & SimdHelper<SIMD_T>::cmpeq_ps_mask(clipIntersection, SIMD_T::setzero_ps());
 811
 812         // skip clipping for points
 813         uint32_t clipMask = 0;
 814         if (NumVertsPerPrim != 1)
 815         {
 816             clipMask = validMask & ComputeClipMask();
 817         }
 818
 819         AR_EVENT(ClipInfoEvent(numInvoc, validMask, clipMask));
 820
 821         if (clipMask)
 822         {
 823             RDTSC_BEGIN(FEGuardbandClip, pa.pDC->drawId);
 824             // we have to clip tris, execute the clipper, which will also
 825             // call the binner
 826             ClipSimd(prim,
 827                      SIMD_T::vmask_ps(validMask),
 828                      SIMD_T::vmask_ps(clipMask),
 829                      pa,
 830                      primId,
 831                      viewportIdx,
 832                      rtIdx);
 833             RDTSC_END(FEGuardbandClip, 1);
 834         }
 835         else if (validMask)
 836         {
 837             // update CPrimitives pipeline state
 838             UPDATE_STAT_FE(CPrimitives, _mm_popcnt_u32(validMask));
 839
 840             // forward valid prims directly to binner
 841             binner.pfnBinFunc(
 842                 this->pDC, pa, this->workerId, prim, validMask, primId, viewportIdx, rtIdx);
 843         }
 844     }
 845
 846 private:
 847     Float<SIMD_T> ComputeInterpFactor(Float<SIMD_T> const& boundaryCoord0,
 848                                       Float<SIMD_T> const& boundaryCoord1)
 849     {
 850         return SIMD_T::div_ps(boundaryCoord0, SIMD_T::sub_ps(boundaryCoord0, boundaryCoord1));
 851     }
 852
 853     Integer<SIMD_T>
 854     ComputeOffsets(uint32_t attrib, Integer<SIMD_T> const& vIndices, uint32_t component)
 855     {
 856         const uint32_t simdVertexStride = sizeof(SIMDVERTEX_T<SIMD_T>);
 857         const uint32_t componentStride  = sizeof(Float<SIMD_T>);
 858         const uint32_t attribStride     = sizeof(Vec4<SIMD_T>);
 859
 860         static const OSALIGNSIMD16(uint32_t) elemOffset[16] = {
 861             0 * sizeof(float),
 862             1 * sizeof(float),
 863             2 * sizeof(float),
 864             3 * sizeof(float),
 865             4 * sizeof(float),
 866             5 * sizeof(float),
 867             6 * sizeof(float),
 868             7 * sizeof(float),
 869             8 * sizeof(float),
 870             9 * sizeof(float),
 871             10 * sizeof(float),
 872             11 * sizeof(float),
 873             12 * sizeof(float),
 874             13 * sizeof(float),
 875             14 * sizeof(float),
 876             15 * sizeof(float),
 877         };
 878
 879         static_assert(sizeof(Integer<SIMD_T>) <= sizeof(elemOffset),
 880                       "Clipper::ComputeOffsets, Increase number of element offsets.");
 881
 882         Integer<SIMD_T> vElemOffset =
 883             SIMD_T::loadu_si(reinterpret_cast<const Integer<SIMD_T>*>(elemOffset));
 884
 885         // step to the simdvertex
 886         Integer<SIMD_T> vOffsets =
 887             SIMD_T::mullo_epi32(vIndices, SIMD_T::set1_epi32(simdVertexStride));
 888
 889         // step to the attribute and component
 890         vOffsets = SIMD_T::add_epi32(
 891             vOffsets, SIMD_T::set1_epi32(attribStride * attrib + componentStride * component));
 892
 893         // step to the lane
 894         vOffsets = SIMD_T::add_epi32(vOffsets, vElemOffset);
 895
 896         return vOffsets;
 897     }
 898
 899     Float<SIMD_T> GatherComponent(const float*           pBuffer,
 900                                   uint32_t               attrib,
 901                                   Float<SIMD_T> const&   vMask,
 902                                   Integer<SIMD_T> const& vIndices,
 903                                   uint32_t               component)
 904     {
 905         Integer<SIMD_T> vOffsets = ComputeOffsets(attrib, vIndices, component);
 906         Float<SIMD_T>   vSrc     = SIMD_T::setzero_ps();
 907
 908         return SIMD_T::template mask_i32gather_ps<ScaleFactor<SIMD_T>(1)>(
 909             vSrc, pBuffer, vOffsets, vMask);
 910     }
 911
 912     void ScatterComponent(const float*           pBuffer,
 913                           uint32_t               attrib,
 914                           Float<SIMD_T> const&   vMask,
 915                           Integer<SIMD_T> const& vIndices,
 916                           uint32_t               component,
 917                           Float<SIMD_T> const&   vSrc)
 918     {
 919         Integer<SIMD_T> vOffsets = ComputeOffsets(attrib, vIndices, component);
 920
 921         const uint32_t* pOffsets = reinterpret_cast<const uint32_t*>(&vOffsets);
 922         const float*    pSrc     = reinterpret_cast<const float*>(&vSrc);
 923         uint32_t        mask     = SIMD_T::movemask_ps(vMask);
 924         DWORD           lane;
 925         while (_BitScanForward(&lane, mask))
 926         {
 927             mask &= ~(1 << lane);
 928             const uint8_t* pBuf = reinterpret_cast<const uint8_t*>(pBuffer) + pOffsets[lane];
 929             *(float*)pBuf       = pSrc[lane];
 930         }
 931     }
 932
 933     template <SWR_CLIPCODES ClippingPlane>
 934     void intersect(const Float<SIMD_T>&   vActiveMask,  // active lanes to operate on
 935                    const Integer<SIMD_T>& s,            // index to first edge vertex v0 in pInPts.
 936                    const Integer<SIMD_T>& p,            // index to second edge vertex v1 in pInPts.
 937                    const Vec4<SIMD_T>&    v1,           // vertex 0 position
 938                    const Vec4<SIMD_T>&    v2,           // vertex 1 position
 939                    Integer<SIMD_T>&       outIndex,     // output index.
 940                    const float*           pInVerts,     // array of all the input positions.
 941                    uint32_t               numInAttribs, // number of attributes per vertex.
 942                    float* pOutVerts) // array of output positions. We'll write our new intersection
 943                                      // point at i*4.
 944     {
 945         uint32_t vertexAttribOffset   = this->state.backendState.vertexAttribOffset;
 946         uint32_t vertexClipCullOffset = this->state.backendState.vertexClipCullOffset;
 947
 948         // compute interpolation factor
 949         Float<SIMD_T> t;
 950         switch (ClippingPlane)
 951         {
 952         case FRUSTUM_LEFT:
 953             t = ComputeInterpFactor(SIMD_T::add_ps(v1[3], v1[0]), SIMD_T::add_ps(v2[3], v2[0]));
 954             break;
 955         case FRUSTUM_RIGHT:
 956             t = ComputeInterpFactor(SIMD_T::sub_ps(v1[3], v1[0]), SIMD_T::sub_ps(v2[3], v2[0]));
 957             break;
 958         case FRUSTUM_TOP:
 959             t = ComputeInterpFactor(SIMD_T::add_ps(v1[3], v1[1]), SIMD_T::add_ps(v2[3], v2[1]));
 960             break;
 961         case FRUSTUM_BOTTOM:
 962             t = ComputeInterpFactor(SIMD_T::sub_ps(v1[3], v1[1]), SIMD_T::sub_ps(v2[3], v2[1]));
 963             break;
 964         case FRUSTUM_NEAR:
 965             // DX Znear plane is 0, GL is -w
 966             if (this->state.rastState.clipHalfZ)
 967             {
 968                 t = ComputeInterpFactor(v1[2], v2[2]);
 969             }
 970             else
 971             {
 972                 t = ComputeInterpFactor(SIMD_T::add_ps(v1[3], v1[2]), SIMD_T::add_ps(v2[3], v2[2]));
 973             }
 974             break;
 975         case FRUSTUM_FAR:
 976             t = ComputeInterpFactor(SIMD_T::sub_ps(v1[3], v1[2]), SIMD_T::sub_ps(v2[3], v2[2]));
 977             break;
 978         default:
 979             SWR_INVALID("invalid clipping plane: %d", ClippingPlane);
 980         };
 981
 982         // interpolate position and store
 983         for (uint32_t c = 0; c < 4; ++c)
 984         {
 985             Float<SIMD_T> vOutPos = SIMD_T::fmadd_ps(SIMD_T::sub_ps(v2[c], v1[c]), t, v1[c]);
 986             ScatterComponent(pOutVerts, VERTEX_POSITION_SLOT, vActiveMask, outIndex, c, vOutPos);
 987         }
 988
 989         // interpolate attributes and store
 990         for (uint32_t a = 0; a < numInAttribs; ++a)
 991         {
 992             uint32_t attribSlot = vertexAttribOffset + a;
 993             for (uint32_t c = 0; c < 4; ++c)
 994             {
 995                 Float<SIMD_T> vAttrib0 = GatherComponent(pInVerts, attribSlot, vActiveMask, s, c);
 996                 Float<SIMD_T> vAttrib1 = GatherComponent(pInVerts, attribSlot, vActiveMask, p, c);
 997                 Float<SIMD_T> vOutAttrib =
 998                     SIMD_T::fmadd_ps(SIMD_T::sub_ps(vAttrib1, vAttrib0), t, vAttrib0);
 999                 ScatterComponent(pOutVerts, attribSlot, vActiveMask, outIndex, c, vOutAttrib);
1000             }
1001         }
1002
1003         // interpolate clip distance if enabled
1004         if (this->state.backendState.clipDistanceMask & 0xf)
1005         {
1006             uint32_t attribSlot = vertexClipCullOffset;
1007             for (uint32_t c = 0; c < 4; ++c)
1008             {
1009                 Float<SIMD_T> vAttrib0 = GatherComponent(pInVerts, attribSlot, vActiveMask, s, c);
1010                 Float<SIMD_T> vAttrib1 = GatherComponent(pInVerts, attribSlot, vActiveMask, p, c);
1011                 Float<SIMD_T> vOutAttrib =
1012                     SIMD_T::fmadd_ps(SIMD_T::sub_ps(vAttrib1, vAttrib0), t, vAttrib0);
1013                 ScatterComponent(pOutVerts, attribSlot, vActiveMask, outIndex, c, vOutAttrib);
1014             }
1015         }
1016
1017         if (this->state.backendState.clipDistanceMask & 0xf0)
1018         {
1019             uint32_t attribSlot = vertexClipCullOffset + 1;
1020             for (uint32_t c = 0; c < 4; ++c)
1021             {
1022                 Float<SIMD_T> vAttrib0 = GatherComponent(pInVerts, attribSlot, vActiveMask, s, c);
1023                 Float<SIMD_T> vAttrib1 = GatherComponent(pInVerts, attribSlot, vActiveMask, p, c);
1024                 Float<SIMD_T> vOutAttrib =
1025                     SIMD_T::fmadd_ps(SIMD_T::sub_ps(vAttrib1, vAttrib0), t, vAttrib0);
1026                 ScatterComponent(pOutVerts, attribSlot, vActiveMask, outIndex, c, vOutAttrib);
1027             }
1028         }
1029     }
1030
1031     template <SWR_CLIPCODES ClippingPlane>
1032     Float<SIMD_T> inside(const Vec4<SIMD_T>& v)
1033     {
1034         switch (ClippingPlane)
1035         {
1036         case FRUSTUM_LEFT:
1037             return SIMD_T::cmpge_ps(v[0], SIMD_T::mul_ps(v[3], SIMD_T::set1_ps(-1.0f)));
1038         case FRUSTUM_RIGHT:
1039             return SIMD_T::cmple_ps(v[0], v[3]);
1040         case FRUSTUM_TOP:
1041             return SIMD_T::cmpge_ps(v[1], SIMD_T::mul_ps(v[3], SIMD_T::set1_ps(-1.0f)));
1042         case FRUSTUM_BOTTOM:
1043             return SIMD_T::cmple_ps(v[1], v[3]);
1044         case FRUSTUM_NEAR:
1045             return SIMD_T::cmpge_ps(v[2],
1046                                     this->state.rastState.clipHalfZ
1047                                         ? SIMD_T::setzero_ps()
1048                                         : SIMD_T::mul_ps(v[3], SIMD_T::set1_ps(-1.0f)));
1049         case FRUSTUM_FAR:
1050             return SIMD_T::cmple_ps(v[2], v[3]);
1051         default:
1052             SWR_INVALID("invalid clipping plane: %d", ClippingPlane);
1053             return SIMD_T::setzero_ps();
1054         }
1055     }
1056
1057     template <SWR_CLIPCODES ClippingPlane>
1058     Integer<SIMD_T> ClipTriToPlane(const float*           pInVerts,
1059                                    const Integer<SIMD_T>& vNumInPts,
1060                                    uint32_t               numInAttribs,
1061                                    float*                 pOutVerts)
1062     {
1063         uint32_t vertexAttribOffset = this->state.backendState.vertexAttribOffset;
1064
1065         Integer<SIMD_T> vCurIndex   = SIMD_T::setzero_si();
1066         Integer<SIMD_T> vOutIndex   = SIMD_T::setzero_si();
1067         Float<SIMD_T>   vActiveMask = SIMD_T::castsi_ps(SIMD_T::cmplt_epi32(vCurIndex, vNumInPts));
1068
1069         while (!SIMD_T::testz_ps(vActiveMask, vActiveMask)) // loop until activeMask is empty
1070         {
1071             Integer<SIMD_T> s             = vCurIndex;
1072             Integer<SIMD_T> p             = SIMD_T::add_epi32(s, SIMD_T::set1_epi32(1));
1073             Integer<SIMD_T> underFlowMask = SIMD_T::cmpgt_epi32(vNumInPts, p);
1074             p                             = SIMD_T::castps_si(SIMD_T::blendv_ps(
1075                 SIMD_T::setzero_ps(), SIMD_T::castsi_ps(p), SIMD_T::castsi_ps(underFlowMask)));
1076
1077             // gather position
1078             Vec4<SIMD_T> vInPos0, vInPos1;
1079             for (uint32_t c = 0; c < 4; ++c)
1080             {
1081                 vInPos0[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, s, c);
1082                 vInPos1[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, p, c);
1083             }
1084
1085             // compute inside mask
1086             Float<SIMD_T> s_in = inside<ClippingPlane>(vInPos0);
1087             Float<SIMD_T> p_in = inside<ClippingPlane>(vInPos1);
1088
1089             // compute intersection mask (s_in != p_in)
1090             Float<SIMD_T> intersectMask = SIMD_T::xor_ps(s_in, p_in);
1091             intersectMask               = SIMD_T::and_ps(intersectMask, vActiveMask);
1092
1093             // store s if inside
1094             s_in = SIMD_T::and_ps(s_in, vActiveMask);
1095             if (!SIMD_T::testz_ps(s_in, s_in))
1096             {
1097                 // store position
1098                 for (uint32_t c = 0; c < 4; ++c)
1099                 {
1100                     ScatterComponent(
1101                         pOutVerts, VERTEX_POSITION_SLOT, s_in, vOutIndex, c, vInPos0[c]);
1102                 }
1103
1104                 // store attribs
1105                 for (uint32_t a = 0; a < numInAttribs; ++a)
1106                 {
1107                     uint32_t attribSlot = vertexAttribOffset + a;
1108                     for (uint32_t c = 0; c < 4; ++c)
1109                     {
1110                         Float<SIMD_T> vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
1111                         ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib);
1112                     }
1113                 }
1114
1115                 // store clip distance if enabled
1116                 uint32_t vertexClipCullSlot = this->state.backendState.vertexClipCullOffset;
1117                 if (this->state.backendState.clipDistanceMask & 0xf)
1118                 {
1119                     uint32_t attribSlot = vertexClipCullSlot;
1120                     for (uint32_t c = 0; c < 4; ++c)
1121                     {
1122                         Float<SIMD_T> vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
1123                         ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib);
1124                     }
1125                 }
1126
1127                 if (this->state.backendState.clipDistanceMask & 0xf0)
1128                 {
1129                     uint32_t attribSlot = vertexClipCullSlot + 1;
1130                     for (uint32_t c = 0; c < 4; ++c)
1131                     {
1132                         Float<SIMD_T> vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
1133                         ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib);
1134                     }
1135                 }
1136
1137                 // increment outIndex
1138                 vOutIndex = SIMD_T::blendv_epi32(
1139                     vOutIndex, SIMD_T::add_epi32(vOutIndex, SIMD_T::set1_epi32(1)), s_in);
1140             }
1141
1142             // compute and store intersection
1143             if (!SIMD_T::testz_ps(intersectMask, intersectMask))
1144             {
1145                 intersect<ClippingPlane>(intersectMask,
1146                                          s,
1147                                          p,
1148                                          vInPos0,
1149                                          vInPos1,
1150                                          vOutIndex,
1151                                          pInVerts,
1152                                          numInAttribs,
1153                                          pOutVerts);
1154
1155                 // increment outIndex for active lanes
1156                 vOutIndex = SIMD_T::blendv_epi32(
1157                     vOutIndex, SIMD_T::add_epi32(vOutIndex, SIMD_T::set1_epi32(1)), intersectMask);
1158             }
1159
1160             // increment loop index and update active mask
1161             vCurIndex   = SIMD_T::add_epi32(vCurIndex, SIMD_T::set1_epi32(1));
1162             vActiveMask = SIMD_T::castsi_ps(SIMD_T::cmplt_epi32(vCurIndex, vNumInPts));
1163         }
1164
1165         return vOutIndex;
1166     }
1167
1168     template <SWR_CLIPCODES ClippingPlane>
1169     Integer<SIMD_T> ClipLineToPlane(const float*           pInVerts,
1170                                     const Integer<SIMD_T>& vNumInPts,
1171                                     uint32_t               numInAttribs,
1172                                     float*                 pOutVerts)
1173     {
1174         uint32_t vertexAttribOffset = this->state.backendState.vertexAttribOffset;
1175
1176         Integer<SIMD_T> vCurIndex   = SIMD_T::setzero_si();
1177         Integer<SIMD_T> vOutIndex   = SIMD_T::setzero_si();
1178         Float<SIMD_T>   vActiveMask = SIMD_T::castsi_ps(SIMD_T::cmplt_epi32(vCurIndex, vNumInPts));
1179
1180         if (!SIMD_T::testz_ps(vActiveMask, vActiveMask))
1181         {
1182             Integer<SIMD_T> s = vCurIndex;
1183             Integer<SIMD_T> p = SIMD_T::add_epi32(s, SIMD_T::set1_epi32(1));
1184
1185             // gather position
1186             Vec4<SIMD_T> vInPos0, vInPos1;
1187             for (uint32_t c = 0; c < 4; ++c)
1188             {
1189                 vInPos0[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, s, c);
1190                 vInPos1[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, p, c);
1191             }
1192
1193             // compute inside mask
1194             Float<SIMD_T> s_in = inside<ClippingPlane>(vInPos0);
1195             Float<SIMD_T> p_in = inside<ClippingPlane>(vInPos1);
1196
1197             // compute intersection mask (s_in != p_in)
1198             Float<SIMD_T> intersectMask = SIMD_T::xor_ps(s_in, p_in);
1199             intersectMask               = SIMD_T::and_ps(intersectMask, vActiveMask);
1200
1201             // store s if inside
1202             s_in = SIMD_T::and_ps(s_in, vActiveMask);
1203             if (!SIMD_T::testz_ps(s_in, s_in))
1204             {
1205                 for (uint32_t c = 0; c < 4; ++c)
1206                 {
1207                     ScatterComponent(
1208                         pOutVerts, VERTEX_POSITION_SLOT, s_in, vOutIndex, c, vInPos0[c]);
1209                 }
1210
1211                 // interpolate attributes and store
1212                 for (uint32_t a = 0; a < numInAttribs; ++a)
1213                 {
1214                     uint32_t attribSlot = vertexAttribOffset + a;
1215                     for (uint32_t c = 0; c < 4; ++c)
1216                     {
1217                         Float<SIMD_T> vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
1218                         ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib);
1219                     }
1220                 }
1221
1222                 // increment outIndex
1223                 vOutIndex = SIMD_T::blendv_epi32(
1224                     vOutIndex, SIMD_T::add_epi32(vOutIndex, SIMD_T::set1_epi32(1)), s_in);
1225             }
1226
1227             // compute and store intersection
1228             if (!SIMD_T::testz_ps(intersectMask, intersectMask))
1229             {
1230                 intersect<ClippingPlane>(intersectMask,
1231                                          s,
1232                                          p,
1233                                          vInPos0,
1234                                          vInPos1,
1235                                          vOutIndex,
1236                                          pInVerts,
1237                                          numInAttribs,
1238                                          pOutVerts);
1239
1240                 // increment outIndex for active lanes
1241                 vOutIndex = SIMD_T::blendv_epi32(
1242                     vOutIndex, SIMD_T::add_epi32(vOutIndex, SIMD_T::set1_epi32(1)), intersectMask);
1243             }
1244
1245             // store p if inside
1246             p_in = SIMD_T::and_ps(p_in, vActiveMask);
1247             if (!SIMD_T::testz_ps(p_in, p_in))
1248             {
1249                 for (uint32_t c = 0; c < 4; ++c)
1250                 {
1251                     ScatterComponent(
1252                         pOutVerts, VERTEX_POSITION_SLOT, p_in, vOutIndex, c, vInPos1[c]);
1253                 }
1254
1255                 // interpolate attributes and store
1256                 for (uint32_t a = 0; a < numInAttribs; ++a)
1257                 {
1258                     uint32_t attribSlot = vertexAttribOffset + a;
1259                     for (uint32_t c = 0; c < 4; ++c)
1260                     {
1261                         Float<SIMD_T> vAttrib = GatherComponent(pInVerts, attribSlot, p_in, p, c);
1262                         ScatterComponent(pOutVerts, attribSlot, p_in, vOutIndex, c, vAttrib);
1263                     }
1264                 }
1265
1266                 // increment outIndex
1267                 vOutIndex = SIMD_T::blendv_epi32(
1268                     vOutIndex, SIMD_T::add_epi32(vOutIndex, SIMD_T::set1_epi32(1)), p_in);
1269             }
1270         }
1271
1272         return vOutIndex;
1273     }
1274
1275     Integer<SIMD_T> ClipPrims(float*               pVertices,
1276                               const Float<SIMD_T>& vPrimMask,
1277                               const Float<SIMD_T>& vClipMask,
1278                               int                  numAttribs)
1279     {
1280         // temp storage
1281         float* pTempVerts = reinterpret_cast<float*>(ClipHelper<SIMD_T>::GetTempVertices());
1282
1283         // zero out num input verts for non-active lanes
1284         Integer<SIMD_T> vNumInPts = SIMD_T::set1_epi32(NumVertsPerPrim);
1285         vNumInPts = SIMD_T::blendv_epi32(SIMD_T::setzero_si(), vNumInPts, vClipMask);
1286
1287         // clip prims to frustum
1288         Integer<SIMD_T> vNumOutPts;
1289         if (NumVertsPerPrim == 3)
1290         {
1291             vNumOutPts = ClipTriToPlane<FRUSTUM_NEAR>(pVertices, vNumInPts, numAttribs, pTempVerts);
1292             vNumOutPts = ClipTriToPlane<FRUSTUM_FAR>(pTempVerts, vNumOutPts, numAttribs, pVertices);
1293             vNumOutPts =
1294                 ClipTriToPlane<FRUSTUM_LEFT>(pVertices, vNumOutPts, numAttribs, pTempVerts);
1295             vNumOutPts =
1296                 ClipTriToPlane<FRUSTUM_RIGHT>(pTempVerts, vNumOutPts, numAttribs, pVertices);
1297             vNumOutPts =
1298                 ClipTriToPlane<FRUSTUM_BOTTOM>(pVertices, vNumOutPts, numAttribs, pTempVerts);
1299             vNumOutPts = ClipTriToPlane<FRUSTUM_TOP>(pTempVerts, vNumOutPts, numAttribs, pVertices);
1300         }
1301         else
1302         {
1303             SWR_ASSERT(NumVertsPerPrim == 2);
1304             vNumOutPts =
1305                 ClipLineToPlane<FRUSTUM_NEAR>(pVertices, vNumInPts, numAttribs, pTempVerts);
1306             vNumOutPts =
1307                 ClipLineToPlane<FRUSTUM_FAR>(pTempVerts, vNumOutPts, numAttribs, pVertices);
1308             vNumOutPts =
1309                 ClipLineToPlane<FRUSTUM_LEFT>(pVertices, vNumOutPts, numAttribs, pTempVerts);
1310             vNumOutPts =
1311                 ClipLineToPlane<FRUSTUM_RIGHT>(pTempVerts, vNumOutPts, numAttribs, pVertices);
1312             vNumOutPts =
1313                 ClipLineToPlane<FRUSTUM_BOTTOM>(pVertices, vNumOutPts, numAttribs, pTempVerts);
1314             vNumOutPts =
1315                 ClipLineToPlane<FRUSTUM_TOP>(pTempVerts, vNumOutPts, numAttribs, pVertices);
1316         }
1317
1318         // restore num verts for non-clipped, active lanes
1319         Float<SIMD_T> vNonClippedMask = SIMD_T::andnot_ps(vClipMask, vPrimMask);
1320         vNumOutPts =
1321             SIMD_T::blendv_epi32(vNumOutPts, SIMD_T::set1_epi32(NumVertsPerPrim), vNonClippedMask);
1322
1323         return vNumOutPts;
1324     }
1325
1326     const uint32_t   workerId{0};
1327     DRAW_CONTEXT*    pDC{nullptr};
1328     const API_STATE& state;
1329     Float<SIMD_T>    clipCodes[NumVertsPerPrim];
1330 };
1331
1332 // pipeline stage functions
1333 void ClipRectangles(DRAW_CONTEXT*      pDC,
1334                     PA_STATE&          pa,
1335                     uint32_t           workerId,
1336                     simdvector         prims[],
1337                     uint32_t           primMask,
1338                     simdscalari const& primId,
1339                     simdscalari const& viewportIdx,
1340                     simdscalari const& rtIdx);
1341 void ClipTriangles(DRAW_CONTEXT*      pDC,
1342                    PA_STATE&          pa,
1343                    uint32_t           workerId,
1344                    simdvector         prims[],
1345                    uint32_t           primMask,
1346                    simdscalari const& primId,
1347                    simdscalari const& viewportIdx,
1348                    simdscalari const& rtIdx);
1349 void ClipLines(DRAW_CONTEXT*      pDC,
1350                PA_STATE&          pa,
1351                uint32_t           workerId,
1352                simdvector         prims[],
1353                uint32_t           primMask,
1354                simdscalari const& primId,
1355                simdscalari const& viewportIdx,
1356                simdscalari const& rtIdx);
1357 void ClipPoints(DRAW_CONTEXT*      pDC,
1358                 PA_STATE&          pa,
1359                 uint32_t           workerId,
1360                 simdvector         prims[],
1361                 uint32_t           primMask,
1362                 simdscalari const& primId,
1363                 simdscalari const& viewportIdx,
1364                 simdscalari const& rtIdx);
1365 #if USE_SIMD16_FRONTEND
1366 void SIMDCALL ClipRectangles_simd16(DRAW_CONTEXT*        pDC,
1367                                     PA_STATE&            pa,
1368                                     uint32_t             workerId,
1369                                     simd16vector         prims[],
1370                                     uint32_t             primMask,
1371                                     simd16scalari const& primId,
1372                                     simd16scalari const& viewportIdx,
1373                                     simd16scalari const& rtIdx);
1374 void SIMDCALL ClipTriangles_simd16(DRAW_CONTEXT*        pDC,
1375                                    PA_STATE&            pa,
1376                                    uint32_t             workerId,
1377                                    simd16vector         prims[],
1378                                    uint32_t             primMask,
1379                                    simd16scalari const& primId,
1380                                    simd16scalari const& viewportIdx,
1381                                    simd16scalari const& rtIdx);
1382 void SIMDCALL ClipLines_simd16(DRAW_CONTEXT*        pDC,
1383                                PA_STATE&            pa,
1384                                uint32_t             workerId,
1385                                simd16vector         prims[],
1386                                uint32_t             primMask,
1387                                simd16scalari const& primId,
1388                                simd16scalari const& viewportIdx,
1389                                simd16scalari const& rtIdx);
1390 void SIMDCALL ClipPoints_simd16(DRAW_CONTEXT*        pDC,
1391                                 PA_STATE&            pa,
1392                                 uint32_t             workerId,
1393                                 simd16vector         prims[],
1394                                 uint32_t             primMask,
1395                                 simd16scalari const& primId,
1396                                 simd16scalari const& viewportIdx,
1397                                 simd16scalari const& rtIdx);
1398 #endif