src/gallium/drivers/swr/rasterizer/core/clip.h

   1 /****************************************************************************
   2  * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  *
  23  * @file clip.h
  24  *
  25  * @brief Definitions for clipping
  26  *
  27  ******************************************************************************/
  28 #pragma once
  29
  30 #include "common/simdintrin.h"
  31 #include "core/context.h"
  32 #include "core/pa.h"
  33 #include "rdtsc_core.h"
  34
  35 enum SWR_CLIPCODES
  36 {
  37 // Shift clip codes out of the mantissa to prevent denormalized values when used in float compare.
  38 // Guardband is able to use a single high-bit with 4 separate LSBs, because it computes a union,
  39 // rather than intersection, of clipcodes.
  40 #define CLIPCODE_SHIFT 23
  41     FRUSTUM_LEFT   = (0x01 << CLIPCODE_SHIFT),
  42     FRUSTUM_TOP    = (0x02 << CLIPCODE_SHIFT),
  43     FRUSTUM_RIGHT  = (0x04 << CLIPCODE_SHIFT),
  44     FRUSTUM_BOTTOM = (0x08 << CLIPCODE_SHIFT),
  45
  46     FRUSTUM_NEAR = (0x10 << CLIPCODE_SHIFT),
  47     FRUSTUM_FAR  = (0x20 << CLIPCODE_SHIFT),
  48
  49     NEGW = (0x40 << CLIPCODE_SHIFT),
  50
  51     GUARDBAND_LEFT   = (0x80 << CLIPCODE_SHIFT | 0x1),
  52     GUARDBAND_TOP    = (0x80 << CLIPCODE_SHIFT | 0x2),
  53     GUARDBAND_RIGHT  = (0x80 << CLIPCODE_SHIFT | 0x4),
  54     GUARDBAND_BOTTOM = (0x80 << CLIPCODE_SHIFT | 0x8)
  55 };
  56
  57 #define GUARDBAND_CLIP_MASK                                                          \
  58     (FRUSTUM_NEAR | FRUSTUM_FAR | GUARDBAND_LEFT | GUARDBAND_TOP | GUARDBAND_RIGHT | \
  59      GUARDBAND_BOTTOM | NEGW)
  60 #define FRUSTUM_CLIP_MASK \
  61     (FRUSTUM_NEAR | FRUSTUM_FAR | FRUSTUM_LEFT | FRUSTUM_RIGHT | FRUSTUM_TOP | FRUSTUM_BOTTOM)
  62
  63 template <typename SIMD_T>
  64 void ComputeClipCodes(const API_STATE&       state,
  65                       const Vec4<SIMD_T>&    vertex,
  66                       Float<SIMD_T>&         clipCodes,
  67                       Integer<SIMD_T> const& viewportIndexes)
  68 {
  69     clipCodes = SIMD_T::setzero_ps();
  70
  71     // -w
  72     Float<SIMD_T> vNegW = SIMD_T::mul_ps(vertex.w, SIMD_T::set1_ps(-1.0f));
  73
  74     // FRUSTUM_LEFT
  75     Float<SIMD_T> vRes = SIMD_T::cmplt_ps(vertex.x, vNegW);
  76     clipCodes          = SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_LEFT)));
  77
  78     // FRUSTUM_TOP
  79     vRes      = SIMD_T::cmplt_ps(vertex.y, vNegW);
  80     clipCodes = SIMD_T::or_ps(
  81         clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_TOP))));
  82
  83     // FRUSTUM_RIGHT
  84     vRes      = SIMD_T::cmpgt_ps(vertex.x, vertex.w);
  85     clipCodes = SIMD_T::or_ps(
  86         clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_RIGHT))));
  87
  88     // FRUSTUM_BOTTOM
  89     vRes      = SIMD_T::cmpgt_ps(vertex.y, vertex.w);
  90     clipCodes = SIMD_T::or_ps(
  91         clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_BOTTOM))));
  92
  93     if (state.rastState.depthClipEnable)
  94     {
  95         // FRUSTUM_NEAR
  96         // DX clips depth [0..w], GL clips [-w..w]
  97         if (state.rastState.clipHalfZ)
  98         {
  99             vRes = SIMD_T::cmplt_ps(vertex.z, SIMD_T::setzero_ps());
 100         }
 101         else
 102         {
 103             vRes = SIMD_T::cmplt_ps(vertex.z, vNegW);
 104         }
 105         clipCodes = SIMD_T::or_ps(
 106             clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_NEAR))));
 107
 108         // FRUSTUM_FAR
 109         vRes      = SIMD_T::cmpgt_ps(vertex.z, vertex.w);
 110         clipCodes = SIMD_T::or_ps(
 111             clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_FAR))));
 112     }
 113
 114     // NEGW
 115     vRes = SIMD_T::cmple_ps(vertex.w, SIMD_T::setzero_ps());
 116     clipCodes =
 117         SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(NEGW))));
 118
 119     // GUARDBAND_LEFT
 120     Float<SIMD_T> gbMult = SIMD_T::mul_ps(vNegW,
 121                                           SIMD_T::template i32gather_ps<ScaleFactor<SIMD_T>(4)>(
 122                                               &state.gbState.left[0], viewportIndexes));
 123     vRes                 = SIMD_T::cmplt_ps(vertex.x, gbMult);
 124     clipCodes            = SIMD_T::or_ps(
 125         clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_LEFT))));
 126
 127     // GUARDBAND_TOP
 128     gbMult    = SIMD_T::mul_ps(vNegW,
 129                             SIMD_T::template i32gather_ps<ScaleFactor<SIMD_T>(4)>(
 130                                 &state.gbState.top[0], viewportIndexes));
 131     vRes      = SIMD_T::cmplt_ps(vertex.y, gbMult);
 132     clipCodes = SIMD_T::or_ps(
 133         clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_TOP))));
 134
 135     // GUARDBAND_RIGHT
 136     gbMult    = SIMD_T::mul_ps(vertex.w,
 137                             SIMD_T::template i32gather_ps<ScaleFactor<SIMD_T>(4)>(
 138                                 &state.gbState.right[0], viewportIndexes));
 139     vRes      = SIMD_T::cmpgt_ps(vertex.x, gbMult);
 140     clipCodes = SIMD_T::or_ps(
 141         clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_RIGHT))));
 142
 143     // GUARDBAND_BOTTOM
 144     gbMult    = SIMD_T::mul_ps(vertex.w,
 145                             SIMD_T::template i32gather_ps<ScaleFactor<SIMD_T>(4)>(
 146                                 &state.gbState.bottom[0], viewportIndexes));
 147     vRes      = SIMD_T::cmpgt_ps(vertex.y, gbMult);
 148     clipCodes = SIMD_T::or_ps(
 149         clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_BOTTOM))));
 150 }
 151
 152 template <typename SIMD_T>
 153 struct BinnerChooser
 154 {
 155 };
 156
 157 template <>
 158 struct BinnerChooser<SIMD256>
 159 {
 160     PFN_PROCESS_PRIMS pfnBinFunc;
 161
 162     BinnerChooser(uint32_t numVertsPerPrim, uint32_t conservativeRast)
 163         :
 164         pfnBinFunc(nullptr)
 165     {
 166         if (numVertsPerPrim == 3)
 167         {
 168             pfnBinFunc = GetBinTrianglesFunc(conservativeRast > 0);
 169
 170         }
 171         else if (numVertsPerPrim == 2)
 172         {
 173             pfnBinFunc = BinLines;
 174         }
 175         else
 176         {
 177             SWR_ASSERT(0 && "Unexpected points in clipper.");
 178         }
 179     }
 180
 181     BinnerChooser(PRIMITIVE_TOPOLOGY topology, uint32_t conservativeRast)
 182         :
 183         pfnBinFunc(nullptr)
 184     {
 185         switch (topology)
 186         {
 187         case TOP_POINT_LIST:
 188             pfnBinFunc = BinPoints;
 189             break;
 190         case TOP_LINE_LIST:
 191         case TOP_LINE_STRIP:
 192         case TOP_LINE_LOOP:
 193         case TOP_LINE_LIST_ADJ:
 194         case TOP_LISTSTRIP_ADJ:
 195             pfnBinFunc = BinLines;
 196             break;
 197         default:
 198             pfnBinFunc = GetBinTrianglesFunc(conservativeRast > 0);
 199             break;
 200         };
 201     }
 202
 203     void BinFunc(DRAW_CONTEXT*           pDC,
 204                  PA_STATE&               pa,
 205                  uint32_t                workerId,
 206                  SIMD256::Vec4           prims[],
 207                  uint32_t                primMask,
 208                  SIMD256::Integer const& primID,
 209                  SIMD256::Integer&       viewportIdx,
 210                  SIMD256::Integer&       rtIdx)
 211     {
 212         SWR_ASSERT(pfnBinFunc != nullptr);
 213
 214         pfnBinFunc(pDC, pa, workerId, prims, primMask, primID, viewportIdx, rtIdx);
 215     }
 216 };
 217
 218 #if USE_SIMD16_FRONTEND
 219 template <>
 220 struct BinnerChooser<SIMD512>
 221 {
 222     PFN_PROCESS_PRIMS_SIMD16 pfnBinFunc;
 223
 224     BinnerChooser(uint32_t numVertsPerPrim, uint32_t conservativeRast)
 225         :
 226         pfnBinFunc(nullptr)
 227     {
 228         if (numVertsPerPrim == 3)
 229         {
 230             pfnBinFunc = GetBinTrianglesFunc_simd16(conservativeRast > 0);
 231
 232         }
 233         else if (numVertsPerPrim == 2)
 234         {
 235             pfnBinFunc = BinLines_simd16;
 236         }
 237         else
 238         {
 239             SWR_ASSERT(0 && "Unexpected points in clipper.");
 240         }
 241     }
 242
 243     BinnerChooser(PRIMITIVE_TOPOLOGY topology, uint32_t conservativeRast)
 244         :
 245         pfnBinFunc(nullptr)
 246     {
 247         switch (topology)
 248         {
 249         case TOP_POINT_LIST:
 250             pfnBinFunc = BinPoints_simd16;
 251             break;
 252         case TOP_LINE_LIST:
 253         case TOP_LINE_STRIP:
 254         case TOP_LINE_LOOP:
 255         case TOP_LINE_LIST_ADJ:
 256         case TOP_LISTSTRIP_ADJ:
 257             pfnBinFunc = BinLines_simd16;
 258             break;
 259         default:
 260             pfnBinFunc = GetBinTrianglesFunc_simd16(conservativeRast > 0);
 261             break;
 262         };
 263     }
 264
 265     void BinFunc(DRAW_CONTEXT*           pDC,
 266                  PA_STATE&               pa,
 267                  uint32_t                workerId,
 268                  SIMD512::Vec4           prims[],
 269                  uint32_t                primMask,
 270                  SIMD512::Integer const& primID,
 271                  SIMD512::Integer&       viewportIdx,
 272                  SIMD512::Integer&       rtIdx)
 273     {
 274         SWR_ASSERT(pfnBinFunc != nullptr);
 275
 276         pfnBinFunc(pDC, pa, workerId, prims, primMask, primID, viewportIdx, rtIdx);
 277     }
 278 };
 279
 280 #endif
 281 template <typename SIMD_T>
 282 struct SimdHelper
 283 {
 284 };
 285
 286 template <>
 287 struct SimdHelper<SIMD256>
 288 {
 289     static SIMD256::Float insert_lo_ps(SIMD256::Float a) { return a; }
 290
 291     static SIMD256::Mask cmpeq_ps_mask(SIMD256::Float a, SIMD256::Float b)
 292     {
 293         return SIMD256::movemask_ps(SIMD256::cmpeq_ps(a, b));
 294     }
 295 };
 296
 297 #if USE_SIMD16_FRONTEND
 298 template <>
 299 struct SimdHelper<SIMD512>
 300 {
 301     static SIMD512::Float insert_lo_ps(SIMD256::Float a)
 302     {
 303         return SIMD512::insert_ps<0>(SIMD512::setzero_ps(), a);
 304     }
 305
 306     static SIMD512::Mask cmpeq_ps_mask(SIMD512::Float a, SIMD512::Float b)
 307     {
 308         return SIMD512::cmp_ps_mask<SIMD16::CompareType::EQ_OQ>(a, b);
 309     }
 310 };
 311 #endif
 312
 313 template <typename SIMD_T, uint32_t NumVertsPerPrimT>
 314 class Clipper
 315 {
 316 public:
 317     INLINE Clipper(uint32_t in_workerId, DRAW_CONTEXT* in_pDC) :
 318         workerId(in_workerId), pDC(in_pDC), state(GetApiState(in_pDC))
 319     {
 320         static_assert(NumVertsPerPrimT >= 1 && NumVertsPerPrimT <= 3, "Invalid NumVertsPerPrim");
 321         THREAD_DATA &thread_data = in_pDC->pContext->threadPool.pThreadData[workerId];
 322
 323         if (thread_data.clipperData == nullptr)
 324         {
 325             // 7 vertex temp data
 326             // 7 post-clipped vertices
 327             // 2 transposed verts for binning
 328             size_t alloc_size = sizeof(SIMDVERTEX_T<SIMD_T>) * (7 + 7 + 2);
 329             thread_data.clipperData = AlignedMalloc(alloc_size, KNOB_SIMD16_BYTES);
 330         }
 331         SWR_ASSERT(thread_data.clipperData);
 332
 333         this->clippedVerts = (SIMDVERTEX_T<SIMD_T>*)thread_data.clipperData;
 334         this->tmpVerts = this->clippedVerts + 7;
 335         this->transposedVerts = this->tmpVerts + 7;
 336     }
 337
 338     void ComputeClipCodes(Vec4<SIMD_T> vertex[], const Integer<SIMD_T>& viewportIndexes)
 339     {
 340         for (uint32_t i = 0; i < NumVertsPerPrimT; ++i)
 341         {
 342             ::ComputeClipCodes<SIMD_T>(state, vertex[i], clipCodes[i], viewportIndexes);
 343         }
 344     }
 345
 346     Float<SIMD_T> ComputeClipCodeIntersection()
 347     {
 348         Float<SIMD_T> result = clipCodes[0];
 349
 350         for (uint32_t i = 1; i < NumVertsPerPrimT; ++i)
 351         {
 352             result = SIMD_T::and_ps(result, clipCodes[i]);
 353         }
 354
 355         return result;
 356     }
 357
 358     Float<SIMD_T> ComputeClipCodeUnion()
 359     {
 360         Float<SIMD_T> result = clipCodes[0];
 361
 362         for (uint32_t i = 1; i < NumVertsPerPrimT; ++i)
 363         {
 364             result = SIMD_T::or_ps(result, clipCodes[i]);
 365         }
 366
 367         return result;
 368     }
 369
 370     int ComputeClipMask()
 371     {
 372         Float<SIMD_T> clipUnion = ComputeClipCodeUnion();
 373
 374         clipUnion =
 375             SIMD_T::and_ps(clipUnion, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_CLIP_MASK)));
 376
 377         return SIMD_T::movemask_ps(SIMD_T::cmpneq_ps(clipUnion, SIMD_T::setzero_ps()));
 378     }
 379
 380     // clipper is responsible for culling any prims with NAN coordinates
 381     int ComputeNaNMask(Vec4<SIMD_T> prim[])
 382     {
 383         Float<SIMD_T> vNanMask = SIMD_T::setzero_ps();
 384
 385         for (uint32_t e = 0; e < NumVertsPerPrimT; ++e)
 386         {
 387             Float<SIMD_T> vNan01 =
 388                 SIMD_T::template cmp_ps<SIMD_T::CompareType::UNORD_Q>(prim[e].v[0], prim[e].v[1]);
 389             vNanMask = SIMD_T::or_ps(vNanMask, vNan01);
 390
 391             Float<SIMD_T> vNan23 =
 392                 SIMD_T::template cmp_ps<SIMD_T::CompareType::UNORD_Q>(prim[e].v[2], prim[e].v[3]);
 393             vNanMask = SIMD_T::or_ps(vNanMask, vNan23);
 394         }
 395
 396         return SIMD_T::movemask_ps(vNanMask);
 397     }
 398
 399     int ComputeUserClipCullMask(PA_STATE& pa, Vec4<SIMD_T> prim[])
 400     {
 401         uint8_t  cullMask             = state.backendState.cullDistanceMask;
 402         uint32_t vertexClipCullOffset = state.backendState.vertexClipCullOffset;
 403
 404         Float<SIMD_T> vClipCullMask = SIMD_T::setzero_ps();
 405
 406         Vec4<SIMD_T> vClipCullDistLo[3];
 407         Vec4<SIMD_T> vClipCullDistHi[3];
 408
 409         pa.Assemble(vertexClipCullOffset, vClipCullDistLo);
 410         pa.Assemble(vertexClipCullOffset + 1, vClipCullDistHi);
 411
 412         DWORD index;
 413         while (_BitScanForward(&index, cullMask))
 414         {
 415             cullMask &= ~(1 << index);
 416             uint32_t slot      = index >> 2;
 417             uint32_t component = index & 0x3;
 418
 419             Float<SIMD_T> vCullMaskElem = SIMD_T::set1_ps(-1.0f);
 420             for (uint32_t e = 0; e < NumVertsPerPrimT; ++e)
 421             {
 422                 Float<SIMD_T> vCullComp;
 423                 if (slot == 0)
 424                 {
 425                     vCullComp = vClipCullDistLo[e][component];
 426                 }
 427                 else
 428                 {
 429                     vCullComp = vClipCullDistHi[e][component];
 430                 }
 431
 432                 // cull if cull distance < 0 || NAN
 433                 Float<SIMD_T> vCull = SIMD_T::template cmp_ps<SIMD_T::CompareType::NLE_UQ>(
 434                     SIMD_T::setzero_ps(), vCullComp);
 435                 vCullMaskElem = SIMD_T::and_ps(vCullMaskElem, vCull);
 436             }
 437             vClipCullMask = SIMD_T::or_ps(vClipCullMask, vCullMaskElem);
 438         }
 439
 440         // clipper should also discard any primitive with NAN clip distance
 441         uint8_t clipMask = state.backendState.clipDistanceMask;
 442         while (_BitScanForward(&index, clipMask))
 443         {
 444             clipMask &= ~(1 << index);
 445             uint32_t slot      = index >> 2;
 446             uint32_t component = index & 0x3;
 447
 448             Float<SIMD_T> vCullMaskElem = SIMD_T::set1_ps(-1.0f);
 449             for (uint32_t e = 0; e < NumVertsPerPrimT; ++e)
 450             {
 451                 Float<SIMD_T> vClipComp;
 452                 if (slot == 0)
 453                 {
 454                     vClipComp = vClipCullDistLo[e][component];
 455                 }
 456                 else
 457                 {
 458                     vClipComp = vClipCullDistHi[e][component];
 459                 }
 460
 461                 Float<SIMD_T> vClip =
 462                     SIMD_T::template cmp_ps<SIMD_T::CompareType::UNORD_Q>(vClipComp, vClipComp);
 463                 Float<SIMD_T> vCull = SIMD_T::template cmp_ps<SIMD_T::CompareType::NLE_UQ>(
 464                     SIMD_T::setzero_ps(), vClipComp);
 465                 vCullMaskElem = SIMD_T::and_ps(vCullMaskElem, vCull);
 466                 vClipCullMask = SIMD_T::or_ps(vClipCullMask, vClip);
 467             }
 468             vClipCullMask = SIMD_T::or_ps(vClipCullMask, vCullMaskElem);
 469         }
 470
 471         return SIMD_T::movemask_ps(vClipCullMask);
 472     }
 473
 474     void ClipSimd(const Vec4<SIMD_T>     prim[],
 475                   const Float<SIMD_T>&   vPrimMask,
 476                   const Float<SIMD_T>&   vClipMask,
 477                   PA_STATE&              pa,
 478                   const Integer<SIMD_T>& vPrimId,
 479                   const Integer<SIMD_T>& vViewportIdx,
 480                   const Integer<SIMD_T>& vRtIdx)
 481     {
 482         // input/output vertex store for clipper
 483         SIMDVERTEX_T<SIMD_T>* vertices = this->clippedVerts;
 484
 485         uint32_t constantInterpMask = state.backendState.constantInterpolationMask;
 486         uint32_t provokingVertex    = 0;
 487         if (pa.binTopology == TOP_TRIANGLE_FAN)
 488         {
 489             provokingVertex = state.frontendState.provokingVertex.triFan;
 490         }
 491         ///@todo: line topology for wireframe?
 492
 493         // assemble pos
 494         Vec4<SIMD_T> tmpVector[NumVertsPerPrimT];
 495         for (uint32_t i = 0; i < NumVertsPerPrimT; ++i)
 496         {
 497             vertices[i].attrib[VERTEX_POSITION_SLOT] = prim[i];
 498         }
 499
 500         // assemble attribs
 501         const SWR_BACKEND_STATE& backendState = state.backendState;
 502
 503         int32_t maxSlot = -1;
 504         for (uint32_t slot = 0; slot < backendState.numAttributes; ++slot)
 505         {
 506             // Compute absolute attrib slot in vertex array
 507             uint32_t mapSlot =
 508                 backendState.swizzleEnable ? backendState.swizzleMap[slot].sourceAttrib : slot;
 509             maxSlot            = std::max<int32_t>(maxSlot, mapSlot);
 510             uint32_t inputSlot = backendState.vertexAttribOffset + mapSlot;
 511
 512             pa.Assemble(inputSlot, tmpVector);
 513
 514             // if constant interpolation enabled for this attribute, assign the provoking
 515             // vertex values to all edges
 516             if (CheckBit(constantInterpMask, slot))
 517             {
 518                 for (uint32_t i = 0; i < NumVertsPerPrimT; ++i)
 519                 {
 520                     vertices[i].attrib[inputSlot] = tmpVector[provokingVertex];
 521                 }
 522             }
 523             else
 524             {
 525                 for (uint32_t i = 0; i < NumVertsPerPrimT; ++i)
 526                 {
 527                     vertices[i].attrib[inputSlot] = tmpVector[i];
 528                 }
 529             }
 530         }
 531
 532         // assemble user clip distances if enabled
 533         uint32_t vertexClipCullSlot = state.backendState.vertexClipCullOffset;
 534         if (state.backendState.clipDistanceMask & 0xf)
 535         {
 536             pa.Assemble(vertexClipCullSlot, tmpVector);
 537             for (uint32_t i = 0; i < NumVertsPerPrimT; ++i)
 538             {
 539                 vertices[i].attrib[vertexClipCullSlot] = tmpVector[i];
 540             }
 541         }
 542
 543         if (state.backendState.clipDistanceMask & 0xf0)
 544         {
 545             pa.Assemble(vertexClipCullSlot + 1, tmpVector);
 546             for (uint32_t i = 0; i < NumVertsPerPrimT; ++i)
 547             {
 548                 vertices[i].attrib[vertexClipCullSlot + 1] = tmpVector[i];
 549             }
 550         }
 551
 552         uint32_t numAttribs = maxSlot + 1;
 553
 554         Integer<SIMD_T> vNumClippedVerts =
 555             ClipPrims((float*)&vertices[0], vPrimMask, vClipMask, numAttribs);
 556
 557         BinnerChooser<SIMD_T> binner(NumVertsPerPrimT,
 558                                      pa.pDC->pState->state.rastState.conservativeRast);
 559
 560         // set up new PA for binning clipped primitives
 561         PRIMITIVE_TOPOLOGY clipTopology = TOP_UNKNOWN;
 562         if (NumVertsPerPrimT == 3)
 563         {
 564             clipTopology = TOP_TRIANGLE_FAN;
 565
 566             // so that the binner knows to bloat wide points later
 567             if (pa.binTopology == TOP_POINT_LIST)
 568             {
 569                 clipTopology = TOP_POINT_LIST;
 570             }
 571             else if (pa.binTopology == TOP_RECT_LIST)
 572             {
 573                 clipTopology = TOP_RECT_LIST;
 574             }
 575         }
 576         else if (NumVertsPerPrimT == 2)
 577         {
 578             clipTopology = TOP_LINE_LIST;
 579         }
 580         else
 581         {
 582             SWR_ASSERT(0 && "Unexpected points in clipper.");
 583         }
 584
 585         const uint32_t* pVertexCount = reinterpret_cast<const uint32_t*>(&vNumClippedVerts);
 586         const uint32_t* pPrimitiveId = reinterpret_cast<const uint32_t*>(&vPrimId);
 587         const uint32_t* pViewportIdx = reinterpret_cast<const uint32_t*>(&vViewportIdx);
 588         const uint32_t* pRtIdx       = reinterpret_cast<const uint32_t*>(&vRtIdx);
 589
 590         const SIMD256::Integer vOffsets =
 591             SIMD256::set_epi32(0 * sizeof(SIMDVERTEX_T<SIMD_T>), // unused lane
 592                                6 * sizeof(SIMDVERTEX_T<SIMD_T>),
 593                                5 * sizeof(SIMDVERTEX_T<SIMD_T>),
 594                                4 * sizeof(SIMDVERTEX_T<SIMD_T>),
 595                                3 * sizeof(SIMDVERTEX_T<SIMD_T>),
 596                                2 * sizeof(SIMDVERTEX_T<SIMD_T>),
 597                                1 * sizeof(SIMDVERTEX_T<SIMD_T>),
 598                                0 * sizeof(SIMDVERTEX_T<SIMD_T>));
 599
 600         // only need to gather 7 verts
 601         // @todo dynamic mask based on actual # of verts generated per lane
 602         const SIMD256::Float vMask = SIMD256::set_ps(0, -1, -1, -1, -1, -1, -1, -1);
 603
 604         uint32_t numClippedPrims = 0;
 605
 606         // transpose clipper output so that each lane's vertices are in SIMD order
 607         // set aside space for 2 vertices, as the PA will try to read up to 16 verts
 608         // for triangle fan
 609         SIMDVERTEX_T<SIMD_T>*  transposedPrims = this->transposedVerts;
 610
 611         uint32_t              numInputPrims = pa.NumPrims();
 612         for (uint32_t inputPrim = 0; inputPrim < numInputPrims; ++inputPrim)
 613         {
 614             uint32_t numEmittedVerts = pVertexCount[inputPrim];
 615             if (numEmittedVerts < NumVertsPerPrimT)
 616             {
 617                 continue;
 618             }
 619             SWR_ASSERT(numEmittedVerts <= 7, "Unexpected vertex count from clipper.");
 620
 621             uint32_t numEmittedPrims = GetNumPrims(clipTopology, numEmittedVerts);
 622             SWR_ASSERT(numEmittedPrims <= 7, "Unexpected primitive count from clipper.");
 623
 624             numClippedPrims += numEmittedPrims;
 625
 626             // tranpose clipper output so that each lane's vertices are in SIMD order
 627             // set aside space for 2 vertices, as the PA will try to read up to 16 verts
 628             // for triangle fan
 629
 630             // transpose pos
 631             float const* pBase =
 632                 reinterpret_cast<float const*>(&vertices[0].attrib[VERTEX_POSITION_SLOT]) +
 633                 inputPrim;
 634
 635             for (uint32_t c = 0; c < 4; ++c)
 636             {
 637                 SIMD256::Float temp =
 638                     SIMD256::mask_i32gather_ps(SIMD256::setzero_ps(), pBase, vOffsets, vMask);
 639                 transposedPrims[0].attrib[VERTEX_POSITION_SLOT][c] =
 640                     SimdHelper<SIMD_T>::insert_lo_ps(temp);
 641                 pBase = PtrAdd(pBase, sizeof(Float<SIMD_T>));
 642             }
 643
 644             // transpose attribs
 645             pBase = reinterpret_cast<float const*>(
 646                         &vertices[0].attrib[backendState.vertexAttribOffset]) +
 647                     inputPrim;
 648
 649             for (uint32_t attrib = 0; attrib < numAttribs; ++attrib)
 650             {
 651                 uint32_t attribSlot = backendState.vertexAttribOffset + attrib;
 652
 653                 for (uint32_t c = 0; c < 4; ++c)
 654                 {
 655                     SIMD256::Float temp =
 656                         SIMD256::mask_i32gather_ps(SIMD256::setzero_ps(), pBase, vOffsets, vMask);
 657                     transposedPrims[0].attrib[attribSlot][c] =
 658                         SimdHelper<SIMD_T>::insert_lo_ps(temp);
 659                     pBase = PtrAdd(pBase, sizeof(Float<SIMD_T>));
 660                 }
 661             }
 662
 663             // transpose user clip distances if enabled
 664             uint32_t vertexClipCullSlot = backendState.vertexClipCullOffset;
 665             if (state.backendState.clipDistanceMask & 0x0f)
 666             {
 667                 pBase = reinterpret_cast<float const*>(&vertices[0].attrib[vertexClipCullSlot]) +
 668                         inputPrim;
 669
 670                 for (uint32_t c = 0; c < 4; ++c)
 671                 {
 672                     SIMD256::Float temp =
 673                         SIMD256::mask_i32gather_ps(SIMD256::setzero_ps(), pBase, vOffsets, vMask);
 674                     transposedPrims[0].attrib[vertexClipCullSlot][c] =
 675                         SimdHelper<SIMD_T>::insert_lo_ps(temp);
 676                     pBase = PtrAdd(pBase, sizeof(Float<SIMD_T>));
 677                 }
 678             }
 679
 680             if (state.backendState.clipDistanceMask & 0xf0)
 681             {
 682                 pBase =
 683                     reinterpret_cast<float const*>(&vertices[0].attrib[vertexClipCullSlot + 1]) +
 684                     inputPrim;
 685
 686                 for (uint32_t c = 0; c < 4; ++c)
 687                 {
 688                     SIMD256::Float temp =
 689                         SIMD256::mask_i32gather_ps(SIMD256::setzero_ps(), pBase, vOffsets, vMask);
 690                     transposedPrims[0].attrib[vertexClipCullSlot + 1][c] =
 691                         SimdHelper<SIMD_T>::insert_lo_ps(temp);
 692                     pBase = PtrAdd(pBase, sizeof(Float<SIMD_T>));
 693                 }
 694             }
 695
 696             PA_STATE_OPT clipPA(pDC,
 697                                 numEmittedPrims,
 698                                 reinterpret_cast<uint8_t*>(&transposedPrims[0]),
 699                                 numEmittedVerts,
 700                                 SWR_VTX_NUM_SLOTS,
 701                                 true,
 702                                 NumVertsPerPrimT,
 703                                 clipTopology);
 704             clipPA.viewportArrayActive = pa.viewportArrayActive;
 705             clipPA.rtArrayActive       = pa.rtArrayActive;
 706
 707             static const uint32_t primMaskMap[] = {0x0, 0x1, 0x3, 0x7, 0xf, 0x1f, 0x3f, 0x7f};
 708
 709             const uint32_t primMask = primMaskMap[numEmittedPrims];
 710
 711             const Integer<SIMD_T> primID      = SIMD_T::set1_epi32(pPrimitiveId[inputPrim]);
 712             const Integer<SIMD_T> viewportIdx = SIMD_T::set1_epi32(pViewportIdx[inputPrim]);
 713             const Integer<SIMD_T> rtIdx       = SIMD_T::set1_epi32(pRtIdx[inputPrim]);
 714
 715             while (clipPA.GetNextStreamOutput())
 716             {
 717                 do
 718                 {
 719                     Vec4<SIMD_T> attrib[NumVertsPerPrimT];
 720
 721                     bool assemble = clipPA.Assemble(VERTEX_POSITION_SLOT, attrib);
 722
 723                     if (assemble)
 724                     {
 725                         binner.pfnBinFunc(
 726                             pDC, clipPA, workerId, attrib, primMask, primID, viewportIdx, rtIdx);
 727                     }
 728
 729                 } while (clipPA.NextPrim());
 730             }
 731         }
 732
 733         // update global pipeline stat
 734         UPDATE_STAT_FE(CPrimitives, numClippedPrims);
 735     }
 736
 737     void ExecuteStage(PA_STATE&              pa,
 738                       Vec4<SIMD_T>           prim[],
 739                       uint32_t               primMask,
 740                       Integer<SIMD_T> const& primId,
 741                       Integer<SIMD_T> const& viewportIdx,
 742                       Integer<SIMD_T> const& rtIdx)
 743     {
 744         SWR_ASSERT(pa.pDC != nullptr);
 745
 746         BinnerChooser<SIMD_T> binner(pa.binTopology,
 747                                      pa.pDC->pState->state.rastState.conservativeRast);
 748
 749         // update clipper invocations pipeline stat
 750         uint32_t numInvoc = _mm_popcnt_u32(primMask);
 751         UPDATE_STAT_FE(CInvocations, numInvoc);
 752
 753         ComputeClipCodes(prim, viewportIdx);
 754
 755         // cull prims with NAN coords
 756         primMask &= ~ComputeNaNMask(prim);
 757
 758         // user cull distance cull
 759         if (state.backendState.cullDistanceMask | state.backendState.clipDistanceMask)
 760         {
 761             primMask &= ~ComputeUserClipCullMask(pa, prim);
 762         }
 763
 764         Float<SIMD_T> clipIntersection = ComputeClipCodeIntersection();
 765         // Mask out non-frustum codes
 766         clipIntersection = SIMD_T::and_ps(clipIntersection,
 767                                           SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_CLIP_MASK)));
 768
 769         // cull prims outside view frustum
 770         int validMask =
 771             primMask & SimdHelper<SIMD_T>::cmpeq_ps_mask(clipIntersection, SIMD_T::setzero_ps());
 772
 773         // skip clipping for points
 774         uint32_t clipMask = 0;
 775         if (NumVertsPerPrimT != 1)
 776         {
 777             clipMask = validMask & ComputeClipMask();
 778         }
 779
 780         AR_EVENT(ClipInfoEvent(numInvoc, validMask, clipMask));
 781
 782         if (clipMask)
 783         {
 784             RDTSC_BEGIN(FEGuardbandClip, pa.pDC->drawId);
 785             // we have to clip tris, execute the clipper, which will also
 786             // call the binner
 787             ClipSimd(prim,
 788                      SIMD_T::vmask_ps(validMask),
 789                      SIMD_T::vmask_ps(clipMask),
 790                      pa,
 791                      primId,
 792                      viewportIdx,
 793                      rtIdx);
 794             RDTSC_END(FEGuardbandClip, 1);
 795         }
 796         else if (validMask)
 797         {
 798             // update CPrimitives pipeline state
 799             UPDATE_STAT_FE(CPrimitives, _mm_popcnt_u32(validMask));
 800
 801             // forward valid prims directly to binner
 802             binner.pfnBinFunc(
 803                 this->pDC, pa, this->workerId, prim, validMask, primId, viewportIdx, rtIdx);
 804         }
 805     }
 806
 807 private:
 808     Float<SIMD_T> ComputeInterpFactor(Float<SIMD_T> const& boundaryCoord0,
 809                                       Float<SIMD_T> const& boundaryCoord1)
 810     {
 811         return SIMD_T::div_ps(boundaryCoord0, SIMD_T::sub_ps(boundaryCoord0, boundaryCoord1));
 812     }
 813
 814     Integer<SIMD_T>
 815     ComputeOffsets(uint32_t attrib, Integer<SIMD_T> const& vIndices, uint32_t component)
 816     {
 817         const uint32_t simdVertexStride = sizeof(SIMDVERTEX_T<SIMD_T>);
 818         const uint32_t componentStride  = sizeof(Float<SIMD_T>);
 819         const uint32_t attribStride     = sizeof(Vec4<SIMD_T>);
 820
 821         static const OSALIGNSIMD16(uint32_t) elemOffset[16] = {
 822             0 * sizeof(float),
 823             1 * sizeof(float),
 824             2 * sizeof(float),
 825             3 * sizeof(float),
 826             4 * sizeof(float),
 827             5 * sizeof(float),
 828             6 * sizeof(float),
 829             7 * sizeof(float),
 830             8 * sizeof(float),
 831             9 * sizeof(float),
 832             10 * sizeof(float),
 833             11 * sizeof(float),
 834             12 * sizeof(float),
 835             13 * sizeof(float),
 836             14 * sizeof(float),
 837             15 * sizeof(float),
 838         };
 839
 840         static_assert(sizeof(Integer<SIMD_T>) <= sizeof(elemOffset),
 841                       "Clipper::ComputeOffsets, Increase number of element offsets.");
 842
 843         Integer<SIMD_T> vElemOffset =
 844             SIMD_T::loadu_si(reinterpret_cast<const Integer<SIMD_T>*>(elemOffset));
 845
 846         // step to the simdvertex
 847         Integer<SIMD_T> vOffsets =
 848             SIMD_T::mullo_epi32(vIndices, SIMD_T::set1_epi32(simdVertexStride));
 849
 850         // step to the attribute and component
 851         vOffsets = SIMD_T::add_epi32(
 852             vOffsets, SIMD_T::set1_epi32(attribStride * attrib + componentStride * component));
 853
 854         // step to the lane
 855         vOffsets = SIMD_T::add_epi32(vOffsets, vElemOffset);
 856
 857         return vOffsets;
 858     }
 859
 860     Float<SIMD_T> GatherComponent(const float*           pBuffer,
 861                                   uint32_t               attrib,
 862                                   Float<SIMD_T> const&   vMask,
 863                                   Integer<SIMD_T> const& vIndices,
 864                                   uint32_t               component)
 865     {
 866         Integer<SIMD_T> vOffsets = ComputeOffsets(attrib, vIndices, component);
 867         Float<SIMD_T>   vSrc     = SIMD_T::setzero_ps();
 868
 869         return SIMD_T::mask_i32gather_ps(vSrc, pBuffer, vOffsets, vMask);
 870     }
 871
 872     void ScatterComponent(const float*           pBuffer,
 873                           uint32_t               attrib,
 874                           Float<SIMD_T> const&   vMask,
 875                           Integer<SIMD_T> const& vIndices,
 876                           uint32_t               component,
 877                           Float<SIMD_T> const&   vSrc)
 878     {
 879         Integer<SIMD_T> vOffsets = ComputeOffsets(attrib, vIndices, component);
 880
 881         const uint32_t* pOffsets = reinterpret_cast<const uint32_t*>(&vOffsets);
 882         const float*    pSrc     = reinterpret_cast<const float*>(&vSrc);
 883         uint32_t        mask     = SIMD_T::movemask_ps(vMask);
 884         DWORD           lane;
 885         while (_BitScanForward(&lane, mask))
 886         {
 887             mask &= ~(1 << lane);
 888             const uint8_t* pBuf = reinterpret_cast<const uint8_t*>(pBuffer) + pOffsets[lane];
 889             *(float*)pBuf       = pSrc[lane];
 890         }
 891     }
 892
 893     template <SWR_CLIPCODES ClippingPlane>
 894     void intersect(const Float<SIMD_T>&   vActiveMask,  // active lanes to operate on
 895                    const Integer<SIMD_T>& s,            // index to first edge vertex v0 in pInPts.
 896                    const Integer<SIMD_T>& p,            // index to second edge vertex v1 in pInPts.
 897                    const Vec4<SIMD_T>&    v1,           // vertex 0 position
 898                    const Vec4<SIMD_T>&    v2,           // vertex 1 position
 899                    Integer<SIMD_T>&       outIndex,     // output index.
 900                    const float*           pInVerts,     // array of all the input positions.
 901                    uint32_t               numInAttribs, // number of attributes per vertex.
 902                    float* pOutVerts) // array of output positions. We'll write our new intersection
 903                                      // point at i*4.
 904     {
 905         uint32_t vertexAttribOffset   = this->state.backendState.vertexAttribOffset;
 906         uint32_t vertexClipCullOffset = this->state.backendState.vertexClipCullOffset;
 907
 908         // compute interpolation factor
 909         Float<SIMD_T> t;
 910         switch (ClippingPlane)
 911         {
 912         case FRUSTUM_LEFT:
 913             t = ComputeInterpFactor(SIMD_T::add_ps(v1[3], v1[0]), SIMD_T::add_ps(v2[3], v2[0]));
 914             break;
 915         case FRUSTUM_RIGHT:
 916             t = ComputeInterpFactor(SIMD_T::sub_ps(v1[3], v1[0]), SIMD_T::sub_ps(v2[3], v2[0]));
 917             break;
 918         case FRUSTUM_TOP:
 919             t = ComputeInterpFactor(SIMD_T::add_ps(v1[3], v1[1]), SIMD_T::add_ps(v2[3], v2[1]));
 920             break;
 921         case FRUSTUM_BOTTOM:
 922             t = ComputeInterpFactor(SIMD_T::sub_ps(v1[3], v1[1]), SIMD_T::sub_ps(v2[3], v2[1]));
 923             break;
 924         case FRUSTUM_NEAR:
 925             // DX Znear plane is 0, GL is -w
 926             if (this->state.rastState.clipHalfZ)
 927             {
 928                 t = ComputeInterpFactor(v1[2], v2[2]);
 929             }
 930             else
 931             {
 932                 t = ComputeInterpFactor(SIMD_T::add_ps(v1[3], v1[2]), SIMD_T::add_ps(v2[3], v2[2]));
 933             }
 934             break;
 935         case FRUSTUM_FAR:
 936             t = ComputeInterpFactor(SIMD_T::sub_ps(v1[3], v1[2]), SIMD_T::sub_ps(v2[3], v2[2]));
 937             break;
 938         default:
 939             SWR_INVALID("invalid clipping plane: %d", ClippingPlane);
 940         };
 941
 942         // interpolate position and store
 943         for (uint32_t c = 0; c < 4; ++c)
 944         {
 945             Float<SIMD_T> vOutPos = SIMD_T::fmadd_ps(SIMD_T::sub_ps(v2[c], v1[c]), t, v1[c]);
 946             ScatterComponent(pOutVerts, VERTEX_POSITION_SLOT, vActiveMask, outIndex, c, vOutPos);
 947         }
 948
 949         // interpolate attributes and store
 950         for (uint32_t a = 0; a < numInAttribs; ++a)
 951         {
 952             uint32_t attribSlot = vertexAttribOffset + a;
 953             for (uint32_t c = 0; c < 4; ++c)
 954             {
 955                 Float<SIMD_T> vAttrib0 = GatherComponent(pInVerts, attribSlot, vActiveMask, s, c);
 956                 Float<SIMD_T> vAttrib1 = GatherComponent(pInVerts, attribSlot, vActiveMask, p, c);
 957                 Float<SIMD_T> vOutAttrib =
 958                     SIMD_T::fmadd_ps(SIMD_T::sub_ps(vAttrib1, vAttrib0), t, vAttrib0);
 959                 ScatterComponent(pOutVerts, attribSlot, vActiveMask, outIndex, c, vOutAttrib);
 960             }
 961         }
 962
 963         // interpolate clip distance if enabled
 964         if (this->state.backendState.clipDistanceMask & 0xf)
 965         {
 966             uint32_t attribSlot = vertexClipCullOffset;
 967             for (uint32_t c = 0; c < 4; ++c)
 968             {
 969                 Float<SIMD_T> vAttrib0 = GatherComponent(pInVerts, attribSlot, vActiveMask, s, c);
 970                 Float<SIMD_T> vAttrib1 = GatherComponent(pInVerts, attribSlot, vActiveMask, p, c);
 971                 Float<SIMD_T> vOutAttrib =
 972                     SIMD_T::fmadd_ps(SIMD_T::sub_ps(vAttrib1, vAttrib0), t, vAttrib0);
 973                 ScatterComponent(pOutVerts, attribSlot, vActiveMask, outIndex, c, vOutAttrib);
 974             }
 975         }
 976
 977         if (this->state.backendState.clipDistanceMask & 0xf0)
 978         {
 979             uint32_t attribSlot = vertexClipCullOffset + 1;
 980             for (uint32_t c = 0; c < 4; ++c)
 981             {
 982                 Float<SIMD_T> vAttrib0 = GatherComponent(pInVerts, attribSlot, vActiveMask, s, c);
 983                 Float<SIMD_T> vAttrib1 = GatherComponent(pInVerts, attribSlot, vActiveMask, p, c);
 984                 Float<SIMD_T> vOutAttrib =
 985                     SIMD_T::fmadd_ps(SIMD_T::sub_ps(vAttrib1, vAttrib0), t, vAttrib0);
 986                 ScatterComponent(pOutVerts, attribSlot, vActiveMask, outIndex, c, vOutAttrib);
 987             }
 988         }
 989     }
 990
 991     template <SWR_CLIPCODES ClippingPlane>
 992     Float<SIMD_T> inside(const Vec4<SIMD_T>& v)
 993     {
 994         switch (ClippingPlane)
 995         {
 996         case FRUSTUM_LEFT:
 997             return SIMD_T::cmpge_ps(v[0], SIMD_T::mul_ps(v[3], SIMD_T::set1_ps(-1.0f)));
 998         case FRUSTUM_RIGHT:
 999             return SIMD_T::cmple_ps(v[0], v[3]);
1000         case FRUSTUM_TOP:
1001             return SIMD_T::cmpge_ps(v[1], SIMD_T::mul_ps(v[3], SIMD_T::set1_ps(-1.0f)));
1002         case FRUSTUM_BOTTOM:
1003             return SIMD_T::cmple_ps(v[1], v[3]);
1004         case FRUSTUM_NEAR:
1005             return SIMD_T::cmpge_ps(v[2],
1006                                     this->state.rastState.clipHalfZ
1007                                         ? SIMD_T::setzero_ps()
1008                                         : SIMD_T::mul_ps(v[3], SIMD_T::set1_ps(-1.0f)));
1009         case FRUSTUM_FAR:
1010             return SIMD_T::cmple_ps(v[2], v[3]);
1011         default:
1012             SWR_INVALID("invalid clipping plane: %d", ClippingPlane);
1013             return SIMD_T::setzero_ps();
1014         }
1015     }
1016
1017     template <SWR_CLIPCODES ClippingPlane>
1018     Integer<SIMD_T> ClipTriToPlane(const float*           pInVerts,
1019                                    const Integer<SIMD_T>& vNumInPts,
1020                                    uint32_t               numInAttribs,
1021                                    float*                 pOutVerts)
1022     {
1023         uint32_t vertexAttribOffset = this->state.backendState.vertexAttribOffset;
1024
1025         Integer<SIMD_T> vCurIndex   = SIMD_T::setzero_si();
1026         Integer<SIMD_T> vOutIndex   = SIMD_T::setzero_si();
1027         Float<SIMD_T>   vActiveMask = SIMD_T::castsi_ps(SIMD_T::cmplt_epi32(vCurIndex, vNumInPts));
1028
1029         while (!SIMD_T::testz_ps(vActiveMask, vActiveMask)) // loop until activeMask is empty
1030         {
1031             Integer<SIMD_T> s             = vCurIndex;
1032             Integer<SIMD_T> p             = SIMD_T::add_epi32(s, SIMD_T::set1_epi32(1));
1033             Integer<SIMD_T> underFlowMask = SIMD_T::cmpgt_epi32(vNumInPts, p);
1034             p                             = SIMD_T::castps_si(SIMD_T::blendv_ps(
1035                 SIMD_T::setzero_ps(), SIMD_T::castsi_ps(p), SIMD_T::castsi_ps(underFlowMask)));
1036
1037             // gather position
1038             Vec4<SIMD_T> vInPos0, vInPos1;
1039             for (uint32_t c = 0; c < 4; ++c)
1040             {
1041                 vInPos0[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, s, c);
1042                 vInPos1[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, p, c);
1043             }
1044
1045             // compute inside mask
1046             Float<SIMD_T> s_in = inside<ClippingPlane>(vInPos0);
1047             Float<SIMD_T> p_in = inside<ClippingPlane>(vInPos1);
1048
1049             // compute intersection mask (s_in != p_in)
1050             Float<SIMD_T> intersectMask = SIMD_T::xor_ps(s_in, p_in);
1051             intersectMask               = SIMD_T::and_ps(intersectMask, vActiveMask);
1052
1053             // store s if inside
1054             s_in = SIMD_T::and_ps(s_in, vActiveMask);
1055             if (!SIMD_T::testz_ps(s_in, s_in))
1056             {
1057                 // store position
1058                 for (uint32_t c = 0; c < 4; ++c)
1059                 {
1060                     ScatterComponent(
1061                         pOutVerts, VERTEX_POSITION_SLOT, s_in, vOutIndex, c, vInPos0[c]);
1062                 }
1063
1064                 // store attribs
1065                 for (uint32_t a = 0; a < numInAttribs; ++a)
1066                 {
1067                     uint32_t attribSlot = vertexAttribOffset + a;
1068                     for (uint32_t c = 0; c < 4; ++c)
1069                     {
1070                         Float<SIMD_T> vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
1071                         ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib);
1072                     }
1073                 }
1074
1075                 // store clip distance if enabled
1076                 uint32_t vertexClipCullSlot = this->state.backendState.vertexClipCullOffset;
1077                 if (this->state.backendState.clipDistanceMask & 0xf)
1078                 {
1079                     uint32_t attribSlot = vertexClipCullSlot;
1080                     for (uint32_t c = 0; c < 4; ++c)
1081                     {
1082                         Float<SIMD_T> vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
1083                         ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib);
1084                     }
1085                 }
1086
1087                 if (this->state.backendState.clipDistanceMask & 0xf0)
1088                 {
1089                     uint32_t attribSlot = vertexClipCullSlot + 1;
1090                     for (uint32_t c = 0; c < 4; ++c)
1091                     {
1092                         Float<SIMD_T> vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
1093                         ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib);
1094                     }
1095                 }
1096
1097                 // increment outIndex
1098                 vOutIndex = SIMD_T::blendv_epi32(
1099                     vOutIndex, SIMD_T::add_epi32(vOutIndex, SIMD_T::set1_epi32(1)), s_in);
1100             }
1101
1102             // compute and store intersection
1103             if (!SIMD_T::testz_ps(intersectMask, intersectMask))
1104             {
1105                 intersect<ClippingPlane>(intersectMask,
1106                                          s,
1107                                          p,
1108                                          vInPos0,
1109                                          vInPos1,
1110                                          vOutIndex,
1111                                          pInVerts,
1112                                          numInAttribs,
1113                                          pOutVerts);
1114
1115                 // increment outIndex for active lanes
1116                 vOutIndex = SIMD_T::blendv_epi32(
1117                     vOutIndex, SIMD_T::add_epi32(vOutIndex, SIMD_T::set1_epi32(1)), intersectMask);
1118             }
1119
1120             // increment loop index and update active mask
1121             vCurIndex   = SIMD_T::add_epi32(vCurIndex, SIMD_T::set1_epi32(1));
1122             vActiveMask = SIMD_T::castsi_ps(SIMD_T::cmplt_epi32(vCurIndex, vNumInPts));
1123         }
1124
1125         return vOutIndex;
1126     }
1127
1128     template <SWR_CLIPCODES ClippingPlane>
1129     Integer<SIMD_T> ClipLineToPlane(const float*           pInVerts,
1130                                     const Integer<SIMD_T>& vNumInPts,
1131                                     uint32_t               numInAttribs,
1132                                     float*                 pOutVerts)
1133     {
1134         uint32_t vertexAttribOffset = this->state.backendState.vertexAttribOffset;
1135
1136         Integer<SIMD_T> vCurIndex   = SIMD_T::setzero_si();
1137         Integer<SIMD_T> vOutIndex   = SIMD_T::setzero_si();
1138         Float<SIMD_T>   vActiveMask = SIMD_T::castsi_ps(SIMD_T::cmplt_epi32(vCurIndex, vNumInPts));
1139
1140         if (!SIMD_T::testz_ps(vActiveMask, vActiveMask))
1141         {
1142             Integer<SIMD_T> s = vCurIndex;
1143             Integer<SIMD_T> p = SIMD_T::add_epi32(s, SIMD_T::set1_epi32(1));
1144
1145             // gather position
1146             Vec4<SIMD_T> vInPos0, vInPos1;
1147             for (uint32_t c = 0; c < 4; ++c)
1148             {
1149                 vInPos0[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, s, c);
1150                 vInPos1[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, p, c);
1151             }
1152
1153             // compute inside mask
1154             Float<SIMD_T> s_in = inside<ClippingPlane>(vInPos0);
1155             Float<SIMD_T> p_in = inside<ClippingPlane>(vInPos1);
1156
1157             // compute intersection mask (s_in != p_in)
1158             Float<SIMD_T> intersectMask = SIMD_T::xor_ps(s_in, p_in);
1159             intersectMask               = SIMD_T::and_ps(intersectMask, vActiveMask);
1160
1161             // store s if inside
1162             s_in = SIMD_T::and_ps(s_in, vActiveMask);
1163             if (!SIMD_T::testz_ps(s_in, s_in))
1164             {
1165                 for (uint32_t c = 0; c < 4; ++c)
1166                 {
1167                     ScatterComponent(
1168                         pOutVerts, VERTEX_POSITION_SLOT, s_in, vOutIndex, c, vInPos0[c]);
1169                 }
1170
1171                 // interpolate attributes and store
1172                 for (uint32_t a = 0; a < numInAttribs; ++a)
1173                 {
1174                     uint32_t attribSlot = vertexAttribOffset + a;
1175                     for (uint32_t c = 0; c < 4; ++c)
1176                     {
1177                         Float<SIMD_T> vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
1178                         ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib);
1179                     }
1180                 }
1181
1182                 // increment outIndex
1183                 vOutIndex = SIMD_T::blendv_epi32(
1184                     vOutIndex, SIMD_T::add_epi32(vOutIndex, SIMD_T::set1_epi32(1)), s_in);
1185             }
1186
1187             // compute and store intersection
1188             if (!SIMD_T::testz_ps(intersectMask, intersectMask))
1189             {
1190                 intersect<ClippingPlane>(intersectMask,
1191                                          s,
1192                                          p,
1193                                          vInPos0,
1194                                          vInPos1,
1195                                          vOutIndex,
1196                                          pInVerts,
1197                                          numInAttribs,
1198                                          pOutVerts);
1199
1200                 // increment outIndex for active lanes
1201                 vOutIndex = SIMD_T::blendv_epi32(
1202                     vOutIndex, SIMD_T::add_epi32(vOutIndex, SIMD_T::set1_epi32(1)), intersectMask);
1203             }
1204
1205             // store p if inside
1206             p_in = SIMD_T::and_ps(p_in, vActiveMask);
1207             if (!SIMD_T::testz_ps(p_in, p_in))
1208             {
1209                 for (uint32_t c = 0; c < 4; ++c)
1210                 {
1211                     ScatterComponent(
1212                         pOutVerts, VERTEX_POSITION_SLOT, p_in, vOutIndex, c, vInPos1[c]);
1213                 }
1214
1215                 // interpolate attributes and store
1216                 for (uint32_t a = 0; a < numInAttribs; ++a)
1217                 {
1218                     uint32_t attribSlot = vertexAttribOffset + a;
1219                     for (uint32_t c = 0; c < 4; ++c)
1220                     {
1221                         Float<SIMD_T> vAttrib = GatherComponent(pInVerts, attribSlot, p_in, p, c);
1222                         ScatterComponent(pOutVerts, attribSlot, p_in, vOutIndex, c, vAttrib);
1223                     }
1224                 }
1225
1226                 // increment outIndex
1227                 vOutIndex = SIMD_T::blendv_epi32(
1228                     vOutIndex, SIMD_T::add_epi32(vOutIndex, SIMD_T::set1_epi32(1)), p_in);
1229             }
1230         }
1231
1232         return vOutIndex;
1233     }
1234
1235     Integer<SIMD_T> ClipPrims(float*               pVertices,
1236                               const Float<SIMD_T>& vPrimMask,
1237                               const Float<SIMD_T>& vClipMask,
1238                               int                  numAttribs)
1239     {
1240         // temp storage
1241         float* pTempVerts = reinterpret_cast<float*>(this->tmpVerts);
1242
1243         // zero out num input verts for non-active lanes
1244         Integer<SIMD_T> vNumInPts = SIMD_T::set1_epi32(NumVertsPerPrimT);
1245         vNumInPts = SIMD_T::blendv_epi32(SIMD_T::setzero_si(), vNumInPts, vClipMask);
1246
1247         // clip prims to frustum
1248         Integer<SIMD_T> vNumOutPts;
1249         if (NumVertsPerPrimT == 3)
1250         {
1251             vNumOutPts = ClipTriToPlane<FRUSTUM_NEAR>(pVertices, vNumInPts, numAttribs, pTempVerts);
1252             vNumOutPts = ClipTriToPlane<FRUSTUM_FAR>(pTempVerts, vNumOutPts, numAttribs, pVertices);
1253             vNumOutPts =
1254                 ClipTriToPlane<FRUSTUM_LEFT>(pVertices, vNumOutPts, numAttribs, pTempVerts);
1255             vNumOutPts =
1256                 ClipTriToPlane<FRUSTUM_RIGHT>(pTempVerts, vNumOutPts, numAttribs, pVertices);
1257             vNumOutPts =
1258                 ClipTriToPlane<FRUSTUM_BOTTOM>(pVertices, vNumOutPts, numAttribs, pTempVerts);
1259             vNumOutPts = ClipTriToPlane<FRUSTUM_TOP>(pTempVerts, vNumOutPts, numAttribs, pVertices);
1260         }
1261         else
1262         {
1263             SWR_ASSERT(NumVertsPerPrimT == 2);
1264             vNumOutPts =
1265                 ClipLineToPlane<FRUSTUM_NEAR>(pVertices, vNumInPts, numAttribs, pTempVerts);
1266             vNumOutPts =
1267                 ClipLineToPlane<FRUSTUM_FAR>(pTempVerts, vNumOutPts, numAttribs, pVertices);
1268             vNumOutPts =
1269                 ClipLineToPlane<FRUSTUM_LEFT>(pVertices, vNumOutPts, numAttribs, pTempVerts);
1270             vNumOutPts =
1271                 ClipLineToPlane<FRUSTUM_RIGHT>(pTempVerts, vNumOutPts, numAttribs, pVertices);
1272             vNumOutPts =
1273                 ClipLineToPlane<FRUSTUM_BOTTOM>(pVertices, vNumOutPts, numAttribs, pTempVerts);
1274             vNumOutPts =
1275                 ClipLineToPlane<FRUSTUM_TOP>(pTempVerts, vNumOutPts, numAttribs, pVertices);
1276         }
1277
1278         // restore num verts for non-clipped, active lanes
1279         Float<SIMD_T> vNonClippedMask = SIMD_T::andnot_ps(vClipMask, vPrimMask);
1280         vNumOutPts =
1281             SIMD_T::blendv_epi32(vNumOutPts, SIMD_T::set1_epi32(NumVertsPerPrimT), vNonClippedMask);
1282
1283         return vNumOutPts;
1284     }
1285
1286     const uint32_t   workerId{0};
1287     DRAW_CONTEXT*    pDC{nullptr};
1288     const API_STATE& state;
1289     Float<SIMD_T>    clipCodes[NumVertsPerPrimT];
1290     SIMDVERTEX_T<SIMD_T>* clippedVerts;
1291     SIMDVERTEX_T<SIMD_T>* tmpVerts;
1292     SIMDVERTEX_T<SIMD_T>* transposedVerts;
1293 };
1294
1295 // pipeline stage functions
1296 void ClipRectangles(DRAW_CONTEXT*      pDC,
1297                     PA_STATE&          pa,
1298                     uint32_t           workerId,
1299                     simdvector         prims[],
1300                     uint32_t           primMask,
1301                     simdscalari const& primId,
1302                     simdscalari const& viewportIdx,
1303                     simdscalari const& rtIdx);
1304 void ClipTriangles(DRAW_CONTEXT*      pDC,
1305                    PA_STATE&          pa,
1306                    uint32_t           workerId,
1307                    simdvector         prims[],
1308                    uint32_t           primMask,
1309                    simdscalari const& primId,
1310                    simdscalari const& viewportIdx,
1311                    simdscalari const& rtIdx);
1312 void ClipLines(DRAW_CONTEXT*      pDC,
1313                PA_STATE&          pa,
1314                uint32_t           workerId,
1315                simdvector         prims[],
1316                uint32_t           primMask,
1317                simdscalari const& primId,
1318                simdscalari const& viewportIdx,
1319                simdscalari const& rtIdx);
1320 void ClipPoints(DRAW_CONTEXT*      pDC,
1321                 PA_STATE&          pa,
1322                 uint32_t           workerId,
1323                 simdvector         prims[],
1324                 uint32_t           primMask,
1325                 simdscalari const& primId,
1326                 simdscalari const& viewportIdx,
1327                 simdscalari const& rtIdx);
1328 #if USE_SIMD16_FRONTEND
1329 void SIMDCALL ClipRectangles_simd16(DRAW_CONTEXT*        pDC,
1330                                     PA_STATE&            pa,
1331                                     uint32_t             workerId,
1332                                     simd16vector         prims[],
1333                                     uint32_t             primMask,
1334                                     simd16scalari const& primId,
1335                                     simd16scalari const& viewportIdx,
1336                                     simd16scalari const& rtIdx);
1337 void SIMDCALL ClipTriangles_simd16(DRAW_CONTEXT*        pDC,
1338                                    PA_STATE&            pa,
1339                                    uint32_t             workerId,
1340                                    simd16vector         prims[],
1341                                    uint32_t             primMask,
1342                                    simd16scalari const& primId,
1343                                    simd16scalari const& viewportIdx,
1344                                    simd16scalari const& rtIdx);
1345 void SIMDCALL ClipLines_simd16(DRAW_CONTEXT*        pDC,
1346                                PA_STATE&            pa,
1347                                uint32_t             workerId,
1348                                simd16vector         prims[],
1349                                uint32_t             primMask,
1350                                simd16scalari const& primId,
1351                                simd16scalari const& viewportIdx,
1352                                simd16scalari const& rtIdx);
1353 void SIMDCALL ClipPoints_simd16(DRAW_CONTEXT*        pDC,
1354                                 PA_STATE&            pa,
1355                                 uint32_t             workerId,
1356                                 simd16vector         prims[],
1357                                 uint32_t             primMask,
1358                                 simd16scalari const& primId,
1359                                 simd16scalari const& viewportIdx,
1360                                 simd16scalari const& rtIdx);
1361 #endif