src/gallium/drivers/swr/rasterizer/core/clip.h

   1 /****************************************************************************
   2 * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
   3 *
   4 * Permission is hereby granted, free of charge, to any person obtaining a
   5 * copy of this software and associated documentation files (the "Software"),
   6 * to deal in the Software without restriction, including without limitation
   7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8 * and/or sell copies of the Software, and to permit persons to whom the
   9 * Software is furnished to do so, subject to the following conditions:
  10 *
  11 * The above copyright notice and this permission notice (including the next
  12 * paragraph) shall be included in all copies or substantial portions of the
  13 * Software.
  14 *
  15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21 * IN THE SOFTWARE.
  22 *
  23 * @file clip.h
  24 *
  25 * @brief Definitions for clipping
  26 *
  27 ******************************************************************************/
  28 #pragma once
  29
  30 #include "common/simdintrin.h"
  31 #include "core/context.h"
  32 #include "core/pa.h"
  33 #include "rdtsc_core.h"
  34
  35 // Temp storage used by the clipper
  36 extern THREAD SIMDVERTEX_T<SIMD256> tlsTempVertices[7];
  37 #if USE_SIMD16_FRONTEND
  38 extern THREAD SIMDVERTEX_T<SIMD512> tlsTempVertices_simd16[7];
  39 #endif
  40
  41 enum SWR_CLIPCODES
  42 {
  43     // Shift clip codes out of the mantissa to prevent denormalized values when used in float compare.
  44     // Guardband is able to use a single high-bit with 4 separate LSBs, because it computes a union, rather than intersection, of clipcodes.
  45 #define CLIPCODE_SHIFT 23
  46     FRUSTUM_LEFT    = (0x01 << CLIPCODE_SHIFT),
  47     FRUSTUM_TOP     = (0x02 << CLIPCODE_SHIFT),
  48     FRUSTUM_RIGHT   = (0x04 << CLIPCODE_SHIFT),
  49     FRUSTUM_BOTTOM  = (0x08 << CLIPCODE_SHIFT),
  50
  51     FRUSTUM_NEAR    = (0x10 << CLIPCODE_SHIFT),
  52     FRUSTUM_FAR     = (0x20 << CLIPCODE_SHIFT),
  53
  54     NEGW            = (0x40 << CLIPCODE_SHIFT),
  55
  56     GUARDBAND_LEFT   = (0x80 << CLIPCODE_SHIFT | 0x1),
  57     GUARDBAND_TOP    = (0x80 << CLIPCODE_SHIFT | 0x2),
  58     GUARDBAND_RIGHT  = (0x80 << CLIPCODE_SHIFT | 0x4),
  59     GUARDBAND_BOTTOM = (0x80 << CLIPCODE_SHIFT | 0x8)
  60 };
  61
  62 #define GUARDBAND_CLIP_MASK (FRUSTUM_NEAR|FRUSTUM_FAR|GUARDBAND_LEFT|GUARDBAND_TOP|GUARDBAND_RIGHT|GUARDBAND_BOTTOM|NEGW)
  63
  64 template<typename SIMD_T>
  65 void ComputeClipCodes(const API_STATE &state, const typename SIMD_T::Vec4 &vertex, typename SIMD_T::Float &clipCodes, typename SIMD_T::Integer const &viewportIndexes)
  66 {
  67     clipCodes = SIMD_T::setzero_ps();
  68
  69     // -w
  70     typename SIMD_T::Float vNegW = SIMD_T::mul_ps(vertex.w,SIMD_T::set1_ps(-1.0f));
  71
  72     // FRUSTUM_LEFT
  73     typename SIMD_T::Float vRes = SIMD_T::cmplt_ps(vertex.x, vNegW);
  74     clipCodes = SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_LEFT)));
  75
  76     // FRUSTUM_TOP
  77     vRes = SIMD_T::cmplt_ps(vertex.y, vNegW);
  78     clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_TOP))));
  79
  80     // FRUSTUM_RIGHT
  81     vRes = SIMD_T::cmpgt_ps(vertex.x, vertex.w);
  82     clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_RIGHT))));
  83
  84     // FRUSTUM_BOTTOM
  85     vRes = SIMD_T::cmpgt_ps(vertex.y, vertex.w);
  86     clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_BOTTOM))));
  87
  88     if (state.rastState.depthClipEnable)
  89     {
  90         // FRUSTUM_NEAR
  91         // DX clips depth [0..w], GL clips [-w..w]
  92         if (state.rastState.clipHalfZ)
  93         {
  94             vRes = SIMD_T::cmplt_ps(vertex.z, SIMD_T::setzero_ps());
  95         }
  96         else
  97         {
  98             vRes = SIMD_T::cmplt_ps(vertex.z, vNegW);
  99         }
 100         clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_NEAR))));
 101
 102         // FRUSTUM_FAR
 103         vRes = SIMD_T::cmpgt_ps(vertex.z, vertex.w);
 104         clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_FAR))));
 105     }
 106
 107     // NEGW
 108     vRes = SIMD_T::cmple_ps(vertex.w, SIMD_T::setzero_ps());
 109     clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(NEGW))));
 110
 111     // GUARDBAND_LEFT
 112     typename SIMD_T::Float gbMult = SIMD_T::mul_ps(vNegW, SIMD_T::template i32gather_ps<typename SIMD_T::ScaleFactor(4)>(&state.gbState.left[0], viewportIndexes));
 113     vRes = SIMD_T::cmplt_ps(vertex.x, gbMult);
 114     clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_LEFT))));
 115
 116     // GUARDBAND_TOP
 117     gbMult = SIMD_T::mul_ps(vNegW, SIMD_T::template i32gather_ps<typename SIMD_T::ScaleFactor(4)>(&state.gbState.top[0], viewportIndexes));
 118     vRes = SIMD_T::cmplt_ps(vertex.y, gbMult);
 119     clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_TOP))));
 120
 121     // GUARDBAND_RIGHT
 122     gbMult = SIMD_T::mul_ps(vertex.w, SIMD_T::template i32gather_ps<typename SIMD_T::ScaleFactor(4)>(&state.gbState.right[0], viewportIndexes));
 123     vRes = SIMD_T::cmpgt_ps(vertex.x, gbMult);
 124     clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_RIGHT))));
 125
 126     // GUARDBAND_BOTTOM
 127     gbMult = SIMD_T::mul_ps(vertex.w, SIMD_T::template i32gather_ps<typename SIMD_T::ScaleFactor(4)>(&state.gbState.bottom[0], viewportIndexes));
 128     vRes = SIMD_T::cmpgt_ps(vertex.y, gbMult);
 129     clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_BOTTOM))));
 130 }
 131
 132 template<typename SIMD_T>
 133 struct BinnerChooser
 134 {
 135 };
 136
 137 template<>
 138 struct BinnerChooser<SIMD256>
 139 {
 140     PFN_PROCESS_PRIMS pfnBinFunc;
 141
 142     BinnerChooser(uint32_t numVertsPerPrim, uint32_t conservativeRast)
 143         :pfnBinFunc(nullptr)
 144     {
 145         if (numVertsPerPrim == 3)
 146         {
 147             pfnBinFunc = GetBinTrianglesFunc(conservativeRast > 0);
 148
 149         }
 150         else if (numVertsPerPrim == 2)
 151         {
 152             pfnBinFunc = BinLines;
 153         }
 154         else
 155         {
 156             SWR_ASSERT(0 && "Unexpected points in clipper.");
 157         }
 158     }
 159
 160     BinnerChooser(PRIMITIVE_TOPOLOGY topology, uint32_t conservativeRast)
 161         :pfnBinFunc(nullptr)
 162     {
 163         switch (topology)
 164         {
 165         case TOP_POINT_LIST:
 166             pfnBinFunc = BinPoints;
 167             break;
 168         case TOP_LINE_LIST:
 169         case TOP_LINE_STRIP:
 170         case TOP_LINE_LOOP:
 171         case TOP_LINE_LIST_ADJ:
 172         case TOP_LISTSTRIP_ADJ:
 173             pfnBinFunc = BinLines;
 174             break;
 175         default:
 176             pfnBinFunc = GetBinTrianglesFunc(conservativeRast > 0);
 177             break;
 178         };
 179     }
 180
 181     void BinFunc(DRAW_CONTEXT *pDC, PA_STATE &pa, uint32_t workerId, SIMD256::Vec4 prims[], uint32_t primMask, SIMD256::Integer const &primID)
 182     {
 183         SWR_ASSERT(pfnBinFunc != nullptr);
 184
 185         pfnBinFunc(pDC, pa, workerId, prims, primMask, primID);
 186     }
 187 };
 188
 189 #if USE_SIMD16_FRONTEND
 190 template<>
 191 struct BinnerChooser<SIMD512>
 192 {
 193     PFN_PROCESS_PRIMS_SIMD16 pfnBinFunc;
 194
 195     BinnerChooser(uint32_t numVertsPerPrim, uint32_t conservativeRast)
 196         :pfnBinFunc(nullptr)
 197     {
 198         if (numVertsPerPrim == 3)
 199         {
 200             pfnBinFunc = GetBinTrianglesFunc_simd16(conservativeRast > 0);
 201
 202         }
 203         else if (numVertsPerPrim == 2)
 204         {
 205             pfnBinFunc = BinLines_simd16;
 206         }
 207         else
 208         {
 209             SWR_ASSERT(0 && "Unexpected points in clipper.");
 210         }
 211     }
 212
 213     BinnerChooser(PRIMITIVE_TOPOLOGY topology, uint32_t conservativeRast)
 214         :pfnBinFunc(nullptr)
 215     {
 216         switch (topology)
 217         {
 218         case TOP_POINT_LIST:
 219             pfnBinFunc = BinPoints_simd16;
 220             break;
 221         case TOP_LINE_LIST:
 222         case TOP_LINE_STRIP:
 223         case TOP_LINE_LOOP:
 224         case TOP_LINE_LIST_ADJ:
 225         case TOP_LISTSTRIP_ADJ:
 226             pfnBinFunc = BinLines_simd16;
 227             break;
 228         default:
 229             pfnBinFunc = GetBinTrianglesFunc_simd16(conservativeRast > 0);
 230             break;
 231         };
 232     }
 233
 234     void BinFunc(DRAW_CONTEXT *pDC, PA_STATE &pa, uint32_t workerId, SIMD512::Vec4 prims[], uint32_t primMask, SIMD512::Integer const &primID)
 235     {
 236         SWR_ASSERT(pfnBinFunc != nullptr);
 237
 238         pfnBinFunc(pDC, pa, workerId, prims, primMask, primID);
 239     }
 240 };
 241
 242 #endif
 243 template<typename SIMD_T>
 244 struct SimdHelper
 245 {
 246 };
 247
 248 template<>
 249 struct SimdHelper<SIMD256>
 250 {
 251     static SIMD256::Float insert_lo_ps(SIMD256::Float a)
 252     {
 253         return a;
 254     }
 255
 256     static SIMD256::Mask cmpeq_ps_mask(SIMD256::Float a, SIMD256::Float b)
 257     {
 258         return SIMD256::movemask_ps(SIMD256::cmpeq_ps(a, b));
 259     }
 260 };
 261
 262 #if USE_SIMD16_FRONTEND
 263 template<>
 264 struct SimdHelper<SIMD512>
 265 {
 266     static SIMD512::Float insert_lo_ps(SIMD256::Float a)
 267     {
 268         return SIMD512::insert_ps<0>(SIMD512::setzero_ps(), a);
 269     }
 270
 271     static SIMD512::Mask cmpeq_ps_mask(SIMD512::Float a, SIMD512::Float b)
 272     {
 273         return SIMD512::cmp_ps_mask<SIMD16::CompareType::EQ_OQ>(a, b);
 274     }
 275 };
 276
 277 #endif
 278 // Temp storage used by the clipper
 279 template<typename SIMD_T>
 280 struct ClipHelper
 281 {
 282 };
 283
 284 template<>
 285 struct ClipHelper<SIMD256>
 286 {
 287     static SIMDVERTEX_T<SIMD256> *GetTempVertices()
 288     {
 289         return tlsTempVertices;
 290     }
 291 };
 292
 293 #if USE_SIMD16_FRONTEND
 294 template<>
 295 struct ClipHelper<SIMD512>
 296 {
 297     static SIMDVERTEX_T<SIMD512> *GetTempVertices()
 298     {
 299         return tlsTempVertices_simd16;
 300     }
 301 };
 302
 303 #endif
 304 template<typename SIMD_T, uint32_t NumVertsPerPrim>
 305 class Clipper
 306 {
 307 public:
 308     INLINE Clipper(uint32_t in_workerId, DRAW_CONTEXT* in_pDC) :
 309         workerId(in_workerId), pDC(in_pDC), state(GetApiState(in_pDC))
 310     {
 311         static_assert(NumVertsPerPrim >= 1 && NumVertsPerPrim <= 3, "Invalid NumVertsPerPrim");
 312     }
 313
 314     void ComputeClipCodes(typename SIMD_T::Vec4 vertex[], const typename SIMD_T::Integer &viewportIndexes)
 315     {
 316         for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
 317         {
 318             ::ComputeClipCodes<SIMD_T>(state, vertex[i], clipCodes[i], viewportIndexes);
 319         }
 320     }
 321
 322     typename SIMD_T::Float ComputeClipCodeIntersection()
 323     {
 324         typename SIMD_T::Float result = clipCodes[0];
 325
 326         for (uint32_t i = 1; i < NumVertsPerPrim; ++i)
 327         {
 328             result = SIMD_T::and_ps(result, clipCodes[i]);
 329         }
 330
 331         return result;
 332     }
 333
 334     typename SIMD_T::Float ComputeClipCodeUnion()
 335     {
 336         typename SIMD_T::Float result = clipCodes[0];
 337
 338         for (uint32_t i = 1; i < NumVertsPerPrim; ++i)
 339         {
 340             result = SIMD_T::or_ps(result, clipCodes[i]);
 341         }
 342
 343         return result;
 344     }
 345
 346     int ComputeClipMask()
 347     {
 348         typename SIMD_T::Float clipUnion = ComputeClipCodeUnion();
 349
 350         clipUnion = SIMD_T::and_ps(clipUnion, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_CLIP_MASK)));
 351
 352         return SIMD_T::movemask_ps(SIMD_T::cmpneq_ps(clipUnion, SIMD_T::setzero_ps()));
 353     }
 354
 355     // clipper is responsible for culling any prims with NAN coordinates
 356     int ComputeNaNMask(typename SIMD_T::Vec4 prim[])
 357     {
 358         typename SIMD_T::Float vNanMask = SIMD_T::setzero_ps();
 359
 360         for (uint32_t e = 0; e < NumVertsPerPrim; ++e)
 361         {
 362             typename SIMD_T::Float vNan01 = SIMD_T::template cmp_ps<SIMD_T::CompareType::UNORD_Q>(prim[e].v[0], prim[e].v[1]);
 363             vNanMask = SIMD_T::or_ps(vNanMask, vNan01);
 364
 365             typename SIMD_T::Float vNan23 = SIMD_T::template cmp_ps<SIMD_T::CompareType::UNORD_Q>(prim[e].v[2], prim[e].v[3]);
 366             vNanMask = SIMD_T::or_ps(vNanMask, vNan23);
 367         }
 368
 369         return SIMD_T::movemask_ps(vNanMask);
 370     }
 371
 372     int ComputeUserClipCullMask(PA_STATE &pa, typename SIMD_T::Vec4 prim[])
 373     {
 374         uint8_t cullMask = state.backendState.cullDistanceMask;
 375         uint32_t vertexClipCullOffset = state.backendState.vertexClipCullOffset;
 376
 377         typename SIMD_T::Float vClipCullMask = SIMD_T::setzero_ps();
 378
 379         typename SIMD_T::Vec4 vClipCullDistLo[3];
 380         typename SIMD_T::Vec4 vClipCullDistHi[3];
 381
 382         pa.Assemble(vertexClipCullOffset, vClipCullDistLo);
 383         pa.Assemble(vertexClipCullOffset + 1, vClipCullDistHi);
 384
 385         DWORD index;
 386         while (_BitScanForward(&index, cullMask))
 387         {
 388             cullMask &= ~(1 << index);
 389             uint32_t slot = index >> 2;
 390             uint32_t component = index & 0x3;
 391
 392             typename SIMD_T::Float vCullMaskElem = SIMD_T::set1_ps(-1.0f);
 393             for (uint32_t e = 0; e < NumVertsPerPrim; ++e)
 394             {
 395                 typename SIMD_T::Float vCullComp;
 396                 if (slot == 0)
 397                 {
 398                     vCullComp = vClipCullDistLo[e][component];
 399                 }
 400                 else
 401                 {
 402                     vCullComp = vClipCullDistHi[e][component];
 403                 }
 404
 405                 // cull if cull distance < 0 || NAN
 406                 typename SIMD_T::Float vCull = SIMD_T::template cmp_ps<SIMD_T::CompareType::NLE_UQ>(SIMD_T::setzero_ps(), vCullComp);
 407                 vCullMaskElem = SIMD_T::and_ps(vCullMaskElem, vCull);
 408             }
 409             vClipCullMask = SIMD_T::or_ps(vClipCullMask, vCullMaskElem);
 410         }
 411
 412         // clipper should also discard any primitive with NAN clip distance
 413         uint8_t clipMask = state.backendState.clipDistanceMask;
 414         while (_BitScanForward(&index, clipMask))
 415         {
 416             clipMask &= ~(1 << index);
 417             uint32_t slot = index >> 2;
 418             uint32_t component = index & 0x3;
 419
 420             for (uint32_t e = 0; e < NumVertsPerPrim; ++e)
 421             {
 422                 typename SIMD_T::Float vClipComp;
 423                 if (slot == 0)
 424                 {
 425                     vClipComp = vClipCullDistLo[e][component];
 426                 }
 427                 else
 428                 {
 429                     vClipComp = vClipCullDistHi[e][component];
 430                 }
 431
 432                 typename SIMD_T::Float vClip = SIMD_T::template cmp_ps<SIMD_T::CompareType::UNORD_Q>(vClipComp, vClipComp);
 433                 vClipCullMask = SIMD_T::or_ps(vClipCullMask, vClip);
 434             }
 435         }
 436
 437         return SIMD_T::movemask_ps(vClipCullMask);
 438     }
 439
 440     void ClipSimd(const typename SIMD_T::Float &vPrimMask, const typename SIMD_T::Float &vClipMask, PA_STATE &pa, const typename SIMD_T::Integer &vPrimId)
 441     {
 442         // input/output vertex store for clipper
 443         SIMDVERTEX_T<SIMD_T> vertices[7]; // maximum 7 verts generated per triangle
 444
 445         uint32_t constantInterpMask = state.backendState.constantInterpolationMask;
 446         uint32_t provokingVertex = 0;
 447         if (pa.binTopology == TOP_TRIANGLE_FAN)
 448         {
 449             provokingVertex = state.frontendState.provokingVertex.triFan;
 450         }
 451         ///@todo: line topology for wireframe?
 452
 453         // assemble pos
 454         typename SIMD_T::Vec4 tmpVector[NumVertsPerPrim];
 455         pa.Assemble(VERTEX_POSITION_SLOT, tmpVector);
 456         for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
 457         {
 458             vertices[i].attrib[VERTEX_POSITION_SLOT] = tmpVector[i];
 459         }
 460
 461         // assemble attribs
 462         const SWR_BACKEND_STATE& backendState = state.backendState;
 463
 464         int32_t maxSlot = -1;
 465         for (uint32_t slot = 0; slot < backendState.numAttributes; ++slot)
 466         {
 467             // Compute absolute attrib slot in vertex array
 468             uint32_t mapSlot = backendState.swizzleEnable ? backendState.swizzleMap[slot].sourceAttrib : slot;
 469             maxSlot = std::max<int32_t>(maxSlot, mapSlot);
 470             uint32_t inputSlot = backendState.vertexAttribOffset + mapSlot;
 471
 472             pa.Assemble(inputSlot, tmpVector);
 473
 474             // if constant interpolation enabled for this attribute, assign the provoking
 475             // vertex values to all edges
 476             if (CheckBit(constantInterpMask, slot))
 477             {
 478                 for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
 479                 {
 480                     vertices[i].attrib[inputSlot] = tmpVector[provokingVertex];
 481                 }
 482             }
 483             else
 484             {
 485                 for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
 486                 {
 487                     vertices[i].attrib[inputSlot] = tmpVector[i];
 488                 }
 489             }
 490         }
 491
 492         // assemble user clip distances if enabled
 493         uint32_t vertexClipCullSlot = state.backendState.vertexClipCullOffset;
 494         if (state.backendState.clipDistanceMask & 0xf)
 495         {
 496             pa.Assemble(vertexClipCullSlot, tmpVector);
 497             for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
 498             {
 499                 vertices[i].attrib[vertexClipCullSlot] = tmpVector[i];
 500             }
 501         }
 502
 503         if (state.backendState.clipDistanceMask & 0xf0)
 504         {
 505             pa.Assemble(vertexClipCullSlot + 1, tmpVector);
 506             for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
 507             {
 508                 vertices[i].attrib[vertexClipCullSlot + 1] = tmpVector[i];
 509             }
 510         }
 511
 512         uint32_t numAttribs = maxSlot + 1;
 513
 514         typename SIMD_T::Integer vNumClippedVerts = ClipPrims((float*)&vertices[0], vPrimMask, vClipMask, numAttribs);
 515
 516         BinnerChooser<SIMD_T> binner(NumVertsPerPrim, pa.pDC->pState->state.rastState.conservativeRast);
 517
 518         // set up new PA for binning clipped primitives
 519         PRIMITIVE_TOPOLOGY clipTopology = TOP_UNKNOWN;
 520         if (NumVertsPerPrim == 3)
 521         {
 522             clipTopology = TOP_TRIANGLE_FAN;
 523
 524             // so that the binner knows to bloat wide points later
 525             if (pa.binTopology == TOP_POINT_LIST)
 526             {
 527                 clipTopology = TOP_POINT_LIST;
 528             }
 529         }
 530         else if (NumVertsPerPrim == 2)
 531         {
 532             clipTopology = TOP_LINE_LIST;
 533         }
 534         else
 535         {
 536             SWR_ASSERT(0 && "Unexpected points in clipper.");
 537         }
 538
 539         const uint32_t *pVertexCount = reinterpret_cast<const uint32_t *>(&vNumClippedVerts);
 540         const uint32_t *pPrimitiveId = reinterpret_cast<const uint32_t *>(&vPrimId);
 541
 542         const SIMD256::Integer vOffsets = SIMD256::set_epi32(
 543             0 * sizeof(SIMDVERTEX_T<SIMD_T>), // unused lane
 544             6 * sizeof(SIMDVERTEX_T<SIMD_T>),
 545             5 * sizeof(SIMDVERTEX_T<SIMD_T>),
 546             4 * sizeof(SIMDVERTEX_T<SIMD_T>),
 547             3 * sizeof(SIMDVERTEX_T<SIMD_T>),
 548             2 * sizeof(SIMDVERTEX_T<SIMD_T>),
 549             1 * sizeof(SIMDVERTEX_T<SIMD_T>),
 550             0 * sizeof(SIMDVERTEX_T<SIMD_T>));
 551
 552         // only need to gather 7 verts
 553         // @todo dynamic mask based on actual # of verts generated per lane
 554         const SIMD256::Float vMask = SIMD256::set_ps(0, -1, -1, -1, -1, -1, -1, -1);
 555
 556         uint32_t numClippedPrims = 0;
 557
 558         // tranpose clipper output so that each lane's vertices are in SIMD order
 559         // set aside space for 2 vertices, as the PA will try to read up to 16 verts
 560         // for triangle fan
 561
 562 #if defined(_DEBUG)
 563         // TODO: need to increase stack size, allocating SIMD16-widened transposedPrims causes stack overflow in debug builds
 564         SIMDVERTEX_T<SIMD_T> *transposedPrims = reinterpret_cast<SIMDVERTEX_T<SIMD_T> *>(malloc(sizeof(SIMDVERTEX_T<SIMD_T>) * 2));
 565
 566 #else
 567         SIMDVERTEX_T<SIMD_T> transposedPrims[2];
 568
 569 #endif
 570         for (uint32_t inputPrim = 0; inputPrim < pa.NumPrims(); ++inputPrim)
 571         {
 572             uint32_t numEmittedVerts = pVertexCount[inputPrim];
 573             if (numEmittedVerts < NumVertsPerPrim)
 574             {
 575                 continue;
 576             }
 577             SWR_ASSERT(numEmittedVerts <= 7, "Unexpected vertex count from clipper.");
 578
 579             uint32_t numEmittedPrims = GetNumPrims(clipTopology, numEmittedVerts);
 580             SWR_ASSERT(numEmittedPrims <= 7, "Unexpected primitive count from clipper.");
 581
 582             numClippedPrims += numEmittedPrims;
 583
 584             // tranpose clipper output so that each lane's vertices are in SIMD order
 585             // set aside space for 2 vertices, as the PA will try to read up to 16 verts
 586             // for triangle fan
 587
 588             // transpose pos
 589             uint8_t *pBase = reinterpret_cast<uint8_t *>(&vertices[0].attrib[VERTEX_POSITION_SLOT]) + sizeof(float) * inputPrim;
 590
 591 #if 0
 592             // TEMPORARY WORKAROUND for bizarre VS2015 code-gen bug
 593             static const float *dummy = reinterpret_cast<const float *>(pBase);
 594
 595 #endif
 596             for (uint32_t c = 0; c < 4; ++c)
 597             {
 598                 SIMD256::Float temp = SIMD256::template mask_i32gather_ps<typename SIMD_T::ScaleFactor(1)>(SIMD256::setzero_ps(), reinterpret_cast<const float *>(pBase), vOffsets, vMask);
 599                 transposedPrims[0].attrib[VERTEX_POSITION_SLOT][c] = SimdHelper<SIMD_T>::insert_lo_ps(temp);
 600                 pBase += sizeof(typename SIMD_T::Float);
 601             }
 602
 603             // transpose attribs
 604             pBase = reinterpret_cast<uint8_t *>(&vertices[0].attrib[backendState.vertexAttribOffset]) + sizeof(float) * inputPrim;
 605
 606             for (uint32_t attrib = 0; attrib < numAttribs; ++attrib)
 607             {
 608                 uint32_t attribSlot = backendState.vertexAttribOffset + attrib;
 609
 610                 for (uint32_t c = 0; c < 4; ++c)
 611                 {
 612                     SIMD256::Float temp = SIMD256::template mask_i32gather_ps<typename SIMD_T::ScaleFactor(1)>(SIMD256::setzero_ps(), reinterpret_cast<const float *>(pBase), vOffsets, vMask);
 613                     transposedPrims[0].attrib[attribSlot][c] = SimdHelper<SIMD_T>::insert_lo_ps(temp);
 614                     pBase += sizeof(typename SIMD_T::Float);
 615                 }
 616             }
 617
 618             // transpose user clip distances if enabled
 619             uint32_t vertexClipCullSlot = backendState.vertexClipCullOffset;
 620             if (state.backendState.clipDistanceMask & 0x0f)
 621             {
 622                 pBase = reinterpret_cast<uint8_t *>(&vertices[0].attrib[vertexClipCullSlot]) + sizeof(float) * inputPrim;
 623
 624                 for (uint32_t c = 0; c < 4; ++c)
 625                 {
 626                     SIMD256::Float temp = SIMD256::template mask_i32gather_ps<typename SIMD_T::ScaleFactor(1)>(SIMD256::setzero_ps(), reinterpret_cast<const float *>(pBase), vOffsets, vMask);
 627                     transposedPrims[0].attrib[vertexClipCullSlot][c] = SimdHelper<SIMD_T>::insert_lo_ps(temp);
 628                     pBase += sizeof(typename SIMD_T::Float);
 629                 }
 630             }
 631
 632             if (state.backendState.clipDistanceMask & 0xf0)
 633             {
 634                 pBase = reinterpret_cast<uint8_t *>(&vertices[0].attrib[vertexClipCullSlot + 1]) + sizeof(float) * inputPrim;
 635
 636                 for (uint32_t c = 0; c < 4; ++c)
 637                 {
 638                     SIMD256::Float temp = SIMD256::template mask_i32gather_ps<typename SIMD_T::ScaleFactor(1)>(SIMD256::setzero_ps(), reinterpret_cast<const float *>(pBase), vOffsets, vMask);
 639                     transposedPrims[0].attrib[vertexClipCullSlot + 1][c] = SimdHelper<SIMD_T>::insert_lo_ps(temp);
 640                     pBase += sizeof(typename SIMD_T::Float);
 641                 }
 642             }
 643
 644             PA_STATE_OPT clipPA(pDC, numEmittedPrims, reinterpret_cast<uint8_t *>(&transposedPrims[0]), numEmittedVerts, SWR_VTX_NUM_SLOTS, true, clipTopology);
 645
 646             static const uint32_t primMaskMap[] = { 0x0, 0x1, 0x3, 0x7, 0xf, 0x1f, 0x3f, 0x7f };
 647
 648             const uint32_t primMask = primMaskMap[numEmittedPrims];
 649
 650             const typename SIMD_T::Integer primID = SIMD_T::set1_epi32(pPrimitiveId[inputPrim]);
 651
 652             while (clipPA.GetNextStreamOutput())
 653             {
 654                 do
 655                 {
 656                     typename SIMD_T::Vec4 attrib[NumVertsPerPrim];
 657
 658                     bool assemble = clipPA.Assemble(VERTEX_POSITION_SLOT, attrib);
 659
 660                     if (assemble)
 661                     {
 662                         binner.pfnBinFunc(pDC, clipPA, workerId, attrib, primMask, primID);
 663                     }
 664
 665                 } while (clipPA.NextPrim());
 666             }
 667         }
 668
 669 #if defined(_DEBUG)
 670         free(transposedPrims);
 671
 672 #endif
 673         // update global pipeline stat
 674         UPDATE_STAT_FE(CPrimitives, numClippedPrims);
 675     }
 676
 677     void ExecuteStage(PA_STATE &pa, typename SIMD_T::Vec4 prim[], uint32_t primMask, typename SIMD_T::Integer const &primId)
 678     {
 679         SWR_ASSERT(pa.pDC != nullptr);
 680
 681         SWR_CONTEXT *pContext = pa.pDC->pContext;
 682
 683         BinnerChooser<SIMD_T> binner(pa.binTopology, pa.pDC->pState->state.rastState.conservativeRast);
 684
 685         // update clipper invocations pipeline stat
 686         uint32_t numInvoc = _mm_popcnt_u32(primMask);
 687         UPDATE_STAT_FE(CInvocations, numInvoc);
 688
 689         // Read back viewport index if required
 690         typename SIMD_T::Integer viewportIdx = SIMD_T::set1_epi32(0);
 691
 692         if (state.backendState.readViewportArrayIndex)
 693         {
 694             typename SIMD_T::Vec4 vpiAttrib[NumVertsPerPrim];
 695             pa.Assemble(VERTEX_SGV_SLOT, vpiAttrib);
 696
 697             // OOB indices => forced to zero.
 698             typename SIMD_T::Integer vpai = SIMD_T::castps_si(vpiAttrib[0][VERTEX_SGV_VAI_COMP]);
 699             vpai = SIMD_T::max_epi32(vpai, SIMD_T::setzero_si());
 700             typename SIMD_T::Integer vNumViewports = SIMD_T::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
 701             typename SIMD_T::Integer vClearMask = SIMD_T::cmplt_epi32(vpai, vNumViewports);
 702             viewportIdx = SIMD_T::and_si(vClearMask, vpai);
 703         }
 704
 705         ComputeClipCodes(prim, viewportIdx);
 706
 707         // cull prims with NAN coords
 708         primMask &= ~ComputeNaNMask(prim);
 709
 710         // user cull distance cull
 711         if (state.backendState.cullDistanceMask)
 712         {
 713             primMask &= ~ComputeUserClipCullMask(pa, prim);
 714         }
 715
 716         // cull prims outside view frustum
 717         typename SIMD_T::Float clipIntersection = ComputeClipCodeIntersection();
 718         int validMask = primMask & SimdHelper<SIMD_T>::cmpeq_ps_mask(clipIntersection, SIMD_T::setzero_ps());
 719
 720         // skip clipping for points
 721         uint32_t clipMask = 0;
 722         if (NumVertsPerPrim != 1)
 723         {
 724             clipMask = primMask & ComputeClipMask();
 725         }
 726
 727         if (clipMask)
 728         {
 729             AR_BEGIN(FEGuardbandClip, pa.pDC->drawId);
 730             // we have to clip tris, execute the clipper, which will also
 731             // call the binner
 732             ClipSimd(SIMD_T::vmask_ps(primMask), SIMD_T::vmask_ps(clipMask), pa, primId);
 733             AR_END(FEGuardbandClip, 1);
 734         }
 735         else if (validMask)
 736         {
 737             // update CPrimitives pipeline state
 738             UPDATE_STAT_FE(CPrimitives, _mm_popcnt_u32(validMask));
 739
 740             // forward valid prims directly to binner
 741             binner.pfnBinFunc(this->pDC, pa, this->workerId, prim, validMask, primId);
 742         }
 743     }
 744
 745 private:
 746     typename SIMD_T::Float ComputeInterpFactor(typename SIMD_T::Float const &boundaryCoord0, typename SIMD_T::Float const &boundaryCoord1)
 747     {
 748         return SIMD_T::div_ps(boundaryCoord0, SIMD_T::sub_ps(boundaryCoord0, boundaryCoord1));
 749     }
 750
 751     typename SIMD_T::Integer ComputeOffsets(uint32_t attrib, typename SIMD_T::Integer const &vIndices, uint32_t component)
 752     {
 753         const uint32_t simdVertexStride = sizeof(SIMDVERTEX_T<SIMD_T>);
 754         const uint32_t componentStride  = sizeof(typename SIMD_T::Float);
 755         const uint32_t attribStride     = sizeof(typename SIMD_T::Vec4);
 756
 757         static const OSALIGNSIMD16(uint32_t) elemOffset[16] =
 758         {
 759             0 * sizeof(float),
 760             1 * sizeof(float),
 761             2 * sizeof(float),
 762             3 * sizeof(float),
 763             4 * sizeof(float),
 764             5 * sizeof(float),
 765             6 * sizeof(float),
 766             7 * sizeof(float),
 767             8 * sizeof(float),
 768             9 * sizeof(float),
 769             10 * sizeof(float),
 770             11 * sizeof(float),
 771             12 * sizeof(float),
 772             13 * sizeof(float),
 773             14 * sizeof(float),
 774             15 * sizeof(float),
 775         };
 776
 777         static_assert(sizeof(typename SIMD_T::Integer) <= sizeof(elemOffset), "Clipper::ComputeOffsets, Increase number of element offsets.");
 778
 779         typename SIMD_T::Integer vElemOffset = SIMD_T::loadu_si(reinterpret_cast<const typename SIMD_T::Integer *>(elemOffset));
 780
 781         // step to the simdvertex
 782         typename SIMD_T::Integer vOffsets = SIMD_T::mullo_epi32(vIndices, SIMD_T::set1_epi32(simdVertexStride));
 783
 784         // step to the attribute and component
 785         vOffsets = SIMD_T::add_epi32(vOffsets, SIMD_T::set1_epi32(attribStride * attrib + componentStride * component));
 786
 787         // step to the lane
 788         vOffsets = SIMD_T::add_epi32(vOffsets, vElemOffset);
 789
 790         return vOffsets;
 791     }
 792
 793     typename SIMD_T::Float GatherComponent(const float* pBuffer, uint32_t attrib, typename SIMD_T::Float const &vMask, typename SIMD_T::Integer const &vIndices, uint32_t component)
 794     {
 795         typename SIMD_T::Integer vOffsets = ComputeOffsets(attrib, vIndices, component);
 796         typename SIMD_T::Float vSrc = SIMD_T::setzero_ps();
 797
 798         return SIMD_T::template mask_i32gather_ps<typename SIMD_T::ScaleFactor(1)>(vSrc, pBuffer, vOffsets, vMask);
 799     }
 800
 801     void ScatterComponent(const float* pBuffer, uint32_t attrib, typename SIMD_T::Float const &vMask, typename SIMD_T::Integer const &vIndices, uint32_t component, typename SIMD_T::Float const &vSrc)
 802     {
 803         typename SIMD_T::Integer vOffsets = ComputeOffsets(attrib, vIndices, component);
 804
 805         const uint32_t *pOffsets = reinterpret_cast<const uint32_t *>(&vOffsets);
 806         const float *pSrc = reinterpret_cast<const float *>(&vSrc);
 807         uint32_t mask = SIMD_T::movemask_ps(vMask);
 808         DWORD lane;
 809         while (_BitScanForward(&lane, mask))
 810         {
 811             mask &= ~(1 << lane);
 812             const uint8_t *pBuf = reinterpret_cast<const uint8_t *>(pBuffer) + pOffsets[lane];
 813             *(float *)pBuf = pSrc[lane];
 814         }
 815     }
 816
 817     template<SWR_CLIPCODES ClippingPlane>
 818     void intersect(
 819         const typename SIMD_T::Float &vActiveMask,  // active lanes to operate on
 820         const typename SIMD_T::Integer &s,          // index to first edge vertex v0 in pInPts.
 821         const typename SIMD_T::Integer &p,          // index to second edge vertex v1 in pInPts.
 822         const typename SIMD_T::Vec4 &v1,            // vertex 0 position
 823         const typename SIMD_T::Vec4 &v2,            // vertex 1 position
 824         typename SIMD_T::Integer &outIndex,         // output index.
 825         const float *pInVerts,                      // array of all the input positions.
 826         uint32_t numInAttribs,                      // number of attributes per vertex.
 827         float *pOutVerts)                           // array of output positions. We'll write our new intersection point at i*4.
 828     {
 829         uint32_t vertexAttribOffset = this->state.backendState.vertexAttribOffset;
 830         uint32_t vertexClipCullOffset = this->state.backendState.vertexClipCullOffset;
 831
 832         // compute interpolation factor
 833         typename SIMD_T::Float t;
 834         switch (ClippingPlane)
 835         {
 836         case FRUSTUM_LEFT:      t = ComputeInterpFactor(SIMD_T::add_ps(v1[3], v1[0]), SIMD_T::add_ps(v2[3], v2[0])); break;
 837         case FRUSTUM_RIGHT:     t = ComputeInterpFactor(SIMD_T::sub_ps(v1[3], v1[0]), SIMD_T::sub_ps(v2[3], v2[0])); break;
 838         case FRUSTUM_TOP:       t = ComputeInterpFactor(SIMD_T::add_ps(v1[3], v1[1]), SIMD_T::add_ps(v2[3], v2[1])); break;
 839         case FRUSTUM_BOTTOM:    t = ComputeInterpFactor(SIMD_T::sub_ps(v1[3], v1[1]), SIMD_T::sub_ps(v2[3], v2[1])); break;
 840         case FRUSTUM_NEAR:
 841             // DX Znear plane is 0, GL is -w
 842             if (this->state.rastState.clipHalfZ)
 843             {
 844                 t = ComputeInterpFactor(v1[2], v2[2]);
 845             }
 846             else
 847             {
 848                 t = ComputeInterpFactor(SIMD_T::add_ps(v1[3], v1[2]), SIMD_T::add_ps(v2[3], v2[2]));
 849             }
 850             break;
 851         case FRUSTUM_FAR:       t = ComputeInterpFactor(SIMD_T::sub_ps(v1[3], v1[2]), SIMD_T::sub_ps(v2[3], v2[2])); break;
 852         default: SWR_INVALID("invalid clipping plane: %d", ClippingPlane);
 853         };
 854
 855         // interpolate position and store
 856         for (uint32_t c = 0; c < 4; ++c)
 857         {
 858             typename SIMD_T::Float vOutPos = SIMD_T::fmadd_ps(SIMD_T::sub_ps(v2[c], v1[c]), t, v1[c]);
 859             ScatterComponent(pOutVerts, VERTEX_POSITION_SLOT, vActiveMask, outIndex, c, vOutPos);
 860         }
 861
 862         // interpolate attributes and store
 863         for (uint32_t a = 0; a < numInAttribs; ++a)
 864         {
 865             uint32_t attribSlot = vertexAttribOffset + a;
 866             for (uint32_t c = 0; c < 4; ++c)
 867             {
 868                 typename SIMD_T::Float vAttrib0 = GatherComponent(pInVerts, attribSlot, vActiveMask, s, c);
 869                 typename SIMD_T::Float vAttrib1 = GatherComponent(pInVerts, attribSlot, vActiveMask, p, c);
 870                 typename SIMD_T::Float vOutAttrib = SIMD_T::fmadd_ps(SIMD_T::sub_ps(vAttrib1, vAttrib0), t, vAttrib0);
 871                 ScatterComponent(pOutVerts, attribSlot, vActiveMask, outIndex, c, vOutAttrib);
 872             }
 873         }
 874
 875         // interpolate clip distance if enabled
 876         if (this->state.backendState.clipDistanceMask & 0xf)
 877         {
 878             uint32_t attribSlot = vertexClipCullOffset;
 879             for (uint32_t c = 0; c < 4; ++c)
 880             {
 881                 typename SIMD_T::Float vAttrib0 = GatherComponent(pInVerts, attribSlot, vActiveMask, s, c);
 882                 typename SIMD_T::Float vAttrib1 = GatherComponent(pInVerts, attribSlot, vActiveMask, p, c);
 883                 typename SIMD_T::Float vOutAttrib = SIMD_T::fmadd_ps(SIMD_T::sub_ps(vAttrib1, vAttrib0), t, vAttrib0);
 884                 ScatterComponent(pOutVerts, attribSlot, vActiveMask, outIndex, c, vOutAttrib);
 885             }
 886         }
 887
 888         if (this->state.backendState.clipDistanceMask & 0xf0)
 889         {
 890             uint32_t attribSlot = vertexClipCullOffset + 1;
 891             for (uint32_t c = 0; c < 4; ++c)
 892             {
 893                 typename SIMD_T::Float vAttrib0 = GatherComponent(pInVerts, attribSlot, vActiveMask, s, c);
 894                 typename SIMD_T::Float vAttrib1 = GatherComponent(pInVerts, attribSlot, vActiveMask, p, c);
 895                 typename SIMD_T::Float vOutAttrib = SIMD_T::fmadd_ps(SIMD_T::sub_ps(vAttrib1, vAttrib0), t, vAttrib0);
 896                 ScatterComponent(pOutVerts, attribSlot, vActiveMask, outIndex, c, vOutAttrib);
 897             }
 898         }
 899     }
 900
 901     template<SWR_CLIPCODES ClippingPlane>
 902     typename SIMD_T::Float inside(const typename SIMD_T::Vec4 &v)
 903     {
 904         switch (ClippingPlane)
 905         {
 906         case FRUSTUM_LEFT:      return SIMD_T::cmpge_ps(v[0], SIMD_T::mul_ps(v[3], SIMD_T::set1_ps(-1.0f)));
 907         case FRUSTUM_RIGHT:     return SIMD_T::cmple_ps(v[0], v[3]);
 908         case FRUSTUM_TOP:       return SIMD_T::cmpge_ps(v[1], SIMD_T::mul_ps(v[3], SIMD_T::set1_ps(-1.0f)));
 909         case FRUSTUM_BOTTOM:    return SIMD_T::cmple_ps(v[1], v[3]);
 910         case FRUSTUM_NEAR:      return SIMD_T::cmpge_ps(v[2], this->state.rastState.clipHalfZ ? SIMD_T::setzero_ps() : SIMD_T::mul_ps(v[3], SIMD_T::set1_ps(-1.0f)));
 911         case FRUSTUM_FAR:       return SIMD_T::cmple_ps(v[2], v[3]);
 912         default:
 913             SWR_INVALID("invalid clipping plane: %d", ClippingPlane);
 914             return SIMD_T::setzero_ps();
 915         }
 916     }
 917
 918     template<SWR_CLIPCODES ClippingPlane>
 919     typename SIMD_T::Integer ClipTriToPlane(const float *pInVerts, const typename SIMD_T::Integer &vNumInPts, uint32_t numInAttribs, float *pOutVerts)
 920     {
 921         uint32_t vertexAttribOffset = this->state.backendState.vertexAttribOffset;
 922
 923         typename SIMD_T::Integer vCurIndex = SIMD_T::setzero_si();
 924         typename SIMD_T::Integer vOutIndex = SIMD_T::setzero_si();
 925         typename SIMD_T::Float vActiveMask = SIMD_T::castsi_ps(SIMD_T::cmplt_epi32(vCurIndex, vNumInPts));
 926
 927         while (!SIMD_T::testz_ps(vActiveMask, vActiveMask)) // loop until activeMask is empty
 928         {
 929             typename SIMD_T::Integer s = vCurIndex;
 930             typename SIMD_T::Integer p = SIMD_T::add_epi32(s, SIMD_T::set1_epi32(1));
 931             typename SIMD_T::Integer underFlowMask = SIMD_T::cmpgt_epi32(vNumInPts, p);
 932             p = SIMD_T::castps_si(SIMD_T::blendv_ps(SIMD_T::setzero_ps(), SIMD_T::castsi_ps(p), SIMD_T::castsi_ps(underFlowMask)));
 933
 934             // gather position
 935             typename SIMD_T::Vec4 vInPos0, vInPos1;
 936             for (uint32_t c = 0; c < 4; ++c)
 937             {
 938                 vInPos0[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, s, c);
 939                 vInPos1[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, p, c);
 940             }
 941
 942             // compute inside mask
 943             typename SIMD_T::Float s_in = inside<ClippingPlane>(vInPos0);
 944             typename SIMD_T::Float p_in = inside<ClippingPlane>(vInPos1);
 945
 946             // compute intersection mask (s_in != p_in)
 947             typename SIMD_T::Float intersectMask = SIMD_T::xor_ps(s_in, p_in);
 948             intersectMask = SIMD_T::and_ps(intersectMask, vActiveMask);
 949
 950             // store s if inside
 951             s_in = SIMD_T::and_ps(s_in, vActiveMask);
 952             if (!SIMD_T::testz_ps(s_in, s_in))
 953             {
 954                 // store position
 955                 for (uint32_t c = 0; c < 4; ++c)
 956                 {
 957                     ScatterComponent(pOutVerts, VERTEX_POSITION_SLOT, s_in, vOutIndex, c, vInPos0[c]);
 958                 }
 959
 960                 // store attribs
 961                 for (uint32_t a = 0; a < numInAttribs; ++a)
 962                 {
 963                     uint32_t attribSlot = vertexAttribOffset + a;
 964                     for (uint32_t c = 0; c < 4; ++c)
 965                     {
 966                         typename SIMD_T::Float vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
 967                         ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib);
 968                     }
 969                 }
 970
 971                 // store clip distance if enabled
 972                 uint32_t vertexClipCullSlot = this->state.backendState.vertexClipCullOffset;
 973                 if (this->state.backendState.clipDistanceMask & 0xf)
 974                 {
 975                     uint32_t attribSlot = vertexClipCullSlot;
 976                     for (uint32_t c = 0; c < 4; ++c)
 977                     {
 978                         typename SIMD_T::Float vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
 979                         ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib);
 980                     }
 981                 }
 982
 983                 if (this->state.backendState.clipDistanceMask & 0xf0)
 984                 {
 985                     uint32_t attribSlot = vertexClipCullSlot + 1;
 986                     for (uint32_t c = 0; c < 4; ++c)
 987                     {
 988                         typename SIMD_T::Float vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
 989                         ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib);
 990                     }
 991                 }
 992
 993                 // increment outIndex
 994                 vOutIndex = SIMD_T::blendv_epi32(vOutIndex, SIMD_T::add_epi32(vOutIndex, SIMD_T::set1_epi32(1)), s_in);
 995             }
 996
 997             // compute and store intersection
 998             if (!SIMD_T::testz_ps(intersectMask, intersectMask))
 999             {
1000                 intersect<ClippingPlane>(intersectMask, s, p, vInPos0, vInPos1, vOutIndex, pInVerts, numInAttribs, pOutVerts);
1001
1002                 // increment outIndex for active lanes
1003                 vOutIndex = SIMD_T::blendv_epi32(vOutIndex, SIMD_T::add_epi32(vOutIndex, SIMD_T::set1_epi32(1)), intersectMask);
1004             }
1005
1006             // increment loop index and update active mask
1007             vCurIndex = SIMD_T::add_epi32(vCurIndex, SIMD_T::set1_epi32(1));
1008             vActiveMask = SIMD_T::castsi_ps(SIMD_T::cmplt_epi32(vCurIndex, vNumInPts));
1009         }
1010
1011         return vOutIndex;
1012     }
1013
1014     template<SWR_CLIPCODES ClippingPlane>
1015     typename SIMD_T::Integer ClipLineToPlane(const float *pInVerts, const typename SIMD_T::Integer &vNumInPts, uint32_t numInAttribs, float *pOutVerts)
1016     {
1017         uint32_t vertexAttribOffset = this->state.backendState.vertexAttribOffset;
1018
1019         typename SIMD_T::Integer vCurIndex = SIMD_T::setzero_si();
1020         typename SIMD_T::Integer vOutIndex = SIMD_T::setzero_si();
1021         typename SIMD_T::Float vActiveMask = SIMD_T::castsi_ps(SIMD_T::cmplt_epi32(vCurIndex, vNumInPts));
1022
1023         if (!SIMD_T::testz_ps(vActiveMask, vActiveMask))
1024         {
1025             typename SIMD_T::Integer s = vCurIndex;
1026             typename SIMD_T::Integer p = SIMD_T::add_epi32(s, SIMD_T::set1_epi32(1));
1027
1028             // gather position
1029             typename SIMD_T::Vec4 vInPos0, vInPos1;
1030             for (uint32_t c = 0; c < 4; ++c)
1031             {
1032                 vInPos0[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, s, c);
1033                 vInPos1[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, p, c);
1034             }
1035
1036             // compute inside mask
1037             typename SIMD_T::Float s_in = inside<ClippingPlane>(vInPos0);
1038             typename SIMD_T::Float p_in = inside<ClippingPlane>(vInPos1);
1039
1040             // compute intersection mask (s_in != p_in)
1041             typename SIMD_T::Float intersectMask = SIMD_T::xor_ps(s_in, p_in);
1042             intersectMask = SIMD_T::and_ps(intersectMask, vActiveMask);
1043
1044             // store s if inside
1045             s_in = SIMD_T::and_ps(s_in, vActiveMask);
1046             if (!SIMD_T::testz_ps(s_in, s_in))
1047             {
1048                 for (uint32_t c = 0; c < 4; ++c)
1049                 {
1050                     ScatterComponent(pOutVerts, VERTEX_POSITION_SLOT, s_in, vOutIndex, c, vInPos0[c]);
1051                 }
1052
1053                 // interpolate attributes and store
1054                 for (uint32_t a = 0; a < numInAttribs; ++a)
1055                 {
1056                     uint32_t attribSlot = vertexAttribOffset + a;
1057                     for (uint32_t c = 0; c < 4; ++c)
1058                     {
1059                         typename SIMD_T::Float vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
1060                         ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib);
1061                     }
1062                 }
1063
1064                 // increment outIndex
1065                 vOutIndex = SIMD_T::blendv_epi32(vOutIndex, SIMD_T::add_epi32(vOutIndex, SIMD_T::set1_epi32(1)), s_in);
1066             }
1067
1068             // compute and store intersection
1069             if (!SIMD_T::testz_ps(intersectMask, intersectMask))
1070             {
1071                 intersect<ClippingPlane>(intersectMask, s, p, vInPos0, vInPos1, vOutIndex, pInVerts, numInAttribs, pOutVerts);
1072
1073                 // increment outIndex for active lanes
1074                 vOutIndex = SIMD_T::blendv_epi32(vOutIndex, SIMD_T::add_epi32(vOutIndex, SIMD_T::set1_epi32(1)), intersectMask);
1075             }
1076
1077             // store p if inside
1078             p_in = SIMD_T::and_ps(p_in, vActiveMask);
1079             if (!SIMD_T::testz_ps(p_in, p_in))
1080             {
1081                 for (uint32_t c = 0; c < 4; ++c)
1082                 {
1083                     ScatterComponent(pOutVerts, VERTEX_POSITION_SLOT, p_in, vOutIndex, c, vInPos1[c]);
1084                 }
1085
1086                 // interpolate attributes and store
1087                 for (uint32_t a = 0; a < numInAttribs; ++a)
1088                 {
1089                     uint32_t attribSlot = vertexAttribOffset + a;
1090                     for (uint32_t c = 0; c < 4; ++c)
1091                     {
1092                         typename SIMD_T::Float vAttrib = GatherComponent(pInVerts, attribSlot, p_in, p, c);
1093                         ScatterComponent(pOutVerts, attribSlot, p_in, vOutIndex, c, vAttrib);
1094                     }
1095                 }
1096
1097                 // increment outIndex
1098                 vOutIndex = SIMD_T::blendv_epi32(vOutIndex, SIMD_T::add_epi32(vOutIndex, SIMD_T::set1_epi32(1)), p_in);
1099             }
1100         }
1101
1102         return vOutIndex;
1103     }
1104
1105     typename SIMD_T::Integer ClipPrims(float *pVertices, const typename SIMD_T::Float &vPrimMask, const typename SIMD_T::Float &vClipMask, int numAttribs)
1106     {
1107         // temp storage
1108         float *pTempVerts = reinterpret_cast<float *>(ClipHelper<SIMD_T>::GetTempVertices());
1109
1110         // zero out num input verts for non-active lanes
1111         typename SIMD_T::Integer vNumInPts = SIMD_T::set1_epi32(NumVertsPerPrim);
1112         vNumInPts = SIMD_T::blendv_epi32(SIMD_T::setzero_si(), vNumInPts, vClipMask);
1113
1114         // clip prims to frustum
1115         typename SIMD_T::Integer vNumOutPts;
1116         if (NumVertsPerPrim == 3)
1117         {
1118             vNumOutPts = ClipTriToPlane<FRUSTUM_NEAR>(pVertices, vNumInPts, numAttribs, pTempVerts);
1119             vNumOutPts = ClipTriToPlane<FRUSTUM_FAR>(pTempVerts, vNumOutPts, numAttribs, pVertices);
1120             vNumOutPts = ClipTriToPlane<FRUSTUM_LEFT>(pVertices, vNumOutPts, numAttribs, pTempVerts);
1121             vNumOutPts = ClipTriToPlane<FRUSTUM_RIGHT>(pTempVerts, vNumOutPts, numAttribs, pVertices);
1122             vNumOutPts = ClipTriToPlane<FRUSTUM_BOTTOM>(pVertices, vNumOutPts, numAttribs, pTempVerts);
1123             vNumOutPts = ClipTriToPlane<FRUSTUM_TOP>(pTempVerts, vNumOutPts, numAttribs, pVertices);
1124         }
1125         else
1126         {
1127             SWR_ASSERT(NumVertsPerPrim == 2);
1128             vNumOutPts = ClipLineToPlane<FRUSTUM_NEAR>(pVertices, vNumInPts, numAttribs, pTempVerts);
1129             vNumOutPts = ClipLineToPlane<FRUSTUM_FAR>(pTempVerts, vNumOutPts, numAttribs, pVertices);
1130             vNumOutPts = ClipLineToPlane<FRUSTUM_LEFT>(pVertices, vNumOutPts, numAttribs, pTempVerts);
1131             vNumOutPts = ClipLineToPlane<FRUSTUM_RIGHT>(pTempVerts, vNumOutPts, numAttribs, pVertices);
1132             vNumOutPts = ClipLineToPlane<FRUSTUM_BOTTOM>(pVertices, vNumOutPts, numAttribs, pTempVerts);
1133             vNumOutPts = ClipLineToPlane<FRUSTUM_TOP>(pTempVerts, vNumOutPts, numAttribs, pVertices);
1134         }
1135
1136         // restore num verts for non-clipped, active lanes
1137         typename SIMD_T::Float vNonClippedMask = SIMD_T::andnot_ps(vClipMask, vPrimMask);
1138         vNumOutPts = SIMD_T::blendv_epi32(vNumOutPts, SIMD_T::set1_epi32(NumVertsPerPrim), vNonClippedMask);
1139
1140         return vNumOutPts;
1141     }
1142
1143     const uint32_t workerId{ 0 };
1144     DRAW_CONTEXT *pDC{ nullptr };
1145     const API_STATE &state;
1146     typename SIMD_T::Float clipCodes[NumVertsPerPrim];
1147 };
1148
1149
1150 // pipeline stage functions
1151 void ClipTriangles(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari const &primId);
1152 void ClipLines(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari const &primId);
1153 void ClipPoints(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari const &primId);
1154 #if USE_SIMD16_FRONTEND
1155 void SIMDCALL ClipTriangles_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari const &primId);
1156 void SIMDCALL ClipLines_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari const &primId);
1157 void SIMDCALL ClipPoints_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari const &primId);
1158 #endif
1159