src/gallium/drivers/swr/rasterizer/core/clip.h

   1 /****************************************************************************
   2 * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
   3 *
   4 * Permission is hereby granted, free of charge, to any person obtaining a
   5 * copy of this software and associated documentation files (the "Software"),
   6 * to deal in the Software without restriction, including without limitation
   7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8 * and/or sell copies of the Software, and to permit persons to whom the
   9 * Software is furnished to do so, subject to the following conditions:
  10 *
  11 * The above copyright notice and this permission notice (including the next
  12 * paragraph) shall be included in all copies or substantial portions of the
  13 * Software.
  14 *
  15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21 * IN THE SOFTWARE.
  22 *
  23 * @file clip.h
  24 *
  25 * @brief Definitions for clipping
  26 *
  27 ******************************************************************************/
  28 #pragma once
  29
  30 #include "common/simdintrin.h"
  31 #include "core/context.h"
  32 #include "core/pa.h"
  33 #include "rdtsc_core.h"
  34
  35 // Temp storage used by the clipper
  36 extern THREAD SIMDVERTEX_T<SIMD256> tlsTempVertices[7];
  37 #if USE_SIMD16_FRONTEND
  38 extern THREAD SIMDVERTEX_T<SIMD512> tlsTempVertices_simd16[7];
  39 #endif
  40
  41 enum SWR_CLIPCODES
  42 {
  43     // Shift clip codes out of the mantissa to prevent denormalized values when used in float compare.
  44     // Guardband is able to use a single high-bit with 4 separate LSBs, because it computes a union, rather than intersection, of clipcodes.
  45 #define CLIPCODE_SHIFT 23
  46     FRUSTUM_LEFT    = (0x01 << CLIPCODE_SHIFT),
  47     FRUSTUM_TOP     = (0x02 << CLIPCODE_SHIFT),
  48     FRUSTUM_RIGHT   = (0x04 << CLIPCODE_SHIFT),
  49     FRUSTUM_BOTTOM  = (0x08 << CLIPCODE_SHIFT),
  50
  51     FRUSTUM_NEAR    = (0x10 << CLIPCODE_SHIFT),
  52     FRUSTUM_FAR     = (0x20 << CLIPCODE_SHIFT),
  53
  54     NEGW            = (0x40 << CLIPCODE_SHIFT),
  55
  56     GUARDBAND_LEFT   = (0x80 << CLIPCODE_SHIFT | 0x1),
  57     GUARDBAND_TOP    = (0x80 << CLIPCODE_SHIFT | 0x2),
  58     GUARDBAND_RIGHT  = (0x80 << CLIPCODE_SHIFT | 0x4),
  59     GUARDBAND_BOTTOM = (0x80 << CLIPCODE_SHIFT | 0x8)
  60 };
  61
  62 #define GUARDBAND_CLIP_MASK (FRUSTUM_NEAR|FRUSTUM_FAR|GUARDBAND_LEFT|GUARDBAND_TOP|GUARDBAND_RIGHT|GUARDBAND_BOTTOM|NEGW)
  63
  64 template<typename SIMD_T>
  65 void ComputeClipCodes(const API_STATE &state, const typename SIMD_T::Vec4 &vertex, typename SIMD_T::Float &clipCodes, typename SIMD_T::Integer const &viewportIndexes)
  66 {
  67     clipCodes = SIMD_T::setzero_ps();
  68
  69     // -w
  70     typename SIMD_T::Float vNegW = SIMD_T::mul_ps(vertex.w,SIMD_T::set1_ps(-1.0f));
  71
  72     // FRUSTUM_LEFT
  73     typename SIMD_T::Float vRes = SIMD_T::cmplt_ps(vertex.x, vNegW);
  74     clipCodes = SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_LEFT)));
  75
  76     // FRUSTUM_TOP
  77     vRes = SIMD_T::cmplt_ps(vertex.y, vNegW);
  78     clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_TOP))));
  79
  80     // FRUSTUM_RIGHT
  81     vRes = SIMD_T::cmpgt_ps(vertex.x, vertex.w);
  82     clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_RIGHT))));
  83
  84     // FRUSTUM_BOTTOM
  85     vRes = SIMD_T::cmpgt_ps(vertex.y, vertex.w);
  86     clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_BOTTOM))));
  87
  88     if (state.rastState.depthClipEnable)
  89     {
  90         // FRUSTUM_NEAR
  91         // DX clips depth [0..w], GL clips [-w..w]
  92         if (state.rastState.clipHalfZ)
  93         {
  94             vRes = SIMD_T::cmplt_ps(vertex.z, SIMD_T::setzero_ps());
  95         }
  96         else
  97         {
  98             vRes = SIMD_T::cmplt_ps(vertex.z, vNegW);
  99         }
 100         clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_NEAR))));
 101
 102         // FRUSTUM_FAR
 103         vRes = SIMD_T::cmpgt_ps(vertex.z, vertex.w);
 104         clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_FAR))));
 105     }
 106
 107     // NEGW
 108     vRes = SIMD_T::cmple_ps(vertex.w, SIMD_T::setzero_ps());
 109     clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(NEGW))));
 110
 111     // GUARDBAND_LEFT
 112     typename SIMD_T::Float gbMult = SIMD_T::mul_ps(vNegW, SIMD_T::template i32gather_ps<typename SIMD_T::ScaleFactor(4)>(&state.gbState.left[0], viewportIndexes));
 113     vRes = SIMD_T::cmplt_ps(vertex.x, gbMult);
 114     clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_LEFT))));
 115
 116     // GUARDBAND_TOP
 117     gbMult = SIMD_T::mul_ps(vNegW, SIMD_T::template i32gather_ps<typename SIMD_T::ScaleFactor(4)>(&state.gbState.top[0], viewportIndexes));
 118     vRes = SIMD_T::cmplt_ps(vertex.y, gbMult);
 119     clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_TOP))));
 120
 121     // GUARDBAND_RIGHT
 122     gbMult = SIMD_T::mul_ps(vertex.w, SIMD_T::template i32gather_ps<typename SIMD_T::ScaleFactor(4)>(&state.gbState.right[0], viewportIndexes));
 123     vRes = SIMD_T::cmpgt_ps(vertex.x, gbMult);
 124     clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_RIGHT))));
 125
 126     // GUARDBAND_BOTTOM
 127     gbMult = SIMD_T::mul_ps(vertex.w, SIMD_T::template i32gather_ps<typename SIMD_T::ScaleFactor(4)>(&state.gbState.bottom[0], viewportIndexes));
 128     vRes = SIMD_T::cmpgt_ps(vertex.y, gbMult);
 129     clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_BOTTOM))));
 130 }
 131
 132 template<typename SIMD_T>
 133 struct BinnerChooser
 134 {
 135 };
 136
 137 template<>
 138 struct BinnerChooser<SIMD256>
 139 {
 140     PFN_PROCESS_PRIMS pfnBinFunc;
 141
 142     BinnerChooser(uint32_t numVertsPerPrim, uint32_t conservativeRast)
 143         :pfnBinFunc(nullptr)
 144     {
 145         if (numVertsPerPrim == 3)
 146         {
 147             pfnBinFunc = GetBinTrianglesFunc(conservativeRast > 0);
 148
 149         }
 150         else if (numVertsPerPrim == 2)
 151         {
 152             pfnBinFunc = BinLines;
 153         }
 154         else
 155         {
 156             SWR_ASSERT(0 && "Unexpected points in clipper.");
 157         }
 158     }
 159
 160     BinnerChooser(PRIMITIVE_TOPOLOGY topology, uint32_t conservativeRast)
 161         :pfnBinFunc(nullptr)
 162     {
 163         switch (topology)
 164         {
 165         case TOP_POINT_LIST:
 166             pfnBinFunc = BinPoints;
 167             break;
 168         case TOP_LINE_LIST:
 169         case TOP_LINE_STRIP:
 170         case TOP_LINE_LOOP:
 171         case TOP_LINE_LIST_ADJ:
 172         case TOP_LISTSTRIP_ADJ:
 173             pfnBinFunc = BinLines;
 174             break;
 175         default:
 176             pfnBinFunc = GetBinTrianglesFunc(conservativeRast > 0);
 177             break;
 178         };
 179     }
 180
 181     void BinFunc(DRAW_CONTEXT *pDC, PA_STATE &pa, uint32_t workerId, SIMD256::Vec4 prims[], uint32_t primMask, SIMD256::Integer const &primID, SIMD256::Integer &viewportIdx, SIMD256::Integer &rtIdx)
 182     {
 183         SWR_ASSERT(pfnBinFunc != nullptr);
 184
 185         pfnBinFunc(pDC, pa, workerId, prims, primMask, primID, viewportIdx, rtIdx);
 186     }
 187 };
 188
 189 #if USE_SIMD16_FRONTEND
 190 template<>
 191 struct BinnerChooser<SIMD512>
 192 {
 193     PFN_PROCESS_PRIMS_SIMD16 pfnBinFunc;
 194
 195     BinnerChooser(uint32_t numVertsPerPrim, uint32_t conservativeRast)
 196         :pfnBinFunc(nullptr)
 197     {
 198         if (numVertsPerPrim == 3)
 199         {
 200             pfnBinFunc = GetBinTrianglesFunc_simd16(conservativeRast > 0);
 201
 202         }
 203         else if (numVertsPerPrim == 2)
 204         {
 205             pfnBinFunc = BinLines_simd16;
 206         }
 207         else
 208         {
 209             SWR_ASSERT(0 && "Unexpected points in clipper.");
 210         }
 211     }
 212
 213     BinnerChooser(PRIMITIVE_TOPOLOGY topology, uint32_t conservativeRast)
 214         :pfnBinFunc(nullptr)
 215     {
 216         switch (topology)
 217         {
 218         case TOP_POINT_LIST:
 219             pfnBinFunc = BinPoints_simd16;
 220             break;
 221         case TOP_LINE_LIST:
 222         case TOP_LINE_STRIP:
 223         case TOP_LINE_LOOP:
 224         case TOP_LINE_LIST_ADJ:
 225         case TOP_LISTSTRIP_ADJ:
 226             pfnBinFunc = BinLines_simd16;
 227             break;
 228         default:
 229             pfnBinFunc = GetBinTrianglesFunc_simd16(conservativeRast > 0);
 230             break;
 231         };
 232     }
 233
 234     void BinFunc(DRAW_CONTEXT *pDC, PA_STATE &pa, uint32_t workerId, SIMD512::Vec4 prims[], uint32_t primMask, SIMD512::Integer const &primID, SIMD512::Integer &viewportIdx, SIMD512::Integer &rtIdx)
 235     {
 236         SWR_ASSERT(pfnBinFunc != nullptr);
 237
 238         pfnBinFunc(pDC, pa, workerId, prims, primMask, primID, viewportIdx, rtIdx);
 239     }
 240 };
 241
 242 #endif
 243 template<typename SIMD_T>
 244 struct SimdHelper
 245 {
 246 };
 247
 248 template<>
 249 struct SimdHelper<SIMD256>
 250 {
 251     static SIMD256::Float insert_lo_ps(SIMD256::Float a)
 252     {
 253         return a;
 254     }
 255
 256     static SIMD256::Mask cmpeq_ps_mask(SIMD256::Float a, SIMD256::Float b)
 257     {
 258         return SIMD256::movemask_ps(SIMD256::cmpeq_ps(a, b));
 259     }
 260 };
 261
 262 #if USE_SIMD16_FRONTEND
 263 template<>
 264 struct SimdHelper<SIMD512>
 265 {
 266     static SIMD512::Float insert_lo_ps(SIMD256::Float a)
 267     {
 268         return SIMD512::insert_ps<0>(SIMD512::setzero_ps(), a);
 269     }
 270
 271     static SIMD512::Mask cmpeq_ps_mask(SIMD512::Float a, SIMD512::Float b)
 272     {
 273         return SIMD512::cmp_ps_mask<SIMD16::CompareType::EQ_OQ>(a, b);
 274     }
 275 };
 276
 277 #endif
 278 // Temp storage used by the clipper
 279 template<typename SIMD_T>
 280 struct ClipHelper
 281 {
 282 };
 283
 284 template<>
 285 struct ClipHelper<SIMD256>
 286 {
 287     static SIMDVERTEX_T<SIMD256> *GetTempVertices()
 288     {
 289         return tlsTempVertices;
 290     }
 291 };
 292
 293 #if USE_SIMD16_FRONTEND
 294 template<>
 295 struct ClipHelper<SIMD512>
 296 {
 297     static SIMDVERTEX_T<SIMD512> *GetTempVertices()
 298     {
 299         return tlsTempVertices_simd16;
 300     }
 301 };
 302
 303 #endif
 304 template<typename SIMD_T, uint32_t NumVertsPerPrim>
 305 class Clipper
 306 {
 307 public:
 308     INLINE Clipper(uint32_t in_workerId, DRAW_CONTEXT* in_pDC) :
 309         workerId(in_workerId), pDC(in_pDC), state(GetApiState(in_pDC))
 310     {
 311         static_assert(NumVertsPerPrim >= 1 && NumVertsPerPrim <= 3, "Invalid NumVertsPerPrim");
 312     }
 313
 314     void ComputeClipCodes(typename SIMD_T::Vec4 vertex[], const typename SIMD_T::Integer &viewportIndexes)
 315     {
 316         for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
 317         {
 318             ::ComputeClipCodes<SIMD_T>(state, vertex[i], clipCodes[i], viewportIndexes);
 319         }
 320     }
 321
 322     typename SIMD_T::Float ComputeClipCodeIntersection()
 323     {
 324         typename SIMD_T::Float result = clipCodes[0];
 325
 326         for (uint32_t i = 1; i < NumVertsPerPrim; ++i)
 327         {
 328             result = SIMD_T::and_ps(result, clipCodes[i]);
 329         }
 330
 331         return result;
 332     }
 333
 334     typename SIMD_T::Float ComputeClipCodeUnion()
 335     {
 336         typename SIMD_T::Float result = clipCodes[0];
 337
 338         for (uint32_t i = 1; i < NumVertsPerPrim; ++i)
 339         {
 340             result = SIMD_T::or_ps(result, clipCodes[i]);
 341         }
 342
 343         return result;
 344     }
 345
 346     int ComputeClipMask()
 347     {
 348         typename SIMD_T::Float clipUnion = ComputeClipCodeUnion();
 349
 350         clipUnion = SIMD_T::and_ps(clipUnion, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_CLIP_MASK)));
 351
 352         return SIMD_T::movemask_ps(SIMD_T::cmpneq_ps(clipUnion, SIMD_T::setzero_ps()));
 353     }
 354
 355     // clipper is responsible for culling any prims with NAN coordinates
 356     int ComputeNaNMask(typename SIMD_T::Vec4 prim[])
 357     {
 358         typename SIMD_T::Float vNanMask = SIMD_T::setzero_ps();
 359
 360         for (uint32_t e = 0; e < NumVertsPerPrim; ++e)
 361         {
 362             typename SIMD_T::Float vNan01 = SIMD_T::template cmp_ps<SIMD_T::CompareType::UNORD_Q>(prim[e].v[0], prim[e].v[1]);
 363             vNanMask = SIMD_T::or_ps(vNanMask, vNan01);
 364
 365             typename SIMD_T::Float vNan23 = SIMD_T::template cmp_ps<SIMD_T::CompareType::UNORD_Q>(prim[e].v[2], prim[e].v[3]);
 366             vNanMask = SIMD_T::or_ps(vNanMask, vNan23);
 367         }
 368
 369         return SIMD_T::movemask_ps(vNanMask);
 370     }
 371
 372     int ComputeUserClipCullMask(PA_STATE &pa, typename SIMD_T::Vec4 prim[])
 373     {
 374         uint8_t cullMask = state.backendState.cullDistanceMask;
 375         uint32_t vertexClipCullOffset = state.backendState.vertexClipCullOffset;
 376
 377         typename SIMD_T::Float vClipCullMask = SIMD_T::setzero_ps();
 378
 379         typename SIMD_T::Vec4 vClipCullDistLo[3];
 380         typename SIMD_T::Vec4 vClipCullDistHi[3];
 381
 382         pa.Assemble(vertexClipCullOffset, vClipCullDistLo);
 383         pa.Assemble(vertexClipCullOffset + 1, vClipCullDistHi);
 384
 385         DWORD index;
 386         while (_BitScanForward(&index, cullMask))
 387         {
 388             cullMask &= ~(1 << index);
 389             uint32_t slot = index >> 2;
 390             uint32_t component = index & 0x3;
 391
 392             typename SIMD_T::Float vCullMaskElem = SIMD_T::set1_ps(-1.0f);
 393             for (uint32_t e = 0; e < NumVertsPerPrim; ++e)
 394             {
 395                 typename SIMD_T::Float vCullComp;
 396                 if (slot == 0)
 397                 {
 398                     vCullComp = vClipCullDistLo[e][component];
 399                 }
 400                 else
 401                 {
 402                     vCullComp = vClipCullDistHi[e][component];
 403                 }
 404
 405                 // cull if cull distance < 0 || NAN
 406                 typename SIMD_T::Float vCull = SIMD_T::template cmp_ps<SIMD_T::CompareType::NLE_UQ>(SIMD_T::setzero_ps(), vCullComp);
 407                 vCullMaskElem = SIMD_T::and_ps(vCullMaskElem, vCull);
 408             }
 409             vClipCullMask = SIMD_T::or_ps(vClipCullMask, vCullMaskElem);
 410         }
 411
 412         // clipper should also discard any primitive with NAN clip distance
 413         uint8_t clipMask = state.backendState.clipDistanceMask;
 414         while (_BitScanForward(&index, clipMask))
 415         {
 416             clipMask &= ~(1 << index);
 417             uint32_t slot = index >> 2;
 418             uint32_t component = index & 0x3;
 419
 420             for (uint32_t e = 0; e < NumVertsPerPrim; ++e)
 421             {
 422                 typename SIMD_T::Float vClipComp;
 423                 if (slot == 0)
 424                 {
 425                     vClipComp = vClipCullDistLo[e][component];
 426                 }
 427                 else
 428                 {
 429                     vClipComp = vClipCullDistHi[e][component];
 430                 }
 431
 432                 typename SIMD_T::Float vClip = SIMD_T::template cmp_ps<SIMD_T::CompareType::UNORD_Q>(vClipComp, vClipComp);
 433                 vClipCullMask = SIMD_T::or_ps(vClipCullMask, vClip);
 434             }
 435         }
 436
 437         return SIMD_T::movemask_ps(vClipCullMask);
 438     }
 439
 440     void ClipSimd(const typename SIMD_T::Vec4 prim[], const typename SIMD_T::Float &vPrimMask, const typename SIMD_T::Float &vClipMask, PA_STATE &pa,
 441                   const typename SIMD_T::Integer &vPrimId, const typename SIMD_T::Integer &vViewportIdx, const typename SIMD_T::Integer &vRtIdx)
 442     {
 443         // input/output vertex store for clipper
 444         SIMDVERTEX_T<SIMD_T> vertices[7]; // maximum 7 verts generated per triangle
 445
 446         uint32_t constantInterpMask = state.backendState.constantInterpolationMask;
 447         uint32_t provokingVertex = 0;
 448         if (pa.binTopology == TOP_TRIANGLE_FAN)
 449         {
 450             provokingVertex = state.frontendState.provokingVertex.triFan;
 451         }
 452         ///@todo: line topology for wireframe?
 453
 454         // assemble pos
 455         typename SIMD_T::Vec4 tmpVector[NumVertsPerPrim];
 456         for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
 457         {
 458             vertices[i].attrib[VERTEX_POSITION_SLOT] = prim[i];
 459         }
 460
 461         // assemble attribs
 462         const SWR_BACKEND_STATE& backendState = state.backendState;
 463
 464         int32_t maxSlot = -1;
 465         for (uint32_t slot = 0; slot < backendState.numAttributes; ++slot)
 466         {
 467             // Compute absolute attrib slot in vertex array
 468             uint32_t mapSlot = backendState.swizzleEnable ? backendState.swizzleMap[slot].sourceAttrib : slot;
 469             maxSlot = std::max<int32_t>(maxSlot, mapSlot);
 470             uint32_t inputSlot = backendState.vertexAttribOffset + mapSlot;
 471
 472             pa.Assemble(inputSlot, tmpVector);
 473
 474             // if constant interpolation enabled for this attribute, assign the provoking
 475             // vertex values to all edges
 476             if (CheckBit(constantInterpMask, slot))
 477             {
 478                 for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
 479                 {
 480                     vertices[i].attrib[inputSlot] = tmpVector[provokingVertex];
 481                 }
 482             }
 483             else
 484             {
 485                 for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
 486                 {
 487                     vertices[i].attrib[inputSlot] = tmpVector[i];
 488                 }
 489             }
 490         }
 491
 492         // assemble user clip distances if enabled
 493         uint32_t vertexClipCullSlot = state.backendState.vertexClipCullOffset;
 494         if (state.backendState.clipDistanceMask & 0xf)
 495         {
 496             pa.Assemble(vertexClipCullSlot, tmpVector);
 497             for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
 498             {
 499                 vertices[i].attrib[vertexClipCullSlot] = tmpVector[i];
 500             }
 501         }
 502
 503         if (state.backendState.clipDistanceMask & 0xf0)
 504         {
 505             pa.Assemble(vertexClipCullSlot + 1, tmpVector);
 506             for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
 507             {
 508                 vertices[i].attrib[vertexClipCullSlot + 1] = tmpVector[i];
 509             }
 510         }
 511
 512         uint32_t numAttribs = maxSlot + 1;
 513
 514         typename SIMD_T::Integer vNumClippedVerts = ClipPrims((float*)&vertices[0], vPrimMask, vClipMask, numAttribs);
 515
 516         BinnerChooser<SIMD_T> binner(NumVertsPerPrim, pa.pDC->pState->state.rastState.conservativeRast);
 517
 518         // set up new PA for binning clipped primitives
 519         PRIMITIVE_TOPOLOGY clipTopology = TOP_UNKNOWN;
 520         if (NumVertsPerPrim == 3)
 521         {
 522             clipTopology = TOP_TRIANGLE_FAN;
 523
 524             // so that the binner knows to bloat wide points later
 525             if (pa.binTopology == TOP_POINT_LIST)
 526             {
 527                 clipTopology = TOP_POINT_LIST;
 528             }
 529         }
 530         else if (NumVertsPerPrim == 2)
 531         {
 532             clipTopology = TOP_LINE_LIST;
 533         }
 534         else
 535         {
 536             SWR_ASSERT(0 && "Unexpected points in clipper.");
 537         }
 538
 539         const uint32_t *pVertexCount = reinterpret_cast<const uint32_t *>(&vNumClippedVerts);
 540         const uint32_t *pPrimitiveId = reinterpret_cast<const uint32_t *>(&vPrimId);
 541         const uint32_t *pViewportIdx = reinterpret_cast<const uint32_t *>(&vViewportIdx);
 542         const uint32_t *pRtIdx = reinterpret_cast<const uint32_t *>(&vRtIdx);
 543
 544         const SIMD256::Integer vOffsets = SIMD256::set_epi32(
 545             0 * sizeof(SIMDVERTEX_T<SIMD_T>), // unused lane
 546             6 * sizeof(SIMDVERTEX_T<SIMD_T>),
 547             5 * sizeof(SIMDVERTEX_T<SIMD_T>),
 548             4 * sizeof(SIMDVERTEX_T<SIMD_T>),
 549             3 * sizeof(SIMDVERTEX_T<SIMD_T>),
 550             2 * sizeof(SIMDVERTEX_T<SIMD_T>),
 551             1 * sizeof(SIMDVERTEX_T<SIMD_T>),
 552             0 * sizeof(SIMDVERTEX_T<SIMD_T>));
 553
 554         // only need to gather 7 verts
 555         // @todo dynamic mask based on actual # of verts generated per lane
 556         const SIMD256::Float vMask = SIMD256::set_ps(0, -1, -1, -1, -1, -1, -1, -1);
 557
 558         uint32_t numClippedPrims = 0;
 559
 560         // tranpose clipper output so that each lane's vertices are in SIMD order
 561         // set aside space for 2 vertices, as the PA will try to read up to 16 verts
 562         // for triangle fan
 563
 564 #if defined(_DEBUG)
 565         // TODO: need to increase stack size, allocating SIMD16-widened transposedPrims causes stack overflow in debug builds
 566         SIMDVERTEX_T<SIMD_T> *transposedPrims = reinterpret_cast<SIMDVERTEX_T<SIMD_T> *>(AlignedMalloc(sizeof(SIMDVERTEX_T<SIMD_T>) * 2, 64));
 567
 568 #else
 569         SIMDVERTEX_T<SIMD_T> transposedPrims[2];
 570
 571 #endif
 572         uint32_t numInputPrims = pa.NumPrims();
 573         for (uint32_t inputPrim = 0; inputPrim < numInputPrims; ++inputPrim)
 574         {
 575             uint32_t numEmittedVerts = pVertexCount[inputPrim];
 576             if (numEmittedVerts < NumVertsPerPrim)
 577             {
 578                 continue;
 579             }
 580             SWR_ASSERT(numEmittedVerts <= 7, "Unexpected vertex count from clipper.");
 581
 582             uint32_t numEmittedPrims = GetNumPrims(clipTopology, numEmittedVerts);
 583             SWR_ASSERT(numEmittedPrims <= 7, "Unexpected primitive count from clipper.");
 584
 585             numClippedPrims += numEmittedPrims;
 586
 587             // tranpose clipper output so that each lane's vertices are in SIMD order
 588             // set aside space for 2 vertices, as the PA will try to read up to 16 verts
 589             // for triangle fan
 590
 591             // transpose pos
 592             uint8_t *pBase = reinterpret_cast<uint8_t *>(&vertices[0].attrib[VERTEX_POSITION_SLOT]) + sizeof(float) * inputPrim;
 593
 594 #if 0
 595             // TEMPORARY WORKAROUND for bizarre VS2015 code-gen bug
 596             static const float *dummy = reinterpret_cast<const float *>(pBase);
 597
 598 #endif
 599             for (uint32_t c = 0; c < 4; ++c)
 600             {
 601                 SIMD256::Float temp = SIMD256::template mask_i32gather_ps<typename SIMD_T::ScaleFactor(1)>(SIMD256::setzero_ps(), reinterpret_cast<const float *>(pBase), vOffsets, vMask);
 602                 transposedPrims[0].attrib[VERTEX_POSITION_SLOT][c] = SimdHelper<SIMD_T>::insert_lo_ps(temp);
 603                 pBase += sizeof(typename SIMD_T::Float);
 604             }
 605
 606             // transpose attribs
 607             pBase = reinterpret_cast<uint8_t *>(&vertices[0].attrib[backendState.vertexAttribOffset]) + sizeof(float) * inputPrim;
 608
 609             for (uint32_t attrib = 0; attrib < numAttribs; ++attrib)
 610             {
 611                 uint32_t attribSlot = backendState.vertexAttribOffset + attrib;
 612
 613                 for (uint32_t c = 0; c < 4; ++c)
 614                 {
 615                     SIMD256::Float temp = SIMD256::template mask_i32gather_ps<typename SIMD_T::ScaleFactor(1)>(SIMD256::setzero_ps(), reinterpret_cast<const float *>(pBase), vOffsets, vMask);
 616                     transposedPrims[0].attrib[attribSlot][c] = SimdHelper<SIMD_T>::insert_lo_ps(temp);
 617                     pBase += sizeof(typename SIMD_T::Float);
 618                 }
 619             }
 620
 621             // transpose user clip distances if enabled
 622             uint32_t vertexClipCullSlot = backendState.vertexClipCullOffset;
 623             if (state.backendState.clipDistanceMask & 0x0f)
 624             {
 625                 pBase = reinterpret_cast<uint8_t *>(&vertices[0].attrib[vertexClipCullSlot]) + sizeof(float) * inputPrim;
 626
 627                 for (uint32_t c = 0; c < 4; ++c)
 628                 {
 629                     SIMD256::Float temp = SIMD256::template mask_i32gather_ps<typename SIMD_T::ScaleFactor(1)>(SIMD256::setzero_ps(), reinterpret_cast<const float *>(pBase), vOffsets, vMask);
 630                     transposedPrims[0].attrib[vertexClipCullSlot][c] = SimdHelper<SIMD_T>::insert_lo_ps(temp);
 631                     pBase += sizeof(typename SIMD_T::Float);
 632                 }
 633             }
 634
 635             if (state.backendState.clipDistanceMask & 0xf0)
 636             {
 637                 pBase = reinterpret_cast<uint8_t *>(&vertices[0].attrib[vertexClipCullSlot + 1]) + sizeof(float) * inputPrim;
 638
 639                 for (uint32_t c = 0; c < 4; ++c)
 640                 {
 641                     SIMD256::Float temp = SIMD256::template mask_i32gather_ps<typename SIMD_T::ScaleFactor(1)>(SIMD256::setzero_ps(), reinterpret_cast<const float *>(pBase), vOffsets, vMask);
 642                     transposedPrims[0].attrib[vertexClipCullSlot + 1][c] = SimdHelper<SIMD_T>::insert_lo_ps(temp);
 643                     pBase += sizeof(typename SIMD_T::Float);
 644                 }
 645             }
 646
 647             PA_STATE_OPT clipPA(pDC, numEmittedPrims, reinterpret_cast<uint8_t *>(&transposedPrims[0]), numEmittedVerts, SWR_VTX_NUM_SLOTS, true, NumVertsPerPrim, clipTopology);
 648             clipPA.viewportArrayActive = pa.viewportArrayActive;
 649             clipPA.rtArrayActive = pa.rtArrayActive;
 650
 651             static const uint32_t primMaskMap[] = { 0x0, 0x1, 0x3, 0x7, 0xf, 0x1f, 0x3f, 0x7f };
 652
 653             const uint32_t primMask = primMaskMap[numEmittedPrims];
 654
 655             const typename SIMD_T::Integer primID = SIMD_T::set1_epi32(pPrimitiveId[inputPrim]);
 656             const typename SIMD_T::Integer viewportIdx = SIMD_T::set1_epi32(pViewportIdx[inputPrim]);
 657             const typename SIMD_T::Integer rtIdx = SIMD_T::set1_epi32(pRtIdx[inputPrim]);
 658
 659
 660             while (clipPA.GetNextStreamOutput())
 661             {
 662                 do
 663                 {
 664                     typename SIMD_T::Vec4 attrib[NumVertsPerPrim];
 665
 666                     bool assemble = clipPA.Assemble(VERTEX_POSITION_SLOT, attrib);
 667
 668                     if (assemble)
 669                     {
 670                         binner.pfnBinFunc(pDC, clipPA, workerId, attrib, primMask, primID, viewportIdx, rtIdx);
 671                     }
 672
 673                 } while (clipPA.NextPrim());
 674             }
 675         }
 676
 677 #if defined(_DEBUG)
 678         AlignedFree(transposedPrims);
 679
 680 #endif
 681         // update global pipeline stat
 682         UPDATE_STAT_FE(CPrimitives, numClippedPrims);
 683     }
 684
 685     void ExecuteStage(PA_STATE &pa, typename SIMD_T::Vec4 prim[], uint32_t primMask,
 686                       typename SIMD_T::Integer const &primId, typename SIMD_T::Integer const &viewportIdx, typename SIMD_T::Integer const &rtIdx)
 687     {
 688         SWR_ASSERT(pa.pDC != nullptr);
 689
 690         BinnerChooser<SIMD_T> binner(pa.binTopology, pa.pDC->pState->state.rastState.conservativeRast);
 691
 692         // update clipper invocations pipeline stat
 693         uint32_t numInvoc = _mm_popcnt_u32(primMask);
 694         UPDATE_STAT_FE(CInvocations, numInvoc);
 695
 696         ComputeClipCodes(prim, viewportIdx);
 697
 698         // cull prims with NAN coords
 699         primMask &= ~ComputeNaNMask(prim);
 700
 701         // user cull distance cull
 702         if (state.backendState.cullDistanceMask)
 703         {
 704             primMask &= ~ComputeUserClipCullMask(pa, prim);
 705         }
 706
 707         // cull prims outside view frustum
 708         typename SIMD_T::Float clipIntersection = ComputeClipCodeIntersection();
 709         int validMask = primMask & SimdHelper<SIMD_T>::cmpeq_ps_mask(clipIntersection, SIMD_T::setzero_ps());
 710
 711         // skip clipping for points
 712         uint32_t clipMask = 0;
 713         if (NumVertsPerPrim != 1)
 714         {
 715             clipMask = primMask & ComputeClipMask();
 716         }
 717
 718         if (clipMask)
 719         {
 720             RDTSC_BEGIN(FEGuardbandClip, pa.pDC->drawId);
 721             // we have to clip tris, execute the clipper, which will also
 722             // call the binner
 723             ClipSimd(prim, SIMD_T::vmask_ps(primMask), SIMD_T::vmask_ps(clipMask), pa, primId, viewportIdx, rtIdx);
 724             RDTSC_END(FEGuardbandClip, 1);
 725         }
 726         else if (validMask)
 727         {
 728             // update CPrimitives pipeline state
 729             UPDATE_STAT_FE(CPrimitives, _mm_popcnt_u32(validMask));
 730
 731             // forward valid prims directly to binner
 732             binner.pfnBinFunc(this->pDC, pa, this->workerId, prim, validMask, primId, viewportIdx, rtIdx);
 733         }
 734     }
 735
 736 private:
 737     typename SIMD_T::Float ComputeInterpFactor(typename SIMD_T::Float const &boundaryCoord0, typename SIMD_T::Float const &boundaryCoord1)
 738     {
 739         return SIMD_T::div_ps(boundaryCoord0, SIMD_T::sub_ps(boundaryCoord0, boundaryCoord1));
 740     }
 741
 742     typename SIMD_T::Integer ComputeOffsets(uint32_t attrib, typename SIMD_T::Integer const &vIndices, uint32_t component)
 743     {
 744         const uint32_t simdVertexStride = sizeof(SIMDVERTEX_T<SIMD_T>);
 745         const uint32_t componentStride  = sizeof(typename SIMD_T::Float);
 746         const uint32_t attribStride     = sizeof(typename SIMD_T::Vec4);
 747
 748         static const OSALIGNSIMD16(uint32_t) elemOffset[16] =
 749         {
 750             0 * sizeof(float),
 751             1 * sizeof(float),
 752             2 * sizeof(float),
 753             3 * sizeof(float),
 754             4 * sizeof(float),
 755             5 * sizeof(float),
 756             6 * sizeof(float),
 757             7 * sizeof(float),
 758             8 * sizeof(float),
 759             9 * sizeof(float),
 760             10 * sizeof(float),
 761             11 * sizeof(float),
 762             12 * sizeof(float),
 763             13 * sizeof(float),
 764             14 * sizeof(float),
 765             15 * sizeof(float),
 766         };
 767
 768         static_assert(sizeof(typename SIMD_T::Integer) <= sizeof(elemOffset), "Clipper::ComputeOffsets, Increase number of element offsets.");
 769
 770         typename SIMD_T::Integer vElemOffset = SIMD_T::loadu_si(reinterpret_cast<const typename SIMD_T::Integer *>(elemOffset));
 771
 772         // step to the simdvertex
 773         typename SIMD_T::Integer vOffsets = SIMD_T::mullo_epi32(vIndices, SIMD_T::set1_epi32(simdVertexStride));
 774
 775         // step to the attribute and component
 776         vOffsets = SIMD_T::add_epi32(vOffsets, SIMD_T::set1_epi32(attribStride * attrib + componentStride * component));
 777
 778         // step to the lane
 779         vOffsets = SIMD_T::add_epi32(vOffsets, vElemOffset);
 780
 781         return vOffsets;
 782     }
 783
 784     typename SIMD_T::Float GatherComponent(const float* pBuffer, uint32_t attrib, typename SIMD_T::Float const &vMask, typename SIMD_T::Integer const &vIndices, uint32_t component)
 785     {
 786         typename SIMD_T::Integer vOffsets = ComputeOffsets(attrib, vIndices, component);
 787         typename SIMD_T::Float vSrc = SIMD_T::setzero_ps();
 788
 789         return SIMD_T::template mask_i32gather_ps<typename SIMD_T::ScaleFactor(1)>(vSrc, pBuffer, vOffsets, vMask);
 790     }
 791
 792     void ScatterComponent(const float* pBuffer, uint32_t attrib, typename SIMD_T::Float const &vMask, typename SIMD_T::Integer const &vIndices, uint32_t component, typename SIMD_T::Float const &vSrc)
 793     {
 794         typename SIMD_T::Integer vOffsets = ComputeOffsets(attrib, vIndices, component);
 795
 796         const uint32_t *pOffsets = reinterpret_cast<const uint32_t *>(&vOffsets);
 797         const float *pSrc = reinterpret_cast<const float *>(&vSrc);
 798         uint32_t mask = SIMD_T::movemask_ps(vMask);
 799         DWORD lane;
 800         while (_BitScanForward(&lane, mask))
 801         {
 802             mask &= ~(1 << lane);
 803             const uint8_t *pBuf = reinterpret_cast<const uint8_t *>(pBuffer) + pOffsets[lane];
 804             *(float *)pBuf = pSrc[lane];
 805         }
 806     }
 807
 808     template<SWR_CLIPCODES ClippingPlane>
 809     void intersect(
 810         const typename SIMD_T::Float &vActiveMask,  // active lanes to operate on
 811         const typename SIMD_T::Integer &s,          // index to first edge vertex v0 in pInPts.
 812         const typename SIMD_T::Integer &p,          // index to second edge vertex v1 in pInPts.
 813         const typename SIMD_T::Vec4 &v1,            // vertex 0 position
 814         const typename SIMD_T::Vec4 &v2,            // vertex 1 position
 815         typename SIMD_T::Integer &outIndex,         // output index.
 816         const float *pInVerts,                      // array of all the input positions.
 817         uint32_t numInAttribs,                      // number of attributes per vertex.
 818         float *pOutVerts)                           // array of output positions. We'll write our new intersection point at i*4.
 819     {
 820         uint32_t vertexAttribOffset = this->state.backendState.vertexAttribOffset;
 821         uint32_t vertexClipCullOffset = this->state.backendState.vertexClipCullOffset;
 822
 823         // compute interpolation factor
 824         typename SIMD_T::Float t;
 825         switch (ClippingPlane)
 826         {
 827         case FRUSTUM_LEFT:      t = ComputeInterpFactor(SIMD_T::add_ps(v1[3], v1[0]), SIMD_T::add_ps(v2[3], v2[0])); break;
 828         case FRUSTUM_RIGHT:     t = ComputeInterpFactor(SIMD_T::sub_ps(v1[3], v1[0]), SIMD_T::sub_ps(v2[3], v2[0])); break;
 829         case FRUSTUM_TOP:       t = ComputeInterpFactor(SIMD_T::add_ps(v1[3], v1[1]), SIMD_T::add_ps(v2[3], v2[1])); break;
 830         case FRUSTUM_BOTTOM:    t = ComputeInterpFactor(SIMD_T::sub_ps(v1[3], v1[1]), SIMD_T::sub_ps(v2[3], v2[1])); break;
 831         case FRUSTUM_NEAR:
 832             // DX Znear plane is 0, GL is -w
 833             if (this->state.rastState.clipHalfZ)
 834             {
 835                 t = ComputeInterpFactor(v1[2], v2[2]);
 836             }
 837             else
 838             {
 839                 t = ComputeInterpFactor(SIMD_T::add_ps(v1[3], v1[2]), SIMD_T::add_ps(v2[3], v2[2]));
 840             }
 841             break;
 842         case FRUSTUM_FAR:       t = ComputeInterpFactor(SIMD_T::sub_ps(v1[3], v1[2]), SIMD_T::sub_ps(v2[3], v2[2])); break;
 843         default: SWR_INVALID("invalid clipping plane: %d", ClippingPlane);
 844         };
 845
 846         // interpolate position and store
 847         for (uint32_t c = 0; c < 4; ++c)
 848         {
 849             typename SIMD_T::Float vOutPos = SIMD_T::fmadd_ps(SIMD_T::sub_ps(v2[c], v1[c]), t, v1[c]);
 850             ScatterComponent(pOutVerts, VERTEX_POSITION_SLOT, vActiveMask, outIndex, c, vOutPos);
 851         }
 852
 853         // interpolate attributes and store
 854         for (uint32_t a = 0; a < numInAttribs; ++a)
 855         {
 856             uint32_t attribSlot = vertexAttribOffset + a;
 857             for (uint32_t c = 0; c < 4; ++c)
 858             {
 859                 typename SIMD_T::Float vAttrib0 = GatherComponent(pInVerts, attribSlot, vActiveMask, s, c);
 860                 typename SIMD_T::Float vAttrib1 = GatherComponent(pInVerts, attribSlot, vActiveMask, p, c);
 861                 typename SIMD_T::Float vOutAttrib = SIMD_T::fmadd_ps(SIMD_T::sub_ps(vAttrib1, vAttrib0), t, vAttrib0);
 862                 ScatterComponent(pOutVerts, attribSlot, vActiveMask, outIndex, c, vOutAttrib);
 863             }
 864         }
 865
 866         // interpolate clip distance if enabled
 867         if (this->state.backendState.clipDistanceMask & 0xf)
 868         {
 869             uint32_t attribSlot = vertexClipCullOffset;
 870             for (uint32_t c = 0; c < 4; ++c)
 871             {
 872                 typename SIMD_T::Float vAttrib0 = GatherComponent(pInVerts, attribSlot, vActiveMask, s, c);
 873                 typename SIMD_T::Float vAttrib1 = GatherComponent(pInVerts, attribSlot, vActiveMask, p, c);
 874                 typename SIMD_T::Float vOutAttrib = SIMD_T::fmadd_ps(SIMD_T::sub_ps(vAttrib1, vAttrib0), t, vAttrib0);
 875                 ScatterComponent(pOutVerts, attribSlot, vActiveMask, outIndex, c, vOutAttrib);
 876             }
 877         }
 878
 879         if (this->state.backendState.clipDistanceMask & 0xf0)
 880         {
 881             uint32_t attribSlot = vertexClipCullOffset + 1;
 882             for (uint32_t c = 0; c < 4; ++c)
 883             {
 884                 typename SIMD_T::Float vAttrib0 = GatherComponent(pInVerts, attribSlot, vActiveMask, s, c);
 885                 typename SIMD_T::Float vAttrib1 = GatherComponent(pInVerts, attribSlot, vActiveMask, p, c);
 886                 typename SIMD_T::Float vOutAttrib = SIMD_T::fmadd_ps(SIMD_T::sub_ps(vAttrib1, vAttrib0), t, vAttrib0);
 887                 ScatterComponent(pOutVerts, attribSlot, vActiveMask, outIndex, c, vOutAttrib);
 888             }
 889         }
 890     }
 891
 892     template<SWR_CLIPCODES ClippingPlane>
 893     typename SIMD_T::Float inside(const typename SIMD_T::Vec4 &v)
 894     {
 895         switch (ClippingPlane)
 896         {
 897         case FRUSTUM_LEFT:      return SIMD_T::cmpge_ps(v[0], SIMD_T::mul_ps(v[3], SIMD_T::set1_ps(-1.0f)));
 898         case FRUSTUM_RIGHT:     return SIMD_T::cmple_ps(v[0], v[3]);
 899         case FRUSTUM_TOP:       return SIMD_T::cmpge_ps(v[1], SIMD_T::mul_ps(v[3], SIMD_T::set1_ps(-1.0f)));
 900         case FRUSTUM_BOTTOM:    return SIMD_T::cmple_ps(v[1], v[3]);
 901         case FRUSTUM_NEAR:      return SIMD_T::cmpge_ps(v[2], this->state.rastState.clipHalfZ ? SIMD_T::setzero_ps() : SIMD_T::mul_ps(v[3], SIMD_T::set1_ps(-1.0f)));
 902         case FRUSTUM_FAR:       return SIMD_T::cmple_ps(v[2], v[3]);
 903         default:
 904             SWR_INVALID("invalid clipping plane: %d", ClippingPlane);
 905             return SIMD_T::setzero_ps();
 906         }
 907     }
 908
 909     template<SWR_CLIPCODES ClippingPlane>
 910     typename SIMD_T::Integer ClipTriToPlane(const float *pInVerts, const typename SIMD_T::Integer &vNumInPts, uint32_t numInAttribs, float *pOutVerts)
 911     {
 912         uint32_t vertexAttribOffset = this->state.backendState.vertexAttribOffset;
 913
 914         typename SIMD_T::Integer vCurIndex = SIMD_T::setzero_si();
 915         typename SIMD_T::Integer vOutIndex = SIMD_T::setzero_si();
 916         typename SIMD_T::Float vActiveMask = SIMD_T::castsi_ps(SIMD_T::cmplt_epi32(vCurIndex, vNumInPts));
 917
 918         while (!SIMD_T::testz_ps(vActiveMask, vActiveMask)) // loop until activeMask is empty
 919         {
 920             typename SIMD_T::Integer s = vCurIndex;
 921             typename SIMD_T::Integer p = SIMD_T::add_epi32(s, SIMD_T::set1_epi32(1));
 922             typename SIMD_T::Integer underFlowMask = SIMD_T::cmpgt_epi32(vNumInPts, p);
 923             p = SIMD_T::castps_si(SIMD_T::blendv_ps(SIMD_T::setzero_ps(), SIMD_T::castsi_ps(p), SIMD_T::castsi_ps(underFlowMask)));
 924
 925             // gather position
 926             typename SIMD_T::Vec4 vInPos0, vInPos1;
 927             for (uint32_t c = 0; c < 4; ++c)
 928             {
 929                 vInPos0[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, s, c);
 930                 vInPos1[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, p, c);
 931             }
 932
 933             // compute inside mask
 934             typename SIMD_T::Float s_in = inside<ClippingPlane>(vInPos0);
 935             typename SIMD_T::Float p_in = inside<ClippingPlane>(vInPos1);
 936
 937             // compute intersection mask (s_in != p_in)
 938             typename SIMD_T::Float intersectMask = SIMD_T::xor_ps(s_in, p_in);
 939             intersectMask = SIMD_T::and_ps(intersectMask, vActiveMask);
 940
 941             // store s if inside
 942             s_in = SIMD_T::and_ps(s_in, vActiveMask);
 943             if (!SIMD_T::testz_ps(s_in, s_in))
 944             {
 945                 // store position
 946                 for (uint32_t c = 0; c < 4; ++c)
 947                 {
 948                     ScatterComponent(pOutVerts, VERTEX_POSITION_SLOT, s_in, vOutIndex, c, vInPos0[c]);
 949                 }
 950
 951                 // store attribs
 952                 for (uint32_t a = 0; a < numInAttribs; ++a)
 953                 {
 954                     uint32_t attribSlot = vertexAttribOffset + a;
 955                     for (uint32_t c = 0; c < 4; ++c)
 956                     {
 957                         typename SIMD_T::Float vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
 958                         ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib);
 959                     }
 960                 }
 961
 962                 // store clip distance if enabled
 963                 uint32_t vertexClipCullSlot = this->state.backendState.vertexClipCullOffset;
 964                 if (this->state.backendState.clipDistanceMask & 0xf)
 965                 {
 966                     uint32_t attribSlot = vertexClipCullSlot;
 967                     for (uint32_t c = 0; c < 4; ++c)
 968                     {
 969                         typename SIMD_T::Float vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
 970                         ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib);
 971                     }
 972                 }
 973
 974                 if (this->state.backendState.clipDistanceMask & 0xf0)
 975                 {
 976                     uint32_t attribSlot = vertexClipCullSlot + 1;
 977                     for (uint32_t c = 0; c < 4; ++c)
 978                     {
 979                         typename SIMD_T::Float vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
 980                         ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib);
 981                     }
 982                 }
 983
 984                 // increment outIndex
 985                 vOutIndex = SIMD_T::blendv_epi32(vOutIndex, SIMD_T::add_epi32(vOutIndex, SIMD_T::set1_epi32(1)), s_in);
 986             }
 987
 988             // compute and store intersection
 989             if (!SIMD_T::testz_ps(intersectMask, intersectMask))
 990             {
 991                 intersect<ClippingPlane>(intersectMask, s, p, vInPos0, vInPos1, vOutIndex, pInVerts, numInAttribs, pOutVerts);
 992
 993                 // increment outIndex for active lanes
 994                 vOutIndex = SIMD_T::blendv_epi32(vOutIndex, SIMD_T::add_epi32(vOutIndex, SIMD_T::set1_epi32(1)), intersectMask);
 995             }
 996
 997             // increment loop index and update active mask
 998             vCurIndex = SIMD_T::add_epi32(vCurIndex, SIMD_T::set1_epi32(1));
 999             vActiveMask = SIMD_T::castsi_ps(SIMD_T::cmplt_epi32(vCurIndex, vNumInPts));
1000         }
1001
1002         return vOutIndex;
1003     }
1004
1005     template<SWR_CLIPCODES ClippingPlane>
1006     typename SIMD_T::Integer ClipLineToPlane(const float *pInVerts, const typename SIMD_T::Integer &vNumInPts, uint32_t numInAttribs, float *pOutVerts)
1007     {
1008         uint32_t vertexAttribOffset = this->state.backendState.vertexAttribOffset;
1009
1010         typename SIMD_T::Integer vCurIndex = SIMD_T::setzero_si();
1011         typename SIMD_T::Integer vOutIndex = SIMD_T::setzero_si();
1012         typename SIMD_T::Float vActiveMask = SIMD_T::castsi_ps(SIMD_T::cmplt_epi32(vCurIndex, vNumInPts));
1013
1014         if (!SIMD_T::testz_ps(vActiveMask, vActiveMask))
1015         {
1016             typename SIMD_T::Integer s = vCurIndex;
1017             typename SIMD_T::Integer p = SIMD_T::add_epi32(s, SIMD_T::set1_epi32(1));
1018
1019             // gather position
1020             typename SIMD_T::Vec4 vInPos0, vInPos1;
1021             for (uint32_t c = 0; c < 4; ++c)
1022             {
1023                 vInPos0[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, s, c);
1024                 vInPos1[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, p, c);
1025             }
1026
1027             // compute inside mask
1028             typename SIMD_T::Float s_in = inside<ClippingPlane>(vInPos0);
1029             typename SIMD_T::Float p_in = inside<ClippingPlane>(vInPos1);
1030
1031             // compute intersection mask (s_in != p_in)
1032             typename SIMD_T::Float intersectMask = SIMD_T::xor_ps(s_in, p_in);
1033             intersectMask = SIMD_T::and_ps(intersectMask, vActiveMask);
1034
1035             // store s if inside
1036             s_in = SIMD_T::and_ps(s_in, vActiveMask);
1037             if (!SIMD_T::testz_ps(s_in, s_in))
1038             {
1039                 for (uint32_t c = 0; c < 4; ++c)
1040                 {
1041                     ScatterComponent(pOutVerts, VERTEX_POSITION_SLOT, s_in, vOutIndex, c, vInPos0[c]);
1042                 }
1043
1044                 // interpolate attributes and store
1045                 for (uint32_t a = 0; a < numInAttribs; ++a)
1046                 {
1047                     uint32_t attribSlot = vertexAttribOffset + a;
1048                     for (uint32_t c = 0; c < 4; ++c)
1049                     {
1050                         typename SIMD_T::Float vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
1051                         ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib);
1052                     }
1053                 }
1054
1055                 // increment outIndex
1056                 vOutIndex = SIMD_T::blendv_epi32(vOutIndex, SIMD_T::add_epi32(vOutIndex, SIMD_T::set1_epi32(1)), s_in);
1057             }
1058
1059             // compute and store intersection
1060             if (!SIMD_T::testz_ps(intersectMask, intersectMask))
1061             {
1062                 intersect<ClippingPlane>(intersectMask, s, p, vInPos0, vInPos1, vOutIndex, pInVerts, numInAttribs, pOutVerts);
1063
1064                 // increment outIndex for active lanes
1065                 vOutIndex = SIMD_T::blendv_epi32(vOutIndex, SIMD_T::add_epi32(vOutIndex, SIMD_T::set1_epi32(1)), intersectMask);
1066             }
1067
1068             // store p if inside
1069             p_in = SIMD_T::and_ps(p_in, vActiveMask);
1070             if (!SIMD_T::testz_ps(p_in, p_in))
1071             {
1072                 for (uint32_t c = 0; c < 4; ++c)
1073                 {
1074                     ScatterComponent(pOutVerts, VERTEX_POSITION_SLOT, p_in, vOutIndex, c, vInPos1[c]);
1075                 }
1076
1077                 // interpolate attributes and store
1078                 for (uint32_t a = 0; a < numInAttribs; ++a)
1079                 {
1080                     uint32_t attribSlot = vertexAttribOffset + a;
1081                     for (uint32_t c = 0; c < 4; ++c)
1082                     {
1083                         typename SIMD_T::Float vAttrib = GatherComponent(pInVerts, attribSlot, p_in, p, c);
1084                         ScatterComponent(pOutVerts, attribSlot, p_in, vOutIndex, c, vAttrib);
1085                     }
1086                 }
1087
1088                 // increment outIndex
1089                 vOutIndex = SIMD_T::blendv_epi32(vOutIndex, SIMD_T::add_epi32(vOutIndex, SIMD_T::set1_epi32(1)), p_in);
1090             }
1091         }
1092
1093         return vOutIndex;
1094     }
1095
1096     typename SIMD_T::Integer ClipPrims(float *pVertices, const typename SIMD_T::Float &vPrimMask, const typename SIMD_T::Float &vClipMask, int numAttribs)
1097     {
1098         // temp storage
1099         float *pTempVerts = reinterpret_cast<float *>(ClipHelper<SIMD_T>::GetTempVertices());
1100
1101         // zero out num input verts for non-active lanes
1102         typename SIMD_T::Integer vNumInPts = SIMD_T::set1_epi32(NumVertsPerPrim);
1103         vNumInPts = SIMD_T::blendv_epi32(SIMD_T::setzero_si(), vNumInPts, vClipMask);
1104
1105         // clip prims to frustum
1106         typename SIMD_T::Integer vNumOutPts;
1107         if (NumVertsPerPrim == 3)
1108         {
1109             vNumOutPts = ClipTriToPlane<FRUSTUM_NEAR>(pVertices, vNumInPts, numAttribs, pTempVerts);
1110             vNumOutPts = ClipTriToPlane<FRUSTUM_FAR>(pTempVerts, vNumOutPts, numAttribs, pVertices);
1111             vNumOutPts = ClipTriToPlane<FRUSTUM_LEFT>(pVertices, vNumOutPts, numAttribs, pTempVerts);
1112             vNumOutPts = ClipTriToPlane<FRUSTUM_RIGHT>(pTempVerts, vNumOutPts, numAttribs, pVertices);
1113             vNumOutPts = ClipTriToPlane<FRUSTUM_BOTTOM>(pVertices, vNumOutPts, numAttribs, pTempVerts);
1114             vNumOutPts = ClipTriToPlane<FRUSTUM_TOP>(pTempVerts, vNumOutPts, numAttribs, pVertices);
1115         }
1116         else
1117         {
1118             SWR_ASSERT(NumVertsPerPrim == 2);
1119             vNumOutPts = ClipLineToPlane<FRUSTUM_NEAR>(pVertices, vNumInPts, numAttribs, pTempVerts);
1120             vNumOutPts = ClipLineToPlane<FRUSTUM_FAR>(pTempVerts, vNumOutPts, numAttribs, pVertices);
1121             vNumOutPts = ClipLineToPlane<FRUSTUM_LEFT>(pVertices, vNumOutPts, numAttribs, pTempVerts);
1122             vNumOutPts = ClipLineToPlane<FRUSTUM_RIGHT>(pTempVerts, vNumOutPts, numAttribs, pVertices);
1123             vNumOutPts = ClipLineToPlane<FRUSTUM_BOTTOM>(pVertices, vNumOutPts, numAttribs, pTempVerts);
1124             vNumOutPts = ClipLineToPlane<FRUSTUM_TOP>(pTempVerts, vNumOutPts, numAttribs, pVertices);
1125         }
1126
1127         // restore num verts for non-clipped, active lanes
1128         typename SIMD_T::Float vNonClippedMask = SIMD_T::andnot_ps(vClipMask, vPrimMask);
1129         vNumOutPts = SIMD_T::blendv_epi32(vNumOutPts, SIMD_T::set1_epi32(NumVertsPerPrim), vNonClippedMask);
1130
1131         return vNumOutPts;
1132     }
1133
1134     const uint32_t workerId{ 0 };
1135     DRAW_CONTEXT *pDC{ nullptr };
1136     const API_STATE &state;
1137     typename SIMD_T::Float clipCodes[NumVertsPerPrim];
1138 };
1139
1140
1141 // pipeline stage functions
1142 void ClipTriangles(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari const &primId, simdscalari const &viewportIdx, simdscalari const &rtIdx);
1143 void ClipLines(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari const &primId, simdscalari const &viewportIdx, simdscalari const &rtIdx);
1144 void ClipPoints(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari const &primId, simdscalari const &viewportIdx, simdscalari const &rtIdx);
1145 #if USE_SIMD16_FRONTEND
1146 void SIMDCALL ClipTriangles_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari const &primId, simd16scalari const &viewportIdx, simd16scalari const &rtIdx);
1147 void SIMDCALL ClipLines_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari const &primId, simd16scalari const &viewportIdx, simd16scalari const &rtIdx);
1148 void SIMDCALL ClipPoints_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari const &primId, simd16scalari const &viewportIdx, simd16scalari const &rtIdx);
1149 #endif
1150