src/gallium/drivers/swr/rasterizer/core/clip.h

   1 /****************************************************************************
   2 * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
   3 *
   4 * Permission is hereby granted, free of charge, to any person obtaining a
   5 * copy of this software and associated documentation files (the "Software"),
   6 * to deal in the Software without restriction, including without limitation
   7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8 * and/or sell copies of the Software, and to permit persons to whom the
   9 * Software is furnished to do so, subject to the following conditions:
  10 *
  11 * The above copyright notice and this permission notice (including the next
  12 * paragraph) shall be included in all copies or substantial portions of the
  13 * Software.
  14 *
  15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21 * IN THE SOFTWARE.
  22 *
  23 * @file clip.h
  24 *
  25 * @brief Definitions for clipping
  26 *
  27 ******************************************************************************/
  28 #pragma once
  29
  30 #include "common/simdintrin.h"
  31 #include "core/context.h"
  32 #include "core/pa.h"
  33 #include "rdtsc_core.h"
  34
  35 // Temp storage used by the clipper
  36 extern THREAD SIMDVERTEX_T<SIMD256> tlsTempVertices[7];
  37 #if USE_SIMD16_FRONTEND
  38 extern THREAD SIMDVERTEX_T<SIMD512> tlsTempVertices_simd16[7];
  39 #endif
  40
  41 enum SWR_CLIPCODES
  42 {
  43     // Shift clip codes out of the mantissa to prevent denormalized values when used in float compare.
  44     // Guardband is able to use a single high-bit with 4 separate LSBs, because it computes a union, rather than intersection, of clipcodes.
  45 #define CLIPCODE_SHIFT 23
  46     FRUSTUM_LEFT    = (0x01 << CLIPCODE_SHIFT),
  47     FRUSTUM_TOP     = (0x02 << CLIPCODE_SHIFT),
  48     FRUSTUM_RIGHT   = (0x04 << CLIPCODE_SHIFT),
  49     FRUSTUM_BOTTOM  = (0x08 << CLIPCODE_SHIFT),
  50
  51     FRUSTUM_NEAR    = (0x10 << CLIPCODE_SHIFT),
  52     FRUSTUM_FAR     = (0x20 << CLIPCODE_SHIFT),
  53
  54     NEGW            = (0x40 << CLIPCODE_SHIFT),
  55
  56     GUARDBAND_LEFT   = (0x80 << CLIPCODE_SHIFT | 0x1),
  57     GUARDBAND_TOP    = (0x80 << CLIPCODE_SHIFT | 0x2),
  58     GUARDBAND_RIGHT  = (0x80 << CLIPCODE_SHIFT | 0x4),
  59     GUARDBAND_BOTTOM = (0x80 << CLIPCODE_SHIFT | 0x8)
  60 };
  61
  62 #define GUARDBAND_CLIP_MASK (FRUSTUM_NEAR|FRUSTUM_FAR|GUARDBAND_LEFT|GUARDBAND_TOP|GUARDBAND_RIGHT|GUARDBAND_BOTTOM|NEGW)
  63
  64 template<typename SIMD_T>
  65 void ComputeClipCodes(const API_STATE &state, const typename SIMD_T::Vec4 &vertex, typename SIMD_T::Float &clipCodes, typename SIMD_T::Integer const &viewportIndexes)
  66 {
  67     clipCodes = SIMD_T::setzero_ps();
  68
  69     // -w
  70     typename SIMD_T::Float vNegW = SIMD_T::mul_ps(vertex.w,SIMD_T::set1_ps(-1.0f));
  71
  72     // FRUSTUM_LEFT
  73     typename SIMD_T::Float vRes = SIMD_T::cmplt_ps(vertex.x, vNegW);
  74     clipCodes = SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_LEFT)));
  75
  76     // FRUSTUM_TOP
  77     vRes = SIMD_T::cmplt_ps(vertex.y, vNegW);
  78     clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_TOP))));
  79
  80     // FRUSTUM_RIGHT
  81     vRes = SIMD_T::cmpgt_ps(vertex.x, vertex.w);
  82     clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_RIGHT))));
  83
  84     // FRUSTUM_BOTTOM
  85     vRes = SIMD_T::cmpgt_ps(vertex.y, vertex.w);
  86     clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_BOTTOM))));
  87
  88     if (state.rastState.depthClipEnable)
  89     {
  90         // FRUSTUM_NEAR
  91         // DX clips depth [0..w], GL clips [-w..w]
  92         if (state.rastState.clipHalfZ)
  93         {
  94             vRes = SIMD_T::cmplt_ps(vertex.z, SIMD_T::setzero_ps());
  95         }
  96         else
  97         {
  98             vRes = SIMD_T::cmplt_ps(vertex.z, vNegW);
  99         }
 100         clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_NEAR))));
 101
 102         // FRUSTUM_FAR
 103         vRes = SIMD_T::cmpgt_ps(vertex.z, vertex.w);
 104         clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_FAR))));
 105     }
 106
 107     // NEGW
 108     vRes = SIMD_T::cmple_ps(vertex.w, SIMD_T::setzero_ps());
 109     clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(NEGW))));
 110
 111     // GUARDBAND_LEFT
 112     typename SIMD_T::Float gbMult = SIMD_T::mul_ps(vNegW, SIMD_T::template i32gather_ps<typename SIMD_T::ScaleFactor(4)>(&state.gbState.left[0], viewportIndexes));
 113     vRes = SIMD_T::cmplt_ps(vertex.x, gbMult);
 114     clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_LEFT))));
 115
 116     // GUARDBAND_TOP
 117     gbMult = SIMD_T::mul_ps(vNegW, SIMD_T::template i32gather_ps<typename SIMD_T::ScaleFactor(4)>(&state.gbState.top[0], viewportIndexes));
 118     vRes = SIMD_T::cmplt_ps(vertex.y, gbMult);
 119     clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_TOP))));
 120
 121     // GUARDBAND_RIGHT
 122     gbMult = SIMD_T::mul_ps(vertex.w, SIMD_T::template i32gather_ps<typename SIMD_T::ScaleFactor(4)>(&state.gbState.right[0], viewportIndexes));
 123     vRes = SIMD_T::cmpgt_ps(vertex.x, gbMult);
 124     clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_RIGHT))));
 125
 126     // GUARDBAND_BOTTOM
 127     gbMult = SIMD_T::mul_ps(vertex.w, SIMD_T::template i32gather_ps<typename SIMD_T::ScaleFactor(4)>(&state.gbState.bottom[0], viewportIndexes));
 128     vRes = SIMD_T::cmpgt_ps(vertex.y, gbMult);
 129     clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_BOTTOM))));
 130 }
 131
 132 template<typename SIMD_T>
 133 struct BinnerChooser
 134 {
 135 };
 136
 137 template<>
 138 struct BinnerChooser<SIMD256>
 139 {
 140     PFN_PROCESS_PRIMS pfnBinFunc;
 141
 142     BinnerChooser(uint32_t numVertsPerPrim, uint32_t conservativeRast)
 143         :pfnBinFunc(nullptr)
 144     {
 145         if (numVertsPerPrim == 3)
 146         {
 147             pfnBinFunc = GetBinTrianglesFunc(conservativeRast > 0);
 148
 149         }
 150         else if (numVertsPerPrim == 2)
 151         {
 152             pfnBinFunc = BinLines;
 153         }
 154         else
 155         {
 156             SWR_ASSERT(0 && "Unexpected points in clipper.");
 157         }
 158     }
 159
 160     BinnerChooser(PRIMITIVE_TOPOLOGY topology, uint32_t conservativeRast)
 161         :pfnBinFunc(nullptr)
 162     {
 163         switch (topology)
 164         {
 165         case TOP_POINT_LIST:
 166             pfnBinFunc = BinPoints;
 167             break;
 168         case TOP_LINE_LIST:
 169         case TOP_LINE_STRIP:
 170         case TOP_LINE_LOOP:
 171         case TOP_LINE_LIST_ADJ:
 172         case TOP_LISTSTRIP_ADJ:
 173             pfnBinFunc = BinLines;
 174             break;
 175         default:
 176             pfnBinFunc = GetBinTrianglesFunc(conservativeRast > 0);
 177             break;
 178         };
 179     }
 180
 181     void BinFunc(DRAW_CONTEXT *pDC, PA_STATE &pa, uint32_t workerId, SIMD256::Vec4 prims[], uint32_t primMask, SIMD256::Integer const &primID, SIMD256::Integer &viewportIdx, SIMD256::Integer &rtIdx)
 182     {
 183         SWR_ASSERT(pfnBinFunc != nullptr);
 184
 185         pfnBinFunc(pDC, pa, workerId, prims, primMask, primID, viewportIdx, rtIdx);
 186     }
 187 };
 188
 189 #if USE_SIMD16_FRONTEND
 190 template<>
 191 struct BinnerChooser<SIMD512>
 192 {
 193     PFN_PROCESS_PRIMS_SIMD16 pfnBinFunc;
 194
 195     BinnerChooser(uint32_t numVertsPerPrim, uint32_t conservativeRast)
 196         :pfnBinFunc(nullptr)
 197     {
 198         if (numVertsPerPrim == 3)
 199         {
 200             pfnBinFunc = GetBinTrianglesFunc_simd16(conservativeRast > 0);
 201
 202         }
 203         else if (numVertsPerPrim == 2)
 204         {
 205             pfnBinFunc = BinLines_simd16;
 206         }
 207         else
 208         {
 209             SWR_ASSERT(0 && "Unexpected points in clipper.");
 210         }
 211     }
 212
 213     BinnerChooser(PRIMITIVE_TOPOLOGY topology, uint32_t conservativeRast)
 214         :pfnBinFunc(nullptr)
 215     {
 216         switch (topology)
 217         {
 218         case TOP_POINT_LIST:
 219             pfnBinFunc = BinPoints_simd16;
 220             break;
 221         case TOP_LINE_LIST:
 222         case TOP_LINE_STRIP:
 223         case TOP_LINE_LOOP:
 224         case TOP_LINE_LIST_ADJ:
 225         case TOP_LISTSTRIP_ADJ:
 226             pfnBinFunc = BinLines_simd16;
 227             break;
 228         default:
 229             pfnBinFunc = GetBinTrianglesFunc_simd16(conservativeRast > 0);
 230             break;
 231         };
 232     }
 233
 234     void BinFunc(DRAW_CONTEXT *pDC, PA_STATE &pa, uint32_t workerId, SIMD512::Vec4 prims[], uint32_t primMask, SIMD512::Integer const &primID, SIMD512::Integer &viewportIdx, SIMD512::Integer &rtIdx)
 235     {
 236         SWR_ASSERT(pfnBinFunc != nullptr);
 237
 238         pfnBinFunc(pDC, pa, workerId, prims, primMask, primID, viewportIdx, rtIdx);
 239     }
 240 };
 241
 242 #endif
 243 template<typename SIMD_T>
 244 struct SimdHelper
 245 {
 246 };
 247
 248 template<>
 249 struct SimdHelper<SIMD256>
 250 {
 251     static SIMD256::Float insert_lo_ps(SIMD256::Float a)
 252     {
 253         return a;
 254     }
 255
 256     static SIMD256::Mask cmpeq_ps_mask(SIMD256::Float a, SIMD256::Float b)
 257     {
 258         return SIMD256::movemask_ps(SIMD256::cmpeq_ps(a, b));
 259     }
 260 };
 261
 262 #if USE_SIMD16_FRONTEND
 263 template<>
 264 struct SimdHelper<SIMD512>
 265 {
 266     static SIMD512::Float insert_lo_ps(SIMD256::Float a)
 267     {
 268         return SIMD512::insert_ps<0>(SIMD512::setzero_ps(), a);
 269     }
 270
 271     static SIMD512::Mask cmpeq_ps_mask(SIMD512::Float a, SIMD512::Float b)
 272     {
 273         return SIMD512::cmp_ps_mask<SIMD16::CompareType::EQ_OQ>(a, b);
 274     }
 275 };
 276
 277 #endif
 278 // Temp storage used by the clipper
 279 template<typename SIMD_T>
 280 struct ClipHelper
 281 {
 282 };
 283
 284 template<>
 285 struct ClipHelper<SIMD256>
 286 {
 287     static SIMDVERTEX_T<SIMD256> *GetTempVertices()
 288     {
 289         return tlsTempVertices;
 290     }
 291 };
 292
 293 #if USE_SIMD16_FRONTEND
 294 template<>
 295 struct ClipHelper<SIMD512>
 296 {
 297     static SIMDVERTEX_T<SIMD512> *GetTempVertices()
 298     {
 299         return tlsTempVertices_simd16;
 300     }
 301 };
 302
 303 #endif
 304 template<typename SIMD_T, uint32_t NumVertsPerPrim>
 305 class Clipper
 306 {
 307 public:
 308     INLINE Clipper(uint32_t in_workerId, DRAW_CONTEXT* in_pDC) :
 309         workerId(in_workerId), pDC(in_pDC), state(GetApiState(in_pDC))
 310     {
 311         static_assert(NumVertsPerPrim >= 1 && NumVertsPerPrim <= 3, "Invalid NumVertsPerPrim");
 312     }
 313
 314     void ComputeClipCodes(typename SIMD_T::Vec4 vertex[], const typename SIMD_T::Integer &viewportIndexes)
 315     {
 316         for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
 317         {
 318             ::ComputeClipCodes<SIMD_T>(state, vertex[i], clipCodes[i], viewportIndexes);
 319         }
 320     }
 321
 322     typename SIMD_T::Float ComputeClipCodeIntersection()
 323     {
 324         typename SIMD_T::Float result = clipCodes[0];
 325
 326         for (uint32_t i = 1; i < NumVertsPerPrim; ++i)
 327         {
 328             result = SIMD_T::and_ps(result, clipCodes[i]);
 329         }
 330
 331         return result;
 332     }
 333
 334     typename SIMD_T::Float ComputeClipCodeUnion()
 335     {
 336         typename SIMD_T::Float result = clipCodes[0];
 337
 338         for (uint32_t i = 1; i < NumVertsPerPrim; ++i)
 339         {
 340             result = SIMD_T::or_ps(result, clipCodes[i]);
 341         }
 342
 343         return result;
 344     }
 345
 346     int ComputeClipMask()
 347     {
 348         typename SIMD_T::Float clipUnion = ComputeClipCodeUnion();
 349
 350         clipUnion = SIMD_T::and_ps(clipUnion, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_CLIP_MASK)));
 351
 352         return SIMD_T::movemask_ps(SIMD_T::cmpneq_ps(clipUnion, SIMD_T::setzero_ps()));
 353     }
 354
 355     // clipper is responsible for culling any prims with NAN coordinates
 356     int ComputeNaNMask(typename SIMD_T::Vec4 prim[])
 357     {
 358         typename SIMD_T::Float vNanMask = SIMD_T::setzero_ps();
 359
 360         for (uint32_t e = 0; e < NumVertsPerPrim; ++e)
 361         {
 362             typename SIMD_T::Float vNan01 = SIMD_T::template cmp_ps<SIMD_T::CompareType::UNORD_Q>(prim[e].v[0], prim[e].v[1]);
 363             vNanMask = SIMD_T::or_ps(vNanMask, vNan01);
 364
 365             typename SIMD_T::Float vNan23 = SIMD_T::template cmp_ps<SIMD_T::CompareType::UNORD_Q>(prim[e].v[2], prim[e].v[3]);
 366             vNanMask = SIMD_T::or_ps(vNanMask, vNan23);
 367         }
 368
 369         return SIMD_T::movemask_ps(vNanMask);
 370     }
 371
 372     int ComputeUserClipCullMask(PA_STATE &pa, typename SIMD_T::Vec4 prim[])
 373     {
 374         uint8_t cullMask = state.backendState.cullDistanceMask;
 375         uint32_t vertexClipCullOffset = state.backendState.vertexClipCullOffset;
 376
 377         typename SIMD_T::Float vClipCullMask = SIMD_T::setzero_ps();
 378
 379         typename SIMD_T::Vec4 vClipCullDistLo[3];
 380         typename SIMD_T::Vec4 vClipCullDistHi[3];
 381
 382         pa.Assemble(vertexClipCullOffset, vClipCullDistLo);
 383         pa.Assemble(vertexClipCullOffset + 1, vClipCullDistHi);
 384
 385         DWORD index;
 386         while (_BitScanForward(&index, cullMask))
 387         {
 388             cullMask &= ~(1 << index);
 389             uint32_t slot = index >> 2;
 390             uint32_t component = index & 0x3;
 391
 392             typename SIMD_T::Float vCullMaskElem = SIMD_T::set1_ps(-1.0f);
 393             for (uint32_t e = 0; e < NumVertsPerPrim; ++e)
 394             {
 395                 typename SIMD_T::Float vCullComp;
 396                 if (slot == 0)
 397                 {
 398                     vCullComp = vClipCullDistLo[e][component];
 399                 }
 400                 else
 401                 {
 402                     vCullComp = vClipCullDistHi[e][component];
 403                 }
 404
 405                 // cull if cull distance < 0 || NAN
 406                 typename SIMD_T::Float vCull = SIMD_T::template cmp_ps<SIMD_T::CompareType::NLE_UQ>(SIMD_T::setzero_ps(), vCullComp);
 407                 vCullMaskElem = SIMD_T::and_ps(vCullMaskElem, vCull);
 408             }
 409             vClipCullMask = SIMD_T::or_ps(vClipCullMask, vCullMaskElem);
 410         }
 411
 412         // clipper should also discard any primitive with NAN clip distance
 413         uint8_t clipMask = state.backendState.clipDistanceMask;
 414         while (_BitScanForward(&index, clipMask))
 415         {
 416             clipMask &= ~(1 << index);
 417             uint32_t slot = index >> 2;
 418             uint32_t component = index & 0x3;
 419
 420             for (uint32_t e = 0; e < NumVertsPerPrim; ++e)
 421             {
 422                 typename SIMD_T::Float vClipComp;
 423                 if (slot == 0)
 424                 {
 425                     vClipComp = vClipCullDistLo[e][component];
 426                 }
 427                 else
 428                 {
 429                     vClipComp = vClipCullDistHi[e][component];
 430                 }
 431
 432                 typename SIMD_T::Float vClip = SIMD_T::template cmp_ps<SIMD_T::CompareType::UNORD_Q>(vClipComp, vClipComp);
 433                 vClipCullMask = SIMD_T::or_ps(vClipCullMask, vClip);
 434             }
 435         }
 436
 437         return SIMD_T::movemask_ps(vClipCullMask);
 438     }
 439
 440     void ClipSimd(const typename SIMD_T::Vec4 prim[], const typename SIMD_T::Float &vPrimMask, const typename SIMD_T::Float &vClipMask, PA_STATE &pa,
 441                   const typename SIMD_T::Integer &vPrimId, const typename SIMD_T::Integer &vViewportIdx, const typename SIMD_T::Integer &vRtIdx)
 442     {
 443         // input/output vertex store for clipper
 444         SIMDVERTEX_T<SIMD_T> vertices[7]; // maximum 7 verts generated per triangle
 445
 446         uint32_t constantInterpMask = state.backendState.constantInterpolationMask;
 447         uint32_t provokingVertex = 0;
 448         if (pa.binTopology == TOP_TRIANGLE_FAN)
 449         {
 450             provokingVertex = state.frontendState.provokingVertex.triFan;
 451         }
 452         ///@todo: line topology for wireframe?
 453
 454         // assemble pos
 455         typename SIMD_T::Vec4 tmpVector[NumVertsPerPrim];
 456         for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
 457         {
 458             vertices[i].attrib[VERTEX_POSITION_SLOT] = prim[i];
 459         }
 460
 461         // assemble attribs
 462         const SWR_BACKEND_STATE& backendState = state.backendState;
 463
 464         int32_t maxSlot = -1;
 465         for (uint32_t slot = 0; slot < backendState.numAttributes; ++slot)
 466         {
 467             // Compute absolute attrib slot in vertex array
 468             uint32_t mapSlot = backendState.swizzleEnable ? backendState.swizzleMap[slot].sourceAttrib : slot;
 469             maxSlot = std::max<int32_t>(maxSlot, mapSlot);
 470             uint32_t inputSlot = backendState.vertexAttribOffset + mapSlot;
 471
 472             pa.Assemble(inputSlot, tmpVector);
 473
 474             // if constant interpolation enabled for this attribute, assign the provoking
 475             // vertex values to all edges
 476             if (CheckBit(constantInterpMask, slot))
 477             {
 478                 for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
 479                 {
 480                     vertices[i].attrib[inputSlot] = tmpVector[provokingVertex];
 481                 }
 482             }
 483             else
 484             {
 485                 for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
 486                 {
 487                     vertices[i].attrib[inputSlot] = tmpVector[i];
 488                 }
 489             }
 490         }
 491
 492         // assemble user clip distances if enabled
 493         uint32_t vertexClipCullSlot = state.backendState.vertexClipCullOffset;
 494         if (state.backendState.clipDistanceMask & 0xf)
 495         {
 496             pa.Assemble(vertexClipCullSlot, tmpVector);
 497             for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
 498             {
 499                 vertices[i].attrib[vertexClipCullSlot] = tmpVector[i];
 500             }
 501         }
 502
 503         if (state.backendState.clipDistanceMask & 0xf0)
 504         {
 505             pa.Assemble(vertexClipCullSlot + 1, tmpVector);
 506             for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
 507             {
 508                 vertices[i].attrib[vertexClipCullSlot + 1] = tmpVector[i];
 509             }
 510         }
 511
 512         uint32_t numAttribs = maxSlot + 1;
 513
 514         typename SIMD_T::Integer vNumClippedVerts = ClipPrims((float*)&vertices[0], vPrimMask, vClipMask, numAttribs);
 515
 516         BinnerChooser<SIMD_T> binner(NumVertsPerPrim, pa.pDC->pState->state.rastState.conservativeRast);
 517
 518         // set up new PA for binning clipped primitives
 519         PRIMITIVE_TOPOLOGY clipTopology = TOP_UNKNOWN;
 520         if (NumVertsPerPrim == 3)
 521         {
 522             clipTopology = TOP_TRIANGLE_FAN;
 523
 524             // so that the binner knows to bloat wide points later
 525             if (pa.binTopology == TOP_POINT_LIST)
 526             {
 527                 clipTopology = TOP_POINT_LIST;
 528             }
 529         }
 530         else if (NumVertsPerPrim == 2)
 531         {
 532             clipTopology = TOP_LINE_LIST;
 533         }
 534         else
 535         {
 536             SWR_ASSERT(0 && "Unexpected points in clipper.");
 537         }
 538
 539         const uint32_t *pVertexCount = reinterpret_cast<const uint32_t *>(&vNumClippedVerts);
 540         const uint32_t *pPrimitiveId = reinterpret_cast<const uint32_t *>(&vPrimId);
 541         const uint32_t *pViewportIdx = reinterpret_cast<const uint32_t *>(&vViewportIdx);
 542         const uint32_t *pRtIdx = reinterpret_cast<const uint32_t *>(&vRtIdx);
 543
 544         const SIMD256::Integer vOffsets = SIMD256::set_epi32(
 545             0 * sizeof(SIMDVERTEX_T<SIMD_T>), // unused lane
 546             6 * sizeof(SIMDVERTEX_T<SIMD_T>),
 547             5 * sizeof(SIMDVERTEX_T<SIMD_T>),
 548             4 * sizeof(SIMDVERTEX_T<SIMD_T>),
 549             3 * sizeof(SIMDVERTEX_T<SIMD_T>),
 550             2 * sizeof(SIMDVERTEX_T<SIMD_T>),
 551             1 * sizeof(SIMDVERTEX_T<SIMD_T>),
 552             0 * sizeof(SIMDVERTEX_T<SIMD_T>));
 553
 554         // only need to gather 7 verts
 555         // @todo dynamic mask based on actual # of verts generated per lane
 556         const SIMD256::Float vMask = SIMD256::set_ps(0, -1, -1, -1, -1, -1, -1, -1);
 557
 558         uint32_t numClippedPrims = 0;
 559
 560         // tranpose clipper output so that each lane's vertices are in SIMD order
 561         // set aside space for 2 vertices, as the PA will try to read up to 16 verts
 562         // for triangle fan
 563
 564 #if defined(_DEBUG)
 565         // TODO: need to increase stack size, allocating SIMD16-widened transposedPrims causes stack overflow in debug builds
 566         SIMDVERTEX_T<SIMD_T> *transposedPrims = reinterpret_cast<SIMDVERTEX_T<SIMD_T> *>(AlignedMalloc(sizeof(SIMDVERTEX_T<SIMD_T>) * 2, 64));
 567
 568 #else
 569         SIMDVERTEX_T<SIMD_T> transposedPrims[2];
 570
 571 #endif
 572         uint32_t numInputPrims = pa.NumPrims();
 573         for (uint32_t inputPrim = 0; inputPrim < numInputPrims; ++inputPrim)
 574         {
 575             uint32_t numEmittedVerts = pVertexCount[inputPrim];
 576             if (numEmittedVerts < NumVertsPerPrim)
 577             {
 578                 continue;
 579             }
 580             SWR_ASSERT(numEmittedVerts <= 7, "Unexpected vertex count from clipper.");
 581
 582             uint32_t numEmittedPrims = GetNumPrims(clipTopology, numEmittedVerts);
 583             SWR_ASSERT(numEmittedPrims <= 7, "Unexpected primitive count from clipper.");
 584
 585             numClippedPrims += numEmittedPrims;
 586
 587             // tranpose clipper output so that each lane's vertices are in SIMD order
 588             // set aside space for 2 vertices, as the PA will try to read up to 16 verts
 589             // for triangle fan
 590
 591             // transpose pos
 592             uint8_t *pBase = reinterpret_cast<uint8_t *>(&vertices[0].attrib[VERTEX_POSITION_SLOT]) + sizeof(float) * inputPrim;
 593
 594 #if 0
 595             // TEMPORARY WORKAROUND for bizarre VS2015 code-gen bug
 596             static const float *dummy = reinterpret_cast<const float *>(pBase);
 597
 598 #endif
 599             for (uint32_t c = 0; c < 4; ++c)
 600             {
 601                 SIMD256::Float temp = SIMD256::template mask_i32gather_ps<typename SIMD_T::ScaleFactor(1)>(SIMD256::setzero_ps(), reinterpret_cast<const float *>(pBase), vOffsets, vMask);
 602                 transposedPrims[0].attrib[VERTEX_POSITION_SLOT][c] = SimdHelper<SIMD_T>::insert_lo_ps(temp);
 603                 pBase += sizeof(typename SIMD_T::Float);
 604             }
 605
 606             // transpose attribs
 607             pBase = reinterpret_cast<uint8_t *>(&vertices[0].attrib[backendState.vertexAttribOffset]) + sizeof(float) * inputPrim;
 608
 609             for (uint32_t attrib = 0; attrib < numAttribs; ++attrib)
 610             {
 611                 uint32_t attribSlot = backendState.vertexAttribOffset + attrib;
 612
 613                 for (uint32_t c = 0; c < 4; ++c)
 614                 {
 615                     SIMD256::Float temp = SIMD256::template mask_i32gather_ps<typename SIMD_T::ScaleFactor(1)>(SIMD256::setzero_ps(), reinterpret_cast<const float *>(pBase), vOffsets, vMask);
 616                     transposedPrims[0].attrib[attribSlot][c] = SimdHelper<SIMD_T>::insert_lo_ps(temp);
 617                     pBase += sizeof(typename SIMD_T::Float);
 618                 }
 619             }
 620
 621             // transpose user clip distances if enabled
 622             uint32_t vertexClipCullSlot = backendState.vertexClipCullOffset;
 623             if (state.backendState.clipDistanceMask & 0x0f)
 624             {
 625                 pBase = reinterpret_cast<uint8_t *>(&vertices[0].attrib[vertexClipCullSlot]) + sizeof(float) * inputPrim;
 626
 627                 for (uint32_t c = 0; c < 4; ++c)
 628                 {
 629                     SIMD256::Float temp = SIMD256::template mask_i32gather_ps<typename SIMD_T::ScaleFactor(1)>(SIMD256::setzero_ps(), reinterpret_cast<const float *>(pBase), vOffsets, vMask);
 630                     transposedPrims[0].attrib[vertexClipCullSlot][c] = SimdHelper<SIMD_T>::insert_lo_ps(temp);
 631                     pBase += sizeof(typename SIMD_T::Float);
 632                 }
 633             }
 634
 635             if (state.backendState.clipDistanceMask & 0xf0)
 636             {
 637                 pBase = reinterpret_cast<uint8_t *>(&vertices[0].attrib[vertexClipCullSlot + 1]) + sizeof(float) * inputPrim;
 638
 639                 for (uint32_t c = 0; c < 4; ++c)
 640                 {
 641                     SIMD256::Float temp = SIMD256::template mask_i32gather_ps<typename SIMD_T::ScaleFactor(1)>(SIMD256::setzero_ps(), reinterpret_cast<const float *>(pBase), vOffsets, vMask);
 642                     transposedPrims[0].attrib[vertexClipCullSlot + 1][c] = SimdHelper<SIMD_T>::insert_lo_ps(temp);
 643                     pBase += sizeof(typename SIMD_T::Float);
 644                 }
 645             }
 646
 647             PA_STATE_OPT clipPA(pDC, numEmittedPrims, reinterpret_cast<uint8_t *>(&transposedPrims[0]), numEmittedVerts, SWR_VTX_NUM_SLOTS, true, NumVertsPerPrim, clipTopology);
 648             clipPA.viewportArrayActive = pa.viewportArrayActive;
 649             clipPA.rtArrayActive = pa.rtArrayActive;
 650
 651             static const uint32_t primMaskMap[] = { 0x0, 0x1, 0x3, 0x7, 0xf, 0x1f, 0x3f, 0x7f };
 652
 653             const uint32_t primMask = primMaskMap[numEmittedPrims];
 654
 655             const typename SIMD_T::Integer primID = SIMD_T::set1_epi32(pPrimitiveId[inputPrim]);
 656             const typename SIMD_T::Integer viewportIdx = SIMD_T::set1_epi32(pViewportIdx[inputPrim]);
 657             const typename SIMD_T::Integer rtIdx = SIMD_T::set1_epi32(pRtIdx[inputPrim]);
 658
 659
 660             while (clipPA.GetNextStreamOutput())
 661             {
 662                 do
 663                 {
 664                     typename SIMD_T::Vec4 attrib[NumVertsPerPrim];
 665
 666                     bool assemble = clipPA.Assemble(VERTEX_POSITION_SLOT, attrib);
 667
 668                     if (assemble)
 669                     {
 670                         binner.pfnBinFunc(pDC, clipPA, workerId, attrib, primMask, primID, viewportIdx, rtIdx);
 671                     }
 672
 673                 } while (clipPA.NextPrim());
 674             }
 675         }
 676
 677 #if defined(_DEBUG)
 678         AlignedFree(transposedPrims);
 679
 680 #endif
 681         // update global pipeline stat
 682         UPDATE_STAT_FE(CPrimitives, numClippedPrims);
 683     }
 684
 685     void ExecuteStage(PA_STATE &pa, typename SIMD_T::Vec4 prim[], uint32_t primMask,
 686                       typename SIMD_T::Integer const &primId, typename SIMD_T::Integer const &viewportIdx, typename SIMD_T::Integer const &rtIdx)
 687     {
 688         SWR_ASSERT(pa.pDC != nullptr);
 689
 690         SWR_CONTEXT *pContext = pa.pDC->pContext;
 691
 692         BinnerChooser<SIMD_T> binner(pa.binTopology, pa.pDC->pState->state.rastState.conservativeRast);
 693
 694         // update clipper invocations pipeline stat
 695         uint32_t numInvoc = _mm_popcnt_u32(primMask);
 696         UPDATE_STAT_FE(CInvocations, numInvoc);
 697
 698         ComputeClipCodes(prim, viewportIdx);
 699
 700         // cull prims with NAN coords
 701         primMask &= ~ComputeNaNMask(prim);
 702
 703         // user cull distance cull
 704         if (state.backendState.cullDistanceMask)
 705         {
 706             primMask &= ~ComputeUserClipCullMask(pa, prim);
 707         }
 708
 709         // cull prims outside view frustum
 710         typename SIMD_T::Float clipIntersection = ComputeClipCodeIntersection();
 711         int validMask = primMask & SimdHelper<SIMD_T>::cmpeq_ps_mask(clipIntersection, SIMD_T::setzero_ps());
 712
 713         // skip clipping for points
 714         uint32_t clipMask = 0;
 715         if (NumVertsPerPrim != 1)
 716         {
 717             clipMask = primMask & ComputeClipMask();
 718         }
 719
 720         if (clipMask)
 721         {
 722             AR_BEGIN(FEGuardbandClip, pa.pDC->drawId);
 723             // we have to clip tris, execute the clipper, which will also
 724             // call the binner
 725             ClipSimd(prim, SIMD_T::vmask_ps(primMask), SIMD_T::vmask_ps(clipMask), pa, primId, viewportIdx, rtIdx);
 726             AR_END(FEGuardbandClip, 1);
 727         }
 728         else if (validMask)
 729         {
 730             // update CPrimitives pipeline state
 731             UPDATE_STAT_FE(CPrimitives, _mm_popcnt_u32(validMask));
 732
 733             // forward valid prims directly to binner
 734             binner.pfnBinFunc(this->pDC, pa, this->workerId, prim, validMask, primId, viewportIdx, rtIdx);
 735         }
 736     }
 737
 738 private:
 739     typename SIMD_T::Float ComputeInterpFactor(typename SIMD_T::Float const &boundaryCoord0, typename SIMD_T::Float const &boundaryCoord1)
 740     {
 741         return SIMD_T::div_ps(boundaryCoord0, SIMD_T::sub_ps(boundaryCoord0, boundaryCoord1));
 742     }
 743
 744     typename SIMD_T::Integer ComputeOffsets(uint32_t attrib, typename SIMD_T::Integer const &vIndices, uint32_t component)
 745     {
 746         const uint32_t simdVertexStride = sizeof(SIMDVERTEX_T<SIMD_T>);
 747         const uint32_t componentStride  = sizeof(typename SIMD_T::Float);
 748         const uint32_t attribStride     = sizeof(typename SIMD_T::Vec4);
 749
 750         static const OSALIGNSIMD16(uint32_t) elemOffset[16] =
 751         {
 752             0 * sizeof(float),
 753             1 * sizeof(float),
 754             2 * sizeof(float),
 755             3 * sizeof(float),
 756             4 * sizeof(float),
 757             5 * sizeof(float),
 758             6 * sizeof(float),
 759             7 * sizeof(float),
 760             8 * sizeof(float),
 761             9 * sizeof(float),
 762             10 * sizeof(float),
 763             11 * sizeof(float),
 764             12 * sizeof(float),
 765             13 * sizeof(float),
 766             14 * sizeof(float),
 767             15 * sizeof(float),
 768         };
 769
 770         static_assert(sizeof(typename SIMD_T::Integer) <= sizeof(elemOffset), "Clipper::ComputeOffsets, Increase number of element offsets.");
 771
 772         typename SIMD_T::Integer vElemOffset = SIMD_T::loadu_si(reinterpret_cast<const typename SIMD_T::Integer *>(elemOffset));
 773
 774         // step to the simdvertex
 775         typename SIMD_T::Integer vOffsets = SIMD_T::mullo_epi32(vIndices, SIMD_T::set1_epi32(simdVertexStride));
 776
 777         // step to the attribute and component
 778         vOffsets = SIMD_T::add_epi32(vOffsets, SIMD_T::set1_epi32(attribStride * attrib + componentStride * component));
 779
 780         // step to the lane
 781         vOffsets = SIMD_T::add_epi32(vOffsets, vElemOffset);
 782
 783         return vOffsets;
 784     }
 785
 786     typename SIMD_T::Float GatherComponent(const float* pBuffer, uint32_t attrib, typename SIMD_T::Float const &vMask, typename SIMD_T::Integer const &vIndices, uint32_t component)
 787     {
 788         typename SIMD_T::Integer vOffsets = ComputeOffsets(attrib, vIndices, component);
 789         typename SIMD_T::Float vSrc = SIMD_T::setzero_ps();
 790
 791         return SIMD_T::template mask_i32gather_ps<typename SIMD_T::ScaleFactor(1)>(vSrc, pBuffer, vOffsets, vMask);
 792     }
 793
 794     void ScatterComponent(const float* pBuffer, uint32_t attrib, typename SIMD_T::Float const &vMask, typename SIMD_T::Integer const &vIndices, uint32_t component, typename SIMD_T::Float const &vSrc)
 795     {
 796         typename SIMD_T::Integer vOffsets = ComputeOffsets(attrib, vIndices, component);
 797
 798         const uint32_t *pOffsets = reinterpret_cast<const uint32_t *>(&vOffsets);
 799         const float *pSrc = reinterpret_cast<const float *>(&vSrc);
 800         uint32_t mask = SIMD_T::movemask_ps(vMask);
 801         DWORD lane;
 802         while (_BitScanForward(&lane, mask))
 803         {
 804             mask &= ~(1 << lane);
 805             const uint8_t *pBuf = reinterpret_cast<const uint8_t *>(pBuffer) + pOffsets[lane];
 806             *(float *)pBuf = pSrc[lane];
 807         }
 808     }
 809
 810     template<SWR_CLIPCODES ClippingPlane>
 811     void intersect(
 812         const typename SIMD_T::Float &vActiveMask,  // active lanes to operate on
 813         const typename SIMD_T::Integer &s,          // index to first edge vertex v0 in pInPts.
 814         const typename SIMD_T::Integer &p,          // index to second edge vertex v1 in pInPts.
 815         const typename SIMD_T::Vec4 &v1,            // vertex 0 position
 816         const typename SIMD_T::Vec4 &v2,            // vertex 1 position
 817         typename SIMD_T::Integer &outIndex,         // output index.
 818         const float *pInVerts,                      // array of all the input positions.
 819         uint32_t numInAttribs,                      // number of attributes per vertex.
 820         float *pOutVerts)                           // array of output positions. We'll write our new intersection point at i*4.
 821     {
 822         uint32_t vertexAttribOffset = this->state.backendState.vertexAttribOffset;
 823         uint32_t vertexClipCullOffset = this->state.backendState.vertexClipCullOffset;
 824
 825         // compute interpolation factor
 826         typename SIMD_T::Float t;
 827         switch (ClippingPlane)
 828         {
 829         case FRUSTUM_LEFT:      t = ComputeInterpFactor(SIMD_T::add_ps(v1[3], v1[0]), SIMD_T::add_ps(v2[3], v2[0])); break;
 830         case FRUSTUM_RIGHT:     t = ComputeInterpFactor(SIMD_T::sub_ps(v1[3], v1[0]), SIMD_T::sub_ps(v2[3], v2[0])); break;
 831         case FRUSTUM_TOP:       t = ComputeInterpFactor(SIMD_T::add_ps(v1[3], v1[1]), SIMD_T::add_ps(v2[3], v2[1])); break;
 832         case FRUSTUM_BOTTOM:    t = ComputeInterpFactor(SIMD_T::sub_ps(v1[3], v1[1]), SIMD_T::sub_ps(v2[3], v2[1])); break;
 833         case FRUSTUM_NEAR:
 834             // DX Znear plane is 0, GL is -w
 835             if (this->state.rastState.clipHalfZ)
 836             {
 837                 t = ComputeInterpFactor(v1[2], v2[2]);
 838             }
 839             else
 840             {
 841                 t = ComputeInterpFactor(SIMD_T::add_ps(v1[3], v1[2]), SIMD_T::add_ps(v2[3], v2[2]));
 842             }
 843             break;
 844         case FRUSTUM_FAR:       t = ComputeInterpFactor(SIMD_T::sub_ps(v1[3], v1[2]), SIMD_T::sub_ps(v2[3], v2[2])); break;
 845         default: SWR_INVALID("invalid clipping plane: %d", ClippingPlane);
 846         };
 847
 848         // interpolate position and store
 849         for (uint32_t c = 0; c < 4; ++c)
 850         {
 851             typename SIMD_T::Float vOutPos = SIMD_T::fmadd_ps(SIMD_T::sub_ps(v2[c], v1[c]), t, v1[c]);
 852             ScatterComponent(pOutVerts, VERTEX_POSITION_SLOT, vActiveMask, outIndex, c, vOutPos);
 853         }
 854
 855         // interpolate attributes and store
 856         for (uint32_t a = 0; a < numInAttribs; ++a)
 857         {
 858             uint32_t attribSlot = vertexAttribOffset + a;
 859             for (uint32_t c = 0; c < 4; ++c)
 860             {
 861                 typename SIMD_T::Float vAttrib0 = GatherComponent(pInVerts, attribSlot, vActiveMask, s, c);
 862                 typename SIMD_T::Float vAttrib1 = GatherComponent(pInVerts, attribSlot, vActiveMask, p, c);
 863                 typename SIMD_T::Float vOutAttrib = SIMD_T::fmadd_ps(SIMD_T::sub_ps(vAttrib1, vAttrib0), t, vAttrib0);
 864                 ScatterComponent(pOutVerts, attribSlot, vActiveMask, outIndex, c, vOutAttrib);
 865             }
 866         }
 867
 868         // interpolate clip distance if enabled
 869         if (this->state.backendState.clipDistanceMask & 0xf)
 870         {
 871             uint32_t attribSlot = vertexClipCullOffset;
 872             for (uint32_t c = 0; c < 4; ++c)
 873             {
 874                 typename SIMD_T::Float vAttrib0 = GatherComponent(pInVerts, attribSlot, vActiveMask, s, c);
 875                 typename SIMD_T::Float vAttrib1 = GatherComponent(pInVerts, attribSlot, vActiveMask, p, c);
 876                 typename SIMD_T::Float vOutAttrib = SIMD_T::fmadd_ps(SIMD_T::sub_ps(vAttrib1, vAttrib0), t, vAttrib0);
 877                 ScatterComponent(pOutVerts, attribSlot, vActiveMask, outIndex, c, vOutAttrib);
 878             }
 879         }
 880
 881         if (this->state.backendState.clipDistanceMask & 0xf0)
 882         {
 883             uint32_t attribSlot = vertexClipCullOffset + 1;
 884             for (uint32_t c = 0; c < 4; ++c)
 885             {
 886                 typename SIMD_T::Float vAttrib0 = GatherComponent(pInVerts, attribSlot, vActiveMask, s, c);
 887                 typename SIMD_T::Float vAttrib1 = GatherComponent(pInVerts, attribSlot, vActiveMask, p, c);
 888                 typename SIMD_T::Float vOutAttrib = SIMD_T::fmadd_ps(SIMD_T::sub_ps(vAttrib1, vAttrib0), t, vAttrib0);
 889                 ScatterComponent(pOutVerts, attribSlot, vActiveMask, outIndex, c, vOutAttrib);
 890             }
 891         }
 892     }
 893
 894     template<SWR_CLIPCODES ClippingPlane>
 895     typename SIMD_T::Float inside(const typename SIMD_T::Vec4 &v)
 896     {
 897         switch (ClippingPlane)
 898         {
 899         case FRUSTUM_LEFT:      return SIMD_T::cmpge_ps(v[0], SIMD_T::mul_ps(v[3], SIMD_T::set1_ps(-1.0f)));
 900         case FRUSTUM_RIGHT:     return SIMD_T::cmple_ps(v[0], v[3]);
 901         case FRUSTUM_TOP:       return SIMD_T::cmpge_ps(v[1], SIMD_T::mul_ps(v[3], SIMD_T::set1_ps(-1.0f)));
 902         case FRUSTUM_BOTTOM:    return SIMD_T::cmple_ps(v[1], v[3]);
 903         case FRUSTUM_NEAR:      return SIMD_T::cmpge_ps(v[2], this->state.rastState.clipHalfZ ? SIMD_T::setzero_ps() : SIMD_T::mul_ps(v[3], SIMD_T::set1_ps(-1.0f)));
 904         case FRUSTUM_FAR:       return SIMD_T::cmple_ps(v[2], v[3]);
 905         default:
 906             SWR_INVALID("invalid clipping plane: %d", ClippingPlane);
 907             return SIMD_T::setzero_ps();
 908         }
 909     }
 910
 911     template<SWR_CLIPCODES ClippingPlane>
 912     typename SIMD_T::Integer ClipTriToPlane(const float *pInVerts, const typename SIMD_T::Integer &vNumInPts, uint32_t numInAttribs, float *pOutVerts)
 913     {
 914         uint32_t vertexAttribOffset = this->state.backendState.vertexAttribOffset;
 915
 916         typename SIMD_T::Integer vCurIndex = SIMD_T::setzero_si();
 917         typename SIMD_T::Integer vOutIndex = SIMD_T::setzero_si();
 918         typename SIMD_T::Float vActiveMask = SIMD_T::castsi_ps(SIMD_T::cmplt_epi32(vCurIndex, vNumInPts));
 919
 920         while (!SIMD_T::testz_ps(vActiveMask, vActiveMask)) // loop until activeMask is empty
 921         {
 922             typename SIMD_T::Integer s = vCurIndex;
 923             typename SIMD_T::Integer p = SIMD_T::add_epi32(s, SIMD_T::set1_epi32(1));
 924             typename SIMD_T::Integer underFlowMask = SIMD_T::cmpgt_epi32(vNumInPts, p);
 925             p = SIMD_T::castps_si(SIMD_T::blendv_ps(SIMD_T::setzero_ps(), SIMD_T::castsi_ps(p), SIMD_T::castsi_ps(underFlowMask)));
 926
 927             // gather position
 928             typename SIMD_T::Vec4 vInPos0, vInPos1;
 929             for (uint32_t c = 0; c < 4; ++c)
 930             {
 931                 vInPos0[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, s, c);
 932                 vInPos1[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, p, c);
 933             }
 934
 935             // compute inside mask
 936             typename SIMD_T::Float s_in = inside<ClippingPlane>(vInPos0);
 937             typename SIMD_T::Float p_in = inside<ClippingPlane>(vInPos1);
 938
 939             // compute intersection mask (s_in != p_in)
 940             typename SIMD_T::Float intersectMask = SIMD_T::xor_ps(s_in, p_in);
 941             intersectMask = SIMD_T::and_ps(intersectMask, vActiveMask);
 942
 943             // store s if inside
 944             s_in = SIMD_T::and_ps(s_in, vActiveMask);
 945             if (!SIMD_T::testz_ps(s_in, s_in))
 946             {
 947                 // store position
 948                 for (uint32_t c = 0; c < 4; ++c)
 949                 {
 950                     ScatterComponent(pOutVerts, VERTEX_POSITION_SLOT, s_in, vOutIndex, c, vInPos0[c]);
 951                 }
 952
 953                 // store attribs
 954                 for (uint32_t a = 0; a < numInAttribs; ++a)
 955                 {
 956                     uint32_t attribSlot = vertexAttribOffset + a;
 957                     for (uint32_t c = 0; c < 4; ++c)
 958                     {
 959                         typename SIMD_T::Float vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
 960                         ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib);
 961                     }
 962                 }
 963
 964                 // store clip distance if enabled
 965                 uint32_t vertexClipCullSlot = this->state.backendState.vertexClipCullOffset;
 966                 if (this->state.backendState.clipDistanceMask & 0xf)
 967                 {
 968                     uint32_t attribSlot = vertexClipCullSlot;
 969                     for (uint32_t c = 0; c < 4; ++c)
 970                     {
 971                         typename SIMD_T::Float vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
 972                         ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib);
 973                     }
 974                 }
 975
 976                 if (this->state.backendState.clipDistanceMask & 0xf0)
 977                 {
 978                     uint32_t attribSlot = vertexClipCullSlot + 1;
 979                     for (uint32_t c = 0; c < 4; ++c)
 980                     {
 981                         typename SIMD_T::Float vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
 982                         ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib);
 983                     }
 984                 }
 985
 986                 // increment outIndex
 987                 vOutIndex = SIMD_T::blendv_epi32(vOutIndex, SIMD_T::add_epi32(vOutIndex, SIMD_T::set1_epi32(1)), s_in);
 988             }
 989
 990             // compute and store intersection
 991             if (!SIMD_T::testz_ps(intersectMask, intersectMask))
 992             {
 993                 intersect<ClippingPlane>(intersectMask, s, p, vInPos0, vInPos1, vOutIndex, pInVerts, numInAttribs, pOutVerts);
 994
 995                 // increment outIndex for active lanes
 996                 vOutIndex = SIMD_T::blendv_epi32(vOutIndex, SIMD_T::add_epi32(vOutIndex, SIMD_T::set1_epi32(1)), intersectMask);
 997             }
 998
 999             // increment loop index and update active mask
1000             vCurIndex = SIMD_T::add_epi32(vCurIndex, SIMD_T::set1_epi32(1));
1001             vActiveMask = SIMD_T::castsi_ps(SIMD_T::cmplt_epi32(vCurIndex, vNumInPts));
1002         }
1003
1004         return vOutIndex;
1005     }
1006
1007     template<SWR_CLIPCODES ClippingPlane>
1008     typename SIMD_T::Integer ClipLineToPlane(const float *pInVerts, const typename SIMD_T::Integer &vNumInPts, uint32_t numInAttribs, float *pOutVerts)
1009     {
1010         uint32_t vertexAttribOffset = this->state.backendState.vertexAttribOffset;
1011
1012         typename SIMD_T::Integer vCurIndex = SIMD_T::setzero_si();
1013         typename SIMD_T::Integer vOutIndex = SIMD_T::setzero_si();
1014         typename SIMD_T::Float vActiveMask = SIMD_T::castsi_ps(SIMD_T::cmplt_epi32(vCurIndex, vNumInPts));
1015
1016         if (!SIMD_T::testz_ps(vActiveMask, vActiveMask))
1017         {
1018             typename SIMD_T::Integer s = vCurIndex;
1019             typename SIMD_T::Integer p = SIMD_T::add_epi32(s, SIMD_T::set1_epi32(1));
1020
1021             // gather position
1022             typename SIMD_T::Vec4 vInPos0, vInPos1;
1023             for (uint32_t c = 0; c < 4; ++c)
1024             {
1025                 vInPos0[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, s, c);
1026                 vInPos1[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, p, c);
1027             }
1028
1029             // compute inside mask
1030             typename SIMD_T::Float s_in = inside<ClippingPlane>(vInPos0);
1031             typename SIMD_T::Float p_in = inside<ClippingPlane>(vInPos1);
1032
1033             // compute intersection mask (s_in != p_in)
1034             typename SIMD_T::Float intersectMask = SIMD_T::xor_ps(s_in, p_in);
1035             intersectMask = SIMD_T::and_ps(intersectMask, vActiveMask);
1036
1037             // store s if inside
1038             s_in = SIMD_T::and_ps(s_in, vActiveMask);
1039             if (!SIMD_T::testz_ps(s_in, s_in))
1040             {
1041                 for (uint32_t c = 0; c < 4; ++c)
1042                 {
1043                     ScatterComponent(pOutVerts, VERTEX_POSITION_SLOT, s_in, vOutIndex, c, vInPos0[c]);
1044                 }
1045
1046                 // interpolate attributes and store
1047                 for (uint32_t a = 0; a < numInAttribs; ++a)
1048                 {
1049                     uint32_t attribSlot = vertexAttribOffset + a;
1050                     for (uint32_t c = 0; c < 4; ++c)
1051                     {
1052                         typename SIMD_T::Float vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
1053                         ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib);
1054                     }
1055                 }
1056
1057                 // increment outIndex
1058                 vOutIndex = SIMD_T::blendv_epi32(vOutIndex, SIMD_T::add_epi32(vOutIndex, SIMD_T::set1_epi32(1)), s_in);
1059             }
1060
1061             // compute and store intersection
1062             if (!SIMD_T::testz_ps(intersectMask, intersectMask))
1063             {
1064                 intersect<ClippingPlane>(intersectMask, s, p, vInPos0, vInPos1, vOutIndex, pInVerts, numInAttribs, pOutVerts);
1065
1066                 // increment outIndex for active lanes
1067                 vOutIndex = SIMD_T::blendv_epi32(vOutIndex, SIMD_T::add_epi32(vOutIndex, SIMD_T::set1_epi32(1)), intersectMask);
1068             }
1069
1070             // store p if inside
1071             p_in = SIMD_T::and_ps(p_in, vActiveMask);
1072             if (!SIMD_T::testz_ps(p_in, p_in))
1073             {
1074                 for (uint32_t c = 0; c < 4; ++c)
1075                 {
1076                     ScatterComponent(pOutVerts, VERTEX_POSITION_SLOT, p_in, vOutIndex, c, vInPos1[c]);
1077                 }
1078
1079                 // interpolate attributes and store
1080                 for (uint32_t a = 0; a < numInAttribs; ++a)
1081                 {
1082                     uint32_t attribSlot = vertexAttribOffset + a;
1083                     for (uint32_t c = 0; c < 4; ++c)
1084                     {
1085                         typename SIMD_T::Float vAttrib = GatherComponent(pInVerts, attribSlot, p_in, p, c);
1086                         ScatterComponent(pOutVerts, attribSlot, p_in, vOutIndex, c, vAttrib);
1087                     }
1088                 }
1089
1090                 // increment outIndex
1091                 vOutIndex = SIMD_T::blendv_epi32(vOutIndex, SIMD_T::add_epi32(vOutIndex, SIMD_T::set1_epi32(1)), p_in);
1092             }
1093         }
1094
1095         return vOutIndex;
1096     }
1097
1098     typename SIMD_T::Integer ClipPrims(float *pVertices, const typename SIMD_T::Float &vPrimMask, const typename SIMD_T::Float &vClipMask, int numAttribs)
1099     {
1100         // temp storage
1101         float *pTempVerts = reinterpret_cast<float *>(ClipHelper<SIMD_T>::GetTempVertices());
1102
1103         // zero out num input verts for non-active lanes
1104         typename SIMD_T::Integer vNumInPts = SIMD_T::set1_epi32(NumVertsPerPrim);
1105         vNumInPts = SIMD_T::blendv_epi32(SIMD_T::setzero_si(), vNumInPts, vClipMask);
1106
1107         // clip prims to frustum
1108         typename SIMD_T::Integer vNumOutPts;
1109         if (NumVertsPerPrim == 3)
1110         {
1111             vNumOutPts = ClipTriToPlane<FRUSTUM_NEAR>(pVertices, vNumInPts, numAttribs, pTempVerts);
1112             vNumOutPts = ClipTriToPlane<FRUSTUM_FAR>(pTempVerts, vNumOutPts, numAttribs, pVertices);
1113             vNumOutPts = ClipTriToPlane<FRUSTUM_LEFT>(pVertices, vNumOutPts, numAttribs, pTempVerts);
1114             vNumOutPts = ClipTriToPlane<FRUSTUM_RIGHT>(pTempVerts, vNumOutPts, numAttribs, pVertices);
1115             vNumOutPts = ClipTriToPlane<FRUSTUM_BOTTOM>(pVertices, vNumOutPts, numAttribs, pTempVerts);
1116             vNumOutPts = ClipTriToPlane<FRUSTUM_TOP>(pTempVerts, vNumOutPts, numAttribs, pVertices);
1117         }
1118         else
1119         {
1120             SWR_ASSERT(NumVertsPerPrim == 2);
1121             vNumOutPts = ClipLineToPlane<FRUSTUM_NEAR>(pVertices, vNumInPts, numAttribs, pTempVerts);
1122             vNumOutPts = ClipLineToPlane<FRUSTUM_FAR>(pTempVerts, vNumOutPts, numAttribs, pVertices);
1123             vNumOutPts = ClipLineToPlane<FRUSTUM_LEFT>(pVertices, vNumOutPts, numAttribs, pTempVerts);
1124             vNumOutPts = ClipLineToPlane<FRUSTUM_RIGHT>(pTempVerts, vNumOutPts, numAttribs, pVertices);
1125             vNumOutPts = ClipLineToPlane<FRUSTUM_BOTTOM>(pVertices, vNumOutPts, numAttribs, pTempVerts);
1126             vNumOutPts = ClipLineToPlane<FRUSTUM_TOP>(pTempVerts, vNumOutPts, numAttribs, pVertices);
1127         }
1128
1129         // restore num verts for non-clipped, active lanes
1130         typename SIMD_T::Float vNonClippedMask = SIMD_T::andnot_ps(vClipMask, vPrimMask);
1131         vNumOutPts = SIMD_T::blendv_epi32(vNumOutPts, SIMD_T::set1_epi32(NumVertsPerPrim), vNonClippedMask);
1132
1133         return vNumOutPts;
1134     }
1135
1136     const uint32_t workerId{ 0 };
1137     DRAW_CONTEXT *pDC{ nullptr };
1138     const API_STATE &state;
1139     typename SIMD_T::Float clipCodes[NumVertsPerPrim];
1140 };
1141
1142
1143 // pipeline stage functions
1144 void ClipTriangles(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari const &primId, simdscalari const &viewportIdx, simdscalari const &rtIdx);
1145 void ClipLines(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari const &primId, simdscalari const &viewportIdx, simdscalari const &rtIdx);
1146 void ClipPoints(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari const &primId, simdscalari const &viewportIdx, simdscalari const &rtIdx);
1147 #if USE_SIMD16_FRONTEND
1148 void SIMDCALL ClipTriangles_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari const &primId, simd16scalari const &viewportIdx, simd16scalari const &rtIdx);
1149 void SIMDCALL ClipLines_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari const &primId, simd16scalari const &viewportIdx, simd16scalari const &rtIdx);
1150 void SIMDCALL ClipPoints_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari const &primId, simd16scalari const &viewportIdx, simd16scalari const &rtIdx);
1151 #endif
1152