src/gallium/drivers/swr/rasterizer/core/clip.h

   1 /****************************************************************************
   2 * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
   3 *
   4 * Permission is hereby granted, free of charge, to any person obtaining a
   5 * copy of this software and associated documentation files (the "Software"),
   6 * to deal in the Software without restriction, including without limitation
   7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8 * and/or sell copies of the Software, and to permit persons to whom the
   9 * Software is furnished to do so, subject to the following conditions:
  10 *
  11 * The above copyright notice and this permission notice (including the next
  12 * paragraph) shall be included in all copies or substantial portions of the
  13 * Software.
  14 *
  15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21 * IN THE SOFTWARE.
  22 *
  23 * @file clip.h
  24 *
  25 * @brief Definitions for clipping
  26 *
  27 ******************************************************************************/
  28 #pragma once
  29
  30 #include "common/simdintrin.h"
  31 #include "core/context.h"
  32 #include "core/pa.h"
  33 #include "rdtsc_core.h"
  34
  35 // Temp storage used by the clipper
  36 extern THREAD SIMDVERTEX_T<SIMD256> tlsTempVertices[7];
  37 #if USE_SIMD16_FRONTEND
  38 extern THREAD SIMDVERTEX_T<SIMD512> tlsTempVertices_simd16[7];
  39 #endif
  40
  41 enum SWR_CLIPCODES
  42 {
  43     // Shift clip codes out of the mantissa to prevent denormalized values when used in float compare.
  44     // Guardband is able to use a single high-bit with 4 separate LSBs, because it computes a union, rather than intersection, of clipcodes.
  45 #define CLIPCODE_SHIFT 23
  46     FRUSTUM_LEFT    = (0x01 << CLIPCODE_SHIFT),
  47     FRUSTUM_TOP     = (0x02 << CLIPCODE_SHIFT),
  48     FRUSTUM_RIGHT   = (0x04 << CLIPCODE_SHIFT),
  49     FRUSTUM_BOTTOM  = (0x08 << CLIPCODE_SHIFT),
  50
  51     FRUSTUM_NEAR    = (0x10 << CLIPCODE_SHIFT),
  52     FRUSTUM_FAR     = (0x20 << CLIPCODE_SHIFT),
  53
  54     NEGW            = (0x40 << CLIPCODE_SHIFT),
  55
  56     GUARDBAND_LEFT   = (0x80 << CLIPCODE_SHIFT | 0x1),
  57     GUARDBAND_TOP    = (0x80 << CLIPCODE_SHIFT | 0x2),
  58     GUARDBAND_RIGHT  = (0x80 << CLIPCODE_SHIFT | 0x4),
  59     GUARDBAND_BOTTOM = (0x80 << CLIPCODE_SHIFT | 0x8)
  60 };
  61
  62 #define GUARDBAND_CLIP_MASK (FRUSTUM_NEAR|FRUSTUM_FAR|GUARDBAND_LEFT|GUARDBAND_TOP|GUARDBAND_RIGHT|GUARDBAND_BOTTOM|NEGW)
  63 #define FRUSTUM_CLIP_MASK (FRUSTUM_NEAR|FRUSTUM_FAR|FRUSTUM_LEFT|FRUSTUM_RIGHT|FRUSTUM_TOP|FRUSTUM_BOTTOM)
  64
  65 template<typename SIMD_T>
  66 void ComputeClipCodes(const API_STATE &state, const Vec4<SIMD_T> &vertex, Float<SIMD_T> &clipCodes, Integer<SIMD_T> const &viewportIndexes)
  67 {
  68     clipCodes = SIMD_T::setzero_ps();
  69
  70     // -w
  71     Float<SIMD_T> vNegW = SIMD_T::mul_ps(vertex.w,SIMD_T::set1_ps(-1.0f));
  72
  73     // FRUSTUM_LEFT
  74     Float<SIMD_T> vRes = SIMD_T::cmplt_ps(vertex.x, vNegW);
  75     clipCodes = SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_LEFT)));
  76
  77     // FRUSTUM_TOP
  78     vRes = SIMD_T::cmplt_ps(vertex.y, vNegW);
  79     clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_TOP))));
  80
  81     // FRUSTUM_RIGHT
  82     vRes = SIMD_T::cmpgt_ps(vertex.x, vertex.w);
  83     clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_RIGHT))));
  84
  85     // FRUSTUM_BOTTOM
  86     vRes = SIMD_T::cmpgt_ps(vertex.y, vertex.w);
  87     clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_BOTTOM))));
  88
  89     if (state.rastState.depthClipEnable)
  90     {
  91         // FRUSTUM_NEAR
  92         // DX clips depth [0..w], GL clips [-w..w]
  93         if (state.rastState.clipHalfZ)
  94         {
  95             vRes = SIMD_T::cmplt_ps(vertex.z, SIMD_T::setzero_ps());
  96         }
  97         else
  98         {
  99             vRes = SIMD_T::cmplt_ps(vertex.z, vNegW);
 100         }
 101         clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_NEAR))));
 102
 103         // FRUSTUM_FAR
 104         vRes = SIMD_T::cmpgt_ps(vertex.z, vertex.w);
 105         clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_FAR))));
 106     }
 107
 108     // NEGW
 109     vRes = SIMD_T::cmple_ps(vertex.w, SIMD_T::setzero_ps());
 110     clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(NEGW))));
 111
 112     // GUARDBAND_LEFT
 113     Float<SIMD_T> gbMult = SIMD_T::mul_ps(vNegW, SIMD_T::template i32gather_ps<ScaleFactor<SIMD_T>(4)>(&state.gbState.left[0], viewportIndexes));
 114     vRes = SIMD_T::cmplt_ps(vertex.x, gbMult);
 115     clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_LEFT))));
 116
 117     // GUARDBAND_TOP
 118     gbMult = SIMD_T::mul_ps(vNegW, SIMD_T::template i32gather_ps<ScaleFactor<SIMD_T>(4)>(&state.gbState.top[0], viewportIndexes));
 119     vRes = SIMD_T::cmplt_ps(vertex.y, gbMult);
 120     clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_TOP))));
 121
 122     // GUARDBAND_RIGHT
 123     gbMult = SIMD_T::mul_ps(vertex.w, SIMD_T::template i32gather_ps<ScaleFactor<SIMD_T>(4)>(&state.gbState.right[0], viewportIndexes));
 124     vRes = SIMD_T::cmpgt_ps(vertex.x, gbMult);
 125     clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_RIGHT))));
 126
 127     // GUARDBAND_BOTTOM
 128     gbMult = SIMD_T::mul_ps(vertex.w, SIMD_T::template i32gather_ps<ScaleFactor<SIMD_T>(4)>(&state.gbState.bottom[0], viewportIndexes));
 129     vRes = SIMD_T::cmpgt_ps(vertex.y, gbMult);
 130     clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_BOTTOM))));
 131 }
 132
 133 template<typename SIMD_T>
 134 struct BinnerChooser
 135 {
 136 };
 137
 138 template<>
 139 struct BinnerChooser<SIMD256>
 140 {
 141     PFN_PROCESS_PRIMS pfnBinFunc;
 142
 143     BinnerChooser(uint32_t numVertsPerPrim, uint32_t conservativeRast)
 144         :pfnBinFunc(nullptr)
 145     {
 146         if (numVertsPerPrim == 3)
 147         {
 148             pfnBinFunc = GetBinTrianglesFunc(conservativeRast > 0);
 149
 150         }
 151         else if (numVertsPerPrim == 2)
 152         {
 153             pfnBinFunc = BinLines;
 154         }
 155         else
 156         {
 157             SWR_ASSERT(0 && "Unexpected points in clipper.");
 158         }
 159     }
 160
 161     BinnerChooser(PRIMITIVE_TOPOLOGY topology, uint32_t conservativeRast)
 162         :pfnBinFunc(nullptr)
 163     {
 164         switch (topology)
 165         {
 166         case TOP_POINT_LIST:
 167             pfnBinFunc = BinPoints;
 168             break;
 169         case TOP_LINE_LIST:
 170         case TOP_LINE_STRIP:
 171         case TOP_LINE_LOOP:
 172         case TOP_LINE_LIST_ADJ:
 173         case TOP_LISTSTRIP_ADJ:
 174             pfnBinFunc = BinLines;
 175             break;
 176         default:
 177             pfnBinFunc = GetBinTrianglesFunc(conservativeRast > 0);
 178             break;
 179         };
 180     }
 181
 182     void BinFunc(DRAW_CONTEXT *pDC, PA_STATE &pa, uint32_t workerId, SIMD256::Vec4 prims[], uint32_t primMask, SIMD256::Integer const &primID, SIMD256::Integer &viewportIdx, SIMD256::Integer &rtIdx)
 183     {
 184         SWR_ASSERT(pfnBinFunc != nullptr);
 185
 186         pfnBinFunc(pDC, pa, workerId, prims, primMask, primID, viewportIdx, rtIdx);
 187     }
 188 };
 189
 190 #if USE_SIMD16_FRONTEND
 191 template<>
 192 struct BinnerChooser<SIMD512>
 193 {
 194     PFN_PROCESS_PRIMS_SIMD16 pfnBinFunc;
 195
 196     BinnerChooser(uint32_t numVertsPerPrim, uint32_t conservativeRast)
 197         :pfnBinFunc(nullptr)
 198     {
 199         if (numVertsPerPrim == 3)
 200         {
 201             pfnBinFunc = GetBinTrianglesFunc_simd16(conservativeRast > 0);
 202
 203         }
 204         else if (numVertsPerPrim == 2)
 205         {
 206             pfnBinFunc = BinLines_simd16;
 207         }
 208         else
 209         {
 210             SWR_ASSERT(0 && "Unexpected points in clipper.");
 211         }
 212     }
 213
 214     BinnerChooser(PRIMITIVE_TOPOLOGY topology, uint32_t conservativeRast)
 215         :pfnBinFunc(nullptr)
 216     {
 217         switch (topology)
 218         {
 219         case TOP_POINT_LIST:
 220             pfnBinFunc = BinPoints_simd16;
 221             break;
 222         case TOP_LINE_LIST:
 223         case TOP_LINE_STRIP:
 224         case TOP_LINE_LOOP:
 225         case TOP_LINE_LIST_ADJ:
 226         case TOP_LISTSTRIP_ADJ:
 227             pfnBinFunc = BinLines_simd16;
 228             break;
 229         default:
 230             pfnBinFunc = GetBinTrianglesFunc_simd16(conservativeRast > 0);
 231             break;
 232         };
 233     }
 234
 235     void BinFunc(DRAW_CONTEXT *pDC, PA_STATE &pa, uint32_t workerId, SIMD512::Vec4 prims[], uint32_t primMask, SIMD512::Integer const &primID, SIMD512::Integer &viewportIdx, SIMD512::Integer &rtIdx)
 236     {
 237         SWR_ASSERT(pfnBinFunc != nullptr);
 238
 239         pfnBinFunc(pDC, pa, workerId, prims, primMask, primID, viewportIdx, rtIdx);
 240     }
 241 };
 242
 243 #endif
 244 template<typename SIMD_T>
 245 struct SimdHelper
 246 {
 247 };
 248
 249 template<>
 250 struct SimdHelper<SIMD256>
 251 {
 252     static SIMD256::Float insert_lo_ps(SIMD256::Float a)
 253     {
 254         return a;
 255     }
 256
 257     static SIMD256::Mask cmpeq_ps_mask(SIMD256::Float a, SIMD256::Float b)
 258     {
 259         return SIMD256::movemask_ps(SIMD256::cmpeq_ps(a, b));
 260     }
 261 };
 262
 263 #if USE_SIMD16_FRONTEND
 264 template<>
 265 struct SimdHelper<SIMD512>
 266 {
 267     static SIMD512::Float insert_lo_ps(SIMD256::Float a)
 268     {
 269         return SIMD512::insert_ps<0>(SIMD512::setzero_ps(), a);
 270     }
 271
 272     static SIMD512::Mask cmpeq_ps_mask(SIMD512::Float a, SIMD512::Float b)
 273     {
 274         return SIMD512::cmp_ps_mask<SIMD16::CompareType::EQ_OQ>(a, b);
 275     }
 276 };
 277
 278 #endif
 279 // Temp storage used by the clipper
 280 template<typename SIMD_T>
 281 struct ClipHelper
 282 {
 283 };
 284
 285 template<>
 286 struct ClipHelper<SIMD256>
 287 {
 288     static SIMDVERTEX_T<SIMD256> *GetTempVertices()
 289     {
 290         return tlsTempVertices;
 291     }
 292 };
 293
 294 #if USE_SIMD16_FRONTEND
 295 template<>
 296 struct ClipHelper<SIMD512>
 297 {
 298     static SIMDVERTEX_T<SIMD512> *GetTempVertices()
 299     {
 300         return tlsTempVertices_simd16;
 301     }
 302 };
 303
 304 #endif
 305 template<typename SIMD_T, uint32_t NumVertsPerPrim>
 306 class Clipper
 307 {
 308 public:
 309     INLINE Clipper(uint32_t in_workerId, DRAW_CONTEXT* in_pDC) :
 310         workerId(in_workerId), pDC(in_pDC), state(GetApiState(in_pDC))
 311     {
 312         static_assert(NumVertsPerPrim >= 1 && NumVertsPerPrim <= 3, "Invalid NumVertsPerPrim");
 313     }
 314
 315     void ComputeClipCodes(Vec4<SIMD_T> vertex[], const Integer<SIMD_T> &viewportIndexes)
 316     {
 317         for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
 318         {
 319             ::ComputeClipCodes<SIMD_T>(state, vertex[i], clipCodes[i], viewportIndexes);
 320         }
 321     }
 322
 323     Float<SIMD_T> ComputeClipCodeIntersection()
 324     {
 325         Float<SIMD_T> result = clipCodes[0];
 326
 327         for (uint32_t i = 1; i < NumVertsPerPrim; ++i)
 328         {
 329             result = SIMD_T::and_ps(result, clipCodes[i]);
 330         }
 331
 332         return result;
 333     }
 334
 335     Float<SIMD_T> ComputeClipCodeUnion()
 336     {
 337         Float<SIMD_T> result = clipCodes[0];
 338
 339         for (uint32_t i = 1; i < NumVertsPerPrim; ++i)
 340         {
 341             result = SIMD_T::or_ps(result, clipCodes[i]);
 342         }
 343
 344         return result;
 345     }
 346
 347     int ComputeClipMask()
 348     {
 349         Float<SIMD_T> clipUnion = ComputeClipCodeUnion();
 350
 351         clipUnion = SIMD_T::and_ps(clipUnion, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_CLIP_MASK)));
 352
 353         return SIMD_T::movemask_ps(SIMD_T::cmpneq_ps(clipUnion, SIMD_T::setzero_ps()));
 354     }
 355
 356     // clipper is responsible for culling any prims with NAN coordinates
 357     int ComputeNaNMask(Vec4<SIMD_T> prim[])
 358     {
 359         Float<SIMD_T> vNanMask = SIMD_T::setzero_ps();
 360
 361         for (uint32_t e = 0; e < NumVertsPerPrim; ++e)
 362         {
 363             Float<SIMD_T> vNan01 = SIMD_T::template cmp_ps<SIMD_T::CompareType::UNORD_Q>(prim[e].v[0], prim[e].v[1]);
 364             vNanMask = SIMD_T::or_ps(vNanMask, vNan01);
 365
 366             Float<SIMD_T> vNan23 = SIMD_T::template cmp_ps<SIMD_T::CompareType::UNORD_Q>(prim[e].v[2], prim[e].v[3]);
 367             vNanMask = SIMD_T::or_ps(vNanMask, vNan23);
 368         }
 369
 370         return SIMD_T::movemask_ps(vNanMask);
 371     }
 372
 373     int ComputeUserClipCullMask(PA_STATE &pa, Vec4<SIMD_T> prim[])
 374     {
 375         uint8_t cullMask = state.backendState.cullDistanceMask;
 376         uint32_t vertexClipCullOffset = state.backendState.vertexClipCullOffset;
 377
 378         Float<SIMD_T> vClipCullMask = SIMD_T::setzero_ps();
 379
 380         Vec4<SIMD_T> vClipCullDistLo[3];
 381         Vec4<SIMD_T> vClipCullDistHi[3];
 382
 383         pa.Assemble(vertexClipCullOffset, vClipCullDistLo);
 384         pa.Assemble(vertexClipCullOffset + 1, vClipCullDistHi);
 385
 386         DWORD index;
 387         while (_BitScanForward(&index, cullMask))
 388         {
 389             cullMask &= ~(1 << index);
 390             uint32_t slot = index >> 2;
 391             uint32_t component = index & 0x3;
 392
 393             Float<SIMD_T> vCullMaskElem = SIMD_T::set1_ps(-1.0f);
 394             for (uint32_t e = 0; e < NumVertsPerPrim; ++e)
 395             {
 396                 Float<SIMD_T> vCullComp;
 397                 if (slot == 0)
 398                 {
 399                     vCullComp = vClipCullDistLo[e][component];
 400                 }
 401                 else
 402                 {
 403                     vCullComp = vClipCullDistHi[e][component];
 404                 }
 405
 406                 // cull if cull distance < 0 || NAN
 407                 Float<SIMD_T> vCull = SIMD_T::template cmp_ps<SIMD_T::CompareType::NLE_UQ>(SIMD_T::setzero_ps(), vCullComp);
 408                 vCullMaskElem = SIMD_T::and_ps(vCullMaskElem, vCull);
 409             }
 410             vClipCullMask = SIMD_T::or_ps(vClipCullMask, vCullMaskElem);
 411         }
 412
 413         // clipper should also discard any primitive with NAN clip distance
 414         uint8_t clipMask = state.backendState.clipDistanceMask;
 415         while (_BitScanForward(&index, clipMask))
 416         {
 417             clipMask &= ~(1 << index);
 418             uint32_t slot = index >> 2;
 419             uint32_t component = index & 0x3;
 420
 421             Float<SIMD_T> vCullMaskElem = SIMD_T::set1_ps(-1.0f);
 422             for (uint32_t e = 0; e < NumVertsPerPrim; ++e)
 423             {
 424                 Float<SIMD_T> vClipComp;
 425                 if (slot == 0)
 426                 {
 427                     vClipComp = vClipCullDistLo[e][component];
 428                 }
 429                 else
 430                 {
 431                     vClipComp = vClipCullDistHi[e][component];
 432                 }
 433
 434                 Float<SIMD_T> vClip = SIMD_T::template cmp_ps<SIMD_T::CompareType::UNORD_Q>(vClipComp, vClipComp);
 435                 Float<SIMD_T> vCull = SIMD_T::template cmp_ps<SIMD_T::CompareType::NLE_UQ>(SIMD_T::setzero_ps(), vClipComp);
 436                 vCullMaskElem = SIMD_T::and_ps(vCullMaskElem, vCull);
 437                 vClipCullMask = SIMD_T::or_ps(vClipCullMask, vClip);
 438             }
 439             vClipCullMask = SIMD_T::or_ps(vClipCullMask, vCullMaskElem);
 440         }
 441
 442         return SIMD_T::movemask_ps(vClipCullMask);
 443     }
 444
 445     void ClipSimd(const Vec4<SIMD_T> prim[], const Float<SIMD_T> &vPrimMask, const Float<SIMD_T> &vClipMask, PA_STATE &pa,
 446                   const Integer<SIMD_T> &vPrimId, const Integer<SIMD_T> &vViewportIdx, const Integer<SIMD_T> &vRtIdx)
 447     {
 448         // input/output vertex store for clipper
 449         SIMDVERTEX_T<SIMD_T> vertices[7]; // maximum 7 verts generated per triangle
 450
 451         uint32_t constantInterpMask = state.backendState.constantInterpolationMask;
 452         uint32_t provokingVertex = 0;
 453         if (pa.binTopology == TOP_TRIANGLE_FAN)
 454         {
 455             provokingVertex = state.frontendState.provokingVertex.triFan;
 456         }
 457         ///@todo: line topology for wireframe?
 458
 459         // assemble pos
 460         Vec4<SIMD_T> tmpVector[NumVertsPerPrim];
 461         for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
 462         {
 463             vertices[i].attrib[VERTEX_POSITION_SLOT] = prim[i];
 464         }
 465
 466         // assemble attribs
 467         const SWR_BACKEND_STATE& backendState = state.backendState;
 468
 469         int32_t maxSlot = -1;
 470         for (uint32_t slot = 0; slot < backendState.numAttributes; ++slot)
 471         {
 472             // Compute absolute attrib slot in vertex array
 473             uint32_t mapSlot = backendState.swizzleEnable ? backendState.swizzleMap[slot].sourceAttrib : slot;
 474             maxSlot = std::max<int32_t>(maxSlot, mapSlot);
 475             uint32_t inputSlot = backendState.vertexAttribOffset + mapSlot;
 476
 477             pa.Assemble(inputSlot, tmpVector);
 478
 479             // if constant interpolation enabled for this attribute, assign the provoking
 480             // vertex values to all edges
 481             if (CheckBit(constantInterpMask, slot))
 482             {
 483                 for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
 484                 {
 485                     vertices[i].attrib[inputSlot] = tmpVector[provokingVertex];
 486                 }
 487             }
 488             else
 489             {
 490                 for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
 491                 {
 492                     vertices[i].attrib[inputSlot] = tmpVector[i];
 493                 }
 494             }
 495         }
 496
 497         // assemble user clip distances if enabled
 498         uint32_t vertexClipCullSlot = state.backendState.vertexClipCullOffset;
 499         if (state.backendState.clipDistanceMask & 0xf)
 500         {
 501             pa.Assemble(vertexClipCullSlot, tmpVector);
 502             for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
 503             {
 504                 vertices[i].attrib[vertexClipCullSlot] = tmpVector[i];
 505             }
 506         }
 507
 508         if (state.backendState.clipDistanceMask & 0xf0)
 509         {
 510             pa.Assemble(vertexClipCullSlot + 1, tmpVector);
 511             for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
 512             {
 513                 vertices[i].attrib[vertexClipCullSlot + 1] = tmpVector[i];
 514             }
 515         }
 516
 517         uint32_t numAttribs = maxSlot + 1;
 518
 519         Integer<SIMD_T> vNumClippedVerts = ClipPrims((float*)&vertices[0], vPrimMask, vClipMask, numAttribs);
 520
 521         BinnerChooser<SIMD_T> binner(NumVertsPerPrim, pa.pDC->pState->state.rastState.conservativeRast);
 522
 523         // set up new PA for binning clipped primitives
 524         PRIMITIVE_TOPOLOGY clipTopology = TOP_UNKNOWN;
 525         if (NumVertsPerPrim == 3)
 526         {
 527             clipTopology = TOP_TRIANGLE_FAN;
 528
 529             // so that the binner knows to bloat wide points later
 530             if (pa.binTopology == TOP_POINT_LIST)
 531             {
 532                 clipTopology = TOP_POINT_LIST;
 533             }
 534         }
 535         else if (NumVertsPerPrim == 2)
 536         {
 537             clipTopology = TOP_LINE_LIST;
 538         }
 539         else
 540         {
 541             SWR_ASSERT(0 && "Unexpected points in clipper.");
 542         }
 543
 544         const uint32_t *pVertexCount = reinterpret_cast<const uint32_t *>(&vNumClippedVerts);
 545         const uint32_t *pPrimitiveId = reinterpret_cast<const uint32_t *>(&vPrimId);
 546         const uint32_t *pViewportIdx = reinterpret_cast<const uint32_t *>(&vViewportIdx);
 547         const uint32_t *pRtIdx = reinterpret_cast<const uint32_t *>(&vRtIdx);
 548
 549         const SIMD256::Integer vOffsets = SIMD256::set_epi32(
 550             0 * sizeof(SIMDVERTEX_T<SIMD_T>), // unused lane
 551             6 * sizeof(SIMDVERTEX_T<SIMD_T>),
 552             5 * sizeof(SIMDVERTEX_T<SIMD_T>),
 553             4 * sizeof(SIMDVERTEX_T<SIMD_T>),
 554             3 * sizeof(SIMDVERTEX_T<SIMD_T>),
 555             2 * sizeof(SIMDVERTEX_T<SIMD_T>),
 556             1 * sizeof(SIMDVERTEX_T<SIMD_T>),
 557             0 * sizeof(SIMDVERTEX_T<SIMD_T>));
 558
 559         // only need to gather 7 verts
 560         // @todo dynamic mask based on actual # of verts generated per lane
 561         const SIMD256::Float vMask = SIMD256::set_ps(0, -1, -1, -1, -1, -1, -1, -1);
 562
 563         uint32_t numClippedPrims = 0;
 564
 565         // tranpose clipper output so that each lane's vertices are in SIMD order
 566         // set aside space for 2 vertices, as the PA will try to read up to 16 verts
 567         // for triangle fan
 568
 569 #if defined(_DEBUG)
 570         // TODO: need to increase stack size, allocating SIMD16-widened transposedPrims causes stack overflow in debug builds
 571         SIMDVERTEX_T<SIMD_T> *transposedPrims = reinterpret_cast<SIMDVERTEX_T<SIMD_T> *>(AlignedMalloc(sizeof(SIMDVERTEX_T<SIMD_T>) * 2, 64));
 572
 573 #else
 574         SIMDVERTEX_T<SIMD_T> transposedPrims[2];
 575
 576 #endif
 577         uint32_t numInputPrims = pa.NumPrims();
 578         for (uint32_t inputPrim = 0; inputPrim < numInputPrims; ++inputPrim)
 579         {
 580             uint32_t numEmittedVerts = pVertexCount[inputPrim];
 581             if (numEmittedVerts < NumVertsPerPrim)
 582             {
 583                 continue;
 584             }
 585             SWR_ASSERT(numEmittedVerts <= 7, "Unexpected vertex count from clipper.");
 586
 587             uint32_t numEmittedPrims = GetNumPrims(clipTopology, numEmittedVerts);
 588             SWR_ASSERT(numEmittedPrims <= 7, "Unexpected primitive count from clipper.");
 589
 590             numClippedPrims += numEmittedPrims;
 591
 592             // tranpose clipper output so that each lane's vertices are in SIMD order
 593             // set aside space for 2 vertices, as the PA will try to read up to 16 verts
 594             // for triangle fan
 595
 596             // transpose pos
 597             uint8_t *pBase = reinterpret_cast<uint8_t *>(&vertices[0].attrib[VERTEX_POSITION_SLOT]) + sizeof(float) * inputPrim;
 598
 599 #if 0
 600             // TEMPORARY WORKAROUND for bizarre VS2015 code-gen bug
 601             static const float *dummy = reinterpret_cast<const float *>(pBase);
 602
 603 #endif
 604             for (uint32_t c = 0; c < 4; ++c)
 605             {
 606                 SIMD256::Float temp = SIMD256::template mask_i32gather_ps<ScaleFactor<SIMD_T>(1)>(SIMD256::setzero_ps(), reinterpret_cast<const float *>(pBase), vOffsets, vMask);
 607                 transposedPrims[0].attrib[VERTEX_POSITION_SLOT][c] = SimdHelper<SIMD_T>::insert_lo_ps(temp);
 608                 pBase += sizeof(Float<SIMD_T>);
 609             }
 610
 611             // transpose attribs
 612             pBase = reinterpret_cast<uint8_t *>(&vertices[0].attrib[backendState.vertexAttribOffset]) + sizeof(float) * inputPrim;
 613
 614             for (uint32_t attrib = 0; attrib < numAttribs; ++attrib)
 615             {
 616                 uint32_t attribSlot = backendState.vertexAttribOffset + attrib;
 617
 618                 for (uint32_t c = 0; c < 4; ++c)
 619                 {
 620                     SIMD256::Float temp = SIMD256::template mask_i32gather_ps<ScaleFactor<SIMD_T>(1)>(SIMD256::setzero_ps(), reinterpret_cast<const float *>(pBase), vOffsets, vMask);
 621                     transposedPrims[0].attrib[attribSlot][c] = SimdHelper<SIMD_T>::insert_lo_ps(temp);
 622                     pBase += sizeof(Float<SIMD_T>);
 623                 }
 624             }
 625
 626             // transpose user clip distances if enabled
 627             uint32_t vertexClipCullSlot = backendState.vertexClipCullOffset;
 628             if (state.backendState.clipDistanceMask & 0x0f)
 629             {
 630                 pBase = reinterpret_cast<uint8_t *>(&vertices[0].attrib[vertexClipCullSlot]) + sizeof(float) * inputPrim;
 631
 632                 for (uint32_t c = 0; c < 4; ++c)
 633                 {
 634                     SIMD256::Float temp = SIMD256::template mask_i32gather_ps<ScaleFactor<SIMD_T>(1)>(SIMD256::setzero_ps(), reinterpret_cast<const float *>(pBase), vOffsets, vMask);
 635                     transposedPrims[0].attrib[vertexClipCullSlot][c] = SimdHelper<SIMD_T>::insert_lo_ps(temp);
 636                     pBase += sizeof(Float<SIMD_T>);
 637                 }
 638             }
 639
 640             if (state.backendState.clipDistanceMask & 0xf0)
 641             {
 642                 pBase = reinterpret_cast<uint8_t *>(&vertices[0].attrib[vertexClipCullSlot + 1]) + sizeof(float) * inputPrim;
 643
 644                 for (uint32_t c = 0; c < 4; ++c)
 645                 {
 646                     SIMD256::Float temp = SIMD256::template mask_i32gather_ps<ScaleFactor<SIMD_T>(1)>(SIMD256::setzero_ps(), reinterpret_cast<const float *>(pBase), vOffsets, vMask);
 647                     transposedPrims[0].attrib[vertexClipCullSlot + 1][c] = SimdHelper<SIMD_T>::insert_lo_ps(temp);
 648                     pBase += sizeof(Float<SIMD_T>);
 649                 }
 650             }
 651
 652             PA_STATE_OPT clipPA(pDC, numEmittedPrims, reinterpret_cast<uint8_t *>(&transposedPrims[0]), numEmittedVerts, SWR_VTX_NUM_SLOTS, true, NumVertsPerPrim, clipTopology);
 653             clipPA.viewportArrayActive = pa.viewportArrayActive;
 654             clipPA.rtArrayActive = pa.rtArrayActive;
 655
 656             static const uint32_t primMaskMap[] = { 0x0, 0x1, 0x3, 0x7, 0xf, 0x1f, 0x3f, 0x7f };
 657
 658             const uint32_t primMask = primMaskMap[numEmittedPrims];
 659
 660             const Integer<SIMD_T> primID = SIMD_T::set1_epi32(pPrimitiveId[inputPrim]);
 661             const Integer<SIMD_T> viewportIdx = SIMD_T::set1_epi32(pViewportIdx[inputPrim]);
 662             const Integer<SIMD_T> rtIdx = SIMD_T::set1_epi32(pRtIdx[inputPrim]);
 663
 664
 665             while (clipPA.GetNextStreamOutput())
 666             {
 667                 do
 668                 {
 669                     Vec4<SIMD_T> attrib[NumVertsPerPrim];
 670
 671                     bool assemble = clipPA.Assemble(VERTEX_POSITION_SLOT, attrib);
 672
 673                     if (assemble)
 674                     {
 675                         binner.pfnBinFunc(pDC, clipPA, workerId, attrib, primMask, primID, viewportIdx, rtIdx);
 676                     }
 677
 678                 } while (clipPA.NextPrim());
 679             }
 680         }
 681
 682 #if defined(_DEBUG)
 683         AlignedFree(transposedPrims);
 684
 685 #endif
 686         // update global pipeline stat
 687         UPDATE_STAT_FE(CPrimitives, numClippedPrims);
 688     }
 689
 690     void ExecuteStage(PA_STATE &pa, Vec4<SIMD_T> prim[], uint32_t primMask,
 691                       Integer<SIMD_T> const &primId, Integer<SIMD_T> const &viewportIdx, Integer<SIMD_T> const &rtIdx)
 692     {
 693         SWR_ASSERT(pa.pDC != nullptr);
 694
 695         BinnerChooser<SIMD_T> binner(pa.binTopology, pa.pDC->pState->state.rastState.conservativeRast);
 696
 697         // update clipper invocations pipeline stat
 698         uint32_t numInvoc = _mm_popcnt_u32(primMask);
 699         UPDATE_STAT_FE(CInvocations, numInvoc);
 700
 701         ComputeClipCodes(prim, viewportIdx);
 702
 703         // cull prims with NAN coords
 704         primMask &= ~ComputeNaNMask(prim);
 705
 706         // user cull distance cull
 707         if (state.backendState.cullDistanceMask | state.backendState.clipDistanceMask)
 708         {
 709             primMask &= ~ComputeUserClipCullMask(pa, prim);
 710         }
 711
 712         Float<SIMD_T> clipIntersection = ComputeClipCodeIntersection();
 713         // Mask out non-frustum codes
 714         clipIntersection = SIMD_T::and_ps(clipIntersection, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_CLIP_MASK)));
 715
 716         // cull prims outside view frustum
 717         int validMask = primMask & SimdHelper<SIMD_T>::cmpeq_ps_mask(clipIntersection, SIMD_T::setzero_ps());
 718
 719         // skip clipping for points
 720         uint32_t clipMask = 0;
 721         if (NumVertsPerPrim != 1)
 722         {
 723             clipMask = validMask & ComputeClipMask();
 724         }
 725
 726         AR_EVENT(ClipInfoEvent(numInvoc, validMask, clipMask));
 727
 728         if (clipMask)
 729         {
 730             RDTSC_BEGIN(FEGuardbandClip, pa.pDC->drawId);
 731             // we have to clip tris, execute the clipper, which will also
 732             // call the binner
 733             ClipSimd(prim, SIMD_T::vmask_ps(validMask), SIMD_T::vmask_ps(clipMask), pa, primId, viewportIdx, rtIdx);
 734             RDTSC_END(FEGuardbandClip, 1);
 735         }
 736         else if (validMask)
 737         {
 738             // update CPrimitives pipeline state
 739             UPDATE_STAT_FE(CPrimitives, _mm_popcnt_u32(validMask));
 740
 741             // forward valid prims directly to binner
 742             binner.pfnBinFunc(this->pDC, pa, this->workerId, prim, validMask, primId, viewportIdx, rtIdx);
 743         }
 744     }
 745
 746 private:
 747     Float<SIMD_T> ComputeInterpFactor(Float<SIMD_T> const &boundaryCoord0, Float<SIMD_T> const &boundaryCoord1)
 748     {
 749         return SIMD_T::div_ps(boundaryCoord0, SIMD_T::sub_ps(boundaryCoord0, boundaryCoord1));
 750     }
 751
 752     Integer<SIMD_T> ComputeOffsets(uint32_t attrib, Integer<SIMD_T> const &vIndices, uint32_t component)
 753     {
 754         const uint32_t simdVertexStride = sizeof(SIMDVERTEX_T<SIMD_T>);
 755         const uint32_t componentStride  = sizeof(Float<SIMD_T>);
 756         const uint32_t attribStride     = sizeof(Vec4<SIMD_T>);
 757
 758         static const OSALIGNSIMD16(uint32_t) elemOffset[16] =
 759         {
 760             0 * sizeof(float),
 761             1 * sizeof(float),
 762             2 * sizeof(float),
 763             3 * sizeof(float),
 764             4 * sizeof(float),
 765             5 * sizeof(float),
 766             6 * sizeof(float),
 767             7 * sizeof(float),
 768             8 * sizeof(float),
 769             9 * sizeof(float),
 770             10 * sizeof(float),
 771             11 * sizeof(float),
 772             12 * sizeof(float),
 773             13 * sizeof(float),
 774             14 * sizeof(float),
 775             15 * sizeof(float),
 776         };
 777
 778         static_assert(sizeof(Integer<SIMD_T>) <= sizeof(elemOffset), "Clipper::ComputeOffsets, Increase number of element offsets.");
 779
 780         Integer<SIMD_T> vElemOffset = SIMD_T::loadu_si(reinterpret_cast<const Integer<SIMD_T> *>(elemOffset));
 781
 782         // step to the simdvertex
 783         Integer<SIMD_T> vOffsets = SIMD_T::mullo_epi32(vIndices, SIMD_T::set1_epi32(simdVertexStride));
 784
 785         // step to the attribute and component
 786         vOffsets = SIMD_T::add_epi32(vOffsets, SIMD_T::set1_epi32(attribStride * attrib + componentStride * component));
 787
 788         // step to the lane
 789         vOffsets = SIMD_T::add_epi32(vOffsets, vElemOffset);
 790
 791         return vOffsets;
 792     }
 793
 794     Float<SIMD_T> GatherComponent(const float* pBuffer, uint32_t attrib, Float<SIMD_T> const &vMask, Integer<SIMD_T> const &vIndices, uint32_t component)
 795     {
 796         Integer<SIMD_T> vOffsets = ComputeOffsets(attrib, vIndices, component);
 797         Float<SIMD_T> vSrc = SIMD_T::setzero_ps();
 798
 799         return SIMD_T::template mask_i32gather_ps<ScaleFactor<SIMD_T>(1)>(vSrc, pBuffer, vOffsets, vMask);
 800     }
 801
 802     void ScatterComponent(const float* pBuffer, uint32_t attrib, Float<SIMD_T> const &vMask, Integer<SIMD_T> const &vIndices, uint32_t component, Float<SIMD_T> const &vSrc)
 803     {
 804         Integer<SIMD_T> vOffsets = ComputeOffsets(attrib, vIndices, component);
 805
 806         const uint32_t *pOffsets = reinterpret_cast<const uint32_t *>(&vOffsets);
 807         const float *pSrc = reinterpret_cast<const float *>(&vSrc);
 808         uint32_t mask = SIMD_T::movemask_ps(vMask);
 809         DWORD lane;
 810         while (_BitScanForward(&lane, mask))
 811         {
 812             mask &= ~(1 << lane);
 813             const uint8_t *pBuf = reinterpret_cast<const uint8_t *>(pBuffer) + pOffsets[lane];
 814             *(float *)pBuf = pSrc[lane];
 815         }
 816     }
 817
 818     template<SWR_CLIPCODES ClippingPlane>
 819     void intersect(
 820         const Float<SIMD_T> &vActiveMask,  // active lanes to operate on
 821         const Integer<SIMD_T> &s,          // index to first edge vertex v0 in pInPts.
 822         const Integer<SIMD_T> &p,          // index to second edge vertex v1 in pInPts.
 823         const Vec4<SIMD_T> &v1,            // vertex 0 position
 824         const Vec4<SIMD_T> &v2,            // vertex 1 position
 825         Integer<SIMD_T> &outIndex,         // output index.
 826         const float *pInVerts,                      // array of all the input positions.
 827         uint32_t numInAttribs,                      // number of attributes per vertex.
 828         float *pOutVerts)                           // array of output positions. We'll write our new intersection point at i*4.
 829     {
 830         uint32_t vertexAttribOffset = this->state.backendState.vertexAttribOffset;
 831         uint32_t vertexClipCullOffset = this->state.backendState.vertexClipCullOffset;
 832
 833         // compute interpolation factor
 834         Float<SIMD_T> t;
 835         switch (ClippingPlane)
 836         {
 837         case FRUSTUM_LEFT:      t = ComputeInterpFactor(SIMD_T::add_ps(v1[3], v1[0]), SIMD_T::add_ps(v2[3], v2[0])); break;
 838         case FRUSTUM_RIGHT:     t = ComputeInterpFactor(SIMD_T::sub_ps(v1[3], v1[0]), SIMD_T::sub_ps(v2[3], v2[0])); break;
 839         case FRUSTUM_TOP:       t = ComputeInterpFactor(SIMD_T::add_ps(v1[3], v1[1]), SIMD_T::add_ps(v2[3], v2[1])); break;
 840         case FRUSTUM_BOTTOM:    t = ComputeInterpFactor(SIMD_T::sub_ps(v1[3], v1[1]), SIMD_T::sub_ps(v2[3], v2[1])); break;
 841         case FRUSTUM_NEAR:
 842             // DX Znear plane is 0, GL is -w
 843             if (this->state.rastState.clipHalfZ)
 844             {
 845                 t = ComputeInterpFactor(v1[2], v2[2]);
 846             }
 847             else
 848             {
 849                 t = ComputeInterpFactor(SIMD_T::add_ps(v1[3], v1[2]), SIMD_T::add_ps(v2[3], v2[2]));
 850             }
 851             break;
 852         case FRUSTUM_FAR:       t = ComputeInterpFactor(SIMD_T::sub_ps(v1[3], v1[2]), SIMD_T::sub_ps(v2[3], v2[2])); break;
 853         default: SWR_INVALID("invalid clipping plane: %d", ClippingPlane);
 854         };
 855
 856         // interpolate position and store
 857         for (uint32_t c = 0; c < 4; ++c)
 858         {
 859             Float<SIMD_T> vOutPos = SIMD_T::fmadd_ps(SIMD_T::sub_ps(v2[c], v1[c]), t, v1[c]);
 860             ScatterComponent(pOutVerts, VERTEX_POSITION_SLOT, vActiveMask, outIndex, c, vOutPos);
 861         }
 862
 863         // interpolate attributes and store
 864         for (uint32_t a = 0; a < numInAttribs; ++a)
 865         {
 866             uint32_t attribSlot = vertexAttribOffset + a;
 867             for (uint32_t c = 0; c < 4; ++c)
 868             {
 869                 Float<SIMD_T> vAttrib0 = GatherComponent(pInVerts, attribSlot, vActiveMask, s, c);
 870                 Float<SIMD_T> vAttrib1 = GatherComponent(pInVerts, attribSlot, vActiveMask, p, c);
 871                 Float<SIMD_T> vOutAttrib = SIMD_T::fmadd_ps(SIMD_T::sub_ps(vAttrib1, vAttrib0), t, vAttrib0);
 872                 ScatterComponent(pOutVerts, attribSlot, vActiveMask, outIndex, c, vOutAttrib);
 873             }
 874         }
 875
 876         // interpolate clip distance if enabled
 877         if (this->state.backendState.clipDistanceMask & 0xf)
 878         {
 879             uint32_t attribSlot = vertexClipCullOffset;
 880             for (uint32_t c = 0; c < 4; ++c)
 881             {
 882                 Float<SIMD_T> vAttrib0 = GatherComponent(pInVerts, attribSlot, vActiveMask, s, c);
 883                 Float<SIMD_T> vAttrib1 = GatherComponent(pInVerts, attribSlot, vActiveMask, p, c);
 884                 Float<SIMD_T> vOutAttrib = SIMD_T::fmadd_ps(SIMD_T::sub_ps(vAttrib1, vAttrib0), t, vAttrib0);
 885                 ScatterComponent(pOutVerts, attribSlot, vActiveMask, outIndex, c, vOutAttrib);
 886             }
 887         }
 888
 889         if (this->state.backendState.clipDistanceMask & 0xf0)
 890         {
 891             uint32_t attribSlot = vertexClipCullOffset + 1;
 892             for (uint32_t c = 0; c < 4; ++c)
 893             {
 894                 Float<SIMD_T> vAttrib0 = GatherComponent(pInVerts, attribSlot, vActiveMask, s, c);
 895                 Float<SIMD_T> vAttrib1 = GatherComponent(pInVerts, attribSlot, vActiveMask, p, c);
 896                 Float<SIMD_T> vOutAttrib = SIMD_T::fmadd_ps(SIMD_T::sub_ps(vAttrib1, vAttrib0), t, vAttrib0);
 897                 ScatterComponent(pOutVerts, attribSlot, vActiveMask, outIndex, c, vOutAttrib);
 898             }
 899         }
 900     }
 901
 902     template<SWR_CLIPCODES ClippingPlane>
 903     Float<SIMD_T> inside(const Vec4<SIMD_T> &v)
 904     {
 905         switch (ClippingPlane)
 906         {
 907         case FRUSTUM_LEFT:      return SIMD_T::cmpge_ps(v[0], SIMD_T::mul_ps(v[3], SIMD_T::set1_ps(-1.0f)));
 908         case FRUSTUM_RIGHT:     return SIMD_T::cmple_ps(v[0], v[3]);
 909         case FRUSTUM_TOP:       return SIMD_T::cmpge_ps(v[1], SIMD_T::mul_ps(v[3], SIMD_T::set1_ps(-1.0f)));
 910         case FRUSTUM_BOTTOM:    return SIMD_T::cmple_ps(v[1], v[3]);
 911         case FRUSTUM_NEAR:      return SIMD_T::cmpge_ps(v[2], this->state.rastState.clipHalfZ ? SIMD_T::setzero_ps() : SIMD_T::mul_ps(v[3], SIMD_T::set1_ps(-1.0f)));
 912         case FRUSTUM_FAR:       return SIMD_T::cmple_ps(v[2], v[3]);
 913         default:
 914             SWR_INVALID("invalid clipping plane: %d", ClippingPlane);
 915             return SIMD_T::setzero_ps();
 916         }
 917     }
 918
 919     template<SWR_CLIPCODES ClippingPlane>
 920     Integer<SIMD_T> ClipTriToPlane(const float *pInVerts, const Integer<SIMD_T> &vNumInPts, uint32_t numInAttribs, float *pOutVerts)
 921     {
 922         uint32_t vertexAttribOffset = this->state.backendState.vertexAttribOffset;
 923
 924         Integer<SIMD_T> vCurIndex = SIMD_T::setzero_si();
 925         Integer<SIMD_T> vOutIndex = SIMD_T::setzero_si();
 926         Float<SIMD_T> vActiveMask = SIMD_T::castsi_ps(SIMD_T::cmplt_epi32(vCurIndex, vNumInPts));
 927
 928         while (!SIMD_T::testz_ps(vActiveMask, vActiveMask)) // loop until activeMask is empty
 929         {
 930             Integer<SIMD_T> s = vCurIndex;
 931             Integer<SIMD_T> p = SIMD_T::add_epi32(s, SIMD_T::set1_epi32(1));
 932             Integer<SIMD_T> underFlowMask = SIMD_T::cmpgt_epi32(vNumInPts, p);
 933             p = SIMD_T::castps_si(SIMD_T::blendv_ps(SIMD_T::setzero_ps(), SIMD_T::castsi_ps(p), SIMD_T::castsi_ps(underFlowMask)));
 934
 935             // gather position
 936             Vec4<SIMD_T> vInPos0, vInPos1;
 937             for (uint32_t c = 0; c < 4; ++c)
 938             {
 939                 vInPos0[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, s, c);
 940                 vInPos1[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, p, c);
 941             }
 942
 943             // compute inside mask
 944             Float<SIMD_T> s_in = inside<ClippingPlane>(vInPos0);
 945             Float<SIMD_T> p_in = inside<ClippingPlane>(vInPos1);
 946
 947             // compute intersection mask (s_in != p_in)
 948             Float<SIMD_T> intersectMask = SIMD_T::xor_ps(s_in, p_in);
 949             intersectMask = SIMD_T::and_ps(intersectMask, vActiveMask);
 950
 951             // store s if inside
 952             s_in = SIMD_T::and_ps(s_in, vActiveMask);
 953             if (!SIMD_T::testz_ps(s_in, s_in))
 954             {
 955                 // store position
 956                 for (uint32_t c = 0; c < 4; ++c)
 957                 {
 958                     ScatterComponent(pOutVerts, VERTEX_POSITION_SLOT, s_in, vOutIndex, c, vInPos0[c]);
 959                 }
 960
 961                 // store attribs
 962                 for (uint32_t a = 0; a < numInAttribs; ++a)
 963                 {
 964                     uint32_t attribSlot = vertexAttribOffset + a;
 965                     for (uint32_t c = 0; c < 4; ++c)
 966                     {
 967                         Float<SIMD_T> vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
 968                         ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib);
 969                     }
 970                 }
 971
 972                 // store clip distance if enabled
 973                 uint32_t vertexClipCullSlot = this->state.backendState.vertexClipCullOffset;
 974                 if (this->state.backendState.clipDistanceMask & 0xf)
 975                 {
 976                     uint32_t attribSlot = vertexClipCullSlot;
 977                     for (uint32_t c = 0; c < 4; ++c)
 978                     {
 979                         Float<SIMD_T> vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
 980                         ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib);
 981                     }
 982                 }
 983
 984                 if (this->state.backendState.clipDistanceMask & 0xf0)
 985                 {
 986                     uint32_t attribSlot = vertexClipCullSlot + 1;
 987                     for (uint32_t c = 0; c < 4; ++c)
 988                     {
 989                         Float<SIMD_T> vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
 990                         ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib);
 991                     }
 992                 }
 993
 994                 // increment outIndex
 995                 vOutIndex = SIMD_T::blendv_epi32(vOutIndex, SIMD_T::add_epi32(vOutIndex, SIMD_T::set1_epi32(1)), s_in);
 996             }
 997
 998             // compute and store intersection
 999             if (!SIMD_T::testz_ps(intersectMask, intersectMask))
1000             {
1001                 intersect<ClippingPlane>(intersectMask, s, p, vInPos0, vInPos1, vOutIndex, pInVerts, numInAttribs, pOutVerts);
1002
1003                 // increment outIndex for active lanes
1004                 vOutIndex = SIMD_T::blendv_epi32(vOutIndex, SIMD_T::add_epi32(vOutIndex, SIMD_T::set1_epi32(1)), intersectMask);
1005             }
1006
1007             // increment loop index and update active mask
1008             vCurIndex = SIMD_T::add_epi32(vCurIndex, SIMD_T::set1_epi32(1));
1009             vActiveMask = SIMD_T::castsi_ps(SIMD_T::cmplt_epi32(vCurIndex, vNumInPts));
1010         }
1011
1012         return vOutIndex;
1013     }
1014
1015     template<SWR_CLIPCODES ClippingPlane>
1016     Integer<SIMD_T> ClipLineToPlane(const float *pInVerts, const Integer<SIMD_T> &vNumInPts, uint32_t numInAttribs, float *pOutVerts)
1017     {
1018         uint32_t vertexAttribOffset = this->state.backendState.vertexAttribOffset;
1019
1020         Integer<SIMD_T> vCurIndex = SIMD_T::setzero_si();
1021         Integer<SIMD_T> vOutIndex = SIMD_T::setzero_si();
1022         Float<SIMD_T> vActiveMask = SIMD_T::castsi_ps(SIMD_T::cmplt_epi32(vCurIndex, vNumInPts));
1023
1024         if (!SIMD_T::testz_ps(vActiveMask, vActiveMask))
1025         {
1026             Integer<SIMD_T> s = vCurIndex;
1027             Integer<SIMD_T> p = SIMD_T::add_epi32(s, SIMD_T::set1_epi32(1));
1028
1029             // gather position
1030             Vec4<SIMD_T> vInPos0, vInPos1;
1031             for (uint32_t c = 0; c < 4; ++c)
1032             {
1033                 vInPos0[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, s, c);
1034                 vInPos1[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, p, c);
1035             }
1036
1037             // compute inside mask
1038             Float<SIMD_T> s_in = inside<ClippingPlane>(vInPos0);
1039             Float<SIMD_T> p_in = inside<ClippingPlane>(vInPos1);
1040
1041             // compute intersection mask (s_in != p_in)
1042             Float<SIMD_T> intersectMask = SIMD_T::xor_ps(s_in, p_in);
1043             intersectMask = SIMD_T::and_ps(intersectMask, vActiveMask);
1044
1045             // store s if inside
1046             s_in = SIMD_T::and_ps(s_in, vActiveMask);
1047             if (!SIMD_T::testz_ps(s_in, s_in))
1048             {
1049                 for (uint32_t c = 0; c < 4; ++c)
1050                 {
1051                     ScatterComponent(pOutVerts, VERTEX_POSITION_SLOT, s_in, vOutIndex, c, vInPos0[c]);
1052                 }
1053
1054                 // interpolate attributes and store
1055                 for (uint32_t a = 0; a < numInAttribs; ++a)
1056                 {
1057                     uint32_t attribSlot = vertexAttribOffset + a;
1058                     for (uint32_t c = 0; c < 4; ++c)
1059                     {
1060                         Float<SIMD_T> vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
1061                         ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib);
1062                     }
1063                 }
1064
1065                 // increment outIndex
1066                 vOutIndex = SIMD_T::blendv_epi32(vOutIndex, SIMD_T::add_epi32(vOutIndex, SIMD_T::set1_epi32(1)), s_in);
1067             }
1068
1069             // compute and store intersection
1070             if (!SIMD_T::testz_ps(intersectMask, intersectMask))
1071             {
1072                 intersect<ClippingPlane>(intersectMask, s, p, vInPos0, vInPos1, vOutIndex, pInVerts, numInAttribs, pOutVerts);
1073
1074                 // increment outIndex for active lanes
1075                 vOutIndex = SIMD_T::blendv_epi32(vOutIndex, SIMD_T::add_epi32(vOutIndex, SIMD_T::set1_epi32(1)), intersectMask);
1076             }
1077
1078             // store p if inside
1079             p_in = SIMD_T::and_ps(p_in, vActiveMask);
1080             if (!SIMD_T::testz_ps(p_in, p_in))
1081             {
1082                 for (uint32_t c = 0; c < 4; ++c)
1083                 {
1084                     ScatterComponent(pOutVerts, VERTEX_POSITION_SLOT, p_in, vOutIndex, c, vInPos1[c]);
1085                 }
1086
1087                 // interpolate attributes and store
1088                 for (uint32_t a = 0; a < numInAttribs; ++a)
1089                 {
1090                     uint32_t attribSlot = vertexAttribOffset + a;
1091                     for (uint32_t c = 0; c < 4; ++c)
1092                     {
1093                         Float<SIMD_T> vAttrib = GatherComponent(pInVerts, attribSlot, p_in, p, c);
1094                         ScatterComponent(pOutVerts, attribSlot, p_in, vOutIndex, c, vAttrib);
1095                     }
1096                 }
1097
1098                 // increment outIndex
1099                 vOutIndex = SIMD_T::blendv_epi32(vOutIndex, SIMD_T::add_epi32(vOutIndex, SIMD_T::set1_epi32(1)), p_in);
1100             }
1101         }
1102
1103         return vOutIndex;
1104     }
1105
1106     Integer<SIMD_T> ClipPrims(float *pVertices, const Float<SIMD_T> &vPrimMask, const Float<SIMD_T> &vClipMask, int numAttribs)
1107     {
1108         // temp storage
1109         float *pTempVerts = reinterpret_cast<float *>(ClipHelper<SIMD_T>::GetTempVertices());
1110
1111         // zero out num input verts for non-active lanes
1112         Integer<SIMD_T> vNumInPts = SIMD_T::set1_epi32(NumVertsPerPrim);
1113         vNumInPts = SIMD_T::blendv_epi32(SIMD_T::setzero_si(), vNumInPts, vClipMask);
1114
1115         // clip prims to frustum
1116         Integer<SIMD_T> vNumOutPts;
1117         if (NumVertsPerPrim == 3)
1118         {
1119             vNumOutPts = ClipTriToPlane<FRUSTUM_NEAR>(pVertices, vNumInPts, numAttribs, pTempVerts);
1120             vNumOutPts = ClipTriToPlane<FRUSTUM_FAR>(pTempVerts, vNumOutPts, numAttribs, pVertices);
1121             vNumOutPts = ClipTriToPlane<FRUSTUM_LEFT>(pVertices, vNumOutPts, numAttribs, pTempVerts);
1122             vNumOutPts = ClipTriToPlane<FRUSTUM_RIGHT>(pTempVerts, vNumOutPts, numAttribs, pVertices);
1123             vNumOutPts = ClipTriToPlane<FRUSTUM_BOTTOM>(pVertices, vNumOutPts, numAttribs, pTempVerts);
1124             vNumOutPts = ClipTriToPlane<FRUSTUM_TOP>(pTempVerts, vNumOutPts, numAttribs, pVertices);
1125         }
1126         else
1127         {
1128             SWR_ASSERT(NumVertsPerPrim == 2);
1129             vNumOutPts = ClipLineToPlane<FRUSTUM_NEAR>(pVertices, vNumInPts, numAttribs, pTempVerts);
1130             vNumOutPts = ClipLineToPlane<FRUSTUM_FAR>(pTempVerts, vNumOutPts, numAttribs, pVertices);
1131             vNumOutPts = ClipLineToPlane<FRUSTUM_LEFT>(pVertices, vNumOutPts, numAttribs, pTempVerts);
1132             vNumOutPts = ClipLineToPlane<FRUSTUM_RIGHT>(pTempVerts, vNumOutPts, numAttribs, pVertices);
1133             vNumOutPts = ClipLineToPlane<FRUSTUM_BOTTOM>(pVertices, vNumOutPts, numAttribs, pTempVerts);
1134             vNumOutPts = ClipLineToPlane<FRUSTUM_TOP>(pTempVerts, vNumOutPts, numAttribs, pVertices);
1135         }
1136
1137         // restore num verts for non-clipped, active lanes
1138         Float<SIMD_T> vNonClippedMask = SIMD_T::andnot_ps(vClipMask, vPrimMask);
1139         vNumOutPts = SIMD_T::blendv_epi32(vNumOutPts, SIMD_T::set1_epi32(NumVertsPerPrim), vNonClippedMask);
1140
1141         return vNumOutPts;
1142     }
1143
1144     const uint32_t workerId{ 0 };
1145     DRAW_CONTEXT *pDC{ nullptr };
1146     const API_STATE &state;
1147     Float<SIMD_T> clipCodes[NumVertsPerPrim];
1148 };
1149
1150
1151 // pipeline stage functions
1152 void ClipTriangles(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari const &primId, simdscalari const &viewportIdx, simdscalari const &rtIdx);
1153 void ClipLines(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari const &primId, simdscalari const &viewportIdx, simdscalari const &rtIdx);
1154 void ClipPoints(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari const &primId, simdscalari const &viewportIdx, simdscalari const &rtIdx);
1155 #if USE_SIMD16_FRONTEND
1156 void SIMDCALL ClipTriangles_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari const &primId, simd16scalari const &viewportIdx, simd16scalari const &rtIdx);
1157 void SIMDCALL ClipLines_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari const &primId, simd16scalari const &viewportIdx, simd16scalari const &rtIdx);
1158 void SIMDCALL ClipPoints_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari const &primId, simd16scalari const &viewportIdx, simd16scalari const &rtIdx);
1159 #endif
1160