/****************************************************************************
-* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice (including the next
-* paragraph) shall be included in all copies or substantial portions of the
-* Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-* IN THE SOFTWARE.
-*
-* @file clip.h
-*
-* @brief Definitions for clipping
-*
-******************************************************************************/
+ * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * @file clip.h
+ *
+ * @brief Definitions for clipping
+ *
+ ******************************************************************************/
#pragma once
#include "common/simdintrin.h"
#include "core/pa.h"
#include "rdtsc_core.h"
-// Temp storage used by the clipper
-extern THREAD simdvertex tlsTempVertices[7];
-
enum SWR_CLIPCODES
{
- // Shift clip codes out of the mantissa to prevent denormalized values when used in float compare.
- // Guardband is able to use a single high-bit with 4 separate LSBs, because it computes a union, rather than intersection, of clipcodes.
+// Shift clip codes out of the mantissa to prevent denormalized values when used in float compare.
+// Guardband is able to use a single high-bit with 4 separate LSBs, because it computes a union,
+// rather than intersection, of clipcodes.
#define CLIPCODE_SHIFT 23
- FRUSTUM_LEFT = (0x01 << CLIPCODE_SHIFT),
- FRUSTUM_TOP = (0x02 << CLIPCODE_SHIFT),
- FRUSTUM_RIGHT = (0x04 << CLIPCODE_SHIFT),
- FRUSTUM_BOTTOM = (0x08 << CLIPCODE_SHIFT),
+ FRUSTUM_LEFT = (0x01 << CLIPCODE_SHIFT),
+ FRUSTUM_TOP = (0x02 << CLIPCODE_SHIFT),
+ FRUSTUM_RIGHT = (0x04 << CLIPCODE_SHIFT),
+ FRUSTUM_BOTTOM = (0x08 << CLIPCODE_SHIFT),
- FRUSTUM_NEAR = (0x10 << CLIPCODE_SHIFT),
- FRUSTUM_FAR = (0x20 << CLIPCODE_SHIFT),
+ FRUSTUM_NEAR = (0x10 << CLIPCODE_SHIFT),
+ FRUSTUM_FAR = (0x20 << CLIPCODE_SHIFT),
- NEGW = (0x40 << CLIPCODE_SHIFT),
+ NEGW = (0x40 << CLIPCODE_SHIFT),
GUARDBAND_LEFT = (0x80 << CLIPCODE_SHIFT | 0x1),
GUARDBAND_TOP = (0x80 << CLIPCODE_SHIFT | 0x2),
GUARDBAND_BOTTOM = (0x80 << CLIPCODE_SHIFT | 0x8)
};
-#define FRUSTUM_CLIP_MASK (FRUSTUM_LEFT|FRUSTUM_TOP|FRUSTUM_RIGHT|FRUSTUM_BOTTOM|FRUSTUM_NEAR|FRUSTUM_FAR)
-#define GUARDBAND_CLIP_MASK (FRUSTUM_NEAR|FRUSTUM_FAR|GUARDBAND_LEFT|GUARDBAND_TOP|GUARDBAND_RIGHT|GUARDBAND_BOTTOM|NEGW)
-
-void Clip(const float *pTriangle, const float *pAttribs, int numAttribs, float *pOutTriangles,
- int *numVerts, float *pOutAttribs);
-
-INLINE
-void ComputeClipCodes(DRIVER_TYPE type, const API_STATE& state, const simdvector& vertex, simdscalar& clipCodes)
+#define GUARDBAND_CLIP_MASK \
+ (FRUSTUM_NEAR | FRUSTUM_FAR | GUARDBAND_LEFT | GUARDBAND_TOP | GUARDBAND_RIGHT | \
+ GUARDBAND_BOTTOM | NEGW)
+#define FRUSTUM_CLIP_MASK \
+ (FRUSTUM_NEAR | FRUSTUM_FAR | FRUSTUM_LEFT | FRUSTUM_RIGHT | FRUSTUM_TOP | FRUSTUM_BOTTOM)
+
+template <typename SIMD_T>
+void ComputeClipCodes(const API_STATE& state,
+ const Vec4<SIMD_T>& vertex,
+ Float<SIMD_T>& clipCodes,
+ Integer<SIMD_T> const& viewportIndexes)
{
- clipCodes = _simd_setzero_ps();
+ clipCodes = SIMD_T::setzero_ps();
// -w
- simdscalar vNegW = _simd_mul_ps(vertex.w, _simd_set1_ps(-1.0f));
+ Float<SIMD_T> vNegW = SIMD_T::mul_ps(vertex.w, SIMD_T::set1_ps(-1.0f));
// FRUSTUM_LEFT
- simdscalar vRes = _simd_cmplt_ps(vertex.x, vNegW);
- clipCodes = _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(FRUSTUM_LEFT)));
+ Float<SIMD_T> vRes = SIMD_T::cmplt_ps(vertex.x, vNegW);
+ clipCodes = SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_LEFT)));
// FRUSTUM_TOP
- vRes = _simd_cmplt_ps(vertex.y, vNegW);
- clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(FRUSTUM_TOP))));
+ vRes = SIMD_T::cmplt_ps(vertex.y, vNegW);
+ clipCodes = SIMD_T::or_ps(
+ clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_TOP))));
// FRUSTUM_RIGHT
- vRes = _simd_cmpgt_ps(vertex.x, vertex.w);
- clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(FRUSTUM_RIGHT))));
+ vRes = SIMD_T::cmpgt_ps(vertex.x, vertex.w);
+ clipCodes = SIMD_T::or_ps(
+ clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_RIGHT))));
// FRUSTUM_BOTTOM
- vRes = _simd_cmpgt_ps(vertex.y, vertex.w);
- clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(FRUSTUM_BOTTOM))));
+ vRes = SIMD_T::cmpgt_ps(vertex.y, vertex.w);
+ clipCodes = SIMD_T::or_ps(
+ clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_BOTTOM))));
if (state.rastState.depthClipEnable)
{
// FRUSTUM_NEAR
// DX clips depth [0..w], GL clips [-w..w]
- if (type == DX)
+ if (state.rastState.clipHalfZ)
{
- vRes = _simd_cmplt_ps(vertex.z, _simd_setzero_ps());
+ vRes = SIMD_T::cmplt_ps(vertex.z, SIMD_T::setzero_ps());
}
else
{
- vRes = _simd_cmplt_ps(vertex.z, vNegW);
+ vRes = SIMD_T::cmplt_ps(vertex.z, vNegW);
}
- clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(FRUSTUM_NEAR))));
+ clipCodes = SIMD_T::or_ps(
+ clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_NEAR))));
// FRUSTUM_FAR
- vRes = _simd_cmpgt_ps(vertex.z, vertex.w);
- clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(FRUSTUM_FAR))));
+ vRes = SIMD_T::cmpgt_ps(vertex.z, vertex.w);
+ clipCodes = SIMD_T::or_ps(
+ clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_FAR))));
}
// NEGW
- vRes = _simd_cmple_ps(vertex.w, _simd_setzero_ps());
- clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(NEGW))));
+ vRes = SIMD_T::cmple_ps(vertex.w, SIMD_T::setzero_ps());
+ clipCodes =
+ SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(NEGW))));
// GUARDBAND_LEFT
- simdscalar gbMult = _simd_mul_ps(vNegW, _simd_set1_ps(state.gbState.left));
- vRes = _simd_cmplt_ps(vertex.x, gbMult);
- clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(GUARDBAND_LEFT))));
+ Float<SIMD_T> gbMult = SIMD_T::mul_ps(vNegW,
+ SIMD_T::template i32gather_ps<ScaleFactor<SIMD_T>(4)>(
+ &state.gbState.left[0], viewportIndexes));
+ vRes = SIMD_T::cmplt_ps(vertex.x, gbMult);
+ clipCodes = SIMD_T::or_ps(
+ clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_LEFT))));
// GUARDBAND_TOP
- gbMult = _simd_mul_ps(vNegW, _simd_set1_ps(state.gbState.top));
- vRes = _simd_cmplt_ps(vertex.y, gbMult);
- clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(GUARDBAND_TOP))));
+ gbMult = SIMD_T::mul_ps(vNegW,
+ SIMD_T::template i32gather_ps<ScaleFactor<SIMD_T>(4)>(
+ &state.gbState.top[0], viewportIndexes));
+ vRes = SIMD_T::cmplt_ps(vertex.y, gbMult);
+ clipCodes = SIMD_T::or_ps(
+ clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_TOP))));
// GUARDBAND_RIGHT
- gbMult = _simd_mul_ps(vertex.w, _simd_set1_ps(state.gbState.right));
- vRes = _simd_cmpgt_ps(vertex.x, gbMult);
- clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(GUARDBAND_RIGHT))));
+ gbMult = SIMD_T::mul_ps(vertex.w,
+ SIMD_T::template i32gather_ps<ScaleFactor<SIMD_T>(4)>(
+ &state.gbState.right[0], viewportIndexes));
+ vRes = SIMD_T::cmpgt_ps(vertex.x, gbMult);
+ clipCodes = SIMD_T::or_ps(
+ clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_RIGHT))));
// GUARDBAND_BOTTOM
- gbMult = _simd_mul_ps(vertex.w, _simd_set1_ps(state.gbState.bottom));
- vRes = _simd_cmpgt_ps(vertex.y, gbMult);
- clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(GUARDBAND_BOTTOM))));
+ gbMult = SIMD_T::mul_ps(vertex.w,
+ SIMD_T::template i32gather_ps<ScaleFactor<SIMD_T>(4)>(
+ &state.gbState.bottom[0], viewportIndexes));
+ vRes = SIMD_T::cmpgt_ps(vertex.y, gbMult);
+ clipCodes = SIMD_T::or_ps(
+ clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_BOTTOM))));
}
-template<uint32_t NumVertsPerPrim>
-class Clipper
+template <typename SIMD_T>
+struct BinnerChooser
{
-public:
- Clipper(uint32_t in_workerId, DRAW_CONTEXT* in_pDC) :
- workerId(in_workerId), driverType(in_pDC->pContext->driverType), pDC(in_pDC), state(GetApiState(in_pDC))
+};
+
+template <>
+struct BinnerChooser<SIMD256>
+{
+ PFN_PROCESS_PRIMS pfnBinFunc;
+
+ BinnerChooser(uint32_t numVertsPerPrim, uint32_t conservativeRast)
+ :
+ pfnBinFunc(nullptr)
+ {
+ if (numVertsPerPrim == 3)
+ {
+ pfnBinFunc = GetBinTrianglesFunc(conservativeRast > 0);
+
+ }
+ else if (numVertsPerPrim == 2)
+ {
+ pfnBinFunc = BinLines;
+ }
+ else
+ {
+ SWR_ASSERT(0 && "Unexpected points in clipper.");
+ }
+ }
+
+ BinnerChooser(PRIMITIVE_TOPOLOGY topology, uint32_t conservativeRast)
+ :
+ pfnBinFunc(nullptr)
+ {
+ switch (topology)
+ {
+ case TOP_POINT_LIST:
+ pfnBinFunc = BinPoints;
+ break;
+ case TOP_LINE_LIST:
+ case TOP_LINE_STRIP:
+ case TOP_LINE_LOOP:
+ case TOP_LINE_LIST_ADJ:
+ case TOP_LISTSTRIP_ADJ:
+ pfnBinFunc = BinLines;
+ break;
+ default:
+ pfnBinFunc = GetBinTrianglesFunc(conservativeRast > 0);
+ break;
+ };
+ }
+
+ void BinFunc(DRAW_CONTEXT* pDC,
+ PA_STATE& pa,
+ uint32_t workerId,
+ SIMD256::Vec4 prims[],
+ uint32_t primMask,
+ SIMD256::Integer const& primID,
+ SIMD256::Integer& viewportIdx,
+ SIMD256::Integer& rtIdx)
+ {
+ SWR_ASSERT(pfnBinFunc != nullptr);
+
+ pfnBinFunc(pDC, pa, workerId, prims, primMask, primID, viewportIdx, rtIdx);
+ }
+};
+
+#if USE_SIMD16_FRONTEND
+template <>
+struct BinnerChooser<SIMD512>
+{
+ PFN_PROCESS_PRIMS_SIMD16 pfnBinFunc;
+
+ BinnerChooser(uint32_t numVertsPerPrim, uint32_t conservativeRast)
+ :
+ pfnBinFunc(nullptr)
+ {
+ if (numVertsPerPrim == 3)
+ {
+ pfnBinFunc = GetBinTrianglesFunc_simd16(conservativeRast > 0);
+
+ }
+ else if (numVertsPerPrim == 2)
+ {
+ pfnBinFunc = BinLines_simd16;
+ }
+ else
+ {
+ SWR_ASSERT(0 && "Unexpected points in clipper.");
+ }
+ }
+
+ BinnerChooser(PRIMITIVE_TOPOLOGY topology, uint32_t conservativeRast)
+ :
+ pfnBinFunc(nullptr)
+ {
+ switch (topology)
+ {
+ case TOP_POINT_LIST:
+ pfnBinFunc = BinPoints_simd16;
+ break;
+ case TOP_LINE_LIST:
+ case TOP_LINE_STRIP:
+ case TOP_LINE_LOOP:
+ case TOP_LINE_LIST_ADJ:
+ case TOP_LISTSTRIP_ADJ:
+ pfnBinFunc = BinLines_simd16;
+ break;
+ default:
+ pfnBinFunc = GetBinTrianglesFunc_simd16(conservativeRast > 0);
+ break;
+ };
+ }
+
+ void BinFunc(DRAW_CONTEXT* pDC,
+ PA_STATE& pa,
+ uint32_t workerId,
+ SIMD512::Vec4 prims[],
+ uint32_t primMask,
+ SIMD512::Integer const& primID,
+ SIMD512::Integer& viewportIdx,
+ SIMD512::Integer& rtIdx)
+ {
+ SWR_ASSERT(pfnBinFunc != nullptr);
+
+ pfnBinFunc(pDC, pa, workerId, prims, primMask, primID, viewportIdx, rtIdx);
+ }
+};
+
+#endif
+template <typename SIMD_T>
+struct SimdHelper
+{
+};
+
+template <>
+struct SimdHelper<SIMD256>
+{
+ static SIMD256::Float insert_lo_ps(SIMD256::Float a) { return a; }
+
+ static SIMD256::Mask cmpeq_ps_mask(SIMD256::Float a, SIMD256::Float b)
+ {
+ return SIMD256::movemask_ps(SIMD256::cmpeq_ps(a, b));
+ }
+};
+
+#if USE_SIMD16_FRONTEND
+template <>
+struct SimdHelper<SIMD512>
+{
+ static SIMD512::Float insert_lo_ps(SIMD256::Float a)
{
- static_assert(NumVertsPerPrim >= 1 && NumVertsPerPrim <= 3, "Invalid NumVertsPerPrim");
+ return SIMD512::insert_ps<0>(SIMD512::setzero_ps(), a);
}
- void ComputeClipCodes(simdvector vertex[])
+ static SIMD512::Mask cmpeq_ps_mask(SIMD512::Float a, SIMD512::Float b)
{
- for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
+ return SIMD512::cmp_ps_mask<SIMD16::CompareType::EQ_OQ>(a, b);
+ }
+};
+#endif
+
+template <typename SIMD_T, uint32_t NumVertsPerPrimT>
+class Clipper
+{
+public:
+ INLINE Clipper(uint32_t in_workerId, DRAW_CONTEXT* in_pDC) :
+ workerId(in_workerId), pDC(in_pDC), state(GetApiState(in_pDC))
+ {
+ static_assert(NumVertsPerPrimT >= 1 && NumVertsPerPrimT <= 3, "Invalid NumVertsPerPrim");
+ THREAD_DATA &thread_data = in_pDC->pContext->threadPool.pThreadData[workerId];
+
+ if (thread_data.clipperData == nullptr)
{
- ::ComputeClipCodes(this->driverType, this->state, vertex[i], this->clipCodes[i]);
+ // 7 vertex temp data
+ // 7 post-clipped vertices
+ // 2 transposed verts for binning
+ size_t alloc_size = sizeof(SIMDVERTEX_T<SIMD_T>) * (7 + 7 + 2);
+ thread_data.clipperData = AlignedMalloc(alloc_size, KNOB_SIMD16_BYTES);
}
+ SWR_ASSERT(thread_data.clipperData);
+
+ this->clippedVerts = (SIMDVERTEX_T<SIMD_T>*)thread_data.clipperData;
+ this->tmpVerts = this->clippedVerts + 7;
+ this->transposedVerts = this->tmpVerts + 7;
}
- simdscalar ComputeClipCodeIntersection()
+ void ComputeClipCodes(Vec4<SIMD_T> vertex[], const Integer<SIMD_T>& viewportIndexes)
{
- simdscalar result = this->clipCodes[0];
- for (uint32_t i = 1; i < NumVertsPerPrim; ++i)
+ for (uint32_t i = 0; i < NumVertsPerPrimT; ++i)
{
- result = _simd_and_ps(result, this->clipCodes[i]);
+ ::ComputeClipCodes<SIMD_T>(state, vertex[i], clipCodes[i], viewportIndexes);
}
- return result;
}
- simdscalar ComputeClipCodeUnion()
+ Float<SIMD_T> ComputeClipCodeIntersection()
{
- simdscalar result = this->clipCodes[0];
- for (uint32_t i = 1; i < NumVertsPerPrim; ++i)
+ Float<SIMD_T> result = clipCodes[0];
+
+ for (uint32_t i = 1; i < NumVertsPerPrimT; ++i)
{
- result = _simd_or_ps(result, this->clipCodes[i]);
+ result = SIMD_T::and_ps(result, clipCodes[i]);
}
+
return result;
}
- int ComputeNegWMask()
+ Float<SIMD_T> ComputeClipCodeUnion()
{
- simdscalar clipCodeUnion = ComputeClipCodeUnion();
- clipCodeUnion = _simd_and_ps(clipCodeUnion, _simd_castsi_ps(_simd_set1_epi32(NEGW)));
- return _simd_movemask_ps(_simd_cmpneq_ps(clipCodeUnion, _simd_setzero_ps()));
+ Float<SIMD_T> result = clipCodes[0];
+
+ for (uint32_t i = 1; i < NumVertsPerPrimT; ++i)
+ {
+ result = SIMD_T::or_ps(result, clipCodes[i]);
+ }
+
+ return result;
}
int ComputeClipMask()
{
- simdscalar clipUnion = ComputeClipCodeUnion();
- clipUnion = _simd_and_ps(clipUnion, _simd_castsi_ps(_simd_set1_epi32(GUARDBAND_CLIP_MASK)));
- return _simd_movemask_ps(_simd_cmpneq_ps(clipUnion, _simd_setzero_ps()));
+ Float<SIMD_T> clipUnion = ComputeClipCodeUnion();
+
+ clipUnion =
+ SIMD_T::and_ps(clipUnion, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_CLIP_MASK)));
+
+ return SIMD_T::movemask_ps(SIMD_T::cmpneq_ps(clipUnion, SIMD_T::setzero_ps()));
}
// clipper is responsible for culling any prims with NAN coordinates
- int ComputeNaNMask(simdvector prim[])
+ int ComputeNaNMask(Vec4<SIMD_T> prim[])
{
- simdscalar vNanMask = _simd_setzero_ps();
- for (uint32_t e = 0; e < NumVertsPerPrim; ++e)
+ Float<SIMD_T> vNanMask = SIMD_T::setzero_ps();
+
+ for (uint32_t e = 0; e < NumVertsPerPrimT; ++e)
{
- simdscalar vNan01 = _simd_cmp_ps(prim[e].v[0], prim[e].v[1], _CMP_UNORD_Q);
- vNanMask = _simd_or_ps(vNanMask, vNan01);
- simdscalar vNan23 = _simd_cmp_ps(prim[e].v[2], prim[e].v[3], _CMP_UNORD_Q);
- vNanMask = _simd_or_ps(vNanMask, vNan23);
+ Float<SIMD_T> vNan01 =
+ SIMD_T::template cmp_ps<SIMD_T::CompareType::UNORD_Q>(prim[e].v[0], prim[e].v[1]);
+ vNanMask = SIMD_T::or_ps(vNanMask, vNan01);
+
+ Float<SIMD_T> vNan23 =
+ SIMD_T::template cmp_ps<SIMD_T::CompareType::UNORD_Q>(prim[e].v[2], prim[e].v[3]);
+ vNanMask = SIMD_T::or_ps(vNanMask, vNan23);
}
- return _simd_movemask_ps(vNanMask);
+ return SIMD_T::movemask_ps(vNanMask);
}
- int ComputeUserClipCullMask(PA_STATE& pa, simdvector prim[])
+ int ComputeUserClipCullMask(PA_STATE& pa, Vec4<SIMD_T> prim[])
{
- uint8_t cullMask = this->state.rastState.cullDistanceMask;
- simdscalar vClipCullMask = _simd_setzero_ps();
- DWORD index;
+ uint8_t cullMask = state.backendState.cullDistanceMask;
+ uint32_t vertexClipCullOffset = state.backendState.vertexClipCullOffset;
+
+ Float<SIMD_T> vClipCullMask = SIMD_T::setzero_ps();
- simdvector vClipCullDistLo[3];
- simdvector vClipCullDistHi[3];
+ Vec4<SIMD_T> vClipCullDistLo[3];
+ Vec4<SIMD_T> vClipCullDistHi[3];
- pa.Assemble(VERTEX_CLIPCULL_DIST_LO_SLOT, vClipCullDistLo);
- pa.Assemble(VERTEX_CLIPCULL_DIST_HI_SLOT, vClipCullDistHi);
+ pa.Assemble(vertexClipCullOffset, vClipCullDistLo);
+ pa.Assemble(vertexClipCullOffset + 1, vClipCullDistHi);
+
+ DWORD index;
while (_BitScanForward(&index, cullMask))
{
cullMask &= ~(1 << index);
- uint32_t slot = index >> 2;
+ uint32_t slot = index >> 2;
uint32_t component = index & 0x3;
- simdscalar vCullMaskElem = _simd_set1_ps(-1.0f);
- for (uint32_t e = 0; e < NumVertsPerPrim; ++e)
+ Float<SIMD_T> vCullMaskElem = SIMD_T::set1_ps(-1.0f);
+ for (uint32_t e = 0; e < NumVertsPerPrimT; ++e)
{
- simdscalar vCullComp;
+ Float<SIMD_T> vCullComp;
if (slot == 0)
{
vCullComp = vClipCullDistLo[e][component];
}
// cull if cull distance < 0 || NAN
- simdscalar vCull = _simd_cmp_ps(_mm256_setzero_ps(), vCullComp, _CMP_NLE_UQ);
- vCullMaskElem = _simd_and_ps(vCullMaskElem, vCull);
+ Float<SIMD_T> vCull = SIMD_T::template cmp_ps<SIMD_T::CompareType::NLE_UQ>(
+ SIMD_T::setzero_ps(), vCullComp);
+ vCullMaskElem = SIMD_T::and_ps(vCullMaskElem, vCull);
}
- vClipCullMask = _simd_or_ps(vClipCullMask, vCullMaskElem);
+ vClipCullMask = SIMD_T::or_ps(vClipCullMask, vCullMaskElem);
}
// clipper should also discard any primitive with NAN clip distance
- uint8_t clipMask = this->state.rastState.clipDistanceMask;
+ uint8_t clipMask = state.backendState.clipDistanceMask;
while (_BitScanForward(&index, clipMask))
{
clipMask &= ~(1 << index);
- uint32_t slot = index >> 2;
+ uint32_t slot = index >> 2;
uint32_t component = index & 0x3;
- for (uint32_t e = 0; e < NumVertsPerPrim; ++e)
+ Float<SIMD_T> vCullMaskElem = SIMD_T::set1_ps(-1.0f);
+ for (uint32_t e = 0; e < NumVertsPerPrimT; ++e)
{
- simdscalar vClipComp;
+ Float<SIMD_T> vClipComp;
if (slot == 0)
{
vClipComp = vClipCullDistLo[e][component];
vClipComp = vClipCullDistHi[e][component];
}
- simdscalar vClip = _simd_cmp_ps(vClipComp, vClipComp, _CMP_UNORD_Q);
- vClipCullMask = _simd_or_ps(vClipCullMask, vClip);
+ Float<SIMD_T> vClip =
+ SIMD_T::template cmp_ps<SIMD_T::CompareType::UNORD_Q>(vClipComp, vClipComp);
+ Float<SIMD_T> vCull = SIMD_T::template cmp_ps<SIMD_T::CompareType::NLE_UQ>(
+ SIMD_T::setzero_ps(), vClipComp);
+ vCullMaskElem = SIMD_T::and_ps(vCullMaskElem, vCull);
+ vClipCullMask = SIMD_T::or_ps(vClipCullMask, vClip);
}
+ vClipCullMask = SIMD_T::or_ps(vClipCullMask, vCullMaskElem);
}
- return _simd_movemask_ps(vClipCullMask);
+ return SIMD_T::movemask_ps(vClipCullMask);
}
- // clip a single primitive
- int ClipScalar(PA_STATE& pa, uint32_t primIndex, float* pOutPos, float* pOutAttribs)
- {
- OSALIGNSIMD(float) inVerts[3 * 4];
- OSALIGNSIMD(float) inAttribs[3 * KNOB_NUM_ATTRIBUTES * 4];
-
- // transpose primitive position
- __m128 verts[3];
- pa.AssembleSingle(VERTEX_POSITION_SLOT, primIndex, verts);
- _mm_store_ps(&inVerts[0], verts[0]);
- _mm_store_ps(&inVerts[4], verts[1]);
- _mm_store_ps(&inVerts[8], verts[2]);
-
- // transpose attribs
- uint32_t numScalarAttribs = this->state.linkageCount * 4;
-
- int idx = 0;
- DWORD slot = 0;
- uint32_t mapIdx = 0;
- uint32_t tmpLinkage = uint32_t(this->state.linkageMask);
- while (_BitScanForward(&slot, tmpLinkage))
- {
- tmpLinkage &= ~(1 << slot);
- // Compute absolute attrib slot in vertex array
- uint32_t inputSlot = VERTEX_ATTRIB_START_SLOT + this->state.linkageMap[mapIdx++];
- __m128 attrib[3]; // triangle attribs (always 4 wide)
- pa.AssembleSingle(inputSlot, primIndex, attrib);
- _mm_store_ps(&inAttribs[idx], attrib[0]);
- _mm_store_ps(&inAttribs[idx + numScalarAttribs], attrib[1]);
- _mm_store_ps(&inAttribs[idx + numScalarAttribs * 2], attrib[2]);
- idx += 4;
- }
-
- int numVerts;
- Clip(inVerts, inAttribs, numScalarAttribs, pOutPos, &numVerts, pOutAttribs);
-
- return numVerts;
- }
-
- // clip SIMD primitives
- void ClipSimd(const simdscalar& vPrimMask, const simdscalar& vClipMask, PA_STATE& pa, const simdscalari& vPrimId)
+ void ClipSimd(const Vec4<SIMD_T> prim[],
+ const Float<SIMD_T>& vPrimMask,
+ const Float<SIMD_T>& vClipMask,
+ PA_STATE& pa,
+ const Integer<SIMD_T>& vPrimId,
+ const Integer<SIMD_T>& vViewportIdx,
+ const Integer<SIMD_T>& vRtIdx)
{
// input/output vertex store for clipper
- simdvertex vertices[7]; // maximum 7 verts generated per triangle
+ SIMDVERTEX_T<SIMD_T>* vertices = this->clippedVerts;
- LONG constantInterpMask = this->state.backendState.constantInterpolationMask;
- uint32_t provokingVertex = 0;
- if(pa.binTopology == TOP_TRIANGLE_FAN)
+ uint32_t constantInterpMask = state.backendState.constantInterpolationMask;
+ uint32_t provokingVertex = 0;
+ if (pa.binTopology == TOP_TRIANGLE_FAN)
{
- provokingVertex = this->state.frontendState.provokingVertex.triFan;
+ provokingVertex = state.frontendState.provokingVertex.triFan;
}
///@todo: line topology for wireframe?
// assemble pos
- simdvector tmpVector[NumVertsPerPrim];
- pa.Assemble(VERTEX_POSITION_SLOT, tmpVector);
- for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
+ Vec4<SIMD_T> tmpVector[NumVertsPerPrimT];
+ for (uint32_t i = 0; i < NumVertsPerPrimT; ++i)
{
- vertices[i].attrib[VERTEX_POSITION_SLOT] = tmpVector[i];
+ vertices[i].attrib[VERTEX_POSITION_SLOT] = prim[i];
}
// assemble attribs
- DWORD slot = 0;
- uint32_t mapIdx = 0;
- uint32_t tmpLinkage = this->state.linkageMask;
+ const SWR_BACKEND_STATE& backendState = state.backendState;
int32_t maxSlot = -1;
- while (_BitScanForward(&slot, tmpLinkage))
+ for (uint32_t slot = 0; slot < backendState.numAttributes; ++slot)
{
- tmpLinkage &= ~(1 << slot);
// Compute absolute attrib slot in vertex array
- uint32_t mapSlot = this->state.linkageMap[mapIdx++];
- maxSlot = std::max<int32_t>(maxSlot, mapSlot);
- uint32_t inputSlot = VERTEX_ATTRIB_START_SLOT + mapSlot;
+ uint32_t mapSlot =
+ backendState.swizzleEnable ? backendState.swizzleMap[slot].sourceAttrib : slot;
+ maxSlot = std::max<int32_t>(maxSlot, mapSlot);
+ uint32_t inputSlot = backendState.vertexAttribOffset + mapSlot;
pa.Assemble(inputSlot, tmpVector);
// if constant interpolation enabled for this attribute, assign the provoking
// vertex values to all edges
- if (_bittest(&constantInterpMask, slot))
+ if (CheckBit(constantInterpMask, slot))
{
- for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
+ for (uint32_t i = 0; i < NumVertsPerPrimT; ++i)
{
vertices[i].attrib[inputSlot] = tmpVector[provokingVertex];
}
}
else
{
- for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
+ for (uint32_t i = 0; i < NumVertsPerPrimT; ++i)
{
vertices[i].attrib[inputSlot] = tmpVector[i];
}
}
// assemble user clip distances if enabled
- if (this->state.rastState.clipDistanceMask & 0xf)
+ uint32_t vertexClipCullSlot = state.backendState.vertexClipCullOffset;
+ if (state.backendState.clipDistanceMask & 0xf)
{
- pa.Assemble(VERTEX_CLIPCULL_DIST_LO_SLOT, tmpVector);
- for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
+ pa.Assemble(vertexClipCullSlot, tmpVector);
+ for (uint32_t i = 0; i < NumVertsPerPrimT; ++i)
{
- vertices[i].attrib[VERTEX_CLIPCULL_DIST_LO_SLOT] = tmpVector[i];
+ vertices[i].attrib[vertexClipCullSlot] = tmpVector[i];
}
}
- if (this->state.rastState.clipDistanceMask & 0xf0)
+ if (state.backendState.clipDistanceMask & 0xf0)
{
- pa.Assemble(VERTEX_CLIPCULL_DIST_HI_SLOT, tmpVector);
- for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
+ pa.Assemble(vertexClipCullSlot + 1, tmpVector);
+ for (uint32_t i = 0; i < NumVertsPerPrimT; ++i)
{
- vertices[i].attrib[VERTEX_CLIPCULL_DIST_HI_SLOT] = tmpVector[i];
+ vertices[i].attrib[vertexClipCullSlot + 1] = tmpVector[i];
}
}
uint32_t numAttribs = maxSlot + 1;
- simdscalari vNumClippedVerts = ClipPrims((float*)&vertices[0], vPrimMask, vClipMask, numAttribs);
+ Integer<SIMD_T> vNumClippedVerts =
+ ClipPrims((float*)&vertices[0], vPrimMask, vClipMask, numAttribs);
+
+ BinnerChooser<SIMD_T> binner(NumVertsPerPrimT,
+ pa.pDC->pState->state.rastState.conservativeRast);
// set up new PA for binning clipped primitives
- PFN_PROCESS_PRIMS pfnBinFunc = nullptr;
PRIMITIVE_TOPOLOGY clipTopology = TOP_UNKNOWN;
- if (NumVertsPerPrim == 3)
+ if (NumVertsPerPrimT == 3)
{
- pfnBinFunc = BinTriangles;
clipTopology = TOP_TRIANGLE_FAN;
// so that the binner knows to bloat wide points later
if (pa.binTopology == TOP_POINT_LIST)
+ {
clipTopology = TOP_POINT_LIST;
+ }
+ else if (pa.binTopology == TOP_RECT_LIST)
+ {
+ clipTopology = TOP_RECT_LIST;
+ }
}
- else if (NumVertsPerPrim == 2)
+ else if (NumVertsPerPrimT == 2)
{
- pfnBinFunc = BinLines;
clipTopology = TOP_LINE_LIST;
}
else
{
SWR_ASSERT(0 && "Unexpected points in clipper.");
}
-
-
- uint32_t* pVertexCount = (uint32_t*)&vNumClippedVerts;
- uint32_t* pPrimitiveId = (uint32_t*)&vPrimId;
- const simdscalari vOffsets = _mm256_set_epi32(
- 0 * sizeof(simdvertex), // unused lane
- 6 * sizeof(simdvertex),
- 5 * sizeof(simdvertex),
- 4 * sizeof(simdvertex),
- 3 * sizeof(simdvertex),
- 2 * sizeof(simdvertex),
- 1 * sizeof(simdvertex),
- 0 * sizeof(simdvertex));
+ const uint32_t* pVertexCount = reinterpret_cast<const uint32_t*>(&vNumClippedVerts);
+ const uint32_t* pPrimitiveId = reinterpret_cast<const uint32_t*>(&vPrimId);
+ const uint32_t* pViewportIdx = reinterpret_cast<const uint32_t*>(&vViewportIdx);
+ const uint32_t* pRtIdx = reinterpret_cast<const uint32_t*>(&vRtIdx);
+
+ const SIMD256::Integer vOffsets =
+ SIMD256::set_epi32(0 * sizeof(SIMDVERTEX_T<SIMD_T>), // unused lane
+ 6 * sizeof(SIMDVERTEX_T<SIMD_T>),
+ 5 * sizeof(SIMDVERTEX_T<SIMD_T>),
+ 4 * sizeof(SIMDVERTEX_T<SIMD_T>),
+ 3 * sizeof(SIMDVERTEX_T<SIMD_T>),
+ 2 * sizeof(SIMDVERTEX_T<SIMD_T>),
+ 1 * sizeof(SIMDVERTEX_T<SIMD_T>),
+ 0 * sizeof(SIMDVERTEX_T<SIMD_T>));
// only need to gather 7 verts
// @todo dynamic mask based on actual # of verts generated per lane
- const simdscalar vMask = _mm256_set_ps(0, -1, -1, -1, -1, -1, -1, -1);
+ const SIMD256::Float vMask = SIMD256::set_ps(0, -1, -1, -1, -1, -1, -1, -1);
uint32_t numClippedPrims = 0;
- for (uint32_t inputPrim = 0; inputPrim < pa.NumPrims(); ++inputPrim)
+
+ // transpose clipper output so that each lane's vertices are in SIMD order
+ // set aside space for 2 vertices, as the PA will try to read up to 16 verts
+ // for triangle fan
+ SIMDVERTEX_T<SIMD_T>* transposedPrims = this->transposedVerts;
+
+ uint32_t numInputPrims = pa.NumPrims();
+ for (uint32_t inputPrim = 0; inputPrim < numInputPrims; ++inputPrim)
{
uint32_t numEmittedVerts = pVertexCount[inputPrim];
- if (numEmittedVerts < NumVertsPerPrim)
+ if (numEmittedVerts < NumVertsPerPrimT)
{
continue;
}
SWR_ASSERT(numEmittedVerts <= 7, "Unexpected vertex count from clipper.");
uint32_t numEmittedPrims = GetNumPrims(clipTopology, numEmittedVerts);
+ SWR_ASSERT(numEmittedPrims <= 7, "Unexpected primitive count from clipper.");
+
numClippedPrims += numEmittedPrims;
// tranpose clipper output so that each lane's vertices are in SIMD order
// set aside space for 2 vertices, as the PA will try to read up to 16 verts
// for triangle fan
- simdvertex transposedPrims[2];
// transpose pos
- uint8_t* pBase = (uint8_t*)(&vertices[0].attrib[VERTEX_POSITION_SLOT]) + sizeof(float) * inputPrim;
+ float const* pBase =
+ reinterpret_cast<float const*>(&vertices[0].attrib[VERTEX_POSITION_SLOT]) +
+ inputPrim;
+
for (uint32_t c = 0; c < 4; ++c)
{
- transposedPrims[0].attrib[VERTEX_POSITION_SLOT][c] = _simd_mask_i32gather_ps(_mm256_undefined_ps(), (const float*)pBase, vOffsets, vMask, 1);
- pBase += sizeof(simdscalar);
+ SIMD256::Float temp =
+ SIMD256::mask_i32gather_ps(SIMD256::setzero_ps(), pBase, vOffsets, vMask);
+ transposedPrims[0].attrib[VERTEX_POSITION_SLOT][c] =
+ SimdHelper<SIMD_T>::insert_lo_ps(temp);
+ pBase = PtrAdd(pBase, sizeof(Float<SIMD_T>));
}
// transpose attribs
- pBase = (uint8_t*)(&vertices[0].attrib[VERTEX_ATTRIB_START_SLOT]) + sizeof(float) * inputPrim;
+ pBase = reinterpret_cast<float const*>(
+ &vertices[0].attrib[backendState.vertexAttribOffset]) +
+ inputPrim;
+
for (uint32_t attrib = 0; attrib < numAttribs; ++attrib)
{
- uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + attrib;
+ uint32_t attribSlot = backendState.vertexAttribOffset + attrib;
+
for (uint32_t c = 0; c < 4; ++c)
{
- transposedPrims[0].attrib[attribSlot][c] = _simd_mask_i32gather_ps(_mm256_undefined_ps(), (const float*)pBase, vOffsets, vMask, 1);
- pBase += sizeof(simdscalar);
+ SIMD256::Float temp =
+ SIMD256::mask_i32gather_ps(SIMD256::setzero_ps(), pBase, vOffsets, vMask);
+ transposedPrims[0].attrib[attribSlot][c] =
+ SimdHelper<SIMD_T>::insert_lo_ps(temp);
+ pBase = PtrAdd(pBase, sizeof(Float<SIMD_T>));
}
}
// transpose user clip distances if enabled
- if (this->state.rastState.clipDistanceMask & 0xf)
+ uint32_t vertexClipCullSlot = backendState.vertexClipCullOffset;
+ if (state.backendState.clipDistanceMask & 0x0f)
{
- pBase = (uint8_t*)(&vertices[0].attrib[VERTEX_CLIPCULL_DIST_LO_SLOT]) + sizeof(float) * inputPrim;
+ pBase = reinterpret_cast<float const*>(&vertices[0].attrib[vertexClipCullSlot]) +
+ inputPrim;
+
for (uint32_t c = 0; c < 4; ++c)
{
- transposedPrims[0].attrib[VERTEX_CLIPCULL_DIST_LO_SLOT][c] = _simd_mask_i32gather_ps(_mm256_undefined_ps(), (const float*)pBase, vOffsets, vMask, 1);
- pBase += sizeof(simdscalar);
+ SIMD256::Float temp =
+ SIMD256::mask_i32gather_ps(SIMD256::setzero_ps(), pBase, vOffsets, vMask);
+ transposedPrims[0].attrib[vertexClipCullSlot][c] =
+ SimdHelper<SIMD_T>::insert_lo_ps(temp);
+ pBase = PtrAdd(pBase, sizeof(Float<SIMD_T>));
}
}
- if (this->state.rastState.clipDistanceMask & 0xf0)
+ if (state.backendState.clipDistanceMask & 0xf0)
{
- pBase = (uint8_t*)(&vertices[0].attrib[VERTEX_CLIPCULL_DIST_HI_SLOT]) + sizeof(float) * inputPrim;
+ pBase =
+ reinterpret_cast<float const*>(&vertices[0].attrib[vertexClipCullSlot + 1]) +
+ inputPrim;
+
for (uint32_t c = 0; c < 4; ++c)
{
- transposedPrims[0].attrib[VERTEX_CLIPCULL_DIST_HI_SLOT][c] = _simd_mask_i32gather_ps(_mm256_undefined_ps(), (const float*)pBase, vOffsets, vMask, 1);
- pBase += sizeof(simdscalar);
+ SIMD256::Float temp =
+ SIMD256::mask_i32gather_ps(SIMD256::setzero_ps(), pBase, vOffsets, vMask);
+ transposedPrims[0].attrib[vertexClipCullSlot + 1][c] =
+ SimdHelper<SIMD_T>::insert_lo_ps(temp);
+ pBase = PtrAdd(pBase, sizeof(Float<SIMD_T>));
}
}
- PA_STATE_OPT clipPa(this->pDC, numEmittedPrims, (uint8_t*)&transposedPrims[0], numEmittedVerts, true, clipTopology);
+ PA_STATE_OPT clipPA(pDC,
+ numEmittedPrims,
+ reinterpret_cast<uint8_t*>(&transposedPrims[0]),
+ numEmittedVerts,
+ SWR_VTX_NUM_SLOTS,
+ true,
+ NumVertsPerPrimT,
+ clipTopology);
+ clipPA.viewportArrayActive = pa.viewportArrayActive;
+ clipPA.rtArrayActive = pa.rtArrayActive;
- while (clipPa.GetNextStreamOutput())
+ static const uint32_t primMaskMap[] = {0x0, 0x1, 0x3, 0x7, 0xf, 0x1f, 0x3f, 0x7f};
+
+ const uint32_t primMask = primMaskMap[numEmittedPrims];
+
+ const Integer<SIMD_T> primID = SIMD_T::set1_epi32(pPrimitiveId[inputPrim]);
+ const Integer<SIMD_T> viewportIdx = SIMD_T::set1_epi32(pViewportIdx[inputPrim]);
+ const Integer<SIMD_T> rtIdx = SIMD_T::set1_epi32(pRtIdx[inputPrim]);
+
+ while (clipPA.GetNextStreamOutput())
{
do
{
- simdvector attrib[NumVertsPerPrim];
- bool assemble = clipPa.Assemble(VERTEX_POSITION_SLOT, attrib);
+ Vec4<SIMD_T> attrib[NumVertsPerPrimT];
+
+ bool assemble = clipPA.Assemble(VERTEX_POSITION_SLOT, attrib);
+
if (assemble)
{
- static const uint32_t primMaskMap[] = { 0x0, 0x1, 0x3, 0x7, 0xf, 0x1f, 0x3f, 0x7f, 0xff };
- pfnBinFunc(this->pDC, clipPa, this->workerId, attrib, primMaskMap[numEmittedPrims], _simd_set1_epi32(pPrimitiveId[inputPrim]));
+ binner.pfnBinFunc(
+ pDC, clipPA, workerId, attrib, primMask, primID, viewportIdx, rtIdx);
}
- } while (clipPa.NextPrim());
+
+ } while (clipPA.NextPrim());
}
}
// update global pipeline stat
- SWR_CONTEXT* pContext = this->pDC->pContext;
- UPDATE_STAT(CPrimitives, numClippedPrims);
+ UPDATE_STAT_FE(CPrimitives, numClippedPrims);
}
-
- // execute the clipper stage
- void ExecuteStage(PA_STATE& pa, simdvector prim[], uint32_t primMask, simdscalari primId)
+
+ void ExecuteStage(PA_STATE& pa,
+ Vec4<SIMD_T> prim[],
+ uint32_t primMask,
+ Integer<SIMD_T> const& primId,
+ Integer<SIMD_T> const& viewportIdx,
+ Integer<SIMD_T> const& rtIdx)
{
- // set up binner based on PA state
- PFN_PROCESS_PRIMS pfnBinner;
- switch (pa.binTopology)
- {
- case TOP_POINT_LIST:
- pfnBinner = BinPoints;
- break;
- case TOP_LINE_LIST:
- case TOP_LINE_STRIP:
- case TOP_LINE_LOOP:
- case TOP_LINE_LIST_ADJ:
- case TOP_LISTSTRIP_ADJ:
- pfnBinner = BinLines;
- break;
- default:
- pfnBinner = BinTriangles;
- break;
- };
+ SWR_ASSERT(pa.pDC != nullptr);
+
+ BinnerChooser<SIMD_T> binner(pa.binTopology,
+ pa.pDC->pState->state.rastState.conservativeRast);
// update clipper invocations pipeline stat
- SWR_CONTEXT* pContext = this->pDC->pContext;
uint32_t numInvoc = _mm_popcnt_u32(primMask);
- UPDATE_STAT(CInvocations, numInvoc);
+ UPDATE_STAT_FE(CInvocations, numInvoc);
- ComputeClipCodes(prim);
+ ComputeClipCodes(prim, viewportIdx);
// cull prims with NAN coords
primMask &= ~ComputeNaNMask(prim);
- // user cull distance cull
- if (this->state.rastState.cullDistanceMask)
+ // user cull distance cull
+ if (state.backendState.cullDistanceMask | state.backendState.clipDistanceMask)
{
primMask &= ~ComputeUserClipCullMask(pa, prim);
}
+ Float<SIMD_T> clipIntersection = ComputeClipCodeIntersection();
+ // Mask out non-frustum codes
+ clipIntersection = SIMD_T::and_ps(clipIntersection,
+ SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_CLIP_MASK)));
+
// cull prims outside view frustum
- simdscalar clipIntersection = ComputeClipCodeIntersection();
- int validMask = primMask & _simd_movemask_ps(_simd_cmpeq_ps(clipIntersection, _simd_setzero_ps()));
+ int validMask =
+ primMask & SimdHelper<SIMD_T>::cmpeq_ps_mask(clipIntersection, SIMD_T::setzero_ps());
// skip clipping for points
uint32_t clipMask = 0;
- if (NumVertsPerPrim != 1)
+ if (NumVertsPerPrimT != 1)
{
- clipMask = primMask & ComputeClipMask();
+ clipMask = validMask & ComputeClipMask();
}
+ AR_EVENT(ClipInfoEvent(numInvoc, validMask, clipMask));
+
if (clipMask)
{
- RDTSC_START(FEGuardbandClip);
+ RDTSC_BEGIN(FEGuardbandClip, pa.pDC->drawId);
// we have to clip tris, execute the clipper, which will also
// call the binner
- ClipSimd(vMask(primMask), vMask(clipMask), pa, primId);
- RDTSC_STOP(FEGuardbandClip, 1, 0);
+ ClipSimd(prim,
+ SIMD_T::vmask_ps(validMask),
+ SIMD_T::vmask_ps(clipMask),
+ pa,
+ primId,
+ viewportIdx,
+ rtIdx);
+ RDTSC_END(FEGuardbandClip, 1);
}
else if (validMask)
{
// update CPrimitives pipeline state
- SWR_CONTEXT* pContext = this->pDC->pContext;
- UPDATE_STAT(CPrimitives, _mm_popcnt_u32(validMask));
+ UPDATE_STAT_FE(CPrimitives, _mm_popcnt_u32(validMask));
// forward valid prims directly to binner
- pfnBinner(this->pDC, pa, this->workerId, prim, validMask, primId);
+ binner.pfnBinFunc(
+ this->pDC, pa, this->workerId, prim, validMask, primId, viewportIdx, rtIdx);
}
}
private:
- inline simdscalar ComputeInterpFactor(simdscalar boundaryCoord0, simdscalar boundaryCoord1)
+ Float<SIMD_T> ComputeInterpFactor(Float<SIMD_T> const& boundaryCoord0,
+ Float<SIMD_T> const& boundaryCoord1)
{
- return _simd_div_ps(boundaryCoord0, _simd_sub_ps(boundaryCoord0, boundaryCoord1));
+ return SIMD_T::div_ps(boundaryCoord0, SIMD_T::sub_ps(boundaryCoord0, boundaryCoord1));
}
- inline simdscalari ComputeOffsets(uint32_t attrib, simdscalari vIndices, uint32_t component)
+ Integer<SIMD_T>
+ ComputeOffsets(uint32_t attrib, Integer<SIMD_T> const& vIndices, uint32_t component)
{
- const uint32_t simdVertexStride = sizeof(simdvertex);
- const uint32_t componentStride = sizeof(simdscalar);
- const uint32_t attribStride = sizeof(simdvector);
- const __m256i vElemOffset = _mm256_set_epi32(7 * sizeof(float), 6 * sizeof(float), 5 * sizeof(float), 4 * sizeof(float),
- 3 * sizeof(float), 2 * sizeof(float), 1 * sizeof(float), 0 * sizeof(float));
+ const uint32_t simdVertexStride = sizeof(SIMDVERTEX_T<SIMD_T>);
+ const uint32_t componentStride = sizeof(Float<SIMD_T>);
+ const uint32_t attribStride = sizeof(Vec4<SIMD_T>);
+
+ static const OSALIGNSIMD16(uint32_t) elemOffset[16] = {
+ 0 * sizeof(float),
+ 1 * sizeof(float),
+ 2 * sizeof(float),
+ 3 * sizeof(float),
+ 4 * sizeof(float),
+ 5 * sizeof(float),
+ 6 * sizeof(float),
+ 7 * sizeof(float),
+ 8 * sizeof(float),
+ 9 * sizeof(float),
+ 10 * sizeof(float),
+ 11 * sizeof(float),
+ 12 * sizeof(float),
+ 13 * sizeof(float),
+ 14 * sizeof(float),
+ 15 * sizeof(float),
+ };
+
+ static_assert(sizeof(Integer<SIMD_T>) <= sizeof(elemOffset),
+ "Clipper::ComputeOffsets, Increase number of element offsets.");
+
+ Integer<SIMD_T> vElemOffset =
+ SIMD_T::loadu_si(reinterpret_cast<const Integer<SIMD_T>*>(elemOffset));
// step to the simdvertex
- simdscalari vOffsets = _simd_mullo_epi32(vIndices, _simd_set1_epi32(simdVertexStride));
+ Integer<SIMD_T> vOffsets =
+ SIMD_T::mullo_epi32(vIndices, SIMD_T::set1_epi32(simdVertexStride));
// step to the attribute and component
- vOffsets = _simd_add_epi32(vOffsets, _simd_set1_epi32(attribStride * attrib + componentStride * component));
+ vOffsets = SIMD_T::add_epi32(
+ vOffsets, SIMD_T::set1_epi32(attribStride * attrib + componentStride * component));
// step to the lane
- vOffsets = _simd_add_epi32(vOffsets, vElemOffset);
+ vOffsets = SIMD_T::add_epi32(vOffsets, vElemOffset);
return vOffsets;
}
- // gathers a single component for a given attribute for each SIMD lane
- inline simdscalar GatherComponent(const float* pBuffer, uint32_t attrib, simdscalar vMask, simdscalari vIndices, uint32_t component)
+ Float<SIMD_T> GatherComponent(const float* pBuffer,
+ uint32_t attrib,
+ Float<SIMD_T> const& vMask,
+ Integer<SIMD_T> const& vIndices,
+ uint32_t component)
{
- simdscalari vOffsets = ComputeOffsets(attrib, vIndices, component);
- simdscalar vSrc = _mm256_undefined_ps();
- return _simd_mask_i32gather_ps(vSrc, pBuffer, vOffsets, vMask, 1);
+ Integer<SIMD_T> vOffsets = ComputeOffsets(attrib, vIndices, component);
+ Float<SIMD_T> vSrc = SIMD_T::setzero_ps();
+
+ return SIMD_T::mask_i32gather_ps(vSrc, pBuffer, vOffsets, vMask);
}
- inline void ScatterComponent(const float* pBuffer, uint32_t attrib, simdscalar vMask, simdscalari vIndices, uint32_t component, simdscalar vSrc)
+ void ScatterComponent(const float* pBuffer,
+ uint32_t attrib,
+ Float<SIMD_T> const& vMask,
+ Integer<SIMD_T> const& vIndices,
+ uint32_t component,
+ Float<SIMD_T> const& vSrc)
{
- simdscalari vOffsets = ComputeOffsets(attrib, vIndices, component);
+ Integer<SIMD_T> vOffsets = ComputeOffsets(attrib, vIndices, component);
- uint32_t* pOffsets = (uint32_t*)&vOffsets;
- float* pSrc = (float*)&vSrc;
- uint32_t mask = _simd_movemask_ps(vMask);
- DWORD lane;
+ const uint32_t* pOffsets = reinterpret_cast<const uint32_t*>(&vOffsets);
+ const float* pSrc = reinterpret_cast<const float*>(&vSrc);
+ uint32_t mask = SIMD_T::movemask_ps(vMask);
+ DWORD lane;
while (_BitScanForward(&lane, mask))
{
mask &= ~(1 << lane);
- uint8_t* pBuf = (uint8_t*)pBuffer + pOffsets[lane];
- *(float*)pBuf = pSrc[lane];
+ const uint8_t* pBuf = reinterpret_cast<const uint8_t*>(pBuffer) + pOffsets[lane];
+ *(float*)pBuf = pSrc[lane];
}
}
- template<SWR_CLIPCODES ClippingPlane>
- inline void intersect(
- const simdscalar& vActiveMask, // active lanes to operate on
- const simdscalari& s, // index to first edge vertex v0 in pInPts.
- const simdscalari& p, // index to second edge vertex v1 in pInPts.
- const simdvector& v1, // vertex 0 position
- const simdvector& v2, // vertex 1 position
- simdscalari& outIndex, // output index.
- const float *pInVerts, // array of all the input positions.
- uint32_t numInAttribs, // number of attributes per vertex.
- float *pOutVerts) // array of output positions. We'll write our new intersection point at i*4.
+ template <SWR_CLIPCODES ClippingPlane>
+ void intersect(const Float<SIMD_T>& vActiveMask, // active lanes to operate on
+ const Integer<SIMD_T>& s, // index to first edge vertex v0 in pInPts.
+ const Integer<SIMD_T>& p, // index to second edge vertex v1 in pInPts.
+ const Vec4<SIMD_T>& v1, // vertex 0 position
+ const Vec4<SIMD_T>& v2, // vertex 1 position
+ Integer<SIMD_T>& outIndex, // output index.
+ const float* pInVerts, // array of all the input positions.
+ uint32_t numInAttribs, // number of attributes per vertex.
+ float* pOutVerts) // array of output positions. We'll write our new intersection
+ // point at i*4.
{
+ uint32_t vertexAttribOffset = this->state.backendState.vertexAttribOffset;
+ uint32_t vertexClipCullOffset = this->state.backendState.vertexClipCullOffset;
+
// compute interpolation factor
- simdscalar t;
+ Float<SIMD_T> t;
switch (ClippingPlane)
{
- case FRUSTUM_LEFT: t = ComputeInterpFactor(_simd_add_ps(v1[3], v1[0]), _simd_add_ps(v2[3], v2[0])); break;
- case FRUSTUM_RIGHT: t = ComputeInterpFactor(_simd_sub_ps(v1[3], v1[0]), _simd_sub_ps(v2[3], v2[0])); break;
- case FRUSTUM_TOP: t = ComputeInterpFactor(_simd_add_ps(v1[3], v1[1]), _simd_add_ps(v2[3], v2[1])); break;
- case FRUSTUM_BOTTOM: t = ComputeInterpFactor(_simd_sub_ps(v1[3], v1[1]), _simd_sub_ps(v2[3], v2[1])); break;
- case FRUSTUM_NEAR:
+ case FRUSTUM_LEFT:
+ t = ComputeInterpFactor(SIMD_T::add_ps(v1[3], v1[0]), SIMD_T::add_ps(v2[3], v2[0]));
+ break;
+ case FRUSTUM_RIGHT:
+ t = ComputeInterpFactor(SIMD_T::sub_ps(v1[3], v1[0]), SIMD_T::sub_ps(v2[3], v2[0]));
+ break;
+ case FRUSTUM_TOP:
+ t = ComputeInterpFactor(SIMD_T::add_ps(v1[3], v1[1]), SIMD_T::add_ps(v2[3], v2[1]));
+ break;
+ case FRUSTUM_BOTTOM:
+ t = ComputeInterpFactor(SIMD_T::sub_ps(v1[3], v1[1]), SIMD_T::sub_ps(v2[3], v2[1]));
+ break;
+ case FRUSTUM_NEAR:
// DX Znear plane is 0, GL is -w
- if (this->driverType == DX)
+ if (this->state.rastState.clipHalfZ)
{
t = ComputeInterpFactor(v1[2], v2[2]);
}
else
{
- t = ComputeInterpFactor(_simd_add_ps(v1[3], v1[2]), _simd_add_ps(v2[3], v2[2]));
+ t = ComputeInterpFactor(SIMD_T::add_ps(v1[3], v1[2]), SIMD_T::add_ps(v2[3], v2[2]));
}
break;
- case FRUSTUM_FAR: t = ComputeInterpFactor(_simd_sub_ps(v1[3], v1[2]), _simd_sub_ps(v2[3], v2[2])); break;
- default: SWR_ASSERT(false, "invalid clipping plane: %d", ClippingPlane);
+ case FRUSTUM_FAR:
+ t = ComputeInterpFactor(SIMD_T::sub_ps(v1[3], v1[2]), SIMD_T::sub_ps(v2[3], v2[2]));
+ break;
+ default:
+ SWR_INVALID("invalid clipping plane: %d", ClippingPlane);
};
// interpolate position and store
for (uint32_t c = 0; c < 4; ++c)
{
- simdscalar vOutPos = _simd_fmadd_ps(_simd_sub_ps(v2[c], v1[c]), t, v1[c]);
+ Float<SIMD_T> vOutPos = SIMD_T::fmadd_ps(SIMD_T::sub_ps(v2[c], v1[c]), t, v1[c]);
ScatterComponent(pOutVerts, VERTEX_POSITION_SLOT, vActiveMask, outIndex, c, vOutPos);
}
// interpolate attributes and store
for (uint32_t a = 0; a < numInAttribs; ++a)
{
- uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + a;
+ uint32_t attribSlot = vertexAttribOffset + a;
for (uint32_t c = 0; c < 4; ++c)
{
- simdscalar vAttrib0 = GatherComponent(pInVerts, attribSlot, vActiveMask, s, c);
- simdscalar vAttrib1 = GatherComponent(pInVerts, attribSlot, vActiveMask, p, c);
- simdscalar vOutAttrib = _simd_fmadd_ps(_simd_sub_ps(vAttrib1, vAttrib0), t, vAttrib0);
+ Float<SIMD_T> vAttrib0 = GatherComponent(pInVerts, attribSlot, vActiveMask, s, c);
+ Float<SIMD_T> vAttrib1 = GatherComponent(pInVerts, attribSlot, vActiveMask, p, c);
+ Float<SIMD_T> vOutAttrib =
+ SIMD_T::fmadd_ps(SIMD_T::sub_ps(vAttrib1, vAttrib0), t, vAttrib0);
ScatterComponent(pOutVerts, attribSlot, vActiveMask, outIndex, c, vOutAttrib);
}
}
// interpolate clip distance if enabled
- if (this->state.rastState.clipDistanceMask & 0xf)
+ if (this->state.backendState.clipDistanceMask & 0xf)
{
- uint32_t attribSlot = VERTEX_CLIPCULL_DIST_LO_SLOT;
+ uint32_t attribSlot = vertexClipCullOffset;
for (uint32_t c = 0; c < 4; ++c)
{
- simdscalar vAttrib0 = GatherComponent(pInVerts, attribSlot, vActiveMask, s, c);
- simdscalar vAttrib1 = GatherComponent(pInVerts, attribSlot, vActiveMask, p, c);
- simdscalar vOutAttrib = _simd_fmadd_ps(_simd_sub_ps(vAttrib1, vAttrib0), t, vAttrib0);
+ Float<SIMD_T> vAttrib0 = GatherComponent(pInVerts, attribSlot, vActiveMask, s, c);
+ Float<SIMD_T> vAttrib1 = GatherComponent(pInVerts, attribSlot, vActiveMask, p, c);
+ Float<SIMD_T> vOutAttrib =
+ SIMD_T::fmadd_ps(SIMD_T::sub_ps(vAttrib1, vAttrib0), t, vAttrib0);
ScatterComponent(pOutVerts, attribSlot, vActiveMask, outIndex, c, vOutAttrib);
}
}
- if (this->state.rastState.clipDistanceMask & 0xf0)
+ if (this->state.backendState.clipDistanceMask & 0xf0)
{
- uint32_t attribSlot = VERTEX_CLIPCULL_DIST_HI_SLOT;
+ uint32_t attribSlot = vertexClipCullOffset + 1;
for (uint32_t c = 0; c < 4; ++c)
{
- simdscalar vAttrib0 = GatherComponent(pInVerts, attribSlot, vActiveMask, s, c);
- simdscalar vAttrib1 = GatherComponent(pInVerts, attribSlot, vActiveMask, p, c);
- simdscalar vOutAttrib = _simd_fmadd_ps(_simd_sub_ps(vAttrib1, vAttrib0), t, vAttrib0);
+ Float<SIMD_T> vAttrib0 = GatherComponent(pInVerts, attribSlot, vActiveMask, s, c);
+ Float<SIMD_T> vAttrib1 = GatherComponent(pInVerts, attribSlot, vActiveMask, p, c);
+ Float<SIMD_T> vOutAttrib =
+ SIMD_T::fmadd_ps(SIMD_T::sub_ps(vAttrib1, vAttrib0), t, vAttrib0);
ScatterComponent(pOutVerts, attribSlot, vActiveMask, outIndex, c, vOutAttrib);
}
}
}
- template<SWR_CLIPCODES ClippingPlane>
- inline simdscalar inside(const simdvector& v)
+ template <SWR_CLIPCODES ClippingPlane>
+ Float<SIMD_T> inside(const Vec4<SIMD_T>& v)
{
switch (ClippingPlane)
{
- case FRUSTUM_LEFT: return _simd_cmpge_ps(v[0], _simd_mul_ps(v[3], _simd_set1_ps(-1.0f)));
- case FRUSTUM_RIGHT: return _simd_cmple_ps(v[0], v[3]);
- case FRUSTUM_TOP: return _simd_cmpge_ps(v[1], _simd_mul_ps(v[3], _simd_set1_ps(-1.0f)));
- case FRUSTUM_BOTTOM: return _simd_cmple_ps(v[1], v[3]);
- case FRUSTUM_NEAR: return _simd_cmpge_ps(v[2], this->driverType == DX ? _simd_setzero_ps() : _simd_mul_ps(v[3], _simd_set1_ps(-1.0f)));
- case FRUSTUM_FAR: return _simd_cmple_ps(v[2], v[3]);
+ case FRUSTUM_LEFT:
+ return SIMD_T::cmpge_ps(v[0], SIMD_T::mul_ps(v[3], SIMD_T::set1_ps(-1.0f)));
+ case FRUSTUM_RIGHT:
+ return SIMD_T::cmple_ps(v[0], v[3]);
+ case FRUSTUM_TOP:
+ return SIMD_T::cmpge_ps(v[1], SIMD_T::mul_ps(v[3], SIMD_T::set1_ps(-1.0f)));
+ case FRUSTUM_BOTTOM:
+ return SIMD_T::cmple_ps(v[1], v[3]);
+ case FRUSTUM_NEAR:
+ return SIMD_T::cmpge_ps(v[2],
+ this->state.rastState.clipHalfZ
+ ? SIMD_T::setzero_ps()
+ : SIMD_T::mul_ps(v[3], SIMD_T::set1_ps(-1.0f)));
+ case FRUSTUM_FAR:
+ return SIMD_T::cmple_ps(v[2], v[3]);
default:
- SWR_ASSERT(false, "invalid clipping plane: %d", ClippingPlane);
- return _simd_setzero_ps();
+ SWR_INVALID("invalid clipping plane: %d", ClippingPlane);
+ return SIMD_T::setzero_ps();
}
}
- template<SWR_CLIPCODES ClippingPlane>
- simdscalari ClipTriToPlane(const float* pInVerts, const simdscalari& vNumInPts, uint32_t numInAttribs, float* pOutVerts)
+ template <SWR_CLIPCODES ClippingPlane>
+ Integer<SIMD_T> ClipTriToPlane(const float* pInVerts,
+ const Integer<SIMD_T>& vNumInPts,
+ uint32_t numInAttribs,
+ float* pOutVerts)
{
- simdscalari vCurIndex = _simd_setzero_si();
- simdscalari vOutIndex = _simd_setzero_si();
- simdscalar vActiveMask = _simd_castsi_ps(_simd_cmplt_epi32(vCurIndex, vNumInPts));
+ uint32_t vertexAttribOffset = this->state.backendState.vertexAttribOffset;
+
+ Integer<SIMD_T> vCurIndex = SIMD_T::setzero_si();
+ Integer<SIMD_T> vOutIndex = SIMD_T::setzero_si();
+ Float<SIMD_T> vActiveMask = SIMD_T::castsi_ps(SIMD_T::cmplt_epi32(vCurIndex, vNumInPts));
- while (!_simd_testz_ps(vActiveMask, vActiveMask)) // loop until activeMask is empty
+ while (!SIMD_T::testz_ps(vActiveMask, vActiveMask)) // loop until activeMask is empty
{
- simdscalari s = vCurIndex;
- simdscalari p = _simd_add_epi32(s, _simd_set1_epi32(1));
- simdscalari underFlowMask = _simd_cmpgt_epi32(vNumInPts, p);
- p = _simd_castps_si(_simd_blendv_ps(_simd_setzero_ps(), _simd_castsi_ps(p), _simd_castsi_ps(underFlowMask)));
+ Integer<SIMD_T> s = vCurIndex;
+ Integer<SIMD_T> p = SIMD_T::add_epi32(s, SIMD_T::set1_epi32(1));
+ Integer<SIMD_T> underFlowMask = SIMD_T::cmpgt_epi32(vNumInPts, p);
+ p = SIMD_T::castps_si(SIMD_T::blendv_ps(
+ SIMD_T::setzero_ps(), SIMD_T::castsi_ps(p), SIMD_T::castsi_ps(underFlowMask)));
// gather position
- simdvector vInPos0, vInPos1;
+ Vec4<SIMD_T> vInPos0, vInPos1;
for (uint32_t c = 0; c < 4; ++c)
{
vInPos0[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, s, c);
}
// compute inside mask
- simdscalar s_in = inside<ClippingPlane>(vInPos0);
- simdscalar p_in = inside<ClippingPlane>(vInPos1);
+ Float<SIMD_T> s_in = inside<ClippingPlane>(vInPos0);
+ Float<SIMD_T> p_in = inside<ClippingPlane>(vInPos1);
// compute intersection mask (s_in != p_in)
- simdscalar intersectMask = _simd_xor_ps(s_in, p_in);
- intersectMask = _simd_and_ps(intersectMask, vActiveMask);
+ Float<SIMD_T> intersectMask = SIMD_T::xor_ps(s_in, p_in);
+ intersectMask = SIMD_T::and_ps(intersectMask, vActiveMask);
// store s if inside
- s_in = _simd_and_ps(s_in, vActiveMask);
- if (!_simd_testz_ps(s_in, s_in))
+ s_in = SIMD_T::and_ps(s_in, vActiveMask);
+ if (!SIMD_T::testz_ps(s_in, s_in))
{
// store position
for (uint32_t c = 0; c < 4; ++c)
{
- ScatterComponent(pOutVerts, VERTEX_POSITION_SLOT, s_in, vOutIndex, c, vInPos0[c]);
+ ScatterComponent(
+ pOutVerts, VERTEX_POSITION_SLOT, s_in, vOutIndex, c, vInPos0[c]);
}
// store attribs
for (uint32_t a = 0; a < numInAttribs; ++a)
{
- uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + a;
+ uint32_t attribSlot = vertexAttribOffset + a;
for (uint32_t c = 0; c < 4; ++c)
{
- simdscalar vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
+ Float<SIMD_T> vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib);
}
}
// store clip distance if enabled
- if (this->state.rastState.clipDistanceMask & 0xf)
+ uint32_t vertexClipCullSlot = this->state.backendState.vertexClipCullOffset;
+ if (this->state.backendState.clipDistanceMask & 0xf)
{
- uint32_t attribSlot = VERTEX_CLIPCULL_DIST_LO_SLOT;
+ uint32_t attribSlot = vertexClipCullSlot;
for (uint32_t c = 0; c < 4; ++c)
{
- simdscalar vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
+ Float<SIMD_T> vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib);
}
}
- if (this->state.rastState.clipDistanceMask & 0xf0)
+ if (this->state.backendState.clipDistanceMask & 0xf0)
{
- uint32_t attribSlot = VERTEX_CLIPCULL_DIST_HI_SLOT;
+ uint32_t attribSlot = vertexClipCullSlot + 1;
for (uint32_t c = 0; c < 4; ++c)
{
- simdscalar vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
+ Float<SIMD_T> vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib);
}
}
// increment outIndex
- vOutIndex = _simd_blendv_epi32(vOutIndex, _simd_add_epi32(vOutIndex, _simd_set1_epi32(1)), s_in);
+ vOutIndex = SIMD_T::blendv_epi32(
+ vOutIndex, SIMD_T::add_epi32(vOutIndex, SIMD_T::set1_epi32(1)), s_in);
}
// compute and store intersection
- if (!_simd_testz_ps(intersectMask, intersectMask))
+ if (!SIMD_T::testz_ps(intersectMask, intersectMask))
{
- intersect<ClippingPlane>(intersectMask, s, p, vInPos0, vInPos1, vOutIndex, pInVerts, numInAttribs, pOutVerts);
+ intersect<ClippingPlane>(intersectMask,
+ s,
+ p,
+ vInPos0,
+ vInPos1,
+ vOutIndex,
+ pInVerts,
+ numInAttribs,
+ pOutVerts);
// increment outIndex for active lanes
- vOutIndex = _simd_blendv_epi32(vOutIndex, _simd_add_epi32(vOutIndex, _simd_set1_epi32(1)), intersectMask);
+ vOutIndex = SIMD_T::blendv_epi32(
+ vOutIndex, SIMD_T::add_epi32(vOutIndex, SIMD_T::set1_epi32(1)), intersectMask);
}
// increment loop index and update active mask
- vCurIndex = _simd_add_epi32(vCurIndex, _simd_set1_epi32(1));
- vActiveMask = _simd_castsi_ps(_simd_cmplt_epi32(vCurIndex, vNumInPts));
+ vCurIndex = SIMD_T::add_epi32(vCurIndex, SIMD_T::set1_epi32(1));
+ vActiveMask = SIMD_T::castsi_ps(SIMD_T::cmplt_epi32(vCurIndex, vNumInPts));
}
return vOutIndex;
}
- template<SWR_CLIPCODES ClippingPlane>
- simdscalari ClipLineToPlane(const float* pInVerts, const simdscalari& vNumInPts, uint32_t numInAttribs, float* pOutVerts)
+ template <SWR_CLIPCODES ClippingPlane>
+ Integer<SIMD_T> ClipLineToPlane(const float* pInVerts,
+ const Integer<SIMD_T>& vNumInPts,
+ uint32_t numInAttribs,
+ float* pOutVerts)
{
- simdscalari vCurIndex = _simd_setzero_si();
- simdscalari vOutIndex = _simd_setzero_si();
- simdscalar vActiveMask = _simd_castsi_ps(_simd_cmplt_epi32(vCurIndex, vNumInPts));
+ uint32_t vertexAttribOffset = this->state.backendState.vertexAttribOffset;
- if (!_simd_testz_ps(vActiveMask, vActiveMask))
+ Integer<SIMD_T> vCurIndex = SIMD_T::setzero_si();
+ Integer<SIMD_T> vOutIndex = SIMD_T::setzero_si();
+ Float<SIMD_T> vActiveMask = SIMD_T::castsi_ps(SIMD_T::cmplt_epi32(vCurIndex, vNumInPts));
+
+ if (!SIMD_T::testz_ps(vActiveMask, vActiveMask))
{
- simdscalari s = vCurIndex;
- simdscalari p = _simd_add_epi32(s, _simd_set1_epi32(1));
+ Integer<SIMD_T> s = vCurIndex;
+ Integer<SIMD_T> p = SIMD_T::add_epi32(s, SIMD_T::set1_epi32(1));
// gather position
- simdvector vInPos0, vInPos1;
+ Vec4<SIMD_T> vInPos0, vInPos1;
for (uint32_t c = 0; c < 4; ++c)
{
vInPos0[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, s, c);
}
// compute inside mask
- simdscalar s_in = inside<ClippingPlane>(vInPos0);
- simdscalar p_in = inside<ClippingPlane>(vInPos1);
+ Float<SIMD_T> s_in = inside<ClippingPlane>(vInPos0);
+ Float<SIMD_T> p_in = inside<ClippingPlane>(vInPos1);
// compute intersection mask (s_in != p_in)
- simdscalar intersectMask = _simd_xor_ps(s_in, p_in);
- intersectMask = _simd_and_ps(intersectMask, vActiveMask);
+ Float<SIMD_T> intersectMask = SIMD_T::xor_ps(s_in, p_in);
+ intersectMask = SIMD_T::and_ps(intersectMask, vActiveMask);
// store s if inside
- s_in = _simd_and_ps(s_in, vActiveMask);
- if (!_simd_testz_ps(s_in, s_in))
+ s_in = SIMD_T::and_ps(s_in, vActiveMask);
+ if (!SIMD_T::testz_ps(s_in, s_in))
{
for (uint32_t c = 0; c < 4; ++c)
{
- ScatterComponent(pOutVerts, VERTEX_POSITION_SLOT, s_in, vOutIndex, c, vInPos0[c]);
+ ScatterComponent(
+ pOutVerts, VERTEX_POSITION_SLOT, s_in, vOutIndex, c, vInPos0[c]);
}
// interpolate attributes and store
for (uint32_t a = 0; a < numInAttribs; ++a)
{
- uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + a;
+ uint32_t attribSlot = vertexAttribOffset + a;
for (uint32_t c = 0; c < 4; ++c)
{
- simdscalar vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
+ Float<SIMD_T> vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib);
}
}
// increment outIndex
- vOutIndex = _simd_blendv_epi32(vOutIndex, _simd_add_epi32(vOutIndex, _simd_set1_epi32(1)), s_in);
+ vOutIndex = SIMD_T::blendv_epi32(
+ vOutIndex, SIMD_T::add_epi32(vOutIndex, SIMD_T::set1_epi32(1)), s_in);
}
// compute and store intersection
- if (!_simd_testz_ps(intersectMask, intersectMask))
+ if (!SIMD_T::testz_ps(intersectMask, intersectMask))
{
- intersect<ClippingPlane>(intersectMask, s, p, vInPos0, vInPos1, vOutIndex, pInVerts, numInAttribs, pOutVerts);
+ intersect<ClippingPlane>(intersectMask,
+ s,
+ p,
+ vInPos0,
+ vInPos1,
+ vOutIndex,
+ pInVerts,
+ numInAttribs,
+ pOutVerts);
// increment outIndex for active lanes
- vOutIndex = _simd_blendv_epi32(vOutIndex, _simd_add_epi32(vOutIndex, _simd_set1_epi32(1)), intersectMask);
+ vOutIndex = SIMD_T::blendv_epi32(
+ vOutIndex, SIMD_T::add_epi32(vOutIndex, SIMD_T::set1_epi32(1)), intersectMask);
}
// store p if inside
- p_in = _simd_and_ps(p_in, vActiveMask);
- if (!_simd_testz_ps(p_in, p_in))
+ p_in = SIMD_T::and_ps(p_in, vActiveMask);
+ if (!SIMD_T::testz_ps(p_in, p_in))
{
for (uint32_t c = 0; c < 4; ++c)
{
- ScatterComponent(pOutVerts, VERTEX_POSITION_SLOT, p_in, vOutIndex, c, vInPos1[c]);
+ ScatterComponent(
+ pOutVerts, VERTEX_POSITION_SLOT, p_in, vOutIndex, c, vInPos1[c]);
}
// interpolate attributes and store
for (uint32_t a = 0; a < numInAttribs; ++a)
{
- uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + a;
+ uint32_t attribSlot = vertexAttribOffset + a;
for (uint32_t c = 0; c < 4; ++c)
{
- simdscalar vAttrib = GatherComponent(pInVerts, attribSlot, p_in, p, c);
+ Float<SIMD_T> vAttrib = GatherComponent(pInVerts, attribSlot, p_in, p, c);
ScatterComponent(pOutVerts, attribSlot, p_in, vOutIndex, c, vAttrib);
}
}
// increment outIndex
- vOutIndex = _simd_blendv_epi32(vOutIndex, _simd_add_epi32(vOutIndex, _simd_set1_epi32(1)), p_in);
+ vOutIndex = SIMD_T::blendv_epi32(
+ vOutIndex, SIMD_T::add_epi32(vOutIndex, SIMD_T::set1_epi32(1)), p_in);
}
}
return vOutIndex;
}
- //////////////////////////////////////////////////////////////////////////
- /// @brief Vertical clipper. Clips SIMD primitives at a time
- /// @param pVertices - pointer to vertices in SOA form. Clipper will read input and write results to this buffer
- /// @param vPrimMask - mask of valid input primitives, including non-clipped prims
- /// @param numAttribs - number of valid input attribs, including position
- simdscalari ClipPrims(float* pVertices, const simdscalar& vPrimMask, const simdscalar& vClipMask, int numAttribs)
+ Integer<SIMD_T> ClipPrims(float* pVertices,
+ const Float<SIMD_T>& vPrimMask,
+ const Float<SIMD_T>& vClipMask,
+ int numAttribs)
{
// temp storage
- float* pTempVerts = (float*)&tlsTempVertices[0];
+ float* pTempVerts = reinterpret_cast<float*>(this->tmpVerts);
// zero out num input verts for non-active lanes
- simdscalari vNumInPts = _simd_set1_epi32(NumVertsPerPrim);
- vNumInPts = _simd_blendv_epi32(_simd_setzero_si(), vNumInPts, vClipMask);
+ Integer<SIMD_T> vNumInPts = SIMD_T::set1_epi32(NumVertsPerPrimT);
+ vNumInPts = SIMD_T::blendv_epi32(SIMD_T::setzero_si(), vNumInPts, vClipMask);
// clip prims to frustum
- simdscalari vNumOutPts;
- if (NumVertsPerPrim == 3)
+ Integer<SIMD_T> vNumOutPts;
+ if (NumVertsPerPrimT == 3)
{
vNumOutPts = ClipTriToPlane<FRUSTUM_NEAR>(pVertices, vNumInPts, numAttribs, pTempVerts);
vNumOutPts = ClipTriToPlane<FRUSTUM_FAR>(pTempVerts, vNumOutPts, numAttribs, pVertices);
- vNumOutPts = ClipTriToPlane<FRUSTUM_LEFT>(pVertices, vNumOutPts, numAttribs, pTempVerts);
- vNumOutPts = ClipTriToPlane<FRUSTUM_RIGHT>(pTempVerts, vNumOutPts, numAttribs, pVertices);
- vNumOutPts = ClipTriToPlane<FRUSTUM_BOTTOM>(pVertices, vNumOutPts, numAttribs, pTempVerts);
+ vNumOutPts =
+ ClipTriToPlane<FRUSTUM_LEFT>(pVertices, vNumOutPts, numAttribs, pTempVerts);
+ vNumOutPts =
+ ClipTriToPlane<FRUSTUM_RIGHT>(pTempVerts, vNumOutPts, numAttribs, pVertices);
+ vNumOutPts =
+ ClipTriToPlane<FRUSTUM_BOTTOM>(pVertices, vNumOutPts, numAttribs, pTempVerts);
vNumOutPts = ClipTriToPlane<FRUSTUM_TOP>(pTempVerts, vNumOutPts, numAttribs, pVertices);
}
else
{
- SWR_ASSERT(NumVertsPerPrim == 2);
- vNumOutPts = ClipLineToPlane<FRUSTUM_NEAR>(pVertices, vNumInPts, numAttribs, pTempVerts);
- vNumOutPts = ClipLineToPlane<FRUSTUM_FAR>(pTempVerts, vNumOutPts, numAttribs, pVertices);
- vNumOutPts = ClipLineToPlane<FRUSTUM_LEFT>(pVertices, vNumOutPts, numAttribs, pTempVerts);
- vNumOutPts = ClipLineToPlane<FRUSTUM_RIGHT>(pTempVerts, vNumOutPts, numAttribs, pVertices);
- vNumOutPts = ClipLineToPlane<FRUSTUM_BOTTOM>(pVertices, vNumOutPts, numAttribs, pTempVerts);
- vNumOutPts = ClipLineToPlane<FRUSTUM_TOP>(pTempVerts, vNumOutPts, numAttribs, pVertices);
+ SWR_ASSERT(NumVertsPerPrimT == 2);
+ vNumOutPts =
+ ClipLineToPlane<FRUSTUM_NEAR>(pVertices, vNumInPts, numAttribs, pTempVerts);
+ vNumOutPts =
+ ClipLineToPlane<FRUSTUM_FAR>(pTempVerts, vNumOutPts, numAttribs, pVertices);
+ vNumOutPts =
+ ClipLineToPlane<FRUSTUM_LEFT>(pVertices, vNumOutPts, numAttribs, pTempVerts);
+ vNumOutPts =
+ ClipLineToPlane<FRUSTUM_RIGHT>(pTempVerts, vNumOutPts, numAttribs, pVertices);
+ vNumOutPts =
+ ClipLineToPlane<FRUSTUM_BOTTOM>(pVertices, vNumOutPts, numAttribs, pTempVerts);
+ vNumOutPts =
+ ClipLineToPlane<FRUSTUM_TOP>(pTempVerts, vNumOutPts, numAttribs, pVertices);
}
// restore num verts for non-clipped, active lanes
- simdscalar vNonClippedMask = _simd_andnot_ps(vClipMask, vPrimMask);
- vNumOutPts = _simd_blendv_epi32(vNumOutPts, _simd_set1_epi32(NumVertsPerPrim), vNonClippedMask);
+ Float<SIMD_T> vNonClippedMask = SIMD_T::andnot_ps(vClipMask, vPrimMask);
+ vNumOutPts =
+ SIMD_T::blendv_epi32(vNumOutPts, SIMD_T::set1_epi32(NumVertsPerPrimT), vNonClippedMask);
return vNumOutPts;
}
- const uint32_t workerId{ 0 };
- const DRIVER_TYPE driverType{ DX };
- DRAW_CONTEXT* pDC{ nullptr };
+ const uint32_t workerId{0};
+ DRAW_CONTEXT* pDC{nullptr};
const API_STATE& state;
- simdscalar clipCodes[NumVertsPerPrim];
+ Float<SIMD_T> clipCodes[NumVertsPerPrimT];
+ SIMDVERTEX_T<SIMD_T>* clippedVerts;
+ SIMDVERTEX_T<SIMD_T>* tmpVerts;
+ SIMDVERTEX_T<SIMD_T>* transposedVerts;
};
-
// pipeline stage functions
-void ClipTriangles(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId);
-void ClipLines(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId);
-void ClipPoints(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId);
+void ClipRectangles(DRAW_CONTEXT* pDC,
+ PA_STATE& pa,
+ uint32_t workerId,
+ simdvector prims[],
+ uint32_t primMask,
+ simdscalari const& primId,
+ simdscalari const& viewportIdx,
+ simdscalari const& rtIdx);
+void ClipTriangles(DRAW_CONTEXT* pDC,
+ PA_STATE& pa,
+ uint32_t workerId,
+ simdvector prims[],
+ uint32_t primMask,
+ simdscalari const& primId,
+ simdscalari const& viewportIdx,
+ simdscalari const& rtIdx);
+void ClipLines(DRAW_CONTEXT* pDC,
+ PA_STATE& pa,
+ uint32_t workerId,
+ simdvector prims[],
+ uint32_t primMask,
+ simdscalari const& primId,
+ simdscalari const& viewportIdx,
+ simdscalari const& rtIdx);
+void ClipPoints(DRAW_CONTEXT* pDC,
+ PA_STATE& pa,
+ uint32_t workerId,
+ simdvector prims[],
+ uint32_t primMask,
+ simdscalari const& primId,
+ simdscalari const& viewportIdx,
+ simdscalari const& rtIdx);
+#if USE_SIMD16_FRONTEND
+void SIMDCALL ClipRectangles_simd16(DRAW_CONTEXT* pDC,
+ PA_STATE& pa,
+ uint32_t workerId,
+ simd16vector prims[],
+ uint32_t primMask,
+ simd16scalari const& primId,
+ simd16scalari const& viewportIdx,
+ simd16scalari const& rtIdx);
+void SIMDCALL ClipTriangles_simd16(DRAW_CONTEXT* pDC,
+ PA_STATE& pa,
+ uint32_t workerId,
+ simd16vector prims[],
+ uint32_t primMask,
+ simd16scalari const& primId,
+ simd16scalari const& viewportIdx,
+ simd16scalari const& rtIdx);
+void SIMDCALL ClipLines_simd16(DRAW_CONTEXT* pDC,
+ PA_STATE& pa,
+ uint32_t workerId,
+ simd16vector prims[],
+ uint32_t primMask,
+ simd16scalari const& primId,
+ simd16scalari const& viewportIdx,
+ simd16scalari const& rtIdx);
+void SIMDCALL ClipPoints_simd16(DRAW_CONTEXT* pDC,
+ PA_STATE& pa,
+ uint32_t workerId,
+ simd16vector prims[],
+ uint32_t primMask,
+ simd16scalari const& primId,
+ simd16scalari const& viewportIdx,
+ simd16scalari const& rtIdx);
+#endif