swr: [rasterizer core] Replace all naked OSALIGN macro uses with OSALIGNSIMD / OSALIG...
[mesa.git] / src / gallium / drivers / swr / rasterizer / core / clip.h
index 49494a4e3747e268909dc966c063fcf6f61efd55..67a4c4f47bbc54786798ae759f471e2b762b82d4 100644 (file)
@@ -32,6 +32,9 @@
 #include "core/pa.h"
 #include "rdtsc_core.h"
 
+// Temp storage used by the clipper
+extern THREAD simdvertex tlsTempVertices[7];
+
 enum SWR_CLIPCODES
 {
     // Shift clip codes out of the mantissa to prevent denormalized values when used in float compare.
@@ -262,8 +265,8 @@ public:
     // clip a single primitive
     int ClipScalar(PA_STATE& pa, uint32_t primIndex, float* pOutPos, float* pOutAttribs)
     {
-        OSALIGN(float, 16) inVerts[3 * 4];
-        OSALIGN(float, 16) inAttribs[3 * KNOB_NUM_ATTRIBUTES * 4];
+        OSALIGNSIMD(float) inVerts[3 * 4];
+        OSALIGNSIMD(float) inAttribs[3 * KNOB_NUM_ATTRIBUTES * 4];
 
         // transpose primitive position
         __m128 verts[3];
@@ -354,6 +357,25 @@ public:
             }
         }
 
+        // assemble user clip distances if enabled
+        if (this->state.rastState.clipDistanceMask & 0xf)
+        {
+            pa.Assemble(VERTEX_CLIPCULL_DIST_LO_SLOT, tmpVector);
+            for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
+            {
+                vertices[i].attrib[VERTEX_CLIPCULL_DIST_LO_SLOT] = tmpVector[i];
+            }
+        }
+
+        if (this->state.rastState.clipDistanceMask & 0xf0)
+        {
+            pa.Assemble(VERTEX_CLIPCULL_DIST_HI_SLOT, tmpVector);
+            for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
+            {
+                vertices[i].attrib[VERTEX_CLIPCULL_DIST_HI_SLOT] = tmpVector[i];
+            }
+        }
+
         uint32_t numAttribs = maxSlot + 1;
 
         simdscalari vNumClippedVerts = ClipPrims((float*)&vertices[0], vPrimMask, vClipMask, numAttribs);
@@ -436,6 +458,27 @@ public:
                 }
             }
 
+            // transpose user clip distances if enabled
+            if (this->state.rastState.clipDistanceMask & 0xf)
+            {
+                pBase = (uint8_t*)(&vertices[0].attrib[VERTEX_CLIPCULL_DIST_LO_SLOT]) + sizeof(float) * inputPrim;
+                for (uint32_t c = 0; c < 4; ++c)
+                {
+                    transposedPrims[0].attrib[VERTEX_CLIPCULL_DIST_LO_SLOT][c] = _simd_mask_i32gather_ps(_mm256_undefined_ps(), (const float*)pBase, vOffsets, vMask, 1);
+                    pBase += sizeof(simdscalar);
+                }
+            }
+
+            if (this->state.rastState.clipDistanceMask & 0xf0)
+            {
+                pBase = (uint8_t*)(&vertices[0].attrib[VERTEX_CLIPCULL_DIST_HI_SLOT]) + sizeof(float) * inputPrim;
+                for (uint32_t c = 0; c < 4; ++c)
+                {
+                    transposedPrims[0].attrib[VERTEX_CLIPCULL_DIST_HI_SLOT][c] = _simd_mask_i32gather_ps(_mm256_undefined_ps(), (const float*)pBase, vOffsets, vMask, 1);
+                    pBase += sizeof(simdscalar);
+                }
+            }
+
             PA_STATE_OPT clipPa(this->pDC, numEmittedPrims, (uint8_t*)&transposedPrims[0], numEmittedVerts, true, clipTopology);
 
             while (clipPa.GetNextStreamOutput())
@@ -630,6 +673,31 @@ private:
                 ScatterComponent(pOutVerts, attribSlot, vActiveMask, outIndex, c, vOutAttrib);
             }
         }
+
+        // interpolate clip distance if enabled
+        if (this->state.rastState.clipDistanceMask & 0xf)
+        {
+            uint32_t attribSlot = VERTEX_CLIPCULL_DIST_LO_SLOT;
+            for (uint32_t c = 0; c < 4; ++c)
+            {
+                simdscalar vAttrib0 = GatherComponent(pInVerts, attribSlot, vActiveMask, s, c);
+                simdscalar vAttrib1 = GatherComponent(pInVerts, attribSlot, vActiveMask, p, c);
+                simdscalar vOutAttrib = _simd_fmadd_ps(_simd_sub_ps(vAttrib1, vAttrib0), t, vAttrib0);
+                ScatterComponent(pOutVerts, attribSlot, vActiveMask, outIndex, c, vOutAttrib);
+            }
+        }
+
+        if (this->state.rastState.clipDistanceMask & 0xf0)
+        {
+            uint32_t attribSlot = VERTEX_CLIPCULL_DIST_HI_SLOT;
+            for (uint32_t c = 0; c < 4; ++c)
+            {
+                simdscalar vAttrib0 = GatherComponent(pInVerts, attribSlot, vActiveMask, s, c);
+                simdscalar vAttrib1 = GatherComponent(pInVerts, attribSlot, vActiveMask, p, c);
+                simdscalar vOutAttrib = _simd_fmadd_ps(_simd_sub_ps(vAttrib1, vAttrib0), t, vAttrib0);
+                ScatterComponent(pOutVerts, attribSlot, vActiveMask, outIndex, c, vOutAttrib);
+            }
+        }
     }
 
     template<SWR_CLIPCODES ClippingPlane>
@@ -700,6 +768,27 @@ private:
                     }
                 }
 
+                // store clip distance if enabled
+                if (this->state.rastState.clipDistanceMask & 0xf)
+                {
+                    uint32_t attribSlot = VERTEX_CLIPCULL_DIST_LO_SLOT;
+                    for (uint32_t c = 0; c < 4; ++c)
+                    {
+                        simdscalar vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
+                        ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib);
+                    }
+                }
+
+                if (this->state.rastState.clipDistanceMask & 0xf0)
+                {
+                    uint32_t attribSlot = VERTEX_CLIPCULL_DIST_HI_SLOT;
+                    for (uint32_t c = 0; c < 4; ++c)
+                    {
+                        simdscalar vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
+                        ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib);
+                    }
+                }
+
                 // increment outIndex
                 vOutIndex = _simd_blendv_epi32(vOutIndex, _simd_add_epi32(vOutIndex, _simd_set1_epi32(1)), s_in);
             }
@@ -818,8 +907,7 @@ private:
     simdscalari ClipPrims(float* pVertices, const simdscalar& vPrimMask, const simdscalar& vClipMask, int numAttribs)
     {
         // temp storage
-        simdvertex tempVertices[7];
-        float* pTempVerts = (float*)&tempVertices[0];
+        float* pTempVerts = (float*)&tlsTempVertices[0];
 
         // zero out num input verts for non-active lanes
         simdscalari vNumInPts = _simd_set1_epi32(NumVertsPerPrim);
@@ -854,9 +942,9 @@ private:
         return vNumOutPts;
     }
 
-    const uint32_t workerId;
-    const DRIVER_TYPE driverType;
-    DRAW_CONTEXT* pDC;
+    const uint32_t workerId{ 0 };
+    const DRIVER_TYPE driverType{ DX };
+    DRAW_CONTEXT* pDC{ nullptr };
     const API_STATE& state;
     simdscalar clipCodes[NumVertsPerPrim];
 };