#define _simd16_cmple_ps(a, b) _simd16_cmp_ps<_CMP_LE_OQ>(a, b)
SIMD16_EMU_AVX512_2(simd16scalar, _simd16_and_ps, _simd_and_ps)
+SIMD16_EMU_AVX512_2(simd16scalar, _simd16_andnot_ps, _simd_andnot_ps)
SIMD16_EMU_AVX512_2(simd16scalar, _simd16_or_ps, _simd_or_ps)
+SIMD16_EMU_AVX512_2(simd16scalar, _simd16_xor_ps, _simd_xor_ps)
+
SIMD16_EMU_AVX512_1(simd16scalar, _simd16_rcp_ps, _simd_rcp_ps)
SIMD16_EMU_AVX512_2(simd16scalar, _simd16_div_ps, _simd_div_ps)
return *reinterpret_cast<simd16scalard *>(&a);
}
-SIMD16_EMU_AVX512_2(simd16scalar, _simd16_andnot_ps, _mm256_andnot_ps)
-
template <int mode>
INLINE simd16scalar _simd16_round_ps_temp(simd16scalar a)
{
SIMD16_EMU_AVX512_2(simd16scalari, _simd16_min_epu32, _simd_min_epu32)
SIMD16_EMU_AVX512_2(simd16scalari, _simd16_max_epu32, _simd_max_epu32)
SIMD16_EMU_AVX512_2(simd16scalari, _simd16_add_epi32, _simd_add_epi32)
+
SIMD16_EMU_AVX512_2(simd16scalari, _simd16_and_si, _simd_and_si)
SIMD16_EMU_AVX512_2(simd16scalari, _simd16_andnot_si, _simd_andnot_si)
SIMD16_EMU_AVX512_2(simd16scalari, _simd16_or_si, _simd_or_si)
SIMD16_EMU_AVX512_2(simd16scalari, _simd16_xor_si, _simd_xor_si)
+
SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpeq_epi32, _simd_cmpeq_epi32)
SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpgt_epi32, _simd_cmpgt_epi32)
SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmplt_epi32, _simd_cmplt_epi32)
SIMD16_EMU_AVX512_3(simd16scalar, _simd16_fmadd_ps, _simd_fmadd_ps)
SIMD16_EMU_AVX512_3(simd16scalar, _simd16_fmsub_ps, _simd_fmsub_ps)
-//__m256 _simd_i32gather_ps(const float* pBase, __m256i vOffsets, const int scale)
template <int scale>
INLINE simd16scalar _simd16_i32gather_ps_temp(const float *m, simd16scalari index)
{
#define _simd16_i32gather_ps(m, index, scale) _simd16_i32gather_ps_temp<scale>(m, index)
-//__m256 _simd_mask_i32gather_ps(__m256 vSrc, const float* pBase, __m256i vOffsets, __m256 vMask, const int scale)
template <int scale>
INLINE simd16scalar _simd16_mask_i32gather_ps_temp(simd16scalar a, const float *m, simd16scalari index, simd16scalari mask)
{
return result;
}
-#define _simd16_mask_i32gather_ps(a, m, index, mask, scale) _simd16_mask_i32gather_ps_temp<scale>(a, m, mask, index)
+#define _simd16_mask_i32gather_ps(a, m, index, mask, scale) _simd16_mask_i32gather_ps_temp<scale>(a, m, index, mask)
SIMD16_EMU_AVX512_2(simd16scalari, _simd16_shuffle_epi8, _simd_shuffle_epi8)
SIMD16_EMU_AVX512_2(simd16scalari, _simd16_adds_epu8, _simd_adds_epu8)
#define _simd16_castpd_ps _mm512_castpd_ps
#define _simd16_castps_pd _mm512_castps_pd
+#define _simd16_and_ps _mm512_and_ps
#define _simd16_andnot_ps _mm512_andnot_ps
+#define _simd16_or_ps _mm512_or_ps
+#define _simd16_xor_ps _mm512_xor_ps
template <int mode>
INLINE simd16scalar _simd16_round_ps_temp(simd16scalar a)
#define _simd16_min_epu32 _mm512_min_epu32
#define _simd16_max_epu32 _mm512_max_epu32
#define _simd16_add_epi32 _mm512_add_epi32
+
#define _simd16_and_si _mm512_and_si512
#define _simd16_andnot_si _mm512_andnot_si512
#define _simd16_or_si _mm512_or_si512
#define _simd16_fmsub_ps _mm512_fmsub_ps
#define _simd16_i32gather_ps(m, index, scale) _mm512_i32gather_ps(index, m, scale)
-#define _simd16_mask_i32gather_ps(a, m, index, mask, scale) _mm512_mask_i32gather_ps(a, m, index, mask, scale)
+
+template <int scale>
+INLINE simd16scalar _simd16_mask_i32gather_ps_temp(simd16scalar a, const float *m, simd16scalari index, simd16scalari mask)
+{
+ __mmask16 k = _mm512_cmpneq_epi32_mask(mask, _mm512_setzero_si512());
+
+ return _mm512_mask_i32gather_ps(a, k, index, m, scale);
+}
+
+#define _simd16_mask_i32gather_ps(a, m, index, mask, scale) _simd16_mask_i32gather_ps_temp<scale>(a, m, index, mask)
#define _simd16_abs_epi32 _mm512_abs_epi32
#define _simd16_cmpeq_epi64 _mm512_abs_epi32
bool nextReset{ false };
bool isStreaming{ false };
- SIMDMASK tmpIndices{ 0 }; // temporary index store for unused virtual function
+ SIMDMASK junkIndices { 0 }; // temporary index store for unused virtual function
PA_STATE_OPT() {}
PA_STATE_OPT(DRAW_CONTEXT* pDC, uint32_t numPrims, uint8_t* pStream, uint32_t streamSizeInVerts,
SIMDMASK& GetNextVsIndices()
{
// unused in optimized PA, pass tmp buffer back
- return tmpIndices;
+ return junkIndices;
}
bool GetNextStreamOutput()
bool processCutVerts{ false }; // vertex indices with cuts should be processed as normal, otherwise they
// are ignored. Fetch shader sends invalid verts on cuts that should be ignored
// while the GS sends valid verts for every index
+
+ simdvector junkVector; // junk simdvector for unimplemented API
+#if ENABLE_AVX512_SIMD16
+ simd16vector junkVector_simd16; // junk simd16vector for unimplemented API
+#endif
+
// Topology state tracking
uint32_t vert[MAX_NUM_VERTS_PER_PRIM];
uint32_t curIndex{ 0 };
{
// unused
SWR_ASSERT(0 && "Not implemented");
- static simdvector junk;
- return junk;
+ return junkVector;
}
#if ENABLE_AVX512_SIMD16
{
// unused
SWR_ASSERT(0 && "Not implemented");
- static simd16vector junk;
- return junk;
+ return junkVector_simd16;
}
#endif
#if USE_SIMD16_FRONTEND
simd16scalar temp = _simd16_i32gather_ps(pBase, offsets, 1);
- verts[v].v[c] = useAlternateOffset ? temp.hi : temp.lo;
+ verts[v].v[c] = useAlternateOffset ? _simd16_extract_ps(temp, 1) : _simd16_extract_ps(temp, 0);
#else
verts[v].v[c] = _simd_i32gather_ps(pBase, offsets, 1);
#endif
#if USE_SIMD16_FRONTEND
verts[v].v[c] = _simd16_i32gather_ps(pBase, offsets, 1);
#else
- verts[v].v[c].lo = _simd_i32gather_ps(pBase, offsets, 1);
- verts[v].v[c].hi = _simd_setzero_ps();
+ verts[v].v[c] = _simd16_insert_ps(_simd15_setzero_ps(), _simd_i32gather_ps(pBase, offsets, 1));
#endif
// move base to next component
simdvector& GetSimdVector(uint32_t index, uint32_t slot)
{
SWR_INVALID("%s NOT IMPLEMENTED", __FUNCTION__);
- static simdvector junk;
- return junk;
+ return junkVector;
}
#if ENABLE_AVX512_SIMD16
simd16vector& GetSimdVector_simd16(uint32_t index, uint32_t slot)
{
SWR_INVALID("%s NOT IMPLEMENTED", __FUNCTION__);
- static simd16vector junk;
- return junk;
+ return junkVector_simd16;
}
#endif
mask,
4 /* gcc doesn't like sizeof(float) */);
- verts[i].v[c] = useAlternateOffset ? temp.hi : temp.lo;
+ verts[i].v[c] = useAlternateOffset ? _simd16_extract_ps(temp, 1) : _simd16_extract_ps(temp, 0);
#else
verts[i].v[c] = _simd_mask_i32gather_ps(
_simd_setzero_ps(),
mask,
4 /* gcc doesn't like sizeof(float) */);
#else
- verts[i].v[c].lo = _simd_mask_i32gather_ps(
+ simdscalar temp = _simd_mask_i32gather_ps(
_simd_setzero_ps(),
pBase,
indices,
_simd_castsi_ps(mask),
4 /* gcc doesn't like sizeof(float) */);
- verts[i].v[c].hi = _simd_setzero_ps();
+ verts[i].v[c] = _simd16_insert_ps(_simd16_setzero_ps(), temp, 0);
#endif
pBase += m_attributeStrideInVectors * SIMD_WIDTH;
}
SIMDVERTEX& GetNextVsOutput()
{
SWR_NOT_IMPL;
- static SIMDVERTEX junk;
- return junk;
+ return junkVertex;
}
bool GetNextStreamOutput()
SIMDMASK& GetNextVsIndices()
{
SWR_NOT_IMPL;
- static SIMDMASK junk;
- return junk;
+ return junkIndices;
}
uint32_t NumPrims()
uint32_t m_numVertsPerPrim = 0;
SIMDSCALARI m_vPrimId;
+
+ simdvector junkVector; // junk simdvector for unimplemented API
+#if ENABLE_AVX512_SIMD16
+ simd16vector junkVector_simd16; // junk simd16vector for unimplemented API
+#endif
+ SIMDVERTEX junkVertex; // junk SIMDVERTEX for unimplemented API
+ SIMDMASK junkIndices; // temporary index store for unused virtual function
};
// Primitive Assembler factory class, responsible for creating and initializing the correct assembler