};
#if KNOB_SIMD_WIDTH == 8
-extern const __m256 vCenterOffsetsX;
-extern const __m256 vCenterOffsetsY;
-extern const __m256 vULOffsetsX;
-extern const __m256 vULOffsetsY;
+extern const simdscalar vCenterOffsetsX;
+extern const simdscalar vCenterOffsetsY;
+extern const simdscalar vULOffsetsX;
+extern const simdscalar vULOffsetsY;
#define MASK 0xff
#endif
// will need to update for avx512
assert(KNOB_SIMD_WIDTH == 8);
- __m256i mask[2];
- __m256i sampleCoverage[2];
+ simdscalari mask[2];
+ simdscalari sampleCoverage[2];
if(T::bIsCenterPattern)
{
mask[0] = _mm256_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0xC, 0x8, 0x4, 0x0,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0xC, 0x8, 0x4, 0x0);
// pull out the 8bit 4x2 coverage for samples 0-7 into the lower 32 bits of each 128bit lane
- __m256i packedCoverage0 = _simd_shuffle_epi8(sampleCoverage[0], mask[0]);
+ simdscalari packedCoverage0 = _simd_shuffle_epi8(sampleCoverage[0], mask[0]);
- __m256i packedCoverage1;
+ simdscalari packedCoverage1;
if(T::MultisampleT::numSamples > 8)
{
// pull out the 8bit 4x2 coverage for samples 8-15 into the lower 32 bits of each 128bit lane
#if (KNOB_ARCH == KNOB_ARCH_AVX)
// pack lower 32 bits of each 128 bit lane into lower 64 bits of single 128 bit lane
- __m256i hiToLow = _mm256_permute2f128_si256(packedCoverage0, packedCoverage0, 0x83);
- __m256 shufRes = _mm256_shuffle_ps(_mm256_castsi256_ps(hiToLow), _mm256_castsi256_ps(hiToLow), _MM_SHUFFLE(1, 1, 0, 1));
+ simdscalari hiToLow = _mm256_permute2f128_si256(packedCoverage0, packedCoverage0, 0x83);
+ simdscalar shufRes = _mm256_shuffle_ps(_mm256_castsi256_ps(hiToLow), _mm256_castsi256_ps(hiToLow), _MM_SHUFFLE(1, 1, 0, 1));
packedCoverage0 = _mm256_castps_si256(_mm256_blend_ps(_mm256_castsi256_ps(packedCoverage0), shufRes, 0xFE));
- __m256i packedSampleCoverage;
+ simdscalari packedSampleCoverage;
if(T::MultisampleT::numSamples > 8)
{
// pack lower 32 bits of each 128 bit lane into upper 64 bits of single 128 bit lane
packedSampleCoverage = packedCoverage0;
}
#else
- __m256i permMask = _mm256_set_epi32(0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x4, 0x0);
+ simdscalari permMask = _mm256_set_epi32(0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x4, 0x0);
// pack lower 32 bits of each 128 bit lane into lower 64 bits of single 128 bit lane
packedCoverage0 = _mm256_permutevar8x32_epi32(packedCoverage0, permMask);
- __m256i packedSampleCoverage;
+ simdscalari packedSampleCoverage;
if(T::MultisampleT::numSamples > 8)
{
permMask = _mm256_set_epi32(0x7, 0x7, 0x7, 0x7, 0x4, 0x0, 0x7, 0x7);
}
}
- INLINE generateInputCoverage(const uint64_t *const coverageMask, __m256 &inputCoverage, const uint32_t sampleMask)
+ INLINE generateInputCoverage(const uint64_t *const coverageMask, simdscalar &inputCoverage, const uint32_t sampleMask)
{
uint32_t inputMask[KNOB_SIMD_WIDTH];
generateInputCoverage<T, T::InputCoverage>(coverageMask, inputMask, sampleMask);
template<typename T>
struct generateInputCoverage<T, SWR_INPUT_COVERAGE_INNER_CONSERVATIVE>
{
- INLINE generateInputCoverage(const uint64_t *const coverageMask, __m256 &inputCoverage, const uint32_t sampleMask)
+ INLINE generateInputCoverage(const uint64_t *const coverageMask, simdscalar &inputCoverage, const uint32_t sampleMask)
{
// will need to update for avx512
assert(KNOB_SIMD_WIDTH == 8);
- __m256i vec = _mm256_set1_epi32(coverageMask[0]);
- const __m256i bit = _mm256_set_epi32(0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01);
+ simdscalari vec = _mm256_set1_epi32(coverageMask[0]);
+ const simdscalari bit = _mm256_set_epi32(0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01);
vec = _simd_and_si(vec, bit);
vec = _simd_cmplt_epi32(_mm256_setzero_si256(), vec);
vec = _simd_blendv_epi32(_simd_setzero_si(), _simd_set1_epi32(1), vec);
vYSample = _simd_add_ps(vYSamplePosUL, vYSample);
// Case (1) and case (3b) - All samples covered or not covered with full SampleMask
- static const __m256i vFullyCoveredMask = T::MultisampleT::FullSampleMask();
- __m256i vInputCoveragei = _mm256_set_epi32(inputMask[7], inputMask[6], inputMask[5], inputMask[4], inputMask[3], inputMask[2], inputMask[1], inputMask[0]);
- __m256i vAllSamplesCovered = _simd_cmpeq_epi32(vInputCoveragei, vFullyCoveredMask);
+ static const simdscalari vFullyCoveredMask = T::MultisampleT::FullSampleMask();
+ simdscalari vInputCoveragei = _mm256_set_epi32(inputMask[7], inputMask[6], inputMask[5], inputMask[4], inputMask[3], inputMask[2], inputMask[1], inputMask[0]);
+ simdscalari vAllSamplesCovered = _simd_cmpeq_epi32(vInputCoveragei, vFullyCoveredMask);
- static const __m256i vZero = _simd_setzero_si();
- const __m256i vSampleMask = _simd_and_si(_simd_set1_epi32(sampleMask), vFullyCoveredMask);
- __m256i vNoSamplesCovered = _simd_cmpeq_epi32(vInputCoveragei, vZero);
- __m256i vIsFullSampleMask = _simd_cmpeq_epi32(vSampleMask, vFullyCoveredMask);
- __m256i vCase3b = _simd_and_si(vNoSamplesCovered, vIsFullSampleMask);
+ static const simdscalari vZero = _simd_setzero_si();
+ const simdscalari vSampleMask = _simd_and_si(_simd_set1_epi32(sampleMask), vFullyCoveredMask);
+ simdscalari vNoSamplesCovered = _simd_cmpeq_epi32(vInputCoveragei, vZero);
+ simdscalari vIsFullSampleMask = _simd_cmpeq_epi32(vSampleMask, vFullyCoveredMask);
+ simdscalari vCase3b = _simd_and_si(vNoSamplesCovered, vIsFullSampleMask);
- __m256i vEvalAtCenter = _simd_or_si(vAllSamplesCovered, vCase3b);
+ simdscalari vEvalAtCenter = _simd_or_si(vAllSamplesCovered, vCase3b);
// set the centroid position based on results from above
psContext.vX.centroid = _simd_blendv_ps(vXSample, psContext.vX.center, _simd_castsi_ps(vEvalAtCenter));
psContext.vY.centroid = _simd_blendv_ps(vYSample, psContext.vY.center, _simd_castsi_ps(vEvalAtCenter));
// Case (3a) No samples covered and partial sample mask
- __m256i vSomeSampleMaskSamples = _simd_cmplt_epi32(vSampleMask, vFullyCoveredMask);
+ simdscalari vSomeSampleMaskSamples = _simd_cmplt_epi32(vSampleMask, vFullyCoveredMask);
// sample mask should never be all 0's for this case, but handle it anyways
unsigned long firstCoveredSampleMaskSample = 0;
(sampleMask > 0) ? (_BitScanForward(&firstCoveredSampleMaskSample, sampleMask)) : (firstCoveredSampleMaskSample = 0);
- __m256i vCase3a = _simd_and_si(vNoSamplesCovered, vSomeSampleMaskSamples);
+ simdscalari vCase3a = _simd_and_si(vNoSamplesCovered, vSomeSampleMaskSamples);
vXSample = _simd_set1_ps(samplePos.X(firstCoveredSampleMaskSample));
vYSample = _simd_set1_ps(samplePos.Y(firstCoveredSampleMaskSample));
row3 = _mm_unpackhi_epi64(row3, vTemp);
}
-#define GCC_VERSION (__GNUC__ * 10000 \
- + __GNUC_MINOR__ * 100 \
- + __GNUC_PATCHLEVEL__)
-
-#if defined(__clang__) || (defined(__GNUC__) && (GCC_VERSION < 40900))
-#define _mm_undefined_ps _mm_setzero_ps
-#define _mm_undefined_si128 _mm_setzero_si128
-#if KNOB_SIMD_WIDTH == 8
-#define _mm256_undefined_ps _mm256_setzero_ps
-#endif
-#endif
-
#if KNOB_SIMD_WIDTH == 8
INLINE
-void vTranspose3x8(__m128 (&vDst)[8], const __m256 &vSrc0, const __m256 &vSrc1, const __m256 &vSrc2)
+void vTranspose3x8(__m128 (&vDst)[8], const simdscalar &vSrc0, const simdscalar &vSrc1, const simdscalar &vSrc2)
{
- __m256 r0r2 = _mm256_unpacklo_ps(vSrc0, vSrc2); //x0z0x1z1 x4z4x5z5
- __m256 r1rx = _mm256_unpacklo_ps(vSrc1, _mm256_undefined_ps()); //y0w0y1w1 y4w4y5w5
- __m256 r02r1xlolo = _mm256_unpacklo_ps(r0r2, r1rx); //x0y0z0w0 x4y4z4w4
- __m256 r02r1xlohi = _mm256_unpackhi_ps(r0r2, r1rx); //x1y1z1w1 x5y5z5w5
+ simdscalar r0r2 = _simd_unpacklo_ps(vSrc0, vSrc2); //x0z0x1z1 x4z4x5z5
+ simdscalar r1rx = _simd_unpacklo_ps(vSrc1, _simd_setzero_ps()); //y0w0y1w1 y4w4y5w5
+ simdscalar r02r1xlolo = _simd_unpacklo_ps(r0r2, r1rx); //x0y0z0w0 x4y4z4w4
+ simdscalar r02r1xlohi = _simd_unpackhi_ps(r0r2, r1rx); //x1y1z1w1 x5y5z5w5
- r0r2 = _mm256_unpackhi_ps(vSrc0, vSrc2); //x2z2x3z3 x6z6x7z7
- r1rx = _mm256_unpackhi_ps(vSrc1, _mm256_undefined_ps()); //y2w2y3w3 y6w6yw77
- __m256 r02r1xhilo = _mm256_unpacklo_ps(r0r2, r1rx); //x2y2z2w2 x6y6z6w6
- __m256 r02r1xhihi = _mm256_unpackhi_ps(r0r2, r1rx); //x3y3z3w3 x7y7z7w7
+ r0r2 = _simd_unpackhi_ps(vSrc0, vSrc2); //x2z2x3z3 x6z6x7z7
+ r1rx = _simd_unpackhi_ps(vSrc1, _simd_setzero_ps()); //y2w2y3w3 y6w6yw77
+ simdscalar r02r1xhilo = _simd_unpacklo_ps(r0r2, r1rx); //x2y2z2w2 x6y6z6w6
+ simdscalar r02r1xhihi = _simd_unpackhi_ps(r0r2, r1rx); //x3y3z3w3 x7y7z7w7
vDst[0] = _mm256_castps256_ps128(r02r1xlolo);
vDst[1] = _mm256_castps256_ps128(r02r1xlohi);
vDst[2] = _mm256_castps256_ps128(r02r1xhilo);
vDst[3] = _mm256_castps256_ps128(r02r1xhihi);
- vDst[4] = _mm256_extractf128_ps(r02r1xlolo, 1);
- vDst[5] = _mm256_extractf128_ps(r02r1xlohi, 1);
- vDst[6] = _mm256_extractf128_ps(r02r1xhilo, 1);
- vDst[7] = _mm256_extractf128_ps(r02r1xhihi, 1);
+ vDst[4] = _simd_extractf128_ps(r02r1xlolo, 1);
+ vDst[5] = _simd_extractf128_ps(r02r1xlohi, 1);
+ vDst[6] = _simd_extractf128_ps(r02r1xhilo, 1);
+ vDst[7] = _simd_extractf128_ps(r02r1xhihi, 1);
}
INLINE
-void vTranspose4x8(__m128 (&vDst)[8], const __m256 &vSrc0, const __m256 &vSrc1, const __m256 &vSrc2, const __m256 &vSrc3)
+void vTranspose4x8(__m128 (&vDst)[8], const simdscalar &vSrc0, const simdscalar &vSrc1, const simdscalar &vSrc2, const simdscalar &vSrc3)
{
- __m256 r0r2 = _mm256_unpacklo_ps(vSrc0, vSrc2); //x0z0x1z1 x4z4x5z5
- __m256 r1rx = _mm256_unpacklo_ps(vSrc1, vSrc3); //y0w0y1w1 y4w4y5w5
- __m256 r02r1xlolo = _mm256_unpacklo_ps(r0r2, r1rx); //x0y0z0w0 x4y4z4w4
- __m256 r02r1xlohi = _mm256_unpackhi_ps(r0r2, r1rx); //x1y1z1w1 x5y5z5w5
+ simdscalar r0r2 = _simd_unpacklo_ps(vSrc0, vSrc2); //x0z0x1z1 x4z4x5z5
+ simdscalar r1rx = _simd_unpacklo_ps(vSrc1, vSrc3); //y0w0y1w1 y4w4y5w5
+ simdscalar r02r1xlolo = _simd_unpacklo_ps(r0r2, r1rx); //x0y0z0w0 x4y4z4w4
+ simdscalar r02r1xlohi = _simd_unpackhi_ps(r0r2, r1rx); //x1y1z1w1 x5y5z5w5
- r0r2 = _mm256_unpackhi_ps(vSrc0, vSrc2); //x2z2x3z3 x6z6x7z7
- r1rx = _mm256_unpackhi_ps(vSrc1, vSrc3) ; //y2w2y3w3 y6w6yw77
- __m256 r02r1xhilo = _mm256_unpacklo_ps(r0r2, r1rx); //x2y2z2w2 x6y6z6w6
- __m256 r02r1xhihi = _mm256_unpackhi_ps(r0r2, r1rx); //x3y3z3w3 x7y7z7w7
+ r0r2 = _simd_unpackhi_ps(vSrc0, vSrc2); //x2z2x3z3 x6z6x7z7
+ r1rx = _simd_unpackhi_ps(vSrc1, vSrc3); //y2w2y3w3 y6w6yw77
+ simdscalar r02r1xhilo = _simd_unpacklo_ps(r0r2, r1rx); //x2y2z2w2 x6y6z6w6
+ simdscalar r02r1xhihi = _simd_unpackhi_ps(r0r2, r1rx); //x3y3z3w3 x7y7z7w7
vDst[0] = _mm256_castps256_ps128(r02r1xlolo);
vDst[1] = _mm256_castps256_ps128(r02r1xlohi);
vDst[2] = _mm256_castps256_ps128(r02r1xhilo);
vDst[3] = _mm256_castps256_ps128(r02r1xhihi);
- vDst[4] = _mm256_extractf128_ps(r02r1xlolo, 1);
- vDst[5] = _mm256_extractf128_ps(r02r1xlohi, 1);
- vDst[6] = _mm256_extractf128_ps(r02r1xhilo, 1);
- vDst[7] = _mm256_extractf128_ps(r02r1xhihi, 1);
+ vDst[4] = _simd_extractf128_ps(r02r1xlolo, 1);
+ vDst[5] = _simd_extractf128_ps(r02r1xlohi, 1);
+ vDst[6] = _simd_extractf128_ps(r02r1xhilo, 1);
+ vDst[7] = _simd_extractf128_ps(r02r1xhihi, 1);
}
#if ENABLE_AVX512_SIMD16
#endif
INLINE
-void vTranspose8x8(__m256 (&vDst)[8], const __m256 &vMask0, const __m256 &vMask1, const __m256 &vMask2, const __m256 &vMask3, const __m256 &vMask4, const __m256 &vMask5, const __m256 &vMask6, const __m256 &vMask7)
+void vTranspose8x8(simdscalar (&vDst)[8], const simdscalar &vMask0, const simdscalar &vMask1, const simdscalar &vMask2, const simdscalar &vMask3, const simdscalar &vMask4, const simdscalar &vMask5, const simdscalar &vMask6, const simdscalar &vMask7)
{
- __m256 __t0 = _mm256_unpacklo_ps(vMask0, vMask1);
- __m256 __t1 = _mm256_unpackhi_ps(vMask0, vMask1);
- __m256 __t2 = _mm256_unpacklo_ps(vMask2, vMask3);
- __m256 __t3 = _mm256_unpackhi_ps(vMask2, vMask3);
- __m256 __t4 = _mm256_unpacklo_ps(vMask4, vMask5);
- __m256 __t5 = _mm256_unpackhi_ps(vMask4, vMask5);
- __m256 __t6 = _mm256_unpacklo_ps(vMask6, vMask7);
- __m256 __t7 = _mm256_unpackhi_ps(vMask6, vMask7);
- __m256 __tt0 = _mm256_shuffle_ps(__t0,__t2,_MM_SHUFFLE(1,0,1,0));
- __m256 __tt1 = _mm256_shuffle_ps(__t0,__t2,_MM_SHUFFLE(3,2,3,2));
- __m256 __tt2 = _mm256_shuffle_ps(__t1,__t3,_MM_SHUFFLE(1,0,1,0));
- __m256 __tt3 = _mm256_shuffle_ps(__t1,__t3,_MM_SHUFFLE(3,2,3,2));
- __m256 __tt4 = _mm256_shuffle_ps(__t4,__t6,_MM_SHUFFLE(1,0,1,0));
- __m256 __tt5 = _mm256_shuffle_ps(__t4,__t6,_MM_SHUFFLE(3,2,3,2));
- __m256 __tt6 = _mm256_shuffle_ps(__t5,__t7,_MM_SHUFFLE(1,0,1,0));
- __m256 __tt7 = _mm256_shuffle_ps(__t5,__t7,_MM_SHUFFLE(3,2,3,2));
- vDst[0] = _mm256_permute2f128_ps(__tt0, __tt4, 0x20);
- vDst[1] = _mm256_permute2f128_ps(__tt1, __tt5, 0x20);
- vDst[2] = _mm256_permute2f128_ps(__tt2, __tt6, 0x20);
- vDst[3] = _mm256_permute2f128_ps(__tt3, __tt7, 0x20);
- vDst[4] = _mm256_permute2f128_ps(__tt0, __tt4, 0x31);
- vDst[5] = _mm256_permute2f128_ps(__tt1, __tt5, 0x31);
- vDst[6] = _mm256_permute2f128_ps(__tt2, __tt6, 0x31);
- vDst[7] = _mm256_permute2f128_ps(__tt3, __tt7, 0x31);
+ simdscalar __t0 = _simd_unpacklo_ps(vMask0, vMask1);
+ simdscalar __t1 = _simd_unpackhi_ps(vMask0, vMask1);
+ simdscalar __t2 = _simd_unpacklo_ps(vMask2, vMask3);
+ simdscalar __t3 = _simd_unpackhi_ps(vMask2, vMask3);
+ simdscalar __t4 = _simd_unpacklo_ps(vMask4, vMask5);
+ simdscalar __t5 = _simd_unpackhi_ps(vMask4, vMask5);
+ simdscalar __t6 = _simd_unpacklo_ps(vMask6, vMask7);
+ simdscalar __t7 = _simd_unpackhi_ps(vMask6, vMask7);
+ simdscalar __tt0 = _simd_shuffle_ps(__t0,__t2,_MM_SHUFFLE(1,0,1,0));
+ simdscalar __tt1 = _simd_shuffle_ps(__t0,__t2,_MM_SHUFFLE(3,2,3,2));
+ simdscalar __tt2 = _simd_shuffle_ps(__t1,__t3,_MM_SHUFFLE(1,0,1,0));
+ simdscalar __tt3 = _simd_shuffle_ps(__t1,__t3,_MM_SHUFFLE(3,2,3,2));
+ simdscalar __tt4 = _simd_shuffle_ps(__t4,__t6,_MM_SHUFFLE(1,0,1,0));
+ simdscalar __tt5 = _simd_shuffle_ps(__t4,__t6,_MM_SHUFFLE(3,2,3,2));
+ simdscalar __tt6 = _simd_shuffle_ps(__t5,__t7,_MM_SHUFFLE(1,0,1,0));
+ simdscalar __tt7 = _simd_shuffle_ps(__t5,__t7,_MM_SHUFFLE(3,2,3,2));
+ vDst[0] = _simd_permute2f128_ps(__tt0, __tt4, 0x20);
+ vDst[1] = _simd_permute2f128_ps(__tt1, __tt5, 0x20);
+ vDst[2] = _simd_permute2f128_ps(__tt2, __tt6, 0x20);
+ vDst[3] = _simd_permute2f128_ps(__tt3, __tt7, 0x20);
+ vDst[4] = _simd_permute2f128_ps(__tt0, __tt4, 0x31);
+ vDst[5] = _simd_permute2f128_ps(__tt1, __tt5, 0x31);
+ vDst[6] = _simd_permute2f128_ps(__tt2, __tt6, 0x31);
+ vDst[7] = _simd_permute2f128_ps(__tt3, __tt7, 0x31);
}
INLINE
-void vTranspose8x8(__m256 (&vDst)[8], const __m256i &vMask0, const __m256i &vMask1, const __m256i &vMask2, const __m256i &vMask3, const __m256i &vMask4, const __m256i &vMask5, const __m256i &vMask6, const __m256i &vMask7)
+void vTranspose8x8(simdscalar (&vDst)[8], const simdscalari &vMask0, const simdscalari &vMask1, const simdscalari &vMask2, const simdscalari &vMask3, const simdscalari &vMask4, const simdscalari &vMask5, const simdscalari &vMask6, const simdscalari &vMask7)
{
- vTranspose8x8(vDst, _mm256_castsi256_ps(vMask0), _mm256_castsi256_ps(vMask1), _mm256_castsi256_ps(vMask2), _mm256_castsi256_ps(vMask3),
- _mm256_castsi256_ps(vMask4), _mm256_castsi256_ps(vMask5), _mm256_castsi256_ps(vMask6), _mm256_castsi256_ps(vMask7));
+ vTranspose8x8(vDst, _simd_castsi_ps(vMask0), _simd_castsi_ps(vMask1), _simd_castsi_ps(vMask2), _simd_castsi_ps(vMask3),
+ _simd_castsi_ps(vMask4), _simd_castsi_ps(vMask5), _simd_castsi_ps(vMask6), _simd_castsi_ps(vMask7));
}
#endif
_mm_store_si128((__m128i*)pDst, c0123lo);
_mm_store_si128((__m128i*)(pDst + 16), c0123hi);
#else
- simdscalari dst01 = _mm256_shuffle_epi8(src,
- _mm256_set_epi32(0x0f078080, 0x0e068080, 0x0d058080, 0x0c048080, 0x80800b03, 0x80800a02, 0x80800901, 0x80800800));
+ simdscalari dst01 = _simd_shuffle_epi8(src,
+ _simd_set_epi32(0x0f078080, 0x0e068080, 0x0d058080, 0x0c048080, 0x80800b03, 0x80800a02, 0x80800901, 0x80800800));
simdscalari dst23 = _mm256_permute2x128_si256(src, src, 0x01);
- dst23 = _mm256_shuffle_epi8(dst23,
- _mm256_set_epi32(0x80800f07, 0x80800e06, 0x80800d05, 0x80800c04, 0x0b038080, 0x0a028080, 0x09018080, 0x08008080));
- simdscalari dst = _mm256_or_si256(dst01, dst23);
+ dst23 = _simd_shuffle_epi8(dst23,
+ _simd_set_epi32(0x80800f07, 0x80800e06, 0x80800d05, 0x80800c04, 0x0b038080, 0x0a028080, 0x09018080, 0x08008080));
+ simdscalari dst = _simd_or_si(dst01, dst23);
_simd_store_si((simdscalari*)pDst, dst);
#endif
#else
// pack
simdscalari packed = _simd_castps_si(dst.x);
- packed = _simd_or_si(packed, _simd_slli_epi32(_simd_castps_si(dst.y), FormatTraits<DstFormat>::GetBPC(0)));
- packed = _simd_or_si(packed, _simd_slli_epi32(_simd_castps_si(dst.z), FormatTraits<DstFormat>::GetBPC(0) +
- FormatTraits<DstFormat>::GetBPC(1)));
+ packed = _simd_or_si(packed, _simd_slli_epi32(_simd_castps_si(dst.y), FormatTraits<DstFormat>::GetConstBPC(0)));
+ packed = _simd_or_si(packed, _simd_slli_epi32(_simd_castps_si(dst.z), FormatTraits<DstFormat>::GetConstBPC(0) +
+ FormatTraits<DstFormat>::GetConstBPC(1)));
// pack low 16 bits of each 32 bit lane to low 128 bits of dst
uint32_t *pPacked = (uint32_t*)&packed;
vComp3 = _simd_mul_ps(vComp3, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(3)));
// moving to 8 wide integer vector types
- __m256i src0 = _simd_cvtps_epi32(vComp0); // padded byte rrrrrrrr
- __m256i src1 = _simd_cvtps_epi32(vComp1); // padded byte gggggggg
- __m256i src2 = _simd_cvtps_epi32(vComp2); // padded byte bbbbbbbb
- __m256i src3 = _simd_cvtps_epi32(vComp3); // padded byte aaaaaaaa
+ simdscalari src0 = _simd_cvtps_epi32(vComp0); // padded byte rrrrrrrr
+ simdscalari src1 = _simd_cvtps_epi32(vComp1); // padded byte gggggggg
+ simdscalari src2 = _simd_cvtps_epi32(vComp2); // padded byte bbbbbbbb
+ simdscalari src3 = _simd_cvtps_epi32(vComp3); // padded byte aaaaaaaa
#if KNOB_ARCH <= KNOB_ARCH_AVX
__m128i vRow00 = _mm_unpacklo_epi64(srcLo0, srcHi0); // abgrabgrabgrabgrabgrabgrabgrabgr
__m128i vRow10 = _mm_unpackhi_epi64(srcLo0, srcHi0);
- __m256i final = _mm256_castsi128_si256(vRow00);
+ simdscalari final = _mm256_castsi128_si256(vRow00);
final = _mm256_insertf128_si256(final, vRow10, 1);
#else
src0 = _mm256_or_si256(src0, src1);
src2 = _mm256_or_si256(src2, src3);
- __m256i final = _mm256_or_si256(src0, src2);
+ simdscalari final = _mm256_or_si256(src0, src2);
// adjust the data to get the tiling order correct 0 1 2 3 -> 0 2 1 3
final = _mm256_permute4x64_epi64(final, 0xD8);
vComp2 = _simd_mul_ps(vComp2, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(2)));
// moving to 8 wide integer vector types
- __m256i src0 = _simd_cvtps_epi32(vComp0); // padded byte rrrrrrrr
- __m256i src1 = _simd_cvtps_epi32(vComp1); // padded byte gggggggg
- __m256i src2 = _simd_cvtps_epi32(vComp2); // padded byte bbbbbbbb
+ simdscalari src0 = _simd_cvtps_epi32(vComp0); // padded byte rrrrrrrr
+ simdscalari src1 = _simd_cvtps_epi32(vComp1); // padded byte gggggggg
+ simdscalari src2 = _simd_cvtps_epi32(vComp2); // padded byte bbbbbbbb
#if KNOB_ARCH <= KNOB_ARCH_AVX
__m128i vRow00 = _mm_unpacklo_epi64(srcLo0, srcHi0); // 0bgr0bgr0bgr0bgr0bgr0bgr0bgr0bgr
__m128i vRow10 = _mm_unpackhi_epi64(srcLo0, srcHi0);
- __m256i final = _mm256_castsi128_si256(vRow00);
+ simdscalari final = _mm256_castsi128_si256(vRow00);
final = _mm256_insertf128_si256(final, vRow10, 1);
#else
src0 = _mm256_or_si256(src0, src1);
- __m256i final = _mm256_or_si256(src0, src2);
+ simdscalari final = _mm256_or_si256(src0, src2);
// adjust the data to get the tiling order correct 0 1 2 3 -> 0 2 1 3
final = _mm256_permute4x64_epi64(final, 0xD8);