/// @todo Optimize this
+#if USE_SIMD16_FRONTEND
+ if (pa.useAlternateOffset)
+ {
+ primIndex += KNOB_SIMD_WIDTH;
+ }
+
+#endif
float* pOutVec = (float*)verts;
for (uint32_t cp = 0; cp < TotalControlPoints; ++cp)
{
uint32_t input_cp = primIndex * TotalControlPoints + cp;
+#if USE_SIMD16_FRONTEND
+ uint32_t input_vec = input_cp / KNOB_SIMD16_WIDTH;
+ uint32_t input_lane = input_cp % KNOB_SIMD16_WIDTH;
+
+#else
uint32_t input_vec = input_cp / KNOB_SIMD_WIDTH;
uint32_t input_lane = input_cp % KNOB_SIMD_WIDTH;
+#endif
// Loop over all components of the attribute
for (uint32_t i = 0; i < 4; ++i)
{
+#if USE_SIMD16_FRONTEND
+ const float* pInputVec = (const float*)(&PaGetSimdVector_simd16(pa, input_vec, slot)[i]);
+#else
const float* pInputVec = (const float*)(&PaGetSimdVector(pa, input_vec, slot)[i]);
+#endif
pOutVec[cp * 4 + i] = pInputVec[input_lane];
}
}
/// @todo Optimize this
+#if USE_SIMD16_FRONTEND
+ uint32_t lane_offset = 0;
+
+ if (pa.useAlternateOffset)
+ {
+ lane_offset = KNOB_SIMD_WIDTH;
+ }
+
+#endif
// Loop over all components of the attribute
for (uint32_t i = 0; i < 4; ++i)
{
float vec[KNOB_SIMD_WIDTH];
for (uint32_t lane = 0; lane < KNOB_SIMD_WIDTH; ++lane)
{
+#if USE_SIMD16_FRONTEND
+ uint32_t input_cp = (lane + lane_offset) * TotalControlPoints + cp;
+ uint32_t input_vec = input_cp / KNOB_SIMD16_WIDTH;
+ uint32_t input_lane = input_cp % KNOB_SIMD16_WIDTH;
+
+ const float* pInputVec = (const float*)(&PaGetSimdVector_simd16(pa, input_vec, slot)[i]);
+#else
uint32_t input_cp = lane * TotalControlPoints + cp;
uint32_t input_vec = input_cp / KNOB_SIMD_WIDTH;
uint32_t input_lane = input_cp % KNOB_SIMD_WIDTH;
const float* pInputVec = (const float*)(&PaGetSimdVector(pa, input_vec, slot)[i]);
+#endif
vec[lane] = pInputVec[input_lane];
}
verts[cp][i] = _simd_loadu_ps(vec);
return true;
}
+#if ENABLE_AVX512_SIMD16
+template<uint32_t TotalControlPoints, uint32_t CurrentControlPoints = 1>
+static bool PaPatchList_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
+{
+ SetNextPaState_simd16(
+ pa,
+ PaPatchList_simd16<TotalControlPoints, CurrentControlPoints + 1>,
+ PaPatchListSingle<TotalControlPoints>);
+
+ return false;
+}
+
+template<uint32_t TotalControlPoints>
+static bool PaPatchListTerm_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
+{
+ // We have an input of KNOB_SIMD_WIDTH * TotalControlPoints and we output
+ // KNOB_SIMD16_WIDTH * 1 patch. This function is called once per attribute.
+ // Each attribute has 4 components.
+
+ /// @todo Optimize this
+
+ // Loop over all components of the attribute
+ for (uint32_t i = 0; i < 4; ++i)
+ {
+ for (uint32_t cp = 0; cp < TotalControlPoints; ++cp)
+ {
+ float vec[KNOB_SIMD16_WIDTH];
+ for (uint32_t lane = 0; lane < KNOB_SIMD16_WIDTH; ++lane)
+ {
+ uint32_t input_cp = lane * TotalControlPoints + cp;
+ uint32_t input_vec = input_cp / KNOB_SIMD16_WIDTH;
+ uint32_t input_lane = input_cp % KNOB_SIMD16_WIDTH;
+
+ const float* pInputVec = (const float*)(&PaGetSimdVector(pa, input_vec, slot)[i]);
+ vec[lane] = pInputVec[input_lane];
+ }
+ verts[cp][i] = _simd16_loadu_ps(vec);
+ }
+ }
+
+ SetNextPaState_simd16(
+ pa,
+ PaPatchList_simd16<TotalControlPoints>,
+ PaPatchListSingle<TotalControlPoints>,
+ 0,
+ KNOB_SIMD16_WIDTH,
+ true);
+
+ return true;
+}
+
+#endif
#define PA_PATCH_LIST_TERMINATOR(N) \
template<> bool PaPatchList<N, N>(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])\
{ return PaPatchListTerm<N>(pa, slot, verts); }
PA_PATCH_LIST_TERMINATOR(32)
#undef PA_PATCH_LIST_TERMINATOR
+#if ENABLE_AVX512_SIMD16
+#define PA_PATCH_LIST_TERMINATOR_SIMD16(N) \
+ template<> bool PaPatchList_simd16<N, N>(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])\
+ { return PaPatchListTerm_simd16<N>(pa, slot, verts); }
+PA_PATCH_LIST_TERMINATOR_SIMD16(1)
+PA_PATCH_LIST_TERMINATOR_SIMD16(2)
+PA_PATCH_LIST_TERMINATOR_SIMD16(3)
+PA_PATCH_LIST_TERMINATOR_SIMD16(4)
+PA_PATCH_LIST_TERMINATOR_SIMD16(5)
+PA_PATCH_LIST_TERMINATOR_SIMD16(6)
+PA_PATCH_LIST_TERMINATOR_SIMD16(7)
+PA_PATCH_LIST_TERMINATOR_SIMD16(8)
+PA_PATCH_LIST_TERMINATOR_SIMD16(9)
+PA_PATCH_LIST_TERMINATOR_SIMD16(10)
+PA_PATCH_LIST_TERMINATOR_SIMD16(11)
+PA_PATCH_LIST_TERMINATOR_SIMD16(12)
+PA_PATCH_LIST_TERMINATOR_SIMD16(13)
+PA_PATCH_LIST_TERMINATOR_SIMD16(14)
+PA_PATCH_LIST_TERMINATOR_SIMD16(15)
+PA_PATCH_LIST_TERMINATOR_SIMD16(16)
+PA_PATCH_LIST_TERMINATOR_SIMD16(17)
+PA_PATCH_LIST_TERMINATOR_SIMD16(18)
+PA_PATCH_LIST_TERMINATOR_SIMD16(19)
+PA_PATCH_LIST_TERMINATOR_SIMD16(20)
+PA_PATCH_LIST_TERMINATOR_SIMD16(21)
+PA_PATCH_LIST_TERMINATOR_SIMD16(22)
+PA_PATCH_LIST_TERMINATOR_SIMD16(23)
+PA_PATCH_LIST_TERMINATOR_SIMD16(24)
+PA_PATCH_LIST_TERMINATOR_SIMD16(25)
+PA_PATCH_LIST_TERMINATOR_SIMD16(26)
+PA_PATCH_LIST_TERMINATOR_SIMD16(27)
+PA_PATCH_LIST_TERMINATOR_SIMD16(28)
+PA_PATCH_LIST_TERMINATOR_SIMD16(29)
+PA_PATCH_LIST_TERMINATOR_SIMD16(30)
+PA_PATCH_LIST_TERMINATOR_SIMD16(31)
+PA_PATCH_LIST_TERMINATOR_SIMD16(32)
+#undef PA_PATCH_LIST_TERMINATOR_SIMD16
+
+#endif
bool PaTriList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
{
SetNextPaState(pa, PaTriList1, PaTriListSingle0);
}
}
- __m256 tmp0, tmp1, tmp2;
+ simd16vector &v0 = verts[0]; // verts[0] needs to be { v0, v0, v3, v3, v6, v6, v9, v9 }
+ simd16vector &v1 = verts[1]; // verts[1] needs to be { v1, v2, v4, v5, v7, v8, v10, v11 }
+ simd16vector &v2 = verts[2]; // verts[2] needs to be { v2, w, v5, x, v8, y, v11, z }
// Loop over each component in the simdvector.
for (int i = 0; i < 4; i += 1)
{
- simd16vector& v0 = verts[0]; // verts[0] needs to be { v0, v0, v3, v3, v6, v6, v9, v9 }
+ simdscalar v0_lo; // verts[0] needs to be { v0, v0, v3, v3, v6, v6, v9, v9 }
+ simdscalar v1_lo; // verts[1] needs to be { v1, v2, v4, v5, v7, v8, v10, v11 }
+ simdscalar v2_lo; // verts[2] needs to be { v2, w, v5, x, v8, y, v11, z }
+
+ __m256 tmp0, tmp1, tmp2;
+
tmp0 = _mm256_permute2f128_ps(b[i], b[i], 0x01); // tmp0 = { v12, v13, v14, v15, v8, v9, v10, v11 }
- v0[i].lo = _mm256_blend_ps(a[i], tmp0, 0x20); // v0 = { v0, *, *, v3, *, v9, v6, * } where * is don't care.
- tmp1 = _mm256_permute_ps(v0[i].lo, 0xF0); // tmp1 = { v0, v0, v3, v3, *, *, *, * }
- v0[i].lo = _mm256_permute_ps(v0[i].lo, 0x5A); // v0 = { *, *, *, *, v6, v6, v9, v9 }
- v0[i].lo = _mm256_blend_ps(tmp1, v0[i].lo, 0xF0); // v0 = { v0, v0, v3, v3, v6, v6, v9, v9 }
+ v0_lo = _mm256_blend_ps(a[i], tmp0, 0x20); // v0 = { v0, *, *, v3, *, v9, v6, * } where * is don't care.
+ tmp1 = _mm256_permute_ps(v0_lo, 0xF0); // tmp1 = { v0, v0, v3, v3, *, *, *, * }
+ v0_lo = _mm256_permute_ps(v0_lo, 0x5A); // v0 = { *, *, *, *, v6, v6, v9, v9 }
+ v0_lo = _mm256_blend_ps(tmp1, v0_lo, 0xF0); // v0 = { v0, v0, v3, v3, v6, v6, v9, v9 }
/// NOTE This is a bit expensive due to conflicts between vertices in 'a' and 'b'.
/// AVX2 should make this much cheaper.
- simd16vector& v1 = verts[1]; // verts[1] needs to be { v1, v2, v4, v5, v7, v8, v10, v11 }
- v1[i].lo = _mm256_permute_ps(a[i], 0x09); // v1 = { v1, v2, *, *, *, *, *, * }
+ v1_lo = _mm256_permute_ps(a[i], 0x09); // v1 = { v1, v2, *, *, *, *, *, * }
tmp1 = _mm256_permute_ps(a[i], 0x43); // tmp1 = { *, *, *, *, v7, *, v4, v5 }
- tmp2 = _mm256_blend_ps(v1[i].lo, tmp1, 0xF0); // tmp2 = { v1, v2, *, *, v7, *, v4, v5 }
+ tmp2 = _mm256_blend_ps(v1_lo, tmp1, 0xF0); // tmp2 = { v1, v2, *, *, v7, *, v4, v5 }
tmp1 = _mm256_permute2f128_ps(tmp2, tmp2, 0x1); // tmp1 = { v7, *, v4, v5, * *, *, * }
- v1[i].lo = _mm256_permute_ps(tmp0, 0xE0); // v1 = { *, *, *, *, *, v8, v10, v11 }
- v1[i].lo = _mm256_blend_ps(tmp2, v1[i].lo, 0xE0); // v1 = { v1, v2, *, *, v7, v8, v10, v11 }
- v1[i].lo = _mm256_blend_ps(v1[i].lo, tmp1, 0x0C); // v1 = { v1, v2, v4, v5, v7, v8, v10, v11 }
+ v1_lo = _mm256_permute_ps(tmp0, 0xE0); // v1 = { *, *, *, *, *, v8, v10, v11 }
+ v1_lo = _mm256_blend_ps(tmp2, v1_lo, 0xE0); // v1 = { v1, v2, *, *, v7, v8, v10, v11 }
+ v1_lo = _mm256_blend_ps(v1_lo, tmp1, 0x0C); // v1 = { v1, v2, v4, v5, v7, v8, v10, v11 }
// verts[2] = { v2, w, v5, x, v8, y, v11, z }
- simd16vector& v2 = verts[2]; // verts[2] needs to be { v2, w, v5, x, v8, y, v11, z }
- v2[i].lo = _mm256_permute_ps(tmp0, 0x30); // v2 = { *, *, *, *, v8, *, v11, * }
+ v2_lo = _mm256_permute_ps(tmp0, 0x30); // v2 = { *, *, *, *, v8, *, v11, * }
tmp1 = _mm256_permute_ps(tmp2, 0x31); // tmp1 = { v2, *, v5, *, *, *, *, * }
- v2[i].lo = _mm256_blend_ps(tmp1, v2[i].lo, 0xF0);
+ v2_lo = _mm256_blend_ps(tmp1, v2_lo, 0xF0);
// Need to compute 4th implied vertex for the rectangle.
- tmp2 = _mm256_sub_ps(v0[i].lo, v1[i].lo);
- tmp2 = _mm256_add_ps(tmp2, v2[i].lo); // tmp2 = { w, *, x, *, y, *, z, * }
+ tmp2 = _mm256_sub_ps(v0_lo, v1_lo);
+ tmp2 = _mm256_add_ps(tmp2, v2_lo); // tmp2 = { w, *, x, *, y, *, z, * }
tmp2 = _mm256_permute_ps(tmp2, 0xA0); // tmp2 = { *, w, *, x, *, y, *, z }
- v2[i].lo = _mm256_blend_ps(v2[i].lo, tmp2, 0xAA); // v2 = { v2, w, v5, x, v8, y, v11, z }
+ v2_lo = _mm256_blend_ps(v2_lo, tmp2, 0xAA); // v2 = { v2, w, v5, x, v8, y, v11, z }
- v0[i].hi = _simd_setzero_ps();
- v1[i].hi = _simd_setzero_ps();
- v2[i].hi = _simd_setzero_ps();
+ v0[i] = _simd16_insert_ps(_simd16_setzero_ps(), v0_lo, 0);
+ v1[i] = _simd16_insert_ps(_simd16_setzero_ps(), v1_lo, 0);
+ v2[i] = _simd16_insert_ps(_simd16_setzero_ps(), v2_lo, 0);
}
SetNextPaState_simd16(pa, PaRectList1_simd16, PaRectListSingle0, 0, KNOB_SIMD16_WIDTH, true);
case TOP_PATCHLIST_1:
this->pfnPaFunc = PaPatchList<1>;
+#if ENABLE_AVX512_SIMD16
+ this->pfnPaFunc_simd16 = PaPatchList_simd16<1>;
+#endif
break;
case TOP_PATCHLIST_2:
this->pfnPaFunc = PaPatchList<2>;
+#if ENABLE_AVX512_SIMD16
+ this->pfnPaFunc_simd16 = PaPatchList_simd16<2>;
+#endif
break;
case TOP_PATCHLIST_3:
this->pfnPaFunc = PaPatchList<3>;
+#if ENABLE_AVX512_SIMD16
+ this->pfnPaFunc_simd16 = PaPatchList_simd16<3>;
+#endif
break;
case TOP_PATCHLIST_4:
this->pfnPaFunc = PaPatchList<4>;
+#if ENABLE_AVX512_SIMD16
+ this->pfnPaFunc_simd16 = PaPatchList_simd16<4>;
+#endif
break;
case TOP_PATCHLIST_5:
this->pfnPaFunc = PaPatchList<5>;
+#if ENABLE_AVX512_SIMD16
+ this->pfnPaFunc_simd16 = PaPatchList_simd16<5>;
+#endif
break;
case TOP_PATCHLIST_6:
this->pfnPaFunc = PaPatchList<6>;
+#if ENABLE_AVX512_SIMD16
+ this->pfnPaFunc_simd16 = PaPatchList_simd16<6>;
+#endif
break;
case TOP_PATCHLIST_7:
this->pfnPaFunc = PaPatchList<7>;
+#if ENABLE_AVX512_SIMD16
+ this->pfnPaFunc_simd16 = PaPatchList_simd16<7>;
+#endif
break;
case TOP_PATCHLIST_8:
this->pfnPaFunc = PaPatchList<8>;
+#if ENABLE_AVX512_SIMD16
+ this->pfnPaFunc_simd16 = PaPatchList_simd16<8>;
+#endif
break;
case TOP_PATCHLIST_9:
this->pfnPaFunc = PaPatchList<9>;
+#if ENABLE_AVX512_SIMD16
+ this->pfnPaFunc_simd16 = PaPatchList_simd16<9>;
+#endif
break;
case TOP_PATCHLIST_10:
this->pfnPaFunc = PaPatchList<10>;
+#if ENABLE_AVX512_SIMD16
+ this->pfnPaFunc_simd16 = PaPatchList_simd16<10>;
+#endif
break;
case TOP_PATCHLIST_11:
this->pfnPaFunc = PaPatchList<11>;
+#if ENABLE_AVX512_SIMD16
+ this->pfnPaFunc_simd16 = PaPatchList_simd16<11>;
+#endif
break;
case TOP_PATCHLIST_12:
this->pfnPaFunc = PaPatchList<12>;
+#if ENABLE_AVX512_SIMD16
+ this->pfnPaFunc_simd16 = PaPatchList_simd16<12>;
+#endif
break;
case TOP_PATCHLIST_13:
this->pfnPaFunc = PaPatchList<13>;
+#if ENABLE_AVX512_SIMD16
+ this->pfnPaFunc_simd16 = PaPatchList_simd16<13>;
+#endif
break;
case TOP_PATCHLIST_14:
this->pfnPaFunc = PaPatchList<14>;
+#if ENABLE_AVX512_SIMD16
+ this->pfnPaFunc_simd16 = PaPatchList_simd16<14>;
+#endif
break;
case TOP_PATCHLIST_15:
this->pfnPaFunc = PaPatchList<15>;
+#if ENABLE_AVX512_SIMD16
+ this->pfnPaFunc_simd16 = PaPatchList_simd16<15>;
+#endif
break;
case TOP_PATCHLIST_16:
this->pfnPaFunc = PaPatchList<16>;
+#if ENABLE_AVX512_SIMD16
+ this->pfnPaFunc_simd16 = PaPatchList_simd16<16>;
+#endif
break;
case TOP_PATCHLIST_17:
this->pfnPaFunc = PaPatchList<17>;
+#if ENABLE_AVX512_SIMD16
+ this->pfnPaFunc_simd16 = PaPatchList_simd16<17>;
+#endif
break;
case TOP_PATCHLIST_18:
this->pfnPaFunc = PaPatchList<18>;
+#if ENABLE_AVX512_SIMD16
+ this->pfnPaFunc_simd16 = PaPatchList_simd16<18>;
+#endif
break;
case TOP_PATCHLIST_19:
this->pfnPaFunc = PaPatchList<19>;
+#if ENABLE_AVX512_SIMD16
+ this->pfnPaFunc_simd16 = PaPatchList_simd16<19>;
+#endif
break;
case TOP_PATCHLIST_20:
this->pfnPaFunc = PaPatchList<20>;
+#if ENABLE_AVX512_SIMD16
+ this->pfnPaFunc_simd16 = PaPatchList_simd16<20>;
+#endif
break;
case TOP_PATCHLIST_21:
this->pfnPaFunc = PaPatchList<21>;
+#if ENABLE_AVX512_SIMD16
+ this->pfnPaFunc_simd16 = PaPatchList_simd16<21>;
+#endif
break;
case TOP_PATCHLIST_22:
this->pfnPaFunc = PaPatchList<22>;
+#if ENABLE_AVX512_SIMD16
+ this->pfnPaFunc_simd16 = PaPatchList_simd16<22>;
+#endif
break;
case TOP_PATCHLIST_23:
this->pfnPaFunc = PaPatchList<23>;
+#if ENABLE_AVX512_SIMD16
+ this->pfnPaFunc_simd16 = PaPatchList_simd16<23>;
+#endif
break;
case TOP_PATCHLIST_24:
this->pfnPaFunc = PaPatchList<24>;
+#if ENABLE_AVX512_SIMD16
+ this->pfnPaFunc_simd16 = PaPatchList_simd16<24>;
+#endif
break;
case TOP_PATCHLIST_25:
this->pfnPaFunc = PaPatchList<25>;
+#if ENABLE_AVX512_SIMD16
+ this->pfnPaFunc_simd16 = PaPatchList_simd16<25>;
+#endif
break;
case TOP_PATCHLIST_26:
this->pfnPaFunc = PaPatchList<26>;
+#if ENABLE_AVX512_SIMD16
+ this->pfnPaFunc_simd16 = PaPatchList_simd16<26>;
+#endif
break;
case TOP_PATCHLIST_27:
this->pfnPaFunc = PaPatchList<27>;
+#if ENABLE_AVX512_SIMD16
+ this->pfnPaFunc_simd16 = PaPatchList_simd16<27>;
+#endif
break;
case TOP_PATCHLIST_28:
this->pfnPaFunc = PaPatchList<28>;
+#if ENABLE_AVX512_SIMD16
+ this->pfnPaFunc_simd16 = PaPatchList_simd16<28>;
+#endif
break;
case TOP_PATCHLIST_29:
this->pfnPaFunc = PaPatchList<29>;
+#if ENABLE_AVX512_SIMD16
+ this->pfnPaFunc_simd16 = PaPatchList_simd16<29>;
+#endif
break;
case TOP_PATCHLIST_30:
this->pfnPaFunc = PaPatchList<30>;
+#if ENABLE_AVX512_SIMD16
+ this->pfnPaFunc_simd16 = PaPatchList_simd16<30>;
+#endif
break;
case TOP_PATCHLIST_31:
this->pfnPaFunc = PaPatchList<31>;
+#if ENABLE_AVX512_SIMD16
+ this->pfnPaFunc_simd16 = PaPatchList_simd16<31>;
+#endif
break;
case TOP_PATCHLIST_32:
this->pfnPaFunc = PaPatchList<32>;
+#if ENABLE_AVX512_SIMD16
+ this->pfnPaFunc_simd16 = PaPatchList_simd16<32>;
+#endif
break;
default: