// 256i - 0 1 2 3 4 5 6 7
// xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
- Value* vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
+ Value* vi128XY = BITCAST(VPERMD(vShufResult, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
// after PERMD: move and pack xy components into each 128bit lane
// 256i - 0 1 2 3 4 5 6 7
// xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
if (info.numComps > 2)
{
Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput[1], v32x8Ty), vConstMask), vGatherTy);
- vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
+ vi128ZW = BITCAST(VPERMD(vShufResult, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
}
for (uint32_t i = 0; i < 4; i++)
// 256i - 0 1 2 3 4 5 6 7
// xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww
- Value* vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({ 0, 4, 0, 0, 1, 5, 0, 0 })), v128Ty);
+ Value* vi128XY = BITCAST(VPERMD(vShufResult, C<int32_t>({ 0, 4, 0, 0, 1, 5, 0, 0 })), v128Ty);
// after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane
// 256i - 0 1 2 3 4 5 6 7
// xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care)
Value* vi128ZW = nullptr;
if (info.numComps > 2)
{
- vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({ 2, 6, 0, 0, 3, 7, 0, 0 })), v128Ty);
+ vi128ZW = BITCAST(VPERMD(vShufResult, C<int32_t>({ 2, 6, 0, 0, 3, 7, 0, 0 })), v128Ty);
}
// sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
return S_EXT(VSHUFFLE(a, a, C<int>({0, 1, 2, 3, 4, 5, 6, 7})), v8x32Ty);
}
- //////////////////////////////////////////////////////////////////////////
- /// @brief Generate a VPERMD operation (shuffle 32 bit integer values
- /// across 128 bit lanes) in LLVM IR. If not supported on the underlying
- /// platform, emulate it
- /// @param a - 256bit SIMD lane(8x32bit) of integer values.
- /// @param idx - 256bit SIMD lane(8x32bit) of 3 bit lane index values
- Value *Builder::PERMD(Value* a, Value* idx)
- {
- Value* res;
- // use avx2 permute instruction if available
- if(JM()->mArch.AVX2())
- {
- res = VPERMD(a, idx);
- }
- else
- {
- if (isa<Constant>(idx))
- {
- res = VSHUFFLE(a, a, idx);
- }
- else
- {
- res = VUNDEF_I();
- for (uint32_t l = 0; l < JM()->mVWidth; ++l)
- {
- Value* pIndex = VEXTRACT(idx, C(l));
- Value* pVal = VEXTRACT(a, pIndex);
- res = VINSERT(res, pVal, C(l));
- }
- }
- }
- return res;
- }
-
- //////////////////////////////////////////////////////////////////////////
- /// @brief Generate a VPERMPS operation (shuffle 32 bit float values
- /// across 128 bit lanes) in LLVM IR. If not supported on the underlying
- /// platform, emulate it
- /// @param a - 256bit SIMD lane(8x32bit) of float values.
- /// @param idx - 256bit SIMD lane(8x32bit) of 3 bit lane index values
- Value *Builder::PERMPS(Value* a, Value* idx)
- {
- Value* res;
- // use avx2 permute instruction if available
- if (JM()->mArch.AVX2())
- {
- // llvm 3.6.0 swapped the order of the args to vpermd
- res = VPERMPS(idx, a);
- }
- else
- {
- if (isa<Constant>(idx))
- {
- res = VSHUFFLE(a, a, idx);
- }
- else
- {
- res = VUNDEF_F();
- for (uint32_t l = 0; l < JM()->mVWidth; ++l)
- {
- Value* pIndex = VEXTRACT(idx, C(l));
- Value* pVal = VEXTRACT(a, pIndex);
- res = VINSERT(res, pVal, C(l));
- }
- }
- }
-
- return res;
- }
-
//////////////////////////////////////////////////////////////////////////
/// @brief Generate a VCVTPH2PS operation (float16->float32 conversion)
/// in LLVM IR. If not supported on the underlying platform, emulate it
Value *vi128XY_hi = nullptr;
if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
{
- vi128XY_lo = BITCAST(PERMD(vShufResult_lo, C<int32_t>({ 0, 4, 0, 0, 1, 5, 0, 0 })), v128Ty);
- vi128XY_hi = BITCAST(PERMD(vShufResult_hi, C<int32_t>({ 0, 4, 0, 0, 1, 5, 0, 0 })), v128Ty);
+ vi128XY_lo = BITCAST(VPERMD(vShufResult_lo, C<int32_t>({ 0, 4, 0, 0, 1, 5, 0, 0 })), v128Ty);
+ vi128XY_hi = BITCAST(VPERMD(vShufResult_hi, C<int32_t>({ 0, 4, 0, 0, 1, 5, 0, 0 })), v128Ty);
// after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane
// 256i - 0 1 2 3 4 5 6 7
Value *vi128ZW_hi = nullptr;
if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
{
- vi128ZW_lo = BITCAST(PERMD(vShufResult_lo, C<int32_t>({ 2, 6, 0, 0, 3, 7, 0, 0 })), v128Ty);
- vi128ZW_hi = BITCAST(PERMD(vShufResult_hi, C<int32_t>({ 2, 6, 0, 0, 3, 7, 0, 0 })), v128Ty);
+ vi128ZW_lo = BITCAST(VPERMD(vShufResult_lo, C<int32_t>({ 2, 6, 0, 0, 3, 7, 0, 0 })), v128Ty);
+ vi128ZW_hi = BITCAST(VPERMD(vShufResult_hi, C<int32_t>({ 2, 6, 0, 0, 3, 7, 0, 0 })), v128Ty);
}
// init denormalize variables if needed
// 256i - 0 1 2 3 4 5 6 7
// xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
- vi128XY_lo = BITCAST(PERMD(vShufResult_lo, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
- vi128XY_hi = BITCAST(PERMD(vShufResult_hi, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
+ vi128XY_lo = BITCAST(VPERMD(vShufResult_lo, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
+ vi128XY_hi = BITCAST(VPERMD(vShufResult_hi, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
// after PERMD: move and pack xy components into each 128bit lane
// 256i - 0 1 2 3 4 5 6 7
Value *vShufResult_lo = BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask), vGatherTy);
Value *vShufResult_hi = BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask), vGatherTy);
- vi128ZW_lo = BITCAST(PERMD(vShufResult_lo, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
- vi128ZW_hi = BITCAST(PERMD(vShufResult_hi, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
+ vi128ZW_lo = BITCAST(VPERMD(vShufResult_lo, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
+ vi128ZW_hi = BITCAST(VPERMD(vShufResult_hi, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
}
// init denormalize variables if needed
// 256i - 0 1 2 3 4 5 6 7
// xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
- vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
+ vi128XY = BITCAST(VPERMD(vShufResult, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
// after PERMD: move and pack xy components into each 128bit lane
// 256i - 0 1 2 3 4 5 6 7
// xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
Value* vi128ZW = nullptr;
if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)) {
Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherResult[1], v32x8Ty), vConstMask), vGatherTy);
- vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
+ vi128ZW = BITCAST(VPERMD(vShufResult, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
}
// init denormalize variables if needed