+#if USE_SIMD16_GATHERS
+void FetchJit::Shuffle16bpcGather16(Shuffle16bpcArgs &args)
+{
+ // Unpack tuple args
+ Value* (&vGatherResult)[2] = std::get<0>(args);
+ Value* pVtxOut = std::get<1>(args);
+ const Instruction::CastOps extendType = std::get<2>(args);
+ const ConversionType conversionType = std::get<3>(args);
+ uint32_t ¤tVertexElement = std::get<4>(args);
+ uint32_t &outputElt = std::get<5>(args);
+ const ComponentEnable compMask = std::get<6>(args);
+ const ComponentControl(&compCtrl)[4] = std::get<7>(args);
+ Value* (&vVertexElements)[4] = std::get<8>(args);
+
+ // cast types
+ Type *vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
+ Type *v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
+
+ // have to do extra work for sign extending
+ if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP) || (extendType == Instruction::CastOps::FPExt))
+ {
+ // is this PP float?
+ bool bFP = (extendType == Instruction::CastOps::FPExt) ? true : false;
+
+ Type *v8x16Ty = VectorType::get(mInt16Ty, 8); // 8x16bit in a 128bit lane
+ Type *v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
+
+ // shuffle mask
+ Value *vConstMask = C<char>({ 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
+ 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 });
+ Value *vi128XY_lo = nullptr;
+ Value *vi128XY_hi = nullptr;
+ if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
+ {
+ // SIMD16 PSHUFB isnt part of AVX-512F, so split into SIMD8 for the sake of KNL, for now..
+
+ Value *vGatherResult_lo = EXTRACT_16(vGatherResult[0], 0);
+ Value *vGatherResult_hi = EXTRACT_16(vGatherResult[0], 1);
+
+ Value *vShufResult_lo = BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask), vGatherTy);
+ Value *vShufResult_hi = BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask), vGatherTy);
+
+ // after pshufb: group components together in each 128bit lane
+ // 256i - 0 1 2 3 4 5 6 7
+ // xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
+
+ vi128XY_lo = BITCAST(PERMD(vShufResult_lo, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
+ vi128XY_hi = BITCAST(PERMD(vShufResult_hi, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
+
+ // after PERMD: move and pack xy components into each 128bit lane
+ // 256i - 0 1 2 3 4 5 6 7
+ // xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
+ }
+
+ // do the same for zw components
+ Value *vi128ZW_lo = nullptr;
+ Value *vi128ZW_hi = nullptr;
+ if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
+ {
+ Value *vGatherResult_lo = EXTRACT_16(vGatherResult[1], 0);
+ Value *vGatherResult_hi = EXTRACT_16(vGatherResult[1], 1);
+
+ Value *vShufResult_lo = BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask), vGatherTy);
+ Value *vShufResult_hi = BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask), vGatherTy);
+
+ vi128ZW_lo = BITCAST(PERMD(vShufResult_lo, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
+ vi128ZW_hi = BITCAST(PERMD(vShufResult_hi, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
+ }
+
+ // init denormalize variables if needed
+ Instruction::CastOps IntToFpCast;
+ Value *conversionFactor;
+
+ switch (conversionType)
+ {
+ case CONVERT_NORMALIZED:
+ IntToFpCast = Instruction::CastOps::SIToFP;
+ conversionFactor = VIMMED1((float)(1.0 / 32767.0));
+ break;
+ case CONVERT_SSCALED:
+ IntToFpCast = Instruction::CastOps::SIToFP;
+ conversionFactor = VIMMED1((float)(1.0));
+ break;
+ case CONVERT_USCALED:
+ SWR_INVALID("Type should not be sign extended!");
+ conversionFactor = nullptr;
+ break;
+ default:
+ SWR_ASSERT(conversionType == CONVERT_NONE);
+ conversionFactor = nullptr;
+ break;
+ }
+
+ // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
+ for (uint32_t i = 0; i < 4; i++)
+ {
+ if (isComponentEnabled(compMask, i))
+ {
+ if (compCtrl[i] == ComponentControl::StoreSrc)
+ {
+ // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
+ uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
+ // if x or y, use vi128XY permute result, else use vi128ZW
+ Value *selectedPermute_lo = (i < 2) ? vi128XY_lo : vi128ZW_lo;
+ Value *selectedPermute_hi = (i < 2) ? vi128XY_hi : vi128ZW_hi;
+
+ if (bFP)
+ {
+ // extract 128 bit lanes to sign extend each component
+ Value *temp_lo = CVTPH2PS(BITCAST(VEXTRACT(selectedPermute_lo, C(lane)), v8x16Ty));
+ Value *temp_hi = CVTPH2PS(BITCAST(VEXTRACT(selectedPermute_hi, C(lane)), v8x16Ty));
+
+ vVertexElements[currentVertexElement] = JOIN_16(temp_lo, temp_hi);
+ }
+ else
+ {
+ // extract 128 bit lanes to sign extend each component
+ Value *temp_lo = PMOVSXWD(BITCAST(VEXTRACT(selectedPermute_lo, C(lane)), v8x16Ty));
+ Value *temp_hi = PMOVSXWD(BITCAST(VEXTRACT(selectedPermute_hi, C(lane)), v8x16Ty));
+
+ // denormalize if needed
+ if (conversionType != CONVERT_NONE)
+ {
+ temp_lo = FMUL(CAST(IntToFpCast, temp_lo, mSimdFP32Ty), conversionFactor);
+ temp_hi = FMUL(CAST(IntToFpCast, temp_hi, mSimdFP32Ty), conversionFactor);
+ }
+
+ vVertexElements[currentVertexElement] = JOIN_16(temp_lo, temp_hi);
+ }
+
+ currentVertexElement += 1;
+ }
+ else
+ {
+ vVertexElements[currentVertexElement++] = GenerateCompCtrlVector16(compCtrl[i]);
+ }
+
+ if (currentVertexElement > 3)
+ {
+ StoreVertexElements16(pVtxOut, outputElt++, 4, vVertexElements);
+ // reset to the next vVertexElement to output
+ currentVertexElement = 0;
+ }
+ }
+ }
+ }
+ // else zero extend
+ else if ((extendType == Instruction::CastOps::ZExt) || (extendType == Instruction::CastOps::UIToFP))
+ {
+ // pshufb masks for each component
+ Value *vConstMask[2];
+
+ if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 2))
+ {
+ // x/z shuffle mask
+ vConstMask[0] = C<char>({ 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
+ 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, });
+ }
+
+ if (isComponentEnabled(compMask, 1) || isComponentEnabled(compMask, 3))
+ {
+ // y/w shuffle mask
+ vConstMask[1] = C<char>({ 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
+ 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1 });
+ }
+
+ // init denormalize variables if needed
+ Instruction::CastOps fpCast;
+ Value* conversionFactor;
+
+ switch (conversionType)
+ {
+ case CONVERT_NORMALIZED:
+ fpCast = Instruction::CastOps::UIToFP;
+ conversionFactor = VIMMED1((float)(1.0 / 65535.0));
+ break;
+ case CONVERT_USCALED:
+ fpCast = Instruction::CastOps::UIToFP;
+ conversionFactor = VIMMED1((float)(1.0f));
+ break;
+ case CONVERT_SSCALED:
+ SWR_INVALID("Type should not be zero extended!");
+ conversionFactor = nullptr;
+ break;
+ default:
+ SWR_ASSERT(conversionType == CONVERT_NONE);
+ conversionFactor = nullptr;
+ break;
+ }
+
+ // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits
+ for (uint32_t i = 0; i < 4; i++)
+ {
+ if (isComponentEnabled(compMask, i))
+ {
+ if (compCtrl[i] == ComponentControl::StoreSrc)
+ {
+ // select correct constMask for x/z or y/w pshufb
+ uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1;
+ // if x or y, use vi128XY permute result, else use vi128ZW
+ uint32_t selectedGather = (i < 2) ? 0 : 1;
+
+ // SIMD16 PSHUFB isnt part of AVX-512F, so split into SIMD8 for the sake of KNL, for now..
+
+ Value *vGatherResult_lo = EXTRACT_16(vGatherResult[selectedGather], 0);
+ Value *vGatherResult_hi = EXTRACT_16(vGatherResult[selectedGather], 1);
+
+ Value *temp_lo = BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask[selectedMask]), vGatherTy);
+ Value *temp_hi = BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask[selectedMask]), vGatherTy);
+
+ // after pshufb mask for x channel; z uses the same shuffle from the second gather
+ // 256i - 0 1 2 3 4 5 6 7
+ // xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00
+
+ // denormalize if needed
+ if (conversionType != CONVERT_NONE)
+ {
+ temp_lo = FMUL(CAST(fpCast, temp_lo, mSimdFP32Ty), conversionFactor);
+ temp_hi = FMUL(CAST(fpCast, temp_hi, mSimdFP32Ty), conversionFactor);
+ }
+
+ vVertexElements[currentVertexElement] = JOIN_16(temp_lo, temp_hi);
+
+ currentVertexElement += 1;
+ }
+ else
+ {
+ vVertexElements[currentVertexElement++] = GenerateCompCtrlVector16(compCtrl[i]);
+ }
+
+ if (currentVertexElement > 3)
+ {
+ StoreVertexElements16(pVtxOut, outputElt++, 4, vVertexElements);
+ // reset to the next vVertexElement to output
+ currentVertexElement = 0;
+ }
+ }
+ }
+ }
+ else
+ {
+ SWR_INVALID("Unsupported conversion type");
+ }
+}
+
+#else
+#if USE_SIMD16_SHADERS
+void FetchJit::Shuffle16bpcGather(Shuffle16bpcArgs &args, bool useVertexID2)
+#else