+ StoreVertexElements16(pVtxOut, outputElt++, 4, vVertexElements);
+ // reset to the next vVertexElement to output
+ currentVertexElement = 0;
+ }
+ }
+ }
+ }
+ else
+ {
+ SWR_INVALID("Unsupported conversion type");
+ }
+}
+
+#else
+#if USE_SIMD16_SHADERS
+void FetchJit::Shuffle16bpcGather(Shuffle16bpcArgs &args, bool useVertexID2)
+#else
+void FetchJit::Shuffle16bpcGather(Shuffle16bpcArgs &args)
+#endif
+{
+ // Unpack tuple args
+ Value* (&vGatherResult)[2] = std::get<0>(args);
+ Value* pVtxOut = std::get<1>(args);
+ const Instruction::CastOps extendType = std::get<2>(args);
+ const ConversionType conversionType = std::get<3>(args);
+ uint32_t ¤tVertexElement = std::get<4>(args);
+ uint32_t &outputElt = std::get<5>(args);
+ const ComponentEnable compMask = std::get<6>(args);
+ const ComponentControl(&compCtrl)[4] = std::get<7>(args);
+ Value* (&vVertexElements)[4] = std::get<8>(args);
+
+ // cast types
+ Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
+ Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
+
+ // have to do extra work for sign extending
+ if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP) ||
+ (extendType == Instruction::CastOps::FPExt))
+ {
+ // is this PP float?
+ bool bFP = (extendType == Instruction::CastOps::FPExt) ? true : false;
+
+ Type* v8x16Ty = VectorType::get(mInt16Ty, 8); // 8x16bit in a 128bit lane
+ Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
+
+ // shuffle mask
+ Value* vConstMask = C<char>({ 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
+ 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 });
+ Value* vi128XY = nullptr;
+ if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)) {
+ Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherResult[0], v32x8Ty), vConstMask), vGatherTy);
+ // after pshufb: group components together in each 128bit lane
+ // 256i - 0 1 2 3 4 5 6 7
+ // xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
+
+ vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
+ // after PERMD: move and pack xy components into each 128bit lane
+ // 256i - 0 1 2 3 4 5 6 7
+ // xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
+ }
+
+ // do the same for zw components
+ Value* vi128ZW = nullptr;
+ if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)) {
+ Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherResult[1], v32x8Ty), vConstMask), vGatherTy);
+ vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
+ }
+
+ // init denormalize variables if needed
+ Instruction::CastOps IntToFpCast;
+ Value* conversionFactor;
+
+ switch (conversionType)
+ {
+ case CONVERT_NORMALIZED:
+ IntToFpCast = Instruction::CastOps::SIToFP;
+ conversionFactor = VIMMED1((float)(1.0 / 32767.0));
+ break;
+ case CONVERT_SSCALED:
+ IntToFpCast = Instruction::CastOps::SIToFP;
+ conversionFactor = VIMMED1((float)(1.0));
+ break;
+ case CONVERT_USCALED:
+ SWR_INVALID("Type should not be sign extended!");
+ conversionFactor = nullptr;
+ break;
+ default:
+ SWR_ASSERT(conversionType == CONVERT_NONE);
+ conversionFactor = nullptr;
+ break;
+ }
+
+ // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
+ for (uint32_t i = 0; i < 4; i++)
+ {
+ if (isComponentEnabled(compMask, i))
+ {
+ if (compCtrl[i] == ComponentControl::StoreSrc)
+ {
+ // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
+ uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
+ // if x or y, use vi128XY permute result, else use vi128ZW
+ Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
+
+ if (bFP) {
+ // extract 128 bit lanes to sign extend each component
+ vVertexElements[currentVertexElement] = CVTPH2PS(BITCAST(VEXTRACT(selectedPermute, C(lane)), v8x16Ty));
+ }
+ else {
+ // extract 128 bit lanes to sign extend each component
+ vVertexElements[currentVertexElement] = PMOVSXWD(BITCAST(VEXTRACT(selectedPermute, C(lane)), v8x16Ty));
+
+ // denormalize if needed
+ if (conversionType != CONVERT_NONE) {
+ vVertexElements[currentVertexElement] = FMUL(CAST(IntToFpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor);
+ }
+ }
+ currentVertexElement++;
+ }
+ else
+ {
+#if USE_SIMD16_SHADERS
+ vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i], useVertexID2);
+#else
+ vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
+#endif
+ }
+
+ if (currentVertexElement > 3)
+ {
+ StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
+ // reset to the next vVertexElement to output
+ currentVertexElement = 0;
+ }
+ }
+ }
+ }
+ // else zero extend
+ else if ((extendType == Instruction::CastOps::ZExt) || (extendType == Instruction::CastOps::UIToFP))
+ {
+ // pshufb masks for each component
+ Value* vConstMask[2];
+ if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 2)) {
+ // x/z shuffle mask
+ vConstMask[0] = C<char>({ 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
+ 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, });
+ }
+
+ if (isComponentEnabled(compMask, 1) || isComponentEnabled(compMask, 3)) {
+ // y/w shuffle mask
+ vConstMask[1] = C<char>({ 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
+ 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1 });
+ }
+
+ // init denormalize variables if needed
+ Instruction::CastOps fpCast;
+ Value* conversionFactor;
+
+ switch (conversionType)
+ {
+ case CONVERT_NORMALIZED:
+ fpCast = Instruction::CastOps::UIToFP;
+ conversionFactor = VIMMED1((float)(1.0 / 65535.0));
+ break;
+ case CONVERT_USCALED:
+ fpCast = Instruction::CastOps::UIToFP;
+ conversionFactor = VIMMED1((float)(1.0f));
+ break;
+ case CONVERT_SSCALED:
+ SWR_INVALID("Type should not be zero extended!");
+ conversionFactor = nullptr;
+ break;
+ default:
+ SWR_ASSERT(conversionType == CONVERT_NONE);
+ conversionFactor = nullptr;
+ break;
+ }
+
+ // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits
+ for (uint32_t i = 0; i < 4; i++)
+ {
+ if (isComponentEnabled(compMask, i))
+ {
+ if (compCtrl[i] == ComponentControl::StoreSrc)
+ {
+ // select correct constMask for x/z or y/w pshufb
+ uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1;
+ // if x or y, use vi128XY permute result, else use vi128ZW
+ uint32_t selectedGather = (i < 2) ? 0 : 1;
+
+ vVertexElements[currentVertexElement] = BITCAST(PSHUFB(BITCAST(vGatherResult[selectedGather], v32x8Ty), vConstMask[selectedMask]), vGatherTy);
+ // after pshufb mask for x channel; z uses the same shuffle from the second gather
+ // 256i - 0 1 2 3 4 5 6 7
+ // xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00
+
+ // denormalize if needed
+ if (conversionType != CONVERT_NONE)
+ {
+ vVertexElements[currentVertexElement] = FMUL(CAST(fpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor);
+ }
+ currentVertexElement++;
+ }
+ else
+ {
+#if USE_SIMD16_SHADERS
+ vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i], useVertexID2);
+#else
+ vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
+#endif
+ }
+
+ if (currentVertexElement > 3)
+ {
+ StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);