- // cast types
- Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
- Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
-
- // input could either be float or int vector; do shuffle work in int
- vGatherInput[0] = BITCAST(vGatherInput[0], mSimdInt32Ty);
- vGatherInput[1] = BITCAST(vGatherInput[1], mSimdInt32Ty);
-
- if(bPackedOutput)
- {
- Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
-
- // shuffle mask
- Value* vConstMask = C<char>({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
- 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15});
- Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput[0], v32x8Ty), vConstMask), vGatherTy);
- // after pshufb: group components together in each 128bit lane
- // 256i - 0 1 2 3 4 5 6 7
- // xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
-
- Value* vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
- // after PERMD: move and pack xy components into each 128bit lane
- // 256i - 0 1 2 3 4 5 6 7
- // xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
-
- // do the same for zw components
- Value* vi128ZW = nullptr;
- if(info.numComps > 2)
- {
- Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput[1], v32x8Ty), vConstMask), vGatherTy);
- vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
- }
-
- for(uint32_t i = 0; i < 4; i++)
- {
- uint32_t swizzleIndex = info.swizzle[i];
- // todo: fixed for packed
- Value* vGatherMaskedVal = VIMMED1((int32_t)(info.defaults[i]));
- if(i >= info.numComps)
- {
- // set the default component val
- vGatherOutput[swizzleIndex] = vGatherMaskedVal;
- continue;
- }
-
- // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
- uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
- // if x or y, use vi128XY permute result, else use vi128ZW
- Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
-
- // extract packed component 128 bit lanes
- vGatherOutput[swizzleIndex] = VEXTRACT(selectedPermute, C(lane));
- }
-
- }
- else
- {
- // pshufb masks for each component
- Value* vConstMask[2];
- // x/z shuffle mask
- vConstMask[0] = C<char>({0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
- 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, });
-
- // y/w shuffle mask
- vConstMask[1] = C<char>({2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
- 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1});
-