src/gallium/drivers/swr/rasterizer/jitter/functionpasses/lower_x86.cpp

   1 /****************************************************************************
   2  * Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  *
  23  * @file lower_x86.cpp
  24  *
  25  * @brief llvm pass to lower meta code to x86
  26  *
  27  * Notes:
  28  *
  29  ******************************************************************************/
  30
  31 #include "jit_pch.hpp"
  32 #include "passes.h"
  33 #include "JitManager.h"
  34
  35 #include <unordered_map>
  36
  37 namespace llvm
  38 {
  39     // foward declare the initializer
  40     void initializeLowerX86Pass(PassRegistry &);
  41 } // namespace llvm
  42
  43 namespace SwrJit
  44 {
  45     using namespace llvm;
  46
  47     enum TargetArch
  48     {
  49         AVX    = 0,
  50         AVX2   = 1,
  51         AVX512 = 2
  52     };
  53
  54     enum TargetWidth
  55     {
  56         W256       = 0,
  57         W512       = 1,
  58         NUM_WIDTHS = 2
  59     };
  60
  61     struct LowerX86;
  62
  63     typedef std::function<Instruction *(LowerX86 *, TargetArch, TargetWidth, CallInst *)> EmuFunc;
  64
  65     struct X86Intrinsic
  66     {
  67         Intrinsic::ID intrin[NUM_WIDTHS];
  68         EmuFunc       emuFunc;
  69     };
  70
  71     // Map of intrinsics that haven't been moved to the new mechanism yet. If used, these get the
  72     // previous behavior of mapping directly to avx/avx2 intrinsics.
  73     static std::map<std::string, Intrinsic::ID> intrinsicMap = {
  74         {"meta.intrinsic.BEXTR_32", Intrinsic::x86_bmi_bextr_32},
  75         {"meta.intrinsic.VPSHUFB", Intrinsic::x86_avx2_pshuf_b},
  76         {"meta.intrinsic.VCVTPS2PH", Intrinsic::x86_vcvtps2ph_256},
  77         {"meta.intrinsic.VPTESTC", Intrinsic::x86_avx_ptestc_256},
  78         {"meta.intrinsic.VPTESTZ", Intrinsic::x86_avx_ptestz_256},
  79         {"meta.intrinsic.VFMADDPS", Intrinsic::x86_fma_vfmadd_ps_256},
  80         {"meta.intrinsic.VPHADDD", Intrinsic::x86_avx2_phadd_d},
  81         {"meta.intrinsic.PDEP32", Intrinsic::x86_bmi_pdep_32},
  82         {"meta.intrinsic.RDTSC", Intrinsic::x86_rdtsc},
  83     };
  84
  85     // Forward decls
  86     Instruction *NO_EMU(LowerX86 *pThis, TargetArch arch, TargetWidth width, CallInst *pCallInst);
  87     Instruction *
  88     VPERM_EMU(LowerX86 *pThis, TargetArch arch, TargetWidth width, CallInst *pCallInst);
  89     Instruction *
  90     VGATHER_EMU(LowerX86 *pThis, TargetArch arch, TargetWidth width, CallInst *pCallInst);
  91     Instruction *
  92     VROUND_EMU(LowerX86 *pThis, TargetArch arch, TargetWidth width, CallInst *pCallInst);
  93     Instruction *
  94     VHSUB_EMU(LowerX86 *pThis, TargetArch arch, TargetWidth width, CallInst *pCallInst);
  95     Instruction *
  96     VCONVERT_EMU(LowerX86 *pThis, TargetArch arch, TargetWidth width, CallInst *pCallInst);
  97
  98     Instruction *DOUBLE_EMU(LowerX86 *    pThis,
  99                             TargetArch    arch,
 100                             TargetWidth   width,
 101                             CallInst *    pCallInst,
 102                             Intrinsic::ID intrin);
 103
 104     static Intrinsic::ID DOUBLE = (Intrinsic::ID)-1;
 105
 106     static std::map<std::string, X86Intrinsic> intrinsicMap2[] = {
 107         //                              256 wide                                    512 wide
 108         {
 109             // AVX
 110             {"meta.intrinsic.VRCPPS", {{Intrinsic::x86_avx_rcp_ps_256, DOUBLE}, NO_EMU}},
 111             {"meta.intrinsic.VPERMPS",
 112              {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VPERM_EMU}},
 113             {"meta.intrinsic.VPERMD",
 114              {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VPERM_EMU}},
 115             {"meta.intrinsic.VGATHERPD",
 116              {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},
 117             {"meta.intrinsic.VGATHERPS",
 118              {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},
 119             {"meta.intrinsic.VGATHERDD",
 120              {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},
 121             {"meta.intrinsic.VCVTPD2PS",
 122              {{Intrinsic::x86_avx_cvt_pd2_ps_256, Intrinsic::not_intrinsic}, NO_EMU}},
 123             {"meta.intrinsic.VCVTPH2PS",
 124              {{Intrinsic::x86_vcvtph2ps_256, Intrinsic::not_intrinsic}, NO_EMU}},
 125             {"meta.intrinsic.VROUND", {{Intrinsic::x86_avx_round_ps_256, DOUBLE}, NO_EMU}},
 126             {"meta.intrinsic.VHSUBPS", {{Intrinsic::x86_avx_hsub_ps_256, DOUBLE}, NO_EMU}},
 127         },
 128         {
 129             // AVX2
 130             {"meta.intrinsic.VRCPPS", {{Intrinsic::x86_avx_rcp_ps_256, DOUBLE}, NO_EMU}},
 131             {"meta.intrinsic.VPERMPS",
 132              {{Intrinsic::x86_avx2_permps, Intrinsic::not_intrinsic}, VPERM_EMU}},
 133             {"meta.intrinsic.VPERMD",
 134              {{Intrinsic::x86_avx2_permd, Intrinsic::not_intrinsic}, VPERM_EMU}},
 135             {"meta.intrinsic.VGATHERPD",
 136              {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},
 137             {"meta.intrinsic.VGATHERPS",
 138              {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},
 139             {"meta.intrinsic.VGATHERDD",
 140              {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},
 141             {"meta.intrinsic.VCVTPD2PS", {{Intrinsic::x86_avx_cvt_pd2_ps_256, DOUBLE}, NO_EMU}},
 142             {"meta.intrinsic.VCVTPH2PS",
 143              {{Intrinsic::x86_vcvtph2ps_256, Intrinsic::not_intrinsic}, NO_EMU}},
 144             {"meta.intrinsic.VROUND", {{Intrinsic::x86_avx_round_ps_256, DOUBLE}, NO_EMU}},
 145             {"meta.intrinsic.VHSUBPS", {{Intrinsic::x86_avx_hsub_ps_256, DOUBLE}, NO_EMU}},
 146         },
 147         {
 148             // AVX512
 149             {"meta.intrinsic.VRCPPS",
 150              {{Intrinsic::x86_avx512_rcp14_ps_256, Intrinsic::x86_avx512_rcp14_ps_512}, NO_EMU}},
 151 #if LLVM_VERSION_MAJOR < 7
 152             {"meta.intrinsic.VPERMPS",
 153              {{Intrinsic::x86_avx512_mask_permvar_sf_256,
 154                Intrinsic::x86_avx512_mask_permvar_sf_512},
 155               NO_EMU}},
 156             {"meta.intrinsic.VPERMD",
 157              {{Intrinsic::x86_avx512_mask_permvar_si_256,
 158                Intrinsic::x86_avx512_mask_permvar_si_512},
 159               NO_EMU}},
 160 #else
 161             {"meta.intrinsic.VPERMPS",
 162              {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VPERM_EMU}},
 163             {"meta.intrinsic.VPERMD",
 164              {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VPERM_EMU}},
 165 #endif
 166             {"meta.intrinsic.VGATHERPD",
 167              {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},
 168             {"meta.intrinsic.VGATHERPS",
 169              {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},
 170             {"meta.intrinsic.VGATHERDD",
 171              {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},
 172 #if LLVM_VERSION_MAJOR < 7
 173             {"meta.intrinsic.VCVTPD2PS",
 174              {{Intrinsic::x86_avx512_mask_cvtpd2ps_256, Intrinsic::x86_avx512_mask_cvtpd2ps_512},
 175               NO_EMU}},
 176 #else
 177             {"meta.intrinsic.VCVTPD2PS",
 178              {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VCONVERT_EMU}},
 179 #endif
 180             {"meta.intrinsic.VCVTPH2PS",
 181              {{Intrinsic::x86_avx512_mask_vcvtph2ps_256, Intrinsic::x86_avx512_mask_vcvtph2ps_512},
 182               NO_EMU}},
 183             {"meta.intrinsic.VROUND",
 184              {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VROUND_EMU}},
 185             {"meta.intrinsic.VHSUBPS",
 186              {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VHSUB_EMU}},
 187         }};
 188
 189     struct LowerX86 : public FunctionPass
 190     {
 191         LowerX86(Builder *b = nullptr) : FunctionPass(ID), B(b)
 192         {
 193             initializeLowerX86Pass(*PassRegistry::getPassRegistry());
 194
 195             // Determine target arch
 196             if (JM()->mArch.AVX512F())
 197             {
 198                 mTarget = AVX512;
 199             }
 200             else if (JM()->mArch.AVX2())
 201             {
 202                 mTarget = AVX2;
 203             }
 204             else if (JM()->mArch.AVX())
 205             {
 206                 mTarget = AVX;
 207             }
 208             else
 209             {
 210                 SWR_ASSERT(false, "Unsupported AVX architecture.");
 211                 mTarget = AVX;
 212             }
 213         }
 214
 215         // Try to decipher the vector type of the instruction. This does not work properly
 216         // across all intrinsics, and will have to be rethought. Probably need something
 217         // similar to llvm's getDeclaration() utility to map a set of inputs to a specific typed
 218         // intrinsic.
 219         void GetRequestedWidthAndType(CallInst *      pCallInst,
 220                                       const StringRef intrinName,
 221                                       TargetWidth *   pWidth,
 222                                       Type **         pTy)
 223         {
 224             Type *pVecTy = pCallInst->getType();
 225
 226             // Check for intrinsic specific types
 227             // VCVTPD2PS type comes from src, not dst
 228             if (intrinName.equals("meta.intrinsic.VCVTPD2PS"))
 229             {
 230                 pVecTy = pCallInst->getOperand(0)->getType();
 231             }
 232
 233             if (!pVecTy->isVectorTy())
 234             {
 235                 for (auto &op : pCallInst->arg_operands())
 236                 {
 237                     if (op.get()->getType()->isVectorTy())
 238                     {
 239                         pVecTy = op.get()->getType();
 240                         break;
 241                     }
 242                 }
 243             }
 244             SWR_ASSERT(pVecTy->isVectorTy(), "Couldn't determine vector size");
 245
 246             uint32_t width = cast<VectorType>(pVecTy)->getBitWidth();
 247             switch (width)
 248             {
 249             case 256:
 250                 *pWidth = W256;
 251                 break;
 252             case 512:
 253                 *pWidth = W512;
 254                 break;
 255             default:
 256                 SWR_ASSERT(false, "Unhandled vector width %d", width);
 257                 *pWidth = W256;
 258             }
 259
 260             *pTy = pVecTy->getScalarType();
 261         }
 262
 263         Value *GetZeroVec(TargetWidth width, Type *pTy)
 264         {
 265             uint32_t numElem = 0;
 266             switch (width)
 267             {
 268             case W256:
 269                 numElem = 8;
 270                 break;
 271             case W512:
 272                 numElem = 16;
 273                 break;
 274             default:
 275                 SWR_ASSERT(false, "Unhandled vector width type %d\n", width);
 276             }
 277
 278             return ConstantVector::getNullValue(VectorType::get(pTy, numElem));
 279         }
 280
 281         Value *GetMask(TargetWidth width)
 282         {
 283             Value *mask;
 284             switch (width)
 285             {
 286             case W256:
 287                 mask = B->C((uint8_t)-1);
 288                 break;
 289             case W512:
 290                 mask = B->C((uint16_t)-1);
 291                 break;
 292             default:
 293                 SWR_ASSERT(false, "Unhandled vector width type %d\n", width);
 294             }
 295             return mask;
 296         }
 297
 298         // Convert <N x i1> mask to <N x i32> x86 mask
 299         Value *VectorMask(Value *vi1Mask)
 300         {
 301             uint32_t numElem = vi1Mask->getType()->getVectorNumElements();
 302             return B->S_EXT(vi1Mask, VectorType::get(B->mInt32Ty, numElem));
 303         }
 304
 305         Instruction *ProcessIntrinsicAdvanced(CallInst *pCallInst)
 306         {
 307             Function *  pFunc     = pCallInst->getCalledFunction();
 308             auto &      intrinsic = intrinsicMap2[mTarget][pFunc->getName()];
 309             TargetWidth vecWidth;
 310             Type *      pElemTy;
 311             GetRequestedWidthAndType(pCallInst, pFunc->getName(), &vecWidth, &pElemTy);
 312
 313             // Check if there is a native intrinsic for this instruction
 314             Intrinsic::ID id = intrinsic.intrin[vecWidth];
 315             if (id == DOUBLE)
 316             {
 317                 // Double pump the next smaller SIMD intrinsic
 318                 SWR_ASSERT(vecWidth != 0, "Cannot double pump smallest SIMD width.");
 319                 Intrinsic::ID id2 = intrinsic.intrin[vecWidth - 1];
 320                 SWR_ASSERT(id2 != Intrinsic::not_intrinsic,
 321                            "Cannot find intrinsic to double pump.");
 322                 return DOUBLE_EMU(this, mTarget, vecWidth, pCallInst, id2);
 323             }
 324             else if (id != Intrinsic::not_intrinsic)
 325             {
 326                 Function *pIntrin = Intrinsic::getDeclaration(B->JM()->mpCurrentModule, id);
 327                 SmallVector<Value *, 8> args;
 328                 for (auto &arg : pCallInst->arg_operands())
 329                 {
 330                     args.push_back(arg.get());
 331                 }
 332
 333                 // If AVX512, all instructions add a src operand and mask. We'll pass in 0 src and
 334                 // full mask for now Assuming the intrinsics are consistent and place the src
 335                 // operand and mask last in the argument list.
 336                 if (mTarget == AVX512)
 337                 {
 338                     if (pFunc->getName().equals("meta.intrinsic.VCVTPD2PS"))
 339                     {
 340                         args.push_back(GetZeroVec(W256, pCallInst->getType()->getScalarType()));
 341                         args.push_back(GetMask(W256));
 342                         // for AVX512 VCVTPD2PS, we also have to add rounding mode
 343                         args.push_back(B->C(_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
 344                     }
 345                     else
 346                     {
 347                         args.push_back(GetZeroVec(vecWidth, pElemTy));
 348                         args.push_back(GetMask(vecWidth));
 349                     }
 350                 }
 351
 352                 return B->CALLA(pIntrin, args);
 353             }
 354             else
 355             {
 356                 // No native intrinsic, call emulation function
 357                 return intrinsic.emuFunc(this, mTarget, vecWidth, pCallInst);
 358             }
 359
 360             SWR_ASSERT(false);
 361             return nullptr;
 362         }
 363
 364         Instruction *ProcessIntrinsic(CallInst *pCallInst)
 365         {
 366             Function *pFunc = pCallInst->getCalledFunction();
 367
 368             // Forward to the advanced support if found
 369             if (intrinsicMap2[mTarget].find(pFunc->getName()) != intrinsicMap2[mTarget].end())
 370             {
 371                 return ProcessIntrinsicAdvanced(pCallInst);
 372             }
 373
 374             SWR_ASSERT(intrinsicMap.find(pFunc->getName()) != intrinsicMap.end(),
 375                        "Unimplemented intrinsic %s.",
 376                        pFunc->getName());
 377
 378             Intrinsic::ID x86Intrinsic = intrinsicMap[pFunc->getName()];
 379             Function *    pX86IntrinFunc =
 380                 Intrinsic::getDeclaration(B->JM()->mpCurrentModule, x86Intrinsic);
 381
 382             SmallVector<Value *, 8> args;
 383             for (auto &arg : pCallInst->arg_operands())
 384             {
 385                 args.push_back(arg.get());
 386             }
 387             return B->CALLA(pX86IntrinFunc, args);
 388         }
 389
 390         //////////////////////////////////////////////////////////////////////////
 391         /// @brief LLVM funtion pass run method.
 392         /// @param f- The function we're working on with this pass.
 393         virtual bool runOnFunction(Function &F)
 394         {
 395             std::vector<Instruction *> toRemove;
 396
 397             for (auto &BB : F.getBasicBlockList())
 398             {
 399                 for (auto &I : BB.getInstList())
 400                 {
 401                     if (CallInst *pCallInst = dyn_cast<CallInst>(&I))
 402                     {
 403                         Function *pFunc = pCallInst->getCalledFunction();
 404                         if (pFunc)
 405                         {
 406                             if (pFunc->getName().startswith("meta.intrinsic"))
 407                             {
 408                                 B->IRB()->SetInsertPoint(&I);
 409                                 Instruction *pReplace = ProcessIntrinsic(pCallInst);
 410                                 SWR_ASSERT(pReplace);
 411                                 toRemove.push_back(pCallInst);
 412                                 pCallInst->replaceAllUsesWith(pReplace);
 413                             }
 414                         }
 415                     }
 416                 }
 417             }
 418
 419             for (auto *pInst : toRemove)
 420             {
 421                 pInst->eraseFromParent();
 422             }
 423
 424             JitManager::DumpToFile(&F, "lowerx86");
 425
 426             return true;
 427         }
 428
 429         virtual void getAnalysisUsage(AnalysisUsage &AU) const {}
 430
 431         JitManager *JM() { return B->JM(); }
 432
 433         Builder *B;
 434
 435         TargetArch mTarget;
 436
 437         static char ID; ///< Needed by LLVM to generate ID for FunctionPass.
 438     };
 439
 440     char LowerX86::ID = 0; // LLVM uses address of ID as the actual ID.
 441
 442     FunctionPass *createLowerX86Pass(Builder *b) { return new LowerX86(b); }
 443
 444     Instruction *NO_EMU(LowerX86 *pThis, TargetArch arch, TargetWidth width, CallInst *pCallInst)
 445     {
 446         SWR_ASSERT(false, "Unimplemented intrinsic emulation.");
 447         return nullptr;
 448     }
 449
 450     Instruction *VPERM_EMU(LowerX86 *pThis, TargetArch arch, TargetWidth width, CallInst *pCallInst)
 451     {
 452         // Only need vperm emulation for AVX
 453         SWR_ASSERT(arch == AVX);
 454
 455         Builder *B         = pThis->B;
 456         auto     v32A      = pCallInst->getArgOperand(0);
 457         auto     vi32Index = pCallInst->getArgOperand(1);
 458
 459         Value *v32Result;
 460         if (isa<Constant>(vi32Index))
 461         {
 462             // Can use llvm shuffle vector directly with constant shuffle indices
 463             v32Result = B->VSHUFFLE(v32A, v32A, vi32Index);
 464         }
 465         else
 466         {
 467             v32Result = UndefValue::get(v32A->getType());
 468             for (uint32_t l = 0; l < v32A->getType()->getVectorNumElements(); ++l)
 469             {
 470                 auto i32Index = B->VEXTRACT(vi32Index, B->C(l));
 471                 auto val      = B->VEXTRACT(v32A, i32Index);
 472                 v32Result     = B->VINSERT(v32Result, val, B->C(l));
 473             }
 474         }
 475         return cast<Instruction>(v32Result);
 476     }
 477
 478     Instruction *
 479     VGATHER_EMU(LowerX86 *pThis, TargetArch arch, TargetWidth width, CallInst *pCallInst)
 480     {
 481         Builder *B           = pThis->B;
 482         auto     vSrc        = pCallInst->getArgOperand(0);
 483         auto     pBase       = pCallInst->getArgOperand(1);
 484         auto     vi32Indices = pCallInst->getArgOperand(2);
 485         auto     vi1Mask     = pCallInst->getArgOperand(3);
 486         auto     i8Scale     = pCallInst->getArgOperand(4);
 487
 488         pBase             = B->POINTER_CAST(pBase, PointerType::get(B->mInt8Ty, 0));
 489         uint32_t numElem  = vSrc->getType()->getVectorNumElements();
 490         auto     i32Scale = B->Z_EXT(i8Scale, B->mInt32Ty);
 491         auto     srcTy    = vSrc->getType()->getVectorElementType();
 492         Value *  v32Gather;
 493         if (arch == AVX)
 494         {
 495             // Full emulation for AVX
 496             // Store source on stack to provide a valid address to load from inactive lanes
 497             auto pStack = B->STACKSAVE();
 498             auto pTmp   = B->ALLOCA(vSrc->getType());
 499             B->STORE(vSrc, pTmp);
 500
 501             v32Gather        = UndefValue::get(vSrc->getType());
 502             auto vi32Scale   = ConstantVector::getSplat(numElem, cast<ConstantInt>(i32Scale));
 503             auto vi32Offsets = B->MUL(vi32Indices, vi32Scale);
 504
 505             for (uint32_t i = 0; i < numElem; ++i)
 506             {
 507                 auto i32Offset          = B->VEXTRACT(vi32Offsets, B->C(i));
 508                 auto pLoadAddress       = B->GEP(pBase, i32Offset);
 509                 pLoadAddress            = B->BITCAST(pLoadAddress, PointerType::get(srcTy, 0));
 510                 auto pMaskedLoadAddress = B->GEP(pTmp, {0, i});
 511                 auto i1Mask             = B->VEXTRACT(vi1Mask, B->C(i));
 512                 auto pValidAddress      = B->SELECT(i1Mask, pLoadAddress, pMaskedLoadAddress);
 513                 auto val                = B->LOAD(pValidAddress);
 514                 v32Gather               = B->VINSERT(v32Gather, val, B->C(i));
 515             }
 516
 517             B->STACKRESTORE(pStack);
 518         }
 519         else if (arch == AVX2 || (arch == AVX512 && width == W256))
 520         {
 521             Function *pX86IntrinFunc;
 522             if (srcTy == B->mFP32Ty)
 523             {
 524                 pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule,
 525                                                            Intrinsic::x86_avx2_gather_d_ps_256);
 526             }
 527             else if (srcTy == B->mInt32Ty)
 528             {
 529                 pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule,
 530                                                            Intrinsic::x86_avx2_gather_d_d_256);
 531             }
 532             else if (srcTy == B->mDoubleTy)
 533             {
 534                 pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule,
 535                                                            Intrinsic::x86_avx2_gather_d_q_256);
 536             }
 537             else
 538             {
 539                 SWR_ASSERT(false, "Unsupported vector element type for gather.");
 540             }
 541
 542             if (width == W256)
 543             {
 544                 auto v32Mask = B->BITCAST(pThis->VectorMask(vi1Mask), vSrc->getType());
 545                 v32Gather = B->CALL(pX86IntrinFunc, {vSrc, pBase, vi32Indices, v32Mask, i8Scale});
 546             }
 547             else if (width == W512)
 548             {
 549                 // Double pump 4-wide for 64bit elements
 550                 if (vSrc->getType()->getVectorElementType() == B->mDoubleTy)
 551                 {
 552                     auto v64Mask = pThis->VectorMask(vi1Mask);
 553                     v64Mask      = B->S_EXT(
 554                         v64Mask,
 555                         VectorType::get(B->mInt64Ty, v64Mask->getType()->getVectorNumElements()));
 556                     v64Mask = B->BITCAST(v64Mask, vSrc->getType());
 557
 558                     Value *src0 = B->VSHUFFLE(vSrc, vSrc, B->C({0, 1, 2, 3}));
 559                     Value *src1 = B->VSHUFFLE(vSrc, vSrc, B->C({4, 5, 6, 7}));
 560
 561                     Value *indices0 = B->VSHUFFLE(vi32Indices, vi32Indices, B->C({0, 1, 2, 3}));
 562                     Value *indices1 = B->VSHUFFLE(vi32Indices, vi32Indices, B->C({4, 5, 6, 7}));
 563
 564                     Value *mask0 = B->VSHUFFLE(v64Mask, v64Mask, B->C({0, 1, 2, 3}));
 565                     Value *mask1 = B->VSHUFFLE(v64Mask, v64Mask, B->C({4, 5, 6, 7}));
 566
 567                     src0 = B->BITCAST(
 568                         src0,
 569                         VectorType::get(B->mInt64Ty, src0->getType()->getVectorNumElements()));
 570                     mask0 = B->BITCAST(
 571                         mask0,
 572                         VectorType::get(B->mInt64Ty, mask0->getType()->getVectorNumElements()));
 573                     Value *gather0 =
 574                         B->CALL(pX86IntrinFunc, {src0, pBase, indices0, mask0, i8Scale});
 575                     src1 = B->BITCAST(
 576                         src1,
 577                         VectorType::get(B->mInt64Ty, src1->getType()->getVectorNumElements()));
 578                     mask1 = B->BITCAST(
 579                         mask1,
 580                         VectorType::get(B->mInt64Ty, mask1->getType()->getVectorNumElements()));
 581                     Value *gather1 =
 582                         B->CALL(pX86IntrinFunc, {src1, pBase, indices1, mask1, i8Scale});
 583
 584                     v32Gather = B->VSHUFFLE(gather0, gather1, B->C({0, 1, 2, 3, 4, 5, 6, 7}));
 585                     v32Gather = B->BITCAST(v32Gather, vSrc->getType());
 586                 }
 587                 else
 588                 {
 589                     // Double pump 8-wide for 32bit elements
 590                     auto v32Mask = pThis->VectorMask(vi1Mask);
 591                     v32Mask      = B->BITCAST(v32Mask, vSrc->getType());
 592                     Value *src0  = B->EXTRACT_16(vSrc, 0);
 593                     Value *src1  = B->EXTRACT_16(vSrc, 1);
 594
 595                     Value *indices0 = B->EXTRACT_16(vi32Indices, 0);
 596                     Value *indices1 = B->EXTRACT_16(vi32Indices, 1);
 597
 598                     Value *mask0 = B->EXTRACT_16(v32Mask, 0);
 599                     Value *mask1 = B->EXTRACT_16(v32Mask, 1);
 600
 601                     Value *gather0 =
 602                         B->CALL(pX86IntrinFunc, {src0, pBase, indices0, mask0, i8Scale});
 603                     Value *gather1 =
 604                         B->CALL(pX86IntrinFunc, {src1, pBase, indices1, mask1, i8Scale});
 605
 606                     v32Gather = B->JOIN_16(gather0, gather1);
 607                 }
 608             }
 609         }
 610         else if (arch == AVX512)
 611         {
 612             Value *   iMask;
 613             Function *pX86IntrinFunc;
 614             if (srcTy == B->mFP32Ty)
 615             {
 616                 pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule,
 617                                                            Intrinsic::x86_avx512_gather_dps_512);
 618                 iMask          = B->BITCAST(vi1Mask, B->mInt16Ty);
 619             }
 620             else if (srcTy == B->mInt32Ty)
 621             {
 622                 pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule,
 623                                                            Intrinsic::x86_avx512_gather_dpi_512);
 624                 iMask          = B->BITCAST(vi1Mask, B->mInt16Ty);
 625             }
 626             else if (srcTy == B->mDoubleTy)
 627             {
 628                 pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule,
 629                                                            Intrinsic::x86_avx512_gather_dpd_512);
 630                 iMask          = B->BITCAST(vi1Mask, B->mInt8Ty);
 631             }
 632             else
 633             {
 634                 SWR_ASSERT(false, "Unsupported vector element type for gather.");
 635             }
 636
 637             auto i32Scale = B->Z_EXT(i8Scale, B->mInt32Ty);
 638             v32Gather     = B->CALL(pX86IntrinFunc, {vSrc, pBase, vi32Indices, iMask, i32Scale});
 639         }
 640
 641         return cast<Instruction>(v32Gather);
 642     }
 643
 644     // No support for vroundps in avx512 (it is available in kncni), so emulate with avx
 645     // instructions
 646     Instruction *
 647     VROUND_EMU(LowerX86 *pThis, TargetArch arch, TargetWidth width, CallInst *pCallInst)
 648     {
 649         SWR_ASSERT(arch == AVX512);
 650
 651         auto B       = pThis->B;
 652         auto vf32Src = pCallInst->getOperand(0);
 653         auto i8Round = pCallInst->getOperand(1);
 654         auto pfnFunc =
 655             Intrinsic::getDeclaration(B->JM()->mpCurrentModule, Intrinsic::x86_avx_round_ps_256);
 656
 657         if (width == W256)
 658         {
 659             return cast<Instruction>(B->CALL2(pfnFunc, vf32Src, i8Round));
 660         }
 661         else if (width == W512)
 662         {
 663             auto v8f32SrcLo = B->EXTRACT_16(vf32Src, 0);
 664             auto v8f32SrcHi = B->EXTRACT_16(vf32Src, 1);
 665
 666             auto v8f32ResLo = B->CALL2(pfnFunc, v8f32SrcLo, i8Round);
 667             auto v8f32ResHi = B->CALL2(pfnFunc, v8f32SrcHi, i8Round);
 668
 669             return cast<Instruction>(B->JOIN_16(v8f32ResLo, v8f32ResHi));
 670         }
 671         else
 672         {
 673             SWR_ASSERT(false, "Unimplemented vector width.");
 674         }
 675
 676         return nullptr;
 677     }
 678
 679     Instruction *VCONVERT_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst)
 680     {
 681         SWR_ASSERT(arch == AVX512);
 682
 683         auto B = pThis->B;
 684         auto vf32Src = pCallInst->getOperand(0);
 685
 686         if (width == W256)
 687         {
 688             auto vf32SrcRound = Intrinsic::getDeclaration(B->JM()->mpCurrentModule, Intrinsic::x86_avx_round_ps_256);
 689             return cast<Instruction>(B->FP_TRUNC(vf32SrcRound, B->mFP32Ty));
 690         }
 691         else if (width == W512)
 692         {
 693             // 512 can use intrinsic
 694             auto pfnFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule, Intrinsic::x86_avx512_mask_cvtpd2ps_512);
 695             return cast<Instruction>(B->CALL(pfnFunc, vf32Src));
 696         }
 697         else
 698         {
 699             SWR_ASSERT(false, "Unimplemented vector width.");
 700         }
 701
 702         return nullptr;
 703     }
 704
 705     // No support for hsub in AVX512
 706     Instruction *VHSUB_EMU(LowerX86 *pThis, TargetArch arch, TargetWidth width, CallInst *pCallInst)
 707     {
 708         SWR_ASSERT(arch == AVX512);
 709
 710         auto B    = pThis->B;
 711         auto src0 = pCallInst->getOperand(0);
 712         auto src1 = pCallInst->getOperand(1);
 713
 714         // 256b hsub can just use avx intrinsic
 715         if (width == W256)
 716         {
 717             auto pX86IntrinFunc =
 718                 Intrinsic::getDeclaration(B->JM()->mpCurrentModule, Intrinsic::x86_avx_hsub_ps_256);
 719             return cast<Instruction>(B->CALL2(pX86IntrinFunc, src0, src1));
 720         }
 721         else if (width == W512)
 722         {
 723             // 512b hsub can be accomplished with shuf/sub combo
 724             auto minuend    = B->VSHUFFLE(src0, src1, B->C({0, 2, 8, 10, 4, 6, 12, 14}));
 725             auto subtrahend = B->VSHUFFLE(src0, src1, B->C({1, 3, 9, 11, 5, 7, 13, 15}));
 726             return cast<Instruction>(B->SUB(minuend, subtrahend));
 727         }
 728         else
 729         {
 730             SWR_ASSERT(false, "Unimplemented vector width.");
 731             return nullptr;
 732         }
 733     }
 734
 735     // Double pump input using Intrin template arg. This blindly extracts lower and upper 256 from
 736     // each vector argument and calls the 256 wide intrinsic, then merges the results to 512 wide
 737     Instruction *DOUBLE_EMU(LowerX86 *    pThis,
 738                             TargetArch    arch,
 739                             TargetWidth   width,
 740                             CallInst *    pCallInst,
 741                             Intrinsic::ID intrin)
 742     {
 743         auto B = pThis->B;
 744         SWR_ASSERT(width == W512);
 745         Value *   result[2];
 746         Function *pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule, intrin);
 747         for (uint32_t i = 0; i < 2; ++i)
 748         {
 749             SmallVector<Value *, 8> args;
 750             for (auto &arg : pCallInst->arg_operands())
 751             {
 752                 auto argType = arg.get()->getType();
 753                 if (argType->isVectorTy())
 754                 {
 755                     uint32_t vecWidth  = argType->getVectorNumElements();
 756                     Value *  lanes     = B->CInc<int>(i * vecWidth / 2, vecWidth / 2);
 757                     Value *  argToPush = B->VSHUFFLE(
 758                         arg.get(), B->VUNDEF(argType->getVectorElementType(), vecWidth), lanes);
 759                     args.push_back(argToPush);
 760                 }
 761                 else
 762                 {
 763                     args.push_back(arg.get());
 764                 }
 765             }
 766             result[i] = B->CALLA(pX86IntrinFunc, args);
 767         }
 768         uint32_t vecWidth;
 769         if (result[0]->getType()->isVectorTy())
 770         {
 771             assert(result[1]->getType()->isVectorTy());
 772             vecWidth = result[0]->getType()->getVectorNumElements() +
 773                        result[1]->getType()->getVectorNumElements();
 774         }
 775         else
 776         {
 777             vecWidth = 2;
 778         }
 779         Value *lanes = B->CInc<int>(0, vecWidth);
 780         return cast<Instruction>(B->VSHUFFLE(result[0], result[1], lanes));
 781     }
 782
 783 } // namespace SwrJit
 784
 785 using namespace SwrJit;
 786
 787 INITIALIZE_PASS_BEGIN(LowerX86, "LowerX86", "LowerX86", false, false)
 788 INITIALIZE_PASS_END(LowerX86, "LowerX86", "LowerX86", false, false)