src/gallium/drivers/swr/rasterizer/jitter/functionpasses/lower_x86.cpp

   1 /****************************************************************************
   2 * Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
   3 *
   4 * Permission is hereby granted, free of charge, to any person obtaining a
   5 * copy of this software and associated documentation files (the "Software"),
   6 * to deal in the Software without restriction, including without limitation
   7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8 * and/or sell copies of the Software, and to permit persons to whom the
   9 * Software is furnished to do so, subject to the following conditions:
  10 *
  11 * The above copyright notice and this permission notice (including the next
  12 * paragraph) shall be included in all copies or substantial portions of the
  13 * Software.
  14 *
  15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21 * IN THE SOFTWARE.
  22 *
  23 * @file lower_x86.cpp
  24 *
  25 * @brief llvm pass to lower meta code to x86
  26 *
  27 * Notes:
  28 *
  29 ******************************************************************************/
  30
  31 #include "jit_pch.hpp"
  32 #include "passes.h"
  33 #include "JitManager.h"
  34
  35 #include <unordered_map>
  36
  37
  38 namespace llvm
  39 {
  40     // foward declare the initializer
  41     void initializeLowerX86Pass(PassRegistry&);
  42 }
  43
  44 namespace SwrJit
  45 {
  46     using namespace llvm;
  47
  48     enum TargetArch
  49     {
  50         AVX = 0,
  51         AVX2 = 1,
  52         AVX512 = 2
  53     };
  54
  55     enum TargetWidth
  56     {
  57         W256 = 0,
  58         W512 = 1,
  59         NUM_WIDTHS = 2
  60     };
  61
  62     struct LowerX86;
  63
  64     typedef std::function<Instruction*(LowerX86*, TargetArch, TargetWidth, CallInst*)> EmuFunc;
  65
  66     struct X86Intrinsic
  67     {
  68         Intrinsic::ID intrin[NUM_WIDTHS];
  69         EmuFunc emuFunc;
  70     };
  71
  72     // Map of intrinsics that haven't been moved to the new mechanism yet. If used, these get the previous behavior of
  73     // mapping directly to avx/avx2 intrinsics.
  74     static std::map<std::string, Intrinsic::ID> intrinsicMap = {
  75         {"meta.intrinsic.BEXTR_32",        Intrinsic::x86_bmi_bextr_32},
  76         {"meta.intrinsic.VPSHUFB",         Intrinsic::x86_avx2_pshuf_b},
  77         {"meta.intrinsic.VCVTPS2PH",       Intrinsic::x86_vcvtps2ph_256},
  78         {"meta.intrinsic.VPTESTC",         Intrinsic::x86_avx_ptestc_256},
  79         {"meta.intrinsic.VPTESTZ",         Intrinsic::x86_avx_ptestz_256},
  80         {"meta.intrinsic.VFMADDPS",        Intrinsic::x86_fma_vfmadd_ps_256},
  81         {"meta.intrinsic.VPHADDD",         Intrinsic::x86_avx2_phadd_d},
  82         {"meta.intrinsic.PDEP32",          Intrinsic::x86_bmi_pdep_32},
  83         {"meta.intrinsic.RDTSC",           Intrinsic::x86_rdtsc},
  84     };
  85
  86     // Forward decls
  87     Instruction* NO_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst);
  88     Instruction* VPERM_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst);
  89     Instruction* VGATHER_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst);
  90     Instruction* VROUND_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst);
  91     Instruction* VHSUB_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst);
  92
  93     Instruction* DOUBLE_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst, Intrinsic::ID intrin);
  94
  95     static Intrinsic::ID DOUBLE = (Intrinsic::ID)-1;
  96
  97     static std::map<std::string, X86Intrinsic> intrinsicMap2[] = {
  98         //                              256 wide                                    512 wide
  99     {   // AVX
 100         {"meta.intrinsic.VRCPPS",      {{Intrinsic::x86_avx_rcp_ps_256,              DOUBLE},                                        NO_EMU}},
 101         {"meta.intrinsic.VPERMPS",     {{Intrinsic::not_intrinsic,                   Intrinsic::not_intrinsic},                      VPERM_EMU}},
 102         {"meta.intrinsic.VPERMD",      {{Intrinsic::not_intrinsic,                   Intrinsic::not_intrinsic},                      VPERM_EMU}},
 103         {"meta.intrinsic.VGATHERPD",   {{Intrinsic::not_intrinsic,                   Intrinsic::not_intrinsic},                      VGATHER_EMU}},
 104         {"meta.intrinsic.VGATHERPS",   {{Intrinsic::not_intrinsic,                   Intrinsic::not_intrinsic},                      VGATHER_EMU}},
 105         {"meta.intrinsic.VGATHERDD",   {{Intrinsic::not_intrinsic,                   Intrinsic::not_intrinsic},                      VGATHER_EMU}},
 106         {"meta.intrinsic.VCVTPD2PS",   {{Intrinsic::x86_avx_cvt_pd2_ps_256,          Intrinsic::not_intrinsic},                      NO_EMU}},
 107         {"meta.intrinsic.VCVTPH2PS",   {{Intrinsic::x86_vcvtph2ps_256,               Intrinsic::not_intrinsic},                      NO_EMU}},
 108         {"meta.intrinsic.VROUND",      {{Intrinsic::x86_avx_round_ps_256,            DOUBLE},                                        NO_EMU}},
 109         {"meta.intrinsic.VHSUBPS",     {{Intrinsic::x86_avx_hsub_ps_256,             DOUBLE},                                        NO_EMU}},
 110     },
 111     {   // AVX2
 112         {"meta.intrinsic.VRCPPS",      {{Intrinsic::x86_avx_rcp_ps_256,              DOUBLE},                                        NO_EMU}},
 113         {"meta.intrinsic.VPERMPS",     {{Intrinsic::x86_avx2_permps,                 Intrinsic::not_intrinsic},                      VPERM_EMU}},
 114         {"meta.intrinsic.VPERMD",      {{Intrinsic::x86_avx2_permd,                  Intrinsic::not_intrinsic},                      VPERM_EMU}},
 115         {"meta.intrinsic.VGATHERPD",   {{Intrinsic::not_intrinsic,                   Intrinsic::not_intrinsic},                      VGATHER_EMU}},
 116         {"meta.intrinsic.VGATHERPS",   {{Intrinsic::not_intrinsic,                   Intrinsic::not_intrinsic},                      VGATHER_EMU}},
 117         {"meta.intrinsic.VGATHERDD",   {{Intrinsic::not_intrinsic,                   Intrinsic::not_intrinsic},                      VGATHER_EMU}},
 118         {"meta.intrinsic.VCVTPD2PS",   {{Intrinsic::x86_avx_cvt_pd2_ps_256,          DOUBLE},                                        NO_EMU}},
 119         {"meta.intrinsic.VCVTPH2PS",   {{Intrinsic::x86_vcvtph2ps_256,               Intrinsic::not_intrinsic},                      NO_EMU}},
 120         {"meta.intrinsic.VROUND",      {{Intrinsic::x86_avx_round_ps_256,            DOUBLE},                                        NO_EMU}},
 121         {"meta.intrinsic.VHSUBPS",     {{Intrinsic::x86_avx_hsub_ps_256,             DOUBLE},                                        NO_EMU}},
 122     },
 123     {   // AVX512
 124         {"meta.intrinsic.VRCPPS",      {{Intrinsic::x86_avx512_rcp14_ps_256,         Intrinsic::x86_avx512_rcp14_ps_512},            NO_EMU}},
 125         {"meta.intrinsic.VPERMPS",     {{Intrinsic::x86_avx512_mask_permvar_sf_256,  Intrinsic::x86_avx512_mask_permvar_sf_512},     NO_EMU}},
 126         {"meta.intrinsic.VPERMD",      {{Intrinsic::x86_avx512_mask_permvar_si_256,  Intrinsic::x86_avx512_mask_permvar_si_512},     NO_EMU}},
 127         {"meta.intrinsic.VGATHERPD",   {{Intrinsic::not_intrinsic,                   Intrinsic::not_intrinsic},                      VGATHER_EMU}},
 128         {"meta.intrinsic.VGATHERPS",   {{Intrinsic::not_intrinsic,                   Intrinsic::not_intrinsic},                      VGATHER_EMU}},
 129         {"meta.intrinsic.VGATHERDD",   {{Intrinsic::not_intrinsic,                   Intrinsic::not_intrinsic},                      VGATHER_EMU}},
 130         {"meta.intrinsic.VCVTPD2PS",   {{Intrinsic::x86_avx512_mask_cvtpd2ps_256,    Intrinsic::x86_avx512_mask_cvtpd2ps_512 },      NO_EMU}},
 131         {"meta.intrinsic.VCVTPH2PS",   {{Intrinsic::x86_avx512_mask_vcvtph2ps_256,   Intrinsic::x86_avx512_mask_vcvtph2ps_512 },     NO_EMU}},
 132         {"meta.intrinsic.VROUND",      {{Intrinsic::not_intrinsic,                   Intrinsic::not_intrinsic},                      VROUND_EMU}},
 133         {"meta.intrinsic.VHSUBPS",     {{Intrinsic::not_intrinsic,                   Intrinsic::not_intrinsic},                      VHSUB_EMU}},
 134     }
 135     };
 136
 137     struct LowerX86 : public FunctionPass
 138     {
 139         LowerX86(JitManager* pJitMgr = nullptr, Builder* b = nullptr)
 140             : FunctionPass(ID), mpJitMgr(pJitMgr), B(b)
 141         {
 142             initializeLowerX86Pass(*PassRegistry::getPassRegistry());
 143
 144             // Determine target arch
 145             if (mpJitMgr->mArch.AVX512F())
 146             {
 147                 mTarget = AVX512;
 148             }
 149             else if (mpJitMgr->mArch.AVX2())
 150             {
 151                 mTarget = AVX2;
 152             }
 153             else if (mpJitMgr->mArch.AVX())
 154             {
 155                 mTarget = AVX;
 156
 157             }
 158             else
 159             {
 160                 SWR_ASSERT(false, "Unsupported AVX architecture.");
 161                 mTarget = AVX;
 162             }
 163         }
 164
 165         // Try to decipher the vector type of the instruction. This does not work properly
 166         // across all intrinsics, and will have to be rethought. Probably need something
 167         // similar to llvm's getDeclaration() utility to map a set of inputs to a specific typed
 168         // intrinsic.
 169         void GetRequestedWidthAndType(CallInst* pCallInst, const StringRef intrinName, TargetWidth* pWidth, Type** pTy)
 170         {
 171             Type* pVecTy = pCallInst->getType();
 172
 173             // Check for intrinsic specific types
 174             // VCVTPD2PS type comes from src, not dst
 175             if (intrinName.equals("meta.intrinsic.VCVTPD2PS"))
 176             {
 177                 pVecTy = pCallInst->getOperand(0)->getType();
 178             }
 179
 180             if (!pVecTy->isVectorTy())
 181             {
 182                 for (auto& op : pCallInst->arg_operands())
 183                 {
 184                     if (op.get()->getType()->isVectorTy())
 185                     {
 186                         pVecTy = op.get()->getType();
 187                         break;
 188                     }
 189                 }
 190             }
 191             SWR_ASSERT(pVecTy->isVectorTy(), "Couldn't determine vector size");
 192
 193             uint32_t width = cast<VectorType>(pVecTy)->getBitWidth();
 194             switch (width)
 195             {
 196             case 256: *pWidth = W256; break;
 197             case 512: *pWidth = W512; break;
 198             default: SWR_ASSERT(false, "Unhandled vector width %d", width);
 199                 *pWidth = W256;
 200             }
 201
 202             *pTy = pVecTy->getScalarType();
 203         }
 204
 205         Value* GetZeroVec(TargetWidth width, Type* pTy)
 206         {
 207             uint32_t numElem = 0;
 208             switch (width)
 209             {
 210             case W256: numElem = 8; break;
 211             case W512: numElem = 16; break;
 212             default: SWR_ASSERT(false, "Unhandled vector width type %d\n", width);
 213             }
 214
 215             return ConstantVector::getNullValue(VectorType::get(pTy, numElem));
 216         }
 217
 218         Value* GetMask(TargetWidth width)
 219         {
 220             Value* mask;
 221             switch (width)
 222             {
 223             case W256: mask = B->C((uint8_t)-1); break;
 224             case W512: mask = B->C((uint16_t)-1); break;
 225             default: SWR_ASSERT(false, "Unhandled vector width type %d\n", width);
 226             }
 227             return mask;
 228         }
 229
 230         // Convert <N x i1> mask to <N x i32> x86 mask
 231         Value* VectorMask(Value* vi1Mask)
 232         {
 233             uint32_t numElem = vi1Mask->getType()->getVectorNumElements();
 234             return B->S_EXT(vi1Mask, VectorType::get(B->mInt32Ty, numElem));
 235         }
 236
 237         Instruction* ProcessIntrinsicAdvanced(CallInst* pCallInst)
 238         {
 239             Function* pFunc = pCallInst->getCalledFunction();
 240             auto& intrinsic = intrinsicMap2[mTarget][pFunc->getName()];
 241             TargetWidth vecWidth;
 242             Type* pElemTy;
 243             GetRequestedWidthAndType(pCallInst, pFunc->getName(), &vecWidth, &pElemTy);
 244
 245             // Check if there is a native intrinsic for this instruction
 246             Intrinsic::ID id = intrinsic.intrin[vecWidth];
 247             if (id == DOUBLE)
 248             {
 249                 // Double pump the next smaller SIMD intrinsic
 250                 SWR_ASSERT(vecWidth != 0, "Cannot double pump smallest SIMD width.");
 251                 Intrinsic::ID id2 = intrinsic.intrin[vecWidth - 1];
 252                 SWR_ASSERT(id2 != Intrinsic::not_intrinsic, "Cannot find intrinsic to double pump.");
 253                 return DOUBLE_EMU(this, mTarget, vecWidth, pCallInst, id2);
 254             }
 255             else if (id != Intrinsic::not_intrinsic)
 256             {
 257                 Function* pIntrin = Intrinsic::getDeclaration(B->JM()->mpCurrentModule, id);
 258                 SmallVector<Value*, 8> args;
 259                 for (auto& arg : pCallInst->arg_operands())
 260                 {
 261                     args.push_back(arg.get());
 262                 }
 263
 264                 // If AVX512, all instructions add a src operand and mask. We'll pass in 0 src and full mask for now
 265                 // Assuming the intrinsics are consistent and place the src operand and mask last in the argument list.
 266                 if (mTarget == AVX512)
 267                 {
 268                     if (pFunc->getName().equals("meta.intrinsic.VCVTPD2PS")) {
 269                         args.push_back(GetZeroVec(W256, pCallInst->getType()->getScalarType()));
 270                         args.push_back(GetMask(W256));
 271                         // for AVX512 VCVTPD2PS, we also have to add rounding mode
 272                         args.push_back(B->C(_MM_FROUND_TO_NEAREST_INT |
 273                                             _MM_FROUND_NO_EXC));
 274                     } else {
 275                         args.push_back(GetZeroVec(vecWidth, pElemTy));
 276                         args.push_back(GetMask(vecWidth));
 277                     }
 278                 }
 279
 280                 return B->CALLA(pIntrin, args);
 281             }
 282             else
 283             {
 284                 // No native intrinsic, call emulation function
 285                 return intrinsic.emuFunc(this, mTarget, vecWidth, pCallInst);
 286             }
 287
 288             SWR_ASSERT(false);
 289             return nullptr;
 290         }
 291
 292         Instruction* ProcessIntrinsic(CallInst* pCallInst)
 293         {
 294             Function* pFunc = pCallInst->getCalledFunction();
 295
 296             // Forward to the advanced support if found
 297             if (intrinsicMap2[mTarget].find(pFunc->getName()) != intrinsicMap2[mTarget].end())
 298             {
 299                 return ProcessIntrinsicAdvanced(pCallInst);
 300             }
 301
 302             SWR_ASSERT(intrinsicMap.find(pFunc->getName()) != intrinsicMap.end(), "Unimplemented intrinsic %s.", pFunc->getName());
 303
 304             Intrinsic::ID x86Intrinsic = intrinsicMap[pFunc->getName()];
 305             Function* pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule, x86Intrinsic);
 306
 307             SmallVector<Value*, 8> args;
 308             for (auto& arg : pCallInst->arg_operands())
 309             {
 310                 args.push_back(arg.get());
 311             }
 312             return B->CALLA(pX86IntrinFunc, args);
 313         }
 314
 315         //////////////////////////////////////////////////////////////////////////
 316         /// @brief LLVM funtion pass run method.
 317         /// @param f- The function we're working on with this pass.
 318         virtual bool runOnFunction(Function& F)
 319         {
 320             std::vector<Instruction*> toRemove;
 321
 322             for (auto& BB : F.getBasicBlockList())
 323             {
 324                 for (auto& I : BB.getInstList())
 325                 {
 326                     if (CallInst* pCallInst = dyn_cast<CallInst>(&I))
 327                     {
 328                         Function* pFunc = pCallInst->getCalledFunction();
 329                         if (pFunc)
 330                         {
 331                             if (pFunc->getName().startswith("meta.intrinsic"))
 332                             {
 333                                 B->IRB()->SetInsertPoint(&I);
 334                                 Instruction* pReplace = ProcessIntrinsic(pCallInst);
 335                                 SWR_ASSERT(pReplace);
 336                                 toRemove.push_back(pCallInst);
 337                                 pCallInst->replaceAllUsesWith(pReplace);
 338                             }
 339                         }
 340
 341                     }
 342                 }
 343             }
 344
 345             for (auto* pInst : toRemove)
 346             {
 347                 pInst->eraseFromParent();
 348             }
 349
 350             JitManager::DumpToFile(&F, "lowerx86");
 351
 352             return true;
 353         }
 354
 355         virtual void getAnalysisUsage(AnalysisUsage& AU) const
 356         {
 357         }
 358
 359         JitManager* JM() { return mpJitMgr; }
 360
 361         JitManager* mpJitMgr;
 362         Builder* B;
 363
 364         TargetArch mTarget;
 365
 366         static char ID;  ///< Needed by LLVM to generate ID for FunctionPass.
 367     };
 368
 369     char LowerX86::ID = 0;   // LLVM uses address of ID as the actual ID.
 370
 371     FunctionPass* createLowerX86Pass(JitManager* pJitMgr, Builder* b)
 372     {
 373         return new LowerX86(pJitMgr, b);
 374     }
 375
 376     Instruction* NO_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst)
 377     {
 378         SWR_ASSERT(false, "Unimplemented intrinsic emulation.");
 379         return nullptr;
 380     }
 381
 382     Instruction* VPERM_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst)
 383     {
 384         // Only need vperm emulation for AVX
 385         SWR_ASSERT(arch == AVX);
 386
 387         Builder* B = pThis->B;
 388         auto v32A = pCallInst->getArgOperand(0);
 389         auto vi32Index = pCallInst->getArgOperand(1);
 390
 391         Value* v32Result;
 392         if (isa<Constant>(vi32Index))
 393         {
 394             // Can use llvm shuffle vector directly with constant shuffle indices
 395             v32Result = B->VSHUFFLE(v32A, v32A, vi32Index);
 396         }
 397         else
 398         {
 399             v32Result = UndefValue::get(v32A->getType());
 400             for (uint32_t l = 0; l < v32A->getType()->getVectorNumElements(); ++l)
 401             {
 402                 auto i32Index = B->VEXTRACT(vi32Index, B->C(l));
 403                 auto val = B->VEXTRACT(v32A, i32Index);
 404                 v32Result = B->VINSERT(v32Result, val, B->C(l));
 405             }
 406         }
 407         return cast<Instruction>(v32Result);
 408     }
 409
 410     Instruction* VGATHER_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst)
 411     {
 412         Builder* B = pThis->B;
 413         auto vSrc = pCallInst->getArgOperand(0);
 414         auto pBase = pCallInst->getArgOperand(1);
 415         auto vi32Indices = pCallInst->getArgOperand(2);
 416         auto vi1Mask = pCallInst->getArgOperand(3);
 417         auto i8Scale = pCallInst->getArgOperand(4);
 418
 419         pBase = B->POINTER_CAST(pBase, PointerType::get(B->mInt8Ty, 0));
 420         uint32_t numElem = vSrc->getType()->getVectorNumElements();
 421         auto i32Scale = B->Z_EXT(i8Scale, B->mInt32Ty);
 422         auto srcTy = vSrc->getType()->getVectorElementType();
 423         Value* v32Gather;
 424         if (arch == AVX)
 425         {
 426             // Full emulation for AVX
 427             // Store source on stack to provide a valid address to load from inactive lanes
 428             auto pStack = B->STACKSAVE();
 429             auto pTmp = B->ALLOCA(vSrc->getType());
 430             B->STORE(vSrc, pTmp);
 431
 432             v32Gather = UndefValue::get(vSrc->getType());
 433             auto vi32Scale = ConstantVector::getSplat(numElem, cast<ConstantInt>(i32Scale));
 434             auto vi32Offsets = B->MUL(vi32Indices, vi32Scale);
 435
 436             for (uint32_t i = 0; i < numElem; ++i)
 437             {
 438                 auto i32Offset = B->VEXTRACT(vi32Offsets, B->C(i));
 439                 auto pLoadAddress = B->GEP(pBase, i32Offset);
 440                 pLoadAddress = B->BITCAST(pLoadAddress, PointerType::get(srcTy, 0));
 441                 auto pMaskedLoadAddress = B->GEP(pTmp, { 0, i });
 442                 auto i1Mask = B->VEXTRACT(vi1Mask, B->C(i));
 443                 auto pValidAddress = B->SELECT(i1Mask, pLoadAddress, pMaskedLoadAddress);
 444                 auto val = B->LOAD(pValidAddress);
 445                 v32Gather = B->VINSERT(v32Gather, val, B->C(i));
 446             }
 447
 448             B->STACKRESTORE(pStack);
 449         }
 450         else if (arch == AVX2 || (arch == AVX512 && width == W256))
 451         {
 452             Function* pX86IntrinFunc;
 453             if (srcTy == B->mFP32Ty)
 454             {
 455                 pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule, Intrinsic::x86_avx2_gather_d_ps_256);
 456             }
 457             else if (srcTy == B->mInt32Ty)
 458             {
 459                 pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule, Intrinsic::x86_avx2_gather_d_d_256);
 460             }
 461             else if (srcTy == B->mDoubleTy)
 462             {
 463                 pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule, Intrinsic::x86_avx2_gather_d_q_256);
 464             }
 465             else
 466             {
 467                 SWR_ASSERT(false, "Unsupported vector element type for gather.");
 468             }
 469
 470             if (width == W256)
 471             {
 472                 auto v32Mask = B->BITCAST(pThis->VectorMask(vi1Mask), vSrc->getType());
 473                 v32Gather = B->CALL(pX86IntrinFunc, { vSrc, pBase, vi32Indices, v32Mask, i8Scale });
 474             }
 475             else if (width == W512)
 476             {
 477                 // Double pump 4-wide for 64bit elements
 478                 if (vSrc->getType()->getVectorElementType() == B->mDoubleTy)
 479                 {
 480                     auto v64Mask = pThis->VectorMask(vi1Mask);
 481                     v64Mask = B->S_EXT(v64Mask,
 482                                        VectorType::get(B->mInt64Ty, v64Mask->getType()->getVectorNumElements()));
 483                     v64Mask = B->BITCAST(v64Mask, vSrc->getType());
 484
 485                     Value* src0 = B->VSHUFFLE(vSrc, vSrc, B->C({ 0, 1, 2, 3 }));
 486                     Value* src1 = B->VSHUFFLE(vSrc, vSrc, B->C({ 4, 5, 6, 7 }));
 487
 488                     Value* indices0 = B->VSHUFFLE(vi32Indices, vi32Indices, B->C({ 0, 1, 2, 3 }));
 489                     Value* indices1 = B->VSHUFFLE(vi32Indices, vi32Indices, B->C({ 4, 5, 6, 7 }));
 490
 491                     Value* mask0 = B->VSHUFFLE(v64Mask, v64Mask, B->C({ 0, 1, 2, 3 }));
 492                     Value* mask1 = B->VSHUFFLE(v64Mask, v64Mask, B->C({ 4, 5, 6, 7 }));
 493
 494                     src0 = B->BITCAST(src0, VectorType::get(B->mInt64Ty, src0->getType()->getVectorNumElements()));
 495                     mask0 = B->BITCAST(mask0, VectorType::get(B->mInt64Ty, mask0->getType()->getVectorNumElements()));
 496                     Value* gather0 = B->CALL(pX86IntrinFunc, { src0, pBase, indices0, mask0, i8Scale });
 497                     src1 = B->BITCAST(src1, VectorType::get(B->mInt64Ty, src1->getType()->getVectorNumElements()));
 498                     mask1 = B->BITCAST(mask1, VectorType::get(B->mInt64Ty, mask1->getType()->getVectorNumElements()));
 499                     Value* gather1 = B->CALL(pX86IntrinFunc, { src1, pBase, indices1, mask1, i8Scale });
 500
 501                     v32Gather = B->VSHUFFLE(gather0, gather1, B->C({ 0, 1, 2, 3, 4, 5, 6, 7 }));
 502                     v32Gather = B->BITCAST(v32Gather, vSrc->getType());
 503                 }
 504                 else
 505                 {
 506                     // Double pump 8-wide for 32bit elements
 507                     auto v32Mask = pThis->VectorMask(vi1Mask);
 508                     v32Mask = B->BITCAST(v32Mask, vSrc->getType());
 509                     Value* src0 = B->EXTRACT_16(vSrc, 0);
 510                     Value* src1 = B->EXTRACT_16(vSrc, 1);
 511
 512                     Value* indices0 = B->EXTRACT_16(vi32Indices, 0);
 513                     Value* indices1 = B->EXTRACT_16(vi32Indices, 1);
 514
 515                     Value* mask0 = B->EXTRACT_16(v32Mask, 0);
 516                     Value* mask1 = B->EXTRACT_16(v32Mask, 1);
 517
 518                     Value* gather0 = B->CALL(pX86IntrinFunc, { src0, pBase, indices0, mask0, i8Scale });
 519                     Value* gather1 = B->CALL(pX86IntrinFunc, { src1, pBase, indices1, mask1, i8Scale });
 520
 521                     v32Gather = B->JOIN_16(gather0, gather1);
 522                 }
 523             }
 524         }
 525         else if (arch == AVX512)
 526         {
 527             Value* iMask;
 528             Function* pX86IntrinFunc;
 529             if (srcTy == B->mFP32Ty)
 530             {
 531                 pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule, Intrinsic::x86_avx512_gather_dps_512);
 532                 iMask = B->BITCAST(vi1Mask, B->mInt16Ty);
 533             }
 534             else if (srcTy == B->mInt32Ty)
 535             {
 536                 pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule, Intrinsic::x86_avx512_gather_dpi_512);
 537                 iMask = B->BITCAST(vi1Mask, B->mInt16Ty);
 538             }
 539             else if (srcTy == B->mDoubleTy)
 540             {
 541                 pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule, Intrinsic::x86_avx512_gather_dpd_512);
 542                 iMask = B->BITCAST(vi1Mask, B->mInt8Ty);
 543             }
 544             else
 545             {
 546                 SWR_ASSERT(false, "Unsupported vector element type for gather.");
 547             }
 548
 549             auto i32Scale = B->Z_EXT(i8Scale, B->mInt32Ty);
 550             v32Gather = B->CALL(pX86IntrinFunc, { vSrc, pBase, vi32Indices, iMask, i32Scale });
 551         }
 552
 553         return cast<Instruction>(v32Gather);
 554     }
 555
 556     // No support for vroundps in avx512 (it is available in kncni), so emulate with avx instructions
 557     Instruction* VROUND_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst)
 558     {
 559         SWR_ASSERT(arch == AVX512);
 560
 561         auto B = pThis->B;
 562         auto vf32Src = pCallInst->getOperand(0);
 563         auto i8Round = pCallInst->getOperand(1);
 564         auto pfnFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule, Intrinsic::x86_avx_round_ps_256);
 565
 566         if (width == W256)
 567         {
 568             return cast<Instruction>(B->CALL2(pfnFunc, vf32Src, i8Round));
 569         }
 570         else if (width == W512)
 571         {
 572             auto v8f32SrcLo = B->EXTRACT_16(vf32Src, 0);
 573             auto v8f32SrcHi = B->EXTRACT_16(vf32Src, 1);
 574
 575             auto v8f32ResLo = B->CALL2(pfnFunc, v8f32SrcLo, i8Round);
 576             auto v8f32ResHi = B->CALL2(pfnFunc, v8f32SrcHi, i8Round);
 577
 578             return cast<Instruction>(B->JOIN_16(v8f32ResLo, v8f32ResHi));
 579         }
 580         else
 581         {
 582             SWR_ASSERT(false, "Unimplemented vector width.");
 583         }
 584
 585         return nullptr;
 586     }
 587
 588     // No support for hsub in AVX512
 589     Instruction* VHSUB_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst)
 590     {
 591         SWR_ASSERT(arch == AVX512);
 592
 593         auto B = pThis->B;
 594         auto src0 = pCallInst->getOperand(0);
 595         auto src1 = pCallInst->getOperand(1);
 596
 597         // 256b hsub can just use avx intrinsic
 598         if (width == W256)
 599         {
 600             auto pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule, Intrinsic::x86_avx_hsub_ps_256);
 601             return cast<Instruction>(B->CALL2(pX86IntrinFunc, src0, src1));
 602         }
 603         else if (width == W512)
 604         {
 605             // 512b hsub can be accomplished with shuf/sub combo
 606             auto minuend = B->VSHUFFLE(src0, src1, B->C({ 0, 2, 8, 10, 4, 6, 12, 14 }));
 607             auto subtrahend = B->VSHUFFLE(src0, src1, B->C({ 1, 3, 9, 11, 5, 7, 13, 15 }));
 608             return cast<Instruction>(B->SUB(minuend, subtrahend));
 609         }
 610         else
 611         {
 612             SWR_ASSERT(false, "Unimplemented vector width.");
 613             return nullptr;
 614         }
 615     }
 616
 617     // Double pump input using Intrin template arg. This blindly extracts lower and upper 256 from each vector argument and
 618     // calls the 256 wide intrinsic, then merges the results to 512 wide
 619     Instruction* DOUBLE_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst, Intrinsic::ID intrin)
 620     {
 621         auto B = pThis->B;
 622         SWR_ASSERT(width == W512);
 623         Value* result[2];
 624         Function* pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule, intrin);
 625         for (uint32_t i = 0; i < 2; ++i)
 626         {
 627             SmallVector<Value*, 8> args;
 628             for (auto& arg : pCallInst->arg_operands())
 629             {
 630                 auto argType = arg.get()->getType();
 631                 if (argType->isVectorTy())
 632                 {
 633                     uint32_t vecWidth = argType->getVectorNumElements();
 634                     Value *lanes = B->CInc<int>(i*vecWidth/2, vecWidth/2);
 635                     Value *argToPush = B->VSHUFFLE(arg.get(), B->VUNDEF(argType->getVectorElementType(), vecWidth), lanes);
 636                     args.push_back(argToPush);
 637                 }
 638                 else
 639                 {
 640                     args.push_back(arg.get());
 641                 }
 642             }
 643             result[i] = B->CALLA(pX86IntrinFunc, args);
 644         }
 645         uint32_t vecWidth;
 646         if (result[0]->getType()->isVectorTy())
 647         {
 648             assert(result[1]->getType()->isVectorTy());
 649             vecWidth = result[0]->getType()->getVectorNumElements() +
 650                 result[1]->getType()->getVectorNumElements();
 651         }
 652         else
 653         {
 654             vecWidth = 2;
 655         }
 656         Value *lanes = B->CInc<int>(0, vecWidth);
 657         return cast<Instruction>(B->VSHUFFLE(result[0], result[1], lanes));
 658     }
 659
 660 }
 661
 662 using namespace SwrJit;
 663
 664 INITIALIZE_PASS_BEGIN(LowerX86, "LowerX86", "LowerX86", false, false)
 665 INITIALIZE_PASS_END(LowerX86, "LowerX86", "LowerX86", false, false)
 666