src/gallium/drivers/swr/rasterizer/jitter/functionpasses/lower_x86.cpp

   1 /****************************************************************************
   2  * Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  *
  23  * @file lower_x86.cpp
  24  *
  25  * @brief llvm pass to lower meta code to x86
  26  *
  27  * Notes:
  28  *
  29  ******************************************************************************/
  30
  31 #include "jit_pch.hpp"
  32 #include "passes.h"
  33 #include "JitManager.h"
  34
  35 #include "common/simdlib.hpp"
  36
  37 #include <unordered_map>
  38
  39 extern "C" void ScatterPS_256(uint8_t*, SIMD256::Integer, SIMD256::Float, uint8_t, uint32_t);
  40
  41 namespace llvm
  42 {
  43     // foward declare the initializer
  44     void initializeLowerX86Pass(PassRegistry&);
  45 } // namespace llvm
  46
  47 namespace SwrJit
  48 {
  49     using namespace llvm;
  50
  51     enum TargetArch
  52     {
  53         AVX    = 0,
  54         AVX2   = 1,
  55         AVX512 = 2
  56     };
  57
  58     enum TargetWidth
  59     {
  60         W256       = 0,
  61         W512       = 1,
  62         NUM_WIDTHS = 2
  63     };
  64
  65     struct LowerX86;
  66
  67     typedef std::function<Instruction*(LowerX86*, TargetArch, TargetWidth, CallInst*)> EmuFunc;
  68
  69     struct X86Intrinsic
  70     {
  71         IntrinsicID intrin[NUM_WIDTHS];
  72         EmuFunc       emuFunc;
  73     };
  74
  75     // Map of intrinsics that haven't been moved to the new mechanism yet. If used, these get the
  76     // previous behavior of mapping directly to avx/avx2 intrinsics.
  77     static std::map<std::string, IntrinsicID> intrinsicMap = {
  78         {"meta.intrinsic.BEXTR_32", Intrinsic::x86_bmi_bextr_32},
  79         {"meta.intrinsic.VPSHUFB", Intrinsic::x86_avx2_pshuf_b},
  80         {"meta.intrinsic.VCVTPS2PH", Intrinsic::x86_vcvtps2ph_256},
  81         {"meta.intrinsic.VPTESTC", Intrinsic::x86_avx_ptestc_256},
  82         {"meta.intrinsic.VPTESTZ", Intrinsic::x86_avx_ptestz_256},
  83         {"meta.intrinsic.VPHADDD", Intrinsic::x86_avx2_phadd_d},
  84         {"meta.intrinsic.PDEP32", Intrinsic::x86_bmi_pdep_32},
  85         {"meta.intrinsic.RDTSC", Intrinsic::x86_rdtsc},
  86     };
  87
  88     // Forward decls
  89     Instruction* NO_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst);
  90     Instruction*
  91     VPERM_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst);
  92     Instruction*
  93     VGATHER_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst);
  94     Instruction*
  95     VSCATTER_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst);
  96     Instruction*
  97     VROUND_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst);
  98     Instruction*
  99     VHSUB_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst);
 100     Instruction*
 101     VCONVERT_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst);
 102
 103     Instruction* DOUBLE_EMU(LowerX86*     pThis,
 104                             TargetArch    arch,
 105                             TargetWidth   width,
 106                             CallInst*     pCallInst,
 107                             Intrinsic::ID intrin);
 108
 109     static Intrinsic::ID DOUBLE = (Intrinsic::ID)-1;
 110
 111     // clang-format off
 112     static std::map<std::string, X86Intrinsic> intrinsicMap2[] = {
 113         //                               256 wide                               512 wide
 114         {
 115             // AVX
 116             {"meta.intrinsic.VRCPPS",    {{Intrinsic::x86_avx_rcp_ps_256,       DOUBLE},                    NO_EMU}},
 117             {"meta.intrinsic.VPERMPS",   {{Intrinsic::not_intrinsic,            Intrinsic::not_intrinsic},  VPERM_EMU}},
 118             {"meta.intrinsic.VPERMD",    {{Intrinsic::not_intrinsic,            Intrinsic::not_intrinsic},  VPERM_EMU}},
 119             {"meta.intrinsic.VGATHERPD", {{Intrinsic::not_intrinsic,            Intrinsic::not_intrinsic},  VGATHER_EMU}},
 120             {"meta.intrinsic.VGATHERPS", {{Intrinsic::not_intrinsic,            Intrinsic::not_intrinsic},  VGATHER_EMU}},
 121             {"meta.intrinsic.VGATHERDD", {{Intrinsic::not_intrinsic,            Intrinsic::not_intrinsic},  VGATHER_EMU}},
 122             {"meta.intrinsic.VSCATTERPS", {{Intrinsic::not_intrinsic,           Intrinsic::not_intrinsic}, VSCATTER_EMU}},
 123             {"meta.intrinsic.VCVTPD2PS", {{Intrinsic::x86_avx_cvt_pd2_ps_256,   Intrinsic::not_intrinsic},  NO_EMU}},
 124             {"meta.intrinsic.VROUND",    {{Intrinsic::x86_avx_round_ps_256,     DOUBLE},                    NO_EMU}},
 125             {"meta.intrinsic.VHSUBPS",   {{Intrinsic::x86_avx_hsub_ps_256,      DOUBLE},                    NO_EMU}},
 126         },
 127         {
 128             // AVX2
 129             {"meta.intrinsic.VRCPPS",       {{Intrinsic::x86_avx_rcp_ps_256,    DOUBLE},                    NO_EMU}},
 130             {"meta.intrinsic.VPERMPS",      {{Intrinsic::x86_avx2_permps,       Intrinsic::not_intrinsic},  VPERM_EMU}},
 131             {"meta.intrinsic.VPERMD",       {{Intrinsic::x86_avx2_permd,        Intrinsic::not_intrinsic},  VPERM_EMU}},
 132             {"meta.intrinsic.VGATHERPD",    {{Intrinsic::not_intrinsic,         Intrinsic::not_intrinsic},  VGATHER_EMU}},
 133             {"meta.intrinsic.VGATHERPS",    {{Intrinsic::not_intrinsic,         Intrinsic::not_intrinsic},  VGATHER_EMU}},
 134             {"meta.intrinsic.VGATHERDD",    {{Intrinsic::not_intrinsic,         Intrinsic::not_intrinsic},  VGATHER_EMU}},
 135             {"meta.intrinsic.VSCATTERPS", {{Intrinsic::not_intrinsic,           Intrinsic::not_intrinsic}, VSCATTER_EMU}},
 136             {"meta.intrinsic.VCVTPD2PS",    {{Intrinsic::x86_avx_cvt_pd2_ps_256, DOUBLE},                   NO_EMU}},
 137             {"meta.intrinsic.VROUND",       {{Intrinsic::x86_avx_round_ps_256,  DOUBLE},                    NO_EMU}},
 138             {"meta.intrinsic.VHSUBPS",      {{Intrinsic::x86_avx_hsub_ps_256,   DOUBLE},                    NO_EMU}},
 139         },
 140         {
 141             // AVX512
 142             {"meta.intrinsic.VRCPPS", {{Intrinsic::x86_avx512_rcp14_ps_256,     Intrinsic::x86_avx512_rcp14_ps_512}, NO_EMU}},
 143 #if LLVM_VERSION_MAJOR < 7
 144             {"meta.intrinsic.VPERMPS", {{Intrinsic::x86_avx512_mask_permvar_sf_256, Intrinsic::x86_avx512_mask_permvar_sf_512}, NO_EMU}},
 145             {"meta.intrinsic.VPERMD", {{Intrinsic::x86_avx512_mask_permvar_si_256, Intrinsic::x86_avx512_mask_permvar_si_512}, NO_EMU}},
 146 #else
 147             {"meta.intrinsic.VPERMPS", {{Intrinsic::not_intrinsic,              Intrinsic::not_intrinsic}, VPERM_EMU}},
 148             {"meta.intrinsic.VPERMD", {{Intrinsic::not_intrinsic,               Intrinsic::not_intrinsic}, VPERM_EMU}},
 149 #endif
 150             {"meta.intrinsic.VGATHERPD", {{Intrinsic::not_intrinsic,            Intrinsic::not_intrinsic}, VGATHER_EMU}},
 151             {"meta.intrinsic.VGATHERPS", {{Intrinsic::not_intrinsic,            Intrinsic::not_intrinsic}, VGATHER_EMU}},
 152             {"meta.intrinsic.VGATHERDD", {{Intrinsic::not_intrinsic,            Intrinsic::not_intrinsic}, VGATHER_EMU}},
 153             {"meta.intrinsic.VSCATTERPS", {{Intrinsic::not_intrinsic,           Intrinsic::not_intrinsic}, VSCATTER_EMU}},
 154 #if LLVM_VERSION_MAJOR < 7
 155             {"meta.intrinsic.VCVTPD2PS", {{Intrinsic::x86_avx512_mask_cvtpd2ps_256, Intrinsic::x86_avx512_mask_cvtpd2ps_512}, NO_EMU}},
 156 #else
 157             {"meta.intrinsic.VCVTPD2PS", {{Intrinsic::not_intrinsic,            Intrinsic::not_intrinsic}, VCONVERT_EMU}},
 158 #endif
 159             {"meta.intrinsic.VROUND", {{Intrinsic::not_intrinsic,               Intrinsic::not_intrinsic}, VROUND_EMU}},
 160             {"meta.intrinsic.VHSUBPS", {{Intrinsic::not_intrinsic,              Intrinsic::not_intrinsic}, VHSUB_EMU}},
 161         }};
 162     // clang-format on
 163
 164     static uint32_t getBitWidth(VectorType *pVTy)
 165     {
 166 #if LLVM_VERSION_MAJOR >= 11
 167         return pVTy->getNumElements() * pVTy->getElementType()->getPrimitiveSizeInBits();
 168 #else
 169         return pVTy->getBitWidth();
 170 #endif
 171     }
 172
 173     struct LowerX86 : public FunctionPass
 174     {
 175         LowerX86(Builder* b = nullptr) : FunctionPass(ID), B(b)
 176         {
 177             initializeLowerX86Pass(*PassRegistry::getPassRegistry());
 178
 179             // Determine target arch
 180             if (JM()->mArch.AVX512F())
 181             {
 182                 mTarget = AVX512;
 183             }
 184             else if (JM()->mArch.AVX2())
 185             {
 186                 mTarget = AVX2;
 187             }
 188             else if (JM()->mArch.AVX())
 189             {
 190                 mTarget = AVX;
 191             }
 192             else
 193             {
 194                 SWR_ASSERT(false, "Unsupported AVX architecture.");
 195                 mTarget = AVX;
 196             }
 197
 198             // Setup scatter function for 256 wide
 199             uint32_t curWidth = B->mVWidth;
 200             B->SetTargetWidth(8);
 201             std::vector<Type*> args = {
 202                 B->mInt8PtrTy,   // pBase
 203                 B->mSimdInt32Ty, // vIndices
 204                 B->mSimdFP32Ty,  // vSrc
 205                 B->mInt8Ty,      // mask
 206                 B->mInt32Ty      // scale
 207             };
 208
 209             FunctionType* pfnScatterTy = FunctionType::get(B->mVoidTy, args, false);
 210             mPfnScatter256             = cast<Function>(
 211 #if LLVM_VERSION_MAJOR >= 9
 212                 B->JM()->mpCurrentModule->getOrInsertFunction("ScatterPS_256", pfnScatterTy).getCallee());
 213 #else
 214                 B->JM()->mpCurrentModule->getOrInsertFunction("ScatterPS_256", pfnScatterTy));
 215 #endif
 216             if (sys::DynamicLibrary::SearchForAddressOfSymbol("ScatterPS_256") == nullptr)
 217             {
 218                 sys::DynamicLibrary::AddSymbol("ScatterPS_256", (void*)&ScatterPS_256);
 219             }
 220
 221             B->SetTargetWidth(curWidth);
 222         }
 223
 224         // Try to decipher the vector type of the instruction. This does not work properly
 225         // across all intrinsics, and will have to be rethought. Probably need something
 226         // similar to llvm's getDeclaration() utility to map a set of inputs to a specific typed
 227         // intrinsic.
 228         void GetRequestedWidthAndType(CallInst*       pCallInst,
 229                                       const StringRef intrinName,
 230                                       TargetWidth*    pWidth,
 231                                       Type**          pTy)
 232         {
 233             assert(pCallInst);
 234             Type* pVecTy = pCallInst->getType();
 235
 236             // Check for intrinsic specific types
 237             // VCVTPD2PS type comes from src, not dst
 238             if (intrinName.equals("meta.intrinsic.VCVTPD2PS"))
 239             {
 240                 Value* pOp = pCallInst->getOperand(0);
 241                 assert(pOp);
 242                 pVecTy = pOp->getType();
 243             }
 244
 245             if (!pVecTy->isVectorTy())
 246             {
 247                 for (auto& op : pCallInst->arg_operands())
 248                 {
 249                     if (op.get()->getType()->isVectorTy())
 250                     {
 251                         pVecTy = op.get()->getType();
 252                         break;
 253                     }
 254                 }
 255             }
 256             SWR_ASSERT(pVecTy->isVectorTy(), "Couldn't determine vector size");
 257
 258             uint32_t width = getBitWidth(cast<VectorType>(pVecTy));
 259             switch (width)
 260             {
 261             case 256:
 262                 *pWidth = W256;
 263                 break;
 264             case 512:
 265                 *pWidth = W512;
 266                 break;
 267             default:
 268                 SWR_ASSERT(false, "Unhandled vector width %d", width);
 269                 *pWidth = W256;
 270             }
 271
 272             *pTy = pVecTy->getScalarType();
 273         }
 274
 275         Value* GetZeroVec(TargetWidth width, Type* pTy)
 276         {
 277             uint32_t numElem = 0;
 278             switch (width)
 279             {
 280             case W256:
 281                 numElem = 8;
 282                 break;
 283             case W512:
 284                 numElem = 16;
 285                 break;
 286             default:
 287                 SWR_ASSERT(false, "Unhandled vector width type %d\n", width);
 288             }
 289
 290             return ConstantVector::getNullValue(VectorType::get(pTy, numElem));
 291         }
 292
 293         Value* GetMask(TargetWidth width)
 294         {
 295             Value* mask;
 296             switch (width)
 297             {
 298             case W256:
 299                 mask = B->C((uint8_t)-1);
 300                 break;
 301             case W512:
 302                 mask = B->C((uint16_t)-1);
 303                 break;
 304             default:
 305                 SWR_ASSERT(false, "Unhandled vector width type %d\n", width);
 306             }
 307             return mask;
 308         }
 309
 310         // Convert <N x i1> mask to <N x i32> x86 mask
 311         Value* VectorMask(Value* vi1Mask)
 312         {
 313 #if LLVM_VERSION_MAJOR >= 11
 314             uint32_t numElem = cast<VectorType>(vi1Mask->getType())->getNumElements();
 315 #else
 316             uint32_t numElem = vi1Mask->getType()->getVectorNumElements();
 317 #endif
 318             return B->S_EXT(vi1Mask, VectorType::get(B->mInt32Ty, numElem));
 319         }
 320
 321         Instruction* ProcessIntrinsicAdvanced(CallInst* pCallInst)
 322         {
 323             Function*   pFunc = pCallInst->getCalledFunction();
 324             assert(pFunc);
 325
 326             auto&       intrinsic = intrinsicMap2[mTarget][pFunc->getName().str()];
 327             TargetWidth vecWidth;
 328             Type*       pElemTy;
 329             GetRequestedWidthAndType(pCallInst, pFunc->getName(), &vecWidth, &pElemTy);
 330
 331             // Check if there is a native intrinsic for this instruction
 332             IntrinsicID id = intrinsic.intrin[vecWidth];
 333             if (id == DOUBLE)
 334             {
 335                 // Double pump the next smaller SIMD intrinsic
 336                 SWR_ASSERT(vecWidth != 0, "Cannot double pump smallest SIMD width.");
 337                 Intrinsic::ID id2 = intrinsic.intrin[vecWidth - 1];
 338                 SWR_ASSERT(id2 != Intrinsic::not_intrinsic,
 339                            "Cannot find intrinsic to double pump.");
 340                 return DOUBLE_EMU(this, mTarget, vecWidth, pCallInst, id2);
 341             }
 342             else if (id != Intrinsic::not_intrinsic)
 343             {
 344                 Function* pIntrin = Intrinsic::getDeclaration(B->JM()->mpCurrentModule, id);
 345                 SmallVector<Value*, 8> args;
 346                 for (auto& arg : pCallInst->arg_operands())
 347                 {
 348                     args.push_back(arg.get());
 349                 }
 350
 351                 // If AVX512, all instructions add a src operand and mask. We'll pass in 0 src and
 352                 // full mask for now Assuming the intrinsics are consistent and place the src
 353                 // operand and mask last in the argument list.
 354                 if (mTarget == AVX512)
 355                 {
 356                     if (pFunc->getName().equals("meta.intrinsic.VCVTPD2PS"))
 357                     {
 358                         args.push_back(GetZeroVec(W256, pCallInst->getType()->getScalarType()));
 359                         args.push_back(GetMask(W256));
 360                         // for AVX512 VCVTPD2PS, we also have to add rounding mode
 361                         args.push_back(B->C(_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
 362                     }
 363                     else
 364                     {
 365                         args.push_back(GetZeroVec(vecWidth, pElemTy));
 366                         args.push_back(GetMask(vecWidth));
 367                     }
 368                 }
 369
 370                 return B->CALLA(pIntrin, args);
 371             }
 372             else
 373             {
 374                 // No native intrinsic, call emulation function
 375                 return intrinsic.emuFunc(this, mTarget, vecWidth, pCallInst);
 376             }
 377
 378             SWR_ASSERT(false);
 379             return nullptr;
 380         }
 381
 382         Instruction* ProcessIntrinsic(CallInst* pCallInst)
 383         {
 384             Function* pFunc = pCallInst->getCalledFunction();
 385             assert(pFunc);
 386
 387             // Forward to the advanced support if found
 388             if (intrinsicMap2[mTarget].find(pFunc->getName().str()) != intrinsicMap2[mTarget].end())
 389             {
 390                 return ProcessIntrinsicAdvanced(pCallInst);
 391             }
 392
 393             SWR_ASSERT(intrinsicMap.find(pFunc->getName().str()) != intrinsicMap.end(),
 394                        "Unimplemented intrinsic %s.",
 395                        pFunc->getName().str().c_str());
 396
 397             Intrinsic::ID x86Intrinsic = intrinsicMap[pFunc->getName().str()];
 398             Function*     pX86IntrinFunc =
 399                 Intrinsic::getDeclaration(B->JM()->mpCurrentModule, x86Intrinsic);
 400
 401             SmallVector<Value*, 8> args;
 402             for (auto& arg : pCallInst->arg_operands())
 403             {
 404                 args.push_back(arg.get());
 405             }
 406             return B->CALLA(pX86IntrinFunc, args);
 407         }
 408
 409         //////////////////////////////////////////////////////////////////////////
 410         /// @brief LLVM funtion pass run method.
 411         /// @param f- The function we're working on with this pass.
 412         virtual bool runOnFunction(Function& F)
 413         {
 414             std::vector<Instruction*> toRemove;
 415             std::vector<BasicBlock*>  bbs;
 416
 417             // Make temp copy of the basic blocks and instructions, as the intrinsic
 418             // replacement code might invalidate the iterators
 419             for (auto& b : F.getBasicBlockList())
 420             {
 421                 bbs.push_back(&b);
 422             }
 423
 424             for (auto* BB : bbs)
 425             {
 426                 std::vector<Instruction*> insts;
 427                 for (auto& i : BB->getInstList())
 428                 {
 429                     insts.push_back(&i);
 430                 }
 431
 432                 for (auto* I : insts)
 433                 {
 434                     if (CallInst* pCallInst = dyn_cast<CallInst>(I))
 435                     {
 436                         Function* pFunc = pCallInst->getCalledFunction();
 437                         if (pFunc)
 438                         {
 439                             if (pFunc->getName().startswith("meta.intrinsic"))
 440                             {
 441                                 B->IRB()->SetInsertPoint(I);
 442                                 Instruction* pReplace = ProcessIntrinsic(pCallInst);
 443                                 toRemove.push_back(pCallInst);
 444                                 if (pReplace)
 445                                 {
 446                                     pCallInst->replaceAllUsesWith(pReplace);
 447                                 }
 448                             }
 449                         }
 450                     }
 451                 }
 452             }
 453
 454             for (auto* pInst : toRemove)
 455             {
 456                 pInst->eraseFromParent();
 457             }
 458
 459             JitManager::DumpToFile(&F, "lowerx86");
 460
 461             return true;
 462         }
 463
 464         virtual void getAnalysisUsage(AnalysisUsage& AU) const {}
 465
 466         JitManager* JM() { return B->JM(); }
 467         Builder*    B;
 468         TargetArch  mTarget;
 469         Function*   mPfnScatter256;
 470
 471         static char ID; ///< Needed by LLVM to generate ID for FunctionPass.
 472     };
 473
 474     char LowerX86::ID = 0; // LLVM uses address of ID as the actual ID.
 475
 476     FunctionPass* createLowerX86Pass(Builder* b) { return new LowerX86(b); }
 477
 478     Instruction* NO_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst)
 479     {
 480         SWR_ASSERT(false, "Unimplemented intrinsic emulation.");
 481         return nullptr;
 482     }
 483
 484     Instruction* VPERM_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst)
 485     {
 486         // Only need vperm emulation for AVX
 487         SWR_ASSERT(arch == AVX);
 488
 489         Builder* B         = pThis->B;
 490         auto     v32A      = pCallInst->getArgOperand(0);
 491         auto     vi32Index = pCallInst->getArgOperand(1);
 492
 493         Value* v32Result;
 494         if (isa<Constant>(vi32Index))
 495         {
 496             // Can use llvm shuffle vector directly with constant shuffle indices
 497             v32Result = B->VSHUFFLE(v32A, v32A, vi32Index);
 498         }
 499         else
 500         {
 501             v32Result = UndefValue::get(v32A->getType());
 502 #if LLVM_VERSION_MAJOR >= 11
 503             uint32_t numElem = cast<VectorType>(v32A->getType())->getNumElements();
 504 #else
 505             uint32_t numElem = v32A->getType()->getVectorNumElements();
 506 #endif
 507             for (uint32_t l = 0; l < numElem; ++l)
 508             {
 509                 auto i32Index = B->VEXTRACT(vi32Index, B->C(l));
 510                 auto val      = B->VEXTRACT(v32A, i32Index);
 511                 v32Result     = B->VINSERT(v32Result, val, B->C(l));
 512             }
 513         }
 514         return cast<Instruction>(v32Result);
 515     }
 516
 517     Instruction*
 518     VGATHER_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst)
 519     {
 520         Builder* B           = pThis->B;
 521         auto     vSrc        = pCallInst->getArgOperand(0);
 522         auto     pBase       = pCallInst->getArgOperand(1);
 523         auto     vi32Indices = pCallInst->getArgOperand(2);
 524         auto     vi1Mask     = pCallInst->getArgOperand(3);
 525         auto     i8Scale     = pCallInst->getArgOperand(4);
 526
 527         pBase              = B->POINTER_CAST(pBase, PointerType::get(B->mInt8Ty, 0));
 528 #if LLVM_VERSION_MAJOR >= 11
 529         VectorType* pVectorType = cast<VectorType>(vSrc->getType());
 530         uint32_t    numElem     = pVectorType->getNumElements();
 531         auto        srcTy       = pVectorType->getElementType();
 532 #else
 533         uint32_t numElem   = vSrc->getType()->getVectorNumElements();
 534         auto     srcTy     = vSrc->getType()->getVectorElementType();
 535 #endif
 536         auto     i32Scale  = B->Z_EXT(i8Scale, B->mInt32Ty);
 537
 538         Value*   v32Gather = nullptr;
 539         if (arch == AVX)
 540         {
 541             // Full emulation for AVX
 542             // Store source on stack to provide a valid address to load from inactive lanes
 543             auto pStack = B->STACKSAVE();
 544             auto pTmp   = B->ALLOCA(vSrc->getType());
 545             B->STORE(vSrc, pTmp);
 546
 547             v32Gather        = UndefValue::get(vSrc->getType());
 548 #if LLVM_VERSION_MAJOR > 10
 549             auto vi32Scale   = ConstantVector::getSplat(ElementCount(numElem, false), cast<ConstantInt>(i32Scale));
 550 #else
 551             auto vi32Scale   = ConstantVector::getSplat(numElem, cast<ConstantInt>(i32Scale));
 552 #endif
 553             auto vi32Offsets = B->MUL(vi32Indices, vi32Scale);
 554
 555             for (uint32_t i = 0; i < numElem; ++i)
 556             {
 557                 auto i32Offset          = B->VEXTRACT(vi32Offsets, B->C(i));
 558                 auto pLoadAddress       = B->GEP(pBase, i32Offset);
 559                 pLoadAddress            = B->BITCAST(pLoadAddress, PointerType::get(srcTy, 0));
 560                 auto pMaskedLoadAddress = B->GEP(pTmp, {0, i});
 561                 auto i1Mask             = B->VEXTRACT(vi1Mask, B->C(i));
 562                 auto pValidAddress      = B->SELECT(i1Mask, pLoadAddress, pMaskedLoadAddress);
 563                 auto val                = B->LOAD(pValidAddress);
 564                 v32Gather               = B->VINSERT(v32Gather, val, B->C(i));
 565             }
 566
 567             B->STACKRESTORE(pStack);
 568         }
 569         else if (arch == AVX2 || (arch == AVX512 && width == W256))
 570         {
 571             Function* pX86IntrinFunc = nullptr;
 572             if (srcTy == B->mFP32Ty)
 573             {
 574                 pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule,
 575                                                            Intrinsic::x86_avx2_gather_d_ps_256);
 576             }
 577             else if (srcTy == B->mInt32Ty)
 578             {
 579                 pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule,
 580                                                            Intrinsic::x86_avx2_gather_d_d_256);
 581             }
 582             else if (srcTy == B->mDoubleTy)
 583             {
 584                 pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule,
 585                                                            Intrinsic::x86_avx2_gather_d_q_256);
 586             }
 587             else
 588             {
 589                 SWR_ASSERT(false, "Unsupported vector element type for gather.");
 590             }
 591
 592             if (width == W256)
 593             {
 594                 auto v32Mask = B->BITCAST(pThis->VectorMask(vi1Mask), vSrc->getType());
 595                 v32Gather = B->CALL(pX86IntrinFunc, {vSrc, pBase, vi32Indices, v32Mask, i8Scale});
 596             }
 597             else if (width == W512)
 598             {
 599                 // Double pump 4-wide for 64bit elements
 600 #if LLVM_VERSION_MAJOR >= 11
 601                 if (cast<VectorType>(vSrc->getType())->getElementType() == B->mDoubleTy)
 602 #else
 603                 if (vSrc->getType()->getVectorElementType() == B->mDoubleTy)
 604 #endif
 605                 {
 606                     auto v64Mask = pThis->VectorMask(vi1Mask);
 607 #if LLVM_VERSION_MAJOR >= 11
 608                     uint32_t numElem = cast<VectorType>(v64Mask->getType())->getNumElements();
 609 #else
 610                     uint32_t numElem = v64Mask->getType()->getVectorNumElements();
 611 #endif
 612                     v64Mask = B->S_EXT(v64Mask, VectorType::get(B->mInt64Ty, numElem));
 613                     v64Mask = B->BITCAST(v64Mask, vSrc->getType());
 614
 615                     Value* src0 = B->VSHUFFLE(vSrc, vSrc, B->C({0, 1, 2, 3}));
 616                     Value* src1 = B->VSHUFFLE(vSrc, vSrc, B->C({4, 5, 6, 7}));
 617
 618                     Value* indices0 = B->VSHUFFLE(vi32Indices, vi32Indices, B->C({0, 1, 2, 3}));
 619                     Value* indices1 = B->VSHUFFLE(vi32Indices, vi32Indices, B->C({4, 5, 6, 7}));
 620
 621                     Value* mask0 = B->VSHUFFLE(v64Mask, v64Mask, B->C({0, 1, 2, 3}));
 622                     Value* mask1 = B->VSHUFFLE(v64Mask, v64Mask, B->C({4, 5, 6, 7}));
 623
 624 #if LLVM_VERSION_MAJOR >= 11
 625                     uint32_t numElemSrc0  = cast<VectorType>(src0->getType())->getNumElements();
 626                     uint32_t numElemMask0 = cast<VectorType>(mask0->getType())->getNumElements();
 627                     uint32_t numElemSrc1  = cast<VectorType>(src1->getType())->getNumElements();
 628                     uint32_t numElemMask1 = cast<VectorType>(mask1->getType())->getNumElements();
 629 #else
 630                     uint32_t numElemSrc0  = src0->getType()->getVectorNumElements();
 631                     uint32_t numElemMask0 = mask0->getType()->getVectorNumElements();
 632                     uint32_t numElemSrc1  = src1->getType()->getVectorNumElements();
 633                     uint32_t numElemMask1 = mask1->getType()->getVectorNumElements();
 634 #endif
 635                     src0 = B->BITCAST(src0, VectorType::get(B->mInt64Ty, numElemSrc0));
 636                     mask0 = B->BITCAST(mask0, VectorType::get(B->mInt64Ty, numElemMask0));
 637                     Value* gather0 =
 638                         B->CALL(pX86IntrinFunc, {src0, pBase, indices0, mask0, i8Scale});
 639                     src1 = B->BITCAST(src1, VectorType::get(B->mInt64Ty, numElemSrc1));
 640                     mask1 = B->BITCAST(mask1, VectorType::get(B->mInt64Ty, numElemMask1));
 641                     Value* gather1 =
 642                         B->CALL(pX86IntrinFunc, {src1, pBase, indices1, mask1, i8Scale});
 643                     v32Gather = B->VSHUFFLE(gather0, gather1, B->C({0, 1, 2, 3, 4, 5, 6, 7}));
 644                     v32Gather = B->BITCAST(v32Gather, vSrc->getType());
 645                 }
 646                 else
 647                 {
 648                     // Double pump 8-wide for 32bit elements
 649                     auto v32Mask = pThis->VectorMask(vi1Mask);
 650                     v32Mask      = B->BITCAST(v32Mask, vSrc->getType());
 651                     Value* src0  = B->EXTRACT_16(vSrc, 0);
 652                     Value* src1  = B->EXTRACT_16(vSrc, 1);
 653
 654                     Value* indices0 = B->EXTRACT_16(vi32Indices, 0);
 655                     Value* indices1 = B->EXTRACT_16(vi32Indices, 1);
 656
 657                     Value* mask0 = B->EXTRACT_16(v32Mask, 0);
 658                     Value* mask1 = B->EXTRACT_16(v32Mask, 1);
 659
 660                     Value* gather0 =
 661                         B->CALL(pX86IntrinFunc, {src0, pBase, indices0, mask0, i8Scale});
 662                     Value* gather1 =
 663                         B->CALL(pX86IntrinFunc, {src1, pBase, indices1, mask1, i8Scale});
 664
 665                     v32Gather = B->JOIN_16(gather0, gather1);
 666                 }
 667             }
 668         }
 669         else if (arch == AVX512)
 670         {
 671             Value*    iMask = nullptr;
 672             Function* pX86IntrinFunc = nullptr;
 673             if (srcTy == B->mFP32Ty)
 674             {
 675                 pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule,
 676                                                            Intrinsic::x86_avx512_gather_dps_512);
 677                 iMask          = B->BITCAST(vi1Mask, B->mInt16Ty);
 678             }
 679             else if (srcTy == B->mInt32Ty)
 680             {
 681                 pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule,
 682                                                            Intrinsic::x86_avx512_gather_dpi_512);
 683                 iMask          = B->BITCAST(vi1Mask, B->mInt16Ty);
 684             }
 685             else if (srcTy == B->mDoubleTy)
 686             {
 687                 pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule,
 688                                                            Intrinsic::x86_avx512_gather_dpd_512);
 689                 iMask          = B->BITCAST(vi1Mask, B->mInt8Ty);
 690             }
 691             else
 692             {
 693                 SWR_ASSERT(false, "Unsupported vector element type for gather.");
 694             }
 695
 696             auto i32Scale = B->Z_EXT(i8Scale, B->mInt32Ty);
 697             v32Gather     = B->CALL(pX86IntrinFunc, {vSrc, pBase, vi32Indices, iMask, i32Scale});
 698         }
 699
 700         return cast<Instruction>(v32Gather);
 701     }
 702     Instruction*
 703     VSCATTER_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst)
 704     {
 705         Builder* B           = pThis->B;
 706         auto     pBase       = pCallInst->getArgOperand(0);
 707         auto     vi1Mask     = pCallInst->getArgOperand(1);
 708         auto     vi32Indices = pCallInst->getArgOperand(2);
 709         auto     v32Src      = pCallInst->getArgOperand(3);
 710         auto     i32Scale    = pCallInst->getArgOperand(4);
 711
 712         if (arch != AVX512)
 713         {
 714             // Call into C function to do the scatter. This has significantly better compile perf
 715             // compared to jitting scatter loops for every scatter
 716             if (width == W256)
 717             {
 718                 auto mask = B->BITCAST(vi1Mask, B->mInt8Ty);
 719                 B->CALL(pThis->mPfnScatter256, {pBase, vi32Indices, v32Src, mask, i32Scale});
 720             }
 721             else
 722             {
 723                 // Need to break up 512 wide scatter to two 256 wide
 724                 auto maskLo = B->VSHUFFLE(vi1Mask, vi1Mask, B->C({0, 1, 2, 3, 4, 5, 6, 7}));
 725                 auto indicesLo =
 726                     B->VSHUFFLE(vi32Indices, vi32Indices, B->C({0, 1, 2, 3, 4, 5, 6, 7}));
 727                 auto srcLo = B->VSHUFFLE(v32Src, v32Src, B->C({0, 1, 2, 3, 4, 5, 6, 7}));
 728
 729                 auto mask = B->BITCAST(maskLo, B->mInt8Ty);
 730                 B->CALL(pThis->mPfnScatter256, {pBase, indicesLo, srcLo, mask, i32Scale});
 731
 732                 auto maskHi = B->VSHUFFLE(vi1Mask, vi1Mask, B->C({8, 9, 10, 11, 12, 13, 14, 15}));
 733                 auto indicesHi =
 734                     B->VSHUFFLE(vi32Indices, vi32Indices, B->C({8, 9, 10, 11, 12, 13, 14, 15}));
 735                 auto srcHi = B->VSHUFFLE(v32Src, v32Src, B->C({8, 9, 10, 11, 12, 13, 14, 15}));
 736
 737                 mask = B->BITCAST(maskHi, B->mInt8Ty);
 738                 B->CALL(pThis->mPfnScatter256, {pBase, indicesHi, srcHi, mask, i32Scale});
 739             }
 740             return nullptr;
 741         }
 742
 743         Value*    iMask;
 744         Function* pX86IntrinFunc;
 745         if (width == W256)
 746         {
 747             // No direct intrinsic supported in llvm to scatter 8 elem with 32bit indices, but we
 748             // can use the scatter of 8 elements with 64bit indices
 749             pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule,
 750                                                        Intrinsic::x86_avx512_scatter_qps_512);
 751
 752             auto vi32IndicesExt = B->Z_EXT(vi32Indices, B->mSimdInt64Ty);
 753             iMask               = B->BITCAST(vi1Mask, B->mInt8Ty);
 754             B->CALL(pX86IntrinFunc, {pBase, iMask, vi32IndicesExt, v32Src, i32Scale});
 755         }
 756         else if (width == W512)
 757         {
 758             pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule,
 759                                                        Intrinsic::x86_avx512_scatter_dps_512);
 760             iMask          = B->BITCAST(vi1Mask, B->mInt16Ty);
 761             B->CALL(pX86IntrinFunc, {pBase, iMask, vi32Indices, v32Src, i32Scale});
 762         }
 763         return nullptr;
 764     }
 765
 766     // No support for vroundps in avx512 (it is available in kncni), so emulate with avx
 767     // instructions
 768     Instruction*
 769     VROUND_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst)
 770     {
 771         SWR_ASSERT(arch == AVX512);
 772
 773         auto B       = pThis->B;
 774         auto vf32Src = pCallInst->getOperand(0);
 775         assert(vf32Src);
 776         auto i8Round = pCallInst->getOperand(1);
 777         assert(i8Round);
 778         auto pfnFunc =
 779             Intrinsic::getDeclaration(B->JM()->mpCurrentModule, Intrinsic::x86_avx_round_ps_256);
 780
 781         if (width == W256)
 782         {
 783             return cast<Instruction>(B->CALL2(pfnFunc, vf32Src, i8Round));
 784         }
 785         else if (width == W512)
 786         {
 787             auto v8f32SrcLo = B->EXTRACT_16(vf32Src, 0);
 788             auto v8f32SrcHi = B->EXTRACT_16(vf32Src, 1);
 789
 790             auto v8f32ResLo = B->CALL2(pfnFunc, v8f32SrcLo, i8Round);
 791             auto v8f32ResHi = B->CALL2(pfnFunc, v8f32SrcHi, i8Round);
 792
 793             return cast<Instruction>(B->JOIN_16(v8f32ResLo, v8f32ResHi));
 794         }
 795         else
 796         {
 797             SWR_ASSERT(false, "Unimplemented vector width.");
 798         }
 799
 800         return nullptr;
 801     }
 802
 803     Instruction*
 804     VCONVERT_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst)
 805     {
 806         SWR_ASSERT(arch == AVX512);
 807
 808         auto B       = pThis->B;
 809         auto vf32Src = pCallInst->getOperand(0);
 810
 811         if (width == W256)
 812         {
 813             auto vf32SrcRound = Intrinsic::getDeclaration(B->JM()->mpCurrentModule,
 814                                                           Intrinsic::x86_avx_round_ps_256);
 815             return cast<Instruction>(B->FP_TRUNC(vf32SrcRound, B->mFP32Ty));
 816         }
 817         else if (width == W512)
 818         {
 819             // 512 can use intrinsic
 820             auto pfnFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule,
 821                                                      Intrinsic::x86_avx512_mask_cvtpd2ps_512);
 822             return cast<Instruction>(B->CALL(pfnFunc, vf32Src));
 823         }
 824         else
 825         {
 826             SWR_ASSERT(false, "Unimplemented vector width.");
 827         }
 828
 829         return nullptr;
 830     }
 831
 832     // No support for hsub in AVX512
 833     Instruction* VHSUB_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst)
 834     {
 835         SWR_ASSERT(arch == AVX512);
 836
 837         auto B    = pThis->B;
 838         auto src0 = pCallInst->getOperand(0);
 839         auto src1 = pCallInst->getOperand(1);
 840
 841         // 256b hsub can just use avx intrinsic
 842         if (width == W256)
 843         {
 844             auto pX86IntrinFunc =
 845                 Intrinsic::getDeclaration(B->JM()->mpCurrentModule, Intrinsic::x86_avx_hsub_ps_256);
 846             return cast<Instruction>(B->CALL2(pX86IntrinFunc, src0, src1));
 847         }
 848         else if (width == W512)
 849         {
 850             // 512b hsub can be accomplished with shuf/sub combo
 851             auto minuend    = B->VSHUFFLE(src0, src1, B->C({0, 2, 8, 10, 4, 6, 12, 14}));
 852             auto subtrahend = B->VSHUFFLE(src0, src1, B->C({1, 3, 9, 11, 5, 7, 13, 15}));
 853             return cast<Instruction>(B->SUB(minuend, subtrahend));
 854         }
 855         else
 856         {
 857             SWR_ASSERT(false, "Unimplemented vector width.");
 858             return nullptr;
 859         }
 860     }
 861
 862     // Double pump input using Intrin template arg. This blindly extracts lower and upper 256 from
 863     // each vector argument and calls the 256 wide intrinsic, then merges the results to 512 wide
 864     Instruction* DOUBLE_EMU(LowerX86*     pThis,
 865                             TargetArch    arch,
 866                             TargetWidth   width,
 867                             CallInst*     pCallInst,
 868                             Intrinsic::ID intrin)
 869     {
 870         auto B = pThis->B;
 871         SWR_ASSERT(width == W512);
 872         Value*    result[2];
 873         Function* pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule, intrin);
 874         for (uint32_t i = 0; i < 2; ++i)
 875         {
 876             SmallVector<Value*, 8> args;
 877             for (auto& arg : pCallInst->arg_operands())
 878             {
 879                 auto argType = arg.get()->getType();
 880                 if (argType->isVectorTy())
 881                 {
 882 #if LLVM_VERSION_MAJOR >= 11
 883                     uint32_t vecWidth  = cast<VectorType>(argType)->getNumElements();
 884                     auto     elemTy    = cast<VectorType>(argType)->getElementType();
 885 #else
 886                     uint32_t vecWidth  = argType->getVectorNumElements();
 887                     auto     elemTy    = argType->getVectorElementType();
 888 #endif
 889                     Value*   lanes     = B->CInc<int>(i * vecWidth / 2, vecWidth / 2);
 890                     Value*   argToPush = B->VSHUFFLE(arg.get(), B->VUNDEF(elemTy, vecWidth), lanes);
 891                     args.push_back(argToPush);
 892                 }
 893                 else
 894                 {
 895                     args.push_back(arg.get());
 896                 }
 897             }
 898             result[i] = B->CALLA(pX86IntrinFunc, args);
 899         }
 900         uint32_t vecWidth;
 901         if (result[0]->getType()->isVectorTy())
 902         {
 903             assert(result[1]->getType()->isVectorTy());
 904 #if LLVM_VERSION_MAJOR >= 11
 905             vecWidth = cast<VectorType>(result[0]->getType())->getNumElements() +
 906                        cast<VectorType>(result[1]->getType())->getNumElements();
 907 #else
 908             vecWidth = result[0]->getType()->getVectorNumElements() +
 909                        result[1]->getType()->getVectorNumElements();
 910 #endif
 911         }
 912         else
 913         {
 914             vecWidth = 2;
 915         }
 916         Value* lanes = B->CInc<int>(0, vecWidth);
 917         return cast<Instruction>(B->VSHUFFLE(result[0], result[1], lanes));
 918     }
 919
 920 } // namespace SwrJit
 921
 922 using namespace SwrJit;
 923
 924 INITIALIZE_PASS_BEGIN(LowerX86, "LowerX86", "LowerX86", false, false)
 925 INITIALIZE_PASS_END(LowerX86, "LowerX86", "LowerX86", false, false)