src/gallium/drivers/swr/rasterizer/jitter/functionpasses/lower_x86.cpp

   1 /****************************************************************************
   2  * Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  *
  23  * @file lower_x86.cpp
  24  *
  25  * @brief llvm pass to lower meta code to x86
  26  *
  27  * Notes:
  28  *
  29  ******************************************************************************/
  30
  31 #include "jit_pch.hpp"
  32 #include "passes.h"
  33 #include "JitManager.h"
  34
  35 #include "common/simdlib.hpp"
  36
  37 #include <unordered_map>
  38
  39 extern "C" void ScatterPS_256(uint8_t*, SIMD256::Integer, SIMD256::Float, uint8_t, uint32_t);
  40
  41 namespace llvm
  42 {
  43     // foward declare the initializer
  44     void initializeLowerX86Pass(PassRegistry&);
  45 } // namespace llvm
  46
  47 namespace SwrJit
  48 {
  49     using namespace llvm;
  50
  51 #if LLVM_VERSION_MAJOR > 10
  52     typedef unsigned IntrinsicID;
  53 #else
  54     typedef Intrinsic::ID IntrinsicID;
  55 #endif
  56
  57     enum TargetArch
  58     {
  59         AVX    = 0,
  60         AVX2   = 1,
  61         AVX512 = 2
  62     };
  63
  64     enum TargetWidth
  65     {
  66         W256       = 0,
  67         W512       = 1,
  68         NUM_WIDTHS = 2
  69     };
  70
  71     struct LowerX86;
  72
  73     typedef std::function<Instruction*(LowerX86*, TargetArch, TargetWidth, CallInst*)> EmuFunc;
  74
  75     struct X86Intrinsic
  76     {
  77         IntrinsicID intrin[NUM_WIDTHS];
  78         EmuFunc       emuFunc;
  79     };
  80
  81     // Map of intrinsics that haven't been moved to the new mechanism yet. If used, these get the
  82     // previous behavior of mapping directly to avx/avx2 intrinsics.
  83     static std::map<std::string, IntrinsicID> intrinsicMap = {
  84         {"meta.intrinsic.BEXTR_32", Intrinsic::x86_bmi_bextr_32},
  85         {"meta.intrinsic.VPSHUFB", Intrinsic::x86_avx2_pshuf_b},
  86         {"meta.intrinsic.VCVTPS2PH", Intrinsic::x86_vcvtps2ph_256},
  87         {"meta.intrinsic.VPTESTC", Intrinsic::x86_avx_ptestc_256},
  88         {"meta.intrinsic.VPTESTZ", Intrinsic::x86_avx_ptestz_256},
  89         {"meta.intrinsic.VPHADDD", Intrinsic::x86_avx2_phadd_d},
  90         {"meta.intrinsic.PDEP32", Intrinsic::x86_bmi_pdep_32},
  91         {"meta.intrinsic.RDTSC", Intrinsic::x86_rdtsc},
  92     };
  93
  94     // Forward decls
  95     Instruction* NO_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst);
  96     Instruction*
  97     VPERM_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst);
  98     Instruction*
  99     VGATHER_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst);
 100     Instruction*
 101     VSCATTER_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst);
 102     Instruction*
 103     VROUND_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst);
 104     Instruction*
 105     VHSUB_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst);
 106     Instruction*
 107     VCONVERT_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst);
 108
 109     Instruction* DOUBLE_EMU(LowerX86*     pThis,
 110                             TargetArch    arch,
 111                             TargetWidth   width,
 112                             CallInst*     pCallInst,
 113                             Intrinsic::ID intrin);
 114
 115     static Intrinsic::ID DOUBLE = (Intrinsic::ID)-1;
 116
 117     // clang-format off
 118     static std::map<std::string, X86Intrinsic> intrinsicMap2[] = {
 119         //                               256 wide                               512 wide
 120         {
 121             // AVX
 122             {"meta.intrinsic.VRCPPS",    {{Intrinsic::x86_avx_rcp_ps_256,       DOUBLE},                    NO_EMU}},
 123             {"meta.intrinsic.VPERMPS",   {{Intrinsic::not_intrinsic,            Intrinsic::not_intrinsic},  VPERM_EMU}},
 124             {"meta.intrinsic.VPERMD",    {{Intrinsic::not_intrinsic,            Intrinsic::not_intrinsic},  VPERM_EMU}},
 125             {"meta.intrinsic.VGATHERPD", {{Intrinsic::not_intrinsic,            Intrinsic::not_intrinsic},  VGATHER_EMU}},
 126             {"meta.intrinsic.VGATHERPS", {{Intrinsic::not_intrinsic,            Intrinsic::not_intrinsic},  VGATHER_EMU}},
 127             {"meta.intrinsic.VGATHERDD", {{Intrinsic::not_intrinsic,            Intrinsic::not_intrinsic},  VGATHER_EMU}},
 128             {"meta.intrinsic.VSCATTERPS", {{Intrinsic::not_intrinsic,           Intrinsic::not_intrinsic}, VSCATTER_EMU}},
 129             {"meta.intrinsic.VCVTPD2PS", {{Intrinsic::x86_avx_cvt_pd2_ps_256,   Intrinsic::not_intrinsic},  NO_EMU}},
 130             {"meta.intrinsic.VROUND",    {{Intrinsic::x86_avx_round_ps_256,     DOUBLE},                    NO_EMU}},
 131             {"meta.intrinsic.VHSUBPS",   {{Intrinsic::x86_avx_hsub_ps_256,      DOUBLE},                    NO_EMU}},
 132         },
 133         {
 134             // AVX2
 135             {"meta.intrinsic.VRCPPS",       {{Intrinsic::x86_avx_rcp_ps_256,    DOUBLE},                    NO_EMU}},
 136             {"meta.intrinsic.VPERMPS",      {{Intrinsic::x86_avx2_permps,       Intrinsic::not_intrinsic},  VPERM_EMU}},
 137             {"meta.intrinsic.VPERMD",       {{Intrinsic::x86_avx2_permd,        Intrinsic::not_intrinsic},  VPERM_EMU}},
 138             {"meta.intrinsic.VGATHERPD",    {{Intrinsic::not_intrinsic,         Intrinsic::not_intrinsic},  VGATHER_EMU}},
 139             {"meta.intrinsic.VGATHERPS",    {{Intrinsic::not_intrinsic,         Intrinsic::not_intrinsic},  VGATHER_EMU}},
 140             {"meta.intrinsic.VGATHERDD",    {{Intrinsic::not_intrinsic,         Intrinsic::not_intrinsic},  VGATHER_EMU}},
 141             {"meta.intrinsic.VSCATTERPS", {{Intrinsic::not_intrinsic,           Intrinsic::not_intrinsic}, VSCATTER_EMU}},
 142             {"meta.intrinsic.VCVTPD2PS",    {{Intrinsic::x86_avx_cvt_pd2_ps_256, DOUBLE},                   NO_EMU}},
 143             {"meta.intrinsic.VROUND",       {{Intrinsic::x86_avx_round_ps_256,  DOUBLE},                    NO_EMU}},
 144             {"meta.intrinsic.VHSUBPS",      {{Intrinsic::x86_avx_hsub_ps_256,   DOUBLE},                    NO_EMU}},
 145         },
 146         {
 147             // AVX512
 148             {"meta.intrinsic.VRCPPS", {{Intrinsic::x86_avx512_rcp14_ps_256,     Intrinsic::x86_avx512_rcp14_ps_512}, NO_EMU}},
 149 #if LLVM_VERSION_MAJOR < 7
 150             {"meta.intrinsic.VPERMPS", {{Intrinsic::x86_avx512_mask_permvar_sf_256, Intrinsic::x86_avx512_mask_permvar_sf_512}, NO_EMU}},
 151             {"meta.intrinsic.VPERMD", {{Intrinsic::x86_avx512_mask_permvar_si_256, Intrinsic::x86_avx512_mask_permvar_si_512}, NO_EMU}},
 152 #else
 153             {"meta.intrinsic.VPERMPS", {{Intrinsic::not_intrinsic,              Intrinsic::not_intrinsic}, VPERM_EMU}},
 154             {"meta.intrinsic.VPERMD", {{Intrinsic::not_intrinsic,               Intrinsic::not_intrinsic}, VPERM_EMU}},
 155 #endif
 156             {"meta.intrinsic.VGATHERPD", {{Intrinsic::not_intrinsic,            Intrinsic::not_intrinsic}, VGATHER_EMU}},
 157             {"meta.intrinsic.VGATHERPS", {{Intrinsic::not_intrinsic,            Intrinsic::not_intrinsic}, VGATHER_EMU}},
 158             {"meta.intrinsic.VGATHERDD", {{Intrinsic::not_intrinsic,            Intrinsic::not_intrinsic}, VGATHER_EMU}},
 159             {"meta.intrinsic.VSCATTERPS", {{Intrinsic::not_intrinsic,           Intrinsic::not_intrinsic}, VSCATTER_EMU}},
 160 #if LLVM_VERSION_MAJOR < 7
 161             {"meta.intrinsic.VCVTPD2PS", {{Intrinsic::x86_avx512_mask_cvtpd2ps_256, Intrinsic::x86_avx512_mask_cvtpd2ps_512}, NO_EMU}},
 162 #else
 163             {"meta.intrinsic.VCVTPD2PS", {{Intrinsic::not_intrinsic,            Intrinsic::not_intrinsic}, VCONVERT_EMU}},
 164 #endif
 165             {"meta.intrinsic.VROUND", {{Intrinsic::not_intrinsic,               Intrinsic::not_intrinsic}, VROUND_EMU}},
 166             {"meta.intrinsic.VHSUBPS", {{Intrinsic::not_intrinsic,              Intrinsic::not_intrinsic}, VHSUB_EMU}},
 167         }};
 168     // clang-format on
 169
 170     struct LowerX86 : public FunctionPass
 171     {
 172         LowerX86(Builder* b = nullptr) : FunctionPass(ID), B(b)
 173         {
 174             initializeLowerX86Pass(*PassRegistry::getPassRegistry());
 175
 176             // Determine target arch
 177             if (JM()->mArch.AVX512F())
 178             {
 179                 mTarget = AVX512;
 180             }
 181             else if (JM()->mArch.AVX2())
 182             {
 183                 mTarget = AVX2;
 184             }
 185             else if (JM()->mArch.AVX())
 186             {
 187                 mTarget = AVX;
 188             }
 189             else
 190             {
 191                 SWR_ASSERT(false, "Unsupported AVX architecture.");
 192                 mTarget = AVX;
 193             }
 194
 195             // Setup scatter function for 256 wide
 196             uint32_t curWidth = B->mVWidth;
 197             B->SetTargetWidth(8);
 198             std::vector<Type*> args = {
 199                 B->mInt8PtrTy,   // pBase
 200                 B->mSimdInt32Ty, // vIndices
 201                 B->mSimdFP32Ty,  // vSrc
 202                 B->mInt8Ty,      // mask
 203                 B->mInt32Ty      // scale
 204             };
 205
 206             FunctionType* pfnScatterTy = FunctionType::get(B->mVoidTy, args, false);
 207             mPfnScatter256             = cast<Function>(
 208 #if LLVM_VERSION_MAJOR >= 9
 209                 B->JM()->mpCurrentModule->getOrInsertFunction("ScatterPS_256", pfnScatterTy).getCallee());
 210 #else
 211                 B->JM()->mpCurrentModule->getOrInsertFunction("ScatterPS_256", pfnScatterTy));
 212 #endif
 213             if (sys::DynamicLibrary::SearchForAddressOfSymbol("ScatterPS_256") == nullptr)
 214             {
 215                 sys::DynamicLibrary::AddSymbol("ScatterPS_256", (void*)&ScatterPS_256);
 216             }
 217
 218             B->SetTargetWidth(curWidth);
 219         }
 220
 221         // Try to decipher the vector type of the instruction. This does not work properly
 222         // across all intrinsics, and will have to be rethought. Probably need something
 223         // similar to llvm's getDeclaration() utility to map a set of inputs to a specific typed
 224         // intrinsic.
 225         void GetRequestedWidthAndType(CallInst*       pCallInst,
 226                                       const StringRef intrinName,
 227                                       TargetWidth*    pWidth,
 228                                       Type**          pTy)
 229         {
 230             assert(pCallInst);
 231             Type* pVecTy = pCallInst->getType();
 232
 233             // Check for intrinsic specific types
 234             // VCVTPD2PS type comes from src, not dst
 235             if (intrinName.equals("meta.intrinsic.VCVTPD2PS"))
 236             {
 237                 Value* pOp = pCallInst->getOperand(0);
 238                 assert(pOp);
 239                 pVecTy = pOp->getType();
 240             }
 241
 242             if (!pVecTy->isVectorTy())
 243             {
 244                 for (auto& op : pCallInst->arg_operands())
 245                 {
 246                     if (op.get()->getType()->isVectorTy())
 247                     {
 248                         pVecTy = op.get()->getType();
 249                         break;
 250                     }
 251                 }
 252             }
 253             SWR_ASSERT(pVecTy->isVectorTy(), "Couldn't determine vector size");
 254
 255             uint32_t width = cast<VectorType>(pVecTy)->getBitWidth();
 256             switch (width)
 257             {
 258             case 256:
 259                 *pWidth = W256;
 260                 break;
 261             case 512:
 262                 *pWidth = W512;
 263                 break;
 264             default:
 265                 SWR_ASSERT(false, "Unhandled vector width %d", width);
 266                 *pWidth = W256;
 267             }
 268
 269             *pTy = pVecTy->getScalarType();
 270         }
 271
 272         Value* GetZeroVec(TargetWidth width, Type* pTy)
 273         {
 274             uint32_t numElem = 0;
 275             switch (width)
 276             {
 277             case W256:
 278                 numElem = 8;
 279                 break;
 280             case W512:
 281                 numElem = 16;
 282                 break;
 283             default:
 284                 SWR_ASSERT(false, "Unhandled vector width type %d\n", width);
 285             }
 286
 287             return ConstantVector::getNullValue(VectorType::get(pTy, numElem));
 288         }
 289
 290         Value* GetMask(TargetWidth width)
 291         {
 292             Value* mask;
 293             switch (width)
 294             {
 295             case W256:
 296                 mask = B->C((uint8_t)-1);
 297                 break;
 298             case W512:
 299                 mask = B->C((uint16_t)-1);
 300                 break;
 301             default:
 302                 SWR_ASSERT(false, "Unhandled vector width type %d\n", width);
 303             }
 304             return mask;
 305         }
 306
 307         // Convert <N x i1> mask to <N x i32> x86 mask
 308         Value* VectorMask(Value* vi1Mask)
 309         {
 310             uint32_t numElem = vi1Mask->getType()->getVectorNumElements();
 311             return B->S_EXT(vi1Mask, VectorType::get(B->mInt32Ty, numElem));
 312         }
 313
 314         Instruction* ProcessIntrinsicAdvanced(CallInst* pCallInst)
 315         {
 316             Function*   pFunc = pCallInst->getCalledFunction();
 317             assert(pFunc);
 318
 319             auto&       intrinsic = intrinsicMap2[mTarget][pFunc->getName().str()];
 320             TargetWidth vecWidth;
 321             Type*       pElemTy;
 322             GetRequestedWidthAndType(pCallInst, pFunc->getName(), &vecWidth, &pElemTy);
 323
 324             // Check if there is a native intrinsic for this instruction
 325             IntrinsicID id = intrinsic.intrin[vecWidth];
 326             if (id == DOUBLE)
 327             {
 328                 // Double pump the next smaller SIMD intrinsic
 329                 SWR_ASSERT(vecWidth != 0, "Cannot double pump smallest SIMD width.");
 330                 Intrinsic::ID id2 = intrinsic.intrin[vecWidth - 1];
 331                 SWR_ASSERT(id2 != Intrinsic::not_intrinsic,
 332                            "Cannot find intrinsic to double pump.");
 333                 return DOUBLE_EMU(this, mTarget, vecWidth, pCallInst, id2);
 334             }
 335             else if (id != Intrinsic::not_intrinsic)
 336             {
 337                 Function* pIntrin = Intrinsic::getDeclaration(B->JM()->mpCurrentModule, id);
 338                 SmallVector<Value*, 8> args;
 339                 for (auto& arg : pCallInst->arg_operands())
 340                 {
 341                     args.push_back(arg.get());
 342                 }
 343
 344                 // If AVX512, all instructions add a src operand and mask. We'll pass in 0 src and
 345                 // full mask for now Assuming the intrinsics are consistent and place the src
 346                 // operand and mask last in the argument list.
 347                 if (mTarget == AVX512)
 348                 {
 349                     if (pFunc->getName().equals("meta.intrinsic.VCVTPD2PS"))
 350                     {
 351                         args.push_back(GetZeroVec(W256, pCallInst->getType()->getScalarType()));
 352                         args.push_back(GetMask(W256));
 353                         // for AVX512 VCVTPD2PS, we also have to add rounding mode
 354                         args.push_back(B->C(_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
 355                     }
 356                     else
 357                     {
 358                         args.push_back(GetZeroVec(vecWidth, pElemTy));
 359                         args.push_back(GetMask(vecWidth));
 360                     }
 361                 }
 362
 363                 return B->CALLA(pIntrin, args);
 364             }
 365             else
 366             {
 367                 // No native intrinsic, call emulation function
 368                 return intrinsic.emuFunc(this, mTarget, vecWidth, pCallInst);
 369             }
 370
 371             SWR_ASSERT(false);
 372             return nullptr;
 373         }
 374
 375         Instruction* ProcessIntrinsic(CallInst* pCallInst)
 376         {
 377             Function* pFunc = pCallInst->getCalledFunction();
 378             assert(pFunc);
 379
 380             // Forward to the advanced support if found
 381             if (intrinsicMap2[mTarget].find(pFunc->getName().str()) != intrinsicMap2[mTarget].end())
 382             {
 383                 return ProcessIntrinsicAdvanced(pCallInst);
 384             }
 385
 386             SWR_ASSERT(intrinsicMap.find(pFunc->getName().str()) != intrinsicMap.end(),
 387                        "Unimplemented intrinsic %s.",
 388                        pFunc->getName().str().c_str());
 389
 390             Intrinsic::ID x86Intrinsic = intrinsicMap[pFunc->getName().str()];
 391             Function*     pX86IntrinFunc =
 392                 Intrinsic::getDeclaration(B->JM()->mpCurrentModule, x86Intrinsic);
 393
 394             SmallVector<Value*, 8> args;
 395             for (auto& arg : pCallInst->arg_operands())
 396             {
 397                 args.push_back(arg.get());
 398             }
 399             return B->CALLA(pX86IntrinFunc, args);
 400         }
 401
 402         //////////////////////////////////////////////////////////////////////////
 403         /// @brief LLVM funtion pass run method.
 404         /// @param f- The function we're working on with this pass.
 405         virtual bool runOnFunction(Function& F)
 406         {
 407             std::vector<Instruction*> toRemove;
 408             std::vector<BasicBlock*>  bbs;
 409
 410             // Make temp copy of the basic blocks and instructions, as the intrinsic
 411             // replacement code might invalidate the iterators
 412             for (auto& b : F.getBasicBlockList())
 413             {
 414                 bbs.push_back(&b);
 415             }
 416
 417             for (auto* BB : bbs)
 418             {
 419                 std::vector<Instruction*> insts;
 420                 for (auto& i : BB->getInstList())
 421                 {
 422                     insts.push_back(&i);
 423                 }
 424
 425                 for (auto* I : insts)
 426                 {
 427                     if (CallInst* pCallInst = dyn_cast<CallInst>(I))
 428                     {
 429                         Function* pFunc = pCallInst->getCalledFunction();
 430                         if (pFunc)
 431                         {
 432                             if (pFunc->getName().startswith("meta.intrinsic"))
 433                             {
 434                                 B->IRB()->SetInsertPoint(I);
 435                                 Instruction* pReplace = ProcessIntrinsic(pCallInst);
 436                                 toRemove.push_back(pCallInst);
 437                                 if (pReplace)
 438                                 {
 439                                     pCallInst->replaceAllUsesWith(pReplace);
 440                                 }
 441                             }
 442                         }
 443                     }
 444                 }
 445             }
 446
 447             for (auto* pInst : toRemove)
 448             {
 449                 pInst->eraseFromParent();
 450             }
 451
 452             JitManager::DumpToFile(&F, "lowerx86");
 453
 454             return true;
 455         }
 456
 457         virtual void getAnalysisUsage(AnalysisUsage& AU) const {}
 458
 459         JitManager* JM() { return B->JM(); }
 460         Builder*    B;
 461         TargetArch  mTarget;
 462         Function*   mPfnScatter256;
 463
 464         static char ID; ///< Needed by LLVM to generate ID for FunctionPass.
 465     };
 466
 467     char LowerX86::ID = 0; // LLVM uses address of ID as the actual ID.
 468
 469     FunctionPass* createLowerX86Pass(Builder* b) { return new LowerX86(b); }
 470
 471     Instruction* NO_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst)
 472     {
 473         SWR_ASSERT(false, "Unimplemented intrinsic emulation.");
 474         return nullptr;
 475     }
 476
 477     Instruction* VPERM_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst)
 478     {
 479         // Only need vperm emulation for AVX
 480         SWR_ASSERT(arch == AVX);
 481
 482         Builder* B         = pThis->B;
 483         auto     v32A      = pCallInst->getArgOperand(0);
 484         auto     vi32Index = pCallInst->getArgOperand(1);
 485
 486         Value* v32Result;
 487         if (isa<Constant>(vi32Index))
 488         {
 489             // Can use llvm shuffle vector directly with constant shuffle indices
 490             v32Result = B->VSHUFFLE(v32A, v32A, vi32Index);
 491         }
 492         else
 493         {
 494             v32Result = UndefValue::get(v32A->getType());
 495             for (uint32_t l = 0; l < v32A->getType()->getVectorNumElements(); ++l)
 496             {
 497                 auto i32Index = B->VEXTRACT(vi32Index, B->C(l));
 498                 auto val      = B->VEXTRACT(v32A, i32Index);
 499                 v32Result     = B->VINSERT(v32Result, val, B->C(l));
 500             }
 501         }
 502         return cast<Instruction>(v32Result);
 503     }
 504
 505     Instruction*
 506     VGATHER_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst)
 507     {
 508         Builder* B           = pThis->B;
 509         auto     vSrc        = pCallInst->getArgOperand(0);
 510         auto     pBase       = pCallInst->getArgOperand(1);
 511         auto     vi32Indices = pCallInst->getArgOperand(2);
 512         auto     vi1Mask     = pCallInst->getArgOperand(3);
 513         auto     i8Scale     = pCallInst->getArgOperand(4);
 514
 515         pBase             = B->POINTER_CAST(pBase, PointerType::get(B->mInt8Ty, 0));
 516         uint32_t numElem  = vSrc->getType()->getVectorNumElements();
 517         auto     i32Scale = B->Z_EXT(i8Scale, B->mInt32Ty);
 518         auto     srcTy    = vSrc->getType()->getVectorElementType();
 519         Value*   v32Gather = nullptr;
 520         if (arch == AVX)
 521         {
 522             // Full emulation for AVX
 523             // Store source on stack to provide a valid address to load from inactive lanes
 524             auto pStack = B->STACKSAVE();
 525             auto pTmp   = B->ALLOCA(vSrc->getType());
 526             B->STORE(vSrc, pTmp);
 527
 528             v32Gather        = UndefValue::get(vSrc->getType());
 529             auto vi32Scale   = ConstantVector::getSplat(numElem, cast<ConstantInt>(i32Scale));
 530             auto vi32Offsets = B->MUL(vi32Indices, vi32Scale);
 531
 532             for (uint32_t i = 0; i < numElem; ++i)
 533             {
 534                 auto i32Offset          = B->VEXTRACT(vi32Offsets, B->C(i));
 535                 auto pLoadAddress       = B->GEP(pBase, i32Offset);
 536                 pLoadAddress            = B->BITCAST(pLoadAddress, PointerType::get(srcTy, 0));
 537                 auto pMaskedLoadAddress = B->GEP(pTmp, {0, i});
 538                 auto i1Mask             = B->VEXTRACT(vi1Mask, B->C(i));
 539                 auto pValidAddress      = B->SELECT(i1Mask, pLoadAddress, pMaskedLoadAddress);
 540                 auto val                = B->LOAD(pValidAddress);
 541                 v32Gather               = B->VINSERT(v32Gather, val, B->C(i));
 542             }
 543
 544             B->STACKRESTORE(pStack);
 545         }
 546         else if (arch == AVX2 || (arch == AVX512 && width == W256))
 547         {
 548             Function* pX86IntrinFunc = nullptr;
 549             if (srcTy == B->mFP32Ty)
 550             {
 551                 pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule,
 552                                                            Intrinsic::x86_avx2_gather_d_ps_256);
 553             }
 554             else if (srcTy == B->mInt32Ty)
 555             {
 556                 pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule,
 557                                                            Intrinsic::x86_avx2_gather_d_d_256);
 558             }
 559             else if (srcTy == B->mDoubleTy)
 560             {
 561                 pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule,
 562                                                            Intrinsic::x86_avx2_gather_d_q_256);
 563             }
 564             else
 565             {
 566                 SWR_ASSERT(false, "Unsupported vector element type for gather.");
 567             }
 568
 569             if (width == W256)
 570             {
 571                 auto v32Mask = B->BITCAST(pThis->VectorMask(vi1Mask), vSrc->getType());
 572                 v32Gather = B->CALL(pX86IntrinFunc, {vSrc, pBase, vi32Indices, v32Mask, i8Scale});
 573             }
 574             else if (width == W512)
 575             {
 576                 // Double pump 4-wide for 64bit elements
 577                 if (vSrc->getType()->getVectorElementType() == B->mDoubleTy)
 578                 {
 579                     auto v64Mask = pThis->VectorMask(vi1Mask);
 580                     v64Mask      = B->S_EXT(
 581                         v64Mask,
 582                         VectorType::get(B->mInt64Ty, v64Mask->getType()->getVectorNumElements()));
 583                     v64Mask = B->BITCAST(v64Mask, vSrc->getType());
 584
 585                     Value* src0 = B->VSHUFFLE(vSrc, vSrc, B->C({0, 1, 2, 3}));
 586                     Value* src1 = B->VSHUFFLE(vSrc, vSrc, B->C({4, 5, 6, 7}));
 587
 588                     Value* indices0 = B->VSHUFFLE(vi32Indices, vi32Indices, B->C({0, 1, 2, 3}));
 589                     Value* indices1 = B->VSHUFFLE(vi32Indices, vi32Indices, B->C({4, 5, 6, 7}));
 590
 591                     Value* mask0 = B->VSHUFFLE(v64Mask, v64Mask, B->C({0, 1, 2, 3}));
 592                     Value* mask1 = B->VSHUFFLE(v64Mask, v64Mask, B->C({4, 5, 6, 7}));
 593
 594                     src0 = B->BITCAST(
 595                         src0,
 596                         VectorType::get(B->mInt64Ty, src0->getType()->getVectorNumElements()));
 597                     mask0 = B->BITCAST(
 598                         mask0,
 599                         VectorType::get(B->mInt64Ty, mask0->getType()->getVectorNumElements()));
 600                     Value* gather0 =
 601                         B->CALL(pX86IntrinFunc, {src0, pBase, indices0, mask0, i8Scale});
 602                     src1 = B->BITCAST(
 603                         src1,
 604                         VectorType::get(B->mInt64Ty, src1->getType()->getVectorNumElements()));
 605                     mask1 = B->BITCAST(
 606                         mask1,
 607                         VectorType::get(B->mInt64Ty, mask1->getType()->getVectorNumElements()));
 608                     Value* gather1 =
 609                         B->CALL(pX86IntrinFunc, {src1, pBase, indices1, mask1, i8Scale});
 610
 611                     v32Gather = B->VSHUFFLE(gather0, gather1, B->C({0, 1, 2, 3, 4, 5, 6, 7}));
 612                     v32Gather = B->BITCAST(v32Gather, vSrc->getType());
 613                 }
 614                 else
 615                 {
 616                     // Double pump 8-wide for 32bit elements
 617                     auto v32Mask = pThis->VectorMask(vi1Mask);
 618                     v32Mask      = B->BITCAST(v32Mask, vSrc->getType());
 619                     Value* src0  = B->EXTRACT_16(vSrc, 0);
 620                     Value* src1  = B->EXTRACT_16(vSrc, 1);
 621
 622                     Value* indices0 = B->EXTRACT_16(vi32Indices, 0);
 623                     Value* indices1 = B->EXTRACT_16(vi32Indices, 1);
 624
 625                     Value* mask0 = B->EXTRACT_16(v32Mask, 0);
 626                     Value* mask1 = B->EXTRACT_16(v32Mask, 1);
 627
 628                     Value* gather0 =
 629                         B->CALL(pX86IntrinFunc, {src0, pBase, indices0, mask0, i8Scale});
 630                     Value* gather1 =
 631                         B->CALL(pX86IntrinFunc, {src1, pBase, indices1, mask1, i8Scale});
 632
 633                     v32Gather = B->JOIN_16(gather0, gather1);
 634                 }
 635             }
 636         }
 637         else if (arch == AVX512)
 638         {
 639             Value*    iMask = nullptr;
 640             Function* pX86IntrinFunc = nullptr;
 641             if (srcTy == B->mFP32Ty)
 642             {
 643                 pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule,
 644                                                            Intrinsic::x86_avx512_gather_dps_512);
 645                 iMask          = B->BITCAST(vi1Mask, B->mInt16Ty);
 646             }
 647             else if (srcTy == B->mInt32Ty)
 648             {
 649                 pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule,
 650                                                            Intrinsic::x86_avx512_gather_dpi_512);
 651                 iMask          = B->BITCAST(vi1Mask, B->mInt16Ty);
 652             }
 653             else if (srcTy == B->mDoubleTy)
 654             {
 655                 pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule,
 656                                                            Intrinsic::x86_avx512_gather_dpd_512);
 657                 iMask          = B->BITCAST(vi1Mask, B->mInt8Ty);
 658             }
 659             else
 660             {
 661                 SWR_ASSERT(false, "Unsupported vector element type for gather.");
 662             }
 663
 664             auto i32Scale = B->Z_EXT(i8Scale, B->mInt32Ty);
 665             v32Gather     = B->CALL(pX86IntrinFunc, {vSrc, pBase, vi32Indices, iMask, i32Scale});
 666         }
 667
 668         return cast<Instruction>(v32Gather);
 669     }
 670     Instruction*
 671     VSCATTER_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst)
 672     {
 673         Builder* B           = pThis->B;
 674         auto     pBase       = pCallInst->getArgOperand(0);
 675         auto     vi1Mask     = pCallInst->getArgOperand(1);
 676         auto     vi32Indices = pCallInst->getArgOperand(2);
 677         auto     v32Src      = pCallInst->getArgOperand(3);
 678         auto     i32Scale    = pCallInst->getArgOperand(4);
 679
 680         if (arch != AVX512)
 681         {
 682             // Call into C function to do the scatter. This has significantly better compile perf
 683             // compared to jitting scatter loops for every scatter
 684             if (width == W256)
 685             {
 686                 auto mask = B->BITCAST(vi1Mask, B->mInt8Ty);
 687                 B->CALL(pThis->mPfnScatter256, {pBase, vi32Indices, v32Src, mask, i32Scale});
 688             }
 689             else
 690             {
 691                 // Need to break up 512 wide scatter to two 256 wide
 692                 auto maskLo = B->VSHUFFLE(vi1Mask, vi1Mask, B->C({0, 1, 2, 3, 4, 5, 6, 7}));
 693                 auto indicesLo =
 694                     B->VSHUFFLE(vi32Indices, vi32Indices, B->C({0, 1, 2, 3, 4, 5, 6, 7}));
 695                 auto srcLo = B->VSHUFFLE(v32Src, v32Src, B->C({0, 1, 2, 3, 4, 5, 6, 7}));
 696
 697                 auto mask = B->BITCAST(maskLo, B->mInt8Ty);
 698                 B->CALL(pThis->mPfnScatter256, {pBase, indicesLo, srcLo, mask, i32Scale});
 699
 700                 auto maskHi = B->VSHUFFLE(vi1Mask, vi1Mask, B->C({8, 9, 10, 11, 12, 13, 14, 15}));
 701                 auto indicesHi =
 702                     B->VSHUFFLE(vi32Indices, vi32Indices, B->C({8, 9, 10, 11, 12, 13, 14, 15}));
 703                 auto srcHi = B->VSHUFFLE(v32Src, v32Src, B->C({8, 9, 10, 11, 12, 13, 14, 15}));
 704
 705                 mask = B->BITCAST(maskHi, B->mInt8Ty);
 706                 B->CALL(pThis->mPfnScatter256, {pBase, indicesHi, srcHi, mask, i32Scale});
 707             }
 708             return nullptr;
 709         }
 710
 711         Value*    iMask;
 712         Function* pX86IntrinFunc;
 713         if (width == W256)
 714         {
 715             // No direct intrinsic supported in llvm to scatter 8 elem with 32bit indices, but we
 716             // can use the scatter of 8 elements with 64bit indices
 717             pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule,
 718                                                        Intrinsic::x86_avx512_scatter_qps_512);
 719
 720             auto vi32IndicesExt = B->Z_EXT(vi32Indices, B->mSimdInt64Ty);
 721             iMask               = B->BITCAST(vi1Mask, B->mInt8Ty);
 722             B->CALL(pX86IntrinFunc, {pBase, iMask, vi32IndicesExt, v32Src, i32Scale});
 723         }
 724         else if (width == W512)
 725         {
 726             pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule,
 727                                                        Intrinsic::x86_avx512_scatter_dps_512);
 728             iMask          = B->BITCAST(vi1Mask, B->mInt16Ty);
 729             B->CALL(pX86IntrinFunc, {pBase, iMask, vi32Indices, v32Src, i32Scale});
 730         }
 731         return nullptr;
 732     }
 733
 734     // No support for vroundps in avx512 (it is available in kncni), so emulate with avx
 735     // instructions
 736     Instruction*
 737     VROUND_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst)
 738     {
 739         SWR_ASSERT(arch == AVX512);
 740
 741         auto B       = pThis->B;
 742         auto vf32Src = pCallInst->getOperand(0);
 743         assert(vf32Src);
 744         auto i8Round = pCallInst->getOperand(1);
 745         assert(i8Round);
 746         auto pfnFunc =
 747             Intrinsic::getDeclaration(B->JM()->mpCurrentModule, Intrinsic::x86_avx_round_ps_256);
 748
 749         if (width == W256)
 750         {
 751             return cast<Instruction>(B->CALL2(pfnFunc, vf32Src, i8Round));
 752         }
 753         else if (width == W512)
 754         {
 755             auto v8f32SrcLo = B->EXTRACT_16(vf32Src, 0);
 756             auto v8f32SrcHi = B->EXTRACT_16(vf32Src, 1);
 757
 758             auto v8f32ResLo = B->CALL2(pfnFunc, v8f32SrcLo, i8Round);
 759             auto v8f32ResHi = B->CALL2(pfnFunc, v8f32SrcHi, i8Round);
 760
 761             return cast<Instruction>(B->JOIN_16(v8f32ResLo, v8f32ResHi));
 762         }
 763         else
 764         {
 765             SWR_ASSERT(false, "Unimplemented vector width.");
 766         }
 767
 768         return nullptr;
 769     }
 770
 771     Instruction*
 772     VCONVERT_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst)
 773     {
 774         SWR_ASSERT(arch == AVX512);
 775
 776         auto B       = pThis->B;
 777         auto vf32Src = pCallInst->getOperand(0);
 778
 779         if (width == W256)
 780         {
 781             auto vf32SrcRound = Intrinsic::getDeclaration(B->JM()->mpCurrentModule,
 782                                                           Intrinsic::x86_avx_round_ps_256);
 783             return cast<Instruction>(B->FP_TRUNC(vf32SrcRound, B->mFP32Ty));
 784         }
 785         else if (width == W512)
 786         {
 787             // 512 can use intrinsic
 788             auto pfnFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule,
 789                                                      Intrinsic::x86_avx512_mask_cvtpd2ps_512);
 790             return cast<Instruction>(B->CALL(pfnFunc, vf32Src));
 791         }
 792         else
 793         {
 794             SWR_ASSERT(false, "Unimplemented vector width.");
 795         }
 796
 797         return nullptr;
 798     }
 799
 800     // No support for hsub in AVX512
 801     Instruction* VHSUB_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst)
 802     {
 803         SWR_ASSERT(arch == AVX512);
 804
 805         auto B    = pThis->B;
 806         auto src0 = pCallInst->getOperand(0);
 807         auto src1 = pCallInst->getOperand(1);
 808
 809         // 256b hsub can just use avx intrinsic
 810         if (width == W256)
 811         {
 812             auto pX86IntrinFunc =
 813                 Intrinsic::getDeclaration(B->JM()->mpCurrentModule, Intrinsic::x86_avx_hsub_ps_256);
 814             return cast<Instruction>(B->CALL2(pX86IntrinFunc, src0, src1));
 815         }
 816         else if (width == W512)
 817         {
 818             // 512b hsub can be accomplished with shuf/sub combo
 819             auto minuend    = B->VSHUFFLE(src0, src1, B->C({0, 2, 8, 10, 4, 6, 12, 14}));
 820             auto subtrahend = B->VSHUFFLE(src0, src1, B->C({1, 3, 9, 11, 5, 7, 13, 15}));
 821             return cast<Instruction>(B->SUB(minuend, subtrahend));
 822         }
 823         else
 824         {
 825             SWR_ASSERT(false, "Unimplemented vector width.");
 826             return nullptr;
 827         }
 828     }
 829
 830     // Double pump input using Intrin template arg. This blindly extracts lower and upper 256 from
 831     // each vector argument and calls the 256 wide intrinsic, then merges the results to 512 wide
 832     Instruction* DOUBLE_EMU(LowerX86*     pThis,
 833                             TargetArch    arch,
 834                             TargetWidth   width,
 835                             CallInst*     pCallInst,
 836                             Intrinsic::ID intrin)
 837     {
 838         auto B = pThis->B;
 839         SWR_ASSERT(width == W512);
 840         Value*    result[2];
 841         Function* pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule, intrin);
 842         for (uint32_t i = 0; i < 2; ++i)
 843         {
 844             SmallVector<Value*, 8> args;
 845             for (auto& arg : pCallInst->arg_operands())
 846             {
 847                 auto argType = arg.get()->getType();
 848                 if (argType->isVectorTy())
 849                 {
 850                     uint32_t vecWidth  = argType->getVectorNumElements();
 851                     Value*   lanes     = B->CInc<int>(i * vecWidth / 2, vecWidth / 2);
 852                     Value*   argToPush = B->VSHUFFLE(
 853                         arg.get(), B->VUNDEF(argType->getVectorElementType(), vecWidth), lanes);
 854                     args.push_back(argToPush);
 855                 }
 856                 else
 857                 {
 858                     args.push_back(arg.get());
 859                 }
 860             }
 861             result[i] = B->CALLA(pX86IntrinFunc, args);
 862         }
 863         uint32_t vecWidth;
 864         if (result[0]->getType()->isVectorTy())
 865         {
 866             assert(result[1]->getType()->isVectorTy());
 867             vecWidth = result[0]->getType()->getVectorNumElements() +
 868                        result[1]->getType()->getVectorNumElements();
 869         }
 870         else
 871         {
 872             vecWidth = 2;
 873         }
 874         Value* lanes = B->CInc<int>(0, vecWidth);
 875         return cast<Instruction>(B->VSHUFFLE(result[0], result[1], lanes));
 876     }
 877
 878 } // namespace SwrJit
 879
 880 using namespace SwrJit;
 881
 882 INITIALIZE_PASS_BEGIN(LowerX86, "LowerX86", "LowerX86", false, false)
 883 INITIALIZE_PASS_END(LowerX86, "LowerX86", "LowerX86", false, false)