src/gallium/drivers/swr/rasterizer/jitter/functionpasses/lower_x86.cpp

   1 /****************************************************************************
   2  * Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  *
  23  * @file lower_x86.cpp
  24  *
  25  * @brief llvm pass to lower meta code to x86
  26  *
  27  * Notes:
  28  *
  29  ******************************************************************************/
  30
  31 #include "jit_pch.hpp"
  32 #include "passes.h"
  33 #include "JitManager.h"
  34
  35 #include "common/simdlib.hpp"
  36
  37 #include <unordered_map>
  38
  39 extern "C" void ScatterPS_256(uint8_t*, SIMD256::Integer, SIMD256::Float, uint8_t, uint32_t);
  40
  41 namespace llvm
  42 {
  43     // foward declare the initializer
  44     void initializeLowerX86Pass(PassRegistry&);
  45 } // namespace llvm
  46
  47 namespace SwrJit
  48 {
  49     using namespace llvm;
  50
  51     enum TargetArch
  52     {
  53         AVX    = 0,
  54         AVX2   = 1,
  55         AVX512 = 2
  56     };
  57
  58     enum TargetWidth
  59     {
  60         W256       = 0,
  61         W512       = 1,
  62         NUM_WIDTHS = 2
  63     };
  64
  65     struct LowerX86;
  66
  67     typedef std::function<Instruction*(LowerX86*, TargetArch, TargetWidth, CallInst*)> EmuFunc;
  68
  69     struct X86Intrinsic
  70     {
  71         Intrinsic::ID intrin[NUM_WIDTHS];
  72         EmuFunc       emuFunc;
  73     };
  74
  75     // Map of intrinsics that haven't been moved to the new mechanism yet. If used, these get the
  76     // previous behavior of mapping directly to avx/avx2 intrinsics.
  77     static std::map<std::string, Intrinsic::ID> intrinsicMap = {
  78         {"meta.intrinsic.BEXTR_32", Intrinsic::x86_bmi_bextr_32},
  79         {"meta.intrinsic.VPSHUFB", Intrinsic::x86_avx2_pshuf_b},
  80         {"meta.intrinsic.VCVTPS2PH", Intrinsic::x86_vcvtps2ph_256},
  81         {"meta.intrinsic.VPTESTC", Intrinsic::x86_avx_ptestc_256},
  82         {"meta.intrinsic.VPTESTZ", Intrinsic::x86_avx_ptestz_256},
  83         {"meta.intrinsic.VPHADDD", Intrinsic::x86_avx2_phadd_d},
  84         {"meta.intrinsic.PDEP32", Intrinsic::x86_bmi_pdep_32},
  85         {"meta.intrinsic.RDTSC", Intrinsic::x86_rdtsc},
  86     };
  87
  88     // Forward decls
  89     Instruction* NO_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst);
  90     Instruction*
  91     VPERM_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst);
  92     Instruction*
  93     VGATHER_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst);
  94     Instruction*
  95     VSCATTER_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst);
  96     Instruction*
  97     VROUND_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst);
  98     Instruction*
  99     VHSUB_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst);
 100     Instruction*
 101     VCONVERT_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst);
 102
 103     Instruction* DOUBLE_EMU(LowerX86*     pThis,
 104                             TargetArch    arch,
 105                             TargetWidth   width,
 106                             CallInst*     pCallInst,
 107                             Intrinsic::ID intrin);
 108
 109     static Intrinsic::ID DOUBLE = (Intrinsic::ID)-1;
 110
 111     // clang-format off
 112     static std::map<std::string, X86Intrinsic> intrinsicMap2[] = {
 113         //                               256 wide                               512 wide
 114         {
 115             // AVX
 116             {"meta.intrinsic.VRCPPS",    {{Intrinsic::x86_avx_rcp_ps_256,       DOUBLE},                    NO_EMU}},
 117             {"meta.intrinsic.VPERMPS",   {{Intrinsic::not_intrinsic,            Intrinsic::not_intrinsic},  VPERM_EMU}},
 118             {"meta.intrinsic.VPERMD",    {{Intrinsic::not_intrinsic,            Intrinsic::not_intrinsic},  VPERM_EMU}},
 119             {"meta.intrinsic.VGATHERPD", {{Intrinsic::not_intrinsic,            Intrinsic::not_intrinsic},  VGATHER_EMU}},
 120             {"meta.intrinsic.VGATHERPS", {{Intrinsic::not_intrinsic,            Intrinsic::not_intrinsic},  VGATHER_EMU}},
 121             {"meta.intrinsic.VGATHERDD", {{Intrinsic::not_intrinsic,            Intrinsic::not_intrinsic},  VGATHER_EMU}},
 122             {"meta.intrinsic.VSCATTERPS", {{Intrinsic::not_intrinsic,           Intrinsic::not_intrinsic}, VSCATTER_EMU}},
 123             {"meta.intrinsic.VCVTPD2PS", {{Intrinsic::x86_avx_cvt_pd2_ps_256,   Intrinsic::not_intrinsic},  NO_EMU}},
 124             {"meta.intrinsic.VCVTPH2PS", {{Intrinsic::x86_vcvtph2ps_256,        Intrinsic::not_intrinsic},  NO_EMU}},
 125             {"meta.intrinsic.VROUND",    {{Intrinsic::x86_avx_round_ps_256,     DOUBLE},                    NO_EMU}},
 126             {"meta.intrinsic.VHSUBPS",   {{Intrinsic::x86_avx_hsub_ps_256,      DOUBLE},                    NO_EMU}},
 127         },
 128         {
 129             // AVX2
 130             {"meta.intrinsic.VRCPPS",       {{Intrinsic::x86_avx_rcp_ps_256,    DOUBLE},                    NO_EMU}},
 131             {"meta.intrinsic.VPERMPS",      {{Intrinsic::x86_avx2_permps,       Intrinsic::not_intrinsic},  VPERM_EMU}},
 132             {"meta.intrinsic.VPERMD",       {{Intrinsic::x86_avx2_permd,        Intrinsic::not_intrinsic},  VPERM_EMU}},
 133             {"meta.intrinsic.VGATHERPD",    {{Intrinsic::not_intrinsic,         Intrinsic::not_intrinsic},  VGATHER_EMU}},
 134             {"meta.intrinsic.VGATHERPS",    {{Intrinsic::not_intrinsic,         Intrinsic::not_intrinsic},  VGATHER_EMU}},
 135             {"meta.intrinsic.VGATHERDD",    {{Intrinsic::not_intrinsic,         Intrinsic::not_intrinsic},  VGATHER_EMU}},
 136             {"meta.intrinsic.VSCATTERPS", {{Intrinsic::not_intrinsic,           Intrinsic::not_intrinsic}, VSCATTER_EMU}},
 137             {"meta.intrinsic.VCVTPD2PS",    {{Intrinsic::x86_avx_cvt_pd2_ps_256, DOUBLE},                   NO_EMU}},
 138             {"meta.intrinsic.VCVTPH2PS",    {{Intrinsic::x86_vcvtph2ps_256,     Intrinsic::not_intrinsic},  NO_EMU}},
 139             {"meta.intrinsic.VROUND",       {{Intrinsic::x86_avx_round_ps_256,  DOUBLE},                    NO_EMU}},
 140             {"meta.intrinsic.VHSUBPS",      {{Intrinsic::x86_avx_hsub_ps_256,   DOUBLE},                    NO_EMU}},
 141         },
 142         {
 143             // AVX512
 144             {"meta.intrinsic.VRCPPS", {{Intrinsic::x86_avx512_rcp14_ps_256,     Intrinsic::x86_avx512_rcp14_ps_512}, NO_EMU}},
 145 #if LLVM_VERSION_MAJOR < 7
 146             {"meta.intrinsic.VPERMPS", {{Intrinsic::x86_avx512_mask_permvar_sf_256, Intrinsic::x86_avx512_mask_permvar_sf_512}, NO_EMU}},
 147             {"meta.intrinsic.VPERMD", {{Intrinsic::x86_avx512_mask_permvar_si_256, Intrinsic::x86_avx512_mask_permvar_si_512}, NO_EMU}},
 148 #else
 149             {"meta.intrinsic.VPERMPS", {{Intrinsic::not_intrinsic,              Intrinsic::not_intrinsic}, VPERM_EMU}},
 150             {"meta.intrinsic.VPERMD", {{Intrinsic::not_intrinsic,               Intrinsic::not_intrinsic}, VPERM_EMU}},
 151 #endif
 152             {"meta.intrinsic.VGATHERPD", {{Intrinsic::not_intrinsic,            Intrinsic::not_intrinsic}, VGATHER_EMU}},
 153             {"meta.intrinsic.VGATHERPS", {{Intrinsic::not_intrinsic,            Intrinsic::not_intrinsic}, VGATHER_EMU}},
 154             {"meta.intrinsic.VGATHERDD", {{Intrinsic::not_intrinsic,            Intrinsic::not_intrinsic}, VGATHER_EMU}},
 155             {"meta.intrinsic.VSCATTERPS", {{Intrinsic::not_intrinsic,           Intrinsic::not_intrinsic}, VSCATTER_EMU}},
 156 #if LLVM_VERSION_MAJOR < 7
 157             {"meta.intrinsic.VCVTPD2PS", {{Intrinsic::x86_avx512_mask_cvtpd2ps_256, Intrinsic::x86_avx512_mask_cvtpd2ps_512}, NO_EMU}},
 158 #else
 159             {"meta.intrinsic.VCVTPD2PS", {{Intrinsic::not_intrinsic,            Intrinsic::not_intrinsic}, VCONVERT_EMU}},
 160 #endif
 161             {"meta.intrinsic.VCVTPH2PS", {{Intrinsic::x86_avx512_mask_vcvtph2ps_256, Intrinsic::x86_avx512_mask_vcvtph2ps_512}, NO_EMU}},
 162             {"meta.intrinsic.VROUND", {{Intrinsic::not_intrinsic,               Intrinsic::not_intrinsic}, VROUND_EMU}},
 163             {"meta.intrinsic.VHSUBPS", {{Intrinsic::not_intrinsic,              Intrinsic::not_intrinsic}, VHSUB_EMU}},
 164         }};
 165     // clang-format on
 166
 167     struct LowerX86 : public FunctionPass
 168     {
 169         LowerX86(Builder* b = nullptr) : FunctionPass(ID), B(b)
 170         {
 171             initializeLowerX86Pass(*PassRegistry::getPassRegistry());
 172
 173             // Determine target arch
 174             if (JM()->mArch.AVX512F())
 175             {
 176                 mTarget = AVX512;
 177             }
 178             else if (JM()->mArch.AVX2())
 179             {
 180                 mTarget = AVX2;
 181             }
 182             else if (JM()->mArch.AVX())
 183             {
 184                 mTarget = AVX;
 185             }
 186             else
 187             {
 188                 SWR_ASSERT(false, "Unsupported AVX architecture.");
 189                 mTarget = AVX;
 190             }
 191
 192             // Setup scatter function for 256 wide
 193             uint32_t curWidth = B->mVWidth;
 194             B->SetTargetWidth(8);
 195             std::vector<Type*> args = {
 196                 B->mInt8PtrTy,   // pBase
 197                 B->mSimdInt32Ty, // vIndices
 198                 B->mSimdFP32Ty,  // vSrc
 199                 B->mInt8Ty,      // mask
 200                 B->mInt32Ty      // scale
 201             };
 202
 203             FunctionType* pfnScatterTy = FunctionType::get(B->mVoidTy, args, false);
 204             mPfnScatter256             = cast<Function>(
 205 #if LLVM_VERSION_MAJOR >= 9
 206                 B->JM()->mpCurrentModule->getOrInsertFunction("ScatterPS_256", pfnScatterTy).getCallee());
 207 #else
 208                 B->JM()->mpCurrentModule->getOrInsertFunction("ScatterPS_256", pfnScatterTy));
 209 #endif
 210             if (sys::DynamicLibrary::SearchForAddressOfSymbol("ScatterPS_256") == nullptr)
 211             {
 212                 sys::DynamicLibrary::AddSymbol("ScatterPS_256", (void*)&ScatterPS_256);
 213             }
 214
 215             B->SetTargetWidth(curWidth);
 216         }
 217
 218         // Try to decipher the vector type of the instruction. This does not work properly
 219         // across all intrinsics, and will have to be rethought. Probably need something
 220         // similar to llvm's getDeclaration() utility to map a set of inputs to a specific typed
 221         // intrinsic.
 222         void GetRequestedWidthAndType(CallInst*       pCallInst,
 223                                       const StringRef intrinName,
 224                                       TargetWidth*    pWidth,
 225                                       Type**          pTy)
 226         {
 227             Type* pVecTy = pCallInst->getType();
 228
 229             // Check for intrinsic specific types
 230             // VCVTPD2PS type comes from src, not dst
 231             if (intrinName.equals("meta.intrinsic.VCVTPD2PS"))
 232             {
 233                 pVecTy = pCallInst->getOperand(0)->getType();
 234             }
 235
 236             if (!pVecTy->isVectorTy())
 237             {
 238                 for (auto& op : pCallInst->arg_operands())
 239                 {
 240                     if (op.get()->getType()->isVectorTy())
 241                     {
 242                         pVecTy = op.get()->getType();
 243                         break;
 244                     }
 245                 }
 246             }
 247             SWR_ASSERT(pVecTy->isVectorTy(), "Couldn't determine vector size");
 248
 249             uint32_t width = cast<VectorType>(pVecTy)->getBitWidth();
 250             switch (width)
 251             {
 252             case 256:
 253                 *pWidth = W256;
 254                 break;
 255             case 512:
 256                 *pWidth = W512;
 257                 break;
 258             default:
 259                 SWR_ASSERT(false, "Unhandled vector width %d", width);
 260                 *pWidth = W256;
 261             }
 262
 263             *pTy = pVecTy->getScalarType();
 264         }
 265
 266         Value* GetZeroVec(TargetWidth width, Type* pTy)
 267         {
 268             uint32_t numElem = 0;
 269             switch (width)
 270             {
 271             case W256:
 272                 numElem = 8;
 273                 break;
 274             case W512:
 275                 numElem = 16;
 276                 break;
 277             default:
 278                 SWR_ASSERT(false, "Unhandled vector width type %d\n", width);
 279             }
 280
 281             return ConstantVector::getNullValue(VectorType::get(pTy, numElem));
 282         }
 283
 284         Value* GetMask(TargetWidth width)
 285         {
 286             Value* mask;
 287             switch (width)
 288             {
 289             case W256:
 290                 mask = B->C((uint8_t)-1);
 291                 break;
 292             case W512:
 293                 mask = B->C((uint16_t)-1);
 294                 break;
 295             default:
 296                 SWR_ASSERT(false, "Unhandled vector width type %d\n", width);
 297             }
 298             return mask;
 299         }
 300
 301         // Convert <N x i1> mask to <N x i32> x86 mask
 302         Value* VectorMask(Value* vi1Mask)
 303         {
 304             uint32_t numElem = vi1Mask->getType()->getVectorNumElements();
 305             return B->S_EXT(vi1Mask, VectorType::get(B->mInt32Ty, numElem));
 306         }
 307
 308         Instruction* ProcessIntrinsicAdvanced(CallInst* pCallInst)
 309         {
 310             Function*   pFunc     = pCallInst->getCalledFunction();
 311             auto&       intrinsic = intrinsicMap2[mTarget][pFunc->getName()];
 312             TargetWidth vecWidth;
 313             Type*       pElemTy;
 314             GetRequestedWidthAndType(pCallInst, pFunc->getName(), &vecWidth, &pElemTy);
 315
 316             // Check if there is a native intrinsic for this instruction
 317             Intrinsic::ID id = intrinsic.intrin[vecWidth];
 318             if (id == DOUBLE)
 319             {
 320                 // Double pump the next smaller SIMD intrinsic
 321                 SWR_ASSERT(vecWidth != 0, "Cannot double pump smallest SIMD width.");
 322                 Intrinsic::ID id2 = intrinsic.intrin[vecWidth - 1];
 323                 SWR_ASSERT(id2 != Intrinsic::not_intrinsic,
 324                            "Cannot find intrinsic to double pump.");
 325                 return DOUBLE_EMU(this, mTarget, vecWidth, pCallInst, id2);
 326             }
 327             else if (id != Intrinsic::not_intrinsic)
 328             {
 329                 Function* pIntrin = Intrinsic::getDeclaration(B->JM()->mpCurrentModule, id);
 330                 SmallVector<Value*, 8> args;
 331                 for (auto& arg : pCallInst->arg_operands())
 332                 {
 333                     args.push_back(arg.get());
 334                 }
 335
 336                 // If AVX512, all instructions add a src operand and mask. We'll pass in 0 src and
 337                 // full mask for now Assuming the intrinsics are consistent and place the src
 338                 // operand and mask last in the argument list.
 339                 if (mTarget == AVX512)
 340                 {
 341                     if (pFunc->getName().equals("meta.intrinsic.VCVTPD2PS"))
 342                     {
 343                         args.push_back(GetZeroVec(W256, pCallInst->getType()->getScalarType()));
 344                         args.push_back(GetMask(W256));
 345                         // for AVX512 VCVTPD2PS, we also have to add rounding mode
 346                         args.push_back(B->C(_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
 347                     }
 348                     else
 349                     {
 350                         args.push_back(GetZeroVec(vecWidth, pElemTy));
 351                         args.push_back(GetMask(vecWidth));
 352                     }
 353                 }
 354
 355                 return B->CALLA(pIntrin, args);
 356             }
 357             else
 358             {
 359                 // No native intrinsic, call emulation function
 360                 return intrinsic.emuFunc(this, mTarget, vecWidth, pCallInst);
 361             }
 362
 363             SWR_ASSERT(false);
 364             return nullptr;
 365         }
 366
 367         Instruction* ProcessIntrinsic(CallInst* pCallInst)
 368         {
 369             Function* pFunc = pCallInst->getCalledFunction();
 370
 371             // Forward to the advanced support if found
 372             if (intrinsicMap2[mTarget].find(pFunc->getName()) != intrinsicMap2[mTarget].end())
 373             {
 374                 return ProcessIntrinsicAdvanced(pCallInst);
 375             }
 376
 377             SWR_ASSERT(intrinsicMap.find(pFunc->getName()) != intrinsicMap.end(),
 378                        "Unimplemented intrinsic %s.",
 379                        pFunc->getName());
 380
 381             Intrinsic::ID x86Intrinsic = intrinsicMap[pFunc->getName()];
 382             Function*     pX86IntrinFunc =
 383                 Intrinsic::getDeclaration(B->JM()->mpCurrentModule, x86Intrinsic);
 384
 385             SmallVector<Value*, 8> args;
 386             for (auto& arg : pCallInst->arg_operands())
 387             {
 388                 args.push_back(arg.get());
 389             }
 390             return B->CALLA(pX86IntrinFunc, args);
 391         }
 392
 393         //////////////////////////////////////////////////////////////////////////
 394         /// @brief LLVM funtion pass run method.
 395         /// @param f- The function we're working on with this pass.
 396         virtual bool runOnFunction(Function& F)
 397         {
 398             std::vector<Instruction*> toRemove;
 399             std::vector<BasicBlock*>  bbs;
 400
 401             // Make temp copy of the basic blocks and instructions, as the intrinsic
 402             // replacement code might invalidate the iterators
 403             for (auto& b : F.getBasicBlockList())
 404             {
 405                 bbs.push_back(&b);
 406             }
 407
 408             for (auto* BB : bbs)
 409             {
 410                 std::vector<Instruction*> insts;
 411                 for (auto& i : BB->getInstList())
 412                 {
 413                     insts.push_back(&i);
 414                 }
 415
 416                 for (auto* I : insts)
 417                 {
 418                     if (CallInst* pCallInst = dyn_cast<CallInst>(I))
 419                     {
 420                         Function* pFunc = pCallInst->getCalledFunction();
 421                         if (pFunc)
 422                         {
 423                             if (pFunc->getName().startswith("meta.intrinsic"))
 424                             {
 425                                 B->IRB()->SetInsertPoint(I);
 426                                 Instruction* pReplace = ProcessIntrinsic(pCallInst);
 427                                 toRemove.push_back(pCallInst);
 428                                 if (pReplace)
 429                                 {
 430                                     pCallInst->replaceAllUsesWith(pReplace);
 431                                 }
 432                             }
 433                         }
 434                     }
 435                 }
 436             }
 437
 438             for (auto* pInst : toRemove)
 439             {
 440                 pInst->eraseFromParent();
 441             }
 442
 443             JitManager::DumpToFile(&F, "lowerx86");
 444
 445             return true;
 446         }
 447
 448         virtual void getAnalysisUsage(AnalysisUsage& AU) const {}
 449
 450         JitManager* JM() { return B->JM(); }
 451         Builder*    B;
 452         TargetArch  mTarget;
 453         Function*   mPfnScatter256;
 454
 455         static char ID; ///< Needed by LLVM to generate ID for FunctionPass.
 456     };
 457
 458     char LowerX86::ID = 0; // LLVM uses address of ID as the actual ID.
 459
 460     FunctionPass* createLowerX86Pass(Builder* b) { return new LowerX86(b); }
 461
 462     Instruction* NO_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst)
 463     {
 464         SWR_ASSERT(false, "Unimplemented intrinsic emulation.");
 465         return nullptr;
 466     }
 467
 468     Instruction* VPERM_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst)
 469     {
 470         // Only need vperm emulation for AVX
 471         SWR_ASSERT(arch == AVX);
 472
 473         Builder* B         = pThis->B;
 474         auto     v32A      = pCallInst->getArgOperand(0);
 475         auto     vi32Index = pCallInst->getArgOperand(1);
 476
 477         Value* v32Result;
 478         if (isa<Constant>(vi32Index))
 479         {
 480             // Can use llvm shuffle vector directly with constant shuffle indices
 481             v32Result = B->VSHUFFLE(v32A, v32A, vi32Index);
 482         }
 483         else
 484         {
 485             v32Result = UndefValue::get(v32A->getType());
 486             for (uint32_t l = 0; l < v32A->getType()->getVectorNumElements(); ++l)
 487             {
 488                 auto i32Index = B->VEXTRACT(vi32Index, B->C(l));
 489                 auto val      = B->VEXTRACT(v32A, i32Index);
 490                 v32Result     = B->VINSERT(v32Result, val, B->C(l));
 491             }
 492         }
 493         return cast<Instruction>(v32Result);
 494     }
 495
 496     Instruction*
 497     VGATHER_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst)
 498     {
 499         Builder* B           = pThis->B;
 500         auto     vSrc        = pCallInst->getArgOperand(0);
 501         auto     pBase       = pCallInst->getArgOperand(1);
 502         auto     vi32Indices = pCallInst->getArgOperand(2);
 503         auto     vi1Mask     = pCallInst->getArgOperand(3);
 504         auto     i8Scale     = pCallInst->getArgOperand(4);
 505
 506         pBase             = B->POINTER_CAST(pBase, PointerType::get(B->mInt8Ty, 0));
 507         uint32_t numElem  = vSrc->getType()->getVectorNumElements();
 508         auto     i32Scale = B->Z_EXT(i8Scale, B->mInt32Ty);
 509         auto     srcTy    = vSrc->getType()->getVectorElementType();
 510         Value*   v32Gather = nullptr;
 511         if (arch == AVX)
 512         {
 513             // Full emulation for AVX
 514             // Store source on stack to provide a valid address to load from inactive lanes
 515             auto pStack = B->STACKSAVE();
 516             auto pTmp   = B->ALLOCA(vSrc->getType());
 517             B->STORE(vSrc, pTmp);
 518
 519             v32Gather        = UndefValue::get(vSrc->getType());
 520             auto vi32Scale   = ConstantVector::getSplat(numElem, cast<ConstantInt>(i32Scale));
 521             auto vi32Offsets = B->MUL(vi32Indices, vi32Scale);
 522
 523             for (uint32_t i = 0; i < numElem; ++i)
 524             {
 525                 auto i32Offset          = B->VEXTRACT(vi32Offsets, B->C(i));
 526                 auto pLoadAddress       = B->GEP(pBase, i32Offset);
 527                 pLoadAddress            = B->BITCAST(pLoadAddress, PointerType::get(srcTy, 0));
 528                 auto pMaskedLoadAddress = B->GEP(pTmp, {0, i});
 529                 auto i1Mask             = B->VEXTRACT(vi1Mask, B->C(i));
 530                 auto pValidAddress      = B->SELECT(i1Mask, pLoadAddress, pMaskedLoadAddress);
 531                 auto val                = B->LOAD(pValidAddress);
 532                 v32Gather               = B->VINSERT(v32Gather, val, B->C(i));
 533             }
 534
 535             B->STACKRESTORE(pStack);
 536         }
 537         else if (arch == AVX2 || (arch == AVX512 && width == W256))
 538         {
 539             Function* pX86IntrinFunc = nullptr;
 540             if (srcTy == B->mFP32Ty)
 541             {
 542                 pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule,
 543                                                            Intrinsic::x86_avx2_gather_d_ps_256);
 544             }
 545             else if (srcTy == B->mInt32Ty)
 546             {
 547                 pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule,
 548                                                            Intrinsic::x86_avx2_gather_d_d_256);
 549             }
 550             else if (srcTy == B->mDoubleTy)
 551             {
 552                 pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule,
 553                                                            Intrinsic::x86_avx2_gather_d_q_256);
 554             }
 555             else
 556             {
 557                 SWR_ASSERT(false, "Unsupported vector element type for gather.");
 558             }
 559
 560             if (width == W256)
 561             {
 562                 auto v32Mask = B->BITCAST(pThis->VectorMask(vi1Mask), vSrc->getType());
 563                 v32Gather = B->CALL(pX86IntrinFunc, {vSrc, pBase, vi32Indices, v32Mask, i8Scale});
 564             }
 565             else if (width == W512)
 566             {
 567                 // Double pump 4-wide for 64bit elements
 568                 if (vSrc->getType()->getVectorElementType() == B->mDoubleTy)
 569                 {
 570                     auto v64Mask = pThis->VectorMask(vi1Mask);
 571                     v64Mask      = B->S_EXT(
 572                         v64Mask,
 573                         VectorType::get(B->mInt64Ty, v64Mask->getType()->getVectorNumElements()));
 574                     v64Mask = B->BITCAST(v64Mask, vSrc->getType());
 575
 576                     Value* src0 = B->VSHUFFLE(vSrc, vSrc, B->C({0, 1, 2, 3}));
 577                     Value* src1 = B->VSHUFFLE(vSrc, vSrc, B->C({4, 5, 6, 7}));
 578
 579                     Value* indices0 = B->VSHUFFLE(vi32Indices, vi32Indices, B->C({0, 1, 2, 3}));
 580                     Value* indices1 = B->VSHUFFLE(vi32Indices, vi32Indices, B->C({4, 5, 6, 7}));
 581
 582                     Value* mask0 = B->VSHUFFLE(v64Mask, v64Mask, B->C({0, 1, 2, 3}));
 583                     Value* mask1 = B->VSHUFFLE(v64Mask, v64Mask, B->C({4, 5, 6, 7}));
 584
 585                     src0 = B->BITCAST(
 586                         src0,
 587                         VectorType::get(B->mInt64Ty, src0->getType()->getVectorNumElements()));
 588                     mask0 = B->BITCAST(
 589                         mask0,
 590                         VectorType::get(B->mInt64Ty, mask0->getType()->getVectorNumElements()));
 591                     Value* gather0 =
 592                         B->CALL(pX86IntrinFunc, {src0, pBase, indices0, mask0, i8Scale});
 593                     src1 = B->BITCAST(
 594                         src1,
 595                         VectorType::get(B->mInt64Ty, src1->getType()->getVectorNumElements()));
 596                     mask1 = B->BITCAST(
 597                         mask1,
 598                         VectorType::get(B->mInt64Ty, mask1->getType()->getVectorNumElements()));
 599                     Value* gather1 =
 600                         B->CALL(pX86IntrinFunc, {src1, pBase, indices1, mask1, i8Scale});
 601
 602                     v32Gather = B->VSHUFFLE(gather0, gather1, B->C({0, 1, 2, 3, 4, 5, 6, 7}));
 603                     v32Gather = B->BITCAST(v32Gather, vSrc->getType());
 604                 }
 605                 else
 606                 {
 607                     // Double pump 8-wide for 32bit elements
 608                     auto v32Mask = pThis->VectorMask(vi1Mask);
 609                     v32Mask      = B->BITCAST(v32Mask, vSrc->getType());
 610                     Value* src0  = B->EXTRACT_16(vSrc, 0);
 611                     Value* src1  = B->EXTRACT_16(vSrc, 1);
 612
 613                     Value* indices0 = B->EXTRACT_16(vi32Indices, 0);
 614                     Value* indices1 = B->EXTRACT_16(vi32Indices, 1);
 615
 616                     Value* mask0 = B->EXTRACT_16(v32Mask, 0);
 617                     Value* mask1 = B->EXTRACT_16(v32Mask, 1);
 618
 619                     Value* gather0 =
 620                         B->CALL(pX86IntrinFunc, {src0, pBase, indices0, mask0, i8Scale});
 621                     Value* gather1 =
 622                         B->CALL(pX86IntrinFunc, {src1, pBase, indices1, mask1, i8Scale});
 623
 624                     v32Gather = B->JOIN_16(gather0, gather1);
 625                 }
 626             }
 627         }
 628         else if (arch == AVX512)
 629         {
 630             Value*    iMask = nullptr;
 631             Function* pX86IntrinFunc = nullptr;
 632             if (srcTy == B->mFP32Ty)
 633             {
 634                 pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule,
 635                                                            Intrinsic::x86_avx512_gather_dps_512);
 636                 iMask          = B->BITCAST(vi1Mask, B->mInt16Ty);
 637             }
 638             else if (srcTy == B->mInt32Ty)
 639             {
 640                 pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule,
 641                                                            Intrinsic::x86_avx512_gather_dpi_512);
 642                 iMask          = B->BITCAST(vi1Mask, B->mInt16Ty);
 643             }
 644             else if (srcTy == B->mDoubleTy)
 645             {
 646                 pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule,
 647                                                            Intrinsic::x86_avx512_gather_dpd_512);
 648                 iMask          = B->BITCAST(vi1Mask, B->mInt8Ty);
 649             }
 650             else
 651             {
 652                 SWR_ASSERT(false, "Unsupported vector element type for gather.");
 653             }
 654
 655             auto i32Scale = B->Z_EXT(i8Scale, B->mInt32Ty);
 656             v32Gather     = B->CALL(pX86IntrinFunc, {vSrc, pBase, vi32Indices, iMask, i32Scale});
 657         }
 658
 659         return cast<Instruction>(v32Gather);
 660     }
 661     Instruction*
 662     VSCATTER_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst)
 663     {
 664         Builder* B           = pThis->B;
 665         auto     pBase       = pCallInst->getArgOperand(0);
 666         auto     vi1Mask     = pCallInst->getArgOperand(1);
 667         auto     vi32Indices = pCallInst->getArgOperand(2);
 668         auto     v32Src      = pCallInst->getArgOperand(3);
 669         auto     i32Scale    = pCallInst->getArgOperand(4);
 670
 671         if (arch != AVX512)
 672         {
 673             // Call into C function to do the scatter. This has significantly better compile perf
 674             // compared to jitting scatter loops for every scatter
 675             if (width == W256)
 676             {
 677                 auto mask = B->BITCAST(vi1Mask, B->mInt8Ty);
 678                 B->CALL(pThis->mPfnScatter256, {pBase, vi32Indices, v32Src, mask, i32Scale});
 679             }
 680             else
 681             {
 682                 // Need to break up 512 wide scatter to two 256 wide
 683                 auto maskLo = B->VSHUFFLE(vi1Mask, vi1Mask, B->C({0, 1, 2, 3, 4, 5, 6, 7}));
 684                 auto indicesLo =
 685                     B->VSHUFFLE(vi32Indices, vi32Indices, B->C({0, 1, 2, 3, 4, 5, 6, 7}));
 686                 auto srcLo = B->VSHUFFLE(v32Src, v32Src, B->C({0, 1, 2, 3, 4, 5, 6, 7}));
 687
 688                 auto mask = B->BITCAST(maskLo, B->mInt8Ty);
 689                 B->CALL(pThis->mPfnScatter256, {pBase, indicesLo, srcLo, mask, i32Scale});
 690
 691                 auto maskHi = B->VSHUFFLE(vi1Mask, vi1Mask, B->C({8, 9, 10, 11, 12, 13, 14, 15}));
 692                 auto indicesHi =
 693                     B->VSHUFFLE(vi32Indices, vi32Indices, B->C({8, 9, 10, 11, 12, 13, 14, 15}));
 694                 auto srcHi = B->VSHUFFLE(v32Src, v32Src, B->C({8, 9, 10, 11, 12, 13, 14, 15}));
 695
 696                 mask = B->BITCAST(maskHi, B->mInt8Ty);
 697                 B->CALL(pThis->mPfnScatter256, {pBase, indicesHi, srcHi, mask, i32Scale});
 698             }
 699             return nullptr;
 700         }
 701
 702         Value*    iMask;
 703         Function* pX86IntrinFunc;
 704         if (width == W256)
 705         {
 706             // No direct intrinsic supported in llvm to scatter 8 elem with 32bit indices, but we
 707             // can use the scatter of 8 elements with 64bit indices
 708             pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule,
 709                                                        Intrinsic::x86_avx512_scatter_qps_512);
 710
 711             auto vi32IndicesExt = B->Z_EXT(vi32Indices, B->mSimdInt64Ty);
 712             iMask               = B->BITCAST(vi1Mask, B->mInt8Ty);
 713             B->CALL(pX86IntrinFunc, {pBase, iMask, vi32IndicesExt, v32Src, i32Scale});
 714         }
 715         else if (width == W512)
 716         {
 717             pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule,
 718                                                        Intrinsic::x86_avx512_scatter_dps_512);
 719             iMask          = B->BITCAST(vi1Mask, B->mInt16Ty);
 720             B->CALL(pX86IntrinFunc, {pBase, iMask, vi32Indices, v32Src, i32Scale});
 721         }
 722         return nullptr;
 723     }
 724
 725     // No support for vroundps in avx512 (it is available in kncni), so emulate with avx
 726     // instructions
 727     Instruction*
 728     VROUND_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst)
 729     {
 730         SWR_ASSERT(arch == AVX512);
 731
 732         auto B       = pThis->B;
 733         auto vf32Src = pCallInst->getOperand(0);
 734         auto i8Round = pCallInst->getOperand(1);
 735         auto pfnFunc =
 736             Intrinsic::getDeclaration(B->JM()->mpCurrentModule, Intrinsic::x86_avx_round_ps_256);
 737
 738         if (width == W256)
 739         {
 740             return cast<Instruction>(B->CALL2(pfnFunc, vf32Src, i8Round));
 741         }
 742         else if (width == W512)
 743         {
 744             auto v8f32SrcLo = B->EXTRACT_16(vf32Src, 0);
 745             auto v8f32SrcHi = B->EXTRACT_16(vf32Src, 1);
 746
 747             auto v8f32ResLo = B->CALL2(pfnFunc, v8f32SrcLo, i8Round);
 748             auto v8f32ResHi = B->CALL2(pfnFunc, v8f32SrcHi, i8Round);
 749
 750             return cast<Instruction>(B->JOIN_16(v8f32ResLo, v8f32ResHi));
 751         }
 752         else
 753         {
 754             SWR_ASSERT(false, "Unimplemented vector width.");
 755         }
 756
 757         return nullptr;
 758     }
 759
 760     Instruction*
 761     VCONVERT_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst)
 762     {
 763         SWR_ASSERT(arch == AVX512);
 764
 765         auto B       = pThis->B;
 766         auto vf32Src = pCallInst->getOperand(0);
 767
 768         if (width == W256)
 769         {
 770             auto vf32SrcRound = Intrinsic::getDeclaration(B->JM()->mpCurrentModule,
 771                                                           Intrinsic::x86_avx_round_ps_256);
 772             return cast<Instruction>(B->FP_TRUNC(vf32SrcRound, B->mFP32Ty));
 773         }
 774         else if (width == W512)
 775         {
 776             // 512 can use intrinsic
 777             auto pfnFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule,
 778                                                      Intrinsic::x86_avx512_mask_cvtpd2ps_512);
 779             return cast<Instruction>(B->CALL(pfnFunc, vf32Src));
 780         }
 781         else
 782         {
 783             SWR_ASSERT(false, "Unimplemented vector width.");
 784         }
 785
 786         return nullptr;
 787     }
 788
 789     // No support for hsub in AVX512
 790     Instruction* VHSUB_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst)
 791     {
 792         SWR_ASSERT(arch == AVX512);
 793
 794         auto B    = pThis->B;
 795         auto src0 = pCallInst->getOperand(0);
 796         auto src1 = pCallInst->getOperand(1);
 797
 798         // 256b hsub can just use avx intrinsic
 799         if (width == W256)
 800         {
 801             auto pX86IntrinFunc =
 802                 Intrinsic::getDeclaration(B->JM()->mpCurrentModule, Intrinsic::x86_avx_hsub_ps_256);
 803             return cast<Instruction>(B->CALL2(pX86IntrinFunc, src0, src1));
 804         }
 805         else if (width == W512)
 806         {
 807             // 512b hsub can be accomplished with shuf/sub combo
 808             auto minuend    = B->VSHUFFLE(src0, src1, B->C({0, 2, 8, 10, 4, 6, 12, 14}));
 809             auto subtrahend = B->VSHUFFLE(src0, src1, B->C({1, 3, 9, 11, 5, 7, 13, 15}));
 810             return cast<Instruction>(B->SUB(minuend, subtrahend));
 811         }
 812         else
 813         {
 814             SWR_ASSERT(false, "Unimplemented vector width.");
 815             return nullptr;
 816         }
 817     }
 818
 819     // Double pump input using Intrin template arg. This blindly extracts lower and upper 256 from
 820     // each vector argument and calls the 256 wide intrinsic, then merges the results to 512 wide
 821     Instruction* DOUBLE_EMU(LowerX86*     pThis,
 822                             TargetArch    arch,
 823                             TargetWidth   width,
 824                             CallInst*     pCallInst,
 825                             Intrinsic::ID intrin)
 826     {
 827         auto B = pThis->B;
 828         SWR_ASSERT(width == W512);
 829         Value*    result[2];
 830         Function* pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule, intrin);
 831         for (uint32_t i = 0; i < 2; ++i)
 832         {
 833             SmallVector<Value*, 8> args;
 834             for (auto& arg : pCallInst->arg_operands())
 835             {
 836                 auto argType = arg.get()->getType();
 837                 if (argType->isVectorTy())
 838                 {
 839                     uint32_t vecWidth  = argType->getVectorNumElements();
 840                     Value*   lanes     = B->CInc<int>(i * vecWidth / 2, vecWidth / 2);
 841                     Value*   argToPush = B->VSHUFFLE(
 842                         arg.get(), B->VUNDEF(argType->getVectorElementType(), vecWidth), lanes);
 843                     args.push_back(argToPush);
 844                 }
 845                 else
 846                 {
 847                     args.push_back(arg.get());
 848                 }
 849             }
 850             result[i] = B->CALLA(pX86IntrinFunc, args);
 851         }
 852         uint32_t vecWidth;
 853         if (result[0]->getType()->isVectorTy())
 854         {
 855             assert(result[1]->getType()->isVectorTy());
 856             vecWidth = result[0]->getType()->getVectorNumElements() +
 857                        result[1]->getType()->getVectorNumElements();
 858         }
 859         else
 860         {
 861             vecWidth = 2;
 862         }
 863         Value* lanes = B->CInc<int>(0, vecWidth);
 864         return cast<Instruction>(B->VSHUFFLE(result[0], result[1], lanes));
 865     }
 866
 867 } // namespace SwrJit
 868
 869 using namespace SwrJit;
 870
 871 INITIALIZE_PASS_BEGIN(LowerX86, "LowerX86", "LowerX86", false, false)
 872 INITIALIZE_PASS_END(LowerX86, "LowerX86", "LowerX86", false, false)