src/gallium/drivers/swr/rasterizer/jitter/functionpasses/lower_x86.cpp

   1 /****************************************************************************
   2  * Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  *
  23  * @file lower_x86.cpp
  24  *
  25  * @brief llvm pass to lower meta code to x86
  26  *
  27  * Notes:
  28  *
  29  ******************************************************************************/
  30
  31 #include "jit_pch.hpp"
  32 #include "passes.h"
  33 #include "JitManager.h"
  34
  35 #include <unordered_map>
  36
  37 namespace llvm
  38 {
  39     // foward declare the initializer
  40     void initializeLowerX86Pass(PassRegistry &);
  41 } // namespace llvm
  42
  43 namespace SwrJit
  44 {
  45     using namespace llvm;
  46
  47     enum TargetArch
  48     {
  49         AVX    = 0,
  50         AVX2   = 1,
  51         AVX512 = 2
  52     };
  53
  54     enum TargetWidth
  55     {
  56         W256       = 0,
  57         W512       = 1,
  58         NUM_WIDTHS = 2
  59     };
  60
  61     struct LowerX86;
  62
  63     typedef std::function<Instruction *(LowerX86 *, TargetArch, TargetWidth, CallInst *)> EmuFunc;
  64
  65     struct X86Intrinsic
  66     {
  67         Intrinsic::ID intrin[NUM_WIDTHS];
  68         EmuFunc       emuFunc;
  69     };
  70
  71     // Map of intrinsics that haven't been moved to the new mechanism yet. If used, these get the
  72     // previous behavior of mapping directly to avx/avx2 intrinsics.
  73     static std::map<std::string, Intrinsic::ID> intrinsicMap = {
  74         {"meta.intrinsic.BEXTR_32", Intrinsic::x86_bmi_bextr_32},
  75         {"meta.intrinsic.VPSHUFB", Intrinsic::x86_avx2_pshuf_b},
  76         {"meta.intrinsic.VCVTPS2PH", Intrinsic::x86_vcvtps2ph_256},
  77         {"meta.intrinsic.VPTESTC", Intrinsic::x86_avx_ptestc_256},
  78         {"meta.intrinsic.VPTESTZ", Intrinsic::x86_avx_ptestz_256},
  79         {"meta.intrinsic.VFMADDPS", Intrinsic::x86_fma_vfmadd_ps_256},
  80         {"meta.intrinsic.VPHADDD", Intrinsic::x86_avx2_phadd_d},
  81         {"meta.intrinsic.PDEP32", Intrinsic::x86_bmi_pdep_32},
  82         {"meta.intrinsic.RDTSC", Intrinsic::x86_rdtsc},
  83     };
  84
  85     // Forward decls
  86     Instruction *NO_EMU(LowerX86 *pThis, TargetArch arch, TargetWidth width, CallInst *pCallInst);
  87     Instruction *
  88     VPERM_EMU(LowerX86 *pThis, TargetArch arch, TargetWidth width, CallInst *pCallInst);
  89     Instruction *
  90     VGATHER_EMU(LowerX86 *pThis, TargetArch arch, TargetWidth width, CallInst *pCallInst);
  91     Instruction *
  92     VROUND_EMU(LowerX86 *pThis, TargetArch arch, TargetWidth width, CallInst *pCallInst);
  93     Instruction *
  94     VHSUB_EMU(LowerX86 *pThis, TargetArch arch, TargetWidth width, CallInst *pCallInst);
  95
  96     Instruction *DOUBLE_EMU(LowerX86 *    pThis,
  97                             TargetArch    arch,
  98                             TargetWidth   width,
  99                             CallInst *    pCallInst,
 100                             Intrinsic::ID intrin);
 101
 102     static Intrinsic::ID DOUBLE = (Intrinsic::ID)-1;
 103
 104     static std::map<std::string, X86Intrinsic> intrinsicMap2[] = {
 105         //                              256 wide                                    512 wide
 106         {
 107             // AVX
 108             {"meta.intrinsic.VRCPPS", {{Intrinsic::x86_avx_rcp_ps_256, DOUBLE}, NO_EMU}},
 109             {"meta.intrinsic.VPERMPS",
 110              {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VPERM_EMU}},
 111             {"meta.intrinsic.VPERMD",
 112              {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VPERM_EMU}},
 113             {"meta.intrinsic.VGATHERPD",
 114              {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},
 115             {"meta.intrinsic.VGATHERPS",
 116              {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},
 117             {"meta.intrinsic.VGATHERDD",
 118              {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},
 119             {"meta.intrinsic.VCVTPD2PS",
 120              {{Intrinsic::x86_avx_cvt_pd2_ps_256, Intrinsic::not_intrinsic}, NO_EMU}},
 121             {"meta.intrinsic.VCVTPH2PS",
 122              {{Intrinsic::x86_vcvtph2ps_256, Intrinsic::not_intrinsic}, NO_EMU}},
 123             {"meta.intrinsic.VROUND", {{Intrinsic::x86_avx_round_ps_256, DOUBLE}, NO_EMU}},
 124             {"meta.intrinsic.VHSUBPS", {{Intrinsic::x86_avx_hsub_ps_256, DOUBLE}, NO_EMU}},
 125         },
 126         {
 127             // AVX2
 128             {"meta.intrinsic.VRCPPS", {{Intrinsic::x86_avx_rcp_ps_256, DOUBLE}, NO_EMU}},
 129             {"meta.intrinsic.VPERMPS",
 130              {{Intrinsic::x86_avx2_permps, Intrinsic::not_intrinsic}, VPERM_EMU}},
 131             {"meta.intrinsic.VPERMD",
 132              {{Intrinsic::x86_avx2_permd, Intrinsic::not_intrinsic}, VPERM_EMU}},
 133             {"meta.intrinsic.VGATHERPD",
 134              {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},
 135             {"meta.intrinsic.VGATHERPS",
 136              {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},
 137             {"meta.intrinsic.VGATHERDD",
 138              {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},
 139             {"meta.intrinsic.VCVTPD2PS", {{Intrinsic::x86_avx_cvt_pd2_ps_256, DOUBLE}, NO_EMU}},
 140             {"meta.intrinsic.VCVTPH2PS",
 141              {{Intrinsic::x86_vcvtph2ps_256, Intrinsic::not_intrinsic}, NO_EMU}},
 142             {"meta.intrinsic.VROUND", {{Intrinsic::x86_avx_round_ps_256, DOUBLE}, NO_EMU}},
 143             {"meta.intrinsic.VHSUBPS", {{Intrinsic::x86_avx_hsub_ps_256, DOUBLE}, NO_EMU}},
 144         },
 145         {
 146             // AVX512
 147             {"meta.intrinsic.VRCPPS",
 148              {{Intrinsic::x86_avx512_rcp14_ps_256, Intrinsic::x86_avx512_rcp14_ps_512}, NO_EMU}},
 149             {"meta.intrinsic.VPERMPS",
 150              {{Intrinsic::x86_avx512_mask_permvar_sf_256,
 151                Intrinsic::x86_avx512_mask_permvar_sf_512},
 152               NO_EMU}},
 153             {"meta.intrinsic.VPERMD",
 154              {{Intrinsic::x86_avx512_mask_permvar_si_256,
 155                Intrinsic::x86_avx512_mask_permvar_si_512},
 156               NO_EMU}},
 157             {"meta.intrinsic.VGATHERPD",
 158              {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},
 159             {"meta.intrinsic.VGATHERPS",
 160              {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},
 161             {"meta.intrinsic.VGATHERDD",
 162              {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},
 163             {"meta.intrinsic.VCVTPD2PS",
 164              {{Intrinsic::x86_avx512_mask_cvtpd2ps_256, Intrinsic::x86_avx512_mask_cvtpd2ps_512},
 165               NO_EMU}},
 166             {"meta.intrinsic.VCVTPH2PS",
 167              {{Intrinsic::x86_avx512_mask_vcvtph2ps_256, Intrinsic::x86_avx512_mask_vcvtph2ps_512},
 168               NO_EMU}},
 169             {"meta.intrinsic.VROUND",
 170              {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VROUND_EMU}},
 171             {"meta.intrinsic.VHSUBPS",
 172              {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VHSUB_EMU}},
 173         }};
 174
 175     struct LowerX86 : public FunctionPass
 176     {
 177         LowerX86(Builder *b = nullptr) : FunctionPass(ID), B(b)
 178         {
 179             initializeLowerX86Pass(*PassRegistry::getPassRegistry());
 180
 181             // Determine target arch
 182             if (JM()->mArch.AVX512F())
 183             {
 184                 mTarget = AVX512;
 185             }
 186             else if (JM()->mArch.AVX2())
 187             {
 188                 mTarget = AVX2;
 189             }
 190             else if (JM()->mArch.AVX())
 191             {
 192                 mTarget = AVX;
 193             }
 194             else
 195             {
 196                 SWR_ASSERT(false, "Unsupported AVX architecture.");
 197                 mTarget = AVX;
 198             }
 199         }
 200
 201         // Try to decipher the vector type of the instruction. This does not work properly
 202         // across all intrinsics, and will have to be rethought. Probably need something
 203         // similar to llvm's getDeclaration() utility to map a set of inputs to a specific typed
 204         // intrinsic.
 205         void GetRequestedWidthAndType(CallInst *      pCallInst,
 206                                       const StringRef intrinName,
 207                                       TargetWidth *   pWidth,
 208                                       Type **         pTy)
 209         {
 210             Type *pVecTy = pCallInst->getType();
 211
 212             // Check for intrinsic specific types
 213             // VCVTPD2PS type comes from src, not dst
 214             if (intrinName.equals("meta.intrinsic.VCVTPD2PS"))
 215             {
 216                 pVecTy = pCallInst->getOperand(0)->getType();
 217             }
 218
 219             if (!pVecTy->isVectorTy())
 220             {
 221                 for (auto &op : pCallInst->arg_operands())
 222                 {
 223                     if (op.get()->getType()->isVectorTy())
 224                     {
 225                         pVecTy = op.get()->getType();
 226                         break;
 227                     }
 228                 }
 229             }
 230             SWR_ASSERT(pVecTy->isVectorTy(), "Couldn't determine vector size");
 231
 232             uint32_t width = cast<VectorType>(pVecTy)->getBitWidth();
 233             switch (width)
 234             {
 235             case 256:
 236                 *pWidth = W256;
 237                 break;
 238             case 512:
 239                 *pWidth = W512;
 240                 break;
 241             default:
 242                 SWR_ASSERT(false, "Unhandled vector width %d", width);
 243                 *pWidth = W256;
 244             }
 245
 246             *pTy = pVecTy->getScalarType();
 247         }
 248
 249         Value *GetZeroVec(TargetWidth width, Type *pTy)
 250         {
 251             uint32_t numElem = 0;
 252             switch (width)
 253             {
 254             case W256:
 255                 numElem = 8;
 256                 break;
 257             case W512:
 258                 numElem = 16;
 259                 break;
 260             default:
 261                 SWR_ASSERT(false, "Unhandled vector width type %d\n", width);
 262             }
 263
 264             return ConstantVector::getNullValue(VectorType::get(pTy, numElem));
 265         }
 266
 267         Value *GetMask(TargetWidth width)
 268         {
 269             Value *mask;
 270             switch (width)
 271             {
 272             case W256:
 273                 mask = B->C((uint8_t)-1);
 274                 break;
 275             case W512:
 276                 mask = B->C((uint16_t)-1);
 277                 break;
 278             default:
 279                 SWR_ASSERT(false, "Unhandled vector width type %d\n", width);
 280             }
 281             return mask;
 282         }
 283
 284         // Convert <N x i1> mask to <N x i32> x86 mask
 285         Value *VectorMask(Value *vi1Mask)
 286         {
 287             uint32_t numElem = vi1Mask->getType()->getVectorNumElements();
 288             return B->S_EXT(vi1Mask, VectorType::get(B->mInt32Ty, numElem));
 289         }
 290
 291         Instruction *ProcessIntrinsicAdvanced(CallInst *pCallInst)
 292         {
 293             Function *  pFunc     = pCallInst->getCalledFunction();
 294             auto &      intrinsic = intrinsicMap2[mTarget][pFunc->getName()];
 295             TargetWidth vecWidth;
 296             Type *      pElemTy;
 297             GetRequestedWidthAndType(pCallInst, pFunc->getName(), &vecWidth, &pElemTy);
 298
 299             // Check if there is a native intrinsic for this instruction
 300             Intrinsic::ID id = intrinsic.intrin[vecWidth];
 301             if (id == DOUBLE)
 302             {
 303                 // Double pump the next smaller SIMD intrinsic
 304                 SWR_ASSERT(vecWidth != 0, "Cannot double pump smallest SIMD width.");
 305                 Intrinsic::ID id2 = intrinsic.intrin[vecWidth - 1];
 306                 SWR_ASSERT(id2 != Intrinsic::not_intrinsic,
 307                            "Cannot find intrinsic to double pump.");
 308                 return DOUBLE_EMU(this, mTarget, vecWidth, pCallInst, id2);
 309             }
 310             else if (id != Intrinsic::not_intrinsic)
 311             {
 312                 Function *pIntrin = Intrinsic::getDeclaration(B->JM()->mpCurrentModule, id);
 313                 SmallVector<Value *, 8> args;
 314                 for (auto &arg : pCallInst->arg_operands())
 315                 {
 316                     args.push_back(arg.get());
 317                 }
 318
 319                 // If AVX512, all instructions add a src operand and mask. We'll pass in 0 src and
 320                 // full mask for now Assuming the intrinsics are consistent and place the src
 321                 // operand and mask last in the argument list.
 322                 if (mTarget == AVX512)
 323                 {
 324                     if (pFunc->getName().equals("meta.intrinsic.VCVTPD2PS"))
 325                     {
 326                         args.push_back(GetZeroVec(W256, pCallInst->getType()->getScalarType()));
 327                         args.push_back(GetMask(W256));
 328                         // for AVX512 VCVTPD2PS, we also have to add rounding mode
 329                         args.push_back(B->C(_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
 330                     }
 331                     else
 332                     {
 333                         args.push_back(GetZeroVec(vecWidth, pElemTy));
 334                         args.push_back(GetMask(vecWidth));
 335                     }
 336                 }
 337
 338                 return B->CALLA(pIntrin, args);
 339             }
 340             else
 341             {
 342                 // No native intrinsic, call emulation function
 343                 return intrinsic.emuFunc(this, mTarget, vecWidth, pCallInst);
 344             }
 345
 346             SWR_ASSERT(false);
 347             return nullptr;
 348         }
 349
 350         Instruction *ProcessIntrinsic(CallInst *pCallInst)
 351         {
 352             Function *pFunc = pCallInst->getCalledFunction();
 353
 354             // Forward to the advanced support if found
 355             if (intrinsicMap2[mTarget].find(pFunc->getName()) != intrinsicMap2[mTarget].end())
 356             {
 357                 return ProcessIntrinsicAdvanced(pCallInst);
 358             }
 359
 360             SWR_ASSERT(intrinsicMap.find(pFunc->getName()) != intrinsicMap.end(),
 361                        "Unimplemented intrinsic %s.",
 362                        pFunc->getName());
 363
 364             Intrinsic::ID x86Intrinsic = intrinsicMap[pFunc->getName()];
 365             Function *    pX86IntrinFunc =
 366                 Intrinsic::getDeclaration(B->JM()->mpCurrentModule, x86Intrinsic);
 367
 368             SmallVector<Value *, 8> args;
 369             for (auto &arg : pCallInst->arg_operands())
 370             {
 371                 args.push_back(arg.get());
 372             }
 373             return B->CALLA(pX86IntrinFunc, args);
 374         }
 375
 376         //////////////////////////////////////////////////////////////////////////
 377         /// @brief LLVM funtion pass run method.
 378         /// @param f- The function we're working on with this pass.
 379         virtual bool runOnFunction(Function &F)
 380         {
 381             std::vector<Instruction *> toRemove;
 382
 383             for (auto &BB : F.getBasicBlockList())
 384             {
 385                 for (auto &I : BB.getInstList())
 386                 {
 387                     if (CallInst *pCallInst = dyn_cast<CallInst>(&I))
 388                     {
 389                         Function *pFunc = pCallInst->getCalledFunction();
 390                         if (pFunc)
 391                         {
 392                             if (pFunc->getName().startswith("meta.intrinsic"))
 393                             {
 394                                 B->IRB()->SetInsertPoint(&I);
 395                                 Instruction *pReplace = ProcessIntrinsic(pCallInst);
 396                                 SWR_ASSERT(pReplace);
 397                                 toRemove.push_back(pCallInst);
 398                                 pCallInst->replaceAllUsesWith(pReplace);
 399                             }
 400                         }
 401                     }
 402                 }
 403             }
 404
 405             for (auto *pInst : toRemove)
 406             {
 407                 pInst->eraseFromParent();
 408             }
 409
 410             JitManager::DumpToFile(&F, "lowerx86");
 411
 412             return true;
 413         }
 414
 415         virtual void getAnalysisUsage(AnalysisUsage &AU) const {}
 416
 417         JitManager *JM() { return B->JM(); }
 418
 419         Builder *B;
 420
 421         TargetArch mTarget;
 422
 423         static char ID; ///< Needed by LLVM to generate ID for FunctionPass.
 424     };
 425
 426     char LowerX86::ID = 0; // LLVM uses address of ID as the actual ID.
 427
 428     FunctionPass *createLowerX86Pass(Builder *b) { return new LowerX86(b); }
 429
 430     Instruction *NO_EMU(LowerX86 *pThis, TargetArch arch, TargetWidth width, CallInst *pCallInst)
 431     {
 432         SWR_ASSERT(false, "Unimplemented intrinsic emulation.");
 433         return nullptr;
 434     }
 435
 436     Instruction *VPERM_EMU(LowerX86 *pThis, TargetArch arch, TargetWidth width, CallInst *pCallInst)
 437     {
 438         // Only need vperm emulation for AVX
 439         SWR_ASSERT(arch == AVX);
 440
 441         Builder *B         = pThis->B;
 442         auto     v32A      = pCallInst->getArgOperand(0);
 443         auto     vi32Index = pCallInst->getArgOperand(1);
 444
 445         Value *v32Result;
 446         if (isa<Constant>(vi32Index))
 447         {
 448             // Can use llvm shuffle vector directly with constant shuffle indices
 449             v32Result = B->VSHUFFLE(v32A, v32A, vi32Index);
 450         }
 451         else
 452         {
 453             v32Result = UndefValue::get(v32A->getType());
 454             for (uint32_t l = 0; l < v32A->getType()->getVectorNumElements(); ++l)
 455             {
 456                 auto i32Index = B->VEXTRACT(vi32Index, B->C(l));
 457                 auto val      = B->VEXTRACT(v32A, i32Index);
 458                 v32Result     = B->VINSERT(v32Result, val, B->C(l));
 459             }
 460         }
 461         return cast<Instruction>(v32Result);
 462     }
 463
 464     Instruction *
 465     VGATHER_EMU(LowerX86 *pThis, TargetArch arch, TargetWidth width, CallInst *pCallInst)
 466     {
 467         Builder *B           = pThis->B;
 468         auto     vSrc        = pCallInst->getArgOperand(0);
 469         auto     pBase       = pCallInst->getArgOperand(1);
 470         auto     vi32Indices = pCallInst->getArgOperand(2);
 471         auto     vi1Mask     = pCallInst->getArgOperand(3);
 472         auto     i8Scale     = pCallInst->getArgOperand(4);
 473
 474         pBase             = B->POINTER_CAST(pBase, PointerType::get(B->mInt8Ty, 0));
 475         uint32_t numElem  = vSrc->getType()->getVectorNumElements();
 476         auto     i32Scale = B->Z_EXT(i8Scale, B->mInt32Ty);
 477         auto     srcTy    = vSrc->getType()->getVectorElementType();
 478         Value *  v32Gather;
 479         if (arch == AVX)
 480         {
 481             // Full emulation for AVX
 482             // Store source on stack to provide a valid address to load from inactive lanes
 483             auto pStack = B->STACKSAVE();
 484             auto pTmp   = B->ALLOCA(vSrc->getType());
 485             B->STORE(vSrc, pTmp);
 486
 487             v32Gather        = UndefValue::get(vSrc->getType());
 488             auto vi32Scale   = ConstantVector::getSplat(numElem, cast<ConstantInt>(i32Scale));
 489             auto vi32Offsets = B->MUL(vi32Indices, vi32Scale);
 490
 491             for (uint32_t i = 0; i < numElem; ++i)
 492             {
 493                 auto i32Offset          = B->VEXTRACT(vi32Offsets, B->C(i));
 494                 auto pLoadAddress       = B->GEP(pBase, i32Offset);
 495                 pLoadAddress            = B->BITCAST(pLoadAddress, PointerType::get(srcTy, 0));
 496                 auto pMaskedLoadAddress = B->GEP(pTmp, {0, i});
 497                 auto i1Mask             = B->VEXTRACT(vi1Mask, B->C(i));
 498                 auto pValidAddress      = B->SELECT(i1Mask, pLoadAddress, pMaskedLoadAddress);
 499                 auto val                = B->LOAD(pValidAddress);
 500                 v32Gather               = B->VINSERT(v32Gather, val, B->C(i));
 501             }
 502
 503             B->STACKRESTORE(pStack);
 504         }
 505         else if (arch == AVX2 || (arch == AVX512 && width == W256))
 506         {
 507             Function *pX86IntrinFunc;
 508             if (srcTy == B->mFP32Ty)
 509             {
 510                 pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule,
 511                                                            Intrinsic::x86_avx2_gather_d_ps_256);
 512             }
 513             else if (srcTy == B->mInt32Ty)
 514             {
 515                 pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule,
 516                                                            Intrinsic::x86_avx2_gather_d_d_256);
 517             }
 518             else if (srcTy == B->mDoubleTy)
 519             {
 520                 pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule,
 521                                                            Intrinsic::x86_avx2_gather_d_q_256);
 522             }
 523             else
 524             {
 525                 SWR_ASSERT(false, "Unsupported vector element type for gather.");
 526             }
 527
 528             if (width == W256)
 529             {
 530                 auto v32Mask = B->BITCAST(pThis->VectorMask(vi1Mask), vSrc->getType());
 531                 v32Gather = B->CALL(pX86IntrinFunc, {vSrc, pBase, vi32Indices, v32Mask, i8Scale});
 532             }
 533             else if (width == W512)
 534             {
 535                 // Double pump 4-wide for 64bit elements
 536                 if (vSrc->getType()->getVectorElementType() == B->mDoubleTy)
 537                 {
 538                     auto v64Mask = pThis->VectorMask(vi1Mask);
 539                     v64Mask      = B->S_EXT(
 540                         v64Mask,
 541                         VectorType::get(B->mInt64Ty, v64Mask->getType()->getVectorNumElements()));
 542                     v64Mask = B->BITCAST(v64Mask, vSrc->getType());
 543
 544                     Value *src0 = B->VSHUFFLE(vSrc, vSrc, B->C({0, 1, 2, 3}));
 545                     Value *src1 = B->VSHUFFLE(vSrc, vSrc, B->C({4, 5, 6, 7}));
 546
 547                     Value *indices0 = B->VSHUFFLE(vi32Indices, vi32Indices, B->C({0, 1, 2, 3}));
 548                     Value *indices1 = B->VSHUFFLE(vi32Indices, vi32Indices, B->C({4, 5, 6, 7}));
 549
 550                     Value *mask0 = B->VSHUFFLE(v64Mask, v64Mask, B->C({0, 1, 2, 3}));
 551                     Value *mask1 = B->VSHUFFLE(v64Mask, v64Mask, B->C({4, 5, 6, 7}));
 552
 553                     src0 = B->BITCAST(
 554                         src0,
 555                         VectorType::get(B->mInt64Ty, src0->getType()->getVectorNumElements()));
 556                     mask0 = B->BITCAST(
 557                         mask0,
 558                         VectorType::get(B->mInt64Ty, mask0->getType()->getVectorNumElements()));
 559                     Value *gather0 =
 560                         B->CALL(pX86IntrinFunc, {src0, pBase, indices0, mask0, i8Scale});
 561                     src1 = B->BITCAST(
 562                         src1,
 563                         VectorType::get(B->mInt64Ty, src1->getType()->getVectorNumElements()));
 564                     mask1 = B->BITCAST(
 565                         mask1,
 566                         VectorType::get(B->mInt64Ty, mask1->getType()->getVectorNumElements()));
 567                     Value *gather1 =
 568                         B->CALL(pX86IntrinFunc, {src1, pBase, indices1, mask1, i8Scale});
 569
 570                     v32Gather = B->VSHUFFLE(gather0, gather1, B->C({0, 1, 2, 3, 4, 5, 6, 7}));
 571                     v32Gather = B->BITCAST(v32Gather, vSrc->getType());
 572                 }
 573                 else
 574                 {
 575                     // Double pump 8-wide for 32bit elements
 576                     auto v32Mask = pThis->VectorMask(vi1Mask);
 577                     v32Mask      = B->BITCAST(v32Mask, vSrc->getType());
 578                     Value *src0  = B->EXTRACT_16(vSrc, 0);
 579                     Value *src1  = B->EXTRACT_16(vSrc, 1);
 580
 581                     Value *indices0 = B->EXTRACT_16(vi32Indices, 0);
 582                     Value *indices1 = B->EXTRACT_16(vi32Indices, 1);
 583
 584                     Value *mask0 = B->EXTRACT_16(v32Mask, 0);
 585                     Value *mask1 = B->EXTRACT_16(v32Mask, 1);
 586
 587                     Value *gather0 =
 588                         B->CALL(pX86IntrinFunc, {src0, pBase, indices0, mask0, i8Scale});
 589                     Value *gather1 =
 590                         B->CALL(pX86IntrinFunc, {src1, pBase, indices1, mask1, i8Scale});
 591
 592                     v32Gather = B->JOIN_16(gather0, gather1);
 593                 }
 594             }
 595         }
 596         else if (arch == AVX512)
 597         {
 598             Value *   iMask;
 599             Function *pX86IntrinFunc;
 600             if (srcTy == B->mFP32Ty)
 601             {
 602                 pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule,
 603                                                            Intrinsic::x86_avx512_gather_dps_512);
 604                 iMask          = B->BITCAST(vi1Mask, B->mInt16Ty);
 605             }
 606             else if (srcTy == B->mInt32Ty)
 607             {
 608                 pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule,
 609                                                            Intrinsic::x86_avx512_gather_dpi_512);
 610                 iMask          = B->BITCAST(vi1Mask, B->mInt16Ty);
 611             }
 612             else if (srcTy == B->mDoubleTy)
 613             {
 614                 pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule,
 615                                                            Intrinsic::x86_avx512_gather_dpd_512);
 616                 iMask          = B->BITCAST(vi1Mask, B->mInt8Ty);
 617             }
 618             else
 619             {
 620                 SWR_ASSERT(false, "Unsupported vector element type for gather.");
 621             }
 622
 623             auto i32Scale = B->Z_EXT(i8Scale, B->mInt32Ty);
 624             v32Gather     = B->CALL(pX86IntrinFunc, {vSrc, pBase, vi32Indices, iMask, i32Scale});
 625         }
 626
 627         return cast<Instruction>(v32Gather);
 628     }
 629
 630     // No support for vroundps in avx512 (it is available in kncni), so emulate with avx
 631     // instructions
 632     Instruction *
 633     VROUND_EMU(LowerX86 *pThis, TargetArch arch, TargetWidth width, CallInst *pCallInst)
 634     {
 635         SWR_ASSERT(arch == AVX512);
 636
 637         auto B       = pThis->B;
 638         auto vf32Src = pCallInst->getOperand(0);
 639         auto i8Round = pCallInst->getOperand(1);
 640         auto pfnFunc =
 641             Intrinsic::getDeclaration(B->JM()->mpCurrentModule, Intrinsic::x86_avx_round_ps_256);
 642
 643         if (width == W256)
 644         {
 645             return cast<Instruction>(B->CALL2(pfnFunc, vf32Src, i8Round));
 646         }
 647         else if (width == W512)
 648         {
 649             auto v8f32SrcLo = B->EXTRACT_16(vf32Src, 0);
 650             auto v8f32SrcHi = B->EXTRACT_16(vf32Src, 1);
 651
 652             auto v8f32ResLo = B->CALL2(pfnFunc, v8f32SrcLo, i8Round);
 653             auto v8f32ResHi = B->CALL2(pfnFunc, v8f32SrcHi, i8Round);
 654
 655             return cast<Instruction>(B->JOIN_16(v8f32ResLo, v8f32ResHi));
 656         }
 657         else
 658         {
 659             SWR_ASSERT(false, "Unimplemented vector width.");
 660         }
 661
 662         return nullptr;
 663     }
 664
 665     // No support for hsub in AVX512
 666     Instruction *VHSUB_EMU(LowerX86 *pThis, TargetArch arch, TargetWidth width, CallInst *pCallInst)
 667     {
 668         SWR_ASSERT(arch == AVX512);
 669
 670         auto B    = pThis->B;
 671         auto src0 = pCallInst->getOperand(0);
 672         auto src1 = pCallInst->getOperand(1);
 673
 674         // 256b hsub can just use avx intrinsic
 675         if (width == W256)
 676         {
 677             auto pX86IntrinFunc =
 678                 Intrinsic::getDeclaration(B->JM()->mpCurrentModule, Intrinsic::x86_avx_hsub_ps_256);
 679             return cast<Instruction>(B->CALL2(pX86IntrinFunc, src0, src1));
 680         }
 681         else if (width == W512)
 682         {
 683             // 512b hsub can be accomplished with shuf/sub combo
 684             auto minuend    = B->VSHUFFLE(src0, src1, B->C({0, 2, 8, 10, 4, 6, 12, 14}));
 685             auto subtrahend = B->VSHUFFLE(src0, src1, B->C({1, 3, 9, 11, 5, 7, 13, 15}));
 686             return cast<Instruction>(B->SUB(minuend, subtrahend));
 687         }
 688         else
 689         {
 690             SWR_ASSERT(false, "Unimplemented vector width.");
 691             return nullptr;
 692         }
 693     }
 694
 695     // Double pump input using Intrin template arg. This blindly extracts lower and upper 256 from
 696     // each vector argument and calls the 256 wide intrinsic, then merges the results to 512 wide
 697     Instruction *DOUBLE_EMU(LowerX86 *    pThis,
 698                             TargetArch    arch,
 699                             TargetWidth   width,
 700                             CallInst *    pCallInst,
 701                             Intrinsic::ID intrin)
 702     {
 703         auto B = pThis->B;
 704         SWR_ASSERT(width == W512);
 705         Value *   result[2];
 706         Function *pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule, intrin);
 707         for (uint32_t i = 0; i < 2; ++i)
 708         {
 709             SmallVector<Value *, 8> args;
 710             for (auto &arg : pCallInst->arg_operands())
 711             {
 712                 auto argType = arg.get()->getType();
 713                 if (argType->isVectorTy())
 714                 {
 715                     uint32_t vecWidth  = argType->getVectorNumElements();
 716                     Value *  lanes     = B->CInc<int>(i * vecWidth / 2, vecWidth / 2);
 717                     Value *  argToPush = B->VSHUFFLE(
 718                         arg.get(), B->VUNDEF(argType->getVectorElementType(), vecWidth), lanes);
 719                     args.push_back(argToPush);
 720                 }
 721                 else
 722                 {
 723                     args.push_back(arg.get());
 724                 }
 725             }
 726             result[i] = B->CALLA(pX86IntrinFunc, args);
 727         }
 728         uint32_t vecWidth;
 729         if (result[0]->getType()->isVectorTy())
 730         {
 731             assert(result[1]->getType()->isVectorTy());
 732             vecWidth = result[0]->getType()->getVectorNumElements() +
 733                        result[1]->getType()->getVectorNumElements();
 734         }
 735         else
 736         {
 737             vecWidth = 2;
 738         }
 739         Value *lanes = B->CInc<int>(0, vecWidth);
 740         return cast<Instruction>(B->VSHUFFLE(result[0], result[1], lanes));
 741     }
 742
 743 } // namespace SwrJit
 744
 745 using namespace SwrJit;
 746
 747 INITIALIZE_PASS_BEGIN(LowerX86, "LowerX86", "LowerX86", false, false)
 748 INITIALIZE_PASS_END(LowerX86, "LowerX86", "LowerX86", false, false)