src/gallium/drivers/swr/rasterizer/jitter/functionpasses/lower_x86.cpp

   1 /****************************************************************************
   2 * Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
   3 *
   4 * Permission is hereby granted, free of charge, to any person obtaining a
   5 * copy of this software and associated documentation files (the "Software"),
   6 * to deal in the Software without restriction, including without limitation
   7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8 * and/or sell copies of the Software, and to permit persons to whom the
   9 * Software is furnished to do so, subject to the following conditions:
  10 *
  11 * The above copyright notice and this permission notice (including the next
  12 * paragraph) shall be included in all copies or substantial portions of the
  13 * Software.
  14 *
  15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21 * IN THE SOFTWARE.
  22 *
  23 * @file lower_x86.cpp
  24 *
  25 * @brief llvm pass to lower meta code to x86
  26 *
  27 * Notes:
  28 *
  29 ******************************************************************************/
  30
  31 #include "jit_pch.hpp"
  32 #include "passes.h"
  33 #include "JitManager.h"
  34
  35 #include <unordered_map>
  36
  37
  38 namespace llvm
  39 {
  40     // foward declare the initializer
  41     void initializeLowerX86Pass(PassRegistry&);
  42 }
  43
  44 namespace SwrJit
  45 {
  46     using namespace llvm;
  47
  48     enum TargetArch
  49     {
  50         AVX = 0,
  51         AVX2 = 1,
  52         AVX512 = 2
  53     };
  54
  55     enum TargetWidth
  56     {
  57         W256 = 0,
  58         W512 = 1,
  59         NUM_WIDTHS = 2
  60     };
  61
  62     struct LowerX86;
  63
  64     typedef std::function<Instruction*(LowerX86*, TargetArch, TargetWidth, CallInst*)> EmuFunc;
  65
  66     struct X86Intrinsic
  67     {
  68         Intrinsic::ID intrin[NUM_WIDTHS];
  69         EmuFunc emuFunc;
  70     };
  71
  72     // Map of intrinsics that haven't been moved to the new mechanism yet. If used, these get the previous behavior of
  73     // mapping directly to avx/avx2 intrinsics.
  74     static std::map<std::string, Intrinsic::ID> intrinsicMap = {
  75         {"meta.intrinsic.VGATHERPD",       Intrinsic::x86_avx2_gather_d_pd_256},
  76         {"meta.intrinsic.VROUND",          Intrinsic::x86_avx_round_ps_256},
  77         {"meta.intrinsic.BEXTR_32",        Intrinsic::x86_bmi_bextr_32},
  78         {"meta.intrinsic.VPSHUFB",         Intrinsic::x86_avx2_pshuf_b},
  79         {"meta.intrinsic.VCVTPD2PS",       Intrinsic::x86_avx_cvt_pd2_ps_256},
  80         {"meta.intrinsic.VCVTPH2PS",       Intrinsic::x86_vcvtph2ps_256},
  81         {"meta.intrinsic.VCVTPS2PH",       Intrinsic::x86_vcvtps2ph_256},
  82         {"meta.intrinsic.VHSUBPS",         Intrinsic::x86_avx_hsub_ps_256},
  83         {"meta.intrinsic.VPTESTC",         Intrinsic::x86_avx_ptestc_256},
  84         {"meta.intrinsic.VPTESTZ",         Intrinsic::x86_avx_ptestz_256},
  85         {"meta.intrinsic.VFMADDPS",        Intrinsic::x86_fma_vfmadd_ps_256},
  86         {"meta.intrinsic.VMOVMSKPS",       Intrinsic::x86_avx_movmsk_ps_256},
  87         {"meta.intrinsic.VPHADDD",         Intrinsic::x86_avx2_phadd_d},
  88         {"meta.intrinsic.PDEP32",          Intrinsic::x86_bmi_pdep_32},
  89         {"meta.intrinsic.RDTSC",           Intrinsic::x86_rdtsc},
  90     };
  91
  92     // Forward decls
  93     Instruction* NO_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst);
  94     Instruction* VPERM_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst);
  95     Instruction* VGATHER_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst);
  96
  97     static std::map<std::string, X86Intrinsic> intrinsicMap2[] = {
  98         //                              256 wide                                    512 wide
  99     {   // AVX
 100         {"meta.intrinsic.VRCPPS",      {{Intrinsic::x86_avx_rcp_ps_256,              Intrinsic::not_intrinsic},                      NO_EMU}},
 101         {"meta.intrinsic.VPERMPS",     {{Intrinsic::not_intrinsic,                   Intrinsic::not_intrinsic},                      VPERM_EMU}},
 102         {"meta.intrinsic.VPERMD",      {{Intrinsic::not_intrinsic,                   Intrinsic::not_intrinsic},                      VPERM_EMU}},
 103         {"meta.intrinsic.VGATHERPS",   {{Intrinsic::not_intrinsic,                   Intrinsic::not_intrinsic},                      VGATHER_EMU}},
 104         {"meta.intrinsic.VGATHERPS_16",{{Intrinsic::not_intrinsic,                   Intrinsic::not_intrinsic},                      VGATHER_EMU}},
 105         {"meta.intrinsic.VGATHERDD",   {{Intrinsic::not_intrinsic,                   Intrinsic::not_intrinsic},                      VGATHER_EMU}},
 106         {"meta.intrinsic.VGATHERDD_16",{{Intrinsic::not_intrinsic,                   Intrinsic::not_intrinsic},                      VGATHER_EMU}},
 107     },
 108     {   // AVX2
 109         {"meta.intrinsic.VRCPPS",      {{Intrinsic::x86_avx_rcp_ps_256,              Intrinsic::not_intrinsic},                      NO_EMU}},
 110         {"meta.intrinsic.VPERMPS",     {{Intrinsic::x86_avx2_permps,                 Intrinsic::not_intrinsic},                      VPERM_EMU}},
 111         {"meta.intrinsic.VPERMD",      {{Intrinsic::x86_avx2_permd,                  Intrinsic::not_intrinsic},                      VPERM_EMU}},
 112         {"meta.intrinsic.VGATHERPS",   {{Intrinsic::not_intrinsic,                   Intrinsic::not_intrinsic},                      VGATHER_EMU}},
 113         {"meta.intrinsic.VGATHERPS_16",{{Intrinsic::not_intrinsic,                   Intrinsic::not_intrinsic},                      VGATHER_EMU}},
 114         {"meta.intrinsic.VGATHERDD",   {{Intrinsic::not_intrinsic,                   Intrinsic::not_intrinsic},                      VGATHER_EMU}},
 115         {"meta.intrinsic.VGATHERDD_16",{{Intrinsic::not_intrinsic,                   Intrinsic::not_intrinsic},                      VGATHER_EMU}},
 116     },
 117     {   // AVX512
 118         {"meta.intrinsic.VRCPPS",      {{Intrinsic::x86_avx512_rcp14_ps_256,         Intrinsic::x86_avx512_rcp14_ps_512},            NO_EMU}},
 119         {"meta.intrinsic.VPERMPS",     {{Intrinsic::x86_avx512_mask_permvar_sf_256,  Intrinsic::x86_avx512_mask_permvar_sf_512},     NO_EMU}},
 120         {"meta.intrinsic.VPERMD",      {{Intrinsic::x86_avx512_mask_permvar_si_256,  Intrinsic::x86_avx512_mask_permvar_si_512},     NO_EMU}},
 121         {"meta.intrinsic.VGATHERPS",   {{Intrinsic::not_intrinsic,                   Intrinsic::not_intrinsic},                      VGATHER_EMU}},
 122         {"meta.intrinsic.VGATHERPS_16",{{Intrinsic::not_intrinsic,                   Intrinsic::not_intrinsic},                      VGATHER_EMU}},
 123         {"meta.intrinsic.VGATHERDD",   {{Intrinsic::not_intrinsic,                   Intrinsic::not_intrinsic},                      VGATHER_EMU}},
 124         {"meta.intrinsic.VGATHERDD_16",{{Intrinsic::not_intrinsic,                   Intrinsic::not_intrinsic},                      VGATHER_EMU}},
 125     }
 126     };
 127
 128     struct LowerX86 : public FunctionPass
 129     {
 130         LowerX86(JitManager* pJitMgr = nullptr, Builder* b = nullptr)
 131             : FunctionPass(ID), mpJitMgr(pJitMgr), B(b)
 132         {
 133             initializeLowerX86Pass(*PassRegistry::getPassRegistry());
 134
 135             // Determine target arch
 136             if (mpJitMgr->mArch.AVX512F())
 137             {
 138                 mTarget = AVX512;
 139             }
 140             else if (mpJitMgr->mArch.AVX2())
 141             {
 142                 mTarget = AVX2;
 143             }
 144             else if (mpJitMgr->mArch.AVX())
 145             {
 146                 mTarget = AVX;
 147
 148             }
 149             else
 150             {
 151                 SWR_ASSERT(false, "Unsupported AVX architecture.");
 152                 mTarget = AVX;
 153             }
 154         }
 155
 156         // Try to decipher the vector type of the instruction. This does not work properly
 157         // across all intrinsics, and will have to be rethought. Probably need something
 158         // similar to llvm's getDeclaration() utility to map a set of inputs to a specific typed
 159         // intrinsic.
 160         void GetRequestedWidthAndType(CallInst* pCallInst, TargetWidth* pWidth, Type** pTy)
 161         {
 162             uint32_t vecWidth;
 163             Type* pVecTy = pCallInst->getType();
 164             if (!pVecTy->isVectorTy())
 165             {
 166                 for (auto& op : pCallInst->arg_operands())
 167                 {
 168                     if (op.get()->getType()->isVectorTy())
 169                     {
 170                         pVecTy = op.get()->getType();
 171                         break;
 172                     }
 173                 }
 174             }
 175             SWR_ASSERT(pVecTy->isVectorTy(), "Couldn't determine vector size");
 176
 177             uint32_t width = cast<VectorType>(pVecTy)->getBitWidth();
 178             switch (width)
 179             {
 180             case 256: *pWidth = W256; break;
 181             case 512: *pWidth = W512; break;
 182             default: SWR_ASSERT(false, "Unhandled vector width %d", width);
 183                 *pWidth = W256;
 184             }
 185
 186             *pTy = pVecTy->getScalarType();
 187         }
 188
 189         Value* GetZeroVec(TargetWidth width, Type* pTy)
 190         {
 191             uint32_t numElem = 0;
 192             switch (width)
 193             {
 194             case W256: numElem = 8; break;
 195             case W512: numElem = 16; break;
 196             }
 197
 198             return ConstantVector::getNullValue(VectorType::get(pTy, numElem));
 199         }
 200
 201         Value* GetMask(TargetWidth width)
 202         {
 203             Value* mask;
 204             switch (width)
 205             {
 206             case W256: mask = B->C((uint8_t)-1); break;
 207             case W512: mask = B->C((uint16_t)-1); break;
 208             }
 209             return mask;
 210         }
 211
 212         Instruction* ProcessIntrinsicAdvanced(CallInst* pCallInst)
 213         {
 214             Function* pFunc = pCallInst->getCalledFunction();
 215             auto& intrinsic = intrinsicMap2[mTarget][pFunc->getName()];
 216             TargetWidth vecWidth;
 217             Type* pElemTy;
 218             GetRequestedWidthAndType(pCallInst, &vecWidth, &pElemTy);
 219
 220             // Check if there is a native intrinsic for this instruction
 221             Intrinsic::ID id = intrinsic.intrin[vecWidth];
 222             if (id != Intrinsic::not_intrinsic)
 223             {
 224                 Function* pIntrin = Intrinsic::getDeclaration(B->JM()->mpCurrentModule, id);
 225                 SmallVector<Value*, 8> args;
 226                 for (auto& arg : pCallInst->arg_operands())
 227                 {
 228                     args.push_back(arg.get());
 229                 }
 230
 231                 // If AVX512, all instructions add a src operand and mask. We'll pass in 0 src and full mask for now
 232                 // Assuming the intrinsics are consistent and place the src operand and mask last in the argument list.
 233                 if (mTarget == AVX512)
 234                 {
 235                     args.push_back(GetZeroVec(vecWidth, pElemTy));
 236                     args.push_back(GetMask(vecWidth));
 237                 }
 238
 239                 return B->CALLA(pIntrin, args);
 240             }
 241             else
 242             {
 243                 // No native intrinsic, call emulation function
 244                 return intrinsic.emuFunc(this, mTarget, vecWidth, pCallInst);
 245             }
 246
 247             SWR_ASSERT(false);
 248             return nullptr;
 249         }
 250
 251         Instruction* ProcessIntrinsic(CallInst* pCallInst)
 252         {
 253             Function* pFunc = pCallInst->getCalledFunction();
 254
 255             // Forward to the advanced support if found
 256             if (intrinsicMap2[mTarget].find(pFunc->getName()) != intrinsicMap2[mTarget].end())
 257             {
 258                 return ProcessIntrinsicAdvanced(pCallInst);
 259             }
 260
 261             SWR_ASSERT(intrinsicMap.find(pFunc->getName()) != intrinsicMap.end(), "Unimplemented intrinsic %s.", pFunc->getName());
 262
 263             Intrinsic::ID x86Intrinsic = intrinsicMap[pFunc->getName()];
 264             Function* pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule, x86Intrinsic);
 265
 266             SmallVector<Value*, 8> args;
 267             for (auto& arg : pCallInst->arg_operands())
 268             {
 269                 args.push_back(arg.get());
 270             }
 271             return B->CALLA(pX86IntrinFunc, args);
 272         }
 273
 274         //////////////////////////////////////////////////////////////////////////
 275         /// @brief LLVM funtion pass run method.
 276         /// @param f- The function we're working on with this pass.
 277         virtual bool runOnFunction(Function& F)
 278         {
 279             std::vector<Instruction*> toRemove;
 280
 281             for (auto& BB : F.getBasicBlockList())
 282             {
 283                 for (auto& I : BB.getInstList())
 284                 {
 285                     if (CallInst* pCallInst = dyn_cast<CallInst>(&I))
 286                     {
 287                         Function* pFunc = pCallInst->getCalledFunction();
 288                         if (pFunc)
 289                         {
 290                             if (pFunc->getName().startswith("meta.intrinsic"))
 291                             {
 292                                 B->IRB()->SetInsertPoint(&I);
 293                                 Instruction* pReplace = ProcessIntrinsic(pCallInst);
 294                                 SWR_ASSERT(pReplace);
 295                                 toRemove.push_back(pCallInst);
 296                                 pCallInst->replaceAllUsesWith(pReplace);
 297                             }
 298                         }
 299
 300                     }
 301                 }
 302             }
 303
 304             for (auto* pInst : toRemove)
 305             {
 306                 pInst->eraseFromParent();
 307             }
 308
 309             JitManager::DumpToFile(&F, "lowerx86");
 310
 311             return true;
 312         }
 313
 314         virtual void getAnalysisUsage(AnalysisUsage& AU) const
 315         {
 316         }
 317
 318         JitManager* JM() { return mpJitMgr; }
 319
 320         JitManager* mpJitMgr;
 321         Builder* B;
 322
 323         TargetArch mTarget;
 324
 325         static char ID;  ///< Needed by LLVM to generate ID for FunctionPass.
 326     };
 327
 328     char LowerX86::ID = 0;   // LLVM uses address of ID as the actual ID.
 329
 330     FunctionPass* createLowerX86Pass(JitManager* pJitMgr, Builder* b)
 331     {
 332         return new LowerX86(pJitMgr, b);
 333     }
 334
 335     Instruction* NO_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst)
 336     {
 337         SWR_ASSERT(false, "Unimplemented intrinsic emulation.");
 338         return nullptr;
 339     }
 340
 341     Instruction* VPERM_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst)
 342     {
 343         // Only need vperm emulation for AVX
 344         SWR_ASSERT(arch == AVX);
 345
 346         Builder* B = pThis->B;
 347         auto v32A = pCallInst->getArgOperand(0);
 348         auto vi32Index = pCallInst->getArgOperand(1);
 349
 350         Value* v32Result;
 351         if (isa<Constant>(vi32Index))
 352         {
 353             // Can use llvm shuffle vector directly with constant shuffle indices
 354             v32Result = B->VSHUFFLE(v32A, v32A, vi32Index);
 355         }
 356         else
 357         {
 358             v32Result = UndefValue::get(v32A->getType());
 359             for (uint32_t l = 0; l < v32A->getType()->getVectorNumElements(); ++l)
 360             {
 361                 auto i32Index = B->VEXTRACT(vi32Index, B->C(l));
 362                 auto val = B->VEXTRACT(v32A, i32Index);
 363                 v32Result = B->VINSERT(v32Result, val, B->C(l));
 364             }
 365         }
 366         return cast<Instruction>(v32Result);
 367     }
 368
 369     Instruction* VGATHER_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst)
 370     {
 371         Builder* B = pThis->B;
 372         auto vSrc = pCallInst->getArgOperand(0);
 373         auto pBase = pCallInst->getArgOperand(1);
 374         auto vi32Indices = pCallInst->getArgOperand(2);
 375         auto vi1Mask = pCallInst->getArgOperand(3);
 376         auto i8Scale = pCallInst->getArgOperand(4);
 377
 378         pBase = B->INT_TO_PTR(pBase, PointerType::get(B->mInt8Ty, 0));
 379         uint32_t numElem = vSrc->getType()->getVectorNumElements();
 380         auto i32Scale = B->Z_EXT(i8Scale, B->mInt32Ty);
 381         auto srcTy = vSrc->getType()->getVectorElementType();
 382         Value* v32Gather;
 383         if (arch == AVX)
 384         {
 385             // Full emulation for AVX
 386             // Store source on stack to provide a valid address to load from inactive lanes
 387             auto pStack = B->STACKSAVE();
 388             auto pTmp = B->ALLOCA(vSrc->getType());
 389             B->STORE(vSrc, pTmp);
 390
 391             v32Gather = UndefValue::get(vSrc->getType());
 392             auto vi32Scale = ConstantVector::getSplat(numElem, cast<ConstantInt>(i32Scale));
 393             auto vi32Offsets = B->MUL(vi32Indices, vi32Scale);
 394
 395             for (uint32_t i = 0; i < numElem; ++i)
 396             {
 397                 auto i32Offset = B->VEXTRACT(vi32Offsets, B->C(i));
 398                 auto pLoadAddress = B->GEP(pBase, i32Offset);
 399                 pLoadAddress = B->BITCAST(pLoadAddress, PointerType::get(srcTy, 0));
 400                 auto pMaskedLoadAddress = B->GEP(pTmp, { 0, i });
 401                 auto i1Mask = B->VEXTRACT(vi1Mask, B->C(i));
 402                 auto pValidAddress = B->SELECT(i1Mask, pLoadAddress, pMaskedLoadAddress);
 403                 auto val = B->LOAD(pValidAddress);
 404                 v32Gather = B->VINSERT(v32Gather, val, B->C(i));
 405             }
 406
 407             B->STACKRESTORE(pStack);
 408         }
 409         else if (arch == AVX2 || (arch == AVX512 && width == W256))
 410         {
 411             Function* pX86IntrinFunc = srcTy == B->mFP32Ty ? Intrinsic::getDeclaration(B->JM()->mpCurrentModule, Intrinsic::x86_avx2_gather_d_ps_256) :
 412                 Intrinsic::getDeclaration(B->JM()->mpCurrentModule, Intrinsic::x86_avx2_gather_d_d_256);
 413             if (width == W256)
 414             {
 415                 auto v32Mask = B->BITCAST(B->VMASK(vi1Mask), vSrc->getType());
 416                 v32Gather = B->CALL(pX86IntrinFunc, { vSrc, pBase, vi32Indices, v32Mask, i8Scale });
 417             }
 418             else if (width == W512)
 419             {
 420                 // Double pump 8-wide
 421                 auto v32Mask = B->BITCAST(B->VMASK_16(vi1Mask), vSrc->getType());
 422                 Value *src0 = B->EXTRACT_16(vSrc, 0);
 423                 Value *src1 = B->EXTRACT_16(vSrc, 1);
 424
 425                 Value *indices0 = B->EXTRACT_16(vi32Indices, 0);
 426                 Value *indices1 = B->EXTRACT_16(vi32Indices, 1);
 427
 428                 Value *mask0 = B->EXTRACT_16(v32Mask, 0);
 429                 Value *mask1 = B->EXTRACT_16(v32Mask, 1);
 430
 431                 Value *gather0 = B->CALL(pX86IntrinFunc, { src0, pBase, indices0, mask0, i8Scale });
 432                 Value *gather1 = B->CALL(pX86IntrinFunc, { src1, pBase, indices1, mask1, i8Scale });
 433
 434                 v32Gather = B->JOIN_16(gather0, gather1);
 435             }
 436         }
 437         else if (arch == AVX512)
 438         {
 439             auto i16Mask = B->BITCAST(vi1Mask, B->mInt16Ty);
 440
 441             Function* pX86IntrinFunc = srcTy == B->mFP32Ty ? Intrinsic::getDeclaration(B->JM()->mpCurrentModule, Intrinsic::x86_avx512_gather_dps_512) :
 442                 Intrinsic::getDeclaration(B->JM()->mpCurrentModule, Intrinsic::x86_avx512_gather_dpi_512);
 443             auto i32Scale = B->Z_EXT(i8Scale, B->mInt32Ty);
 444             v32Gather = B->CALL(pX86IntrinFunc, { vSrc, pBase, vi32Indices, i16Mask, i32Scale });
 445         }
 446
 447         return cast<Instruction>(v32Gather);
 448     }
 449 }
 450
 451 using namespace SwrJit;
 452
 453 INITIALIZE_PASS_BEGIN(LowerX86, "LowerX86", "LowerX86", false, false)
 454 INITIALIZE_PASS_END(LowerX86, "LowerX86", "LowerX86", false, false)
 455