src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp

   1 /****************************************************************************
   2 * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
   3 *
   4 * Permission is hereby granted, free of charge, to any person obtaining a
   5 * copy of this software and associated documentation files (the "Software"),
   6 * to deal in the Software without restriction, including without limitation
   7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8 * and/or sell copies of the Software, and to permit persons to whom the
   9 * Software is furnished to do so, subject to the following conditions:
  10 *
  11 * The above copyright notice and this permission notice (including the next
  12 * paragraph) shall be included in all copies or substantial portions of the
  13 * Software.
  14 *
  15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21 * IN THE SOFTWARE.
  22 *
  23 * @file builder_misc.cpp
  24 *
  25 * @brief Implementation for miscellaneous builder functions
  26 *
  27 * Notes:
  28 *
  29 ******************************************************************************/
  30 #include "builder.h"
  31 #include "common/rdtsc_buckets.h"
  32
  33 #include <cstdarg>
  34
  35 namespace SwrJit
  36 {
  37     void __cdecl CallPrint(const char* fmt, ...);
  38
  39     //////////////////////////////////////////////////////////////////////////
  40     /// @brief Convert an IEEE 754 32-bit single precision float to an
  41     ///        16 bit float with 5 exponent bits and a variable
  42     ///        number of mantissa bits.
  43     /// @param val - 32-bit float
  44     /// @todo Maybe move this outside of this file into a header?
  45     static uint16_t Convert32To16Float(float val)
  46     {
  47         uint32_t sign, exp, mant;
  48         uint32_t roundBits;
  49
  50         // Extract the sign, exponent, and mantissa
  51         uint32_t uf = *(uint32_t*)&val;
  52         sign = (uf & 0x80000000) >> 31;
  53         exp = (uf & 0x7F800000) >> 23;
  54         mant = uf & 0x007FFFFF;
  55
  56         // Check for out of range
  57         if (std::isnan(val))
  58         {
  59             exp = 0x1F;
  60             mant = 0x200;
  61             sign = 1;                     // set the sign bit for NANs
  62         }
  63         else if (std::isinf(val))
  64         {
  65             exp = 0x1f;
  66             mant = 0x0;
  67         }
  68         else if (exp > (0x70 + 0x1E)) // Too big to represent -> max representable value
  69         {
  70             exp = 0x1E;
  71             mant = 0x3FF;
  72         }
  73         else if ((exp <= 0x70) && (exp >= 0x66)) // It's a denorm
  74         {
  75             mant |= 0x00800000;
  76             for (; exp <= 0x70; mant >>= 1, exp++)
  77                 ;
  78             exp = 0;
  79             mant = mant >> 13;
  80         }
  81         else if (exp < 0x66) // Too small to represent -> Zero
  82         {
  83             exp = 0;
  84             mant = 0;
  85         }
  86         else
  87         {
  88             // Saves bits that will be shifted off for rounding
  89             roundBits = mant & 0x1FFFu;
  90             // convert exponent and mantissa to 16 bit format
  91             exp = exp - 0x70;
  92             mant = mant >> 13;
  93
  94             // Essentially RTZ, but round up if off by only 1 lsb
  95             if (roundBits == 0x1FFFu)
  96             {
  97                 mant++;
  98                 // check for overflow
  99                 if ((mant & 0xC00u) != 0)
 100                     exp++;
 101                 // make sure only the needed bits are used
 102                 mant &= 0x3FF;
 103             }
 104         }
 105
 106         uint32_t tmpVal = (sign << 15) | (exp << 10) | mant;
 107         return (uint16_t)tmpVal;
 108     }
 109
 110     //////////////////////////////////////////////////////////////////////////
 111     /// @brief Convert an IEEE 754 16-bit float to an 32-bit single precision
 112     ///        float
 113     /// @param val - 16-bit float
 114     /// @todo Maybe move this outside of this file into a header?
 115     static float ConvertSmallFloatTo32(UINT val)
 116     {
 117         UINT result;
 118         if ((val & 0x7fff) == 0)
 119         {
 120             result = ((uint32_t)(val & 0x8000)) << 16;
 121         }
 122         else if ((val & 0x7c00) == 0x7c00)
 123         {
 124             result = ((val & 0x3ff) == 0) ? 0x7f800000 : 0x7fc00000;
 125             result |= ((uint32_t)val & 0x8000) << 16;
 126         }
 127         else
 128         {
 129             uint32_t sign = (val & 0x8000) << 16;
 130             uint32_t mant = (val & 0x3ff) << 13;
 131             uint32_t exp = (val >> 10) & 0x1f;
 132             if ((exp == 0) && (mant != 0)) // Adjust exponent and mantissa for denormals
 133             {
 134                 mant <<= 1;
 135                 while (mant < (0x400 << 13))
 136                 {
 137                     exp--;
 138                     mant <<= 1;
 139                 }
 140                 mant &= (0x3ff << 13);
 141             }
 142             exp = ((exp - 15 + 127) & 0xff) << 23;
 143             result = sign | exp | mant;
 144         }
 145
 146         return *(float*)&result;
 147     }
 148
 149     Constant *Builder::C(bool i)
 150     {
 151         return ConstantInt::get(IRB()->getInt1Ty(), (i ? 1 : 0));
 152     }
 153
 154     Constant *Builder::C(char i)
 155     {
 156         return ConstantInt::get(IRB()->getInt8Ty(), i);
 157     }
 158
 159     Constant *Builder::C(uint8_t i)
 160     {
 161         return ConstantInt::get(IRB()->getInt8Ty(), i);
 162     }
 163
 164     Constant *Builder::C(int i)
 165     {
 166         return ConstantInt::get(IRB()->getInt32Ty(), i);
 167     }
 168
 169     Constant *Builder::C(int64_t i)
 170     {
 171         return ConstantInt::get(IRB()->getInt64Ty(), i);
 172     }
 173
 174     Constant *Builder::C(uint16_t i)
 175     {
 176         return ConstantInt::get(mInt16Ty,i);
 177     }
 178
 179     Constant *Builder::C(uint32_t i)
 180     {
 181         return ConstantInt::get(IRB()->getInt32Ty(), i);
 182     }
 183
 184     Constant *Builder::C(float i)
 185     {
 186         return ConstantFP::get(IRB()->getFloatTy(), i);
 187     }
 188
 189     Constant *Builder::PRED(bool pred)
 190     {
 191         return ConstantInt::get(IRB()->getInt1Ty(), (pred ? 1 : 0));
 192     }
 193
 194     Value *Builder::VIMMED1(int i)
 195     {
 196         return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i)));
 197     }
 198
 199     Value *Builder::VIMMED1(uint32_t i)
 200     {
 201         return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i)));
 202     }
 203
 204     Value *Builder::VIMMED1(float i)
 205     {
 206         return ConstantVector::getSplat(mVWidth, cast<ConstantFP>(C(i)));
 207     }
 208
 209     Value *Builder::VIMMED1(bool i)
 210     {
 211         return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i)));
 212     }
 213
 214     Value *Builder::VUNDEF_IPTR()
 215     {
 216         return UndefValue::get(VectorType::get(mInt32PtrTy,mVWidth));
 217     }
 218
 219     Value *Builder::VUNDEF_I()
 220     {
 221         return UndefValue::get(VectorType::get(mInt32Ty, mVWidth));
 222     }
 223
 224     Value *Builder::VUNDEF(Type *ty, uint32_t size)
 225     {
 226         return UndefValue::get(VectorType::get(ty, size));
 227     }
 228
 229     Value *Builder::VUNDEF_F()
 230     {
 231         return UndefValue::get(VectorType::get(mFP32Ty, mVWidth));
 232     }
 233
 234     Value *Builder::VUNDEF(Type* t)
 235     {
 236         return UndefValue::get(VectorType::get(t, mVWidth));
 237     }
 238
 239     #if HAVE_LLVM == 0x306
 240     Value *Builder::VINSERT(Value *vec, Value *val, uint64_t index)
 241     {
 242         return VINSERT(vec, val, C((int64_t)index));
 243     }
 244     #endif
 245
 246     Value *Builder::VBROADCAST(Value *src)
 247     {
 248         // check if src is already a vector
 249         if (src->getType()->isVectorTy())
 250         {
 251             return src;
 252         }
 253
 254         return VECTOR_SPLAT(mVWidth, src);
 255     }
 256
 257     uint32_t Builder::IMMED(Value* v)
 258     {
 259         SWR_ASSERT(isa<ConstantInt>(v));
 260         ConstantInt *pValConst = cast<ConstantInt>(v);
 261         return pValConst->getZExtValue();
 262     }
 263
 264     int32_t Builder::S_IMMED(Value* v)
 265     {
 266         SWR_ASSERT(isa<ConstantInt>(v));
 267         ConstantInt *pValConst = cast<ConstantInt>(v);
 268         return pValConst->getSExtValue();
 269     }
 270
 271     Value *Builder::GEP(Value* ptr, const std::initializer_list<Value*> &indexList)
 272     {
 273         std::vector<Value*> indices;
 274         for (auto i : indexList)
 275             indices.push_back(i);
 276         return GEPA(ptr, indices);
 277     }
 278
 279     Value *Builder::GEP(Value* ptr, const std::initializer_list<uint32_t> &indexList)
 280     {
 281         std::vector<Value*> indices;
 282         for (auto i : indexList)
 283             indices.push_back(C(i));
 284         return GEPA(ptr, indices);
 285     }
 286
 287     LoadInst *Builder::LOAD(Value *basePtr, const std::initializer_list<uint32_t> &indices, const llvm::Twine& name)
 288     {
 289         std::vector<Value*> valIndices;
 290         for (auto i : indices)
 291             valIndices.push_back(C(i));
 292         return LOAD(GEPA(basePtr, valIndices), name);
 293     }
 294
 295     LoadInst *Builder::LOADV(Value *basePtr, const std::initializer_list<Value*> &indices, const llvm::Twine& name)
 296     {
 297         std::vector<Value*> valIndices;
 298         for (auto i : indices)
 299             valIndices.push_back(i);
 300         return LOAD(GEPA(basePtr, valIndices), name);
 301     }
 302
 303     StoreInst *Builder::STORE(Value *val, Value *basePtr, const std::initializer_list<uint32_t> &indices)
 304     {
 305         std::vector<Value*> valIndices;
 306         for (auto i : indices)
 307             valIndices.push_back(C(i));
 308         return STORE(val, GEPA(basePtr, valIndices));
 309     }
 310
 311     StoreInst *Builder::STOREV(Value *val, Value *basePtr, const std::initializer_list<Value*> &indices)
 312     {
 313         std::vector<Value*> valIndices;
 314         for (auto i : indices)
 315             valIndices.push_back(i);
 316         return STORE(val, GEPA(basePtr, valIndices));
 317     }
 318
 319     CallInst *Builder::CALL(Value *Callee, const std::initializer_list<Value*> &argsList)
 320     {
 321         std::vector<Value*> args;
 322         for (auto arg : argsList)
 323             args.push_back(arg);
 324         return CALLA(Callee, args);
 325     }
 326
 327     #if HAVE_LLVM > 0x306
 328     CallInst *Builder::CALL(Value *Callee, Value* arg)
 329     {
 330         std::vector<Value*> args;
 331         args.push_back(arg);
 332         return CALLA(Callee, args);
 333     }
 334
 335     CallInst *Builder::CALL2(Value *Callee, Value* arg1, Value* arg2)
 336     {
 337         std::vector<Value*> args;
 338         args.push_back(arg1);
 339         args.push_back(arg2);
 340         return CALLA(Callee, args);
 341     }
 342
 343     CallInst *Builder::CALL3(Value *Callee, Value* arg1, Value* arg2, Value* arg3)
 344     {
 345         std::vector<Value*> args;
 346         args.push_back(arg1);
 347         args.push_back(arg2);
 348         args.push_back(arg3);
 349         return CALLA(Callee, args);
 350     }
 351     #endif
 352
 353     //////////////////////////////////////////////////////////////////////////
 354     Value *Builder::DEBUGTRAP()
 355     {
 356         Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::debugtrap);
 357         return CALL(func);
 358     }
 359
 360     Value *Builder::VRCP(Value *va)
 361     {
 362         return FDIV(VIMMED1(1.0f), va);  // 1 / a
 363     }
 364
 365     Value *Builder::VPLANEPS(Value* vA, Value* vB, Value* vC, Value* &vX, Value* &vY)
 366     {
 367         Value* vOut = FMADDPS(vA, vX, vC);
 368         vOut = FMADDPS(vB, vY, vOut);
 369         return vOut;
 370     }
 371
 372     //////////////////////////////////////////////////////////////////////////
 373     /// @brief Generate an i32 masked load operation in LLVM IR.  If not
 374     /// supported on the underlying platform, emulate it with float masked load
 375     /// @param src - base address pointer for the load
 376     /// @param vMask - SIMD wide mask that controls whether to access memory load 0
 377     Value *Builder::MASKLOADD(Value* src,Value* mask)
 378     {
 379         Value* vResult;
 380         // use avx2 gather instruction is available
 381         if(JM()->mArch.AVX2())
 382         {
 383             Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx2_maskload_d_256);
 384             vResult = CALL(func,{src,mask});
 385         }
 386         else
 387         {
 388             // maskload intrinsic expects integer mask operand in llvm >= 3.8
 389     #if (LLVM_VERSION_MAJOR > 3) || (LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 8)
 390             mask = BITCAST(mask,VectorType::get(mInt32Ty,mVWidth));
 391     #else
 392             mask = BITCAST(mask,VectorType::get(mFP32Ty,mVWidth));
 393     #endif
 394             Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule,Intrinsic::x86_avx_maskload_ps_256);
 395             vResult = BITCAST(CALL(func,{src,mask}), VectorType::get(mInt32Ty,mVWidth));
 396         }
 397         return vResult;
 398     }
 399
 400     //////////////////////////////////////////////////////////////////////////
 401     /// @brief insert a JIT call to CallPrint
 402     /// - outputs formatted string to both stdout and VS output window
 403     /// - DEBUG builds only
 404     /// Usage example:
 405     ///   PRINT("index %d = 0x%p\n",{C(lane), pIndex});
 406     ///   where C(lane) creates a constant value to print, and pIndex is the Value*
 407     ///   result from a GEP, printing out the pointer to memory
 408     /// @param printStr - constant string to print, which includes format specifiers
 409     /// @param printArgs - initializer list of Value*'s to print to std out
 410     CallInst *Builder::PRINT(const std::string &printStr,const std::initializer_list<Value*> &printArgs)
 411     {
 412         // push the arguments to CallPrint into a vector
 413         std::vector<Value*> printCallArgs;
 414         // save room for the format string.  we still need to modify it for vectors
 415         printCallArgs.resize(1);
 416
 417         // search through the format string for special processing
 418         size_t pos = 0;
 419         std::string tempStr(printStr);
 420         pos = tempStr.find('%', pos);
 421         auto v = printArgs.begin();
 422
 423         while ((pos != std::string::npos) && (v != printArgs.end()))
 424         {
 425             Value* pArg = *v;
 426             Type* pType = pArg->getType();
 427
 428             if (pType->isVectorTy())
 429             {
 430                 Type* pContainedType = pType->getContainedType(0);
 431
 432                 if (toupper(tempStr[pos + 1]) == 'X')
 433                 {
 434                     tempStr[pos] = '0';
 435                     tempStr[pos + 1] = 'x';
 436                     tempStr.insert(pos + 2, "%08X ");
 437                     pos += 7;
 438
 439                     printCallArgs.push_back(VEXTRACT(pArg, C(0)));
 440
 441                     std::string vectorFormatStr;
 442                     for (uint32_t i = 1; i < pType->getVectorNumElements(); ++i)
 443                     {
 444                         vectorFormatStr += "0x%08X ";
 445                         printCallArgs.push_back(VEXTRACT(pArg, C(i)));
 446                     }
 447
 448                     tempStr.insert(pos, vectorFormatStr);
 449                     pos += vectorFormatStr.size();
 450                 }
 451                 else if ((tempStr[pos + 1] == 'f') && (pContainedType->isFloatTy()))
 452                 {
 453                     uint32_t i = 0;
 454                     for (; i < (pArg->getType()->getVectorNumElements()) - 1; i++)
 455                     {
 456                         tempStr.insert(pos, std::string("%f "));
 457                         pos += 3;
 458                         printCallArgs.push_back(FP_EXT(VEXTRACT(pArg, C(i)), Type::getDoubleTy(JM()->mContext)));
 459                     }
 460                     printCallArgs.push_back(FP_EXT(VEXTRACT(pArg, C(i)), Type::getDoubleTy(JM()->mContext)));
 461                 }
 462                 else if ((tempStr[pos + 1] == 'd') && (pContainedType->isIntegerTy()))
 463                 {
 464                     uint32_t i = 0;
 465                     for (; i < (pArg->getType()->getVectorNumElements()) - 1; i++)
 466                     {
 467                         tempStr.insert(pos, std::string("%d "));
 468                         pos += 3;
 469                         printCallArgs.push_back(VEXTRACT(pArg, C(i)));
 470                     }
 471                     printCallArgs.push_back(VEXTRACT(pArg, C(i)));
 472                 }
 473             }
 474             else
 475             {
 476                 if (toupper(tempStr[pos + 1]) == 'X')
 477                 {
 478                     tempStr[pos] = '0';
 479                     tempStr.insert(pos + 1, "x%08");
 480                     printCallArgs.push_back(pArg);
 481                     pos += 3;
 482                 }
 483                 // for %f we need to cast float Values to doubles so that they print out correctly
 484                 else if ((tempStr[pos + 1] == 'f') && (pType->isFloatTy()))
 485                 {
 486                     printCallArgs.push_back(FP_EXT(pArg, Type::getDoubleTy(JM()->mContext)));
 487                     pos++;
 488                 }
 489                 else
 490                 {
 491                     printCallArgs.push_back(pArg);
 492                 }
 493             }
 494
 495             // advance to the next arguement
 496             v++;
 497             pos = tempStr.find('%', ++pos);
 498         }
 499
 500         // create global variable constant string
 501         Constant *constString = ConstantDataArray::getString(JM()->mContext,tempStr,true);
 502         GlobalVariable *gvPtr = new GlobalVariable(constString->getType(),true,GlobalValue::InternalLinkage,constString,"printStr");
 503         JM()->mpCurrentModule->getGlobalList().push_back(gvPtr);
 504
 505         // get a pointer to the first character in the constant string array
 506         std::vector<Constant*> geplist{C(0),C(0)};
 507     #if HAVE_LLVM == 0x306
 508         Constant *strGEP = ConstantExpr::getGetElementPtr(gvPtr,geplist,false);
 509     #else
 510         Constant *strGEP = ConstantExpr::getGetElementPtr(nullptr, gvPtr,geplist,false);
 511     #endif
 512
 513         // insert the pointer to the format string in the argument vector
 514         printCallArgs[0] = strGEP;
 515
 516         // get pointer to CallPrint function and insert decl into the module if needed
 517         std::vector<Type*> args;
 518         args.push_back(PointerType::get(mInt8Ty,0));
 519         FunctionType* callPrintTy = FunctionType::get(Type::getVoidTy(JM()->mContext),args,true);
 520         Function *callPrintFn = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("CallPrint", callPrintTy));
 521
 522         // if we haven't yet added the symbol to the symbol table
 523         if((sys::DynamicLibrary::SearchForAddressOfSymbol("CallPrint")) == nullptr)
 524         {
 525             sys::DynamicLibrary::AddSymbol("CallPrint", (void *)&CallPrint);
 526         }
 527
 528         // insert a call to CallPrint
 529         return CALLA(callPrintFn,printCallArgs);
 530     }
 531
 532     //////////////////////////////////////////////////////////////////////////
 533     /// @brief Wrapper around PRINT with initializer list.
 534     CallInst* Builder::PRINT(const std::string &printStr)
 535     {
 536         return PRINT(printStr, {});
 537     }
 538
 539     //////////////////////////////////////////////////////////////////////////
 540     /// @brief Generate a masked gather operation in LLVM IR.  If not
 541     /// supported on the underlying platform, emulate it with loads
 542     /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
 543     /// @param pBase - Int8* base VB address pointer value
 544     /// @param vIndices - SIMD wide value of VB byte offsets
 545     /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
 546     /// @param scale - value to scale indices by
 547     Value *Builder::GATHERPS(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, Value* scale)
 548     {
 549         Value* vGather;
 550
 551         // use avx2 gather instruction if available
 552         if(JM()->mArch.AVX2())
 553         {
 554             // force mask to <N x float>, required by vgather
 555             vMask = BITCAST(vMask, mSimdFP32Ty);
 556             vGather = VGATHERPS(vSrc,pBase,vIndices,vMask,scale);
 557         }
 558         else
 559         {
 560             Value* pStack = STACKSAVE();
 561
 562             // store vSrc on the stack.  this way we can select between a valid load address and the vSrc address
 563             Value* vSrcPtr = ALLOCA(vSrc->getType());
 564             STORE(vSrc, vSrcPtr);
 565
 566             vGather = VUNDEF_F();
 567             Value *vScaleVec = VBROADCAST(Z_EXT(scale,mInt32Ty));
 568             Value *vOffsets = MUL(vIndices,vScaleVec);
 569             Value *mask = MASK(vMask);
 570             for(uint32_t i = 0; i < mVWidth; ++i)
 571             {
 572                 // single component byte index
 573                 Value *offset = VEXTRACT(vOffsets,C(i));
 574                 // byte pointer to component
 575                 Value *loadAddress = GEP(pBase,offset);
 576                 loadAddress = BITCAST(loadAddress,PointerType::get(mFP32Ty,0));
 577                 // pointer to the value to load if we're masking off a component
 578                 Value *maskLoadAddress = GEP(vSrcPtr,{C(0), C(i)});
 579                 Value *selMask = VEXTRACT(mask,C(i));
 580                 // switch in a safe address to load if we're trying to access a vertex
 581                 Value *validAddress = SELECT(selMask, loadAddress, maskLoadAddress);
 582                 Value *val = LOAD(validAddress);
 583                 vGather = VINSERT(vGather,val,C(i));
 584             }
 585             STACKRESTORE(pStack);
 586         }
 587
 588         return vGather;
 589     }
 590
 591     //////////////////////////////////////////////////////////////////////////
 592     /// @brief Generate a masked gather operation in LLVM IR.  If not
 593     /// supported on the underlying platform, emulate it with loads
 594     /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
 595     /// @param pBase - Int8* base VB address pointer value
 596     /// @param vIndices - SIMD wide value of VB byte offsets
 597     /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
 598     /// @param scale - value to scale indices by
 599     Value *Builder::GATHERDD(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, Value* scale)
 600     {
 601         Value* vGather;
 602
 603         // use avx2 gather instruction if available
 604         if(JM()->mArch.AVX2())
 605         {
 606             vGather = VGATHERDD(vSrc, pBase, vIndices, vMask, scale);
 607         }
 608         else
 609         {
 610             Value* pStack = STACKSAVE();
 611
 612             // store vSrc on the stack.  this way we can select between a valid load address and the vSrc address
 613             Value* vSrcPtr = ALLOCA(vSrc->getType());
 614             STORE(vSrc, vSrcPtr);
 615
 616             vGather = VUNDEF_I();
 617             Value *vScaleVec = VBROADCAST(Z_EXT(scale, mInt32Ty));
 618             Value *vOffsets = MUL(vIndices, vScaleVec);
 619             Value *mask = MASK(vMask);
 620             for(uint32_t i = 0; i < mVWidth; ++i)
 621             {
 622                 // single component byte index
 623                 Value *offset = VEXTRACT(vOffsets, C(i));
 624                 // byte pointer to component
 625                 Value *loadAddress = GEP(pBase, offset);
 626                 loadAddress = BITCAST(loadAddress, PointerType::get(mInt32Ty, 0));
 627                 // pointer to the value to load if we're masking off a component
 628                 Value *maskLoadAddress = GEP(vSrcPtr, {C(0), C(i)});
 629                 Value *selMask = VEXTRACT(mask, C(i));
 630                 // switch in a safe address to load if we're trying to access a vertex
 631                 Value *validAddress = SELECT(selMask, loadAddress, maskLoadAddress);
 632                 Value *val = LOAD(validAddress, C(0));
 633                 vGather = VINSERT(vGather, val, C(i));
 634             }
 635
 636             STACKRESTORE(pStack);
 637         }
 638         return vGather;
 639     }
 640
 641     //////////////////////////////////////////////////////////////////////////
 642     /// @brief Generate a masked gather operation in LLVM IR.  If not
 643     /// supported on the underlying platform, emulate it with loads
 644     /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
 645     /// @param pBase - Int8* base VB address pointer value
 646     /// @param vIndices - SIMD wide value of VB byte offsets
 647     /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
 648     /// @param scale - value to scale indices by
 649     Value *Builder::GATHERPD(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, Value* scale)
 650     {
 651         Value* vGather;
 652
 653         // use avx2 gather instruction if available
 654         if(JM()->mArch.AVX2())
 655         {
 656             vGather = VGATHERPD(vSrc, pBase, vIndices, vMask, scale);
 657         }
 658         else
 659         {
 660             Value* pStack = STACKSAVE();
 661
 662             // store vSrc on the stack.  this way we can select between a valid load address and the vSrc address
 663             Value* vSrcPtr = ALLOCA(vSrc->getType());
 664             STORE(vSrc, vSrcPtr);
 665
 666             vGather = UndefValue::get(VectorType::get(mDoubleTy, 4));
 667             Value *vScaleVec = VECTOR_SPLAT(4, Z_EXT(scale,mInt32Ty));
 668             Value *vOffsets = MUL(vIndices,vScaleVec);
 669             Value *mask = MASK(vMask);
 670             for(uint32_t i = 0; i < mVWidth/2; ++i)
 671             {
 672                 // single component byte index
 673                 Value *offset = VEXTRACT(vOffsets,C(i));
 674                 // byte pointer to component
 675                 Value *loadAddress = GEP(pBase,offset);
 676                 loadAddress = BITCAST(loadAddress,PointerType::get(mDoubleTy,0));
 677                 // pointer to the value to load if we're masking off a component
 678                 Value *maskLoadAddress = GEP(vSrcPtr,{C(0), C(i)});
 679                 Value *selMask = VEXTRACT(mask,C(i));
 680                 // switch in a safe address to load if we're trying to access a vertex
 681                 Value *validAddress = SELECT(selMask, loadAddress, maskLoadAddress);
 682                 Value *val = LOAD(validAddress);
 683                 vGather = VINSERT(vGather,val,C(i));
 684             }
 685             STACKRESTORE(pStack);
 686         }
 687         return vGather;
 688     }
 689
 690     //////////////////////////////////////////////////////////////////////////
 691     /// @brief convert x86 <N x float> mask to llvm <N x i1> mask
 692     Value* Builder::MASK(Value* vmask)
 693     {
 694         Value* src = BITCAST(vmask, mSimdInt32Ty);
 695         return ICMP_SLT(src, VIMMED1(0));
 696     }
 697
 698     //////////////////////////////////////////////////////////////////////////
 699     /// @brief convert llvm <N x i1> mask to x86 <N x i32> mask
 700     Value* Builder::VMASK(Value* mask)
 701     {
 702         return S_EXT(mask, mSimdInt32Ty);
 703     }
 704
 705     //////////////////////////////////////////////////////////////////////////
 706     /// @brief Generate a VPSHUFB operation in LLVM IR.  If not
 707     /// supported on the underlying platform, emulate it
 708     /// @param a - 256bit SIMD(32x8bit) of 8bit integer values
 709     /// @param b - 256bit SIMD(32x8bit) of 8bit integer mask values
 710     /// Byte masks in lower 128 lane of b selects 8 bit values from lower
 711     /// 128bits of a, and vice versa for the upper lanes.  If the mask
 712     /// value is negative, '0' is inserted.
 713     Value *Builder::PSHUFB(Value* a, Value* b)
 714     {
 715         Value* res;
 716         // use avx2 pshufb instruction if available
 717         if(JM()->mArch.AVX2())
 718         {
 719             res = VPSHUFB(a, b);
 720         }
 721         else
 722         {
 723             Constant* cB = dyn_cast<Constant>(b);
 724             // number of 8 bit elements in b
 725             uint32_t numElms = cast<VectorType>(cB->getType())->getNumElements();
 726             // output vector
 727             Value* vShuf = UndefValue::get(VectorType::get(mInt8Ty, numElms));
 728
 729             // insert an 8 bit value from the high and low lanes of a per loop iteration
 730             numElms /= 2;
 731             for(uint32_t i = 0; i < numElms; i++)
 732             {
 733                 ConstantInt* cLow128b = cast<ConstantInt>(cB->getAggregateElement(i));
 734                 ConstantInt* cHigh128b = cast<ConstantInt>(cB->getAggregateElement(i + numElms));
 735
 736                 // extract values from constant mask
 737                 char valLow128bLane =  (char)(cLow128b->getSExtValue());
 738                 char valHigh128bLane = (char)(cHigh128b->getSExtValue());
 739
 740                 Value* insertValLow128b;
 741                 Value* insertValHigh128b;
 742
 743                 // if the mask value is negative, insert a '0' in the respective output position
 744                 // otherwise, lookup the value at mask position (bits 3..0 of the respective mask byte) in a and insert in output vector
 745                 insertValLow128b = (valLow128bLane < 0) ? C((char)0) : VEXTRACT(a, C((valLow128bLane & 0xF)));
 746                 insertValHigh128b = (valHigh128bLane < 0) ? C((char)0) : VEXTRACT(a, C((valHigh128bLane & 0xF) + numElms));
 747
 748                 vShuf = VINSERT(vShuf, insertValLow128b, i);
 749                 vShuf = VINSERT(vShuf, insertValHigh128b, (i + numElms));
 750             }
 751             res = vShuf;
 752         }
 753         return res;
 754     }
 755
 756     //////////////////////////////////////////////////////////////////////////
 757     /// @brief Generate a VPSHUFB operation (sign extend 8 8bit values to 32
 758     /// bits)in LLVM IR.  If not supported on the underlying platform, emulate it
 759     /// @param a - 128bit SIMD lane(16x8bit) of 8bit integer values.  Only
 760     /// lower 8 values are used.
 761     Value *Builder::PMOVSXBD(Value* a)
 762     {
 763         // llvm-3.9 removed the pmovsxbd intrinsic
 764     #if HAVE_LLVM < 0x309
 765         // use avx2 byte sign extend instruction if available
 766         if(JM()->mArch.AVX2())
 767         {
 768             Function *pmovsxbd = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx2_pmovsxbd);
 769             return CALL(pmovsxbd, std::initializer_list<Value*>{a});
 770         }
 771         else
 772     #endif
 773         {
 774             // VPMOVSXBD output type
 775             Type* v8x32Ty = VectorType::get(mInt32Ty, 8);
 776             // Extract 8 values from 128bit lane and sign extend
 777             return S_EXT(VSHUFFLE(a, a, C<int>({0, 1, 2, 3, 4, 5, 6, 7})), v8x32Ty);
 778         }
 779     }
 780
 781     //////////////////////////////////////////////////////////////////////////
 782     /// @brief Generate a VPSHUFB operation (sign extend 8 16bit values to 32
 783     /// bits)in LLVM IR.  If not supported on the underlying platform, emulate it
 784     /// @param a - 128bit SIMD lane(8x16bit) of 16bit integer values.
 785     Value *Builder::PMOVSXWD(Value* a)
 786     {
 787         // llvm-3.9 removed the pmovsxwd intrinsic
 788     #if HAVE_LLVM < 0x309
 789         // use avx2 word sign extend if available
 790         if(JM()->mArch.AVX2())
 791         {
 792             Function *pmovsxwd = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx2_pmovsxwd);
 793             return CALL(pmovsxwd, std::initializer_list<Value*>{a});
 794         }
 795         else
 796     #endif
 797         {
 798             // VPMOVSXWD output type
 799             Type* v8x32Ty = VectorType::get(mInt32Ty, 8);
 800             // Extract 8 values from 128bit lane and sign extend
 801             return S_EXT(VSHUFFLE(a, a, C<int>({0, 1, 2, 3, 4, 5, 6, 7})), v8x32Ty);
 802         }
 803     }
 804
 805     //////////////////////////////////////////////////////////////////////////
 806     /// @brief Generate a VPERMD operation (shuffle 32 bit integer values
 807     /// across 128 bit lanes) in LLVM IR.  If not supported on the underlying
 808     /// platform, emulate it
 809     /// @param a - 256bit SIMD lane(8x32bit) of integer values.
 810     /// @param idx - 256bit SIMD lane(8x32bit) of 3 bit lane index values
 811     Value *Builder::PERMD(Value* a, Value* idx)
 812     {
 813         Value* res;
 814         // use avx2 permute instruction if available
 815         if(JM()->mArch.AVX2())
 816         {
 817             res = VPERMD(a, idx);
 818         }
 819         else
 820         {
 821             if (isa<Constant>(idx))
 822             {
 823                 res = VSHUFFLE(a, a, idx);
 824             }
 825             else
 826             {
 827                 res = VUNDEF_I();
 828                 for (uint32_t l = 0; l < JM()->mVWidth; ++l)
 829                 {
 830                     Value* pIndex = VEXTRACT(idx, C(l));
 831                     Value* pVal = VEXTRACT(a, pIndex);
 832                     res = VINSERT(res, pVal, C(l));
 833                 }
 834             }
 835         }
 836         return res;
 837     }
 838
 839     //////////////////////////////////////////////////////////////////////////
 840     /// @brief Generate a VPERMPS operation (shuffle 32 bit float values
 841     /// across 128 bit lanes) in LLVM IR.  If not supported on the underlying
 842     /// platform, emulate it
 843     /// @param a - 256bit SIMD lane(8x32bit) of float values.
 844     /// @param idx - 256bit SIMD lane(8x32bit) of 3 bit lane index values
 845     Value *Builder::PERMPS(Value* a, Value* idx)
 846     {
 847         Value* res;
 848         // use avx2 permute instruction if available
 849         if (JM()->mArch.AVX2())
 850         {
 851             // llvm 3.6.0 swapped the order of the args to vpermd
 852             res = VPERMPS(idx, a);
 853         }
 854         else
 855         {
 856             if (isa<Constant>(idx))
 857             {
 858                 res = VSHUFFLE(a, a, idx);
 859             }
 860             else
 861             {
 862                 res = VUNDEF_F();
 863                 for (uint32_t l = 0; l < JM()->mVWidth; ++l)
 864                 {
 865                     Value* pIndex = VEXTRACT(idx, C(l));
 866                     Value* pVal = VEXTRACT(a, pIndex);
 867                     res = VINSERT(res, pVal, C(l));
 868                 }
 869             }
 870         }
 871
 872         return res;
 873     }
 874
 875     //////////////////////////////////////////////////////////////////////////
 876     /// @brief Generate a VCVTPH2PS operation (float16->float32 conversion)
 877     /// in LLVM IR.  If not supported on the underlying platform, emulate it
 878     /// @param a - 128bit SIMD lane(8x16bit) of float16 in int16 format.
 879     Value *Builder::CVTPH2PS(Value* a)
 880     {
 881         if (JM()->mArch.F16C())
 882         {
 883             return VCVTPH2PS(a);
 884         }
 885         else
 886         {
 887             FunctionType* pFuncTy = FunctionType::get(mFP32Ty, mInt16Ty);
 888             Function* pCvtPh2Ps = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("ConvertSmallFloatTo32", pFuncTy));
 889
 890             if (sys::DynamicLibrary::SearchForAddressOfSymbol("ConvertSmallFloatTo32") == nullptr)
 891             {
 892                 sys::DynamicLibrary::AddSymbol("ConvertSmallFloatTo32", (void *)&ConvertSmallFloatTo32);
 893             }
 894
 895             Value* pResult = UndefValue::get(mSimdFP32Ty);
 896             for (uint32_t i = 0; i < mVWidth; ++i)
 897             {
 898                 Value* pSrc = VEXTRACT(a, C(i));
 899                 Value* pConv = CALL(pCvtPh2Ps, std::initializer_list<Value*>{pSrc});
 900                 pResult = VINSERT(pResult, pConv, C(i));
 901             }
 902
 903             return pResult;
 904         }
 905     }
 906
 907     //////////////////////////////////////////////////////////////////////////
 908     /// @brief Generate a VCVTPS2PH operation (float32->float16 conversion)
 909     /// in LLVM IR.  If not supported on the underlying platform, emulate it
 910     /// @param a - 128bit SIMD lane(8x16bit) of float16 in int16 format.
 911     Value *Builder::CVTPS2PH(Value* a, Value* rounding)
 912     {
 913         if (JM()->mArch.F16C())
 914         {
 915             return VCVTPS2PH(a, rounding);
 916         }
 917         else
 918         {
 919             // call scalar C function for now
 920             FunctionType* pFuncTy = FunctionType::get(mInt16Ty, mFP32Ty);
 921             Function* pCvtPs2Ph = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("Convert32To16Float", pFuncTy));
 922
 923             if (sys::DynamicLibrary::SearchForAddressOfSymbol("Convert32To16Float") == nullptr)
 924             {
 925                 sys::DynamicLibrary::AddSymbol("Convert32To16Float", (void *)&Convert32To16Float);
 926             }
 927
 928             Value* pResult = UndefValue::get(mSimdInt16Ty);
 929             for (uint32_t i = 0; i < mVWidth; ++i)
 930             {
 931                 Value* pSrc = VEXTRACT(a, C(i));
 932                 Value* pConv = CALL(pCvtPs2Ph, std::initializer_list<Value*>{pSrc});
 933                 pResult = VINSERT(pResult, pConv, C(i));
 934             }
 935
 936             return pResult;
 937         }
 938     }
 939
 940     Value *Builder::PMAXSD(Value* a, Value* b)
 941     {
 942         // llvm-3.9 removed the pmax intrinsics
 943     #if HAVE_LLVM >= 0x309
 944         Value* cmp = ICMP_SGT(a, b);
 945         return SELECT(cmp, a, b);
 946     #else
 947         if (JM()->mArch.AVX2())
 948         {
 949             Function* pmaxsd = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx2_pmaxs_d);
 950             return CALL(pmaxsd, {a, b});
 951         }
 952         else
 953         {
 954             // use 4-wide sse max intrinsic on lower/upper halves of 8-wide sources
 955             Function* pmaxsd = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_sse41_pmaxsd);
 956
 957             // low 128
 958             Value* aLo = VEXTRACTI128(a, C((uint8_t)0));
 959             Value* bLo = VEXTRACTI128(b, C((uint8_t)0));
 960             Value* resLo = CALL(pmaxsd, {aLo, bLo});
 961
 962             // high 128
 963             Value* aHi = VEXTRACTI128(a, C((uint8_t)1));
 964             Value* bHi = VEXTRACTI128(b, C((uint8_t)1));
 965             Value* resHi = CALL(pmaxsd, {aHi, bHi});
 966
 967             // combine
 968             Value* result = VINSERTI128(VUNDEF_I(), resLo, C((uint8_t)0));
 969             result = VINSERTI128(result, resHi, C((uint8_t)1));
 970
 971             return result;
 972         }
 973     #endif
 974     }
 975
 976     Value *Builder::PMINSD(Value* a, Value* b)
 977     {
 978         // llvm-3.9 removed the pmin intrinsics
 979     #if HAVE_LLVM >= 0x309
 980         Value* cmp = ICMP_SLT(a, b);
 981         return SELECT(cmp, a, b);
 982     #else
 983         if (JM()->mArch.AVX2())
 984         {
 985             Function* pminsd = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx2_pmins_d);
 986             return CALL(pminsd, {a, b});
 987         }
 988         else
 989         {
 990             // use 4-wide sse max intrinsic on lower/upper halves of 8-wide sources
 991             Function* pminsd = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_sse41_pminsd);
 992
 993             // low 128
 994             Value* aLo = VEXTRACTI128(a, C((uint8_t)0));
 995             Value* bLo = VEXTRACTI128(b, C((uint8_t)0));
 996             Value* resLo = CALL(pminsd, {aLo, bLo});
 997
 998             // high 128
 999             Value* aHi = VEXTRACTI128(a, C((uint8_t)1));
1000             Value* bHi = VEXTRACTI128(b, C((uint8_t)1));
1001             Value* resHi = CALL(pminsd, {aHi, bHi});
1002
1003             // combine
1004             Value* result = VINSERTI128(VUNDEF_I(), resLo, C((uint8_t)0));
1005             result = VINSERTI128(result, resHi, C((uint8_t)1));
1006
1007             return result;
1008         }
1009     #endif
1010     }
1011
1012     void Builder::Gather4(const SWR_FORMAT format, Value* pSrcBase, Value* byteOffsets,
1013                           Value* mask, Value* vGatherComponents[], bool bPackedOutput)
1014     {
1015         const SWR_FORMAT_INFO &info = GetFormatInfo(format);
1016         if(info.type[0] == SWR_TYPE_FLOAT && info.bpc[0] == 32)
1017         {
1018             // ensure our mask is the correct type
1019             mask = BITCAST(mask, mSimdFP32Ty);
1020             GATHER4PS(info, pSrcBase, byteOffsets, mask, vGatherComponents, bPackedOutput);
1021         }
1022         else
1023         {
1024             // ensure our mask is the correct type
1025             mask = BITCAST(mask, mSimdInt32Ty);
1026             GATHER4DD(info, pSrcBase, byteOffsets, mask, vGatherComponents, bPackedOutput);
1027         }
1028     }
1029
1030     void Builder::GATHER4PS(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets,
1031                             Value* mask, Value* vGatherComponents[], bool bPackedOutput)
1032     {
1033         switch(info.bpp / info.numComps)
1034         {
1035             case 16:
1036             {
1037                     Value* vGatherResult[2];
1038                     Value *vMask;
1039
1040                     // TODO: vGatherMaskedVal
1041                     Value* vGatherMaskedVal = VIMMED1((float)0);
1042
1043                     // always have at least one component out of x or y to fetch
1044
1045                     // save mask as it is zero'd out after each gather
1046                     vMask = mask;
1047
1048                     vGatherResult[0] = GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, C((char)1));
1049                     // e.g. result of first 8x32bit integer gather for 16bit components
1050                     // 256i - 0    1    2    3    4    5    6    7
1051                     //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
1052                     //
1053
1054                     // if we have at least one component out of x or y to fetch
1055                     if(info.numComps > 2)
1056                     {
1057                         // offset base to the next components(zw) in the vertex to gather
1058                         pSrcBase = GEP(pSrcBase, C((char)4));
1059                         vMask = mask;
1060
1061                         vGatherResult[1] =  GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, C((char)1));
1062                         // e.g. result of second 8x32bit integer gather for 16bit components
1063                         // 256i - 0    1    2    3    4    5    6    7
1064                         //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
1065                         //
1066                     }
1067                     else
1068                     {
1069                         vGatherResult[1] =  vGatherMaskedVal;
1070                     }
1071
1072                     // Shuffle gathered components into place, each row is a component
1073                     Shuffle16bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
1074             }
1075                 break;
1076             case 32:
1077             {
1078                 // apply defaults
1079                 for (uint32_t i = 0; i < 4; ++i)
1080                 {
1081                     vGatherComponents[i] = VIMMED1(*(float*)&info.defaults[i]);
1082                 }
1083
1084                 for(uint32_t i = 0; i < info.numComps; i++)
1085                 {
1086                     uint32_t swizzleIndex = info.swizzle[i];
1087
1088                     // save mask as it is zero'd out after each gather
1089                     Value *vMask = mask;
1090
1091                     // Gather a SIMD of components
1092                     vGatherComponents[swizzleIndex] = GATHERPS(vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask, C((char)1));
1093
1094                     // offset base to the next component to gather
1095                     pSrcBase = GEP(pSrcBase, C((char)4));
1096                 }
1097             }
1098                 break;
1099             default:
1100                 SWR_INVALID("Invalid float format");
1101                 break;
1102         }
1103     }
1104
1105     void Builder::GATHER4DD(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets,
1106                             Value* mask, Value* vGatherComponents[], bool bPackedOutput)
1107     {
1108         switch (info.bpp / info.numComps)
1109         {
1110             case 8:
1111             {
1112                 Value* vGatherMaskedVal = VIMMED1((int32_t)0);
1113                 Value* vGatherResult = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, mask, C((char)1));
1114                 // e.g. result of an 8x32bit integer gather for 8bit components
1115                 // 256i - 0    1    2    3    4    5    6    7
1116                 //        xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw
1117
1118                 Shuffle8bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
1119             }
1120                 break;
1121             case 16:
1122             {
1123                 Value* vGatherResult[2];
1124                 Value *vMask;
1125
1126                 // TODO: vGatherMaskedVal
1127                 Value* vGatherMaskedVal = VIMMED1((int32_t)0);
1128
1129                 // always have at least one component out of x or y to fetch
1130
1131                 // save mask as it is zero'd out after each gather
1132                 vMask = mask;
1133
1134                 vGatherResult[0] = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, C((char)1));
1135                 // e.g. result of first 8x32bit integer gather for 16bit components
1136                 // 256i - 0    1    2    3    4    5    6    7
1137                 //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
1138                 //
1139
1140                 // if we have at least one component out of x or y to fetch
1141                 if(info.numComps > 2)
1142                 {
1143                     // offset base to the next components(zw) in the vertex to gather
1144                     pSrcBase = GEP(pSrcBase, C((char)4));
1145                     vMask = mask;
1146
1147                     vGatherResult[1] = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, C((char)1));
1148                     // e.g. result of second 8x32bit integer gather for 16bit components
1149                     // 256i - 0    1    2    3    4    5    6    7
1150                     //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
1151                     //
1152                 }
1153                 else
1154                 {
1155                     vGatherResult[1] = vGatherMaskedVal;
1156                 }
1157
1158                 // Shuffle gathered components into place, each row is a component
1159                 Shuffle16bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
1160
1161             }
1162                 break;
1163             case 32:
1164             {
1165                 // apply defaults
1166                 for (uint32_t i = 0; i < 4; ++i)
1167                 {
1168                     vGatherComponents[i] = VIMMED1((int)info.defaults[i]);
1169                 }
1170
1171                 for(uint32_t i = 0; i < info.numComps; i++)
1172                 {
1173                     uint32_t swizzleIndex = info.swizzle[i];
1174
1175                     // save mask as it is zero'd out after each gather
1176                     Value *vMask = mask;
1177
1178                     // Gather a SIMD of components
1179                     vGatherComponents[swizzleIndex] = GATHERDD(vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask, C((char)1));
1180
1181                     // offset base to the next component to gather
1182                     pSrcBase = GEP(pSrcBase, C((char)4));
1183                 }
1184             }
1185                 break;
1186             default:
1187                 SWR_INVALID("unsupported format");
1188             break;
1189         }
1190     }
1191
1192     void Builder::Shuffle16bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput[2], Value* vGatherOutput[4], bool bPackedOutput)
1193     {
1194         // cast types
1195         Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
1196         Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
1197
1198         // input could either be float or int vector; do shuffle work in int
1199         vGatherInput[0] = BITCAST(vGatherInput[0], mSimdInt32Ty);
1200         vGatherInput[1] = BITCAST(vGatherInput[1], mSimdInt32Ty);
1201
1202         if(bPackedOutput)
1203         {
1204             Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
1205
1206             // shuffle mask
1207             Value* vConstMask = C<char>({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
1208                                          0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15});
1209             Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput[0], v32x8Ty), vConstMask), vGatherTy);
1210             // after pshufb: group components together in each 128bit lane
1211             // 256i - 0    1    2    3    4    5    6    7
1212             //        xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
1213
1214             Value* vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
1215             // after PERMD: move and pack xy components into each 128bit lane
1216             // 256i - 0    1    2    3    4    5    6    7
1217             //        xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
1218
1219             // do the same for zw components
1220             Value* vi128ZW = nullptr;
1221             if(info.numComps > 2)
1222             {
1223                 Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput[1], v32x8Ty), vConstMask), vGatherTy);
1224                 vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
1225             }
1226
1227             for(uint32_t i = 0; i < 4; i++)
1228             {
1229                 uint32_t swizzleIndex = info.swizzle[i];
1230                 // todo: fixed for packed
1231                 Value* vGatherMaskedVal = VIMMED1((int32_t)(info.defaults[i]));
1232                 if(i >= info.numComps)
1233                 {
1234                     // set the default component val
1235                     vGatherOutput[swizzleIndex] = vGatherMaskedVal;
1236                     continue;
1237                 }
1238
1239                 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
1240                 uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
1241                 // if x or y, use vi128XY permute result, else use vi128ZW
1242                 Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
1243
1244                 // extract packed component 128 bit lanes
1245                 vGatherOutput[swizzleIndex] = VEXTRACT(selectedPermute, C(lane));
1246             }
1247
1248         }
1249         else
1250         {
1251             // pshufb masks for each component
1252             Value* vConstMask[2];
1253             // x/z shuffle mask
1254             vConstMask[0] = C<char>({0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
1255                                      0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, });
1256
1257             // y/w shuffle mask
1258             vConstMask[1] = C<char>({2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
1259                                      2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1});
1260
1261
1262             // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits
1263             // apply defaults
1264             for (uint32_t i = 0; i < 4; ++i)
1265             {
1266                 vGatherOutput[i] = VIMMED1((int32_t)info.defaults[i]);
1267             }
1268
1269             for(uint32_t i = 0; i < info.numComps; i++)
1270             {
1271                 uint32_t swizzleIndex = info.swizzle[i];
1272
1273                 // select correct constMask for x/z or y/w pshufb
1274                 uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1;
1275                 // if x or y, use vi128XY permute result, else use vi128ZW
1276                 uint32_t selectedGather = (i < 2) ? 0 : 1;
1277
1278                 vGatherOutput[swizzleIndex] = BITCAST(PSHUFB(BITCAST(vGatherInput[selectedGather], v32x8Ty), vConstMask[selectedMask]), vGatherTy);
1279                 // after pshufb mask for x channel; z uses the same shuffle from the second gather
1280                 // 256i - 0    1    2    3    4    5    6    7
1281                 //        xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00
1282             }
1283         }
1284     }
1285
1286     void Builder::Shuffle8bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput, Value* vGatherOutput[], bool bPackedOutput)
1287     {
1288         // cast types
1289         Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
1290         Type* v32x8Ty =  VectorType::get(mInt8Ty, mVWidth * 4 ); // vwidth is units of 32 bits
1291
1292         if(bPackedOutput)
1293         {
1294             Type* v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
1295             // shuffle mask
1296             Value* vConstMask = C<char>({0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15,
1297                                          0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15});
1298             Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput, v32x8Ty), vConstMask), vGatherTy);
1299             // after pshufb: group components together in each 128bit lane
1300             // 256i - 0    1    2    3    4    5    6    7
1301             //        xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww
1302
1303             Value* vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({0, 4, 0, 0, 1, 5, 0, 0})), v128Ty);
1304             // after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane
1305             // 256i - 0    1    2    3    4    5    6    7
1306             //        xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care)
1307
1308             // do the same for zw components
1309             Value* vi128ZW = nullptr;
1310             if(info.numComps > 2)
1311             {
1312                 vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({2, 6, 0, 0, 3, 7, 0, 0})), v128Ty);
1313             }
1314
1315             // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
1316             for(uint32_t i = 0; i < 4; i++)
1317             {
1318                 uint32_t swizzleIndex = info.swizzle[i];
1319                 // todo: fix for packed
1320                 Value* vGatherMaskedVal = VIMMED1((int32_t)(info.defaults[i]));
1321                 if(i >= info.numComps)
1322                 {
1323                     // set the default component val
1324                     vGatherOutput[swizzleIndex] = vGatherMaskedVal;
1325                     continue;
1326                 }
1327
1328                 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
1329                 uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
1330                 // if x or y, use vi128XY permute result, else use vi128ZW
1331                 Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
1332
1333                 // sign extend
1334                 vGatherOutput[swizzleIndex] = VEXTRACT(selectedPermute, C(lane));
1335             }
1336         }
1337         // else zero extend
1338         else{
1339             // shuffle enabled components into lower byte of each 32bit lane, 0 extending to 32 bits
1340             // apply defaults
1341             for (uint32_t i = 0; i < 4; ++i)
1342             {
1343                 vGatherOutput[i] = VIMMED1((int32_t)info.defaults[i]);
1344             }
1345
1346             for(uint32_t i = 0; i < info.numComps; i++){
1347                 uint32_t swizzleIndex = info.swizzle[i];
1348
1349                 // pshufb masks for each component
1350                 Value* vConstMask;
1351                 switch(i)
1352                 {
1353                     case 0:
1354                         // x shuffle mask
1355                         vConstMask = C<char>({0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1,
1356                                               0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1});
1357                         break;
1358                     case 1:
1359                         // y shuffle mask
1360                         vConstMask = C<char>({1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1,
1361                                               1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1});
1362                         break;
1363                     case 2:
1364                         // z shuffle mask
1365                         vConstMask = C<char>({2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1,
1366                                               2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1});
1367                         break;
1368                     case 3:
1369                         // w shuffle mask
1370                         vConstMask = C<char>({3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1,
1371                                               3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1});
1372                         break;
1373                     default:
1374                         vConstMask = nullptr;
1375                         break;
1376                 }
1377
1378                     vGatherOutput[swizzleIndex] = BITCAST(PSHUFB(BITCAST(vGatherInput, v32x8Ty), vConstMask), vGatherTy);
1379                     // after pshufb for x channel
1380                     // 256i - 0    1    2    3    4    5    6    7
1381                     //        x000 x000 x000 x000 x000 x000 x000 x000
1382             }
1383         }
1384     }
1385
1386     // Helper function to create alloca in entry block of function
1387     Value* Builder::CreateEntryAlloca(Function* pFunc, Type* pType)
1388     {
1389         auto saveIP = IRB()->saveIP();
1390         IRB()->SetInsertPoint(&pFunc->getEntryBlock(),
1391                               pFunc->getEntryBlock().begin());
1392         Value* pAlloca = ALLOCA(pType);
1393         IRB()->restoreIP(saveIP);
1394         return pAlloca;
1395     }
1396
1397     //////////////////////////////////////////////////////////////////////////
1398     /// @brief emulates a scatter operation.
1399     /// @param pDst - pointer to destination
1400     /// @param vSrc - vector of src data to scatter
1401     /// @param vOffsets - vector of byte offsets from pDst
1402     /// @param vMask - mask of valid lanes
1403     void Builder::SCATTERPS(Value* pDst, Value* vSrc, Value* vOffsets, Value* vMask)
1404     {
1405         /* Scatter algorithm
1406
1407            while(Index = BitScanForward(mask))
1408                 srcElem = srcVector[Index]
1409                 offsetElem = offsetVector[Index]
1410                 *(pDst + offsetElem) = srcElem
1411                 Update mask (&= ~(1<<Index)
1412
1413         */
1414
1415         BasicBlock* pCurBB = IRB()->GetInsertBlock();
1416         Function* pFunc = pCurBB->getParent();
1417         Type* pSrcTy = vSrc->getType()->getVectorElementType();
1418
1419         // Store vectors on stack
1420         if (pScatterStackSrc == nullptr)
1421         {
1422             // Save off stack allocations and reuse per scatter. Significantly reduces stack
1423             // requirements for shaders with a lot of scatters.
1424             pScatterStackSrc = CreateEntryAlloca(pFunc, mSimdInt64Ty);
1425             pScatterStackOffsets = CreateEntryAlloca(pFunc, mSimdInt32Ty);
1426         }
1427
1428         Value* pSrcArrayPtr = BITCAST(pScatterStackSrc, PointerType::get(vSrc->getType(), 0));
1429         Value* pOffsetsArrayPtr = pScatterStackOffsets;
1430         STORE(vSrc, pSrcArrayPtr);
1431         STORE(vOffsets, pOffsetsArrayPtr);
1432
1433         // Cast to pointers for random access
1434         pSrcArrayPtr = POINTER_CAST(pSrcArrayPtr, PointerType::get(pSrcTy, 0));
1435         pOffsetsArrayPtr = POINTER_CAST(pOffsetsArrayPtr, PointerType::get(mInt32Ty, 0));
1436
1437         Value* pMask = VMOVMSKPS(BITCAST(vMask, mSimdFP32Ty));
1438
1439         // Get cttz function
1440         Function* pfnCttz = Intrinsic::getDeclaration(mpJitMgr->mpCurrentModule, Intrinsic::cttz, { mInt32Ty });
1441
1442         // Setup loop basic block
1443         BasicBlock* pLoop = BasicBlock::Create(mpJitMgr->mContext, "Scatter Loop", pFunc);
1444
1445         // compute first set bit
1446         Value* pIndex = CALL(pfnCttz, { pMask, C(false) });
1447
1448         Value* pIsUndef = ICMP_EQ(pIndex, C(32));
1449
1450         // Split current block
1451         BasicBlock* pPostLoop = pCurBB->splitBasicBlock(cast<Instruction>(pIsUndef)->getNextNode());
1452
1453         // Remove unconditional jump created by splitBasicBlock
1454         pCurBB->getTerminator()->eraseFromParent();
1455
1456         // Add terminator to end of original block
1457         IRB()->SetInsertPoint(pCurBB);
1458
1459         // Add conditional branch
1460         COND_BR(pIsUndef, pPostLoop, pLoop);
1461
1462         // Add loop basic block contents
1463         IRB()->SetInsertPoint(pLoop);
1464         PHINode* pIndexPhi = PHI(mInt32Ty, 2);
1465         PHINode* pMaskPhi = PHI(mInt32Ty, 2);
1466
1467         pIndexPhi->addIncoming(pIndex, pCurBB);
1468         pMaskPhi->addIncoming(pMask, pCurBB);
1469
1470         // Extract elements for this index
1471         Value* pSrcElem = LOADV(pSrcArrayPtr, { pIndexPhi });
1472         Value* pOffsetElem = LOADV(pOffsetsArrayPtr, { pIndexPhi });
1473
1474         // GEP to this offset in dst
1475         Value* pCurDst = GEP(pDst, pOffsetElem);
1476         pCurDst = POINTER_CAST(pCurDst, PointerType::get(pSrcTy, 0));
1477         STORE(pSrcElem, pCurDst);
1478
1479         // Update the mask
1480         Value* pNewMask = AND(pMaskPhi, NOT(SHL(C(1), pIndexPhi)));
1481
1482         // Terminator
1483         Value* pNewIndex = CALL(pfnCttz, { pNewMask, C(false) });
1484
1485         pIsUndef = ICMP_EQ(pNewIndex, C(32));
1486         COND_BR(pIsUndef, pPostLoop, pLoop);
1487
1488         // Update phi edges
1489         pIndexPhi->addIncoming(pNewIndex, pLoop);
1490         pMaskPhi->addIncoming(pNewMask, pLoop);
1491
1492         // Move builder to beginning of post loop
1493         IRB()->SetInsertPoint(pPostLoop, pPostLoop->begin());
1494     }
1495
1496     Value* Builder::VABSPS(Value* a)
1497     {
1498         Value* asInt = BITCAST(a, mSimdInt32Ty);
1499         Value* result = BITCAST(AND(asInt, VIMMED1(0x7fffffff)), mSimdFP32Ty);
1500         return result;
1501     }
1502
1503     Value *Builder::ICLAMP(Value* src, Value* low, Value* high)
1504     {
1505         Value *lowCmp = ICMP_SLT(src, low);
1506         Value *ret = SELECT(lowCmp, low, src);
1507
1508         Value *highCmp = ICMP_SGT(ret, high);
1509         ret = SELECT(highCmp, high, ret);
1510
1511         return ret;
1512     }
1513
1514     Value *Builder::FCLAMP(Value* src, Value* low, Value* high)
1515     {
1516         Value *lowCmp = FCMP_OLT(src, low);
1517         Value *ret = SELECT(lowCmp, low, src);
1518
1519         Value *highCmp = FCMP_OGT(ret, high);
1520         ret = SELECT(highCmp, high, ret);
1521
1522         return ret;
1523     }
1524
1525     Value *Builder::FCLAMP(Value* src, float low, float high)
1526     {
1527         Value* result = VMAXPS(src, VIMMED1(low));
1528         result = VMINPS(result, VIMMED1(high));
1529
1530         return result;
1531     }
1532
1533     //////////////////////////////////////////////////////////////////////////
1534     /// @brief save/restore stack, providing ability to push/pop the stack and
1535     ///        reduce overall stack requirements for temporary stack use
1536     Value* Builder::STACKSAVE()
1537     {
1538         Function* pfnStackSave = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::stacksave);
1539     #if HAVE_LLVM == 0x306
1540         return CALL(pfnStackSave);
1541     #else
1542         return CALLA(pfnStackSave);
1543     #endif
1544     }
1545
1546     void Builder::STACKRESTORE(Value* pSaved)
1547     {
1548         Function* pfnStackRestore = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::stackrestore);
1549         CALL(pfnStackRestore, std::initializer_list<Value*>{pSaved});
1550     }
1551
1552     Value *Builder::FMADDPS(Value* a, Value* b, Value* c)
1553     {
1554         Value* vOut;
1555         // use FMADs if available
1556         if(JM()->mArch.AVX2())
1557         {
1558             vOut = VFMADDPS(a, b, c);
1559         }
1560         else
1561         {
1562             vOut = FADD(FMUL(a, b), c);
1563         }
1564         return vOut;
1565     }
1566
1567     Value* Builder::POPCNT(Value* a)
1568     {
1569         Function* pCtPop = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::ctpop, { a->getType() });
1570         return CALL(pCtPop, std::initializer_list<Value*>{a});
1571     }
1572
1573     //////////////////////////////////////////////////////////////////////////
1574     /// @brief C functions called by LLVM IR
1575     //////////////////////////////////////////////////////////////////////////
1576
1577     //////////////////////////////////////////////////////////////////////////
1578     /// @brief called in JIT code, inserted by PRINT
1579     /// output to both stdout and visual studio debug console
1580     void __cdecl CallPrint(const char* fmt, ...)
1581     {
1582         va_list args;
1583         va_start(args, fmt);
1584         vprintf(fmt, args);
1585
1586     #if defined( _WIN32 )
1587         char strBuf[1024];
1588         vsnprintf_s(strBuf, _TRUNCATE, fmt, args);
1589         OutputDebugString(strBuf);
1590     #endif
1591
1592         va_end(args);
1593     }
1594
1595     Value *Builder::VEXTRACTI128(Value* a, Constant* imm8)
1596     {
1597     #if HAVE_LLVM == 0x306
1598         Function *func =
1599             Intrinsic::getDeclaration(JM()->mpCurrentModule,
1600                                       Intrinsic::x86_avx_vextractf128_si_256);
1601         return CALL(func, {a, imm8});
1602     #else
1603         bool flag = !imm8->isZeroValue();
1604         SmallVector<Constant*,8> idx;
1605         for (unsigned i = 0; i < mVWidth / 2; i++) {
1606             idx.push_back(C(flag ? i + mVWidth / 2 : i));
1607         }
1608         return VSHUFFLE(a, VUNDEF_I(), ConstantVector::get(idx));
1609     #endif
1610     }
1611
1612     Value *Builder::VINSERTI128(Value* a, Value* b, Constant* imm8)
1613     {
1614     #if HAVE_LLVM == 0x306
1615         Function *func =
1616             Intrinsic::getDeclaration(JM()->mpCurrentModule,
1617                                       Intrinsic::x86_avx_vinsertf128_si_256);
1618         return CALL(func, {a, b, imm8});
1619     #else
1620         bool flag = !imm8->isZeroValue();
1621         SmallVector<Constant*,8> idx;
1622         for (unsigned i = 0; i < mVWidth; i++) {
1623             idx.push_back(C(i));
1624         }
1625         Value *inter = VSHUFFLE(b, VUNDEF_I(), ConstantVector::get(idx));
1626
1627         SmallVector<Constant*,8> idx2;
1628         for (unsigned i = 0; i < mVWidth / 2; i++) {
1629             idx2.push_back(C(flag ? i : i + mVWidth));
1630         }
1631         for (unsigned i = mVWidth / 2; i < mVWidth; i++) {
1632             idx2.push_back(C(flag ? i + mVWidth / 2 : i));
1633         }
1634         return VSHUFFLE(a, inter, ConstantVector::get(idx2));
1635     #endif
1636     }
1637
1638     // rdtsc buckets macros
1639     void Builder::RDTSC_START(Value* pBucketMgr, Value* pId)
1640     {
1641         // @todo due to an issue with thread local storage propagation in llvm, we can only safely call into
1642         // buckets framework when single threaded
1643         if (KNOB_SINGLE_THREADED)
1644         {
1645             std::vector<Type*> args{
1646                 PointerType::get(mInt32Ty, 0),   // pBucketMgr
1647                 mInt32Ty                        // id
1648             };
1649
1650             FunctionType* pFuncTy = FunctionType::get(Type::getVoidTy(JM()->mContext), args, false);
1651             Function* pFunc = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("BucketManager_StartBucket", pFuncTy));
1652             if (sys::DynamicLibrary::SearchForAddressOfSymbol("BucketManager_StartBucket") == nullptr)
1653             {
1654                 sys::DynamicLibrary::AddSymbol("BucketManager_StartBucket", (void*)&BucketManager_StartBucket);
1655             }
1656
1657             CALL(pFunc, { pBucketMgr, pId });
1658         }
1659     }
1660
1661     void Builder::RDTSC_STOP(Value* pBucketMgr, Value* pId)
1662     {
1663         // @todo due to an issue with thread local storage propagation in llvm, we can only safely call into
1664         // buckets framework when single threaded
1665         if (KNOB_SINGLE_THREADED)
1666         {
1667             std::vector<Type*> args{
1668                 PointerType::get(mInt32Ty, 0),   // pBucketMgr
1669                 mInt32Ty                        // id
1670             };
1671
1672             FunctionType* pFuncTy = FunctionType::get(Type::getVoidTy(JM()->mContext), args, false);
1673             Function* pFunc = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("BucketManager_StopBucket", pFuncTy));
1674             if (sys::DynamicLibrary::SearchForAddressOfSymbol("BucketManager_StopBucket") == nullptr)
1675             {
1676                 sys::DynamicLibrary::AddSymbol("BucketManager_StopBucket", (void*)&BucketManager_StopBucket);
1677             }
1678
1679             CALL(pFunc, { pBucketMgr, pId });
1680         }
1681     }
1682
1683 }