src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp

   1 /****************************************************************************
   2 * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
   3 *
   4 * Permission is hereby granted, free of charge, to any person obtaining a
   5 * copy of this software and associated documentation files (the "Software"),
   6 * to deal in the Software without restriction, including without limitation
   7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8 * and/or sell copies of the Software, and to permit persons to whom the
   9 * Software is furnished to do so, subject to the following conditions:
  10 *
  11 * The above copyright notice and this permission notice (including the next
  12 * paragraph) shall be included in all copies or substantial portions of the
  13 * Software.
  14 *
  15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21 * IN THE SOFTWARE.
  22 *
  23 * @file builder_misc.cpp
  24 *
  25 * @brief Implementation for miscellaneous builder functions
  26 *
  27 * Notes:
  28 *
  29 ******************************************************************************/
  30 #include "builder.h"
  31 #include "common/rdtsc_buckets.h"
  32
  33 #include <cstdarg>
  34
  35 namespace SwrJit
  36 {
  37     void __cdecl CallPrint(const char* fmt, ...);
  38
  39     //////////////////////////////////////////////////////////////////////////
  40     /// @brief Convert an IEEE 754 32-bit single precision float to an
  41     ///        16 bit float with 5 exponent bits and a variable
  42     ///        number of mantissa bits.
  43     /// @param val - 32-bit float
  44     /// @todo Maybe move this outside of this file into a header?
  45     static uint16_t Convert32To16Float(float val)
  46     {
  47         uint32_t sign, exp, mant;
  48         uint32_t roundBits;
  49
  50         // Extract the sign, exponent, and mantissa
  51         uint32_t uf = *(uint32_t*)&val;
  52         sign = (uf & 0x80000000) >> 31;
  53         exp = (uf & 0x7F800000) >> 23;
  54         mant = uf & 0x007FFFFF;
  55
  56         // Check for out of range
  57         if (std::isnan(val))
  58         {
  59             exp = 0x1F;
  60             mant = 0x200;
  61             sign = 1;                     // set the sign bit for NANs
  62         }
  63         else if (std::isinf(val))
  64         {
  65             exp = 0x1f;
  66             mant = 0x0;
  67         }
  68         else if (exp > (0x70 + 0x1E)) // Too big to represent -> max representable value
  69         {
  70             exp = 0x1E;
  71             mant = 0x3FF;
  72         }
  73         else if ((exp <= 0x70) && (exp >= 0x66)) // It's a denorm
  74         {
  75             mant |= 0x00800000;
  76             for (; exp <= 0x70; mant >>= 1, exp++)
  77                 ;
  78             exp = 0;
  79             mant = mant >> 13;
  80         }
  81         else if (exp < 0x66) // Too small to represent -> Zero
  82         {
  83             exp = 0;
  84             mant = 0;
  85         }
  86         else
  87         {
  88             // Saves bits that will be shifted off for rounding
  89             roundBits = mant & 0x1FFFu;
  90             // convert exponent and mantissa to 16 bit format
  91             exp = exp - 0x70;
  92             mant = mant >> 13;
  93
  94             // Essentially RTZ, but round up if off by only 1 lsb
  95             if (roundBits == 0x1FFFu)
  96             {
  97                 mant++;
  98                 // check for overflow
  99                 if ((mant & 0xC00u) != 0)
 100                     exp++;
 101                 // make sure only the needed bits are used
 102                 mant &= 0x3FF;
 103             }
 104         }
 105
 106         uint32_t tmpVal = (sign << 15) | (exp << 10) | mant;
 107         return (uint16_t)tmpVal;
 108     }
 109
 110     //////////////////////////////////////////////////////////////////////////
 111     /// @brief Convert an IEEE 754 16-bit float to an 32-bit single precision
 112     ///        float
 113     /// @param val - 16-bit float
 114     /// @todo Maybe move this outside of this file into a header?
 115     static float ConvertSmallFloatTo32(uint32_t val)
 116     {
 117         uint32_t result;
 118         if ((val & 0x7fff) == 0)
 119         {
 120             result = ((uint32_t)(val & 0x8000)) << 16;
 121         }
 122         else if ((val & 0x7c00) == 0x7c00)
 123         {
 124             result = ((val & 0x3ff) == 0) ? 0x7f800000 : 0x7fc00000;
 125             result |= ((uint32_t)val & 0x8000) << 16;
 126         }
 127         else
 128         {
 129             uint32_t sign = (val & 0x8000) << 16;
 130             uint32_t mant = (val & 0x3ff) << 13;
 131             uint32_t exp = (val >> 10) & 0x1f;
 132             if ((exp == 0) && (mant != 0)) // Adjust exponent and mantissa for denormals
 133             {
 134                 mant <<= 1;
 135                 while (mant < (0x400 << 13))
 136                 {
 137                     exp--;
 138                     mant <<= 1;
 139                 }
 140                 mant &= (0x3ff << 13);
 141             }
 142             exp = ((exp - 15 + 127) & 0xff) << 23;
 143             result = sign | exp | mant;
 144         }
 145
 146         return *(float*)&result;
 147     }
 148
 149     Constant *Builder::C(bool i)
 150     {
 151         return ConstantInt::get(IRB()->getInt1Ty(), (i ? 1 : 0));
 152     }
 153
 154     Constant *Builder::C(char i)
 155     {
 156         return ConstantInt::get(IRB()->getInt8Ty(), i);
 157     }
 158
 159     Constant *Builder::C(uint8_t i)
 160     {
 161         return ConstantInt::get(IRB()->getInt8Ty(), i);
 162     }
 163
 164     Constant *Builder::C(int i)
 165     {
 166         return ConstantInt::get(IRB()->getInt32Ty(), i);
 167     }
 168
 169     Constant *Builder::C(int64_t i)
 170     {
 171         return ConstantInt::get(IRB()->getInt64Ty(), i);
 172     }
 173
 174     Constant *Builder::C(uint16_t i)
 175     {
 176         return ConstantInt::get(mInt16Ty,i);
 177     }
 178
 179     Constant *Builder::C(uint32_t i)
 180     {
 181         return ConstantInt::get(IRB()->getInt32Ty(), i);
 182     }
 183
 184     Constant *Builder::C(float i)
 185     {
 186         return ConstantFP::get(IRB()->getFloatTy(), i);
 187     }
 188
 189     Constant *Builder::PRED(bool pred)
 190     {
 191         return ConstantInt::get(IRB()->getInt1Ty(), (pred ? 1 : 0));
 192     }
 193
 194     Value *Builder::VIMMED1(int i)
 195     {
 196         return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i)));
 197     }
 198
 199     Value *Builder::VIMMED1(uint32_t i)
 200     {
 201         return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i)));
 202     }
 203
 204     Value *Builder::VIMMED1(float i)
 205     {
 206         return ConstantVector::getSplat(mVWidth, cast<ConstantFP>(C(i)));
 207     }
 208
 209     Value *Builder::VIMMED1(bool i)
 210     {
 211         return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i)));
 212     }
 213
 214     Value *Builder::VUNDEF_IPTR()
 215     {
 216         return UndefValue::get(VectorType::get(mInt32PtrTy,mVWidth));
 217     }
 218
 219     Value *Builder::VUNDEF_I()
 220     {
 221         return UndefValue::get(VectorType::get(mInt32Ty, mVWidth));
 222     }
 223
 224     Value *Builder::VUNDEF(Type *ty, uint32_t size)
 225     {
 226         return UndefValue::get(VectorType::get(ty, size));
 227     }
 228
 229     Value *Builder::VUNDEF_F()
 230     {
 231         return UndefValue::get(VectorType::get(mFP32Ty, mVWidth));
 232     }
 233
 234     Value *Builder::VUNDEF(Type* t)
 235     {
 236         return UndefValue::get(VectorType::get(t, mVWidth));
 237     }
 238
 239     Value *Builder::VBROADCAST(Value *src)
 240     {
 241         // check if src is already a vector
 242         if (src->getType()->isVectorTy())
 243         {
 244             return src;
 245         }
 246
 247         return VECTOR_SPLAT(mVWidth, src);
 248     }
 249
 250     uint32_t Builder::IMMED(Value* v)
 251     {
 252         SWR_ASSERT(isa<ConstantInt>(v));
 253         ConstantInt *pValConst = cast<ConstantInt>(v);
 254         return pValConst->getZExtValue();
 255     }
 256
 257     int32_t Builder::S_IMMED(Value* v)
 258     {
 259         SWR_ASSERT(isa<ConstantInt>(v));
 260         ConstantInt *pValConst = cast<ConstantInt>(v);
 261         return pValConst->getSExtValue();
 262     }
 263
 264     Value *Builder::GEP(Value* ptr, const std::initializer_list<Value*> &indexList)
 265     {
 266         std::vector<Value*> indices;
 267         for (auto i : indexList)
 268             indices.push_back(i);
 269         return GEPA(ptr, indices);
 270     }
 271
 272     Value *Builder::GEP(Value* ptr, const std::initializer_list<uint32_t> &indexList)
 273     {
 274         std::vector<Value*> indices;
 275         for (auto i : indexList)
 276             indices.push_back(C(i));
 277         return GEPA(ptr, indices);
 278     }
 279
 280     Value *Builder::IN_BOUNDS_GEP(Value* ptr, const std::initializer_list<Value*> &indexList)
 281     {
 282         std::vector<Value*> indices;
 283         for (auto i : indexList)
 284             indices.push_back(i);
 285         return IN_BOUNDS_GEP(ptr, indices);
 286     }
 287
 288     Value *Builder::IN_BOUNDS_GEP(Value* ptr, const std::initializer_list<uint32_t> &indexList)
 289     {
 290         std::vector<Value*> indices;
 291         for (auto i : indexList)
 292             indices.push_back(C(i));
 293         return IN_BOUNDS_GEP(ptr, indices);
 294     }
 295
 296     LoadInst *Builder::LOAD(Value *basePtr, const std::initializer_list<uint32_t> &indices, const llvm::Twine& name)
 297     {
 298         std::vector<Value*> valIndices;
 299         for (auto i : indices)
 300             valIndices.push_back(C(i));
 301         return LOAD(GEPA(basePtr, valIndices), name);
 302     }
 303
 304     LoadInst *Builder::LOADV(Value *basePtr, const std::initializer_list<Value*> &indices, const llvm::Twine& name)
 305     {
 306         std::vector<Value*> valIndices;
 307         for (auto i : indices)
 308             valIndices.push_back(i);
 309         return LOAD(GEPA(basePtr, valIndices), name);
 310     }
 311
 312     StoreInst *Builder::STORE(Value *val, Value *basePtr, const std::initializer_list<uint32_t> &indices)
 313     {
 314         std::vector<Value*> valIndices;
 315         for (auto i : indices)
 316             valIndices.push_back(C(i));
 317         return STORE(val, GEPA(basePtr, valIndices));
 318     }
 319
 320     StoreInst *Builder::STOREV(Value *val, Value *basePtr, const std::initializer_list<Value*> &indices)
 321     {
 322         std::vector<Value*> valIndices;
 323         for (auto i : indices)
 324             valIndices.push_back(i);
 325         return STORE(val, GEPA(basePtr, valIndices));
 326     }
 327
 328     CallInst *Builder::CALL(Value *Callee, const std::initializer_list<Value*> &argsList)
 329     {
 330         std::vector<Value*> args;
 331         for (auto arg : argsList)
 332             args.push_back(arg);
 333         return CALLA(Callee, args);
 334     }
 335
 336     CallInst *Builder::CALL(Value *Callee, Value* arg)
 337     {
 338         std::vector<Value*> args;
 339         args.push_back(arg);
 340         return CALLA(Callee, args);
 341     }
 342
 343     CallInst *Builder::CALL2(Value *Callee, Value* arg1, Value* arg2)
 344     {
 345         std::vector<Value*> args;
 346         args.push_back(arg1);
 347         args.push_back(arg2);
 348         return CALLA(Callee, args);
 349     }
 350
 351     CallInst *Builder::CALL3(Value *Callee, Value* arg1, Value* arg2, Value* arg3)
 352     {
 353         std::vector<Value*> args;
 354         args.push_back(arg1);
 355         args.push_back(arg2);
 356         args.push_back(arg3);
 357         return CALLA(Callee, args);
 358     }
 359
 360     //////////////////////////////////////////////////////////////////////////
 361     Value *Builder::DEBUGTRAP()
 362     {
 363         Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::debugtrap);
 364         return CALL(func);
 365     }
 366
 367     Value *Builder::VRCP(Value *va)
 368     {
 369         return FDIV(VIMMED1(1.0f), va);  // 1 / a
 370     }
 371
 372     Value *Builder::VPLANEPS(Value* vA, Value* vB, Value* vC, Value* &vX, Value* &vY)
 373     {
 374         Value* vOut = FMADDPS(vA, vX, vC);
 375         vOut = FMADDPS(vB, vY, vOut);
 376         return vOut;
 377     }
 378
 379     //////////////////////////////////////////////////////////////////////////
 380     /// @brief Generate an i32 masked load operation in LLVM IR.  If not
 381     /// supported on the underlying platform, emulate it with float masked load
 382     /// @param src - base address pointer for the load
 383     /// @param vMask - SIMD wide mask that controls whether to access memory load 0
 384     Value *Builder::MASKLOADD(Value* src,Value* mask)
 385     {
 386         Value* vResult;
 387         // use avx2 gather instruction is available
 388         if(JM()->mArch.AVX2())
 389         {
 390             Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx2_maskload_d_256);
 391             vResult = CALL(func,{src,mask});
 392         }
 393         else
 394         {
 395             // maskload intrinsic expects integer mask operand in llvm >= 3.8
 396     #if (LLVM_VERSION_MAJOR > 3) || (LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 8)
 397             mask = BITCAST(mask,VectorType::get(mInt32Ty,mVWidth));
 398     #else
 399             mask = BITCAST(mask,VectorType::get(mFP32Ty,mVWidth));
 400     #endif
 401             Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule,Intrinsic::x86_avx_maskload_ps_256);
 402             vResult = BITCAST(CALL(func,{src,mask}), VectorType::get(mInt32Ty,mVWidth));
 403         }
 404         return vResult;
 405     }
 406
 407     //////////////////////////////////////////////////////////////////////////
 408     /// @brief insert a JIT call to CallPrint
 409     /// - outputs formatted string to both stdout and VS output window
 410     /// - DEBUG builds only
 411     /// Usage example:
 412     ///   PRINT("index %d = 0x%p\n",{C(lane), pIndex});
 413     ///   where C(lane) creates a constant value to print, and pIndex is the Value*
 414     ///   result from a GEP, printing out the pointer to memory
 415     /// @param printStr - constant string to print, which includes format specifiers
 416     /// @param printArgs - initializer list of Value*'s to print to std out
 417     CallInst *Builder::PRINT(const std::string &printStr,const std::initializer_list<Value*> &printArgs)
 418     {
 419         // push the arguments to CallPrint into a vector
 420         std::vector<Value*> printCallArgs;
 421         // save room for the format string.  we still need to modify it for vectors
 422         printCallArgs.resize(1);
 423
 424         // search through the format string for special processing
 425         size_t pos = 0;
 426         std::string tempStr(printStr);
 427         pos = tempStr.find('%', pos);
 428         auto v = printArgs.begin();
 429
 430         while ((pos != std::string::npos) && (v != printArgs.end()))
 431         {
 432             Value* pArg = *v;
 433             Type* pType = pArg->getType();
 434
 435             if (pType->isVectorTy())
 436             {
 437                 Type* pContainedType = pType->getContainedType(0);
 438
 439                 if (toupper(tempStr[pos + 1]) == 'X')
 440                 {
 441                     tempStr[pos] = '0';
 442                     tempStr[pos + 1] = 'x';
 443                     tempStr.insert(pos + 2, "%08X ");
 444                     pos += 7;
 445
 446                     printCallArgs.push_back(VEXTRACT(pArg, C(0)));
 447
 448                     std::string vectorFormatStr;
 449                     for (uint32_t i = 1; i < pType->getVectorNumElements(); ++i)
 450                     {
 451                         vectorFormatStr += "0x%08X ";
 452                         printCallArgs.push_back(VEXTRACT(pArg, C(i)));
 453                     }
 454
 455                     tempStr.insert(pos, vectorFormatStr);
 456                     pos += vectorFormatStr.size();
 457                 }
 458                 else if ((tempStr[pos + 1] == 'f') && (pContainedType->isFloatTy()))
 459                 {
 460                     uint32_t i = 0;
 461                     for (; i < (pArg->getType()->getVectorNumElements()) - 1; i++)
 462                     {
 463                         tempStr.insert(pos, std::string("%f "));
 464                         pos += 3;
 465                         printCallArgs.push_back(FP_EXT(VEXTRACT(pArg, C(i)), Type::getDoubleTy(JM()->mContext)));
 466                     }
 467                     printCallArgs.push_back(FP_EXT(VEXTRACT(pArg, C(i)), Type::getDoubleTy(JM()->mContext)));
 468                 }
 469                 else if ((tempStr[pos + 1] == 'd') && (pContainedType->isIntegerTy()))
 470                 {
 471                     uint32_t i = 0;
 472                     for (; i < (pArg->getType()->getVectorNumElements()) - 1; i++)
 473                     {
 474                         tempStr.insert(pos, std::string("%d "));
 475                         pos += 3;
 476                         printCallArgs.push_back(VEXTRACT(pArg, C(i)));
 477                     }
 478                     printCallArgs.push_back(VEXTRACT(pArg, C(i)));
 479                 }
 480             }
 481             else
 482             {
 483                 if (toupper(tempStr[pos + 1]) == 'X')
 484                 {
 485                     tempStr[pos] = '0';
 486                     tempStr.insert(pos + 1, "x%08");
 487                     printCallArgs.push_back(pArg);
 488                     pos += 3;
 489                 }
 490                 // for %f we need to cast float Values to doubles so that they print out correctly
 491                 else if ((tempStr[pos + 1] == 'f') && (pType->isFloatTy()))
 492                 {
 493                     printCallArgs.push_back(FP_EXT(pArg, Type::getDoubleTy(JM()->mContext)));
 494                     pos++;
 495                 }
 496                 else
 497                 {
 498                     printCallArgs.push_back(pArg);
 499                 }
 500             }
 501
 502             // advance to the next arguement
 503             v++;
 504             pos = tempStr.find('%', ++pos);
 505         }
 506
 507         // create global variable constant string
 508         Constant *constString = ConstantDataArray::getString(JM()->mContext,tempStr,true);
 509         GlobalVariable *gvPtr = new GlobalVariable(constString->getType(),true,GlobalValue::InternalLinkage,constString,"printStr");
 510         JM()->mpCurrentModule->getGlobalList().push_back(gvPtr);
 511
 512         // get a pointer to the first character in the constant string array
 513         std::vector<Constant*> geplist{C(0),C(0)};
 514         Constant *strGEP = ConstantExpr::getGetElementPtr(nullptr, gvPtr,geplist,false);
 515
 516         // insert the pointer to the format string in the argument vector
 517         printCallArgs[0] = strGEP;
 518
 519         // get pointer to CallPrint function and insert decl into the module if needed
 520         std::vector<Type*> args;
 521         args.push_back(PointerType::get(mInt8Ty,0));
 522         FunctionType* callPrintTy = FunctionType::get(Type::getVoidTy(JM()->mContext),args,true);
 523         Function *callPrintFn = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("CallPrint", callPrintTy));
 524
 525         // if we haven't yet added the symbol to the symbol table
 526         if((sys::DynamicLibrary::SearchForAddressOfSymbol("CallPrint")) == nullptr)
 527         {
 528             sys::DynamicLibrary::AddSymbol("CallPrint", (void *)&CallPrint);
 529         }
 530
 531         // insert a call to CallPrint
 532         return CALLA(callPrintFn,printCallArgs);
 533     }
 534
 535     //////////////////////////////////////////////////////////////////////////
 536     /// @brief Wrapper around PRINT with initializer list.
 537     CallInst* Builder::PRINT(const std::string &printStr)
 538     {
 539         return PRINT(printStr, {});
 540     }
 541
 542     //////////////////////////////////////////////////////////////////////////
 543     /// @brief Generate a masked gather operation in LLVM IR.  If not
 544     /// supported on the underlying platform, emulate it with loads
 545     /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
 546     /// @param pBase - Int8* base VB address pointer value
 547     /// @param vIndices - SIMD wide value of VB byte offsets
 548     /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
 549     /// @param scale - value to scale indices by
 550     Value *Builder::GATHERPS(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, Value* scale)
 551     {
 552         Value* vGather;
 553
 554         // use avx2 gather instruction if available
 555         if(JM()->mArch.AVX2())
 556         {
 557             // force mask to <N x float>, required by vgather
 558             vMask = BITCAST(vMask, mSimdFP32Ty);
 559             vGather = VGATHERPS(vSrc,pBase,vIndices,vMask,scale);
 560         }
 561         else
 562         {
 563             Value* pStack = STACKSAVE();
 564
 565             // store vSrc on the stack.  this way we can select between a valid load address and the vSrc address
 566             Value* vSrcPtr = ALLOCA(vSrc->getType());
 567             STORE(vSrc, vSrcPtr);
 568
 569             vGather = VUNDEF_F();
 570             Value *vScaleVec = VBROADCAST(Z_EXT(scale,mInt32Ty));
 571             Value *vOffsets = MUL(vIndices,vScaleVec);
 572             Value *mask = MASK(vMask);
 573             for(uint32_t i = 0; i < mVWidth; ++i)
 574             {
 575                 // single component byte index
 576                 Value *offset = VEXTRACT(vOffsets,C(i));
 577                 // byte pointer to component
 578                 Value *loadAddress = GEP(pBase,offset);
 579                 loadAddress = BITCAST(loadAddress,PointerType::get(mFP32Ty,0));
 580                 // pointer to the value to load if we're masking off a component
 581                 Value *maskLoadAddress = GEP(vSrcPtr,{C(0), C(i)});
 582                 Value *selMask = VEXTRACT(mask,C(i));
 583                 // switch in a safe address to load if we're trying to access a vertex
 584                 Value *validAddress = SELECT(selMask, loadAddress, maskLoadAddress);
 585                 Value *val = LOAD(validAddress);
 586                 vGather = VINSERT(vGather,val,C(i));
 587             }
 588             STACKRESTORE(pStack);
 589         }
 590
 591         return vGather;
 592     }
 593
 594     //////////////////////////////////////////////////////////////////////////
 595     /// @brief Generate a masked gather operation in LLVM IR.  If not
 596     /// supported on the underlying platform, emulate it with loads
 597     /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
 598     /// @param pBase - Int8* base VB address pointer value
 599     /// @param vIndices - SIMD wide value of VB byte offsets
 600     /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
 601     /// @param scale - value to scale indices by
 602     Value *Builder::GATHERDD(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, Value* scale)
 603     {
 604         Value* vGather;
 605
 606         // use avx2 gather instruction if available
 607         if(JM()->mArch.AVX2())
 608         {
 609             vGather = VGATHERDD(vSrc, pBase, vIndices, vMask, scale);
 610         }
 611         else
 612         {
 613             Value* pStack = STACKSAVE();
 614
 615             // store vSrc on the stack.  this way we can select between a valid load address and the vSrc address
 616             Value* vSrcPtr = ALLOCA(vSrc->getType());
 617             STORE(vSrc, vSrcPtr);
 618
 619             vGather = VUNDEF_I();
 620             Value *vScaleVec = VBROADCAST(Z_EXT(scale, mInt32Ty));
 621             Value *vOffsets = MUL(vIndices, vScaleVec);
 622             Value *mask = MASK(vMask);
 623             for(uint32_t i = 0; i < mVWidth; ++i)
 624             {
 625                 // single component byte index
 626                 Value *offset = VEXTRACT(vOffsets, C(i));
 627                 // byte pointer to component
 628                 Value *loadAddress = GEP(pBase, offset);
 629                 loadAddress = BITCAST(loadAddress, PointerType::get(mInt32Ty, 0));
 630                 // pointer to the value to load if we're masking off a component
 631                 Value *maskLoadAddress = GEP(vSrcPtr, {C(0), C(i)});
 632                 Value *selMask = VEXTRACT(mask, C(i));
 633                 // switch in a safe address to load if we're trying to access a vertex
 634                 Value *validAddress = SELECT(selMask, loadAddress, maskLoadAddress);
 635                 Value *val = LOAD(validAddress, C(0));
 636                 vGather = VINSERT(vGather, val, C(i));
 637             }
 638
 639             STACKRESTORE(pStack);
 640         }
 641         return vGather;
 642     }
 643
 644     //////////////////////////////////////////////////////////////////////////
 645     /// @brief Generate a masked gather operation in LLVM IR.  If not
 646     /// supported on the underlying platform, emulate it with loads
 647     /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
 648     /// @param pBase - Int8* base VB address pointer value
 649     /// @param vIndices - SIMD wide value of VB byte offsets
 650     /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
 651     /// @param scale - value to scale indices by
 652     Value *Builder::GATHERPD(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, Value* scale)
 653     {
 654         Value* vGather;
 655
 656         // use avx2 gather instruction if available
 657         if(JM()->mArch.AVX2())
 658         {
 659             vGather = VGATHERPD(vSrc, pBase, vIndices, vMask, scale);
 660         }
 661         else
 662         {
 663             Value* pStack = STACKSAVE();
 664
 665             // store vSrc on the stack.  this way we can select between a valid load address and the vSrc address
 666             Value* vSrcPtr = ALLOCA(vSrc->getType());
 667             STORE(vSrc, vSrcPtr);
 668
 669             vGather = UndefValue::get(VectorType::get(mDoubleTy, 4));
 670             Value *vScaleVec = VECTOR_SPLAT(4, Z_EXT(scale,mInt32Ty));
 671             Value *vOffsets = MUL(vIndices,vScaleVec);
 672             Value *mask = MASK(vMask);
 673             for(uint32_t i = 0; i < mVWidth/2; ++i)
 674             {
 675                 // single component byte index
 676                 Value *offset = VEXTRACT(vOffsets,C(i));
 677                 // byte pointer to component
 678                 Value *loadAddress = GEP(pBase,offset);
 679                 loadAddress = BITCAST(loadAddress,PointerType::get(mDoubleTy,0));
 680                 // pointer to the value to load if we're masking off a component
 681                 Value *maskLoadAddress = GEP(vSrcPtr,{C(0), C(i)});
 682                 Value *selMask = VEXTRACT(mask,C(i));
 683                 // switch in a safe address to load if we're trying to access a vertex
 684                 Value *validAddress = SELECT(selMask, loadAddress, maskLoadAddress);
 685                 Value *val = LOAD(validAddress);
 686                 vGather = VINSERT(vGather,val,C(i));
 687             }
 688             STACKRESTORE(pStack);
 689         }
 690         return vGather;
 691     }
 692
 693     //////////////////////////////////////////////////////////////////////////
 694     /// @brief convert x86 <N x float> mask to llvm <N x i1> mask
 695     Value* Builder::MASK(Value* vmask)
 696     {
 697         Value* src = BITCAST(vmask, mSimdInt32Ty);
 698         return ICMP_SLT(src, VIMMED1(0));
 699     }
 700
 701     //////////////////////////////////////////////////////////////////////////
 702     /// @brief convert llvm <N x i1> mask to x86 <N x i32> mask
 703     Value* Builder::VMASK(Value* mask)
 704     {
 705         return S_EXT(mask, mSimdInt32Ty);
 706     }
 707
 708     //////////////////////////////////////////////////////////////////////////
 709     /// @brief Generate a VPSHUFB operation in LLVM IR.  If not
 710     /// supported on the underlying platform, emulate it
 711     /// @param a - 256bit SIMD(32x8bit) of 8bit integer values
 712     /// @param b - 256bit SIMD(32x8bit) of 8bit integer mask values
 713     /// Byte masks in lower 128 lane of b selects 8 bit values from lower
 714     /// 128bits of a, and vice versa for the upper lanes.  If the mask
 715     /// value is negative, '0' is inserted.
 716     Value *Builder::PSHUFB(Value* a, Value* b)
 717     {
 718         Value* res;
 719         // use avx2 pshufb instruction if available
 720         if(JM()->mArch.AVX2())
 721         {
 722             res = VPSHUFB(a, b);
 723         }
 724         else
 725         {
 726             Constant* cB = dyn_cast<Constant>(b);
 727             // number of 8 bit elements in b
 728             uint32_t numElms = cast<VectorType>(cB->getType())->getNumElements();
 729             // output vector
 730             Value* vShuf = UndefValue::get(VectorType::get(mInt8Ty, numElms));
 731
 732             // insert an 8 bit value from the high and low lanes of a per loop iteration
 733             numElms /= 2;
 734             for(uint32_t i = 0; i < numElms; i++)
 735             {
 736                 ConstantInt* cLow128b = cast<ConstantInt>(cB->getAggregateElement(i));
 737                 ConstantInt* cHigh128b = cast<ConstantInt>(cB->getAggregateElement(i + numElms));
 738
 739                 // extract values from constant mask
 740                 char valLow128bLane =  (char)(cLow128b->getSExtValue());
 741                 char valHigh128bLane = (char)(cHigh128b->getSExtValue());
 742
 743                 Value* insertValLow128b;
 744                 Value* insertValHigh128b;
 745
 746                 // if the mask value is negative, insert a '0' in the respective output position
 747                 // otherwise, lookup the value at mask position (bits 3..0 of the respective mask byte) in a and insert in output vector
 748                 insertValLow128b = (valLow128bLane < 0) ? C((char)0) : VEXTRACT(a, C((valLow128bLane & 0xF)));
 749                 insertValHigh128b = (valHigh128bLane < 0) ? C((char)0) : VEXTRACT(a, C((valHigh128bLane & 0xF) + numElms));
 750
 751                 vShuf = VINSERT(vShuf, insertValLow128b, i);
 752                 vShuf = VINSERT(vShuf, insertValHigh128b, (i + numElms));
 753             }
 754             res = vShuf;
 755         }
 756         return res;
 757     }
 758
 759     //////////////////////////////////////////////////////////////////////////
 760     /// @brief Generate a VPSHUFB operation (sign extend 8 8bit values to 32
 761     /// bits)in LLVM IR.  If not supported on the underlying platform, emulate it
 762     /// @param a - 128bit SIMD lane(16x8bit) of 8bit integer values.  Only
 763     /// lower 8 values are used.
 764     Value *Builder::PMOVSXBD(Value* a)
 765     {
 766         // llvm-3.9 removed the pmovsxbd intrinsic
 767     #if HAVE_LLVM < 0x309
 768         // use avx2 byte sign extend instruction if available
 769         if(JM()->mArch.AVX2())
 770         {
 771             Function *pmovsxbd = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx2_pmovsxbd);
 772             return CALL(pmovsxbd, std::initializer_list<Value*>{a});
 773         }
 774         else
 775     #endif
 776         {
 777             // VPMOVSXBD output type
 778             Type* v8x32Ty = VectorType::get(mInt32Ty, 8);
 779             // Extract 8 values from 128bit lane and sign extend
 780             return S_EXT(VSHUFFLE(a, a, C<int>({0, 1, 2, 3, 4, 5, 6, 7})), v8x32Ty);
 781         }
 782     }
 783
 784     //////////////////////////////////////////////////////////////////////////
 785     /// @brief Generate a VPSHUFB operation (sign extend 8 16bit values to 32
 786     /// bits)in LLVM IR.  If not supported on the underlying platform, emulate it
 787     /// @param a - 128bit SIMD lane(8x16bit) of 16bit integer values.
 788     Value *Builder::PMOVSXWD(Value* a)
 789     {
 790         // llvm-3.9 removed the pmovsxwd intrinsic
 791     #if HAVE_LLVM < 0x309
 792         // use avx2 word sign extend if available
 793         if(JM()->mArch.AVX2())
 794         {
 795             Function *pmovsxwd = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx2_pmovsxwd);
 796             return CALL(pmovsxwd, std::initializer_list<Value*>{a});
 797         }
 798         else
 799     #endif
 800         {
 801             // VPMOVSXWD output type
 802             Type* v8x32Ty = VectorType::get(mInt32Ty, 8);
 803             // Extract 8 values from 128bit lane and sign extend
 804             return S_EXT(VSHUFFLE(a, a, C<int>({0, 1, 2, 3, 4, 5, 6, 7})), v8x32Ty);
 805         }
 806     }
 807
 808     //////////////////////////////////////////////////////////////////////////
 809     /// @brief Generate a VPERMD operation (shuffle 32 bit integer values
 810     /// across 128 bit lanes) in LLVM IR.  If not supported on the underlying
 811     /// platform, emulate it
 812     /// @param a - 256bit SIMD lane(8x32bit) of integer values.
 813     /// @param idx - 256bit SIMD lane(8x32bit) of 3 bit lane index values
 814     Value *Builder::PERMD(Value* a, Value* idx)
 815     {
 816         Value* res;
 817         // use avx2 permute instruction if available
 818         if(JM()->mArch.AVX2())
 819         {
 820             res = VPERMD(a, idx);
 821         }
 822         else
 823         {
 824             if (isa<Constant>(idx))
 825             {
 826                 res = VSHUFFLE(a, a, idx);
 827             }
 828             else
 829             {
 830                 res = VUNDEF_I();
 831                 for (uint32_t l = 0; l < JM()->mVWidth; ++l)
 832                 {
 833                     Value* pIndex = VEXTRACT(idx, C(l));
 834                     Value* pVal = VEXTRACT(a, pIndex);
 835                     res = VINSERT(res, pVal, C(l));
 836                 }
 837             }
 838         }
 839         return res;
 840     }
 841
 842     //////////////////////////////////////////////////////////////////////////
 843     /// @brief Generate a VPERMPS operation (shuffle 32 bit float values
 844     /// across 128 bit lanes) in LLVM IR.  If not supported on the underlying
 845     /// platform, emulate it
 846     /// @param a - 256bit SIMD lane(8x32bit) of float values.
 847     /// @param idx - 256bit SIMD lane(8x32bit) of 3 bit lane index values
 848     Value *Builder::PERMPS(Value* a, Value* idx)
 849     {
 850         Value* res;
 851         // use avx2 permute instruction if available
 852         if (JM()->mArch.AVX2())
 853         {
 854             // llvm 3.6.0 swapped the order of the args to vpermd
 855             res = VPERMPS(idx, a);
 856         }
 857         else
 858         {
 859             if (isa<Constant>(idx))
 860             {
 861                 res = VSHUFFLE(a, a, idx);
 862             }
 863             else
 864             {
 865                 res = VUNDEF_F();
 866                 for (uint32_t l = 0; l < JM()->mVWidth; ++l)
 867                 {
 868                     Value* pIndex = VEXTRACT(idx, C(l));
 869                     Value* pVal = VEXTRACT(a, pIndex);
 870                     res = VINSERT(res, pVal, C(l));
 871                 }
 872             }
 873         }
 874
 875         return res;
 876     }
 877
 878     //////////////////////////////////////////////////////////////////////////
 879     /// @brief Generate a VCVTPH2PS operation (float16->float32 conversion)
 880     /// in LLVM IR.  If not supported on the underlying platform, emulate it
 881     /// @param a - 128bit SIMD lane(8x16bit) of float16 in int16 format.
 882     Value *Builder::CVTPH2PS(Value* a)
 883     {
 884         if (JM()->mArch.F16C())
 885         {
 886             return VCVTPH2PS(a);
 887         }
 888         else
 889         {
 890             FunctionType* pFuncTy = FunctionType::get(mFP32Ty, mInt16Ty);
 891             Function* pCvtPh2Ps = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("ConvertSmallFloatTo32", pFuncTy));
 892
 893             if (sys::DynamicLibrary::SearchForAddressOfSymbol("ConvertSmallFloatTo32") == nullptr)
 894             {
 895                 sys::DynamicLibrary::AddSymbol("ConvertSmallFloatTo32", (void *)&ConvertSmallFloatTo32);
 896             }
 897
 898             Value* pResult = UndefValue::get(mSimdFP32Ty);
 899             for (uint32_t i = 0; i < mVWidth; ++i)
 900             {
 901                 Value* pSrc = VEXTRACT(a, C(i));
 902                 Value* pConv = CALL(pCvtPh2Ps, std::initializer_list<Value*>{pSrc});
 903                 pResult = VINSERT(pResult, pConv, C(i));
 904             }
 905
 906             return pResult;
 907         }
 908     }
 909
 910     //////////////////////////////////////////////////////////////////////////
 911     /// @brief Generate a VCVTPS2PH operation (float32->float16 conversion)
 912     /// in LLVM IR.  If not supported on the underlying platform, emulate it
 913     /// @param a - 128bit SIMD lane(8x16bit) of float16 in int16 format.
 914     Value *Builder::CVTPS2PH(Value* a, Value* rounding)
 915     {
 916         if (JM()->mArch.F16C())
 917         {
 918             return VCVTPS2PH(a, rounding);
 919         }
 920         else
 921         {
 922             // call scalar C function for now
 923             FunctionType* pFuncTy = FunctionType::get(mInt16Ty, mFP32Ty);
 924             Function* pCvtPs2Ph = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("Convert32To16Float", pFuncTy));
 925
 926             if (sys::DynamicLibrary::SearchForAddressOfSymbol("Convert32To16Float") == nullptr)
 927             {
 928                 sys::DynamicLibrary::AddSymbol("Convert32To16Float", (void *)&Convert32To16Float);
 929             }
 930
 931             Value* pResult = UndefValue::get(mSimdInt16Ty);
 932             for (uint32_t i = 0; i < mVWidth; ++i)
 933             {
 934                 Value* pSrc = VEXTRACT(a, C(i));
 935                 Value* pConv = CALL(pCvtPs2Ph, std::initializer_list<Value*>{pSrc});
 936                 pResult = VINSERT(pResult, pConv, C(i));
 937             }
 938
 939             return pResult;
 940         }
 941     }
 942
 943     Value *Builder::PMAXSD(Value* a, Value* b)
 944     {
 945         // llvm-3.9 removed the pmax intrinsics
 946     #if HAVE_LLVM >= 0x309
 947         Value* cmp = ICMP_SGT(a, b);
 948         return SELECT(cmp, a, b);
 949     #else
 950         if (JM()->mArch.AVX2())
 951         {
 952             Function* pmaxsd = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx2_pmaxs_d);
 953             return CALL(pmaxsd, {a, b});
 954         }
 955         else
 956         {
 957             // use 4-wide sse max intrinsic on lower/upper halves of 8-wide sources
 958             Function* pmaxsd = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_sse41_pmaxsd);
 959
 960             // low 128
 961             Value* aLo = VEXTRACTI128(a, C((uint8_t)0));
 962             Value* bLo = VEXTRACTI128(b, C((uint8_t)0));
 963             Value* resLo = CALL(pmaxsd, {aLo, bLo});
 964
 965             // high 128
 966             Value* aHi = VEXTRACTI128(a, C((uint8_t)1));
 967             Value* bHi = VEXTRACTI128(b, C((uint8_t)1));
 968             Value* resHi = CALL(pmaxsd, {aHi, bHi});
 969
 970             // combine
 971             Value* result = VINSERTI128(VUNDEF_I(), resLo, C((uint8_t)0));
 972             result = VINSERTI128(result, resHi, C((uint8_t)1));
 973
 974             return result;
 975         }
 976     #endif
 977     }
 978
 979     Value *Builder::PMINSD(Value* a, Value* b)
 980     {
 981         // llvm-3.9 removed the pmin intrinsics
 982     #if HAVE_LLVM >= 0x309
 983         Value* cmp = ICMP_SLT(a, b);
 984         return SELECT(cmp, a, b);
 985     #else
 986         if (JM()->mArch.AVX2())
 987         {
 988             Function* pminsd = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx2_pmins_d);
 989             return CALL(pminsd, {a, b});
 990         }
 991         else
 992         {
 993             // use 4-wide sse max intrinsic on lower/upper halves of 8-wide sources
 994             Function* pminsd = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_sse41_pminsd);
 995
 996             // low 128
 997             Value* aLo = VEXTRACTI128(a, C((uint8_t)0));
 998             Value* bLo = VEXTRACTI128(b, C((uint8_t)0));
 999             Value* resLo = CALL(pminsd, {aLo, bLo});
1000
1001             // high 128
1002             Value* aHi = VEXTRACTI128(a, C((uint8_t)1));
1003             Value* bHi = VEXTRACTI128(b, C((uint8_t)1));
1004             Value* resHi = CALL(pminsd, {aHi, bHi});
1005
1006             // combine
1007             Value* result = VINSERTI128(VUNDEF_I(), resLo, C((uint8_t)0));
1008             result = VINSERTI128(result, resHi, C((uint8_t)1));
1009
1010             return result;
1011         }
1012     #endif
1013     }
1014
1015     void Builder::Gather4(const SWR_FORMAT format, Value* pSrcBase, Value* byteOffsets,
1016                           Value* mask, Value* vGatherComponents[], bool bPackedOutput)
1017     {
1018         const SWR_FORMAT_INFO &info = GetFormatInfo(format);
1019         if(info.type[0] == SWR_TYPE_FLOAT && info.bpc[0] == 32)
1020         {
1021             // ensure our mask is the correct type
1022             mask = BITCAST(mask, mSimdFP32Ty);
1023             GATHER4PS(info, pSrcBase, byteOffsets, mask, vGatherComponents, bPackedOutput);
1024         }
1025         else
1026         {
1027             // ensure our mask is the correct type
1028             mask = BITCAST(mask, mSimdInt32Ty);
1029             GATHER4DD(info, pSrcBase, byteOffsets, mask, vGatherComponents, bPackedOutput);
1030         }
1031     }
1032
1033     void Builder::GATHER4PS(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets,
1034                             Value* mask, Value* vGatherComponents[], bool bPackedOutput)
1035     {
1036         switch(info.bpp / info.numComps)
1037         {
1038             case 16:
1039             {
1040                     Value* vGatherResult[2];
1041                     Value *vMask;
1042
1043                     // TODO: vGatherMaskedVal
1044                     Value* vGatherMaskedVal = VIMMED1((float)0);
1045
1046                     // always have at least one component out of x or y to fetch
1047
1048                     // save mask as it is zero'd out after each gather
1049                     vMask = mask;
1050
1051                     vGatherResult[0] = GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, C((char)1));
1052                     // e.g. result of first 8x32bit integer gather for 16bit components
1053                     // 256i - 0    1    2    3    4    5    6    7
1054                     //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
1055                     //
1056
1057                     // if we have at least one component out of x or y to fetch
1058                     if(info.numComps > 2)
1059                     {
1060                         // offset base to the next components(zw) in the vertex to gather
1061                         pSrcBase = GEP(pSrcBase, C((char)4));
1062                         vMask = mask;
1063
1064                         vGatherResult[1] =  GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, C((char)1));
1065                         // e.g. result of second 8x32bit integer gather for 16bit components
1066                         // 256i - 0    1    2    3    4    5    6    7
1067                         //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
1068                         //
1069                     }
1070                     else
1071                     {
1072                         vGatherResult[1] =  vGatherMaskedVal;
1073                     }
1074
1075                     // Shuffle gathered components into place, each row is a component
1076                     Shuffle16bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
1077             }
1078                 break;
1079             case 32:
1080             {
1081                 // apply defaults
1082                 for (uint32_t i = 0; i < 4; ++i)
1083                 {
1084                     vGatherComponents[i] = VIMMED1(*(float*)&info.defaults[i]);
1085                 }
1086
1087                 for(uint32_t i = 0; i < info.numComps; i++)
1088                 {
1089                     uint32_t swizzleIndex = info.swizzle[i];
1090
1091                     // save mask as it is zero'd out after each gather
1092                     Value *vMask = mask;
1093
1094                     // Gather a SIMD of components
1095                     vGatherComponents[swizzleIndex] = GATHERPS(vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask, C((char)1));
1096
1097                     // offset base to the next component to gather
1098                     pSrcBase = GEP(pSrcBase, C((char)4));
1099                 }
1100             }
1101                 break;
1102             default:
1103                 SWR_INVALID("Invalid float format");
1104                 break;
1105         }
1106     }
1107
1108     void Builder::GATHER4DD(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets,
1109                             Value* mask, Value* vGatherComponents[], bool bPackedOutput)
1110     {
1111         switch (info.bpp / info.numComps)
1112         {
1113             case 8:
1114             {
1115                 Value* vGatherMaskedVal = VIMMED1((int32_t)0);
1116                 Value* vGatherResult = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, mask, C((char)1));
1117                 // e.g. result of an 8x32bit integer gather for 8bit components
1118                 // 256i - 0    1    2    3    4    5    6    7
1119                 //        xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw
1120
1121                 Shuffle8bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
1122             }
1123                 break;
1124             case 16:
1125             {
1126                 Value* vGatherResult[2];
1127                 Value *vMask;
1128
1129                 // TODO: vGatherMaskedVal
1130                 Value* vGatherMaskedVal = VIMMED1((int32_t)0);
1131
1132                 // always have at least one component out of x or y to fetch
1133
1134                 // save mask as it is zero'd out after each gather
1135                 vMask = mask;
1136
1137                 vGatherResult[0] = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, C((char)1));
1138                 // e.g. result of first 8x32bit integer gather for 16bit components
1139                 // 256i - 0    1    2    3    4    5    6    7
1140                 //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
1141                 //
1142
1143                 // if we have at least one component out of x or y to fetch
1144                 if(info.numComps > 2)
1145                 {
1146                     // offset base to the next components(zw) in the vertex to gather
1147                     pSrcBase = GEP(pSrcBase, C((char)4));
1148                     vMask = mask;
1149
1150                     vGatherResult[1] = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, C((char)1));
1151                     // e.g. result of second 8x32bit integer gather for 16bit components
1152                     // 256i - 0    1    2    3    4    5    6    7
1153                     //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
1154                     //
1155                 }
1156                 else
1157                 {
1158                     vGatherResult[1] = vGatherMaskedVal;
1159                 }
1160
1161                 // Shuffle gathered components into place, each row is a component
1162                 Shuffle16bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
1163
1164             }
1165                 break;
1166             case 32:
1167             {
1168                 // apply defaults
1169                 for (uint32_t i = 0; i < 4; ++i)
1170                 {
1171                     vGatherComponents[i] = VIMMED1((int)info.defaults[i]);
1172                 }
1173
1174                 for(uint32_t i = 0; i < info.numComps; i++)
1175                 {
1176                     uint32_t swizzleIndex = info.swizzle[i];
1177
1178                     // save mask as it is zero'd out after each gather
1179                     Value *vMask = mask;
1180
1181                     // Gather a SIMD of components
1182                     vGatherComponents[swizzleIndex] = GATHERDD(vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask, C((char)1));
1183
1184                     // offset base to the next component to gather
1185                     pSrcBase = GEP(pSrcBase, C((char)4));
1186                 }
1187             }
1188                 break;
1189             default:
1190                 SWR_INVALID("unsupported format");
1191             break;
1192         }
1193     }
1194
1195     void Builder::Shuffle16bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput[2], Value* vGatherOutput[4], bool bPackedOutput)
1196     {
1197         // cast types
1198         Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
1199         Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
1200
1201         // input could either be float or int vector; do shuffle work in int
1202         vGatherInput[0] = BITCAST(vGatherInput[0], mSimdInt32Ty);
1203         vGatherInput[1] = BITCAST(vGatherInput[1], mSimdInt32Ty);
1204
1205         if(bPackedOutput)
1206         {
1207             Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
1208
1209             // shuffle mask
1210             Value* vConstMask = C<char>({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
1211                                          0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15});
1212             Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput[0], v32x8Ty), vConstMask), vGatherTy);
1213             // after pshufb: group components together in each 128bit lane
1214             // 256i - 0    1    2    3    4    5    6    7
1215             //        xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
1216
1217             Value* vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
1218             // after PERMD: move and pack xy components into each 128bit lane
1219             // 256i - 0    1    2    3    4    5    6    7
1220             //        xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
1221
1222             // do the same for zw components
1223             Value* vi128ZW = nullptr;
1224             if(info.numComps > 2)
1225             {
1226                 Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput[1], v32x8Ty), vConstMask), vGatherTy);
1227                 vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
1228             }
1229
1230             for(uint32_t i = 0; i < 4; i++)
1231             {
1232                 uint32_t swizzleIndex = info.swizzle[i];
1233                 // todo: fixed for packed
1234                 Value* vGatherMaskedVal = VIMMED1((int32_t)(info.defaults[i]));
1235                 if(i >= info.numComps)
1236                 {
1237                     // set the default component val
1238                     vGatherOutput[swizzleIndex] = vGatherMaskedVal;
1239                     continue;
1240                 }
1241
1242                 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
1243                 uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
1244                 // if x or y, use vi128XY permute result, else use vi128ZW
1245                 Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
1246
1247                 // extract packed component 128 bit lanes
1248                 vGatherOutput[swizzleIndex] = VEXTRACT(selectedPermute, C(lane));
1249             }
1250
1251         }
1252         else
1253         {
1254             // pshufb masks for each component
1255             Value* vConstMask[2];
1256             // x/z shuffle mask
1257             vConstMask[0] = C<char>({0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
1258                                      0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, });
1259
1260             // y/w shuffle mask
1261             vConstMask[1] = C<char>({2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
1262                                      2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1});
1263
1264
1265             // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits
1266             // apply defaults
1267             for (uint32_t i = 0; i < 4; ++i)
1268             {
1269                 vGatherOutput[i] = VIMMED1((int32_t)info.defaults[i]);
1270             }
1271
1272             for(uint32_t i = 0; i < info.numComps; i++)
1273             {
1274                 uint32_t swizzleIndex = info.swizzle[i];
1275
1276                 // select correct constMask for x/z or y/w pshufb
1277                 uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1;
1278                 // if x or y, use vi128XY permute result, else use vi128ZW
1279                 uint32_t selectedGather = (i < 2) ? 0 : 1;
1280
1281                 vGatherOutput[swizzleIndex] = BITCAST(PSHUFB(BITCAST(vGatherInput[selectedGather], v32x8Ty), vConstMask[selectedMask]), vGatherTy);
1282                 // after pshufb mask for x channel; z uses the same shuffle from the second gather
1283                 // 256i - 0    1    2    3    4    5    6    7
1284                 //        xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00
1285             }
1286         }
1287     }
1288
1289     void Builder::Shuffle8bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput, Value* vGatherOutput[], bool bPackedOutput)
1290     {
1291         // cast types
1292         Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
1293         Type* v32x8Ty =  VectorType::get(mInt8Ty, mVWidth * 4 ); // vwidth is units of 32 bits
1294
1295         if(bPackedOutput)
1296         {
1297             Type* v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
1298             // shuffle mask
1299             Value* vConstMask = C<char>({0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15,
1300                                          0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15});
1301             Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput, v32x8Ty), vConstMask), vGatherTy);
1302             // after pshufb: group components together in each 128bit lane
1303             // 256i - 0    1    2    3    4    5    6    7
1304             //        xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww
1305
1306             Value* vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({0, 4, 0, 0, 1, 5, 0, 0})), v128Ty);
1307             // after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane
1308             // 256i - 0    1    2    3    4    5    6    7
1309             //        xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care)
1310
1311             // do the same for zw components
1312             Value* vi128ZW = nullptr;
1313             if(info.numComps > 2)
1314             {
1315                 vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({2, 6, 0, 0, 3, 7, 0, 0})), v128Ty);
1316             }
1317
1318             // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
1319             for(uint32_t i = 0; i < 4; i++)
1320             {
1321                 uint32_t swizzleIndex = info.swizzle[i];
1322                 // todo: fix for packed
1323                 Value* vGatherMaskedVal = VIMMED1((int32_t)(info.defaults[i]));
1324                 if(i >= info.numComps)
1325                 {
1326                     // set the default component val
1327                     vGatherOutput[swizzleIndex] = vGatherMaskedVal;
1328                     continue;
1329                 }
1330
1331                 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
1332                 uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
1333                 // if x or y, use vi128XY permute result, else use vi128ZW
1334                 Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
1335
1336                 // sign extend
1337                 vGatherOutput[swizzleIndex] = VEXTRACT(selectedPermute, C(lane));
1338             }
1339         }
1340         // else zero extend
1341         else{
1342             // shuffle enabled components into lower byte of each 32bit lane, 0 extending to 32 bits
1343             // apply defaults
1344             for (uint32_t i = 0; i < 4; ++i)
1345             {
1346                 vGatherOutput[i] = VIMMED1((int32_t)info.defaults[i]);
1347             }
1348
1349             for(uint32_t i = 0; i < info.numComps; i++){
1350                 uint32_t swizzleIndex = info.swizzle[i];
1351
1352                 // pshufb masks for each component
1353                 Value* vConstMask;
1354                 switch(i)
1355                 {
1356                     case 0:
1357                         // x shuffle mask
1358                         vConstMask = C<char>({0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1,
1359                                               0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1});
1360                         break;
1361                     case 1:
1362                         // y shuffle mask
1363                         vConstMask = C<char>({1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1,
1364                                               1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1});
1365                         break;
1366                     case 2:
1367                         // z shuffle mask
1368                         vConstMask = C<char>({2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1,
1369                                               2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1});
1370                         break;
1371                     case 3:
1372                         // w shuffle mask
1373                         vConstMask = C<char>({3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1,
1374                                               3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1});
1375                         break;
1376                     default:
1377                         vConstMask = nullptr;
1378                         break;
1379                 }
1380
1381                     vGatherOutput[swizzleIndex] = BITCAST(PSHUFB(BITCAST(vGatherInput, v32x8Ty), vConstMask), vGatherTy);
1382                     // after pshufb for x channel
1383                     // 256i - 0    1    2    3    4    5    6    7
1384                     //        x000 x000 x000 x000 x000 x000 x000 x000
1385             }
1386         }
1387     }
1388
1389     // Helper function to create alloca in entry block of function
1390     Value* Builder::CreateEntryAlloca(Function* pFunc, Type* pType)
1391     {
1392         auto saveIP = IRB()->saveIP();
1393         IRB()->SetInsertPoint(&pFunc->getEntryBlock(),
1394                               pFunc->getEntryBlock().begin());
1395         Value* pAlloca = ALLOCA(pType);
1396         if (saveIP.isSet()) IRB()->restoreIP(saveIP);
1397         return pAlloca;
1398     }
1399
1400     Value* Builder::CreateEntryAlloca(Function* pFunc, Type* pType, Value* pArraySize)
1401     {
1402         auto saveIP = IRB()->saveIP();
1403         IRB()->SetInsertPoint(&pFunc->getEntryBlock(),
1404             pFunc->getEntryBlock().begin());
1405         Value* pAlloca = ALLOCA(pType, pArraySize);
1406         if (saveIP.isSet()) IRB()->restoreIP(saveIP);
1407         return pAlloca;
1408     }
1409
1410     //////////////////////////////////////////////////////////////////////////
1411     /// @brief emulates a scatter operation.
1412     /// @param pDst - pointer to destination
1413     /// @param vSrc - vector of src data to scatter
1414     /// @param vOffsets - vector of byte offsets from pDst
1415     /// @param vMask - mask of valid lanes
1416     void Builder::SCATTERPS(Value* pDst, Value* vSrc, Value* vOffsets, Value* vMask)
1417     {
1418         /* Scatter algorithm
1419
1420            while(Index = BitScanForward(mask))
1421                 srcElem = srcVector[Index]
1422                 offsetElem = offsetVector[Index]
1423                 *(pDst + offsetElem) = srcElem
1424                 Update mask (&= ~(1<<Index)
1425
1426         */
1427
1428         BasicBlock* pCurBB = IRB()->GetInsertBlock();
1429         Function* pFunc = pCurBB->getParent();
1430         Type* pSrcTy = vSrc->getType()->getVectorElementType();
1431
1432         // Store vectors on stack
1433         if (pScatterStackSrc == nullptr)
1434         {
1435             // Save off stack allocations and reuse per scatter. Significantly reduces stack
1436             // requirements for shaders with a lot of scatters.
1437             pScatterStackSrc = CreateEntryAlloca(pFunc, mSimdInt64Ty);
1438             pScatterStackOffsets = CreateEntryAlloca(pFunc, mSimdInt32Ty);
1439         }
1440
1441         Value* pSrcArrayPtr = BITCAST(pScatterStackSrc, PointerType::get(vSrc->getType(), 0));
1442         Value* pOffsetsArrayPtr = pScatterStackOffsets;
1443         STORE(vSrc, pSrcArrayPtr);
1444         STORE(vOffsets, pOffsetsArrayPtr);
1445
1446         // Cast to pointers for random access
1447         pSrcArrayPtr = POINTER_CAST(pSrcArrayPtr, PointerType::get(pSrcTy, 0));
1448         pOffsetsArrayPtr = POINTER_CAST(pOffsetsArrayPtr, PointerType::get(mInt32Ty, 0));
1449
1450         Value* pMask = VMOVMSKPS(BITCAST(vMask, mSimdFP32Ty));
1451
1452         // Get cttz function
1453         Function* pfnCttz = Intrinsic::getDeclaration(mpJitMgr->mpCurrentModule, Intrinsic::cttz, { mInt32Ty });
1454
1455         // Setup loop basic block
1456         BasicBlock* pLoop = BasicBlock::Create(mpJitMgr->mContext, "Scatter Loop", pFunc);
1457
1458         // compute first set bit
1459         Value* pIndex = CALL(pfnCttz, { pMask, C(false) });
1460
1461         Value* pIsUndef = ICMP_EQ(pIndex, C(32));
1462
1463         // Split current block
1464         BasicBlock* pPostLoop = pCurBB->splitBasicBlock(cast<Instruction>(pIsUndef)->getNextNode());
1465
1466         // Remove unconditional jump created by splitBasicBlock
1467         pCurBB->getTerminator()->eraseFromParent();
1468
1469         // Add terminator to end of original block
1470         IRB()->SetInsertPoint(pCurBB);
1471
1472         // Add conditional branch
1473         COND_BR(pIsUndef, pPostLoop, pLoop);
1474
1475         // Add loop basic block contents
1476         IRB()->SetInsertPoint(pLoop);
1477         PHINode* pIndexPhi = PHI(mInt32Ty, 2);
1478         PHINode* pMaskPhi = PHI(mInt32Ty, 2);
1479
1480         pIndexPhi->addIncoming(pIndex, pCurBB);
1481         pMaskPhi->addIncoming(pMask, pCurBB);
1482
1483         // Extract elements for this index
1484         Value* pSrcElem = LOADV(pSrcArrayPtr, { pIndexPhi });
1485         Value* pOffsetElem = LOADV(pOffsetsArrayPtr, { pIndexPhi });
1486
1487         // GEP to this offset in dst
1488         Value* pCurDst = GEP(pDst, pOffsetElem);
1489         pCurDst = POINTER_CAST(pCurDst, PointerType::get(pSrcTy, 0));
1490         STORE(pSrcElem, pCurDst);
1491
1492         // Update the mask
1493         Value* pNewMask = AND(pMaskPhi, NOT(SHL(C(1), pIndexPhi)));
1494
1495         // Terminator
1496         Value* pNewIndex = CALL(pfnCttz, { pNewMask, C(false) });
1497
1498         pIsUndef = ICMP_EQ(pNewIndex, C(32));
1499         COND_BR(pIsUndef, pPostLoop, pLoop);
1500
1501         // Update phi edges
1502         pIndexPhi->addIncoming(pNewIndex, pLoop);
1503         pMaskPhi->addIncoming(pNewMask, pLoop);
1504
1505         // Move builder to beginning of post loop
1506         IRB()->SetInsertPoint(pPostLoop, pPostLoop->begin());
1507     }
1508
1509     Value* Builder::VABSPS(Value* a)
1510     {
1511         Value* asInt = BITCAST(a, mSimdInt32Ty);
1512         Value* result = BITCAST(AND(asInt, VIMMED1(0x7fffffff)), mSimdFP32Ty);
1513         return result;
1514     }
1515
1516     Value *Builder::ICLAMP(Value* src, Value* low, Value* high)
1517     {
1518         Value *lowCmp = ICMP_SLT(src, low);
1519         Value *ret = SELECT(lowCmp, low, src);
1520
1521         Value *highCmp = ICMP_SGT(ret, high);
1522         ret = SELECT(highCmp, high, ret);
1523
1524         return ret;
1525     }
1526
1527     Value *Builder::FCLAMP(Value* src, Value* low, Value* high)
1528     {
1529         Value *lowCmp = FCMP_OLT(src, low);
1530         Value *ret = SELECT(lowCmp, low, src);
1531
1532         Value *highCmp = FCMP_OGT(ret, high);
1533         ret = SELECT(highCmp, high, ret);
1534
1535         return ret;
1536     }
1537
1538     Value *Builder::FCLAMP(Value* src, float low, float high)
1539     {
1540         Value* result = VMAXPS(src, VIMMED1(low));
1541         result = VMINPS(result, VIMMED1(high));
1542
1543         return result;
1544     }
1545
1546     //////////////////////////////////////////////////////////////////////////
1547     /// @brief save/restore stack, providing ability to push/pop the stack and
1548     ///        reduce overall stack requirements for temporary stack use
1549     Value* Builder::STACKSAVE()
1550     {
1551         Function* pfnStackSave = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::stacksave);
1552         return CALLA(pfnStackSave);
1553     }
1554
1555     void Builder::STACKRESTORE(Value* pSaved)
1556     {
1557         Function* pfnStackRestore = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::stackrestore);
1558         CALL(pfnStackRestore, std::initializer_list<Value*>{pSaved});
1559     }
1560
1561     Value *Builder::FMADDPS(Value* a, Value* b, Value* c)
1562     {
1563         Value* vOut;
1564         // use FMADs if available
1565         if(JM()->mArch.AVX2())
1566         {
1567             vOut = VFMADDPS(a, b, c);
1568         }
1569         else
1570         {
1571             vOut = FADD(FMUL(a, b), c);
1572         }
1573         return vOut;
1574     }
1575
1576     Value* Builder::POPCNT(Value* a)
1577     {
1578         Function* pCtPop = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::ctpop, { a->getType() });
1579         return CALL(pCtPop, std::initializer_list<Value*>{a});
1580     }
1581
1582     //////////////////////////////////////////////////////////////////////////
1583     /// @brief C functions called by LLVM IR
1584     //////////////////////////////////////////////////////////////////////////
1585
1586     //////////////////////////////////////////////////////////////////////////
1587     /// @brief called in JIT code, inserted by PRINT
1588     /// output to both stdout and visual studio debug console
1589     void __cdecl CallPrint(const char* fmt, ...)
1590     {
1591         va_list args;
1592         va_start(args, fmt);
1593         vprintf(fmt, args);
1594
1595     #if defined( _WIN32 )
1596         char strBuf[1024];
1597         vsnprintf_s(strBuf, _TRUNCATE, fmt, args);
1598         OutputDebugString(strBuf);
1599     #endif
1600
1601         va_end(args);
1602     }
1603
1604     Value *Builder::VEXTRACTI128(Value* a, Constant* imm8)
1605     {
1606         bool flag = !imm8->isZeroValue();
1607         SmallVector<Constant*,8> idx;
1608         for (unsigned i = 0; i < mVWidth / 2; i++) {
1609             idx.push_back(C(flag ? i + mVWidth / 2 : i));
1610         }
1611         return VSHUFFLE(a, VUNDEF_I(), ConstantVector::get(idx));
1612     }
1613
1614     Value *Builder::VINSERTI128(Value* a, Value* b, Constant* imm8)
1615     {
1616         bool flag = !imm8->isZeroValue();
1617         SmallVector<Constant*,8> idx;
1618         for (unsigned i = 0; i < mVWidth; i++) {
1619             idx.push_back(C(i));
1620         }
1621         Value *inter = VSHUFFLE(b, VUNDEF_I(), ConstantVector::get(idx));
1622
1623         SmallVector<Constant*,8> idx2;
1624         for (unsigned i = 0; i < mVWidth / 2; i++) {
1625             idx2.push_back(C(flag ? i : i + mVWidth));
1626         }
1627         for (unsigned i = mVWidth / 2; i < mVWidth; i++) {
1628             idx2.push_back(C(flag ? i + mVWidth / 2 : i));
1629         }
1630         return VSHUFFLE(a, inter, ConstantVector::get(idx2));
1631     }
1632
1633     // rdtsc buckets macros
1634     void Builder::RDTSC_START(Value* pBucketMgr, Value* pId)
1635     {
1636         // @todo due to an issue with thread local storage propagation in llvm, we can only safely call into
1637         // buckets framework when single threaded
1638         if (KNOB_SINGLE_THREADED)
1639         {
1640             std::vector<Type*> args{
1641                 PointerType::get(mInt32Ty, 0),   // pBucketMgr
1642                 mInt32Ty                        // id
1643             };
1644
1645             FunctionType* pFuncTy = FunctionType::get(Type::getVoidTy(JM()->mContext), args, false);
1646             Function* pFunc = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("BucketManager_StartBucket", pFuncTy));
1647             if (sys::DynamicLibrary::SearchForAddressOfSymbol("BucketManager_StartBucket") == nullptr)
1648             {
1649                 sys::DynamicLibrary::AddSymbol("BucketManager_StartBucket", (void*)&BucketManager_StartBucket);
1650             }
1651
1652             CALL(pFunc, { pBucketMgr, pId });
1653         }
1654     }
1655
1656     void Builder::RDTSC_STOP(Value* pBucketMgr, Value* pId)
1657     {
1658         // @todo due to an issue with thread local storage propagation in llvm, we can only safely call into
1659         // buckets framework when single threaded
1660         if (KNOB_SINGLE_THREADED)
1661         {
1662             std::vector<Type*> args{
1663                 PointerType::get(mInt32Ty, 0),   // pBucketMgr
1664                 mInt32Ty                        // id
1665             };
1666
1667             FunctionType* pFuncTy = FunctionType::get(Type::getVoidTy(JM()->mContext), args, false);
1668             Function* pFunc = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("BucketManager_StopBucket", pFuncTy));
1669             if (sys::DynamicLibrary::SearchForAddressOfSymbol("BucketManager_StopBucket") == nullptr)
1670             {
1671                 sys::DynamicLibrary::AddSymbol("BucketManager_StopBucket", (void*)&BucketManager_StopBucket);
1672             }
1673
1674             CALL(pFunc, { pBucketMgr, pId });
1675         }
1676     }
1677
1678
1679     uint32_t Builder::GetTypeSize(Type* pType)
1680     {
1681         if (pType->isStructTy())
1682         {
1683             uint32_t numElems = pType->getStructNumElements();
1684             Type* pElemTy = pType->getStructElementType(0);
1685             return numElems * GetTypeSize(pElemTy);
1686         }
1687
1688         if (pType->isArrayTy())
1689         {
1690             uint32_t numElems = pType->getArrayNumElements();
1691             Type* pElemTy = pType->getArrayElementType();
1692             return numElems * GetTypeSize(pElemTy);
1693         }
1694
1695         if (pType->isIntegerTy())
1696         {
1697             uint32_t bitSize = pType->getIntegerBitWidth();
1698             return bitSize / 8;
1699         }
1700
1701         if (pType->isFloatTy())
1702         {
1703             return 4;
1704         }
1705
1706         if (pType->isHalfTy())
1707         {
1708             return 2;
1709         }
1710
1711         if (pType->isDoubleTy())
1712         {
1713             return 8;
1714         }
1715
1716         SWR_ASSERT(false, "Unimplemented type.");
1717         return 0;
1718     }
1719 }