src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp

   1 /****************************************************************************
   2 * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
   3 *
   4 * Permission is hereby granted, free of charge, to any person obtaining a
   5 * copy of this software and associated documentation files (the "Software"),
   6 * to deal in the Software without restriction, including without limitation
   7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8 * and/or sell copies of the Software, and to permit persons to whom the
   9 * Software is furnished to do so, subject to the following conditions:
  10 *
  11 * The above copyright notice and this permission notice (including the next
  12 * paragraph) shall be included in all copies or substantial portions of the
  13 * Software.
  14 *
  15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21 * IN THE SOFTWARE.
  22 *
  23 * @file builder_misc.cpp
  24 *
  25 * @brief Implementation for miscellaneous builder functions
  26 *
  27 * Notes:
  28 *
  29 ******************************************************************************/
  30 #include "builder.h"
  31 #include "common/rdtsc_buckets.h"
  32
  33 #include <cstdarg>
  34
  35 namespace SwrJit
  36 {
  37     void __cdecl CallPrint(const char* fmt, ...);
  38
  39     //////////////////////////////////////////////////////////////////////////
  40     /// @brief Convert an IEEE 754 32-bit single precision float to an
  41     ///        16 bit float with 5 exponent bits and a variable
  42     ///        number of mantissa bits.
  43     /// @param val - 32-bit float
  44     /// @todo Maybe move this outside of this file into a header?
  45     static uint16_t ConvertFloat32ToFloat16(float val)
  46     {
  47         uint32_t sign, exp, mant;
  48         uint32_t roundBits;
  49
  50         // Extract the sign, exponent, and mantissa
  51         uint32_t uf = *(uint32_t*)&val;
  52         sign = (uf & 0x80000000) >> 31;
  53         exp = (uf & 0x7F800000) >> 23;
  54         mant = uf & 0x007FFFFF;
  55
  56         // Check for out of range
  57         if (std::isnan(val))
  58         {
  59             exp = 0x1F;
  60             mant = 0x200;
  61             sign = 1;                     // set the sign bit for NANs
  62         }
  63         else if (std::isinf(val))
  64         {
  65             exp = 0x1f;
  66             mant = 0x0;
  67         }
  68         else if (exp > (0x70 + 0x1E)) // Too big to represent -> max representable value
  69         {
  70             exp = 0x1E;
  71             mant = 0x3FF;
  72         }
  73         else if ((exp <= 0x70) && (exp >= 0x66)) // It's a denorm
  74         {
  75             mant |= 0x00800000;
  76             for (; exp <= 0x70; mant >>= 1, exp++)
  77                 ;
  78             exp = 0;
  79             mant = mant >> 13;
  80         }
  81         else if (exp < 0x66) // Too small to represent -> Zero
  82         {
  83             exp = 0;
  84             mant = 0;
  85         }
  86         else
  87         {
  88             // Saves bits that will be shifted off for rounding
  89             roundBits = mant & 0x1FFFu;
  90             // convert exponent and mantissa to 16 bit format
  91             exp = exp - 0x70;
  92             mant = mant >> 13;
  93
  94             // Essentially RTZ, but round up if off by only 1 lsb
  95             if (roundBits == 0x1FFFu)
  96             {
  97                 mant++;
  98                 // check for overflow
  99                 if ((mant & 0xC00u) != 0)
 100                     exp++;
 101                 // make sure only the needed bits are used
 102                 mant &= 0x3FF;
 103             }
 104         }
 105
 106         uint32_t tmpVal = (sign << 15) | (exp << 10) | mant;
 107         return (uint16_t)tmpVal;
 108     }
 109
 110     //////////////////////////////////////////////////////////////////////////
 111     /// @brief Convert an IEEE 754 16-bit float to an 32-bit single precision
 112     ///        float
 113     /// @param val - 16-bit float
 114     /// @todo Maybe move this outside of this file into a header?
 115     static float ConvertFloat16ToFloat32(uint32_t val)
 116     {
 117         uint32_t result;
 118         if ((val & 0x7fff) == 0)
 119         {
 120             result = ((uint32_t)(val & 0x8000)) << 16;
 121         }
 122         else if ((val & 0x7c00) == 0x7c00)
 123         {
 124             result = ((val & 0x3ff) == 0) ? 0x7f800000 : 0x7fc00000;
 125             result |= ((uint32_t)val & 0x8000) << 16;
 126         }
 127         else
 128         {
 129             uint32_t sign = (val & 0x8000) << 16;
 130             uint32_t mant = (val & 0x3ff) << 13;
 131             uint32_t exp = (val >> 10) & 0x1f;
 132             if ((exp == 0) && (mant != 0)) // Adjust exponent and mantissa for denormals
 133             {
 134                 mant <<= 1;
 135                 while (mant < (0x400 << 13))
 136                 {
 137                     exp--;
 138                     mant <<= 1;
 139                 }
 140                 mant &= (0x3ff << 13);
 141             }
 142             exp = ((exp - 15 + 127) & 0xff) << 23;
 143             result = sign | exp | mant;
 144         }
 145
 146         return *(float*)&result;
 147     }
 148
 149     Constant *Builder::C(bool i)
 150     {
 151         return ConstantInt::get(IRB()->getInt1Ty(), (i ? 1 : 0));
 152     }
 153
 154     Constant *Builder::C(char i)
 155     {
 156         return ConstantInt::get(IRB()->getInt8Ty(), i);
 157     }
 158
 159     Constant *Builder::C(uint8_t i)
 160     {
 161         return ConstantInt::get(IRB()->getInt8Ty(), i);
 162     }
 163
 164     Constant *Builder::C(int i)
 165     {
 166         return ConstantInt::get(IRB()->getInt32Ty(), i);
 167     }
 168
 169     Constant *Builder::C(int64_t i)
 170     {
 171         return ConstantInt::get(IRB()->getInt64Ty(), i);
 172     }
 173
 174     Constant *Builder::C(uint16_t i)
 175     {
 176         return ConstantInt::get(mInt16Ty,i);
 177     }
 178
 179     Constant *Builder::C(uint32_t i)
 180     {
 181         return ConstantInt::get(IRB()->getInt32Ty(), i);
 182     }
 183
 184     Constant *Builder::C(float i)
 185     {
 186         return ConstantFP::get(IRB()->getFloatTy(), i);
 187     }
 188
 189     Constant *Builder::PRED(bool pred)
 190     {
 191         return ConstantInt::get(IRB()->getInt1Ty(), (pred ? 1 : 0));
 192     }
 193
 194     Value *Builder::VIMMED1(int i)
 195     {
 196         return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i)));
 197     }
 198
 199     Value *Builder::VIMMED1_16(int i)
 200     {
 201         return ConstantVector::getSplat(mVWidth16, cast<ConstantInt>(C(i)));
 202     }
 203
 204     Value *Builder::VIMMED1(uint32_t i)
 205     {
 206         return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i)));
 207     }
 208
 209     Value *Builder::VIMMED1_16(uint32_t i)
 210     {
 211         return ConstantVector::getSplat(mVWidth16, cast<ConstantInt>(C(i)));
 212     }
 213
 214     Value *Builder::VIMMED1(float i)
 215     {
 216         return ConstantVector::getSplat(mVWidth, cast<ConstantFP>(C(i)));
 217     }
 218
 219     Value *Builder::VIMMED1_16(float i)
 220     {
 221         return ConstantVector::getSplat(mVWidth16, cast<ConstantFP>(C(i)));
 222     }
 223
 224     Value *Builder::VIMMED1(bool i)
 225     {
 226         return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i)));
 227     }
 228
 229     Value *Builder::VIMMED1_16(bool i)
 230     {
 231         return ConstantVector::getSplat(mVWidth16, cast<ConstantInt>(C(i)));
 232     }
 233
 234     Value *Builder::VUNDEF_IPTR()
 235     {
 236         return UndefValue::get(VectorType::get(mInt32PtrTy,mVWidth));
 237     }
 238
 239     Value *Builder::VUNDEF(Type* t)
 240     {
 241         return UndefValue::get(VectorType::get(t, mVWidth));
 242     }
 243
 244     Value *Builder::VUNDEF_I()
 245     {
 246         return UndefValue::get(VectorType::get(mInt32Ty, mVWidth));
 247     }
 248
 249     Value *Builder::VUNDEF_I_16()
 250     {
 251         return UndefValue::get(VectorType::get(mInt32Ty, mVWidth16));
 252     }
 253
 254     Value *Builder::VUNDEF_F()
 255     {
 256         return UndefValue::get(VectorType::get(mFP32Ty, mVWidth));
 257     }
 258
 259     Value *Builder::VUNDEF_F_16()
 260     {
 261         return UndefValue::get(VectorType::get(mFP32Ty, mVWidth16));
 262     }
 263
 264     Value *Builder::VUNDEF(Type *ty, uint32_t size)
 265     {
 266         return UndefValue::get(VectorType::get(ty, size));
 267     }
 268
 269     Value *Builder::VBROADCAST(Value *src)
 270     {
 271         // check if src is already a vector
 272         if (src->getType()->isVectorTy())
 273         {
 274             return src;
 275         }
 276
 277         return VECTOR_SPLAT(mVWidth, src);
 278     }
 279
 280     Value *Builder::VBROADCAST_16(Value *src)
 281     {
 282         // check if src is already a vector
 283         if (src->getType()->isVectorTy())
 284         {
 285             return src;
 286         }
 287
 288         return VECTOR_SPLAT(mVWidth16, src);
 289     }
 290
 291     uint32_t Builder::IMMED(Value* v)
 292     {
 293         SWR_ASSERT(isa<ConstantInt>(v));
 294         ConstantInt *pValConst = cast<ConstantInt>(v);
 295         return pValConst->getZExtValue();
 296     }
 297
 298     int32_t Builder::S_IMMED(Value* v)
 299     {
 300         SWR_ASSERT(isa<ConstantInt>(v));
 301         ConstantInt *pValConst = cast<ConstantInt>(v);
 302         return pValConst->getSExtValue();
 303     }
 304
 305     Value *Builder::GEP(Value* ptr, const std::initializer_list<Value*> &indexList)
 306     {
 307         std::vector<Value*> indices;
 308         for (auto i : indexList)
 309             indices.push_back(i);
 310         return GEPA(ptr, indices);
 311     }
 312
 313     Value *Builder::GEP(Value* ptr, const std::initializer_list<uint32_t> &indexList)
 314     {
 315         std::vector<Value*> indices;
 316         for (auto i : indexList)
 317             indices.push_back(C(i));
 318         return GEPA(ptr, indices);
 319     }
 320
 321     Value *Builder::IN_BOUNDS_GEP(Value* ptr, const std::initializer_list<Value*> &indexList)
 322     {
 323         std::vector<Value*> indices;
 324         for (auto i : indexList)
 325             indices.push_back(i);
 326         return IN_BOUNDS_GEP(ptr, indices);
 327     }
 328
 329     Value *Builder::IN_BOUNDS_GEP(Value* ptr, const std::initializer_list<uint32_t> &indexList)
 330     {
 331         std::vector<Value*> indices;
 332         for (auto i : indexList)
 333             indices.push_back(C(i));
 334         return IN_BOUNDS_GEP(ptr, indices);
 335     }
 336
 337     LoadInst *Builder::LOAD(Value *basePtr, const std::initializer_list<uint32_t> &indices, const llvm::Twine& name)
 338     {
 339         std::vector<Value*> valIndices;
 340         for (auto i : indices)
 341             valIndices.push_back(C(i));
 342         return LOAD(GEPA(basePtr, valIndices), name);
 343     }
 344
 345     LoadInst *Builder::LOADV(Value *basePtr, const std::initializer_list<Value*> &indices, const llvm::Twine& name)
 346     {
 347         std::vector<Value*> valIndices;
 348         for (auto i : indices)
 349             valIndices.push_back(i);
 350         return LOAD(GEPA(basePtr, valIndices), name);
 351     }
 352
 353     StoreInst *Builder::STORE(Value *val, Value *basePtr, const std::initializer_list<uint32_t> &indices)
 354     {
 355         std::vector<Value*> valIndices;
 356         for (auto i : indices)
 357             valIndices.push_back(C(i));
 358         return STORE(val, GEPA(basePtr, valIndices));
 359     }
 360
 361     StoreInst *Builder::STOREV(Value *val, Value *basePtr, const std::initializer_list<Value*> &indices)
 362     {
 363         std::vector<Value*> valIndices;
 364         for (auto i : indices)
 365             valIndices.push_back(i);
 366         return STORE(val, GEPA(basePtr, valIndices));
 367     }
 368
 369     CallInst *Builder::CALL(Value *Callee, const std::initializer_list<Value*> &argsList)
 370     {
 371         std::vector<Value*> args;
 372         for (auto arg : argsList)
 373             args.push_back(arg);
 374         return CALLA(Callee, args);
 375     }
 376
 377     CallInst *Builder::CALL(Value *Callee, Value* arg)
 378     {
 379         std::vector<Value*> args;
 380         args.push_back(arg);
 381         return CALLA(Callee, args);
 382     }
 383
 384     CallInst *Builder::CALL2(Value *Callee, Value* arg1, Value* arg2)
 385     {
 386         std::vector<Value*> args;
 387         args.push_back(arg1);
 388         args.push_back(arg2);
 389         return CALLA(Callee, args);
 390     }
 391
 392     CallInst *Builder::CALL3(Value *Callee, Value* arg1, Value* arg2, Value* arg3)
 393     {
 394         std::vector<Value*> args;
 395         args.push_back(arg1);
 396         args.push_back(arg2);
 397         args.push_back(arg3);
 398         return CALLA(Callee, args);
 399     }
 400
 401     //////////////////////////////////////////////////////////////////////////
 402     Value *Builder::DEBUGTRAP()
 403     {
 404         Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::debugtrap);
 405         return CALL(func);
 406     }
 407
 408     Value *Builder::VRCP(Value *va)
 409     {
 410         return FDIV(VIMMED1(1.0f), va);  // 1 / a
 411     }
 412
 413     Value *Builder::VPLANEPS(Value* vA, Value* vB, Value* vC, Value* &vX, Value* &vY)
 414     {
 415         Value* vOut = FMADDPS(vA, vX, vC);
 416         vOut = FMADDPS(vB, vY, vOut);
 417         return vOut;
 418     }
 419
 420     //////////////////////////////////////////////////////////////////////////
 421     /// @brief Generate an i32 masked load operation in LLVM IR.  If not
 422     /// supported on the underlying platform, emulate it with float masked load
 423     /// @param src - base address pointer for the load
 424     /// @param vMask - SIMD wide mask that controls whether to access memory load 0
 425     Value *Builder::MASKLOADD(Value* src,Value* mask)
 426     {
 427         Value* vResult;
 428         // use avx2 gather instruction is available
 429         if(JM()->mArch.AVX2())
 430         {
 431             Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx2_maskload_d_256);
 432             vResult = CALL(func,{src,mask});
 433         }
 434         else
 435         {
 436             // maskload intrinsic expects integer mask operand in llvm >= 3.8
 437     #if (LLVM_VERSION_MAJOR > 3) || (LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 8)
 438             mask = BITCAST(mask,VectorType::get(mInt32Ty,mVWidth));
 439     #else
 440             mask = BITCAST(mask,VectorType::get(mFP32Ty,mVWidth));
 441     #endif
 442             Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule,Intrinsic::x86_avx_maskload_ps_256);
 443             vResult = BITCAST(CALL(func,{src,mask}), VectorType::get(mInt32Ty,mVWidth));
 444         }
 445         return vResult;
 446     }
 447
 448     //////////////////////////////////////////////////////////////////////////
 449     /// @brief insert a JIT call to CallPrint
 450     /// - outputs formatted string to both stdout and VS output window
 451     /// - DEBUG builds only
 452     /// Usage example:
 453     ///   PRINT("index %d = 0x%p\n",{C(lane), pIndex});
 454     ///   where C(lane) creates a constant value to print, and pIndex is the Value*
 455     ///   result from a GEP, printing out the pointer to memory
 456     /// @param printStr - constant string to print, which includes format specifiers
 457     /// @param printArgs - initializer list of Value*'s to print to std out
 458     CallInst *Builder::PRINT(const std::string &printStr,const std::initializer_list<Value*> &printArgs)
 459     {
 460         // push the arguments to CallPrint into a vector
 461         std::vector<Value*> printCallArgs;
 462         // save room for the format string.  we still need to modify it for vectors
 463         printCallArgs.resize(1);
 464
 465         // search through the format string for special processing
 466         size_t pos = 0;
 467         std::string tempStr(printStr);
 468         pos = tempStr.find('%', pos);
 469         auto v = printArgs.begin();
 470
 471         while ((pos != std::string::npos) && (v != printArgs.end()))
 472         {
 473             Value* pArg = *v;
 474             Type* pType = pArg->getType();
 475
 476             if (pType->isVectorTy())
 477             {
 478                 Type* pContainedType = pType->getContainedType(0);
 479
 480                 if (toupper(tempStr[pos + 1]) == 'X')
 481                 {
 482                     tempStr[pos] = '0';
 483                     tempStr[pos + 1] = 'x';
 484                     tempStr.insert(pos + 2, "%08X ");
 485                     pos += 7;
 486
 487                     printCallArgs.push_back(VEXTRACT(pArg, C(0)));
 488
 489                     std::string vectorFormatStr;
 490                     for (uint32_t i = 1; i < pType->getVectorNumElements(); ++i)
 491                     {
 492                         vectorFormatStr += "0x%08X ";
 493                         printCallArgs.push_back(VEXTRACT(pArg, C(i)));
 494                     }
 495
 496                     tempStr.insert(pos, vectorFormatStr);
 497                     pos += vectorFormatStr.size();
 498                 }
 499                 else if ((tempStr[pos + 1] == 'f') && (pContainedType->isFloatTy()))
 500                 {
 501                     uint32_t i = 0;
 502                     for (; i < (pArg->getType()->getVectorNumElements()) - 1; i++)
 503                     {
 504                         tempStr.insert(pos, std::string("%f "));
 505                         pos += 3;
 506                         printCallArgs.push_back(FP_EXT(VEXTRACT(pArg, C(i)), Type::getDoubleTy(JM()->mContext)));
 507                     }
 508                     printCallArgs.push_back(FP_EXT(VEXTRACT(pArg, C(i)), Type::getDoubleTy(JM()->mContext)));
 509                 }
 510                 else if ((tempStr[pos + 1] == 'd') && (pContainedType->isIntegerTy()))
 511                 {
 512                     uint32_t i = 0;
 513                     for (; i < (pArg->getType()->getVectorNumElements()) - 1; i++)
 514                     {
 515                         tempStr.insert(pos, std::string("%d "));
 516                         pos += 3;
 517                         printCallArgs.push_back(VEXTRACT(pArg, C(i)));
 518                     }
 519                     printCallArgs.push_back(VEXTRACT(pArg, C(i)));
 520                 }
 521             }
 522             else
 523             {
 524                 if (toupper(tempStr[pos + 1]) == 'X')
 525                 {
 526                     tempStr[pos] = '0';
 527                     tempStr.insert(pos + 1, "x%08");
 528                     printCallArgs.push_back(pArg);
 529                     pos += 3;
 530                 }
 531                 // for %f we need to cast float Values to doubles so that they print out correctly
 532                 else if ((tempStr[pos + 1] == 'f') && (pType->isFloatTy()))
 533                 {
 534                     printCallArgs.push_back(FP_EXT(pArg, Type::getDoubleTy(JM()->mContext)));
 535                     pos++;
 536                 }
 537                 else
 538                 {
 539                     printCallArgs.push_back(pArg);
 540                 }
 541             }
 542
 543             // advance to the next arguement
 544             v++;
 545             pos = tempStr.find('%', ++pos);
 546         }
 547
 548         // create global variable constant string
 549         Constant *constString = ConstantDataArray::getString(JM()->mContext,tempStr,true);
 550         GlobalVariable *gvPtr = new GlobalVariable(constString->getType(),true,GlobalValue::InternalLinkage,constString,"printStr");
 551         JM()->mpCurrentModule->getGlobalList().push_back(gvPtr);
 552
 553         // get a pointer to the first character in the constant string array
 554         std::vector<Constant*> geplist{C(0),C(0)};
 555         Constant *strGEP = ConstantExpr::getGetElementPtr(nullptr, gvPtr,geplist,false);
 556
 557         // insert the pointer to the format string in the argument vector
 558         printCallArgs[0] = strGEP;
 559
 560         // get pointer to CallPrint function and insert decl into the module if needed
 561         std::vector<Type*> args;
 562         args.push_back(PointerType::get(mInt8Ty,0));
 563         FunctionType* callPrintTy = FunctionType::get(Type::getVoidTy(JM()->mContext),args,true);
 564         Function *callPrintFn = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("CallPrint", callPrintTy));
 565
 566         // if we haven't yet added the symbol to the symbol table
 567         if((sys::DynamicLibrary::SearchForAddressOfSymbol("CallPrint")) == nullptr)
 568         {
 569             sys::DynamicLibrary::AddSymbol("CallPrint", (void *)&CallPrint);
 570         }
 571
 572         // insert a call to CallPrint
 573         return CALLA(callPrintFn,printCallArgs);
 574     }
 575
 576     //////////////////////////////////////////////////////////////////////////
 577     /// @brief Wrapper around PRINT with initializer list.
 578     CallInst* Builder::PRINT(const std::string &printStr)
 579     {
 580         return PRINT(printStr, {});
 581     }
 582
 583     //////////////////////////////////////////////////////////////////////////
 584     /// @brief Generate a masked gather operation in LLVM IR.  If not
 585     /// supported on the underlying platform, emulate it with loads
 586     /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
 587     /// @param pBase - Int8* base VB address pointer value
 588     /// @param vIndices - SIMD wide value of VB byte offsets
 589     /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
 590     /// @param scale - value to scale indices by
 591     Value *Builder::GATHERPS(Value *vSrc, Value *pBase, Value *vIndices, Value *vMask, uint8_t scale)
 592     {
 593         Value *vGather;
 594
 595         // use avx2 gather instruction if available
 596         if(JM()->mArch.AVX2())
 597         {
 598             // force mask to <N x float>, required by vgather
 599             Value *mask = BITCAST(VMASK(vMask), mSimdFP32Ty);
 600
 601             vGather = VGATHERPS(vSrc, pBase, vIndices, mask, C(scale));
 602         }
 603         else
 604         {
 605             Value* pStack = STACKSAVE();
 606
 607             // store vSrc on the stack.  this way we can select between a valid load address and the vSrc address
 608             Value* vSrcPtr = ALLOCA(vSrc->getType());
 609             STORE(vSrc, vSrcPtr);
 610
 611             vGather = VUNDEF_F();
 612             Value *vScaleVec = VIMMED1((uint32_t)scale);
 613             Value *vOffsets = MUL(vIndices,vScaleVec);
 614             for(uint32_t i = 0; i < mVWidth; ++i)
 615             {
 616                 // single component byte index
 617                 Value *offset = VEXTRACT(vOffsets,C(i));
 618                 // byte pointer to component
 619                 Value *loadAddress = GEP(pBase,offset);
 620                 loadAddress = BITCAST(loadAddress,PointerType::get(mFP32Ty,0));
 621                 // pointer to the value to load if we're masking off a component
 622                 Value *maskLoadAddress = GEP(vSrcPtr,{C(0), C(i)});
 623                 Value *selMask = VEXTRACT(vMask,C(i));
 624                 // switch in a safe address to load if we're trying to access a vertex
 625                 Value *validAddress = SELECT(selMask, loadAddress, maskLoadAddress);
 626                 Value *val = LOAD(validAddress);
 627                 vGather = VINSERT(vGather,val,C(i));
 628             }
 629
 630             STACKRESTORE(pStack);
 631         }
 632
 633         return vGather;
 634     }
 635
 636     Value *Builder::GATHERPS_16(Value *vSrc, Value *pBase, Value *vIndices, Value *vMask, uint8_t scale)
 637     {
 638         Value *vGather = VUNDEF_F_16();
 639
 640         // use AVX512F gather instruction if available
 641         if (JM()->mArch.AVX512F())
 642         {
 643             // force mask to <N-bit Integer>, required by vgather2
 644             Value *mask = BITCAST(vMask, mInt16Ty);
 645
 646             vGather = VGATHERPS_16(vSrc, pBase, vIndices, mask, C((uint32_t)scale));
 647         }
 648         else
 649         {
 650             Value *src0 = EXTRACT_16(vSrc, 0);
 651             Value *src1 = EXTRACT_16(vSrc, 1);
 652
 653             Value *indices0 = EXTRACT_16(vIndices, 0);
 654             Value *indices1 = EXTRACT_16(vIndices, 1);
 655
 656             Value *mask0 = EXTRACT_16(vMask, 0);
 657             Value *mask1 = EXTRACT_16(vMask, 1);
 658
 659             Value *gather0 = GATHERPS(src0, pBase, indices0, mask0, scale);
 660             Value *gather1 = GATHERPS(src1, pBase, indices1, mask1, scale);
 661
 662             vGather = JOIN_16(gather0, gather1);
 663         }
 664
 665         return vGather;
 666     }
 667
 668     //////////////////////////////////////////////////////////////////////////
 669     /// @brief Generate a masked gather operation in LLVM IR.  If not
 670     /// supported on the underlying platform, emulate it with loads
 671     /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
 672     /// @param pBase - Int8* base VB address pointer value
 673     /// @param vIndices - SIMD wide value of VB byte offsets
 674     /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
 675     /// @param scale - value to scale indices by
 676     Value *Builder::GATHERDD(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, uint8_t scale)
 677     {
 678         Value* vGather;
 679
 680         // use avx2 gather instruction if available
 681         if(JM()->mArch.AVX2())
 682         {
 683             vGather = VGATHERDD(vSrc, pBase, vIndices, VMASK(vMask), C(scale));
 684         }
 685         else
 686         {
 687             Value* pStack = STACKSAVE();
 688
 689             // store vSrc on the stack.  this way we can select between a valid load address and the vSrc address
 690             Value* vSrcPtr = ALLOCA(vSrc->getType());
 691             STORE(vSrc, vSrcPtr);
 692
 693             vGather = VUNDEF_I();
 694             Value *vScaleVec = VIMMED1((uint32_t)scale);
 695             Value *vOffsets = MUL(vIndices, vScaleVec);
 696             for(uint32_t i = 0; i < mVWidth; ++i)
 697             {
 698                 // single component byte index
 699                 Value *offset = VEXTRACT(vOffsets, C(i));
 700                 // byte pointer to component
 701                 Value *loadAddress = GEP(pBase, offset);
 702                 loadAddress = BITCAST(loadAddress, PointerType::get(mInt32Ty, 0));
 703                 // pointer to the value to load if we're masking off a component
 704                 Value *maskLoadAddress = GEP(vSrcPtr, {C(0), C(i)});
 705                 Value *selMask = VEXTRACT(vMask, C(i));
 706                 // switch in a safe address to load if we're trying to access a vertex
 707                 Value *validAddress = SELECT(selMask, loadAddress, maskLoadAddress);
 708                 Value *val = LOAD(validAddress, C(0));
 709                 vGather = VINSERT(vGather, val, C(i));
 710             }
 711
 712             STACKRESTORE(pStack);
 713         }
 714
 715         return vGather;
 716     }
 717
 718     Value *Builder::GATHERDD_16(Value *vSrc, Value *pBase, Value *vIndices, Value *vMask, uint8_t scale)
 719     {
 720         Value *vGather = VUNDEF_I_16();
 721
 722         // use AVX512F gather instruction if available
 723         if (JM()->mArch.AVX512F())
 724         {
 725             // force mask to <N-bit Integer>, required by vgather2
 726             Value *mask = BITCAST(vMask, mInt16Ty);
 727
 728             vGather = VGATHERDD_16(vSrc, pBase, vIndices, mask, C((uint32_t)scale));
 729         }
 730         else
 731         {
 732             Value *src0 = EXTRACT_16(vSrc, 0);
 733             Value *src1 = EXTRACT_16(vSrc, 1);
 734
 735             Value *indices0 = EXTRACT_16(vIndices, 0);
 736             Value *indices1 = EXTRACT_16(vIndices, 1);
 737
 738             Value *mask0 = EXTRACT_16(vMask, 0);
 739             Value *mask1 = EXTRACT_16(vMask, 1);
 740
 741             Value *gather0 = GATHERDD(src0, pBase, indices0, mask0, scale);
 742             Value *gather1 = GATHERDD(src1, pBase, indices1, mask1, scale);
 743
 744             vGather = JOIN_16(gather0, gather1);
 745         }
 746
 747         return vGather;
 748     }
 749
 750     //////////////////////////////////////////////////////////////////////////
 751     /// @brief Generate a masked gather operation in LLVM IR.  If not
 752     /// supported on the underlying platform, emulate it with loads
 753     /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
 754     /// @param pBase - Int8* base VB address pointer value
 755     /// @param vIndices - SIMD wide value of VB byte offsets
 756     /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
 757     /// @param scale - value to scale indices by
 758     Value *Builder::GATHERPD(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, uint8_t scale)
 759     {
 760         Value* vGather;
 761
 762         // use avx2 gather instruction if available
 763         if(JM()->mArch.AVX2())
 764         {
 765             vMask = BITCAST(S_EXT(vMask, VectorType::get(mInt64Ty, mVWidth/2)), VectorType::get(mDoubleTy, mVWidth/2));
 766             vGather = VGATHERPD(vSrc, pBase, vIndices, vMask, C(scale));
 767         }
 768         else
 769         {
 770             Value* pStack = STACKSAVE();
 771
 772             // store vSrc on the stack.  this way we can select between a valid load address and the vSrc address
 773             Value* vSrcPtr = ALLOCA(vSrc->getType());
 774             STORE(vSrc, vSrcPtr);
 775
 776             vGather = UndefValue::get(VectorType::get(mDoubleTy, 4));
 777             Value *vScaleVec = VECTOR_SPLAT(4, C((uint32_t)scale));
 778             Value *vOffsets = MUL(vIndices,vScaleVec);
 779             for(uint32_t i = 0; i < mVWidth/2; ++i)
 780             {
 781                 // single component byte index
 782                 Value *offset = VEXTRACT(vOffsets,C(i));
 783                 // byte pointer to component
 784                 Value *loadAddress = GEP(pBase,offset);
 785                 loadAddress = BITCAST(loadAddress,PointerType::get(mDoubleTy,0));
 786                 // pointer to the value to load if we're masking off a component
 787                 Value *maskLoadAddress = GEP(vSrcPtr,{C(0), C(i)});
 788                 Value *selMask = VEXTRACT(vMask,C(i));
 789                 // switch in a safe address to load if we're trying to access a vertex
 790                 Value *validAddress = SELECT(selMask, loadAddress, maskLoadAddress);
 791                 Value *val = LOAD(validAddress);
 792                 vGather = VINSERT(vGather,val,C(i));
 793             }
 794             STACKRESTORE(pStack);
 795         }
 796         return vGather;
 797     }
 798
 799     Value *Builder::EXTRACT_16(Value *x, uint32_t imm)
 800     {
 801         if (imm == 0)
 802         {
 803             return VSHUFFLE(x, UndefValue::get(x->getType()), { 0, 1, 2, 3, 4, 5, 6, 7 });
 804         }
 805         else
 806         {
 807             return VSHUFFLE(x, UndefValue::get(x->getType()), { 8, 9, 10, 11, 12, 13, 14, 15 });
 808         }
 809     }
 810
 811     Value *Builder::JOIN_16(Value *a, Value *b)
 812     {
 813         return VSHUFFLE(a, b, { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 });
 814     }
 815
 816     //////////////////////////////////////////////////////////////////////////
 817     /// @brief convert x86 <N x float> mask to llvm <N x i1> mask
 818     Value *Builder::MASK(Value *vmask)
 819     {
 820         Value *src = BITCAST(vmask, mSimdInt32Ty);
 821         return ICMP_SLT(src, VIMMED1(0));
 822     }
 823
 824     Value *Builder::MASK_16(Value *vmask)
 825     {
 826         Value *src = BITCAST(vmask, mSimd16Int32Ty);
 827         return ICMP_SLT(src, VIMMED1_16(0));
 828     }
 829
 830     //////////////////////////////////////////////////////////////////////////
 831     /// @brief convert llvm <N x i1> mask to x86 <N x i32> mask
 832     Value *Builder::VMASK(Value *mask)
 833     {
 834         return S_EXT(mask, mSimdInt32Ty);
 835     }
 836
 837     Value *Builder::VMASK_16(Value *mask)
 838     {
 839         return S_EXT(mask, mSimd16Int32Ty);
 840     }
 841
 842     //////////////////////////////////////////////////////////////////////////
 843     /// @brief Generate a VPSHUFB operation in LLVM IR.  If not
 844     /// supported on the underlying platform, emulate it
 845     /// @param a - 256bit SIMD(32x8bit) of 8bit integer values
 846     /// @param b - 256bit SIMD(32x8bit) of 8bit integer mask values
 847     /// Byte masks in lower 128 lane of b selects 8 bit values from lower
 848     /// 128bits of a, and vice versa for the upper lanes.  If the mask
 849     /// value is negative, '0' is inserted.
 850     Value *Builder::PSHUFB(Value* a, Value* b)
 851     {
 852         Value* res;
 853         // use avx2 pshufb instruction if available
 854         if(JM()->mArch.AVX2())
 855         {
 856             res = VPSHUFB(a, b);
 857         }
 858         else
 859         {
 860             Constant* cB = dyn_cast<Constant>(b);
 861             // number of 8 bit elements in b
 862             uint32_t numElms = cast<VectorType>(cB->getType())->getNumElements();
 863             // output vector
 864             Value* vShuf = UndefValue::get(VectorType::get(mInt8Ty, numElms));
 865
 866             // insert an 8 bit value from the high and low lanes of a per loop iteration
 867             numElms /= 2;
 868             for(uint32_t i = 0; i < numElms; i++)
 869             {
 870                 ConstantInt* cLow128b = cast<ConstantInt>(cB->getAggregateElement(i));
 871                 ConstantInt* cHigh128b = cast<ConstantInt>(cB->getAggregateElement(i + numElms));
 872
 873                 // extract values from constant mask
 874                 char valLow128bLane =  (char)(cLow128b->getSExtValue());
 875                 char valHigh128bLane = (char)(cHigh128b->getSExtValue());
 876
 877                 Value* insertValLow128b;
 878                 Value* insertValHigh128b;
 879
 880                 // if the mask value is negative, insert a '0' in the respective output position
 881                 // otherwise, lookup the value at mask position (bits 3..0 of the respective mask byte) in a and insert in output vector
 882                 insertValLow128b = (valLow128bLane < 0) ? C((char)0) : VEXTRACT(a, C((valLow128bLane & 0xF)));
 883                 insertValHigh128b = (valHigh128bLane < 0) ? C((char)0) : VEXTRACT(a, C((valHigh128bLane & 0xF) + numElms));
 884
 885                 vShuf = VINSERT(vShuf, insertValLow128b, i);
 886                 vShuf = VINSERT(vShuf, insertValHigh128b, (i + numElms));
 887             }
 888             res = vShuf;
 889         }
 890         return res;
 891     }
 892
 893     //////////////////////////////////////////////////////////////////////////
 894     /// @brief Generate a VPSHUFB operation (sign extend 8 8bit values to 32
 895     /// bits)in LLVM IR.  If not supported on the underlying platform, emulate it
 896     /// @param a - 128bit SIMD lane(16x8bit) of 8bit integer values.  Only
 897     /// lower 8 values are used.
 898     Value *Builder::PMOVSXBD(Value* a)
 899     {
 900         // VPMOVSXBD output type
 901         Type* v8x32Ty = VectorType::get(mInt32Ty, 8);
 902         // Extract 8 values from 128bit lane and sign extend
 903         return S_EXT(VSHUFFLE(a, a, C<int>({0, 1, 2, 3, 4, 5, 6, 7})), v8x32Ty);
 904     }
 905
 906     //////////////////////////////////////////////////////////////////////////
 907     /// @brief Generate a VPSHUFB operation (sign extend 8 16bit values to 32
 908     /// bits)in LLVM IR.  If not supported on the underlying platform, emulate it
 909     /// @param a - 128bit SIMD lane(8x16bit) of 16bit integer values.
 910     Value *Builder::PMOVSXWD(Value* a)
 911     {
 912         // VPMOVSXWD output type
 913         Type* v8x32Ty = VectorType::get(mInt32Ty, 8);
 914         // Extract 8 values from 128bit lane and sign extend
 915         return S_EXT(VSHUFFLE(a, a, C<int>({0, 1, 2, 3, 4, 5, 6, 7})), v8x32Ty);
 916     }
 917
 918     //////////////////////////////////////////////////////////////////////////
 919     /// @brief Generate a VPERMD operation (shuffle 32 bit integer values
 920     /// across 128 bit lanes) in LLVM IR.  If not supported on the underlying
 921     /// platform, emulate it
 922     /// @param a - 256bit SIMD lane(8x32bit) of integer values.
 923     /// @param idx - 256bit SIMD lane(8x32bit) of 3 bit lane index values
 924     Value *Builder::PERMD(Value* a, Value* idx)
 925     {
 926         Value* res;
 927         // use avx2 permute instruction if available
 928         if(JM()->mArch.AVX2())
 929         {
 930             res = VPERMD(a, idx);
 931         }
 932         else
 933         {
 934             if (isa<Constant>(idx))
 935             {
 936                 res = VSHUFFLE(a, a, idx);
 937             }
 938             else
 939             {
 940                 res = VUNDEF_I();
 941                 for (uint32_t l = 0; l < JM()->mVWidth; ++l)
 942                 {
 943                     Value* pIndex = VEXTRACT(idx, C(l));
 944                     Value* pVal = VEXTRACT(a, pIndex);
 945                     res = VINSERT(res, pVal, C(l));
 946                 }
 947             }
 948         }
 949         return res;
 950     }
 951
 952     //////////////////////////////////////////////////////////////////////////
 953     /// @brief Generate a VPERMPS operation (shuffle 32 bit float values
 954     /// across 128 bit lanes) in LLVM IR.  If not supported on the underlying
 955     /// platform, emulate it
 956     /// @param a - 256bit SIMD lane(8x32bit) of float values.
 957     /// @param idx - 256bit SIMD lane(8x32bit) of 3 bit lane index values
 958     Value *Builder::PERMPS(Value* a, Value* idx)
 959     {
 960         Value* res;
 961         // use avx2 permute instruction if available
 962         if (JM()->mArch.AVX2())
 963         {
 964             // llvm 3.6.0 swapped the order of the args to vpermd
 965             res = VPERMPS(idx, a);
 966         }
 967         else
 968         {
 969             if (isa<Constant>(idx))
 970             {
 971                 res = VSHUFFLE(a, a, idx);
 972             }
 973             else
 974             {
 975                 res = VUNDEF_F();
 976                 for (uint32_t l = 0; l < JM()->mVWidth; ++l)
 977                 {
 978                     Value* pIndex = VEXTRACT(idx, C(l));
 979                     Value* pVal = VEXTRACT(a, pIndex);
 980                     res = VINSERT(res, pVal, C(l));
 981                 }
 982             }
 983         }
 984
 985         return res;
 986     }
 987
 988     //////////////////////////////////////////////////////////////////////////
 989     /// @brief Generate a VCVTPH2PS operation (float16->float32 conversion)
 990     /// in LLVM IR.  If not supported on the underlying platform, emulate it
 991     /// @param a - 128bit SIMD lane(8x16bit) of float16 in int16 format.
 992     Value *Builder::CVTPH2PS(Value* a)
 993     {
 994         if (JM()->mArch.F16C())
 995         {
 996             return VCVTPH2PS(a);
 997         }
 998         else
 999         {
1000             FunctionType* pFuncTy = FunctionType::get(mFP32Ty, mInt16Ty);
1001             Function* pCvtPh2Ps = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("ConvertFloat16ToFloat32", pFuncTy));
1002
1003             if (sys::DynamicLibrary::SearchForAddressOfSymbol("ConvertFloat16ToFloat32") == nullptr)
1004             {
1005                 sys::DynamicLibrary::AddSymbol("ConvertFloat16ToFloat32", (void *)&ConvertFloat16ToFloat32);
1006             }
1007
1008             Value* pResult = UndefValue::get(mSimdFP32Ty);
1009             for (uint32_t i = 0; i < mVWidth; ++i)
1010             {
1011                 Value* pSrc = VEXTRACT(a, C(i));
1012                 Value* pConv = CALL(pCvtPh2Ps, std::initializer_list<Value*>{pSrc});
1013                 pResult = VINSERT(pResult, pConv, C(i));
1014             }
1015
1016             return pResult;
1017         }
1018     }
1019
1020     //////////////////////////////////////////////////////////////////////////
1021     /// @brief Generate a VCVTPS2PH operation (float32->float16 conversion)
1022     /// in LLVM IR.  If not supported on the underlying platform, emulate it
1023     /// @param a - 128bit SIMD lane(8x16bit) of float16 in int16 format.
1024     Value *Builder::CVTPS2PH(Value* a, Value* rounding)
1025     {
1026         if (JM()->mArch.F16C())
1027         {
1028             return VCVTPS2PH(a, rounding);
1029         }
1030         else
1031         {
1032             // call scalar C function for now
1033             FunctionType* pFuncTy = FunctionType::get(mInt16Ty, mFP32Ty);
1034             Function* pCvtPs2Ph = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("ConvertFloat32ToFloat16", pFuncTy));
1035
1036             if (sys::DynamicLibrary::SearchForAddressOfSymbol("ConvertFloat32ToFloat16") == nullptr)
1037             {
1038                 sys::DynamicLibrary::AddSymbol("ConvertFloat32ToFloat16", (void *)&ConvertFloat32ToFloat16);
1039             }
1040
1041             Value* pResult = UndefValue::get(mSimdInt16Ty);
1042             for (uint32_t i = 0; i < mVWidth; ++i)
1043             {
1044                 Value* pSrc = VEXTRACT(a, C(i));
1045                 Value* pConv = CALL(pCvtPs2Ph, std::initializer_list<Value*>{pSrc});
1046                 pResult = VINSERT(pResult, pConv, C(i));
1047             }
1048
1049             return pResult;
1050         }
1051     }
1052
1053     Value *Builder::PMAXSD(Value* a, Value* b)
1054     {
1055         Value* cmp = ICMP_SGT(a, b);
1056         return SELECT(cmp, a, b);
1057     }
1058
1059     Value *Builder::PMINSD(Value* a, Value* b)
1060     {
1061         Value* cmp = ICMP_SLT(a, b);
1062         return SELECT(cmp, a, b);
1063     }
1064
1065     void Builder::Gather4(const SWR_FORMAT format, Value* pSrcBase, Value* byteOffsets,
1066                           Value* mask, Value* vGatherComponents[], bool bPackedOutput)
1067     {
1068         const SWR_FORMAT_INFO &info = GetFormatInfo(format);
1069         if(info.type[0] == SWR_TYPE_FLOAT && info.bpc[0] == 32)
1070         {
1071             GATHER4PS(info, pSrcBase, byteOffsets, mask, vGatherComponents, bPackedOutput);
1072         }
1073         else
1074         {
1075             GATHER4DD(info, pSrcBase, byteOffsets, mask, vGatherComponents, bPackedOutput);
1076         }
1077     }
1078
1079     void Builder::GATHER4PS(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets,
1080                             Value* vMask, Value* vGatherComponents[], bool bPackedOutput)
1081     {
1082         switch(info.bpp / info.numComps)
1083         {
1084             case 16:
1085             {
1086                     Value* vGatherResult[2];
1087
1088                     // TODO: vGatherMaskedVal
1089                     Value* vGatherMaskedVal = VIMMED1((float)0);
1090
1091                     // always have at least one component out of x or y to fetch
1092
1093                     vGatherResult[0] = GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask);
1094                     // e.g. result of first 8x32bit integer gather for 16bit components
1095                     // 256i - 0    1    2    3    4    5    6    7
1096                     //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
1097                     //
1098
1099                     // if we have at least one component out of x or y to fetch
1100                     if(info.numComps > 2)
1101                     {
1102                         // offset base to the next components(zw) in the vertex to gather
1103                         pSrcBase = GEP(pSrcBase, C((char)4));
1104
1105                         vGatherResult[1] =  GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask);
1106                         // e.g. result of second 8x32bit integer gather for 16bit components
1107                         // 256i - 0    1    2    3    4    5    6    7
1108                         //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
1109                         //
1110                     }
1111                     else
1112                     {
1113                         vGatherResult[1] =  vGatherMaskedVal;
1114                     }
1115
1116                     // Shuffle gathered components into place, each row is a component
1117                     Shuffle16bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
1118             }
1119                 break;
1120             case 32:
1121             {
1122                 // apply defaults
1123                 for (uint32_t i = 0; i < 4; ++i)
1124                 {
1125                     vGatherComponents[i] = VIMMED1(*(float*)&info.defaults[i]);
1126                 }
1127
1128                 for(uint32_t i = 0; i < info.numComps; i++)
1129                 {
1130                     uint32_t swizzleIndex = info.swizzle[i];
1131
1132                     // Gather a SIMD of components
1133                     vGatherComponents[swizzleIndex] = GATHERPS(vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask);
1134
1135                     // offset base to the next component to gather
1136                     pSrcBase = GEP(pSrcBase, C((char)4));
1137                 }
1138             }
1139                 break;
1140             default:
1141                 SWR_INVALID("Invalid float format");
1142                 break;
1143         }
1144     }
1145
1146     void Builder::GATHER4DD(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets,
1147                             Value* vMask, Value* vGatherComponents[], bool bPackedOutput)
1148     {
1149         switch (info.bpp / info.numComps)
1150         {
1151             case 8:
1152             {
1153                 Value* vGatherMaskedVal = VIMMED1((int32_t)0);
1154                 Value* vGatherResult = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask);
1155                 // e.g. result of an 8x32bit integer gather for 8bit components
1156                 // 256i - 0    1    2    3    4    5    6    7
1157                 //        xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw
1158
1159                 Shuffle8bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
1160             }
1161                 break;
1162             case 16:
1163             {
1164                 Value* vGatherResult[2];
1165
1166                 // TODO: vGatherMaskedVal
1167                 Value* vGatherMaskedVal = VIMMED1((int32_t)0);
1168
1169                 // always have at least one component out of x or y to fetch
1170
1171                 vGatherResult[0] = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask);
1172                 // e.g. result of first 8x32bit integer gather for 16bit components
1173                 // 256i - 0    1    2    3    4    5    6    7
1174                 //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
1175                 //
1176
1177                 // if we have at least one component out of x or y to fetch
1178                 if(info.numComps > 2)
1179                 {
1180                     // offset base to the next components(zw) in the vertex to gather
1181                     pSrcBase = GEP(pSrcBase, C((char)4));
1182
1183                     vGatherResult[1] = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask);
1184                     // e.g. result of second 8x32bit integer gather for 16bit components
1185                     // 256i - 0    1    2    3    4    5    6    7
1186                     //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
1187                     //
1188                 }
1189                 else
1190                 {
1191                     vGatherResult[1] = vGatherMaskedVal;
1192                 }
1193
1194                 // Shuffle gathered components into place, each row is a component
1195                 Shuffle16bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
1196
1197             }
1198                 break;
1199             case 32:
1200             {
1201                 // apply defaults
1202                 for (uint32_t i = 0; i < 4; ++i)
1203                 {
1204                     vGatherComponents[i] = VIMMED1((int)info.defaults[i]);
1205                 }
1206
1207                 for(uint32_t i = 0; i < info.numComps; i++)
1208                 {
1209                     uint32_t swizzleIndex = info.swizzle[i];
1210
1211                     // Gather a SIMD of components
1212                     vGatherComponents[swizzleIndex] = GATHERDD(vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask);
1213
1214                     // offset base to the next component to gather
1215                     pSrcBase = GEP(pSrcBase, C((char)4));
1216                 }
1217             }
1218                 break;
1219             default:
1220                 SWR_INVALID("unsupported format");
1221             break;
1222         }
1223     }
1224
1225     void Builder::Shuffle16bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput[2], Value* vGatherOutput[4], bool bPackedOutput)
1226     {
1227         // cast types
1228         Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
1229         Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
1230
1231         // input could either be float or int vector; do shuffle work in int
1232         vGatherInput[0] = BITCAST(vGatherInput[0], mSimdInt32Ty);
1233         vGatherInput[1] = BITCAST(vGatherInput[1], mSimdInt32Ty);
1234
1235         if(bPackedOutput)
1236         {
1237             Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
1238
1239             // shuffle mask
1240             Value* vConstMask = C<char>({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
1241                                          0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15});
1242             Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput[0], v32x8Ty), vConstMask), vGatherTy);
1243             // after pshufb: group components together in each 128bit lane
1244             // 256i - 0    1    2    3    4    5    6    7
1245             //        xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
1246
1247             Value* vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
1248             // after PERMD: move and pack xy components into each 128bit lane
1249             // 256i - 0    1    2    3    4    5    6    7
1250             //        xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
1251
1252             // do the same for zw components
1253             Value* vi128ZW = nullptr;
1254             if(info.numComps > 2)
1255             {
1256                 Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput[1], v32x8Ty), vConstMask), vGatherTy);
1257                 vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
1258             }
1259
1260             for(uint32_t i = 0; i < 4; i++)
1261             {
1262                 uint32_t swizzleIndex = info.swizzle[i];
1263                 // todo: fixed for packed
1264                 Value* vGatherMaskedVal = VIMMED1((int32_t)(info.defaults[i]));
1265                 if(i >= info.numComps)
1266                 {
1267                     // set the default component val
1268                     vGatherOutput[swizzleIndex] = vGatherMaskedVal;
1269                     continue;
1270                 }
1271
1272                 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
1273                 uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
1274                 // if x or y, use vi128XY permute result, else use vi128ZW
1275                 Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
1276
1277                 // extract packed component 128 bit lanes
1278                 vGatherOutput[swizzleIndex] = VEXTRACT(selectedPermute, C(lane));
1279             }
1280
1281         }
1282         else
1283         {
1284             // pshufb masks for each component
1285             Value* vConstMask[2];
1286             // x/z shuffle mask
1287             vConstMask[0] = C<char>({0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
1288                                      0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, });
1289
1290             // y/w shuffle mask
1291             vConstMask[1] = C<char>({2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
1292                                      2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1});
1293
1294
1295             // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits
1296             // apply defaults
1297             for (uint32_t i = 0; i < 4; ++i)
1298             {
1299                 vGatherOutput[i] = VIMMED1((int32_t)info.defaults[i]);
1300             }
1301
1302             for(uint32_t i = 0; i < info.numComps; i++)
1303             {
1304                 uint32_t swizzleIndex = info.swizzle[i];
1305
1306                 // select correct constMask for x/z or y/w pshufb
1307                 uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1;
1308                 // if x or y, use vi128XY permute result, else use vi128ZW
1309                 uint32_t selectedGather = (i < 2) ? 0 : 1;
1310
1311                 vGatherOutput[swizzleIndex] = BITCAST(PSHUFB(BITCAST(vGatherInput[selectedGather], v32x8Ty), vConstMask[selectedMask]), vGatherTy);
1312                 // after pshufb mask for x channel; z uses the same shuffle from the second gather
1313                 // 256i - 0    1    2    3    4    5    6    7
1314                 //        xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00
1315             }
1316         }
1317     }
1318
1319     void Builder::Shuffle8bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput, Value* vGatherOutput[], bool bPackedOutput)
1320     {
1321         // cast types
1322         Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
1323         Type* v32x8Ty =  VectorType::get(mInt8Ty, mVWidth * 4 ); // vwidth is units of 32 bits
1324
1325         if(bPackedOutput)
1326         {
1327             Type* v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
1328             // shuffle mask
1329             Value* vConstMask = C<char>({0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15,
1330                                          0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15});
1331             Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput, v32x8Ty), vConstMask), vGatherTy);
1332             // after pshufb: group components together in each 128bit lane
1333             // 256i - 0    1    2    3    4    5    6    7
1334             //        xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww
1335
1336             Value* vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({0, 4, 0, 0, 1, 5, 0, 0})), v128Ty);
1337             // after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane
1338             // 256i - 0    1    2    3    4    5    6    7
1339             //        xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care)
1340
1341             // do the same for zw components
1342             Value* vi128ZW = nullptr;
1343             if(info.numComps > 2)
1344             {
1345                 vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({2, 6, 0, 0, 3, 7, 0, 0})), v128Ty);
1346             }
1347
1348             // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
1349             for(uint32_t i = 0; i < 4; i++)
1350             {
1351                 uint32_t swizzleIndex = info.swizzle[i];
1352                 // todo: fix for packed
1353                 Value* vGatherMaskedVal = VIMMED1((int32_t)(info.defaults[i]));
1354                 if(i >= info.numComps)
1355                 {
1356                     // set the default component val
1357                     vGatherOutput[swizzleIndex] = vGatherMaskedVal;
1358                     continue;
1359                 }
1360
1361                 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
1362                 uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
1363                 // if x or y, use vi128XY permute result, else use vi128ZW
1364                 Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
1365
1366                 // sign extend
1367                 vGatherOutput[swizzleIndex] = VEXTRACT(selectedPermute, C(lane));
1368             }
1369         }
1370         // else zero extend
1371         else{
1372             // shuffle enabled components into lower byte of each 32bit lane, 0 extending to 32 bits
1373             // apply defaults
1374             for (uint32_t i = 0; i < 4; ++i)
1375             {
1376                 vGatherOutput[i] = VIMMED1((int32_t)info.defaults[i]);
1377             }
1378
1379             for(uint32_t i = 0; i < info.numComps; i++){
1380                 uint32_t swizzleIndex = info.swizzle[i];
1381
1382                 // pshufb masks for each component
1383                 Value* vConstMask;
1384                 switch(i)
1385                 {
1386                     case 0:
1387                         // x shuffle mask
1388                         vConstMask = C<char>({0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1,
1389                                               0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1});
1390                         break;
1391                     case 1:
1392                         // y shuffle mask
1393                         vConstMask = C<char>({1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1,
1394                                               1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1});
1395                         break;
1396                     case 2:
1397                         // z shuffle mask
1398                         vConstMask = C<char>({2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1,
1399                                               2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1});
1400                         break;
1401                     case 3:
1402                         // w shuffle mask
1403                         vConstMask = C<char>({3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1,
1404                                               3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1});
1405                         break;
1406                     default:
1407                         vConstMask = nullptr;
1408                         break;
1409                 }
1410
1411                     vGatherOutput[swizzleIndex] = BITCAST(PSHUFB(BITCAST(vGatherInput, v32x8Ty), vConstMask), vGatherTy);
1412                     // after pshufb for x channel
1413                     // 256i - 0    1    2    3    4    5    6    7
1414                     //        x000 x000 x000 x000 x000 x000 x000 x000
1415             }
1416         }
1417     }
1418
1419     // Helper function to create alloca in entry block of function
1420     Value* Builder::CreateEntryAlloca(Function* pFunc, Type* pType)
1421     {
1422         auto saveIP = IRB()->saveIP();
1423         IRB()->SetInsertPoint(&pFunc->getEntryBlock(),
1424                               pFunc->getEntryBlock().begin());
1425         Value* pAlloca = ALLOCA(pType);
1426         if (saveIP.isSet()) IRB()->restoreIP(saveIP);
1427         return pAlloca;
1428     }
1429
1430     Value* Builder::CreateEntryAlloca(Function* pFunc, Type* pType, Value* pArraySize)
1431     {
1432         auto saveIP = IRB()->saveIP();
1433         IRB()->SetInsertPoint(&pFunc->getEntryBlock(),
1434             pFunc->getEntryBlock().begin());
1435         Value* pAlloca = ALLOCA(pType, pArraySize);
1436         if (saveIP.isSet()) IRB()->restoreIP(saveIP);
1437         return pAlloca;
1438     }
1439
1440     //////////////////////////////////////////////////////////////////////////
1441     /// @brief emulates a scatter operation.
1442     /// @param pDst - pointer to destination
1443     /// @param vSrc - vector of src data to scatter
1444     /// @param vOffsets - vector of byte offsets from pDst
1445     /// @param vMask - mask of valid lanes
1446     void Builder::SCATTERPS(Value* pDst, Value* vSrc, Value* vOffsets, Value* vMask)
1447     {
1448         /* Scatter algorithm
1449
1450            while(Index = BitScanForward(mask))
1451                 srcElem = srcVector[Index]
1452                 offsetElem = offsetVector[Index]
1453                 *(pDst + offsetElem) = srcElem
1454                 Update mask (&= ~(1<<Index)
1455
1456         */
1457
1458         BasicBlock* pCurBB = IRB()->GetInsertBlock();
1459         Function* pFunc = pCurBB->getParent();
1460         Type* pSrcTy = vSrc->getType()->getVectorElementType();
1461
1462         // Store vectors on stack
1463         if (pScatterStackSrc == nullptr)
1464         {
1465             // Save off stack allocations and reuse per scatter. Significantly reduces stack
1466             // requirements for shaders with a lot of scatters.
1467             pScatterStackSrc = CreateEntryAlloca(pFunc, mSimdInt64Ty);
1468             pScatterStackOffsets = CreateEntryAlloca(pFunc, mSimdInt32Ty);
1469         }
1470
1471         Value* pSrcArrayPtr = BITCAST(pScatterStackSrc, PointerType::get(vSrc->getType(), 0));
1472         Value* pOffsetsArrayPtr = pScatterStackOffsets;
1473         STORE(vSrc, pSrcArrayPtr);
1474         STORE(vOffsets, pOffsetsArrayPtr);
1475
1476         // Cast to pointers for random access
1477         pSrcArrayPtr = POINTER_CAST(pSrcArrayPtr, PointerType::get(pSrcTy, 0));
1478         pOffsetsArrayPtr = POINTER_CAST(pOffsetsArrayPtr, PointerType::get(mInt32Ty, 0));
1479
1480         Value* pMask = VMOVMSKPS(BITCAST(vMask, mSimdFP32Ty));
1481
1482         // Get cttz function
1483         Function* pfnCttz = Intrinsic::getDeclaration(mpJitMgr->mpCurrentModule, Intrinsic::cttz, { mInt32Ty });
1484
1485         // Setup loop basic block
1486         BasicBlock* pLoop = BasicBlock::Create(mpJitMgr->mContext, "Scatter Loop", pFunc);
1487
1488         // compute first set bit
1489         Value* pIndex = CALL(pfnCttz, { pMask, C(false) });
1490
1491         Value* pIsUndef = ICMP_EQ(pIndex, C(32));
1492
1493         // Split current block
1494         BasicBlock* pPostLoop = pCurBB->splitBasicBlock(cast<Instruction>(pIsUndef)->getNextNode());
1495
1496         // Remove unconditional jump created by splitBasicBlock
1497         pCurBB->getTerminator()->eraseFromParent();
1498
1499         // Add terminator to end of original block
1500         IRB()->SetInsertPoint(pCurBB);
1501
1502         // Add conditional branch
1503         COND_BR(pIsUndef, pPostLoop, pLoop);
1504
1505         // Add loop basic block contents
1506         IRB()->SetInsertPoint(pLoop);
1507         PHINode* pIndexPhi = PHI(mInt32Ty, 2);
1508         PHINode* pMaskPhi = PHI(mInt32Ty, 2);
1509
1510         pIndexPhi->addIncoming(pIndex, pCurBB);
1511         pMaskPhi->addIncoming(pMask, pCurBB);
1512
1513         // Extract elements for this index
1514         Value* pSrcElem = LOADV(pSrcArrayPtr, { pIndexPhi });
1515         Value* pOffsetElem = LOADV(pOffsetsArrayPtr, { pIndexPhi });
1516
1517         // GEP to this offset in dst
1518         Value* pCurDst = GEP(pDst, pOffsetElem);
1519         pCurDst = POINTER_CAST(pCurDst, PointerType::get(pSrcTy, 0));
1520         STORE(pSrcElem, pCurDst);
1521
1522         // Update the mask
1523         Value* pNewMask = AND(pMaskPhi, NOT(SHL(C(1), pIndexPhi)));
1524
1525         // Terminator
1526         Value* pNewIndex = CALL(pfnCttz, { pNewMask, C(false) });
1527
1528         pIsUndef = ICMP_EQ(pNewIndex, C(32));
1529         COND_BR(pIsUndef, pPostLoop, pLoop);
1530
1531         // Update phi edges
1532         pIndexPhi->addIncoming(pNewIndex, pLoop);
1533         pMaskPhi->addIncoming(pNewMask, pLoop);
1534
1535         // Move builder to beginning of post loop
1536         IRB()->SetInsertPoint(pPostLoop, pPostLoop->begin());
1537     }
1538
1539     Value* Builder::VABSPS(Value* a)
1540     {
1541         Value* asInt = BITCAST(a, mSimdInt32Ty);
1542         Value* result = BITCAST(AND(asInt, VIMMED1(0x7fffffff)), mSimdFP32Ty);
1543         return result;
1544     }
1545
1546     Value *Builder::ICLAMP(Value* src, Value* low, Value* high)
1547     {
1548         Value *lowCmp = ICMP_SLT(src, low);
1549         Value *ret = SELECT(lowCmp, low, src);
1550
1551         Value *highCmp = ICMP_SGT(ret, high);
1552         ret = SELECT(highCmp, high, ret);
1553
1554         return ret;
1555     }
1556
1557     Value *Builder::FCLAMP(Value* src, Value* low, Value* high)
1558     {
1559         Value *lowCmp = FCMP_OLT(src, low);
1560         Value *ret = SELECT(lowCmp, low, src);
1561
1562         Value *highCmp = FCMP_OGT(ret, high);
1563         ret = SELECT(highCmp, high, ret);
1564
1565         return ret;
1566     }
1567
1568     Value *Builder::FCLAMP(Value* src, float low, float high)
1569     {
1570         Value* result = VMAXPS(src, VIMMED1(low));
1571         result = VMINPS(result, VIMMED1(high));
1572
1573         return result;
1574     }
1575
1576     //////////////////////////////////////////////////////////////////////////
1577     /// @brief save/restore stack, providing ability to push/pop the stack and
1578     ///        reduce overall stack requirements for temporary stack use
1579     Value* Builder::STACKSAVE()
1580     {
1581         Function* pfnStackSave = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::stacksave);
1582         return CALLA(pfnStackSave);
1583     }
1584
1585     void Builder::STACKRESTORE(Value* pSaved)
1586     {
1587         Function* pfnStackRestore = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::stackrestore);
1588         CALL(pfnStackRestore, std::initializer_list<Value*>{pSaved});
1589     }
1590
1591     Value *Builder::FMADDPS(Value* a, Value* b, Value* c)
1592     {
1593         Value* vOut;
1594         // use FMADs if available
1595         if(JM()->mArch.AVX2())
1596         {
1597             vOut = VFMADDPS(a, b, c);
1598         }
1599         else
1600         {
1601             vOut = FADD(FMUL(a, b), c);
1602         }
1603         return vOut;
1604     }
1605
1606     Value* Builder::POPCNT(Value* a)
1607     {
1608         Function* pCtPop = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::ctpop, { a->getType() });
1609         return CALL(pCtPop, std::initializer_list<Value*>{a});
1610     }
1611
1612     //////////////////////////////////////////////////////////////////////////
1613     /// @brief C functions called by LLVM IR
1614     //////////////////////////////////////////////////////////////////////////
1615
1616     //////////////////////////////////////////////////////////////////////////
1617     /// @brief called in JIT code, inserted by PRINT
1618     /// output to both stdout and visual studio debug console
1619     void __cdecl CallPrint(const char* fmt, ...)
1620     {
1621         va_list args;
1622         va_start(args, fmt);
1623         vprintf(fmt, args);
1624
1625     #if defined( _WIN32 )
1626         char strBuf[1024];
1627         vsnprintf_s(strBuf, _TRUNCATE, fmt, args);
1628         OutputDebugStringA(strBuf);
1629     #endif
1630
1631         va_end(args);
1632     }
1633
1634     Value *Builder::VEXTRACTI128(Value* a, Constant* imm8)
1635     {
1636         bool flag = !imm8->isZeroValue();
1637         SmallVector<Constant*,8> idx;
1638         for (unsigned i = 0; i < mVWidth / 2; i++) {
1639             idx.push_back(C(flag ? i + mVWidth / 2 : i));
1640         }
1641         return VSHUFFLE(a, VUNDEF_I(), ConstantVector::get(idx));
1642     }
1643
1644     Value *Builder::VINSERTI128(Value* a, Value* b, Constant* imm8)
1645     {
1646         bool flag = !imm8->isZeroValue();
1647         SmallVector<Constant*,8> idx;
1648         for (unsigned i = 0; i < mVWidth; i++) {
1649             idx.push_back(C(i));
1650         }
1651         Value *inter = VSHUFFLE(b, VUNDEF_I(), ConstantVector::get(idx));
1652
1653         SmallVector<Constant*,8> idx2;
1654         for (unsigned i = 0; i < mVWidth / 2; i++) {
1655             idx2.push_back(C(flag ? i : i + mVWidth));
1656         }
1657         for (unsigned i = mVWidth / 2; i < mVWidth; i++) {
1658             idx2.push_back(C(flag ? i + mVWidth / 2 : i));
1659         }
1660         return VSHUFFLE(a, inter, ConstantVector::get(idx2));
1661     }
1662
1663     // rdtsc buckets macros
1664     void Builder::RDTSC_START(Value* pBucketMgr, Value* pId)
1665     {
1666         // @todo due to an issue with thread local storage propagation in llvm, we can only safely call into
1667         // buckets framework when single threaded
1668         if (KNOB_SINGLE_THREADED)
1669         {
1670             std::vector<Type*> args{
1671                 PointerType::get(mInt32Ty, 0),   // pBucketMgr
1672                 mInt32Ty                        // id
1673             };
1674
1675             FunctionType* pFuncTy = FunctionType::get(Type::getVoidTy(JM()->mContext), args, false);
1676             Function* pFunc = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("BucketManager_StartBucket", pFuncTy));
1677             if (sys::DynamicLibrary::SearchForAddressOfSymbol("BucketManager_StartBucket") == nullptr)
1678             {
1679                 sys::DynamicLibrary::AddSymbol("BucketManager_StartBucket", (void*)&BucketManager_StartBucket);
1680             }
1681
1682             CALL(pFunc, { pBucketMgr, pId });
1683         }
1684     }
1685
1686     void Builder::RDTSC_STOP(Value* pBucketMgr, Value* pId)
1687     {
1688         // @todo due to an issue with thread local storage propagation in llvm, we can only safely call into
1689         // buckets framework when single threaded
1690         if (KNOB_SINGLE_THREADED)
1691         {
1692             std::vector<Type*> args{
1693                 PointerType::get(mInt32Ty, 0),   // pBucketMgr
1694                 mInt32Ty                        // id
1695             };
1696
1697             FunctionType* pFuncTy = FunctionType::get(Type::getVoidTy(JM()->mContext), args, false);
1698             Function* pFunc = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("BucketManager_StopBucket", pFuncTy));
1699             if (sys::DynamicLibrary::SearchForAddressOfSymbol("BucketManager_StopBucket") == nullptr)
1700             {
1701                 sys::DynamicLibrary::AddSymbol("BucketManager_StopBucket", (void*)&BucketManager_StopBucket);
1702             }
1703
1704             CALL(pFunc, { pBucketMgr, pId });
1705         }
1706     }
1707
1708
1709     uint32_t Builder::GetTypeSize(Type* pType)
1710     {
1711         if (pType->isStructTy())
1712         {
1713             uint32_t numElems = pType->getStructNumElements();
1714             Type* pElemTy = pType->getStructElementType(0);
1715             return numElems * GetTypeSize(pElemTy);
1716         }
1717
1718         if (pType->isArrayTy())
1719         {
1720             uint32_t numElems = pType->getArrayNumElements();
1721             Type* pElemTy = pType->getArrayElementType();
1722             return numElems * GetTypeSize(pElemTy);
1723         }
1724
1725         if (pType->isIntegerTy())
1726         {
1727             uint32_t bitSize = pType->getIntegerBitWidth();
1728             return bitSize / 8;
1729         }
1730
1731         if (pType->isFloatTy())
1732         {
1733             return 4;
1734         }
1735
1736         if (pType->isHalfTy())
1737         {
1738             return 2;
1739         }
1740
1741         if (pType->isDoubleTy())
1742         {
1743             return 8;
1744         }
1745
1746         SWR_ASSERT(false, "Unimplemented type.");
1747         return 0;
1748     }
1749 }