src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp

   1 /****************************************************************************
   2 * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
   3 *
   4 * Permission is hereby granted, free of charge, to any person obtaining a
   5 * copy of this software and associated documentation files (the "Software"),
   6 * to deal in the Software without restriction, including without limitation
   7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8 * and/or sell copies of the Software, and to permit persons to whom the
   9 * Software is furnished to do so, subject to the following conditions:
  10 *
  11 * The above copyright notice and this permission notice (including the next
  12 * paragraph) shall be included in all copies or substantial portions of the
  13 * Software.
  14 *
  15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21 * IN THE SOFTWARE.
  22 *
  23 * @file builder_misc.cpp
  24 *
  25 * @brief Implementation for miscellaneous builder functions
  26 *
  27 * Notes:
  28 *
  29 ******************************************************************************/
  30 #include "jit_pch.hpp"
  31 #include "builder.h"
  32 #include "common/rdtsc_buckets.h"
  33
  34 #include <cstdarg>
  35
  36 namespace SwrJit
  37 {
  38     void __cdecl CallPrint(const char* fmt, ...);
  39
  40     //////////////////////////////////////////////////////////////////////////
  41     /// @brief Convert an IEEE 754 32-bit single precision float to an
  42     ///        16 bit float with 5 exponent bits and a variable
  43     ///        number of mantissa bits.
  44     /// @param val - 32-bit float
  45     /// @todo Maybe move this outside of this file into a header?
  46     static uint16_t ConvertFloat32ToFloat16(float val)
  47     {
  48         uint32_t sign, exp, mant;
  49         uint32_t roundBits;
  50
  51         // Extract the sign, exponent, and mantissa
  52         uint32_t uf = *(uint32_t*)&val;
  53         sign = (uf & 0x80000000) >> 31;
  54         exp = (uf & 0x7F800000) >> 23;
  55         mant = uf & 0x007FFFFF;
  56
  57         // Check for out of range
  58         if (std::isnan(val))
  59         {
  60             exp = 0x1F;
  61             mant = 0x200;
  62             sign = 1;                     // set the sign bit for NANs
  63         }
  64         else if (std::isinf(val))
  65         {
  66             exp = 0x1f;
  67             mant = 0x0;
  68         }
  69         else if (exp > (0x70 + 0x1E)) // Too big to represent -> max representable value
  70         {
  71             exp = 0x1E;
  72             mant = 0x3FF;
  73         }
  74         else if ((exp <= 0x70) && (exp >= 0x66)) // It's a denorm
  75         {
  76             mant |= 0x00800000;
  77             for (; exp <= 0x70; mant >>= 1, exp++)
  78                 ;
  79             exp = 0;
  80             mant = mant >> 13;
  81         }
  82         else if (exp < 0x66) // Too small to represent -> Zero
  83         {
  84             exp = 0;
  85             mant = 0;
  86         }
  87         else
  88         {
  89             // Saves bits that will be shifted off for rounding
  90             roundBits = mant & 0x1FFFu;
  91             // convert exponent and mantissa to 16 bit format
  92             exp = exp - 0x70;
  93             mant = mant >> 13;
  94
  95             // Essentially RTZ, but round up if off by only 1 lsb
  96             if (roundBits == 0x1FFFu)
  97             {
  98                 mant++;
  99                 // check for overflow
 100                 if ((mant & 0xC00u) != 0)
 101                     exp++;
 102                 // make sure only the needed bits are used
 103                 mant &= 0x3FF;
 104             }
 105         }
 106
 107         uint32_t tmpVal = (sign << 15) | (exp << 10) | mant;
 108         return (uint16_t)tmpVal;
 109     }
 110
 111     //////////////////////////////////////////////////////////////////////////
 112     /// @brief Convert an IEEE 754 16-bit float to an 32-bit single precision
 113     ///        float
 114     /// @param val - 16-bit float
 115     /// @todo Maybe move this outside of this file into a header?
 116     static float ConvertFloat16ToFloat32(uint32_t val)
 117     {
 118         uint32_t result;
 119         if ((val & 0x7fff) == 0)
 120         {
 121             result = ((uint32_t)(val & 0x8000)) << 16;
 122         }
 123         else if ((val & 0x7c00) == 0x7c00)
 124         {
 125             result = ((val & 0x3ff) == 0) ? 0x7f800000 : 0x7fc00000;
 126             result |= ((uint32_t)val & 0x8000) << 16;
 127         }
 128         else
 129         {
 130             uint32_t sign = (val & 0x8000) << 16;
 131             uint32_t mant = (val & 0x3ff) << 13;
 132             uint32_t exp = (val >> 10) & 0x1f;
 133             if ((exp == 0) && (mant != 0)) // Adjust exponent and mantissa for denormals
 134             {
 135                 mant <<= 1;
 136                 while (mant < (0x400 << 13))
 137                 {
 138                     exp--;
 139                     mant <<= 1;
 140                 }
 141                 mant &= (0x3ff << 13);
 142             }
 143             exp = ((exp - 15 + 127) & 0xff) << 23;
 144             result = sign | exp | mant;
 145         }
 146
 147         return *(float*)&result;
 148     }
 149
 150     Constant *Builder::C(bool i)
 151     {
 152         return ConstantInt::get(IRB()->getInt1Ty(), (i ? 1 : 0));
 153     }
 154
 155     Constant *Builder::C(char i)
 156     {
 157         return ConstantInt::get(IRB()->getInt8Ty(), i);
 158     }
 159
 160     Constant *Builder::C(uint8_t i)
 161     {
 162         return ConstantInt::get(IRB()->getInt8Ty(), i);
 163     }
 164
 165     Constant *Builder::C(int i)
 166     {
 167         return ConstantInt::get(IRB()->getInt32Ty(), i);
 168     }
 169
 170     Constant *Builder::C(int64_t i)
 171     {
 172         return ConstantInt::get(IRB()->getInt64Ty(), i);
 173     }
 174
 175     Constant *Builder::C(uint16_t i)
 176     {
 177         return ConstantInt::get(mInt16Ty,i);
 178     }
 179
 180     Constant *Builder::C(uint32_t i)
 181     {
 182         return ConstantInt::get(IRB()->getInt32Ty(), i);
 183     }
 184
 185     Constant *Builder::C(float i)
 186     {
 187         return ConstantFP::get(IRB()->getFloatTy(), i);
 188     }
 189
 190     Constant *Builder::PRED(bool pred)
 191     {
 192         return ConstantInt::get(IRB()->getInt1Ty(), (pred ? 1 : 0));
 193     }
 194
 195     Value *Builder::VIMMED1(int i)
 196     {
 197         return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i)));
 198     }
 199
 200     Value *Builder::VIMMED1_16(int i)
 201     {
 202         return ConstantVector::getSplat(mVWidth16, cast<ConstantInt>(C(i)));
 203     }
 204
 205     Value *Builder::VIMMED1(uint32_t i)
 206     {
 207         return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i)));
 208     }
 209
 210     Value *Builder::VIMMED1_16(uint32_t i)
 211     {
 212         return ConstantVector::getSplat(mVWidth16, cast<ConstantInt>(C(i)));
 213     }
 214
 215     Value *Builder::VIMMED1(float i)
 216     {
 217         return ConstantVector::getSplat(mVWidth, cast<ConstantFP>(C(i)));
 218     }
 219
 220     Value *Builder::VIMMED1_16(float i)
 221     {
 222         return ConstantVector::getSplat(mVWidth16, cast<ConstantFP>(C(i)));
 223     }
 224
 225     Value *Builder::VIMMED1(bool i)
 226     {
 227         return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i)));
 228     }
 229
 230     Value *Builder::VIMMED1_16(bool i)
 231     {
 232         return ConstantVector::getSplat(mVWidth16, cast<ConstantInt>(C(i)));
 233     }
 234
 235     Value *Builder::VUNDEF_IPTR()
 236     {
 237         return UndefValue::get(VectorType::get(mInt32PtrTy,mVWidth));
 238     }
 239
 240     Value *Builder::VUNDEF(Type* t)
 241     {
 242         return UndefValue::get(VectorType::get(t, mVWidth));
 243     }
 244
 245     Value *Builder::VUNDEF_I()
 246     {
 247         return UndefValue::get(VectorType::get(mInt32Ty, mVWidth));
 248     }
 249
 250     Value *Builder::VUNDEF_I_16()
 251     {
 252         return UndefValue::get(VectorType::get(mInt32Ty, mVWidth16));
 253     }
 254
 255     Value *Builder::VUNDEF_F()
 256     {
 257         return UndefValue::get(VectorType::get(mFP32Ty, mVWidth));
 258     }
 259
 260     Value *Builder::VUNDEF_F_16()
 261     {
 262         return UndefValue::get(VectorType::get(mFP32Ty, mVWidth16));
 263     }
 264
 265     Value *Builder::VUNDEF(Type *ty, uint32_t size)
 266     {
 267         return UndefValue::get(VectorType::get(ty, size));
 268     }
 269
 270     Value *Builder::VBROADCAST(Value *src, const llvm::Twine& name)
 271     {
 272         // check if src is already a vector
 273         if (src->getType()->isVectorTy())
 274         {
 275             return src;
 276         }
 277
 278         return VECTOR_SPLAT(mVWidth, src, name);
 279     }
 280
 281     Value *Builder::VBROADCAST_16(Value *src)
 282     {
 283         // check if src is already a vector
 284         if (src->getType()->isVectorTy())
 285         {
 286             return src;
 287         }
 288
 289         return VECTOR_SPLAT(mVWidth16, src);
 290     }
 291
 292     uint32_t Builder::IMMED(Value* v)
 293     {
 294         SWR_ASSERT(isa<ConstantInt>(v));
 295         ConstantInt *pValConst = cast<ConstantInt>(v);
 296         return pValConst->getZExtValue();
 297     }
 298
 299     int32_t Builder::S_IMMED(Value* v)
 300     {
 301         SWR_ASSERT(isa<ConstantInt>(v));
 302         ConstantInt *pValConst = cast<ConstantInt>(v);
 303         return pValConst->getSExtValue();
 304     }
 305
 306     Value *Builder::GEP(Value* ptr, const std::initializer_list<Value*> &indexList)
 307     {
 308         std::vector<Value*> indices;
 309         for (auto i : indexList)
 310             indices.push_back(i);
 311         return GEPA(ptr, indices);
 312     }
 313
 314     Value *Builder::GEP(Value* ptr, const std::initializer_list<uint32_t> &indexList)
 315     {
 316         std::vector<Value*> indices;
 317         for (auto i : indexList)
 318             indices.push_back(C(i));
 319         return GEPA(ptr, indices);
 320     }
 321
 322     Value *Builder::IN_BOUNDS_GEP(Value* ptr, const std::initializer_list<Value*> &indexList)
 323     {
 324         std::vector<Value*> indices;
 325         for (auto i : indexList)
 326             indices.push_back(i);
 327         return IN_BOUNDS_GEP(ptr, indices);
 328     }
 329
 330     Value *Builder::IN_BOUNDS_GEP(Value* ptr, const std::initializer_list<uint32_t> &indexList)
 331     {
 332         std::vector<Value*> indices;
 333         for (auto i : indexList)
 334             indices.push_back(C(i));
 335         return IN_BOUNDS_GEP(ptr, indices);
 336     }
 337
 338     LoadInst *Builder::LOAD(Value *basePtr, const std::initializer_list<uint32_t> &indices, const llvm::Twine& name)
 339     {
 340         std::vector<Value*> valIndices;
 341         for (auto i : indices)
 342             valIndices.push_back(C(i));
 343         return LOAD(GEPA(basePtr, valIndices), name);
 344     }
 345
 346     LoadInst *Builder::LOADV(Value *basePtr, const std::initializer_list<Value*> &indices, const llvm::Twine& name)
 347     {
 348         std::vector<Value*> valIndices;
 349         for (auto i : indices)
 350             valIndices.push_back(i);
 351         return LOAD(GEPA(basePtr, valIndices), name);
 352     }
 353
 354     StoreInst *Builder::STORE(Value *val, Value *basePtr, const std::initializer_list<uint32_t> &indices)
 355     {
 356         std::vector<Value*> valIndices;
 357         for (auto i : indices)
 358             valIndices.push_back(C(i));
 359         return STORE(val, GEPA(basePtr, valIndices));
 360     }
 361
 362     StoreInst *Builder::STOREV(Value *val, Value *basePtr, const std::initializer_list<Value*> &indices)
 363     {
 364         std::vector<Value*> valIndices;
 365         for (auto i : indices)
 366             valIndices.push_back(i);
 367         return STORE(val, GEPA(basePtr, valIndices));
 368     }
 369
 370     CallInst *Builder::CALL(Value *Callee, const std::initializer_list<Value*> &argsList, const llvm::Twine& name)
 371     {
 372         std::vector<Value*> args;
 373         for (auto arg : argsList)
 374             args.push_back(arg);
 375         return CALLA(Callee, args, name);
 376     }
 377
 378     CallInst *Builder::CALL(Value *Callee, Value* arg)
 379     {
 380         std::vector<Value*> args;
 381         args.push_back(arg);
 382         return CALLA(Callee, args);
 383     }
 384
 385     CallInst *Builder::CALL2(Value *Callee, Value* arg1, Value* arg2)
 386     {
 387         std::vector<Value*> args;
 388         args.push_back(arg1);
 389         args.push_back(arg2);
 390         return CALLA(Callee, args);
 391     }
 392
 393     CallInst *Builder::CALL3(Value *Callee, Value* arg1, Value* arg2, Value* arg3)
 394     {
 395         std::vector<Value*> args;
 396         args.push_back(arg1);
 397         args.push_back(arg2);
 398         args.push_back(arg3);
 399         return CALLA(Callee, args);
 400     }
 401
 402     //////////////////////////////////////////////////////////////////////////
 403     Value *Builder::DEBUGTRAP()
 404     {
 405         Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::debugtrap);
 406         return CALL(func);
 407     }
 408
 409     Value *Builder::VRCP(Value *va, const llvm::Twine& name)
 410     {
 411         return FDIV(VIMMED1(1.0f), va, name);  // 1 / a
 412     }
 413
 414     Value *Builder::VPLANEPS(Value* vA, Value* vB, Value* vC, Value* &vX, Value* &vY)
 415     {
 416         Value* vOut = FMADDPS(vA, vX, vC);
 417         vOut = FMADDPS(vB, vY, vOut);
 418         return vOut;
 419     }
 420
 421     //////////////////////////////////////////////////////////////////////////
 422     /// @brief Generate an i32 masked load operation in LLVM IR.  If not
 423     /// supported on the underlying platform, emulate it with float masked load
 424     /// @param src - base address pointer for the load
 425     /// @param vMask - SIMD wide mask that controls whether to access memory load 0
 426     Value *Builder::MASKLOADD(Value* src,Value* mask)
 427     {
 428         Value* vResult;
 429         // use avx2 gather instruction is available
 430         if(JM()->mArch.AVX2())
 431         {
 432             Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx2_maskload_d_256);
 433             vResult = CALL(func,{src,mask});
 434         }
 435         else
 436         {
 437             // maskload intrinsic expects integer mask operand in llvm >= 3.8
 438     #if (LLVM_VERSION_MAJOR > 3) || (LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 8)
 439             mask = BITCAST(mask,VectorType::get(mInt32Ty,mVWidth));
 440     #else
 441             mask = BITCAST(mask,VectorType::get(mFP32Ty,mVWidth));
 442     #endif
 443             Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule,Intrinsic::x86_avx_maskload_ps_256);
 444             vResult = BITCAST(CALL(func,{src,mask}), VectorType::get(mInt32Ty,mVWidth));
 445         }
 446         return vResult;
 447     }
 448
 449     //////////////////////////////////////////////////////////////////////////
 450     /// @brief insert a JIT call to CallPrint
 451     /// - outputs formatted string to both stdout and VS output window
 452     /// - DEBUG builds only
 453     /// Usage example:
 454     ///   PRINT("index %d = 0x%p\n",{C(lane), pIndex});
 455     ///   where C(lane) creates a constant value to print, and pIndex is the Value*
 456     ///   result from a GEP, printing out the pointer to memory
 457     /// @param printStr - constant string to print, which includes format specifiers
 458     /// @param printArgs - initializer list of Value*'s to print to std out
 459     CallInst *Builder::PRINT(const std::string &printStr,const std::initializer_list<Value*> &printArgs)
 460     {
 461         // push the arguments to CallPrint into a vector
 462         std::vector<Value*> printCallArgs;
 463         // save room for the format string.  we still need to modify it for vectors
 464         printCallArgs.resize(1);
 465
 466         // search through the format string for special processing
 467         size_t pos = 0;
 468         std::string tempStr(printStr);
 469         pos = tempStr.find('%', pos);
 470         auto v = printArgs.begin();
 471
 472         while ((pos != std::string::npos) && (v != printArgs.end()))
 473         {
 474             Value* pArg = *v;
 475             Type* pType = pArg->getType();
 476
 477             if (pType->isVectorTy())
 478             {
 479                 Type* pContainedType = pType->getContainedType(0);
 480
 481                 if (toupper(tempStr[pos + 1]) == 'X')
 482                 {
 483                     tempStr[pos] = '0';
 484                     tempStr[pos + 1] = 'x';
 485                     tempStr.insert(pos + 2, "%08X ");
 486                     pos += 7;
 487
 488                     printCallArgs.push_back(VEXTRACT(pArg, C(0)));
 489
 490                     std::string vectorFormatStr;
 491                     for (uint32_t i = 1; i < pType->getVectorNumElements(); ++i)
 492                     {
 493                         vectorFormatStr += "0x%08X ";
 494                         printCallArgs.push_back(VEXTRACT(pArg, C(i)));
 495                     }
 496
 497                     tempStr.insert(pos, vectorFormatStr);
 498                     pos += vectorFormatStr.size();
 499                 }
 500                 else if ((tempStr[pos + 1] == 'f') && (pContainedType->isFloatTy()))
 501                 {
 502                     uint32_t i = 0;
 503                     for (; i < (pArg->getType()->getVectorNumElements()) - 1; i++)
 504                     {
 505                         tempStr.insert(pos, std::string("%f "));
 506                         pos += 3;
 507                         printCallArgs.push_back(FP_EXT(VEXTRACT(pArg, C(i)), Type::getDoubleTy(JM()->mContext)));
 508                     }
 509                     printCallArgs.push_back(FP_EXT(VEXTRACT(pArg, C(i)), Type::getDoubleTy(JM()->mContext)));
 510                 }
 511                 else if ((tempStr[pos + 1] == 'd') && (pContainedType->isIntegerTy()))
 512                 {
 513                     uint32_t i = 0;
 514                     for (; i < (pArg->getType()->getVectorNumElements()) - 1; i++)
 515                     {
 516                         tempStr.insert(pos, std::string("%d "));
 517                         pos += 3;
 518                         printCallArgs.push_back(VEXTRACT(pArg, C(i)));
 519                     }
 520                     printCallArgs.push_back(VEXTRACT(pArg, C(i)));
 521                 }
 522             }
 523             else
 524             {
 525                 if (toupper(tempStr[pos + 1]) == 'X')
 526                 {
 527                     tempStr[pos] = '0';
 528                     tempStr.insert(pos + 1, "x%08");
 529                     printCallArgs.push_back(pArg);
 530                     pos += 3;
 531                 }
 532                 // for %f we need to cast float Values to doubles so that they print out correctly
 533                 else if ((tempStr[pos + 1] == 'f') && (pType->isFloatTy()))
 534                 {
 535                     printCallArgs.push_back(FP_EXT(pArg, Type::getDoubleTy(JM()->mContext)));
 536                     pos++;
 537                 }
 538                 else
 539                 {
 540                     printCallArgs.push_back(pArg);
 541                 }
 542             }
 543
 544             // advance to the next arguement
 545             v++;
 546             pos = tempStr.find('%', ++pos);
 547         }
 548
 549         // create global variable constant string
 550         Constant *constString = ConstantDataArray::getString(JM()->mContext,tempStr,true);
 551         GlobalVariable *gvPtr = new GlobalVariable(constString->getType(),true,GlobalValue::InternalLinkage,constString,"printStr");
 552         JM()->mpCurrentModule->getGlobalList().push_back(gvPtr);
 553
 554         // get a pointer to the first character in the constant string array
 555         std::vector<Constant*> geplist{C(0),C(0)};
 556         Constant *strGEP = ConstantExpr::getGetElementPtr(nullptr, gvPtr,geplist,false);
 557
 558         // insert the pointer to the format string in the argument vector
 559         printCallArgs[0] = strGEP;
 560
 561         // get pointer to CallPrint function and insert decl into the module if needed
 562         std::vector<Type*> args;
 563         args.push_back(PointerType::get(mInt8Ty,0));
 564         FunctionType* callPrintTy = FunctionType::get(Type::getVoidTy(JM()->mContext),args,true);
 565         Function *callPrintFn = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("CallPrint", callPrintTy));
 566
 567         // if we haven't yet added the symbol to the symbol table
 568         if((sys::DynamicLibrary::SearchForAddressOfSymbol("CallPrint")) == nullptr)
 569         {
 570             sys::DynamicLibrary::AddSymbol("CallPrint", (void *)&CallPrint);
 571         }
 572
 573         // insert a call to CallPrint
 574         return CALLA(callPrintFn,printCallArgs);
 575     }
 576
 577     //////////////////////////////////////////////////////////////////////////
 578     /// @brief Wrapper around PRINT with initializer list.
 579     CallInst* Builder::PRINT(const std::string &printStr)
 580     {
 581         return PRINT(printStr, {});
 582     }
 583
 584     //////////////////////////////////////////////////////////////////////////
 585     /// @brief Generate a masked gather operation in LLVM IR.  If not
 586     /// supported on the underlying platform, emulate it with loads
 587     /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
 588     /// @param pBase - Int8* base VB address pointer value
 589     /// @param vIndices - SIMD wide value of VB byte offsets
 590     /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
 591     /// @param scale - value to scale indices by
 592     Value *Builder::GATHERPS(Value *vSrc, Value *pBase, Value *vIndices, Value *vMask, uint8_t scale)
 593     {
 594         Value *vGather;
 595
 596         // use avx2 gather instruction if available
 597         if(JM()->mArch.AVX2())
 598         {
 599             // force mask to <N x float>, required by vgather
 600             Value *mask = BITCAST(VMASK(vMask), mSimdFP32Ty);
 601
 602             vGather = VGATHERPS(vSrc, pBase, vIndices, mask, C(scale));
 603         }
 604         else
 605         {
 606             Value* pStack = STACKSAVE();
 607
 608             // store vSrc on the stack.  this way we can select between a valid load address and the vSrc address
 609             Value* vSrcPtr = ALLOCA(vSrc->getType());
 610             STORE(vSrc, vSrcPtr);
 611
 612             vGather = VUNDEF_F();
 613             Value *vScaleVec = VIMMED1((uint32_t)scale);
 614             Value *vOffsets = MUL(vIndices,vScaleVec);
 615             for(uint32_t i = 0; i < mVWidth; ++i)
 616             {
 617                 // single component byte index
 618                 Value *offset = VEXTRACT(vOffsets,C(i));
 619                 // byte pointer to component
 620                 Value *loadAddress = GEP(pBase,offset);
 621                 loadAddress = BITCAST(loadAddress,PointerType::get(mFP32Ty,0));
 622                 // pointer to the value to load if we're masking off a component
 623                 Value *maskLoadAddress = GEP(vSrcPtr,{C(0), C(i)});
 624                 Value *selMask = VEXTRACT(vMask,C(i));
 625                 // switch in a safe address to load if we're trying to access a vertex
 626                 Value *validAddress = SELECT(selMask, loadAddress, maskLoadAddress);
 627                 Value *val = LOAD(validAddress);
 628                 vGather = VINSERT(vGather,val,C(i));
 629             }
 630
 631             STACKRESTORE(pStack);
 632         }
 633
 634         return vGather;
 635     }
 636
 637     Value *Builder::GATHERPS_16(Value *vSrc, Value *pBase, Value *vIndices, Value *vMask, uint8_t scale)
 638     {
 639         Value *vGather = VUNDEF_F_16();
 640
 641         // use AVX512F gather instruction if available
 642         if (JM()->mArch.AVX512F())
 643         {
 644             // force mask to <N-bit Integer>, required by vgather2
 645             Value *mask = BITCAST(vMask, mInt16Ty);
 646
 647             vGather = VGATHERPS_16(vSrc, pBase, vIndices, mask, C((uint32_t)scale));
 648         }
 649         else
 650         {
 651             Value *src0 = EXTRACT_16(vSrc, 0);
 652             Value *src1 = EXTRACT_16(vSrc, 1);
 653
 654             Value *indices0 = EXTRACT_16(vIndices, 0);
 655             Value *indices1 = EXTRACT_16(vIndices, 1);
 656
 657             Value *mask0 = EXTRACT_16(vMask, 0);
 658             Value *mask1 = EXTRACT_16(vMask, 1);
 659
 660             Value *gather0 = GATHERPS(src0, pBase, indices0, mask0, scale);
 661             Value *gather1 = GATHERPS(src1, pBase, indices1, mask1, scale);
 662
 663             vGather = JOIN_16(gather0, gather1);
 664         }
 665
 666         return vGather;
 667     }
 668
 669     //////////////////////////////////////////////////////////////////////////
 670     /// @brief Generate a masked gather operation in LLVM IR.  If not
 671     /// supported on the underlying platform, emulate it with loads
 672     /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
 673     /// @param pBase - Int8* base VB address pointer value
 674     /// @param vIndices - SIMD wide value of VB byte offsets
 675     /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
 676     /// @param scale - value to scale indices by
 677     Value *Builder::GATHERDD(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, uint8_t scale)
 678     {
 679         Value* vGather;
 680
 681         // use avx2 gather instruction if available
 682         if(JM()->mArch.AVX2())
 683         {
 684             vGather = VGATHERDD(vSrc, pBase, vIndices, VMASK(vMask), C(scale));
 685         }
 686         else
 687         {
 688             Value* pStack = STACKSAVE();
 689
 690             // store vSrc on the stack.  this way we can select between a valid load address and the vSrc address
 691             Value* vSrcPtr = ALLOCA(vSrc->getType());
 692             STORE(vSrc, vSrcPtr);
 693
 694             vGather = VUNDEF_I();
 695             Value *vScaleVec = VIMMED1((uint32_t)scale);
 696             Value *vOffsets = MUL(vIndices, vScaleVec);
 697             for(uint32_t i = 0; i < mVWidth; ++i)
 698             {
 699                 // single component byte index
 700                 Value *offset = VEXTRACT(vOffsets, C(i));
 701                 // byte pointer to component
 702                 Value *loadAddress = GEP(pBase, offset);
 703                 loadAddress = BITCAST(loadAddress, PointerType::get(mInt32Ty, 0));
 704                 // pointer to the value to load if we're masking off a component
 705                 Value *maskLoadAddress = GEP(vSrcPtr, {C(0), C(i)});
 706                 Value *selMask = VEXTRACT(vMask, C(i));
 707                 // switch in a safe address to load if we're trying to access a vertex
 708                 Value *validAddress = SELECT(selMask, loadAddress, maskLoadAddress);
 709                 Value *val = LOAD(validAddress, C(0));
 710                 vGather = VINSERT(vGather, val, C(i));
 711             }
 712
 713             STACKRESTORE(pStack);
 714         }
 715
 716         return vGather;
 717     }
 718
 719     Value *Builder::GATHERDD_16(Value *vSrc, Value *pBase, Value *vIndices, Value *vMask, uint8_t scale)
 720     {
 721         Value *vGather = VUNDEF_I_16();
 722
 723         // use AVX512F gather instruction if available
 724         if (JM()->mArch.AVX512F())
 725         {
 726             // force mask to <N-bit Integer>, required by vgather2
 727             Value *mask = BITCAST(vMask, mInt16Ty);
 728
 729             vGather = VGATHERDD_16(vSrc, pBase, vIndices, mask, C((uint32_t)scale));
 730         }
 731         else
 732         {
 733             Value *src0 = EXTRACT_16(vSrc, 0);
 734             Value *src1 = EXTRACT_16(vSrc, 1);
 735
 736             Value *indices0 = EXTRACT_16(vIndices, 0);
 737             Value *indices1 = EXTRACT_16(vIndices, 1);
 738
 739             Value *mask0 = EXTRACT_16(vMask, 0);
 740             Value *mask1 = EXTRACT_16(vMask, 1);
 741
 742             Value *gather0 = GATHERDD(src0, pBase, indices0, mask0, scale);
 743             Value *gather1 = GATHERDD(src1, pBase, indices1, mask1, scale);
 744
 745             vGather = JOIN_16(gather0, gather1);
 746         }
 747
 748         return vGather;
 749     }
 750
 751     //////////////////////////////////////////////////////////////////////////
 752     /// @brief Generate a masked gather operation in LLVM IR.  If not
 753     /// supported on the underlying platform, emulate it with loads
 754     /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
 755     /// @param pBase - Int8* base VB address pointer value
 756     /// @param vIndices - SIMD wide value of VB byte offsets
 757     /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
 758     /// @param scale - value to scale indices by
 759     Value *Builder::GATHERPD(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, uint8_t scale)
 760     {
 761         Value* vGather;
 762
 763         // use avx2 gather instruction if available
 764         if(JM()->mArch.AVX2())
 765         {
 766             vMask = BITCAST(S_EXT(vMask, VectorType::get(mInt64Ty, mVWidth/2)), VectorType::get(mDoubleTy, mVWidth/2));
 767             vGather = VGATHERPD(vSrc, pBase, vIndices, vMask, C(scale));
 768         }
 769         else
 770         {
 771             Value* pStack = STACKSAVE();
 772
 773             // store vSrc on the stack.  this way we can select between a valid load address and the vSrc address
 774             Value* vSrcPtr = ALLOCA(vSrc->getType());
 775             STORE(vSrc, vSrcPtr);
 776
 777             vGather = UndefValue::get(VectorType::get(mDoubleTy, 4));
 778             Value *vScaleVec = VECTOR_SPLAT(4, C((uint32_t)scale));
 779             Value *vOffsets = MUL(vIndices,vScaleVec);
 780             for(uint32_t i = 0; i < mVWidth/2; ++i)
 781             {
 782                 // single component byte index
 783                 Value *offset = VEXTRACT(vOffsets,C(i));
 784                 // byte pointer to component
 785                 Value *loadAddress = GEP(pBase,offset);
 786                 loadAddress = BITCAST(loadAddress,PointerType::get(mDoubleTy,0));
 787                 // pointer to the value to load if we're masking off a component
 788                 Value *maskLoadAddress = GEP(vSrcPtr,{C(0), C(i)});
 789                 Value *selMask = VEXTRACT(vMask,C(i));
 790                 // switch in a safe address to load if we're trying to access a vertex
 791                 Value *validAddress = SELECT(selMask, loadAddress, maskLoadAddress);
 792                 Value *val = LOAD(validAddress);
 793                 vGather = VINSERT(vGather,val,C(i));
 794             }
 795             STACKRESTORE(pStack);
 796         }
 797         return vGather;
 798     }
 799
 800     Value *Builder::EXTRACT_16(Value *x, uint32_t imm)
 801     {
 802         if (imm == 0)
 803         {
 804             return VSHUFFLE(x, UndefValue::get(x->getType()), { 0, 1, 2, 3, 4, 5, 6, 7 });
 805         }
 806         else
 807         {
 808             return VSHUFFLE(x, UndefValue::get(x->getType()), { 8, 9, 10, 11, 12, 13, 14, 15 });
 809         }
 810     }
 811
 812     Value *Builder::JOIN_16(Value *a, Value *b)
 813     {
 814         return VSHUFFLE(a, b, { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 });
 815     }
 816
 817     //////////////////////////////////////////////////////////////////////////
 818     /// @brief convert x86 <N x float> mask to llvm <N x i1> mask
 819     Value *Builder::MASK(Value *vmask)
 820     {
 821         Value *src = BITCAST(vmask, mSimdInt32Ty);
 822         return ICMP_SLT(src, VIMMED1(0));
 823     }
 824
 825     Value *Builder::MASK_16(Value *vmask)
 826     {
 827         Value *src = BITCAST(vmask, mSimd16Int32Ty);
 828         return ICMP_SLT(src, VIMMED1_16(0));
 829     }
 830
 831     //////////////////////////////////////////////////////////////////////////
 832     /// @brief convert llvm <N x i1> mask to x86 <N x i32> mask
 833     Value *Builder::VMASK(Value *mask)
 834     {
 835         return S_EXT(mask, mSimdInt32Ty);
 836     }
 837
 838     Value *Builder::VMASK_16(Value *mask)
 839     {
 840         return S_EXT(mask, mSimd16Int32Ty);
 841     }
 842
 843     //////////////////////////////////////////////////////////////////////////
 844     /// @brief Generate a VPSHUFB operation in LLVM IR.  If not
 845     /// supported on the underlying platform, emulate it
 846     /// @param a - 256bit SIMD(32x8bit) of 8bit integer values
 847     /// @param b - 256bit SIMD(32x8bit) of 8bit integer mask values
 848     /// Byte masks in lower 128 lane of b selects 8 bit values from lower
 849     /// 128bits of a, and vice versa for the upper lanes.  If the mask
 850     /// value is negative, '0' is inserted.
 851     Value *Builder::PSHUFB(Value* a, Value* b)
 852     {
 853         Value* res;
 854         // use avx2 pshufb instruction if available
 855         if(JM()->mArch.AVX2())
 856         {
 857             res = VPSHUFB(a, b);
 858         }
 859         else
 860         {
 861             Constant* cB = dyn_cast<Constant>(b);
 862             // number of 8 bit elements in b
 863             uint32_t numElms = cast<VectorType>(cB->getType())->getNumElements();
 864             // output vector
 865             Value* vShuf = UndefValue::get(VectorType::get(mInt8Ty, numElms));
 866
 867             // insert an 8 bit value from the high and low lanes of a per loop iteration
 868             numElms /= 2;
 869             for(uint32_t i = 0; i < numElms; i++)
 870             {
 871                 ConstantInt* cLow128b = cast<ConstantInt>(cB->getAggregateElement(i));
 872                 ConstantInt* cHigh128b = cast<ConstantInt>(cB->getAggregateElement(i + numElms));
 873
 874                 // extract values from constant mask
 875                 char valLow128bLane =  (char)(cLow128b->getSExtValue());
 876                 char valHigh128bLane = (char)(cHigh128b->getSExtValue());
 877
 878                 Value* insertValLow128b;
 879                 Value* insertValHigh128b;
 880
 881                 // if the mask value is negative, insert a '0' in the respective output position
 882                 // otherwise, lookup the value at mask position (bits 3..0 of the respective mask byte) in a and insert in output vector
 883                 insertValLow128b = (valLow128bLane < 0) ? C((char)0) : VEXTRACT(a, C((valLow128bLane & 0xF)));
 884                 insertValHigh128b = (valHigh128bLane < 0) ? C((char)0) : VEXTRACT(a, C((valHigh128bLane & 0xF) + numElms));
 885
 886                 vShuf = VINSERT(vShuf, insertValLow128b, i);
 887                 vShuf = VINSERT(vShuf, insertValHigh128b, (i + numElms));
 888             }
 889             res = vShuf;
 890         }
 891         return res;
 892     }
 893
 894     //////////////////////////////////////////////////////////////////////////
 895     /// @brief Generate a VPSHUFB operation (sign extend 8 8bit values to 32
 896     /// bits)in LLVM IR.  If not supported on the underlying platform, emulate it
 897     /// @param a - 128bit SIMD lane(16x8bit) of 8bit integer values.  Only
 898     /// lower 8 values are used.
 899     Value *Builder::PMOVSXBD(Value* a)
 900     {
 901         // VPMOVSXBD output type
 902         Type* v8x32Ty = VectorType::get(mInt32Ty, 8);
 903         // Extract 8 values from 128bit lane and sign extend
 904         return S_EXT(VSHUFFLE(a, a, C<int>({0, 1, 2, 3, 4, 5, 6, 7})), v8x32Ty);
 905     }
 906
 907     //////////////////////////////////////////////////////////////////////////
 908     /// @brief Generate a VPSHUFB operation (sign extend 8 16bit values to 32
 909     /// bits)in LLVM IR.  If not supported on the underlying platform, emulate it
 910     /// @param a - 128bit SIMD lane(8x16bit) of 16bit integer values.
 911     Value *Builder::PMOVSXWD(Value* a)
 912     {
 913         // VPMOVSXWD output type
 914         Type* v8x32Ty = VectorType::get(mInt32Ty, 8);
 915         // Extract 8 values from 128bit lane and sign extend
 916         return S_EXT(VSHUFFLE(a, a, C<int>({0, 1, 2, 3, 4, 5, 6, 7})), v8x32Ty);
 917     }
 918
 919     //////////////////////////////////////////////////////////////////////////
 920     /// @brief Generate a VPERMD operation (shuffle 32 bit integer values
 921     /// across 128 bit lanes) in LLVM IR.  If not supported on the underlying
 922     /// platform, emulate it
 923     /// @param a - 256bit SIMD lane(8x32bit) of integer values.
 924     /// @param idx - 256bit SIMD lane(8x32bit) of 3 bit lane index values
 925     Value *Builder::PERMD(Value* a, Value* idx)
 926     {
 927         Value* res;
 928         // use avx2 permute instruction if available
 929         if(JM()->mArch.AVX2())
 930         {
 931             res = VPERMD(a, idx);
 932         }
 933         else
 934         {
 935             if (isa<Constant>(idx))
 936             {
 937                 res = VSHUFFLE(a, a, idx);
 938             }
 939             else
 940             {
 941                 res = VUNDEF_I();
 942                 for (uint32_t l = 0; l < JM()->mVWidth; ++l)
 943                 {
 944                     Value* pIndex = VEXTRACT(idx, C(l));
 945                     Value* pVal = VEXTRACT(a, pIndex);
 946                     res = VINSERT(res, pVal, C(l));
 947                 }
 948             }
 949         }
 950         return res;
 951     }
 952
 953     //////////////////////////////////////////////////////////////////////////
 954     /// @brief Generate a VPERMPS operation (shuffle 32 bit float values
 955     /// across 128 bit lanes) in LLVM IR.  If not supported on the underlying
 956     /// platform, emulate it
 957     /// @param a - 256bit SIMD lane(8x32bit) of float values.
 958     /// @param idx - 256bit SIMD lane(8x32bit) of 3 bit lane index values
 959     Value *Builder::PERMPS(Value* a, Value* idx)
 960     {
 961         Value* res;
 962         // use avx2 permute instruction if available
 963         if (JM()->mArch.AVX2())
 964         {
 965             // llvm 3.6.0 swapped the order of the args to vpermd
 966             res = VPERMPS(idx, a);
 967         }
 968         else
 969         {
 970             if (isa<Constant>(idx))
 971             {
 972                 res = VSHUFFLE(a, a, idx);
 973             }
 974             else
 975             {
 976                 res = VUNDEF_F();
 977                 for (uint32_t l = 0; l < JM()->mVWidth; ++l)
 978                 {
 979                     Value* pIndex = VEXTRACT(idx, C(l));
 980                     Value* pVal = VEXTRACT(a, pIndex);
 981                     res = VINSERT(res, pVal, C(l));
 982                 }
 983             }
 984         }
 985
 986         return res;
 987     }
 988
 989     //////////////////////////////////////////////////////////////////////////
 990     /// @brief Generate a VCVTPH2PS operation (float16->float32 conversion)
 991     /// in LLVM IR.  If not supported on the underlying platform, emulate it
 992     /// @param a - 128bit SIMD lane(8x16bit) of float16 in int16 format.
 993     Value *Builder::CVTPH2PS(Value* a, const llvm::Twine& name)
 994     {
 995         if (JM()->mArch.F16C())
 996         {
 997             return VCVTPH2PS(a, name);
 998         }
 999         else
1000         {
1001             FunctionType* pFuncTy = FunctionType::get(mFP32Ty, mInt16Ty);
1002             Function* pCvtPh2Ps = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("ConvertFloat16ToFloat32", pFuncTy));
1003
1004             if (sys::DynamicLibrary::SearchForAddressOfSymbol("ConvertFloat16ToFloat32") == nullptr)
1005             {
1006                 sys::DynamicLibrary::AddSymbol("ConvertFloat16ToFloat32", (void *)&ConvertFloat16ToFloat32);
1007             }
1008
1009             Value* pResult = UndefValue::get(mSimdFP32Ty);
1010             for (uint32_t i = 0; i < mVWidth; ++i)
1011             {
1012                 Value* pSrc = VEXTRACT(a, C(i));
1013                 Value* pConv = CALL(pCvtPh2Ps, std::initializer_list<Value*>{pSrc});
1014                 pResult = VINSERT(pResult, pConv, C(i));
1015             }
1016
1017             pResult->setName(name);
1018             return pResult;
1019         }
1020     }
1021
1022     //////////////////////////////////////////////////////////////////////////
1023     /// @brief Generate a VCVTPS2PH operation (float32->float16 conversion)
1024     /// in LLVM IR.  If not supported on the underlying platform, emulate it
1025     /// @param a - 128bit SIMD lane(8x16bit) of float16 in int16 format.
1026     Value *Builder::CVTPS2PH(Value* a, Value* rounding)
1027     {
1028         if (JM()->mArch.F16C())
1029         {
1030             return VCVTPS2PH(a, rounding);
1031         }
1032         else
1033         {
1034             // call scalar C function for now
1035             FunctionType* pFuncTy = FunctionType::get(mInt16Ty, mFP32Ty);
1036             Function* pCvtPs2Ph = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("ConvertFloat32ToFloat16", pFuncTy));
1037
1038             if (sys::DynamicLibrary::SearchForAddressOfSymbol("ConvertFloat32ToFloat16") == nullptr)
1039             {
1040                 sys::DynamicLibrary::AddSymbol("ConvertFloat32ToFloat16", (void *)&ConvertFloat32ToFloat16);
1041             }
1042
1043             Value* pResult = UndefValue::get(mSimdInt16Ty);
1044             for (uint32_t i = 0; i < mVWidth; ++i)
1045             {
1046                 Value* pSrc = VEXTRACT(a, C(i));
1047                 Value* pConv = CALL(pCvtPs2Ph, std::initializer_list<Value*>{pSrc});
1048                 pResult = VINSERT(pResult, pConv, C(i));
1049             }
1050
1051             return pResult;
1052         }
1053     }
1054
1055     Value *Builder::PMAXSD(Value* a, Value* b)
1056     {
1057         Value* cmp = ICMP_SGT(a, b);
1058         return SELECT(cmp, a, b);
1059     }
1060
1061     Value *Builder::PMINSD(Value* a, Value* b)
1062     {
1063         Value* cmp = ICMP_SLT(a, b);
1064         return SELECT(cmp, a, b);
1065     }
1066
1067     void Builder::Gather4(const SWR_FORMAT format, Value* pSrcBase, Value* byteOffsets,
1068                           Value* mask, Value* vGatherComponents[], bool bPackedOutput)
1069     {
1070         const SWR_FORMAT_INFO &info = GetFormatInfo(format);
1071         if(info.type[0] == SWR_TYPE_FLOAT && info.bpc[0] == 32)
1072         {
1073             GATHER4PS(info, pSrcBase, byteOffsets, mask, vGatherComponents, bPackedOutput);
1074         }
1075         else
1076         {
1077             GATHER4DD(info, pSrcBase, byteOffsets, mask, vGatherComponents, bPackedOutput);
1078         }
1079     }
1080
1081     void Builder::GATHER4PS(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets,
1082                             Value* vMask, Value* vGatherComponents[], bool bPackedOutput)
1083     {
1084         switch(info.bpp / info.numComps)
1085         {
1086             case 16:
1087             {
1088                     Value* vGatherResult[2];
1089
1090                     // TODO: vGatherMaskedVal
1091                     Value* vGatherMaskedVal = VIMMED1((float)0);
1092
1093                     // always have at least one component out of x or y to fetch
1094
1095                     vGatherResult[0] = GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask);
1096                     // e.g. result of first 8x32bit integer gather for 16bit components
1097                     // 256i - 0    1    2    3    4    5    6    7
1098                     //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
1099                     //
1100
1101                     // if we have at least one component out of x or y to fetch
1102                     if(info.numComps > 2)
1103                     {
1104                         // offset base to the next components(zw) in the vertex to gather
1105                         pSrcBase = GEP(pSrcBase, C((char)4));
1106
1107                         vGatherResult[1] =  GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask);
1108                         // e.g. result of second 8x32bit integer gather for 16bit components
1109                         // 256i - 0    1    2    3    4    5    6    7
1110                         //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
1111                         //
1112                     }
1113                     else
1114                     {
1115                         vGatherResult[1] =  vGatherMaskedVal;
1116                     }
1117
1118                     // Shuffle gathered components into place, each row is a component
1119                     Shuffle16bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
1120             }
1121                 break;
1122             case 32:
1123             {
1124                 // apply defaults
1125                 for (uint32_t i = 0; i < 4; ++i)
1126                 {
1127                     vGatherComponents[i] = VIMMED1(*(float*)&info.defaults[i]);
1128                 }
1129
1130                 for(uint32_t i = 0; i < info.numComps; i++)
1131                 {
1132                     uint32_t swizzleIndex = info.swizzle[i];
1133
1134                     // Gather a SIMD of components
1135                     vGatherComponents[swizzleIndex] = GATHERPS(vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask);
1136
1137                     // offset base to the next component to gather
1138                     pSrcBase = GEP(pSrcBase, C((char)4));
1139                 }
1140             }
1141                 break;
1142             default:
1143                 SWR_INVALID("Invalid float format");
1144                 break;
1145         }
1146     }
1147
1148     void Builder::GATHER4DD(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets,
1149                             Value* vMask, Value* vGatherComponents[], bool bPackedOutput)
1150     {
1151         switch (info.bpp / info.numComps)
1152         {
1153             case 8:
1154             {
1155                 Value* vGatherMaskedVal = VIMMED1((int32_t)0);
1156                 Value* vGatherResult = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask);
1157                 // e.g. result of an 8x32bit integer gather for 8bit components
1158                 // 256i - 0    1    2    3    4    5    6    7
1159                 //        xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw
1160
1161                 Shuffle8bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
1162             }
1163                 break;
1164             case 16:
1165             {
1166                 Value* vGatherResult[2];
1167
1168                 // TODO: vGatherMaskedVal
1169                 Value* vGatherMaskedVal = VIMMED1((int32_t)0);
1170
1171                 // always have at least one component out of x or y to fetch
1172
1173                 vGatherResult[0] = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask);
1174                 // e.g. result of first 8x32bit integer gather for 16bit components
1175                 // 256i - 0    1    2    3    4    5    6    7
1176                 //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
1177                 //
1178
1179                 // if we have at least one component out of x or y to fetch
1180                 if(info.numComps > 2)
1181                 {
1182                     // offset base to the next components(zw) in the vertex to gather
1183                     pSrcBase = GEP(pSrcBase, C((char)4));
1184
1185                     vGatherResult[1] = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask);
1186                     // e.g. result of second 8x32bit integer gather for 16bit components
1187                     // 256i - 0    1    2    3    4    5    6    7
1188                     //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
1189                     //
1190                 }
1191                 else
1192                 {
1193                     vGatherResult[1] = vGatherMaskedVal;
1194                 }
1195
1196                 // Shuffle gathered components into place, each row is a component
1197                 Shuffle16bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
1198
1199             }
1200                 break;
1201             case 32:
1202             {
1203                 // apply defaults
1204                 for (uint32_t i = 0; i < 4; ++i)
1205                 {
1206                     vGatherComponents[i] = VIMMED1((int)info.defaults[i]);
1207                 }
1208
1209                 for(uint32_t i = 0; i < info.numComps; i++)
1210                 {
1211                     uint32_t swizzleIndex = info.swizzle[i];
1212
1213                     // Gather a SIMD of components
1214                     vGatherComponents[swizzleIndex] = GATHERDD(vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask);
1215
1216                     // offset base to the next component to gather
1217                     pSrcBase = GEP(pSrcBase, C((char)4));
1218                 }
1219             }
1220                 break;
1221             default:
1222                 SWR_INVALID("unsupported format");
1223             break;
1224         }
1225     }
1226
1227     void Builder::Shuffle16bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput[2], Value* vGatherOutput[4], bool bPackedOutput)
1228     {
1229         // cast types
1230         Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
1231         Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
1232
1233         // input could either be float or int vector; do shuffle work in int
1234         vGatherInput[0] = BITCAST(vGatherInput[0], mSimdInt32Ty);
1235         vGatherInput[1] = BITCAST(vGatherInput[1], mSimdInt32Ty);
1236
1237         if(bPackedOutput)
1238         {
1239             Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
1240
1241             // shuffle mask
1242             Value* vConstMask = C<char>({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
1243                                          0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15});
1244             Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput[0], v32x8Ty), vConstMask), vGatherTy);
1245             // after pshufb: group components together in each 128bit lane
1246             // 256i - 0    1    2    3    4    5    6    7
1247             //        xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
1248
1249             Value* vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
1250             // after PERMD: move and pack xy components into each 128bit lane
1251             // 256i - 0    1    2    3    4    5    6    7
1252             //        xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
1253
1254             // do the same for zw components
1255             Value* vi128ZW = nullptr;
1256             if(info.numComps > 2)
1257             {
1258                 Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput[1], v32x8Ty), vConstMask), vGatherTy);
1259                 vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
1260             }
1261
1262             for(uint32_t i = 0; i < 4; i++)
1263             {
1264                 uint32_t swizzleIndex = info.swizzle[i];
1265                 // todo: fixed for packed
1266                 Value* vGatherMaskedVal = VIMMED1((int32_t)(info.defaults[i]));
1267                 if(i >= info.numComps)
1268                 {
1269                     // set the default component val
1270                     vGatherOutput[swizzleIndex] = vGatherMaskedVal;
1271                     continue;
1272                 }
1273
1274                 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
1275                 uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
1276                 // if x or y, use vi128XY permute result, else use vi128ZW
1277                 Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
1278
1279                 // extract packed component 128 bit lanes
1280                 vGatherOutput[swizzleIndex] = VEXTRACT(selectedPermute, C(lane));
1281             }
1282
1283         }
1284         else
1285         {
1286             // pshufb masks for each component
1287             Value* vConstMask[2];
1288             // x/z shuffle mask
1289             vConstMask[0] = C<char>({0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
1290                                      0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, });
1291
1292             // y/w shuffle mask
1293             vConstMask[1] = C<char>({2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
1294                                      2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1});
1295
1296
1297             // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits
1298             // apply defaults
1299             for (uint32_t i = 0; i < 4; ++i)
1300             {
1301                 vGatherOutput[i] = VIMMED1((int32_t)info.defaults[i]);
1302             }
1303
1304             for(uint32_t i = 0; i < info.numComps; i++)
1305             {
1306                 uint32_t swizzleIndex = info.swizzle[i];
1307
1308                 // select correct constMask for x/z or y/w pshufb
1309                 uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1;
1310                 // if x or y, use vi128XY permute result, else use vi128ZW
1311                 uint32_t selectedGather = (i < 2) ? 0 : 1;
1312
1313                 vGatherOutput[swizzleIndex] = BITCAST(PSHUFB(BITCAST(vGatherInput[selectedGather], v32x8Ty), vConstMask[selectedMask]), vGatherTy);
1314                 // after pshufb mask for x channel; z uses the same shuffle from the second gather
1315                 // 256i - 0    1    2    3    4    5    6    7
1316                 //        xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00
1317             }
1318         }
1319     }
1320
1321     void Builder::Shuffle8bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput, Value* vGatherOutput[], bool bPackedOutput)
1322     {
1323         // cast types
1324         Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
1325         Type* v32x8Ty =  VectorType::get(mInt8Ty, mVWidth * 4 ); // vwidth is units of 32 bits
1326
1327         if(bPackedOutput)
1328         {
1329             Type* v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
1330             // shuffle mask
1331             Value* vConstMask = C<char>({0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15,
1332                                          0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15});
1333             Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput, v32x8Ty), vConstMask), vGatherTy);
1334             // after pshufb: group components together in each 128bit lane
1335             // 256i - 0    1    2    3    4    5    6    7
1336             //        xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww
1337
1338             Value* vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({0, 4, 0, 0, 1, 5, 0, 0})), v128Ty);
1339             // after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane
1340             // 256i - 0    1    2    3    4    5    6    7
1341             //        xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care)
1342
1343             // do the same for zw components
1344             Value* vi128ZW = nullptr;
1345             if(info.numComps > 2)
1346             {
1347                 vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({2, 6, 0, 0, 3, 7, 0, 0})), v128Ty);
1348             }
1349
1350             // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
1351             for(uint32_t i = 0; i < 4; i++)
1352             {
1353                 uint32_t swizzleIndex = info.swizzle[i];
1354                 // todo: fix for packed
1355                 Value* vGatherMaskedVal = VIMMED1((int32_t)(info.defaults[i]));
1356                 if(i >= info.numComps)
1357                 {
1358                     // set the default component val
1359                     vGatherOutput[swizzleIndex] = vGatherMaskedVal;
1360                     continue;
1361                 }
1362
1363                 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
1364                 uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
1365                 // if x or y, use vi128XY permute result, else use vi128ZW
1366                 Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
1367
1368                 // sign extend
1369                 vGatherOutput[swizzleIndex] = VEXTRACT(selectedPermute, C(lane));
1370             }
1371         }
1372         // else zero extend
1373         else{
1374             // shuffle enabled components into lower byte of each 32bit lane, 0 extending to 32 bits
1375             // apply defaults
1376             for (uint32_t i = 0; i < 4; ++i)
1377             {
1378                 vGatherOutput[i] = VIMMED1((int32_t)info.defaults[i]);
1379             }
1380
1381             for(uint32_t i = 0; i < info.numComps; i++){
1382                 uint32_t swizzleIndex = info.swizzle[i];
1383
1384                 // pshufb masks for each component
1385                 Value* vConstMask;
1386                 switch(i)
1387                 {
1388                     case 0:
1389                         // x shuffle mask
1390                         vConstMask = C<char>({0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1,
1391                                               0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1});
1392                         break;
1393                     case 1:
1394                         // y shuffle mask
1395                         vConstMask = C<char>({1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1,
1396                                               1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1});
1397                         break;
1398                     case 2:
1399                         // z shuffle mask
1400                         vConstMask = C<char>({2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1,
1401                                               2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1});
1402                         break;
1403                     case 3:
1404                         // w shuffle mask
1405                         vConstMask = C<char>({3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1,
1406                                               3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1});
1407                         break;
1408                     default:
1409                         vConstMask = nullptr;
1410                         break;
1411                 }
1412
1413                     vGatherOutput[swizzleIndex] = BITCAST(PSHUFB(BITCAST(vGatherInput, v32x8Ty), vConstMask), vGatherTy);
1414                     // after pshufb for x channel
1415                     // 256i - 0    1    2    3    4    5    6    7
1416                     //        x000 x000 x000 x000 x000 x000 x000 x000
1417             }
1418         }
1419     }
1420
1421     // Helper function to create alloca in entry block of function
1422     Value* Builder::CreateEntryAlloca(Function* pFunc, Type* pType)
1423     {
1424         auto saveIP = IRB()->saveIP();
1425         IRB()->SetInsertPoint(&pFunc->getEntryBlock(),
1426                               pFunc->getEntryBlock().begin());
1427         Value* pAlloca = ALLOCA(pType);
1428         if (saveIP.isSet()) IRB()->restoreIP(saveIP);
1429         return pAlloca;
1430     }
1431
1432     Value* Builder::CreateEntryAlloca(Function* pFunc, Type* pType, Value* pArraySize)
1433     {
1434         auto saveIP = IRB()->saveIP();
1435         IRB()->SetInsertPoint(&pFunc->getEntryBlock(),
1436             pFunc->getEntryBlock().begin());
1437         Value* pAlloca = ALLOCA(pType, pArraySize);
1438         if (saveIP.isSet()) IRB()->restoreIP(saveIP);
1439         return pAlloca;
1440     }
1441
1442     //////////////////////////////////////////////////////////////////////////
1443     /// @brief emulates a scatter operation.
1444     /// @param pDst - pointer to destination
1445     /// @param vSrc - vector of src data to scatter
1446     /// @param vOffsets - vector of byte offsets from pDst
1447     /// @param vMask - mask of valid lanes
1448     void Builder::SCATTERPS(Value* pDst, Value* vSrc, Value* vOffsets, Value* vMask)
1449     {
1450         /* Scatter algorithm
1451
1452            while(Index = BitScanForward(mask))
1453                 srcElem = srcVector[Index]
1454                 offsetElem = offsetVector[Index]
1455                 *(pDst + offsetElem) = srcElem
1456                 Update mask (&= ~(1<<Index)
1457
1458         */
1459
1460         BasicBlock* pCurBB = IRB()->GetInsertBlock();
1461         Function* pFunc = pCurBB->getParent();
1462         Type* pSrcTy = vSrc->getType()->getVectorElementType();
1463
1464         // Store vectors on stack
1465         if (pScatterStackSrc == nullptr)
1466         {
1467             // Save off stack allocations and reuse per scatter. Significantly reduces stack
1468             // requirements for shaders with a lot of scatters.
1469             pScatterStackSrc = CreateEntryAlloca(pFunc, mSimdInt64Ty);
1470             pScatterStackOffsets = CreateEntryAlloca(pFunc, mSimdInt32Ty);
1471         }
1472
1473         Value* pSrcArrayPtr = BITCAST(pScatterStackSrc, PointerType::get(vSrc->getType(), 0));
1474         Value* pOffsetsArrayPtr = pScatterStackOffsets;
1475         STORE(vSrc, pSrcArrayPtr);
1476         STORE(vOffsets, pOffsetsArrayPtr);
1477
1478         // Cast to pointers for random access
1479         pSrcArrayPtr = POINTER_CAST(pSrcArrayPtr, PointerType::get(pSrcTy, 0));
1480         pOffsetsArrayPtr = POINTER_CAST(pOffsetsArrayPtr, PointerType::get(mInt32Ty, 0));
1481
1482         Value* pMask = VMOVMSKPS(BITCAST(vMask, mSimdFP32Ty));
1483
1484         // Get cttz function
1485         Function* pfnCttz = Intrinsic::getDeclaration(mpJitMgr->mpCurrentModule, Intrinsic::cttz, { mInt32Ty });
1486
1487         // Setup loop basic block
1488         BasicBlock* pLoop = BasicBlock::Create(mpJitMgr->mContext, "Scatter_Loop", pFunc);
1489
1490         // compute first set bit
1491         Value* pIndex = CALL(pfnCttz, { pMask, C(false) });
1492
1493         Value* pIsUndef = ICMP_EQ(pIndex, C(32));
1494
1495         // Split current block
1496         BasicBlock* pPostLoop = pCurBB->splitBasicBlock(cast<Instruction>(pIsUndef)->getNextNode());
1497
1498         // Remove unconditional jump created by splitBasicBlock
1499         pCurBB->getTerminator()->eraseFromParent();
1500
1501         // Add terminator to end of original block
1502         IRB()->SetInsertPoint(pCurBB);
1503
1504         // Add conditional branch
1505         COND_BR(pIsUndef, pPostLoop, pLoop);
1506
1507         // Add loop basic block contents
1508         IRB()->SetInsertPoint(pLoop);
1509         PHINode* pIndexPhi = PHI(mInt32Ty, 2);
1510         PHINode* pMaskPhi = PHI(mInt32Ty, 2);
1511
1512         pIndexPhi->addIncoming(pIndex, pCurBB);
1513         pMaskPhi->addIncoming(pMask, pCurBB);
1514
1515         // Extract elements for this index
1516         Value* pSrcElem = LOADV(pSrcArrayPtr, { pIndexPhi });
1517         Value* pOffsetElem = LOADV(pOffsetsArrayPtr, { pIndexPhi });
1518
1519         // GEP to this offset in dst
1520         Value* pCurDst = GEP(pDst, pOffsetElem);
1521         pCurDst = POINTER_CAST(pCurDst, PointerType::get(pSrcTy, 0));
1522         STORE(pSrcElem, pCurDst);
1523
1524         // Update the mask
1525         Value* pNewMask = AND(pMaskPhi, NOT(SHL(C(1), pIndexPhi)));
1526
1527         // Terminator
1528         Value* pNewIndex = CALL(pfnCttz, { pNewMask, C(false) });
1529
1530         pIsUndef = ICMP_EQ(pNewIndex, C(32));
1531         COND_BR(pIsUndef, pPostLoop, pLoop);
1532
1533         // Update phi edges
1534         pIndexPhi->addIncoming(pNewIndex, pLoop);
1535         pMaskPhi->addIncoming(pNewMask, pLoop);
1536
1537         // Move builder to beginning of post loop
1538         IRB()->SetInsertPoint(pPostLoop, pPostLoop->begin());
1539     }
1540
1541     Value* Builder::VABSPS(Value* a)
1542     {
1543         Value* asInt = BITCAST(a, mSimdInt32Ty);
1544         Value* result = BITCAST(AND(asInt, VIMMED1(0x7fffffff)), mSimdFP32Ty);
1545         return result;
1546     }
1547
1548     Value *Builder::ICLAMP(Value* src, Value* low, Value* high, const llvm::Twine& name)
1549     {
1550         Value *lowCmp = ICMP_SLT(src, low);
1551         Value *ret = SELECT(lowCmp, low, src);
1552
1553         Value *highCmp = ICMP_SGT(ret, high);
1554         ret = SELECT(highCmp, high, ret, name);
1555
1556         return ret;
1557     }
1558
1559     Value *Builder::FCLAMP(Value* src, Value* low, Value* high)
1560     {
1561         Value *lowCmp = FCMP_OLT(src, low);
1562         Value *ret = SELECT(lowCmp, low, src);
1563
1564         Value *highCmp = FCMP_OGT(ret, high);
1565         ret = SELECT(highCmp, high, ret);
1566
1567         return ret;
1568     }
1569
1570     Value *Builder::FCLAMP(Value* src, float low, float high)
1571     {
1572         Value* result = VMAXPS(src, VIMMED1(low));
1573         result = VMINPS(result, VIMMED1(high));
1574
1575         return result;
1576     }
1577
1578     //////////////////////////////////////////////////////////////////////////
1579     /// @brief save/restore stack, providing ability to push/pop the stack and
1580     ///        reduce overall stack requirements for temporary stack use
1581     Value* Builder::STACKSAVE()
1582     {
1583         Function* pfnStackSave = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::stacksave);
1584         return CALLA(pfnStackSave);
1585     }
1586
1587     void Builder::STACKRESTORE(Value* pSaved)
1588     {
1589         Function* pfnStackRestore = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::stackrestore);
1590         CALL(pfnStackRestore, std::initializer_list<Value*>{pSaved});
1591     }
1592
1593     Value *Builder::FMADDPS(Value* a, Value* b, Value* c)
1594     {
1595         Value* vOut;
1596         // use FMADs if available
1597         if(JM()->mArch.AVX2())
1598         {
1599             vOut = VFMADDPS(a, b, c);
1600         }
1601         else
1602         {
1603             vOut = FADD(FMUL(a, b), c);
1604         }
1605         return vOut;
1606     }
1607
1608     Value* Builder::POPCNT(Value* a)
1609     {
1610         Function* pCtPop = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::ctpop, { a->getType() });
1611         return CALL(pCtPop, std::initializer_list<Value*>{a});
1612     }
1613
1614     //////////////////////////////////////////////////////////////////////////
1615     /// @brief C functions called by LLVM IR
1616     //////////////////////////////////////////////////////////////////////////
1617
1618     //////////////////////////////////////////////////////////////////////////
1619     /// @brief called in JIT code, inserted by PRINT
1620     /// output to both stdout and visual studio debug console
1621     void __cdecl CallPrint(const char* fmt, ...)
1622     {
1623         va_list args;
1624         va_start(args, fmt);
1625         vprintf(fmt, args);
1626
1627     #if defined( _WIN32 )
1628         char strBuf[1024];
1629         vsnprintf_s(strBuf, _TRUNCATE, fmt, args);
1630         OutputDebugStringA(strBuf);
1631     #endif
1632
1633         va_end(args);
1634     }
1635
1636     Value *Builder::VEXTRACTI128(Value* a, Constant* imm8)
1637     {
1638         bool flag = !imm8->isZeroValue();
1639         SmallVector<Constant*,8> idx;
1640         for (unsigned i = 0; i < mVWidth / 2; i++) {
1641             idx.push_back(C(flag ? i + mVWidth / 2 : i));
1642         }
1643         return VSHUFFLE(a, VUNDEF_I(), ConstantVector::get(idx));
1644     }
1645
1646     Value *Builder::VINSERTI128(Value* a, Value* b, Constant* imm8)
1647     {
1648         bool flag = !imm8->isZeroValue();
1649         SmallVector<Constant*,8> idx;
1650         for (unsigned i = 0; i < mVWidth; i++) {
1651             idx.push_back(C(i));
1652         }
1653         Value *inter = VSHUFFLE(b, VUNDEF_I(), ConstantVector::get(idx));
1654
1655         SmallVector<Constant*,8> idx2;
1656         for (unsigned i = 0; i < mVWidth / 2; i++) {
1657             idx2.push_back(C(flag ? i : i + mVWidth));
1658         }
1659         for (unsigned i = mVWidth / 2; i < mVWidth; i++) {
1660             idx2.push_back(C(flag ? i + mVWidth / 2 : i));
1661         }
1662         return VSHUFFLE(a, inter, ConstantVector::get(idx2));
1663     }
1664
1665     // rdtsc buckets macros
1666     void Builder::RDTSC_START(Value* pBucketMgr, Value* pId)
1667     {
1668         // @todo due to an issue with thread local storage propagation in llvm, we can only safely call into
1669         // buckets framework when single threaded
1670         if (KNOB_SINGLE_THREADED)
1671         {
1672             std::vector<Type*> args{
1673                 PointerType::get(mInt32Ty, 0),   // pBucketMgr
1674                 mInt32Ty                        // id
1675             };
1676
1677             FunctionType* pFuncTy = FunctionType::get(Type::getVoidTy(JM()->mContext), args, false);
1678             Function* pFunc = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("BucketManager_StartBucket", pFuncTy));
1679             if (sys::DynamicLibrary::SearchForAddressOfSymbol("BucketManager_StartBucket") == nullptr)
1680             {
1681                 sys::DynamicLibrary::AddSymbol("BucketManager_StartBucket", (void*)&BucketManager_StartBucket);
1682             }
1683
1684             CALL(pFunc, { pBucketMgr, pId });
1685         }
1686     }
1687
1688     void Builder::RDTSC_STOP(Value* pBucketMgr, Value* pId)
1689     {
1690         // @todo due to an issue with thread local storage propagation in llvm, we can only safely call into
1691         // buckets framework when single threaded
1692         if (KNOB_SINGLE_THREADED)
1693         {
1694             std::vector<Type*> args{
1695                 PointerType::get(mInt32Ty, 0),   // pBucketMgr
1696                 mInt32Ty                        // id
1697             };
1698
1699             FunctionType* pFuncTy = FunctionType::get(Type::getVoidTy(JM()->mContext), args, false);
1700             Function* pFunc = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("BucketManager_StopBucket", pFuncTy));
1701             if (sys::DynamicLibrary::SearchForAddressOfSymbol("BucketManager_StopBucket") == nullptr)
1702             {
1703                 sys::DynamicLibrary::AddSymbol("BucketManager_StopBucket", (void*)&BucketManager_StopBucket);
1704             }
1705
1706             CALL(pFunc, { pBucketMgr, pId });
1707         }
1708     }
1709
1710
1711     uint32_t Builder::GetTypeSize(Type* pType)
1712     {
1713         if (pType->isStructTy())
1714         {
1715             uint32_t numElems = pType->getStructNumElements();
1716             Type* pElemTy = pType->getStructElementType(0);
1717             return numElems * GetTypeSize(pElemTy);
1718         }
1719
1720         if (pType->isArrayTy())
1721         {
1722             uint32_t numElems = pType->getArrayNumElements();
1723             Type* pElemTy = pType->getArrayElementType();
1724             return numElems * GetTypeSize(pElemTy);
1725         }
1726
1727         if (pType->isIntegerTy())
1728         {
1729             uint32_t bitSize = pType->getIntegerBitWidth();
1730             return bitSize / 8;
1731         }
1732
1733         if (pType->isFloatTy())
1734         {
1735             return 4;
1736         }
1737
1738         if (pType->isHalfTy())
1739         {
1740             return 2;
1741         }
1742
1743         if (pType->isDoubleTy())
1744         {
1745             return 8;
1746         }
1747
1748         SWR_ASSERT(false, "Unimplemented type.");
1749         return 0;
1750     }
1751 }