src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp

   1 /****************************************************************************
   2 * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
   3 *
   4 * Permission is hereby granted, free of charge, to any person obtaining a
   5 * copy of this software and associated documentation files (the "Software"),
   6 * to deal in the Software without restriction, including without limitation
   7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8 * and/or sell copies of the Software, and to permit persons to whom the
   9 * Software is furnished to do so, subject to the following conditions:
  10 *
  11 * The above copyright notice and this permission notice (including the next
  12 * paragraph) shall be included in all copies or substantial portions of the
  13 * Software.
  14 *
  15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21 * IN THE SOFTWARE.
  22 *
  23 * @file builder_misc.cpp
  24 *
  25 * @brief Implementation for miscellaneous builder functions
  26 *
  27 * Notes:
  28 *
  29 ******************************************************************************/
  30 #include "builder.h"
  31 #include "common/rdtsc_buckets.h"
  32
  33 #include <cstdarg>
  34
  35 namespace SwrJit
  36 {
  37     void __cdecl CallPrint(const char* fmt, ...);
  38
  39     //////////////////////////////////////////////////////////////////////////
  40     /// @brief Convert an IEEE 754 32-bit single precision float to an
  41     ///        16 bit float with 5 exponent bits and a variable
  42     ///        number of mantissa bits.
  43     /// @param val - 32-bit float
  44     /// @todo Maybe move this outside of this file into a header?
  45     static uint16_t ConvertFloat32ToFloat16(float val)
  46     {
  47         uint32_t sign, exp, mant;
  48         uint32_t roundBits;
  49
  50         // Extract the sign, exponent, and mantissa
  51         uint32_t uf = *(uint32_t*)&val;
  52         sign = (uf & 0x80000000) >> 31;
  53         exp = (uf & 0x7F800000) >> 23;
  54         mant = uf & 0x007FFFFF;
  55
  56         // Check for out of range
  57         if (std::isnan(val))
  58         {
  59             exp = 0x1F;
  60             mant = 0x200;
  61             sign = 1;                     // set the sign bit for NANs
  62         }
  63         else if (std::isinf(val))
  64         {
  65             exp = 0x1f;
  66             mant = 0x0;
  67         }
  68         else if (exp > (0x70 + 0x1E)) // Too big to represent -> max representable value
  69         {
  70             exp = 0x1E;
  71             mant = 0x3FF;
  72         }
  73         else if ((exp <= 0x70) && (exp >= 0x66)) // It's a denorm
  74         {
  75             mant |= 0x00800000;
  76             for (; exp <= 0x70; mant >>= 1, exp++)
  77                 ;
  78             exp = 0;
  79             mant = mant >> 13;
  80         }
  81         else if (exp < 0x66) // Too small to represent -> Zero
  82         {
  83             exp = 0;
  84             mant = 0;
  85         }
  86         else
  87         {
  88             // Saves bits that will be shifted off for rounding
  89             roundBits = mant & 0x1FFFu;
  90             // convert exponent and mantissa to 16 bit format
  91             exp = exp - 0x70;
  92             mant = mant >> 13;
  93
  94             // Essentially RTZ, but round up if off by only 1 lsb
  95             if (roundBits == 0x1FFFu)
  96             {
  97                 mant++;
  98                 // check for overflow
  99                 if ((mant & 0xC00u) != 0)
 100                     exp++;
 101                 // make sure only the needed bits are used
 102                 mant &= 0x3FF;
 103             }
 104         }
 105
 106         uint32_t tmpVal = (sign << 15) | (exp << 10) | mant;
 107         return (uint16_t)tmpVal;
 108     }
 109
 110     //////////////////////////////////////////////////////////////////////////
 111     /// @brief Convert an IEEE 754 16-bit float to an 32-bit single precision
 112     ///        float
 113     /// @param val - 16-bit float
 114     /// @todo Maybe move this outside of this file into a header?
 115     static float ConvertFloat16ToFloat32(uint32_t val)
 116     {
 117         uint32_t result;
 118         if ((val & 0x7fff) == 0)
 119         {
 120             result = ((uint32_t)(val & 0x8000)) << 16;
 121         }
 122         else if ((val & 0x7c00) == 0x7c00)
 123         {
 124             result = ((val & 0x3ff) == 0) ? 0x7f800000 : 0x7fc00000;
 125             result |= ((uint32_t)val & 0x8000) << 16;
 126         }
 127         else
 128         {
 129             uint32_t sign = (val & 0x8000) << 16;
 130             uint32_t mant = (val & 0x3ff) << 13;
 131             uint32_t exp = (val >> 10) & 0x1f;
 132             if ((exp == 0) && (mant != 0)) // Adjust exponent and mantissa for denormals
 133             {
 134                 mant <<= 1;
 135                 while (mant < (0x400 << 13))
 136                 {
 137                     exp--;
 138                     mant <<= 1;
 139                 }
 140                 mant &= (0x3ff << 13);
 141             }
 142             exp = ((exp - 15 + 127) & 0xff) << 23;
 143             result = sign | exp | mant;
 144         }
 145
 146         return *(float*)&result;
 147     }
 148
 149     Constant *Builder::C(bool i)
 150     {
 151         return ConstantInt::get(IRB()->getInt1Ty(), (i ? 1 : 0));
 152     }
 153
 154     Constant *Builder::C(char i)
 155     {
 156         return ConstantInt::get(IRB()->getInt8Ty(), i);
 157     }
 158
 159     Constant *Builder::C(uint8_t i)
 160     {
 161         return ConstantInt::get(IRB()->getInt8Ty(), i);
 162     }
 163
 164     Constant *Builder::C(int i)
 165     {
 166         return ConstantInt::get(IRB()->getInt32Ty(), i);
 167     }
 168
 169     Constant *Builder::C(int64_t i)
 170     {
 171         return ConstantInt::get(IRB()->getInt64Ty(), i);
 172     }
 173
 174     Constant *Builder::C(uint16_t i)
 175     {
 176         return ConstantInt::get(mInt16Ty,i);
 177     }
 178
 179     Constant *Builder::C(uint32_t i)
 180     {
 181         return ConstantInt::get(IRB()->getInt32Ty(), i);
 182     }
 183
 184     Constant *Builder::C(float i)
 185     {
 186         return ConstantFP::get(IRB()->getFloatTy(), i);
 187     }
 188
 189     Constant *Builder::PRED(bool pred)
 190     {
 191         return ConstantInt::get(IRB()->getInt1Ty(), (pred ? 1 : 0));
 192     }
 193
 194     Value *Builder::VIMMED1(int i)
 195     {
 196         return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i)));
 197     }
 198
 199     Value *Builder::VIMMED1(uint32_t i)
 200     {
 201         return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i)));
 202     }
 203
 204     Value *Builder::VIMMED1(float i)
 205     {
 206         return ConstantVector::getSplat(mVWidth, cast<ConstantFP>(C(i)));
 207     }
 208
 209     Value *Builder::VIMMED1(bool i)
 210     {
 211         return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i)));
 212     }
 213
 214 #if USE_SIMD16_BUILDER
 215     Value *Builder::VIMMED2_1(int i)
 216     {
 217         return ConstantVector::getSplat(mVWidth2, cast<ConstantInt>(C(i)));
 218     }
 219
 220     Value *Builder::VIMMED2_1(uint32_t i)
 221     {
 222         return ConstantVector::getSplat(mVWidth2, cast<ConstantInt>(C(i)));
 223     }
 224
 225     Value *Builder::VIMMED2_1(float i)
 226     {
 227         return ConstantVector::getSplat(mVWidth2, cast<ConstantFP>(C(i)));
 228     }
 229
 230     Value *Builder::VIMMED2_1(bool i)
 231     {
 232         return ConstantVector::getSplat(mVWidth2, cast<ConstantInt>(C(i)));
 233     }
 234
 235 #endif
 236     Value *Builder::VUNDEF_IPTR()
 237     {
 238         return UndefValue::get(VectorType::get(mInt32PtrTy,mVWidth));
 239     }
 240
 241     Value *Builder::VUNDEF_I()
 242     {
 243         return UndefValue::get(VectorType::get(mInt32Ty, mVWidth));
 244     }
 245
 246     Value *Builder::VUNDEF(Type *ty, uint32_t size)
 247     {
 248         return UndefValue::get(VectorType::get(ty, size));
 249     }
 250
 251     Value *Builder::VUNDEF_F()
 252     {
 253         return UndefValue::get(VectorType::get(mFP32Ty, mVWidth));
 254     }
 255
 256 #if USE_SIMD16_BUILDER
 257     Value *Builder::VUNDEF2_F()
 258     {
 259         return UndefValue::get(VectorType::get(mFP32Ty, mVWidth2));
 260     }
 261
 262     Value *Builder::VUNDEF2_I()
 263     {
 264         return UndefValue::get(VectorType::get(mInt32Ty, mVWidth2));
 265     }
 266
 267 #endif
 268     Value *Builder::VUNDEF(Type* t)
 269     {
 270         return UndefValue::get(VectorType::get(t, mVWidth));
 271     }
 272
 273     Value *Builder::VBROADCAST(Value *src)
 274     {
 275         // check if src is already a vector
 276         if (src->getType()->isVectorTy())
 277         {
 278             return src;
 279         }
 280
 281         return VECTOR_SPLAT(mVWidth, src);
 282     }
 283
 284 #if USE_SIMD16_BUILDER
 285     Value *Builder::VBROADCAST2(Value *src)
 286     {
 287         // check if src is already a vector
 288         if (src->getType()->isVectorTy())
 289         {
 290             return src;
 291         }
 292
 293         return VECTOR_SPLAT(mVWidth2, src);
 294     }
 295
 296 #endif
 297     uint32_t Builder::IMMED(Value* v)
 298     {
 299         SWR_ASSERT(isa<ConstantInt>(v));
 300         ConstantInt *pValConst = cast<ConstantInt>(v);
 301         return pValConst->getZExtValue();
 302     }
 303
 304     int32_t Builder::S_IMMED(Value* v)
 305     {
 306         SWR_ASSERT(isa<ConstantInt>(v));
 307         ConstantInt *pValConst = cast<ConstantInt>(v);
 308         return pValConst->getSExtValue();
 309     }
 310
 311     Value *Builder::GEP(Value* ptr, const std::initializer_list<Value*> &indexList)
 312     {
 313         std::vector<Value*> indices;
 314         for (auto i : indexList)
 315             indices.push_back(i);
 316         return GEPA(ptr, indices);
 317     }
 318
 319     Value *Builder::GEP(Value* ptr, const std::initializer_list<uint32_t> &indexList)
 320     {
 321         std::vector<Value*> indices;
 322         for (auto i : indexList)
 323             indices.push_back(C(i));
 324         return GEPA(ptr, indices);
 325     }
 326
 327     Value *Builder::IN_BOUNDS_GEP(Value* ptr, const std::initializer_list<Value*> &indexList)
 328     {
 329         std::vector<Value*> indices;
 330         for (auto i : indexList)
 331             indices.push_back(i);
 332         return IN_BOUNDS_GEP(ptr, indices);
 333     }
 334
 335     Value *Builder::IN_BOUNDS_GEP(Value* ptr, const std::initializer_list<uint32_t> &indexList)
 336     {
 337         std::vector<Value*> indices;
 338         for (auto i : indexList)
 339             indices.push_back(C(i));
 340         return IN_BOUNDS_GEP(ptr, indices);
 341     }
 342
 343     LoadInst *Builder::LOAD(Value *basePtr, const std::initializer_list<uint32_t> &indices, const llvm::Twine& name)
 344     {
 345         std::vector<Value*> valIndices;
 346         for (auto i : indices)
 347             valIndices.push_back(C(i));
 348         return LOAD(GEPA(basePtr, valIndices), name);
 349     }
 350
 351     LoadInst *Builder::LOADV(Value *basePtr, const std::initializer_list<Value*> &indices, const llvm::Twine& name)
 352     {
 353         std::vector<Value*> valIndices;
 354         for (auto i : indices)
 355             valIndices.push_back(i);
 356         return LOAD(GEPA(basePtr, valIndices), name);
 357     }
 358
 359     StoreInst *Builder::STORE(Value *val, Value *basePtr, const std::initializer_list<uint32_t> &indices)
 360     {
 361         std::vector<Value*> valIndices;
 362         for (auto i : indices)
 363             valIndices.push_back(C(i));
 364         return STORE(val, GEPA(basePtr, valIndices));
 365     }
 366
 367     StoreInst *Builder::STOREV(Value *val, Value *basePtr, const std::initializer_list<Value*> &indices)
 368     {
 369         std::vector<Value*> valIndices;
 370         for (auto i : indices)
 371             valIndices.push_back(i);
 372         return STORE(val, GEPA(basePtr, valIndices));
 373     }
 374
 375     CallInst *Builder::CALL(Value *Callee, const std::initializer_list<Value*> &argsList)
 376     {
 377         std::vector<Value*> args;
 378         for (auto arg : argsList)
 379             args.push_back(arg);
 380         return CALLA(Callee, args);
 381     }
 382
 383     CallInst *Builder::CALL(Value *Callee, Value* arg)
 384     {
 385         std::vector<Value*> args;
 386         args.push_back(arg);
 387         return CALLA(Callee, args);
 388     }
 389
 390     CallInst *Builder::CALL2(Value *Callee, Value* arg1, Value* arg2)
 391     {
 392         std::vector<Value*> args;
 393         args.push_back(arg1);
 394         args.push_back(arg2);
 395         return CALLA(Callee, args);
 396     }
 397
 398     CallInst *Builder::CALL3(Value *Callee, Value* arg1, Value* arg2, Value* arg3)
 399     {
 400         std::vector<Value*> args;
 401         args.push_back(arg1);
 402         args.push_back(arg2);
 403         args.push_back(arg3);
 404         return CALLA(Callee, args);
 405     }
 406
 407     //////////////////////////////////////////////////////////////////////////
 408     Value *Builder::DEBUGTRAP()
 409     {
 410         Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::debugtrap);
 411         return CALL(func);
 412     }
 413
 414     Value *Builder::VRCP(Value *va)
 415     {
 416         return FDIV(VIMMED1(1.0f), va);  // 1 / a
 417     }
 418
 419     Value *Builder::VPLANEPS(Value* vA, Value* vB, Value* vC, Value* &vX, Value* &vY)
 420     {
 421         Value* vOut = FMADDPS(vA, vX, vC);
 422         vOut = FMADDPS(vB, vY, vOut);
 423         return vOut;
 424     }
 425
 426     //////////////////////////////////////////////////////////////////////////
 427     /// @brief Generate an i32 masked load operation in LLVM IR.  If not
 428     /// supported on the underlying platform, emulate it with float masked load
 429     /// @param src - base address pointer for the load
 430     /// @param vMask - SIMD wide mask that controls whether to access memory load 0
 431     Value *Builder::MASKLOADD(Value* src,Value* mask)
 432     {
 433         Value* vResult;
 434         // use avx2 gather instruction is available
 435         if(JM()->mArch.AVX2())
 436         {
 437             Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx2_maskload_d_256);
 438             vResult = CALL(func,{src,mask});
 439         }
 440         else
 441         {
 442             // maskload intrinsic expects integer mask operand in llvm >= 3.8
 443     #if (LLVM_VERSION_MAJOR > 3) || (LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 8)
 444             mask = BITCAST(mask,VectorType::get(mInt32Ty,mVWidth));
 445     #else
 446             mask = BITCAST(mask,VectorType::get(mFP32Ty,mVWidth));
 447     #endif
 448             Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule,Intrinsic::x86_avx_maskload_ps_256);
 449             vResult = BITCAST(CALL(func,{src,mask}), VectorType::get(mInt32Ty,mVWidth));
 450         }
 451         return vResult;
 452     }
 453
 454     //////////////////////////////////////////////////////////////////////////
 455     /// @brief insert a JIT call to CallPrint
 456     /// - outputs formatted string to both stdout and VS output window
 457     /// - DEBUG builds only
 458     /// Usage example:
 459     ///   PRINT("index %d = 0x%p\n",{C(lane), pIndex});
 460     ///   where C(lane) creates a constant value to print, and pIndex is the Value*
 461     ///   result from a GEP, printing out the pointer to memory
 462     /// @param printStr - constant string to print, which includes format specifiers
 463     /// @param printArgs - initializer list of Value*'s to print to std out
 464     CallInst *Builder::PRINT(const std::string &printStr,const std::initializer_list<Value*> &printArgs)
 465     {
 466         // push the arguments to CallPrint into a vector
 467         std::vector<Value*> printCallArgs;
 468         // save room for the format string.  we still need to modify it for vectors
 469         printCallArgs.resize(1);
 470
 471         // search through the format string for special processing
 472         size_t pos = 0;
 473         std::string tempStr(printStr);
 474         pos = tempStr.find('%', pos);
 475         auto v = printArgs.begin();
 476
 477         while ((pos != std::string::npos) && (v != printArgs.end()))
 478         {
 479             Value* pArg = *v;
 480             Type* pType = pArg->getType();
 481
 482             if (pType->isVectorTy())
 483             {
 484                 Type* pContainedType = pType->getContainedType(0);
 485
 486                 if (toupper(tempStr[pos + 1]) == 'X')
 487                 {
 488                     tempStr[pos] = '0';
 489                     tempStr[pos + 1] = 'x';
 490                     tempStr.insert(pos + 2, "%08X ");
 491                     pos += 7;
 492
 493                     printCallArgs.push_back(VEXTRACT(pArg, C(0)));
 494
 495                     std::string vectorFormatStr;
 496                     for (uint32_t i = 1; i < pType->getVectorNumElements(); ++i)
 497                     {
 498                         vectorFormatStr += "0x%08X ";
 499                         printCallArgs.push_back(VEXTRACT(pArg, C(i)));
 500                     }
 501
 502                     tempStr.insert(pos, vectorFormatStr);
 503                     pos += vectorFormatStr.size();
 504                 }
 505                 else if ((tempStr[pos + 1] == 'f') && (pContainedType->isFloatTy()))
 506                 {
 507                     uint32_t i = 0;
 508                     for (; i < (pArg->getType()->getVectorNumElements()) - 1; i++)
 509                     {
 510                         tempStr.insert(pos, std::string("%f "));
 511                         pos += 3;
 512                         printCallArgs.push_back(FP_EXT(VEXTRACT(pArg, C(i)), Type::getDoubleTy(JM()->mContext)));
 513                     }
 514                     printCallArgs.push_back(FP_EXT(VEXTRACT(pArg, C(i)), Type::getDoubleTy(JM()->mContext)));
 515                 }
 516                 else if ((tempStr[pos + 1] == 'd') && (pContainedType->isIntegerTy()))
 517                 {
 518                     uint32_t i = 0;
 519                     for (; i < (pArg->getType()->getVectorNumElements()) - 1; i++)
 520                     {
 521                         tempStr.insert(pos, std::string("%d "));
 522                         pos += 3;
 523                         printCallArgs.push_back(VEXTRACT(pArg, C(i)));
 524                     }
 525                     printCallArgs.push_back(VEXTRACT(pArg, C(i)));
 526                 }
 527             }
 528             else
 529             {
 530                 if (toupper(tempStr[pos + 1]) == 'X')
 531                 {
 532                     tempStr[pos] = '0';
 533                     tempStr.insert(pos + 1, "x%08");
 534                     printCallArgs.push_back(pArg);
 535                     pos += 3;
 536                 }
 537                 // for %f we need to cast float Values to doubles so that they print out correctly
 538                 else if ((tempStr[pos + 1] == 'f') && (pType->isFloatTy()))
 539                 {
 540                     printCallArgs.push_back(FP_EXT(pArg, Type::getDoubleTy(JM()->mContext)));
 541                     pos++;
 542                 }
 543                 else
 544                 {
 545                     printCallArgs.push_back(pArg);
 546                 }
 547             }
 548
 549             // advance to the next arguement
 550             v++;
 551             pos = tempStr.find('%', ++pos);
 552         }
 553
 554         // create global variable constant string
 555         Constant *constString = ConstantDataArray::getString(JM()->mContext,tempStr,true);
 556         GlobalVariable *gvPtr = new GlobalVariable(constString->getType(),true,GlobalValue::InternalLinkage,constString,"printStr");
 557         JM()->mpCurrentModule->getGlobalList().push_back(gvPtr);
 558
 559         // get a pointer to the first character in the constant string array
 560         std::vector<Constant*> geplist{C(0),C(0)};
 561         Constant *strGEP = ConstantExpr::getGetElementPtr(nullptr, gvPtr,geplist,false);
 562
 563         // insert the pointer to the format string in the argument vector
 564         printCallArgs[0] = strGEP;
 565
 566         // get pointer to CallPrint function and insert decl into the module if needed
 567         std::vector<Type*> args;
 568         args.push_back(PointerType::get(mInt8Ty,0));
 569         FunctionType* callPrintTy = FunctionType::get(Type::getVoidTy(JM()->mContext),args,true);
 570         Function *callPrintFn = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("CallPrint", callPrintTy));
 571
 572         // if we haven't yet added the symbol to the symbol table
 573         if((sys::DynamicLibrary::SearchForAddressOfSymbol("CallPrint")) == nullptr)
 574         {
 575             sys::DynamicLibrary::AddSymbol("CallPrint", (void *)&CallPrint);
 576         }
 577
 578         // insert a call to CallPrint
 579         return CALLA(callPrintFn,printCallArgs);
 580     }
 581
 582     //////////////////////////////////////////////////////////////////////////
 583     /// @brief Wrapper around PRINT with initializer list.
 584     CallInst* Builder::PRINT(const std::string &printStr)
 585     {
 586         return PRINT(printStr, {});
 587     }
 588
 589     //////////////////////////////////////////////////////////////////////////
 590     /// @brief Generate a masked gather operation in LLVM IR.  If not
 591     /// supported on the underlying platform, emulate it with loads
 592     /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
 593     /// @param pBase - Int8* base VB address pointer value
 594     /// @param vIndices - SIMD wide value of VB byte offsets
 595     /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
 596     /// @param scale - value to scale indices by
 597     Value *Builder::GATHERPS(Value *vSrc, Value *pBase, Value *vIndices, Value *vMask, uint8_t scale)
 598     {
 599         Value *vGather;
 600
 601         // use avx2 gather instruction if available
 602         if(JM()->mArch.AVX2())
 603         {
 604             // force mask to <N x float>, required by vgather
 605             Value *mask = BITCAST(VMASK(vMask), mSimdFP32Ty);
 606
 607             vGather = VGATHERPS(vSrc, pBase, vIndices, mask, C(scale));
 608         }
 609         else
 610         {
 611             Value* pStack = STACKSAVE();
 612
 613             // store vSrc on the stack.  this way we can select between a valid load address and the vSrc address
 614             Value* vSrcPtr = ALLOCA(vSrc->getType());
 615             STORE(vSrc, vSrcPtr);
 616
 617             vGather = VUNDEF_F();
 618             Value *vScaleVec = VIMMED1((uint32_t)scale);
 619             Value *vOffsets = MUL(vIndices,vScaleVec);
 620             for(uint32_t i = 0; i < mVWidth; ++i)
 621             {
 622                 // single component byte index
 623                 Value *offset = VEXTRACT(vOffsets,C(i));
 624                 // byte pointer to component
 625                 Value *loadAddress = GEP(pBase,offset);
 626                 loadAddress = BITCAST(loadAddress,PointerType::get(mFP32Ty,0));
 627                 // pointer to the value to load if we're masking off a component
 628                 Value *maskLoadAddress = GEP(vSrcPtr,{C(0), C(i)});
 629                 Value *selMask = VEXTRACT(vMask,C(i));
 630                 // switch in a safe address to load if we're trying to access a vertex
 631                 Value *validAddress = SELECT(selMask, loadAddress, maskLoadAddress);
 632                 Value *val = LOAD(validAddress);
 633                 vGather = VINSERT(vGather,val,C(i));
 634             }
 635             STACKRESTORE(pStack);
 636         }
 637
 638         return vGather;
 639     }
 640
 641 #if USE_SIMD16_BUILDER
 642     Value *Builder::GATHERPS_16(Value *vSrc, Value *pBase, Value *vIndices, Value *vMask, uint8_t scale)
 643     {
 644         Value *vGather = VUNDEF2_F();
 645
 646         // use avx512 gather instruction if available
 647         if (JM()->mArch.AVX512F())
 648         {
 649             // force mask to <N-bit Integer>, required by vgather2
 650             Value *mask = BITCAST(vMask, mInt16Ty);
 651
 652             vGather = VGATHERPS_16(vSrc, pBase, vIndices, mask, C((uint32_t)scale));
 653         }
 654         else
 655         {
 656             Value *src0 = EXTRACT2_F(vSrc, 0);
 657             Value *src1 = EXTRACT2_F(vSrc, 1);
 658
 659             Value *indices0 = EXTRACT2_I(vIndices, 0);
 660             Value *indices1 = EXTRACT2_I(vIndices, 1);
 661
 662             Value *vmask16 = VMASK2(vMask);
 663
 664             Value *mask0 = MASK(EXTRACT2_I(vmask16, 0));  // TODO: do this better..
 665             Value *mask1 = MASK(EXTRACT2_I(vmask16, 1));
 666
 667             Value *gather0 = GATHERPS(src0, pBase, indices0, mask0, scale);
 668             Value *gather1 = GATHERPS(src1, pBase, indices1, mask1, scale);
 669
 670             vGather = INSERT2_F(vGather, gather0, 0);
 671             vGather = INSERT2_F(vGather, gather1, 1);
 672         }
 673
 674         return vGather;
 675     }
 676
 677 #endif
 678     //////////////////////////////////////////////////////////////////////////
 679     /// @brief Generate a masked gather operation in LLVM IR.  If not
 680     /// supported on the underlying platform, emulate it with loads
 681     /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
 682     /// @param pBase - Int8* base VB address pointer value
 683     /// @param vIndices - SIMD wide value of VB byte offsets
 684     /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
 685     /// @param scale - value to scale indices by
 686     Value *Builder::GATHERDD(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, uint8_t scale)
 687     {
 688         Value* vGather;
 689
 690         // use avx2 gather instruction if available
 691         if(JM()->mArch.AVX2())
 692         {
 693             vGather = VGATHERDD(vSrc, pBase, vIndices, VMASK(vMask), C(scale));
 694         }
 695         else
 696         {
 697             Value* pStack = STACKSAVE();
 698
 699             // store vSrc on the stack.  this way we can select between a valid load address and the vSrc address
 700             Value* vSrcPtr = ALLOCA(vSrc->getType());
 701             STORE(vSrc, vSrcPtr);
 702
 703             vGather = VUNDEF_I();
 704             Value *vScaleVec = VIMMED1((uint32_t)scale);
 705             Value *vOffsets = MUL(vIndices, vScaleVec);
 706             for(uint32_t i = 0; i < mVWidth; ++i)
 707             {
 708                 // single component byte index
 709                 Value *offset = VEXTRACT(vOffsets, C(i));
 710                 // byte pointer to component
 711                 Value *loadAddress = GEP(pBase, offset);
 712                 loadAddress = BITCAST(loadAddress, PointerType::get(mInt32Ty, 0));
 713                 // pointer to the value to load if we're masking off a component
 714                 Value *maskLoadAddress = GEP(vSrcPtr, {C(0), C(i)});
 715                 Value *selMask = VEXTRACT(vMask, C(i));
 716                 // switch in a safe address to load if we're trying to access a vertex
 717                 Value *validAddress = SELECT(selMask, loadAddress, maskLoadAddress);
 718                 Value *val = LOAD(validAddress, C(0));
 719                 vGather = VINSERT(vGather, val, C(i));
 720             }
 721
 722             STACKRESTORE(pStack);
 723         }
 724         return vGather;
 725     }
 726
 727     //////////////////////////////////////////////////////////////////////////
 728     /// @brief Generate a masked gather operation in LLVM IR.  If not
 729     /// supported on the underlying platform, emulate it with loads
 730     /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
 731     /// @param pBase - Int8* base VB address pointer value
 732     /// @param vIndices - SIMD wide value of VB byte offsets
 733     /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
 734     /// @param scale - value to scale indices by
 735     Value *Builder::GATHERPD(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, uint8_t scale)
 736     {
 737         Value* vGather;
 738
 739         // use avx2 gather instruction if available
 740         if(JM()->mArch.AVX2())
 741         {
 742             vMask = BITCAST(S_EXT(vMask, VectorType::get(mInt64Ty, mVWidth/2)), VectorType::get(mDoubleTy, mVWidth/2));
 743             vGather = VGATHERPD(vSrc, pBase, vIndices, vMask, C(scale));
 744         }
 745         else
 746         {
 747             Value* pStack = STACKSAVE();
 748
 749             // store vSrc on the stack.  this way we can select between a valid load address and the vSrc address
 750             Value* vSrcPtr = ALLOCA(vSrc->getType());
 751             STORE(vSrc, vSrcPtr);
 752
 753             vGather = UndefValue::get(VectorType::get(mDoubleTy, 4));
 754             Value *vScaleVec = VECTOR_SPLAT(4, C((uint32_t)scale));
 755             Value *vOffsets = MUL(vIndices,vScaleVec);
 756             for(uint32_t i = 0; i < mVWidth/2; ++i)
 757             {
 758                 // single component byte index
 759                 Value *offset = VEXTRACT(vOffsets,C(i));
 760                 // byte pointer to component
 761                 Value *loadAddress = GEP(pBase,offset);
 762                 loadAddress = BITCAST(loadAddress,PointerType::get(mDoubleTy,0));
 763                 // pointer to the value to load if we're masking off a component
 764                 Value *maskLoadAddress = GEP(vSrcPtr,{C(0), C(i)});
 765                 Value *selMask = VEXTRACT(vMask,C(i));
 766                 // switch in a safe address to load if we're trying to access a vertex
 767                 Value *validAddress = SELECT(selMask, loadAddress, maskLoadAddress);
 768                 Value *val = LOAD(validAddress);
 769                 vGather = VINSERT(vGather,val,C(i));
 770             }
 771             STACKRESTORE(pStack);
 772         }
 773         return vGather;
 774     }
 775
 776 #if USE_SIMD16_BUILDER
 777     Value *Builder::PSRLI(Value *a, Value *imm)
 778     {
 779         return VPSRLI(a, imm);
 780     }
 781
 782     Value *Builder::PSRLI_16(Value *a, Value *imm)
 783     {
 784         Value *result = VUNDEF2_I();
 785
 786         // use avx512 shift right instruction if available
 787         if (JM()->mArch.AVX512F())
 788         {
 789             result = VPSRLI_16(a, imm);
 790         }
 791         else
 792         {
 793             Value *a0 = EXTRACT2_I(a, 0);
 794             Value *a1 = EXTRACT2_I(a, 1);
 795
 796             Value *result0 = PSRLI(a0, imm);
 797             Value *result1 = PSRLI(a1, imm);
 798
 799             result = INSERT2_I(result, result0, 0);
 800             result = INSERT2_I(result, result1, 1);
 801         }
 802
 803         return result;
 804     }
 805
 806 #endif
 807 #if USE_SIMD16_BUILDER
 808     //////////////////////////////////////////////////////////////////////////
 809     /// @brief
 810     Value *Builder::EXTRACT2_F(Value *a2, uint32_t imm)
 811     {
 812         const uint32_t i0 = (imm > 0) ? mVWidth : 0;
 813
 814         Value *result = VUNDEF_F();
 815
 816         for (uint32_t i = 0; i < mVWidth; i += 1)
 817         {
 818 #if 1
 819             if (!a2->getType()->getScalarType()->isFloatTy())
 820             {
 821                 a2 = BITCAST(a2, mSimd2FP32Ty);
 822             }
 823
 824 #endif
 825             Value *temp = VEXTRACT(a2, C(i0 + i));
 826
 827             result = VINSERT(result, temp, C(i));
 828         }
 829
 830         return result;
 831     }
 832
 833     Value *Builder::EXTRACT2_I(Value *a2, uint32_t imm)
 834     {
 835         return BITCAST(EXTRACT2_F(a2, imm), mSimdInt32Ty);
 836     }
 837
 838     //////////////////////////////////////////////////////////////////////////
 839     /// @brief
 840     Value *Builder::INSERT2_F(Value *a2, Value *b, uint32_t imm)
 841     {
 842         const uint32_t i0 = (imm > 0) ? mVWidth : 0;
 843
 844         Value *result = BITCAST(a2, mSimd2FP32Ty);
 845
 846         for (uint32_t i = 0; i < mVWidth; i += 1)
 847         {
 848 #if 1
 849             if (!b->getType()->getScalarType()->isFloatTy())
 850             {
 851                 b = BITCAST(b, mSimdFP32Ty);
 852             }
 853
 854 #endif
 855             Value *temp = VEXTRACT(b, C(i));
 856
 857             result = VINSERT(result, temp, C(i0 + i));
 858         }
 859
 860         return result;
 861     }
 862
 863     Value *Builder::INSERT2_I(Value *a2, Value *b, uint32_t imm)
 864     {
 865         return BITCAST(INSERT2_F(a2, b, imm), mSimd2Int32Ty);
 866     }
 867
 868 #endif
 869     //////////////////////////////////////////////////////////////////////////
 870     /// @brief convert x86 <N x float> mask to llvm <N x i1> mask
 871     Value *Builder::MASK(Value *vmask)
 872     {
 873         Value *src = BITCAST(vmask, mSimdInt32Ty);
 874         return ICMP_SLT(src, VIMMED1(0));
 875     }
 876
 877 #if USE_SIMD16_BUILDER
 878     Value *Builder::MASK2(Value *vmask)
 879     {
 880         Value *src = BITCAST(vmask, mSimd2Int32Ty);
 881         return ICMP_SLT(src, VIMMED2_1(0));
 882     }
 883
 884 #endif
 885     //////////////////////////////////////////////////////////////////////////
 886     /// @brief convert llvm <N x i1> mask to x86 <N x i32> mask
 887     Value *Builder::VMASK(Value *mask)
 888     {
 889         return S_EXT(mask, mSimdInt32Ty);
 890     }
 891
 892 #if USE_SIMD16_BUILDER
 893     Value *Builder::VMASK2(Value *mask)
 894     {
 895         return S_EXT(mask, mSimd2Int32Ty);
 896     }
 897
 898 #endif
 899     //////////////////////////////////////////////////////////////////////////
 900     /// @brief Generate a VPSHUFB operation in LLVM IR.  If not
 901     /// supported on the underlying platform, emulate it
 902     /// @param a - 256bit SIMD(32x8bit) of 8bit integer values
 903     /// @param b - 256bit SIMD(32x8bit) of 8bit integer mask values
 904     /// Byte masks in lower 128 lane of b selects 8 bit values from lower
 905     /// 128bits of a, and vice versa for the upper lanes.  If the mask
 906     /// value is negative, '0' is inserted.
 907     Value *Builder::PSHUFB(Value* a, Value* b)
 908     {
 909         Value* res;
 910         // use avx2 pshufb instruction if available
 911         if(JM()->mArch.AVX2())
 912         {
 913             res = VPSHUFB(a, b);
 914         }
 915         else
 916         {
 917             Constant* cB = dyn_cast<Constant>(b);
 918             // number of 8 bit elements in b
 919             uint32_t numElms = cast<VectorType>(cB->getType())->getNumElements();
 920             // output vector
 921             Value* vShuf = UndefValue::get(VectorType::get(mInt8Ty, numElms));
 922
 923             // insert an 8 bit value from the high and low lanes of a per loop iteration
 924             numElms /= 2;
 925             for(uint32_t i = 0; i < numElms; i++)
 926             {
 927                 ConstantInt* cLow128b = cast<ConstantInt>(cB->getAggregateElement(i));
 928                 ConstantInt* cHigh128b = cast<ConstantInt>(cB->getAggregateElement(i + numElms));
 929
 930                 // extract values from constant mask
 931                 char valLow128bLane =  (char)(cLow128b->getSExtValue());
 932                 char valHigh128bLane = (char)(cHigh128b->getSExtValue());
 933
 934                 Value* insertValLow128b;
 935                 Value* insertValHigh128b;
 936
 937                 // if the mask value is negative, insert a '0' in the respective output position
 938                 // otherwise, lookup the value at mask position (bits 3..0 of the respective mask byte) in a and insert in output vector
 939                 insertValLow128b = (valLow128bLane < 0) ? C((char)0) : VEXTRACT(a, C((valLow128bLane & 0xF)));
 940                 insertValHigh128b = (valHigh128bLane < 0) ? C((char)0) : VEXTRACT(a, C((valHigh128bLane & 0xF) + numElms));
 941
 942                 vShuf = VINSERT(vShuf, insertValLow128b, i);
 943                 vShuf = VINSERT(vShuf, insertValHigh128b, (i + numElms));
 944             }
 945             res = vShuf;
 946         }
 947         return res;
 948     }
 949
 950     //////////////////////////////////////////////////////////////////////////
 951     /// @brief Generate a VPSHUFB operation (sign extend 8 8bit values to 32
 952     /// bits)in LLVM IR.  If not supported on the underlying platform, emulate it
 953     /// @param a - 128bit SIMD lane(16x8bit) of 8bit integer values.  Only
 954     /// lower 8 values are used.
 955     Value *Builder::PMOVSXBD(Value* a)
 956     {
 957         // VPMOVSXBD output type
 958         Type* v8x32Ty = VectorType::get(mInt32Ty, 8);
 959         // Extract 8 values from 128bit lane and sign extend
 960         return S_EXT(VSHUFFLE(a, a, C<int>({0, 1, 2, 3, 4, 5, 6, 7})), v8x32Ty);
 961     }
 962
 963     //////////////////////////////////////////////////////////////////////////
 964     /// @brief Generate a VPSHUFB operation (sign extend 8 16bit values to 32
 965     /// bits)in LLVM IR.  If not supported on the underlying platform, emulate it
 966     /// @param a - 128bit SIMD lane(8x16bit) of 16bit integer values.
 967     Value *Builder::PMOVSXWD(Value* a)
 968     {
 969         // VPMOVSXWD output type
 970         Type* v8x32Ty = VectorType::get(mInt32Ty, 8);
 971         // Extract 8 values from 128bit lane and sign extend
 972         return S_EXT(VSHUFFLE(a, a, C<int>({0, 1, 2, 3, 4, 5, 6, 7})), v8x32Ty);
 973     }
 974
 975     //////////////////////////////////////////////////////////////////////////
 976     /// @brief Generate a VPERMD operation (shuffle 32 bit integer values
 977     /// across 128 bit lanes) in LLVM IR.  If not supported on the underlying
 978     /// platform, emulate it
 979     /// @param a - 256bit SIMD lane(8x32bit) of integer values.
 980     /// @param idx - 256bit SIMD lane(8x32bit) of 3 bit lane index values
 981     Value *Builder::PERMD(Value* a, Value* idx)
 982     {
 983         Value* res;
 984         // use avx2 permute instruction if available
 985         if(JM()->mArch.AVX2())
 986         {
 987             res = VPERMD(a, idx);
 988         }
 989         else
 990         {
 991             if (isa<Constant>(idx))
 992             {
 993                 res = VSHUFFLE(a, a, idx);
 994             }
 995             else
 996             {
 997                 res = VUNDEF_I();
 998                 for (uint32_t l = 0; l < JM()->mVWidth; ++l)
 999                 {
1000                     Value* pIndex = VEXTRACT(idx, C(l));
1001                     Value* pVal = VEXTRACT(a, pIndex);
1002                     res = VINSERT(res, pVal, C(l));
1003                 }
1004             }
1005         }
1006         return res;
1007     }
1008
1009     //////////////////////////////////////////////////////////////////////////
1010     /// @brief Generate a VPERMPS operation (shuffle 32 bit float values
1011     /// across 128 bit lanes) in LLVM IR.  If not supported on the underlying
1012     /// platform, emulate it
1013     /// @param a - 256bit SIMD lane(8x32bit) of float values.
1014     /// @param idx - 256bit SIMD lane(8x32bit) of 3 bit lane index values
1015     Value *Builder::PERMPS(Value* a, Value* idx)
1016     {
1017         Value* res;
1018         // use avx2 permute instruction if available
1019         if (JM()->mArch.AVX2())
1020         {
1021             // llvm 3.6.0 swapped the order of the args to vpermd
1022             res = VPERMPS(idx, a);
1023         }
1024         else
1025         {
1026             if (isa<Constant>(idx))
1027             {
1028                 res = VSHUFFLE(a, a, idx);
1029             }
1030             else
1031             {
1032                 res = VUNDEF_F();
1033                 for (uint32_t l = 0; l < JM()->mVWidth; ++l)
1034                 {
1035                     Value* pIndex = VEXTRACT(idx, C(l));
1036                     Value* pVal = VEXTRACT(a, pIndex);
1037                     res = VINSERT(res, pVal, C(l));
1038                 }
1039             }
1040         }
1041
1042         return res;
1043     }
1044
1045     //////////////////////////////////////////////////////////////////////////
1046     /// @brief Generate a VCVTPH2PS operation (float16->float32 conversion)
1047     /// in LLVM IR.  If not supported on the underlying platform, emulate it
1048     /// @param a - 128bit SIMD lane(8x16bit) of float16 in int16 format.
1049     Value *Builder::CVTPH2PS(Value* a)
1050     {
1051         if (JM()->mArch.F16C())
1052         {
1053             return VCVTPH2PS(a);
1054         }
1055         else
1056         {
1057             FunctionType* pFuncTy = FunctionType::get(mFP32Ty, mInt16Ty);
1058             Function* pCvtPh2Ps = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("ConvertFloat16ToFloat32", pFuncTy));
1059
1060             if (sys::DynamicLibrary::SearchForAddressOfSymbol("ConvertFloat16ToFloat32") == nullptr)
1061             {
1062                 sys::DynamicLibrary::AddSymbol("ConvertFloat16ToFloat32", (void *)&ConvertFloat16ToFloat32);
1063             }
1064
1065             Value* pResult = UndefValue::get(mSimdFP32Ty);
1066             for (uint32_t i = 0; i < mVWidth; ++i)
1067             {
1068                 Value* pSrc = VEXTRACT(a, C(i));
1069                 Value* pConv = CALL(pCvtPh2Ps, std::initializer_list<Value*>{pSrc});
1070                 pResult = VINSERT(pResult, pConv, C(i));
1071             }
1072
1073             return pResult;
1074         }
1075     }
1076
1077     //////////////////////////////////////////////////////////////////////////
1078     /// @brief Generate a VCVTPS2PH operation (float32->float16 conversion)
1079     /// in LLVM IR.  If not supported on the underlying platform, emulate it
1080     /// @param a - 128bit SIMD lane(8x16bit) of float16 in int16 format.
1081     Value *Builder::CVTPS2PH(Value* a, Value* rounding)
1082     {
1083         if (JM()->mArch.F16C())
1084         {
1085             return VCVTPS2PH(a, rounding);
1086         }
1087         else
1088         {
1089             // call scalar C function for now
1090             FunctionType* pFuncTy = FunctionType::get(mInt16Ty, mFP32Ty);
1091             Function* pCvtPs2Ph = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("ConvertFloat32ToFloat16", pFuncTy));
1092
1093             if (sys::DynamicLibrary::SearchForAddressOfSymbol("ConvertFloat32ToFloat16") == nullptr)
1094             {
1095                 sys::DynamicLibrary::AddSymbol("ConvertFloat32ToFloat16", (void *)&ConvertFloat32ToFloat16);
1096             }
1097
1098             Value* pResult = UndefValue::get(mSimdInt16Ty);
1099             for (uint32_t i = 0; i < mVWidth; ++i)
1100             {
1101                 Value* pSrc = VEXTRACT(a, C(i));
1102                 Value* pConv = CALL(pCvtPs2Ph, std::initializer_list<Value*>{pSrc});
1103                 pResult = VINSERT(pResult, pConv, C(i));
1104             }
1105
1106             return pResult;
1107         }
1108     }
1109
1110     Value *Builder::PMAXSD(Value* a, Value* b)
1111     {
1112         Value* cmp = ICMP_SGT(a, b);
1113         return SELECT(cmp, a, b);
1114     }
1115
1116     Value *Builder::PMINSD(Value* a, Value* b)
1117     {
1118         Value* cmp = ICMP_SLT(a, b);
1119         return SELECT(cmp, a, b);
1120     }
1121
1122     void Builder::Gather4(const SWR_FORMAT format, Value* pSrcBase, Value* byteOffsets,
1123                           Value* mask, Value* vGatherComponents[], bool bPackedOutput)
1124     {
1125         const SWR_FORMAT_INFO &info = GetFormatInfo(format);
1126         if(info.type[0] == SWR_TYPE_FLOAT && info.bpc[0] == 32)
1127         {
1128             GATHER4PS(info, pSrcBase, byteOffsets, mask, vGatherComponents, bPackedOutput);
1129         }
1130         else
1131         {
1132             GATHER4DD(info, pSrcBase, byteOffsets, mask, vGatherComponents, bPackedOutput);
1133         }
1134     }
1135
1136     void Builder::GATHER4PS(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets,
1137                             Value* vMask, Value* vGatherComponents[], bool bPackedOutput)
1138     {
1139         switch(info.bpp / info.numComps)
1140         {
1141             case 16:
1142             {
1143                     Value* vGatherResult[2];
1144
1145                     // TODO: vGatherMaskedVal
1146                     Value* vGatherMaskedVal = VIMMED1((float)0);
1147
1148                     // always have at least one component out of x or y to fetch
1149
1150                     vGatherResult[0] = GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask);
1151                     // e.g. result of first 8x32bit integer gather for 16bit components
1152                     // 256i - 0    1    2    3    4    5    6    7
1153                     //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
1154                     //
1155
1156                     // if we have at least one component out of x or y to fetch
1157                     if(info.numComps > 2)
1158                     {
1159                         // offset base to the next components(zw) in the vertex to gather
1160                         pSrcBase = GEP(pSrcBase, C((char)4));
1161
1162                         vGatherResult[1] =  GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask);
1163                         // e.g. result of second 8x32bit integer gather for 16bit components
1164                         // 256i - 0    1    2    3    4    5    6    7
1165                         //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
1166                         //
1167                     }
1168                     else
1169                     {
1170                         vGatherResult[1] =  vGatherMaskedVal;
1171                     }
1172
1173                     // Shuffle gathered components into place, each row is a component
1174                     Shuffle16bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
1175             }
1176                 break;
1177             case 32:
1178             {
1179                 // apply defaults
1180                 for (uint32_t i = 0; i < 4; ++i)
1181                 {
1182                     vGatherComponents[i] = VIMMED1(*(float*)&info.defaults[i]);
1183                 }
1184
1185                 for(uint32_t i = 0; i < info.numComps; i++)
1186                 {
1187                     uint32_t swizzleIndex = info.swizzle[i];
1188
1189                     // Gather a SIMD of components
1190                     vGatherComponents[swizzleIndex] = GATHERPS(vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask);
1191
1192                     // offset base to the next component to gather
1193                     pSrcBase = GEP(pSrcBase, C((char)4));
1194                 }
1195             }
1196                 break;
1197             default:
1198                 SWR_INVALID("Invalid float format");
1199                 break;
1200         }
1201     }
1202
1203     void Builder::GATHER4DD(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets,
1204                             Value* vMask, Value* vGatherComponents[], bool bPackedOutput)
1205     {
1206         switch (info.bpp / info.numComps)
1207         {
1208             case 8:
1209             {
1210                 Value* vGatherMaskedVal = VIMMED1((int32_t)0);
1211                 Value* vGatherResult = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask);
1212                 // e.g. result of an 8x32bit integer gather for 8bit components
1213                 // 256i - 0    1    2    3    4    5    6    7
1214                 //        xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw
1215
1216                 Shuffle8bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
1217             }
1218                 break;
1219             case 16:
1220             {
1221                 Value* vGatherResult[2];
1222
1223                 // TODO: vGatherMaskedVal
1224                 Value* vGatherMaskedVal = VIMMED1((int32_t)0);
1225
1226                 // always have at least one component out of x or y to fetch
1227
1228                 vGatherResult[0] = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask);
1229                 // e.g. result of first 8x32bit integer gather for 16bit components
1230                 // 256i - 0    1    2    3    4    5    6    7
1231                 //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
1232                 //
1233
1234                 // if we have at least one component out of x or y to fetch
1235                 if(info.numComps > 2)
1236                 {
1237                     // offset base to the next components(zw) in the vertex to gather
1238                     pSrcBase = GEP(pSrcBase, C((char)4));
1239
1240                     vGatherResult[1] = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask);
1241                     // e.g. result of second 8x32bit integer gather for 16bit components
1242                     // 256i - 0    1    2    3    4    5    6    7
1243                     //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
1244                     //
1245                 }
1246                 else
1247                 {
1248                     vGatherResult[1] = vGatherMaskedVal;
1249                 }
1250
1251                 // Shuffle gathered components into place, each row is a component
1252                 Shuffle16bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
1253
1254             }
1255                 break;
1256             case 32:
1257             {
1258                 // apply defaults
1259                 for (uint32_t i = 0; i < 4; ++i)
1260                 {
1261                     vGatherComponents[i] = VIMMED1((int)info.defaults[i]);
1262                 }
1263
1264                 for(uint32_t i = 0; i < info.numComps; i++)
1265                 {
1266                     uint32_t swizzleIndex = info.swizzle[i];
1267
1268                     // Gather a SIMD of components
1269                     vGatherComponents[swizzleIndex] = GATHERDD(vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask);
1270
1271                     // offset base to the next component to gather
1272                     pSrcBase = GEP(pSrcBase, C((char)4));
1273                 }
1274             }
1275                 break;
1276             default:
1277                 SWR_INVALID("unsupported format");
1278             break;
1279         }
1280     }
1281
1282     void Builder::Shuffle16bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput[2], Value* vGatherOutput[4], bool bPackedOutput)
1283     {
1284         // cast types
1285         Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
1286         Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
1287
1288         // input could either be float or int vector; do shuffle work in int
1289         vGatherInput[0] = BITCAST(vGatherInput[0], mSimdInt32Ty);
1290         vGatherInput[1] = BITCAST(vGatherInput[1], mSimdInt32Ty);
1291
1292         if(bPackedOutput)
1293         {
1294             Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
1295
1296             // shuffle mask
1297             Value* vConstMask = C<char>({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
1298                                          0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15});
1299             Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput[0], v32x8Ty), vConstMask), vGatherTy);
1300             // after pshufb: group components together in each 128bit lane
1301             // 256i - 0    1    2    3    4    5    6    7
1302             //        xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
1303
1304             Value* vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
1305             // after PERMD: move and pack xy components into each 128bit lane
1306             // 256i - 0    1    2    3    4    5    6    7
1307             //        xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
1308
1309             // do the same for zw components
1310             Value* vi128ZW = nullptr;
1311             if(info.numComps > 2)
1312             {
1313                 Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput[1], v32x8Ty), vConstMask), vGatherTy);
1314                 vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
1315             }
1316
1317             for(uint32_t i = 0; i < 4; i++)
1318             {
1319                 uint32_t swizzleIndex = info.swizzle[i];
1320                 // todo: fixed for packed
1321                 Value* vGatherMaskedVal = VIMMED1((int32_t)(info.defaults[i]));
1322                 if(i >= info.numComps)
1323                 {
1324                     // set the default component val
1325                     vGatherOutput[swizzleIndex] = vGatherMaskedVal;
1326                     continue;
1327                 }
1328
1329                 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
1330                 uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
1331                 // if x or y, use vi128XY permute result, else use vi128ZW
1332                 Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
1333
1334                 // extract packed component 128 bit lanes
1335                 vGatherOutput[swizzleIndex] = VEXTRACT(selectedPermute, C(lane));
1336             }
1337
1338         }
1339         else
1340         {
1341             // pshufb masks for each component
1342             Value* vConstMask[2];
1343             // x/z shuffle mask
1344             vConstMask[0] = C<char>({0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
1345                                      0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, });
1346
1347             // y/w shuffle mask
1348             vConstMask[1] = C<char>({2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
1349                                      2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1});
1350
1351
1352             // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits
1353             // apply defaults
1354             for (uint32_t i = 0; i < 4; ++i)
1355             {
1356                 vGatherOutput[i] = VIMMED1((int32_t)info.defaults[i]);
1357             }
1358
1359             for(uint32_t i = 0; i < info.numComps; i++)
1360             {
1361                 uint32_t swizzleIndex = info.swizzle[i];
1362
1363                 // select correct constMask for x/z or y/w pshufb
1364                 uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1;
1365                 // if x or y, use vi128XY permute result, else use vi128ZW
1366                 uint32_t selectedGather = (i < 2) ? 0 : 1;
1367
1368                 vGatherOutput[swizzleIndex] = BITCAST(PSHUFB(BITCAST(vGatherInput[selectedGather], v32x8Ty), vConstMask[selectedMask]), vGatherTy);
1369                 // after pshufb mask for x channel; z uses the same shuffle from the second gather
1370                 // 256i - 0    1    2    3    4    5    6    7
1371                 //        xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00
1372             }
1373         }
1374     }
1375
1376     void Builder::Shuffle8bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput, Value* vGatherOutput[], bool bPackedOutput)
1377     {
1378         // cast types
1379         Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
1380         Type* v32x8Ty =  VectorType::get(mInt8Ty, mVWidth * 4 ); // vwidth is units of 32 bits
1381
1382         if(bPackedOutput)
1383         {
1384             Type* v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
1385             // shuffle mask
1386             Value* vConstMask = C<char>({0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15,
1387                                          0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15});
1388             Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput, v32x8Ty), vConstMask), vGatherTy);
1389             // after pshufb: group components together in each 128bit lane
1390             // 256i - 0    1    2    3    4    5    6    7
1391             //        xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww
1392
1393             Value* vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({0, 4, 0, 0, 1, 5, 0, 0})), v128Ty);
1394             // after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane
1395             // 256i - 0    1    2    3    4    5    6    7
1396             //        xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care)
1397
1398             // do the same for zw components
1399             Value* vi128ZW = nullptr;
1400             if(info.numComps > 2)
1401             {
1402                 vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({2, 6, 0, 0, 3, 7, 0, 0})), v128Ty);
1403             }
1404
1405             // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
1406             for(uint32_t i = 0; i < 4; i++)
1407             {
1408                 uint32_t swizzleIndex = info.swizzle[i];
1409                 // todo: fix for packed
1410                 Value* vGatherMaskedVal = VIMMED1((int32_t)(info.defaults[i]));
1411                 if(i >= info.numComps)
1412                 {
1413                     // set the default component val
1414                     vGatherOutput[swizzleIndex] = vGatherMaskedVal;
1415                     continue;
1416                 }
1417
1418                 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
1419                 uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
1420                 // if x or y, use vi128XY permute result, else use vi128ZW
1421                 Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
1422
1423                 // sign extend
1424                 vGatherOutput[swizzleIndex] = VEXTRACT(selectedPermute, C(lane));
1425             }
1426         }
1427         // else zero extend
1428         else{
1429             // shuffle enabled components into lower byte of each 32bit lane, 0 extending to 32 bits
1430             // apply defaults
1431             for (uint32_t i = 0; i < 4; ++i)
1432             {
1433                 vGatherOutput[i] = VIMMED1((int32_t)info.defaults[i]);
1434             }
1435
1436             for(uint32_t i = 0; i < info.numComps; i++){
1437                 uint32_t swizzleIndex = info.swizzle[i];
1438
1439                 // pshufb masks for each component
1440                 Value* vConstMask;
1441                 switch(i)
1442                 {
1443                     case 0:
1444                         // x shuffle mask
1445                         vConstMask = C<char>({0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1,
1446                                               0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1});
1447                         break;
1448                     case 1:
1449                         // y shuffle mask
1450                         vConstMask = C<char>({1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1,
1451                                               1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1});
1452                         break;
1453                     case 2:
1454                         // z shuffle mask
1455                         vConstMask = C<char>({2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1,
1456                                               2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1});
1457                         break;
1458                     case 3:
1459                         // w shuffle mask
1460                         vConstMask = C<char>({3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1,
1461                                               3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1});
1462                         break;
1463                     default:
1464                         vConstMask = nullptr;
1465                         break;
1466                 }
1467
1468                     vGatherOutput[swizzleIndex] = BITCAST(PSHUFB(BITCAST(vGatherInput, v32x8Ty), vConstMask), vGatherTy);
1469                     // after pshufb for x channel
1470                     // 256i - 0    1    2    3    4    5    6    7
1471                     //        x000 x000 x000 x000 x000 x000 x000 x000
1472             }
1473         }
1474     }
1475
1476     // Helper function to create alloca in entry block of function
1477     Value* Builder::CreateEntryAlloca(Function* pFunc, Type* pType)
1478     {
1479         auto saveIP = IRB()->saveIP();
1480         IRB()->SetInsertPoint(&pFunc->getEntryBlock(),
1481                               pFunc->getEntryBlock().begin());
1482         Value* pAlloca = ALLOCA(pType);
1483         if (saveIP.isSet()) IRB()->restoreIP(saveIP);
1484         return pAlloca;
1485     }
1486
1487     Value* Builder::CreateEntryAlloca(Function* pFunc, Type* pType, Value* pArraySize)
1488     {
1489         auto saveIP = IRB()->saveIP();
1490         IRB()->SetInsertPoint(&pFunc->getEntryBlock(),
1491             pFunc->getEntryBlock().begin());
1492         Value* pAlloca = ALLOCA(pType, pArraySize);
1493         if (saveIP.isSet()) IRB()->restoreIP(saveIP);
1494         return pAlloca;
1495     }
1496
1497     //////////////////////////////////////////////////////////////////////////
1498     /// @brief emulates a scatter operation.
1499     /// @param pDst - pointer to destination
1500     /// @param vSrc - vector of src data to scatter
1501     /// @param vOffsets - vector of byte offsets from pDst
1502     /// @param vMask - mask of valid lanes
1503     void Builder::SCATTERPS(Value* pDst, Value* vSrc, Value* vOffsets, Value* vMask)
1504     {
1505         /* Scatter algorithm
1506
1507            while(Index = BitScanForward(mask))
1508                 srcElem = srcVector[Index]
1509                 offsetElem = offsetVector[Index]
1510                 *(pDst + offsetElem) = srcElem
1511                 Update mask (&= ~(1<<Index)
1512
1513         */
1514
1515         BasicBlock* pCurBB = IRB()->GetInsertBlock();
1516         Function* pFunc = pCurBB->getParent();
1517         Type* pSrcTy = vSrc->getType()->getVectorElementType();
1518
1519         // Store vectors on stack
1520         if (pScatterStackSrc == nullptr)
1521         {
1522             // Save off stack allocations and reuse per scatter. Significantly reduces stack
1523             // requirements for shaders with a lot of scatters.
1524             pScatterStackSrc = CreateEntryAlloca(pFunc, mSimdInt64Ty);
1525             pScatterStackOffsets = CreateEntryAlloca(pFunc, mSimdInt32Ty);
1526         }
1527
1528         Value* pSrcArrayPtr = BITCAST(pScatterStackSrc, PointerType::get(vSrc->getType(), 0));
1529         Value* pOffsetsArrayPtr = pScatterStackOffsets;
1530         STORE(vSrc, pSrcArrayPtr);
1531         STORE(vOffsets, pOffsetsArrayPtr);
1532
1533         // Cast to pointers for random access
1534         pSrcArrayPtr = POINTER_CAST(pSrcArrayPtr, PointerType::get(pSrcTy, 0));
1535         pOffsetsArrayPtr = POINTER_CAST(pOffsetsArrayPtr, PointerType::get(mInt32Ty, 0));
1536
1537         Value* pMask = VMOVMSKPS(BITCAST(vMask, mSimdFP32Ty));
1538
1539         // Get cttz function
1540         Function* pfnCttz = Intrinsic::getDeclaration(mpJitMgr->mpCurrentModule, Intrinsic::cttz, { mInt32Ty });
1541
1542         // Setup loop basic block
1543         BasicBlock* pLoop = BasicBlock::Create(mpJitMgr->mContext, "Scatter Loop", pFunc);
1544
1545         // compute first set bit
1546         Value* pIndex = CALL(pfnCttz, { pMask, C(false) });
1547
1548         Value* pIsUndef = ICMP_EQ(pIndex, C(32));
1549
1550         // Split current block
1551         BasicBlock* pPostLoop = pCurBB->splitBasicBlock(cast<Instruction>(pIsUndef)->getNextNode());
1552
1553         // Remove unconditional jump created by splitBasicBlock
1554         pCurBB->getTerminator()->eraseFromParent();
1555
1556         // Add terminator to end of original block
1557         IRB()->SetInsertPoint(pCurBB);
1558
1559         // Add conditional branch
1560         COND_BR(pIsUndef, pPostLoop, pLoop);
1561
1562         // Add loop basic block contents
1563         IRB()->SetInsertPoint(pLoop);
1564         PHINode* pIndexPhi = PHI(mInt32Ty, 2);
1565         PHINode* pMaskPhi = PHI(mInt32Ty, 2);
1566
1567         pIndexPhi->addIncoming(pIndex, pCurBB);
1568         pMaskPhi->addIncoming(pMask, pCurBB);
1569
1570         // Extract elements for this index
1571         Value* pSrcElem = LOADV(pSrcArrayPtr, { pIndexPhi });
1572         Value* pOffsetElem = LOADV(pOffsetsArrayPtr, { pIndexPhi });
1573
1574         // GEP to this offset in dst
1575         Value* pCurDst = GEP(pDst, pOffsetElem);
1576         pCurDst = POINTER_CAST(pCurDst, PointerType::get(pSrcTy, 0));
1577         STORE(pSrcElem, pCurDst);
1578
1579         // Update the mask
1580         Value* pNewMask = AND(pMaskPhi, NOT(SHL(C(1), pIndexPhi)));
1581
1582         // Terminator
1583         Value* pNewIndex = CALL(pfnCttz, { pNewMask, C(false) });
1584
1585         pIsUndef = ICMP_EQ(pNewIndex, C(32));
1586         COND_BR(pIsUndef, pPostLoop, pLoop);
1587
1588         // Update phi edges
1589         pIndexPhi->addIncoming(pNewIndex, pLoop);
1590         pMaskPhi->addIncoming(pNewMask, pLoop);
1591
1592         // Move builder to beginning of post loop
1593         IRB()->SetInsertPoint(pPostLoop, pPostLoop->begin());
1594     }
1595
1596     Value* Builder::VABSPS(Value* a)
1597     {
1598         Value* asInt = BITCAST(a, mSimdInt32Ty);
1599         Value* result = BITCAST(AND(asInt, VIMMED1(0x7fffffff)), mSimdFP32Ty);
1600         return result;
1601     }
1602
1603     Value *Builder::ICLAMP(Value* src, Value* low, Value* high)
1604     {
1605         Value *lowCmp = ICMP_SLT(src, low);
1606         Value *ret = SELECT(lowCmp, low, src);
1607
1608         Value *highCmp = ICMP_SGT(ret, high);
1609         ret = SELECT(highCmp, high, ret);
1610
1611         return ret;
1612     }
1613
1614     Value *Builder::FCLAMP(Value* src, Value* low, Value* high)
1615     {
1616         Value *lowCmp = FCMP_OLT(src, low);
1617         Value *ret = SELECT(lowCmp, low, src);
1618
1619         Value *highCmp = FCMP_OGT(ret, high);
1620         ret = SELECT(highCmp, high, ret);
1621
1622         return ret;
1623     }
1624
1625     Value *Builder::FCLAMP(Value* src, float low, float high)
1626     {
1627         Value* result = VMAXPS(src, VIMMED1(low));
1628         result = VMINPS(result, VIMMED1(high));
1629
1630         return result;
1631     }
1632
1633     //////////////////////////////////////////////////////////////////////////
1634     /// @brief save/restore stack, providing ability to push/pop the stack and
1635     ///        reduce overall stack requirements for temporary stack use
1636     Value* Builder::STACKSAVE()
1637     {
1638         Function* pfnStackSave = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::stacksave);
1639         return CALLA(pfnStackSave);
1640     }
1641
1642     void Builder::STACKRESTORE(Value* pSaved)
1643     {
1644         Function* pfnStackRestore = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::stackrestore);
1645         CALL(pfnStackRestore, std::initializer_list<Value*>{pSaved});
1646     }
1647
1648     Value *Builder::FMADDPS(Value* a, Value* b, Value* c)
1649     {
1650         Value* vOut;
1651         // use FMADs if available
1652         if(JM()->mArch.AVX2())
1653         {
1654             vOut = VFMADDPS(a, b, c);
1655         }
1656         else
1657         {
1658             vOut = FADD(FMUL(a, b), c);
1659         }
1660         return vOut;
1661     }
1662
1663     Value* Builder::POPCNT(Value* a)
1664     {
1665         Function* pCtPop = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::ctpop, { a->getType() });
1666         return CALL(pCtPop, std::initializer_list<Value*>{a});
1667     }
1668
1669     //////////////////////////////////////////////////////////////////////////
1670     /// @brief C functions called by LLVM IR
1671     //////////////////////////////////////////////////////////////////////////
1672
1673     //////////////////////////////////////////////////////////////////////////
1674     /// @brief called in JIT code, inserted by PRINT
1675     /// output to both stdout and visual studio debug console
1676     void __cdecl CallPrint(const char* fmt, ...)
1677     {
1678         va_list args;
1679         va_start(args, fmt);
1680         vprintf(fmt, args);
1681
1682     #if defined( _WIN32 )
1683         char strBuf[1024];
1684         vsnprintf_s(strBuf, _TRUNCATE, fmt, args);
1685         OutputDebugStringA(strBuf);
1686     #endif
1687
1688         va_end(args);
1689     }
1690
1691     Value *Builder::VEXTRACTI128(Value* a, Constant* imm8)
1692     {
1693         bool flag = !imm8->isZeroValue();
1694         SmallVector<Constant*,8> idx;
1695         for (unsigned i = 0; i < mVWidth / 2; i++) {
1696             idx.push_back(C(flag ? i + mVWidth / 2 : i));
1697         }
1698         return VSHUFFLE(a, VUNDEF_I(), ConstantVector::get(idx));
1699     }
1700
1701     Value *Builder::VINSERTI128(Value* a, Value* b, Constant* imm8)
1702     {
1703         bool flag = !imm8->isZeroValue();
1704         SmallVector<Constant*,8> idx;
1705         for (unsigned i = 0; i < mVWidth; i++) {
1706             idx.push_back(C(i));
1707         }
1708         Value *inter = VSHUFFLE(b, VUNDEF_I(), ConstantVector::get(idx));
1709
1710         SmallVector<Constant*,8> idx2;
1711         for (unsigned i = 0; i < mVWidth / 2; i++) {
1712             idx2.push_back(C(flag ? i : i + mVWidth));
1713         }
1714         for (unsigned i = mVWidth / 2; i < mVWidth; i++) {
1715             idx2.push_back(C(flag ? i + mVWidth / 2 : i));
1716         }
1717         return VSHUFFLE(a, inter, ConstantVector::get(idx2));
1718     }
1719
1720     // rdtsc buckets macros
1721     void Builder::RDTSC_START(Value* pBucketMgr, Value* pId)
1722     {
1723         // @todo due to an issue with thread local storage propagation in llvm, we can only safely call into
1724         // buckets framework when single threaded
1725         if (KNOB_SINGLE_THREADED)
1726         {
1727             std::vector<Type*> args{
1728                 PointerType::get(mInt32Ty, 0),   // pBucketMgr
1729                 mInt32Ty                        // id
1730             };
1731
1732             FunctionType* pFuncTy = FunctionType::get(Type::getVoidTy(JM()->mContext), args, false);
1733             Function* pFunc = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("BucketManager_StartBucket", pFuncTy));
1734             if (sys::DynamicLibrary::SearchForAddressOfSymbol("BucketManager_StartBucket") == nullptr)
1735             {
1736                 sys::DynamicLibrary::AddSymbol("BucketManager_StartBucket", (void*)&BucketManager_StartBucket);
1737             }
1738
1739             CALL(pFunc, { pBucketMgr, pId });
1740         }
1741     }
1742
1743     void Builder::RDTSC_STOP(Value* pBucketMgr, Value* pId)
1744     {
1745         // @todo due to an issue with thread local storage propagation in llvm, we can only safely call into
1746         // buckets framework when single threaded
1747         if (KNOB_SINGLE_THREADED)
1748         {
1749             std::vector<Type*> args{
1750                 PointerType::get(mInt32Ty, 0),   // pBucketMgr
1751                 mInt32Ty                        // id
1752             };
1753
1754             FunctionType* pFuncTy = FunctionType::get(Type::getVoidTy(JM()->mContext), args, false);
1755             Function* pFunc = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("BucketManager_StopBucket", pFuncTy));
1756             if (sys::DynamicLibrary::SearchForAddressOfSymbol("BucketManager_StopBucket") == nullptr)
1757             {
1758                 sys::DynamicLibrary::AddSymbol("BucketManager_StopBucket", (void*)&BucketManager_StopBucket);
1759             }
1760
1761             CALL(pFunc, { pBucketMgr, pId });
1762         }
1763     }
1764
1765
1766     uint32_t Builder::GetTypeSize(Type* pType)
1767     {
1768         if (pType->isStructTy())
1769         {
1770             uint32_t numElems = pType->getStructNumElements();
1771             Type* pElemTy = pType->getStructElementType(0);
1772             return numElems * GetTypeSize(pElemTy);
1773         }
1774
1775         if (pType->isArrayTy())
1776         {
1777             uint32_t numElems = pType->getArrayNumElements();
1778             Type* pElemTy = pType->getArrayElementType();
1779             return numElems * GetTypeSize(pElemTy);
1780         }
1781
1782         if (pType->isIntegerTy())
1783         {
1784             uint32_t bitSize = pType->getIntegerBitWidth();
1785             return bitSize / 8;
1786         }
1787
1788         if (pType->isFloatTy())
1789         {
1790             return 4;
1791         }
1792
1793         if (pType->isHalfTy())
1794         {
1795             return 2;
1796         }
1797
1798         if (pType->isDoubleTy())
1799         {
1800             return 8;
1801         }
1802
1803         SWR_ASSERT(false, "Unimplemented type.");
1804         return 0;
1805     }
1806 }