src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp

   1 /****************************************************************************
   2 * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
   3 *
   4 * Permission is hereby granted, free of charge, to any person obtaining a
   5 * copy of this software and associated documentation files (the "Software"),
   6 * to deal in the Software without restriction, including without limitation
   7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8 * and/or sell copies of the Software, and to permit persons to whom the
   9 * Software is furnished to do so, subject to the following conditions:
  10 *
  11 * The above copyright notice and this permission notice (including the next
  12 * paragraph) shall be included in all copies or substantial portions of the
  13 * Software.
  14 *
  15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21 * IN THE SOFTWARE.
  22 *
  23 * @file builder_misc.cpp
  24 *
  25 * @brief Implementation for miscellaneous builder functions
  26 *
  27 * Notes:
  28 *
  29 ******************************************************************************/
  30 #include "builder.h"
  31 #include "common/rdtsc_buckets.h"
  32
  33 #include <cstdarg>
  34
  35 namespace SwrJit
  36 {
  37     void __cdecl CallPrint(const char* fmt, ...);
  38
  39     //////////////////////////////////////////////////////////////////////////
  40     /// @brief Convert an IEEE 754 32-bit single precision float to an
  41     ///        16 bit float with 5 exponent bits and a variable
  42     ///        number of mantissa bits.
  43     /// @param val - 32-bit float
  44     /// @todo Maybe move this outside of this file into a header?
  45     static uint16_t ConvertFloat32ToFloat16(float val)
  46     {
  47         uint32_t sign, exp, mant;
  48         uint32_t roundBits;
  49
  50         // Extract the sign, exponent, and mantissa
  51         uint32_t uf = *(uint32_t*)&val;
  52         sign = (uf & 0x80000000) >> 31;
  53         exp = (uf & 0x7F800000) >> 23;
  54         mant = uf & 0x007FFFFF;
  55
  56         // Check for out of range
  57         if (std::isnan(val))
  58         {
  59             exp = 0x1F;
  60             mant = 0x200;
  61             sign = 1;                     // set the sign bit for NANs
  62         }
  63         else if (std::isinf(val))
  64         {
  65             exp = 0x1f;
  66             mant = 0x0;
  67         }
  68         else if (exp > (0x70 + 0x1E)) // Too big to represent -> max representable value
  69         {
  70             exp = 0x1E;
  71             mant = 0x3FF;
  72         }
  73         else if ((exp <= 0x70) && (exp >= 0x66)) // It's a denorm
  74         {
  75             mant |= 0x00800000;
  76             for (; exp <= 0x70; mant >>= 1, exp++)
  77                 ;
  78             exp = 0;
  79             mant = mant >> 13;
  80         }
  81         else if (exp < 0x66) // Too small to represent -> Zero
  82         {
  83             exp = 0;
  84             mant = 0;
  85         }
  86         else
  87         {
  88             // Saves bits that will be shifted off for rounding
  89             roundBits = mant & 0x1FFFu;
  90             // convert exponent and mantissa to 16 bit format
  91             exp = exp - 0x70;
  92             mant = mant >> 13;
  93
  94             // Essentially RTZ, but round up if off by only 1 lsb
  95             if (roundBits == 0x1FFFu)
  96             {
  97                 mant++;
  98                 // check for overflow
  99                 if ((mant & 0xC00u) != 0)
 100                     exp++;
 101                 // make sure only the needed bits are used
 102                 mant &= 0x3FF;
 103             }
 104         }
 105
 106         uint32_t tmpVal = (sign << 15) | (exp << 10) | mant;
 107         return (uint16_t)tmpVal;
 108     }
 109
 110     //////////////////////////////////////////////////////////////////////////
 111     /// @brief Convert an IEEE 754 16-bit float to an 32-bit single precision
 112     ///        float
 113     /// @param val - 16-bit float
 114     /// @todo Maybe move this outside of this file into a header?
 115     static float ConvertFloat16ToFloat32(uint32_t val)
 116     {
 117         uint32_t result;
 118         if ((val & 0x7fff) == 0)
 119         {
 120             result = ((uint32_t)(val & 0x8000)) << 16;
 121         }
 122         else if ((val & 0x7c00) == 0x7c00)
 123         {
 124             result = ((val & 0x3ff) == 0) ? 0x7f800000 : 0x7fc00000;
 125             result |= ((uint32_t)val & 0x8000) << 16;
 126         }
 127         else
 128         {
 129             uint32_t sign = (val & 0x8000) << 16;
 130             uint32_t mant = (val & 0x3ff) << 13;
 131             uint32_t exp = (val >> 10) & 0x1f;
 132             if ((exp == 0) && (mant != 0)) // Adjust exponent and mantissa for denormals
 133             {
 134                 mant <<= 1;
 135                 while (mant < (0x400 << 13))
 136                 {
 137                     exp--;
 138                     mant <<= 1;
 139                 }
 140                 mant &= (0x3ff << 13);
 141             }
 142             exp = ((exp - 15 + 127) & 0xff) << 23;
 143             result = sign | exp | mant;
 144         }
 145
 146         return *(float*)&result;
 147     }
 148
 149     Constant *Builder::C(bool i)
 150     {
 151         return ConstantInt::get(IRB()->getInt1Ty(), (i ? 1 : 0));
 152     }
 153
 154     Constant *Builder::C(char i)
 155     {
 156         return ConstantInt::get(IRB()->getInt8Ty(), i);
 157     }
 158
 159     Constant *Builder::C(uint8_t i)
 160     {
 161         return ConstantInt::get(IRB()->getInt8Ty(), i);
 162     }
 163
 164     Constant *Builder::C(int i)
 165     {
 166         return ConstantInt::get(IRB()->getInt32Ty(), i);
 167     }
 168
 169     Constant *Builder::C(int64_t i)
 170     {
 171         return ConstantInt::get(IRB()->getInt64Ty(), i);
 172     }
 173
 174     Constant *Builder::C(uint16_t i)
 175     {
 176         return ConstantInt::get(mInt16Ty,i);
 177     }
 178
 179     Constant *Builder::C(uint32_t i)
 180     {
 181         return ConstantInt::get(IRB()->getInt32Ty(), i);
 182     }
 183
 184     Constant *Builder::C(float i)
 185     {
 186         return ConstantFP::get(IRB()->getFloatTy(), i);
 187     }
 188
 189     Constant *Builder::PRED(bool pred)
 190     {
 191         return ConstantInt::get(IRB()->getInt1Ty(), (pred ? 1 : 0));
 192     }
 193
 194     Value *Builder::VIMMED1(int i)
 195     {
 196         return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i)));
 197     }
 198
 199     Value *Builder::VIMMED1(uint32_t i)
 200     {
 201         return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i)));
 202     }
 203
 204     Value *Builder::VIMMED1(float i)
 205     {
 206         return ConstantVector::getSplat(mVWidth, cast<ConstantFP>(C(i)));
 207     }
 208
 209     Value *Builder::VIMMED1(bool i)
 210     {
 211         return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i)));
 212     }
 213
 214     Value *Builder::VUNDEF_IPTR()
 215     {
 216         return UndefValue::get(VectorType::get(mInt32PtrTy,mVWidth));
 217     }
 218
 219     Value *Builder::VUNDEF_I()
 220     {
 221         return UndefValue::get(VectorType::get(mInt32Ty, mVWidth));
 222     }
 223
 224     Value *Builder::VUNDEF(Type *ty, uint32_t size)
 225     {
 226         return UndefValue::get(VectorType::get(ty, size));
 227     }
 228
 229     Value *Builder::VUNDEF_F()
 230     {
 231         return UndefValue::get(VectorType::get(mFP32Ty, mVWidth));
 232     }
 233
 234 #if USE_SIMD16_BUILDER
 235     Value *Builder::VUNDEF2_F()
 236     {
 237         return UndefValue::get(VectorType::get(mFP32Ty, mVWidth2));
 238     }
 239
 240 #endif
 241     Value *Builder::VUNDEF(Type* t)
 242     {
 243         return UndefValue::get(VectorType::get(t, mVWidth));
 244     }
 245
 246     Value *Builder::VBROADCAST(Value *src)
 247     {
 248         // check if src is already a vector
 249         if (src->getType()->isVectorTy())
 250         {
 251             return src;
 252         }
 253
 254         return VECTOR_SPLAT(mVWidth, src);
 255     }
 256
 257     uint32_t Builder::IMMED(Value* v)
 258     {
 259         SWR_ASSERT(isa<ConstantInt>(v));
 260         ConstantInt *pValConst = cast<ConstantInt>(v);
 261         return pValConst->getZExtValue();
 262     }
 263
 264     int32_t Builder::S_IMMED(Value* v)
 265     {
 266         SWR_ASSERT(isa<ConstantInt>(v));
 267         ConstantInt *pValConst = cast<ConstantInt>(v);
 268         return pValConst->getSExtValue();
 269     }
 270
 271     Value *Builder::GEP(Value* ptr, const std::initializer_list<Value*> &indexList)
 272     {
 273         std::vector<Value*> indices;
 274         for (auto i : indexList)
 275             indices.push_back(i);
 276         return GEPA(ptr, indices);
 277     }
 278
 279     Value *Builder::GEP(Value* ptr, const std::initializer_list<uint32_t> &indexList)
 280     {
 281         std::vector<Value*> indices;
 282         for (auto i : indexList)
 283             indices.push_back(C(i));
 284         return GEPA(ptr, indices);
 285     }
 286
 287     Value *Builder::IN_BOUNDS_GEP(Value* ptr, const std::initializer_list<Value*> &indexList)
 288     {
 289         std::vector<Value*> indices;
 290         for (auto i : indexList)
 291             indices.push_back(i);
 292         return IN_BOUNDS_GEP(ptr, indices);
 293     }
 294
 295     Value *Builder::IN_BOUNDS_GEP(Value* ptr, const std::initializer_list<uint32_t> &indexList)
 296     {
 297         std::vector<Value*> indices;
 298         for (auto i : indexList)
 299             indices.push_back(C(i));
 300         return IN_BOUNDS_GEP(ptr, indices);
 301     }
 302
 303     LoadInst *Builder::LOAD(Value *basePtr, const std::initializer_list<uint32_t> &indices, const llvm::Twine& name)
 304     {
 305         std::vector<Value*> valIndices;
 306         for (auto i : indices)
 307             valIndices.push_back(C(i));
 308         return LOAD(GEPA(basePtr, valIndices), name);
 309     }
 310
 311     LoadInst *Builder::LOADV(Value *basePtr, const std::initializer_list<Value*> &indices, const llvm::Twine& name)
 312     {
 313         std::vector<Value*> valIndices;
 314         for (auto i : indices)
 315             valIndices.push_back(i);
 316         return LOAD(GEPA(basePtr, valIndices), name);
 317     }
 318
 319     StoreInst *Builder::STORE(Value *val, Value *basePtr, const std::initializer_list<uint32_t> &indices)
 320     {
 321         std::vector<Value*> valIndices;
 322         for (auto i : indices)
 323             valIndices.push_back(C(i));
 324         return STORE(val, GEPA(basePtr, valIndices));
 325     }
 326
 327     StoreInst *Builder::STOREV(Value *val, Value *basePtr, const std::initializer_list<Value*> &indices)
 328     {
 329         std::vector<Value*> valIndices;
 330         for (auto i : indices)
 331             valIndices.push_back(i);
 332         return STORE(val, GEPA(basePtr, valIndices));
 333     }
 334
 335     CallInst *Builder::CALL(Value *Callee, const std::initializer_list<Value*> &argsList)
 336     {
 337         std::vector<Value*> args;
 338         for (auto arg : argsList)
 339             args.push_back(arg);
 340         return CALLA(Callee, args);
 341     }
 342
 343     CallInst *Builder::CALL(Value *Callee, Value* arg)
 344     {
 345         std::vector<Value*> args;
 346         args.push_back(arg);
 347         return CALLA(Callee, args);
 348     }
 349
 350     CallInst *Builder::CALL2(Value *Callee, Value* arg1, Value* arg2)
 351     {
 352         std::vector<Value*> args;
 353         args.push_back(arg1);
 354         args.push_back(arg2);
 355         return CALLA(Callee, args);
 356     }
 357
 358     CallInst *Builder::CALL3(Value *Callee, Value* arg1, Value* arg2, Value* arg3)
 359     {
 360         std::vector<Value*> args;
 361         args.push_back(arg1);
 362         args.push_back(arg2);
 363         args.push_back(arg3);
 364         return CALLA(Callee, args);
 365     }
 366
 367     //////////////////////////////////////////////////////////////////////////
 368     Value *Builder::DEBUGTRAP()
 369     {
 370         Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::debugtrap);
 371         return CALL(func);
 372     }
 373
 374     Value *Builder::VRCP(Value *va)
 375     {
 376         return FDIV(VIMMED1(1.0f), va);  // 1 / a
 377     }
 378
 379     Value *Builder::VPLANEPS(Value* vA, Value* vB, Value* vC, Value* &vX, Value* &vY)
 380     {
 381         Value* vOut = FMADDPS(vA, vX, vC);
 382         vOut = FMADDPS(vB, vY, vOut);
 383         return vOut;
 384     }
 385
 386     //////////////////////////////////////////////////////////////////////////
 387     /// @brief Generate an i32 masked load operation in LLVM IR.  If not
 388     /// supported on the underlying platform, emulate it with float masked load
 389     /// @param src - base address pointer for the load
 390     /// @param vMask - SIMD wide mask that controls whether to access memory load 0
 391     Value *Builder::MASKLOADD(Value* src,Value* mask)
 392     {
 393         Value* vResult;
 394         // use avx2 gather instruction is available
 395         if(JM()->mArch.AVX2())
 396         {
 397             Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx2_maskload_d_256);
 398             vResult = CALL(func,{src,mask});
 399         }
 400         else
 401         {
 402             // maskload intrinsic expects integer mask operand in llvm >= 3.8
 403     #if (LLVM_VERSION_MAJOR > 3) || (LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 8)
 404             mask = BITCAST(mask,VectorType::get(mInt32Ty,mVWidth));
 405     #else
 406             mask = BITCAST(mask,VectorType::get(mFP32Ty,mVWidth));
 407     #endif
 408             Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule,Intrinsic::x86_avx_maskload_ps_256);
 409             vResult = BITCAST(CALL(func,{src,mask}), VectorType::get(mInt32Ty,mVWidth));
 410         }
 411         return vResult;
 412     }
 413
 414     //////////////////////////////////////////////////////////////////////////
 415     /// @brief insert a JIT call to CallPrint
 416     /// - outputs formatted string to both stdout and VS output window
 417     /// - DEBUG builds only
 418     /// Usage example:
 419     ///   PRINT("index %d = 0x%p\n",{C(lane), pIndex});
 420     ///   where C(lane) creates a constant value to print, and pIndex is the Value*
 421     ///   result from a GEP, printing out the pointer to memory
 422     /// @param printStr - constant string to print, which includes format specifiers
 423     /// @param printArgs - initializer list of Value*'s to print to std out
 424     CallInst *Builder::PRINT(const std::string &printStr,const std::initializer_list<Value*> &printArgs)
 425     {
 426         // push the arguments to CallPrint into a vector
 427         std::vector<Value*> printCallArgs;
 428         // save room for the format string.  we still need to modify it for vectors
 429         printCallArgs.resize(1);
 430
 431         // search through the format string for special processing
 432         size_t pos = 0;
 433         std::string tempStr(printStr);
 434         pos = tempStr.find('%', pos);
 435         auto v = printArgs.begin();
 436
 437         while ((pos != std::string::npos) && (v != printArgs.end()))
 438         {
 439             Value* pArg = *v;
 440             Type* pType = pArg->getType();
 441
 442             if (pType->isVectorTy())
 443             {
 444                 Type* pContainedType = pType->getContainedType(0);
 445
 446                 if (toupper(tempStr[pos + 1]) == 'X')
 447                 {
 448                     tempStr[pos] = '0';
 449                     tempStr[pos + 1] = 'x';
 450                     tempStr.insert(pos + 2, "%08X ");
 451                     pos += 7;
 452
 453                     printCallArgs.push_back(VEXTRACT(pArg, C(0)));
 454
 455                     std::string vectorFormatStr;
 456                     for (uint32_t i = 1; i < pType->getVectorNumElements(); ++i)
 457                     {
 458                         vectorFormatStr += "0x%08X ";
 459                         printCallArgs.push_back(VEXTRACT(pArg, C(i)));
 460                     }
 461
 462                     tempStr.insert(pos, vectorFormatStr);
 463                     pos += vectorFormatStr.size();
 464                 }
 465                 else if ((tempStr[pos + 1] == 'f') && (pContainedType->isFloatTy()))
 466                 {
 467                     uint32_t i = 0;
 468                     for (; i < (pArg->getType()->getVectorNumElements()) - 1; i++)
 469                     {
 470                         tempStr.insert(pos, std::string("%f "));
 471                         pos += 3;
 472                         printCallArgs.push_back(FP_EXT(VEXTRACT(pArg, C(i)), Type::getDoubleTy(JM()->mContext)));
 473                     }
 474                     printCallArgs.push_back(FP_EXT(VEXTRACT(pArg, C(i)), Type::getDoubleTy(JM()->mContext)));
 475                 }
 476                 else if ((tempStr[pos + 1] == 'd') && (pContainedType->isIntegerTy()))
 477                 {
 478                     uint32_t i = 0;
 479                     for (; i < (pArg->getType()->getVectorNumElements()) - 1; i++)
 480                     {
 481                         tempStr.insert(pos, std::string("%d "));
 482                         pos += 3;
 483                         printCallArgs.push_back(VEXTRACT(pArg, C(i)));
 484                     }
 485                     printCallArgs.push_back(VEXTRACT(pArg, C(i)));
 486                 }
 487             }
 488             else
 489             {
 490                 if (toupper(tempStr[pos + 1]) == 'X')
 491                 {
 492                     tempStr[pos] = '0';
 493                     tempStr.insert(pos + 1, "x%08");
 494                     printCallArgs.push_back(pArg);
 495                     pos += 3;
 496                 }
 497                 // for %f we need to cast float Values to doubles so that they print out correctly
 498                 else if ((tempStr[pos + 1] == 'f') && (pType->isFloatTy()))
 499                 {
 500                     printCallArgs.push_back(FP_EXT(pArg, Type::getDoubleTy(JM()->mContext)));
 501                     pos++;
 502                 }
 503                 else
 504                 {
 505                     printCallArgs.push_back(pArg);
 506                 }
 507             }
 508
 509             // advance to the next arguement
 510             v++;
 511             pos = tempStr.find('%', ++pos);
 512         }
 513
 514         // create global variable constant string
 515         Constant *constString = ConstantDataArray::getString(JM()->mContext,tempStr,true);
 516         GlobalVariable *gvPtr = new GlobalVariable(constString->getType(),true,GlobalValue::InternalLinkage,constString,"printStr");
 517         JM()->mpCurrentModule->getGlobalList().push_back(gvPtr);
 518
 519         // get a pointer to the first character in the constant string array
 520         std::vector<Constant*> geplist{C(0),C(0)};
 521         Constant *strGEP = ConstantExpr::getGetElementPtr(nullptr, gvPtr,geplist,false);
 522
 523         // insert the pointer to the format string in the argument vector
 524         printCallArgs[0] = strGEP;
 525
 526         // get pointer to CallPrint function and insert decl into the module if needed
 527         std::vector<Type*> args;
 528         args.push_back(PointerType::get(mInt8Ty,0));
 529         FunctionType* callPrintTy = FunctionType::get(Type::getVoidTy(JM()->mContext),args,true);
 530         Function *callPrintFn = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("CallPrint", callPrintTy));
 531
 532         // if we haven't yet added the symbol to the symbol table
 533         if((sys::DynamicLibrary::SearchForAddressOfSymbol("CallPrint")) == nullptr)
 534         {
 535             sys::DynamicLibrary::AddSymbol("CallPrint", (void *)&CallPrint);
 536         }
 537
 538         // insert a call to CallPrint
 539         return CALLA(callPrintFn,printCallArgs);
 540     }
 541
 542     //////////////////////////////////////////////////////////////////////////
 543     /// @brief Wrapper around PRINT with initializer list.
 544     CallInst* Builder::PRINT(const std::string &printStr)
 545     {
 546         return PRINT(printStr, {});
 547     }
 548
 549     //////////////////////////////////////////////////////////////////////////
 550     /// @brief Generate a masked gather operation in LLVM IR.  If not
 551     /// supported on the underlying platform, emulate it with loads
 552     /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
 553     /// @param pBase - Int8* base VB address pointer value
 554     /// @param vIndices - SIMD wide value of VB byte offsets
 555     /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
 556     /// @param scale - value to scale indices by
 557     Value *Builder::GATHERPS(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, uint8_t scale)
 558     {
 559         Value* vGather;
 560
 561         // use avx2 gather instruction if available
 562         if(JM()->mArch.AVX2())
 563         {
 564             // force mask to <N x float>, required by vgather
 565             vMask = BITCAST(vMask, mSimdFP32Ty);
 566             vGather = VGATHERPS(vSrc,pBase,vIndices,vMask,C(scale));
 567         }
 568         else
 569         {
 570             Value* pStack = STACKSAVE();
 571
 572             // store vSrc on the stack.  this way we can select between a valid load address and the vSrc address
 573             Value* vSrcPtr = ALLOCA(vSrc->getType());
 574             STORE(vSrc, vSrcPtr);
 575
 576             vGather = VUNDEF_F();
 577             Value *vScaleVec = VIMMED1((uint32_t)scale);
 578             Value *vOffsets = MUL(vIndices,vScaleVec);
 579             Value *mask = MASK(vMask);
 580             for(uint32_t i = 0; i < mVWidth; ++i)
 581             {
 582                 // single component byte index
 583                 Value *offset = VEXTRACT(vOffsets,C(i));
 584                 // byte pointer to component
 585                 Value *loadAddress = GEP(pBase,offset);
 586                 loadAddress = BITCAST(loadAddress,PointerType::get(mFP32Ty,0));
 587                 // pointer to the value to load if we're masking off a component
 588                 Value *maskLoadAddress = GEP(vSrcPtr,{C(0), C(i)});
 589                 Value *selMask = VEXTRACT(mask,C(i));
 590                 // switch in a safe address to load if we're trying to access a vertex
 591                 Value *validAddress = SELECT(selMask, loadAddress, maskLoadAddress);
 592                 Value *val = LOAD(validAddress);
 593                 vGather = VINSERT(vGather,val,C(i));
 594             }
 595             STACKRESTORE(pStack);
 596         }
 597
 598         return vGather;
 599     }
 600
 601     //////////////////////////////////////////////////////////////////////////
 602     /// @brief Generate a masked gather operation in LLVM IR.  If not
 603     /// supported on the underlying platform, emulate it with loads
 604     /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
 605     /// @param pBase - Int8* base VB address pointer value
 606     /// @param vIndices - SIMD wide value of VB byte offsets
 607     /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
 608     /// @param scale - value to scale indices by
 609     Value *Builder::GATHERDD(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, uint8_t scale)
 610     {
 611         Value* vGather;
 612
 613         // use avx2 gather instruction if available
 614         if(JM()->mArch.AVX2())
 615         {
 616             vGather = VGATHERDD(vSrc, pBase, vIndices, vMask, C(scale));
 617         }
 618         else
 619         {
 620             Value* pStack = STACKSAVE();
 621
 622             // store vSrc on the stack.  this way we can select between a valid load address and the vSrc address
 623             Value* vSrcPtr = ALLOCA(vSrc->getType());
 624             STORE(vSrc, vSrcPtr);
 625
 626             vGather = VUNDEF_I();
 627             Value *vScaleVec = VIMMED1((uint32_t)scale);
 628             Value *vOffsets = MUL(vIndices, vScaleVec);
 629             Value *mask = MASK(vMask);
 630             for(uint32_t i = 0; i < mVWidth; ++i)
 631             {
 632                 // single component byte index
 633                 Value *offset = VEXTRACT(vOffsets, C(i));
 634                 // byte pointer to component
 635                 Value *loadAddress = GEP(pBase, offset);
 636                 loadAddress = BITCAST(loadAddress, PointerType::get(mInt32Ty, 0));
 637                 // pointer to the value to load if we're masking off a component
 638                 Value *maskLoadAddress = GEP(vSrcPtr, {C(0), C(i)});
 639                 Value *selMask = VEXTRACT(mask, C(i));
 640                 // switch in a safe address to load if we're trying to access a vertex
 641                 Value *validAddress = SELECT(selMask, loadAddress, maskLoadAddress);
 642                 Value *val = LOAD(validAddress, C(0));
 643                 vGather = VINSERT(vGather, val, C(i));
 644             }
 645
 646             STACKRESTORE(pStack);
 647         }
 648         return vGather;
 649     }
 650
 651     //////////////////////////////////////////////////////////////////////////
 652     /// @brief Generate a masked gather operation in LLVM IR.  If not
 653     /// supported on the underlying platform, emulate it with loads
 654     /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
 655     /// @param pBase - Int8* base VB address pointer value
 656     /// @param vIndices - SIMD wide value of VB byte offsets
 657     /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
 658     /// @param scale - value to scale indices by
 659     Value *Builder::GATHERPD(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, uint8_t scale)
 660     {
 661         Value* vGather;
 662
 663         // use avx2 gather instruction if available
 664         if(JM()->mArch.AVX2())
 665         {
 666             vGather = VGATHERPD(vSrc, pBase, vIndices, vMask, C(scale));
 667         }
 668         else
 669         {
 670             Value* pStack = STACKSAVE();
 671
 672             // store vSrc on the stack.  this way we can select between a valid load address and the vSrc address
 673             Value* vSrcPtr = ALLOCA(vSrc->getType());
 674             STORE(vSrc, vSrcPtr);
 675
 676             vGather = UndefValue::get(VectorType::get(mDoubleTy, 4));
 677             Value *vScaleVec = VECTOR_SPLAT(4, C((uint32_t)scale));
 678             Value *vOffsets = MUL(vIndices,vScaleVec);
 679             Value *mask = MASK(vMask);
 680             for(uint32_t i = 0; i < mVWidth/2; ++i)
 681             {
 682                 // single component byte index
 683                 Value *offset = VEXTRACT(vOffsets,C(i));
 684                 // byte pointer to component
 685                 Value *loadAddress = GEP(pBase,offset);
 686                 loadAddress = BITCAST(loadAddress,PointerType::get(mDoubleTy,0));
 687                 // pointer to the value to load if we're masking off a component
 688                 Value *maskLoadAddress = GEP(vSrcPtr,{C(0), C(i)});
 689                 Value *selMask = VEXTRACT(mask,C(i));
 690                 // switch in a safe address to load if we're trying to access a vertex
 691                 Value *validAddress = SELECT(selMask, loadAddress, maskLoadAddress);
 692                 Value *val = LOAD(validAddress);
 693                 vGather = VINSERT(vGather,val,C(i));
 694             }
 695             STACKRESTORE(pStack);
 696         }
 697         return vGather;
 698     }
 699
 700 #if USE_SIMD16_BUILDER
 701     //////////////////////////////////////////////////////////////////////////
 702     /// @brief
 703     Value *Builder::EXTRACT(Value *a2, uint32_t imm)
 704     {
 705         const uint32_t i0 = (imm > 0) ? mVWidth : 0;
 706
 707         Value *result = VUNDEF_F();
 708
 709         for (uint32_t i = 0; i < mVWidth; i += 1)
 710         {
 711             Value *temp = VEXTRACT(a2, C(i0 + i));
 712
 713             result = VINSERT(result, temp, C(i));
 714         }
 715
 716         return result;
 717     }
 718
 719     //////////////////////////////////////////////////////////////////////////
 720     /// @brief
 721     Value *Builder::INSERT(Value *a2, Value * b, uint32_t imm)
 722     {
 723         const uint32_t i0 = (imm > 0) ? mVWidth : 0;
 724
 725         Value *result = BITCAST(a2, mSimd2FP32Ty);
 726
 727         for (uint32_t i = 0; i < mVWidth; i += 1)
 728         {
 729 #if 1
 730             if (!b->getType()->getScalarType()->isFloatTy())
 731             {
 732                 b = BITCAST(b, mSimdFP32Ty);
 733             }
 734
 735 #endif
 736             Value *temp = VEXTRACT(b, C(i));
 737
 738             result = VINSERT(result, temp, C(i0 + i));
 739         }
 740
 741         return result;
 742     }
 743
 744 #endif
 745     //////////////////////////////////////////////////////////////////////////
 746     /// @brief convert x86 <N x float> mask to llvm <N x i1> mask
 747     Value* Builder::MASK(Value* vmask)
 748     {
 749         Value* src = BITCAST(vmask, mSimdInt32Ty);
 750         return ICMP_SLT(src, VIMMED1(0));
 751     }
 752
 753     //////////////////////////////////////////////////////////////////////////
 754     /// @brief convert llvm <N x i1> mask to x86 <N x i32> mask
 755     Value* Builder::VMASK(Value* mask)
 756     {
 757         return S_EXT(mask, mSimdInt32Ty);
 758     }
 759
 760     //////////////////////////////////////////////////////////////////////////
 761     /// @brief Generate a VPSHUFB operation in LLVM IR.  If not
 762     /// supported on the underlying platform, emulate it
 763     /// @param a - 256bit SIMD(32x8bit) of 8bit integer values
 764     /// @param b - 256bit SIMD(32x8bit) of 8bit integer mask values
 765     /// Byte masks in lower 128 lane of b selects 8 bit values from lower
 766     /// 128bits of a, and vice versa for the upper lanes.  If the mask
 767     /// value is negative, '0' is inserted.
 768     Value *Builder::PSHUFB(Value* a, Value* b)
 769     {
 770         Value* res;
 771         // use avx2 pshufb instruction if available
 772         if(JM()->mArch.AVX2())
 773         {
 774             res = VPSHUFB(a, b);
 775         }
 776         else
 777         {
 778             Constant* cB = dyn_cast<Constant>(b);
 779             // number of 8 bit elements in b
 780             uint32_t numElms = cast<VectorType>(cB->getType())->getNumElements();
 781             // output vector
 782             Value* vShuf = UndefValue::get(VectorType::get(mInt8Ty, numElms));
 783
 784             // insert an 8 bit value from the high and low lanes of a per loop iteration
 785             numElms /= 2;
 786             for(uint32_t i = 0; i < numElms; i++)
 787             {
 788                 ConstantInt* cLow128b = cast<ConstantInt>(cB->getAggregateElement(i));
 789                 ConstantInt* cHigh128b = cast<ConstantInt>(cB->getAggregateElement(i + numElms));
 790
 791                 // extract values from constant mask
 792                 char valLow128bLane =  (char)(cLow128b->getSExtValue());
 793                 char valHigh128bLane = (char)(cHigh128b->getSExtValue());
 794
 795                 Value* insertValLow128b;
 796                 Value* insertValHigh128b;
 797
 798                 // if the mask value is negative, insert a '0' in the respective output position
 799                 // otherwise, lookup the value at mask position (bits 3..0 of the respective mask byte) in a and insert in output vector
 800                 insertValLow128b = (valLow128bLane < 0) ? C((char)0) : VEXTRACT(a, C((valLow128bLane & 0xF)));
 801                 insertValHigh128b = (valHigh128bLane < 0) ? C((char)0) : VEXTRACT(a, C((valHigh128bLane & 0xF) + numElms));
 802
 803                 vShuf = VINSERT(vShuf, insertValLow128b, i);
 804                 vShuf = VINSERT(vShuf, insertValHigh128b, (i + numElms));
 805             }
 806             res = vShuf;
 807         }
 808         return res;
 809     }
 810
 811     //////////////////////////////////////////////////////////////////////////
 812     /// @brief Generate a VPSHUFB operation (sign extend 8 8bit values to 32
 813     /// bits)in LLVM IR.  If not supported on the underlying platform, emulate it
 814     /// @param a - 128bit SIMD lane(16x8bit) of 8bit integer values.  Only
 815     /// lower 8 values are used.
 816     Value *Builder::PMOVSXBD(Value* a)
 817     {
 818         // VPMOVSXBD output type
 819         Type* v8x32Ty = VectorType::get(mInt32Ty, 8);
 820         // Extract 8 values from 128bit lane and sign extend
 821         return S_EXT(VSHUFFLE(a, a, C<int>({0, 1, 2, 3, 4, 5, 6, 7})), v8x32Ty);
 822     }
 823
 824     //////////////////////////////////////////////////////////////////////////
 825     /// @brief Generate a VPSHUFB operation (sign extend 8 16bit values to 32
 826     /// bits)in LLVM IR.  If not supported on the underlying platform, emulate it
 827     /// @param a - 128bit SIMD lane(8x16bit) of 16bit integer values.
 828     Value *Builder::PMOVSXWD(Value* a)
 829     {
 830         // VPMOVSXWD output type
 831         Type* v8x32Ty = VectorType::get(mInt32Ty, 8);
 832         // Extract 8 values from 128bit lane and sign extend
 833         return S_EXT(VSHUFFLE(a, a, C<int>({0, 1, 2, 3, 4, 5, 6, 7})), v8x32Ty);
 834     }
 835
 836     //////////////////////////////////////////////////////////////////////////
 837     /// @brief Generate a VPERMD operation (shuffle 32 bit integer values
 838     /// across 128 bit lanes) in LLVM IR.  If not supported on the underlying
 839     /// platform, emulate it
 840     /// @param a - 256bit SIMD lane(8x32bit) of integer values.
 841     /// @param idx - 256bit SIMD lane(8x32bit) of 3 bit lane index values
 842     Value *Builder::PERMD(Value* a, Value* idx)
 843     {
 844         Value* res;
 845         // use avx2 permute instruction if available
 846         if(JM()->mArch.AVX2())
 847         {
 848             res = VPERMD(a, idx);
 849         }
 850         else
 851         {
 852             if (isa<Constant>(idx))
 853             {
 854                 res = VSHUFFLE(a, a, idx);
 855             }
 856             else
 857             {
 858                 res = VUNDEF_I();
 859                 for (uint32_t l = 0; l < JM()->mVWidth; ++l)
 860                 {
 861                     Value* pIndex = VEXTRACT(idx, C(l));
 862                     Value* pVal = VEXTRACT(a, pIndex);
 863                     res = VINSERT(res, pVal, C(l));
 864                 }
 865             }
 866         }
 867         return res;
 868     }
 869
 870     //////////////////////////////////////////////////////////////////////////
 871     /// @brief Generate a VPERMPS operation (shuffle 32 bit float values
 872     /// across 128 bit lanes) in LLVM IR.  If not supported on the underlying
 873     /// platform, emulate it
 874     /// @param a - 256bit SIMD lane(8x32bit) of float values.
 875     /// @param idx - 256bit SIMD lane(8x32bit) of 3 bit lane index values
 876     Value *Builder::PERMPS(Value* a, Value* idx)
 877     {
 878         Value* res;
 879         // use avx2 permute instruction if available
 880         if (JM()->mArch.AVX2())
 881         {
 882             // llvm 3.6.0 swapped the order of the args to vpermd
 883             res = VPERMPS(idx, a);
 884         }
 885         else
 886         {
 887             if (isa<Constant>(idx))
 888             {
 889                 res = VSHUFFLE(a, a, idx);
 890             }
 891             else
 892             {
 893                 res = VUNDEF_F();
 894                 for (uint32_t l = 0; l < JM()->mVWidth; ++l)
 895                 {
 896                     Value* pIndex = VEXTRACT(idx, C(l));
 897                     Value* pVal = VEXTRACT(a, pIndex);
 898                     res = VINSERT(res, pVal, C(l));
 899                 }
 900             }
 901         }
 902
 903         return res;
 904     }
 905
 906     //////////////////////////////////////////////////////////////////////////
 907     /// @brief Generate a VCVTPH2PS operation (float16->float32 conversion)
 908     /// in LLVM IR.  If not supported on the underlying platform, emulate it
 909     /// @param a - 128bit SIMD lane(8x16bit) of float16 in int16 format.
 910     Value *Builder::CVTPH2PS(Value* a)
 911     {
 912         if (JM()->mArch.F16C())
 913         {
 914             return VCVTPH2PS(a);
 915         }
 916         else
 917         {
 918             FunctionType* pFuncTy = FunctionType::get(mFP32Ty, mInt16Ty);
 919             Function* pCvtPh2Ps = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("ConvertFloat16ToFloat32", pFuncTy));
 920
 921             if (sys::DynamicLibrary::SearchForAddressOfSymbol("ConvertFloat16ToFloat32") == nullptr)
 922             {
 923                 sys::DynamicLibrary::AddSymbol("ConvertFloat16ToFloat32", (void *)&ConvertFloat16ToFloat32);
 924             }
 925
 926             Value* pResult = UndefValue::get(mSimdFP32Ty);
 927             for (uint32_t i = 0; i < mVWidth; ++i)
 928             {
 929                 Value* pSrc = VEXTRACT(a, C(i));
 930                 Value* pConv = CALL(pCvtPh2Ps, std::initializer_list<Value*>{pSrc});
 931                 pResult = VINSERT(pResult, pConv, C(i));
 932             }
 933
 934             return pResult;
 935         }
 936     }
 937
 938     //////////////////////////////////////////////////////////////////////////
 939     /// @brief Generate a VCVTPS2PH operation (float32->float16 conversion)
 940     /// in LLVM IR.  If not supported on the underlying platform, emulate it
 941     /// @param a - 128bit SIMD lane(8x16bit) of float16 in int16 format.
 942     Value *Builder::CVTPS2PH(Value* a, Value* rounding)
 943     {
 944         if (JM()->mArch.F16C())
 945         {
 946             return VCVTPS2PH(a, rounding);
 947         }
 948         else
 949         {
 950             // call scalar C function for now
 951             FunctionType* pFuncTy = FunctionType::get(mInt16Ty, mFP32Ty);
 952             Function* pCvtPs2Ph = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("ConvertFloat32ToFloat16", pFuncTy));
 953
 954             if (sys::DynamicLibrary::SearchForAddressOfSymbol("ConvertFloat32ToFloat16") == nullptr)
 955             {
 956                 sys::DynamicLibrary::AddSymbol("ConvertFloat32ToFloat16", (void *)&ConvertFloat32ToFloat16);
 957             }
 958
 959             Value* pResult = UndefValue::get(mSimdInt16Ty);
 960             for (uint32_t i = 0; i < mVWidth; ++i)
 961             {
 962                 Value* pSrc = VEXTRACT(a, C(i));
 963                 Value* pConv = CALL(pCvtPs2Ph, std::initializer_list<Value*>{pSrc});
 964                 pResult = VINSERT(pResult, pConv, C(i));
 965             }
 966
 967             return pResult;
 968         }
 969     }
 970
 971     Value *Builder::PMAXSD(Value* a, Value* b)
 972     {
 973         Value* cmp = ICMP_SGT(a, b);
 974         return SELECT(cmp, a, b);
 975     }
 976
 977     Value *Builder::PMINSD(Value* a, Value* b)
 978     {
 979         Value* cmp = ICMP_SLT(a, b);
 980         return SELECT(cmp, a, b);
 981     }
 982
 983     void Builder::Gather4(const SWR_FORMAT format, Value* pSrcBase, Value* byteOffsets,
 984                           Value* mask, Value* vGatherComponents[], bool bPackedOutput)
 985     {
 986         const SWR_FORMAT_INFO &info = GetFormatInfo(format);
 987         if(info.type[0] == SWR_TYPE_FLOAT && info.bpc[0] == 32)
 988         {
 989             // ensure our mask is the correct type
 990             mask = BITCAST(mask, mSimdFP32Ty);
 991             GATHER4PS(info, pSrcBase, byteOffsets, mask, vGatherComponents, bPackedOutput);
 992         }
 993         else
 994         {
 995             // ensure our mask is the correct type
 996             mask = BITCAST(mask, mSimdInt32Ty);
 997             GATHER4DD(info, pSrcBase, byteOffsets, mask, vGatherComponents, bPackedOutput);
 998         }
 999     }
1000
1001     void Builder::GATHER4PS(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets,
1002                             Value* mask, Value* vGatherComponents[], bool bPackedOutput)
1003     {
1004         switch(info.bpp / info.numComps)
1005         {
1006             case 16:
1007             {
1008                     Value* vGatherResult[2];
1009                     Value *vMask;
1010
1011                     // TODO: vGatherMaskedVal
1012                     Value* vGatherMaskedVal = VIMMED1((float)0);
1013
1014                     // always have at least one component out of x or y to fetch
1015
1016                     // save mask as it is zero'd out after each gather
1017                     vMask = mask;
1018
1019                     vGatherResult[0] = GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask);
1020                     // e.g. result of first 8x32bit integer gather for 16bit components
1021                     // 256i - 0    1    2    3    4    5    6    7
1022                     //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
1023                     //
1024
1025                     // if we have at least one component out of x or y to fetch
1026                     if(info.numComps > 2)
1027                     {
1028                         // offset base to the next components(zw) in the vertex to gather
1029                         pSrcBase = GEP(pSrcBase, C((char)4));
1030                         vMask = mask;
1031
1032                         vGatherResult[1] =  GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask);
1033                         // e.g. result of second 8x32bit integer gather for 16bit components
1034                         // 256i - 0    1    2    3    4    5    6    7
1035                         //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
1036                         //
1037                     }
1038                     else
1039                     {
1040                         vGatherResult[1] =  vGatherMaskedVal;
1041                     }
1042
1043                     // Shuffle gathered components into place, each row is a component
1044                     Shuffle16bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
1045             }
1046                 break;
1047             case 32:
1048             {
1049                 // apply defaults
1050                 for (uint32_t i = 0; i < 4; ++i)
1051                 {
1052                     vGatherComponents[i] = VIMMED1(*(float*)&info.defaults[i]);
1053                 }
1054
1055                 for(uint32_t i = 0; i < info.numComps; i++)
1056                 {
1057                     uint32_t swizzleIndex = info.swizzle[i];
1058
1059                     // save mask as it is zero'd out after each gather
1060                     Value *vMask = mask;
1061
1062                     // Gather a SIMD of components
1063                     vGatherComponents[swizzleIndex] = GATHERPS(vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask);
1064
1065                     // offset base to the next component to gather
1066                     pSrcBase = GEP(pSrcBase, C((char)4));
1067                 }
1068             }
1069                 break;
1070             default:
1071                 SWR_INVALID("Invalid float format");
1072                 break;
1073         }
1074     }
1075
1076     void Builder::GATHER4DD(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets,
1077                             Value* mask, Value* vGatherComponents[], bool bPackedOutput)
1078     {
1079         switch (info.bpp / info.numComps)
1080         {
1081             case 8:
1082             {
1083                 Value* vGatherMaskedVal = VIMMED1((int32_t)0);
1084                 Value* vGatherResult = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, mask);
1085                 // e.g. result of an 8x32bit integer gather for 8bit components
1086                 // 256i - 0    1    2    3    4    5    6    7
1087                 //        xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw
1088
1089                 Shuffle8bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
1090             }
1091                 break;
1092             case 16:
1093             {
1094                 Value* vGatherResult[2];
1095                 Value *vMask;
1096
1097                 // TODO: vGatherMaskedVal
1098                 Value* vGatherMaskedVal = VIMMED1((int32_t)0);
1099
1100                 // always have at least one component out of x or y to fetch
1101
1102                 // save mask as it is zero'd out after each gather
1103                 vMask = mask;
1104
1105                 vGatherResult[0] = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask);
1106                 // e.g. result of first 8x32bit integer gather for 16bit components
1107                 // 256i - 0    1    2    3    4    5    6    7
1108                 //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
1109                 //
1110
1111                 // if we have at least one component out of x or y to fetch
1112                 if(info.numComps > 2)
1113                 {
1114                     // offset base to the next components(zw) in the vertex to gather
1115                     pSrcBase = GEP(pSrcBase, C((char)4));
1116                     vMask = mask;
1117
1118                     vGatherResult[1] = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask);
1119                     // e.g. result of second 8x32bit integer gather for 16bit components
1120                     // 256i - 0    1    2    3    4    5    6    7
1121                     //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
1122                     //
1123                 }
1124                 else
1125                 {
1126                     vGatherResult[1] = vGatherMaskedVal;
1127                 }
1128
1129                 // Shuffle gathered components into place, each row is a component
1130                 Shuffle16bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
1131
1132             }
1133                 break;
1134             case 32:
1135             {
1136                 // apply defaults
1137                 for (uint32_t i = 0; i < 4; ++i)
1138                 {
1139                     vGatherComponents[i] = VIMMED1((int)info.defaults[i]);
1140                 }
1141
1142                 for(uint32_t i = 0; i < info.numComps; i++)
1143                 {
1144                     uint32_t swizzleIndex = info.swizzle[i];
1145
1146                     // save mask as it is zero'd out after each gather
1147                     Value *vMask = mask;
1148
1149                     // Gather a SIMD of components
1150                     vGatherComponents[swizzleIndex] = GATHERDD(vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask);
1151
1152                     // offset base to the next component to gather
1153                     pSrcBase = GEP(pSrcBase, C((char)4));
1154                 }
1155             }
1156                 break;
1157             default:
1158                 SWR_INVALID("unsupported format");
1159             break;
1160         }
1161     }
1162
1163     void Builder::Shuffle16bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput[2], Value* vGatherOutput[4], bool bPackedOutput)
1164     {
1165         // cast types
1166         Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
1167         Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
1168
1169         // input could either be float or int vector; do shuffle work in int
1170         vGatherInput[0] = BITCAST(vGatherInput[0], mSimdInt32Ty);
1171         vGatherInput[1] = BITCAST(vGatherInput[1], mSimdInt32Ty);
1172
1173         if(bPackedOutput)
1174         {
1175             Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
1176
1177             // shuffle mask
1178             Value* vConstMask = C<char>({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
1179                                          0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15});
1180             Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput[0], v32x8Ty), vConstMask), vGatherTy);
1181             // after pshufb: group components together in each 128bit lane
1182             // 256i - 0    1    2    3    4    5    6    7
1183             //        xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
1184
1185             Value* vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
1186             // after PERMD: move and pack xy components into each 128bit lane
1187             // 256i - 0    1    2    3    4    5    6    7
1188             //        xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
1189
1190             // do the same for zw components
1191             Value* vi128ZW = nullptr;
1192             if(info.numComps > 2)
1193             {
1194                 Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput[1], v32x8Ty), vConstMask), vGatherTy);
1195                 vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
1196             }
1197
1198             for(uint32_t i = 0; i < 4; i++)
1199             {
1200                 uint32_t swizzleIndex = info.swizzle[i];
1201                 // todo: fixed for packed
1202                 Value* vGatherMaskedVal = VIMMED1((int32_t)(info.defaults[i]));
1203                 if(i >= info.numComps)
1204                 {
1205                     // set the default component val
1206                     vGatherOutput[swizzleIndex] = vGatherMaskedVal;
1207                     continue;
1208                 }
1209
1210                 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
1211                 uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
1212                 // if x or y, use vi128XY permute result, else use vi128ZW
1213                 Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
1214
1215                 // extract packed component 128 bit lanes
1216                 vGatherOutput[swizzleIndex] = VEXTRACT(selectedPermute, C(lane));
1217             }
1218
1219         }
1220         else
1221         {
1222             // pshufb masks for each component
1223             Value* vConstMask[2];
1224             // x/z shuffle mask
1225             vConstMask[0] = C<char>({0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
1226                                      0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, });
1227
1228             // y/w shuffle mask
1229             vConstMask[1] = C<char>({2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
1230                                      2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1});
1231
1232
1233             // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits
1234             // apply defaults
1235             for (uint32_t i = 0; i < 4; ++i)
1236             {
1237                 vGatherOutput[i] = VIMMED1((int32_t)info.defaults[i]);
1238             }
1239
1240             for(uint32_t i = 0; i < info.numComps; i++)
1241             {
1242                 uint32_t swizzleIndex = info.swizzle[i];
1243
1244                 // select correct constMask for x/z or y/w pshufb
1245                 uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1;
1246                 // if x or y, use vi128XY permute result, else use vi128ZW
1247                 uint32_t selectedGather = (i < 2) ? 0 : 1;
1248
1249                 vGatherOutput[swizzleIndex] = BITCAST(PSHUFB(BITCAST(vGatherInput[selectedGather], v32x8Ty), vConstMask[selectedMask]), vGatherTy);
1250                 // after pshufb mask for x channel; z uses the same shuffle from the second gather
1251                 // 256i - 0    1    2    3    4    5    6    7
1252                 //        xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00
1253             }
1254         }
1255     }
1256
1257     void Builder::Shuffle8bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput, Value* vGatherOutput[], bool bPackedOutput)
1258     {
1259         // cast types
1260         Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
1261         Type* v32x8Ty =  VectorType::get(mInt8Ty, mVWidth * 4 ); // vwidth is units of 32 bits
1262
1263         if(bPackedOutput)
1264         {
1265             Type* v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
1266             // shuffle mask
1267             Value* vConstMask = C<char>({0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15,
1268                                          0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15});
1269             Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput, v32x8Ty), vConstMask), vGatherTy);
1270             // after pshufb: group components together in each 128bit lane
1271             // 256i - 0    1    2    3    4    5    6    7
1272             //        xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww
1273
1274             Value* vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({0, 4, 0, 0, 1, 5, 0, 0})), v128Ty);
1275             // after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane
1276             // 256i - 0    1    2    3    4    5    6    7
1277             //        xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care)
1278
1279             // do the same for zw components
1280             Value* vi128ZW = nullptr;
1281             if(info.numComps > 2)
1282             {
1283                 vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({2, 6, 0, 0, 3, 7, 0, 0})), v128Ty);
1284             }
1285
1286             // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
1287             for(uint32_t i = 0; i < 4; i++)
1288             {
1289                 uint32_t swizzleIndex = info.swizzle[i];
1290                 // todo: fix for packed
1291                 Value* vGatherMaskedVal = VIMMED1((int32_t)(info.defaults[i]));
1292                 if(i >= info.numComps)
1293                 {
1294                     // set the default component val
1295                     vGatherOutput[swizzleIndex] = vGatherMaskedVal;
1296                     continue;
1297                 }
1298
1299                 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
1300                 uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
1301                 // if x or y, use vi128XY permute result, else use vi128ZW
1302                 Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
1303
1304                 // sign extend
1305                 vGatherOutput[swizzleIndex] = VEXTRACT(selectedPermute, C(lane));
1306             }
1307         }
1308         // else zero extend
1309         else{
1310             // shuffle enabled components into lower byte of each 32bit lane, 0 extending to 32 bits
1311             // apply defaults
1312             for (uint32_t i = 0; i < 4; ++i)
1313             {
1314                 vGatherOutput[i] = VIMMED1((int32_t)info.defaults[i]);
1315             }
1316
1317             for(uint32_t i = 0; i < info.numComps; i++){
1318                 uint32_t swizzleIndex = info.swizzle[i];
1319
1320                 // pshufb masks for each component
1321                 Value* vConstMask;
1322                 switch(i)
1323                 {
1324                     case 0:
1325                         // x shuffle mask
1326                         vConstMask = C<char>({0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1,
1327                                               0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1});
1328                         break;
1329                     case 1:
1330                         // y shuffle mask
1331                         vConstMask = C<char>({1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1,
1332                                               1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1});
1333                         break;
1334                     case 2:
1335                         // z shuffle mask
1336                         vConstMask = C<char>({2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1,
1337                                               2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1});
1338                         break;
1339                     case 3:
1340                         // w shuffle mask
1341                         vConstMask = C<char>({3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1,
1342                                               3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1});
1343                         break;
1344                     default:
1345                         vConstMask = nullptr;
1346                         break;
1347                 }
1348
1349                     vGatherOutput[swizzleIndex] = BITCAST(PSHUFB(BITCAST(vGatherInput, v32x8Ty), vConstMask), vGatherTy);
1350                     // after pshufb for x channel
1351                     // 256i - 0    1    2    3    4    5    6    7
1352                     //        x000 x000 x000 x000 x000 x000 x000 x000
1353             }
1354         }
1355     }
1356
1357     // Helper function to create alloca in entry block of function
1358     Value* Builder::CreateEntryAlloca(Function* pFunc, Type* pType)
1359     {
1360         auto saveIP = IRB()->saveIP();
1361         IRB()->SetInsertPoint(&pFunc->getEntryBlock(),
1362                               pFunc->getEntryBlock().begin());
1363         Value* pAlloca = ALLOCA(pType);
1364         if (saveIP.isSet()) IRB()->restoreIP(saveIP);
1365         return pAlloca;
1366     }
1367
1368     Value* Builder::CreateEntryAlloca(Function* pFunc, Type* pType, Value* pArraySize)
1369     {
1370         auto saveIP = IRB()->saveIP();
1371         IRB()->SetInsertPoint(&pFunc->getEntryBlock(),
1372             pFunc->getEntryBlock().begin());
1373         Value* pAlloca = ALLOCA(pType, pArraySize);
1374         if (saveIP.isSet()) IRB()->restoreIP(saveIP);
1375         return pAlloca;
1376     }
1377
1378     //////////////////////////////////////////////////////////////////////////
1379     /// @brief emulates a scatter operation.
1380     /// @param pDst - pointer to destination
1381     /// @param vSrc - vector of src data to scatter
1382     /// @param vOffsets - vector of byte offsets from pDst
1383     /// @param vMask - mask of valid lanes
1384     void Builder::SCATTERPS(Value* pDst, Value* vSrc, Value* vOffsets, Value* vMask)
1385     {
1386         /* Scatter algorithm
1387
1388            while(Index = BitScanForward(mask))
1389                 srcElem = srcVector[Index]
1390                 offsetElem = offsetVector[Index]
1391                 *(pDst + offsetElem) = srcElem
1392                 Update mask (&= ~(1<<Index)
1393
1394         */
1395
1396         BasicBlock* pCurBB = IRB()->GetInsertBlock();
1397         Function* pFunc = pCurBB->getParent();
1398         Type* pSrcTy = vSrc->getType()->getVectorElementType();
1399
1400         // Store vectors on stack
1401         if (pScatterStackSrc == nullptr)
1402         {
1403             // Save off stack allocations and reuse per scatter. Significantly reduces stack
1404             // requirements for shaders with a lot of scatters.
1405             pScatterStackSrc = CreateEntryAlloca(pFunc, mSimdInt64Ty);
1406             pScatterStackOffsets = CreateEntryAlloca(pFunc, mSimdInt32Ty);
1407         }
1408
1409         Value* pSrcArrayPtr = BITCAST(pScatterStackSrc, PointerType::get(vSrc->getType(), 0));
1410         Value* pOffsetsArrayPtr = pScatterStackOffsets;
1411         STORE(vSrc, pSrcArrayPtr);
1412         STORE(vOffsets, pOffsetsArrayPtr);
1413
1414         // Cast to pointers for random access
1415         pSrcArrayPtr = POINTER_CAST(pSrcArrayPtr, PointerType::get(pSrcTy, 0));
1416         pOffsetsArrayPtr = POINTER_CAST(pOffsetsArrayPtr, PointerType::get(mInt32Ty, 0));
1417
1418         Value* pMask = VMOVMSKPS(BITCAST(vMask, mSimdFP32Ty));
1419
1420         // Get cttz function
1421         Function* pfnCttz = Intrinsic::getDeclaration(mpJitMgr->mpCurrentModule, Intrinsic::cttz, { mInt32Ty });
1422
1423         // Setup loop basic block
1424         BasicBlock* pLoop = BasicBlock::Create(mpJitMgr->mContext, "Scatter Loop", pFunc);
1425
1426         // compute first set bit
1427         Value* pIndex = CALL(pfnCttz, { pMask, C(false) });
1428
1429         Value* pIsUndef = ICMP_EQ(pIndex, C(32));
1430
1431         // Split current block
1432         BasicBlock* pPostLoop = pCurBB->splitBasicBlock(cast<Instruction>(pIsUndef)->getNextNode());
1433
1434         // Remove unconditional jump created by splitBasicBlock
1435         pCurBB->getTerminator()->eraseFromParent();
1436
1437         // Add terminator to end of original block
1438         IRB()->SetInsertPoint(pCurBB);
1439
1440         // Add conditional branch
1441         COND_BR(pIsUndef, pPostLoop, pLoop);
1442
1443         // Add loop basic block contents
1444         IRB()->SetInsertPoint(pLoop);
1445         PHINode* pIndexPhi = PHI(mInt32Ty, 2);
1446         PHINode* pMaskPhi = PHI(mInt32Ty, 2);
1447
1448         pIndexPhi->addIncoming(pIndex, pCurBB);
1449         pMaskPhi->addIncoming(pMask, pCurBB);
1450
1451         // Extract elements for this index
1452         Value* pSrcElem = LOADV(pSrcArrayPtr, { pIndexPhi });
1453         Value* pOffsetElem = LOADV(pOffsetsArrayPtr, { pIndexPhi });
1454
1455         // GEP to this offset in dst
1456         Value* pCurDst = GEP(pDst, pOffsetElem);
1457         pCurDst = POINTER_CAST(pCurDst, PointerType::get(pSrcTy, 0));
1458         STORE(pSrcElem, pCurDst);
1459
1460         // Update the mask
1461         Value* pNewMask = AND(pMaskPhi, NOT(SHL(C(1), pIndexPhi)));
1462
1463         // Terminator
1464         Value* pNewIndex = CALL(pfnCttz, { pNewMask, C(false) });
1465
1466         pIsUndef = ICMP_EQ(pNewIndex, C(32));
1467         COND_BR(pIsUndef, pPostLoop, pLoop);
1468
1469         // Update phi edges
1470         pIndexPhi->addIncoming(pNewIndex, pLoop);
1471         pMaskPhi->addIncoming(pNewMask, pLoop);
1472
1473         // Move builder to beginning of post loop
1474         IRB()->SetInsertPoint(pPostLoop, pPostLoop->begin());
1475     }
1476
1477     Value* Builder::VABSPS(Value* a)
1478     {
1479         Value* asInt = BITCAST(a, mSimdInt32Ty);
1480         Value* result = BITCAST(AND(asInt, VIMMED1(0x7fffffff)), mSimdFP32Ty);
1481         return result;
1482     }
1483
1484     Value *Builder::ICLAMP(Value* src, Value* low, Value* high)
1485     {
1486         Value *lowCmp = ICMP_SLT(src, low);
1487         Value *ret = SELECT(lowCmp, low, src);
1488
1489         Value *highCmp = ICMP_SGT(ret, high);
1490         ret = SELECT(highCmp, high, ret);
1491
1492         return ret;
1493     }
1494
1495     Value *Builder::FCLAMP(Value* src, Value* low, Value* high)
1496     {
1497         Value *lowCmp = FCMP_OLT(src, low);
1498         Value *ret = SELECT(lowCmp, low, src);
1499
1500         Value *highCmp = FCMP_OGT(ret, high);
1501         ret = SELECT(highCmp, high, ret);
1502
1503         return ret;
1504     }
1505
1506     Value *Builder::FCLAMP(Value* src, float low, float high)
1507     {
1508         Value* result = VMAXPS(src, VIMMED1(low));
1509         result = VMINPS(result, VIMMED1(high));
1510
1511         return result;
1512     }
1513
1514     //////////////////////////////////////////////////////////////////////////
1515     /// @brief save/restore stack, providing ability to push/pop the stack and
1516     ///        reduce overall stack requirements for temporary stack use
1517     Value* Builder::STACKSAVE()
1518     {
1519         Function* pfnStackSave = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::stacksave);
1520         return CALLA(pfnStackSave);
1521     }
1522
1523     void Builder::STACKRESTORE(Value* pSaved)
1524     {
1525         Function* pfnStackRestore = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::stackrestore);
1526         CALL(pfnStackRestore, std::initializer_list<Value*>{pSaved});
1527     }
1528
1529     Value *Builder::FMADDPS(Value* a, Value* b, Value* c)
1530     {
1531         Value* vOut;
1532         // use FMADs if available
1533         if(JM()->mArch.AVX2())
1534         {
1535             vOut = VFMADDPS(a, b, c);
1536         }
1537         else
1538         {
1539             vOut = FADD(FMUL(a, b), c);
1540         }
1541         return vOut;
1542     }
1543
1544     Value* Builder::POPCNT(Value* a)
1545     {
1546         Function* pCtPop = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::ctpop, { a->getType() });
1547         return CALL(pCtPop, std::initializer_list<Value*>{a});
1548     }
1549
1550     //////////////////////////////////////////////////////////////////////////
1551     /// @brief C functions called by LLVM IR
1552     //////////////////////////////////////////////////////////////////////////
1553
1554     //////////////////////////////////////////////////////////////////////////
1555     /// @brief called in JIT code, inserted by PRINT
1556     /// output to both stdout and visual studio debug console
1557     void __cdecl CallPrint(const char* fmt, ...)
1558     {
1559         va_list args;
1560         va_start(args, fmt);
1561         vprintf(fmt, args);
1562
1563     #if defined( _WIN32 )
1564         char strBuf[1024];
1565         vsnprintf_s(strBuf, _TRUNCATE, fmt, args);
1566         OutputDebugStringA(strBuf);
1567     #endif
1568
1569         va_end(args);
1570     }
1571
1572     Value *Builder::VEXTRACTI128(Value* a, Constant* imm8)
1573     {
1574         bool flag = !imm8->isZeroValue();
1575         SmallVector<Constant*,8> idx;
1576         for (unsigned i = 0; i < mVWidth / 2; i++) {
1577             idx.push_back(C(flag ? i + mVWidth / 2 : i));
1578         }
1579         return VSHUFFLE(a, VUNDEF_I(), ConstantVector::get(idx));
1580     }
1581
1582     Value *Builder::VINSERTI128(Value* a, Value* b, Constant* imm8)
1583     {
1584         bool flag = !imm8->isZeroValue();
1585         SmallVector<Constant*,8> idx;
1586         for (unsigned i = 0; i < mVWidth; i++) {
1587             idx.push_back(C(i));
1588         }
1589         Value *inter = VSHUFFLE(b, VUNDEF_I(), ConstantVector::get(idx));
1590
1591         SmallVector<Constant*,8> idx2;
1592         for (unsigned i = 0; i < mVWidth / 2; i++) {
1593             idx2.push_back(C(flag ? i : i + mVWidth));
1594         }
1595         for (unsigned i = mVWidth / 2; i < mVWidth; i++) {
1596             idx2.push_back(C(flag ? i + mVWidth / 2 : i));
1597         }
1598         return VSHUFFLE(a, inter, ConstantVector::get(idx2));
1599     }
1600
1601     // rdtsc buckets macros
1602     void Builder::RDTSC_START(Value* pBucketMgr, Value* pId)
1603     {
1604         // @todo due to an issue with thread local storage propagation in llvm, we can only safely call into
1605         // buckets framework when single threaded
1606         if (KNOB_SINGLE_THREADED)
1607         {
1608             std::vector<Type*> args{
1609                 PointerType::get(mInt32Ty, 0),   // pBucketMgr
1610                 mInt32Ty                        // id
1611             };
1612
1613             FunctionType* pFuncTy = FunctionType::get(Type::getVoidTy(JM()->mContext), args, false);
1614             Function* pFunc = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("BucketManager_StartBucket", pFuncTy));
1615             if (sys::DynamicLibrary::SearchForAddressOfSymbol("BucketManager_StartBucket") == nullptr)
1616             {
1617                 sys::DynamicLibrary::AddSymbol("BucketManager_StartBucket", (void*)&BucketManager_StartBucket);
1618             }
1619
1620             CALL(pFunc, { pBucketMgr, pId });
1621         }
1622     }
1623
1624     void Builder::RDTSC_STOP(Value* pBucketMgr, Value* pId)
1625     {
1626         // @todo due to an issue with thread local storage propagation in llvm, we can only safely call into
1627         // buckets framework when single threaded
1628         if (KNOB_SINGLE_THREADED)
1629         {
1630             std::vector<Type*> args{
1631                 PointerType::get(mInt32Ty, 0),   // pBucketMgr
1632                 mInt32Ty                        // id
1633             };
1634
1635             FunctionType* pFuncTy = FunctionType::get(Type::getVoidTy(JM()->mContext), args, false);
1636             Function* pFunc = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("BucketManager_StopBucket", pFuncTy));
1637             if (sys::DynamicLibrary::SearchForAddressOfSymbol("BucketManager_StopBucket") == nullptr)
1638             {
1639                 sys::DynamicLibrary::AddSymbol("BucketManager_StopBucket", (void*)&BucketManager_StopBucket);
1640             }
1641
1642             CALL(pFunc, { pBucketMgr, pId });
1643         }
1644     }
1645
1646
1647     uint32_t Builder::GetTypeSize(Type* pType)
1648     {
1649         if (pType->isStructTy())
1650         {
1651             uint32_t numElems = pType->getStructNumElements();
1652             Type* pElemTy = pType->getStructElementType(0);
1653             return numElems * GetTypeSize(pElemTy);
1654         }
1655
1656         if (pType->isArrayTy())
1657         {
1658             uint32_t numElems = pType->getArrayNumElements();
1659             Type* pElemTy = pType->getArrayElementType();
1660             return numElems * GetTypeSize(pElemTy);
1661         }
1662
1663         if (pType->isIntegerTy())
1664         {
1665             uint32_t bitSize = pType->getIntegerBitWidth();
1666             return bitSize / 8;
1667         }
1668
1669         if (pType->isFloatTy())
1670         {
1671             return 4;
1672         }
1673
1674         if (pType->isHalfTy())
1675         {
1676             return 2;
1677         }
1678
1679         if (pType->isDoubleTy())
1680         {
1681             return 8;
1682         }
1683
1684         SWR_ASSERT(false, "Unimplemented type.");
1685         return 0;
1686     }
1687 }