src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp

   1 /****************************************************************************
   2 * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
   3 *
   4 * Permission is hereby granted, free of charge, to any person obtaining a
   5 * copy of this software and associated documentation files (the "Software"),
   6 * to deal in the Software without restriction, including without limitation
   7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8 * and/or sell copies of the Software, and to permit persons to whom the
   9 * Software is furnished to do so, subject to the following conditions:
  10 *
  11 * The above copyright notice and this permission notice (including the next
  12 * paragraph) shall be included in all copies or substantial portions of the
  13 * Software.
  14 *
  15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21 * IN THE SOFTWARE.
  22 *
  23 * @file builder_misc.cpp
  24 *
  25 * @brief Implementation for miscellaneous builder functions
  26 *
  27 * Notes:
  28 *
  29 ******************************************************************************/
  30 #include "builder.h"
  31 #include "common/rdtsc_buckets.h"
  32
  33
  34 namespace SwrJit
  35 {
  36     void __cdecl CallPrint(const char* fmt, ...);
  37
  38     //////////////////////////////////////////////////////////////////////////
  39     /// @brief Convert an IEEE 754 32-bit single precision float to an
  40     ///        16 bit float with 5 exponent bits and a variable
  41     ///        number of mantissa bits.
  42     /// @param val - 32-bit float
  43     /// @todo Maybe move this outside of this file into a header?
  44     static uint16_t Convert32To16Float(float val)
  45     {
  46         uint32_t sign, exp, mant;
  47         uint32_t roundBits;
  48
  49         // Extract the sign, exponent, and mantissa
  50         uint32_t uf = *(uint32_t*)&val;
  51         sign = (uf & 0x80000000) >> 31;
  52         exp = (uf & 0x7F800000) >> 23;
  53         mant = uf & 0x007FFFFF;
  54
  55         // Check for out of range
  56         if (std::isnan(val))
  57         {
  58             exp = 0x1F;
  59             mant = 0x200;
  60             sign = 1;                     // set the sign bit for NANs
  61         }
  62         else if (std::isinf(val))
  63         {
  64             exp = 0x1f;
  65             mant = 0x0;
  66         }
  67         else if (exp > (0x70 + 0x1E)) // Too big to represent -> max representable value
  68         {
  69             exp = 0x1E;
  70             mant = 0x3FF;
  71         }
  72         else if ((exp <= 0x70) && (exp >= 0x66)) // It's a denorm
  73         {
  74             mant |= 0x00800000;
  75             for (; exp <= 0x70; mant >>= 1, exp++)
  76                 ;
  77             exp = 0;
  78             mant = mant >> 13;
  79         }
  80         else if (exp < 0x66) // Too small to represent -> Zero
  81         {
  82             exp = 0;
  83             mant = 0;
  84         }
  85         else
  86         {
  87             // Saves bits that will be shifted off for rounding
  88             roundBits = mant & 0x1FFFu;
  89             // convert exponent and mantissa to 16 bit format
  90             exp = exp - 0x70;
  91             mant = mant >> 13;
  92
  93             // Essentially RTZ, but round up if off by only 1 lsb
  94             if (roundBits == 0x1FFFu)
  95             {
  96                 mant++;
  97                 // check for overflow
  98                 if ((mant & 0xC00u) != 0)
  99                     exp++;
 100                 // make sure only the needed bits are used
 101                 mant &= 0x3FF;
 102             }
 103         }
 104
 105         uint32_t tmpVal = (sign << 15) | (exp << 10) | mant;
 106         return (uint16_t)tmpVal;
 107     }
 108
 109     //////////////////////////////////////////////////////////////////////////
 110     /// @brief Convert an IEEE 754 16-bit float to an 32-bit single precision
 111     ///        float
 112     /// @param val - 16-bit float
 113     /// @todo Maybe move this outside of this file into a header?
 114     static float ConvertSmallFloatTo32(UINT val)
 115     {
 116         UINT result;
 117         if ((val & 0x7fff) == 0)
 118         {
 119             result = ((uint32_t)(val & 0x8000)) << 16;
 120         }
 121         else if ((val & 0x7c00) == 0x7c00)
 122         {
 123             result = ((val & 0x3ff) == 0) ? 0x7f800000 : 0x7fc00000;
 124             result |= ((uint32_t)val & 0x8000) << 16;
 125         }
 126         else
 127         {
 128             uint32_t sign = (val & 0x8000) << 16;
 129             uint32_t mant = (val & 0x3ff) << 13;
 130             uint32_t exp = (val >> 10) & 0x1f;
 131             if ((exp == 0) && (mant != 0)) // Adjust exponent and mantissa for denormals
 132             {
 133                 mant <<= 1;
 134                 while (mant < (0x400 << 13))
 135                 {
 136                     exp--;
 137                     mant <<= 1;
 138                 }
 139                 mant &= (0x3ff << 13);
 140             }
 141             exp = ((exp - 15 + 127) & 0xff) << 23;
 142             result = sign | exp | mant;
 143         }
 144
 145         return *(float*)&result;
 146     }
 147
 148     Constant *Builder::C(bool i)
 149     {
 150         return ConstantInt::get(IRB()->getInt1Ty(), (i ? 1 : 0));
 151     }
 152
 153     Constant *Builder::C(char i)
 154     {
 155         return ConstantInt::get(IRB()->getInt8Ty(), i);
 156     }
 157
 158     Constant *Builder::C(uint8_t i)
 159     {
 160         return ConstantInt::get(IRB()->getInt8Ty(), i);
 161     }
 162
 163     Constant *Builder::C(int i)
 164     {
 165         return ConstantInt::get(IRB()->getInt32Ty(), i);
 166     }
 167
 168     Constant *Builder::C(int64_t i)
 169     {
 170         return ConstantInt::get(IRB()->getInt64Ty(), i);
 171     }
 172
 173     Constant *Builder::C(uint16_t i)
 174     {
 175         return ConstantInt::get(mInt16Ty,i);
 176     }
 177
 178     Constant *Builder::C(uint32_t i)
 179     {
 180         return ConstantInt::get(IRB()->getInt32Ty(), i);
 181     }
 182
 183     Constant *Builder::C(float i)
 184     {
 185         return ConstantFP::get(IRB()->getFloatTy(), i);
 186     }
 187
 188     Constant *Builder::PRED(bool pred)
 189     {
 190         return ConstantInt::get(IRB()->getInt1Ty(), (pred ? 1 : 0));
 191     }
 192
 193     Value *Builder::VIMMED1(int i)
 194     {
 195         return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i)));
 196     }
 197
 198     Value *Builder::VIMMED1(uint32_t i)
 199     {
 200         return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i)));
 201     }
 202
 203     Value *Builder::VIMMED1(float i)
 204     {
 205         return ConstantVector::getSplat(mVWidth, cast<ConstantFP>(C(i)));
 206     }
 207
 208     Value *Builder::VIMMED1(bool i)
 209     {
 210         return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i)));
 211     }
 212
 213     Value *Builder::VUNDEF_IPTR()
 214     {
 215         return UndefValue::get(VectorType::get(mInt32PtrTy,mVWidth));
 216     }
 217
 218     Value *Builder::VUNDEF_I()
 219     {
 220         return UndefValue::get(VectorType::get(mInt32Ty, mVWidth));
 221     }
 222
 223     Value *Builder::VUNDEF(Type *ty, uint32_t size)
 224     {
 225         return UndefValue::get(VectorType::get(ty, size));
 226     }
 227
 228     Value *Builder::VUNDEF_F()
 229     {
 230         return UndefValue::get(VectorType::get(mFP32Ty, mVWidth));
 231     }
 232
 233     Value *Builder::VUNDEF(Type* t)
 234     {
 235         return UndefValue::get(VectorType::get(t, mVWidth));
 236     }
 237
 238     #if HAVE_LLVM == 0x306
 239     Value *Builder::VINSERT(Value *vec, Value *val, uint64_t index)
 240     {
 241         return VINSERT(vec, val, C((int64_t)index));
 242     }
 243     #endif
 244
 245     Value *Builder::VBROADCAST(Value *src)
 246     {
 247         // check if src is already a vector
 248         if (src->getType()->isVectorTy())
 249         {
 250             return src;
 251         }
 252
 253         return VECTOR_SPLAT(mVWidth, src);
 254     }
 255
 256     uint32_t Builder::IMMED(Value* v)
 257     {
 258         SWR_ASSERT(isa<ConstantInt>(v));
 259         ConstantInt *pValConst = cast<ConstantInt>(v);
 260         return pValConst->getZExtValue();
 261     }
 262
 263     int32_t Builder::S_IMMED(Value* v)
 264     {
 265         SWR_ASSERT(isa<ConstantInt>(v));
 266         ConstantInt *pValConst = cast<ConstantInt>(v);
 267         return pValConst->getSExtValue();
 268     }
 269
 270     Value *Builder::GEP(Value* ptr, const std::initializer_list<Value*> &indexList)
 271     {
 272         std::vector<Value*> indices;
 273         for (auto i : indexList)
 274             indices.push_back(i);
 275         return GEPA(ptr, indices);
 276     }
 277
 278     Value *Builder::GEP(Value* ptr, const std::initializer_list<uint32_t> &indexList)
 279     {
 280         std::vector<Value*> indices;
 281         for (auto i : indexList)
 282             indices.push_back(C(i));
 283         return GEPA(ptr, indices);
 284     }
 285
 286     LoadInst *Builder::LOAD(Value *basePtr, const std::initializer_list<uint32_t> &indices, const llvm::Twine& name)
 287     {
 288         std::vector<Value*> valIndices;
 289         for (auto i : indices)
 290             valIndices.push_back(C(i));
 291         return LOAD(GEPA(basePtr, valIndices), name);
 292     }
 293
 294     LoadInst *Builder::LOADV(Value *basePtr, const std::initializer_list<Value*> &indices, const llvm::Twine& name)
 295     {
 296         std::vector<Value*> valIndices;
 297         for (auto i : indices)
 298             valIndices.push_back(i);
 299         return LOAD(GEPA(basePtr, valIndices), name);
 300     }
 301
 302     StoreInst *Builder::STORE(Value *val, Value *basePtr, const std::initializer_list<uint32_t> &indices)
 303     {
 304         std::vector<Value*> valIndices;
 305         for (auto i : indices)
 306             valIndices.push_back(C(i));
 307         return STORE(val, GEPA(basePtr, valIndices));
 308     }
 309
 310     StoreInst *Builder::STOREV(Value *val, Value *basePtr, const std::initializer_list<Value*> &indices)
 311     {
 312         std::vector<Value*> valIndices;
 313         for (auto i : indices)
 314             valIndices.push_back(i);
 315         return STORE(val, GEPA(basePtr, valIndices));
 316     }
 317
 318     CallInst *Builder::CALL(Value *Callee, const std::initializer_list<Value*> &argsList)
 319     {
 320         std::vector<Value*> args;
 321         for (auto arg : argsList)
 322             args.push_back(arg);
 323         return CALLA(Callee, args);
 324     }
 325
 326     #if HAVE_LLVM > 0x306
 327     CallInst *Builder::CALL(Value *Callee, Value* arg)
 328     {
 329         std::vector<Value*> args;
 330         args.push_back(arg);
 331         return CALLA(Callee, args);
 332     }
 333
 334     CallInst *Builder::CALL2(Value *Callee, Value* arg1, Value* arg2)
 335     {
 336         std::vector<Value*> args;
 337         args.push_back(arg1);
 338         args.push_back(arg2);
 339         return CALLA(Callee, args);
 340     }
 341
 342     CallInst *Builder::CALL3(Value *Callee, Value* arg1, Value* arg2, Value* arg3)
 343     {
 344         std::vector<Value*> args;
 345         args.push_back(arg1);
 346         args.push_back(arg2);
 347         args.push_back(arg3);
 348         return CALLA(Callee, args);
 349     }
 350     #endif
 351
 352     Value *Builder::VRCP(Value *va)
 353     {
 354         return FDIV(VIMMED1(1.0f), va);  // 1 / a
 355     }
 356
 357     Value *Builder::VPLANEPS(Value* vA, Value* vB, Value* vC, Value* &vX, Value* &vY)
 358     {
 359         Value* vOut = FMADDPS(vA, vX, vC);
 360         vOut = FMADDPS(vB, vY, vOut);
 361         return vOut;
 362     }
 363
 364     //////////////////////////////////////////////////////////////////////////
 365     /// @brief Generate an i32 masked load operation in LLVM IR.  If not
 366     /// supported on the underlying platform, emulate it with float masked load
 367     /// @param src - base address pointer for the load
 368     /// @param vMask - SIMD wide mask that controls whether to access memory load 0
 369     Value *Builder::MASKLOADD(Value* src,Value* mask)
 370     {
 371         Value* vResult;
 372         // use avx2 gather instruction is available
 373         if(JM()->mArch.AVX2())
 374         {
 375             Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx2_maskload_d_256);
 376             vResult = CALL(func,{src,mask});
 377         }
 378         else
 379         {
 380             // maskload intrinsic expects integer mask operand in llvm >= 3.8
 381     #if (LLVM_VERSION_MAJOR > 3) || (LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 8)
 382             mask = BITCAST(mask,VectorType::get(mInt32Ty,mVWidth));
 383     #else
 384             mask = BITCAST(mask,VectorType::get(mFP32Ty,mVWidth));
 385     #endif
 386             Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule,Intrinsic::x86_avx_maskload_ps_256);
 387             vResult = BITCAST(CALL(func,{src,mask}), VectorType::get(mInt32Ty,mVWidth));
 388         }
 389         return vResult;
 390     }
 391
 392     //////////////////////////////////////////////////////////////////////////
 393     /// @brief insert a JIT call to CallPrint
 394     /// - outputs formatted string to both stdout and VS output window
 395     /// - DEBUG builds only
 396     /// Usage example:
 397     ///   PRINT("index %d = 0x%p\n",{C(lane), pIndex});
 398     ///   where C(lane) creates a constant value to print, and pIndex is the Value*
 399     ///   result from a GEP, printing out the pointer to memory
 400     /// @param printStr - constant string to print, which includes format specifiers
 401     /// @param printArgs - initializer list of Value*'s to print to std out
 402     CallInst *Builder::PRINT(const std::string &printStr,const std::initializer_list<Value*> &printArgs)
 403     {
 404         // push the arguments to CallPrint into a vector
 405         std::vector<Value*> printCallArgs;
 406         // save room for the format string.  we still need to modify it for vectors
 407         printCallArgs.resize(1);
 408
 409         // search through the format string for special processing
 410         size_t pos = 0;
 411         std::string tempStr(printStr);
 412         pos = tempStr.find('%', pos);
 413         auto v = printArgs.begin();
 414
 415         while ((pos != std::string::npos) && (v != printArgs.end()))
 416         {
 417             Value* pArg = *v;
 418             Type* pType = pArg->getType();
 419
 420             if (pType->isVectorTy())
 421             {
 422                 Type* pContainedType = pType->getContainedType(0);
 423
 424                 if (toupper(tempStr[pos + 1]) == 'X')
 425                 {
 426                     tempStr[pos] = '0';
 427                     tempStr[pos + 1] = 'x';
 428                     tempStr.insert(pos + 2, "%08X ");
 429                     pos += 7;
 430
 431                     printCallArgs.push_back(VEXTRACT(pArg, C(0)));
 432
 433                     std::string vectorFormatStr;
 434                     for (uint32_t i = 1; i < pType->getVectorNumElements(); ++i)
 435                     {
 436                         vectorFormatStr += "0x%08X ";
 437                         printCallArgs.push_back(VEXTRACT(pArg, C(i)));
 438                     }
 439
 440                     tempStr.insert(pos, vectorFormatStr);
 441                     pos += vectorFormatStr.size();
 442                 }
 443                 else if ((tempStr[pos + 1] == 'f') && (pContainedType->isFloatTy()))
 444                 {
 445                     uint32_t i = 0;
 446                     for (; i < (pArg->getType()->getVectorNumElements()) - 1; i++)
 447                     {
 448                         tempStr.insert(pos, std::string("%f "));
 449                         pos += 3;
 450                         printCallArgs.push_back(FP_EXT(VEXTRACT(pArg, C(i)), Type::getDoubleTy(JM()->mContext)));
 451                     }
 452                     printCallArgs.push_back(FP_EXT(VEXTRACT(pArg, C(i)), Type::getDoubleTy(JM()->mContext)));
 453                 }
 454                 else if ((tempStr[pos + 1] == 'd') && (pContainedType->isIntegerTy()))
 455                 {
 456                     uint32_t i = 0;
 457                     for (; i < (pArg->getType()->getVectorNumElements()) - 1; i++)
 458                     {
 459                         tempStr.insert(pos, std::string("%d "));
 460                         pos += 3;
 461                         printCallArgs.push_back(VEXTRACT(pArg, C(i)));
 462                     }
 463                     printCallArgs.push_back(VEXTRACT(pArg, C(i)));
 464                 }
 465             }
 466             else
 467             {
 468                 if (toupper(tempStr[pos + 1]) == 'X')
 469                 {
 470                     tempStr[pos] = '0';
 471                     tempStr.insert(pos + 1, "x%08");
 472                     printCallArgs.push_back(pArg);
 473                     pos += 3;
 474                 }
 475                 // for %f we need to cast float Values to doubles so that they print out correctly
 476                 else if ((tempStr[pos + 1] == 'f') && (pType->isFloatTy()))
 477                 {
 478                     printCallArgs.push_back(FP_EXT(pArg, Type::getDoubleTy(JM()->mContext)));
 479                     pos++;
 480                 }
 481                 else
 482                 {
 483                     printCallArgs.push_back(pArg);
 484                 }
 485             }
 486
 487             // advance to the next arguement
 488             v++;
 489             pos = tempStr.find('%', ++pos);
 490         }
 491
 492         // create global variable constant string
 493         Constant *constString = ConstantDataArray::getString(JM()->mContext,tempStr,true);
 494         GlobalVariable *gvPtr = new GlobalVariable(constString->getType(),true,GlobalValue::InternalLinkage,constString,"printStr");
 495         JM()->mpCurrentModule->getGlobalList().push_back(gvPtr);
 496
 497         // get a pointer to the first character in the constant string array
 498         std::vector<Constant*> geplist{C(0),C(0)};
 499     #if HAVE_LLVM == 0x306
 500         Constant *strGEP = ConstantExpr::getGetElementPtr(gvPtr,geplist,false);
 501     #else
 502         Constant *strGEP = ConstantExpr::getGetElementPtr(nullptr, gvPtr,geplist,false);
 503     #endif
 504
 505         // insert the pointer to the format string in the argument vector
 506         printCallArgs[0] = strGEP;
 507
 508         // get pointer to CallPrint function and insert decl into the module if needed
 509         std::vector<Type*> args;
 510         args.push_back(PointerType::get(mInt8Ty,0));
 511         FunctionType* callPrintTy = FunctionType::get(Type::getVoidTy(JM()->mContext),args,true);
 512         Function *callPrintFn = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("CallPrint", callPrintTy));
 513
 514         // if we haven't yet added the symbol to the symbol table
 515         if((sys::DynamicLibrary::SearchForAddressOfSymbol("CallPrint")) == nullptr)
 516         {
 517             sys::DynamicLibrary::AddSymbol("CallPrint", (void *)&CallPrint);
 518         }
 519
 520         // insert a call to CallPrint
 521         return CALLA(callPrintFn,printCallArgs);
 522     }
 523
 524     //////////////////////////////////////////////////////////////////////////
 525     /// @brief Wrapper around PRINT with initializer list.
 526     CallInst* Builder::PRINT(const std::string &printStr)
 527     {
 528         return PRINT(printStr, {});
 529     }
 530
 531     //////////////////////////////////////////////////////////////////////////
 532     /// @brief Generate a masked gather operation in LLVM IR.  If not
 533     /// supported on the underlying platform, emulate it with loads
 534     /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
 535     /// @param pBase - Int8* base VB address pointer value
 536     /// @param vIndices - SIMD wide value of VB byte offsets
 537     /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
 538     /// @param scale - value to scale indices by
 539     Value *Builder::GATHERPS(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, Value* scale)
 540     {
 541         Value* vGather;
 542
 543         // use avx2 gather instruction if available
 544         if(JM()->mArch.AVX2())
 545         {
 546             // force mask to <N x float>, required by vgather
 547             vMask = BITCAST(vMask, mSimdFP32Ty);
 548             vGather = VGATHERPS(vSrc,pBase,vIndices,vMask,scale);
 549         }
 550         else
 551         {
 552             Value* pStack = STACKSAVE();
 553
 554             // store vSrc on the stack.  this way we can select between a valid load address and the vSrc address
 555             Value* vSrcPtr = ALLOCA(vSrc->getType());
 556             STORE(vSrc, vSrcPtr);
 557
 558             vGather = VUNDEF_F();
 559             Value *vScaleVec = VBROADCAST(Z_EXT(scale,mInt32Ty));
 560             Value *vOffsets = MUL(vIndices,vScaleVec);
 561             Value *mask = MASK(vMask);
 562             for(uint32_t i = 0; i < mVWidth; ++i)
 563             {
 564                 // single component byte index
 565                 Value *offset = VEXTRACT(vOffsets,C(i));
 566                 // byte pointer to component
 567                 Value *loadAddress = GEP(pBase,offset);
 568                 loadAddress = BITCAST(loadAddress,PointerType::get(mFP32Ty,0));
 569                 // pointer to the value to load if we're masking off a component
 570                 Value *maskLoadAddress = GEP(vSrcPtr,{C(0), C(i)});
 571                 Value *selMask = VEXTRACT(mask,C(i));
 572                 // switch in a safe address to load if we're trying to access a vertex
 573                 Value *validAddress = SELECT(selMask, loadAddress, maskLoadAddress);
 574                 Value *val = LOAD(validAddress);
 575                 vGather = VINSERT(vGather,val,C(i));
 576             }
 577             STACKRESTORE(pStack);
 578         }
 579
 580         return vGather;
 581     }
 582
 583     //////////////////////////////////////////////////////////////////////////
 584     /// @brief Generate a masked gather operation in LLVM IR.  If not
 585     /// supported on the underlying platform, emulate it with loads
 586     /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
 587     /// @param pBase - Int8* base VB address pointer value
 588     /// @param vIndices - SIMD wide value of VB byte offsets
 589     /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
 590     /// @param scale - value to scale indices by
 591     Value *Builder::GATHERDD(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, Value* scale)
 592     {
 593         Value* vGather;
 594
 595         // use avx2 gather instruction if available
 596         if(JM()->mArch.AVX2())
 597         {
 598             vGather = VGATHERDD(vSrc, pBase, vIndices, vMask, scale);
 599         }
 600         else
 601         {
 602             Value* pStack = STACKSAVE();
 603
 604             // store vSrc on the stack.  this way we can select between a valid load address and the vSrc address
 605             Value* vSrcPtr = ALLOCA(vSrc->getType());
 606             STORE(vSrc, vSrcPtr);
 607
 608             vGather = VUNDEF_I();
 609             Value *vScaleVec = VBROADCAST(Z_EXT(scale, mInt32Ty));
 610             Value *vOffsets = MUL(vIndices, vScaleVec);
 611             Value *mask = MASK(vMask);
 612             for(uint32_t i = 0; i < mVWidth; ++i)
 613             {
 614                 // single component byte index
 615                 Value *offset = VEXTRACT(vOffsets, C(i));
 616                 // byte pointer to component
 617                 Value *loadAddress = GEP(pBase, offset);
 618                 loadAddress = BITCAST(loadAddress, PointerType::get(mInt32Ty, 0));
 619                 // pointer to the value to load if we're masking off a component
 620                 Value *maskLoadAddress = GEP(vSrcPtr, {C(0), C(i)});
 621                 Value *selMask = VEXTRACT(mask, C(i));
 622                 // switch in a safe address to load if we're trying to access a vertex
 623                 Value *validAddress = SELECT(selMask, loadAddress, maskLoadAddress);
 624                 Value *val = LOAD(validAddress, C(0));
 625                 vGather = VINSERT(vGather, val, C(i));
 626             }
 627
 628             STACKRESTORE(pStack);
 629         }
 630         return vGather;
 631     }
 632
 633     //////////////////////////////////////////////////////////////////////////
 634     /// @brief convert x86 <N x float> mask to llvm <N x i1> mask
 635     Value* Builder::MASK(Value* vmask)
 636     {
 637         Value* src = BITCAST(vmask, mSimdInt32Ty);
 638         return ICMP_SLT(src, VIMMED1(0));
 639     }
 640
 641     //////////////////////////////////////////////////////////////////////////
 642     /// @brief convert llvm <N x i1> mask to x86 <N x i32> mask
 643     Value* Builder::VMASK(Value* mask)
 644     {
 645         return S_EXT(mask, mSimdInt32Ty);
 646     }
 647
 648     //////////////////////////////////////////////////////////////////////////
 649     /// @brief Generate a VPSHUFB operation in LLVM IR.  If not
 650     /// supported on the underlying platform, emulate it
 651     /// @param a - 256bit SIMD(32x8bit) of 8bit integer values
 652     /// @param b - 256bit SIMD(32x8bit) of 8bit integer mask values
 653     /// Byte masks in lower 128 lane of b selects 8 bit values from lower
 654     /// 128bits of a, and vice versa for the upper lanes.  If the mask
 655     /// value is negative, '0' is inserted.
 656     Value *Builder::PSHUFB(Value* a, Value* b)
 657     {
 658         Value* res;
 659         // use avx2 pshufb instruction if available
 660         if(JM()->mArch.AVX2())
 661         {
 662             res = VPSHUFB(a, b);
 663         }
 664         else
 665         {
 666             Constant* cB = dyn_cast<Constant>(b);
 667             // number of 8 bit elements in b
 668             uint32_t numElms = cast<VectorType>(cB->getType())->getNumElements();
 669             // output vector
 670             Value* vShuf = UndefValue::get(VectorType::get(mInt8Ty, numElms));
 671
 672             // insert an 8 bit value from the high and low lanes of a per loop iteration
 673             numElms /= 2;
 674             for(uint32_t i = 0; i < numElms; i++)
 675             {
 676                 ConstantInt* cLow128b = cast<ConstantInt>(cB->getAggregateElement(i));
 677                 ConstantInt* cHigh128b = cast<ConstantInt>(cB->getAggregateElement(i + numElms));
 678
 679                 // extract values from constant mask
 680                 char valLow128bLane =  (char)(cLow128b->getSExtValue());
 681                 char valHigh128bLane = (char)(cHigh128b->getSExtValue());
 682
 683                 Value* insertValLow128b;
 684                 Value* insertValHigh128b;
 685
 686                 // if the mask value is negative, insert a '0' in the respective output position
 687                 // otherwise, lookup the value at mask position (bits 3..0 of the respective mask byte) in a and insert in output vector
 688                 insertValLow128b = (valLow128bLane < 0) ? C((char)0) : VEXTRACT(a, C((valLow128bLane & 0xF)));
 689                 insertValHigh128b = (valHigh128bLane < 0) ? C((char)0) : VEXTRACT(a, C((valHigh128bLane & 0xF) + numElms));
 690
 691                 vShuf = VINSERT(vShuf, insertValLow128b, i);
 692                 vShuf = VINSERT(vShuf, insertValHigh128b, (i + numElms));
 693             }
 694             res = vShuf;
 695         }
 696         return res;
 697     }
 698
 699     //////////////////////////////////////////////////////////////////////////
 700     /// @brief Generate a VPSHUFB operation (sign extend 8 8bit values to 32
 701     /// bits)in LLVM IR.  If not supported on the underlying platform, emulate it
 702     /// @param a - 128bit SIMD lane(16x8bit) of 8bit integer values.  Only
 703     /// lower 8 values are used.
 704     Value *Builder::PMOVSXBD(Value* a)
 705     {
 706         // llvm-3.9 removed the pmovsxbd intrinsic
 707     #if HAVE_LLVM < 0x309
 708         // use avx2 byte sign extend instruction if available
 709         if(JM()->mArch.AVX2())
 710         {
 711             Function *pmovsxbd = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx2_pmovsxbd);
 712             return CALL(pmovsxbd, std::initializer_list<Value*>{a});
 713         }
 714         else
 715     #endif
 716         {
 717             // VPMOVSXBD output type
 718             Type* v8x32Ty = VectorType::get(mInt32Ty, 8);
 719             // Extract 8 values from 128bit lane and sign extend
 720             return S_EXT(VSHUFFLE(a, a, C<int>({0, 1, 2, 3, 4, 5, 6, 7})), v8x32Ty);
 721         }
 722     }
 723
 724     //////////////////////////////////////////////////////////////////////////
 725     /// @brief Generate a VPSHUFB operation (sign extend 8 16bit values to 32
 726     /// bits)in LLVM IR.  If not supported on the underlying platform, emulate it
 727     /// @param a - 128bit SIMD lane(8x16bit) of 16bit integer values.
 728     Value *Builder::PMOVSXWD(Value* a)
 729     {
 730         // llvm-3.9 removed the pmovsxwd intrinsic
 731     #if HAVE_LLVM < 0x309
 732         // use avx2 word sign extend if available
 733         if(JM()->mArch.AVX2())
 734         {
 735             Function *pmovsxwd = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx2_pmovsxwd);
 736             return CALL(pmovsxwd, std::initializer_list<Value*>{a});
 737         }
 738         else
 739     #endif
 740         {
 741             // VPMOVSXWD output type
 742             Type* v8x32Ty = VectorType::get(mInt32Ty, 8);
 743             // Extract 8 values from 128bit lane and sign extend
 744             return S_EXT(VSHUFFLE(a, a, C<int>({0, 1, 2, 3, 4, 5, 6, 7})), v8x32Ty);
 745         }
 746     }
 747
 748     //////////////////////////////////////////////////////////////////////////
 749     /// @brief Generate a VPERMD operation (shuffle 32 bit integer values
 750     /// across 128 bit lanes) in LLVM IR.  If not supported on the underlying
 751     /// platform, emulate it
 752     /// @param a - 256bit SIMD lane(8x32bit) of integer values.
 753     /// @param idx - 256bit SIMD lane(8x32bit) of 3 bit lane index values
 754     Value *Builder::PERMD(Value* a, Value* idx)
 755     {
 756         Value* res;
 757         // use avx2 permute instruction if available
 758         if(JM()->mArch.AVX2())
 759         {
 760             res = VPERMD(a, idx);
 761         }
 762         else
 763         {
 764             if (isa<Constant>(idx))
 765             {
 766                 res = VSHUFFLE(a, a, idx);
 767             }
 768             else
 769             {
 770                 res = VUNDEF_I();
 771                 for (uint32_t l = 0; l < JM()->mVWidth; ++l)
 772                 {
 773                     Value* pIndex = VEXTRACT(idx, C(l));
 774                     Value* pVal = VEXTRACT(a, pIndex);
 775                     res = VINSERT(res, pVal, C(l));
 776                 }
 777             }
 778         }
 779         return res;
 780     }
 781
 782     //////////////////////////////////////////////////////////////////////////
 783     /// @brief Generate a VPERMPS operation (shuffle 32 bit float values
 784     /// across 128 bit lanes) in LLVM IR.  If not supported on the underlying
 785     /// platform, emulate it
 786     /// @param a - 256bit SIMD lane(8x32bit) of float values.
 787     /// @param idx - 256bit SIMD lane(8x32bit) of 3 bit lane index values
 788     Value *Builder::PERMPS(Value* a, Value* idx)
 789     {
 790         Value* res;
 791         // use avx2 permute instruction if available
 792         if (JM()->mArch.AVX2())
 793         {
 794             // llvm 3.6.0 swapped the order of the args to vpermd
 795             res = VPERMPS(idx, a);
 796         }
 797         else
 798         {
 799             if (isa<Constant>(idx))
 800             {
 801                 res = VSHUFFLE(a, a, idx);
 802             }
 803             else
 804             {
 805                 res = VUNDEF_F();
 806                 for (uint32_t l = 0; l < JM()->mVWidth; ++l)
 807                 {
 808                     Value* pIndex = VEXTRACT(idx, C(l));
 809                     Value* pVal = VEXTRACT(a, pIndex);
 810                     res = VINSERT(res, pVal, C(l));
 811                 }
 812             }
 813         }
 814
 815         return res;
 816     }
 817
 818     //////////////////////////////////////////////////////////////////////////
 819     /// @brief Generate a VCVTPH2PS operation (float16->float32 conversion)
 820     /// in LLVM IR.  If not supported on the underlying platform, emulate it
 821     /// @param a - 128bit SIMD lane(8x16bit) of float16 in int16 format.
 822     Value *Builder::CVTPH2PS(Value* a)
 823     {
 824         if (JM()->mArch.F16C())
 825         {
 826             return VCVTPH2PS(a);
 827         }
 828         else
 829         {
 830             FunctionType* pFuncTy = FunctionType::get(mFP32Ty, mInt16Ty);
 831             Function* pCvtPh2Ps = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("ConvertSmallFloatTo32", pFuncTy));
 832
 833             if (sys::DynamicLibrary::SearchForAddressOfSymbol("ConvertSmallFloatTo32") == nullptr)
 834             {
 835                 sys::DynamicLibrary::AddSymbol("ConvertSmallFloatTo32", (void *)&ConvertSmallFloatTo32);
 836             }
 837
 838             Value* pResult = UndefValue::get(mSimdFP32Ty);
 839             for (uint32_t i = 0; i < mVWidth; ++i)
 840             {
 841                 Value* pSrc = VEXTRACT(a, C(i));
 842                 Value* pConv = CALL(pCvtPh2Ps, std::initializer_list<Value*>{pSrc});
 843                 pResult = VINSERT(pResult, pConv, C(i));
 844             }
 845
 846             return pResult;
 847         }
 848     }
 849
 850     //////////////////////////////////////////////////////////////////////////
 851     /// @brief Generate a VCVTPS2PH operation (float32->float16 conversion)
 852     /// in LLVM IR.  If not supported on the underlying platform, emulate it
 853     /// @param a - 128bit SIMD lane(8x16bit) of float16 in int16 format.
 854     Value *Builder::CVTPS2PH(Value* a, Value* rounding)
 855     {
 856         if (JM()->mArch.F16C())
 857         {
 858             return VCVTPS2PH(a, rounding);
 859         }
 860         else
 861         {
 862             // call scalar C function for now
 863             FunctionType* pFuncTy = FunctionType::get(mInt16Ty, mFP32Ty);
 864             Function* pCvtPs2Ph = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("Convert32To16Float", pFuncTy));
 865
 866             if (sys::DynamicLibrary::SearchForAddressOfSymbol("Convert32To16Float") == nullptr)
 867             {
 868                 sys::DynamicLibrary::AddSymbol("Convert32To16Float", (void *)&Convert32To16Float);
 869             }
 870
 871             Value* pResult = UndefValue::get(mSimdInt16Ty);
 872             for (uint32_t i = 0; i < mVWidth; ++i)
 873             {
 874                 Value* pSrc = VEXTRACT(a, C(i));
 875                 Value* pConv = CALL(pCvtPs2Ph, std::initializer_list<Value*>{pSrc});
 876                 pResult = VINSERT(pResult, pConv, C(i));
 877             }
 878
 879             return pResult;
 880         }
 881     }
 882
 883     Value *Builder::PMAXSD(Value* a, Value* b)
 884     {
 885         // llvm-3.9 removed the pmax intrinsics
 886     #if HAVE_LLVM >= 0x309
 887         Value* cmp = ICMP_SGT(a, b);
 888         return SELECT(cmp, a, b);
 889     #else
 890         if (JM()->mArch.AVX2())
 891         {
 892             Function* pmaxsd = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx2_pmaxs_d);
 893             return CALL(pmaxsd, {a, b});
 894         }
 895         else
 896         {
 897             // use 4-wide sse max intrinsic on lower/upper halves of 8-wide sources
 898             Function* pmaxsd = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_sse41_pmaxsd);
 899
 900             // low 128
 901             Value* aLo = VEXTRACTI128(a, C((uint8_t)0));
 902             Value* bLo = VEXTRACTI128(b, C((uint8_t)0));
 903             Value* resLo = CALL(pmaxsd, {aLo, bLo});
 904
 905             // high 128
 906             Value* aHi = VEXTRACTI128(a, C((uint8_t)1));
 907             Value* bHi = VEXTRACTI128(b, C((uint8_t)1));
 908             Value* resHi = CALL(pmaxsd, {aHi, bHi});
 909
 910             // combine
 911             Value* result = VINSERTI128(VUNDEF_I(), resLo, C((uint8_t)0));
 912             result = VINSERTI128(result, resHi, C((uint8_t)1));
 913
 914             return result;
 915         }
 916     #endif
 917     }
 918
 919     Value *Builder::PMINSD(Value* a, Value* b)
 920     {
 921         // llvm-3.9 removed the pmin intrinsics
 922     #if HAVE_LLVM >= 0x309
 923         Value* cmp = ICMP_SLT(a, b);
 924         return SELECT(cmp, a, b);
 925     #else
 926         if (JM()->mArch.AVX2())
 927         {
 928             Function* pminsd = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx2_pmins_d);
 929             return CALL(pminsd, {a, b});
 930         }
 931         else
 932         {
 933             // use 4-wide sse max intrinsic on lower/upper halves of 8-wide sources
 934             Function* pminsd = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_sse41_pminsd);
 935
 936             // low 128
 937             Value* aLo = VEXTRACTI128(a, C((uint8_t)0));
 938             Value* bLo = VEXTRACTI128(b, C((uint8_t)0));
 939             Value* resLo = CALL(pminsd, {aLo, bLo});
 940
 941             // high 128
 942             Value* aHi = VEXTRACTI128(a, C((uint8_t)1));
 943             Value* bHi = VEXTRACTI128(b, C((uint8_t)1));
 944             Value* resHi = CALL(pminsd, {aHi, bHi});
 945
 946             // combine
 947             Value* result = VINSERTI128(VUNDEF_I(), resLo, C((uint8_t)0));
 948             result = VINSERTI128(result, resHi, C((uint8_t)1));
 949
 950             return result;
 951         }
 952     #endif
 953     }
 954
 955     void Builder::Gather4(const SWR_FORMAT format, Value* pSrcBase, Value* byteOffsets,
 956                           Value* mask, Value* vGatherComponents[], bool bPackedOutput)
 957     {
 958         const SWR_FORMAT_INFO &info = GetFormatInfo(format);
 959         if(info.type[0] == SWR_TYPE_FLOAT && info.bpc[0] == 32)
 960         {
 961             // ensure our mask is the correct type
 962             mask = BITCAST(mask, mSimdFP32Ty);
 963             GATHER4PS(info, pSrcBase, byteOffsets, mask, vGatherComponents, bPackedOutput);
 964         }
 965         else
 966         {
 967             // ensure our mask is the correct type
 968             mask = BITCAST(mask, mSimdInt32Ty);
 969             GATHER4DD(info, pSrcBase, byteOffsets, mask, vGatherComponents, bPackedOutput);
 970         }
 971     }
 972
 973     void Builder::GATHER4PS(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets,
 974                             Value* mask, Value* vGatherComponents[], bool bPackedOutput)
 975     {
 976         switch(info.bpp / info.numComps)
 977         {
 978             case 16:
 979             {
 980                     Value* vGatherResult[2];
 981                     Value *vMask;
 982
 983                     // TODO: vGatherMaskedVal
 984                     Value* vGatherMaskedVal = VIMMED1((float)0);
 985
 986                     // always have at least one component out of x or y to fetch
 987
 988                     // save mask as it is zero'd out after each gather
 989                     vMask = mask;
 990
 991                     vGatherResult[0] = GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, C((char)1));
 992                     // e.g. result of first 8x32bit integer gather for 16bit components
 993                     // 256i - 0    1    2    3    4    5    6    7
 994                     //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
 995                     //
 996
 997                     // if we have at least one component out of x or y to fetch
 998                     if(info.numComps > 2)
 999                     {
1000                         // offset base to the next components(zw) in the vertex to gather
1001                         pSrcBase = GEP(pSrcBase, C((char)4));
1002                         vMask = mask;
1003
1004                         vGatherResult[1] =  GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, C((char)1));
1005                         // e.g. result of second 8x32bit integer gather for 16bit components
1006                         // 256i - 0    1    2    3    4    5    6    7
1007                         //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
1008                         //
1009                     }
1010                     else
1011                     {
1012                         vGatherResult[1] =  vGatherMaskedVal;
1013                     }
1014
1015                     // Shuffle gathered components into place, each row is a component
1016                     Shuffle16bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
1017             }
1018                 break;
1019             case 32:
1020             {
1021                 // apply defaults
1022                 for (uint32_t i = 0; i < 4; ++i)
1023                 {
1024                     vGatherComponents[i] = VIMMED1(*(float*)&info.defaults[i]);
1025                 }
1026
1027                 for(uint32_t i = 0; i < info.numComps; i++)
1028                 {
1029                     uint32_t swizzleIndex = info.swizzle[i];
1030
1031                     // save mask as it is zero'd out after each gather
1032                     Value *vMask = mask;
1033
1034                     // Gather a SIMD of components
1035                     vGatherComponents[swizzleIndex] = GATHERPS(vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask, C((char)1));
1036
1037                     // offset base to the next component to gather
1038                     pSrcBase = GEP(pSrcBase, C((char)4));
1039                 }
1040             }
1041                 break;
1042             default:
1043                 SWR_ASSERT(0, "Invalid float format");
1044                 break;
1045         }
1046     }
1047
1048     void Builder::GATHER4DD(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets,
1049                             Value* mask, Value* vGatherComponents[], bool bPackedOutput)
1050     {
1051         switch (info.bpp / info.numComps)
1052         {
1053             case 8:
1054             {
1055                 Value* vGatherMaskedVal = VIMMED1((int32_t)0);
1056                 Value* vGatherResult = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, mask, C((char)1));
1057                 // e.g. result of an 8x32bit integer gather for 8bit components
1058                 // 256i - 0    1    2    3    4    5    6    7
1059                 //        xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw
1060
1061                 Shuffle8bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
1062             }
1063                 break;
1064             case 16:
1065             {
1066                 Value* vGatherResult[2];
1067                 Value *vMask;
1068
1069                 // TODO: vGatherMaskedVal
1070                 Value* vGatherMaskedVal = VIMMED1((int32_t)0);
1071
1072                 // always have at least one component out of x or y to fetch
1073
1074                 // save mask as it is zero'd out after each gather
1075                 vMask = mask;
1076
1077                 vGatherResult[0] = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, C((char)1));
1078                 // e.g. result of first 8x32bit integer gather for 16bit components
1079                 // 256i - 0    1    2    3    4    5    6    7
1080                 //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
1081                 //
1082
1083                 // if we have at least one component out of x or y to fetch
1084                 if(info.numComps > 2)
1085                 {
1086                     // offset base to the next components(zw) in the vertex to gather
1087                     pSrcBase = GEP(pSrcBase, C((char)4));
1088                     vMask = mask;
1089
1090                     vGatherResult[1] = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, C((char)1));
1091                     // e.g. result of second 8x32bit integer gather for 16bit components
1092                     // 256i - 0    1    2    3    4    5    6    7
1093                     //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
1094                     //
1095                 }
1096                 else
1097                 {
1098                     vGatherResult[1] = vGatherMaskedVal;
1099                 }
1100
1101                 // Shuffle gathered components into place, each row is a component
1102                 Shuffle16bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
1103
1104             }
1105                 break;
1106             case 32:
1107             {
1108                 // apply defaults
1109                 for (uint32_t i = 0; i < 4; ++i)
1110                 {
1111                     vGatherComponents[i] = VIMMED1((int)info.defaults[i]);
1112                 }
1113
1114                 for(uint32_t i = 0; i < info.numComps; i++)
1115                 {
1116                     uint32_t swizzleIndex = info.swizzle[i];
1117
1118                     // save mask as it is zero'd out after each gather
1119                     Value *vMask = mask;
1120
1121                     // Gather a SIMD of components
1122                     vGatherComponents[swizzleIndex] = GATHERDD(vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask, C((char)1));
1123
1124                     // offset base to the next component to gather
1125                     pSrcBase = GEP(pSrcBase, C((char)4));
1126                 }
1127             }
1128                 break;
1129             default:
1130                 SWR_ASSERT(0, "unsupported format");
1131             break;
1132         }
1133     }
1134
1135     void Builder::Shuffle16bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput[2], Value* vGatherOutput[4], bool bPackedOutput)
1136     {
1137         // cast types
1138         Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
1139         Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
1140
1141         // input could either be float or int vector; do shuffle work in int
1142         vGatherInput[0] = BITCAST(vGatherInput[0], mSimdInt32Ty);
1143         vGatherInput[1] = BITCAST(vGatherInput[1], mSimdInt32Ty);
1144
1145         if(bPackedOutput)
1146         {
1147             Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
1148
1149             // shuffle mask
1150             Value* vConstMask = C<char>({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
1151                                          0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15});
1152             Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput[0], v32x8Ty), vConstMask), vGatherTy);
1153             // after pshufb: group components together in each 128bit lane
1154             // 256i - 0    1    2    3    4    5    6    7
1155             //        xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
1156
1157             Value* vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
1158             // after PERMD: move and pack xy components into each 128bit lane
1159             // 256i - 0    1    2    3    4    5    6    7
1160             //        xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
1161
1162             // do the same for zw components
1163             Value* vi128ZW = nullptr;
1164             if(info.numComps > 2)
1165             {
1166                 Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput[1], v32x8Ty), vConstMask), vGatherTy);
1167                 vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
1168             }
1169
1170             for(uint32_t i = 0; i < 4; i++)
1171             {
1172                 uint32_t swizzleIndex = info.swizzle[i];
1173                 // todo: fixed for packed
1174                 Value* vGatherMaskedVal = VIMMED1((int32_t)(info.defaults[i]));
1175                 if(i >= info.numComps)
1176                 {
1177                     // set the default component val
1178                     vGatherOutput[swizzleIndex] = vGatherMaskedVal;
1179                     continue;
1180                 }
1181
1182                 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
1183                 uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
1184                 // if x or y, use vi128XY permute result, else use vi128ZW
1185                 Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
1186
1187                 // extract packed component 128 bit lanes
1188                 vGatherOutput[swizzleIndex] = VEXTRACT(selectedPermute, C(lane));
1189             }
1190
1191         }
1192         else
1193         {
1194             // pshufb masks for each component
1195             Value* vConstMask[2];
1196             // x/z shuffle mask
1197             vConstMask[0] = C<char>({0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
1198                                      0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, });
1199
1200             // y/w shuffle mask
1201             vConstMask[1] = C<char>({2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
1202                                      2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1});
1203
1204
1205             // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits
1206             // apply defaults
1207             for (uint32_t i = 0; i < 4; ++i)
1208             {
1209                 vGatherOutput[i] = VIMMED1((int32_t)info.defaults[i]);
1210             }
1211
1212             for(uint32_t i = 0; i < info.numComps; i++)
1213             {
1214                 uint32_t swizzleIndex = info.swizzle[i];
1215
1216                 // select correct constMask for x/z or y/w pshufb
1217                 uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1;
1218                 // if x or y, use vi128XY permute result, else use vi128ZW
1219                 uint32_t selectedGather = (i < 2) ? 0 : 1;
1220
1221                 vGatherOutput[swizzleIndex] = BITCAST(PSHUFB(BITCAST(vGatherInput[selectedGather], v32x8Ty), vConstMask[selectedMask]), vGatherTy);
1222                 // after pshufb mask for x channel; z uses the same shuffle from the second gather
1223                 // 256i - 0    1    2    3    4    5    6    7
1224                 //        xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00
1225             }
1226         }
1227     }
1228
1229     void Builder::Shuffle8bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput, Value* vGatherOutput[], bool bPackedOutput)
1230     {
1231         // cast types
1232         Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
1233         Type* v32x8Ty =  VectorType::get(mInt8Ty, mVWidth * 4 ); // vwidth is units of 32 bits
1234
1235         if(bPackedOutput)
1236         {
1237             Type* v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
1238             // shuffle mask
1239             Value* vConstMask = C<char>({0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15,
1240                                          0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15});
1241             Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput, v32x8Ty), vConstMask), vGatherTy);
1242             // after pshufb: group components together in each 128bit lane
1243             // 256i - 0    1    2    3    4    5    6    7
1244             //        xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww
1245
1246             Value* vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({0, 4, 0, 0, 1, 5, 0, 0})), v128Ty);
1247             // after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane
1248             // 256i - 0    1    2    3    4    5    6    7
1249             //        xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care)
1250
1251             // do the same for zw components
1252             Value* vi128ZW = nullptr;
1253             if(info.numComps > 2)
1254             {
1255                 vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({2, 6, 0, 0, 3, 7, 0, 0})), v128Ty);
1256             }
1257
1258             // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
1259             for(uint32_t i = 0; i < 4; i++)
1260             {
1261                 uint32_t swizzleIndex = info.swizzle[i];
1262                 // todo: fix for packed
1263                 Value* vGatherMaskedVal = VIMMED1((int32_t)(info.defaults[i]));
1264                 if(i >= info.numComps)
1265                 {
1266                     // set the default component val
1267                     vGatherOutput[swizzleIndex] = vGatherMaskedVal;
1268                     continue;
1269                 }
1270
1271                 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
1272                 uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
1273                 // if x or y, use vi128XY permute result, else use vi128ZW
1274                 Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
1275
1276                 // sign extend
1277                 vGatherOutput[swizzleIndex] = VEXTRACT(selectedPermute, C(lane));
1278             }
1279         }
1280         // else zero extend
1281         else{
1282             // shuffle enabled components into lower byte of each 32bit lane, 0 extending to 32 bits
1283             // apply defaults
1284             for (uint32_t i = 0; i < 4; ++i)
1285             {
1286                 vGatherOutput[i] = VIMMED1((int32_t)info.defaults[i]);
1287             }
1288
1289             for(uint32_t i = 0; i < info.numComps; i++){
1290                 uint32_t swizzleIndex = info.swizzle[i];
1291
1292                 // pshufb masks for each component
1293                 Value* vConstMask;
1294                 switch(i)
1295                 {
1296                     case 0:
1297                         // x shuffle mask
1298                         vConstMask = C<char>({0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1,
1299                                               0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1});
1300                         break;
1301                     case 1:
1302                         // y shuffle mask
1303                         vConstMask = C<char>({1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1,
1304                                               1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1});
1305                         break;
1306                     case 2:
1307                         // z shuffle mask
1308                         vConstMask = C<char>({2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1,
1309                                               2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1});
1310                         break;
1311                     case 3:
1312                         // w shuffle mask
1313                         vConstMask = C<char>({3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1,
1314                                               3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1});
1315                         break;
1316                     default:
1317                         vConstMask = nullptr;
1318                         break;
1319                 }
1320
1321                     vGatherOutput[swizzleIndex] = BITCAST(PSHUFB(BITCAST(vGatherInput, v32x8Ty), vConstMask), vGatherTy);
1322                     // after pshufb for x channel
1323                     // 256i - 0    1    2    3    4    5    6    7
1324                     //        x000 x000 x000 x000 x000 x000 x000 x000
1325             }
1326         }
1327     }
1328
1329     // Helper function to create alloca in entry block of function
1330     Value* Builder::CreateEntryAlloca(Function* pFunc, Type* pType)
1331     {
1332         auto saveIP = IRB()->saveIP();
1333         IRB()->SetInsertPoint(&pFunc->getEntryBlock(),
1334                               pFunc->getEntryBlock().begin());
1335         Value* pAlloca = ALLOCA(pType);
1336         IRB()->restoreIP(saveIP);
1337         return pAlloca;
1338     }
1339
1340     //////////////////////////////////////////////////////////////////////////
1341     /// @brief emulates a scatter operation.
1342     /// @param pDst - pointer to destination
1343     /// @param vSrc - vector of src data to scatter
1344     /// @param vOffsets - vector of byte offsets from pDst
1345     /// @param vMask - mask of valid lanes
1346     void Builder::SCATTERPS(Value* pDst, Value* vSrc, Value* vOffsets, Value* vMask)
1347     {
1348         /* Scatter algorithm
1349
1350            while(Index = BitScanForward(mask))
1351                 srcElem = srcVector[Index]
1352                 offsetElem = offsetVector[Index]
1353                 *(pDst + offsetElem) = srcElem
1354                 Update mask (&= ~(1<<Index)
1355
1356         */
1357
1358         BasicBlock* pCurBB = IRB()->GetInsertBlock();
1359         Function* pFunc = pCurBB->getParent();
1360         Type* pSrcTy = vSrc->getType()->getVectorElementType();
1361
1362         // Store vectors on stack
1363         if (pScatterStackSrc == nullptr)
1364         {
1365             // Save off stack allocations and reuse per scatter. Significantly reduces stack
1366             // requirements for shaders with a lot of scatters.
1367             pScatterStackSrc = CreateEntryAlloca(pFunc, mSimdInt64Ty);
1368             pScatterStackOffsets = CreateEntryAlloca(pFunc, mSimdInt32Ty);
1369         }
1370
1371         Value* pSrcArrayPtr = BITCAST(pScatterStackSrc, PointerType::get(vSrc->getType(), 0));
1372         Value* pOffsetsArrayPtr = pScatterStackOffsets;
1373         STORE(vSrc, pSrcArrayPtr);
1374         STORE(vOffsets, pOffsetsArrayPtr);
1375
1376         // Cast to pointers for random access
1377         pSrcArrayPtr = POINTER_CAST(pSrcArrayPtr, PointerType::get(pSrcTy, 0));
1378         pOffsetsArrayPtr = POINTER_CAST(pOffsetsArrayPtr, PointerType::get(mInt32Ty, 0));
1379
1380         Value* pMask = VMOVMSKPS(BITCAST(vMask, mSimdFP32Ty));
1381
1382         // Get cttz function
1383         Function* pfnCttz = Intrinsic::getDeclaration(mpJitMgr->mpCurrentModule, Intrinsic::cttz, { mInt32Ty });
1384
1385         // Setup loop basic block
1386         BasicBlock* pLoop = BasicBlock::Create(mpJitMgr->mContext, "Scatter Loop", pFunc);
1387
1388         // compute first set bit
1389         Value* pIndex = CALL(pfnCttz, { pMask, C(false) });
1390
1391         Value* pIsUndef = ICMP_EQ(pIndex, C(32));
1392
1393         // Split current block
1394         BasicBlock* pPostLoop = pCurBB->splitBasicBlock(cast<Instruction>(pIsUndef)->getNextNode());
1395
1396         // Remove unconditional jump created by splitBasicBlock
1397         pCurBB->getTerminator()->eraseFromParent();
1398
1399         // Add terminator to end of original block
1400         IRB()->SetInsertPoint(pCurBB);
1401
1402         // Add conditional branch
1403         COND_BR(pIsUndef, pPostLoop, pLoop);
1404
1405         // Add loop basic block contents
1406         IRB()->SetInsertPoint(pLoop);
1407         PHINode* pIndexPhi = PHI(mInt32Ty, 2);
1408         PHINode* pMaskPhi = PHI(mInt32Ty, 2);
1409
1410         pIndexPhi->addIncoming(pIndex, pCurBB);
1411         pMaskPhi->addIncoming(pMask, pCurBB);
1412
1413         // Extract elements for this index
1414         Value* pSrcElem = LOADV(pSrcArrayPtr, { pIndexPhi });
1415         Value* pOffsetElem = LOADV(pOffsetsArrayPtr, { pIndexPhi });
1416
1417         // GEP to this offset in dst
1418         Value* pCurDst = GEP(pDst, pOffsetElem);
1419         pCurDst = POINTER_CAST(pCurDst, PointerType::get(pSrcTy, 0));
1420         STORE(pSrcElem, pCurDst);
1421
1422         // Update the mask
1423         Value* pNewMask = AND(pMaskPhi, NOT(SHL(C(1), pIndexPhi)));
1424
1425         // Terminator
1426         Value* pNewIndex = CALL(pfnCttz, { pNewMask, C(false) });
1427
1428         pIsUndef = ICMP_EQ(pNewIndex, C(32));
1429         COND_BR(pIsUndef, pPostLoop, pLoop);
1430
1431         // Update phi edges
1432         pIndexPhi->addIncoming(pNewIndex, pLoop);
1433         pMaskPhi->addIncoming(pNewMask, pLoop);
1434
1435         // Move builder to beginning of post loop
1436         IRB()->SetInsertPoint(pPostLoop, pPostLoop->begin());
1437     }
1438
1439     Value* Builder::VABSPS(Value* a)
1440     {
1441         Value* asInt = BITCAST(a, mSimdInt32Ty);
1442         Value* result = BITCAST(AND(asInt, VIMMED1(0x7fffffff)), mSimdFP32Ty);
1443         return result;
1444     }
1445
1446     Value *Builder::ICLAMP(Value* src, Value* low, Value* high)
1447     {
1448         Value *lowCmp = ICMP_SLT(src, low);
1449         Value *ret = SELECT(lowCmp, low, src);
1450
1451         Value *highCmp = ICMP_SGT(ret, high);
1452         ret = SELECT(highCmp, high, ret);
1453
1454         return ret;
1455     }
1456
1457     Value *Builder::FCLAMP(Value* src, Value* low, Value* high)
1458     {
1459         Value *lowCmp = FCMP_OLT(src, low);
1460         Value *ret = SELECT(lowCmp, low, src);
1461
1462         Value *highCmp = FCMP_OGT(ret, high);
1463         ret = SELECT(highCmp, high, ret);
1464
1465         return ret;
1466     }
1467
1468     Value *Builder::FCLAMP(Value* src, float low, float high)
1469     {
1470         Value* result = VMAXPS(src, VIMMED1(low));
1471         result = VMINPS(result, VIMMED1(high));
1472
1473         return result;
1474     }
1475
1476     //////////////////////////////////////////////////////////////////////////
1477     /// @brief save/restore stack, providing ability to push/pop the stack and
1478     ///        reduce overall stack requirements for temporary stack use
1479     Value* Builder::STACKSAVE()
1480     {
1481         Function* pfnStackSave = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::stacksave);
1482     #if HAVE_LLVM == 0x306
1483         return CALL(pfnStackSave);
1484     #else
1485         return CALLA(pfnStackSave);
1486     #endif
1487     }
1488
1489     void Builder::STACKRESTORE(Value* pSaved)
1490     {
1491         Function* pfnStackRestore = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::stackrestore);
1492         CALL(pfnStackRestore, std::initializer_list<Value*>{pSaved});
1493     }
1494
1495     Value *Builder::FMADDPS(Value* a, Value* b, Value* c)
1496     {
1497         Value* vOut;
1498         // use FMADs if available
1499         if(JM()->mArch.AVX2())
1500         {
1501             vOut = VFMADDPS(a, b, c);
1502         }
1503         else
1504         {
1505             vOut = FADD(FMUL(a, b), c);
1506         }
1507         return vOut;
1508     }
1509
1510     Value* Builder::POPCNT(Value* a)
1511     {
1512         Function* pCtPop = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::ctpop, { a->getType() });
1513         return CALL(pCtPop, std::initializer_list<Value*>{a});
1514     }
1515
1516     //////////////////////////////////////////////////////////////////////////
1517     /// @brief C functions called by LLVM IR
1518     //////////////////////////////////////////////////////////////////////////
1519
1520     //////////////////////////////////////////////////////////////////////////
1521     /// @brief called in JIT code, inserted by PRINT
1522     /// output to both stdout and visual studio debug console
1523     void __cdecl CallPrint(const char* fmt, ...)
1524     {
1525         va_list args;
1526         va_start(args, fmt);
1527         vprintf(fmt, args);
1528
1529     #if defined( _WIN32 )
1530         char strBuf[1024];
1531         vsnprintf_s(strBuf, _TRUNCATE, fmt, args);
1532         OutputDebugString(strBuf);
1533     #endif
1534
1535         va_end(args);
1536     }
1537
1538     Value *Builder::VEXTRACTI128(Value* a, Constant* imm8)
1539     {
1540     #if HAVE_LLVM == 0x306
1541         Function *func =
1542             Intrinsic::getDeclaration(JM()->mpCurrentModule,
1543                                       Intrinsic::x86_avx_vextractf128_si_256);
1544         return CALL(func, {a, imm8});
1545     #else
1546         bool flag = !imm8->isZeroValue();
1547         SmallVector<Constant*,8> idx;
1548         for (unsigned i = 0; i < mVWidth / 2; i++) {
1549             idx.push_back(C(flag ? i + mVWidth / 2 : i));
1550         }
1551         return VSHUFFLE(a, VUNDEF_I(), ConstantVector::get(idx));
1552     #endif
1553     }
1554
1555     Value *Builder::VINSERTI128(Value* a, Value* b, Constant* imm8)
1556     {
1557     #if HAVE_LLVM == 0x306
1558         Function *func =
1559             Intrinsic::getDeclaration(JM()->mpCurrentModule,
1560                                       Intrinsic::x86_avx_vinsertf128_si_256);
1561         return CALL(func, {a, b, imm8});
1562     #else
1563         bool flag = !imm8->isZeroValue();
1564         SmallVector<Constant*,8> idx;
1565         for (unsigned i = 0; i < mVWidth; i++) {
1566             idx.push_back(C(i));
1567         }
1568         Value *inter = VSHUFFLE(b, VUNDEF_I(), ConstantVector::get(idx));
1569
1570         SmallVector<Constant*,8> idx2;
1571         for (unsigned i = 0; i < mVWidth / 2; i++) {
1572             idx2.push_back(C(flag ? i : i + mVWidth));
1573         }
1574         for (unsigned i = mVWidth / 2; i < mVWidth; i++) {
1575             idx2.push_back(C(flag ? i + mVWidth / 2 : i));
1576         }
1577         return VSHUFFLE(a, inter, ConstantVector::get(idx2));
1578     #endif
1579     }
1580
1581     // rdtsc buckets macros
1582     void Builder::RDTSC_START(Value* pBucketMgr, Value* pId)
1583     {
1584         // @todo due to an issue with thread local storage propagation in llvm, we can only safely call into
1585         // buckets framework when single threaded
1586         if (KNOB_SINGLE_THREADED)
1587         {
1588             std::vector<Type*> args{
1589                 PointerType::get(mInt32Ty, 0),   // pBucketMgr
1590                 mInt32Ty                        // id
1591             };
1592
1593             FunctionType* pFuncTy = FunctionType::get(Type::getVoidTy(JM()->mContext), args, false);
1594             Function* pFunc = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("BucketManager_StartBucket", pFuncTy));
1595             if (sys::DynamicLibrary::SearchForAddressOfSymbol("BucketManager_StartBucket") == nullptr)
1596             {
1597                 sys::DynamicLibrary::AddSymbol("BucketManager_StartBucket", (void*)&BucketManager_StartBucket);
1598             }
1599
1600             CALL(pFunc, { pBucketMgr, pId });
1601         }
1602     }
1603
1604     void Builder::RDTSC_STOP(Value* pBucketMgr, Value* pId)
1605     {
1606         // @todo due to an issue with thread local storage propagation in llvm, we can only safely call into
1607         // buckets framework when single threaded
1608         if (KNOB_SINGLE_THREADED)
1609         {
1610             std::vector<Type*> args{
1611                 PointerType::get(mInt32Ty, 0),   // pBucketMgr
1612                 mInt32Ty                        // id
1613             };
1614
1615             FunctionType* pFuncTy = FunctionType::get(Type::getVoidTy(JM()->mContext), args, false);
1616             Function* pFunc = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("BucketManager_StopBucket", pFuncTy));
1617             if (sys::DynamicLibrary::SearchForAddressOfSymbol("BucketManager_StopBucket") == nullptr)
1618             {
1619                 sys::DynamicLibrary::AddSymbol("BucketManager_StopBucket", (void*)&BucketManager_StopBucket);
1620             }
1621
1622             CALL(pFunc, { pBucketMgr, pId });
1623         }
1624     }
1625
1626 }