src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp

   1 /****************************************************************************
   2 * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
   3 *
   4 * Permission is hereby granted, free of charge, to any person obtaining a
   5 * copy of this software and associated documentation files (the "Software"),
   6 * to deal in the Software without restriction, including without limitation
   7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8 * and/or sell copies of the Software, and to permit persons to whom the
   9 * Software is furnished to do so, subject to the following conditions:
  10 *
  11 * The above copyright notice and this permission notice (including the next
  12 * paragraph) shall be included in all copies or substantial portions of the
  13 * Software.
  14 *
  15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21 * IN THE SOFTWARE.
  22 *
  23 * @file builder_misc.cpp
  24 *
  25 * @brief Implementation for miscellaneous builder functions
  26 *
  27 * Notes:
  28 *
  29 ******************************************************************************/
  30 #include "builder.h"
  31 #include "common/rdtsc_buckets.h"
  32
  33 void __cdecl CallPrint(const char* fmt, ...);
  34
  35 //////////////////////////////////////////////////////////////////////////
  36 /// @brief Convert an IEEE 754 32-bit single precision float to an
  37 ///        16 bit float with 5 exponent bits and a variable
  38 ///        number of mantissa bits.
  39 /// @param val - 32-bit float
  40 /// @todo Maybe move this outside of this file into a header?
  41 static uint16_t Convert32To16Float(float val)
  42 {
  43     uint32_t sign, exp, mant;
  44     uint32_t roundBits;
  45
  46     // Extract the sign, exponent, and mantissa
  47     uint32_t uf = *(uint32_t*)&val;
  48     sign = (uf & 0x80000000) >> 31;
  49     exp = (uf & 0x7F800000) >> 23;
  50     mant = uf & 0x007FFFFF;
  51
  52     // Check for out of range
  53     if (std::isnan(val))
  54     {
  55         exp = 0x1F;
  56         mant = 0x200;
  57         sign = 1;                     // set the sign bit for NANs
  58     }
  59     else if (std::isinf(val))
  60     {
  61         exp = 0x1f;
  62         mant = 0x0;
  63     }
  64     else if (exp > (0x70 + 0x1E)) // Too big to represent -> max representable value
  65     {
  66         exp = 0x1E;
  67         mant = 0x3FF;
  68     }
  69     else if ((exp <= 0x70) && (exp >= 0x66)) // It's a denorm
  70     {
  71         mant |= 0x00800000;
  72         for (; exp <= 0x70; mant >>= 1, exp++)
  73             ;
  74         exp = 0;
  75         mant = mant >> 13;
  76     }
  77     else if (exp < 0x66) // Too small to represent -> Zero
  78     {
  79         exp = 0;
  80         mant = 0;
  81     }
  82     else
  83     {
  84         // Saves bits that will be shifted off for rounding
  85         roundBits = mant & 0x1FFFu;
  86         // convert exponent and mantissa to 16 bit format
  87         exp = exp - 0x70;
  88         mant = mant >> 13;
  89
  90         // Essentially RTZ, but round up if off by only 1 lsb
  91         if (roundBits == 0x1FFFu)
  92         {
  93             mant++;
  94             // check for overflow
  95             if ((mant & 0xC00u) != 0)
  96                 exp++;
  97             // make sure only the needed bits are used
  98             mant &= 0x3FF;
  99         }
 100     }
 101
 102     uint32_t tmpVal = (sign << 15) | (exp << 10) | mant;
 103     return (uint16_t)tmpVal;
 104 }
 105
 106 //////////////////////////////////////////////////////////////////////////
 107 /// @brief Convert an IEEE 754 16-bit float to an 32-bit single precision
 108 ///        float
 109 /// @param val - 16-bit float
 110 /// @todo Maybe move this outside of this file into a header?
 111 static float ConvertSmallFloatTo32(UINT val)
 112 {
 113     UINT result;
 114     if ((val & 0x7fff) == 0)
 115     {
 116         result = ((uint32_t)(val & 0x8000)) << 16;
 117     }
 118     else if ((val & 0x7c00) == 0x7c00)
 119     {
 120         result = ((val & 0x3ff) == 0) ? 0x7f800000 : 0x7fc00000;
 121         result |= ((uint32_t)val & 0x8000) << 16;
 122     }
 123     else
 124     {
 125         uint32_t sign = (val & 0x8000) << 16;
 126         uint32_t mant = (val & 0x3ff) << 13;
 127         uint32_t exp = (val >> 10) & 0x1f;
 128         if ((exp == 0) && (mant != 0)) // Adjust exponent and mantissa for denormals
 129         {
 130             mant <<= 1;
 131             while (mant < (0x400 << 13))
 132             {
 133                 exp--;
 134                 mant <<= 1;
 135             }
 136             mant &= (0x3ff << 13);
 137         }
 138         exp = ((exp - 15 + 127) & 0xff) << 23;
 139         result = sign | exp | mant;
 140     }
 141
 142     return *(float*)&result;
 143 }
 144
 145 Constant *Builder::C(bool i)
 146 {
 147     return ConstantInt::get(IRB()->getInt1Ty(), (i ? 1 : 0));
 148 }
 149
 150 Constant *Builder::C(char i)
 151 {
 152     return ConstantInt::get(IRB()->getInt8Ty(), i);
 153 }
 154
 155 Constant *Builder::C(uint8_t i)
 156 {
 157     return ConstantInt::get(IRB()->getInt8Ty(), i);
 158 }
 159
 160 Constant *Builder::C(int i)
 161 {
 162     return ConstantInt::get(IRB()->getInt32Ty(), i);
 163 }
 164
 165 Constant *Builder::C(int64_t i)
 166 {
 167     return ConstantInt::get(IRB()->getInt64Ty(), i);
 168 }
 169
 170 Constant *Builder::C(uint16_t i)
 171 {
 172     return ConstantInt::get(mInt16Ty,i);
 173 }
 174
 175 Constant *Builder::C(uint32_t i)
 176 {
 177     return ConstantInt::get(IRB()->getInt32Ty(), i);
 178 }
 179
 180 Constant *Builder::C(float i)
 181 {
 182     return ConstantFP::get(IRB()->getFloatTy(), i);
 183 }
 184
 185 Constant *Builder::PRED(bool pred)
 186 {
 187     return ConstantInt::get(IRB()->getInt1Ty(), (pred ? 1 : 0));
 188 }
 189
 190 Value *Builder::VIMMED1(int i)
 191 {
 192     return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i)));
 193 }
 194
 195 Value *Builder::VIMMED1(uint32_t i)
 196 {
 197     return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i)));
 198 }
 199
 200 Value *Builder::VIMMED1(float i)
 201 {
 202     return ConstantVector::getSplat(mVWidth, cast<ConstantFP>(C(i)));
 203 }
 204
 205 Value *Builder::VIMMED1(bool i)
 206 {
 207     return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i)));
 208 }
 209
 210 Value *Builder::VUNDEF_IPTR()
 211 {
 212     return UndefValue::get(VectorType::get(mInt32PtrTy,mVWidth));
 213 }
 214
 215 Value *Builder::VUNDEF_I()
 216 {
 217     return UndefValue::get(VectorType::get(mInt32Ty, mVWidth));
 218 }
 219
 220 Value *Builder::VUNDEF(Type *ty, uint32_t size)
 221 {
 222     return UndefValue::get(VectorType::get(ty, size));
 223 }
 224
 225 Value *Builder::VUNDEF_F()
 226 {
 227     return UndefValue::get(VectorType::get(mFP32Ty, mVWidth));
 228 }
 229
 230 Value *Builder::VUNDEF(Type* t)
 231 {
 232     return UndefValue::get(VectorType::get(t, mVWidth));
 233 }
 234
 235 #if HAVE_LLVM == 0x306
 236 Value *Builder::VINSERT(Value *vec, Value *val, uint64_t index)
 237 {
 238     return VINSERT(vec, val, C((int64_t)index));
 239 }
 240 #endif
 241
 242 Value *Builder::VBROADCAST(Value *src)
 243 {
 244     // check if src is already a vector
 245     if (src->getType()->isVectorTy())
 246     {
 247         return src;
 248     }
 249
 250     return VECTOR_SPLAT(mVWidth, src);
 251 }
 252
 253 uint32_t Builder::IMMED(Value* v)
 254 {
 255     SWR_ASSERT(isa<ConstantInt>(v));
 256     ConstantInt *pValConst = cast<ConstantInt>(v);
 257     return pValConst->getZExtValue();
 258 }
 259
 260 int32_t Builder::S_IMMED(Value* v)
 261 {
 262     SWR_ASSERT(isa<ConstantInt>(v));
 263     ConstantInt *pValConst = cast<ConstantInt>(v);
 264     return pValConst->getSExtValue();
 265 }
 266
 267 Value *Builder::GEP(Value* ptr, const std::initializer_list<Value*> &indexList)
 268 {
 269     std::vector<Value*> indices;
 270     for (auto i : indexList)
 271         indices.push_back(i);
 272     return GEPA(ptr, indices);
 273 }
 274
 275 Value *Builder::GEP(Value* ptr, const std::initializer_list<uint32_t> &indexList)
 276 {
 277     std::vector<Value*> indices;
 278     for (auto i : indexList)
 279         indices.push_back(C(i));
 280     return GEPA(ptr, indices);
 281 }
 282
 283 LoadInst *Builder::LOAD(Value *basePtr, const std::initializer_list<uint32_t> &indices, const llvm::Twine& name)
 284 {
 285     std::vector<Value*> valIndices;
 286     for (auto i : indices)
 287         valIndices.push_back(C(i));
 288     return LOAD(GEPA(basePtr, valIndices), name);
 289 }
 290
 291 LoadInst *Builder::LOADV(Value *basePtr, const std::initializer_list<Value*> &indices, const llvm::Twine& name)
 292 {
 293     std::vector<Value*> valIndices;
 294     for (auto i : indices)
 295         valIndices.push_back(i);
 296     return LOAD(GEPA(basePtr, valIndices), name);
 297 }
 298
 299 StoreInst *Builder::STORE(Value *val, Value *basePtr, const std::initializer_list<uint32_t> &indices)
 300 {
 301     std::vector<Value*> valIndices;
 302     for (auto i : indices)
 303         valIndices.push_back(C(i));
 304     return STORE(val, GEPA(basePtr, valIndices));
 305 }
 306
 307 StoreInst *Builder::STOREV(Value *val, Value *basePtr, const std::initializer_list<Value*> &indices)
 308 {
 309     std::vector<Value*> valIndices;
 310     for (auto i : indices)
 311         valIndices.push_back(i);
 312     return STORE(val, GEPA(basePtr, valIndices));
 313 }
 314
 315 CallInst *Builder::CALL(Value *Callee, const std::initializer_list<Value*> &argsList)
 316 {
 317     std::vector<Value*> args;
 318     for (auto arg : argsList)
 319         args.push_back(arg);
 320     return CALLA(Callee, args);
 321 }
 322
 323 #if HAVE_LLVM > 0x306
 324 CallInst *Builder::CALL(Value *Callee, Value* arg)
 325 {
 326     std::vector<Value*> args;
 327     args.push_back(arg);
 328     return CALLA(Callee, args);
 329 }
 330
 331 CallInst *Builder::CALL2(Value *Callee, Value* arg1, Value* arg2)
 332 {
 333     std::vector<Value*> args;
 334     args.push_back(arg1);
 335     args.push_back(arg2);
 336     return CALLA(Callee, args);
 337 }
 338
 339 CallInst *Builder::CALL3(Value *Callee, Value* arg1, Value* arg2, Value* arg3)
 340 {
 341     std::vector<Value*> args;
 342     args.push_back(arg1);
 343     args.push_back(arg2);
 344     args.push_back(arg3);
 345     return CALLA(Callee, args);
 346 }
 347 #endif
 348
 349 Value *Builder::VRCP(Value *va)
 350 {
 351     return FDIV(VIMMED1(1.0f), va);  // 1 / a
 352 }
 353
 354 Value *Builder::VPLANEPS(Value* vA, Value* vB, Value* vC, Value* &vX, Value* &vY)
 355 {
 356     Value* vOut = FMADDPS(vA, vX, vC);
 357     vOut = FMADDPS(vB, vY, vOut);
 358     return vOut;
 359 }
 360
 361 //////////////////////////////////////////////////////////////////////////
 362 /// @brief Generate an i32 masked load operation in LLVM IR.  If not
 363 /// supported on the underlying platform, emulate it with float masked load
 364 /// @param src - base address pointer for the load
 365 /// @param vMask - SIMD wide mask that controls whether to access memory load 0
 366 Value *Builder::MASKLOADD(Value* src,Value* mask)
 367 {
 368     Value* vResult;
 369     // use avx2 gather instruction is available
 370     if(JM()->mArch.AVX2())
 371     {
 372         Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx2_maskload_d_256);
 373         vResult = CALL(func,{src,mask});
 374     }
 375     else
 376     {
 377         // maskload intrinsic expects integer mask operand in llvm >= 3.8
 378 #if (LLVM_VERSION_MAJOR > 3) || (LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 8)
 379         mask = BITCAST(mask,VectorType::get(mInt32Ty,mVWidth));
 380 #else
 381         mask = BITCAST(mask,VectorType::get(mFP32Ty,mVWidth));
 382 #endif
 383         Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule,Intrinsic::x86_avx_maskload_ps_256);
 384         vResult = BITCAST(CALL(func,{src,mask}), VectorType::get(mInt32Ty,mVWidth));
 385     }
 386     return vResult;
 387 }
 388
 389 //////////////////////////////////////////////////////////////////////////
 390 /// @brief insert a JIT call to CallPrint
 391 /// - outputs formatted string to both stdout and VS output window
 392 /// - DEBUG builds only
 393 /// Usage example:
 394 ///   PRINT("index %d = 0x%p\n",{C(lane), pIndex});
 395 ///   where C(lane) creates a constant value to print, and pIndex is the Value*
 396 ///   result from a GEP, printing out the pointer to memory
 397 /// @param printStr - constant string to print, which includes format specifiers
 398 /// @param printArgs - initializer list of Value*'s to print to std out
 399 CallInst *Builder::PRINT(const std::string &printStr,const std::initializer_list<Value*> &printArgs)
 400 {
 401     // push the arguments to CallPrint into a vector
 402     std::vector<Value*> printCallArgs;
 403     // save room for the format string.  we still need to modify it for vectors
 404     printCallArgs.resize(1);
 405
 406     // search through the format string for special processing
 407     size_t pos = 0;
 408     std::string tempStr(printStr);
 409     pos = tempStr.find('%', pos);
 410     auto v = printArgs.begin();
 411
 412     while ((pos != std::string::npos) && (v != printArgs.end()))
 413     {
 414         Value* pArg = *v;
 415         Type* pType = pArg->getType();
 416
 417         if (pType->isVectorTy())
 418         {
 419             Type* pContainedType = pType->getContainedType(0);
 420
 421             if (toupper(tempStr[pos + 1]) == 'X')
 422             {
 423                 tempStr[pos] = '0';
 424                 tempStr[pos + 1] = 'x';
 425                 tempStr.insert(pos + 2, "%08X ");
 426                 pos += 7;
 427
 428                 printCallArgs.push_back(VEXTRACT(pArg, C(0)));
 429
 430                 std::string vectorFormatStr;
 431                 for (uint32_t i = 1; i < pType->getVectorNumElements(); ++i)
 432                 {
 433                     vectorFormatStr += "0x%08X ";
 434                     printCallArgs.push_back(VEXTRACT(pArg, C(i)));
 435                 }
 436
 437                 tempStr.insert(pos, vectorFormatStr);
 438                 pos += vectorFormatStr.size();
 439             }
 440             else if ((tempStr[pos + 1] == 'f') && (pContainedType->isFloatTy()))
 441             {
 442                 uint32_t i = 0;
 443                 for (; i < (pArg->getType()->getVectorNumElements()) - 1; i++)
 444                 {
 445                     tempStr.insert(pos, std::string("%f "));
 446                     pos += 3;
 447                     printCallArgs.push_back(FP_EXT(VEXTRACT(pArg, C(i)), Type::getDoubleTy(JM()->mContext)));
 448                 }
 449                 printCallArgs.push_back(FP_EXT(VEXTRACT(pArg, C(i)), Type::getDoubleTy(JM()->mContext)));
 450             }
 451             else if ((tempStr[pos + 1] == 'd') && (pContainedType->isIntegerTy()))
 452             {
 453                 uint32_t i = 0;
 454                 for (; i < (pArg->getType()->getVectorNumElements()) - 1; i++)
 455                 {
 456                     tempStr.insert(pos, std::string("%d "));
 457                     pos += 3;
 458                     printCallArgs.push_back(VEXTRACT(pArg, C(i)));
 459                 }
 460                 printCallArgs.push_back(VEXTRACT(pArg, C(i)));
 461             }
 462         }
 463         else
 464         {
 465             if (toupper(tempStr[pos + 1]) == 'X')
 466             {
 467                 tempStr[pos] = '0';
 468                 tempStr.insert(pos + 1, "x%08");
 469                 printCallArgs.push_back(pArg);
 470                 pos += 3;
 471             }
 472             // for %f we need to cast float Values to doubles so that they print out correctly
 473             else if ((tempStr[pos + 1] == 'f') && (pType->isFloatTy()))
 474             {
 475                 printCallArgs.push_back(FP_EXT(pArg, Type::getDoubleTy(JM()->mContext)));
 476                 pos++;
 477             }
 478             else
 479             {
 480                 printCallArgs.push_back(pArg);
 481             }
 482         }
 483
 484         // advance to the next arguement
 485         v++;
 486         pos = tempStr.find('%', ++pos);
 487     }
 488
 489     // create global variable constant string
 490     Constant *constString = ConstantDataArray::getString(JM()->mContext,tempStr,true);
 491     GlobalVariable *gvPtr = new GlobalVariable(constString->getType(),true,GlobalValue::InternalLinkage,constString,"printStr");
 492     JM()->mpCurrentModule->getGlobalList().push_back(gvPtr);
 493
 494     // get a pointer to the first character in the constant string array
 495     std::vector<Constant*> geplist{C(0),C(0)};
 496 #if HAVE_LLVM == 0x306
 497     Constant *strGEP = ConstantExpr::getGetElementPtr(gvPtr,geplist,false);
 498 #else
 499     Constant *strGEP = ConstantExpr::getGetElementPtr(nullptr, gvPtr,geplist,false);
 500 #endif
 501
 502     // insert the pointer to the format string in the argument vector
 503     printCallArgs[0] = strGEP;
 504
 505     // get pointer to CallPrint function and insert decl into the module if needed
 506     std::vector<Type*> args;
 507     args.push_back(PointerType::get(mInt8Ty,0));
 508     FunctionType* callPrintTy = FunctionType::get(Type::getVoidTy(JM()->mContext),args,true);
 509     Function *callPrintFn = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("CallPrint", callPrintTy));
 510
 511     // if we haven't yet added the symbol to the symbol table
 512     if((sys::DynamicLibrary::SearchForAddressOfSymbol("CallPrint")) == nullptr)
 513     {
 514         sys::DynamicLibrary::AddSymbol("CallPrint", (void *)&CallPrint);
 515     }
 516
 517     // insert a call to CallPrint
 518     return CALLA(callPrintFn,printCallArgs);
 519 }
 520
 521 //////////////////////////////////////////////////////////////////////////
 522 /// @brief Wrapper around PRINT with initializer list.
 523 CallInst* Builder::PRINT(const std::string &printStr)
 524 {
 525     return PRINT(printStr, {});
 526 }
 527
 528 //////////////////////////////////////////////////////////////////////////
 529 /// @brief Generate a masked gather operation in LLVM IR.  If not
 530 /// supported on the underlying platform, emulate it with loads
 531 /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
 532 /// @param pBase - Int8* base VB address pointer value
 533 /// @param vIndices - SIMD wide value of VB byte offsets
 534 /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
 535 /// @param scale - value to scale indices by
 536 Value *Builder::GATHERPS(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, Value* scale)
 537 {
 538     Value* vGather;
 539
 540     // use avx2 gather instruction if available
 541     if(JM()->mArch.AVX2())
 542     {
 543         // force mask to <N x float>, required by vgather
 544         vMask = BITCAST(vMask, mSimdFP32Ty);
 545         vGather = VGATHERPS(vSrc,pBase,vIndices,vMask,scale);
 546     }
 547     else
 548     {
 549         Value* pStack = STACKSAVE();
 550
 551         // store vSrc on the stack.  this way we can select between a valid load address and the vSrc address
 552         Value* vSrcPtr = ALLOCA(vSrc->getType());
 553         STORE(vSrc, vSrcPtr);
 554
 555         vGather = VUNDEF_F();
 556         Value *vScaleVec = VBROADCAST(Z_EXT(scale,mInt32Ty));
 557         Value *vOffsets = MUL(vIndices,vScaleVec);
 558         Value *mask = MASK(vMask);
 559         for(uint32_t i = 0; i < mVWidth; ++i)
 560         {
 561             // single component byte index
 562             Value *offset = VEXTRACT(vOffsets,C(i));
 563             // byte pointer to component
 564             Value *loadAddress = GEP(pBase,offset);
 565             loadAddress = BITCAST(loadAddress,PointerType::get(mFP32Ty,0));
 566             // pointer to the value to load if we're masking off a component
 567             Value *maskLoadAddress = GEP(vSrcPtr,{C(0), C(i)});
 568             Value *selMask = VEXTRACT(mask,C(i));
 569             // switch in a safe address to load if we're trying to access a vertex
 570             Value *validAddress = SELECT(selMask, loadAddress, maskLoadAddress);
 571             Value *val = LOAD(validAddress);
 572             vGather = VINSERT(vGather,val,C(i));
 573         }
 574         STACKRESTORE(pStack);
 575     }
 576
 577     return vGather;
 578 }
 579
 580 //////////////////////////////////////////////////////////////////////////
 581 /// @brief Generate a masked gather operation in LLVM IR.  If not
 582 /// supported on the underlying platform, emulate it with loads
 583 /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
 584 /// @param pBase - Int8* base VB address pointer value
 585 /// @param vIndices - SIMD wide value of VB byte offsets
 586 /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
 587 /// @param scale - value to scale indices by
 588 Value *Builder::GATHERDD(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, Value* scale)
 589 {
 590     Value* vGather;
 591
 592     // use avx2 gather instruction if available
 593     if(JM()->mArch.AVX2())
 594     {
 595         vGather = VGATHERDD(vSrc, pBase, vIndices, vMask, scale);
 596     }
 597     else
 598     {
 599         Value* pStack = STACKSAVE();
 600
 601         // store vSrc on the stack.  this way we can select between a valid load address and the vSrc address
 602         Value* vSrcPtr = ALLOCA(vSrc->getType());
 603         STORE(vSrc, vSrcPtr);
 604
 605         vGather = VUNDEF_I();
 606         Value *vScaleVec = VBROADCAST(Z_EXT(scale, mInt32Ty));
 607         Value *vOffsets = MUL(vIndices, vScaleVec);
 608         Value *mask = MASK(vMask);
 609         for(uint32_t i = 0; i < mVWidth; ++i)
 610         {
 611             // single component byte index
 612             Value *offset = VEXTRACT(vOffsets, C(i));
 613             // byte pointer to component
 614             Value *loadAddress = GEP(pBase, offset);
 615             loadAddress = BITCAST(loadAddress, PointerType::get(mInt32Ty, 0));
 616             // pointer to the value to load if we're masking off a component
 617             Value *maskLoadAddress = GEP(vSrcPtr, {C(0), C(i)});
 618             Value *selMask = VEXTRACT(mask, C(i));
 619             // switch in a safe address to load if we're trying to access a vertex
 620             Value *validAddress = SELECT(selMask, loadAddress, maskLoadAddress);
 621             Value *val = LOAD(validAddress, C(0));
 622             vGather = VINSERT(vGather, val, C(i));
 623         }
 624
 625         STACKRESTORE(pStack);
 626     }
 627     return vGather;
 628 }
 629
 630 //////////////////////////////////////////////////////////////////////////
 631 /// @brief convert x86 <N x float> mask to llvm <N x i1> mask
 632 Value* Builder::MASK(Value* vmask)
 633 {
 634     Value* src = BITCAST(vmask, mSimdInt32Ty);
 635     return ICMP_SLT(src, VIMMED1(0));
 636 }
 637
 638 //////////////////////////////////////////////////////////////////////////
 639 /// @brief convert llvm <N x i1> mask to x86 <N x i32> mask
 640 Value* Builder::VMASK(Value* mask)
 641 {
 642     return S_EXT(mask, mSimdInt32Ty);
 643 }
 644
 645 //////////////////////////////////////////////////////////////////////////
 646 /// @brief Generate a VPSHUFB operation in LLVM IR.  If not
 647 /// supported on the underlying platform, emulate it
 648 /// @param a - 256bit SIMD(32x8bit) of 8bit integer values
 649 /// @param b - 256bit SIMD(32x8bit) of 8bit integer mask values
 650 /// Byte masks in lower 128 lane of b selects 8 bit values from lower
 651 /// 128bits of a, and vice versa for the upper lanes.  If the mask
 652 /// value is negative, '0' is inserted.
 653 Value *Builder::PSHUFB(Value* a, Value* b)
 654 {
 655     Value* res;
 656     // use avx2 pshufb instruction if available
 657     if(JM()->mArch.AVX2())
 658     {
 659         res = VPSHUFB(a, b);
 660     }
 661     else
 662     {
 663         Constant* cB = dyn_cast<Constant>(b);
 664         // number of 8 bit elements in b
 665         uint32_t numElms = cast<VectorType>(cB->getType())->getNumElements();
 666         // output vector
 667         Value* vShuf = UndefValue::get(VectorType::get(mInt8Ty, numElms));
 668
 669         // insert an 8 bit value from the high and low lanes of a per loop iteration
 670         numElms /= 2;
 671         for(uint32_t i = 0; i < numElms; i++)
 672         {
 673             ConstantInt* cLow128b = cast<ConstantInt>(cB->getAggregateElement(i));
 674             ConstantInt* cHigh128b = cast<ConstantInt>(cB->getAggregateElement(i + numElms));
 675
 676             // extract values from constant mask
 677             char valLow128bLane =  (char)(cLow128b->getSExtValue());
 678             char valHigh128bLane = (char)(cHigh128b->getSExtValue());
 679
 680             Value* insertValLow128b;
 681             Value* insertValHigh128b;
 682
 683             // if the mask value is negative, insert a '0' in the respective output position
 684             // otherwise, lookup the value at mask position (bits 3..0 of the respective mask byte) in a and insert in output vector
 685             insertValLow128b = (valLow128bLane < 0) ? C((char)0) : VEXTRACT(a, C((valLow128bLane & 0xF)));
 686             insertValHigh128b = (valHigh128bLane < 0) ? C((char)0) : VEXTRACT(a, C((valHigh128bLane & 0xF) + numElms));
 687
 688             vShuf = VINSERT(vShuf, insertValLow128b, i);
 689             vShuf = VINSERT(vShuf, insertValHigh128b, (i + numElms));
 690         }
 691         res = vShuf;
 692     }
 693     return res;
 694 }
 695
 696 //////////////////////////////////////////////////////////////////////////
 697 /// @brief Generate a VPSHUFB operation (sign extend 8 8bit values to 32
 698 /// bits)in LLVM IR.  If not supported on the underlying platform, emulate it
 699 /// @param a - 128bit SIMD lane(16x8bit) of 8bit integer values.  Only
 700 /// lower 8 values are used.
 701 Value *Builder::PMOVSXBD(Value* a)
 702 {
 703     // llvm-3.9 removed the pmovsxbd intrinsic
 704 #if HAVE_LLVM < 0x309
 705     // use avx2 byte sign extend instruction if available
 706     if(JM()->mArch.AVX2())
 707     {
 708         Function *pmovsxbd = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx2_pmovsxbd);
 709         return CALL(pmovsxbd, std::initializer_list<Value*>{a});
 710     }
 711     else
 712 #endif
 713     {
 714         // VPMOVSXBD output type
 715         Type* v8x32Ty = VectorType::get(mInt32Ty, 8);
 716         // Extract 8 values from 128bit lane and sign extend
 717         return S_EXT(VSHUFFLE(a, a, C<int>({0, 1, 2, 3, 4, 5, 6, 7})), v8x32Ty);
 718     }
 719 }
 720
 721 //////////////////////////////////////////////////////////////////////////
 722 /// @brief Generate a VPSHUFB operation (sign extend 8 16bit values to 32
 723 /// bits)in LLVM IR.  If not supported on the underlying platform, emulate it
 724 /// @param a - 128bit SIMD lane(8x16bit) of 16bit integer values.
 725 Value *Builder::PMOVSXWD(Value* a)
 726 {
 727     // llvm-3.9 removed the pmovsxwd intrinsic
 728 #if HAVE_LLVM < 0x309
 729     // use avx2 word sign extend if available
 730     if(JM()->mArch.AVX2())
 731     {
 732         Function *pmovsxwd = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx2_pmovsxwd);
 733         return CALL(pmovsxwd, std::initializer_list<Value*>{a});
 734     }
 735     else
 736 #endif
 737     {
 738         // VPMOVSXWD output type
 739         Type* v8x32Ty = VectorType::get(mInt32Ty, 8);
 740         // Extract 8 values from 128bit lane and sign extend
 741         return S_EXT(VSHUFFLE(a, a, C<int>({0, 1, 2, 3, 4, 5, 6, 7})), v8x32Ty);
 742     }
 743 }
 744
 745 //////////////////////////////////////////////////////////////////////////
 746 /// @brief Generate a VPERMD operation (shuffle 32 bit integer values
 747 /// across 128 bit lanes) in LLVM IR.  If not supported on the underlying
 748 /// platform, emulate it
 749 /// @param a - 256bit SIMD lane(8x32bit) of integer values.
 750 /// @param idx - 256bit SIMD lane(8x32bit) of 3 bit lane index values
 751 Value *Builder::PERMD(Value* a, Value* idx)
 752 {
 753     Value* res;
 754     // use avx2 permute instruction if available
 755     if(JM()->mArch.AVX2())
 756     {
 757         res = VPERMD(a, idx);
 758     }
 759     else
 760     {
 761         if (isa<Constant>(idx))
 762         {
 763             res = VSHUFFLE(a, a, idx);
 764         }
 765         else
 766         {
 767             res = VUNDEF_I();
 768             for (uint32_t l = 0; l < JM()->mVWidth; ++l)
 769             {
 770                 Value* pIndex = VEXTRACT(idx, C(l));
 771                 Value* pVal = VEXTRACT(a, pIndex);
 772                 res = VINSERT(res, pVal, C(l));
 773             }
 774         }
 775     }
 776     return res;
 777 }
 778
 779 //////////////////////////////////////////////////////////////////////////
 780 /// @brief Generate a VPERMPS operation (shuffle 32 bit float values
 781 /// across 128 bit lanes) in LLVM IR.  If not supported on the underlying
 782 /// platform, emulate it
 783 /// @param a - 256bit SIMD lane(8x32bit) of float values.
 784 /// @param idx - 256bit SIMD lane(8x32bit) of 3 bit lane index values
 785 Value *Builder::PERMPS(Value* a, Value* idx)
 786 {
 787     Value* res;
 788     // use avx2 permute instruction if available
 789     if (JM()->mArch.AVX2())
 790     {
 791         // llvm 3.6.0 swapped the order of the args to vpermd
 792         res = VPERMPS(idx, a);
 793     }
 794     else
 795     {
 796         if (isa<Constant>(idx))
 797         {
 798             res = VSHUFFLE(a, a, idx);
 799         }
 800         else
 801         {
 802             res = VUNDEF_F();
 803             for (uint32_t l = 0; l < JM()->mVWidth; ++l)
 804             {
 805                 Value* pIndex = VEXTRACT(idx, C(l));
 806                 Value* pVal = VEXTRACT(a, pIndex);
 807                 res = VINSERT(res, pVal, C(l));
 808             }
 809         }
 810     }
 811
 812     return res;
 813 }
 814
 815 //////////////////////////////////////////////////////////////////////////
 816 /// @brief Generate a VCVTPH2PS operation (float16->float32 conversion)
 817 /// in LLVM IR.  If not supported on the underlying platform, emulate it
 818 /// @param a - 128bit SIMD lane(8x16bit) of float16 in int16 format.
 819 Value *Builder::CVTPH2PS(Value* a)
 820 {
 821     if (JM()->mArch.F16C())
 822     {
 823         return VCVTPH2PS(a);
 824     }
 825     else
 826     {
 827         FunctionType* pFuncTy = FunctionType::get(mFP32Ty, mInt16Ty);
 828         Function* pCvtPh2Ps = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("ConvertSmallFloatTo32", pFuncTy));
 829
 830         if (sys::DynamicLibrary::SearchForAddressOfSymbol("ConvertSmallFloatTo32") == nullptr)
 831         {
 832             sys::DynamicLibrary::AddSymbol("ConvertSmallFloatTo32", (void *)&ConvertSmallFloatTo32);
 833         }
 834
 835         Value* pResult = UndefValue::get(mSimdFP32Ty);
 836         for (uint32_t i = 0; i < mVWidth; ++i)
 837         {
 838             Value* pSrc = VEXTRACT(a, C(i));
 839             Value* pConv = CALL(pCvtPh2Ps, std::initializer_list<Value*>{pSrc});
 840             pResult = VINSERT(pResult, pConv, C(i));
 841         }
 842
 843         return pResult;
 844     }
 845 }
 846
 847 //////////////////////////////////////////////////////////////////////////
 848 /// @brief Generate a VCVTPS2PH operation (float32->float16 conversion)
 849 /// in LLVM IR.  If not supported on the underlying platform, emulate it
 850 /// @param a - 128bit SIMD lane(8x16bit) of float16 in int16 format.
 851 Value *Builder::CVTPS2PH(Value* a, Value* rounding)
 852 {
 853     if (JM()->mArch.F16C())
 854     {
 855         return VCVTPS2PH(a, rounding);
 856     }
 857     else
 858     {
 859         // call scalar C function for now
 860         FunctionType* pFuncTy = FunctionType::get(mInt16Ty, mFP32Ty);
 861         Function* pCvtPs2Ph = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("Convert32To16Float", pFuncTy));
 862
 863         if (sys::DynamicLibrary::SearchForAddressOfSymbol("Convert32To16Float") == nullptr)
 864         {
 865             sys::DynamicLibrary::AddSymbol("Convert32To16Float", (void *)&Convert32To16Float);
 866         }
 867
 868         Value* pResult = UndefValue::get(mSimdInt16Ty);
 869         for (uint32_t i = 0; i < mVWidth; ++i)
 870         {
 871             Value* pSrc = VEXTRACT(a, C(i));
 872             Value* pConv = CALL(pCvtPs2Ph, std::initializer_list<Value*>{pSrc});
 873             pResult = VINSERT(pResult, pConv, C(i));
 874         }
 875
 876         return pResult;
 877     }
 878 }
 879
 880 Value *Builder::PMAXSD(Value* a, Value* b)
 881 {
 882     // llvm-3.9 removed the pmax intrinsics
 883 #if HAVE_LLVM >= 0x309
 884     Value* cmp = ICMP_SGT(a, b);
 885     return SELECT(cmp, a, b);
 886 #else
 887     if (JM()->mArch.AVX2())
 888     {
 889         Function* pmaxsd = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx2_pmaxs_d);
 890         return CALL(pmaxsd, {a, b});
 891     }
 892     else
 893     {
 894         // use 4-wide sse max intrinsic on lower/upper halves of 8-wide sources
 895         Function* pmaxsd = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_sse41_pmaxsd);
 896
 897         // low 128
 898         Value* aLo = VEXTRACTI128(a, C((uint8_t)0));
 899         Value* bLo = VEXTRACTI128(b, C((uint8_t)0));
 900         Value* resLo = CALL(pmaxsd, {aLo, bLo});
 901
 902         // high 128
 903         Value* aHi = VEXTRACTI128(a, C((uint8_t)1));
 904         Value* bHi = VEXTRACTI128(b, C((uint8_t)1));
 905         Value* resHi = CALL(pmaxsd, {aHi, bHi});
 906
 907         // combine
 908         Value* result = VINSERTI128(VUNDEF_I(), resLo, C((uint8_t)0));
 909         result = VINSERTI128(result, resHi, C((uint8_t)1));
 910
 911         return result;
 912     }
 913 #endif
 914 }
 915
 916 Value *Builder::PMINSD(Value* a, Value* b)
 917 {
 918     // llvm-3.9 removed the pmin intrinsics
 919 #if HAVE_LLVM >= 0x309
 920     Value* cmp = ICMP_SLT(a, b);
 921     return SELECT(cmp, a, b);
 922 #else
 923     if (JM()->mArch.AVX2())
 924     {
 925         Function* pminsd = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx2_pmins_d);
 926         return CALL(pminsd, {a, b});
 927     }
 928     else
 929     {
 930         // use 4-wide sse max intrinsic on lower/upper halves of 8-wide sources
 931         Function* pminsd = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_sse41_pminsd);
 932
 933         // low 128
 934         Value* aLo = VEXTRACTI128(a, C((uint8_t)0));
 935         Value* bLo = VEXTRACTI128(b, C((uint8_t)0));
 936         Value* resLo = CALL(pminsd, {aLo, bLo});
 937
 938         // high 128
 939         Value* aHi = VEXTRACTI128(a, C((uint8_t)1));
 940         Value* bHi = VEXTRACTI128(b, C((uint8_t)1));
 941         Value* resHi = CALL(pminsd, {aHi, bHi});
 942
 943         // combine
 944         Value* result = VINSERTI128(VUNDEF_I(), resLo, C((uint8_t)0));
 945         result = VINSERTI128(result, resHi, C((uint8_t)1));
 946
 947         return result;
 948     }
 949 #endif
 950 }
 951
 952 void Builder::Gather4(const SWR_FORMAT format, Value* pSrcBase, Value* byteOffsets,
 953                       Value* mask, Value* vGatherComponents[], bool bPackedOutput)
 954 {
 955     const SWR_FORMAT_INFO &info = GetFormatInfo(format);
 956     if(info.type[0] == SWR_TYPE_FLOAT && info.bpc[0] == 32)
 957     {
 958         // ensure our mask is the correct type
 959         mask = BITCAST(mask, mSimdFP32Ty);
 960         GATHER4PS(info, pSrcBase, byteOffsets, mask, vGatherComponents, bPackedOutput);
 961     }
 962     else
 963     {
 964         // ensure our mask is the correct type
 965         mask = BITCAST(mask, mSimdInt32Ty);
 966         GATHER4DD(info, pSrcBase, byteOffsets, mask, vGatherComponents, bPackedOutput);
 967     }
 968 }
 969
 970 void Builder::GATHER4PS(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets,
 971                         Value* mask, Value* vGatherComponents[], bool bPackedOutput)
 972 {
 973     switch(info.bpp / info.numComps)
 974     {
 975         case 16:
 976         {
 977                 Value* vGatherResult[2];
 978                 Value *vMask;
 979
 980                 // TODO: vGatherMaskedVal
 981                 Value* vGatherMaskedVal = VIMMED1((float)0);
 982
 983                 // always have at least one component out of x or y to fetch
 984
 985                 // save mask as it is zero'd out after each gather
 986                 vMask = mask;
 987
 988                 vGatherResult[0] = GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, C((char)1));
 989                 // e.g. result of first 8x32bit integer gather for 16bit components
 990                 // 256i - 0    1    2    3    4    5    6    7
 991                 //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
 992                 //
 993
 994                 // if we have at least one component out of x or y to fetch
 995                 if(info.numComps > 2)
 996                 {
 997                     // offset base to the next components(zw) in the vertex to gather
 998                     pSrcBase = GEP(pSrcBase, C((char)4));
 999                     vMask = mask;
1000
1001                     vGatherResult[1] =  GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, C((char)1));
1002                     // e.g. result of second 8x32bit integer gather for 16bit components
1003                     // 256i - 0    1    2    3    4    5    6    7
1004                     //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
1005                     //
1006                 }
1007                 else
1008                 {
1009                     vGatherResult[1] =  vGatherMaskedVal;
1010                 }
1011
1012                 // Shuffle gathered components into place, each row is a component
1013                 Shuffle16bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
1014         }
1015             break;
1016         case 32:
1017         {
1018             // apply defaults
1019             for (uint32_t i = 0; i < 4; ++i)
1020             {
1021                 vGatherComponents[i] = VIMMED1(*(float*)&info.defaults[i]);
1022             }
1023
1024             for(uint32_t i = 0; i < info.numComps; i++)
1025             {
1026                 uint32_t swizzleIndex = info.swizzle[i];
1027
1028                 // save mask as it is zero'd out after each gather
1029                 Value *vMask = mask;
1030
1031                 // Gather a SIMD of components
1032                 vGatherComponents[swizzleIndex] = GATHERPS(vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask, C((char)1));
1033
1034                 // offset base to the next component to gather
1035                 pSrcBase = GEP(pSrcBase, C((char)4));
1036             }
1037         }
1038             break;
1039         default:
1040             SWR_ASSERT(0, "Invalid float format");
1041             break;
1042     }
1043 }
1044
1045 void Builder::GATHER4DD(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets,
1046                         Value* mask, Value* vGatherComponents[], bool bPackedOutput)
1047 {
1048     switch (info.bpp / info.numComps)
1049     {
1050         case 8:
1051         {
1052             Value* vGatherMaskedVal = VIMMED1((int32_t)0);
1053             Value* vGatherResult = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, mask, C((char)1));
1054             // e.g. result of an 8x32bit integer gather for 8bit components
1055             // 256i - 0    1    2    3    4    5    6    7
1056             //        xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw
1057
1058             Shuffle8bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
1059         }
1060             break;
1061         case 16:
1062         {
1063             Value* vGatherResult[2];
1064             Value *vMask;
1065
1066             // TODO: vGatherMaskedVal
1067             Value* vGatherMaskedVal = VIMMED1((int32_t)0);
1068
1069             // always have at least one component out of x or y to fetch
1070
1071             // save mask as it is zero'd out after each gather
1072             vMask = mask;
1073
1074             vGatherResult[0] = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, C((char)1));
1075             // e.g. result of first 8x32bit integer gather for 16bit components
1076             // 256i - 0    1    2    3    4    5    6    7
1077             //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
1078             //
1079
1080             // if we have at least one component out of x or y to fetch
1081             if(info.numComps > 2)
1082             {
1083                 // offset base to the next components(zw) in the vertex to gather
1084                 pSrcBase = GEP(pSrcBase, C((char)4));
1085                 vMask = mask;
1086
1087                 vGatherResult[1] = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, C((char)1));
1088                 // e.g. result of second 8x32bit integer gather for 16bit components
1089                 // 256i - 0    1    2    3    4    5    6    7
1090                 //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
1091                 //
1092             }
1093             else
1094             {
1095                 vGatherResult[1] = vGatherMaskedVal;
1096             }
1097
1098             // Shuffle gathered components into place, each row is a component
1099             Shuffle16bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
1100
1101         }
1102             break;
1103         case 32:
1104         {
1105             // apply defaults
1106             for (uint32_t i = 0; i < 4; ++i)
1107             {
1108                 vGatherComponents[i] = VIMMED1((int)info.defaults[i]);
1109             }
1110
1111             for(uint32_t i = 0; i < info.numComps; i++)
1112             {
1113                 uint32_t swizzleIndex = info.swizzle[i];
1114
1115                 // save mask as it is zero'd out after each gather
1116                 Value *vMask = mask;
1117
1118                 // Gather a SIMD of components
1119                 vGatherComponents[swizzleIndex] = GATHERDD(vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask, C((char)1));
1120
1121                 // offset base to the next component to gather
1122                 pSrcBase = GEP(pSrcBase, C((char)4));
1123             }
1124         }
1125             break;
1126         default:
1127             SWR_ASSERT(0, "unsupported format");
1128         break;
1129     }
1130 }
1131
1132 void Builder::Shuffle16bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput[2], Value* vGatherOutput[4], bool bPackedOutput)
1133 {
1134     // cast types
1135     Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
1136     Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
1137
1138     // input could either be float or int vector; do shuffle work in int
1139     vGatherInput[0] = BITCAST(vGatherInput[0], mSimdInt32Ty);
1140     vGatherInput[1] = BITCAST(vGatherInput[1], mSimdInt32Ty);
1141
1142     if(bPackedOutput)
1143     {
1144         Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
1145
1146         // shuffle mask
1147         Value* vConstMask = C<char>({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
1148                                      0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15});
1149         Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput[0], v32x8Ty), vConstMask), vGatherTy);
1150         // after pshufb: group components together in each 128bit lane
1151         // 256i - 0    1    2    3    4    5    6    7
1152         //        xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
1153
1154         Value* vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
1155         // after PERMD: move and pack xy components into each 128bit lane
1156         // 256i - 0    1    2    3    4    5    6    7
1157         //        xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
1158
1159         // do the same for zw components
1160         Value* vi128ZW = nullptr;
1161         if(info.numComps > 2)
1162         {
1163             Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput[1], v32x8Ty), vConstMask), vGatherTy);
1164             vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
1165         }
1166
1167         for(uint32_t i = 0; i < 4; i++)
1168         {
1169             uint32_t swizzleIndex = info.swizzle[i];
1170             // todo: fixed for packed
1171             Value* vGatherMaskedVal = VIMMED1((int32_t)(info.defaults[i]));
1172             if(i >= info.numComps)
1173             {
1174                 // set the default component val
1175                 vGatherOutput[swizzleIndex] = vGatherMaskedVal;
1176                 continue;
1177             }
1178
1179             // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
1180             uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
1181             // if x or y, use vi128XY permute result, else use vi128ZW
1182             Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
1183
1184             // extract packed component 128 bit lanes
1185             vGatherOutput[swizzleIndex] = VEXTRACT(selectedPermute, C(lane));
1186         }
1187
1188     }
1189     else
1190     {
1191         // pshufb masks for each component
1192         Value* vConstMask[2];
1193         // x/z shuffle mask
1194         vConstMask[0] = C<char>({0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
1195                                  0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, });
1196
1197         // y/w shuffle mask
1198         vConstMask[1] = C<char>({2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
1199                                  2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1});
1200
1201
1202         // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits
1203         // apply defaults
1204         for (uint32_t i = 0; i < 4; ++i)
1205         {
1206             vGatherOutput[i] = VIMMED1((int32_t)info.defaults[i]);
1207         }
1208
1209         for(uint32_t i = 0; i < info.numComps; i++)
1210         {
1211             uint32_t swizzleIndex = info.swizzle[i];
1212
1213             // select correct constMask for x/z or y/w pshufb
1214             uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1;
1215             // if x or y, use vi128XY permute result, else use vi128ZW
1216             uint32_t selectedGather = (i < 2) ? 0 : 1;
1217
1218             vGatherOutput[swizzleIndex] = BITCAST(PSHUFB(BITCAST(vGatherInput[selectedGather], v32x8Ty), vConstMask[selectedMask]), vGatherTy);
1219             // after pshufb mask for x channel; z uses the same shuffle from the second gather
1220             // 256i - 0    1    2    3    4    5    6    7
1221             //        xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00
1222         }
1223     }
1224 }
1225
1226 void Builder::Shuffle8bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput, Value* vGatherOutput[], bool bPackedOutput)
1227 {
1228     // cast types
1229     Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
1230     Type* v32x8Ty =  VectorType::get(mInt8Ty, mVWidth * 4 ); // vwidth is units of 32 bits
1231
1232     if(bPackedOutput)
1233     {
1234         Type* v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
1235         // shuffle mask
1236         Value* vConstMask = C<char>({0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15,
1237                                      0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15});
1238         Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput, v32x8Ty), vConstMask), vGatherTy);
1239         // after pshufb: group components together in each 128bit lane
1240         // 256i - 0    1    2    3    4    5    6    7
1241         //        xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww
1242
1243         Value* vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({0, 4, 0, 0, 1, 5, 0, 0})), v128Ty);
1244         // after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane
1245         // 256i - 0    1    2    3    4    5    6    7
1246         //        xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care)
1247
1248         // do the same for zw components
1249         Value* vi128ZW = nullptr;
1250         if(info.numComps > 2)
1251         {
1252             vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({2, 6, 0, 0, 3, 7, 0, 0})), v128Ty);
1253         }
1254
1255         // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
1256         for(uint32_t i = 0; i < 4; i++)
1257         {
1258             uint32_t swizzleIndex = info.swizzle[i];
1259             // todo: fix for packed
1260             Value* vGatherMaskedVal = VIMMED1((int32_t)(info.defaults[i]));
1261             if(i >= info.numComps)
1262             {
1263                 // set the default component val
1264                 vGatherOutput[swizzleIndex] = vGatherMaskedVal;
1265                 continue;
1266             }
1267
1268             // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
1269             uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
1270             // if x or y, use vi128XY permute result, else use vi128ZW
1271             Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
1272
1273             // sign extend
1274             vGatherOutput[swizzleIndex] = VEXTRACT(selectedPermute, C(lane));
1275         }
1276     }
1277     // else zero extend
1278     else{
1279         // shuffle enabled components into lower byte of each 32bit lane, 0 extending to 32 bits
1280         // apply defaults
1281         for (uint32_t i = 0; i < 4; ++i)
1282         {
1283             vGatherOutput[i] = VIMMED1((int32_t)info.defaults[i]);
1284         }
1285
1286         for(uint32_t i = 0; i < info.numComps; i++){
1287             uint32_t swizzleIndex = info.swizzle[i];
1288
1289             // pshufb masks for each component
1290             Value* vConstMask;
1291             switch(i)
1292             {
1293                 case 0:
1294                     // x shuffle mask
1295                     vConstMask = C<char>({0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1,
1296                                           0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1});
1297                     break;
1298                 case 1:
1299                     // y shuffle mask
1300                     vConstMask = C<char>({1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1,
1301                                           1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1});
1302                     break;
1303                 case 2:
1304                     // z shuffle mask
1305                     vConstMask = C<char>({2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1,
1306                                           2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1});
1307                     break;
1308                 case 3:
1309                     // w shuffle mask
1310                     vConstMask = C<char>({3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1,
1311                                           3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1});
1312                     break;
1313                 default:
1314                     vConstMask = nullptr;
1315                     break;
1316             }
1317
1318                 vGatherOutput[swizzleIndex] = BITCAST(PSHUFB(BITCAST(vGatherInput, v32x8Ty), vConstMask), vGatherTy);
1319                 // after pshufb for x channel
1320                 // 256i - 0    1    2    3    4    5    6    7
1321                 //        x000 x000 x000 x000 x000 x000 x000 x000
1322         }
1323     }
1324 }
1325
1326 //////////////////////////////////////////////////////////////////////////
1327 /// @brief emulates a scatter operation.
1328 /// @param pDst - pointer to destination
1329 /// @param vSrc - vector of src data to scatter
1330 /// @param vOffsets - vector of byte offsets from pDst
1331 /// @param vMask - mask of valid lanes
1332 void Builder::SCATTERPS(Value* pDst, Value* vSrc, Value* vOffsets, Value* vMask)
1333 {
1334     Value* pStack = STACKSAVE();
1335
1336     Type* pSrcTy = vSrc->getType()->getVectorElementType();
1337
1338     // allocate tmp stack for masked off lanes
1339     Value* vTmpPtr = ALLOCA(pSrcTy);
1340
1341     Value *mask = MASK(vMask);
1342     for (uint32_t i = 0; i < mVWidth; ++i)
1343     {
1344         Value *offset = VEXTRACT(vOffsets, C(i));
1345         // byte pointer to component
1346         Value *storeAddress = GEP(pDst, offset);
1347         storeAddress = BITCAST(storeAddress, PointerType::get(pSrcTy, 0));
1348         Value *selMask = VEXTRACT(mask, C(i));
1349         Value *srcElem = VEXTRACT(vSrc, C(i));
1350         // switch in a safe address to load if we're trying to access a vertex
1351         Value *validAddress = SELECT(selMask, storeAddress, vTmpPtr);
1352         STORE(srcElem, validAddress);
1353     }
1354
1355     STACKRESTORE(pStack);
1356 }
1357
1358 Value* Builder::VABSPS(Value* a)
1359 {
1360     Value* asInt = BITCAST(a, mSimdInt32Ty);
1361     Value* result = BITCAST(AND(asInt, VIMMED1(0x7fffffff)), mSimdFP32Ty);
1362     return result;
1363 }
1364
1365 Value *Builder::ICLAMP(Value* src, Value* low, Value* high)
1366 {
1367     Value *lowCmp = ICMP_SLT(src, low);
1368     Value *ret = SELECT(lowCmp, low, src);
1369
1370     Value *highCmp = ICMP_SGT(ret, high);
1371     ret = SELECT(highCmp, high, ret);
1372
1373     return ret;
1374 }
1375
1376 Value *Builder::FCLAMP(Value* src, Value* low, Value* high)
1377 {
1378     Value *lowCmp = FCMP_OLT(src, low);
1379     Value *ret = SELECT(lowCmp, low, src);
1380
1381     Value *highCmp = FCMP_OGT(ret, high);
1382     ret = SELECT(highCmp, high, ret);
1383
1384     return ret;
1385 }
1386
1387 Value *Builder::FCLAMP(Value* src, float low, float high)
1388 {
1389     Value* result = VMAXPS(src, VIMMED1(low));
1390     result = VMINPS(result, VIMMED1(high));
1391
1392     return result;
1393 }
1394
1395 //////////////////////////////////////////////////////////////////////////
1396 /// @brief save/restore stack, providing ability to push/pop the stack and
1397 ///        reduce overall stack requirements for temporary stack use
1398 Value* Builder::STACKSAVE()
1399 {
1400     Function* pfnStackSave = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::stacksave);
1401 #if HAVE_LLVM == 0x306
1402     return CALL(pfnStackSave);
1403 #else
1404     return CALLA(pfnStackSave);
1405 #endif
1406 }
1407
1408 void Builder::STACKRESTORE(Value* pSaved)
1409 {
1410     Function* pfnStackRestore = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::stackrestore);
1411     CALL(pfnStackRestore, std::initializer_list<Value*>{pSaved});
1412 }
1413
1414 Value *Builder::FMADDPS(Value* a, Value* b, Value* c)
1415 {
1416     Value* vOut;
1417     // use FMADs if available
1418     if(JM()->mArch.AVX2())
1419     {
1420         vOut = VFMADDPS(a, b, c);
1421     }
1422     else
1423     {
1424         vOut = FADD(FMUL(a, b), c);
1425     }
1426     return vOut;
1427 }
1428
1429 Value* Builder::POPCNT(Value* a)
1430 {
1431     Function* pCtPop = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::ctpop, { a->getType() });
1432     return CALL(pCtPop, std::initializer_list<Value*>{a});
1433 }
1434
1435 //////////////////////////////////////////////////////////////////////////
1436 /// @brief C functions called by LLVM IR
1437 //////////////////////////////////////////////////////////////////////////
1438
1439 //////////////////////////////////////////////////////////////////////////
1440 /// @brief called in JIT code, inserted by PRINT
1441 /// output to both stdout and visual studio debug console
1442 void __cdecl CallPrint(const char* fmt, ...)
1443 {
1444     va_list args;
1445     va_start(args, fmt);
1446     vprintf(fmt, args);
1447
1448 #if defined( _WIN32 )
1449     char strBuf[1024];
1450     vsnprintf_s(strBuf, _TRUNCATE, fmt, args);
1451     OutputDebugString(strBuf);
1452 #endif
1453
1454     va_end(args);
1455 }
1456
1457 Value *Builder::VEXTRACTI128(Value* a, Constant* imm8)
1458 {
1459 #if HAVE_LLVM == 0x306
1460     Function *func =
1461         Intrinsic::getDeclaration(JM()->mpCurrentModule,
1462                                   Intrinsic::x86_avx_vextractf128_si_256);
1463     return CALL(func, {a, imm8});
1464 #else
1465     bool flag = !imm8->isZeroValue();
1466     SmallVector<Constant*,8> idx;
1467     for (unsigned i = 0; i < mVWidth / 2; i++) {
1468         idx.push_back(C(flag ? i + mVWidth / 2 : i));
1469     }
1470     return VSHUFFLE(a, VUNDEF_I(), ConstantVector::get(idx));
1471 #endif
1472 }
1473
1474 Value *Builder::VINSERTI128(Value* a, Value* b, Constant* imm8)
1475 {
1476 #if HAVE_LLVM == 0x306
1477     Function *func =
1478         Intrinsic::getDeclaration(JM()->mpCurrentModule,
1479                                   Intrinsic::x86_avx_vinsertf128_si_256);
1480     return CALL(func, {a, b, imm8});
1481 #else
1482     bool flag = !imm8->isZeroValue();
1483     SmallVector<Constant*,8> idx;
1484     for (unsigned i = 0; i < mVWidth; i++) {
1485         idx.push_back(C(i));
1486     }
1487     Value *inter = VSHUFFLE(b, VUNDEF_I(), ConstantVector::get(idx));
1488
1489     SmallVector<Constant*,8> idx2;
1490     for (unsigned i = 0; i < mVWidth / 2; i++) {
1491         idx2.push_back(C(flag ? i : i + mVWidth));
1492     }
1493     for (unsigned i = mVWidth / 2; i < mVWidth; i++) {
1494         idx2.push_back(C(flag ? i + mVWidth / 2 : i));
1495     }
1496     return VSHUFFLE(a, inter, ConstantVector::get(idx2));
1497 #endif
1498 }
1499
1500 // rdtsc buckets macros
1501 void Builder::RDTSC_START(Value* pBucketMgr, Value* pId)
1502 {
1503     // @todo due to an issue with thread local storage propagation in llvm, we can only safely call into
1504     // buckets framework when single threaded
1505     if (KNOB_SINGLE_THREADED)
1506     {
1507         std::vector<Type*> args{
1508             PointerType::get(mInt32Ty, 0),   // pBucketMgr
1509             mInt32Ty                        // id
1510         };
1511
1512         FunctionType* pFuncTy = FunctionType::get(Type::getVoidTy(JM()->mContext), args, false);
1513         Function* pFunc = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("BucketManager_StartBucket", pFuncTy));
1514         if (sys::DynamicLibrary::SearchForAddressOfSymbol("BucketManager_StartBucket") == nullptr)
1515         {
1516             sys::DynamicLibrary::AddSymbol("BucketManager_StartBucket", (void*)&BucketManager_StartBucket);
1517         }
1518
1519         CALL(pFunc, { pBucketMgr, pId });
1520     }
1521 }
1522
1523 void Builder::RDTSC_STOP(Value* pBucketMgr, Value* pId)
1524 {
1525     // @todo due to an issue with thread local storage propagation in llvm, we can only safely call into
1526     // buckets framework when single threaded
1527     if (KNOB_SINGLE_THREADED)
1528     {
1529         std::vector<Type*> args{
1530             PointerType::get(mInt32Ty, 0),   // pBucketMgr
1531             mInt32Ty                        // id
1532         };
1533
1534         FunctionType* pFuncTy = FunctionType::get(Type::getVoidTy(JM()->mContext), args, false);
1535         Function* pFunc = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("BucketManager_StopBucket", pFuncTy));
1536         if (sys::DynamicLibrary::SearchForAddressOfSymbol("BucketManager_StopBucket") == nullptr)
1537         {
1538             sys::DynamicLibrary::AddSymbol("BucketManager_StopBucket", (void*)&BucketManager_StopBucket);
1539         }
1540
1541         CALL(pFunc, { pBucketMgr, pId });
1542     }
1543 }
1544