src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp

   1 /****************************************************************************
   2 * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
   3 *
   4 * Permission is hereby granted, free of charge, to any person obtaining a
   5 * copy of this software and associated documentation files (the "Software"),
   6 * to deal in the Software without restriction, including without limitation
   7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8 * and/or sell copies of the Software, and to permit persons to whom the
   9 * Software is furnished to do so, subject to the following conditions:
  10 *
  11 * The above copyright notice and this permission notice (including the next
  12 * paragraph) shall be included in all copies or substantial portions of the
  13 * Software.
  14 *
  15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21 * IN THE SOFTWARE.
  22 *
  23 * @file builder_misc.cpp
  24 *
  25 * @brief Implementation for miscellaneous builder functions
  26 *
  27 * Notes:
  28 *
  29 ******************************************************************************/
  30 #include "builder.h"
  31 #include "common/rdtsc_buckets.h"
  32
  33 #include "llvm/Support/DynamicLibrary.h"
  34
  35 void __cdecl CallPrint(const char* fmt, ...);
  36
  37 //////////////////////////////////////////////////////////////////////////
  38 /// @brief Convert an IEEE 754 32-bit single precision float to an
  39 ///        16 bit float with 5 exponent bits and a variable
  40 ///        number of mantissa bits.
  41 /// @param val - 32-bit float
  42 /// @todo Maybe move this outside of this file into a header?
  43 static uint16_t Convert32To16Float(float val)
  44 {
  45     uint32_t sign, exp, mant;
  46     uint32_t roundBits;
  47
  48     // Extract the sign, exponent, and mantissa
  49     uint32_t uf = *(uint32_t*)&val;
  50     sign = (uf & 0x80000000) >> 31;
  51     exp = (uf & 0x7F800000) >> 23;
  52     mant = uf & 0x007FFFFF;
  53
  54     // Check for out of range
  55     if (std::isnan(val))
  56     {
  57         exp = 0x1F;
  58         mant = 0x200;
  59         sign = 1;                     // set the sign bit for NANs
  60     }
  61     else if (std::isinf(val))
  62     {
  63         exp = 0x1f;
  64         mant = 0x0;
  65     }
  66     else if (exp > (0x70 + 0x1E)) // Too big to represent -> max representable value
  67     {
  68         exp = 0x1E;
  69         mant = 0x3FF;
  70     }
  71     else if ((exp <= 0x70) && (exp >= 0x66)) // It's a denorm
  72     {
  73         mant |= 0x00800000;
  74         for (; exp <= 0x70; mant >>= 1, exp++)
  75             ;
  76         exp = 0;
  77         mant = mant >> 13;
  78     }
  79     else if (exp < 0x66) // Too small to represent -> Zero
  80     {
  81         exp = 0;
  82         mant = 0;
  83     }
  84     else
  85     {
  86         // Saves bits that will be shifted off for rounding
  87         roundBits = mant & 0x1FFFu;
  88         // convert exponent and mantissa to 16 bit format
  89         exp = exp - 0x70;
  90         mant = mant >> 13;
  91
  92         // Essentially RTZ, but round up if off by only 1 lsb
  93         if (roundBits == 0x1FFFu)
  94         {
  95             mant++;
  96             // check for overflow
  97             if ((mant & 0xC00u) != 0)
  98                 exp++;
  99             // make sure only the needed bits are used
 100             mant &= 0x3FF;
 101         }
 102     }
 103
 104     uint32_t tmpVal = (sign << 15) | (exp << 10) | mant;
 105     return (uint16_t)tmpVal;
 106 }
 107
 108 //////////////////////////////////////////////////////////////////////////
 109 /// @brief Convert an IEEE 754 16-bit float to an 32-bit single precision
 110 ///        float
 111 /// @param val - 16-bit float
 112 /// @todo Maybe move this outside of this file into a header?
 113 static float ConvertSmallFloatTo32(UINT val)
 114 {
 115     UINT result;
 116     if ((val & 0x7fff) == 0)
 117     {
 118         result = ((uint32_t)(val & 0x8000)) << 16;
 119     }
 120     else if ((val & 0x7c00) == 0x7c00)
 121     {
 122         result = ((val & 0x3ff) == 0) ? 0x7f800000 : 0x7fc00000;
 123         result |= ((uint32_t)val & 0x8000) << 16;
 124     }
 125     else
 126     {
 127         uint32_t sign = (val & 0x8000) << 16;
 128         uint32_t mant = (val & 0x3ff) << 13;
 129         uint32_t exp = (val >> 10) & 0x1f;
 130         if ((exp == 0) && (mant != 0)) // Adjust exponent and mantissa for denormals
 131         {
 132             mant <<= 1;
 133             while (mant < (0x400 << 13))
 134             {
 135                 exp--;
 136                 mant <<= 1;
 137             }
 138             mant &= (0x3ff << 13);
 139         }
 140         exp = ((exp - 15 + 127) & 0xff) << 23;
 141         result = sign | exp | mant;
 142     }
 143
 144     return *(float*)&result;
 145 }
 146
 147 Constant *Builder::C(bool i)
 148 {
 149     return ConstantInt::get(IRB()->getInt1Ty(), (i ? 1 : 0));
 150 }
 151
 152 Constant *Builder::C(char i)
 153 {
 154     return ConstantInt::get(IRB()->getInt8Ty(), i);
 155 }
 156
 157 Constant *Builder::C(uint8_t i)
 158 {
 159     return ConstantInt::get(IRB()->getInt8Ty(), i);
 160 }
 161
 162 Constant *Builder::C(int i)
 163 {
 164     return ConstantInt::get(IRB()->getInt32Ty(), i);
 165 }
 166
 167 Constant *Builder::C(int64_t i)
 168 {
 169     return ConstantInt::get(IRB()->getInt64Ty(), i);
 170 }
 171
 172 Constant *Builder::C(uint16_t i)
 173 {
 174     return ConstantInt::get(mInt16Ty,i);
 175 }
 176
 177 Constant *Builder::C(uint32_t i)
 178 {
 179     return ConstantInt::get(IRB()->getInt32Ty(), i);
 180 }
 181
 182 Constant *Builder::C(float i)
 183 {
 184     return ConstantFP::get(IRB()->getFloatTy(), i);
 185 }
 186
 187 Constant *Builder::PRED(bool pred)
 188 {
 189     return ConstantInt::get(IRB()->getInt1Ty(), (pred ? 1 : 0));
 190 }
 191
 192 Value *Builder::VIMMED1(int i)
 193 {
 194     return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i)));
 195 }
 196
 197 Value *Builder::VIMMED1(uint32_t i)
 198 {
 199     return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i)));
 200 }
 201
 202 Value *Builder::VIMMED1(float i)
 203 {
 204     return ConstantVector::getSplat(mVWidth, cast<ConstantFP>(C(i)));
 205 }
 206
 207 Value *Builder::VIMMED1(bool i)
 208 {
 209     return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i)));
 210 }
 211
 212 Value *Builder::VUNDEF_IPTR()
 213 {
 214     return UndefValue::get(VectorType::get(mInt32PtrTy,mVWidth));
 215 }
 216
 217 Value *Builder::VUNDEF_I()
 218 {
 219     return UndefValue::get(VectorType::get(mInt32Ty, mVWidth));
 220 }
 221
 222 Value *Builder::VUNDEF(Type *ty, uint32_t size)
 223 {
 224     return UndefValue::get(VectorType::get(ty, size));
 225 }
 226
 227 Value *Builder::VUNDEF_F()
 228 {
 229     return UndefValue::get(VectorType::get(mFP32Ty, mVWidth));
 230 }
 231
 232 Value *Builder::VUNDEF(Type* t)
 233 {
 234     return UndefValue::get(VectorType::get(t, mVWidth));
 235 }
 236
 237 #if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR == 6
 238 Value *Builder::VINSERT(Value *vec, Value *val, uint64_t index)
 239 {
 240     return VINSERT(vec, val, C((int64_t)index));
 241 }
 242 #endif
 243
 244 Value *Builder::VBROADCAST(Value *src)
 245 {
 246     // check if src is already a vector
 247     if (src->getType()->isVectorTy())
 248     {
 249         return src;
 250     }
 251
 252     return VECTOR_SPLAT(mVWidth, src);
 253 }
 254
 255 uint32_t Builder::IMMED(Value* v)
 256 {
 257     SWR_ASSERT(isa<ConstantInt>(v));
 258     ConstantInt *pValConst = cast<ConstantInt>(v);
 259     return pValConst->getZExtValue();
 260 }
 261
 262 int32_t Builder::S_IMMED(Value* v)
 263 {
 264     SWR_ASSERT(isa<ConstantInt>(v));
 265     ConstantInt *pValConst = cast<ConstantInt>(v);
 266     return pValConst->getSExtValue();
 267 }
 268
 269 Value *Builder::GEP(Value* ptr, const std::initializer_list<Value*> &indexList)
 270 {
 271     std::vector<Value*> indices;
 272     for (auto i : indexList)
 273         indices.push_back(i);
 274     return GEPA(ptr, indices);
 275 }
 276
 277 Value *Builder::GEP(Value* ptr, const std::initializer_list<uint32_t> &indexList)
 278 {
 279     std::vector<Value*> indices;
 280     for (auto i : indexList)
 281         indices.push_back(C(i));
 282     return GEPA(ptr, indices);
 283 }
 284
 285 LoadInst *Builder::LOAD(Value *basePtr, const std::initializer_list<uint32_t> &indices, const llvm::Twine& name)
 286 {
 287     std::vector<Value*> valIndices;
 288     for (auto i : indices)
 289         valIndices.push_back(C(i));
 290     return LOAD(GEPA(basePtr, valIndices), name);
 291 }
 292
 293 LoadInst *Builder::LOADV(Value *basePtr, const std::initializer_list<Value*> &indices, const llvm::Twine& name)
 294 {
 295     std::vector<Value*> valIndices;
 296     for (auto i : indices)
 297         valIndices.push_back(i);
 298     return LOAD(GEPA(basePtr, valIndices), name);
 299 }
 300
 301 StoreInst *Builder::STORE(Value *val, Value *basePtr, const std::initializer_list<uint32_t> &indices)
 302 {
 303     std::vector<Value*> valIndices;
 304     for (auto i : indices)
 305         valIndices.push_back(C(i));
 306     return STORE(val, GEPA(basePtr, valIndices));
 307 }
 308
 309 StoreInst *Builder::STOREV(Value *val, Value *basePtr, const std::initializer_list<Value*> &indices)
 310 {
 311     std::vector<Value*> valIndices;
 312     for (auto i : indices)
 313         valIndices.push_back(i);
 314     return STORE(val, GEPA(basePtr, valIndices));
 315 }
 316
 317 CallInst *Builder::CALL(Value *Callee, const std::initializer_list<Value*> &argsList)
 318 {
 319     std::vector<Value*> args;
 320     for (auto arg : argsList)
 321         args.push_back(arg);
 322     return CALLA(Callee, args);
 323 }
 324
 325 Value *Builder::VRCP(Value *va)
 326 {
 327     return FDIV(VIMMED1(1.0f), va);  // 1 / a
 328 }
 329
 330 Value *Builder::VPLANEPS(Value* vA, Value* vB, Value* vC, Value* &vX, Value* &vY)
 331 {
 332     Value* vOut = FMADDPS(vA, vX, vC);
 333     vOut = FMADDPS(vB, vY, vOut);
 334     return vOut;
 335 }
 336
 337 //////////////////////////////////////////////////////////////////////////
 338 /// @brief Generate an i32 masked load operation in LLVM IR.  If not
 339 /// supported on the underlying platform, emulate it with float masked load
 340 /// @param src - base address pointer for the load
 341 /// @param vMask - SIMD wide mask that controls whether to access memory load 0
 342 Value *Builder::MASKLOADD(Value* src,Value* mask)
 343 {
 344     Value* vResult;
 345     // use avx2 gather instruction is available
 346     if(JM()->mArch.AVX2())
 347     {
 348         Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx2_maskload_d_256);
 349         vResult = CALL(func,{src,mask});
 350     }
 351     else
 352     {
 353         Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule,Intrinsic::x86_avx_maskload_ps_256);
 354         Value* fMask = BITCAST(mask,VectorType::get(mFP32Ty,mVWidth));
 355         vResult = BITCAST(CALL(func,{src,fMask}), VectorType::get(mInt32Ty,mVWidth));
 356     }
 357     return vResult;
 358 }
 359
 360 //////////////////////////////////////////////////////////////////////////
 361 /// @brief insert a JIT call to CallPrint
 362 /// - outputs formatted string to both stdout and VS output window
 363 /// - DEBUG builds only
 364 /// Usage example:
 365 ///   PRINT("index %d = 0x%p\n",{C(lane), pIndex});
 366 ///   where C(lane) creates a constant value to print, and pIndex is the Value*
 367 ///   result from a GEP, printing out the pointer to memory
 368 /// @param printStr - constant string to print, which includes format specifiers
 369 /// @param printArgs - initializer list of Value*'s to print to std out
 370 CallInst *Builder::PRINT(const std::string &printStr,const std::initializer_list<Value*> &printArgs)
 371 {
 372     // push the arguments to CallPrint into a vector
 373     std::vector<Value*> printCallArgs;
 374     // save room for the format string.  we still need to modify it for vectors
 375     printCallArgs.resize(1);
 376
 377     // search through the format string for special processing
 378     size_t pos = 0;
 379     std::string tempStr(printStr);
 380     pos = tempStr.find('%', pos);
 381     auto v = printArgs.begin();
 382
 383     while ((pos != std::string::npos) && (v != printArgs.end()))
 384     {
 385         Value* pArg = *v;
 386         Type* pType = pArg->getType();
 387
 388         if (tempStr[pos + 1] == 't')
 389         {
 390             if (pType->isVectorTy())
 391             {
 392                 Type* pContainedType = pType->getContainedType(0);
 393
 394                 std::string vectorFormatStr;
 395
 396                 if (pContainedType->isFloatTy())
 397                 {
 398                     tempStr[pos + 1] = 'f';  // Ensure its %f
 399                     printCallArgs.push_back(FP_EXT(VEXTRACT(pArg, C(0)), mDoubleTy));
 400
 401                     for (uint32_t i = 1; i < pType->getVectorNumElements(); ++i)
 402                     {
 403                         vectorFormatStr += "%f ";
 404                         printCallArgs.push_back(FP_EXT(VEXTRACT(pArg, C(i)), mDoubleTy));
 405                     }
 406                 }
 407                 else if (pContainedType->isIntegerTy())
 408                 {
 409                     tempStr[pos + 1] = 'd';  // Ensure its %d
 410                     printCallArgs.push_back(VEXTRACT(pArg, C(0)));
 411
 412                     for (uint32_t i = 1; i < pType->getVectorNumElements(); ++i)
 413                     {
 414                         vectorFormatStr += "%d ";
 415                         printCallArgs.push_back(VEXTRACT(pArg, C(i)));
 416                     }
 417                 }
 418                 else
 419                 {
 420                     SWR_ASSERT(0, "Unsupported tyep");
 421                 }
 422
 423                 tempStr.insert(pos, vectorFormatStr);
 424                 pos += vectorFormatStr.size();
 425             }
 426             else
 427             {
 428                 if (pType->isFloatTy())
 429                 {
 430                     tempStr[pos + 1] = 'f';  // Ensure its %f
 431                     printCallArgs.push_back(FP_EXT(pArg, mDoubleTy));
 432                 }
 433                 else if (pType->isIntegerTy())
 434                 {
 435                     tempStr[pos + 1] = 'd';  // Ensure its %d
 436                     printCallArgs.push_back(pArg);
 437                 }
 438             }
 439         }
 440         else if (toupper(tempStr[pos + 1]) == 'X')
 441         {
 442             if (pType->isVectorTy())
 443             {
 444                 tempStr[pos] = '0';
 445                 tempStr.insert(pos + 1, "x%08");
 446
 447                 printCallArgs.push_back(VEXTRACT(pArg, C(0)));
 448
 449                 std::string vectorFormatStr;
 450                 for (uint32_t i = 1; i < pType->getVectorNumElements(); ++i)
 451                 {
 452                     vectorFormatStr += "0x%08X ";
 453                     printCallArgs.push_back(VEXTRACT(pArg, C(i)));
 454                 }
 455
 456                 tempStr.insert(pos, vectorFormatStr);
 457                 pos += vectorFormatStr.size();
 458             }
 459             else
 460             {
 461                 tempStr[pos] = '0';
 462                 tempStr.insert(pos + 1, "x%08");
 463                 printCallArgs.push_back(pArg);
 464                 pos += 3;
 465             }
 466         }
 467         // for %f we need to cast float Values to doubles so that they print out correctly
 468         else if ((tempStr[pos + 1] == 'f') && (pType->isFloatTy()))
 469         {
 470             printCallArgs.push_back(FP_EXT(pArg, Type::getDoubleTy(JM()->mContext)));
 471             pos++;
 472         }
 473         // add special handling for %f and %d format specifiers to make printing llvm vector types easier
 474         else if (pType->isVectorTy())
 475         {
 476             Type* pContainedType = pType->getContainedType(0);
 477
 478             if ((tempStr[pos + 1] == 'f') && (pContainedType->isFloatTy()))
 479             {
 480                 uint32_t i = 0;
 481                 for (; i < (pArg->getType()->getVectorNumElements()) - 1; i++)
 482                 {
 483                     tempStr.insert(pos, std::string("%f "));
 484                     pos += 3;
 485                     printCallArgs.push_back(FP_EXT(VEXTRACT(pArg, C(i)), Type::getDoubleTy(JM()->mContext)));
 486                 }
 487                 printCallArgs.push_back(FP_EXT(VEXTRACT(pArg, C(i)), Type::getDoubleTy(JM()->mContext)));
 488             }
 489             else if ((tempStr[pos + 1] == 'd') && (pContainedType->isIntegerTy()))
 490             {
 491                 uint32_t i = 0;
 492                 for (; i < (pArg->getType()->getVectorNumElements()) - 1; i++)
 493                 {
 494                     tempStr.insert(pos, std::string("%d "));
 495                     pos += 3;
 496                     printCallArgs.push_back(VEXTRACT(pArg, C(i)));
 497                 }
 498                 printCallArgs.push_back(VEXTRACT(pArg, C(i)));
 499             }
 500             else
 501             {
 502                 /// not a supported vector to print
 503                 /// @todo pointer types too
 504                 SWR_ASSERT(0);
 505             }
 506         }
 507         else
 508         {
 509             printCallArgs.push_back(pArg);
 510         }
 511
 512         // advance to the next arguement
 513         v++;
 514         pos = tempStr.find('%', ++pos);
 515     }
 516
 517     // create global variable constant string
 518     Constant *constString = ConstantDataArray::getString(JM()->mContext,tempStr,true);
 519     GlobalVariable *gvPtr = new GlobalVariable(constString->getType(),true,GlobalValue::InternalLinkage,constString,"printStr");
 520     JM()->mpCurrentModule->getGlobalList().push_back(gvPtr);
 521
 522     // get a pointer to the first character in the constant string array
 523     std::vector<Constant*> geplist{C(0),C(0)};
 524 #if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR == 6
 525     Constant *strGEP = ConstantExpr::getGetElementPtr(gvPtr,geplist,false);
 526 #else
 527     Constant *strGEP = ConstantExpr::getGetElementPtr(nullptr, gvPtr,geplist,false);
 528 #endif
 529
 530     // insert the pointer to the format string in the argument vector
 531     printCallArgs[0] = strGEP;
 532
 533     // get pointer to CallPrint function and insert decl into the module if needed
 534     std::vector<Type*> args;
 535     args.push_back(PointerType::get(mInt8Ty,0));
 536     FunctionType* callPrintTy = FunctionType::get(Type::getVoidTy(JM()->mContext),args,true);
 537     Function *callPrintFn = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("CallPrint", callPrintTy));
 538
 539     // if we haven't yet added the symbol to the symbol table
 540     if((sys::DynamicLibrary::SearchForAddressOfSymbol("CallPrint")) == nullptr)
 541     {
 542         sys::DynamicLibrary::AddSymbol("CallPrint", (void *)&CallPrint);
 543     }
 544
 545     // insert a call to CallPrint
 546     return CALLA(callPrintFn,printCallArgs);
 547 }
 548
 549 //////////////////////////////////////////////////////////////////////////
 550 /// @brief Wrapper around PRINT with initializer list.
 551 CallInst* Builder::PRINT(const std::string &printStr)
 552 {
 553     return PRINT(printStr, {});
 554 }
 555
 556 //////////////////////////////////////////////////////////////////////////
 557 /// @brief Generate a masked gather operation in LLVM IR.  If not
 558 /// supported on the underlying platform, emulate it with loads
 559 /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
 560 /// @param pBase - Int8* base VB address pointer value
 561 /// @param vIndices - SIMD wide value of VB byte offsets
 562 /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
 563 /// @param scale - value to scale indices by
 564 Value *Builder::GATHERPS(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, Value* scale)
 565 {
 566     Value* vGather;
 567
 568     // use avx2 gather instruction if available
 569     if(JM()->mArch.AVX2())
 570     {
 571         // force mask to <N x float>, required by vgather
 572         vMask = BITCAST(vMask, mSimdFP32Ty);
 573         vGather = VGATHERPS(vSrc,pBase,vIndices,vMask,scale);
 574     }
 575     else
 576     {
 577         Value* pStack = STACKSAVE();
 578
 579         // store vSrc on the stack.  this way we can select between a valid load address and the vSrc address
 580         Value* vSrcPtr = ALLOCA(vSrc->getType());
 581         STORE(vSrc, vSrcPtr);
 582
 583         vGather = VUNDEF_F();
 584         Value *vScaleVec = VBROADCAST(Z_EXT(scale,mInt32Ty));
 585         Value *vOffsets = MUL(vIndices,vScaleVec);
 586         Value *mask = MASK(vMask);
 587         for(uint32_t i = 0; i < mVWidth; ++i)
 588         {
 589             // single component byte index
 590             Value *offset = VEXTRACT(vOffsets,C(i));
 591             // byte pointer to component
 592             Value *loadAddress = GEP(pBase,offset);
 593             loadAddress = BITCAST(loadAddress,PointerType::get(mFP32Ty,0));
 594             // pointer to the value to load if we're masking off a component
 595             Value *maskLoadAddress = GEP(vSrcPtr,{C(0), C(i)});
 596             Value *selMask = VEXTRACT(mask,C(i));
 597             // switch in a safe address to load if we're trying to access a vertex
 598             Value *validAddress = SELECT(selMask, loadAddress, maskLoadAddress);
 599             Value *val = LOAD(validAddress);
 600             vGather = VINSERT(vGather,val,C(i));
 601         }
 602         STACKRESTORE(pStack);
 603     }
 604
 605     return vGather;
 606 }
 607
 608 //////////////////////////////////////////////////////////////////////////
 609 /// @brief Generate a masked gather operation in LLVM IR.  If not
 610 /// supported on the underlying platform, emulate it with loads
 611 /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
 612 /// @param pBase - Int8* base VB address pointer value
 613 /// @param vIndices - SIMD wide value of VB byte offsets
 614 /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
 615 /// @param scale - value to scale indices by
 616 Value *Builder::GATHERDD(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, Value* scale)
 617 {
 618     Value* vGather;
 619
 620     // use avx2 gather instruction if available
 621     if(JM()->mArch.AVX2())
 622     {
 623         vGather = VGATHERDD(vSrc, pBase, vIndices, vMask, scale);
 624     }
 625     else
 626     {
 627         Value* pStack = STACKSAVE();
 628
 629         // store vSrc on the stack.  this way we can select between a valid load address and the vSrc address
 630         Value* vSrcPtr = ALLOCA(vSrc->getType());
 631         STORE(vSrc, vSrcPtr);
 632
 633         vGather = VUNDEF_I();
 634         Value *vScaleVec = VBROADCAST(Z_EXT(scale, mInt32Ty));
 635         Value *vOffsets = MUL(vIndices, vScaleVec);
 636         Value *mask = MASK(vMask);
 637         for(uint32_t i = 0; i < mVWidth; ++i)
 638         {
 639             // single component byte index
 640             Value *offset = VEXTRACT(vOffsets, C(i));
 641             // byte pointer to component
 642             Value *loadAddress = GEP(pBase, offset);
 643             loadAddress = BITCAST(loadAddress, PointerType::get(mInt32Ty, 0));
 644             // pointer to the value to load if we're masking off a component
 645             Value *maskLoadAddress = GEP(vSrcPtr, {C(0), C(i)});
 646             Value *selMask = VEXTRACT(mask, C(i));
 647             // switch in a safe address to load if we're trying to access a vertex
 648             Value *validAddress = SELECT(selMask, loadAddress, maskLoadAddress);
 649             Value *val = LOAD(validAddress, C(0));
 650             vGather = VINSERT(vGather, val, C(i));
 651         }
 652
 653         STACKRESTORE(pStack);
 654     }
 655     return vGather;
 656 }
 657
 658 //////////////////////////////////////////////////////////////////////////
 659 /// @brief convert x86 <N x float> mask to llvm <N x i1> mask
 660 Value* Builder::MASK(Value* vmask)
 661 {
 662     Value* src = BITCAST(vmask, mSimdInt32Ty);
 663     return ICMP_SLT(src, VIMMED1(0));
 664 }
 665
 666 //////////////////////////////////////////////////////////////////////////
 667 /// @brief convert llvm <N x i1> mask to x86 <N x i32> mask
 668 Value* Builder::VMASK(Value* mask)
 669 {
 670     return S_EXT(mask, mSimdInt32Ty);
 671 }
 672
 673 //////////////////////////////////////////////////////////////////////////
 674 /// @brief Generate a VPSHUFB operation in LLVM IR.  If not
 675 /// supported on the underlying platform, emulate it
 676 /// @param a - 256bit SIMD(32x8bit) of 8bit integer values
 677 /// @param b - 256bit SIMD(32x8bit) of 8bit integer mask values
 678 /// Byte masks in lower 128 lane of b selects 8 bit values from lower
 679 /// 128bits of a, and vice versa for the upper lanes.  If the mask
 680 /// value is negative, '0' is inserted.
 681 Value *Builder::PSHUFB(Value* a, Value* b)
 682 {
 683     Value* res;
 684     // use avx2 pshufb instruction if available
 685     if(JM()->mArch.AVX2())
 686     {
 687         res = VPSHUFB(a, b);
 688     }
 689     else
 690     {
 691         Constant* cB = dyn_cast<Constant>(b);
 692         // number of 8 bit elements in b
 693         uint32_t numElms = cast<VectorType>(cB->getType())->getNumElements();
 694         // output vector
 695         Value* vShuf = UndefValue::get(VectorType::get(mInt8Ty, numElms));
 696
 697         // insert an 8 bit value from the high and low lanes of a per loop iteration
 698         numElms /= 2;
 699         for(uint32_t i = 0; i < numElms; i++)
 700         {
 701             ConstantInt* cLow128b = cast<ConstantInt>(cB->getAggregateElement(i));
 702             ConstantInt* cHigh128b = cast<ConstantInt>(cB->getAggregateElement(i + numElms));
 703
 704             // extract values from constant mask
 705             char valLow128bLane =  (char)(cLow128b->getSExtValue());
 706             char valHigh128bLane = (char)(cHigh128b->getSExtValue());
 707
 708             Value* insertValLow128b;
 709             Value* insertValHigh128b;
 710
 711             // if the mask value is negative, insert a '0' in the respective output position
 712             // otherwise, lookup the value at mask position (bits 3..0 of the respective mask byte) in a and insert in output vector
 713             insertValLow128b = (valLow128bLane < 0) ? C((char)0) : VEXTRACT(a, C((valLow128bLane & 0xF)));
 714             insertValHigh128b = (valHigh128bLane < 0) ? C((char)0) : VEXTRACT(a, C((valHigh128bLane & 0xF) + numElms));
 715
 716             vShuf = VINSERT(vShuf, insertValLow128b, i);
 717             vShuf = VINSERT(vShuf, insertValHigh128b, (i + numElms));
 718         }
 719         res = vShuf;
 720     }
 721     return res;
 722 }
 723
 724 //////////////////////////////////////////////////////////////////////////
 725 /// @brief Generate a VPSHUFB operation (sign extend 8 8bit values to 32
 726 /// bits)in LLVM IR.  If not supported on the underlying platform, emulate it
 727 /// @param a - 128bit SIMD lane(16x8bit) of 8bit integer values.  Only
 728 /// lower 8 values are used.
 729 Value *Builder::PMOVSXBD(Value* a)
 730 {
 731     Value* res;
 732     // use avx2 byte sign extend instruction if available
 733     if(JM()->mArch.AVX2())
 734     {
 735         res = VPMOVSXBD(a);
 736     }
 737     else
 738     {
 739         // VPMOVSXBD output type
 740         Type* v8x32Ty = VectorType::get(mInt32Ty, 8);
 741         // Extract 8 values from 128bit lane and sign extend
 742         res = S_EXT(VSHUFFLE(a, a, C<int>({0, 1, 2, 3, 4, 5, 6, 7})), v8x32Ty);
 743     }
 744     return res;
 745 }
 746
 747 //////////////////////////////////////////////////////////////////////////
 748 /// @brief Generate a VPSHUFB operation (sign extend 8 16bit values to 32
 749 /// bits)in LLVM IR.  If not supported on the underlying platform, emulate it
 750 /// @param a - 128bit SIMD lane(8x16bit) of 16bit integer values.
 751 Value *Builder::PMOVSXWD(Value* a)
 752 {
 753     Value* res;
 754     // use avx2 word sign extend if available
 755     if(JM()->mArch.AVX2())
 756     {
 757         res = VPMOVSXWD(a);
 758     }
 759     else
 760     {
 761         // VPMOVSXWD output type
 762         Type* v8x32Ty = VectorType::get(mInt32Ty, 8);
 763         // Extract 8 values from 128bit lane and sign extend
 764         res = S_EXT(VSHUFFLE(a, a, C<int>({0, 1, 2, 3, 4, 5, 6, 7})), v8x32Ty);
 765     }
 766     return res;
 767 }
 768
 769 //////////////////////////////////////////////////////////////////////////
 770 /// @brief Generate a VPERMD operation (shuffle 32 bit integer values
 771 /// across 128 bit lanes) in LLVM IR.  If not supported on the underlying
 772 /// platform, emulate it
 773 /// @param a - 256bit SIMD lane(8x32bit) of integer values.
 774 /// @param idx - 256bit SIMD lane(8x32bit) of 3 bit lane index values
 775 Value *Builder::PERMD(Value* a, Value* idx)
 776 {
 777     Value* res;
 778     // use avx2 permute instruction if available
 779     if(JM()->mArch.AVX2())
 780     {
 781         // llvm 3.6.0 swapped the order of the args to vpermd
 782         res = VPERMD(idx, a);
 783     }
 784     else
 785     {
 786         if (isa<Constant>(idx))
 787         {
 788             res = VSHUFFLE(a, a, idx);
 789         }
 790         else
 791         {
 792             res = VUNDEF_I();
 793             for (uint32_t l = 0; l < JM()->mVWidth; ++l)
 794             {
 795                 Value* pIndex = VEXTRACT(idx, C(l));
 796                 Value* pVal = VEXTRACT(a, pIndex);
 797                 res = VINSERT(res, pVal, C(l));
 798             }
 799         }
 800     }
 801     return res;
 802 }
 803
 804 //////////////////////////////////////////////////////////////////////////
 805 /// @brief Generate a VPERMPS operation (shuffle 32 bit float values
 806 /// across 128 bit lanes) in LLVM IR.  If not supported on the underlying
 807 /// platform, emulate it
 808 /// @param a - 256bit SIMD lane(8x32bit) of float values.
 809 /// @param idx - 256bit SIMD lane(8x32bit) of 3 bit lane index values
 810 Value *Builder::PERMPS(Value* a, Value* idx)
 811 {
 812     Value* res;
 813     // use avx2 permute instruction if available
 814     if (JM()->mArch.AVX2())
 815     {
 816         // llvm 3.6.0 swapped the order of the args to vpermd
 817         res = VPERMPS(idx, a);
 818     }
 819     else
 820     {
 821         if (isa<Constant>(idx))
 822         {
 823             res = VSHUFFLE(a, a, idx);
 824         }
 825         else
 826         {
 827             res = VUNDEF_F();
 828             for (uint32_t l = 0; l < JM()->mVWidth; ++l)
 829             {
 830                 Value* pIndex = VEXTRACT(idx, C(l));
 831                 Value* pVal = VEXTRACT(a, pIndex);
 832                 res = VINSERT(res, pVal, C(l));
 833             }
 834         }
 835     }
 836
 837     return res;
 838 }
 839
 840 //////////////////////////////////////////////////////////////////////////
 841 /// @brief Generate a VCVTPH2PS operation (float16->float32 conversion)
 842 /// in LLVM IR.  If not supported on the underlying platform, emulate it
 843 /// @param a - 128bit SIMD lane(8x16bit) of float16 in int16 format.
 844 Value *Builder::CVTPH2PS(Value* a)
 845 {
 846     if (JM()->mArch.F16C())
 847     {
 848         return VCVTPH2PS(a);
 849     }
 850     else
 851     {
 852         FunctionType* pFuncTy = FunctionType::get(mFP32Ty, mInt16Ty);
 853         Function* pCvtPh2Ps = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("ConvertSmallFloatTo32", pFuncTy));
 854
 855         if (sys::DynamicLibrary::SearchForAddressOfSymbol("ConvertSmallFloatTo32") == nullptr)
 856         {
 857             sys::DynamicLibrary::AddSymbol("ConvertSmallFloatTo32", (void *)&ConvertSmallFloatTo32);
 858         }
 859
 860         Value* pResult = UndefValue::get(mSimdFP32Ty);
 861         for (uint32_t i = 0; i < mVWidth; ++i)
 862         {
 863             Value* pSrc = VEXTRACT(a, C(i));
 864             Value* pConv = CALL(pCvtPh2Ps, std::initializer_list<Value*>{pSrc});
 865             pResult = VINSERT(pResult, pConv, C(i));
 866         }
 867
 868         return pResult;
 869     }
 870 }
 871
 872 //////////////////////////////////////////////////////////////////////////
 873 /// @brief Generate a VCVTPS2PH operation (float32->float16 conversion)
 874 /// in LLVM IR.  If not supported on the underlying platform, emulate it
 875 /// @param a - 128bit SIMD lane(8x16bit) of float16 in int16 format.
 876 Value *Builder::CVTPS2PH(Value* a, Value* rounding)
 877 {
 878     if (JM()->mArch.F16C())
 879     {
 880         return VCVTPS2PH(a, rounding);
 881     }
 882     else
 883     {
 884         // call scalar C function for now
 885         FunctionType* pFuncTy = FunctionType::get(mInt16Ty, mFP32Ty);
 886         Function* pCvtPs2Ph = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("Convert32To16Float", pFuncTy));
 887
 888         if (sys::DynamicLibrary::SearchForAddressOfSymbol("Convert32To16Float") == nullptr)
 889         {
 890             sys::DynamicLibrary::AddSymbol("Convert32To16Float", (void *)&Convert32To16Float);
 891         }
 892
 893         Value* pResult = UndefValue::get(mSimdInt16Ty);
 894         for (uint32_t i = 0; i < mVWidth; ++i)
 895         {
 896             Value* pSrc = VEXTRACT(a, C(i));
 897             Value* pConv = CALL(pCvtPs2Ph, std::initializer_list<Value*>{pSrc});
 898             pResult = VINSERT(pResult, pConv, C(i));
 899         }
 900
 901         return pResult;
 902     }
 903 }
 904
 905 Value *Builder::PMAXSD(Value* a, Value* b)
 906 {
 907     if (JM()->mArch.AVX2())
 908     {
 909         return VPMAXSD(a, b);
 910     }
 911     else
 912     {
 913         // use 4-wide sse max intrinsic on lower/upper halves of 8-wide sources
 914         Function* pmaxsd = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_sse41_pmaxsd);
 915
 916         // low 128
 917         Value* aLo = VEXTRACTI128(a, C((uint8_t)0));
 918         Value* bLo = VEXTRACTI128(b, C((uint8_t)0));
 919         Value* resLo = CALL(pmaxsd, {aLo, bLo});
 920
 921         // high 128
 922         Value* aHi = VEXTRACTI128(a, C((uint8_t)1));
 923         Value* bHi = VEXTRACTI128(b, C((uint8_t)1));
 924         Value* resHi = CALL(pmaxsd, {aHi, bHi});
 925
 926         // combine
 927         Value* result = VINSERTI128(VUNDEF_I(), resLo, C((uint8_t)0));
 928         result = VINSERTI128(result, resHi, C((uint8_t)1));
 929
 930         return result;
 931     }
 932 }
 933
 934 Value *Builder::PMINSD(Value* a, Value* b)
 935 {
 936     if (JM()->mArch.AVX2())
 937     {
 938         return VPMINSD(a, b);
 939     }
 940     else
 941     {
 942         // use 4-wide sse max intrinsic on lower/upper halves of 8-wide sources
 943         Function* pminsd = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_sse41_pminsd);
 944
 945         // low 128
 946         Value* aLo = VEXTRACTI128(a, C((uint8_t)0));
 947         Value* bLo = VEXTRACTI128(b, C((uint8_t)0));
 948         Value* resLo = CALL(pminsd, {aLo, bLo});
 949
 950         // high 128
 951         Value* aHi = VEXTRACTI128(a, C((uint8_t)1));
 952         Value* bHi = VEXTRACTI128(b, C((uint8_t)1));
 953         Value* resHi = CALL(pminsd, {aHi, bHi});
 954
 955         // combine
 956         Value* result = VINSERTI128(VUNDEF_I(), resLo, C((uint8_t)0));
 957         result = VINSERTI128(result, resHi, C((uint8_t)1));
 958
 959         return result;
 960     }
 961 }
 962
 963 void Builder::Gather4(const SWR_FORMAT format, Value* pSrcBase, Value* byteOffsets,
 964                       Value* mask, Value* vGatherComponents[], bool bPackedOutput)
 965 {
 966     const SWR_FORMAT_INFO &info = GetFormatInfo(format);
 967     if(info.type[0] == SWR_TYPE_FLOAT && info.bpc[0] == 32)
 968     {
 969         // ensure our mask is the correct type
 970         mask = BITCAST(mask, mSimdFP32Ty);
 971         GATHER4PS(info, pSrcBase, byteOffsets, mask, vGatherComponents, bPackedOutput);
 972     }
 973     else
 974     {
 975         // ensure our mask is the correct type
 976         mask = BITCAST(mask, mSimdInt32Ty);
 977         GATHER4DD(info, pSrcBase, byteOffsets, mask, vGatherComponents, bPackedOutput);
 978     }
 979 }
 980
 981 void Builder::GATHER4PS(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets,
 982                         Value* mask, Value* vGatherComponents[], bool bPackedOutput)
 983 {
 984     switch(info.bpp / info.numComps)
 985     {
 986         case 16:
 987         {
 988                 Value* vGatherResult[2];
 989                 Value *vMask;
 990
 991                 // TODO: vGatherMaskedVal
 992                 Value* vGatherMaskedVal = VIMMED1((float)0);
 993
 994                 // always have at least one component out of x or y to fetch
 995
 996                 // save mask as it is zero'd out after each gather
 997                 vMask = mask;
 998
 999                 vGatherResult[0] = GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, C((char)1));
1000                 // e.g. result of first 8x32bit integer gather for 16bit components
1001                 // 256i - 0    1    2    3    4    5    6    7
1002                 //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
1003                 //
1004
1005                 // if we have at least one component out of x or y to fetch
1006                 if(info.numComps > 2)
1007                 {
1008                     // offset base to the next components(zw) in the vertex to gather
1009                     pSrcBase = GEP(pSrcBase, C((char)4));
1010                     vMask = mask;
1011
1012                     vGatherResult[1] =  GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, C((char)1));
1013                     // e.g. result of second 8x32bit integer gather for 16bit components
1014                     // 256i - 0    1    2    3    4    5    6    7
1015                     //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
1016                     //
1017                 }
1018                 else
1019                 {
1020                     vGatherResult[1] =  vGatherMaskedVal;
1021                 }
1022
1023                 // Shuffle gathered components into place, each row is a component
1024                 Shuffle16bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
1025         }
1026             break;
1027         case 32:
1028         {
1029             // apply defaults
1030             for (uint32_t i = 0; i < 4; ++i)
1031             {
1032                 vGatherComponents[i] = VIMMED1(*(float*)&info.defaults[i]);
1033             }
1034
1035             for(uint32_t i = 0; i < info.numComps; i++)
1036             {
1037                 uint32_t swizzleIndex = info.swizzle[i];
1038
1039                 // save mask as it is zero'd out after each gather
1040                 Value *vMask = mask;
1041
1042                 // Gather a SIMD of components
1043                 vGatherComponents[swizzleIndex] = GATHERPS(vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask, C((char)1));
1044
1045                 // offset base to the next component to gather
1046                 pSrcBase = GEP(pSrcBase, C((char)4));
1047             }
1048         }
1049             break;
1050         default:
1051             SWR_ASSERT(0, "Invalid float format");
1052             break;
1053     }
1054 }
1055
1056 void Builder::GATHER4DD(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets,
1057                         Value* mask, Value* vGatherComponents[], bool bPackedOutput)
1058 {
1059     switch (info.bpp / info.numComps)
1060     {
1061         case 8:
1062         {
1063             Value* vGatherMaskedVal = VIMMED1((int32_t)0);
1064             Value* vGatherResult = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, mask, C((char)1));
1065             // e.g. result of an 8x32bit integer gather for 8bit components
1066             // 256i - 0    1    2    3    4    5    6    7
1067             //        xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw
1068
1069             Shuffle8bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
1070         }
1071             break;
1072         case 16:
1073         {
1074             Value* vGatherResult[2];
1075             Value *vMask;
1076
1077             // TODO: vGatherMaskedVal
1078             Value* vGatherMaskedVal = VIMMED1((int32_t)0);
1079
1080             // always have at least one component out of x or y to fetch
1081
1082             // save mask as it is zero'd out after each gather
1083             vMask = mask;
1084
1085             vGatherResult[0] = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, C((char)1));
1086             // e.g. result of first 8x32bit integer gather for 16bit components
1087             // 256i - 0    1    2    3    4    5    6    7
1088             //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
1089             //
1090
1091             // if we have at least one component out of x or y to fetch
1092             if(info.numComps > 2)
1093             {
1094                 // offset base to the next components(zw) in the vertex to gather
1095                 pSrcBase = GEP(pSrcBase, C((char)4));
1096                 vMask = mask;
1097
1098                 vGatherResult[1] = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, C((char)1));
1099                 // e.g. result of second 8x32bit integer gather for 16bit components
1100                 // 256i - 0    1    2    3    4    5    6    7
1101                 //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
1102                 //
1103             }
1104             else
1105             {
1106                 vGatherResult[1] = vGatherMaskedVal;
1107             }
1108
1109             // Shuffle gathered components into place, each row is a component
1110             Shuffle16bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
1111
1112         }
1113             break;
1114         case 32:
1115         {
1116             // apply defaults
1117             for (uint32_t i = 0; i < 4; ++i)
1118             {
1119                 vGatherComponents[i] = VIMMED1((int)info.defaults[i]);
1120             }
1121
1122             for(uint32_t i = 0; i < info.numComps; i++)
1123             {
1124                 uint32_t swizzleIndex = info.swizzle[i];
1125
1126                 // save mask as it is zero'd out after each gather
1127                 Value *vMask = mask;
1128
1129                 // Gather a SIMD of components
1130                 vGatherComponents[swizzleIndex] = GATHERDD(vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask, C((char)1));
1131
1132                 // offset base to the next component to gather
1133                 pSrcBase = GEP(pSrcBase, C((char)4));
1134             }
1135         }
1136             break;
1137         default:
1138             SWR_ASSERT(0, "unsupported format");
1139         break;
1140     }
1141 }
1142
1143 void Builder::Shuffle16bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput[2], Value* vGatherOutput[4], bool bPackedOutput)
1144 {
1145     // cast types
1146     Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
1147     Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
1148
1149     // input could either be float or int vector; do shuffle work in int
1150     vGatherInput[0] = BITCAST(vGatherInput[0], mSimdInt32Ty);
1151     vGatherInput[1] = BITCAST(vGatherInput[1], mSimdInt32Ty);
1152
1153     if(bPackedOutput)
1154     {
1155         Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
1156
1157         // shuffle mask
1158         Value* vConstMask = C<char>({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
1159                                      0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15});
1160         Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput[0], v32x8Ty), vConstMask), vGatherTy);
1161         // after pshufb: group components together in each 128bit lane
1162         // 256i - 0    1    2    3    4    5    6    7
1163         //        xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
1164
1165         Value* vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
1166         // after PERMD: move and pack xy components into each 128bit lane
1167         // 256i - 0    1    2    3    4    5    6    7
1168         //        xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
1169
1170         // do the same for zw components
1171         Value* vi128ZW = nullptr;
1172         if(info.numComps > 2)
1173         {
1174             Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput[1], v32x8Ty), vConstMask), vGatherTy);
1175             vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
1176         }
1177
1178         for(uint32_t i = 0; i < 4; i++)
1179         {
1180             uint32_t swizzleIndex = info.swizzle[i];
1181             // todo: fixed for packed
1182             Value* vGatherMaskedVal = VIMMED1((int32_t)(info.defaults[i]));
1183             if(i >= info.numComps)
1184             {
1185                 // set the default component val
1186                 vGatherOutput[swizzleIndex] = vGatherMaskedVal;
1187                 continue;
1188             }
1189
1190             // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
1191             uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
1192             // if x or y, use vi128XY permute result, else use vi128ZW
1193             Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
1194
1195             // extract packed component 128 bit lanes
1196             vGatherOutput[swizzleIndex] = VEXTRACT(selectedPermute, C(lane));
1197         }
1198
1199     }
1200     else
1201     {
1202         // pshufb masks for each component
1203         Value* vConstMask[2];
1204         // x/z shuffle mask
1205         vConstMask[0] = C<char>({0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
1206                                  0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, });
1207
1208         // y/w shuffle mask
1209         vConstMask[1] = C<char>({2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
1210                                  2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1});
1211
1212
1213         // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits
1214         // apply defaults
1215         for (uint32_t i = 0; i < 4; ++i)
1216         {
1217             vGatherOutput[i] = VIMMED1((int32_t)info.defaults[i]);
1218         }
1219
1220         for(uint32_t i = 0; i < info.numComps; i++)
1221         {
1222             uint32_t swizzleIndex = info.swizzle[i];
1223
1224             // select correct constMask for x/z or y/w pshufb
1225             uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1;
1226             // if x or y, use vi128XY permute result, else use vi128ZW
1227             uint32_t selectedGather = (i < 2) ? 0 : 1;
1228
1229             vGatherOutput[swizzleIndex] = BITCAST(PSHUFB(BITCAST(vGatherInput[selectedGather], v32x8Ty), vConstMask[selectedMask]), vGatherTy);
1230             // after pshufb mask for x channel; z uses the same shuffle from the second gather
1231             // 256i - 0    1    2    3    4    5    6    7
1232             //        xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00
1233         }
1234     }
1235 }
1236
1237 void Builder::Shuffle8bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput, Value* vGatherOutput[], bool bPackedOutput)
1238 {
1239     // cast types
1240     Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
1241     Type* v32x8Ty =  VectorType::get(mInt8Ty, mVWidth * 4 ); // vwidth is units of 32 bits
1242
1243     if(bPackedOutput)
1244     {
1245         Type* v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
1246         // shuffle mask
1247         Value* vConstMask = C<char>({0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15,
1248                                      0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15});
1249         Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput, v32x8Ty), vConstMask), vGatherTy);
1250         // after pshufb: group components together in each 128bit lane
1251         // 256i - 0    1    2    3    4    5    6    7
1252         //        xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww
1253
1254         Value* vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({0, 4, 0, 0, 1, 5, 0, 0})), v128Ty);
1255         // after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane
1256         // 256i - 0    1    2    3    4    5    6    7
1257         //        xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care)
1258
1259         // do the same for zw components
1260         Value* vi128ZW = nullptr;
1261         if(info.numComps > 2)
1262         {
1263             vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({2, 6, 0, 0, 3, 7, 0, 0})), v128Ty);
1264         }
1265
1266         // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
1267         for(uint32_t i = 0; i < 4; i++)
1268         {
1269             uint32_t swizzleIndex = info.swizzle[i];
1270             // todo: fix for packed
1271             Value* vGatherMaskedVal = VIMMED1((int32_t)(info.defaults[i]));
1272             if(i >= info.numComps)
1273             {
1274                 // set the default component val
1275                 vGatherOutput[swizzleIndex] = vGatherMaskedVal;
1276                 continue;
1277             }
1278
1279             // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
1280             uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
1281             // if x or y, use vi128XY permute result, else use vi128ZW
1282             Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
1283
1284             // sign extend
1285             vGatherOutput[swizzleIndex] = VEXTRACT(selectedPermute, C(lane));
1286         }
1287     }
1288     // else zero extend
1289     else{
1290         // shuffle enabled components into lower byte of each 32bit lane, 0 extending to 32 bits
1291         // apply defaults
1292         for (uint32_t i = 0; i < 4; ++i)
1293         {
1294             vGatherOutput[i] = VIMMED1((int32_t)info.defaults[i]);
1295         }
1296
1297         for(uint32_t i = 0; i < info.numComps; i++){
1298             uint32_t swizzleIndex = info.swizzle[i];
1299
1300             // pshufb masks for each component
1301             Value* vConstMask;
1302             switch(i)
1303             {
1304                 case 0:
1305                     // x shuffle mask
1306                     vConstMask = C<char>({0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1,
1307                                           0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1});
1308                     break;
1309                 case 1:
1310                     // y shuffle mask
1311                     vConstMask = C<char>({1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1,
1312                                           1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1});
1313                     break;
1314                 case 2:
1315                     // z shuffle mask
1316                     vConstMask = C<char>({2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1,
1317                                           2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1});
1318                     break;
1319                 case 3:
1320                     // w shuffle mask
1321                     vConstMask = C<char>({3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1,
1322                                           3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1});
1323                     break;
1324                 default:
1325                     vConstMask = nullptr;
1326                     break;
1327             }
1328
1329                 vGatherOutput[swizzleIndex] = BITCAST(PSHUFB(BITCAST(vGatherInput, v32x8Ty), vConstMask), vGatherTy);
1330                 // after pshufb for x channel
1331                 // 256i - 0    1    2    3    4    5    6    7
1332                 //        x000 x000 x000 x000 x000 x000 x000 x000
1333         }
1334     }
1335 }
1336
1337 //////////////////////////////////////////////////////////////////////////
1338 /// @brief emulates a scatter operation.
1339 /// @param pDst - pointer to destination
1340 /// @param vSrc - vector of src data to scatter
1341 /// @param vOffsets - vector of byte offsets from pDst
1342 /// @param vMask - mask of valid lanes
1343 void Builder::SCATTERPS(Value* pDst, Value* vSrc, Value* vOffsets, Value* vMask)
1344 {
1345     Value* pStack = STACKSAVE();
1346
1347     Type* pSrcTy = vSrc->getType()->getVectorElementType();
1348
1349     // allocate tmp stack for masked off lanes
1350     Value* vTmpPtr = ALLOCA(pSrcTy);
1351
1352     Value *mask = MASK(vMask);
1353     for (uint32_t i = 0; i < mVWidth; ++i)
1354     {
1355         Value *offset = VEXTRACT(vOffsets, C(i));
1356         // byte pointer to component
1357         Value *storeAddress = GEP(pDst, offset);
1358         storeAddress = BITCAST(storeAddress, PointerType::get(pSrcTy, 0));
1359         Value *selMask = VEXTRACT(mask, C(i));
1360         Value *srcElem = VEXTRACT(vSrc, C(i));
1361         // switch in a safe address to load if we're trying to access a vertex
1362         Value *validAddress = SELECT(selMask, storeAddress, vTmpPtr);
1363         STORE(srcElem, validAddress);
1364     }
1365
1366     STACKRESTORE(pStack);
1367 }
1368
1369 Value* Builder::VABSPS(Value* a)
1370 {
1371     Value* asInt = BITCAST(a, mSimdInt32Ty);
1372     Value* result = BITCAST(AND(asInt, VIMMED1(0x7fffffff)), mSimdFP32Ty);
1373     return result;
1374 }
1375
1376 Value *Builder::ICLAMP(Value* src, Value* low, Value* high)
1377 {
1378     Value *lowCmp = ICMP_SLT(src, low);
1379     Value *ret = SELECT(lowCmp, low, src);
1380
1381     Value *highCmp = ICMP_SGT(ret, high);
1382     ret = SELECT(highCmp, high, ret);
1383
1384     return ret;
1385 }
1386
1387 Value *Builder::FCLAMP(Value* src, Value* low, Value* high)
1388 {
1389     Value *lowCmp = FCMP_OLT(src, low);
1390     Value *ret = SELECT(lowCmp, low, src);
1391
1392     Value *highCmp = FCMP_OGT(ret, high);
1393     ret = SELECT(highCmp, high, ret);
1394
1395     return ret;
1396 }
1397
1398 Value *Builder::FCLAMP(Value* src, float low, float high)
1399 {
1400     Value* result = VMAXPS(src, VIMMED1(low));
1401     result = VMINPS(result, VIMMED1(high));
1402
1403     return result;
1404 }
1405
1406 //////////////////////////////////////////////////////////////////////////
1407 /// @brief save/restore stack, providing ability to push/pop the stack and
1408 ///        reduce overall stack requirements for temporary stack use
1409 Value* Builder::STACKSAVE()
1410 {
1411     Function* pfnStackSave = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::stacksave);
1412 #if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR == 6
1413     return CALL(pfnStackSave);
1414 #else
1415     return CALLA(pfnStackSave);
1416 #endif
1417 }
1418
1419 void Builder::STACKRESTORE(Value* pSaved)
1420 {
1421     Function* pfnStackRestore = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::stackrestore);
1422     CALL(pfnStackRestore, std::initializer_list<Value*>{pSaved});
1423 }
1424
1425 Value *Builder::FMADDPS(Value* a, Value* b, Value* c)
1426 {
1427     Value* vOut;
1428     // use FMADs if available
1429     if(JM()->mArch.AVX2())
1430     {
1431         vOut = VFMADDPS(a, b, c);
1432     }
1433     else
1434     {
1435         vOut = FADD(FMUL(a, b), c);
1436     }
1437     return vOut;
1438 }
1439
1440 Value* Builder::POPCNT(Value* a)
1441 {
1442     Function* pCtPop = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::ctpop, { a->getType() });
1443     return CALL(pCtPop, std::initializer_list<Value*>{a});
1444 }
1445
1446 //////////////////////////////////////////////////////////////////////////
1447 /// @brief C functions called by LLVM IR
1448 //////////////////////////////////////////////////////////////////////////
1449
1450 //////////////////////////////////////////////////////////////////////////
1451 /// @brief called in JIT code, inserted by PRINT
1452 /// output to both stdout and visual studio debug console
1453 void __cdecl CallPrint(const char* fmt, ...)
1454 {
1455     va_list args;
1456     va_start(args, fmt);
1457     vprintf(fmt, args);
1458
1459 #if defined( _WIN32 )
1460     char strBuf[1024];
1461     vsnprintf_s(strBuf, _TRUNCATE, fmt, args);
1462     OutputDebugString(strBuf);
1463 #endif
1464
1465     va_end(args);
1466 }
1467
1468 Value *Builder::VEXTRACTI128(Value* a, Constant* imm8)
1469 {
1470 #if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR == 6
1471     Function *func =
1472         Intrinsic::getDeclaration(JM()->mpCurrentModule,
1473                                   Intrinsic::x86_avx_vextractf128_si_256);
1474     return CALL(func, {a, imm8});
1475 #else
1476     bool flag = !imm8->isZeroValue();
1477     SmallVector<Constant*,8> idx;
1478     for (unsigned i = 0; i < mVWidth / 2; i++) {
1479         idx.push_back(C(flag ? i + mVWidth / 2 : i));
1480     }
1481     return VSHUFFLE(a, VUNDEF_I(), ConstantVector::get(idx));
1482 #endif
1483 }
1484
1485 Value *Builder::VINSERTI128(Value* a, Value* b, Constant* imm8)
1486 {
1487 #if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR == 6
1488     Function *func =
1489         Intrinsic::getDeclaration(JM()->mpCurrentModule,
1490                                   Intrinsic::x86_avx_vinsertf128_si_256);
1491     return CALL(func, {a, b, imm8});
1492 #else
1493     bool flag = !imm8->isZeroValue();
1494     SmallVector<Constant*,8> idx;
1495     for (unsigned i = 0; i < mVWidth; i++) {
1496         idx.push_back(C(i));
1497     }
1498     Value *inter = VSHUFFLE(b, VUNDEF_I(), ConstantVector::get(idx));
1499
1500     SmallVector<Constant*,8> idx2;
1501     for (unsigned i = 0; i < mVWidth / 2; i++) {
1502         idx2.push_back(C(flag ? i : i + mVWidth));
1503     }
1504     for (unsigned i = mVWidth / 2; i < mVWidth; i++) {
1505         idx2.push_back(C(flag ? i + mVWidth / 2 : i));
1506     }
1507     return VSHUFFLE(a, inter, ConstantVector::get(idx2));
1508 #endif
1509 }
1510
1511 // rdtsc buckets macros
1512 void Builder::RDTSC_START(Value* pBucketMgr, Value* pId)
1513 {
1514     std::vector<Type*> args{
1515         PointerType::get(mInt32Ty, 0),   // pBucketMgr
1516         mInt32Ty                        // id
1517     };
1518
1519     FunctionType* pFuncTy = FunctionType::get(Type::getVoidTy(JM()->mContext), args, false);
1520     Function* pFunc = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("BucketManager_StartBucket", pFuncTy));
1521     if (sys::DynamicLibrary::SearchForAddressOfSymbol("BucketManager_StartBucket") == nullptr)
1522     {
1523         sys::DynamicLibrary::AddSymbol("BucketManager_StartBucket", (void*)&BucketManager_StartBucket);
1524     }
1525
1526     CALL(pFunc, { pBucketMgr, pId });
1527 }
1528
1529 void Builder::RDTSC_STOP(Value* pBucketMgr, Value* pId)
1530 {
1531     std::vector<Type*> args{
1532         PointerType::get(mInt32Ty, 0),   // pBucketMgr
1533         mInt32Ty                        // id
1534     };
1535
1536     FunctionType* pFuncTy = FunctionType::get(Type::getVoidTy(JM()->mContext), args, false);
1537     Function* pFunc = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("BucketManager_StopBucket", pFuncTy));
1538     if (sys::DynamicLibrary::SearchForAddressOfSymbol("BucketManager_StopBucket") == nullptr)
1539     {
1540         sys::DynamicLibrary::AddSymbol("BucketManager_StopBucket", (void*)&BucketManager_StopBucket);
1541     }
1542
1543     CALL(pFunc, { pBucketMgr, pId });
1544 }
1545