src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp

   1 /****************************************************************************
   2 * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
   3 *
   4 * Permission is hereby granted, free of charge, to any person obtaining a
   5 * copy of this software and associated documentation files (the "Software"),
   6 * to deal in the Software without restriction, including without limitation
   7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8 * and/or sell copies of the Software, and to permit persons to whom the
   9 * Software is furnished to do so, subject to the following conditions:
  10 *
  11 * The above copyright notice and this permission notice (including the next
  12 * paragraph) shall be included in all copies or substantial portions of the
  13 * Software.
  14 *
  15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21 * IN THE SOFTWARE.
  22 *
  23 * @file builder_misc.cpp
  24 *
  25 * @brief Implementation for miscellaneous builder functions
  26 *
  27 * Notes:
  28 *
  29 ******************************************************************************/
  30 #include "builder.h"
  31 #include "common/rdtsc_buckets.h"
  32
  33 #include "llvm/Support/DynamicLibrary.h"
  34
  35 void __cdecl CallPrint(const char* fmt, ...);
  36
  37 //////////////////////////////////////////////////////////////////////////
  38 /// @brief Convert an IEEE 754 32-bit single precision float to an
  39 ///        16 bit float with 5 exponent bits and a variable
  40 ///        number of mantissa bits.
  41 /// @param val - 32-bit float
  42 /// @todo Maybe move this outside of this file into a header?
  43 static uint16_t Convert32To16Float(float val)
  44 {
  45     uint32_t sign, exp, mant;
  46     uint32_t roundBits;
  47
  48     // Extract the sign, exponent, and mantissa
  49     uint32_t uf = *(uint32_t*)&val;
  50     sign = (uf & 0x80000000) >> 31;
  51     exp = (uf & 0x7F800000) >> 23;
  52     mant = uf & 0x007FFFFF;
  53
  54     // Check for out of range
  55     if (std::isnan(val))
  56     {
  57         exp = 0x1F;
  58         mant = 0x200;
  59         sign = 1;                     // set the sign bit for NANs
  60     }
  61     else if (std::isinf(val))
  62     {
  63         exp = 0x1f;
  64         mant = 0x0;
  65     }
  66     else if (exp > (0x70 + 0x1E)) // Too big to represent -> max representable value
  67     {
  68         exp = 0x1E;
  69         mant = 0x3FF;
  70     }
  71     else if ((exp <= 0x70) && (exp >= 0x66)) // It's a denorm
  72     {
  73         mant |= 0x00800000;
  74         for (; exp <= 0x70; mant >>= 1, exp++)
  75             ;
  76         exp = 0;
  77         mant = mant >> 13;
  78     }
  79     else if (exp < 0x66) // Too small to represent -> Zero
  80     {
  81         exp = 0;
  82         mant = 0;
  83     }
  84     else
  85     {
  86         // Saves bits that will be shifted off for rounding
  87         roundBits = mant & 0x1FFFu;
  88         // convert exponent and mantissa to 16 bit format
  89         exp = exp - 0x70;
  90         mant = mant >> 13;
  91
  92         // Essentially RTZ, but round up if off by only 1 lsb
  93         if (roundBits == 0x1FFFu)
  94         {
  95             mant++;
  96             // check for overflow
  97             if ((mant & 0xC00u) != 0)
  98                 exp++;
  99             // make sure only the needed bits are used
 100             mant &= 0x3FF;
 101         }
 102     }
 103
 104     uint32_t tmpVal = (sign << 15) | (exp << 10) | mant;
 105     return (uint16_t)tmpVal;
 106 }
 107
 108 //////////////////////////////////////////////////////////////////////////
 109 /// @brief Convert an IEEE 754 16-bit float to an 32-bit single precision
 110 ///        float
 111 /// @param val - 16-bit float
 112 /// @todo Maybe move this outside of this file into a header?
 113 static float ConvertSmallFloatTo32(UINT val)
 114 {
 115     UINT result;
 116     if ((val & 0x7fff) == 0)
 117     {
 118         result = ((uint32_t)(val & 0x8000)) << 16;
 119     }
 120     else if ((val & 0x7c00) == 0x7c00)
 121     {
 122         result = ((val & 0x3ff) == 0) ? 0x7f800000 : 0x7fc00000;
 123         result |= ((uint32_t)val & 0x8000) << 16;
 124     }
 125     else
 126     {
 127         uint32_t sign = (val & 0x8000) << 16;
 128         uint32_t mant = (val & 0x3ff) << 13;
 129         uint32_t exp = (val >> 10) & 0x1f;
 130         if ((exp == 0) && (mant != 0)) // Adjust exponent and mantissa for denormals
 131         {
 132             mant <<= 1;
 133             while (mant < (0x400 << 13))
 134             {
 135                 exp--;
 136                 mant <<= 1;
 137             }
 138             mant &= (0x3ff << 13);
 139         }
 140         exp = ((exp - 15 + 127) & 0xff) << 23;
 141         result = sign | exp | mant;
 142     }
 143
 144     return *(float*)&result;
 145 }
 146
 147 Constant *Builder::C(bool i)
 148 {
 149     return ConstantInt::get(IRB()->getInt1Ty(), (i ? 1 : 0));
 150 }
 151
 152 Constant *Builder::C(char i)
 153 {
 154     return ConstantInt::get(IRB()->getInt8Ty(), i);
 155 }
 156
 157 Constant *Builder::C(uint8_t i)
 158 {
 159     return ConstantInt::get(IRB()->getInt8Ty(), i);
 160 }
 161
 162 Constant *Builder::C(int i)
 163 {
 164     return ConstantInt::get(IRB()->getInt32Ty(), i);
 165 }
 166
 167 Constant *Builder::C(int64_t i)
 168 {
 169     return ConstantInt::get(IRB()->getInt64Ty(), i);
 170 }
 171
 172 Constant *Builder::C(uint16_t i)
 173 {
 174     return ConstantInt::get(mInt16Ty,i);
 175 }
 176
 177 Constant *Builder::C(uint32_t i)
 178 {
 179     return ConstantInt::get(IRB()->getInt32Ty(), i);
 180 }
 181
 182 Constant *Builder::C(float i)
 183 {
 184     return ConstantFP::get(IRB()->getFloatTy(), i);
 185 }
 186
 187 Constant *Builder::PRED(bool pred)
 188 {
 189     return ConstantInt::get(IRB()->getInt1Ty(), (pred ? 1 : 0));
 190 }
 191
 192 Value *Builder::VIMMED1(int i)
 193 {
 194     return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i)));
 195 }
 196
 197 Value *Builder::VIMMED1(uint32_t i)
 198 {
 199     return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i)));
 200 }
 201
 202 Value *Builder::VIMMED1(float i)
 203 {
 204     return ConstantVector::getSplat(mVWidth, cast<ConstantFP>(C(i)));
 205 }
 206
 207 Value *Builder::VIMMED1(bool i)
 208 {
 209     return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i)));
 210 }
 211
 212 Value *Builder::VUNDEF_IPTR()
 213 {
 214     return UndefValue::get(VectorType::get(mInt32PtrTy,mVWidth));
 215 }
 216
 217 Value *Builder::VUNDEF_I()
 218 {
 219     return UndefValue::get(VectorType::get(mInt32Ty, mVWidth));
 220 }
 221
 222 Value *Builder::VUNDEF(Type *ty, uint32_t size)
 223 {
 224     return UndefValue::get(VectorType::get(ty, size));
 225 }
 226
 227 Value *Builder::VUNDEF_F()
 228 {
 229     return UndefValue::get(VectorType::get(mFP32Ty, mVWidth));
 230 }
 231
 232 Value *Builder::VUNDEF(Type* t)
 233 {
 234     return UndefValue::get(VectorType::get(t, mVWidth));
 235 }
 236
 237 #if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR == 6
 238 Value *Builder::VINSERT(Value *vec, Value *val, uint64_t index)
 239 {
 240     return VINSERT(vec, val, C((int64_t)index));
 241 }
 242 #endif
 243
 244 Value *Builder::VBROADCAST(Value *src)
 245 {
 246     // check if src is already a vector
 247     if (src->getType()->isVectorTy())
 248     {
 249         return src;
 250     }
 251
 252     return VECTOR_SPLAT(mVWidth, src);
 253 }
 254
 255 uint32_t Builder::IMMED(Value* v)
 256 {
 257     SWR_ASSERT(isa<ConstantInt>(v));
 258     ConstantInt *pValConst = cast<ConstantInt>(v);
 259     return pValConst->getZExtValue();
 260 }
 261
 262 Value *Builder::GEP(Value* ptr, const std::initializer_list<Value*> &indexList)
 263 {
 264     std::vector<Value*> indices;
 265     for (auto i : indexList)
 266         indices.push_back(i);
 267     return GEPA(ptr, indices);
 268 }
 269
 270 Value *Builder::GEP(Value* ptr, const std::initializer_list<uint32_t> &indexList)
 271 {
 272     std::vector<Value*> indices;
 273     for (auto i : indexList)
 274         indices.push_back(C(i));
 275     return GEPA(ptr, indices);
 276 }
 277
 278 LoadInst *Builder::LOAD(Value *basePtr, const std::initializer_list<uint32_t> &indices, const llvm::Twine& name)
 279 {
 280     std::vector<Value*> valIndices;
 281     for (auto i : indices)
 282         valIndices.push_back(C(i));
 283     return LOAD(GEPA(basePtr, valIndices), name);
 284 }
 285
 286 LoadInst *Builder::LOADV(Value *basePtr, const std::initializer_list<Value*> &indices, const llvm::Twine& name)
 287 {
 288     std::vector<Value*> valIndices;
 289     for (auto i : indices)
 290         valIndices.push_back(i);
 291     return LOAD(GEPA(basePtr, valIndices), name);
 292 }
 293
 294 StoreInst *Builder::STORE(Value *val, Value *basePtr, const std::initializer_list<uint32_t> &indices)
 295 {
 296     std::vector<Value*> valIndices;
 297     for (auto i : indices)
 298         valIndices.push_back(C(i));
 299     return STORE(val, GEPA(basePtr, valIndices));
 300 }
 301
 302 StoreInst *Builder::STOREV(Value *val, Value *basePtr, const std::initializer_list<Value*> &indices)
 303 {
 304     std::vector<Value*> valIndices;
 305     for (auto i : indices)
 306         valIndices.push_back(i);
 307     return STORE(val, GEPA(basePtr, valIndices));
 308 }
 309
 310 CallInst *Builder::CALL(Value *Callee, const std::initializer_list<Value*> &argsList)
 311 {
 312     std::vector<Value*> args;
 313     for (auto arg : argsList)
 314         args.push_back(arg);
 315     return CALLA(Callee, args);
 316 }
 317
 318 Value *Builder::VRCP(Value *va)
 319 {
 320     return FDIV(VIMMED1(1.0f), va);  // 1 / a
 321 }
 322
 323 Value *Builder::VPLANEPS(Value* vA, Value* vB, Value* vC, Value* &vX, Value* &vY)
 324 {
 325     Value* vOut = FMADDPS(vA, vX, vC);
 326     vOut = FMADDPS(vB, vY, vOut);
 327     return vOut;
 328 }
 329
 330 //////////////////////////////////////////////////////////////////////////
 331 /// @brief Generate an i32 masked load operation in LLVM IR.  If not
 332 /// supported on the underlying platform, emulate it with float masked load
 333 /// @param src - base address pointer for the load
 334 /// @param vMask - SIMD wide mask that controls whether to access memory load 0
 335 Value *Builder::MASKLOADD(Value* src,Value* mask)
 336 {
 337     Value* vResult;
 338     // use avx2 gather instruction is available
 339     if(JM()->mArch.AVX2())
 340     {
 341         Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx2_maskload_d_256);
 342         vResult = CALL(func,{src,mask});
 343     }
 344     else
 345     {
 346         Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule,Intrinsic::x86_avx_maskload_ps_256);
 347         Value* fMask = BITCAST(mask,VectorType::get(mFP32Ty,mVWidth));
 348         vResult = BITCAST(CALL(func,{src,fMask}), VectorType::get(mInt32Ty,mVWidth));
 349     }
 350     return vResult;
 351 }
 352
 353 //////////////////////////////////////////////////////////////////////////
 354 /// @brief insert a JIT call to CallPrint
 355 /// - outputs formatted string to both stdout and VS output window
 356 /// - DEBUG builds only
 357 /// Usage example:
 358 ///   PRINT("index %d = 0x%p\n",{C(lane), pIndex});
 359 ///   where C(lane) creates a constant value to print, and pIndex is the Value*
 360 ///   result from a GEP, printing out the pointer to memory
 361 /// @param printStr - constant string to print, which includes format specifiers
 362 /// @param printArgs - initializer list of Value*'s to print to std out
 363 CallInst *Builder::PRINT(const std::string &printStr,const std::initializer_list<Value*> &printArgs)
 364 {
 365     // push the arguments to CallPrint into a vector
 366     std::vector<Value*> printCallArgs;
 367     // save room for the format string.  we still need to modify it for vectors
 368     printCallArgs.resize(1);
 369
 370     // search through the format string for special processing
 371     size_t pos = 0;
 372     std::string tempStr(printStr);
 373     pos = tempStr.find('%', pos);
 374     auto v = printArgs.begin();
 375
 376     while ((pos != std::string::npos) && (v != printArgs.end()))
 377     {
 378         Value* pArg = *v;
 379         Type* pType = pArg->getType();
 380
 381         if (tempStr[pos + 1] == 't')
 382         {
 383             if (pType->isVectorTy())
 384             {
 385                 Type* pContainedType = pType->getContainedType(0);
 386
 387                 std::string vectorFormatStr;
 388
 389                 if (pContainedType->isFloatTy())
 390                 {
 391                     tempStr[pos + 1] = 'f';  // Ensure its %f
 392                     printCallArgs.push_back(FP_EXT(VEXTRACT(pArg, C(0)), mDoubleTy));
 393
 394                     for (uint32_t i = 1; i < pType->getVectorNumElements(); ++i)
 395                     {
 396                         vectorFormatStr += "%f ";
 397                         printCallArgs.push_back(FP_EXT(VEXTRACT(pArg, C(i)), mDoubleTy));
 398                     }
 399                 }
 400                 else if (pContainedType->isIntegerTy())
 401                 {
 402                     tempStr[pos + 1] = 'd';  // Ensure its %d
 403                     printCallArgs.push_back(VEXTRACT(pArg, C(0)));
 404
 405                     for (uint32_t i = 1; i < pType->getVectorNumElements(); ++i)
 406                     {
 407                         vectorFormatStr += "%d ";
 408                         printCallArgs.push_back(VEXTRACT(pArg, C(i)));
 409                     }
 410                 }
 411                 else
 412                 {
 413                     SWR_ASSERT(0, "Unsupported tyep");
 414                 }
 415
 416                 tempStr.insert(pos, vectorFormatStr);
 417                 pos += vectorFormatStr.size();
 418             }
 419             else
 420             {
 421                 if (pType->isFloatTy())
 422                 {
 423                     tempStr[pos + 1] = 'f';  // Ensure its %f
 424                     printCallArgs.push_back(FP_EXT(pArg, mDoubleTy));
 425                 }
 426                 else if (pType->isIntegerTy())
 427                 {
 428                     tempStr[pos + 1] = 'd';  // Ensure its %d
 429                     printCallArgs.push_back(pArg);
 430                 }
 431             }
 432         }
 433         else if (toupper(tempStr[pos + 1]) == 'X')
 434         {
 435             if (pType->isVectorTy())
 436             {
 437                 tempStr[pos] = '0';
 438                 tempStr.insert(pos + 1, "x%08");
 439
 440                 printCallArgs.push_back(VEXTRACT(pArg, C(0)));
 441
 442                 std::string vectorFormatStr;
 443                 for (uint32_t i = 1; i < pType->getVectorNumElements(); ++i)
 444                 {
 445                     vectorFormatStr += "0x%08X ";
 446                     printCallArgs.push_back(VEXTRACT(pArg, C(i)));
 447                 }
 448
 449                 tempStr.insert(pos, vectorFormatStr);
 450                 pos += vectorFormatStr.size();
 451             }
 452             else
 453             {
 454                 tempStr[pos] = '0';
 455                 tempStr.insert(pos + 1, "x%08");
 456                 printCallArgs.push_back(pArg);
 457                 pos += 3;
 458             }
 459         }
 460         // for %f we need to cast float Values to doubles so that they print out correctly
 461         else if ((tempStr[pos + 1] == 'f') && (pType->isFloatTy()))
 462         {
 463             printCallArgs.push_back(FP_EXT(pArg, Type::getDoubleTy(JM()->mContext)));
 464             pos++;
 465         }
 466         // add special handling for %f and %d format specifiers to make printing llvm vector types easier
 467         else if (pType->isVectorTy())
 468         {
 469             Type* pContainedType = pType->getContainedType(0);
 470
 471             if ((tempStr[pos + 1] == 'f') && (pContainedType->isFloatTy()))
 472             {
 473                 uint32_t i = 0;
 474                 for (; i < (pArg->getType()->getVectorNumElements()) - 1; i++)
 475                 {
 476                     tempStr.insert(pos, std::string("%f "));
 477                     pos += 3;
 478                     printCallArgs.push_back(FP_EXT(VEXTRACT(pArg, C(i)), Type::getDoubleTy(JM()->mContext)));
 479                 }
 480                 printCallArgs.push_back(FP_EXT(VEXTRACT(pArg, C(i)), Type::getDoubleTy(JM()->mContext)));
 481             }
 482             else if ((tempStr[pos + 1] == 'd') && (pContainedType->isIntegerTy()))
 483             {
 484                 uint32_t i = 0;
 485                 for (; i < (pArg->getType()->getVectorNumElements()) - 1; i++)
 486                 {
 487                     tempStr.insert(pos, std::string("%d "));
 488                     pos += 3;
 489                     printCallArgs.push_back(VEXTRACT(pArg, C(i)));
 490                 }
 491                 printCallArgs.push_back(VEXTRACT(pArg, C(i)));
 492             }
 493             else
 494             {
 495                 /// not a supported vector to print
 496                 /// @todo pointer types too
 497                 SWR_ASSERT(0);
 498             }
 499         }
 500         else
 501         {
 502             printCallArgs.push_back(pArg);
 503         }
 504
 505         // advance to the next arguement
 506         v++;
 507         pos = tempStr.find('%', ++pos);
 508     }
 509
 510     // create global variable constant string
 511     Constant *constString = ConstantDataArray::getString(JM()->mContext,tempStr,true);
 512     GlobalVariable *gvPtr = new GlobalVariable(constString->getType(),true,GlobalValue::InternalLinkage,constString,"printStr");
 513     JM()->mpCurrentModule->getGlobalList().push_back(gvPtr);
 514
 515     // get a pointer to the first character in the constant string array
 516     std::vector<Constant*> geplist{C(0),C(0)};
 517 #if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR == 6
 518     Constant *strGEP = ConstantExpr::getGetElementPtr(gvPtr,geplist,false);
 519 #else
 520     Constant *strGEP = ConstantExpr::getGetElementPtr(nullptr, gvPtr,geplist,false);
 521 #endif
 522
 523     // insert the pointer to the format string in the argument vector
 524     printCallArgs[0] = strGEP;
 525
 526     // get pointer to CallPrint function and insert decl into the module if needed
 527     std::vector<Type*> args;
 528     args.push_back(PointerType::get(mInt8Ty,0));
 529     FunctionType* callPrintTy = FunctionType::get(Type::getVoidTy(JM()->mContext),args,true);
 530     Function *callPrintFn = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("CallPrint", callPrintTy));
 531
 532     // if we haven't yet added the symbol to the symbol table
 533     if((sys::DynamicLibrary::SearchForAddressOfSymbol("CallPrint")) == nullptr)
 534     {
 535         sys::DynamicLibrary::AddSymbol("CallPrint", (void *)&CallPrint);
 536     }
 537
 538     // insert a call to CallPrint
 539     return CALLA(callPrintFn,printCallArgs);
 540 }
 541
 542 //////////////////////////////////////////////////////////////////////////
 543 /// @brief Wrapper around PRINT with initializer list.
 544 CallInst* Builder::PRINT(const std::string &printStr)
 545 {
 546     return PRINT(printStr, {});
 547 }
 548
 549 //////////////////////////////////////////////////////////////////////////
 550 /// @brief Generate a masked gather operation in LLVM IR.  If not
 551 /// supported on the underlying platform, emulate it with loads
 552 /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
 553 /// @param pBase - Int8* base VB address pointer value
 554 /// @param vIndices - SIMD wide value of VB byte offsets
 555 /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
 556 /// @param scale - value to scale indices by
 557 Value *Builder::GATHERPS(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, Value* scale)
 558 {
 559     Value* vGather;
 560
 561     // use avx2 gather instruction if available
 562     if(JM()->mArch.AVX2())
 563     {
 564         // force mask to <N x float>, required by vgather
 565         vMask = BITCAST(vMask, mSimdFP32Ty);
 566         vGather = VGATHERPS(vSrc,pBase,vIndices,vMask,scale);
 567     }
 568     else
 569     {
 570         Value* pStack = STACKSAVE();
 571
 572         // store vSrc on the stack.  this way we can select between a valid load address and the vSrc address
 573         Value* vSrcPtr = ALLOCA(vSrc->getType());
 574         STORE(vSrc, vSrcPtr);
 575
 576         vGather = VUNDEF_F();
 577         Value *vScaleVec = VBROADCAST(Z_EXT(scale,mInt32Ty));
 578         Value *vOffsets = MUL(vIndices,vScaleVec);
 579         Value *mask = MASK(vMask);
 580         for(uint32_t i = 0; i < mVWidth; ++i)
 581         {
 582             // single component byte index
 583             Value *offset = VEXTRACT(vOffsets,C(i));
 584             // byte pointer to component
 585             Value *loadAddress = GEP(pBase,offset);
 586             loadAddress = BITCAST(loadAddress,PointerType::get(mFP32Ty,0));
 587             // pointer to the value to load if we're masking off a component
 588             Value *maskLoadAddress = GEP(vSrcPtr,{C(0), C(i)});
 589             Value *selMask = VEXTRACT(mask,C(i));
 590             // switch in a safe address to load if we're trying to access a vertex
 591             Value *validAddress = SELECT(selMask, loadAddress, maskLoadAddress);
 592             Value *val = LOAD(validAddress);
 593             vGather = VINSERT(vGather,val,C(i));
 594         }
 595         STACKRESTORE(pStack);
 596     }
 597
 598     return vGather;
 599 }
 600
 601 //////////////////////////////////////////////////////////////////////////
 602 /// @brief Generate a masked gather operation in LLVM IR.  If not
 603 /// supported on the underlying platform, emulate it with loads
 604 /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
 605 /// @param pBase - Int8* base VB address pointer value
 606 /// @param vIndices - SIMD wide value of VB byte offsets
 607 /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
 608 /// @param scale - value to scale indices by
 609 Value *Builder::GATHERDD(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, Value* scale)
 610 {
 611     Value* vGather;
 612
 613     // use avx2 gather instruction if available
 614     if(JM()->mArch.AVX2())
 615     {
 616         vGather = VGATHERDD(vSrc, pBase, vIndices, vMask, scale);
 617     }
 618     else
 619     {
 620         Value* pStack = STACKSAVE();
 621
 622         // store vSrc on the stack.  this way we can select between a valid load address and the vSrc address
 623         Value* vSrcPtr = ALLOCA(vSrc->getType());
 624         STORE(vSrc, vSrcPtr);
 625
 626         vGather = VUNDEF_I();
 627         Value *vScaleVec = VBROADCAST(Z_EXT(scale, mInt32Ty));
 628         Value *vOffsets = MUL(vIndices, vScaleVec);
 629         Value *mask = MASK(vMask);
 630         for(uint32_t i = 0; i < mVWidth; ++i)
 631         {
 632             // single component byte index
 633             Value *offset = VEXTRACT(vOffsets, C(i));
 634             // byte pointer to component
 635             Value *loadAddress = GEP(pBase, offset);
 636             loadAddress = BITCAST(loadAddress, PointerType::get(mInt32Ty, 0));
 637             // pointer to the value to load if we're masking off a component
 638             Value *maskLoadAddress = GEP(vSrcPtr, {C(0), C(i)});
 639             Value *selMask = VEXTRACT(mask, C(i));
 640             // switch in a safe address to load if we're trying to access a vertex
 641             Value *validAddress = SELECT(selMask, loadAddress, maskLoadAddress);
 642             Value *val = LOAD(validAddress, C(0));
 643             vGather = VINSERT(vGather, val, C(i));
 644         }
 645
 646         STACKRESTORE(pStack);
 647     }
 648     return vGather;
 649 }
 650
 651 //////////////////////////////////////////////////////////////////////////
 652 /// @brief convert x86 <N x float> mask to llvm <N x i1> mask
 653 Value* Builder::MASK(Value* vmask)
 654 {
 655     Value* src = BITCAST(vmask, mSimdInt32Ty);
 656     return ICMP_SLT(src, VIMMED1(0));
 657 }
 658
 659 //////////////////////////////////////////////////////////////////////////
 660 /// @brief convert llvm <N x i1> mask to x86 <N x i32> mask
 661 Value* Builder::VMASK(Value* mask)
 662 {
 663     return S_EXT(mask, mSimdInt32Ty);
 664 }
 665
 666 //////////////////////////////////////////////////////////////////////////
 667 /// @brief Generate a VPSHUFB operation in LLVM IR.  If not
 668 /// supported on the underlying platform, emulate it
 669 /// @param a - 256bit SIMD(32x8bit) of 8bit integer values
 670 /// @param b - 256bit SIMD(32x8bit) of 8bit integer mask values
 671 /// Byte masks in lower 128 lane of b selects 8 bit values from lower
 672 /// 128bits of a, and vice versa for the upper lanes.  If the mask
 673 /// value is negative, '0' is inserted.
 674 Value *Builder::PSHUFB(Value* a, Value* b)
 675 {
 676     Value* res;
 677     // use avx2 pshufb instruction if available
 678     if(JM()->mArch.AVX2())
 679     {
 680         res = VPSHUFB(a, b);
 681     }
 682     else
 683     {
 684         Constant* cB = dyn_cast<Constant>(b);
 685         // number of 8 bit elements in b
 686         uint32_t numElms = cast<VectorType>(cB->getType())->getNumElements();
 687         // output vector
 688         Value* vShuf = UndefValue::get(VectorType::get(mInt8Ty, numElms));
 689
 690         // insert an 8 bit value from the high and low lanes of a per loop iteration
 691         numElms /= 2;
 692         for(uint32_t i = 0; i < numElms; i++)
 693         {
 694             ConstantInt* cLow128b = cast<ConstantInt>(cB->getAggregateElement(i));
 695             ConstantInt* cHigh128b = cast<ConstantInt>(cB->getAggregateElement(i + numElms));
 696
 697             // extract values from constant mask
 698             char valLow128bLane =  (char)(cLow128b->getSExtValue());
 699             char valHigh128bLane = (char)(cHigh128b->getSExtValue());
 700
 701             Value* insertValLow128b;
 702             Value* insertValHigh128b;
 703
 704             // if the mask value is negative, insert a '0' in the respective output position
 705             // otherwise, lookup the value at mask position (bits 3..0 of the respective mask byte) in a and insert in output vector
 706             insertValLow128b = (valLow128bLane < 0) ? C((char)0) : VEXTRACT(a, C((valLow128bLane & 0xF)));
 707             insertValHigh128b = (valHigh128bLane < 0) ? C((char)0) : VEXTRACT(a, C((valHigh128bLane & 0xF) + numElms));
 708
 709             vShuf = VINSERT(vShuf, insertValLow128b, i);
 710             vShuf = VINSERT(vShuf, insertValHigh128b, (i + numElms));
 711         }
 712         res = vShuf;
 713     }
 714     return res;
 715 }
 716
 717 //////////////////////////////////////////////////////////////////////////
 718 /// @brief Generate a VPSHUFB operation (sign extend 8 8bit values to 32
 719 /// bits)in LLVM IR.  If not supported on the underlying platform, emulate it
 720 /// @param a - 128bit SIMD lane(16x8bit) of 8bit integer values.  Only
 721 /// lower 8 values are used.
 722 Value *Builder::PMOVSXBD(Value* a)
 723 {
 724     Value* res;
 725     // use avx2 byte sign extend instruction if available
 726     if(JM()->mArch.AVX2())
 727     {
 728         res = VPMOVSXBD(a);
 729     }
 730     else
 731     {
 732         // VPMOVSXBD output type
 733         Type* v8x32Ty = VectorType::get(mInt32Ty, 8);
 734         // Extract 8 values from 128bit lane and sign extend
 735         res = S_EXT(VSHUFFLE(a, a, C<int>({0, 1, 2, 3, 4, 5, 6, 7})), v8x32Ty);
 736     }
 737     return res;
 738 }
 739
 740 //////////////////////////////////////////////////////////////////////////
 741 /// @brief Generate a VPSHUFB operation (sign extend 8 16bit values to 32
 742 /// bits)in LLVM IR.  If not supported on the underlying platform, emulate it
 743 /// @param a - 128bit SIMD lane(8x16bit) of 16bit integer values.
 744 Value *Builder::PMOVSXWD(Value* a)
 745 {
 746     Value* res;
 747     // use avx2 word sign extend if available
 748     if(JM()->mArch.AVX2())
 749     {
 750         res = VPMOVSXWD(a);
 751     }
 752     else
 753     {
 754         // VPMOVSXWD output type
 755         Type* v8x32Ty = VectorType::get(mInt32Ty, 8);
 756         // Extract 8 values from 128bit lane and sign extend
 757         res = S_EXT(VSHUFFLE(a, a, C<int>({0, 1, 2, 3, 4, 5, 6, 7})), v8x32Ty);
 758     }
 759     return res;
 760 }
 761
 762 //////////////////////////////////////////////////////////////////////////
 763 /// @brief Generate a VPERMD operation (shuffle 32 bit integer values
 764 /// across 128 bit lanes) in LLVM IR.  If not supported on the underlying
 765 /// platform, emulate it
 766 /// @param a - 256bit SIMD lane(8x32bit) of integer values.
 767 /// @param idx - 256bit SIMD lane(8x32bit) of 3 bit lane index values
 768 Value *Builder::PERMD(Value* a, Value* idx)
 769 {
 770     Value* res;
 771     // use avx2 permute instruction if available
 772     if(JM()->mArch.AVX2())
 773     {
 774         // llvm 3.6.0 swapped the order of the args to vpermd
 775         res = VPERMD(idx, a);
 776     }
 777     else
 778     {
 779         if (isa<Constant>(idx))
 780         {
 781             res = VSHUFFLE(a, a, idx);
 782         }
 783         else
 784         {
 785             res = VUNDEF_I();
 786             for (uint32_t l = 0; l < JM()->mVWidth; ++l)
 787             {
 788                 Value* pIndex = VEXTRACT(idx, C(l));
 789                 Value* pVal = VEXTRACT(a, pIndex);
 790                 res = VINSERT(res, pVal, C(l));
 791             }
 792         }
 793     }
 794     return res;
 795 }
 796
 797 //////////////////////////////////////////////////////////////////////////
 798 /// @brief Generate a VPERMPS operation (shuffle 32 bit float values
 799 /// across 128 bit lanes) in LLVM IR.  If not supported on the underlying
 800 /// platform, emulate it
 801 /// @param a - 256bit SIMD lane(8x32bit) of float values.
 802 /// @param idx - 256bit SIMD lane(8x32bit) of 3 bit lane index values
 803 Value *Builder::PERMPS(Value* a, Value* idx)
 804 {
 805     Value* res;
 806     // use avx2 permute instruction if available
 807     if (JM()->mArch.AVX2())
 808     {
 809         // llvm 3.6.0 swapped the order of the args to vpermd
 810         res = VPERMPS(idx, a);
 811     }
 812     else
 813     {
 814         if (isa<Constant>(idx))
 815         {
 816             res = VSHUFFLE(a, a, idx);
 817         }
 818         else
 819         {
 820             res = VUNDEF_F();
 821             for (uint32_t l = 0; l < JM()->mVWidth; ++l)
 822             {
 823                 Value* pIndex = VEXTRACT(idx, C(l));
 824                 Value* pVal = VEXTRACT(a, pIndex);
 825                 res = VINSERT(res, pVal, C(l));
 826             }
 827         }
 828     }
 829
 830     return res;
 831 }
 832
 833 //////////////////////////////////////////////////////////////////////////
 834 /// @brief Generate a VCVTPH2PS operation (float16->float32 conversion)
 835 /// in LLVM IR.  If not supported on the underlying platform, emulate it
 836 /// @param a - 128bit SIMD lane(8x16bit) of float16 in int16 format.
 837 Value *Builder::CVTPH2PS(Value* a)
 838 {
 839     if (JM()->mArch.F16C())
 840     {
 841         return VCVTPH2PS(a);
 842     }
 843     else
 844     {
 845         FunctionType* pFuncTy = FunctionType::get(mFP32Ty, mInt16Ty);
 846         Function* pCvtPh2Ps = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("ConvertSmallFloatTo32", pFuncTy));
 847
 848         if (sys::DynamicLibrary::SearchForAddressOfSymbol("ConvertSmallFloatTo32") == nullptr)
 849         {
 850             sys::DynamicLibrary::AddSymbol("ConvertSmallFloatTo32", (void *)&ConvertSmallFloatTo32);
 851         }
 852
 853         Value* pResult = UndefValue::get(mSimdFP32Ty);
 854         for (uint32_t i = 0; i < mVWidth; ++i)
 855         {
 856             Value* pSrc = VEXTRACT(a, C(i));
 857             Value* pConv = CALL(pCvtPh2Ps, std::initializer_list<Value*>{pSrc});
 858             pResult = VINSERT(pResult, pConv, C(i));
 859         }
 860
 861         return pResult;
 862     }
 863 }
 864
 865 //////////////////////////////////////////////////////////////////////////
 866 /// @brief Generate a VCVTPS2PH operation (float32->float16 conversion)
 867 /// in LLVM IR.  If not supported on the underlying platform, emulate it
 868 /// @param a - 128bit SIMD lane(8x16bit) of float16 in int16 format.
 869 Value *Builder::CVTPS2PH(Value* a, Value* rounding)
 870 {
 871     if (JM()->mArch.F16C())
 872     {
 873         return VCVTPS2PH(a, rounding);
 874     }
 875     else
 876     {
 877         // call scalar C function for now
 878         FunctionType* pFuncTy = FunctionType::get(mInt16Ty, mFP32Ty);
 879         Function* pCvtPs2Ph = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("Convert32To16Float", pFuncTy));
 880
 881         if (sys::DynamicLibrary::SearchForAddressOfSymbol("Convert32To16Float") == nullptr)
 882         {
 883             sys::DynamicLibrary::AddSymbol("Convert32To16Float", (void *)&Convert32To16Float);
 884         }
 885
 886         Value* pResult = UndefValue::get(mSimdInt16Ty);
 887         for (uint32_t i = 0; i < mVWidth; ++i)
 888         {
 889             Value* pSrc = VEXTRACT(a, C(i));
 890             Value* pConv = CALL(pCvtPs2Ph, std::initializer_list<Value*>{pSrc});
 891             pResult = VINSERT(pResult, pConv, C(i));
 892         }
 893
 894         return pResult;
 895     }
 896 }
 897
 898 Value *Builder::PMAXSD(Value* a, Value* b)
 899 {
 900     if (JM()->mArch.AVX2())
 901     {
 902         return VPMAXSD(a, b);
 903     }
 904     else
 905     {
 906         // use 4-wide sse max intrinsic on lower/upper halves of 8-wide sources
 907         Function* pmaxsd = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_sse41_pmaxsd);
 908
 909         // low 128
 910         Value* aLo = VEXTRACTI128(a, C((uint8_t)0));
 911         Value* bLo = VEXTRACTI128(b, C((uint8_t)0));
 912         Value* resLo = CALL(pmaxsd, {aLo, bLo});
 913
 914         // high 128
 915         Value* aHi = VEXTRACTI128(a, C((uint8_t)1));
 916         Value* bHi = VEXTRACTI128(b, C((uint8_t)1));
 917         Value* resHi = CALL(pmaxsd, {aHi, bHi});
 918
 919         // combine
 920         Value* result = VINSERTI128(VUNDEF_I(), resLo, C((uint8_t)0));
 921         result = VINSERTI128(result, resHi, C((uint8_t)1));
 922
 923         return result;
 924     }
 925 }
 926
 927 Value *Builder::PMINSD(Value* a, Value* b)
 928 {
 929     if (JM()->mArch.AVX2())
 930     {
 931         return VPMINSD(a, b);
 932     }
 933     else
 934     {
 935         // use 4-wide sse max intrinsic on lower/upper halves of 8-wide sources
 936         Function* pminsd = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_sse41_pminsd);
 937
 938         // low 128
 939         Value* aLo = VEXTRACTI128(a, C((uint8_t)0));
 940         Value* bLo = VEXTRACTI128(b, C((uint8_t)0));
 941         Value* resLo = CALL(pminsd, {aLo, bLo});
 942
 943         // high 128
 944         Value* aHi = VEXTRACTI128(a, C((uint8_t)1));
 945         Value* bHi = VEXTRACTI128(b, C((uint8_t)1));
 946         Value* resHi = CALL(pminsd, {aHi, bHi});
 947
 948         // combine
 949         Value* result = VINSERTI128(VUNDEF_I(), resLo, C((uint8_t)0));
 950         result = VINSERTI128(result, resHi, C((uint8_t)1));
 951
 952         return result;
 953     }
 954 }
 955
 956 void Builder::Gather4(const SWR_FORMAT format, Value* pSrcBase, Value* byteOffsets,
 957                       Value* mask, Value* vGatherComponents[], bool bPackedOutput)
 958 {
 959     const SWR_FORMAT_INFO &info = GetFormatInfo(format);
 960     if(info.type[0] == SWR_TYPE_FLOAT && info.bpc[0] == 32)
 961     {
 962         // ensure our mask is the correct type
 963         mask = BITCAST(mask, mSimdFP32Ty);
 964         GATHER4PS(info, pSrcBase, byteOffsets, mask, vGatherComponents, bPackedOutput);
 965     }
 966     else
 967     {
 968         // ensure our mask is the correct type
 969         mask = BITCAST(mask, mSimdInt32Ty);
 970         GATHER4DD(info, pSrcBase, byteOffsets, mask, vGatherComponents, bPackedOutput);
 971     }
 972 }
 973
 974 void Builder::GATHER4PS(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets,
 975                         Value* mask, Value* vGatherComponents[], bool bPackedOutput)
 976 {
 977     switch(info.bpp / info.numComps)
 978     {
 979         case 16:
 980         {
 981                 Value* vGatherResult[2];
 982                 Value *vMask;
 983
 984                 // TODO: vGatherMaskedVal
 985                 Value* vGatherMaskedVal = VIMMED1((float)0);
 986
 987                 // always have at least one component out of x or y to fetch
 988
 989                 // save mask as it is zero'd out after each gather
 990                 vMask = mask;
 991
 992                 vGatherResult[0] = GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, C((char)1));
 993                 // e.g. result of first 8x32bit integer gather for 16bit components
 994                 // 256i - 0    1    2    3    4    5    6    7
 995                 //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
 996                 //
 997
 998                 // if we have at least one component out of x or y to fetch
 999                 if(info.numComps > 2)
1000                 {
1001                     // offset base to the next components(zw) in the vertex to gather
1002                     pSrcBase = GEP(pSrcBase, C((char)4));
1003                     vMask = mask;
1004
1005                     vGatherResult[1] =  GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, C((char)1));
1006                     // e.g. result of second 8x32bit integer gather for 16bit components
1007                     // 256i - 0    1    2    3    4    5    6    7
1008                     //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
1009                     //
1010                 }
1011                 else
1012                 {
1013                     vGatherResult[1] =  vGatherMaskedVal;
1014                 }
1015
1016                 // Shuffle gathered components into place, each row is a component
1017                 Shuffle16bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
1018         }
1019             break;
1020         case 32:
1021         {
1022             // apply defaults
1023             for (uint32_t i = 0; i < 4; ++i)
1024             {
1025                 vGatherComponents[i] = VIMMED1(*(float*)&info.defaults[i]);
1026             }
1027
1028             for(uint32_t i = 0; i < info.numComps; i++)
1029             {
1030                 uint32_t swizzleIndex = info.swizzle[i];
1031
1032                 // save mask as it is zero'd out after each gather
1033                 Value *vMask = mask;
1034
1035                 // Gather a SIMD of components
1036                 vGatherComponents[swizzleIndex] = GATHERPS(vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask, C((char)1));
1037
1038                 // offset base to the next component to gather
1039                 pSrcBase = GEP(pSrcBase, C((char)4));
1040             }
1041         }
1042             break;
1043         default:
1044             SWR_ASSERT(0, "Invalid float format");
1045             break;
1046     }
1047 }
1048
1049 void Builder::GATHER4DD(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets,
1050                         Value* mask, Value* vGatherComponents[], bool bPackedOutput)
1051 {
1052     switch (info.bpp / info.numComps)
1053     {
1054         case 8:
1055         {
1056             Value* vGatherMaskedVal = VIMMED1((int32_t)0);
1057             Value* vGatherResult = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, mask, C((char)1));
1058             // e.g. result of an 8x32bit integer gather for 8bit components
1059             // 256i - 0    1    2    3    4    5    6    7
1060             //        xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw
1061
1062             Shuffle8bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
1063         }
1064             break;
1065         case 16:
1066         {
1067             Value* vGatherResult[2];
1068             Value *vMask;
1069
1070             // TODO: vGatherMaskedVal
1071             Value* vGatherMaskedVal = VIMMED1((int32_t)0);
1072
1073             // always have at least one component out of x or y to fetch
1074
1075             // save mask as it is zero'd out after each gather
1076             vMask = mask;
1077
1078             vGatherResult[0] = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, C((char)1));
1079             // e.g. result of first 8x32bit integer gather for 16bit components
1080             // 256i - 0    1    2    3    4    5    6    7
1081             //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
1082             //
1083
1084             // if we have at least one component out of x or y to fetch
1085             if(info.numComps > 2)
1086             {
1087                 // offset base to the next components(zw) in the vertex to gather
1088                 pSrcBase = GEP(pSrcBase, C((char)4));
1089                 vMask = mask;
1090
1091                 vGatherResult[1] = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, C((char)1));
1092                 // e.g. result of second 8x32bit integer gather for 16bit components
1093                 // 256i - 0    1    2    3    4    5    6    7
1094                 //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
1095                 //
1096             }
1097             else
1098             {
1099                 vGatherResult[1] = vGatherMaskedVal;
1100             }
1101
1102             // Shuffle gathered components into place, each row is a component
1103             Shuffle16bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
1104
1105         }
1106             break;
1107         case 32:
1108         {
1109             // apply defaults
1110             for (uint32_t i = 0; i < 4; ++i)
1111             {
1112                 vGatherComponents[i] = VIMMED1((int)info.defaults[i]);
1113             }
1114
1115             for(uint32_t i = 0; i < info.numComps; i++)
1116             {
1117                 uint32_t swizzleIndex = info.swizzle[i];
1118
1119                 // save mask as it is zero'd out after each gather
1120                 Value *vMask = mask;
1121
1122                 // Gather a SIMD of components
1123                 vGatherComponents[swizzleIndex] = GATHERDD(vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask, C((char)1));
1124
1125                 // offset base to the next component to gather
1126                 pSrcBase = GEP(pSrcBase, C((char)4));
1127             }
1128         }
1129             break;
1130         default:
1131             SWR_ASSERT(0, "unsupported format");
1132         break;
1133     }
1134 }
1135
1136 void Builder::Shuffle16bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput[2], Value* vGatherOutput[4], bool bPackedOutput)
1137 {
1138     // cast types
1139     Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
1140     Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
1141
1142     // input could either be float or int vector; do shuffle work in int
1143     vGatherInput[0] = BITCAST(vGatherInput[0], mSimdInt32Ty);
1144     vGatherInput[1] = BITCAST(vGatherInput[1], mSimdInt32Ty);
1145
1146     if(bPackedOutput)
1147     {
1148         Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
1149
1150         // shuffle mask
1151         Value* vConstMask = C<char>({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
1152                                      0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15});
1153         Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput[0], v32x8Ty), vConstMask), vGatherTy);
1154         // after pshufb: group components together in each 128bit lane
1155         // 256i - 0    1    2    3    4    5    6    7
1156         //        xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
1157
1158         Value* vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
1159         // after PERMD: move and pack xy components into each 128bit lane
1160         // 256i - 0    1    2    3    4    5    6    7
1161         //        xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
1162
1163         // do the same for zw components
1164         Value* vi128ZW = nullptr;
1165         if(info.numComps > 2)
1166         {
1167             Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput[1], v32x8Ty), vConstMask), vGatherTy);
1168             vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
1169         }
1170
1171         for(uint32_t i = 0; i < 4; i++)
1172         {
1173             uint32_t swizzleIndex = info.swizzle[i];
1174             // todo: fixed for packed
1175             Value* vGatherMaskedVal = VIMMED1((int32_t)(info.defaults[i]));
1176             if(i >= info.numComps)
1177             {
1178                 // set the default component val
1179                 vGatherOutput[swizzleIndex] = vGatherMaskedVal;
1180                 continue;
1181             }
1182
1183             // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
1184             uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
1185             // if x or y, use vi128XY permute result, else use vi128ZW
1186             Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
1187
1188             // extract packed component 128 bit lanes
1189             vGatherOutput[swizzleIndex] = VEXTRACT(selectedPermute, C(lane));
1190         }
1191
1192     }
1193     else
1194     {
1195         // pshufb masks for each component
1196         Value* vConstMask[2];
1197         // x/z shuffle mask
1198         vConstMask[0] = C<char>({0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
1199                                  0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, });
1200
1201         // y/w shuffle mask
1202         vConstMask[1] = C<char>({2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
1203                                  2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1});
1204
1205
1206         // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits
1207         // apply defaults
1208         for (uint32_t i = 0; i < 4; ++i)
1209         {
1210             vGatherOutput[i] = VIMMED1((int32_t)info.defaults[i]);
1211         }
1212
1213         for(uint32_t i = 0; i < info.numComps; i++)
1214         {
1215             uint32_t swizzleIndex = info.swizzle[i];
1216
1217             // select correct constMask for x/z or y/w pshufb
1218             uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1;
1219             // if x or y, use vi128XY permute result, else use vi128ZW
1220             uint32_t selectedGather = (i < 2) ? 0 : 1;
1221
1222             vGatherOutput[swizzleIndex] = BITCAST(PSHUFB(BITCAST(vGatherInput[selectedGather], v32x8Ty), vConstMask[selectedMask]), vGatherTy);
1223             // after pshufb mask for x channel; z uses the same shuffle from the second gather
1224             // 256i - 0    1    2    3    4    5    6    7
1225             //        xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00
1226         }
1227     }
1228 }
1229
1230 void Builder::Shuffle8bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput, Value* vGatherOutput[], bool bPackedOutput)
1231 {
1232     // cast types
1233     Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
1234     Type* v32x8Ty =  VectorType::get(mInt8Ty, mVWidth * 4 ); // vwidth is units of 32 bits
1235
1236     if(bPackedOutput)
1237     {
1238         Type* v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
1239         // shuffle mask
1240         Value* vConstMask = C<char>({0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15,
1241                                      0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15});
1242         Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput, v32x8Ty), vConstMask), vGatherTy);
1243         // after pshufb: group components together in each 128bit lane
1244         // 256i - 0    1    2    3    4    5    6    7
1245         //        xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww
1246
1247         Value* vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({0, 4, 0, 0, 1, 5, 0, 0})), v128Ty);
1248         // after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane
1249         // 256i - 0    1    2    3    4    5    6    7
1250         //        xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care)
1251
1252         // do the same for zw components
1253         Value* vi128ZW = nullptr;
1254         if(info.numComps > 2)
1255         {
1256             vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({2, 6, 0, 0, 3, 7, 0, 0})), v128Ty);
1257         }
1258
1259         // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
1260         for(uint32_t i = 0; i < 4; i++)
1261         {
1262             uint32_t swizzleIndex = info.swizzle[i];
1263             // todo: fix for packed
1264             Value* vGatherMaskedVal = VIMMED1((int32_t)(info.defaults[i]));
1265             if(i >= info.numComps)
1266             {
1267                 // set the default component val
1268                 vGatherOutput[swizzleIndex] = vGatherMaskedVal;
1269                 continue;
1270             }
1271
1272             // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
1273             uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
1274             // if x or y, use vi128XY permute result, else use vi128ZW
1275             Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
1276
1277             // sign extend
1278             vGatherOutput[swizzleIndex] = VEXTRACT(selectedPermute, C(lane));
1279         }
1280     }
1281     // else zero extend
1282     else{
1283         // shuffle enabled components into lower byte of each 32bit lane, 0 extending to 32 bits
1284         // apply defaults
1285         for (uint32_t i = 0; i < 4; ++i)
1286         {
1287             vGatherOutput[i] = VIMMED1((int32_t)info.defaults[i]);
1288         }
1289
1290         for(uint32_t i = 0; i < info.numComps; i++){
1291             uint32_t swizzleIndex = info.swizzle[i];
1292
1293             // pshufb masks for each component
1294             Value* vConstMask;
1295             switch(i)
1296             {
1297                 case 0:
1298                     // x shuffle mask
1299                     vConstMask = C<char>({0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1,
1300                                           0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1});
1301                     break;
1302                 case 1:
1303                     // y shuffle mask
1304                     vConstMask = C<char>({1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1,
1305                                           1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1});
1306                     break;
1307                 case 2:
1308                     // z shuffle mask
1309                     vConstMask = C<char>({2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1,
1310                                           2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1});
1311                     break;
1312                 case 3:
1313                     // w shuffle mask
1314                     vConstMask = C<char>({3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1,
1315                                           3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1});
1316                     break;
1317                 default:
1318                     vConstMask = nullptr;
1319                     break;
1320             }
1321
1322                 vGatherOutput[swizzleIndex] = BITCAST(PSHUFB(BITCAST(vGatherInput, v32x8Ty), vConstMask), vGatherTy);
1323                 // after pshufb for x channel
1324                 // 256i - 0    1    2    3    4    5    6    7
1325                 //        x000 x000 x000 x000 x000 x000 x000 x000
1326         }
1327     }
1328 }
1329
1330 //////////////////////////////////////////////////////////////////////////
1331 /// @brief emulates a scatter operation.
1332 /// @param pDst - pointer to destination
1333 /// @param vSrc - vector of src data to scatter
1334 /// @param vOffsets - vector of byte offsets from pDst
1335 /// @param vMask - mask of valid lanes
1336 void Builder::SCATTERPS(Value* pDst, Value* vSrc, Value* vOffsets, Value* vMask)
1337 {
1338     Value* pStack = STACKSAVE();
1339
1340     Type* pSrcTy = vSrc->getType()->getVectorElementType();
1341
1342     // allocate tmp stack for masked off lanes
1343     Value* vTmpPtr = ALLOCA(pSrcTy);
1344
1345     Value *mask = MASK(vMask);
1346     for (uint32_t i = 0; i < mVWidth; ++i)
1347     {
1348         Value *offset = VEXTRACT(vOffsets, C(i));
1349         // byte pointer to component
1350         Value *storeAddress = GEP(pDst, offset);
1351         storeAddress = BITCAST(storeAddress, PointerType::get(pSrcTy, 0));
1352         Value *selMask = VEXTRACT(mask, C(i));
1353         Value *srcElem = VEXTRACT(vSrc, C(i));
1354         // switch in a safe address to load if we're trying to access a vertex
1355         Value *validAddress = SELECT(selMask, storeAddress, vTmpPtr);
1356         STORE(srcElem, validAddress);
1357     }
1358
1359     STACKRESTORE(pStack);
1360 }
1361
1362 Value* Builder::VABSPS(Value* a)
1363 {
1364     Value* asInt = BITCAST(a, mSimdInt32Ty);
1365     Value* result = BITCAST(AND(asInt, VIMMED1(0x7fffffff)), mSimdFP32Ty);
1366     return result;
1367 }
1368
1369 Value *Builder::ICLAMP(Value* src, Value* low, Value* high)
1370 {
1371     Value *lowCmp = ICMP_SLT(src, low);
1372     Value *ret = SELECT(lowCmp, low, src);
1373
1374     Value *highCmp = ICMP_SGT(ret, high);
1375     ret = SELECT(highCmp, high, ret);
1376
1377     return ret;
1378 }
1379
1380 Value *Builder::FCLAMP(Value* src, Value* low, Value* high)
1381 {
1382     Value *lowCmp = FCMP_OLT(src, low);
1383     Value *ret = SELECT(lowCmp, low, src);
1384
1385     Value *highCmp = FCMP_OGT(ret, high);
1386     ret = SELECT(highCmp, high, ret);
1387
1388     return ret;
1389 }
1390
1391 Value *Builder::FCLAMP(Value* src, float low, float high)
1392 {
1393     Value* result = VMAXPS(src, VIMMED1(low));
1394     result = VMINPS(result, VIMMED1(high));
1395
1396     return result;
1397 }
1398
1399 //////////////////////////////////////////////////////////////////////////
1400 /// @brief save/restore stack, providing ability to push/pop the stack and
1401 ///        reduce overall stack requirements for temporary stack use
1402 Value* Builder::STACKSAVE()
1403 {
1404     Function* pfnStackSave = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::stacksave);
1405 #if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR == 6
1406     return CALL(pfnStackSave);
1407 #else
1408     return CALLA(pfnStackSave);
1409 #endif
1410 }
1411
1412 void Builder::STACKRESTORE(Value* pSaved)
1413 {
1414     Function* pfnStackRestore = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::stackrestore);
1415     CALL(pfnStackRestore, std::initializer_list<Value*>{pSaved});
1416 }
1417
1418 Value *Builder::FMADDPS(Value* a, Value* b, Value* c)
1419 {
1420     Value* vOut;
1421     // use FMADs if available
1422     if(JM()->mArch.AVX2())
1423     {
1424         vOut = VFMADDPS(a, b, c);
1425     }
1426     else
1427     {
1428         vOut = FADD(FMUL(a, b), c);
1429     }
1430     return vOut;
1431 }
1432
1433 Value* Builder::POPCNT(Value* a)
1434 {
1435     Function* pCtPop = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::ctpop, { a->getType() });
1436     return CALL(pCtPop, std::initializer_list<Value*>{a});
1437 }
1438
1439 //////////////////////////////////////////////////////////////////////////
1440 /// @brief C functions called by LLVM IR
1441 //////////////////////////////////////////////////////////////////////////
1442
1443 //////////////////////////////////////////////////////////////////////////
1444 /// @brief called in JIT code, inserted by PRINT
1445 /// output to both stdout and visual studio debug console
1446 void __cdecl CallPrint(const char* fmt, ...)
1447 {
1448     va_list args;
1449     va_start(args, fmt);
1450     vprintf(fmt, args);
1451
1452 #if defined( _WIN32 )
1453     char strBuf[1024];
1454     vsnprintf_s(strBuf, _TRUNCATE, fmt, args);
1455     OutputDebugString(strBuf);
1456 #endif
1457 }
1458
1459 Value *Builder::VEXTRACTI128(Value* a, Constant* imm8)
1460 {
1461 #if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR == 6
1462     Function *func =
1463         Intrinsic::getDeclaration(JM()->mpCurrentModule,
1464                                   Intrinsic::x86_avx_vextractf128_si_256);
1465     return CALL(func, {a, imm8});
1466 #else
1467     bool flag = !imm8->isZeroValue();
1468     SmallVector<Constant*,8> idx;
1469     for (unsigned i = 0; i < mVWidth / 2; i++) {
1470         idx.push_back(C(flag ? i + mVWidth / 2 : i));
1471     }
1472     return VSHUFFLE(a, VUNDEF_I(), ConstantVector::get(idx));
1473 #endif
1474 }
1475
1476 Value *Builder::VINSERTI128(Value* a, Value* b, Constant* imm8)
1477 {
1478 #if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR == 6
1479     Function *func =
1480         Intrinsic::getDeclaration(JM()->mpCurrentModule,
1481                                   Intrinsic::x86_avx_vinsertf128_si_256);
1482     return CALL(func, {a, b, imm8});
1483 #else
1484     bool flag = !imm8->isZeroValue();
1485     SmallVector<Constant*,8> idx;
1486     for (unsigned i = 0; i < mVWidth; i++) {
1487         idx.push_back(C(i));
1488     }
1489     Value *inter = VSHUFFLE(b, VUNDEF_I(), ConstantVector::get(idx));
1490
1491     SmallVector<Constant*,8> idx2;
1492     for (unsigned i = 0; i < mVWidth / 2; i++) {
1493         idx2.push_back(C(flag ? i : i + mVWidth));
1494     }
1495     for (unsigned i = mVWidth / 2; i < mVWidth; i++) {
1496         idx2.push_back(C(flag ? i + mVWidth / 2 : i));
1497     }
1498     return VSHUFFLE(a, inter, ConstantVector::get(idx2));
1499 #endif
1500 }
1501
1502 // rdtsc buckets macros
1503 void Builder::RDTSC_START(Value* pBucketMgr, Value* pId)
1504 {
1505     std::vector<Type*> args{
1506         PointerType::get(mInt32Ty, 0),   // pBucketMgr
1507         mInt32Ty                        // id
1508     };
1509
1510     FunctionType* pFuncTy = FunctionType::get(Type::getVoidTy(JM()->mContext), args, false);
1511     Function* pFunc = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("BucketManager_StartBucket", pFuncTy));
1512     if (sys::DynamicLibrary::SearchForAddressOfSymbol("BucketManager_StartBucket") == nullptr)
1513     {
1514         sys::DynamicLibrary::AddSymbol("BucketManager_StartBucket", (void*)&BucketManager_StartBucket);
1515     }
1516
1517     CALL(pFunc, { pBucketMgr, pId });
1518 }
1519
1520 void Builder::RDTSC_STOP(Value* pBucketMgr, Value* pId)
1521 {
1522     std::vector<Type*> args{
1523         PointerType::get(mInt32Ty, 0),   // pBucketMgr
1524         mInt32Ty                        // id
1525     };
1526
1527     FunctionType* pFuncTy = FunctionType::get(Type::getVoidTy(JM()->mContext), args, false);
1528     Function* pFunc = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("BucketManager_StopBucket", pFuncTy));
1529     if (sys::DynamicLibrary::SearchForAddressOfSymbol("BucketManager_StopBucket") == nullptr)
1530     {
1531         sys::DynamicLibrary::AddSymbol("BucketManager_StopBucket", (void*)&BucketManager_StopBucket);
1532     }
1533
1534     CALL(pFunc, { pBucketMgr, pId });
1535 }
1536