src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp

   1 /****************************************************************************
   2 * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
   3 *
   4 * Permission is hereby granted, free of charge, to any person obtaining a
   5 * copy of this software and associated documentation files (the "Software"),
   6 * to deal in the Software without restriction, including without limitation
   7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8 * and/or sell copies of the Software, and to permit persons to whom the
   9 * Software is furnished to do so, subject to the following conditions:
  10 *
  11 * The above copyright notice and this permission notice (including the next
  12 * paragraph) shall be included in all copies or substantial portions of the
  13 * Software.
  14 *
  15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21 * IN THE SOFTWARE.
  22 *
  23 * @file builder_misc.cpp
  24 *
  25 * @brief Implementation for miscellaneous builder functions
  26 *
  27 * Notes:
  28 *
  29 ******************************************************************************/
  30 #include "builder.h"
  31 #include "llvm/Support/DynamicLibrary.h"
  32
  33 void __cdecl CallPrint(const char* fmt, ...);
  34
  35 //////////////////////////////////////////////////////////////////////////
  36 /// @brief Convert an IEEE 754 32-bit single precision float to an
  37 ///        16 bit float with 5 exponent bits and a variable
  38 ///        number of mantissa bits.
  39 /// @param val - 32-bit float
  40 /// @todo Maybe move this outside of this file into a header?
  41 static uint16_t Convert32To16Float(float val)
  42 {
  43     uint32_t sign, exp, mant;
  44     uint32_t roundBits;
  45
  46     // Extract the sign, exponent, and mantissa
  47     uint32_t uf = *(uint32_t*)&val;
  48     sign = (uf & 0x80000000) >> 31;
  49     exp = (uf & 0x7F800000) >> 23;
  50     mant = uf & 0x007FFFFF;
  51
  52     // Check for out of range
  53     if (std::isnan(val))
  54     {
  55         exp = 0x1F;
  56         mant = 0x200;
  57         sign = 1;                     // set the sign bit for NANs
  58     }
  59     else if (std::isinf(val))
  60     {
  61         exp = 0x1f;
  62         mant = 0x0;
  63     }
  64     else if (exp > (0x70 + 0x1E)) // Too big to represent -> max representable value
  65     {
  66         exp = 0x1E;
  67         mant = 0x3FF;
  68     }
  69     else if ((exp <= 0x70) && (exp >= 0x66)) // It's a denorm
  70     {
  71         mant |= 0x00800000;
  72         for (; exp <= 0x70; mant >>= 1, exp++)
  73             ;
  74         exp = 0;
  75         mant = mant >> 13;
  76     }
  77     else if (exp < 0x66) // Too small to represent -> Zero
  78     {
  79         exp = 0;
  80         mant = 0;
  81     }
  82     else
  83     {
  84         // Saves bits that will be shifted off for rounding
  85         roundBits = mant & 0x1FFFu;
  86         // convert exponent and mantissa to 16 bit format
  87         exp = exp - 0x70;
  88         mant = mant >> 13;
  89
  90         // Essentially RTZ, but round up if off by only 1 lsb
  91         if (roundBits == 0x1FFFu)
  92         {
  93             mant++;
  94             // check for overflow
  95             if ((mant & 0xC00u) != 0)
  96                 exp++;
  97             // make sure only the needed bits are used
  98             mant &= 0x3FF;
  99         }
 100     }
 101
 102     uint32_t tmpVal = (sign << 15) | (exp << 10) | mant;
 103     return (uint16_t)tmpVal;
 104 }
 105
 106 //////////////////////////////////////////////////////////////////////////
 107 /// @brief Convert an IEEE 754 16-bit float to an 32-bit single precision
 108 ///        float
 109 /// @param val - 16-bit float
 110 /// @todo Maybe move this outside of this file into a header?
 111 static float ConvertSmallFloatTo32(UINT val)
 112 {
 113     UINT result;
 114     if ((val & 0x7fff) == 0)
 115     {
 116         result = ((uint32_t)(val & 0x8000)) << 16;
 117     }
 118     else if ((val & 0x7c00) == 0x7c00)
 119     {
 120         result = ((val & 0x3ff) == 0) ? 0x7f800000 : 0x7fc00000;
 121         result |= ((uint32_t)val & 0x8000) << 16;
 122     }
 123     else
 124     {
 125         uint32_t sign = (val & 0x8000) << 16;
 126         uint32_t mant = (val & 0x3ff) << 13;
 127         uint32_t exp = (val >> 10) & 0x1f;
 128         if ((exp == 0) && (mant != 0)) // Adjust exponent and mantissa for denormals
 129         {
 130             mant <<= 1;
 131             while (mant < (0x400 << 13))
 132             {
 133                 exp--;
 134                 mant <<= 1;
 135             }
 136             mant &= (0x3ff << 13);
 137         }
 138         exp = ((exp - 15 + 127) & 0xff) << 23;
 139         result = sign | exp | mant;
 140     }
 141
 142     return *(float*)&result;
 143 }
 144
 145 Constant *Builder::C(bool i)
 146 {
 147     return ConstantInt::get(IRB()->getInt1Ty(), (i ? 1 : 0));
 148 }
 149
 150 Constant *Builder::C(char i)
 151 {
 152     return ConstantInt::get(IRB()->getInt8Ty(), i);
 153 }
 154
 155 Constant *Builder::C(uint8_t i)
 156 {
 157     return ConstantInt::get(IRB()->getInt8Ty(), i);
 158 }
 159
 160 Constant *Builder::C(int i)
 161 {
 162     return ConstantInt::get(IRB()->getInt32Ty(), i);
 163 }
 164
 165 Constant *Builder::C(int64_t i)
 166 {
 167     return ConstantInt::get(IRB()->getInt64Ty(), i);
 168 }
 169
 170 Constant *Builder::C(uint16_t i)
 171 {
 172     return ConstantInt::get(mInt16Ty,i);
 173 }
 174
 175 Constant *Builder::C(uint32_t i)
 176 {
 177     return ConstantInt::get(IRB()->getInt32Ty(), i);
 178 }
 179
 180 Constant *Builder::C(float i)
 181 {
 182     return ConstantFP::get(IRB()->getFloatTy(), i);
 183 }
 184
 185 Constant *Builder::PRED(bool pred)
 186 {
 187     return ConstantInt::get(IRB()->getInt1Ty(), (pred ? 1 : 0));
 188 }
 189
 190 Value *Builder::VIMMED1(int i)
 191 {
 192     return ConstantVector::getSplat(JM()->mVWidth, cast<ConstantInt>(C(i)));
 193 }
 194
 195 Value *Builder::VIMMED1(uint32_t i)
 196 {
 197     return ConstantVector::getSplat(JM()->mVWidth, cast<ConstantInt>(C(i)));
 198 }
 199
 200 Value *Builder::VIMMED1(float i)
 201 {
 202     return ConstantVector::getSplat(JM()->mVWidth, cast<ConstantFP>(C(i)));
 203 }
 204
 205 Value *Builder::VIMMED1(bool i)
 206 {
 207     return ConstantVector::getSplat(JM()->mVWidth, cast<ConstantInt>(C(i)));
 208 }
 209
 210 Value *Builder::VUNDEF_IPTR()
 211 {
 212     return UndefValue::get(VectorType::get(PointerType::get(mInt32Ty, 0),JM()->mVWidth));
 213 }
 214
 215 Value *Builder::VUNDEF_I()
 216 {
 217     return UndefValue::get(VectorType::get(mInt32Ty, JM()->mVWidth));
 218 }
 219
 220 Value *Builder::VUNDEF(Type *ty, uint32_t size)
 221 {
 222     return UndefValue::get(VectorType::get(ty, size));
 223 }
 224
 225 Value *Builder::VUNDEF_F()
 226 {
 227     return UndefValue::get(VectorType::get(mFP32Ty, JM()->mVWidth));
 228 }
 229
 230 Value *Builder::VUNDEF(Type* t)
 231 {
 232     return UndefValue::get(VectorType::get(t, JM()->mVWidth));
 233 }
 234
 235 #if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR == 6
 236 Value *Builder::VINSERT(Value *vec, Value *val, uint64_t index)
 237 {
 238     return VINSERT(vec, val, C((int64_t)index));
 239 }
 240 #endif
 241
 242 Value *Builder::VBROADCAST(Value *src)
 243 {
 244     // check if src is already a vector
 245     if (src->getType()->isVectorTy())
 246     {
 247         return src;
 248     }
 249
 250     return VECTOR_SPLAT(JM()->mVWidth, src);
 251 }
 252
 253 uint32_t Builder::IMMED(Value* v)
 254 {
 255     SWR_ASSERT(isa<ConstantInt>(v));
 256     ConstantInt *pValConst = cast<ConstantInt>(v);
 257     return pValConst->getZExtValue();
 258 }
 259
 260 Value *Builder::GEP(Value* ptr, const std::initializer_list<Value*> &indexList)
 261 {
 262     std::vector<Value*> indices;
 263     for (auto i : indexList)
 264         indices.push_back(i);
 265     return GEPA(ptr, indices);
 266 }
 267
 268 Value *Builder::GEP(Value* ptr, const std::initializer_list<uint32_t> &indexList)
 269 {
 270     std::vector<Value*> indices;
 271     for (auto i : indexList)
 272         indices.push_back(C(i));
 273     return GEPA(ptr, indices);
 274 }
 275
 276 LoadInst *Builder::LOAD(Value *basePtr, const std::initializer_list<uint32_t> &indices, const llvm::Twine& name)
 277 {
 278     std::vector<Value*> valIndices;
 279     for (auto i : indices)
 280         valIndices.push_back(C(i));
 281     return LOAD(GEPA(basePtr, valIndices), name);
 282 }
 283
 284 LoadInst *Builder::LOADV(Value *basePtr, const std::initializer_list<Value*> &indices, const llvm::Twine& name)
 285 {
 286     std::vector<Value*> valIndices;
 287     for (auto i : indices)
 288         valIndices.push_back(i);
 289     return LOAD(GEPA(basePtr, valIndices), name);
 290 }
 291
 292 StoreInst *Builder::STORE(Value *val, Value *basePtr, const std::initializer_list<uint32_t> &indices)
 293 {
 294     std::vector<Value*> valIndices;
 295     for (auto i : indices)
 296         valIndices.push_back(C(i));
 297     return STORE(val, GEPA(basePtr, valIndices));
 298 }
 299
 300 StoreInst *Builder::STOREV(Value *val, Value *basePtr, const std::initializer_list<Value*> &indices)
 301 {
 302     std::vector<Value*> valIndices;
 303     for (auto i : indices)
 304         valIndices.push_back(i);
 305     return STORE(val, GEPA(basePtr, valIndices));
 306 }
 307
 308 CallInst *Builder::CALL(Value *Callee, const std::initializer_list<Value*> &argsList)
 309 {
 310     std::vector<Value*> args;
 311     for (auto arg : argsList)
 312         args.push_back(arg);
 313     return CALLA(Callee, args);
 314 }
 315
 316 Value *Builder::VRCP(Value *va)
 317 {
 318     return FDIV(VIMMED1(1.0f), va);  // 1 / a
 319 }
 320
 321 Value *Builder::VPLANEPS(Value* vA, Value* vB, Value* vC, Value* &vX, Value* &vY)
 322 {
 323     Value* vOut = FMADDPS(vA, vX, vC);
 324     vOut = FMADDPS(vB, vY, vOut);
 325     return vOut;
 326 }
 327
 328 //////////////////////////////////////////////////////////////////////////
 329 /// @brief Generate an i32 masked load operation in LLVM IR.  If not
 330 /// supported on the underlying platform, emulate it with float masked load
 331 /// @param src - base address pointer for the load
 332 /// @param vMask - SIMD wide mask that controls whether to access memory load 0
 333 Value *Builder::MASKLOADD(Value* src,Value* mask)
 334 {
 335     Value* vResult;
 336     // use avx2 gather instruction is available
 337     if(JM()->mArch.AVX2())
 338     {
 339         Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx2_maskload_d_256);
 340         vResult = CALL(func,{src,mask});
 341     }
 342     else
 343     {
 344         Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule,Intrinsic::x86_avx_maskload_ps_256);
 345         Value* fMask = BITCAST(mask,VectorType::get(mFP32Ty,JM()->mVWidth));
 346         vResult = BITCAST(CALL(func,{src,fMask}), VectorType::get(mInt32Ty,JM()->mVWidth));
 347     }
 348     return vResult;
 349 }
 350
 351 //////////////////////////////////////////////////////////////////////////
 352 /// @brief insert a JIT call to CallPrint
 353 /// - outputs formatted string to both stdout and VS output window
 354 /// - DEBUG builds only
 355 /// Usage example:
 356 ///   PRINT("index %d = 0x%p\n",{C(lane), pIndex});
 357 ///   where C(lane) creates a constant value to print, and pIndex is the Value*
 358 ///   result from a GEP, printing out the pointer to memory
 359 /// @param printStr - constant string to print, which includes format specifiers
 360 /// @param printArgs - initializer list of Value*'s to print to std out
 361 CallInst *Builder::PRINT(const std::string &printStr,const std::initializer_list<Value*> &printArgs)
 362 {
 363     // push the arguments to CallPrint into a vector
 364     std::vector<Value*> printCallArgs;
 365     // save room for the format string.  we still need to modify it for vectors
 366     printCallArgs.resize(1);
 367
 368     // search through the format string for special processing
 369     size_t pos = 0;
 370     std::string tempStr(printStr);
 371     pos = tempStr.find('%', pos);
 372     auto v = printArgs.begin();
 373
 374     while ((pos != std::string::npos) && (v != printArgs.end()))
 375     {
 376         Value* pArg = *v;
 377         Type* pType = pArg->getType();
 378
 379         if (tempStr[pos + 1] == 't')
 380         {
 381             if (pType->isVectorTy())
 382             {
 383                 Type* pContainedType = pType->getContainedType(0);
 384
 385                 std::string vectorFormatStr;
 386
 387                 if (pContainedType->isFloatTy())
 388                 {
 389                     tempStr[pos + 1] = 'f';  // Ensure its %f
 390                     printCallArgs.push_back(FP_EXT(VEXTRACT(pArg, C(0)), mDoubleTy));
 391
 392                     for (uint32_t i = 1; i < pType->getVectorNumElements(); ++i)
 393                     {
 394                         vectorFormatStr += "%f ";
 395                         printCallArgs.push_back(FP_EXT(VEXTRACT(pArg, C(i)), mDoubleTy));
 396                     }
 397                 }
 398                 else if (pContainedType->isIntegerTy())
 399                 {
 400                     tempStr[pos + 1] = 'd';  // Ensure its %d
 401                     printCallArgs.push_back(VEXTRACT(pArg, C(0)));
 402
 403                     for (uint32_t i = 1; i < pType->getVectorNumElements(); ++i)
 404                     {
 405                         vectorFormatStr += "%d ";
 406                         printCallArgs.push_back(VEXTRACT(pArg, C(i)));
 407                     }
 408                 }
 409                 else
 410                 {
 411                     SWR_ASSERT(0, "Unsupported tyep");
 412                 }
 413
 414                 tempStr.insert(pos, vectorFormatStr);
 415                 pos += vectorFormatStr.size();
 416             }
 417             else
 418             {
 419                 if (pType->isFloatTy())
 420                 {
 421                     tempStr[pos + 1] = 'f';  // Ensure its %f
 422                     printCallArgs.push_back(FP_EXT(pArg, mDoubleTy));
 423                 }
 424                 else if (pType->isIntegerTy())
 425                 {
 426                     tempStr[pos + 1] = 'd';  // Ensure its %d
 427                     printCallArgs.push_back(pArg);
 428                 }
 429             }
 430         }
 431         else if (toupper(tempStr[pos + 1]) == 'X')
 432         {
 433             if (pType->isVectorTy())
 434             {
 435                 tempStr[pos] = '0';
 436                 tempStr.insert(pos + 1, "x%08");
 437
 438                 printCallArgs.push_back(VEXTRACT(pArg, C(0)));
 439
 440                 std::string vectorFormatStr;
 441                 for (uint32_t i = 1; i < pType->getVectorNumElements(); ++i)
 442                 {
 443                     vectorFormatStr += "0x%08X ";
 444                     printCallArgs.push_back(VEXTRACT(pArg, C(i)));
 445                 }
 446
 447                 tempStr.insert(pos, vectorFormatStr);
 448                 pos += vectorFormatStr.size();
 449             }
 450             else
 451             {
 452                 tempStr[pos] = '0';
 453                 tempStr.insert(pos + 1, "x%08");
 454                 printCallArgs.push_back(pArg);
 455                 pos += 3;
 456             }
 457         }
 458         // for %f we need to cast float Values to doubles so that they print out correctly
 459         else if ((tempStr[pos + 1] == 'f') && (pType->isFloatTy()))
 460         {
 461             printCallArgs.push_back(FP_EXT(pArg, Type::getDoubleTy(JM()->mContext)));
 462             pos++;
 463         }
 464         // add special handling for %f and %d format specifiers to make printing llvm vector types easier
 465         else if (pType->isVectorTy())
 466         {
 467             Type* pContainedType = pType->getContainedType(0);
 468
 469             if ((tempStr[pos + 1] == 'f') && (pContainedType->isFloatTy()))
 470             {
 471                 uint32_t i = 0;
 472                 for (; i < (pArg->getType()->getVectorNumElements()) - 1; i++)
 473                 {
 474                     tempStr.insert(pos, std::string("%f "));
 475                     pos += 3;
 476                     printCallArgs.push_back(FP_EXT(VEXTRACT(pArg, C(i)), Type::getDoubleTy(JM()->mContext)));
 477                 }
 478                 printCallArgs.push_back(FP_EXT(VEXTRACT(pArg, C(i)), Type::getDoubleTy(JM()->mContext)));
 479             }
 480             else if ((tempStr[pos + 1] == 'd') && (pContainedType->isIntegerTy()))
 481             {
 482                 uint32_t i = 0;
 483                 for (; i < (pArg->getType()->getVectorNumElements()) - 1; i++)
 484                 {
 485                     tempStr.insert(pos, std::string("%d "));
 486                     pos += 3;
 487                     printCallArgs.push_back(VEXTRACT(pArg, C(i)));
 488                 }
 489                 printCallArgs.push_back(VEXTRACT(pArg, C(i)));
 490             }
 491             else
 492             {
 493                 /// not a supported vector to print
 494                 /// @todo pointer types too
 495                 SWR_ASSERT(0);
 496             }
 497         }
 498         else
 499         {
 500             printCallArgs.push_back(pArg);
 501         }
 502
 503         // advance to the next arguement
 504         v++;
 505         pos = tempStr.find('%', ++pos);
 506     }
 507
 508     // create global variable constant string
 509     Constant *constString = ConstantDataArray::getString(JM()->mContext,tempStr,true);
 510     GlobalVariable *gvPtr = new GlobalVariable(constString->getType(),true,GlobalValue::InternalLinkage,constString,"printStr");
 511     JM()->mpCurrentModule->getGlobalList().push_back(gvPtr);
 512
 513     // get a pointer to the first character in the constant string array
 514     std::vector<Constant*> geplist{C(0),C(0)};
 515 #if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR == 6
 516     Constant *strGEP = ConstantExpr::getGetElementPtr(gvPtr,geplist,false);
 517 #else
 518     Constant *strGEP = ConstantExpr::getGetElementPtr(nullptr, gvPtr,geplist,false);
 519 #endif
 520
 521     // insert the pointer to the format string in the argument vector
 522     printCallArgs[0] = strGEP;
 523
 524     // get pointer to CallPrint function and insert decl into the module if needed
 525     std::vector<Type*> args;
 526     args.push_back(PointerType::get(mInt8Ty,0));
 527     FunctionType* callPrintTy = FunctionType::get(Type::getVoidTy(JM()->mContext),args,true);
 528     Function *callPrintFn = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("CallPrint", callPrintTy));
 529
 530     // if we haven't yet added the symbol to the symbol table
 531     if((sys::DynamicLibrary::SearchForAddressOfSymbol("CallPrint")) == nullptr)
 532     {
 533         sys::DynamicLibrary::AddSymbol("CallPrint", (void *)&CallPrint);
 534     }
 535
 536     // insert a call to CallPrint
 537     return CALLA(callPrintFn,printCallArgs);
 538 }
 539
 540 //////////////////////////////////////////////////////////////////////////
 541 /// @brief Wrapper around PRINT with initializer list.
 542 CallInst* Builder::PRINT(const std::string &printStr)
 543 {
 544     return PRINT(printStr, {});
 545 }
 546
 547 //////////////////////////////////////////////////////////////////////////
 548 /// @brief Generate a masked gather operation in LLVM IR.  If not
 549 /// supported on the underlying platform, emulate it with loads
 550 /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
 551 /// @param pBase - Int8* base VB address pointer value
 552 /// @param vIndices - SIMD wide value of VB byte offsets
 553 /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
 554 /// @param scale - value to scale indices by
 555 Value *Builder::GATHERPS(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, Value* scale)
 556 {
 557     Value* vGather;
 558
 559     // use avx2 gather instruction if available
 560     if(JM()->mArch.AVX2())
 561     {
 562         // force mask to <N x float>, required by vgather
 563         vMask = BITCAST(vMask, mSimdFP32Ty);
 564         vGather = VGATHERPS(vSrc,pBase,vIndices,vMask,scale);
 565     }
 566     else
 567     {
 568         Value* pStack = STACKSAVE();
 569
 570         // store vSrc on the stack.  this way we can select between a valid load address and the vSrc address
 571         Value* vSrcPtr = ALLOCA(vSrc->getType());
 572         STORE(vSrc, vSrcPtr);
 573
 574         vGather = VUNDEF_F();
 575         Value *vScaleVec = VBROADCAST(Z_EXT(scale,mInt32Ty));
 576         Value *vOffsets = MUL(vIndices,vScaleVec);
 577         Value *mask = MASK(vMask);
 578         for(uint32_t i = 0; i < JM()->mVWidth; ++i)
 579         {
 580             // single component byte index
 581             Value *offset = VEXTRACT(vOffsets,C(i));
 582             // byte pointer to component
 583             Value *loadAddress = GEP(pBase,offset);
 584             loadAddress = BITCAST(loadAddress,PointerType::get(mFP32Ty,0));
 585             // pointer to the value to load if we're masking off a component
 586             Value *maskLoadAddress = GEP(vSrcPtr,{C(0), C(i)});
 587             Value *selMask = VEXTRACT(mask,C(i));
 588             // switch in a safe address to load if we're trying to access a vertex
 589             Value *validAddress = SELECT(selMask, loadAddress, maskLoadAddress);
 590             Value *val = LOAD(validAddress);
 591             vGather = VINSERT(vGather,val,C(i));
 592         }
 593         STACKRESTORE(pStack);
 594     }
 595
 596     return vGather;
 597 }
 598
 599 //////////////////////////////////////////////////////////////////////////
 600 /// @brief Generate a masked gather operation in LLVM IR.  If not
 601 /// supported on the underlying platform, emulate it with loads
 602 /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
 603 /// @param pBase - Int8* base VB address pointer value
 604 /// @param vIndices - SIMD wide value of VB byte offsets
 605 /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
 606 /// @param scale - value to scale indices by
 607 Value *Builder::GATHERDD(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, Value* scale)
 608 {
 609     Value* vGather;
 610
 611     // use avx2 gather instruction if available
 612     if(JM()->mArch.AVX2())
 613     {
 614         vGather = VGATHERDD(vSrc, pBase, vIndices, vMask, scale);
 615     }
 616     else
 617     {
 618         Value* pStack = STACKSAVE();
 619
 620         // store vSrc on the stack.  this way we can select between a valid load address and the vSrc address
 621         Value* vSrcPtr = ALLOCA(vSrc->getType());
 622         STORE(vSrc, vSrcPtr);
 623
 624         vGather = VUNDEF_I();
 625         Value *vScaleVec = VBROADCAST(Z_EXT(scale, mInt32Ty));
 626         Value *vOffsets = MUL(vIndices, vScaleVec);
 627         Value *mask = MASK(vMask);
 628         for(uint32_t i = 0; i < JM()->mVWidth; ++i)
 629         {
 630             // single component byte index
 631             Value *offset = VEXTRACT(vOffsets, C(i));
 632             // byte pointer to component
 633             Value *loadAddress = GEP(pBase, offset);
 634             loadAddress = BITCAST(loadAddress, PointerType::get(mInt32Ty, 0));
 635             // pointer to the value to load if we're masking off a component
 636             Value *maskLoadAddress = GEP(vSrcPtr, {C(0), C(i)});
 637             Value *selMask = VEXTRACT(mask, C(i));
 638             // switch in a safe address to load if we're trying to access a vertex
 639             Value *validAddress = SELECT(selMask, loadAddress, maskLoadAddress);
 640             Value *val = LOAD(validAddress, C(0));
 641             vGather = VINSERT(vGather, val, C(i));
 642         }
 643
 644         STACKRESTORE(pStack);
 645     }
 646     return vGather;
 647 }
 648
 649 //////////////////////////////////////////////////////////////////////////
 650 /// @brief convert x86 <N x float> mask to llvm <N x i1> mask
 651 Value* Builder::MASK(Value* vmask)
 652 {
 653     Value* src = BITCAST(vmask, mSimdInt32Ty);
 654     return ICMP_SLT(src, VIMMED1(0));
 655 }
 656
 657 //////////////////////////////////////////////////////////////////////////
 658 /// @brief convert llvm <N x i1> mask to x86 <N x i32> mask
 659 Value* Builder::VMASK(Value* mask)
 660 {
 661     return S_EXT(mask, mSimdInt32Ty);
 662 }
 663
 664 //////////////////////////////////////////////////////////////////////////
 665 /// @brief Generate a VPSHUFB operation in LLVM IR.  If not
 666 /// supported on the underlying platform, emulate it
 667 /// @param a - 256bit SIMD(32x8bit) of 8bit integer values
 668 /// @param b - 256bit SIMD(32x8bit) of 8bit integer mask values
 669 /// Byte masks in lower 128 lane of b selects 8 bit values from lower
 670 /// 128bits of a, and vice versa for the upper lanes.  If the mask
 671 /// value is negative, '0' is inserted.
 672 Value *Builder::PSHUFB(Value* a, Value* b)
 673 {
 674     Value* res;
 675     // use avx2 pshufb instruction if available
 676     if(JM()->mArch.AVX2())
 677     {
 678         res = VPSHUFB(a, b);
 679     }
 680     else
 681     {
 682         Constant* cB = dyn_cast<Constant>(b);
 683         // number of 8 bit elements in b
 684         uint32_t numElms = cast<VectorType>(cB->getType())->getNumElements();
 685         // output vector
 686         Value* vShuf = UndefValue::get(VectorType::get(mInt8Ty, numElms));
 687
 688         // insert an 8 bit value from the high and low lanes of a per loop iteration
 689         numElms /= 2;
 690         for(uint32_t i = 0; i < numElms; i++)
 691         {
 692             ConstantInt* cLow128b = cast<ConstantInt>(cB->getAggregateElement(i));
 693             ConstantInt* cHigh128b = cast<ConstantInt>(cB->getAggregateElement(i + numElms));
 694
 695             // extract values from constant mask
 696             char valLow128bLane =  (char)(cLow128b->getSExtValue());
 697             char valHigh128bLane = (char)(cHigh128b->getSExtValue());
 698
 699             Value* insertValLow128b;
 700             Value* insertValHigh128b;
 701
 702             // if the mask value is negative, insert a '0' in the respective output position
 703             // otherwise, lookup the value at mask position (bits 3..0 of the respective mask byte) in a and insert in output vector
 704             insertValLow128b = (valLow128bLane < 0) ? C((char)0) : VEXTRACT(a, C((valLow128bLane & 0xF)));
 705             insertValHigh128b = (valHigh128bLane < 0) ? C((char)0) : VEXTRACT(a, C((valHigh128bLane & 0xF) + numElms));
 706
 707             vShuf = VINSERT(vShuf, insertValLow128b, i);
 708             vShuf = VINSERT(vShuf, insertValHigh128b, (i + numElms));
 709         }
 710         res = vShuf;
 711     }
 712     return res;
 713 }
 714
 715 //////////////////////////////////////////////////////////////////////////
 716 /// @brief Generate a VPSHUFB operation (sign extend 8 8bit values to 32
 717 /// bits)in LLVM IR.  If not supported on the underlying platform, emulate it
 718 /// @param a - 128bit SIMD lane(16x8bit) of 8bit integer values.  Only
 719 /// lower 8 values are used.
 720 Value *Builder::PMOVSXBD(Value* a)
 721 {
 722     Value* res;
 723     // use avx2 byte sign extend instruction if available
 724     if(JM()->mArch.AVX2())
 725     {
 726         res = VPMOVSXBD(a);
 727     }
 728     else
 729     {
 730         // VPMOVSXBD output type
 731         Type* v8x32Ty = VectorType::get(mInt32Ty, 8);
 732         // Extract 8 values from 128bit lane and sign extend
 733         res = S_EXT(VSHUFFLE(a, a, C<int>({0, 1, 2, 3, 4, 5, 6, 7})), v8x32Ty);
 734     }
 735     return res;
 736 }
 737
 738 //////////////////////////////////////////////////////////////////////////
 739 /// @brief Generate a VPSHUFB operation (sign extend 8 16bit values to 32
 740 /// bits)in LLVM IR.  If not supported on the underlying platform, emulate it
 741 /// @param a - 128bit SIMD lane(8x16bit) of 16bit integer values.
 742 Value *Builder::PMOVSXWD(Value* a)
 743 {
 744     Value* res;
 745     // use avx2 word sign extend if available
 746     if(JM()->mArch.AVX2())
 747     {
 748         res = VPMOVSXWD(a);
 749     }
 750     else
 751     {
 752         // VPMOVSXWD output type
 753         Type* v8x32Ty = VectorType::get(mInt32Ty, 8);
 754         // Extract 8 values from 128bit lane and sign extend
 755         res = S_EXT(VSHUFFLE(a, a, C<int>({0, 1, 2, 3, 4, 5, 6, 7})), v8x32Ty);
 756     }
 757     return res;
 758 }
 759
 760 //////////////////////////////////////////////////////////////////////////
 761 /// @brief Generate a VPERMD operation (shuffle 32 bit integer values
 762 /// across 128 bit lanes) in LLVM IR.  If not supported on the underlying
 763 /// platform, emulate it
 764 /// @param a - 256bit SIMD lane(8x32bit) of integer values.
 765 /// @param idx - 256bit SIMD lane(8x32bit) of 3 bit lane index values
 766 Value *Builder::PERMD(Value* a, Value* idx)
 767 {
 768     Value* res;
 769     // use avx2 permute instruction if available
 770     if(JM()->mArch.AVX2())
 771     {
 772         // llvm 3.6.0 swapped the order of the args to vpermd
 773         res = VPERMD(idx, a);
 774     }
 775     else
 776     {
 777         res = VSHUFFLE(a, a, idx);
 778     }
 779     return res;
 780 }
 781
 782 //////////////////////////////////////////////////////////////////////////
 783 /// @brief Generate a VCVTPH2PS operation (float16->float32 conversion)
 784 /// in LLVM IR.  If not supported on the underlying platform, emulate it
 785 /// @param a - 128bit SIMD lane(8x16bit) of float16 in int16 format.
 786 Value *Builder::CVTPH2PS(Value* a)
 787 {
 788     if (JM()->mArch.F16C())
 789     {
 790         return VCVTPH2PS(a);
 791     }
 792     else
 793     {
 794         FunctionType* pFuncTy = FunctionType::get(mFP32Ty, mInt16Ty);
 795         Function* pCvtPh2Ps = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("ConvertSmallFloatTo32", pFuncTy));
 796
 797         if (sys::DynamicLibrary::SearchForAddressOfSymbol("ConvertSmallFloatTo32") == nullptr)
 798         {
 799             sys::DynamicLibrary::AddSymbol("ConvertSmallFloatTo32", (void *)&ConvertSmallFloatTo32);
 800         }
 801
 802         Value* pResult = UndefValue::get(mSimdFP32Ty);
 803         for (uint32_t i = 0; i < JM()->mVWidth; ++i)
 804         {
 805             Value* pSrc = VEXTRACT(a, C(i));
 806             Value* pConv = CALL(pCvtPh2Ps, std::initializer_list<Value*>{pSrc});
 807             pResult = VINSERT(pResult, pConv, C(i));
 808         }
 809
 810         return pResult;
 811     }
 812 }
 813
 814 //////////////////////////////////////////////////////////////////////////
 815 /// @brief Generate a VCVTPS2PH operation (float32->float16 conversion)
 816 /// in LLVM IR.  If not supported on the underlying platform, emulate it
 817 /// @param a - 128bit SIMD lane(8x16bit) of float16 in int16 format.
 818 Value *Builder::CVTPS2PH(Value* a, Value* rounding)
 819 {
 820     if (JM()->mArch.F16C())
 821     {
 822         return VCVTPS2PH(a, rounding);
 823     }
 824     else
 825     {
 826         // call scalar C function for now
 827         FunctionType* pFuncTy = FunctionType::get(mInt16Ty, mFP32Ty);
 828         Function* pCvtPs2Ph = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("Convert32To16Float", pFuncTy));
 829
 830         if (sys::DynamicLibrary::SearchForAddressOfSymbol("Convert32To16Float") == nullptr)
 831         {
 832             sys::DynamicLibrary::AddSymbol("Convert32To16Float", (void *)&Convert32To16Float);
 833         }
 834
 835         Value* pResult = UndefValue::get(mSimdInt16Ty);
 836         for (uint32_t i = 0; i < JM()->mVWidth; ++i)
 837         {
 838             Value* pSrc = VEXTRACT(a, C(i));
 839             Value* pConv = CALL(pCvtPs2Ph, std::initializer_list<Value*>{pSrc});
 840             pResult = VINSERT(pResult, pConv, C(i));
 841         }
 842
 843         return pResult;
 844     }
 845 }
 846
 847 Value *Builder::PMAXSD(Value* a, Value* b)
 848 {
 849     if (JM()->mArch.AVX2())
 850     {
 851         return VPMAXSD(a, b);
 852     }
 853     else
 854     {
 855         // use 4-wide sse max intrinsic on lower/upper halves of 8-wide sources
 856         Function* pmaxsd = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_sse41_pmaxsd);
 857
 858         // low 128
 859         Value* aLo = VEXTRACTI128(a, C((uint8_t)0));
 860         Value* bLo = VEXTRACTI128(b, C((uint8_t)0));
 861         Value* resLo = CALL(pmaxsd, {aLo, bLo});
 862
 863         // high 128
 864         Value* aHi = VEXTRACTI128(a, C((uint8_t)1));
 865         Value* bHi = VEXTRACTI128(b, C((uint8_t)1));
 866         Value* resHi = CALL(pmaxsd, {aHi, bHi});
 867
 868         // combine
 869         Value* result = VINSERTI128(VUNDEF_I(), resLo, C((uint8_t)0));
 870         result = VINSERTI128(result, resHi, C((uint8_t)1));
 871
 872         return result;
 873     }
 874 }
 875
 876 Value *Builder::PMINSD(Value* a, Value* b)
 877 {
 878     if (JM()->mArch.AVX2())
 879     {
 880         return VPMINSD(a, b);
 881     }
 882     else
 883     {
 884         // use 4-wide sse max intrinsic on lower/upper halves of 8-wide sources
 885         Function* pminsd = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_sse41_pminsd);
 886
 887         // low 128
 888         Value* aLo = VEXTRACTI128(a, C((uint8_t)0));
 889         Value* bLo = VEXTRACTI128(b, C((uint8_t)0));
 890         Value* resLo = CALL(pminsd, {aLo, bLo});
 891
 892         // high 128
 893         Value* aHi = VEXTRACTI128(a, C((uint8_t)1));
 894         Value* bHi = VEXTRACTI128(b, C((uint8_t)1));
 895         Value* resHi = CALL(pminsd, {aHi, bHi});
 896
 897         // combine
 898         Value* result = VINSERTI128(VUNDEF_I(), resLo, C((uint8_t)0));
 899         result = VINSERTI128(result, resHi, C((uint8_t)1));
 900
 901         return result;
 902     }
 903 }
 904
 905 void Builder::Gather4(const SWR_FORMAT format, Value* pSrcBase, Value* byteOffsets,
 906                       Value* mask, Value* vGatherComponents[], bool bPackedOutput)
 907 {
 908     const SWR_FORMAT_INFO &info = GetFormatInfo(format);
 909     if(info.type[0] == SWR_TYPE_FLOAT && info.bpc[0] == 32)
 910     {
 911         // ensure our mask is the correct type
 912         mask = BITCAST(mask, mSimdFP32Ty);
 913         GATHER4PS(info, pSrcBase, byteOffsets, mask, vGatherComponents, bPackedOutput);
 914     }
 915     else
 916     {
 917         // ensure our mask is the correct type
 918         mask = BITCAST(mask, mSimdInt32Ty);
 919         GATHER4DD(info, pSrcBase, byteOffsets, mask, vGatherComponents, bPackedOutput);
 920     }
 921 }
 922
 923 void Builder::GATHER4PS(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets,
 924                         Value* mask, Value* vGatherComponents[], bool bPackedOutput)
 925 {
 926     switch(info.bpp / info.numComps)
 927     {
 928         case 16:
 929         {
 930                 Value* vGatherResult[2];
 931                 Value *vMask;
 932
 933                 // TODO: vGatherMaskedVal
 934                 Value* vGatherMaskedVal = VIMMED1((float)0);
 935
 936                 // always have at least one component out of x or y to fetch
 937
 938                 // save mask as it is zero'd out after each gather
 939                 vMask = mask;
 940
 941                 vGatherResult[0] = GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, C((char)1));
 942                 // e.g. result of first 8x32bit integer gather for 16bit components
 943                 // 256i - 0    1    2    3    4    5    6    7
 944                 //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
 945                 //
 946
 947                 // if we have at least one component out of x or y to fetch
 948                 if(info.numComps > 2)
 949                 {
 950                     // offset base to the next components(zw) in the vertex to gather
 951                     pSrcBase = GEP(pSrcBase, C((char)4));
 952                     vMask = mask;
 953
 954                     vGatherResult[1] =  GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, C((char)1));
 955                     // e.g. result of second 8x32bit integer gather for 16bit components
 956                     // 256i - 0    1    2    3    4    5    6    7
 957                     //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
 958                     //
 959                 }
 960                 else
 961                 {
 962                     vGatherResult[1] =  vGatherMaskedVal;
 963                 }
 964
 965                 // Shuffle gathered components into place, each row is a component
 966                 Shuffle16bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
 967         }
 968             break;
 969         case 32:
 970         {
 971             // apply defaults
 972             for (uint32_t i = 0; i < 4; ++i)
 973             {
 974                 vGatherComponents[i] = VIMMED1(*(float*)&info.defaults[i]);
 975             }
 976
 977             for(uint32_t i = 0; i < info.numComps; i++)
 978             {
 979                 uint32_t swizzleIndex = info.swizzle[i];
 980
 981                 // save mask as it is zero'd out after each gather
 982                 Value *vMask = mask;
 983
 984                 // Gather a SIMD of components
 985                 vGatherComponents[swizzleIndex] = GATHERPS(vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask, C((char)1));
 986
 987                 // offset base to the next component to gather
 988                 pSrcBase = GEP(pSrcBase, C((char)4));
 989             }
 990         }
 991             break;
 992         default:
 993             SWR_ASSERT(0, "Invalid float format");
 994             break;
 995     }
 996 }
 997
 998 void Builder::GATHER4DD(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets,
 999                         Value* mask, Value* vGatherComponents[], bool bPackedOutput)
1000 {
1001     switch (info.bpp / info.numComps)
1002     {
1003         case 8:
1004         {
1005             Value* vGatherMaskedVal = VIMMED1((int32_t)0);
1006             Value* vGatherResult = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, mask, C((char)1));
1007             // e.g. result of an 8x32bit integer gather for 8bit components
1008             // 256i - 0    1    2    3    4    5    6    7
1009             //        xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw
1010
1011             Shuffle8bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
1012         }
1013             break;
1014         case 16:
1015         {
1016             Value* vGatherResult[2];
1017             Value *vMask;
1018
1019             // TODO: vGatherMaskedVal
1020             Value* vGatherMaskedVal = VIMMED1((int32_t)0);
1021
1022             // always have at least one component out of x or y to fetch
1023
1024             // save mask as it is zero'd out after each gather
1025             vMask = mask;
1026
1027             vGatherResult[0] = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, C((char)1));
1028             // e.g. result of first 8x32bit integer gather for 16bit components
1029             // 256i - 0    1    2    3    4    5    6    7
1030             //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
1031             //
1032
1033             // if we have at least one component out of x or y to fetch
1034             if(info.numComps > 2)
1035             {
1036                 // offset base to the next components(zw) in the vertex to gather
1037                 pSrcBase = GEP(pSrcBase, C((char)4));
1038                 vMask = mask;
1039
1040                 vGatherResult[1] = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, C((char)1));
1041                 // e.g. result of second 8x32bit integer gather for 16bit components
1042                 // 256i - 0    1    2    3    4    5    6    7
1043                 //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
1044                 //
1045             }
1046             else
1047             {
1048                 vGatherResult[1] = vGatherMaskedVal;
1049             }
1050
1051             // Shuffle gathered components into place, each row is a component
1052             Shuffle16bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
1053
1054         }
1055             break;
1056         case 32:
1057         {
1058             // apply defaults
1059             for (uint32_t i = 0; i < 4; ++i)
1060             {
1061                 vGatherComponents[i] = VIMMED1((int)info.defaults[i]);
1062             }
1063
1064             for(uint32_t i = 0; i < info.numComps; i++)
1065             {
1066                 uint32_t swizzleIndex = info.swizzle[i];
1067
1068                 // save mask as it is zero'd out after each gather
1069                 Value *vMask = mask;
1070
1071                 // Gather a SIMD of components
1072                 vGatherComponents[swizzleIndex] = GATHERDD(vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask, C((char)1));
1073
1074                 // offset base to the next component to gather
1075                 pSrcBase = GEP(pSrcBase, C((char)4));
1076             }
1077         }
1078             break;
1079         default:
1080             SWR_ASSERT(0, "unsupported format");
1081         break;
1082     }
1083 }
1084
1085 void Builder::Shuffle16bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput[2], Value* vGatherOutput[4], bool bPackedOutput)
1086 {
1087     // cast types
1088     Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), JM()->mVWidth);
1089     Type* v32x8Ty = VectorType::get(mInt8Ty, JM()->mVWidth * 4); // vwidth is units of 32 bits
1090
1091     // input could either be float or int vector; do shuffle work in int
1092     vGatherInput[0] = BITCAST(vGatherInput[0], mSimdInt32Ty);
1093     vGatherInput[1] = BITCAST(vGatherInput[1], mSimdInt32Ty);
1094
1095     if(bPackedOutput)
1096     {
1097         Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), JM()->mVWidth / 4); // vwidth is units of 32 bits
1098
1099         // shuffle mask
1100         Value* vConstMask = C<char>({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
1101                                      0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15});
1102         Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput[0], v32x8Ty), vConstMask), vGatherTy);
1103         // after pshufb: group components together in each 128bit lane
1104         // 256i - 0    1    2    3    4    5    6    7
1105         //        xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
1106
1107         Value* vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
1108         // after PERMD: move and pack xy components into each 128bit lane
1109         // 256i - 0    1    2    3    4    5    6    7
1110         //        xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
1111
1112         // do the same for zw components
1113         Value* vi128ZW = nullptr;
1114         if(info.numComps > 2)
1115         {
1116             Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput[1], v32x8Ty), vConstMask), vGatherTy);
1117             vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
1118         }
1119
1120         for(uint32_t i = 0; i < 4; i++)
1121         {
1122             uint32_t swizzleIndex = info.swizzle[i];
1123             // todo: fixed for packed
1124             Value* vGatherMaskedVal = VIMMED1((int32_t)(info.defaults[i]));
1125             if(i >= info.numComps)
1126             {
1127                 // set the default component val
1128                 vGatherOutput[swizzleIndex] = vGatherMaskedVal;
1129                 continue;
1130             }
1131
1132             // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
1133             uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
1134             // if x or y, use vi128XY permute result, else use vi128ZW
1135             Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
1136
1137             // extract packed component 128 bit lanes
1138             vGatherOutput[swizzleIndex] = VEXTRACT(selectedPermute, C(lane));
1139         }
1140
1141     }
1142     else
1143     {
1144         // pshufb masks for each component
1145         Value* vConstMask[2];
1146         // x/z shuffle mask
1147         vConstMask[0] = C<char>({0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
1148                                  0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, });
1149
1150         // y/w shuffle mask
1151         vConstMask[1] = C<char>({2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
1152                                  2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1});
1153
1154
1155         // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits
1156         // apply defaults
1157         for (uint32_t i = 0; i < 4; ++i)
1158         {
1159             vGatherOutput[i] = VIMMED1((int32_t)info.defaults[i]);
1160         }
1161
1162         for(uint32_t i = 0; i < info.numComps; i++)
1163         {
1164             uint32_t swizzleIndex = info.swizzle[i];
1165
1166             // select correct constMask for x/z or y/w pshufb
1167             uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1;
1168             // if x or y, use vi128XY permute result, else use vi128ZW
1169             uint32_t selectedGather = (i < 2) ? 0 : 1;
1170
1171             vGatherOutput[swizzleIndex] = BITCAST(PSHUFB(BITCAST(vGatherInput[selectedGather], v32x8Ty), vConstMask[selectedMask]), vGatherTy);
1172             // after pshufb mask for x channel; z uses the same shuffle from the second gather
1173             // 256i - 0    1    2    3    4    5    6    7
1174             //        xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00
1175         }
1176     }
1177 }
1178
1179 void Builder::Shuffle8bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput, Value* vGatherOutput[], bool bPackedOutput)
1180 {
1181     // cast types
1182     Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), JM()->mVWidth);
1183     Type* v32x8Ty =  VectorType::get(mInt8Ty, JM()->mVWidth * 4 ); // vwidth is units of 32 bits
1184
1185     if(bPackedOutput)
1186     {
1187         Type* v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), JM()->mVWidth / 4); // vwidth is units of 32 bits
1188         // shuffle mask
1189         Value* vConstMask = C<char>({0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15,
1190                                      0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15});
1191         Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput, v32x8Ty), vConstMask), vGatherTy);
1192         // after pshufb: group components together in each 128bit lane
1193         // 256i - 0    1    2    3    4    5    6    7
1194         //        xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww
1195
1196         Value* vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({0, 4, 0, 0, 1, 5, 0, 0})), v128Ty);
1197         // after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane
1198         // 256i - 0    1    2    3    4    5    6    7
1199         //        xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care)
1200
1201         // do the same for zw components
1202         Value* vi128ZW = nullptr;
1203         if(info.numComps > 2)
1204         {
1205             vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({2, 6, 0, 0, 3, 7, 0, 0})), v128Ty);
1206         }
1207
1208         // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
1209         for(uint32_t i = 0; i < 4; i++)
1210         {
1211             uint32_t swizzleIndex = info.swizzle[i];
1212             // todo: fix for packed
1213             Value* vGatherMaskedVal = VIMMED1((int32_t)(info.defaults[i]));
1214             if(i >= info.numComps)
1215             {
1216                 // set the default component val
1217                 vGatherOutput[swizzleIndex] = vGatherMaskedVal;
1218                 continue;
1219             }
1220
1221             // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
1222             uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
1223             // if x or y, use vi128XY permute result, else use vi128ZW
1224             Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
1225
1226             // sign extend
1227             vGatherOutput[swizzleIndex] = VEXTRACT(selectedPermute, C(lane));
1228         }
1229     }
1230     // else zero extend
1231     else{
1232         // shuffle enabled components into lower byte of each 32bit lane, 0 extending to 32 bits
1233         // apply defaults
1234         for (uint32_t i = 0; i < 4; ++i)
1235         {
1236             vGatherOutput[i] = VIMMED1((int32_t)info.defaults[i]);
1237         }
1238
1239         for(uint32_t i = 0; i < info.numComps; i++){
1240             uint32_t swizzleIndex = info.swizzle[i];
1241
1242             // pshufb masks for each component
1243             Value* vConstMask;
1244             switch(i)
1245             {
1246                 case 0:
1247                     // x shuffle mask
1248                     vConstMask = C<char>({0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1,
1249                                           0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1});
1250                     break;
1251                 case 1:
1252                     // y shuffle mask
1253                     vConstMask = C<char>({1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1,
1254                                           1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1});
1255                     break;
1256                 case 2:
1257                     // z shuffle mask
1258                     vConstMask = C<char>({2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1,
1259                                           2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1});
1260                     break;
1261                 case 3:
1262                     // w shuffle mask
1263                     vConstMask = C<char>({3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1,
1264                                           3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1});
1265                     break;
1266                 default:
1267                     vConstMask = nullptr;
1268                     break;
1269             }
1270
1271                 vGatherOutput[swizzleIndex] = BITCAST(PSHUFB(BITCAST(vGatherInput, v32x8Ty), vConstMask), vGatherTy);
1272                 // after pshufb for x channel
1273                 // 256i - 0    1    2    3    4    5    6    7
1274                 //        x000 x000 x000 x000 x000 x000 x000 x000
1275         }
1276     }
1277 }
1278
1279 //////////////////////////////////////////////////////////////////////////
1280 /// @brief emulates a scatter operation.
1281 /// @param pDst - pointer to destination
1282 /// @param vSrc - vector of src data to scatter
1283 /// @param vOffsets - vector of byte offsets from pDst
1284 /// @param vMask - mask of valid lanes
1285 void Builder::SCATTERPS(Value* pDst, Value* vSrc, Value* vOffsets, Value* vMask)
1286 {
1287     Value* pStack = STACKSAVE();
1288
1289     Type* pSrcTy = vSrc->getType()->getVectorElementType();
1290
1291     // allocate tmp stack for masked off lanes
1292     Value* vTmpPtr = ALLOCA(pSrcTy);
1293
1294     Value *mask = MASK(vMask);
1295     for (uint32_t i = 0; i < JM()->mVWidth; ++i)
1296     {
1297         Value *offset = VEXTRACT(vOffsets, C(i));
1298         // byte pointer to component
1299         Value *storeAddress = GEP(pDst, offset);
1300         storeAddress = BITCAST(storeAddress, PointerType::get(pSrcTy, 0));
1301         Value *selMask = VEXTRACT(mask, C(i));
1302         Value *srcElem = VEXTRACT(vSrc, C(i));
1303         // switch in a safe address to load if we're trying to access a vertex
1304         Value *validAddress = SELECT(selMask, storeAddress, vTmpPtr);
1305         STORE(srcElem, validAddress);
1306     }
1307
1308     STACKRESTORE(pStack);
1309 }
1310
1311 Value* Builder::VABSPS(Value* a)
1312 {
1313     Value* asInt = BITCAST(a, mSimdInt32Ty);
1314     Value* result = BITCAST(AND(asInt, VIMMED1(0x7fffffff)), mSimdFP32Ty);
1315     return result;
1316 }
1317
1318 Value *Builder::ICLAMP(Value* src, Value* low, Value* high)
1319 {
1320     Value *lowCmp = ICMP_SLT(src, low);
1321     Value *ret = SELECT(lowCmp, low, src);
1322
1323     Value *highCmp = ICMP_SGT(ret, high);
1324     ret = SELECT(highCmp, high, ret);
1325
1326     return ret;
1327 }
1328
1329 Value *Builder::FCLAMP(Value* src, Value* low, Value* high)
1330 {
1331     Value *lowCmp = FCMP_OLT(src, low);
1332     Value *ret = SELECT(lowCmp, low, src);
1333
1334     Value *highCmp = FCMP_OGT(ret, high);
1335     ret = SELECT(highCmp, high, ret);
1336
1337     return ret;
1338 }
1339
1340 Value *Builder::FCLAMP(Value* src, float low, float high)
1341 {
1342     Value* result = VMAXPS(src, VIMMED1(low));
1343     result = VMINPS(result, VIMMED1(high));
1344
1345     return result;
1346 }
1347
1348 //////////////////////////////////////////////////////////////////////////
1349 /// @brief save/restore stack, providing ability to push/pop the stack and
1350 ///        reduce overall stack requirements for temporary stack use
1351 Value* Builder::STACKSAVE()
1352 {
1353     Function* pfnStackSave = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::stacksave);
1354 #if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR == 6
1355     return CALL(pfnStackSave);
1356 #else
1357     return CALLA(pfnStackSave);
1358 #endif
1359 }
1360
1361 void Builder::STACKRESTORE(Value* pSaved)
1362 {
1363     Function* pfnStackRestore = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::stackrestore);
1364     CALL(pfnStackRestore, std::initializer_list<Value*>{pSaved});
1365 }
1366
1367 Value *Builder::FMADDPS(Value* a, Value* b, Value* c)
1368 {
1369     Value* vOut;
1370     // use FMADs if available
1371     if(JM()->mArch.AVX2())
1372     {
1373         vOut = VFMADDPS(a, b, c);
1374     }
1375     else
1376     {
1377         vOut = FADD(FMUL(a, b), c);
1378     }
1379     return vOut;
1380 }
1381
1382 Value* Builder::POPCNT(Value* a)
1383 {
1384     Function* pCtPop = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::ctpop, { a->getType() });
1385     return CALL(pCtPop, std::initializer_list<Value*>{a});
1386 }
1387
1388 //////////////////////////////////////////////////////////////////////////
1389 /// @brief C functions called by LLVM IR
1390 //////////////////////////////////////////////////////////////////////////
1391
1392 //////////////////////////////////////////////////////////////////////////
1393 /// @brief called in JIT code, inserted by PRINT
1394 /// output to both stdout and visual studio debug console
1395 void __cdecl CallPrint(const char* fmt, ...)
1396 {
1397     va_list args;
1398     va_start(args, fmt);
1399     vprintf(fmt, args);
1400
1401 #if defined( _WIN32 )
1402     char strBuf[1024];
1403     vsnprintf_s(strBuf, _TRUNCATE, fmt, args);
1404     OutputDebugString(strBuf);
1405 #endif
1406 }
1407
1408 Value *Builder::VEXTRACTI128(Value* a, Constant* imm8)
1409 {
1410 #if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR == 6
1411     Function *func =
1412         Intrinsic::getDeclaration(JM()->mpCurrentModule,
1413                                   Intrinsic::x86_avx_vextractf128_si_256);
1414     return CALL(func, {a, imm8});
1415 #else
1416     bool flag = !imm8->isZeroValue();
1417     SmallVector<Constant*,8> idx;
1418     for (unsigned i = 0; i < JM()->mVWidth / 2; i++) {
1419         idx.push_back(C(flag ? i + JM()->mVWidth / 2 : i));
1420     }
1421     return VSHUFFLE(a, VUNDEF_I(), ConstantVector::get(idx));
1422 #endif
1423 }
1424
1425 Value *Builder::VINSERTI128(Value* a, Value* b, Constant* imm8)
1426 {
1427 #if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR == 6
1428     Function *func =
1429         Intrinsic::getDeclaration(JM()->mpCurrentModule,
1430                                   Intrinsic::x86_avx_vinsertf128_si_256);
1431     return CALL(func, {a, b, imm8});
1432 #else
1433     bool flag = !imm8->isZeroValue();
1434     SmallVector<Constant*,8> idx;
1435     for (unsigned i = 0; i < JM()->mVWidth; i++) {
1436         idx.push_back(C(i));
1437     }
1438     Value *inter = VSHUFFLE(b, VUNDEF_I(), ConstantVector::get(idx));
1439
1440     SmallVector<Constant*,8> idx2;
1441     for (unsigned i = 0; i < JM()->mVWidth / 2; i++) {
1442         idx2.push_back(C(flag ? i : i + JM()->mVWidth));
1443     }
1444     for (unsigned i = JM()->mVWidth / 2; i < JM()->mVWidth; i++) {
1445         idx2.push_back(C(flag ? i + JM()->mVWidth / 2 : i));
1446     }
1447     return VSHUFFLE(a, inter, ConstantVector::get(idx2));
1448 #endif
1449 }