src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp

   1 /****************************************************************************
   2  * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  *
  23  * @file builder_misc.cpp
  24  *
  25  * @brief Implementation for miscellaneous builder functions
  26  *
  27  * Notes:
  28  *
  29  ******************************************************************************/
  30 #include "jit_pch.hpp"
  31 #include "builder.h"
  32 #include "common/rdtsc_buckets.h"
  33
  34 #include <cstdarg>
  35
  36 extern "C" void CallPrint(const char* fmt, ...);
  37
  38 namespace SwrJit
  39 {
  40     //////////////////////////////////////////////////////////////////////////
  41     /// @brief Convert an IEEE 754 32-bit single precision float to an
  42     ///        16 bit float with 5 exponent bits and a variable
  43     ///        number of mantissa bits.
  44     /// @param val - 32-bit float
  45     /// @todo Maybe move this outside of this file into a header?
  46     static uint16_t ConvertFloat32ToFloat16(float val)
  47     {
  48         uint32_t sign, exp, mant;
  49         uint32_t roundBits;
  50
  51         // Extract the sign, exponent, and mantissa
  52         uint32_t uf = *(uint32_t*)&val;
  53         sign        = (uf & 0x80000000) >> 31;
  54         exp         = (uf & 0x7F800000) >> 23;
  55         mant        = uf & 0x007FFFFF;
  56
  57         // Check for out of range
  58         if (std::isnan(val))
  59         {
  60             exp  = 0x1F;
  61             mant = 0x200;
  62             sign = 1; // set the sign bit for NANs
  63         }
  64         else if (std::isinf(val))
  65         {
  66             exp  = 0x1f;
  67             mant = 0x0;
  68         }
  69         else if (exp > (0x70 + 0x1E)) // Too big to represent -> max representable value
  70         {
  71             exp  = 0x1E;
  72             mant = 0x3FF;
  73         }
  74         else if ((exp <= 0x70) && (exp >= 0x66)) // It's a denorm
  75         {
  76             mant |= 0x00800000;
  77             for (; exp <= 0x70; mant >>= 1, exp++)
  78                 ;
  79             exp  = 0;
  80             mant = mant >> 13;
  81         }
  82         else if (exp < 0x66) // Too small to represent -> Zero
  83         {
  84             exp  = 0;
  85             mant = 0;
  86         }
  87         else
  88         {
  89             // Saves bits that will be shifted off for rounding
  90             roundBits = mant & 0x1FFFu;
  91             // convert exponent and mantissa to 16 bit format
  92             exp  = exp - 0x70;
  93             mant = mant >> 13;
  94
  95             // Essentially RTZ, but round up if off by only 1 lsb
  96             if (roundBits == 0x1FFFu)
  97             {
  98                 mant++;
  99                 // check for overflow
 100                 if ((mant & 0xC00u) != 0)
 101                     exp++;
 102                 // make sure only the needed bits are used
 103                 mant &= 0x3FF;
 104             }
 105         }
 106
 107         uint32_t tmpVal = (sign << 15) | (exp << 10) | mant;
 108         return (uint16_t)tmpVal;
 109     }
 110
 111     Constant* Builder::C(bool i) { return ConstantInt::get(IRB()->getInt1Ty(), (i ? 1 : 0)); }
 112
 113     Constant* Builder::C(char i) { return ConstantInt::get(IRB()->getInt8Ty(), i); }
 114
 115     Constant* Builder::C(uint8_t i) { return ConstantInt::get(IRB()->getInt8Ty(), i); }
 116
 117     Constant* Builder::C(int i) { return ConstantInt::get(IRB()->getInt32Ty(), i); }
 118
 119     Constant* Builder::C(int64_t i) { return ConstantInt::get(IRB()->getInt64Ty(), i); }
 120
 121     Constant* Builder::C(uint16_t i) { return ConstantInt::get(mInt16Ty, i); }
 122
 123     Constant* Builder::C(uint32_t i) { return ConstantInt::get(IRB()->getInt32Ty(), i); }
 124
 125     Constant* Builder::C(uint64_t i) { return ConstantInt::get(IRB()->getInt64Ty(), i); }
 126
 127     Constant* Builder::C(float i) { return ConstantFP::get(IRB()->getFloatTy(), i); }
 128
 129     Constant* Builder::PRED(bool pred)
 130     {
 131         return ConstantInt::get(IRB()->getInt1Ty(), (pred ? 1 : 0));
 132     }
 133
 134     Value* Builder::VIMMED1(uint64_t i)
 135     {
 136 #if LLVM_VERSION_MAJOR > 10
 137         return ConstantVector::getSplat(ElementCount::get(mVWidth, false), cast<ConstantInt>(C(i)));
 138 #else
 139         return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i)));
 140 #endif
 141     }
 142
 143     Value* Builder::VIMMED1_16(uint64_t i)
 144     {
 145 #if LLVM_VERSION_MAJOR > 10
 146         return ConstantVector::getSplat(ElementCount::get(mVWidth16, false), cast<ConstantInt>(C(i)));
 147 #else
 148         return ConstantVector::getSplat(mVWidth16, cast<ConstantInt>(C(i)));
 149 #endif
 150     }
 151
 152     Value* Builder::VIMMED1(int i)
 153     {
 154 #if LLVM_VERSION_MAJOR > 10
 155         return ConstantVector::getSplat(ElementCount::get(mVWidth, false), cast<ConstantInt>(C(i)));
 156 #else
 157         return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i)));
 158 #endif
 159     }
 160
 161     Value* Builder::VIMMED1_16(int i)
 162     {
 163 #if LLVM_VERSION_MAJOR > 10
 164         return ConstantVector::getSplat(ElementCount::get(mVWidth16, false), cast<ConstantInt>(C(i)));
 165 #else
 166         return ConstantVector::getSplat(mVWidth16, cast<ConstantInt>(C(i)));
 167 #endif
 168     }
 169
 170     Value* Builder::VIMMED1(uint32_t i)
 171     {
 172 #if LLVM_VERSION_MAJOR > 10
 173         return ConstantVector::getSplat(ElementCount::get(mVWidth, false), cast<ConstantInt>(C(i)));
 174 #else
 175         return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i)));
 176 #endif
 177     }
 178
 179     Value* Builder::VIMMED1_16(uint32_t i)
 180     {
 181 #if LLVM_VERSION_MAJOR > 10
 182         return ConstantVector::getSplat(ElementCount::get(mVWidth16, false), cast<ConstantInt>(C(i)));
 183 #else
 184         return ConstantVector::getSplat(mVWidth16, cast<ConstantInt>(C(i)));
 185 #endif
 186     }
 187
 188     Value* Builder::VIMMED1(float i)
 189     {
 190 #if LLVM_VERSION_MAJOR > 10
 191         return ConstantVector::getSplat(ElementCount::get(mVWidth, false), cast<ConstantFP>(C(i)));
 192 #else
 193         return ConstantVector::getSplat(mVWidth, cast<ConstantFP>(C(i)));
 194 #endif
 195     }
 196
 197     Value* Builder::VIMMED1_16(float i)
 198     {
 199 #if LLVM_VERSION_MAJOR > 10
 200         return ConstantVector::getSplat(ElementCount::get(mVWidth16, false), cast<ConstantFP>(C(i)));
 201 #else
 202         return ConstantVector::getSplat(mVWidth16, cast<ConstantFP>(C(i)));
 203 #endif
 204     }
 205
 206     Value* Builder::VIMMED1(bool i)
 207     {
 208 #if LLVM_VERSION_MAJOR > 10
 209         return ConstantVector::getSplat(ElementCount::get(mVWidth, false), cast<ConstantInt>(C(i)));
 210 #else
 211         return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i)));
 212 #endif
 213     }
 214
 215     Value* Builder::VIMMED1_16(bool i)
 216     {
 217 #if LLVM_VERSION_MAJOR > 10
 218         return ConstantVector::getSplat(ElementCount::get(mVWidth16, false), cast<ConstantInt>(C(i)));
 219 #else
 220         return ConstantVector::getSplat(mVWidth16, cast<ConstantInt>(C(i)));
 221 #endif
 222     }
 223
 224     Value* Builder::VUNDEF_IPTR() { return UndefValue::get(getVectorType(mInt32PtrTy, mVWidth)); }
 225
 226     Value* Builder::VUNDEF(Type* t) { return UndefValue::get(getVectorType(t, mVWidth)); }
 227
 228     Value* Builder::VUNDEF_I() { return UndefValue::get(getVectorType(mInt32Ty, mVWidth)); }
 229
 230     Value* Builder::VUNDEF_I_16() { return UndefValue::get(getVectorType(mInt32Ty, mVWidth16)); }
 231
 232     Value* Builder::VUNDEF_F() { return UndefValue::get(getVectorType(mFP32Ty, mVWidth)); }
 233
 234     Value* Builder::VUNDEF_F_16() { return UndefValue::get(getVectorType(mFP32Ty, mVWidth16)); }
 235
 236     Value* Builder::VUNDEF(Type* ty, uint32_t size)
 237     {
 238         return UndefValue::get(getVectorType(ty, size));
 239     }
 240
 241     Value* Builder::VBROADCAST(Value* src, const llvm::Twine& name)
 242     {
 243         // check if src is already a vector
 244         if (src->getType()->isVectorTy())
 245         {
 246             return src;
 247         }
 248
 249         return VECTOR_SPLAT(mVWidth, src, name);
 250     }
 251
 252     Value* Builder::VBROADCAST_16(Value* src)
 253     {
 254         // check if src is already a vector
 255         if (src->getType()->isVectorTy())
 256         {
 257             return src;
 258         }
 259
 260         return VECTOR_SPLAT(mVWidth16, src);
 261     }
 262
 263     uint32_t Builder::IMMED(Value* v)
 264     {
 265         SWR_ASSERT(isa<ConstantInt>(v));
 266         ConstantInt* pValConst = cast<ConstantInt>(v);
 267         return pValConst->getZExtValue();
 268     }
 269
 270     int32_t Builder::S_IMMED(Value* v)
 271     {
 272         SWR_ASSERT(isa<ConstantInt>(v));
 273         ConstantInt* pValConst = cast<ConstantInt>(v);
 274         return pValConst->getSExtValue();
 275     }
 276
 277     CallInst* Builder::CALL(Value*                               Callee,
 278                             const std::initializer_list<Value*>& argsList,
 279                             const llvm::Twine&                   name)
 280     {
 281         std::vector<Value*> args;
 282         for (auto arg : argsList)
 283             args.push_back(arg);
 284 #if LLVM_VERSION_MAJOR >= 11
 285         // see comment to CALLA(Callee) function in the header
 286         return CALLA(FunctionCallee(cast<Function>(Callee)), args, name);
 287 #else
 288         return CALLA(Callee, args, name);
 289 #endif
 290     }
 291
 292     CallInst* Builder::CALL(Value* Callee, Value* arg)
 293     {
 294         std::vector<Value*> args;
 295         args.push_back(arg);
 296 #if LLVM_VERSION_MAJOR >= 11
 297         // see comment to CALLA(Callee) function in the header
 298         return CALLA(FunctionCallee(cast<Function>(Callee)), args);
 299 #else
 300         return CALLA(Callee, args);
 301 #endif
 302     }
 303
 304     CallInst* Builder::CALL2(Value* Callee, Value* arg1, Value* arg2)
 305     {
 306         std::vector<Value*> args;
 307         args.push_back(arg1);
 308         args.push_back(arg2);
 309 #if LLVM_VERSION_MAJOR >= 11
 310         // see comment to CALLA(Callee) function in the header
 311         return CALLA(FunctionCallee(cast<Function>(Callee)), args);
 312 #else
 313         return CALLA(Callee, args);
 314 #endif
 315     }
 316
 317     CallInst* Builder::CALL3(Value* Callee, Value* arg1, Value* arg2, Value* arg3)
 318     {
 319         std::vector<Value*> args;
 320         args.push_back(arg1);
 321         args.push_back(arg2);
 322         args.push_back(arg3);
 323 #if LLVM_VERSION_MAJOR >= 11
 324         // see comment to CALLA(Callee) function in the header
 325         return CALLA(FunctionCallee(cast<Function>(Callee)), args);
 326 #else
 327         return CALLA(Callee, args);
 328 #endif
 329     }
 330
 331     Value* Builder::VRCP(Value* va, const llvm::Twine& name)
 332     {
 333         return FDIV(VIMMED1(1.0f), va, name); // 1 / a
 334     }
 335
 336     Value* Builder::VPLANEPS(Value* vA, Value* vB, Value* vC, Value*& vX, Value*& vY)
 337     {
 338         Value* vOut = FMADDPS(vA, vX, vC);
 339         vOut        = FMADDPS(vB, vY, vOut);
 340         return vOut;
 341     }
 342
 343     //////////////////////////////////////////////////////////////////////////
 344     /// @brief insert a JIT call to CallPrint
 345     /// - outputs formatted string to both stdout and VS output window
 346     /// - DEBUG builds only
 347     /// Usage example:
 348     ///   PRINT("index %d = 0x%p\n",{C(lane), pIndex});
 349     ///   where C(lane) creates a constant value to print, and pIndex is the Value*
 350     ///   result from a GEP, printing out the pointer to memory
 351     /// @param printStr - constant string to print, which includes format specifiers
 352     /// @param printArgs - initializer list of Value*'s to print to std out
 353     CallInst* Builder::PRINT(const std::string&                   printStr,
 354                              const std::initializer_list<Value*>& printArgs)
 355     {
 356         // push the arguments to CallPrint into a vector
 357         std::vector<Value*> printCallArgs;
 358         // save room for the format string.  we still need to modify it for vectors
 359         printCallArgs.resize(1);
 360
 361         // search through the format string for special processing
 362         size_t      pos = 0;
 363         std::string tempStr(printStr);
 364         pos    = tempStr.find('%', pos);
 365         auto v = printArgs.begin();
 366
 367         while ((pos != std::string::npos) && (v != printArgs.end()))
 368         {
 369             Value* pArg  = *v;
 370             Type*  pType = pArg->getType();
 371
 372             if (pType->isVectorTy())
 373             {
 374                 Type* pContainedType = pType->getContainedType(0);
 375 #if LLVM_VERSION_MAJOR >= 11
 376                 VectorType* pVectorType = cast<VectorType>(pType);
 377 #endif
 378                 if (toupper(tempStr[pos + 1]) == 'X')
 379                 {
 380                     tempStr[pos]     = '0';
 381                     tempStr[pos + 1] = 'x';
 382                     tempStr.insert(pos + 2, "%08X ");
 383                     pos += 7;
 384
 385                     printCallArgs.push_back(VEXTRACT(pArg, C(0)));
 386
 387                     std::string vectorFormatStr;
 388 #if LLVM_VERSION_MAJOR >= 11
 389                     for (uint32_t i = 1; i < pVectorType->getNumElements(); ++i)
 390 #else
 391                     for (uint32_t i = 1; i < pType->getVectorNumElements(); ++i)
 392 #endif
 393                     {
 394                         vectorFormatStr += "0x%08X ";
 395                         printCallArgs.push_back(VEXTRACT(pArg, C(i)));
 396                     }
 397
 398                     tempStr.insert(pos, vectorFormatStr);
 399                     pos += vectorFormatStr.size();
 400                 }
 401                 else if ((tempStr[pos + 1] == 'f') && (pContainedType->isFloatTy()))
 402                 {
 403                     uint32_t i = 0;
 404 #if LLVM_VERSION_MAJOR >= 11
 405                     for (; i < pVectorType->getNumElements() - 1; i++)
 406 #else
 407                     for (; i < pType->getVectorNumElements() - 1; i++)
 408 #endif
 409                     {
 410                         tempStr.insert(pos, std::string("%f "));
 411                         pos += 3;
 412                         printCallArgs.push_back(
 413                             FP_EXT(VEXTRACT(pArg, C(i)), Type::getDoubleTy(JM()->mContext)));
 414                     }
 415                     printCallArgs.push_back(
 416                         FP_EXT(VEXTRACT(pArg, C(i)), Type::getDoubleTy(JM()->mContext)));
 417                 }
 418                 else if ((tempStr[pos + 1] == 'd') && (pContainedType->isIntegerTy()))
 419                 {
 420                     uint32_t i = 0;
 421 #if LLVM_VERSION_MAJOR >= 11
 422                     for (; i < pVectorType->getNumElements() - 1; i++)
 423 #else
 424                     for (; i < pType->getVectorNumElements() - 1; i++)
 425 #endif
 426                     {
 427                         tempStr.insert(pos, std::string("%d "));
 428                         pos += 3;
 429                         printCallArgs.push_back(
 430                             S_EXT(VEXTRACT(pArg, C(i)), Type::getInt32Ty(JM()->mContext)));
 431                     }
 432                     printCallArgs.push_back(
 433                         S_EXT(VEXTRACT(pArg, C(i)), Type::getInt32Ty(JM()->mContext)));
 434                 }
 435                 else if ((tempStr[pos + 1] == 'u') && (pContainedType->isIntegerTy()))
 436                 {
 437                     uint32_t i = 0;
 438 #if LLVM_VERSION_MAJOR >= 11
 439                     for (; i < pVectorType->getNumElements() - 1; i++)
 440 #else
 441                     for (; i < pType->getVectorNumElements() - 1; i++)
 442 #endif
 443                     {
 444                         tempStr.insert(pos, std::string("%d "));
 445                         pos += 3;
 446                         printCallArgs.push_back(
 447                             Z_EXT(VEXTRACT(pArg, C(i)), Type::getInt32Ty(JM()->mContext)));
 448                     }
 449                     printCallArgs.push_back(
 450                         Z_EXT(VEXTRACT(pArg, C(i)), Type::getInt32Ty(JM()->mContext)));
 451                 }
 452             }
 453             else
 454             {
 455                 if (toupper(tempStr[pos + 1]) == 'X')
 456                 {
 457                     tempStr[pos] = '0';
 458                     tempStr.insert(pos + 1, "x%08");
 459                     printCallArgs.push_back(pArg);
 460                     pos += 3;
 461                 }
 462                 // for %f we need to cast float Values to doubles so that they print out correctly
 463                 else if ((tempStr[pos + 1] == 'f') && (pType->isFloatTy()))
 464                 {
 465                     printCallArgs.push_back(FP_EXT(pArg, Type::getDoubleTy(JM()->mContext)));
 466                     pos++;
 467                 }
 468                 else
 469                 {
 470                     printCallArgs.push_back(pArg);
 471                 }
 472             }
 473
 474             // advance to the next arguement
 475             v++;
 476             pos = tempStr.find('%', ++pos);
 477         }
 478
 479         // create global variable constant string
 480         Constant*       constString = ConstantDataArray::getString(JM()->mContext, tempStr, true);
 481         GlobalVariable* gvPtr       = new GlobalVariable(
 482             constString->getType(), true, GlobalValue::InternalLinkage, constString, "printStr");
 483         JM()->mpCurrentModule->getGlobalList().push_back(gvPtr);
 484
 485         // get a pointer to the first character in the constant string array
 486         std::vector<Constant*> geplist{C(0), C(0)};
 487         Constant* strGEP = ConstantExpr::getGetElementPtr(nullptr, gvPtr, geplist, false);
 488
 489         // insert the pointer to the format string in the argument vector
 490         printCallArgs[0] = strGEP;
 491
 492         // get pointer to CallPrint function and insert decl into the module if needed
 493         std::vector<Type*> args;
 494         args.push_back(PointerType::get(mInt8Ty, 0));
 495         FunctionType* callPrintTy = FunctionType::get(Type::getVoidTy(JM()->mContext), args, true);
 496         Function*     callPrintFn =
 497 #if LLVM_VERSION_MAJOR >= 9
 498             cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("CallPrint", callPrintTy).getCallee());
 499 #else
 500             cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("CallPrint", callPrintTy));
 501 #endif
 502
 503         // if we haven't yet added the symbol to the symbol table
 504         if ((sys::DynamicLibrary::SearchForAddressOfSymbol("CallPrint")) == nullptr)
 505         {
 506             sys::DynamicLibrary::AddSymbol("CallPrint", (void*)&CallPrint);
 507         }
 508
 509         // insert a call to CallPrint
 510         return CALLA(callPrintFn, printCallArgs);
 511     }
 512
 513     //////////////////////////////////////////////////////////////////////////
 514     /// @brief Wrapper around PRINT with initializer list.
 515     CallInst* Builder::PRINT(const std::string& printStr) { return PRINT(printStr, {}); }
 516
 517     Value* Builder::EXTRACT_16(Value* x, uint32_t imm)
 518     {
 519         if (imm == 0)
 520         {
 521             return VSHUFFLE(x, UndefValue::get(x->getType()), {0, 1, 2, 3, 4, 5, 6, 7});
 522         }
 523         else
 524         {
 525             return VSHUFFLE(x, UndefValue::get(x->getType()), {8, 9, 10, 11, 12, 13, 14, 15});
 526         }
 527     }
 528
 529     Value* Builder::JOIN_16(Value* a, Value* b)
 530     {
 531         return VSHUFFLE(a, b, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15});
 532     }
 533
 534     //////////////////////////////////////////////////////////////////////////
 535     /// @brief convert x86 <N x float> mask to llvm <N x i1> mask
 536     Value* Builder::MASK(Value* vmask)
 537     {
 538         Value* src = BITCAST(vmask, mSimdInt32Ty);
 539         return ICMP_SLT(src, VIMMED1(0));
 540     }
 541
 542     Value* Builder::MASK_16(Value* vmask)
 543     {
 544         Value* src = BITCAST(vmask, mSimd16Int32Ty);
 545         return ICMP_SLT(src, VIMMED1_16(0));
 546     }
 547
 548     //////////////////////////////////////////////////////////////////////////
 549     /// @brief convert llvm <N x i1> mask to x86 <N x i32> mask
 550     Value* Builder::VMASK(Value* mask) { return S_EXT(mask, mSimdInt32Ty); }
 551
 552     Value* Builder::VMASK_16(Value* mask) { return S_EXT(mask, mSimd16Int32Ty); }
 553
 554     /// @brief Convert <Nxi1> llvm mask to integer
 555     Value* Builder::VMOVMSK(Value* mask)
 556     {
 557 #if LLVM_VERSION_MAJOR >= 11
 558         VectorType* pVectorType = cast<VectorType>(mask->getType());
 559         SWR_ASSERT(pVectorType->getElementType() == mInt1Ty);
 560         uint32_t numLanes = pVectorType->getNumElements();
 561 #else
 562         SWR_ASSERT(mask->getType()->getVectorElementType() == mInt1Ty);
 563         uint32_t numLanes = mask->getType()->getVectorNumElements();
 564 #endif
 565         Value*   i32Result;
 566         if (numLanes == 8)
 567         {
 568             i32Result = BITCAST(mask, mInt8Ty);
 569         }
 570         else if (numLanes == 16)
 571         {
 572             i32Result = BITCAST(mask, mInt16Ty);
 573         }
 574         else
 575         {
 576             SWR_ASSERT("Unsupported vector width");
 577             i32Result = BITCAST(mask, mInt8Ty);
 578         }
 579         return Z_EXT(i32Result, mInt32Ty);
 580     }
 581
 582     //////////////////////////////////////////////////////////////////////////
 583     /// @brief Generate a VPSHUFB operation in LLVM IR.  If not
 584     /// supported on the underlying platform, emulate it
 585     /// @param a - 256bit SIMD(32x8bit) of 8bit integer values
 586     /// @param b - 256bit SIMD(32x8bit) of 8bit integer mask values
 587     /// Byte masks in lower 128 lane of b selects 8 bit values from lower
 588     /// 128bits of a, and vice versa for the upper lanes.  If the mask
 589     /// value is negative, '0' is inserted.
 590     Value* Builder::PSHUFB(Value* a, Value* b)
 591     {
 592         Value* res;
 593         // use avx2 pshufb instruction if available
 594         if (JM()->mArch.AVX2())
 595         {
 596             res = VPSHUFB(a, b);
 597         }
 598         else
 599         {
 600             Constant* cB = dyn_cast<Constant>(b);
 601             assert(cB != nullptr);
 602             // number of 8 bit elements in b
 603             uint32_t numElms = cast<VectorType>(cB->getType())->getNumElements();
 604             // output vector
 605             Value* vShuf = UndefValue::get(getVectorType(mInt8Ty, numElms));
 606
 607             // insert an 8 bit value from the high and low lanes of a per loop iteration
 608             numElms /= 2;
 609             for (uint32_t i = 0; i < numElms; i++)
 610             {
 611                 ConstantInt* cLow128b  = cast<ConstantInt>(cB->getAggregateElement(i));
 612                 ConstantInt* cHigh128b = cast<ConstantInt>(cB->getAggregateElement(i + numElms));
 613
 614                 // extract values from constant mask
 615                 char valLow128bLane  = (char)(cLow128b->getSExtValue());
 616                 char valHigh128bLane = (char)(cHigh128b->getSExtValue());
 617
 618                 Value* insertValLow128b;
 619                 Value* insertValHigh128b;
 620
 621                 // if the mask value is negative, insert a '0' in the respective output position
 622                 // otherwise, lookup the value at mask position (bits 3..0 of the respective mask
 623                 // byte) in a and insert in output vector
 624                 insertValLow128b =
 625                     (valLow128bLane < 0) ? C((char)0) : VEXTRACT(a, C((valLow128bLane & 0xF)));
 626                 insertValHigh128b = (valHigh128bLane < 0)
 627                                         ? C((char)0)
 628                                         : VEXTRACT(a, C((valHigh128bLane & 0xF) + numElms));
 629
 630                 vShuf = VINSERT(vShuf, insertValLow128b, i);
 631                 vShuf = VINSERT(vShuf, insertValHigh128b, (i + numElms));
 632             }
 633             res = vShuf;
 634         }
 635         return res;
 636     }
 637
 638     //////////////////////////////////////////////////////////////////////////
 639     /// @brief Generate a VPSHUFB operation (sign extend 8 8bit values to 32
 640     /// bits)in LLVM IR.  If not supported on the underlying platform, emulate it
 641     /// @param a - 128bit SIMD lane(16x8bit) of 8bit integer values.  Only
 642     /// lower 8 values are used.
 643     Value* Builder::PMOVSXBD(Value* a)
 644     {
 645         // VPMOVSXBD output type
 646         Type* v8x32Ty = getVectorType(mInt32Ty, 8);
 647         // Extract 8 values from 128bit lane and sign extend
 648         return S_EXT(VSHUFFLE(a, a, C<int>({0, 1, 2, 3, 4, 5, 6, 7})), v8x32Ty);
 649     }
 650
 651     //////////////////////////////////////////////////////////////////////////
 652     /// @brief Generate a VPSHUFB operation (sign extend 8 16bit values to 32
 653     /// bits)in LLVM IR.  If not supported on the underlying platform, emulate it
 654     /// @param a - 128bit SIMD lane(8x16bit) of 16bit integer values.
 655     Value* Builder::PMOVSXWD(Value* a)
 656     {
 657         // VPMOVSXWD output type
 658         Type* v8x32Ty = getVectorType(mInt32Ty, 8);
 659         // Extract 8 values from 128bit lane and sign extend
 660         return S_EXT(VSHUFFLE(a, a, C<int>({0, 1, 2, 3, 4, 5, 6, 7})), v8x32Ty);
 661     }
 662
 663     //////////////////////////////////////////////////////////////////////////
 664     /// @brief Generate a VCVTPH2PS operation (float16->float32 conversion)
 665     /// in LLVM IR.  If not supported on the underlying platform, emulate it
 666     /// @param a - 128bit SIMD lane(8x16bit) of float16 in int16 format.
 667     Value* Builder::CVTPH2PS(Value* a, const llvm::Twine& name)
 668     {
 669         // Bitcast Nxint16 to Nxhalf
 670 #if LLVM_VERSION_MAJOR >= 11
 671         uint32_t numElems = cast<VectorType>(a->getType())->getNumElements();
 672 #else
 673         uint32_t numElems = a->getType()->getVectorNumElements();
 674 #endif
 675         Value*   input    = BITCAST(a, getVectorType(mFP16Ty, numElems));
 676
 677         return FP_EXT(input, getVectorType(mFP32Ty, numElems), name);
 678     }
 679
 680     //////////////////////////////////////////////////////////////////////////
 681     /// @brief Generate a VCVTPS2PH operation (float32->float16 conversion)
 682     /// in LLVM IR.  If not supported on the underlying platform, emulate it
 683     /// @param a - 128bit SIMD lane(8x16bit) of float16 in int16 format.
 684     Value* Builder::CVTPS2PH(Value* a, Value* rounding)
 685     {
 686         if (JM()->mArch.F16C())
 687         {
 688             return VCVTPS2PH(a, rounding);
 689         }
 690         else
 691         {
 692             // call scalar C function for now
 693             FunctionType* pFuncTy   = FunctionType::get(mInt16Ty, mFP32Ty);
 694             Function*     pCvtPs2Ph = cast<Function>(
 695 #if LLVM_VERSION_MAJOR >= 9
 696                 JM()->mpCurrentModule->getOrInsertFunction("ConvertFloat32ToFloat16", pFuncTy).getCallee());
 697 #else
 698                 JM()->mpCurrentModule->getOrInsertFunction("ConvertFloat32ToFloat16", pFuncTy));
 699 #endif
 700
 701             if (sys::DynamicLibrary::SearchForAddressOfSymbol("ConvertFloat32ToFloat16") == nullptr)
 702             {
 703                 sys::DynamicLibrary::AddSymbol("ConvertFloat32ToFloat16",
 704                                                (void*)&ConvertFloat32ToFloat16);
 705             }
 706
 707             Value* pResult = UndefValue::get(mSimdInt16Ty);
 708             for (uint32_t i = 0; i < mVWidth; ++i)
 709             {
 710                 Value* pSrc  = VEXTRACT(a, C(i));
 711                 Value* pConv = CALL(pCvtPs2Ph, std::initializer_list<Value*>{pSrc});
 712                 pResult      = VINSERT(pResult, pConv, C(i));
 713             }
 714
 715             return pResult;
 716         }
 717     }
 718
 719     Value* Builder::PMAXSD(Value* a, Value* b)
 720     {
 721         Value* cmp = ICMP_SGT(a, b);
 722         return SELECT(cmp, a, b);
 723     }
 724
 725     Value* Builder::PMINSD(Value* a, Value* b)
 726     {
 727         Value* cmp = ICMP_SLT(a, b);
 728         return SELECT(cmp, a, b);
 729     }
 730
 731     Value* Builder::PMAXUD(Value* a, Value* b)
 732     {
 733         Value* cmp = ICMP_UGT(a, b);
 734         return SELECT(cmp, a, b);
 735     }
 736
 737     Value* Builder::PMINUD(Value* a, Value* b)
 738     {
 739         Value* cmp = ICMP_ULT(a, b);
 740         return SELECT(cmp, a, b);
 741     }
 742
 743     // Helper function to create alloca in entry block of function
 744     Value* Builder::CreateEntryAlloca(Function* pFunc, Type* pType)
 745     {
 746         auto saveIP = IRB()->saveIP();
 747         IRB()->SetInsertPoint(&pFunc->getEntryBlock(), pFunc->getEntryBlock().begin());
 748         Value* pAlloca = ALLOCA(pType);
 749         if (saveIP.isSet())
 750             IRB()->restoreIP(saveIP);
 751         return pAlloca;
 752     }
 753
 754     Value* Builder::CreateEntryAlloca(Function* pFunc, Type* pType, Value* pArraySize)
 755     {
 756         auto saveIP = IRB()->saveIP();
 757         IRB()->SetInsertPoint(&pFunc->getEntryBlock(), pFunc->getEntryBlock().begin());
 758         Value* pAlloca = ALLOCA(pType, pArraySize);
 759         if (saveIP.isSet())
 760             IRB()->restoreIP(saveIP);
 761         return pAlloca;
 762     }
 763
 764     Value* Builder::VABSPS(Value* a)
 765     {
 766         Value* asInt  = BITCAST(a, mSimdInt32Ty);
 767         Value* result = BITCAST(AND(asInt, VIMMED1(0x7fffffff)), mSimdFP32Ty);
 768         return result;
 769     }
 770
 771     Value* Builder::ICLAMP(Value* src, Value* low, Value* high, const llvm::Twine& name)
 772     {
 773         Value* lowCmp = ICMP_SLT(src, low);
 774         Value* ret    = SELECT(lowCmp, low, src);
 775
 776         Value* highCmp = ICMP_SGT(ret, high);
 777         ret            = SELECT(highCmp, high, ret, name);
 778
 779         return ret;
 780     }
 781
 782     Value* Builder::FCLAMP(Value* src, Value* low, Value* high)
 783     {
 784         Value* lowCmp = FCMP_OLT(src, low);
 785         Value* ret    = SELECT(lowCmp, low, src);
 786
 787         Value* highCmp = FCMP_OGT(ret, high);
 788         ret            = SELECT(highCmp, high, ret);
 789
 790         return ret;
 791     }
 792
 793     Value* Builder::FCLAMP(Value* src, float low, float high)
 794     {
 795         Value* result = VMAXPS(src, VIMMED1(low));
 796         result        = VMINPS(result, VIMMED1(high));
 797
 798         return result;
 799     }
 800
 801     Value* Builder::FMADDPS(Value* a, Value* b, Value* c)
 802     {
 803         Value* vOut;
 804         // This maps to LLVM fmuladd intrinsic
 805         vOut = VFMADDPS(a, b, c);
 806         return vOut;
 807     }
 808
 809     //////////////////////////////////////////////////////////////////////////
 810     /// @brief pop count on vector mask (e.g. <8 x i1>)
 811     Value* Builder::VPOPCNT(Value* a) { return POPCNT(VMOVMSK(a)); }
 812
 813     //////////////////////////////////////////////////////////////////////////
 814     /// @brief Float / Fixed-point conversions
 815     //////////////////////////////////////////////////////////////////////////
 816     Value* Builder::VCVT_F32_FIXED_SI(Value*             vFloat,
 817                                       uint32_t           numIntBits,
 818                                       uint32_t           numFracBits,
 819                                       const llvm::Twine& name)
 820     {
 821         SWR_ASSERT((numIntBits + numFracBits) <= 32, "Can only handle 32-bit fixed-point values");
 822         Value* fixed = nullptr;
 823
 824 #if 0   // This doesn't work for negative numbers!!
 825         {
 826             fixed = FP_TO_SI(VROUND(FMUL(vFloat, VIMMED1(float(1 << numFracBits))),
 827                                     C(_MM_FROUND_TO_NEAREST_INT)),
 828                              mSimdInt32Ty);
 829         }
 830         else
 831 #endif
 832         {
 833             // Do round to nearest int on fractional bits first
 834             // Not entirely perfect for negative numbers, but close enough
 835             vFloat = VROUND(FMUL(vFloat, VIMMED1(float(1 << numFracBits))),
 836                             C(_MM_FROUND_TO_NEAREST_INT));
 837             vFloat = FMUL(vFloat, VIMMED1(1.0f / float(1 << numFracBits)));
 838
 839             // TODO: Handle INF, NAN, overflow / underflow, etc.
 840
 841             Value* vSgn      = FCMP_OLT(vFloat, VIMMED1(0.0f));
 842             Value* vFloatInt = BITCAST(vFloat, mSimdInt32Ty);
 843             Value* vFixed    = AND(vFloatInt, VIMMED1((1 << 23) - 1));
 844             vFixed           = OR(vFixed, VIMMED1(1 << 23));
 845             vFixed           = SELECT(vSgn, NEG(vFixed), vFixed);
 846
 847             Value* vExp = LSHR(SHL(vFloatInt, VIMMED1(1)), VIMMED1(24));
 848             vExp        = SUB(vExp, VIMMED1(127));
 849
 850             Value* vExtraBits = SUB(VIMMED1(23 - numFracBits), vExp);
 851
 852             fixed = ASHR(vFixed, vExtraBits, name);
 853         }
 854
 855         return fixed;
 856     }
 857
 858     Value* Builder::VCVT_FIXED_SI_F32(Value*             vFixed,
 859                                       uint32_t           numIntBits,
 860                                       uint32_t           numFracBits,
 861                                       const llvm::Twine& name)
 862     {
 863         SWR_ASSERT((numIntBits + numFracBits) <= 32, "Can only handle 32-bit fixed-point values");
 864         uint32_t extraBits = 32 - numIntBits - numFracBits;
 865         if (numIntBits && extraBits)
 866         {
 867             // Sign extend
 868             Value* shftAmt = VIMMED1(extraBits);
 869             vFixed         = ASHR(SHL(vFixed, shftAmt), shftAmt);
 870         }
 871
 872         Value* fVal  = VIMMED1(0.0f);
 873         Value* fFrac = VIMMED1(0.0f);
 874         if (numIntBits)
 875         {
 876             fVal = SI_TO_FP(ASHR(vFixed, VIMMED1(numFracBits)), mSimdFP32Ty, name);
 877         }
 878
 879         if (numFracBits)
 880         {
 881             fFrac = UI_TO_FP(AND(vFixed, VIMMED1((1 << numFracBits) - 1)), mSimdFP32Ty);
 882             fFrac = FDIV(fFrac, VIMMED1(float(1 << numFracBits)), name);
 883         }
 884
 885         return FADD(fVal, fFrac, name);
 886     }
 887
 888     Value* Builder::VCVT_F32_FIXED_UI(Value*             vFloat,
 889                                       uint32_t           numIntBits,
 890                                       uint32_t           numFracBits,
 891                                       const llvm::Twine& name)
 892     {
 893         SWR_ASSERT((numIntBits + numFracBits) <= 32, "Can only handle 32-bit fixed-point values");
 894         Value* fixed = nullptr;
 895 #if 1   // KNOB_SIM_FAST_MATH?  Below works correctly from a precision
 896         // standpoint...
 897         {
 898             fixed = FP_TO_UI(VROUND(FMUL(vFloat, VIMMED1(float(1 << numFracBits))),
 899                                     C(_MM_FROUND_TO_NEAREST_INT)),
 900                              mSimdInt32Ty);
 901         }
 902 #else
 903         {
 904             // Do round to nearest int on fractional bits first
 905             vFloat = VROUND(FMUL(vFloat, VIMMED1(float(1 << numFracBits))),
 906                             C(_MM_FROUND_TO_NEAREST_INT));
 907             vFloat = FMUL(vFloat, VIMMED1(1.0f / float(1 << numFracBits)));
 908
 909             // TODO: Handle INF, NAN, overflow / underflow, etc.
 910
 911             Value* vSgn      = FCMP_OLT(vFloat, VIMMED1(0.0f));
 912             Value* vFloatInt = BITCAST(vFloat, mSimdInt32Ty);
 913             Value* vFixed    = AND(vFloatInt, VIMMED1((1 << 23) - 1));
 914             vFixed           = OR(vFixed, VIMMED1(1 << 23));
 915
 916             Value* vExp = LSHR(SHL(vFloatInt, VIMMED1(1)), VIMMED1(24));
 917             vExp        = SUB(vExp, VIMMED1(127));
 918
 919             Value* vExtraBits = SUB(VIMMED1(23 - numFracBits), vExp);
 920
 921             fixed = LSHR(vFixed, vExtraBits, name);
 922         }
 923 #endif
 924         return fixed;
 925     }
 926
 927     Value* Builder::VCVT_FIXED_UI_F32(Value*             vFixed,
 928                                       uint32_t           numIntBits,
 929                                       uint32_t           numFracBits,
 930                                       const llvm::Twine& name)
 931     {
 932         SWR_ASSERT((numIntBits + numFracBits) <= 32, "Can only handle 32-bit fixed-point values");
 933         uint32_t extraBits = 32 - numIntBits - numFracBits;
 934         if (numIntBits && extraBits)
 935         {
 936             // Sign extend
 937             Value* shftAmt = VIMMED1(extraBits);
 938             vFixed         = ASHR(SHL(vFixed, shftAmt), shftAmt);
 939         }
 940
 941         Value* fVal  = VIMMED1(0.0f);
 942         Value* fFrac = VIMMED1(0.0f);
 943         if (numIntBits)
 944         {
 945             fVal = UI_TO_FP(LSHR(vFixed, VIMMED1(numFracBits)), mSimdFP32Ty, name);
 946         }
 947
 948         if (numFracBits)
 949         {
 950             fFrac = UI_TO_FP(AND(vFixed, VIMMED1((1 << numFracBits) - 1)), mSimdFP32Ty);
 951             fFrac = FDIV(fFrac, VIMMED1(float(1 << numFracBits)), name);
 952         }
 953
 954         return FADD(fVal, fFrac, name);
 955     }
 956
 957     //////////////////////////////////////////////////////////////////////////
 958     /// @brief C functions called by LLVM IR
 959     //////////////////////////////////////////////////////////////////////////
 960
 961     Value* Builder::VEXTRACTI128(Value* a, Constant* imm8)
 962     {
 963         bool                      flag = !imm8->isZeroValue();
 964         SmallVector<Constant*, 8> idx;
 965         for (unsigned i = 0; i < mVWidth / 2; i++)
 966         {
 967             idx.push_back(C(flag ? i + mVWidth / 2 : i));
 968         }
 969         return VSHUFFLE(a, VUNDEF_I(), ConstantVector::get(idx));
 970     }
 971
 972     Value* Builder::VINSERTI128(Value* a, Value* b, Constant* imm8)
 973     {
 974         bool                      flag = !imm8->isZeroValue();
 975         SmallVector<Constant*, 8> idx;
 976         for (unsigned i = 0; i < mVWidth; i++)
 977         {
 978             idx.push_back(C(i));
 979         }
 980         Value* inter = VSHUFFLE(b, VUNDEF_I(), ConstantVector::get(idx));
 981
 982         SmallVector<Constant*, 8> idx2;
 983         for (unsigned i = 0; i < mVWidth / 2; i++)
 984         {
 985             idx2.push_back(C(flag ? i : i + mVWidth));
 986         }
 987         for (unsigned i = mVWidth / 2; i < mVWidth; i++)
 988         {
 989             idx2.push_back(C(flag ? i + mVWidth / 2 : i));
 990         }
 991         return VSHUFFLE(a, inter, ConstantVector::get(idx2));
 992     }
 993
 994     // rdtsc buckets macros
 995     void Builder::RDTSC_START(Value* pBucketMgr, Value* pId)
 996     {
 997         // @todo due to an issue with thread local storage propagation in llvm, we can only safely
 998         // call into buckets framework when single threaded
 999         if (KNOB_SINGLE_THREADED)
1000         {
1001             std::vector<Type*> args{
1002                 PointerType::get(mInt32Ty, 0), // pBucketMgr
1003                 mInt32Ty                       // id
1004             };
1005
1006             FunctionType* pFuncTy = FunctionType::get(Type::getVoidTy(JM()->mContext), args, false);
1007             Function*     pFunc   = cast<Function>(
1008 #if LLVM_VERSION_MAJOR >= 9
1009                 JM()->mpCurrentModule->getOrInsertFunction("BucketManager_StartBucket", pFuncTy).getCallee());
1010 #else
1011                 JM()->mpCurrentModule->getOrInsertFunction("BucketManager_StartBucket", pFuncTy));
1012 #endif
1013             if (sys::DynamicLibrary::SearchForAddressOfSymbol("BucketManager_StartBucket") ==
1014                 nullptr)
1015             {
1016                 sys::DynamicLibrary::AddSymbol("BucketManager_StartBucket",
1017                                                (void*)&BucketManager_StartBucket);
1018             }
1019
1020             CALL(pFunc, {pBucketMgr, pId});
1021         }
1022     }
1023
1024     void Builder::RDTSC_STOP(Value* pBucketMgr, Value* pId)
1025     {
1026         // @todo due to an issue with thread local storage propagation in llvm, we can only safely
1027         // call into buckets framework when single threaded
1028         if (KNOB_SINGLE_THREADED)
1029         {
1030             std::vector<Type*> args{
1031                 PointerType::get(mInt32Ty, 0), // pBucketMgr
1032                 mInt32Ty                       // id
1033             };
1034
1035             FunctionType* pFuncTy = FunctionType::get(Type::getVoidTy(JM()->mContext), args, false);
1036             Function*     pFunc   = cast<Function>(
1037 #if LLVM_VERSION_MAJOR >= 9
1038                 JM()->mpCurrentModule->getOrInsertFunction("BucketManager_StopBucket", pFuncTy).getCallee());
1039 #else
1040                 JM()->mpCurrentModule->getOrInsertFunction("BucketManager_StopBucket", pFuncTy));
1041 #endif
1042             if (sys::DynamicLibrary::SearchForAddressOfSymbol("BucketManager_StopBucket") ==
1043                 nullptr)
1044             {
1045                 sys::DynamicLibrary::AddSymbol("BucketManager_StopBucket",
1046                                                (void*)&BucketManager_StopBucket);
1047             }
1048
1049             CALL(pFunc, {pBucketMgr, pId});
1050         }
1051     }
1052
1053     uint32_t Builder::GetTypeSize(Type* pType)
1054     {
1055         if (pType->isStructTy())
1056         {
1057             uint32_t numElems = pType->getStructNumElements();
1058             Type*    pElemTy  = pType->getStructElementType(0);
1059             return numElems * GetTypeSize(pElemTy);
1060         }
1061
1062         if (pType->isArrayTy())
1063         {
1064             uint32_t numElems = pType->getArrayNumElements();
1065             Type*    pElemTy  = pType->getArrayElementType();
1066             return numElems * GetTypeSize(pElemTy);
1067         }
1068
1069         if (pType->isIntegerTy())
1070         {
1071             uint32_t bitSize = pType->getIntegerBitWidth();
1072             return bitSize / 8;
1073         }
1074
1075         if (pType->isFloatTy())
1076         {
1077             return 4;
1078         }
1079
1080         if (pType->isHalfTy())
1081         {
1082             return 2;
1083         }
1084
1085         if (pType->isDoubleTy())
1086         {
1087             return 8;
1088         }
1089
1090         SWR_ASSERT(false, "Unimplemented type.");
1091         return 0;
1092     }
1093 } // namespace SwrJit