src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp

   1 /****************************************************************************
   2 * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
   3 *
   4 * Permission is hereby granted, free of charge, to any person obtaining a
   5 * copy of this software and associated documentation files (the "Software"),
   6 * to deal in the Software without restriction, including without limitation
   7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8 * and/or sell copies of the Software, and to permit persons to whom the
   9 * Software is furnished to do so, subject to the following conditions:
  10 *
  11 * The above copyright notice and this permission notice (including the next
  12 * paragraph) shall be included in all copies or substantial portions of the
  13 * Software.
  14 *
  15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21 * IN THE SOFTWARE.
  22 *
  23 * @file builder_misc.cpp
  24 *
  25 * @brief Implementation for miscellaneous builder functions
  26 *
  27 * Notes:
  28 *
  29 ******************************************************************************/
  30 #include "jit_pch.hpp"
  31 #include "builder.h"
  32 #include "common/rdtsc_buckets.h"
  33
  34 #include <cstdarg>
  35
  36 namespace SwrJit
  37 {
  38     void __cdecl CallPrint(const char* fmt, ...);
  39
  40     //////////////////////////////////////////////////////////////////////////
  41     /// @brief Convert an IEEE 754 32-bit single precision float to an
  42     ///        16 bit float with 5 exponent bits and a variable
  43     ///        number of mantissa bits.
  44     /// @param val - 32-bit float
  45     /// @todo Maybe move this outside of this file into a header?
  46     static uint16_t ConvertFloat32ToFloat16(float val)
  47     {
  48         uint32_t sign, exp, mant;
  49         uint32_t roundBits;
  50
  51         // Extract the sign, exponent, and mantissa
  52         uint32_t uf = *(uint32_t*)&val;
  53         sign = (uf & 0x80000000) >> 31;
  54         exp = (uf & 0x7F800000) >> 23;
  55         mant = uf & 0x007FFFFF;
  56
  57         // Check for out of range
  58         if (std::isnan(val))
  59         {
  60             exp = 0x1F;
  61             mant = 0x200;
  62             sign = 1;                     // set the sign bit for NANs
  63         }
  64         else if (std::isinf(val))
  65         {
  66             exp = 0x1f;
  67             mant = 0x0;
  68         }
  69         else if (exp > (0x70 + 0x1E)) // Too big to represent -> max representable value
  70         {
  71             exp = 0x1E;
  72             mant = 0x3FF;
  73         }
  74         else if ((exp <= 0x70) && (exp >= 0x66)) // It's a denorm
  75         {
  76             mant |= 0x00800000;
  77             for (; exp <= 0x70; mant >>= 1, exp++)
  78                 ;
  79             exp = 0;
  80             mant = mant >> 13;
  81         }
  82         else if (exp < 0x66) // Too small to represent -> Zero
  83         {
  84             exp = 0;
  85             mant = 0;
  86         }
  87         else
  88         {
  89             // Saves bits that will be shifted off for rounding
  90             roundBits = mant & 0x1FFFu;
  91             // convert exponent and mantissa to 16 bit format
  92             exp = exp - 0x70;
  93             mant = mant >> 13;
  94
  95             // Essentially RTZ, but round up if off by only 1 lsb
  96             if (roundBits == 0x1FFFu)
  97             {
  98                 mant++;
  99                 // check for overflow
 100                 if ((mant & 0xC00u) != 0)
 101                     exp++;
 102                 // make sure only the needed bits are used
 103                 mant &= 0x3FF;
 104             }
 105         }
 106
 107         uint32_t tmpVal = (sign << 15) | (exp << 10) | mant;
 108         return (uint16_t)tmpVal;
 109     }
 110
 111     //////////////////////////////////////////////////////////////////////////
 112     /// @brief Convert an IEEE 754 16-bit float to an 32-bit single precision
 113     ///        float
 114     /// @param val - 16-bit float
 115     /// @todo Maybe move this outside of this file into a header?
 116     static float ConvertFloat16ToFloat32(uint32_t val)
 117     {
 118         uint32_t result;
 119         if ((val & 0x7fff) == 0)
 120         {
 121             result = ((uint32_t)(val & 0x8000)) << 16;
 122         }
 123         else if ((val & 0x7c00) == 0x7c00)
 124         {
 125             result = ((val & 0x3ff) == 0) ? 0x7f800000 : 0x7fc00000;
 126             result |= ((uint32_t)val & 0x8000) << 16;
 127         }
 128         else
 129         {
 130             uint32_t sign = (val & 0x8000) << 16;
 131             uint32_t mant = (val & 0x3ff) << 13;
 132             uint32_t exp = (val >> 10) & 0x1f;
 133             if ((exp == 0) && (mant != 0)) // Adjust exponent and mantissa for denormals
 134             {
 135                 mant <<= 1;
 136                 while (mant < (0x400 << 13))
 137                 {
 138                     exp--;
 139                     mant <<= 1;
 140                 }
 141                 mant &= (0x3ff << 13);
 142             }
 143             exp = ((exp - 15 + 127) & 0xff) << 23;
 144             result = sign | exp | mant;
 145         }
 146
 147         return *(float*)&result;
 148     }
 149
 150     Constant *Builder::C(bool i)
 151     {
 152         return ConstantInt::get(IRB()->getInt1Ty(), (i ? 1 : 0));
 153     }
 154
 155     Constant *Builder::C(char i)
 156     {
 157         return ConstantInt::get(IRB()->getInt8Ty(), i);
 158     }
 159
 160     Constant *Builder::C(uint8_t i)
 161     {
 162         return ConstantInt::get(IRB()->getInt8Ty(), i);
 163     }
 164
 165     Constant *Builder::C(int i)
 166     {
 167         return ConstantInt::get(IRB()->getInt32Ty(), i);
 168     }
 169
 170     Constant *Builder::C(int64_t i)
 171     {
 172         return ConstantInt::get(IRB()->getInt64Ty(), i);
 173     }
 174
 175     Constant *Builder::C(uint16_t i)
 176     {
 177         return ConstantInt::get(mInt16Ty,i);
 178     }
 179
 180     Constant *Builder::C(uint32_t i)
 181     {
 182         return ConstantInt::get(IRB()->getInt32Ty(), i);
 183     }
 184
 185     Constant *Builder::C(float i)
 186     {
 187         return ConstantFP::get(IRB()->getFloatTy(), i);
 188     }
 189
 190     Constant *Builder::PRED(bool pred)
 191     {
 192         return ConstantInt::get(IRB()->getInt1Ty(), (pred ? 1 : 0));
 193     }
 194
 195     Value *Builder::VIMMED1(int i)
 196     {
 197         return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i)));
 198     }
 199
 200     Value *Builder::VIMMED1_16(int i)
 201     {
 202         return ConstantVector::getSplat(mVWidth16, cast<ConstantInt>(C(i)));
 203     }
 204
 205     Value *Builder::VIMMED1(uint32_t i)
 206     {
 207         return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i)));
 208     }
 209
 210     Value *Builder::VIMMED1_16(uint32_t i)
 211     {
 212         return ConstantVector::getSplat(mVWidth16, cast<ConstantInt>(C(i)));
 213     }
 214
 215     Value *Builder::VIMMED1(float i)
 216     {
 217         return ConstantVector::getSplat(mVWidth, cast<ConstantFP>(C(i)));
 218     }
 219
 220     Value *Builder::VIMMED1_16(float i)
 221     {
 222         return ConstantVector::getSplat(mVWidth16, cast<ConstantFP>(C(i)));
 223     }
 224
 225     Value *Builder::VIMMED1(bool i)
 226     {
 227         return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i)));
 228     }
 229
 230     Value *Builder::VIMMED1_16(bool i)
 231     {
 232         return ConstantVector::getSplat(mVWidth16, cast<ConstantInt>(C(i)));
 233     }
 234
 235     Value *Builder::VUNDEF_IPTR()
 236     {
 237         return UndefValue::get(VectorType::get(mInt32PtrTy,mVWidth));
 238     }
 239
 240     Value *Builder::VUNDEF(Type* t)
 241     {
 242         return UndefValue::get(VectorType::get(t, mVWidth));
 243     }
 244
 245     Value *Builder::VUNDEF_I()
 246     {
 247         return UndefValue::get(VectorType::get(mInt32Ty, mVWidth));
 248     }
 249
 250     Value *Builder::VUNDEF_I_16()
 251     {
 252         return UndefValue::get(VectorType::get(mInt32Ty, mVWidth16));
 253     }
 254
 255     Value *Builder::VUNDEF_F()
 256     {
 257         return UndefValue::get(VectorType::get(mFP32Ty, mVWidth));
 258     }
 259
 260     Value *Builder::VUNDEF_F_16()
 261     {
 262         return UndefValue::get(VectorType::get(mFP32Ty, mVWidth16));
 263     }
 264
 265     Value *Builder::VUNDEF(Type *ty, uint32_t size)
 266     {
 267         return UndefValue::get(VectorType::get(ty, size));
 268     }
 269
 270     Value *Builder::VBROADCAST(Value *src, const llvm::Twine& name)
 271     {
 272         // check if src is already a vector
 273         if (src->getType()->isVectorTy())
 274         {
 275             return src;
 276         }
 277
 278         return VECTOR_SPLAT(mVWidth, src, name);
 279     }
 280
 281     Value *Builder::VBROADCAST_16(Value *src)
 282     {
 283         // check if src is already a vector
 284         if (src->getType()->isVectorTy())
 285         {
 286             return src;
 287         }
 288
 289         return VECTOR_SPLAT(mVWidth16, src);
 290     }
 291
 292     uint32_t Builder::IMMED(Value* v)
 293     {
 294         SWR_ASSERT(isa<ConstantInt>(v));
 295         ConstantInt *pValConst = cast<ConstantInt>(v);
 296         return pValConst->getZExtValue();
 297     }
 298
 299     int32_t Builder::S_IMMED(Value* v)
 300     {
 301         SWR_ASSERT(isa<ConstantInt>(v));
 302         ConstantInt *pValConst = cast<ConstantInt>(v);
 303         return pValConst->getSExtValue();
 304     }
 305
 306     CallInst *Builder::CALL(Value *Callee, const std::initializer_list<Value*> &argsList, const llvm::Twine& name)
 307     {
 308         std::vector<Value*> args;
 309         for (auto arg : argsList)
 310             args.push_back(arg);
 311         return CALLA(Callee, args, name);
 312     }
 313
 314     CallInst *Builder::CALL(Value *Callee, Value* arg)
 315     {
 316         std::vector<Value*> args;
 317         args.push_back(arg);
 318         return CALLA(Callee, args);
 319     }
 320
 321     CallInst *Builder::CALL2(Value *Callee, Value* arg1, Value* arg2)
 322     {
 323         std::vector<Value*> args;
 324         args.push_back(arg1);
 325         args.push_back(arg2);
 326         return CALLA(Callee, args);
 327     }
 328
 329     CallInst *Builder::CALL3(Value *Callee, Value* arg1, Value* arg2, Value* arg3)
 330     {
 331         std::vector<Value*> args;
 332         args.push_back(arg1);
 333         args.push_back(arg2);
 334         args.push_back(arg3);
 335         return CALLA(Callee, args);
 336     }
 337
 338     //////////////////////////////////////////////////////////////////////////
 339     Value *Builder::DEBUGTRAP()
 340     {
 341         Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::debugtrap);
 342         return CALL(func);
 343     }
 344
 345     Value *Builder::VRCP(Value *va, const llvm::Twine& name)
 346     {
 347         return FDIV(VIMMED1(1.0f), va, name);  // 1 / a
 348     }
 349
 350     Value *Builder::VPLANEPS(Value* vA, Value* vB, Value* vC, Value* &vX, Value* &vY)
 351     {
 352         Value* vOut = FMADDPS(vA, vX, vC);
 353         vOut = FMADDPS(vB, vY, vOut);
 354         return vOut;
 355     }
 356
 357     //////////////////////////////////////////////////////////////////////////
 358     /// @brief insert a JIT call to CallPrint
 359     /// - outputs formatted string to both stdout and VS output window
 360     /// - DEBUG builds only
 361     /// Usage example:
 362     ///   PRINT("index %d = 0x%p\n",{C(lane), pIndex});
 363     ///   where C(lane) creates a constant value to print, and pIndex is the Value*
 364     ///   result from a GEP, printing out the pointer to memory
 365     /// @param printStr - constant string to print, which includes format specifiers
 366     /// @param printArgs - initializer list of Value*'s to print to std out
 367     CallInst *Builder::PRINT(const std::string &printStr,const std::initializer_list<Value*> &printArgs)
 368     {
 369         // push the arguments to CallPrint into a vector
 370         std::vector<Value*> printCallArgs;
 371         // save room for the format string.  we still need to modify it for vectors
 372         printCallArgs.resize(1);
 373
 374         // search through the format string for special processing
 375         size_t pos = 0;
 376         std::string tempStr(printStr);
 377         pos = tempStr.find('%', pos);
 378         auto v = printArgs.begin();
 379
 380         while ((pos != std::string::npos) && (v != printArgs.end()))
 381         {
 382             Value* pArg = *v;
 383             Type* pType = pArg->getType();
 384
 385             if (pType->isVectorTy())
 386             {
 387                 Type* pContainedType = pType->getContainedType(0);
 388
 389                 if (toupper(tempStr[pos + 1]) == 'X')
 390                 {
 391                     tempStr[pos] = '0';
 392                     tempStr[pos + 1] = 'x';
 393                     tempStr.insert(pos + 2, "%08X ");
 394                     pos += 7;
 395
 396                     printCallArgs.push_back(VEXTRACT(pArg, C(0)));
 397
 398                     std::string vectorFormatStr;
 399                     for (uint32_t i = 1; i < pType->getVectorNumElements(); ++i)
 400                     {
 401                         vectorFormatStr += "0x%08X ";
 402                         printCallArgs.push_back(VEXTRACT(pArg, C(i)));
 403                     }
 404
 405                     tempStr.insert(pos, vectorFormatStr);
 406                     pos += vectorFormatStr.size();
 407                 }
 408                 else if ((tempStr[pos + 1] == 'f') && (pContainedType->isFloatTy()))
 409                 {
 410                     uint32_t i = 0;
 411                     for (; i < (pArg->getType()->getVectorNumElements()) - 1; i++)
 412                     {
 413                         tempStr.insert(pos, std::string("%f "));
 414                         pos += 3;
 415                         printCallArgs.push_back(FP_EXT(VEXTRACT(pArg, C(i)), Type::getDoubleTy(JM()->mContext)));
 416                     }
 417                     printCallArgs.push_back(FP_EXT(VEXTRACT(pArg, C(i)), Type::getDoubleTy(JM()->mContext)));
 418                 }
 419                 else if ((tempStr[pos + 1] == 'd') && (pContainedType->isIntegerTy()))
 420                 {
 421                     uint32_t i = 0;
 422                     for (; i < (pArg->getType()->getVectorNumElements()) - 1; i++)
 423                     {
 424                         tempStr.insert(pos, std::string("%d "));
 425                         pos += 3;
 426                         printCallArgs.push_back(VEXTRACT(pArg, C(i)));
 427                     }
 428                     printCallArgs.push_back(VEXTRACT(pArg, C(i)));
 429                 }
 430             }
 431             else
 432             {
 433                 if (toupper(tempStr[pos + 1]) == 'X')
 434                 {
 435                     tempStr[pos] = '0';
 436                     tempStr.insert(pos + 1, "x%08");
 437                     printCallArgs.push_back(pArg);
 438                     pos += 3;
 439                 }
 440                 // for %f we need to cast float Values to doubles so that they print out correctly
 441                 else if ((tempStr[pos + 1] == 'f') && (pType->isFloatTy()))
 442                 {
 443                     printCallArgs.push_back(FP_EXT(pArg, Type::getDoubleTy(JM()->mContext)));
 444                     pos++;
 445                 }
 446                 else
 447                 {
 448                     printCallArgs.push_back(pArg);
 449                 }
 450             }
 451
 452             // advance to the next arguement
 453             v++;
 454             pos = tempStr.find('%', ++pos);
 455         }
 456
 457         // create global variable constant string
 458         Constant *constString = ConstantDataArray::getString(JM()->mContext,tempStr,true);
 459         GlobalVariable *gvPtr = new GlobalVariable(constString->getType(),true,GlobalValue::InternalLinkage,constString,"printStr");
 460         JM()->mpCurrentModule->getGlobalList().push_back(gvPtr);
 461
 462         // get a pointer to the first character in the constant string array
 463         std::vector<Constant*> geplist{C(0),C(0)};
 464         Constant *strGEP = ConstantExpr::getGetElementPtr(nullptr, gvPtr,geplist,false);
 465
 466         // insert the pointer to the format string in the argument vector
 467         printCallArgs[0] = strGEP;
 468
 469         // get pointer to CallPrint function and insert decl into the module if needed
 470         std::vector<Type*> args;
 471         args.push_back(PointerType::get(mInt8Ty,0));
 472         FunctionType* callPrintTy = FunctionType::get(Type::getVoidTy(JM()->mContext),args,true);
 473         Function *callPrintFn = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("CallPrint", callPrintTy));
 474
 475         // if we haven't yet added the symbol to the symbol table
 476         if((sys::DynamicLibrary::SearchForAddressOfSymbol("CallPrint")) == nullptr)
 477         {
 478             sys::DynamicLibrary::AddSymbol("CallPrint", (void *)&CallPrint);
 479         }
 480
 481         // insert a call to CallPrint
 482         return CALLA(callPrintFn,printCallArgs);
 483     }
 484
 485     //////////////////////////////////////////////////////////////////////////
 486     /// @brief Wrapper around PRINT with initializer list.
 487     CallInst* Builder::PRINT(const std::string &printStr)
 488     {
 489         return PRINT(printStr, {});
 490     }
 491
 492     Value *Builder::EXTRACT_16(Value *x, uint32_t imm)
 493     {
 494         if (imm == 0)
 495         {
 496             return VSHUFFLE(x, UndefValue::get(x->getType()), { 0, 1, 2, 3, 4, 5, 6, 7 });
 497         }
 498         else
 499         {
 500             return VSHUFFLE(x, UndefValue::get(x->getType()), { 8, 9, 10, 11, 12, 13, 14, 15 });
 501         }
 502     }
 503
 504     Value *Builder::JOIN_16(Value *a, Value *b)
 505     {
 506         return VSHUFFLE(a, b, { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 });
 507     }
 508
 509     //////////////////////////////////////////////////////////////////////////
 510     /// @brief convert x86 <N x float> mask to llvm <N x i1> mask
 511     Value *Builder::MASK(Value *vmask)
 512     {
 513         Value *src = BITCAST(vmask, mSimdInt32Ty);
 514         return ICMP_SLT(src, VIMMED1(0));
 515     }
 516
 517     Value *Builder::MASK_16(Value *vmask)
 518     {
 519         Value *src = BITCAST(vmask, mSimd16Int32Ty);
 520         return ICMP_SLT(src, VIMMED1_16(0));
 521     }
 522
 523     //////////////////////////////////////////////////////////////////////////
 524     /// @brief convert llvm <N x i1> mask to x86 <N x i32> mask
 525     Value *Builder::VMASK(Value *mask)
 526     {
 527         return S_EXT(mask, mSimdInt32Ty);
 528     }
 529
 530     Value *Builder::VMASK_16(Value *mask)
 531     {
 532         return S_EXT(mask, mSimd16Int32Ty);
 533     }
 534
 535     //////////////////////////////////////////////////////////////////////////
 536     /// @brief Generate a VPSHUFB operation in LLVM IR.  If not
 537     /// supported on the underlying platform, emulate it
 538     /// @param a - 256bit SIMD(32x8bit) of 8bit integer values
 539     /// @param b - 256bit SIMD(32x8bit) of 8bit integer mask values
 540     /// Byte masks in lower 128 lane of b selects 8 bit values from lower
 541     /// 128bits of a, and vice versa for the upper lanes.  If the mask
 542     /// value is negative, '0' is inserted.
 543     Value *Builder::PSHUFB(Value* a, Value* b)
 544     {
 545         Value* res;
 546         // use avx2 pshufb instruction if available
 547         if(JM()->mArch.AVX2())
 548         {
 549             res = VPSHUFB(a, b);
 550         }
 551         else
 552         {
 553             Constant* cB = dyn_cast<Constant>(b);
 554             // number of 8 bit elements in b
 555             uint32_t numElms = cast<VectorType>(cB->getType())->getNumElements();
 556             // output vector
 557             Value* vShuf = UndefValue::get(VectorType::get(mInt8Ty, numElms));
 558
 559             // insert an 8 bit value from the high and low lanes of a per loop iteration
 560             numElms /= 2;
 561             for(uint32_t i = 0; i < numElms; i++)
 562             {
 563                 ConstantInt* cLow128b = cast<ConstantInt>(cB->getAggregateElement(i));
 564                 ConstantInt* cHigh128b = cast<ConstantInt>(cB->getAggregateElement(i + numElms));
 565
 566                 // extract values from constant mask
 567                 char valLow128bLane =  (char)(cLow128b->getSExtValue());
 568                 char valHigh128bLane = (char)(cHigh128b->getSExtValue());
 569
 570                 Value* insertValLow128b;
 571                 Value* insertValHigh128b;
 572
 573                 // if the mask value is negative, insert a '0' in the respective output position
 574                 // otherwise, lookup the value at mask position (bits 3..0 of the respective mask byte) in a and insert in output vector
 575                 insertValLow128b = (valLow128bLane < 0) ? C((char)0) : VEXTRACT(a, C((valLow128bLane & 0xF)));
 576                 insertValHigh128b = (valHigh128bLane < 0) ? C((char)0) : VEXTRACT(a, C((valHigh128bLane & 0xF) + numElms));
 577
 578                 vShuf = VINSERT(vShuf, insertValLow128b, i);
 579                 vShuf = VINSERT(vShuf, insertValHigh128b, (i + numElms));
 580             }
 581             res = vShuf;
 582         }
 583         return res;
 584     }
 585
 586     //////////////////////////////////////////////////////////////////////////
 587     /// @brief Generate a VPSHUFB operation (sign extend 8 8bit values to 32
 588     /// bits)in LLVM IR.  If not supported on the underlying platform, emulate it
 589     /// @param a - 128bit SIMD lane(16x8bit) of 8bit integer values.  Only
 590     /// lower 8 values are used.
 591     Value *Builder::PMOVSXBD(Value* a)
 592     {
 593         // VPMOVSXBD output type
 594         Type* v8x32Ty = VectorType::get(mInt32Ty, 8);
 595         // Extract 8 values from 128bit lane and sign extend
 596         return S_EXT(VSHUFFLE(a, a, C<int>({0, 1, 2, 3, 4, 5, 6, 7})), v8x32Ty);
 597     }
 598
 599     //////////////////////////////////////////////////////////////////////////
 600     /// @brief Generate a VPSHUFB operation (sign extend 8 16bit values to 32
 601     /// bits)in LLVM IR.  If not supported on the underlying platform, emulate it
 602     /// @param a - 128bit SIMD lane(8x16bit) of 16bit integer values.
 603     Value *Builder::PMOVSXWD(Value* a)
 604     {
 605         // VPMOVSXWD output type
 606         Type* v8x32Ty = VectorType::get(mInt32Ty, 8);
 607         // Extract 8 values from 128bit lane and sign extend
 608         return S_EXT(VSHUFFLE(a, a, C<int>({0, 1, 2, 3, 4, 5, 6, 7})), v8x32Ty);
 609     }
 610
 611     //////////////////////////////////////////////////////////////////////////
 612     /// @brief Generate a VPERMD operation (shuffle 32 bit integer values
 613     /// across 128 bit lanes) in LLVM IR.  If not supported on the underlying
 614     /// platform, emulate it
 615     /// @param a - 256bit SIMD lane(8x32bit) of integer values.
 616     /// @param idx - 256bit SIMD lane(8x32bit) of 3 bit lane index values
 617     Value *Builder::PERMD(Value* a, Value* idx)
 618     {
 619         Value* res;
 620         // use avx2 permute instruction if available
 621         if(JM()->mArch.AVX2())
 622         {
 623             res = VPERMD(a, idx);
 624         }
 625         else
 626         {
 627             if (isa<Constant>(idx))
 628             {
 629                 res = VSHUFFLE(a, a, idx);
 630             }
 631             else
 632             {
 633                 res = VUNDEF_I();
 634                 for (uint32_t l = 0; l < JM()->mVWidth; ++l)
 635                 {
 636                     Value* pIndex = VEXTRACT(idx, C(l));
 637                     Value* pVal = VEXTRACT(a, pIndex);
 638                     res = VINSERT(res, pVal, C(l));
 639                 }
 640             }
 641         }
 642         return res;
 643     }
 644
 645     //////////////////////////////////////////////////////////////////////////
 646     /// @brief Generate a VPERMPS operation (shuffle 32 bit float values
 647     /// across 128 bit lanes) in LLVM IR.  If not supported on the underlying
 648     /// platform, emulate it
 649     /// @param a - 256bit SIMD lane(8x32bit) of float values.
 650     /// @param idx - 256bit SIMD lane(8x32bit) of 3 bit lane index values
 651     Value *Builder::PERMPS(Value* a, Value* idx)
 652     {
 653         Value* res;
 654         // use avx2 permute instruction if available
 655         if (JM()->mArch.AVX2())
 656         {
 657             // llvm 3.6.0 swapped the order of the args to vpermd
 658             res = VPERMPS(idx, a);
 659         }
 660         else
 661         {
 662             if (isa<Constant>(idx))
 663             {
 664                 res = VSHUFFLE(a, a, idx);
 665             }
 666             else
 667             {
 668                 res = VUNDEF_F();
 669                 for (uint32_t l = 0; l < JM()->mVWidth; ++l)
 670                 {
 671                     Value* pIndex = VEXTRACT(idx, C(l));
 672                     Value* pVal = VEXTRACT(a, pIndex);
 673                     res = VINSERT(res, pVal, C(l));
 674                 }
 675             }
 676         }
 677
 678         return res;
 679     }
 680
 681     //////////////////////////////////////////////////////////////////////////
 682     /// @brief Generate a VCVTPH2PS operation (float16->float32 conversion)
 683     /// in LLVM IR.  If not supported on the underlying platform, emulate it
 684     /// @param a - 128bit SIMD lane(8x16bit) of float16 in int16 format.
 685     Value *Builder::CVTPH2PS(Value* a, const llvm::Twine& name)
 686     {
 687         if (JM()->mArch.F16C())
 688         {
 689             return VCVTPH2PS(a, name);
 690         }
 691         else
 692         {
 693             FunctionType* pFuncTy = FunctionType::get(mFP32Ty, mInt16Ty);
 694             Function* pCvtPh2Ps = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("ConvertFloat16ToFloat32", pFuncTy));
 695
 696             if (sys::DynamicLibrary::SearchForAddressOfSymbol("ConvertFloat16ToFloat32") == nullptr)
 697             {
 698                 sys::DynamicLibrary::AddSymbol("ConvertFloat16ToFloat32", (void *)&ConvertFloat16ToFloat32);
 699             }
 700
 701             Value* pResult = UndefValue::get(mSimdFP32Ty);
 702             for (uint32_t i = 0; i < mVWidth; ++i)
 703             {
 704                 Value* pSrc = VEXTRACT(a, C(i));
 705                 Value* pConv = CALL(pCvtPh2Ps, std::initializer_list<Value*>{pSrc});
 706                 pResult = VINSERT(pResult, pConv, C(i));
 707             }
 708
 709             pResult->setName(name);
 710             return pResult;
 711         }
 712     }
 713
 714     //////////////////////////////////////////////////////////////////////////
 715     /// @brief Generate a VCVTPS2PH operation (float32->float16 conversion)
 716     /// in LLVM IR.  If not supported on the underlying platform, emulate it
 717     /// @param a - 128bit SIMD lane(8x16bit) of float16 in int16 format.
 718     Value *Builder::CVTPS2PH(Value* a, Value* rounding)
 719     {
 720         if (JM()->mArch.F16C())
 721         {
 722             return VCVTPS2PH(a, rounding);
 723         }
 724         else
 725         {
 726             // call scalar C function for now
 727             FunctionType* pFuncTy = FunctionType::get(mInt16Ty, mFP32Ty);
 728             Function* pCvtPs2Ph = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("ConvertFloat32ToFloat16", pFuncTy));
 729
 730             if (sys::DynamicLibrary::SearchForAddressOfSymbol("ConvertFloat32ToFloat16") == nullptr)
 731             {
 732                 sys::DynamicLibrary::AddSymbol("ConvertFloat32ToFloat16", (void *)&ConvertFloat32ToFloat16);
 733             }
 734
 735             Value* pResult = UndefValue::get(mSimdInt16Ty);
 736             for (uint32_t i = 0; i < mVWidth; ++i)
 737             {
 738                 Value* pSrc = VEXTRACT(a, C(i));
 739                 Value* pConv = CALL(pCvtPs2Ph, std::initializer_list<Value*>{pSrc});
 740                 pResult = VINSERT(pResult, pConv, C(i));
 741             }
 742
 743             return pResult;
 744         }
 745     }
 746
 747     Value *Builder::PMAXSD(Value* a, Value* b)
 748     {
 749         Value* cmp = ICMP_SGT(a, b);
 750         return SELECT(cmp, a, b);
 751     }
 752
 753     Value *Builder::PMINSD(Value* a, Value* b)
 754     {
 755         Value* cmp = ICMP_SLT(a, b);
 756         return SELECT(cmp, a, b);
 757     }
 758
 759     // Helper function to create alloca in entry block of function
 760     Value* Builder::CreateEntryAlloca(Function* pFunc, Type* pType)
 761     {
 762         auto saveIP = IRB()->saveIP();
 763         IRB()->SetInsertPoint(&pFunc->getEntryBlock(),
 764                               pFunc->getEntryBlock().begin());
 765         Value* pAlloca = ALLOCA(pType);
 766         if (saveIP.isSet()) IRB()->restoreIP(saveIP);
 767         return pAlloca;
 768     }
 769
 770     Value* Builder::CreateEntryAlloca(Function* pFunc, Type* pType, Value* pArraySize)
 771     {
 772         auto saveIP = IRB()->saveIP();
 773         IRB()->SetInsertPoint(&pFunc->getEntryBlock(),
 774             pFunc->getEntryBlock().begin());
 775         Value* pAlloca = ALLOCA(pType, pArraySize);
 776         if (saveIP.isSet()) IRB()->restoreIP(saveIP);
 777         return pAlloca;
 778     }
 779
 780     Value* Builder::VABSPS(Value* a)
 781     {
 782         Value* asInt = BITCAST(a, mSimdInt32Ty);
 783         Value* result = BITCAST(AND(asInt, VIMMED1(0x7fffffff)), mSimdFP32Ty);
 784         return result;
 785     }
 786
 787     Value *Builder::ICLAMP(Value* src, Value* low, Value* high, const llvm::Twine& name)
 788     {
 789         Value *lowCmp = ICMP_SLT(src, low);
 790         Value *ret = SELECT(lowCmp, low, src);
 791
 792         Value *highCmp = ICMP_SGT(ret, high);
 793         ret = SELECT(highCmp, high, ret, name);
 794
 795         return ret;
 796     }
 797
 798     Value *Builder::FCLAMP(Value* src, Value* low, Value* high)
 799     {
 800         Value *lowCmp = FCMP_OLT(src, low);
 801         Value *ret = SELECT(lowCmp, low, src);
 802
 803         Value *highCmp = FCMP_OGT(ret, high);
 804         ret = SELECT(highCmp, high, ret);
 805
 806         return ret;
 807     }
 808
 809     Value *Builder::FCLAMP(Value* src, float low, float high)
 810     {
 811         Value* result = VMAXPS(src, VIMMED1(low));
 812         result = VMINPS(result, VIMMED1(high));
 813
 814         return result;
 815     }
 816
 817     Value *Builder::FMADDPS(Value* a, Value* b, Value* c)
 818     {
 819         Value* vOut;
 820         // use FMADs if available
 821         if(JM()->mArch.AVX2())
 822         {
 823             vOut = VFMADDPS(a, b, c);
 824         }
 825         else
 826         {
 827             vOut = FADD(FMUL(a, b), c);
 828         }
 829         return vOut;
 830     }
 831
 832     Value* Builder::POPCNT(Value* a)
 833     {
 834         Function* pCtPop = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::ctpop, { a->getType() });
 835         return CALL(pCtPop, std::initializer_list<Value*>{a});
 836     }
 837
 838     //////////////////////////////////////////////////////////////////////////
 839     /// @brief C functions called by LLVM IR
 840     //////////////////////////////////////////////////////////////////////////
 841
 842     //////////////////////////////////////////////////////////////////////////
 843     /// @brief called in JIT code, inserted by PRINT
 844     /// output to both stdout and visual studio debug console
 845     void __cdecl CallPrint(const char* fmt, ...)
 846     {
 847         va_list args;
 848         va_start(args, fmt);
 849         vprintf(fmt, args);
 850
 851     #if defined( _WIN32 )
 852         char strBuf[1024];
 853         vsnprintf_s(strBuf, _TRUNCATE, fmt, args);
 854         OutputDebugStringA(strBuf);
 855     #endif
 856
 857         va_end(args);
 858     }
 859
 860     Value *Builder::VEXTRACTI128(Value* a, Constant* imm8)
 861     {
 862         bool flag = !imm8->isZeroValue();
 863         SmallVector<Constant*,8> idx;
 864         for (unsigned i = 0; i < mVWidth / 2; i++) {
 865             idx.push_back(C(flag ? i + mVWidth / 2 : i));
 866         }
 867         return VSHUFFLE(a, VUNDEF_I(), ConstantVector::get(idx));
 868     }
 869
 870     Value *Builder::VINSERTI128(Value* a, Value* b, Constant* imm8)
 871     {
 872         bool flag = !imm8->isZeroValue();
 873         SmallVector<Constant*,8> idx;
 874         for (unsigned i = 0; i < mVWidth; i++) {
 875             idx.push_back(C(i));
 876         }
 877         Value *inter = VSHUFFLE(b, VUNDEF_I(), ConstantVector::get(idx));
 878
 879         SmallVector<Constant*,8> idx2;
 880         for (unsigned i = 0; i < mVWidth / 2; i++) {
 881             idx2.push_back(C(flag ? i : i + mVWidth));
 882         }
 883         for (unsigned i = mVWidth / 2; i < mVWidth; i++) {
 884             idx2.push_back(C(flag ? i + mVWidth / 2 : i));
 885         }
 886         return VSHUFFLE(a, inter, ConstantVector::get(idx2));
 887     }
 888
 889     // rdtsc buckets macros
 890     void Builder::RDTSC_START(Value* pBucketMgr, Value* pId)
 891     {
 892         // @todo due to an issue with thread local storage propagation in llvm, we can only safely call into
 893         // buckets framework when single threaded
 894         if (KNOB_SINGLE_THREADED)
 895         {
 896             std::vector<Type*> args{
 897                 PointerType::get(mInt32Ty, 0),   // pBucketMgr
 898                 mInt32Ty                        // id
 899             };
 900
 901             FunctionType* pFuncTy = FunctionType::get(Type::getVoidTy(JM()->mContext), args, false);
 902             Function* pFunc = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("BucketManager_StartBucket", pFuncTy));
 903             if (sys::DynamicLibrary::SearchForAddressOfSymbol("BucketManager_StartBucket") == nullptr)
 904             {
 905                 sys::DynamicLibrary::AddSymbol("BucketManager_StartBucket", (void*)&BucketManager_StartBucket);
 906             }
 907
 908             CALL(pFunc, { pBucketMgr, pId });
 909         }
 910     }
 911
 912     void Builder::RDTSC_STOP(Value* pBucketMgr, Value* pId)
 913     {
 914         // @todo due to an issue with thread local storage propagation in llvm, we can only safely call into
 915         // buckets framework when single threaded
 916         if (KNOB_SINGLE_THREADED)
 917         {
 918             std::vector<Type*> args{
 919                 PointerType::get(mInt32Ty, 0),   // pBucketMgr
 920                 mInt32Ty                        // id
 921             };
 922
 923             FunctionType* pFuncTy = FunctionType::get(Type::getVoidTy(JM()->mContext), args, false);
 924             Function* pFunc = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("BucketManager_StopBucket", pFuncTy));
 925             if (sys::DynamicLibrary::SearchForAddressOfSymbol("BucketManager_StopBucket") == nullptr)
 926             {
 927                 sys::DynamicLibrary::AddSymbol("BucketManager_StopBucket", (void*)&BucketManager_StopBucket);
 928             }
 929
 930             CALL(pFunc, { pBucketMgr, pId });
 931         }
 932     }
 933
 934     uint32_t Builder::GetTypeSize(Type* pType)
 935     {
 936         if (pType->isStructTy())
 937         {
 938             uint32_t numElems = pType->getStructNumElements();
 939             Type* pElemTy = pType->getStructElementType(0);
 940             return numElems * GetTypeSize(pElemTy);
 941         }
 942
 943         if (pType->isArrayTy())
 944         {
 945             uint32_t numElems = pType->getArrayNumElements();
 946             Type* pElemTy = pType->getArrayElementType();
 947             return numElems * GetTypeSize(pElemTy);
 948         }
 949
 950         if (pType->isIntegerTy())
 951         {
 952             uint32_t bitSize = pType->getIntegerBitWidth();
 953             return bitSize / 8;
 954         }
 955
 956         if (pType->isFloatTy())
 957         {
 958             return 4;
 959         }
 960
 961         if (pType->isHalfTy())
 962         {
 963             return 2;
 964         }
 965
 966         if (pType->isDoubleTy())
 967         {
 968             return 8;
 969         }
 970
 971         SWR_ASSERT(false, "Unimplemented type.");
 972         return 0;
 973     }
 974 }