src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp

   1 /****************************************************************************
   2 * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
   3 *
   4 * Permission is hereby granted, free of charge, to any person obtaining a
   5 * copy of this software and associated documentation files (the "Software"),
   6 * to deal in the Software without restriction, including without limitation
   7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8 * and/or sell copies of the Software, and to permit persons to whom the
   9 * Software is furnished to do so, subject to the following conditions:
  10 *
  11 * The above copyright notice and this permission notice (including the next
  12 * paragraph) shall be included in all copies or substantial portions of the
  13 * Software.
  14 *
  15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21 * IN THE SOFTWARE.
  22 *
  23 * @file fetch_jit.cpp
  24 *
  25 * @brief Implementation of the fetch jitter
  26 *
  27 * Notes:
  28 *
  29 ******************************************************************************/
  30 #include "jit_pch.hpp"
  31 #include "builder_gfx_mem.h"
  32 #include "jit_api.h"
  33 #include "fetch_jit.h"
  34 #include "gen_state_llvm.h"
  35 #include "functionpasses/passes.h"
  36
  37 //#define FETCH_DUMP_VERTEX 1
  38 using namespace llvm;
  39 using namespace SwrJit;
  40
  41 bool isComponentEnabled(ComponentEnable enableMask, uint8_t component);
  42
  43 enum ConversionType
  44 {
  45     CONVERT_NONE,
  46     CONVERT_NORMALIZED,
  47     CONVERT_USCALED,
  48     CONVERT_SSCALED,
  49     CONVERT_SFIXED,
  50 };
  51
  52 //////////////////////////////////////////////////////////////////////////
  53 /// Interface to Jitting a fetch shader
  54 //////////////////////////////////////////////////////////////////////////
  55 struct FetchJit : public BuilderGfxMem
  56 {
  57     FetchJit(JitManager* pJitMgr) :
  58         BuilderGfxMem(pJitMgr)
  59     {}
  60
  61     Function* Create(const FETCH_COMPILE_STATE& fetchState);
  62
  63     Value* GetSimdValid32bitIndices(Value* vIndices, Value* pLastIndex);
  64     Value* GetSimdValid16bitIndices(Value* vIndices, Value* pLastIndex);
  65     Value* GetSimdValid8bitIndices(Value* vIndices, Value* pLastIndex);
  66
  67     // package up Shuffle*bpcGatherd args into a tuple for convenience
  68     typedef std::tuple<Value*&, Value*, const Instruction::CastOps, const ConversionType,
  69         uint32_t&, uint32_t&, const ComponentEnable, const ComponentControl(&)[4], Value*(&)[4],
  70         const uint32_t(&)[4]> Shuffle8bpcArgs;
  71
  72     void Shuffle8bpcGatherd16(Shuffle8bpcArgs &args);
  73     void Shuffle8bpcGatherd(Shuffle8bpcArgs &args);
  74
  75     typedef std::tuple<Value*(&)[2], Value*, const Instruction::CastOps, const ConversionType,
  76         uint32_t&, uint32_t&, const ComponentEnable, const ComponentControl(&)[4], Value*(&)[4]> Shuffle16bpcArgs;
  77
  78     void Shuffle16bpcGather16(Shuffle16bpcArgs &args);
  79     void Shuffle16bpcGather(Shuffle16bpcArgs &args);
  80
  81     void StoreVertexElements(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4]);
  82
  83     Value *GenerateCompCtrlVector(const ComponentControl ctrl);
  84
  85     void JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, Value* streams, Value* vIndices, Value* pVtxOut);
  86
  87     bool IsOddFormat(SWR_FORMAT format);
  88     bool IsUniformFormat(SWR_FORMAT format);
  89     void UnpackComponents(SWR_FORMAT format, Value* vInput, Value* result[4]);
  90     void CreateGatherOddFormats(SWR_FORMAT format, Value* pMask, Value* pBase, Value* offsets, Value* result[4]);
  91     void ConvertFormat(SWR_FORMAT format, Value *texels[4]);
  92
  93     Value* mpFetchInfo;
  94 };
  95
  96 Function* FetchJit::Create(const FETCH_COMPILE_STATE& fetchState)
  97 {
  98     std::stringstream fnName("FCH_", std::ios_base::in | std::ios_base::out | std::ios_base::ate);
  99     fnName << ComputeCRC(0, &fetchState, sizeof(fetchState));
 100
 101     Function*    fetch = Function::Create(JM()->mFetchShaderTy, GlobalValue::ExternalLinkage, fnName.str(), JM()->mpCurrentModule);
 102     BasicBlock*    entry = BasicBlock::Create(JM()->mContext, "entry", fetch);
 103
 104     fetch->getParent()->setModuleIdentifier(fetch->getName());
 105
 106     IRB()->SetInsertPoint(entry);
 107
 108     auto    argitr = fetch->arg_begin();
 109
 110     // Fetch shader arguments
 111     Value* privateContext = &*argitr; ++argitr;
 112     privateContext->setName("privateContext");
 113     SetPrivateContext(privateContext);
 114
 115     mpFetchInfo = &*argitr; ++argitr;
 116     mpFetchInfo->setName("fetchInfo");
 117     Value*    pVtxOut = &*argitr;
 118     pVtxOut->setName("vtxOutput");
 119
 120     uint32_t baseWidth = mVWidth;
 121
 122     SWR_ASSERT(mVWidth == 8 || mVWidth == 16, "Unsupported vector width %d", mVWidth);
 123
 124     // Override builder target width to force 16-wide SIMD
 125 #if USE_SIMD16_SHADERS
 126     SetTargetWidth(16);
 127 #endif
 128
 129     pVtxOut = BITCAST(pVtxOut, PointerType::get(mSimdFP32Ty, 0));
 130
 131     // SWR_FETCH_CONTEXT::pStreams
 132     Value*    streams = LOAD(mpFetchInfo,{0, SWR_FETCH_CONTEXT_pStreams});
 133     streams->setName("pStreams");
 134
 135     // SWR_FETCH_CONTEXT::pIndices
 136     Value*    indices = LOAD(mpFetchInfo,{0, SWR_FETCH_CONTEXT_pIndices});
 137     indices->setName("pIndices");
 138
 139     // SWR_FETCH_CONTEXT::pLastIndex
 140     Value*    pLastIndex = LOAD(mpFetchInfo,{0, SWR_FETCH_CONTEXT_pLastIndex});
 141     pLastIndex->setName("pLastIndex");
 142
 143     Value* vIndices;
 144     switch(fetchState.indexType)
 145     {
 146         case R8_UINT:
 147             indices = BITCAST(indices, Type::getInt8PtrTy(JM()->mContext, 0));
 148             if(fetchState.bDisableIndexOOBCheck)
 149             {
 150                 vIndices = LOAD(BITCAST(indices, PointerType::get(VectorType::get(mInt8Ty, mpJitMgr->mVWidth), 0)), {(uint32_t)0});
 151                 vIndices = Z_EXT(vIndices, mSimdInt32Ty);
 152             }
 153             else
 154             {
 155                 pLastIndex = BITCAST(pLastIndex, Type::getInt8PtrTy(JM()->mContext, 0));
 156                 vIndices = GetSimdValid8bitIndices(indices, pLastIndex);
 157             }
 158             break;
 159         case R16_UINT:
 160             indices = BITCAST(indices, Type::getInt16PtrTy(JM()->mContext, 0));
 161             if(fetchState.bDisableIndexOOBCheck)
 162             {
 163                 vIndices = LOAD(BITCAST(indices, PointerType::get(VectorType::get(mInt16Ty, mpJitMgr->mVWidth), 0)), {(uint32_t)0});
 164                 vIndices = Z_EXT(vIndices, mSimdInt32Ty);
 165             }
 166             else
 167             {
 168                 pLastIndex = BITCAST(pLastIndex, Type::getInt16PtrTy(JM()->mContext, 0));
 169                 vIndices = GetSimdValid16bitIndices(indices, pLastIndex);
 170             }
 171             break;
 172         case R32_UINT:
 173             (fetchState.bDisableIndexOOBCheck) ? vIndices = LOAD(BITCAST(indices, PointerType::get(mSimdInt32Ty,0)),{(uint32_t)0})
 174                                                : vIndices = GetSimdValid32bitIndices(indices, pLastIndex);
 175             break; // incoming type is already 32bit int
 176         default:
 177             SWR_INVALID("Unsupported index type");
 178             vIndices = nullptr;
 179             break;
 180     }
 181
 182     if(fetchState.bForceSequentialAccessEnable)
 183     {
 184         Value* pOffsets = mVWidth == 8 ? C({ 0, 1, 2, 3, 4, 5, 6, 7 }) :
 185             C({ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 });
 186
 187         // VertexData buffers are accessed sequentially, the index is equal to the vertex number
 188         vIndices = VBROADCAST(LOAD(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_StartVertex }));
 189         vIndices = ADD(vIndices, pOffsets);
 190     }
 191
 192     Value* vVertexId = vIndices;
 193     if (fetchState.bVertexIDOffsetEnable)
 194     {
 195         // Assuming one of baseVertex or startVertex is 0, so adding both should be functionally correct
 196         Value* vBaseVertex = VBROADCAST(LOAD(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_BaseVertex }));
 197         Value* vStartVertex = VBROADCAST(LOAD(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_StartVertex }));
 198         vVertexId = ADD(vIndices, vBaseVertex);
 199         vVertexId = ADD(vVertexId, vStartVertex);
 200     }
 201
 202     // store out vertex IDs
 203     if (mVWidth == 16)
 204     {
 205         // store out in simd8 halves until core supports 16-wide natively
 206         auto vVertexIdLo = EXTRACT_16(vVertexId, 0);
 207         auto vVertexIdHi = EXTRACT_16(vVertexId, 1);
 208         STORE(vVertexIdLo, GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID }));
 209         STORE(vVertexIdHi, GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID2 }));
 210     }
 211     else if (mVWidth == 8)
 212     {
 213         STORE(vVertexId, GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID }));
 214     }
 215
 216     // store out cut mask if enabled
 217     if (fetchState.bEnableCutIndex)
 218     {
 219         Value* vCutIndex = VIMMED1(fetchState.cutIndex);
 220         Value* cutMask = VMASK(ICMP_EQ(vIndices, vCutIndex));
 221
 222         if (mVWidth == 16)
 223         {
 224             auto cutMaskLo = EXTRACT_16(cutMask, 0);
 225             auto cutMaskHi = EXTRACT_16(cutMask, 1);
 226             STORE(cutMaskLo, GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_CutMask }));
 227             STORE(cutMaskHi, GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_CutMask2 }));
 228         }
 229         else if (mVWidth == 8)
 230         {
 231             STORE(cutMask, GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_CutMask }));
 232         }
 233     }
 234
 235     // Fetch attributes from memory and output to a simdvertex struct
 236     // since VGATHER has a perf penalty on HSW vs BDW, allow client to choose which fetch method to use
 237     JitGatherVertices(fetchState, streams, vIndices, pVtxOut);
 238
 239     RET_VOID();
 240
 241     JitManager::DumpToFile(fetch, "src");
 242
 243 #if defined(_DEBUG)
 244     verifyFunction(*fetch);
 245 #endif
 246
 247     ::FunctionPassManager setupPasses(JM()->mpCurrentModule);
 248
 249     ///@todo We don't need the CFG passes for fetch. (e.g. BreakCriticalEdges and CFGSimplification)
 250     setupPasses.add(createBreakCriticalEdgesPass());
 251     setupPasses.add(createCFGSimplificationPass());
 252     setupPasses.add(createEarlyCSEPass());
 253     setupPasses.add(createPromoteMemoryToRegisterPass());
 254
 255     setupPasses.run(*fetch);
 256
 257     JitManager::DumpToFile(fetch, "se");
 258
 259     ::FunctionPassManager optPasses(JM()->mpCurrentModule);
 260
 261     ///@todo Haven't touched these either. Need to remove some of these and add others.
 262     optPasses.add(createCFGSimplificationPass());
 263     optPasses.add(createEarlyCSEPass());
 264     optPasses.add(createInstructionCombiningPass());
 265     optPasses.add(createInstructionSimplifierPass());
 266     optPasses.add(createConstantPropagationPass());
 267     optPasses.add(createSCCPPass());
 268     optPasses.add(createAggressiveDCEPass());
 269
 270     optPasses.run(*fetch);
 271
 272     optPasses.add(createLowerX86Pass(JM(), this));
 273     optPasses.run(*fetch);
 274
 275     JitManager::DumpToFile(fetch, "opt");
 276
 277
 278     // Revert 16-wide override
 279 #if USE_SIMD16_SHADERS
 280     SetTargetWidth(baseWidth);
 281 #endif
 282
 283     return fetch;
 284 }
 285
 286 // returns true for odd formats that require special state.gather handling
 287 bool FetchJit::IsOddFormat(SWR_FORMAT format)
 288 {
 289     const SWR_FORMAT_INFO& info = GetFormatInfo(format);
 290     if (info.bpc[0] != 8 && info.bpc[0] != 16 && info.bpc[0] != 32 && info.bpc[0] != 64)
 291     {
 292         return true;
 293     }
 294     return false;
 295 }
 296
 297 // format is uniform if all components are the same size and type
 298 bool FetchJit::IsUniformFormat(SWR_FORMAT format)
 299 {
 300     const SWR_FORMAT_INFO& info = GetFormatInfo(format);
 301     uint32_t bpc0 = info.bpc[0];
 302     uint32_t type0 = info.type[0];
 303
 304     for (uint32_t c = 1; c < info.numComps; ++c)
 305     {
 306         if (bpc0 != info.bpc[c] || type0 != info.type[c])
 307         {
 308             return false;
 309         }
 310     }
 311     return true;
 312 }
 313
 314 // unpacks components based on format
 315 // foreach component in the pixel
 316 //   mask off everything but this component
 317 //   shift component to LSB
 318 void FetchJit::UnpackComponents(SWR_FORMAT format, Value* vInput, Value* result[4])
 319 {
 320     const SWR_FORMAT_INFO& info = GetFormatInfo(format);
 321
 322     uint32_t bitOffset = 0;
 323     for (uint32_t c = 0; c < info.numComps; ++c)
 324     {
 325         uint32_t swizzledIndex = info.swizzle[c];
 326         uint32_t compBits = info.bpc[c];
 327         uint32_t bitmask = ((1 << compBits) - 1) << bitOffset;
 328         Value* comp = AND(vInput, bitmask);
 329         comp = LSHR(comp, bitOffset);
 330
 331         result[swizzledIndex] = comp;
 332         bitOffset += compBits;
 333     }
 334 }
 335
 336 // gather for odd component size formats
 337 // gather SIMD full pixels per lane then shift/mask to move each component to their
 338 // own vector
 339 void FetchJit::CreateGatherOddFormats(SWR_FORMAT format, Value* pMask, Value* pBase, Value* pOffsets, Value* pResult[4])
 340 {
 341     const SWR_FORMAT_INFO &info = GetFormatInfo(format);
 342
 343     // only works if pixel size is <= 32bits
 344     SWR_ASSERT(info.bpp <= 32);
 345
 346     Value *pGather;
 347     if (info.bpp == 32)
 348     {
 349         pGather = GATHERDD(VIMMED1(0), pBase, pOffsets, pMask);
 350     }
 351     else
 352     {
 353         // Can't use 32-bit gather for items less than 32-bits, could cause page faults.
 354         Value *pMem = ALLOCA(mSimdInt32Ty);
 355         STORE(VIMMED1(0u), pMem);
 356
 357         pBase = BITCAST(pBase, PointerType::get(mInt8Ty, 0));
 358         Value* pDstMem = BITCAST(pMem, mInt32PtrTy);
 359
 360         for (uint32_t lane = 0; lane < mVWidth; ++lane)
 361         {
 362             // Get index
 363             Value* index = VEXTRACT(pOffsets, C(lane));
 364             Value* mask = VEXTRACT(pMask, C(lane));
 365             switch (info.bpp)
 366             {
 367             case 8:
 368             {
 369                 Value* pDst = BITCAST(GEP(pDstMem, C(lane)), PointerType::get(mInt8Ty, 0));
 370                 Value* pSrc = BITCAST(GEP(pBase, index), PointerType::get(mInt8Ty, 0));
 371                 STORE(LOAD(SELECT(mask, pSrc, pDst)), pDst);
 372                 break;
 373             }
 374
 375             case 16:
 376             {
 377                 Value* pDst = BITCAST(GEP(pDstMem, C(lane)), PointerType::get(mInt16Ty, 0));
 378                 Value* pSrc = BITCAST(GEP(pBase, index), PointerType::get(mInt16Ty, 0));
 379                 STORE(LOAD(SELECT(mask, pSrc, pDst)), pDst);
 380                 break;
 381             }
 382             break;
 383
 384             case 24:
 385             {
 386                 // First 16-bits of data
 387                 Value* pDst = BITCAST(GEP(pDstMem, C(lane)), PointerType::get(mInt16Ty, 0));
 388                 Value* pSrc = BITCAST(GEP(pBase, index), PointerType::get(mInt16Ty, 0));
 389                 STORE(LOAD(SELECT(mask, pSrc, pDst)), pDst);
 390
 391                 // Last 8-bits of data
 392                 pDst = BITCAST(GEP(pDst, C(1)), PointerType::get(mInt8Ty, 0));
 393                 pSrc = BITCAST(GEP(pSrc, C(1)), PointerType::get(mInt8Ty, 0));
 394                 STORE(LOAD(SELECT(mask, pSrc, pDst)), pDst);
 395                 break;
 396             }
 397
 398             default:
 399                 SWR_INVALID("Shouldn't have BPP = %d now", info.bpp);
 400                 break;
 401             }
 402         }
 403
 404         pGather = LOAD(pMem);
 405     }
 406
 407     for (uint32_t comp = 0; comp < 4; ++comp)
 408     {
 409         pResult[comp] = VIMMED1((int)info.defaults[comp]);
 410     }
 411
 412     UnpackComponents(format, pGather, pResult);
 413
 414     // cast to fp32
 415     pResult[0] = BITCAST(pResult[0], mSimdFP32Ty);
 416     pResult[1] = BITCAST(pResult[1], mSimdFP32Ty);
 417     pResult[2] = BITCAST(pResult[2], mSimdFP32Ty);
 418     pResult[3] = BITCAST(pResult[3], mSimdFP32Ty);
 419 }
 420
 421 void FetchJit::ConvertFormat(SWR_FORMAT format, Value *texels[4])
 422 {
 423     const SWR_FORMAT_INFO &info = GetFormatInfo(format);
 424
 425     for (uint32_t c = 0; c < info.numComps; ++c)
 426     {
 427         uint32_t compIndex = info.swizzle[c];
 428
 429         // skip any conversion on UNUSED components
 430         if (info.type[c] == SWR_TYPE_UNUSED)
 431         {
 432             continue;
 433         }
 434
 435         if (info.isNormalized[c])
 436         {
 437             if (info.type[c] == SWR_TYPE_SNORM)
 438             {
 439                 /// @todo The most-negative value maps to -1.0f. e.g. the 5-bit value 10000 maps to -1.0f.
 440
 441                 /// result = c * (1.0f / (2^(n-1) - 1);
 442                 uint32_t n = info.bpc[c];
 443                 uint32_t pow2 = 1 << (n - 1);
 444                 float scale = 1.0f / (float)(pow2 - 1);
 445                 Value *vScale = VIMMED1(scale);
 446                 texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty);
 447                 texels[compIndex] = SI_TO_FP(texels[compIndex], mSimdFP32Ty);
 448                 texels[compIndex] = FMUL(texels[compIndex], vScale);
 449             }
 450             else
 451             {
 452                 SWR_ASSERT(info.type[c] == SWR_TYPE_UNORM);
 453
 454                 /// result = c * (1.0f / (2^n - 1))
 455                 uint32_t n = info.bpc[c];
 456                 uint32_t pow2 = 1 << n;
 457                 // special case 24bit unorm format, which requires a full divide to meet ULP requirement
 458                 if (n == 24)
 459                 {
 460                     float scale = (float)(pow2 - 1);
 461                     Value* vScale = VIMMED1(scale);
 462                     texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty);
 463                     texels[compIndex] = SI_TO_FP(texels[compIndex], mSimdFP32Ty);
 464                     texels[compIndex] = FDIV(texels[compIndex], vScale);
 465                 }
 466                 else
 467                 {
 468                     float scale = 1.0f / (float)(pow2 - 1);
 469                     Value *vScale = VIMMED1(scale);
 470                     texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty);
 471                     texels[compIndex] = UI_TO_FP(texels[compIndex], mSimdFP32Ty);
 472                     texels[compIndex] = FMUL(texels[compIndex], vScale);
 473                 }
 474             }
 475             continue;
 476         }
 477     }
 478 }
 479
 480 //////////////////////////////////////////////////////////////////////////
 481 /// @brief Loads attributes from memory using AVX2 GATHER(s)
 482 /// @param fetchState - info about attributes to be fetched from memory
 483 /// @param streams - value pointer to the current vertex stream
 484 /// @param vIndices - vector value of indices to gather
 485 /// @param pVtxOut - value pointer to output simdvertex struct
 486 void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
 487     Value* streams, Value* vIndices, Value* pVtxOut)
 488 {
 489     uint32_t currentVertexElement = 0;
 490     uint32_t outputElt = 0;
 491     Value* vVertexElements[4];
 492
 493     Value* startVertex = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartVertex});
 494     Value* startInstance = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartInstance});
 495     Value* curInstance = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_CurInstance});
 496     Value* vBaseVertex = VBROADCAST(LOAD(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_BaseVertex }));
 497     curInstance->setName("curInstance");
 498
 499     for (uint32_t nInputElt = 0; nInputElt < fetchState.numAttribs; nInputElt += 1)
 500     {
 501         const INPUT_ELEMENT_DESC& ied = fetchState.layout[nInputElt];
 502
 503         // skip element if all components are disabled
 504         if (ied.ComponentPacking == ComponentEnable::NONE)
 505         {
 506             continue;
 507         }
 508
 509         const SWR_FORMAT_INFO &info = GetFormatInfo((SWR_FORMAT)ied.Format);
 510         SWR_ASSERT((info.bpp != 0), "Unsupported format in JitGatherVertices.");
 511         uint32_t bpc = info.bpp / info.numComps;  ///@todo Code below assumes all components are same size. Need to fix.
 512
 513         Value *stream = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_xpData});
 514
 515         // VGATHER* takes an *i8 src pointer
 516         Value *pStreamBase = INT_TO_PTR(stream, PointerType::get(mInt8Ty, 0));
 517
 518         Value *stride = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pitch});
 519         Value *vStride = VBROADCAST(stride);
 520
 521         // max vertex index that is fully in bounds
 522         Value *maxVertex = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_maxVertex)});
 523         maxVertex = LOAD(maxVertex);
 524
 525         Value *minVertex = NULL;
 526         if (fetchState.bPartialVertexBuffer)
 527         {
 528             // min vertex index for low bounds OOB checking
 529             minVertex = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_minVertex)});
 530             minVertex = LOAD(minVertex);
 531         }
 532
 533         if (fetchState.bInstanceIDOffsetEnable)
 534         {
 535             // the InstanceID (curInstance) value is offset by StartInstanceLocation
 536             curInstance = ADD(curInstance, startInstance);
 537         }
 538
 539         Value *vCurIndices;
 540         Value *startOffset;
 541         Value *vInstanceStride = VIMMED1(0);
 542
 543         if (ied.InstanceEnable)
 544         {
 545             Value* stepRate = C(ied.InstanceAdvancementState);
 546
 547             // prevent a div by 0 for 0 step rate
 548             Value* isNonZeroStep = ICMP_UGT(stepRate, C(0));
 549             stepRate = SELECT(isNonZeroStep, stepRate, C(1));
 550
 551             // calc the current offset into instanced data buffer
 552             Value* calcInstance = UDIV(curInstance, stepRate);
 553
 554             // if step rate is 0, every instance gets instance 0
 555             calcInstance = SELECT(isNonZeroStep, calcInstance, C(0));
 556
 557             vCurIndices = VBROADCAST(calcInstance);
 558             startOffset = startInstance;
 559         }
 560         else if (ied.InstanceStrideEnable)
 561         {
 562             // grab the instance advancement state, determines stride in bytes from one instance to the next
 563             Value* stepRate = C(ied.InstanceAdvancementState);
 564             vInstanceStride = VBROADCAST(MUL(curInstance, stepRate));
 565
 566             // offset indices by baseVertex
 567             vCurIndices = ADD(vIndices, vBaseVertex);
 568
 569             startOffset = startVertex;
 570             SWR_ASSERT((0), "TODO: Fill out more once driver sends this down.");
 571         }
 572         else
 573         {
 574             // offset indices by baseVertex
 575             vCurIndices = ADD(vIndices, vBaseVertex);
 576             startOffset = startVertex;
 577         }
 578
 579         // All of the OOB calculations are in vertices, not VB offsets, to prevent having to
 580         // do 64bit address offset calculations.
 581
 582         // calculate byte offset to the start of the VB
 583         Value* baseOffset = MUL(Z_EXT(startOffset, mInt64Ty), Z_EXT(stride, mInt64Ty));
 584         pStreamBase = GEP(pStreamBase, baseOffset);
 585         Value* pStreamBaseGFX = ADD(stream, baseOffset);
 586
 587         // if we have a start offset, subtract from max vertex. Used for OOB check
 588         maxVertex = SUB(Z_EXT(maxVertex, mInt64Ty), Z_EXT(startOffset, mInt64Ty));
 589         Value* maxNeg = ICMP_SLT(maxVertex, C((int64_t)0));
 590         // if we have a negative value, we're already OOB. clamp at 0.
 591         maxVertex = SELECT(maxNeg, C(0), TRUNC(maxVertex, mInt32Ty));
 592
 593         if (fetchState.bPartialVertexBuffer)
 594         {
 595             // similary for min vertex
 596             minVertex = SUB(Z_EXT(minVertex, mInt64Ty), Z_EXT(startOffset, mInt64Ty));
 597             Value *minNeg = ICMP_SLT(minVertex, C((int64_t)0));
 598             minVertex = SELECT(minNeg, C(0), TRUNC(minVertex, mInt32Ty));
 599         }
 600
 601         // Load the in bounds size of a partially valid vertex
 602         Value *partialInboundsSize = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_partialInboundsSize)});
 603         partialInboundsSize = LOAD(partialInboundsSize);
 604         Value *vPartialVertexSize = VBROADCAST(partialInboundsSize);
 605         Value *vBpp = VBROADCAST(C(info.Bpp));
 606         Value *vAlignmentOffsets = VBROADCAST(C(ied.AlignedByteOffset));
 607
 608         // is the element is <= the partially valid size
 609         Value *vElementInBoundsMask = ICMP_SLE(vBpp, SUB(vPartialVertexSize, vAlignmentOffsets));
 610
 611         // override cur indices with 0 if pitch is 0
 612         Value* pZeroPitchMask = ICMP_EQ(vStride, VIMMED1(0));
 613         vCurIndices = SELECT(pZeroPitchMask, VIMMED1(0), vCurIndices);
 614
 615         // are vertices partially OOB?
 616         Value* vMaxVertex = VBROADCAST(maxVertex);
 617         Value* vPartialOOBMask = ICMP_EQ(vCurIndices, vMaxVertex);
 618
 619         // are vertices fully in bounds?
 620         Value* vMaxGatherMask = ICMP_ULT(vCurIndices, vMaxVertex);
 621
 622         Value *vGatherMask;
 623         if (fetchState.bPartialVertexBuffer)
 624         {
 625             // are vertices below minVertex limit?
 626             Value *vMinVertex = VBROADCAST(minVertex);
 627             Value *vMinGatherMask = ICMP_UGE(vCurIndices, vMinVertex);
 628
 629             // only fetch lanes that pass both tests
 630             vGatherMask = AND(vMaxGatherMask, vMinGatherMask);
 631         }
 632         else
 633         {
 634             vGatherMask = vMaxGatherMask;
 635         }
 636
 637         // blend in any partially OOB indices that have valid elements
 638         vGatherMask = SELECT(vPartialOOBMask, vElementInBoundsMask, vGatherMask);
 639
 640         // calculate the actual offsets into the VB
 641         Value* vOffsets = MUL(vCurIndices, vStride);
 642         vOffsets = ADD(vOffsets, vAlignmentOffsets);
 643
 644         // if instance stride enable is:
 645         //  true  - add product of the instanceID and advancement state to the offst into the VB
 646         //  false - value of vInstanceStride has been initialialized to zero
 647         vOffsets = ADD(vOffsets, vInstanceStride);
 648
 649         // Packing and component control
 650         ComponentEnable compMask = (ComponentEnable)ied.ComponentPacking;
 651         const ComponentControl compCtrl[4] { (ComponentControl)ied.ComponentControl0, (ComponentControl)ied.ComponentControl1,
 652                                              (ComponentControl)ied.ComponentControl2, (ComponentControl)ied.ComponentControl3};
 653
 654         // Special gather/conversion for formats without equal component sizes
 655         if (IsOddFormat((SWR_FORMAT)ied.Format))
 656         {
 657             Value *pResults[4];
 658             CreateGatherOddFormats((SWR_FORMAT)ied.Format, vGatherMask, pStreamBase, vOffsets, pResults);
 659             ConvertFormat((SWR_FORMAT)ied.Format, pResults);
 660
 661             for (uint32_t c = 0; c < 4; c += 1)
 662             {
 663                 if (isComponentEnabled(compMask, c))
 664                 {
 665                     vVertexElements[currentVertexElement++] = pResults[c];
 666                     if (currentVertexElement > 3)
 667                     {
 668                         StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
 669                         // reset to the next vVertexElement to output
 670                         currentVertexElement = 0;
 671                     }
 672                 }
 673             }
 674         }
 675         else if(info.type[0] == SWR_TYPE_FLOAT)
 676         {
 677             ///@todo: support 64 bit vb accesses
 678             Value *gatherSrc = VIMMED1(0.0f);
 679
 680             SWR_ASSERT(IsUniformFormat((SWR_FORMAT)ied.Format),
 681                 "Unsupported format for standard gather fetch.");
 682
 683             // Gather components from memory to store in a simdvertex structure
 684             switch (bpc)
 685             {
 686                 case 16:
 687                 {
 688                     Value *vGatherResult[2];
 689
 690                     // if we have at least one component out of x or y to fetch
 691                     if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
 692                     {
 693                         vGatherResult[0] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vGatherMask);
 694                         // e.g. result of first 8x32bit integer gather for 16bit components
 695                         // 256i - 0    1    2    3    4    5    6    7
 696                         //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
 697                         //
 698                     }
 699
 700                     // if we have at least one component out of z or w to fetch
 701                     if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
 702                     {
 703                         // offset base to the next components(zw) in the vertex to gather
 704                         pStreamBase = GEP(pStreamBase, C((char)4));
 705
 706                         vGatherResult[1] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vGatherMask);
 707                         // e.g. result of second 8x32bit integer gather for 16bit components
 708                         // 256i - 0    1    2    3    4    5    6    7
 709                         //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
 710                         //
 711                     }
 712
 713                     // if we have at least one component to shuffle into place
 714                     if (compMask)
 715                     {
 716                         Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, Instruction::CastOps::FPExt, CONVERT_NONE,
 717                             currentVertexElement, outputElt, compMask, compCtrl, vVertexElements);
 718
 719                         // Shuffle gathered components into place in simdvertex struct
 720                         mVWidth == 16 ? Shuffle16bpcGather16(args) : Shuffle16bpcGather(args);  // outputs to vVertexElements ref
 721                     }
 722                 }
 723                     break;
 724                 case 32:
 725                 {
 726                     for (uint32_t i = 0; i < 4; i += 1)
 727                     {
 728                         if (isComponentEnabled(compMask, i))
 729                         {
 730                             // if we need to gather the component
 731                             if (compCtrl[i] == StoreSrc)
 732                             {
 733                                 // Gather a SIMD of vertices
 734                                 // APIs allow a 4GB range for offsets
 735                                 // However, GATHERPS uses signed 32-bit offsets, so only a 2GB range :(
 736                                 // But, we know that elements must be aligned for FETCH. :)
 737                                 // Right shift the offset by a bit and then scale by 2 to remove the sign extension.
 738                                 Value *vShiftedOffsets = LSHR(vOffsets, 1);
 739                                 vVertexElements[currentVertexElement++] = GATHERPS(gatherSrc, pStreamBaseGFX, vShiftedOffsets, vGatherMask, 2, GFX_MEM_CLIENT_FETCH);
 740                             }
 741                             else
 742                             {
 743                                 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
 744                             }
 745
 746                             if (currentVertexElement > 3)
 747                             {
 748                                 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
 749                                 // reset to the next vVertexElement to output
 750                                 currentVertexElement = 0;
 751                             }
 752                         }
 753
 754                         // offset base to the next component in the vertex to gather
 755                         pStreamBase = GEP(pStreamBase, C((char)4));
 756                         pStreamBaseGFX = ADD(pStreamBaseGFX, C((int64_t)4));
 757                     }
 758                 }
 759                     break;
 760                 case 64:
 761                 {
 762                     for (uint32_t i = 0; i < 4; i += 1)
 763                     {
 764                         if (isComponentEnabled(compMask, i))
 765                         {
 766                             // if we need to gather the component
 767                             if (compCtrl[i] == StoreSrc)
 768                             {
 769                                 Value *vMaskLo = VSHUFFLE(vGatherMask, VUNDEF(mInt1Ty, 8), C({0, 1, 2, 3}));
 770                                 Value *vMaskHi = VSHUFFLE(vGatherMask, VUNDEF(mInt1Ty, 8), C({4, 5, 6, 7}));
 771
 772                                 Value *vOffsetsLo = VEXTRACTI128(vOffsets, C(0));
 773                                 Value *vOffsetsHi = VEXTRACTI128(vOffsets, C(1));
 774
 775                                 Value *vZeroDouble = VECTOR_SPLAT(4, ConstantFP::get(IRB()->getDoubleTy(), 0.0f));
 776
 777                                 Value* pGatherLo = GATHERPD(vZeroDouble, pStreamBase, vOffsetsLo, vMaskLo);
 778                                 Value* pGatherHi = GATHERPD(vZeroDouble, pStreamBase, vOffsetsHi, vMaskHi);
 779
 780                                 pGatherLo = VCVTPD2PS(pGatherLo);
 781                                 pGatherHi = VCVTPD2PS(pGatherHi);
 782
 783                                 Value *pGather = VSHUFFLE(pGatherLo, pGatherHi, C({0, 1, 2, 3, 4, 5, 6, 7}));
 784
 785                                 vVertexElements[currentVertexElement++] = pGather;
 786                             }
 787                             else
 788                             {
 789                                 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
 790                             }
 791
 792                             if (currentVertexElement > 3)
 793                             {
 794                                 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
 795                                 // reset to the next vVertexElement to output
 796                                 currentVertexElement = 0;
 797                             }
 798                         }
 799
 800                         // offset base to the next component  in the vertex to gather
 801                         pStreamBase = GEP(pStreamBase, C((char)8));
 802                     }
 803                 }
 804                     break;
 805                 default:
 806                     SWR_INVALID("Tried to fetch invalid FP format");
 807                     break;
 808             }
 809         }
 810         else
 811         {
 812             Instruction::CastOps extendCastType = Instruction::CastOps::CastOpsEnd;
 813             ConversionType conversionType = CONVERT_NONE;
 814
 815             SWR_ASSERT(IsUniformFormat((SWR_FORMAT)ied.Format),
 816                 "Unsupported format for standard gather fetch.");
 817
 818             switch(info.type[0])
 819             {
 820                 case SWR_TYPE_UNORM:
 821                     conversionType = CONVERT_NORMALIZED;
 822                 case SWR_TYPE_UINT:
 823                     extendCastType = Instruction::CastOps::ZExt;
 824                     break;
 825                 case SWR_TYPE_SNORM:
 826                     conversionType = CONVERT_NORMALIZED;
 827                 case SWR_TYPE_SINT:
 828                     extendCastType = Instruction::CastOps::SExt;
 829                     break;
 830                 case SWR_TYPE_USCALED:
 831                     conversionType = CONVERT_USCALED;
 832                     extendCastType = Instruction::CastOps::UIToFP;
 833                     break;
 834                 case SWR_TYPE_SSCALED:
 835                     conversionType = CONVERT_SSCALED;
 836                     extendCastType = Instruction::CastOps::SIToFP;
 837                     break;
 838                 case SWR_TYPE_SFIXED:
 839                     conversionType = CONVERT_SFIXED;
 840                     extendCastType = Instruction::CastOps::SExt;
 841                     break;
 842                 default:
 843                     break;
 844             }
 845
 846             // value substituted when component of gather is masked
 847             Value* gatherSrc = VIMMED1(0);
 848
 849             // Gather components from memory to store in a simdvertex structure
 850             switch (bpc)
 851             {
 852                 case 8:
 853                 {
 854                     // if we have at least one component to fetch
 855                     if (compMask)
 856                     {
 857                         Value *vGatherResult = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask);
 858                         // e.g. result of an 8x32bit integer gather for 8bit components
 859                         // 256i - 0    1    2    3    4    5    6    7
 860                         //        xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw
 861
 862                         Shuffle8bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, extendCastType, conversionType,
 863                             currentVertexElement, outputElt, compMask, compCtrl, vVertexElements, info.swizzle);
 864
 865                         // Shuffle gathered components into place in simdvertex struct
 866                         mVWidth == 16 ? Shuffle8bpcGatherd16(args) : Shuffle8bpcGatherd(args); // outputs to vVertexElements ref
 867                     }
 868                 }
 869                 break;
 870                 case 16:
 871                 {
 872                     Value *vGatherResult[2];
 873
 874                     // if we have at least one component out of x or y to fetch
 875                     if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
 876                     {
 877                         vGatherResult[0] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask);
 878                         // e.g. result of first 8x32bit integer gather for 16bit components
 879                         // 256i - 0    1    2    3    4    5    6    7
 880                         //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
 881                         //
 882                     }
 883
 884                     // if we have at least one component out of z or w to fetch
 885                     if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
 886                     {
 887                         // offset base to the next components(zw) in the vertex to gather
 888                         pStreamBase = GEP(pStreamBase, C((char)4));
 889
 890                         vGatherResult[1] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask);
 891                         // e.g. result of second 8x32bit integer gather for 16bit components
 892                         // 256i - 0    1    2    3    4    5    6    7
 893                         //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
 894                         //
 895                     }
 896
 897                     // if we have at least one component to shuffle into place
 898                     if (compMask)
 899                     {
 900                         Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, extendCastType, conversionType,
 901                             currentVertexElement, outputElt, compMask, compCtrl, vVertexElements);
 902
 903                         // Shuffle gathered components into place in simdvertex struct
 904                         mVWidth == 16 ? Shuffle16bpcGather16(args) : Shuffle16bpcGather(args); // outputs to vVertexElements ref
 905                     }
 906                 }
 907                 break;
 908                 case 32:
 909                 {
 910                     // Gathered components into place in simdvertex struct
 911                     for (uint32_t i = 0; i < 4; i++)
 912                     {
 913                         if (isComponentEnabled(compMask, i))
 914                         {
 915                             // if we need to gather the component
 916                             if (compCtrl[i] == StoreSrc)
 917                             {
 918                                 Value* pGather = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask);
 919
 920                                 if (conversionType == CONVERT_USCALED)
 921                                 {
 922                                     pGather = UI_TO_FP(pGather, mSimdFP32Ty);
 923                                 }
 924                                 else if (conversionType == CONVERT_SSCALED)
 925                                 {
 926                                     pGather = SI_TO_FP(pGather, mSimdFP32Ty);
 927                                 }
 928                                 else if (conversionType == CONVERT_SFIXED)
 929                                 {
 930                                     pGather = FMUL(SI_TO_FP(pGather, mSimdFP32Ty), VBROADCAST(C(1/65536.0f)));
 931                                 }
 932
 933                                 vVertexElements[currentVertexElement++] = pGather;
 934
 935                                 // e.g. result of a single 8x32bit integer gather for 32bit components
 936                                 // 256i - 0    1    2    3    4    5    6    7
 937                                 //        xxxx xxxx xxxx xxxx xxxx xxxx xxxx xxxx
 938                             }
 939                             else
 940                             {
 941                                 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
 942                             }
 943
 944                             if (currentVertexElement > 3)
 945                             {
 946                                 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
 947
 948                                 // reset to the next vVertexElement to output
 949                                 currentVertexElement = 0;
 950                             }
 951
 952                         }
 953
 954                         // offset base to the next component  in the vertex to gather
 955                         pStreamBase = GEP(pStreamBase, C((char)4));
 956                     }
 957                 }
 958                 break;
 959             }
 960         }
 961     }
 962
 963     // if we have a partially filled vVertexElement struct, output it
 964     if (currentVertexElement > 0)
 965     {
 966         StoreVertexElements(pVtxOut, outputElt++, currentVertexElement, vVertexElements);
 967     }
 968 }
 969
 970 //////////////////////////////////////////////////////////////////////////
 971 /// @brief Loads a simd of valid indices. OOB indices are set to 0
 972 /// *Note* have to do 16bit index checking in scalar until we have AVX-512
 973 /// support
 974 /// @param pIndices - pointer to 8 bit indices
 975 /// @param pLastIndex - pointer to last valid index
 976 Value* FetchJit::GetSimdValid8bitIndices(Value* pIndices, Value* pLastIndex)
 977 {
 978     // can fit 2 16 bit integers per vWidth lane
 979     Value* vIndices =  VUNDEF_I();
 980
 981     // store 0 index on stack to be used to conditionally load from if index address is OOB
 982     Value* pZeroIndex = ALLOCA(mInt8Ty);
 983     STORE(C((uint8_t)0), pZeroIndex);
 984
 985     // Load a SIMD of index pointers
 986     for(int64_t lane = 0; lane < mVWidth; lane++)
 987     {
 988         // Calculate the address of the requested index
 989         Value *pIndex = GEP(pIndices, C(lane));
 990
 991         // check if the address is less than the max index,
 992         Value* mask = ICMP_ULT(pIndex, pLastIndex);
 993
 994         // if valid, load the index. if not, load 0 from the stack
 995         Value* pValid = SELECT(mask, pIndex, pZeroIndex);
 996         Value *index = LOAD(pValid, "valid index");
 997
 998         // zero extended index to 32 bits and insert into the correct simd lane
 999         index = Z_EXT(index, mInt32Ty);
1000         vIndices = VINSERT(vIndices, index, lane);
1001     }
1002     return vIndices;
1003 }
1004
1005 //////////////////////////////////////////////////////////////////////////
1006 /// @brief Loads a simd of valid indices. OOB indices are set to 0
1007 /// *Note* have to do 16bit index checking in scalar until we have AVX-512
1008 /// support
1009 /// @param pIndices - pointer to 16 bit indices
1010 /// @param pLastIndex - pointer to last valid index
1011 Value* FetchJit::GetSimdValid16bitIndices(Value* pIndices, Value* pLastIndex)
1012 {
1013     // can fit 2 16 bit integers per vWidth lane
1014     Value* vIndices =  VUNDEF_I();
1015
1016     // store 0 index on stack to be used to conditionally load from if index address is OOB
1017     Value* pZeroIndex = ALLOCA(mInt16Ty);
1018     STORE(C((uint16_t)0), pZeroIndex);
1019
1020     // Load a SIMD of index pointers
1021     for(int64_t lane = 0; lane < mVWidth; lane++)
1022     {
1023         // Calculate the address of the requested index
1024         Value *pIndex = GEP(pIndices, C(lane));
1025
1026         // check if the address is less than the max index,
1027         Value* mask = ICMP_ULT(pIndex, pLastIndex);
1028
1029         // if valid, load the index. if not, load 0 from the stack
1030         Value* pValid = SELECT(mask, pIndex, pZeroIndex);
1031         Value *index = LOAD(pValid, "valid index", GFX_MEM_CLIENT_FETCH);
1032
1033         // zero extended index to 32 bits and insert into the correct simd lane
1034         index = Z_EXT(index, mInt32Ty);
1035         vIndices = VINSERT(vIndices, index, lane);
1036     }
1037     return vIndices;
1038 }
1039
1040 //////////////////////////////////////////////////////////////////////////
1041 /// @brief Loads a simd of valid indices. OOB indices are set to 0
1042 /// @param pIndices - pointer to 32 bit indices
1043 /// @param pLastIndex - pointer to last valid index
1044 Value* FetchJit::GetSimdValid32bitIndices(Value* pIndices, Value* pLastIndex)
1045 {
1046     DataLayout dL(JM()->mpCurrentModule);
1047     unsigned int ptrSize = dL.getPointerSize() * 8;  // ptr size in bits
1048     Value* iLastIndex = PTR_TO_INT(pLastIndex, Type::getIntNTy(JM()->mContext, ptrSize));
1049     Value* iIndices = PTR_TO_INT(pIndices, Type::getIntNTy(JM()->mContext, ptrSize));
1050
1051     // get the number of indices left in the buffer (endPtr - curPtr) / sizeof(index)
1052     Value* numIndicesLeft = SUB(iLastIndex,iIndices);
1053     numIndicesLeft = TRUNC(numIndicesLeft, mInt32Ty);
1054     numIndicesLeft = SDIV(numIndicesLeft, C(4));
1055
1056     // create a vector of index counts from the base index ptr passed into the fetch
1057     Constant* vIndexOffsets;
1058     if (mVWidth == 8)
1059     {
1060         vIndexOffsets = C({ 0, 1, 2, 3, 4, 5, 6, 7 });
1061     }
1062     else
1063     {
1064         vIndexOffsets = C({ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 });
1065     }
1066
1067     // compare index count to the max valid index
1068     // e.g vMaxIndex      4 4 4 4 4 4 4 4 : 4 indices left to load
1069     //     vIndexOffsets  0 1 2 3 4 5 6 7
1070     //     ------------------------------
1071     //     vIndexMask    -1-1-1-1 0 0 0 0 : offsets < max pass
1072     //     vLoadedIndices 0 1 2 3 0 0 0 0 : offsets >= max masked to 0
1073     Value* vMaxIndex = VBROADCAST(numIndicesLeft);
1074     Value* vIndexMask = ICMP_SGT(vMaxIndex, vIndexOffsets);
1075
1076     // Load the indices; OOB loads 0
1077     pIndices = BITCAST(pIndices, PointerType::get(mSimdInt32Ty, 0));
1078     return MASKED_LOAD(pIndices, 4, vIndexMask, VIMMED1(0));
1079 }
1080
1081 //////////////////////////////////////////////////////////////////////////
1082 /// @brief Takes a SIMD of gathered 8bpc verts, zero or sign extends,
1083 /// denormalizes if needed, converts to F32 if needed, and positions in
1084 //  the proper SIMD rows to be output to the simdvertex structure
1085 /// @param args: (tuple of args, listed below)
1086 ///   @param vGatherResult - 8 gathered 8bpc vertices
1087 ///   @param pVtxOut - base pointer to output simdvertex struct
1088 ///   @param extendType - sign extend or zero extend
1089 ///   @param bNormalized - do we need to denormalize?
1090 ///   @param currentVertexElement - reference to the current vVertexElement
1091 ///   @param outputElt - reference to the current offset from simdvertex we're o
1092 ///   @param compMask - component packing mask
1093 ///   @param compCtrl - component control val
1094 ///   @param vVertexElements[4] - vertex components to output
1095 ///   @param swizzle[4] - component swizzle location
1096 void FetchJit::Shuffle8bpcGatherd16(Shuffle8bpcArgs &args)
1097 {
1098     // Unpack tuple args
1099     Value*& vGatherResult = std::get<0>(args);
1100     Value* pVtxOut = std::get<1>(args);
1101     const Instruction::CastOps extendType = std::get<2>(args);
1102     const ConversionType conversionType = std::get<3>(args);
1103     uint32_t &currentVertexElement = std::get<4>(args);
1104     uint32_t &outputElt = std::get<5>(args);
1105     const ComponentEnable compMask = std::get<6>(args);
1106     const ComponentControl(&compCtrl)[4] = std::get<7>(args);
1107     Value* (&vVertexElements)[4] = std::get<8>(args);
1108     const uint32_t(&swizzle)[4] = std::get<9>(args);
1109
1110     // cast types
1111     Type *vGatherTy = VectorType::get(mInt32Ty, 8);
1112     Type *v32x8Ty = VectorType::get(mInt8Ty, 32);
1113
1114     // have to do extra work for sign extending
1115     if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP))
1116     {
1117         Type *v16x8Ty = VectorType::get(mInt8Ty, 16); // 8x16bit ints in a 128bit lane
1118         Type *v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), 2);
1119
1120         // shuffle mask, including any swizzling
1121         const char x = (char)swizzle[0]; const char y = (char)swizzle[1];
1122         const char z = (char)swizzle[2]; const char w = (char)swizzle[3];
1123         Value *vConstMask = C<char>({ char(x), char(x + 4), char(x + 8), char(x + 12),
1124             char(y), char(y + 4), char(y + 8), char(y + 12),
1125             char(z), char(z + 4), char(z + 8), char(z + 12),
1126             char(w), char(w + 4), char(w + 8), char(w + 12),
1127             char(x), char(x + 4), char(x + 8), char(x + 12),
1128             char(y), char(y + 4), char(y + 8), char(y + 12),
1129             char(z), char(z + 4), char(z + 8), char(z + 12),
1130             char(w), char(w + 4), char(w + 8), char(w + 12) });
1131
1132         // SIMD16 PSHUFB isnt part of AVX-512F, so split into SIMD8 for the sake of KNL, for now..
1133
1134         Value *vGatherResult_lo = EXTRACT_16(vGatherResult, 0);
1135         Value *vGatherResult_hi = EXTRACT_16(vGatherResult, 1);
1136
1137         Value *vShufResult_lo = BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask), vGatherTy);
1138         Value *vShufResult_hi = BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask), vGatherTy);
1139
1140         // after pshufb: group components together in each 128bit lane
1141         // 256i - 0    1    2    3    4    5    6    7
1142         //        xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww
1143
1144         Value *vi128XY_lo = nullptr;
1145         Value *vi128XY_hi = nullptr;
1146         if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
1147         {
1148             vi128XY_lo = BITCAST(VSHUFFLE(vShufResult_lo, vShufResult_lo, C<int32_t>({ 0, 4, 0, 0, 1, 5, 0, 0 })), v128Ty);
1149             vi128XY_hi = BITCAST(VSHUFFLE(vShufResult_hi, vShufResult_hi, C<int32_t>({ 0, 4, 0, 0, 1, 5, 0, 0 })), v128Ty);
1150
1151             // after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane
1152             // 256i - 0    1    2    3    4    5    6    7
1153             //        xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care)
1154         }
1155
1156         // do the same for zw components
1157         Value *vi128ZW_lo = nullptr;
1158         Value *vi128ZW_hi = nullptr;
1159         if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
1160         {
1161             vi128ZW_lo = BITCAST(VSHUFFLE(vShufResult_lo, vShufResult_lo, C<int32_t>({ 2, 6, 0, 0, 3, 7, 0, 0 })), v128Ty);
1162             vi128ZW_hi = BITCAST(VSHUFFLE(vShufResult_hi, vShufResult_hi, C<int32_t>({ 2, 6, 0, 0, 3, 7, 0, 0 })), v128Ty);
1163         }
1164
1165         // init denormalize variables if needed
1166         Instruction::CastOps fpCast;
1167         Value *conversionFactor;
1168
1169         switch (conversionType)
1170         {
1171         case CONVERT_NORMALIZED:
1172             fpCast = Instruction::CastOps::SIToFP;
1173             conversionFactor = VIMMED1((float)(1.0 / 127.0));
1174             break;
1175         case CONVERT_SSCALED:
1176             fpCast = Instruction::CastOps::SIToFP;
1177             conversionFactor = VIMMED1((float)(1.0));
1178             break;
1179         case CONVERT_USCALED:
1180             SWR_INVALID("Type should not be sign extended!");
1181             conversionFactor = nullptr;
1182             break;
1183         default:
1184             SWR_ASSERT(conversionType == CONVERT_NONE);
1185             conversionFactor = nullptr;
1186             break;
1187         }
1188
1189         // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
1190         for (uint32_t i = 0; i < 4; i++)
1191         {
1192             if (isComponentEnabled(compMask, i))
1193             {
1194                 if (compCtrl[i] == ComponentControl::StoreSrc)
1195                 {
1196                     // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
1197                     uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
1198                     // if x or y, use vi128XY permute result, else use vi128ZW
1199                     Value *selectedPermute_lo = (i < 2) ? vi128XY_lo : vi128ZW_lo;
1200                     Value *selectedPermute_hi = (i < 2) ? vi128XY_hi : vi128ZW_hi;
1201
1202                     // sign extend
1203                     Value *temp_lo = PMOVSXBD(BITCAST(VEXTRACT(selectedPermute_lo, C(lane)), v16x8Ty));
1204                     Value *temp_hi = PMOVSXBD(BITCAST(VEXTRACT(selectedPermute_hi, C(lane)), v16x8Ty));
1205
1206                     Value* temp = JOIN_16(temp_lo, temp_hi);
1207
1208                     // denormalize if needed
1209                     if (conversionType != CONVERT_NONE)
1210                     {
1211                         temp = FMUL(CAST(fpCast, temp, mSimdFP32Ty), conversionFactor);
1212                     }
1213
1214                     vVertexElements[currentVertexElement] = temp;
1215
1216                     currentVertexElement += 1;
1217                 }
1218                 else
1219                 {
1220                     vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1221                 }
1222
1223                 if (currentVertexElement > 3)
1224                 {
1225                     StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1226                     // reset to the next vVertexElement to output
1227                     currentVertexElement = 0;
1228                 }
1229             }
1230         }
1231     }
1232     // else zero extend
1233     else if ((extendType == Instruction::CastOps::ZExt) || (extendType == Instruction::CastOps::UIToFP))
1234     {
1235         // init denormalize variables if needed
1236         Instruction::CastOps fpCast;
1237         Value *conversionFactor;
1238
1239         switch (conversionType)
1240         {
1241         case CONVERT_NORMALIZED:
1242             fpCast = Instruction::CastOps::UIToFP;
1243             conversionFactor = VIMMED1((float)(1.0 / 255.0));
1244             break;
1245         case CONVERT_USCALED:
1246             fpCast = Instruction::CastOps::UIToFP;
1247             conversionFactor = VIMMED1((float)(1.0));
1248             break;
1249         case CONVERT_SSCALED:
1250             SWR_INVALID("Type should not be zero extended!");
1251             conversionFactor = nullptr;
1252             break;
1253         default:
1254             SWR_ASSERT(conversionType == CONVERT_NONE);
1255             conversionFactor = nullptr;
1256             break;
1257         }
1258
1259         // shuffle enabled components into lower byte of each 32bit lane, 0 extending to 32 bits
1260         for (uint32_t i = 0; i < 4; i++)
1261         {
1262             if (isComponentEnabled(compMask, i))
1263             {
1264                 if (compCtrl[i] == ComponentControl::StoreSrc)
1265                 {
1266                     // pshufb masks for each component
1267                     Value *vConstMask;
1268                     switch (swizzle[i])
1269                     {
1270                     case 0:
1271                         // x shuffle mask
1272                         vConstMask = C<char>({ 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1,
1273                             0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1 });
1274                         break;
1275                     case 1:
1276                         // y shuffle mask
1277                         vConstMask = C<char>({ 1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1,
1278                             1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1 });
1279                         break;
1280                     case 2:
1281                         // z shuffle mask
1282                         vConstMask = C<char>({ 2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1,
1283                             2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1 });
1284                         break;
1285                     case 3:
1286                         // w shuffle mask
1287                         vConstMask = C<char>({ 3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1,
1288                             3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1 });
1289                         break;
1290                     default:
1291                         vConstMask = nullptr;
1292                         break;
1293                     }
1294
1295                     Value *vGatherResult_lo = EXTRACT_16(vGatherResult, 0);
1296                     Value *vGatherResult_hi = EXTRACT_16(vGatherResult, 1);
1297
1298                     Value *temp_lo = BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask), vGatherTy);
1299                     Value *temp_hi = BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask), vGatherTy);
1300
1301                     // after pshufb for x channel
1302                     // 256i - 0    1    2    3    4    5    6    7
1303                     //        x000 x000 x000 x000 x000 x000 x000 x000
1304
1305                     Value* temp = JOIN_16(temp_lo, temp_hi);
1306
1307                     // denormalize if needed
1308                     if (conversionType != CONVERT_NONE)
1309                     {
1310                         temp = FMUL(CAST(fpCast, temp, mSimdFP32Ty), conversionFactor);
1311                     }
1312
1313                     vVertexElements[currentVertexElement] = temp;
1314
1315                     currentVertexElement += 1;
1316                 }
1317                 else
1318                 {
1319                     vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1320                 }
1321
1322                 if (currentVertexElement > 3)
1323                 {
1324                     StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1325                     // reset to the next vVertexElement to output
1326                     currentVertexElement = 0;
1327                 }
1328             }
1329         }
1330     }
1331     else
1332     {
1333         SWR_INVALID("Unsupported conversion type");
1334     }
1335 }
1336
1337 void FetchJit::Shuffle8bpcGatherd(Shuffle8bpcArgs &args)
1338 {
1339     // Unpack tuple args
1340     Value*& vGatherResult = std::get<0>(args);
1341     Value* pVtxOut = std::get<1>(args);
1342     const Instruction::CastOps extendType = std::get<2>(args);
1343     const ConversionType conversionType = std::get<3>(args);
1344     uint32_t &currentVertexElement = std::get<4>(args);
1345     uint32_t &outputElt = std::get<5>(args);
1346     const ComponentEnable compMask = std::get<6>(args);
1347     const ComponentControl(&compCtrl)[4] = std::get<7>(args);
1348     Value* (&vVertexElements)[4] = std::get<8>(args);
1349     const uint32_t(&swizzle)[4] = std::get<9>(args);
1350
1351     // cast types
1352     Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
1353
1354     for (uint32_t i = 0; i < 4; i++)
1355     {
1356         if (!isComponentEnabled(compMask, i))
1357             continue;
1358
1359         if (compCtrl[i] == ComponentControl::StoreSrc)
1360         {
1361             std::vector<uint32_t> vShuffleMasks[4] = {
1362                 { 0, 4,  8, 12, 16, 20, 24, 28 }, // x
1363                 { 1, 5,  9, 13, 17, 21, 25, 29 }, // y
1364                 { 2, 6, 10, 14, 18, 22, 26, 30 }, // z
1365                 { 3, 7, 11, 15, 19, 23, 27, 31 }, // w
1366             };
1367
1368             Value *val = VSHUFFLE(BITCAST(vGatherResult, v32x8Ty),
1369                 UndefValue::get(v32x8Ty),
1370                 vShuffleMasks[swizzle[i]]);
1371
1372             if ((extendType == Instruction::CastOps::SExt) ||
1373                 (extendType == Instruction::CastOps::SIToFP)) {
1374                 switch (conversionType)
1375                 {
1376                 case CONVERT_NORMALIZED:
1377                     val = FMUL(SI_TO_FP(val, mSimdFP32Ty), VIMMED1((float)(1.0 / 127.0)));
1378                     break;
1379                 case CONVERT_SSCALED:
1380                     val = SI_TO_FP(val, mSimdFP32Ty);
1381                     break;
1382                 case CONVERT_USCALED:
1383                     SWR_INVALID("Type should not be sign extended!");
1384                     break;
1385                 default:
1386                     SWR_ASSERT(conversionType == CONVERT_NONE);
1387                     val = S_EXT(val, mSimdInt32Ty);
1388                     break;
1389                 }
1390             }
1391             else if ((extendType == Instruction::CastOps::ZExt) ||
1392                 (extendType == Instruction::CastOps::UIToFP)) {
1393                 switch (conversionType)
1394                 {
1395                 case CONVERT_NORMALIZED:
1396                     val = FMUL(UI_TO_FP(val, mSimdFP32Ty), VIMMED1((float)(1.0 / 255.0)));
1397                     break;
1398                 case CONVERT_SSCALED:
1399                     SWR_INVALID("Type should not be zero extended!");
1400                     break;
1401                 case CONVERT_USCALED:
1402                     val = UI_TO_FP(val, mSimdFP32Ty);
1403                     break;
1404                 default:
1405                     SWR_ASSERT(conversionType == CONVERT_NONE);
1406                     val = Z_EXT(val, mSimdInt32Ty);
1407                     break;
1408                 }
1409             }
1410             else
1411             {
1412                 SWR_INVALID("Unsupported conversion type");
1413             }
1414
1415             vVertexElements[currentVertexElement++] = val;
1416         }
1417         else
1418         {
1419             vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1420         }
1421
1422         if (currentVertexElement > 3)
1423         {
1424             StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1425             // reset to the next vVertexElement to output
1426             currentVertexElement = 0;
1427         }
1428     }
1429 }
1430
1431 //////////////////////////////////////////////////////////////////////////
1432 /// @brief Takes a SIMD of gathered 16bpc verts, zero or sign extends,
1433 /// denormalizes if needed, converts to F32 if needed, and positions in
1434 //  the proper SIMD rows to be output to the simdvertex structure
1435 /// @param args: (tuple of args, listed below)
1436 ///   @param vGatherResult[2] - array of gathered 16bpc vertices, 4 per index
1437 ///   @param pVtxOut - base pointer to output simdvertex struct
1438 ///   @param extendType - sign extend or zero extend
1439 ///   @param bNormalized - do we need to denormalize?
1440 ///   @param currentVertexElement - reference to the current vVertexElement
1441 ///   @param outputElt - reference to the current offset from simdvertex we're o
1442 ///   @param compMask - component packing mask
1443 ///   @param compCtrl - component control val
1444 ///   @param vVertexElements[4] - vertex components to output
1445 void FetchJit::Shuffle16bpcGather16(Shuffle16bpcArgs &args)
1446 {
1447     // Unpack tuple args
1448     Value* (&vGatherResult)[2] = std::get<0>(args);
1449     Value* pVtxOut = std::get<1>(args);
1450     const Instruction::CastOps extendType = std::get<2>(args);
1451     const ConversionType conversionType = std::get<3>(args);
1452     uint32_t &currentVertexElement = std::get<4>(args);
1453     uint32_t &outputElt = std::get<5>(args);
1454     const ComponentEnable compMask = std::get<6>(args);
1455     const ComponentControl(&compCtrl)[4] = std::get<7>(args);
1456     Value* (&vVertexElements)[4] = std::get<8>(args);
1457
1458     // cast types
1459     Type *vGatherTy = VectorType::get(mInt32Ty, 8);
1460     Type *v32x8Ty = VectorType::get(mInt8Ty, 32);
1461
1462     // have to do extra work for sign extending
1463     if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP) || (extendType == Instruction::CastOps::FPExt))
1464     {
1465         // is this PP float?
1466         bool bFP = (extendType == Instruction::CastOps::FPExt) ? true : false;
1467
1468         Type *v8x16Ty = VectorType::get(mInt16Ty, 8); // 8x16bit in a 128bit lane
1469         Type *v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), 2);
1470
1471         // shuffle mask
1472         Value *vConstMask = C<uint8_t>({ 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
1473                                           0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 });
1474         Value *vi128XY_lo = nullptr;
1475         Value *vi128XY_hi = nullptr;
1476         if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
1477         {
1478             // SIMD16 PSHUFB isnt part of AVX-512F, so split into SIMD8 for the sake of KNL, for now..
1479
1480             Value *vGatherResult_lo = BITCAST(EXTRACT_16(vGatherResult[0], 0), v32x8Ty);
1481             Value *vGatherResult_hi = BITCAST(EXTRACT_16(vGatherResult[0], 1), v32x8Ty);
1482
1483             Value *vShufResult_lo = BITCAST(PSHUFB(vGatherResult_lo, vConstMask), vGatherTy);
1484             Value *vShufResult_hi = BITCAST(PSHUFB(vGatherResult_hi, vConstMask), vGatherTy);
1485
1486             // after pshufb: group components together in each 128bit lane
1487             // 256i - 0    1    2    3    4    5    6    7
1488             //        xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
1489
1490             vi128XY_lo = BITCAST(VSHUFFLE(vShufResult_lo, vShufResult_lo, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
1491             vi128XY_hi = BITCAST(VSHUFFLE(vShufResult_hi, vShufResult_hi, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
1492
1493             // after PERMD: move and pack xy components into each 128bit lane
1494             // 256i - 0    1    2    3    4    5    6    7
1495             //        xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
1496         }
1497
1498         // do the same for zw components
1499         Value *vi128ZW_lo = nullptr;
1500         Value *vi128ZW_hi = nullptr;
1501         if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
1502         {
1503             Value *vGatherResult_lo = BITCAST(EXTRACT_16(vGatherResult[1], 0), v32x8Ty);
1504             Value *vGatherResult_hi = BITCAST(EXTRACT_16(vGatherResult[1], 1), v32x8Ty);
1505
1506             Value *vShufResult_lo = BITCAST(PSHUFB(vGatherResult_lo, vConstMask), vGatherTy);
1507             Value *vShufResult_hi = BITCAST(PSHUFB(vGatherResult_hi, vConstMask), vGatherTy);
1508
1509             vi128ZW_lo = BITCAST(VSHUFFLE(vShufResult_lo, vShufResult_lo, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
1510             vi128ZW_hi = BITCAST(VSHUFFLE(vShufResult_hi, vShufResult_hi, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
1511         }
1512
1513         // init denormalize variables if needed
1514         Instruction::CastOps IntToFpCast;
1515         Value *conversionFactor;
1516
1517         switch (conversionType)
1518         {
1519         case CONVERT_NORMALIZED:
1520             IntToFpCast = Instruction::CastOps::SIToFP;
1521             conversionFactor = VIMMED1((float)(1.0 / 32767.0));
1522             break;
1523         case CONVERT_SSCALED:
1524             IntToFpCast = Instruction::CastOps::SIToFP;
1525             conversionFactor = VIMMED1((float)(1.0));
1526             break;
1527         case CONVERT_USCALED:
1528             SWR_INVALID("Type should not be sign extended!");
1529             conversionFactor = nullptr;
1530             break;
1531         default:
1532             SWR_ASSERT(conversionType == CONVERT_NONE);
1533             conversionFactor = nullptr;
1534             break;
1535         }
1536
1537         // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
1538         for (uint32_t i = 0; i < 4; i++)
1539         {
1540             if (isComponentEnabled(compMask, i))
1541             {
1542                 if (compCtrl[i] == ComponentControl::StoreSrc)
1543                 {
1544                     // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
1545                     uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
1546                     // if x or y, use vi128XY permute result, else use vi128ZW
1547                     Value *selectedPermute_lo = (i < 2) ? vi128XY_lo : vi128ZW_lo;
1548                     Value *selectedPermute_hi = (i < 2) ? vi128XY_hi : vi128ZW_hi;
1549
1550                     if (bFP)
1551                     {
1552                         // extract 128 bit lanes to sign extend each component
1553                         /// @todo Force 8-wide cvt until we support generic cvt in x86 lowering pass
1554                         Function* pCvtPh2Ps = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_vcvtph2ps_256);
1555                         Value *temp_lo = CALL(pCvtPh2Ps, BITCAST(VEXTRACT(selectedPermute_lo, C(lane)), v8x16Ty));
1556                         Value *temp_hi = CALL(pCvtPh2Ps, BITCAST(VEXTRACT(selectedPermute_hi, C(lane)), v8x16Ty));
1557
1558                         vVertexElements[currentVertexElement] = JOIN_16(temp_lo, temp_hi);
1559                     }
1560                     else
1561                     {
1562                         // extract 128 bit lanes to sign extend each component
1563                         Value *temp_lo = PMOVSXWD(BITCAST(VEXTRACT(selectedPermute_lo, C(lane)), v8x16Ty));
1564                         Value *temp_hi = PMOVSXWD(BITCAST(VEXTRACT(selectedPermute_hi, C(lane)), v8x16Ty));
1565
1566                         Value* temp = JOIN_16(temp_lo, temp_hi);
1567
1568                         // denormalize if needed
1569                         if (conversionType != CONVERT_NONE)
1570                         {
1571                             temp = FMUL(CAST(IntToFpCast, temp, mSimdFP32Ty), conversionFactor);
1572                         }
1573
1574                         vVertexElements[currentVertexElement] = temp;
1575                     }
1576
1577                     currentVertexElement += 1;
1578                 }
1579                 else
1580                 {
1581                     vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1582                 }
1583
1584                 if (currentVertexElement > 3)
1585                 {
1586                     StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1587                     // reset to the next vVertexElement to output
1588                     currentVertexElement = 0;
1589                 }
1590             }
1591         }
1592     }
1593     // else zero extend
1594     else if ((extendType == Instruction::CastOps::ZExt) || (extendType == Instruction::CastOps::UIToFP))
1595     {
1596         // pshufb masks for each component
1597         Value *vConstMask[2];
1598
1599         if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 2))
1600         {
1601             // x/z shuffle mask
1602             vConstMask[0] = C<char>({ 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
1603                 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, });
1604         }
1605
1606         if (isComponentEnabled(compMask, 1) || isComponentEnabled(compMask, 3))
1607         {
1608             // y/w shuffle mask
1609             vConstMask[1] = C<char>({ 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
1610                 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1 });
1611         }
1612
1613         // init denormalize variables if needed
1614         Instruction::CastOps fpCast;
1615         Value* conversionFactor;
1616
1617         switch (conversionType)
1618         {
1619         case CONVERT_NORMALIZED:
1620             fpCast = Instruction::CastOps::UIToFP;
1621             conversionFactor = VIMMED1((float)(1.0 / 65535.0));
1622             break;
1623         case CONVERT_USCALED:
1624             fpCast = Instruction::CastOps::UIToFP;
1625             conversionFactor = VIMMED1((float)(1.0f));
1626             break;
1627         case CONVERT_SSCALED:
1628             SWR_INVALID("Type should not be zero extended!");
1629             conversionFactor = nullptr;
1630             break;
1631         default:
1632             SWR_ASSERT(conversionType == CONVERT_NONE);
1633             conversionFactor = nullptr;
1634             break;
1635         }
1636
1637         // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits
1638         for (uint32_t i = 0; i < 4; i++)
1639         {
1640             if (isComponentEnabled(compMask, i))
1641             {
1642                 if (compCtrl[i] == ComponentControl::StoreSrc)
1643                 {
1644                     // select correct constMask for x/z or y/w pshufb
1645                     uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1;
1646                     // if x or y, use vi128XY permute result, else use vi128ZW
1647                     uint32_t selectedGather = (i < 2) ? 0 : 1;
1648
1649                     // SIMD16 PSHUFB isnt part of AVX-512F, so split into SIMD8 for the sake of KNL, for now..
1650
1651                     Value *vGatherResult_lo = EXTRACT_16(vGatherResult[selectedGather], 0);
1652                     Value *vGatherResult_hi = EXTRACT_16(vGatherResult[selectedGather], 1);
1653
1654                     Value *temp_lo = BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask[selectedMask]), vGatherTy);
1655                     Value *temp_hi = BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask[selectedMask]), vGatherTy);
1656
1657                     // after pshufb mask for x channel; z uses the same shuffle from the second gather
1658                     // 256i - 0    1    2    3    4    5    6    7
1659                     //        xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00
1660
1661                     Value* temp = JOIN_16(temp_lo, temp_hi);
1662
1663                     // denormalize if needed
1664                     if (conversionType != CONVERT_NONE)
1665                     {
1666                         temp = FMUL(CAST(fpCast, temp, mSimdFP32Ty), conversionFactor);
1667                     }
1668
1669                     vVertexElements[currentVertexElement] = temp;
1670
1671                     currentVertexElement += 1;
1672                 }
1673                 else
1674                 {
1675                     vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1676                 }
1677
1678                 if (currentVertexElement > 3)
1679                 {
1680                     StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1681                     // reset to the next vVertexElement to output
1682                     currentVertexElement = 0;
1683                 }
1684             }
1685         }
1686     }
1687     else
1688     {
1689         SWR_INVALID("Unsupported conversion type");
1690     }
1691 }
1692
1693 void FetchJit::Shuffle16bpcGather(Shuffle16bpcArgs &args)
1694 {
1695     // Unpack tuple args
1696     Value* (&vGatherResult)[2] = std::get<0>(args);
1697     Value* pVtxOut = std::get<1>(args);
1698     const Instruction::CastOps extendType = std::get<2>(args);
1699     const ConversionType conversionType = std::get<3>(args);
1700     uint32_t &currentVertexElement = std::get<4>(args);
1701     uint32_t &outputElt = std::get<5>(args);
1702     const ComponentEnable compMask = std::get<6>(args);
1703     const ComponentControl(&compCtrl)[4] = std::get<7>(args);
1704     Value* (&vVertexElements)[4] = std::get<8>(args);
1705
1706     // cast types
1707     Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
1708     Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
1709
1710                                                            // have to do extra work for sign extending
1711     if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP) ||
1712         (extendType == Instruction::CastOps::FPExt))
1713     {
1714         // is this PP float?
1715         bool bFP = (extendType == Instruction::CastOps::FPExt) ? true : false;
1716
1717         Type* v8x16Ty = VectorType::get(mInt16Ty, 8); // 8x16bit in a 128bit lane
1718         Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
1719
1720                                                                                                      // shuffle mask
1721         Value* vConstMask = C<char>({ 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
1722             0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 });
1723         Value* vi128XY = nullptr;
1724         if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)) {
1725             Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherResult[0], v32x8Ty), vConstMask), vGatherTy);
1726             // after pshufb: group components together in each 128bit lane
1727             // 256i - 0    1    2    3    4    5    6    7
1728             //        xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
1729
1730             vi128XY = BITCAST(VPERMD(vShufResult, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
1731             // after PERMD: move and pack xy components into each 128bit lane
1732             // 256i - 0    1    2    3    4    5    6    7
1733             //        xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
1734         }
1735
1736         // do the same for zw components
1737         Value* vi128ZW = nullptr;
1738         if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)) {
1739             Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherResult[1], v32x8Ty), vConstMask), vGatherTy);
1740             vi128ZW = BITCAST(VPERMD(vShufResult, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
1741         }
1742
1743         // init denormalize variables if needed
1744         Instruction::CastOps IntToFpCast;
1745         Value* conversionFactor;
1746
1747         switch (conversionType)
1748         {
1749         case CONVERT_NORMALIZED:
1750             IntToFpCast = Instruction::CastOps::SIToFP;
1751             conversionFactor = VIMMED1((float)(1.0 / 32767.0));
1752             break;
1753         case CONVERT_SSCALED:
1754             IntToFpCast = Instruction::CastOps::SIToFP;
1755             conversionFactor = VIMMED1((float)(1.0));
1756             break;
1757         case CONVERT_USCALED:
1758             SWR_INVALID("Type should not be sign extended!");
1759             conversionFactor = nullptr;
1760             break;
1761         default:
1762             SWR_ASSERT(conversionType == CONVERT_NONE);
1763             conversionFactor = nullptr;
1764             break;
1765         }
1766
1767         // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
1768         for (uint32_t i = 0; i < 4; i++)
1769         {
1770             if (isComponentEnabled(compMask, i))
1771             {
1772                 if (compCtrl[i] == ComponentControl::StoreSrc)
1773                 {
1774                     // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
1775                     uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
1776                     // if x or y, use vi128XY permute result, else use vi128ZW
1777                     Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
1778
1779                     if (bFP) {
1780                         // extract 128 bit lanes to sign extend each component
1781                         vVertexElements[currentVertexElement] = CVTPH2PS(BITCAST(VEXTRACT(selectedPermute, C(lane)), v8x16Ty));
1782                     }
1783                     else {
1784                         // extract 128 bit lanes to sign extend each component
1785                         vVertexElements[currentVertexElement] = PMOVSXWD(BITCAST(VEXTRACT(selectedPermute, C(lane)), v8x16Ty));
1786
1787                         // denormalize if needed
1788                         if (conversionType != CONVERT_NONE) {
1789                             vVertexElements[currentVertexElement] = FMUL(CAST(IntToFpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor);
1790                         }
1791                     }
1792                     currentVertexElement++;
1793                 }
1794                 else
1795                 {
1796                     vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1797                 }
1798
1799                 if (currentVertexElement > 3)
1800                 {
1801                     StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1802                     // reset to the next vVertexElement to output
1803                     currentVertexElement = 0;
1804                 }
1805             }
1806         }
1807     }
1808     // else zero extend
1809     else if ((extendType == Instruction::CastOps::ZExt) || (extendType == Instruction::CastOps::UIToFP))
1810     {
1811         // pshufb masks for each component
1812         Value* vConstMask[2];
1813         if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 2)) {
1814             // x/z shuffle mask
1815             vConstMask[0] = C<char>({ 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
1816                 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, });
1817         }
1818
1819         if (isComponentEnabled(compMask, 1) || isComponentEnabled(compMask, 3)) {
1820             // y/w shuffle mask
1821             vConstMask[1] = C<char>({ 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
1822                 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1 });
1823         }
1824
1825         // init denormalize variables if needed
1826         Instruction::CastOps fpCast;
1827         Value* conversionFactor;
1828
1829         switch (conversionType)
1830         {
1831         case CONVERT_NORMALIZED:
1832             fpCast = Instruction::CastOps::UIToFP;
1833             conversionFactor = VIMMED1((float)(1.0 / 65535.0));
1834             break;
1835         case CONVERT_USCALED:
1836             fpCast = Instruction::CastOps::UIToFP;
1837             conversionFactor = VIMMED1((float)(1.0f));
1838             break;
1839         case CONVERT_SSCALED:
1840             SWR_INVALID("Type should not be zero extended!");
1841             conversionFactor = nullptr;
1842             break;
1843         default:
1844             SWR_ASSERT(conversionType == CONVERT_NONE);
1845             conversionFactor = nullptr;
1846             break;
1847         }
1848
1849         // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits
1850         for (uint32_t i = 0; i < 4; i++)
1851         {
1852             if (isComponentEnabled(compMask, i))
1853             {
1854                 if (compCtrl[i] == ComponentControl::StoreSrc)
1855                 {
1856                     // select correct constMask for x/z or y/w pshufb
1857                     uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1;
1858                     // if x or y, use vi128XY permute result, else use vi128ZW
1859                     uint32_t selectedGather = (i < 2) ? 0 : 1;
1860
1861                     vVertexElements[currentVertexElement] = BITCAST(PSHUFB(BITCAST(vGatherResult[selectedGather], v32x8Ty), vConstMask[selectedMask]), vGatherTy);
1862                     // after pshufb mask for x channel; z uses the same shuffle from the second gather
1863                     // 256i - 0    1    2    3    4    5    6    7
1864                     //        xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00
1865
1866                     // denormalize if needed
1867                     if (conversionType != CONVERT_NONE)
1868                     {
1869                         vVertexElements[currentVertexElement] = FMUL(CAST(fpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor);
1870                     }
1871                     currentVertexElement++;
1872                 }
1873                 else
1874                 {
1875                     vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1876                 }
1877
1878                 if (currentVertexElement > 3)
1879                 {
1880                     StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1881                     // reset to the next vVertexElement to output
1882                     currentVertexElement = 0;
1883                 }
1884             }
1885         }
1886     }
1887     else
1888     {
1889         SWR_INVALID("Unsupported conversion type");
1890     }
1891 }
1892
1893 //////////////////////////////////////////////////////////////////////////
1894 /// @brief Output a simdvertex worth of elements to the current outputElt
1895 /// @param pVtxOut - base address of VIN output struct
1896 /// @param outputElt - simdvertex offset in VIN to write to
1897 /// @param numEltsToStore - number of simdvertex rows to write out
1898 /// @param vVertexElements - LLVM Value*[] simdvertex to write out
1899 void FetchJit::StoreVertexElements(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4])
1900 {
1901     SWR_ASSERT(numEltsToStore <= 4, "Invalid element count.");
1902
1903     for (uint32_t c = 0; c < numEltsToStore; ++c)
1904     {
1905         // STORE expects FP32 x vWidth type, just bitcast if needed
1906         if (!vVertexElements[c]->getType()->getScalarType()->isFloatTy())
1907         {
1908 #if FETCH_DUMP_VERTEX
1909             PRINT("vVertexElements[%d]: 0x%x\n", { C(c), vVertexElements[c] });
1910 #endif
1911             vVertexElements[c] = BITCAST(vVertexElements[c], mSimdFP32Ty);
1912         }
1913 #if FETCH_DUMP_VERTEX
1914         else
1915         {
1916             PRINT("vVertexElements[%d]: %f\n", { C(c), vVertexElements[c] });
1917         }
1918 #endif
1919         // outputElt * 4 = offsetting by the size of a simdvertex
1920         // + c offsets to a 32bit x vWidth row within the current vertex
1921         Value* dest = GEP(pVtxOut, C(outputElt * 4 + c), "destGEP");
1922         STORE(vVertexElements[c], dest);
1923     }
1924 }
1925
1926 //////////////////////////////////////////////////////////////////////////
1927 /// @brief Generates a constant vector of values based on the
1928 /// ComponentControl value
1929 /// @param ctrl - ComponentControl value
1930 Value *FetchJit::GenerateCompCtrlVector(const ComponentControl ctrl)
1931 {
1932     switch (ctrl)
1933     {
1934     case NoStore:
1935         return VUNDEF_I();
1936     case Store0:
1937         return VIMMED1(0);
1938     case Store1Fp:
1939         return VIMMED1(1.0f);
1940     case Store1Int:
1941         return VIMMED1(1);
1942     case StoreVertexId:
1943     {
1944         if (mVWidth == 16)
1945         {
1946             Type* pSimd8FPTy = VectorType::get(mFP32Ty, 8);
1947             Value *pIdLo = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID })), pSimd8FPTy);
1948             Value *pIdHi = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID2 })), pSimd8FPTy);
1949             return JOIN_16(pIdLo, pIdHi);
1950         }
1951         else
1952         {
1953             return BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID })), mSimdFP32Ty);
1954         }
1955     }
1956     case StoreInstanceId:
1957         {
1958             Value *pId = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_CurInstance })), mFP32Ty);
1959             return VBROADCAST(pId);
1960         }
1961
1962
1963     case StoreSrc:
1964     default:
1965         SWR_INVALID("Invalid component control");
1966         return VUNDEF_I();
1967     }
1968 }
1969
1970 //////////////////////////////////////////////////////////////////////////
1971 /// @brief Returns the enable mask for the specified component.
1972 /// @param enableMask - enable bits
1973 /// @param component - component to check if enabled.
1974 bool isComponentEnabled(ComponentEnable enableMask, uint8_t component)
1975 {
1976     switch (component)
1977     {
1978         // X
1979     case 0: return (enableMask & ComponentEnable::X);
1980         // Y
1981     case 1: return (enableMask & ComponentEnable::Y);
1982         // Z
1983     case 2: return (enableMask & ComponentEnable::Z);
1984         // W
1985     case 3: return (enableMask & ComponentEnable::W);
1986
1987     default: return false;
1988     }
1989 }
1990
1991 // Don't want two threads compiling the same fetch shader simultaneously
1992 // Has problems in the JIT cache implementation
1993 // This is only a problem for fetch right now.
1994 static std::mutex gFetchCodegenMutex;
1995
1996 //////////////////////////////////////////////////////////////////////////
1997 /// @brief JITs from fetch shader IR
1998 /// @param hJitMgr - JitManager handle
1999 /// @param func   - LLVM function IR
2000 /// @return PFN_FETCH_FUNC - pointer to fetch code
2001 PFN_FETCH_FUNC JitFetchFunc(HANDLE hJitMgr, const HANDLE hFunc)
2002 {
2003     const llvm::Function* func = (const llvm::Function*)hFunc;
2004     JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
2005     PFN_FETCH_FUNC pfnFetch;
2006
2007     gFetchCodegenMutex.lock();
2008     pfnFetch = (PFN_FETCH_FUNC)(pJitMgr->mpExec->getFunctionAddress(func->getName().str()));
2009     // MCJIT finalizes modules the first time you JIT code from them. After finalized, you cannot add new IR to the module
2010     pJitMgr->mIsModuleFinalized = true;
2011
2012 #if defined(KNOB_SWRC_TRACING)
2013     char fName[1024];
2014     const char *funcName = func->getName().data();
2015     sprintf(fName, "%s.bin", funcName);
2016     FILE *fd = fopen(fName, "wb");
2017     fwrite((void *)pfnFetch, 1, 2048, fd);
2018     fclose(fd);
2019 #endif
2020
2021     pJitMgr->DumpAsm(const_cast<llvm::Function*>(func), "final");
2022     gFetchCodegenMutex.unlock();
2023
2024
2025
2026     return pfnFetch;
2027 }
2028
2029 //////////////////////////////////////////////////////////////////////////
2030 /// @brief JIT compiles fetch shader
2031 /// @param hJitMgr - JitManager handle
2032 /// @param state   - fetch state to build function from
2033 extern "C" PFN_FETCH_FUNC JITCALL JitCompileFetch(HANDLE hJitMgr, const FETCH_COMPILE_STATE& state)
2034 {
2035     JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
2036
2037     pJitMgr->SetupNewModule();
2038
2039     FetchJit theJit(pJitMgr);
2040     HANDLE hFunc = theJit.Create(state);
2041
2042     return JitFetchFunc(hJitMgr, hFunc);
2043 }