src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp

   1 /****************************************************************************
   2  * Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  *
  23  * @file fetch_jit.cpp
  24  *
  25  * @brief Implementation of the fetch jitter
  26  *
  27  * Notes:
  28  *
  29  ******************************************************************************/
  30 #include "jit_pch.hpp"
  31 #include "builder_gfx_mem.h"
  32 #include "jit_api.h"
  33 #include "fetch_jit.h"
  34 #include "gen_state_llvm.h"
  35 #include "functionpasses/passes.h"
  36
  37 //#define FETCH_DUMP_VERTEX 1
  38 using namespace llvm;
  39 using namespace SwrJit;
  40
  41 bool isComponentEnabled(ComponentEnable enableMask, uint8_t component);
  42
  43 enum ConversionType
  44 {
  45     CONVERT_NONE,
  46     CONVERT_NORMALIZED,
  47     CONVERT_USCALED,
  48     CONVERT_SSCALED,
  49     CONVERT_SFIXED,
  50 };
  51
  52 //////////////////////////////////////////////////////////////////////////
  53 /// Interface to Jitting a fetch shader
  54 //////////////////////////////////////////////////////////////////////////
  55 struct FetchJit : public BuilderGfxMem
  56 {
  57     FetchJit(JitManager* pJitMgr) : BuilderGfxMem(pJitMgr) {}
  58
  59     Function* Create(const FETCH_COMPILE_STATE& fetchState);
  60
  61     Value* GetSimdValid32bitIndices(Value* vIndices, Value* pLastIndex);
  62     Value* GetSimdValid16bitIndices(Value* vIndices, Value* pLastIndex);
  63     Value* GetSimdValid8bitIndices(Value* vIndices, Value* pLastIndex);
  64     template <typename T>
  65     Value* GetSimdValidIndicesHelper(Value* pIndices, Value* pLastIndex);
  66
  67     // package up Shuffle*bpcGatherd args into a tuple for convenience
  68     typedef std::tuple<Value*&,
  69                        Value*,
  70                        const Instruction::CastOps,
  71                        const ConversionType,
  72                        uint32_t&,
  73                        uint32_t&,
  74                        const ComponentEnable,
  75                        const ComponentControl (&)[4],
  76                        Value* (&)[4],
  77                        const uint32_t (&)[4]>
  78         Shuffle8bpcArgs;
  79
  80     void Shuffle8bpcGatherd16(Shuffle8bpcArgs& args);
  81     void Shuffle8bpcGatherd(Shuffle8bpcArgs& args);
  82
  83     typedef std::tuple<Value* (&)[2],
  84                        Value*,
  85                        const Instruction::CastOps,
  86                        const ConversionType,
  87                        uint32_t&,
  88                        uint32_t&,
  89                        const ComponentEnable,
  90                        const ComponentControl (&)[4],
  91                        Value* (&)[4]>
  92         Shuffle16bpcArgs;
  93
  94     void Shuffle16bpcGather16(Shuffle16bpcArgs& args);
  95     void Shuffle16bpcGather(Shuffle16bpcArgs& args);
  96
  97     void StoreVertexElements(Value*         pVtxOut,
  98                              const uint32_t outputElt,
  99                              const uint32_t numEltsToStore,
 100                              Value* (&vVertexElements)[4]);
 101
 102     Value* GenerateCompCtrlVector(const ComponentControl ctrl);
 103
 104     void JitGatherVertices(const FETCH_COMPILE_STATE& fetchState,
 105                            Value*                     streams,
 106                            Value*                     vIndices,
 107                            Value*                     pVtxOut);
 108
 109     bool IsOddFormat(SWR_FORMAT format);
 110     bool IsUniformFormat(SWR_FORMAT format);
 111     void UnpackComponents(SWR_FORMAT format, Value* vInput, Value* result[4]);
 112     void CreateGatherOddFormats(
 113         SWR_FORMAT format, Value* pMask, Value* pBase, Value* offsets, Value* result[4]);
 114     void ConvertFormat(SWR_FORMAT format, Value* texels[4]);
 115
 116     Value* mpFetchInfo;
 117 };
 118
 119 Function* FetchJit::Create(const FETCH_COMPILE_STATE& fetchState)
 120 {
 121     std::stringstream fnName("FCH_", std::ios_base::in | std::ios_base::out | std::ios_base::ate);
 122     fnName << ComputeCRC(0, &fetchState, sizeof(fetchState));
 123
 124     Function* fetch = Function::Create(
 125         JM()->mFetchShaderTy, GlobalValue::ExternalLinkage, fnName.str(), JM()->mpCurrentModule);
 126     BasicBlock* entry = BasicBlock::Create(JM()->mContext, "entry", fetch);
 127
 128     fetch->getParent()->setModuleIdentifier(fetch->getName());
 129
 130     IRB()->SetInsertPoint(entry);
 131
 132     auto argitr = fetch->arg_begin();
 133
 134     // Fetch shader arguments
 135     Value* privateContext = &*argitr;
 136     ++argitr;
 137     privateContext->setName("privateContext");
 138     SetPrivateContext(privateContext);
 139
 140     mpWorkerData = &*argitr;
 141     ++argitr;
 142     mpWorkerData->setName("pWorkerData");
 143
 144     mpFetchInfo = &*argitr;
 145     ++argitr;
 146     mpFetchInfo->setName("fetchInfo");
 147     Value* pVtxOut = &*argitr;
 148     pVtxOut->setName("vtxOutput");
 149
 150     uint32_t baseWidth = mVWidth;
 151
 152     SWR_ASSERT(mVWidth == 8 || mVWidth == 16, "Unsupported vector width %d", mVWidth);
 153
 154     // Override builder target width to force 16-wide SIMD
 155 #if USE_SIMD16_SHADERS
 156     SetTargetWidth(16);
 157 #endif
 158
 159     pVtxOut = BITCAST(pVtxOut, PointerType::get(mSimdFP32Ty, 0));
 160
 161     // SWR_FETCH_CONTEXT::pStreams
 162     Value* streams = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_pStreams});
 163     streams->setName("pStreams");
 164
 165     // SWR_FETCH_CONTEXT::pIndices
 166     Value* indices = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_xpIndices});
 167     indices->setName("pIndices");
 168
 169     // SWR_FETCH_CONTEXT::pLastIndex
 170     Value* pLastIndex = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_xpLastIndex});
 171     pLastIndex->setName("pLastIndex");
 172
 173     Value* vIndices;
 174     switch (fetchState.indexType)
 175     {
 176     case R8_UINT:
 177         indices = BITCAST(indices, Type::getInt8PtrTy(JM()->mContext, 0));
 178         if (fetchState.bDisableIndexOOBCheck)
 179         {
 180             vIndices = LOAD(
 181                 BITCAST(indices, PointerType::get(VectorType::get(mInt8Ty, mpJitMgr->mVWidth), 0)),
 182                 {(uint32_t)0});
 183             vIndices = Z_EXT(vIndices, mSimdInt32Ty);
 184         }
 185         else
 186         {
 187             vIndices = GetSimdValid8bitIndices(indices, pLastIndex);
 188         }
 189         break;
 190     case R16_UINT:
 191         if (fetchState.bDisableIndexOOBCheck)
 192         {
 193             vIndices = LOAD(
 194                 BITCAST(indices, PointerType::get(VectorType::get(mInt16Ty, mpJitMgr->mVWidth), 0)),
 195                 {(uint32_t)0});
 196             vIndices = Z_EXT(vIndices, mSimdInt32Ty);
 197         }
 198         else
 199         {
 200             vIndices = GetSimdValid16bitIndices(indices, pLastIndex);
 201         }
 202         break;
 203     case R32_UINT:
 204         (fetchState.bDisableIndexOOBCheck)
 205             ? vIndices = LOAD(indices,
 206                               "",
 207                               PointerType::get(mSimdInt32Ty, 0),
 208                               MEM_CLIENT::GFX_MEM_CLIENT_FETCH)
 209             : vIndices = GetSimdValid32bitIndices(indices, pLastIndex);
 210         break; // incoming type is already 32bit int
 211     default:
 212         SWR_INVALID("Unsupported index type");
 213         vIndices = nullptr;
 214         break;
 215     }
 216
 217     if (fetchState.bForceSequentialAccessEnable)
 218     {
 219         Value* pOffsets = mVWidth == 8 ? C({0, 1, 2, 3, 4, 5, 6, 7})
 220                                        : C({0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15});
 221
 222         // VertexData buffers are accessed sequentially, the index is equal to the vertex number
 223         vIndices = VBROADCAST(LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartVertex}));
 224         vIndices = ADD(vIndices, pOffsets);
 225     }
 226
 227     Value* vVertexId = vIndices;
 228     if (fetchState.bVertexIDOffsetEnable)
 229     {
 230         // Assuming one of baseVertex or startVertex is 0, so adding both should be functionally
 231         // correct
 232         Value* vBaseVertex  = VBROADCAST(LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_BaseVertex}));
 233         Value* vStartVertex = VBROADCAST(LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartVertex}));
 234         vVertexId           = ADD(vIndices, vBaseVertex);
 235         vVertexId           = ADD(vVertexId, vStartVertex);
 236     }
 237
 238     // store out vertex IDs
 239     if (mVWidth == 16)
 240     {
 241         // store out in simd8 halves until core supports 16-wide natively
 242         auto vVertexIdLo = EXTRACT_16(vVertexId, 0);
 243         auto vVertexIdHi = EXTRACT_16(vVertexId, 1);
 244         STORE(vVertexIdLo, GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_VertexID}));
 245         STORE(vVertexIdHi, GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_VertexID2}));
 246     }
 247     else if (mVWidth == 8)
 248     {
 249         STORE(vVertexId, GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_VertexID}));
 250     }
 251
 252     // store out cut mask if enabled
 253     if (fetchState.bEnableCutIndex)
 254     {
 255         Value* vCutIndex = VIMMED1(fetchState.cutIndex);
 256         Value* cutMask   = VMASK(ICMP_EQ(vIndices, vCutIndex));
 257
 258         if (mVWidth == 16)
 259         {
 260             auto cutMaskLo = EXTRACT_16(cutMask, 0);
 261             auto cutMaskHi = EXTRACT_16(cutMask, 1);
 262             STORE(cutMaskLo, GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_CutMask}));
 263             STORE(cutMaskHi, GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_CutMask2}));
 264         }
 265         else if (mVWidth == 8)
 266         {
 267             STORE(cutMask, GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_CutMask}));
 268         }
 269     }
 270
 271     // Fetch attributes from memory and output to a simdvertex struct
 272     JitGatherVertices(fetchState, streams, vIndices, pVtxOut);
 273
 274     RET_VOID();
 275
 276     JitManager::DumpToFile(fetch, "src");
 277
 278 #if defined(_DEBUG)
 279     verifyFunction(*fetch);
 280 #endif
 281
 282     ::FunctionPassManager setupPasses(JM()->mpCurrentModule);
 283
 284     ///@todo We don't need the CFG passes for fetch. (e.g. BreakCriticalEdges and CFGSimplification)
 285     setupPasses.add(createBreakCriticalEdgesPass());
 286     setupPasses.add(createCFGSimplificationPass());
 287     setupPasses.add(createEarlyCSEPass());
 288     setupPasses.add(createPromoteMemoryToRegisterPass());
 289
 290     setupPasses.run(*fetch);
 291
 292     JitManager::DumpToFile(fetch, "se");
 293
 294     ::FunctionPassManager optPasses(JM()->mpCurrentModule);
 295
 296     ///@todo Haven't touched these either. Need to remove some of these and add others.
 297     optPasses.add(createCFGSimplificationPass());
 298     optPasses.add(createEarlyCSEPass());
 299     optPasses.add(createInstructionCombiningPass());
 300     optPasses.add(createConstantPropagationPass());
 301     optPasses.add(createSCCPPass());
 302     optPasses.add(createAggressiveDCEPass());
 303
 304     optPasses.run(*fetch);
 305
 306     optPasses.add(createLowerX86Pass(this));
 307     optPasses.run(*fetch);
 308
 309     JitManager::DumpToFile(fetch, "opt");
 310
 311
 312     // Revert 16-wide override
 313 #if USE_SIMD16_SHADERS
 314     SetTargetWidth(baseWidth);
 315 #endif
 316
 317     return fetch;
 318 }
 319
 320 // returns true for odd formats that require special state.gather handling
 321 bool FetchJit::IsOddFormat(SWR_FORMAT format)
 322 {
 323     const SWR_FORMAT_INFO& info = GetFormatInfo(format);
 324     if (info.bpc[0] != 8 && info.bpc[0] != 16 && info.bpc[0] != 32 && info.bpc[0] != 64)
 325     {
 326         return true;
 327     }
 328     return false;
 329 }
 330
 331 // format is uniform if all components are the same size and type
 332 bool FetchJit::IsUniformFormat(SWR_FORMAT format)
 333 {
 334     const SWR_FORMAT_INFO& info  = GetFormatInfo(format);
 335     uint32_t               bpc0  = info.bpc[0];
 336     uint32_t               type0 = info.type[0];
 337
 338     for (uint32_t c = 1; c < info.numComps; ++c)
 339     {
 340         if (bpc0 != info.bpc[c] || type0 != info.type[c])
 341         {
 342             return false;
 343         }
 344     }
 345     return true;
 346 }
 347
 348 // unpacks components based on format
 349 // foreach component in the pixel
 350 //   mask off everything but this component
 351 //   shift component to LSB
 352 void FetchJit::UnpackComponents(SWR_FORMAT format, Value* vInput, Value* result[4])
 353 {
 354     const SWR_FORMAT_INFO& info = GetFormatInfo(format);
 355
 356     uint32_t bitOffset = 0;
 357     for (uint32_t c = 0; c < info.numComps; ++c)
 358     {
 359         uint32_t swizzledIndex = info.swizzle[c];
 360         uint32_t compBits      = info.bpc[c];
 361         uint32_t bitmask       = ((1 << compBits) - 1) << bitOffset;
 362         Value*   comp          = AND(vInput, bitmask);
 363         comp                   = LSHR(comp, bitOffset);
 364
 365         result[swizzledIndex] = comp;
 366         bitOffset += compBits;
 367     }
 368 }
 369
 370 // gather for odd component size formats
 371 // gather SIMD full pixels per lane then shift/mask to move each component to their
 372 // own vector
 373 void FetchJit::CreateGatherOddFormats(
 374     SWR_FORMAT format, Value* pMask, Value* xpBase, Value* pOffsets, Value* pResult[4])
 375 {
 376     const SWR_FORMAT_INFO& info = GetFormatInfo(format);
 377
 378     // only works if pixel size is <= 32bits
 379     SWR_ASSERT(info.bpp <= 32);
 380
 381     Value* pGather;
 382     if (info.bpp == 32)
 383     {
 384         pGather =
 385             GATHERDD(VIMMED1(0), xpBase, pOffsets, pMask, 1, MEM_CLIENT::GFX_MEM_CLIENT_FETCH);
 386     }
 387     else
 388     {
 389         // Can't use 32-bit gather for items less than 32-bits, could cause page faults.
 390         Value* pMem = ALLOCA(mSimdInt32Ty);
 391         STORE(VIMMED1(0u), pMem);
 392
 393         Value* pDstMem = POINTER_CAST(pMem, mInt32PtrTy);
 394
 395         for (uint32_t lane = 0; lane < mVWidth; ++lane)
 396         {
 397             // Get index
 398             Value* index = VEXTRACT(pOffsets, C(lane));
 399             Value* mask  = VEXTRACT(pMask, C(lane));
 400
 401             // use branch around load based on mask
 402             // Needed to avoid page-faults on unmasked lanes
 403             BasicBlock* pCurrentBB = IRB()->GetInsertBlock();
 404             BasicBlock* pMaskedLoadBlock =
 405                 BasicBlock::Create(JM()->mContext, "MaskedLaneLoad", pCurrentBB->getParent());
 406             BasicBlock* pEndLoadBB =
 407                 BasicBlock::Create(JM()->mContext, "AfterMaskedLoad", pCurrentBB->getParent());
 408
 409             COND_BR(mask, pMaskedLoadBlock, pEndLoadBB);
 410
 411             JM()->mBuilder.SetInsertPoint(pMaskedLoadBlock);
 412
 413             switch (info.bpp)
 414             {
 415             case 8:
 416             {
 417                 Value* pDst  = BITCAST(GEP(pDstMem, C(lane)), PointerType::get(mInt8Ty, 0));
 418                 Value* xpSrc = ADD(xpBase, Z_EXT(index, xpBase->getType()));
 419                 STORE(LOAD(xpSrc, "", mInt8PtrTy, MEM_CLIENT::GFX_MEM_CLIENT_FETCH), pDst);
 420                 break;
 421             }
 422
 423             case 16:
 424             {
 425                 Value* pDst  = BITCAST(GEP(pDstMem, C(lane)), PointerType::get(mInt16Ty, 0));
 426                 Value* xpSrc = ADD(xpBase, Z_EXT(index, xpBase->getType()));
 427                 STORE(LOAD(xpSrc, "", mInt16PtrTy, MEM_CLIENT::GFX_MEM_CLIENT_FETCH), pDst);
 428                 break;
 429             }
 430             break;
 431
 432             case 24:
 433             {
 434                 // First 16-bits of data
 435                 Value* pDst  = BITCAST(GEP(pDstMem, C(lane)), PointerType::get(mInt16Ty, 0));
 436                 Value* xpSrc = ADD(xpBase, Z_EXT(index, xpBase->getType()));
 437                 STORE(LOAD(xpSrc, "", mInt16PtrTy, MEM_CLIENT::GFX_MEM_CLIENT_FETCH), pDst);
 438
 439                 // Last 8-bits of data
 440                 pDst  = BITCAST(GEP(pDst, C(1)), PointerType::get(mInt8Ty, 0));
 441                 xpSrc = ADD(xpSrc, C((int64_t)2));
 442                 STORE(LOAD(xpSrc, "", mInt8PtrTy, MEM_CLIENT::GFX_MEM_CLIENT_FETCH), pDst);
 443                 break;
 444             }
 445
 446             default:
 447                 SWR_INVALID("Shouldn't have BPP = %d now", info.bpp);
 448                 break;
 449             }
 450
 451             BR(pEndLoadBB);
 452             JM()->mBuilder.SetInsertPoint(pEndLoadBB);
 453         }
 454
 455         pGather = LOAD(pMem);
 456     }
 457
 458     for (uint32_t comp = 0; comp < 4; ++comp)
 459     {
 460         pResult[comp] = VIMMED1((int)info.defaults[comp]);
 461     }
 462
 463     UnpackComponents(format, pGather, pResult);
 464
 465     // cast to fp32
 466     pResult[0] = BITCAST(pResult[0], mSimdFP32Ty);
 467     pResult[1] = BITCAST(pResult[1], mSimdFP32Ty);
 468     pResult[2] = BITCAST(pResult[2], mSimdFP32Ty);
 469     pResult[3] = BITCAST(pResult[3], mSimdFP32Ty);
 470 }
 471
 472 void FetchJit::ConvertFormat(SWR_FORMAT format, Value* texels[4])
 473 {
 474     const SWR_FORMAT_INFO& info = GetFormatInfo(format);
 475
 476     for (uint32_t c = 0; c < info.numComps; ++c)
 477     {
 478         uint32_t compIndex = info.swizzle[c];
 479
 480         // skip any conversion on UNUSED components
 481         if (info.type[c] == SWR_TYPE_UNUSED)
 482         {
 483             continue;
 484         }
 485
 486         if (info.isNormalized[c])
 487         {
 488             if (info.type[c] == SWR_TYPE_SNORM)
 489             {
 490                 /// @todo The most-negative value maps to -1.0f. e.g. the 5-bit value 10000 maps to
 491                 /// -1.0f.
 492
 493                 /// result = c * (1.0f / (2^(n-1) - 1);
 494                 uint32_t n        = info.bpc[c];
 495                 uint32_t pow2     = 1 << (n - 1);
 496                 float    scale    = 1.0f / (float)(pow2 - 1);
 497                 Value*   vScale   = VIMMED1(scale);
 498                 texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty);
 499                 texels[compIndex] = SI_TO_FP(texels[compIndex], mSimdFP32Ty);
 500                 texels[compIndex] = FMUL(texels[compIndex], vScale);
 501             }
 502             else
 503             {
 504                 SWR_ASSERT(info.type[c] == SWR_TYPE_UNORM);
 505
 506                 /// result = c * (1.0f / (2^n - 1))
 507                 uint32_t n    = info.bpc[c];
 508                 uint32_t pow2 = 1 << n;
 509                 // special case 24bit unorm format, which requires a full divide to meet ULP
 510                 // requirement
 511                 if (n == 24)
 512                 {
 513                     float  scale      = (float)(pow2 - 1);
 514                     Value* vScale     = VIMMED1(scale);
 515                     texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty);
 516                     texels[compIndex] = SI_TO_FP(texels[compIndex], mSimdFP32Ty);
 517                     texels[compIndex] = FDIV(texels[compIndex], vScale);
 518                 }
 519                 else
 520                 {
 521                     float  scale      = 1.0f / (float)(pow2 - 1);
 522                     Value* vScale     = VIMMED1(scale);
 523                     texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty);
 524                     texels[compIndex] = UI_TO_FP(texels[compIndex], mSimdFP32Ty);
 525                     texels[compIndex] = FMUL(texels[compIndex], vScale);
 526                 }
 527             }
 528             continue;
 529         }
 530     }
 531 }
 532
 533 //////////////////////////////////////////////////////////////////////////
 534 /// @brief Loads attributes from memory using AVX2 GATHER(s)
 535 /// @param fetchState - info about attributes to be fetched from memory
 536 /// @param streams - value pointer to the current vertex stream
 537 /// @param vIndices - vector value of indices to gather
 538 /// @param pVtxOut - value pointer to output simdvertex struct
 539 void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE& fetchState,
 540                                  Value*                     streams,
 541                                  Value*                     vIndices,
 542                                  Value*                     pVtxOut)
 543 {
 544     uint32_t currentVertexElement = 0;
 545     uint32_t outputElt            = 0;
 546     Value*   vVertexElements[4];
 547
 548     Value* startVertex   = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartVertex});
 549     Value* startInstance = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartInstance});
 550     Value* curInstance   = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_CurInstance});
 551     Value* vBaseVertex   = VBROADCAST(LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_BaseVertex}));
 552     curInstance->setName("curInstance");
 553
 554     for (uint32_t nInputElt = 0; nInputElt < fetchState.numAttribs; nInputElt += 1)
 555     {
 556         const INPUT_ELEMENT_DESC& ied = fetchState.layout[nInputElt];
 557
 558         // skip element if all components are disabled
 559         if (ied.ComponentPacking == ComponentEnable::NONE)
 560         {
 561             continue;
 562         }
 563
 564         const SWR_FORMAT_INFO& info = GetFormatInfo((SWR_FORMAT)ied.Format);
 565         SWR_ASSERT((info.bpp != 0), "Unsupported format in JitGatherVertices.");
 566         uint32_t bpc =
 567             info.bpp /
 568             info.numComps; ///@todo Code below assumes all components are same size. Need to fix.
 569
 570         Value* stream = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_xpData});
 571
 572         Value* stride  = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pitch});
 573         Value* vStride = VBROADCAST(stride);
 574
 575         // max vertex index that is fully in bounds
 576         Value* maxVertex = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_maxVertex)});
 577         maxVertex        = LOAD(maxVertex);
 578
 579         Value* minVertex = NULL;
 580         if (fetchState.bPartialVertexBuffer)
 581         {
 582             // min vertex index for low bounds OOB checking
 583             minVertex = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_minVertex)});
 584             minVertex = LOAD(minVertex);
 585         }
 586
 587         if (fetchState.bInstanceIDOffsetEnable)
 588         {
 589             // the InstanceID (curInstance) value is offset by StartInstanceLocation
 590             curInstance = ADD(curInstance, startInstance);
 591         }
 592
 593         Value* vCurIndices;
 594         Value* startOffset;
 595         Value* vInstanceStride = VIMMED1(0);
 596
 597         if (ied.InstanceEnable)
 598         {
 599             Value* stepRate = C(ied.InstanceAdvancementState);
 600
 601             // prevent a div by 0 for 0 step rate
 602             Value* isNonZeroStep = ICMP_UGT(stepRate, C(0));
 603             stepRate             = SELECT(isNonZeroStep, stepRate, C(1));
 604
 605             // calc the current offset into instanced data buffer
 606             Value* calcInstance = UDIV(curInstance, stepRate);
 607
 608             // if step rate is 0, every instance gets instance 0
 609             calcInstance = SELECT(isNonZeroStep, calcInstance, C(0));
 610
 611             vCurIndices = VBROADCAST(calcInstance);
 612             startOffset = startInstance;
 613         }
 614         else if (ied.InstanceStrideEnable)
 615         {
 616             // grab the instance advancement state, determines stride in bytes from one instance to
 617             // the next
 618             Value* stepRate = C(ied.InstanceAdvancementState);
 619             vInstanceStride = VBROADCAST(MUL(curInstance, stepRate));
 620
 621             // offset indices by baseVertex
 622             vCurIndices = ADD(vIndices, vBaseVertex);
 623
 624             startOffset = startVertex;
 625             SWR_ASSERT((0), "TODO: Fill out more once driver sends this down.");
 626         }
 627         else
 628         {
 629             // offset indices by baseVertex
 630             vCurIndices = ADD(vIndices, vBaseVertex);
 631             startOffset = startVertex;
 632         }
 633
 634         // All of the OOB calculations are in vertices, not VB offsets, to prevent having to
 635         // do 64bit address offset calculations.
 636
 637         // calculate byte offset to the start of the VB
 638         Value* baseOffset = MUL(Z_EXT(startOffset, mInt64Ty), Z_EXT(stride, mInt64Ty));
 639
 640         // VGATHER* takes an *i8 src pointer so that's what stream is
 641         Value* pStreamBaseGFX = ADD(stream, baseOffset);
 642
 643         // if we have a start offset, subtract from max vertex. Used for OOB check
 644         maxVertex     = SUB(Z_EXT(maxVertex, mInt64Ty), Z_EXT(startOffset, mInt64Ty));
 645         Value* maxNeg = ICMP_SLT(maxVertex, C((int64_t)0));
 646         // if we have a negative value, we're already OOB. clamp at 0.
 647         maxVertex = SELECT(maxNeg, C(0), TRUNC(maxVertex, mInt32Ty));
 648
 649         if (fetchState.bPartialVertexBuffer)
 650         {
 651             // similary for min vertex
 652             minVertex     = SUB(Z_EXT(minVertex, mInt64Ty), Z_EXT(startOffset, mInt64Ty));
 653             Value* minNeg = ICMP_SLT(minVertex, C((int64_t)0));
 654             minVertex     = SELECT(minNeg, C(0), TRUNC(minVertex, mInt32Ty));
 655         }
 656
 657         // Load the in bounds size of a partially valid vertex
 658         Value* partialInboundsSize =
 659             GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_partialInboundsSize)});
 660         partialInboundsSize       = LOAD(partialInboundsSize);
 661         Value* vPartialVertexSize = VBROADCAST(partialInboundsSize);
 662         Value* vBpp               = VBROADCAST(C(info.Bpp));
 663         Value* vAlignmentOffsets  = VBROADCAST(C(ied.AlignedByteOffset));
 664
 665         // is the element is <= the partially valid size
 666         Value* vElementInBoundsMask = ICMP_SLE(vBpp, SUB(vPartialVertexSize, vAlignmentOffsets));
 667
 668         // override cur indices with 0 if pitch is 0
 669         Value* pZeroPitchMask = ICMP_EQ(vStride, VIMMED1(0));
 670         vCurIndices           = SELECT(pZeroPitchMask, VIMMED1(0), vCurIndices);
 671
 672         // are vertices partially OOB?
 673         Value* vMaxVertex      = VBROADCAST(maxVertex);
 674         Value* vPartialOOBMask = ICMP_EQ(vCurIndices, vMaxVertex);
 675
 676         // are vertices fully in bounds?
 677         Value* vMaxGatherMask = ICMP_ULT(vCurIndices, vMaxVertex);
 678
 679         Value* vGatherMask;
 680         if (fetchState.bPartialVertexBuffer)
 681         {
 682             // are vertices below minVertex limit?
 683             Value* vMinVertex     = VBROADCAST(minVertex);
 684             Value* vMinGatherMask = ICMP_UGE(vCurIndices, vMinVertex);
 685
 686             // only fetch lanes that pass both tests
 687             vGatherMask = AND(vMaxGatherMask, vMinGatherMask);
 688         }
 689         else
 690         {
 691             vGatherMask = vMaxGatherMask;
 692         }
 693
 694         // blend in any partially OOB indices that have valid elements
 695         vGatherMask = SELECT(vPartialOOBMask, vElementInBoundsMask, vGatherMask);
 696
 697         // calculate the actual offsets into the VB
 698         Value* vOffsets = MUL(vCurIndices, vStride);
 699         vOffsets        = ADD(vOffsets, vAlignmentOffsets);
 700
 701         // if instance stride enable is:
 702         //  true  - add product of the instanceID and advancement state to the offst into the VB
 703         //  false - value of vInstanceStride has been initialialized to zero
 704         vOffsets = ADD(vOffsets, vInstanceStride);
 705
 706         // Packing and component control
 707         ComponentEnable        compMask = (ComponentEnable)ied.ComponentPacking;
 708         const ComponentControl compCtrl[4]{(ComponentControl)ied.ComponentControl0,
 709                                            (ComponentControl)ied.ComponentControl1,
 710                                            (ComponentControl)ied.ComponentControl2,
 711                                            (ComponentControl)ied.ComponentControl3};
 712
 713         // Special gather/conversion for formats without equal component sizes
 714         if (IsOddFormat((SWR_FORMAT)ied.Format))
 715         {
 716             Value* pResults[4];
 717             CreateGatherOddFormats(
 718                 (SWR_FORMAT)ied.Format, vGatherMask, pStreamBaseGFX, vOffsets, pResults);
 719             ConvertFormat((SWR_FORMAT)ied.Format, pResults);
 720
 721             for (uint32_t c = 0; c < 4; c += 1)
 722             {
 723                 if (isComponentEnabled(compMask, c))
 724                 {
 725                     vVertexElements[currentVertexElement++] = pResults[c];
 726                     if (currentVertexElement > 3)
 727                     {
 728                         StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
 729                         // reset to the next vVertexElement to output
 730                         currentVertexElement = 0;
 731                     }
 732                 }
 733             }
 734         }
 735         else if (info.type[0] == SWR_TYPE_FLOAT)
 736         {
 737             ///@todo: support 64 bit vb accesses
 738             Value* gatherSrc = VIMMED1(0.0f);
 739
 740             SWR_ASSERT(IsUniformFormat((SWR_FORMAT)ied.Format),
 741                        "Unsupported format for standard gather fetch.");
 742
 743             // Gather components from memory to store in a simdvertex structure
 744             switch (bpc)
 745             {
 746             case 16:
 747             {
 748                 Value* vGatherResult[2];
 749
 750                 // if we have at least one component out of x or y to fetch
 751                 if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
 752                 {
 753                     vGatherResult[0] = GATHERPS(gatherSrc, pStreamBaseGFX, vOffsets, vGatherMask, 1, MEM_CLIENT::GFX_MEM_CLIENT_FETCH);
 754                     // e.g. result of first 8x32bit integer gather for 16bit components
 755                     // 256i - 0    1    2    3    4    5    6    7
 756                     //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
 757                     //
 758                 }
 759
 760                 // if we have at least one component out of z or w to fetch
 761                 if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
 762                 {
 763                     // offset base to the next components(zw) in the vertex to gather
 764                     pStreamBaseGFX = ADD(pStreamBaseGFX, C((int64_t)4));
 765
 766                     vGatherResult[1] = GATHERPS(gatherSrc, pStreamBaseGFX, vOffsets, vGatherMask, 1, MEM_CLIENT::GFX_MEM_CLIENT_FETCH);
 767                     // e.g. result of second 8x32bit integer gather for 16bit components
 768                     // 256i - 0    1    2    3    4    5    6    7
 769                     //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
 770                     //
 771                 }
 772
 773                 // if we have at least one component to shuffle into place
 774                 if (compMask)
 775                 {
 776                     Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult,
 777                                                                   pVtxOut,
 778                                                                   Instruction::CastOps::FPExt,
 779                                                                   CONVERT_NONE,
 780                                                                   currentVertexElement,
 781                                                                   outputElt,
 782                                                                   compMask,
 783                                                                   compCtrl,
 784                                                                   vVertexElements);
 785
 786                     // Shuffle gathered components into place in simdvertex struct
 787                     mVWidth == 16 ? Shuffle16bpcGather16(args)
 788                                   : Shuffle16bpcGather(args); // outputs to vVertexElements ref
 789                 }
 790             }
 791             break;
 792             case 32:
 793             {
 794                 for (uint32_t i = 0; i < 4; i += 1)
 795                 {
 796                     if (isComponentEnabled(compMask, i))
 797                     {
 798                         // if we need to gather the component
 799                         if (compCtrl[i] == StoreSrc)
 800                         {
 801                             // Gather a SIMD of vertices
 802                             // APIs allow a 4GB range for offsets
 803                             // However, GATHERPS uses signed 32-bit offsets, so +/- 2GB range :(
 804                             // Add 2GB to the base pointer and 2GB to the offsets.  This makes
 805                             // "negative" (large) offsets into positive offsets and small offsets
 806                             // into negative offsets.
 807                             Value* vNewOffsets = ADD(vOffsets, VIMMED1(0x80000000));
 808                             vVertexElements[currentVertexElement++] =
 809                                 GATHERPS(gatherSrc,
 810                                          ADD(pStreamBaseGFX, C((uintptr_t)0x80000000U)),
 811                                          vNewOffsets,
 812                                          vGatherMask,
 813                                          1,
 814                                          MEM_CLIENT::GFX_MEM_CLIENT_FETCH);
 815                         }
 816                         else
 817                         {
 818                             vVertexElements[currentVertexElement++] =
 819                                 GenerateCompCtrlVector(compCtrl[i]);
 820                         }
 821
 822                         if (currentVertexElement > 3)
 823                         {
 824                             StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
 825                             // reset to the next vVertexElement to output
 826                             currentVertexElement = 0;
 827                         }
 828                     }
 829
 830                     // offset base to the next component in the vertex to gather
 831                     pStreamBaseGFX = ADD(pStreamBaseGFX, C((int64_t)4));
 832                 }
 833             }
 834             break;
 835             case 64:
 836             {
 837                 for (uint32_t i = 0; i < 4; i += 1)
 838                 {
 839                     if (isComponentEnabled(compMask, i))
 840                     {
 841                         // if we need to gather the component
 842                         if (compCtrl[i] == StoreSrc)
 843                         {
 844                             Value* vShufLo;
 845                             Value* vShufHi;
 846                             Value* vShufAll;
 847
 848                             if (mVWidth == 8)
 849                             {
 850                                 vShufLo  = C({0, 1, 2, 3});
 851                                 vShufHi  = C({4, 5, 6, 7});
 852                                 vShufAll = C({0, 1, 2, 3, 4, 5, 6, 7});
 853                             }
 854                             else
 855                             {
 856                                 SWR_ASSERT(mVWidth == 16);
 857                                 vShufLo = C({0, 1, 2, 3, 4, 5, 6, 7});
 858                                 vShufHi = C({8, 9, 10, 11, 12, 13, 14, 15});
 859                                 vShufAll =
 860                                     C({0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15});
 861                             }
 862
 863                             Value* vMaskLo = VSHUFFLE(vGatherMask, vGatherMask, vShufLo);
 864                             Value* vMaskHi = VSHUFFLE(vGatherMask, vGatherMask, vShufHi);
 865
 866                             Value* vOffsetsLo = VSHUFFLE(vOffsets, vOffsets, vShufLo);
 867                             Value* vOffsetsHi = VSHUFFLE(vOffsets, vOffsets, vShufHi);
 868
 869                             Value* vZeroDouble = VECTOR_SPLAT(
 870                                 mVWidth / 2, ConstantFP::get(IRB()->getDoubleTy(), 0.0f));
 871
 872                             Value* pGatherLo =
 873                                 GATHERPD(vZeroDouble, pStreamBaseGFX, vOffsetsLo, vMaskLo);
 874                             Value* pGatherHi =
 875                                 GATHERPD(vZeroDouble, pStreamBaseGFX, vOffsetsHi, vMaskHi);
 876
 877                             pGatherLo = VCVTPD2PS(pGatherLo);
 878                             pGatherHi = VCVTPD2PS(pGatherHi);
 879
 880                             Value* pGather = VSHUFFLE(pGatherLo, pGatherHi, vShufAll);
 881
 882                             vVertexElements[currentVertexElement++] = pGather;
 883                         }
 884                         else
 885                         {
 886                             vVertexElements[currentVertexElement++] =
 887                                 GenerateCompCtrlVector(compCtrl[i]);
 888                         }
 889
 890                         if (currentVertexElement > 3)
 891                         {
 892                             StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
 893                             // reset to the next vVertexElement to output
 894                             currentVertexElement = 0;
 895                         }
 896                     }
 897
 898                     // offset base to the next component  in the vertex to gather
 899                     pStreamBaseGFX = ADD(pStreamBaseGFX, C((int64_t)8));
 900                 }
 901             }
 902             break;
 903             default:
 904                 SWR_INVALID("Tried to fetch invalid FP format");
 905                 break;
 906             }
 907         }
 908         else
 909         {
 910             Instruction::CastOps extendCastType = Instruction::CastOps::CastOpsEnd;
 911             ConversionType       conversionType = CONVERT_NONE;
 912
 913             SWR_ASSERT(IsUniformFormat((SWR_FORMAT)ied.Format),
 914                        "Unsupported format for standard gather fetch.");
 915
 916             switch (info.type[0])
 917             {
 918             case SWR_TYPE_UNORM:
 919                 conversionType = CONVERT_NORMALIZED;
 920             case SWR_TYPE_UINT:
 921                 extendCastType = Instruction::CastOps::ZExt;
 922                 break;
 923             case SWR_TYPE_SNORM:
 924                 conversionType = CONVERT_NORMALIZED;
 925             case SWR_TYPE_SINT:
 926                 extendCastType = Instruction::CastOps::SExt;
 927                 break;
 928             case SWR_TYPE_USCALED:
 929                 conversionType = CONVERT_USCALED;
 930                 extendCastType = Instruction::CastOps::UIToFP;
 931                 break;
 932             case SWR_TYPE_SSCALED:
 933                 conversionType = CONVERT_SSCALED;
 934                 extendCastType = Instruction::CastOps::SIToFP;
 935                 break;
 936             case SWR_TYPE_SFIXED:
 937                 conversionType = CONVERT_SFIXED;
 938                 extendCastType = Instruction::CastOps::SExt;
 939                 break;
 940             default:
 941                 break;
 942             }
 943
 944             // value substituted when component of gather is masked
 945             Value* gatherSrc = VIMMED1(0);
 946
 947             // Gather components from memory to store in a simdvertex structure
 948             switch (bpc)
 949             {
 950             case 8:
 951             {
 952                 // if we have at least one component to fetch
 953                 if (compMask)
 954                 {
 955                     Value* vGatherResult = GATHERDD(gatherSrc,
 956                                                     pStreamBaseGFX,
 957                                                     vOffsets,
 958                                                     vGatherMask,
 959                                                     1,
 960                                                     MEM_CLIENT::GFX_MEM_CLIENT_FETCH);
 961                     // e.g. result of an 8x32bit integer gather for 8bit components
 962                     // 256i - 0    1    2    3    4    5    6    7
 963                     //        xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw
 964
 965                     Shuffle8bpcArgs args = std::forward_as_tuple(vGatherResult,
 966                                                                  pVtxOut,
 967                                                                  extendCastType,
 968                                                                  conversionType,
 969                                                                  currentVertexElement,
 970                                                                  outputElt,
 971                                                                  compMask,
 972                                                                  compCtrl,
 973                                                                  vVertexElements,
 974                                                                  info.swizzle);
 975
 976                     // Shuffle gathered components into place in simdvertex struct
 977                     mVWidth == 16 ? Shuffle8bpcGatherd16(args)
 978                                   : Shuffle8bpcGatherd(args); // outputs to vVertexElements ref
 979                 }
 980             }
 981             break;
 982             case 16:
 983             {
 984                 Value* vGatherResult[2];
 985
 986                 // if we have at least one component out of x or y to fetch
 987                 if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
 988                 {
 989                     vGatherResult[0] = GATHERDD(gatherSrc,
 990                                                 pStreamBaseGFX,
 991                                                 vOffsets,
 992                                                 vGatherMask,
 993                                                 1,
 994                                                 MEM_CLIENT::GFX_MEM_CLIENT_FETCH);
 995                     // e.g. result of first 8x32bit integer gather for 16bit components
 996                     // 256i - 0    1    2    3    4    5    6    7
 997                     //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
 998                     //
 999                 }
1000
1001                 // if we have at least one component out of z or w to fetch
1002                 if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
1003                 {
1004                     // offset base to the next components(zw) in the vertex to gather
1005                     pStreamBaseGFX = ADD(pStreamBaseGFX, C((int64_t)4));
1006
1007                     vGatherResult[1] = GATHERDD(gatherSrc,
1008                                                 pStreamBaseGFX,
1009                                                 vOffsets,
1010                                                 vGatherMask,
1011                                                 1,
1012                                                 MEM_CLIENT::GFX_MEM_CLIENT_FETCH);
1013                     // e.g. result of second 8x32bit integer gather for 16bit components
1014                     // 256i - 0    1    2    3    4    5    6    7
1015                     //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
1016                     //
1017                 }
1018
1019                 // if we have at least one component to shuffle into place
1020                 if (compMask)
1021                 {
1022                     Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult,
1023                                                                   pVtxOut,
1024                                                                   extendCastType,
1025                                                                   conversionType,
1026                                                                   currentVertexElement,
1027                                                                   outputElt,
1028                                                                   compMask,
1029                                                                   compCtrl,
1030                                                                   vVertexElements);
1031
1032                     // Shuffle gathered components into place in simdvertex struct
1033                     mVWidth == 16 ? Shuffle16bpcGather16(args)
1034                                   : Shuffle16bpcGather(args); // outputs to vVertexElements ref
1035                 }
1036             }
1037             break;
1038             case 32:
1039             {
1040                 // Gathered components into place in simdvertex struct
1041                 for (uint32_t i = 0; i < 4; i++)
1042                 {
1043                     if (isComponentEnabled(compMask, i))
1044                     {
1045                         // if we need to gather the component
1046                         if (compCtrl[i] == StoreSrc)
1047                         {
1048                             Value* pGather = GATHERDD(gatherSrc,
1049                                                       pStreamBaseGFX,
1050                                                       vOffsets,
1051                                                       vGatherMask,
1052                                                       1,
1053                                                       MEM_CLIENT::GFX_MEM_CLIENT_FETCH);
1054
1055                             if (conversionType == CONVERT_USCALED)
1056                             {
1057                                 pGather = UI_TO_FP(pGather, mSimdFP32Ty);
1058                             }
1059                             else if (conversionType == CONVERT_SSCALED)
1060                             {
1061                                 pGather = SI_TO_FP(pGather, mSimdFP32Ty);
1062                             }
1063                             else if (conversionType == CONVERT_SFIXED)
1064                             {
1065                                 pGather = FMUL(SI_TO_FP(pGather, mSimdFP32Ty),
1066                                                VBROADCAST(C(1 / 65536.0f)));
1067                             }
1068
1069                             vVertexElements[currentVertexElement++] = pGather;
1070
1071                             // e.g. result of a single 8x32bit integer gather for 32bit components
1072                             // 256i - 0    1    2    3    4    5    6    7
1073                             //        xxxx xxxx xxxx xxxx xxxx xxxx xxxx xxxx
1074                         }
1075                         else
1076                         {
1077                             vVertexElements[currentVertexElement++] =
1078                                 GenerateCompCtrlVector(compCtrl[i]);
1079                         }
1080
1081                         if (currentVertexElement > 3)
1082                         {
1083                             StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1084
1085                             // reset to the next vVertexElement to output
1086                             currentVertexElement = 0;
1087                         }
1088                     }
1089
1090                     // offset base to the next component  in the vertex to gather
1091                     pStreamBaseGFX = ADD(pStreamBaseGFX, C((int64_t)4));
1092                 }
1093             }
1094             break;
1095             }
1096         }
1097     }
1098
1099     // if we have a partially filled vVertexElement struct, output it
1100     if (currentVertexElement > 0)
1101     {
1102         StoreVertexElements(pVtxOut, outputElt++, currentVertexElement, vVertexElements);
1103     }
1104 }
1105
1106 template <typename T>
1107 Value* FetchJit::GetSimdValidIndicesHelper(Value* pIndices, Value* pLastIndex)
1108 {
1109     SWR_ASSERT(pIndices->getType() == mInt64Ty && pLastIndex->getType() == mInt64Ty,
1110                "Function expects gfxptr_t for both input parameters.");
1111
1112     Type* Ty = nullptr;
1113
1114     static_assert(sizeof(T) == sizeof(uint16_t) || sizeof(T) == sizeof(uint8_t),
1115                   "Unsupported type for use with GetSimdValidIndicesHelper<T>");
1116     constexpr bool bSize = (sizeof(T) == sizeof(uint16_t));
1117     if (bSize)
1118     {
1119         Ty = mInt16PtrTy;
1120     }
1121     else if (sizeof(T) == sizeof(uint8_t))
1122     {
1123         Ty = mInt8PtrTy;
1124     }
1125     else
1126     {
1127         SWR_ASSERT(false, "This should never happen as per static_assert above.");
1128     }
1129
1130     Value* vIndices = VUNDEF_I();
1131
1132     {
1133         // store 0 index on stack to be used to conditionally load from if index address is OOB
1134         Value* pZeroIndex = ALLOCA(Ty->getPointerElementType());
1135         STORE(C((T)0), pZeroIndex);
1136
1137         // Load a SIMD of index pointers
1138         for (int64_t lane = 0; lane < mVWidth; lane++)
1139         {
1140             // Calculate the address of the requested index
1141             Value* pIndex = GEP(pIndices, C(lane), Ty);
1142
1143             pLastIndex = INT_TO_PTR(pLastIndex, Ty);
1144
1145             // check if the address is less than the max index,
1146             Value* mask = ICMP_ULT(pIndex, pLastIndex);
1147
1148             // if valid, load the index. if not, load 0 from the stack
1149             Value* pValid = SELECT(mask, pIndex, pZeroIndex);
1150             Value* index  = LOAD(pValid, "valid index", Ty, MEM_CLIENT::GFX_MEM_CLIENT_FETCH);
1151
1152             // zero extended index to 32 bits and insert into the correct simd lane
1153             index    = Z_EXT(index, mInt32Ty);
1154             vIndices = VINSERT(vIndices, index, lane);
1155         }
1156     }
1157
1158     return vIndices;
1159 }
1160
1161 //////////////////////////////////////////////////////////////////////////
1162 /// @brief Loads a simd of valid indices. OOB indices are set to 0
1163 /// *Note* have to do 8bit index checking in scalar until we have AVX-512
1164 /// support
1165 /// @param pIndices - pointer to 8 bit indices
1166 /// @param pLastIndex - pointer to last valid index
1167 Value* FetchJit::GetSimdValid8bitIndices(Value* pIndices, Value* pLastIndex)
1168 {
1169     return GetSimdValidIndicesHelper<uint8_t>(pIndices, pLastIndex);
1170 }
1171
1172 //////////////////////////////////////////////////////////////////////////
1173 /// @brief Loads a simd of valid indices. OOB indices are set to 0
1174 /// *Note* have to do 16bit index checking in scalar until we have AVX-512
1175 /// support
1176 /// @param pIndices - pointer to 16 bit indices
1177 /// @param pLastIndex - pointer to last valid index
1178 Value* FetchJit::GetSimdValid16bitIndices(Value* pIndices, Value* pLastIndex)
1179 {
1180     return GetSimdValidIndicesHelper<uint16_t>(pIndices, pLastIndex);
1181 }
1182
1183 //////////////////////////////////////////////////////////////////////////
1184 /// @brief Loads a simd of valid indices. OOB indices are set to 0
1185 /// @param pIndices - pointer to 32 bit indices
1186 /// @param pLastIndex - pointer to last valid index
1187 Value* FetchJit::GetSimdValid32bitIndices(Value* pIndices, Value* pLastIndex)
1188 {
1189     DataLayout dL(JM()->mpCurrentModule);
1190     Value*     iLastIndex = pLastIndex;
1191     Value*     iIndices   = pIndices;
1192
1193     // get the number of indices left in the buffer (endPtr - curPtr) / sizeof(index)
1194     Value* numIndicesLeft = SUB(iLastIndex, iIndices);
1195     numIndicesLeft        = TRUNC(numIndicesLeft, mInt32Ty);
1196     numIndicesLeft        = SDIV(numIndicesLeft, C(4));
1197
1198     // create a vector of index counts from the base index ptr passed into the fetch
1199     Constant* vIndexOffsets;
1200     if (mVWidth == 8)
1201     {
1202         vIndexOffsets = C({0, 1, 2, 3, 4, 5, 6, 7});
1203     }
1204     else
1205     {
1206         vIndexOffsets = C({0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15});
1207     }
1208
1209     // compare index count to the max valid index
1210     // e.g vMaxIndex      4 4 4 4 4 4 4 4 : 4 indices left to load
1211     //     vIndexOffsets  0 1 2 3 4 5 6 7
1212     //     ------------------------------
1213     //     vIndexMask    -1-1-1-1 0 0 0 0 : offsets < max pass
1214     //     vLoadedIndices 0 1 2 3 0 0 0 0 : offsets >= max masked to 0
1215     Value* vMaxIndex  = VBROADCAST(numIndicesLeft);
1216     Value* vIndexMask = ICMP_SGT(vMaxIndex, vIndexOffsets);
1217
1218     // Load the indices; OOB loads 0
1219     return MASKED_LOAD(pIndices,
1220                        4,
1221                        vIndexMask,
1222                        VIMMED1(0),
1223                        "vIndices",
1224                        PointerType::get(mSimdInt32Ty, 0),
1225                        MEM_CLIENT::GFX_MEM_CLIENT_FETCH);
1226 }
1227
1228 //////////////////////////////////////////////////////////////////////////
1229 /// @brief Takes a SIMD of gathered 8bpc verts, zero or sign extends,
1230 /// denormalizes if needed, converts to F32 if needed, and positions in
1231 //  the proper SIMD rows to be output to the simdvertex structure
1232 /// @param args: (tuple of args, listed below)
1233 ///   @param vGatherResult - 8 gathered 8bpc vertices
1234 ///   @param pVtxOut - base pointer to output simdvertex struct
1235 ///   @param extendType - sign extend or zero extend
1236 ///   @param bNormalized - do we need to denormalize?
1237 ///   @param currentVertexElement - reference to the current vVertexElement
1238 ///   @param outputElt - reference to the current offset from simdvertex we're o
1239 ///   @param compMask - component packing mask
1240 ///   @param compCtrl - component control val
1241 ///   @param vVertexElements[4] - vertex components to output
1242 ///   @param swizzle[4] - component swizzle location
1243 void FetchJit::Shuffle8bpcGatherd16(Shuffle8bpcArgs& args)
1244 {
1245     // Unpack tuple args
1246     Value*&                    vGatherResult        = std::get<0>(args);
1247     Value*                     pVtxOut              = std::get<1>(args);
1248     const Instruction::CastOps extendType           = std::get<2>(args);
1249     const ConversionType       conversionType       = std::get<3>(args);
1250     uint32_t&                  currentVertexElement = std::get<4>(args);
1251     uint32_t&                  outputElt            = std::get<5>(args);
1252     const ComponentEnable      compMask             = std::get<6>(args);
1253     const ComponentControl(&compCtrl)[4]            = std::get<7>(args);
1254     Value*(&vVertexElements)[4]                     = std::get<8>(args);
1255     const uint32_t(&swizzle)[4]                     = std::get<9>(args);
1256
1257     // cast types
1258     Type* vGatherTy = VectorType::get(mInt32Ty, 8);
1259     Type* v32x8Ty   = VectorType::get(mInt8Ty, 32);
1260
1261     // have to do extra work for sign extending
1262     if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP))
1263     {
1264         Type* v16x8Ty = VectorType::get(mInt8Ty, 16); // 8x16bit ints in a 128bit lane
1265         Type* v128Ty  = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), 2);
1266
1267         // shuffle mask, including any swizzling
1268         const char x          = (char)swizzle[0];
1269         const char y          = (char)swizzle[1];
1270         const char z          = (char)swizzle[2];
1271         const char w          = (char)swizzle[3];
1272         Value*     vConstMask = C<char>(
1273             {char(x),     char(x + 4),  char(x + 8), char(x + 12), char(y),     char(y + 4),
1274              char(y + 8), char(y + 12), char(z),     char(z + 4),  char(z + 8), char(z + 12),
1275              char(w),     char(w + 4),  char(w + 8), char(w + 12), char(x),     char(x + 4),
1276              char(x + 8), char(x + 12), char(y),     char(y + 4),  char(y + 8), char(y + 12),
1277              char(z),     char(z + 4),  char(z + 8), char(z + 12), char(w),     char(w + 4),
1278              char(w + 8), char(w + 12)});
1279
1280         // SIMD16 PSHUFB isnt part of AVX-512F, so split into SIMD8 for the sake of KNL, for now..
1281
1282         Value* vGatherResult_lo = EXTRACT_16(vGatherResult, 0);
1283         Value* vGatherResult_hi = EXTRACT_16(vGatherResult, 1);
1284
1285         Value* vShufResult_lo =
1286             BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask), vGatherTy);
1287         Value* vShufResult_hi =
1288             BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask), vGatherTy);
1289
1290         // after pshufb: group components together in each 128bit lane
1291         // 256i - 0    1    2    3    4    5    6    7
1292         //        xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww
1293
1294         Value* vi128XY_lo = nullptr;
1295         Value* vi128XY_hi = nullptr;
1296         if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
1297         {
1298             vi128XY_lo = BITCAST(
1299                 VSHUFFLE(vShufResult_lo, vShufResult_lo, C<int32_t>({0, 4, 0, 0, 1, 5, 0, 0})),
1300                 v128Ty);
1301             vi128XY_hi = BITCAST(
1302                 VSHUFFLE(vShufResult_hi, vShufResult_hi, C<int32_t>({0, 4, 0, 0, 1, 5, 0, 0})),
1303                 v128Ty);
1304
1305             // after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane
1306             // 256i - 0    1    2    3    4    5    6    7
1307             //        xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care)
1308         }
1309
1310         // do the same for zw components
1311         Value* vi128ZW_lo = nullptr;
1312         Value* vi128ZW_hi = nullptr;
1313         if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
1314         {
1315             vi128ZW_lo = BITCAST(
1316                 VSHUFFLE(vShufResult_lo, vShufResult_lo, C<int32_t>({2, 6, 0, 0, 3, 7, 0, 0})),
1317                 v128Ty);
1318             vi128ZW_hi = BITCAST(
1319                 VSHUFFLE(vShufResult_hi, vShufResult_hi, C<int32_t>({2, 6, 0, 0, 3, 7, 0, 0})),
1320                 v128Ty);
1321         }
1322
1323         // init denormalize variables if needed
1324         Instruction::CastOps fpCast;
1325         Value*               conversionFactor;
1326
1327         switch (conversionType)
1328         {
1329         case CONVERT_NORMALIZED:
1330             fpCast           = Instruction::CastOps::SIToFP;
1331             conversionFactor = VIMMED1((float)(1.0 / 127.0));
1332             break;
1333         case CONVERT_SSCALED:
1334             fpCast           = Instruction::CastOps::SIToFP;
1335             conversionFactor = VIMMED1((float)(1.0));
1336             break;
1337         case CONVERT_USCALED:
1338             SWR_INVALID("Type should not be sign extended!");
1339             conversionFactor = nullptr;
1340             break;
1341         default:
1342             SWR_ASSERT(conversionType == CONVERT_NONE);
1343             conversionFactor = nullptr;
1344             break;
1345         }
1346
1347         // sign extend all enabled components. If we have a fill vVertexElements, output to current
1348         // simdvertex
1349         for (uint32_t i = 0; i < 4; i++)
1350         {
1351             if (isComponentEnabled(compMask, i))
1352             {
1353                 if (compCtrl[i] == ComponentControl::StoreSrc)
1354                 {
1355                     // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
1356                     uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
1357                     // if x or y, use vi128XY permute result, else use vi128ZW
1358                     Value* selectedPermute_lo = (i < 2) ? vi128XY_lo : vi128ZW_lo;
1359                     Value* selectedPermute_hi = (i < 2) ? vi128XY_hi : vi128ZW_hi;
1360
1361                     // sign extend
1362                     Value* temp_lo =
1363                         PMOVSXBD(BITCAST(VEXTRACT(selectedPermute_lo, C(lane)), v16x8Ty));
1364                     Value* temp_hi =
1365                         PMOVSXBD(BITCAST(VEXTRACT(selectedPermute_hi, C(lane)), v16x8Ty));
1366
1367                     Value* temp = JOIN_16(temp_lo, temp_hi);
1368
1369                     // denormalize if needed
1370                     if (conversionType != CONVERT_NONE)
1371                     {
1372                         temp = FMUL(CAST(fpCast, temp, mSimdFP32Ty), conversionFactor);
1373                     }
1374
1375                     vVertexElements[currentVertexElement] = temp;
1376
1377                     currentVertexElement += 1;
1378                 }
1379                 else
1380                 {
1381                     vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1382                 }
1383
1384                 if (currentVertexElement > 3)
1385                 {
1386                     StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1387                     // reset to the next vVertexElement to output
1388                     currentVertexElement = 0;
1389                 }
1390             }
1391         }
1392     }
1393     // else zero extend
1394     else if ((extendType == Instruction::CastOps::ZExt) ||
1395              (extendType == Instruction::CastOps::UIToFP))
1396     {
1397         // init denormalize variables if needed
1398         Instruction::CastOps fpCast;
1399         Value*               conversionFactor;
1400
1401         switch (conversionType)
1402         {
1403         case CONVERT_NORMALIZED:
1404             fpCast           = Instruction::CastOps::UIToFP;
1405             conversionFactor = VIMMED1((float)(1.0 / 255.0));
1406             break;
1407         case CONVERT_USCALED:
1408             fpCast           = Instruction::CastOps::UIToFP;
1409             conversionFactor = VIMMED1((float)(1.0));
1410             break;
1411         case CONVERT_SSCALED:
1412             SWR_INVALID("Type should not be zero extended!");
1413             conversionFactor = nullptr;
1414             break;
1415         default:
1416             SWR_ASSERT(conversionType == CONVERT_NONE);
1417             conversionFactor = nullptr;
1418             break;
1419         }
1420
1421         // shuffle enabled components into lower byte of each 32bit lane, 0 extending to 32 bits
1422         for (uint32_t i = 0; i < 4; i++)
1423         {
1424             if (isComponentEnabled(compMask, i))
1425             {
1426                 if (compCtrl[i] == ComponentControl::StoreSrc)
1427                 {
1428                     // pshufb masks for each component
1429                     Value* vConstMask;
1430                     switch (swizzle[i])
1431                     {
1432                     case 0:
1433                         // x shuffle mask
1434                         vConstMask =
1435                             C<char>({0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1,
1436                                      0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1});
1437                         break;
1438                     case 1:
1439                         // y shuffle mask
1440                         vConstMask =
1441                             C<char>({1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1,
1442                                      1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1});
1443                         break;
1444                     case 2:
1445                         // z shuffle mask
1446                         vConstMask =
1447                             C<char>({2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1,
1448                                      2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1});
1449                         break;
1450                     case 3:
1451                         // w shuffle mask
1452                         vConstMask =
1453                             C<char>({3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1,
1454                                      3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1});
1455                         break;
1456                     default:
1457                         vConstMask = nullptr;
1458                         break;
1459                     }
1460
1461                     Value* vGatherResult_lo = EXTRACT_16(vGatherResult, 0);
1462                     Value* vGatherResult_hi = EXTRACT_16(vGatherResult, 1);
1463
1464                     Value* temp_lo =
1465                         BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask), vGatherTy);
1466                     Value* temp_hi =
1467                         BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask), vGatherTy);
1468
1469                     // after pshufb for x channel
1470                     // 256i - 0    1    2    3    4    5    6    7
1471                     //        x000 x000 x000 x000 x000 x000 x000 x000
1472
1473                     Value* temp = JOIN_16(temp_lo, temp_hi);
1474
1475                     // denormalize if needed
1476                     if (conversionType != CONVERT_NONE)
1477                     {
1478                         temp = FMUL(CAST(fpCast, temp, mSimdFP32Ty), conversionFactor);
1479                     }
1480
1481                     vVertexElements[currentVertexElement] = temp;
1482
1483                     currentVertexElement += 1;
1484                 }
1485                 else
1486                 {
1487                     vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1488                 }
1489
1490                 if (currentVertexElement > 3)
1491                 {
1492                     StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1493                     // reset to the next vVertexElement to output
1494                     currentVertexElement = 0;
1495                 }
1496             }
1497         }
1498     }
1499     else
1500     {
1501         SWR_INVALID("Unsupported conversion type");
1502     }
1503 }
1504
1505 void FetchJit::Shuffle8bpcGatherd(Shuffle8bpcArgs& args)
1506 {
1507     // Unpack tuple args
1508     Value*&                    vGatherResult        = std::get<0>(args);
1509     Value*                     pVtxOut              = std::get<1>(args);
1510     const Instruction::CastOps extendType           = std::get<2>(args);
1511     const ConversionType       conversionType       = std::get<3>(args);
1512     uint32_t&                  currentVertexElement = std::get<4>(args);
1513     uint32_t&                  outputElt            = std::get<5>(args);
1514     const ComponentEnable      compMask             = std::get<6>(args);
1515     const ComponentControl(&compCtrl)[4]            = std::get<7>(args);
1516     Value*(&vVertexElements)[4]                     = std::get<8>(args);
1517     const uint32_t(&swizzle)[4]                     = std::get<9>(args);
1518
1519     // cast types
1520     Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
1521
1522     for (uint32_t i = 0; i < 4; i++)
1523     {
1524         if (!isComponentEnabled(compMask, i))
1525             continue;
1526
1527         if (compCtrl[i] == ComponentControl::StoreSrc)
1528         {
1529             std::vector<uint32_t> vShuffleMasks[4] = {
1530                 {0, 4, 8, 12, 16, 20, 24, 28},  // x
1531                 {1, 5, 9, 13, 17, 21, 25, 29},  // y
1532                 {2, 6, 10, 14, 18, 22, 26, 30}, // z
1533                 {3, 7, 11, 15, 19, 23, 27, 31}, // w
1534             };
1535
1536             Value* val = VSHUFFLE(BITCAST(vGatherResult, v32x8Ty),
1537                                   UndefValue::get(v32x8Ty),
1538                                   vShuffleMasks[swizzle[i]]);
1539
1540             if ((extendType == Instruction::CastOps::SExt) ||
1541                 (extendType == Instruction::CastOps::SIToFP))
1542             {
1543                 switch (conversionType)
1544                 {
1545                 case CONVERT_NORMALIZED:
1546                     val = FMUL(SI_TO_FP(val, mSimdFP32Ty), VIMMED1((float)(1.0 / 127.0)));
1547                     break;
1548                 case CONVERT_SSCALED:
1549                     val = SI_TO_FP(val, mSimdFP32Ty);
1550                     break;
1551                 case CONVERT_USCALED:
1552                     SWR_INVALID("Type should not be sign extended!");
1553                     break;
1554                 default:
1555                     SWR_ASSERT(conversionType == CONVERT_NONE);
1556                     val = S_EXT(val, mSimdInt32Ty);
1557                     break;
1558                 }
1559             }
1560             else if ((extendType == Instruction::CastOps::ZExt) ||
1561                      (extendType == Instruction::CastOps::UIToFP))
1562             {
1563                 switch (conversionType)
1564                 {
1565                 case CONVERT_NORMALIZED:
1566                     val = FMUL(UI_TO_FP(val, mSimdFP32Ty), VIMMED1((float)(1.0 / 255.0)));
1567                     break;
1568                 case CONVERT_SSCALED:
1569                     SWR_INVALID("Type should not be zero extended!");
1570                     break;
1571                 case CONVERT_USCALED:
1572                     val = UI_TO_FP(val, mSimdFP32Ty);
1573                     break;
1574                 default:
1575                     SWR_ASSERT(conversionType == CONVERT_NONE);
1576                     val = Z_EXT(val, mSimdInt32Ty);
1577                     break;
1578                 }
1579             }
1580             else
1581             {
1582                 SWR_INVALID("Unsupported conversion type");
1583             }
1584
1585             vVertexElements[currentVertexElement++] = val;
1586         }
1587         else
1588         {
1589             vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1590         }
1591
1592         if (currentVertexElement > 3)
1593         {
1594             StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1595             // reset to the next vVertexElement to output
1596             currentVertexElement = 0;
1597         }
1598     }
1599 }
1600
1601 //////////////////////////////////////////////////////////////////////////
1602 /// @brief Takes a SIMD of gathered 16bpc verts, zero or sign extends,
1603 /// denormalizes if needed, converts to F32 if needed, and positions in
1604 //  the proper SIMD rows to be output to the simdvertex structure
1605 /// @param args: (tuple of args, listed below)
1606 ///   @param vGatherResult[2] - array of gathered 16bpc vertices, 4 per index
1607 ///   @param pVtxOut - base pointer to output simdvertex struct
1608 ///   @param extendType - sign extend or zero extend
1609 ///   @param bNormalized - do we need to denormalize?
1610 ///   @param currentVertexElement - reference to the current vVertexElement
1611 ///   @param outputElt - reference to the current offset from simdvertex we're o
1612 ///   @param compMask - component packing mask
1613 ///   @param compCtrl - component control val
1614 ///   @param vVertexElements[4] - vertex components to output
1615 void FetchJit::Shuffle16bpcGather16(Shuffle16bpcArgs& args)
1616 {
1617     // Unpack tuple args
1618     Value*(&vGatherResult)[2]                       = std::get<0>(args);
1619     Value*                     pVtxOut              = std::get<1>(args);
1620     const Instruction::CastOps extendType           = std::get<2>(args);
1621     const ConversionType       conversionType       = std::get<3>(args);
1622     uint32_t&                  currentVertexElement = std::get<4>(args);
1623     uint32_t&                  outputElt            = std::get<5>(args);
1624     const ComponentEnable      compMask             = std::get<6>(args);
1625     const ComponentControl(&compCtrl)[4]            = std::get<7>(args);
1626     Value*(&vVertexElements)[4]                     = std::get<8>(args);
1627
1628     // cast types
1629     Type* vGatherTy = VectorType::get(mInt32Ty, 8);
1630     Type* v32x8Ty   = VectorType::get(mInt8Ty, 32);
1631
1632     // have to do extra work for sign extending
1633     if ((extendType == Instruction::CastOps::SExt) ||
1634         (extendType == Instruction::CastOps::SIToFP) || (extendType == Instruction::CastOps::FPExt))
1635     {
1636         // is this PP float?
1637         bool bFP = (extendType == Instruction::CastOps::FPExt) ? true : false;
1638
1639         Type* v8x16Ty   = VectorType::get(mInt16Ty, 8); // 8x16bit in a 128bit lane
1640         Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), 2);
1641
1642         // shuffle mask
1643         Value* vConstMask = C<uint8_t>({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
1644                                         0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15});
1645         Value* vi128XY_lo = nullptr;
1646         Value* vi128XY_hi = nullptr;
1647         if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
1648         {
1649             // SIMD16 PSHUFB isnt part of AVX-512F, so split into SIMD8 for the sake of KNL, for
1650             // now..
1651
1652             Value* vGatherResult_lo = BITCAST(EXTRACT_16(vGatherResult[0], 0), v32x8Ty);
1653             Value* vGatherResult_hi = BITCAST(EXTRACT_16(vGatherResult[0], 1), v32x8Ty);
1654
1655             Value* vShufResult_lo = BITCAST(PSHUFB(vGatherResult_lo, vConstMask), vGatherTy);
1656             Value* vShufResult_hi = BITCAST(PSHUFB(vGatherResult_hi, vConstMask), vGatherTy);
1657
1658             // after pshufb: group components together in each 128bit lane
1659             // 256i - 0    1    2    3    4    5    6    7
1660             //        xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
1661
1662             vi128XY_lo = BITCAST(
1663                 VSHUFFLE(vShufResult_lo, vShufResult_lo, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})),
1664                 v128bitTy);
1665             vi128XY_hi = BITCAST(
1666                 VSHUFFLE(vShufResult_hi, vShufResult_hi, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})),
1667                 v128bitTy);
1668
1669             // after PERMD: move and pack xy components into each 128bit lane
1670             // 256i - 0    1    2    3    4    5    6    7
1671             //        xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
1672         }
1673
1674         // do the same for zw components
1675         Value* vi128ZW_lo = nullptr;
1676         Value* vi128ZW_hi = nullptr;
1677         if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
1678         {
1679             Value* vGatherResult_lo = BITCAST(EXTRACT_16(vGatherResult[1], 0), v32x8Ty);
1680             Value* vGatherResult_hi = BITCAST(EXTRACT_16(vGatherResult[1], 1), v32x8Ty);
1681
1682             Value* vShufResult_lo = BITCAST(PSHUFB(vGatherResult_lo, vConstMask), vGatherTy);
1683             Value* vShufResult_hi = BITCAST(PSHUFB(vGatherResult_hi, vConstMask), vGatherTy);
1684
1685             vi128ZW_lo = BITCAST(
1686                 VSHUFFLE(vShufResult_lo, vShufResult_lo, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})),
1687                 v128bitTy);
1688             vi128ZW_hi = BITCAST(
1689                 VSHUFFLE(vShufResult_hi, vShufResult_hi, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})),
1690                 v128bitTy);
1691         }
1692
1693         // init denormalize variables if needed
1694         Instruction::CastOps IntToFpCast;
1695         Value*               conversionFactor;
1696
1697         switch (conversionType)
1698         {
1699         case CONVERT_NORMALIZED:
1700             IntToFpCast      = Instruction::CastOps::SIToFP;
1701             conversionFactor = VIMMED1((float)(1.0 / 32767.0));
1702             break;
1703         case CONVERT_SSCALED:
1704             IntToFpCast      = Instruction::CastOps::SIToFP;
1705             conversionFactor = VIMMED1((float)(1.0));
1706             break;
1707         case CONVERT_USCALED:
1708             SWR_INVALID("Type should not be sign extended!");
1709             conversionFactor = nullptr;
1710             break;
1711         default:
1712             SWR_ASSERT(conversionType == CONVERT_NONE);
1713             conversionFactor = nullptr;
1714             break;
1715         }
1716
1717         // sign extend all enabled components. If we have a fill vVertexElements, output to current
1718         // simdvertex
1719         for (uint32_t i = 0; i < 4; i++)
1720         {
1721             if (isComponentEnabled(compMask, i))
1722             {
1723                 if (compCtrl[i] == ComponentControl::StoreSrc)
1724                 {
1725                     // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
1726                     uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
1727                     // if x or y, use vi128XY permute result, else use vi128ZW
1728                     Value* selectedPermute_lo = (i < 2) ? vi128XY_lo : vi128ZW_lo;
1729                     Value* selectedPermute_hi = (i < 2) ? vi128XY_hi : vi128ZW_hi;
1730
1731                     if (bFP)
1732                     {
1733                         // extract 128 bit lanes to sign extend each component
1734                         Value* temp_lo =
1735                             CVTPH2PS(BITCAST(VEXTRACT(selectedPermute_lo, C(lane)), v8x16Ty));
1736                         Value* temp_hi =
1737                             CVTPH2PS(BITCAST(VEXTRACT(selectedPermute_hi, C(lane)), v8x16Ty));
1738
1739                         vVertexElements[currentVertexElement] = JOIN_16(temp_lo, temp_hi);
1740                     }
1741                     else
1742                     {
1743                         // extract 128 bit lanes to sign extend each component
1744                         Value* temp_lo =
1745                             PMOVSXWD(BITCAST(VEXTRACT(selectedPermute_lo, C(lane)), v8x16Ty));
1746                         Value* temp_hi =
1747                             PMOVSXWD(BITCAST(VEXTRACT(selectedPermute_hi, C(lane)), v8x16Ty));
1748
1749                         Value* temp = JOIN_16(temp_lo, temp_hi);
1750
1751                         // denormalize if needed
1752                         if (conversionType != CONVERT_NONE)
1753                         {
1754                             temp = FMUL(CAST(IntToFpCast, temp, mSimdFP32Ty), conversionFactor);
1755                         }
1756
1757                         vVertexElements[currentVertexElement] = temp;
1758                     }
1759
1760                     currentVertexElement += 1;
1761                 }
1762                 else
1763                 {
1764                     vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1765                 }
1766
1767                 if (currentVertexElement > 3)
1768                 {
1769                     StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1770                     // reset to the next vVertexElement to output
1771                     currentVertexElement = 0;
1772                 }
1773             }
1774         }
1775     }
1776     // else zero extend
1777     else if ((extendType == Instruction::CastOps::ZExt) ||
1778              (extendType == Instruction::CastOps::UIToFP))
1779     {
1780         // pshufb masks for each component
1781         Value* vConstMask[2];
1782
1783         if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 2))
1784         {
1785             // x/z shuffle mask
1786             vConstMask[0] = C<char>({
1787                 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
1788                 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
1789             });
1790         }
1791
1792         if (isComponentEnabled(compMask, 1) || isComponentEnabled(compMask, 3))
1793         {
1794             // y/w shuffle mask
1795             vConstMask[1] = C<char>({2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
1796                                      2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1});
1797         }
1798
1799         // init denormalize variables if needed
1800         Instruction::CastOps fpCast;
1801         Value*               conversionFactor;
1802
1803         switch (conversionType)
1804         {
1805         case CONVERT_NORMALIZED:
1806             fpCast           = Instruction::CastOps::UIToFP;
1807             conversionFactor = VIMMED1((float)(1.0 / 65535.0));
1808             break;
1809         case CONVERT_USCALED:
1810             fpCast           = Instruction::CastOps::UIToFP;
1811             conversionFactor = VIMMED1((float)(1.0f));
1812             break;
1813         case CONVERT_SSCALED:
1814             SWR_INVALID("Type should not be zero extended!");
1815             conversionFactor = nullptr;
1816             break;
1817         default:
1818             SWR_ASSERT(conversionType == CONVERT_NONE);
1819             conversionFactor = nullptr;
1820             break;
1821         }
1822
1823         // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits
1824         for (uint32_t i = 0; i < 4; i++)
1825         {
1826             if (isComponentEnabled(compMask, i))
1827             {
1828                 if (compCtrl[i] == ComponentControl::StoreSrc)
1829                 {
1830                     // select correct constMask for x/z or y/w pshufb
1831                     uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1;
1832                     // if x or y, use vi128XY permute result, else use vi128ZW
1833                     uint32_t selectedGather = (i < 2) ? 0 : 1;
1834
1835                     // SIMD16 PSHUFB isnt part of AVX-512F, so split into SIMD8 for the sake of KNL,
1836                     // for now..
1837
1838                     Value* vGatherResult_lo = EXTRACT_16(vGatherResult[selectedGather], 0);
1839                     Value* vGatherResult_hi = EXTRACT_16(vGatherResult[selectedGather], 1);
1840
1841                     Value* temp_lo = BITCAST(
1842                         PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask[selectedMask]),
1843                         vGatherTy);
1844                     Value* temp_hi = BITCAST(
1845                         PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask[selectedMask]),
1846                         vGatherTy);
1847
1848                     // after pshufb mask for x channel; z uses the same shuffle from the second
1849                     // gather 256i - 0    1    2    3    4    5    6    7
1850                     //        xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00
1851
1852                     Value* temp = JOIN_16(temp_lo, temp_hi);
1853
1854                     // denormalize if needed
1855                     if (conversionType != CONVERT_NONE)
1856                     {
1857                         temp = FMUL(CAST(fpCast, temp, mSimdFP32Ty), conversionFactor);
1858                     }
1859
1860                     vVertexElements[currentVertexElement] = temp;
1861
1862                     currentVertexElement += 1;
1863                 }
1864                 else
1865                 {
1866                     vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1867                 }
1868
1869                 if (currentVertexElement > 3)
1870                 {
1871                     StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1872                     // reset to the next vVertexElement to output
1873                     currentVertexElement = 0;
1874                 }
1875             }
1876         }
1877     }
1878     else
1879     {
1880         SWR_INVALID("Unsupported conversion type");
1881     }
1882 }
1883
1884 void FetchJit::Shuffle16bpcGather(Shuffle16bpcArgs& args)
1885 {
1886     // Unpack tuple args
1887     Value*(&vGatherResult)[2]                       = std::get<0>(args);
1888     Value*                     pVtxOut              = std::get<1>(args);
1889     const Instruction::CastOps extendType           = std::get<2>(args);
1890     const ConversionType       conversionType       = std::get<3>(args);
1891     uint32_t&                  currentVertexElement = std::get<4>(args);
1892     uint32_t&                  outputElt            = std::get<5>(args);
1893     const ComponentEnable      compMask             = std::get<6>(args);
1894     const ComponentControl(&compCtrl)[4]            = std::get<7>(args);
1895     Value*(&vVertexElements)[4]                     = std::get<8>(args);
1896
1897     // cast types
1898     Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
1899     Type* v32x8Ty   = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
1900
1901     // have to do extra work for sign extending
1902     if ((extendType == Instruction::CastOps::SExt) ||
1903         (extendType == Instruction::CastOps::SIToFP) || (extendType == Instruction::CastOps::FPExt))
1904     {
1905         // is this PP float?
1906         bool bFP = (extendType == Instruction::CastOps::FPExt) ? true : false;
1907
1908         Type* v8x16Ty   = VectorType::get(mInt16Ty, 8); // 8x16bit in a 128bit lane
1909         Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128),
1910                                           mVWidth / 4); // vwidth is units of 32 bits
1911
1912         // shuffle mask
1913         Value* vConstMask = C<char>({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
1914                                      0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15});
1915         Value* vi128XY    = nullptr;
1916         if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
1917         {
1918             Value* vShufResult =
1919                 BITCAST(PSHUFB(BITCAST(vGatherResult[0], v32x8Ty), vConstMask), vGatherTy);
1920             // after pshufb: group components together in each 128bit lane
1921             // 256i - 0    1    2    3    4    5    6    7
1922             //        xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
1923
1924             vi128XY = BITCAST(VPERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
1925             // after PERMD: move and pack xy components into each 128bit lane
1926             // 256i - 0    1    2    3    4    5    6    7
1927             //        xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
1928         }
1929
1930         // do the same for zw components
1931         Value* vi128ZW = nullptr;
1932         if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
1933         {
1934             Value* vShufResult =
1935                 BITCAST(PSHUFB(BITCAST(vGatherResult[1], v32x8Ty), vConstMask), vGatherTy);
1936             vi128ZW = BITCAST(VPERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
1937         }
1938
1939         // init denormalize variables if needed
1940         Instruction::CastOps IntToFpCast;
1941         Value*               conversionFactor;
1942
1943         switch (conversionType)
1944         {
1945         case CONVERT_NORMALIZED:
1946             IntToFpCast      = Instruction::CastOps::SIToFP;
1947             conversionFactor = VIMMED1((float)(1.0 / 32767.0));
1948             break;
1949         case CONVERT_SSCALED:
1950             IntToFpCast      = Instruction::CastOps::SIToFP;
1951             conversionFactor = VIMMED1((float)(1.0));
1952             break;
1953         case CONVERT_USCALED:
1954             SWR_INVALID("Type should not be sign extended!");
1955             conversionFactor = nullptr;
1956             break;
1957         default:
1958             SWR_ASSERT(conversionType == CONVERT_NONE);
1959             conversionFactor = nullptr;
1960             break;
1961         }
1962
1963         // sign extend all enabled components. If we have a fill vVertexElements, output to current
1964         // simdvertex
1965         for (uint32_t i = 0; i < 4; i++)
1966         {
1967             if (isComponentEnabled(compMask, i))
1968             {
1969                 if (compCtrl[i] == ComponentControl::StoreSrc)
1970                 {
1971                     // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
1972                     uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
1973                     // if x or y, use vi128XY permute result, else use vi128ZW
1974                     Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
1975
1976                     if (bFP)
1977                     {
1978                         // extract 128 bit lanes to sign extend each component
1979                         vVertexElements[currentVertexElement] =
1980                             CVTPH2PS(BITCAST(VEXTRACT(selectedPermute, C(lane)), v8x16Ty));
1981                     }
1982                     else
1983                     {
1984                         // extract 128 bit lanes to sign extend each component
1985                         vVertexElements[currentVertexElement] =
1986                             PMOVSXWD(BITCAST(VEXTRACT(selectedPermute, C(lane)), v8x16Ty));
1987
1988                         // denormalize if needed
1989                         if (conversionType != CONVERT_NONE)
1990                         {
1991                             vVertexElements[currentVertexElement] =
1992                                 FMUL(CAST(IntToFpCast,
1993                                           vVertexElements[currentVertexElement],
1994                                           mSimdFP32Ty),
1995                                      conversionFactor);
1996                         }
1997                     }
1998                     currentVertexElement++;
1999                 }
2000                 else
2001                 {
2002                     vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
2003                 }
2004
2005                 if (currentVertexElement > 3)
2006                 {
2007                     StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
2008                     // reset to the next vVertexElement to output
2009                     currentVertexElement = 0;
2010                 }
2011             }
2012         }
2013     }
2014     // else zero extend
2015     else if ((extendType == Instruction::CastOps::ZExt) ||
2016              (extendType == Instruction::CastOps::UIToFP))
2017     {
2018         // pshufb masks for each component
2019         Value* vConstMask[2];
2020         if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 2))
2021         {
2022             // x/z shuffle mask
2023             vConstMask[0] = C<char>({
2024                 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
2025                 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
2026             });
2027         }
2028
2029         if (isComponentEnabled(compMask, 1) || isComponentEnabled(compMask, 3))
2030         {
2031             // y/w shuffle mask
2032             vConstMask[1] = C<char>({2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
2033                                      2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1});
2034         }
2035
2036         // init denormalize variables if needed
2037         Instruction::CastOps fpCast;
2038         Value*               conversionFactor;
2039
2040         switch (conversionType)
2041         {
2042         case CONVERT_NORMALIZED:
2043             fpCast           = Instruction::CastOps::UIToFP;
2044             conversionFactor = VIMMED1((float)(1.0 / 65535.0));
2045             break;
2046         case CONVERT_USCALED:
2047             fpCast           = Instruction::CastOps::UIToFP;
2048             conversionFactor = VIMMED1((float)(1.0f));
2049             break;
2050         case CONVERT_SSCALED:
2051             SWR_INVALID("Type should not be zero extended!");
2052             conversionFactor = nullptr;
2053             break;
2054         default:
2055             SWR_ASSERT(conversionType == CONVERT_NONE);
2056             conversionFactor = nullptr;
2057             break;
2058         }
2059
2060         // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits
2061         for (uint32_t i = 0; i < 4; i++)
2062         {
2063             if (isComponentEnabled(compMask, i))
2064             {
2065                 if (compCtrl[i] == ComponentControl::StoreSrc)
2066                 {
2067                     // select correct constMask for x/z or y/w pshufb
2068                     uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1;
2069                     // if x or y, use vi128XY permute result, else use vi128ZW
2070                     uint32_t selectedGather = (i < 2) ? 0 : 1;
2071
2072                     vVertexElements[currentVertexElement] =
2073                         BITCAST(PSHUFB(BITCAST(vGatherResult[selectedGather], v32x8Ty),
2074                                        vConstMask[selectedMask]),
2075                                 vGatherTy);
2076                     // after pshufb mask for x channel; z uses the same shuffle from the second
2077                     // gather 256i - 0    1    2    3    4    5    6    7
2078                     //        xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00
2079
2080                     // denormalize if needed
2081                     if (conversionType != CONVERT_NONE)
2082                     {
2083                         vVertexElements[currentVertexElement] =
2084                             FMUL(CAST(fpCast, vVertexElements[currentVertexElement], mSimdFP32Ty),
2085                                  conversionFactor);
2086                     }
2087                     currentVertexElement++;
2088                 }
2089                 else
2090                 {
2091                     vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
2092                 }
2093
2094                 if (currentVertexElement > 3)
2095                 {
2096                     StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
2097                     // reset to the next vVertexElement to output
2098                     currentVertexElement = 0;
2099                 }
2100             }
2101         }
2102     }
2103     else
2104     {
2105         SWR_INVALID("Unsupported conversion type");
2106     }
2107 }
2108
2109 //////////////////////////////////////////////////////////////////////////
2110 /// @brief Output a simdvertex worth of elements to the current outputElt
2111 /// @param pVtxOut - base address of VIN output struct
2112 /// @param outputElt - simdvertex offset in VIN to write to
2113 /// @param numEltsToStore - number of simdvertex rows to write out
2114 /// @param vVertexElements - LLVM Value*[] simdvertex to write out
2115 void FetchJit::StoreVertexElements(Value*         pVtxOut,
2116                                    const uint32_t outputElt,
2117                                    const uint32_t numEltsToStore,
2118                                    Value* (&vVertexElements)[4])
2119 {
2120     SWR_ASSERT(numEltsToStore <= 4, "Invalid element count.");
2121
2122     for (uint32_t c = 0; c < numEltsToStore; ++c)
2123     {
2124         // STORE expects FP32 x vWidth type, just bitcast if needed
2125         if (!vVertexElements[c]->getType()->getScalarType()->isFloatTy())
2126         {
2127 #if FETCH_DUMP_VERTEX
2128             PRINT("vVertexElements[%d]: 0x%x\n", {C(c), vVertexElements[c]});
2129 #endif
2130             vVertexElements[c] = BITCAST(vVertexElements[c], mSimdFP32Ty);
2131         }
2132 #if FETCH_DUMP_VERTEX
2133         else
2134         {
2135             PRINT("vVertexElements[%d]: %f\n", {C(c), vVertexElements[c]});
2136         }
2137 #endif
2138         // outputElt * 4 = offsetting by the size of a simdvertex
2139         // + c offsets to a 32bit x vWidth row within the current vertex
2140         Value* dest = GEP(pVtxOut, C(outputElt * 4 + c), nullptr, "destGEP");
2141         STORE(vVertexElements[c], dest);
2142     }
2143 }
2144
2145 //////////////////////////////////////////////////////////////////////////
2146 /// @brief Generates a constant vector of values based on the
2147 /// ComponentControl value
2148 /// @param ctrl - ComponentControl value
2149 Value* FetchJit::GenerateCompCtrlVector(const ComponentControl ctrl)
2150 {
2151     switch (ctrl)
2152     {
2153     case NoStore:
2154         return VUNDEF_I();
2155     case Store0:
2156         return VIMMED1(0);
2157     case Store1Fp:
2158         return VIMMED1(1.0f);
2159     case Store1Int:
2160         return VIMMED1(1);
2161     case StoreVertexId:
2162     {
2163         if (mVWidth == 16)
2164         {
2165             Type*  pSimd8FPTy = VectorType::get(mFP32Ty, 8);
2166             Value* pIdLo =
2167                 BITCAST(LOAD(GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_VertexID})), pSimd8FPTy);
2168             Value* pIdHi =
2169                 BITCAST(LOAD(GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_VertexID2})), pSimd8FPTy);
2170             return JOIN_16(pIdLo, pIdHi);
2171         }
2172         else
2173         {
2174             return BITCAST(LOAD(GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_VertexID})), mSimdFP32Ty);
2175         }
2176     }
2177     case StoreInstanceId:
2178     {
2179         Value* pId = BITCAST(LOAD(GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_CurInstance})), mFP32Ty);
2180         return VBROADCAST(pId);
2181     }
2182
2183
2184     case StoreSrc:
2185     default:
2186         SWR_INVALID("Invalid component control");
2187         return VUNDEF_I();
2188     }
2189 }
2190
2191 //////////////////////////////////////////////////////////////////////////
2192 /// @brief Returns the enable mask for the specified component.
2193 /// @param enableMask - enable bits
2194 /// @param component - component to check if enabled.
2195 bool isComponentEnabled(ComponentEnable enableMask, uint8_t component)
2196 {
2197     switch (component)
2198     {
2199         // X
2200     case 0:
2201         return (enableMask & ComponentEnable::X);
2202         // Y
2203     case 1:
2204         return (enableMask & ComponentEnable::Y);
2205         // Z
2206     case 2:
2207         return (enableMask & ComponentEnable::Z);
2208         // W
2209     case 3:
2210         return (enableMask & ComponentEnable::W);
2211
2212     default:
2213         return false;
2214     }
2215 }
2216
2217 // Don't want two threads compiling the same fetch shader simultaneously
2218 // Has problems in the JIT cache implementation
2219 // This is only a problem for fetch right now.
2220 static std::mutex gFetchCodegenMutex;
2221
2222 //////////////////////////////////////////////////////////////////////////
2223 /// @brief JITs from fetch shader IR
2224 /// @param hJitMgr - JitManager handle
2225 /// @param func   - LLVM function IR
2226 /// @return PFN_FETCH_FUNC - pointer to fetch code
2227 PFN_FETCH_FUNC JitFetchFunc(HANDLE hJitMgr, const HANDLE hFunc)
2228 {
2229     const llvm::Function* func    = (const llvm::Function*)hFunc;
2230     JitManager*           pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
2231     PFN_FETCH_FUNC        pfnFetch;
2232
2233     gFetchCodegenMutex.lock();
2234     pfnFetch = (PFN_FETCH_FUNC)(pJitMgr->mpExec->getFunctionAddress(func->getName().str()));
2235     // MCJIT finalizes modules the first time you JIT code from them. After finalized, you cannot
2236     // add new IR to the module
2237     pJitMgr->mIsModuleFinalized = true;
2238
2239 #if defined(KNOB_SWRC_TRACING)
2240     char        fName[1024];
2241     const char* funcName = func->getName().data();
2242     sprintf(fName, "%s.bin", funcName);
2243     FILE* fd = fopen(fName, "wb");
2244     fwrite((void*)pfnFetch, 1, 2048, fd);
2245     fclose(fd);
2246 #endif
2247
2248     pJitMgr->DumpAsm(const_cast<llvm::Function*>(func), "final");
2249     gFetchCodegenMutex.unlock();
2250
2251
2252     return pfnFetch;
2253 }
2254
2255 //////////////////////////////////////////////////////////////////////////
2256 /// @brief JIT compiles fetch shader
2257 /// @param hJitMgr - JitManager handle
2258 /// @param state   - fetch state to build function from
2259 extern "C" PFN_FETCH_FUNC JITCALL JitCompileFetch(HANDLE hJitMgr, const FETCH_COMPILE_STATE& state)
2260 {
2261     JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
2262
2263     pJitMgr->SetupNewModule();
2264
2265     FetchJit theJit(pJitMgr);
2266     HANDLE   hFunc = theJit.Create(state);
2267
2268     return JitFetchFunc(hJitMgr, hFunc);
2269 }