src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp

   1 /****************************************************************************
   2  * Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  *
  23  * @file fetch_jit.cpp
  24  *
  25  * @brief Implementation of the fetch jitter
  26  *
  27  * Notes:
  28  *
  29  ******************************************************************************/
  30 #include "jit_pch.hpp"
  31 #include "builder_gfx_mem.h"
  32 #include "jit_api.h"
  33 #include "fetch_jit.h"
  34 #include "gen_state_llvm.h"
  35 #include "functionpasses/passes.h"
  36
  37 //#define FETCH_DUMP_VERTEX 1
  38 using namespace llvm;
  39 using namespace SwrJit;
  40
  41 bool isComponentEnabled(ComponentEnable enableMask, uint8_t component);
  42
  43 enum ConversionType
  44 {
  45     CONVERT_NONE,
  46     CONVERT_NORMALIZED,
  47     CONVERT_USCALED,
  48     CONVERT_SSCALED,
  49     CONVERT_SFIXED,
  50 };
  51
  52 //////////////////////////////////////////////////////////////////////////
  53 /// Interface to Jitting a fetch shader
  54 //////////////////////////////////////////////////////////////////////////
  55 struct FetchJit : public BuilderGfxMem
  56 {
  57     FetchJit(JitManager* pJitMgr) : BuilderGfxMem(pJitMgr) {}
  58
  59     Function* Create(const FETCH_COMPILE_STATE& fetchState);
  60
  61     Value* GetSimdValid32bitIndices(Value* vIndices, Value* pLastIndex);
  62     Value* GetSimdValid16bitIndices(Value* vIndices, Value* pLastIndex);
  63     Value* GetSimdValid8bitIndices(Value* vIndices, Value* pLastIndex);
  64     template <typename T>
  65     Value* GetSimdValidIndicesHelper(Value* pIndices, Value* pLastIndex);
  66
  67     // package up Shuffle*bpcGatherd args into a tuple for convenience
  68     typedef std::tuple<Value*&,
  69                        Value*,
  70                        const Instruction::CastOps,
  71                        const ConversionType,
  72                        uint32_t&,
  73                        uint32_t&,
  74                        const ComponentEnable,
  75                        const ComponentControl (&)[4],
  76                        Value* (&)[4],
  77                        const uint32_t (&)[4]>
  78         Shuffle8bpcArgs;
  79
  80     void Shuffle8bpcGatherd16(Shuffle8bpcArgs& args);
  81     void Shuffle8bpcGatherd(Shuffle8bpcArgs& args);
  82
  83     typedef std::tuple<Value* (&)[2],
  84                        Value*,
  85                        const Instruction::CastOps,
  86                        const ConversionType,
  87                        uint32_t&,
  88                        uint32_t&,
  89                        const ComponentEnable,
  90                        const ComponentControl (&)[4],
  91                        Value* (&)[4]>
  92         Shuffle16bpcArgs;
  93
  94     void Shuffle16bpcGather16(Shuffle16bpcArgs& args);
  95     void Shuffle16bpcGather(Shuffle16bpcArgs& args);
  96
  97     void StoreVertexElements(Value*         pVtxOut,
  98                              const uint32_t outputElt,
  99                              const uint32_t numEltsToStore,
 100                              Value* (&vVertexElements)[4]);
 101
 102     Value* GenerateCompCtrlVector(const ComponentControl ctrl);
 103
 104     void JitGatherVertices(const FETCH_COMPILE_STATE& fetchState,
 105                            Value*                     streams,
 106                            Value*                     vIndices,
 107                            Value*                     pVtxOut);
 108
 109     bool IsOddFormat(SWR_FORMAT format);
 110     bool IsUniformFormat(SWR_FORMAT format);
 111     void UnpackComponents(SWR_FORMAT format, Value* vInput, Value* result[4]);
 112     void CreateGatherOddFormats(
 113         SWR_FORMAT format, Value* pMask, Value* pBase, Value* offsets, Value* result[4]);
 114     void ConvertFormat(SWR_FORMAT format, Value* texels[4]);
 115
 116     Value* mpFetchInfo;
 117 };
 118
 119 Function* FetchJit::Create(const FETCH_COMPILE_STATE& fetchState)
 120 {
 121     std::stringstream fnName("FCH_", std::ios_base::in | std::ios_base::out | std::ios_base::ate);
 122     fnName << ComputeCRC(0, &fetchState, sizeof(fetchState));
 123
 124     Function* fetch = Function::Create(
 125         JM()->mFetchShaderTy, GlobalValue::ExternalLinkage, fnName.str(), JM()->mpCurrentModule);
 126     BasicBlock* entry = BasicBlock::Create(JM()->mContext, "entry", fetch);
 127
 128     fetch->getParent()->setModuleIdentifier(fetch->getName());
 129
 130     IRB()->SetInsertPoint(entry);
 131
 132     auto argitr = fetch->arg_begin();
 133
 134     // Fetch shader arguments
 135     Value* privateContext = &*argitr;
 136     ++argitr;
 137     privateContext->setName("privateContext");
 138     SetPrivateContext(privateContext);
 139
 140     mpWorkerData = &*argitr;
 141     ++argitr;
 142     mpWorkerData->setName("pWorkerData");
 143
 144     mpFetchInfo = &*argitr;
 145     ++argitr;
 146     mpFetchInfo->setName("fetchInfo");
 147     Value* pVtxOut = &*argitr;
 148     pVtxOut->setName("vtxOutput");
 149
 150     uint32_t baseWidth = mVWidth;
 151
 152     SWR_ASSERT(mVWidth == 8 || mVWidth == 16, "Unsupported vector width %d", mVWidth);
 153
 154     // Override builder target width to force 16-wide SIMD
 155 #if USE_SIMD16_SHADERS
 156     SetTargetWidth(16);
 157 #endif
 158
 159     pVtxOut = BITCAST(pVtxOut, PointerType::get(mSimdFP32Ty, 0));
 160
 161     // SWR_FETCH_CONTEXT::pStreams
 162     Value* streams = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_pStreams});
 163     streams->setName("pStreams");
 164
 165     // SWR_FETCH_CONTEXT::pIndices
 166     Value* indices = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_xpIndices});
 167     indices->setName("pIndices");
 168
 169     // SWR_FETCH_CONTEXT::pLastIndex
 170     Value* pLastIndex = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_xpLastIndex});
 171     pLastIndex->setName("pLastIndex");
 172
 173     Value* vIndices;
 174     switch (fetchState.indexType)
 175     {
 176     case R8_UINT:
 177         indices = BITCAST(indices, Type::getInt8PtrTy(JM()->mContext, 0));
 178         if (fetchState.bDisableIndexOOBCheck)
 179         {
 180             vIndices = LOAD(
 181                 BITCAST(indices, PointerType::get(VectorType::get(mInt8Ty, mpJitMgr->mVWidth), 0)),
 182                 {(uint32_t)0});
 183             vIndices = Z_EXT(vIndices, mSimdInt32Ty);
 184         }
 185         else
 186         {
 187             vIndices = GetSimdValid8bitIndices(indices, pLastIndex);
 188         }
 189         break;
 190     case R16_UINT:
 191         if (fetchState.bDisableIndexOOBCheck)
 192         {
 193             vIndices = LOAD(
 194                 BITCAST(indices, PointerType::get(VectorType::get(mInt16Ty, mpJitMgr->mVWidth), 0)),
 195                 {(uint32_t)0});
 196             vIndices = Z_EXT(vIndices, mSimdInt32Ty);
 197         }
 198         else
 199         {
 200             vIndices = GetSimdValid16bitIndices(indices, pLastIndex);
 201         }
 202         break;
 203     case R32_UINT:
 204         (fetchState.bDisableIndexOOBCheck)
 205             ? vIndices = LOAD(indices,
 206                               "",
 207                               PointerType::get(mSimdInt32Ty, 0),
 208                               MEM_CLIENT::GFX_MEM_CLIENT_FETCH)
 209             : vIndices = GetSimdValid32bitIndices(indices, pLastIndex);
 210         break; // incoming type is already 32bit int
 211     default:
 212         SWR_INVALID("Unsupported index type");
 213         vIndices = nullptr;
 214         break;
 215     }
 216
 217     if (fetchState.bForceSequentialAccessEnable)
 218     {
 219         Value* pOffsets = mVWidth == 8 ? C({0, 1, 2, 3, 4, 5, 6, 7})
 220                                        : C({0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15});
 221
 222         // VertexData buffers are accessed sequentially, the index is equal to the vertex number
 223         vIndices = VBROADCAST(LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartVertex}));
 224         vIndices = ADD(vIndices, pOffsets);
 225     }
 226
 227     Value* vVertexId = vIndices;
 228     if (fetchState.bVertexIDOffsetEnable)
 229     {
 230         // Assuming one of baseVertex or startVertex is 0, so adding both should be functionally
 231         // correct
 232         Value* vBaseVertex  = VBROADCAST(LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_BaseVertex}));
 233         Value* vStartVertex = VBROADCAST(LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartVertex}));
 234         vVertexId           = ADD(vIndices, vBaseVertex);
 235         vVertexId           = ADD(vVertexId, vStartVertex);
 236     }
 237
 238     // store out vertex IDs
 239     if (mVWidth == 16)
 240     {
 241         // store out in simd8 halves until core supports 16-wide natively
 242         auto vVertexIdLo = EXTRACT_16(vVertexId, 0);
 243         auto vVertexIdHi = EXTRACT_16(vVertexId, 1);
 244         STORE(vVertexIdLo, GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_VertexID}));
 245         STORE(vVertexIdHi, GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_VertexID2}));
 246     }
 247     else if (mVWidth == 8)
 248     {
 249         STORE(vVertexId, GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_VertexID}));
 250     }
 251
 252     // store out cut mask if enabled
 253     if (fetchState.bEnableCutIndex)
 254     {
 255         Value* vCutIndex = VIMMED1(fetchState.cutIndex);
 256         Value* cutMask   = VMASK(ICMP_EQ(vIndices, vCutIndex));
 257
 258         if (mVWidth == 16)
 259         {
 260             auto cutMaskLo = EXTRACT_16(cutMask, 0);
 261             auto cutMaskHi = EXTRACT_16(cutMask, 1);
 262             STORE(cutMaskLo, GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_CutMask}));
 263             STORE(cutMaskHi, GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_CutMask2}));
 264         }
 265         else if (mVWidth == 8)
 266         {
 267             STORE(cutMask, GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_CutMask}));
 268         }
 269     }
 270
 271     // Fetch attributes from memory and output to a simdvertex struct
 272     JitGatherVertices(fetchState, streams, vIndices, pVtxOut);
 273
 274     RET_VOID();
 275
 276     JitManager::DumpToFile(fetch, "src");
 277
 278 #if defined(_DEBUG)
 279     verifyFunction(*fetch);
 280 #endif
 281
 282     ::FunctionPassManager setupPasses(JM()->mpCurrentModule);
 283
 284     ///@todo We don't need the CFG passes for fetch. (e.g. BreakCriticalEdges and CFGSimplification)
 285     setupPasses.add(createBreakCriticalEdgesPass());
 286     setupPasses.add(createCFGSimplificationPass());
 287     setupPasses.add(createEarlyCSEPass());
 288     setupPasses.add(createPromoteMemoryToRegisterPass());
 289
 290     setupPasses.run(*fetch);
 291
 292     JitManager::DumpToFile(fetch, "se");
 293
 294     ::FunctionPassManager optPasses(JM()->mpCurrentModule);
 295
 296     ///@todo Haven't touched these either. Need to remove some of these and add others.
 297     optPasses.add(createCFGSimplificationPass());
 298     optPasses.add(createEarlyCSEPass());
 299     optPasses.add(createInstructionCombiningPass());
 300     optPasses.add(createConstantPropagationPass());
 301     optPasses.add(createSCCPPass());
 302     optPasses.add(createAggressiveDCEPass());
 303
 304     optPasses.run(*fetch);
 305
 306     optPasses.add(createLowerX86Pass(this));
 307     optPasses.run(*fetch);
 308
 309     JitManager::DumpToFile(fetch, "opt");
 310
 311
 312     // Revert 16-wide override
 313 #if USE_SIMD16_SHADERS
 314     SetTargetWidth(baseWidth);
 315 #endif
 316
 317     return fetch;
 318 }
 319
 320 // returns true for odd formats that require special state.gather handling
 321 bool FetchJit::IsOddFormat(SWR_FORMAT format)
 322 {
 323     const SWR_FORMAT_INFO& info = GetFormatInfo(format);
 324     if (info.bpc[0] != 8 && info.bpc[0] != 16 && info.bpc[0] != 32 && info.bpc[0] != 64)
 325     {
 326         return true;
 327     }
 328     return false;
 329 }
 330
 331 // format is uniform if all components are the same size and type
 332 bool FetchJit::IsUniformFormat(SWR_FORMAT format)
 333 {
 334     const SWR_FORMAT_INFO& info  = GetFormatInfo(format);
 335     uint32_t               bpc0  = info.bpc[0];
 336     uint32_t               type0 = info.type[0];
 337
 338     for (uint32_t c = 1; c < info.numComps; ++c)
 339     {
 340         if (bpc0 != info.bpc[c] || type0 != info.type[c])
 341         {
 342             return false;
 343         }
 344     }
 345     return true;
 346 }
 347
 348 // unpacks components based on format
 349 // foreach component in the pixel
 350 //   mask off everything but this component
 351 //   shift component to LSB
 352 void FetchJit::UnpackComponents(SWR_FORMAT format, Value* vInput, Value* result[4])
 353 {
 354     const SWR_FORMAT_INFO& info = GetFormatInfo(format);
 355
 356     uint32_t bitOffset = 0;
 357     for (uint32_t c = 0; c < info.numComps; ++c)
 358     {
 359         uint32_t swizzledIndex = info.swizzle[c];
 360         uint32_t compBits      = info.bpc[c];
 361         uint32_t bitmask       = ((1 << compBits) - 1) << bitOffset;
 362         Value*   comp          = AND(vInput, bitmask);
 363         comp                   = LSHR(comp, bitOffset);
 364
 365         result[swizzledIndex] = comp;
 366         bitOffset += compBits;
 367     }
 368 }
 369
 370 // gather for odd component size formats
 371 // gather SIMD full pixels per lane then shift/mask to move each component to their
 372 // own vector
 373 void FetchJit::CreateGatherOddFormats(
 374     SWR_FORMAT format, Value* pMask, Value* xpBase, Value* pOffsets, Value* pResult[4])
 375 {
 376     const SWR_FORMAT_INFO& info = GetFormatInfo(format);
 377
 378     // only works if pixel size is <= 32bits
 379     SWR_ASSERT(info.bpp <= 32);
 380
 381     Value* pGather;
 382     if (info.bpp == 32)
 383     {
 384         pGather =
 385             GATHERDD(VIMMED1(0), xpBase, pOffsets, pMask, 1, MEM_CLIENT::GFX_MEM_CLIENT_FETCH);
 386     }
 387     else
 388     {
 389         // Can't use 32-bit gather for items less than 32-bits, could cause page faults.
 390         Value* pMem = ALLOCA(mSimdInt32Ty);
 391         STORE(VIMMED1(0u), pMem);
 392
 393         Value* pDstMem = POINTER_CAST(pMem, mInt32PtrTy);
 394
 395         for (uint32_t lane = 0; lane < mVWidth; ++lane)
 396         {
 397             // Get index
 398             Value* index = VEXTRACT(pOffsets, C(lane));
 399             Value* mask  = VEXTRACT(pMask, C(lane));
 400
 401             // use branch around load based on mask
 402             // Needed to avoid page-faults on unmasked lanes
 403             BasicBlock* pCurrentBB = IRB()->GetInsertBlock();
 404             BasicBlock* pMaskedLoadBlock =
 405                 BasicBlock::Create(JM()->mContext, "MaskedLaneLoad", pCurrentBB->getParent());
 406             BasicBlock* pEndLoadBB =
 407                 BasicBlock::Create(JM()->mContext, "AfterMaskedLoad", pCurrentBB->getParent());
 408
 409             COND_BR(mask, pMaskedLoadBlock, pEndLoadBB);
 410
 411             JM()->mBuilder.SetInsertPoint(pMaskedLoadBlock);
 412
 413             switch (info.bpp)
 414             {
 415             case 8:
 416             {
 417                 Value* pDst  = BITCAST(GEP(pDstMem, C(lane)), PointerType::get(mInt8Ty, 0));
 418                 Value* xpSrc = ADD(xpBase, Z_EXT(index, xpBase->getType()));
 419                 STORE(LOAD(xpSrc, "", mInt8PtrTy, MEM_CLIENT::GFX_MEM_CLIENT_FETCH), pDst);
 420                 break;
 421             }
 422
 423             case 16:
 424             {
 425                 Value* pDst  = BITCAST(GEP(pDstMem, C(lane)), PointerType::get(mInt16Ty, 0));
 426                 Value* xpSrc = ADD(xpBase, Z_EXT(index, xpBase->getType()));
 427                 STORE(LOAD(xpSrc, "", mInt16PtrTy, MEM_CLIENT::GFX_MEM_CLIENT_FETCH), pDst);
 428                 break;
 429             }
 430             break;
 431
 432             case 24:
 433             {
 434                 // First 16-bits of data
 435                 Value* pDst  = BITCAST(GEP(pDstMem, C(lane)), PointerType::get(mInt16Ty, 0));
 436                 Value* xpSrc = ADD(xpBase, Z_EXT(index, xpBase->getType()));
 437                 STORE(LOAD(xpSrc, "", mInt16PtrTy, MEM_CLIENT::GFX_MEM_CLIENT_FETCH), pDst);
 438
 439                 // Last 8-bits of data
 440                 pDst  = BITCAST(GEP(pDst, C(1)), PointerType::get(mInt8Ty, 0));
 441                 xpSrc = ADD(xpSrc, C((int64_t)2));
 442                 STORE(LOAD(xpSrc, "", mInt8PtrTy, MEM_CLIENT::GFX_MEM_CLIENT_FETCH), pDst);
 443                 break;
 444             }
 445
 446             default:
 447                 SWR_INVALID("Shouldn't have BPP = %d now", info.bpp);
 448                 break;
 449             }
 450
 451             BR(pEndLoadBB);
 452             JM()->mBuilder.SetInsertPoint(pEndLoadBB);
 453         }
 454
 455         pGather = LOAD(pMem);
 456     }
 457
 458     for (uint32_t comp = 0; comp < 4; ++comp)
 459     {
 460         pResult[comp] = VIMMED1((int)info.defaults[comp]);
 461     }
 462
 463     UnpackComponents(format, pGather, pResult);
 464
 465     // cast to fp32
 466     pResult[0] = BITCAST(pResult[0], mSimdFP32Ty);
 467     pResult[1] = BITCAST(pResult[1], mSimdFP32Ty);
 468     pResult[2] = BITCAST(pResult[2], mSimdFP32Ty);
 469     pResult[3] = BITCAST(pResult[3], mSimdFP32Ty);
 470 }
 471
 472 void FetchJit::ConvertFormat(SWR_FORMAT format, Value* texels[4])
 473 {
 474     const SWR_FORMAT_INFO& info = GetFormatInfo(format);
 475
 476     for (uint32_t c = 0; c < info.numComps; ++c)
 477     {
 478         uint32_t compIndex = info.swizzle[c];
 479
 480         // skip any conversion on UNUSED components
 481         if (info.type[c] == SWR_TYPE_UNUSED)
 482         {
 483             continue;
 484         }
 485
 486         if (info.isNormalized[c])
 487         {
 488             if (info.type[c] == SWR_TYPE_SNORM)
 489             {
 490                 /// @todo The most-negative value maps to -1.0f. e.g. the 5-bit value 10000 maps to
 491                 /// -1.0f.
 492
 493                 /// result = c * (1.0f / (2^(n-1) - 1);
 494                 uint32_t n        = info.bpc[c];
 495                 uint32_t pow2     = 1 << (n - 1);
 496                 float    scale    = 1.0f / (float)(pow2 - 1);
 497                 Value*   vScale   = VIMMED1(scale);
 498                 texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty);
 499                 texels[compIndex] = SI_TO_FP(texels[compIndex], mSimdFP32Ty);
 500                 texels[compIndex] = FMUL(texels[compIndex], vScale);
 501             }
 502             else
 503             {
 504                 SWR_ASSERT(info.type[c] == SWR_TYPE_UNORM);
 505
 506                 /// result = c * (1.0f / (2^n - 1))
 507                 uint32_t n    = info.bpc[c];
 508                 uint32_t pow2 = 1 << n;
 509                 // special case 24bit unorm format, which requires a full divide to meet ULP
 510                 // requirement
 511                 if (n == 24)
 512                 {
 513                     float  scale      = (float)(pow2 - 1);
 514                     Value* vScale     = VIMMED1(scale);
 515                     texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty);
 516                     texels[compIndex] = SI_TO_FP(texels[compIndex], mSimdFP32Ty);
 517                     texels[compIndex] = FDIV(texels[compIndex], vScale);
 518                 }
 519                 else
 520                 {
 521                     float  scale      = 1.0f / (float)(pow2 - 1);
 522                     Value* vScale     = VIMMED1(scale);
 523                     texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty);
 524                     texels[compIndex] = UI_TO_FP(texels[compIndex], mSimdFP32Ty);
 525                     texels[compIndex] = FMUL(texels[compIndex], vScale);
 526                 }
 527             }
 528             continue;
 529         }
 530     }
 531 }
 532
 533 //////////////////////////////////////////////////////////////////////////
 534 /// @brief Loads attributes from memory using AVX2 GATHER(s)
 535 /// @param fetchState - info about attributes to be fetched from memory
 536 /// @param streams - value pointer to the current vertex stream
 537 /// @param vIndices - vector value of indices to gather
 538 /// @param pVtxOut - value pointer to output simdvertex struct
 539 void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE& fetchState,
 540                                  Value*                     streams,
 541                                  Value*                     vIndices,
 542                                  Value*                     pVtxOut)
 543 {
 544     uint32_t currentVertexElement = 0;
 545     uint32_t outputElt            = 0;
 546     Value*   vVertexElements[4];
 547
 548     Value* startVertex   = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartVertex});
 549     Value* startInstance = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartInstance});
 550     Value* curInstance   = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_CurInstance});
 551     Value* vBaseVertex   = VBROADCAST(LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_BaseVertex}));
 552     curInstance->setName("curInstance");
 553
 554     for (uint32_t nInputElt = 0; nInputElt < fetchState.numAttribs; nInputElt += 1)
 555     {
 556         const INPUT_ELEMENT_DESC& ied = fetchState.layout[nInputElt];
 557
 558         // skip element if all components are disabled
 559         if (ied.ComponentPacking == ComponentEnable::NONE)
 560         {
 561             continue;
 562         }
 563
 564         const SWR_FORMAT_INFO& info = GetFormatInfo((SWR_FORMAT)ied.Format);
 565         SWR_ASSERT((info.bpp != 0), "Unsupported format in JitGatherVertices.");
 566         uint32_t bpc =
 567             info.bpp /
 568             info.numComps; ///@todo Code below assumes all components are same size. Need to fix.
 569
 570         Value* stream = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_xpData});
 571
 572         Value* stride  = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pitch});
 573         Value* vStride = VBROADCAST(stride);
 574
 575         // max vertex index that is fully in bounds
 576         Value* maxVertex = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_maxVertex)});
 577         maxVertex        = LOAD(maxVertex);
 578
 579         Value* minVertex = NULL;
 580         if (fetchState.bPartialVertexBuffer)
 581         {
 582             // min vertex index for low bounds OOB checking
 583             minVertex = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_minVertex)});
 584             minVertex = LOAD(minVertex);
 585         }
 586
 587         if (fetchState.bInstanceIDOffsetEnable)
 588         {
 589             // the InstanceID (curInstance) value is offset by StartInstanceLocation
 590             curInstance = ADD(curInstance, startInstance);
 591         }
 592
 593         Value* vCurIndices;
 594         Value* startOffset;
 595         Value* vInstanceStride = VIMMED1(0);
 596
 597         if (ied.InstanceEnable)
 598         {
 599             Value* stepRate = C(ied.InstanceAdvancementState);
 600
 601             // prevent a div by 0 for 0 step rate
 602             Value* isNonZeroStep = ICMP_UGT(stepRate, C(0));
 603             stepRate             = SELECT(isNonZeroStep, stepRate, C(1));
 604
 605             // calc the current offset into instanced data buffer
 606             Value* calcInstance = UDIV(curInstance, stepRate);
 607
 608             // if step rate is 0, every instance gets instance 0
 609             calcInstance = SELECT(isNonZeroStep, calcInstance, C(0));
 610
 611             vCurIndices = VBROADCAST(calcInstance);
 612             startOffset = startInstance;
 613         }
 614         else if (ied.InstanceStrideEnable)
 615         {
 616             // grab the instance advancement state, determines stride in bytes from one instance to
 617             // the next
 618             Value* stepRate = C(ied.InstanceAdvancementState);
 619             vInstanceStride = VBROADCAST(MUL(curInstance, stepRate));
 620
 621             // offset indices by baseVertex
 622             vCurIndices = ADD(vIndices, vBaseVertex);
 623
 624             startOffset = startVertex;
 625             SWR_ASSERT((0), "TODO: Fill out more once driver sends this down.");
 626         }
 627         else
 628         {
 629             // offset indices by baseVertex
 630             vCurIndices = ADD(vIndices, vBaseVertex);
 631             startOffset = startVertex;
 632         }
 633
 634         // All of the OOB calculations are in vertices, not VB offsets, to prevent having to
 635         // do 64bit address offset calculations.
 636
 637         // calculate byte offset to the start of the VB
 638         Value* baseOffset = MUL(Z_EXT(startOffset, mInt64Ty), Z_EXT(stride, mInt64Ty));
 639
 640         // VGATHER* takes an *i8 src pointer so that's what stream is
 641         Value* pStreamBaseGFX = ADD(stream, baseOffset);
 642
 643         // if we have a start offset, subtract from max vertex. Used for OOB check
 644         maxVertex     = SUB(Z_EXT(maxVertex, mInt64Ty), Z_EXT(startOffset, mInt64Ty));
 645         Value* maxNeg = ICMP_SLT(maxVertex, C((int64_t)0));
 646         // if we have a negative value, we're already OOB. clamp at 0.
 647         maxVertex = SELECT(maxNeg, C(0), TRUNC(maxVertex, mInt32Ty));
 648
 649         if (fetchState.bPartialVertexBuffer)
 650         {
 651             // similary for min vertex
 652             minVertex     = SUB(Z_EXT(minVertex, mInt64Ty), Z_EXT(startOffset, mInt64Ty));
 653             Value* minNeg = ICMP_SLT(minVertex, C((int64_t)0));
 654             minVertex     = SELECT(minNeg, C(0), TRUNC(minVertex, mInt32Ty));
 655         }
 656
 657         // Load the in bounds size of a partially valid vertex
 658         Value* partialInboundsSize =
 659             GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_partialInboundsSize)});
 660         partialInboundsSize       = LOAD(partialInboundsSize);
 661         Value* vPartialVertexSize = VBROADCAST(partialInboundsSize);
 662         Value* vBpp               = VBROADCAST(C(info.Bpp));
 663         Value* vAlignmentOffsets  = VBROADCAST(C(ied.AlignedByteOffset));
 664
 665         // is the element is <= the partially valid size
 666         Value* vElementInBoundsMask = ICMP_SLE(vBpp, SUB(vPartialVertexSize, vAlignmentOffsets));
 667
 668         // override cur indices with 0 if pitch is 0
 669         Value* pZeroPitchMask = ICMP_EQ(vStride, VIMMED1(0));
 670         vCurIndices           = SELECT(pZeroPitchMask, VIMMED1(0), vCurIndices);
 671
 672         // are vertices partially OOB?
 673         Value* vMaxVertex      = VBROADCAST(maxVertex);
 674         Value* vPartialOOBMask = ICMP_EQ(vCurIndices, vMaxVertex);
 675
 676         // are vertices fully in bounds?
 677         Value* vMaxGatherMask = ICMP_ULT(vCurIndices, vMaxVertex);
 678
 679         Value* vGatherMask;
 680         if (fetchState.bPartialVertexBuffer)
 681         {
 682             // are vertices below minVertex limit?
 683             Value* vMinVertex     = VBROADCAST(minVertex);
 684             Value* vMinGatherMask = ICMP_UGE(vCurIndices, vMinVertex);
 685
 686             // only fetch lanes that pass both tests
 687             vGatherMask = AND(vMaxGatherMask, vMinGatherMask);
 688         }
 689         else
 690         {
 691             vGatherMask = vMaxGatherMask;
 692         }
 693
 694         // blend in any partially OOB indices that have valid elements
 695         vGatherMask = SELECT(vPartialOOBMask, vElementInBoundsMask, vGatherMask);
 696
 697         // calculate the actual offsets into the VB
 698         Value* vOffsets = MUL(vCurIndices, vStride);
 699         vOffsets        = ADD(vOffsets, vAlignmentOffsets);
 700
 701         // if instance stride enable is:
 702         //  true  - add product of the instanceID and advancement state to the offst into the VB
 703         //  false - value of vInstanceStride has been initialialized to zero
 704         vOffsets = ADD(vOffsets, vInstanceStride);
 705
 706         // Packing and component control
 707         ComponentEnable        compMask = (ComponentEnable)ied.ComponentPacking;
 708         const ComponentControl compCtrl[4]{(ComponentControl)ied.ComponentControl0,
 709                                            (ComponentControl)ied.ComponentControl1,
 710                                            (ComponentControl)ied.ComponentControl2,
 711                                            (ComponentControl)ied.ComponentControl3};
 712
 713         // Special gather/conversion for formats without equal component sizes
 714         if (IsOddFormat((SWR_FORMAT)ied.Format))
 715         {
 716             Value* pResults[4];
 717             CreateGatherOddFormats(
 718                 (SWR_FORMAT)ied.Format, vGatherMask, pStreamBaseGFX, vOffsets, pResults);
 719             ConvertFormat((SWR_FORMAT)ied.Format, pResults);
 720
 721             for (uint32_t c = 0; c < 4; c += 1)
 722             {
 723                 if (isComponentEnabled(compMask, c))
 724                 {
 725                     vVertexElements[currentVertexElement++] = pResults[c];
 726                     if (currentVertexElement > 3)
 727                     {
 728                         StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
 729                         // reset to the next vVertexElement to output
 730                         currentVertexElement = 0;
 731                     }
 732                 }
 733             }
 734         }
 735         else if (info.type[0] == SWR_TYPE_FLOAT)
 736         {
 737             ///@todo: support 64 bit vb accesses
 738             Value* gatherSrc = VIMMED1(0.0f);
 739
 740             SWR_ASSERT(IsUniformFormat((SWR_FORMAT)ied.Format),
 741                        "Unsupported format for standard gather fetch.");
 742
 743             // Gather components from memory to store in a simdvertex structure
 744             switch (bpc)
 745             {
 746             case 16:
 747             {
 748                 Value* vGatherResult[2];
 749
 750                 // if we have at least one component out of x or y to fetch
 751                 if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
 752                 {
 753                     vGatherResult[0] = GATHERPS(gatherSrc, pStreamBaseGFX, vOffsets, vGatherMask, 1, MEM_CLIENT::GFX_MEM_CLIENT_FETCH);
 754                     // e.g. result of first 8x32bit integer gather for 16bit components
 755                     // 256i - 0    1    2    3    4    5    6    7
 756                     //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
 757                     //
 758                 }
 759
 760                 // if we have at least one component out of z or w to fetch
 761                 if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
 762                 {
 763                     // offset base to the next components(zw) in the vertex to gather
 764                     pStreamBaseGFX = ADD(pStreamBaseGFX, C((int64_t)4));
 765
 766                     vGatherResult[1] = GATHERPS(gatherSrc, pStreamBaseGFX, vOffsets, vGatherMask, 1, MEM_CLIENT::GFX_MEM_CLIENT_FETCH);
 767                     // e.g. result of second 8x32bit integer gather for 16bit components
 768                     // 256i - 0    1    2    3    4    5    6    7
 769                     //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
 770                     //
 771                 }
 772
 773                 // if we have at least one component to shuffle into place
 774                 if (compMask)
 775                 {
 776                     Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult,
 777                                                                   pVtxOut,
 778                                                                   Instruction::CastOps::FPExt,
 779                                                                   CONVERT_NONE,
 780                                                                   currentVertexElement,
 781                                                                   outputElt,
 782                                                                   compMask,
 783                                                                   compCtrl,
 784                                                                   vVertexElements);
 785
 786                     // Shuffle gathered components into place in simdvertex struct
 787                     mVWidth == 16 ? Shuffle16bpcGather16(args)
 788                                   : Shuffle16bpcGather(args); // outputs to vVertexElements ref
 789                 }
 790             }
 791             break;
 792             case 32:
 793             {
 794                 for (uint32_t i = 0; i < 4; i += 1)
 795                 {
 796                     if (isComponentEnabled(compMask, i))
 797                     {
 798                         // if we need to gather the component
 799                         if (compCtrl[i] == StoreSrc)
 800                         {
 801                             // Gather a SIMD of vertices
 802                             // APIs allow a 4GB range for offsets
 803                             // However, GATHERPS uses signed 32-bit offsets, so +/- 2GB range :(
 804                             // Add 2GB to the base pointer and 2GB to the offsets.  This makes
 805                             // "negative" (large) offsets into positive offsets and small offsets
 806                             // into negative offsets.
 807                             Value* vNewOffsets = ADD(vOffsets, VIMMED1(0x80000000));
 808                             vVertexElements[currentVertexElement++] =
 809                                 GATHERPS(gatherSrc,
 810                                          ADD(pStreamBaseGFX, C((uintptr_t)0x80000000U)),
 811                                          vNewOffsets,
 812                                          vGatherMask,
 813                                          1,
 814                                          MEM_CLIENT::GFX_MEM_CLIENT_FETCH);
 815                         }
 816                         else
 817                         {
 818                             vVertexElements[currentVertexElement++] =
 819                                 GenerateCompCtrlVector(compCtrl[i]);
 820                         }
 821
 822                         if (currentVertexElement > 3)
 823                         {
 824                             StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
 825                             // reset to the next vVertexElement to output
 826                             currentVertexElement = 0;
 827                         }
 828                     }
 829
 830                     // offset base to the next component in the vertex to gather
 831                     pStreamBaseGFX = ADD(pStreamBaseGFX, C((int64_t)4));
 832                 }
 833             }
 834             break;
 835             case 64:
 836             {
 837                 for (uint32_t i = 0; i < 4; i += 1)
 838                 {
 839                     if (isComponentEnabled(compMask, i))
 840                     {
 841                         // if we need to gather the component
 842                         if (compCtrl[i] == StoreSrc)
 843                         {
 844                             Value* vShufLo;
 845                             Value* vShufHi;
 846                             Value* vShufAll;
 847
 848                             if (mVWidth == 8)
 849                             {
 850                                 vShufLo  = C({0, 1, 2, 3});
 851                                 vShufHi  = C({4, 5, 6, 7});
 852                                 vShufAll = C({0, 1, 2, 3, 4, 5, 6, 7});
 853                             }
 854                             else
 855                             {
 856                                 SWR_ASSERT(mVWidth == 16);
 857                                 vShufLo = C({0, 1, 2, 3, 4, 5, 6, 7});
 858                                 vShufHi = C({8, 9, 10, 11, 12, 13, 14, 15});
 859                                 vShufAll =
 860                                     C({0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15});
 861                             }
 862
 863                             Value* vMaskLo = VSHUFFLE(vGatherMask, vGatherMask, vShufLo);
 864                             Value* vMaskHi = VSHUFFLE(vGatherMask, vGatherMask, vShufHi);
 865
 866                             Value* vOffsetsLo = VSHUFFLE(vOffsets, vOffsets, vShufLo);
 867                             Value* vOffsetsHi = VSHUFFLE(vOffsets, vOffsets, vShufHi);
 868
 869                             Value* vZeroDouble = VECTOR_SPLAT(
 870                                 mVWidth / 2, ConstantFP::get(IRB()->getDoubleTy(), 0.0f));
 871
 872                             Value* pGatherLo =
 873                                 GATHERPD(vZeroDouble, pStreamBaseGFX, vOffsetsLo, vMaskLo);
 874                             Value* pGatherHi =
 875                                 GATHERPD(vZeroDouble, pStreamBaseGFX, vOffsetsHi, vMaskHi);
 876
 877                             pGatherLo = VCVTPD2PS(pGatherLo);
 878                             pGatherHi = VCVTPD2PS(pGatherHi);
 879
 880                             Value* pGather = VSHUFFLE(pGatherLo, pGatherHi, vShufAll);
 881
 882                             vVertexElements[currentVertexElement++] = pGather;
 883                         }
 884                         else
 885                         {
 886                             vVertexElements[currentVertexElement++] =
 887                                 GenerateCompCtrlVector(compCtrl[i]);
 888                         }
 889
 890                         if (currentVertexElement > 3)
 891                         {
 892                             StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
 893                             // reset to the next vVertexElement to output
 894                             currentVertexElement = 0;
 895                         }
 896                     }
 897
 898                     // offset base to the next component  in the vertex to gather
 899                     pStreamBaseGFX = ADD(pStreamBaseGFX, C((int64_t)8));
 900                 }
 901             }
 902             break;
 903             default:
 904                 SWR_INVALID("Tried to fetch invalid FP format");
 905                 break;
 906             }
 907         }
 908         else
 909         {
 910             Instruction::CastOps extendCastType = Instruction::CastOps::CastOpsEnd;
 911             ConversionType       conversionType = CONVERT_NONE;
 912
 913             SWR_ASSERT(IsUniformFormat((SWR_FORMAT)ied.Format),
 914                        "Unsupported format for standard gather fetch.");
 915
 916             switch (info.type[0])
 917             {
 918             case SWR_TYPE_UNORM:
 919                 conversionType = CONVERT_NORMALIZED;
 920             case SWR_TYPE_UINT:
 921                 extendCastType = Instruction::CastOps::ZExt;
 922                 break;
 923             case SWR_TYPE_SNORM:
 924                 conversionType = CONVERT_NORMALIZED;
 925             case SWR_TYPE_SINT:
 926                 extendCastType = Instruction::CastOps::SExt;
 927                 break;
 928             case SWR_TYPE_USCALED:
 929                 conversionType = CONVERT_USCALED;
 930                 extendCastType = Instruction::CastOps::UIToFP;
 931                 break;
 932             case SWR_TYPE_SSCALED:
 933                 conversionType = CONVERT_SSCALED;
 934                 extendCastType = Instruction::CastOps::SIToFP;
 935                 break;
 936             case SWR_TYPE_SFIXED:
 937                 conversionType = CONVERT_SFIXED;
 938                 extendCastType = Instruction::CastOps::SExt;
 939                 break;
 940             default:
 941                 break;
 942             }
 943
 944             // value substituted when component of gather is masked
 945             Value* gatherSrc = VIMMED1(0);
 946
 947             // Gather components from memory to store in a simdvertex structure
 948             switch (bpc)
 949             {
 950             case 8:
 951             {
 952                 // if we have at least one component to fetch
 953                 if (compMask)
 954                 {
 955                     Value* vGatherResult = GATHERDD(gatherSrc,
 956                                                     pStreamBaseGFX,
 957                                                     vOffsets,
 958                                                     vGatherMask,
 959                                                     1,
 960                                                     MEM_CLIENT::GFX_MEM_CLIENT_FETCH);
 961                     // e.g. result of an 8x32bit integer gather for 8bit components
 962                     // 256i - 0    1    2    3    4    5    6    7
 963                     //        xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw
 964
 965                     Shuffle8bpcArgs args = std::forward_as_tuple(vGatherResult,
 966                                                                  pVtxOut,
 967                                                                  extendCastType,
 968                                                                  conversionType,
 969                                                                  currentVertexElement,
 970                                                                  outputElt,
 971                                                                  compMask,
 972                                                                  compCtrl,
 973                                                                  vVertexElements,
 974                                                                  info.swizzle);
 975
 976                     // Shuffle gathered components into place in simdvertex struct
 977                     mVWidth == 16 ? Shuffle8bpcGatherd16(args)
 978                                   : Shuffle8bpcGatherd(args); // outputs to vVertexElements ref
 979                 }
 980             }
 981             break;
 982             case 16:
 983             {
 984                 Value* vGatherResult[2];
 985
 986                 // if we have at least one component out of x or y to fetch
 987                 if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
 988                 {
 989                     vGatherResult[0] = GATHERDD(gatherSrc,
 990                                                 pStreamBaseGFX,
 991                                                 vOffsets,
 992                                                 vGatherMask,
 993                                                 1,
 994                                                 MEM_CLIENT::GFX_MEM_CLIENT_FETCH);
 995                     // e.g. result of first 8x32bit integer gather for 16bit components
 996                     // 256i - 0    1    2    3    4    5    6    7
 997                     //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
 998                     //
 999                 }
1000
1001                 // if we have at least one component out of z or w to fetch
1002                 if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
1003                 {
1004                     // offset base to the next components(zw) in the vertex to gather
1005                     pStreamBaseGFX = ADD(pStreamBaseGFX, C((int64_t)4));
1006
1007                     vGatherResult[1] = GATHERDD(gatherSrc,
1008                                                 pStreamBaseGFX,
1009                                                 vOffsets,
1010                                                 vGatherMask,
1011                                                 1,
1012                                                 MEM_CLIENT::GFX_MEM_CLIENT_FETCH);
1013                     // e.g. result of second 8x32bit integer gather for 16bit components
1014                     // 256i - 0    1    2    3    4    5    6    7
1015                     //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
1016                     //
1017                 }
1018
1019                 // if we have at least one component to shuffle into place
1020                 if (compMask)
1021                 {
1022                     Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult,
1023                                                                   pVtxOut,
1024                                                                   extendCastType,
1025                                                                   conversionType,
1026                                                                   currentVertexElement,
1027                                                                   outputElt,
1028                                                                   compMask,
1029                                                                   compCtrl,
1030                                                                   vVertexElements);
1031
1032                     // Shuffle gathered components into place in simdvertex struct
1033                     mVWidth == 16 ? Shuffle16bpcGather16(args)
1034                                   : Shuffle16bpcGather(args); // outputs to vVertexElements ref
1035                 }
1036             }
1037             break;
1038             case 32:
1039             {
1040                 // Gathered components into place in simdvertex struct
1041                 for (uint32_t i = 0; i < 4; i++)
1042                 {
1043                     if (isComponentEnabled(compMask, i))
1044                     {
1045                         // if we need to gather the component
1046                         if (compCtrl[i] == StoreSrc)
1047                         {
1048                             Value* pGather = GATHERDD(gatherSrc,
1049                                                       pStreamBaseGFX,
1050                                                       vOffsets,
1051                                                       vGatherMask,
1052                                                       1,
1053                                                       MEM_CLIENT::GFX_MEM_CLIENT_FETCH);
1054
1055                             if (conversionType == CONVERT_USCALED)
1056                             {
1057                                 pGather = UI_TO_FP(pGather, mSimdFP32Ty);
1058                             }
1059                             else if (conversionType == CONVERT_SSCALED)
1060                             {
1061                                 pGather = SI_TO_FP(pGather, mSimdFP32Ty);
1062                             }
1063                             else if (conversionType == CONVERT_SFIXED)
1064                             {
1065                                 pGather = FMUL(SI_TO_FP(pGather, mSimdFP32Ty),
1066                                                VBROADCAST(C(1 / 65536.0f)));
1067                             }
1068
1069                             vVertexElements[currentVertexElement++] = pGather;
1070
1071                             // e.g. result of a single 8x32bit integer gather for 32bit components
1072                             // 256i - 0    1    2    3    4    5    6    7
1073                             //        xxxx xxxx xxxx xxxx xxxx xxxx xxxx xxxx
1074                         }
1075                         else
1076                         {
1077                             vVertexElements[currentVertexElement++] =
1078                                 GenerateCompCtrlVector(compCtrl[i]);
1079                         }
1080
1081                         if (currentVertexElement > 3)
1082                         {
1083                             StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1084
1085                             // reset to the next vVertexElement to output
1086                             currentVertexElement = 0;
1087                         }
1088                     }
1089
1090                     // offset base to the next component  in the vertex to gather
1091                     pStreamBaseGFX = ADD(pStreamBaseGFX, C((int64_t)4));
1092                 }
1093             }
1094             break;
1095             }
1096         }
1097     }
1098
1099     // if we have a partially filled vVertexElement struct, output it
1100     if (currentVertexElement > 0)
1101     {
1102         StoreVertexElements(pVtxOut, outputElt++, currentVertexElement, vVertexElements);
1103     }
1104 }
1105
1106
1107 typedef void* (*PFN_TRANSLATEGFXADDRESS_FUNC)(void* pdc, gfxptr_t va, bool* out_pbNullTileAccessed, void* pWorkerData);
1108
1109 template <typename T>
1110 void GetSimdValidIndicesGfx(gfxptr_t                     indices,
1111                             gfxptr_t                     lastIndex,
1112                             uint32_t                     vWidth,
1113                             PFN_TRANSLATEGFXADDRESS_FUNC pfnTranslate,
1114                             void*                        pdc,
1115                             uint32_t*                    outIndices,
1116                             void*                        pWorkerData)
1117 {
1118     SWR_ASSERT(outIndices != nullptr);
1119
1120     gfxptr_t indexPtr = indices;
1121     for (int64_t lane = 0; lane < vWidth; lane++)
1122     {
1123         uint32_t index = 0;
1124
1125         if (indexPtr < lastIndex)
1126         {
1127             // translate indexPtr and load from it
1128             T* addr = (T*)pfnTranslate(pdc, indexPtr, nullptr, pWorkerData);
1129             SWR_ASSERT(addr != nullptr);
1130             index = *addr;
1131         }
1132
1133         // index to 32 bits and insert into the correct simd lane
1134         outIndices[lane] = index;
1135
1136         indexPtr += sizeof(T);
1137     }
1138 }
1139
1140 void GetSimdValid8bitIndicesGfx(gfxptr_t                     indices,
1141                                 gfxptr_t                     lastIndex,
1142                                 uint32_t                     vWidth,
1143                                 PFN_TRANSLATEGFXADDRESS_FUNC pfnTranslate,
1144                                 void*                        pdc,
1145                                 uint32_t*                    outIndices,
1146                                 void*                        pWorkerData)
1147 {
1148     GetSimdValidIndicesGfx<uint8_t>(indices, lastIndex, vWidth, pfnTranslate, pdc, outIndices, pWorkerData);
1149 }
1150
1151 void GetSimdValid16bitIndicesGfx(gfxptr_t                     indices,
1152                                  gfxptr_t                     lastIndex,
1153                                  uint32_t                     vWidth,
1154                                  PFN_TRANSLATEGFXADDRESS_FUNC pfnTranslate,
1155                                  void*                        pdc,
1156                                  uint32_t*                    outIndices,
1157                                  void*                        pWorkerData)
1158 {
1159     GetSimdValidIndicesGfx<uint16_t>(indices, lastIndex, vWidth, pfnTranslate, pdc, outIndices, pWorkerData);
1160 }
1161
1162
1163 template <typename T>
1164 Value* FetchJit::GetSimdValidIndicesHelper(Value* pIndices, Value* pLastIndex)
1165 {
1166     SWR_ASSERT(pIndices->getType() == mInt64Ty && pLastIndex->getType() == mInt64Ty,
1167                "Function expects gfxptr_t for both input parameters.");
1168
1169     Type* Ty = nullptr;
1170
1171     static_assert(sizeof(T) == sizeof(uint16_t) || sizeof(T) == sizeof(uint8_t),
1172                   "Unsupported type for use with GetSimdValidIndicesHelper<T>");
1173     constexpr bool bSize = (sizeof(T) == sizeof(uint16_t));
1174     if (bSize)
1175     {
1176         Ty = mInt16PtrTy;
1177     }
1178     else if (sizeof(T) == sizeof(uint8_t))
1179     {
1180         Ty = mInt8PtrTy;
1181     }
1182     else
1183     {
1184         SWR_ASSERT(false, "This should never happen as per static_assert above.");
1185     }
1186
1187     Value* vIndices = VUNDEF_I();
1188
1189     {
1190         // store 0 index on stack to be used to conditionally load from if index address is OOB
1191         Value* pZeroIndex = ALLOCA(Ty->getPointerElementType());
1192         STORE(C((T)0), pZeroIndex);
1193
1194         // Load a SIMD of index pointers
1195         for (int64_t lane = 0; lane < mVWidth; lane++)
1196         {
1197             // Calculate the address of the requested index
1198             Value* pIndex = GEP(pIndices, C(lane), Ty);
1199
1200             pLastIndex = INT_TO_PTR(pLastIndex, Ty);
1201
1202             // check if the address is less than the max index,
1203             Value* mask = ICMP_ULT(pIndex, pLastIndex);
1204
1205             // if valid, load the index. if not, load 0 from the stack
1206             Value* pValid = SELECT(mask, pIndex, pZeroIndex);
1207             Value* index  = LOAD(pValid, "valid index", Ty, MEM_CLIENT::GFX_MEM_CLIENT_FETCH);
1208
1209             // zero extended index to 32 bits and insert into the correct simd lane
1210             index    = Z_EXT(index, mInt32Ty);
1211             vIndices = VINSERT(vIndices, index, lane);
1212         }
1213     }
1214
1215     return vIndices;
1216 }
1217
1218 //////////////////////////////////////////////////////////////////////////
1219 /// @brief Loads a simd of valid indices. OOB indices are set to 0
1220 /// *Note* have to do 8bit index checking in scalar until we have AVX-512
1221 /// support
1222 /// @param pIndices - pointer to 8 bit indices
1223 /// @param pLastIndex - pointer to last valid index
1224 Value* FetchJit::GetSimdValid8bitIndices(Value* pIndices, Value* pLastIndex)
1225 {
1226     return GetSimdValidIndicesHelper<uint8_t>(pIndices, pLastIndex);
1227 }
1228
1229 //////////////////////////////////////////////////////////////////////////
1230 /// @brief Loads a simd of valid indices. OOB indices are set to 0
1231 /// *Note* have to do 16bit index checking in scalar until we have AVX-512
1232 /// support
1233 /// @param pIndices - pointer to 16 bit indices
1234 /// @param pLastIndex - pointer to last valid index
1235 Value* FetchJit::GetSimdValid16bitIndices(Value* pIndices, Value* pLastIndex)
1236 {
1237     return GetSimdValidIndicesHelper<uint16_t>(pIndices, pLastIndex);
1238 }
1239
1240 //////////////////////////////////////////////////////////////////////////
1241 /// @brief Loads a simd of valid indices. OOB indices are set to 0
1242 /// @param pIndices - pointer to 32 bit indices
1243 /// @param pLastIndex - pointer to last valid index
1244 Value* FetchJit::GetSimdValid32bitIndices(Value* pIndices, Value* pLastIndex)
1245 {
1246     DataLayout dL(JM()->mpCurrentModule);
1247     Value*     iLastIndex = pLastIndex;
1248     Value*     iIndices   = pIndices;
1249
1250     // get the number of indices left in the buffer (endPtr - curPtr) / sizeof(index)
1251     Value* numIndicesLeft = SUB(iLastIndex, iIndices);
1252     numIndicesLeft        = TRUNC(numIndicesLeft, mInt32Ty);
1253     numIndicesLeft        = SDIV(numIndicesLeft, C(4));
1254
1255     // create a vector of index counts from the base index ptr passed into the fetch
1256     Constant* vIndexOffsets;
1257     if (mVWidth == 8)
1258     {
1259         vIndexOffsets = C({0, 1, 2, 3, 4, 5, 6, 7});
1260     }
1261     else
1262     {
1263         vIndexOffsets = C({0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15});
1264     }
1265
1266     // compare index count to the max valid index
1267     // e.g vMaxIndex      4 4 4 4 4 4 4 4 : 4 indices left to load
1268     //     vIndexOffsets  0 1 2 3 4 5 6 7
1269     //     ------------------------------
1270     //     vIndexMask    -1-1-1-1 0 0 0 0 : offsets < max pass
1271     //     vLoadedIndices 0 1 2 3 0 0 0 0 : offsets >= max masked to 0
1272     Value* vMaxIndex  = VBROADCAST(numIndicesLeft);
1273     Value* vIndexMask = ICMP_SGT(vMaxIndex, vIndexOffsets);
1274
1275     // Load the indices; OOB loads 0
1276     return MASKED_LOAD(pIndices,
1277                        4,
1278                        vIndexMask,
1279                        VIMMED1(0),
1280                        "vIndices",
1281                        PointerType::get(mSimdInt32Ty, 0),
1282                        MEM_CLIENT::GFX_MEM_CLIENT_FETCH);
1283 }
1284
1285 //////////////////////////////////////////////////////////////////////////
1286 /// @brief Takes a SIMD of gathered 8bpc verts, zero or sign extends,
1287 /// denormalizes if needed, converts to F32 if needed, and positions in
1288 //  the proper SIMD rows to be output to the simdvertex structure
1289 /// @param args: (tuple of args, listed below)
1290 ///   @param vGatherResult - 8 gathered 8bpc vertices
1291 ///   @param pVtxOut - base pointer to output simdvertex struct
1292 ///   @param extendType - sign extend or zero extend
1293 ///   @param bNormalized - do we need to denormalize?
1294 ///   @param currentVertexElement - reference to the current vVertexElement
1295 ///   @param outputElt - reference to the current offset from simdvertex we're o
1296 ///   @param compMask - component packing mask
1297 ///   @param compCtrl - component control val
1298 ///   @param vVertexElements[4] - vertex components to output
1299 ///   @param swizzle[4] - component swizzle location
1300 void FetchJit::Shuffle8bpcGatherd16(Shuffle8bpcArgs& args)
1301 {
1302     // Unpack tuple args
1303     Value*&                    vGatherResult        = std::get<0>(args);
1304     Value*                     pVtxOut              = std::get<1>(args);
1305     const Instruction::CastOps extendType           = std::get<2>(args);
1306     const ConversionType       conversionType       = std::get<3>(args);
1307     uint32_t&                  currentVertexElement = std::get<4>(args);
1308     uint32_t&                  outputElt            = std::get<5>(args);
1309     const ComponentEnable      compMask             = std::get<6>(args);
1310     const ComponentControl(&compCtrl)[4]            = std::get<7>(args);
1311     Value*(&vVertexElements)[4]                     = std::get<8>(args);
1312     const uint32_t(&swizzle)[4]                     = std::get<9>(args);
1313
1314     // cast types
1315     Type* vGatherTy = VectorType::get(mInt32Ty, 8);
1316     Type* v32x8Ty   = VectorType::get(mInt8Ty, 32);
1317
1318     // have to do extra work for sign extending
1319     if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP))
1320     {
1321         Type* v16x8Ty = VectorType::get(mInt8Ty, 16); // 8x16bit ints in a 128bit lane
1322         Type* v128Ty  = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), 2);
1323
1324         // shuffle mask, including any swizzling
1325         const char x          = (char)swizzle[0];
1326         const char y          = (char)swizzle[1];
1327         const char z          = (char)swizzle[2];
1328         const char w          = (char)swizzle[3];
1329         Value*     vConstMask = C<char>(
1330             {char(x),     char(x + 4),  char(x + 8), char(x + 12), char(y),     char(y + 4),
1331              char(y + 8), char(y + 12), char(z),     char(z + 4),  char(z + 8), char(z + 12),
1332              char(w),     char(w + 4),  char(w + 8), char(w + 12), char(x),     char(x + 4),
1333              char(x + 8), char(x + 12), char(y),     char(y + 4),  char(y + 8), char(y + 12),
1334              char(z),     char(z + 4),  char(z + 8), char(z + 12), char(w),     char(w + 4),
1335              char(w + 8), char(w + 12)});
1336
1337         // SIMD16 PSHUFB isnt part of AVX-512F, so split into SIMD8 for the sake of KNL, for now..
1338
1339         Value* vGatherResult_lo = EXTRACT_16(vGatherResult, 0);
1340         Value* vGatherResult_hi = EXTRACT_16(vGatherResult, 1);
1341
1342         Value* vShufResult_lo =
1343             BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask), vGatherTy);
1344         Value* vShufResult_hi =
1345             BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask), vGatherTy);
1346
1347         // after pshufb: group components together in each 128bit lane
1348         // 256i - 0    1    2    3    4    5    6    7
1349         //        xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww
1350
1351         Value* vi128XY_lo = nullptr;
1352         Value* vi128XY_hi = nullptr;
1353         if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
1354         {
1355             vi128XY_lo = BITCAST(
1356                 VSHUFFLE(vShufResult_lo, vShufResult_lo, C<int32_t>({0, 4, 0, 0, 1, 5, 0, 0})),
1357                 v128Ty);
1358             vi128XY_hi = BITCAST(
1359                 VSHUFFLE(vShufResult_hi, vShufResult_hi, C<int32_t>({0, 4, 0, 0, 1, 5, 0, 0})),
1360                 v128Ty);
1361
1362             // after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane
1363             // 256i - 0    1    2    3    4    5    6    7
1364             //        xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care)
1365         }
1366
1367         // do the same for zw components
1368         Value* vi128ZW_lo = nullptr;
1369         Value* vi128ZW_hi = nullptr;
1370         if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
1371         {
1372             vi128ZW_lo = BITCAST(
1373                 VSHUFFLE(vShufResult_lo, vShufResult_lo, C<int32_t>({2, 6, 0, 0, 3, 7, 0, 0})),
1374                 v128Ty);
1375             vi128ZW_hi = BITCAST(
1376                 VSHUFFLE(vShufResult_hi, vShufResult_hi, C<int32_t>({2, 6, 0, 0, 3, 7, 0, 0})),
1377                 v128Ty);
1378         }
1379
1380         // init denormalize variables if needed
1381         Instruction::CastOps fpCast;
1382         Value*               conversionFactor;
1383
1384         switch (conversionType)
1385         {
1386         case CONVERT_NORMALIZED:
1387             fpCast           = Instruction::CastOps::SIToFP;
1388             conversionFactor = VIMMED1((float)(1.0 / 127.0));
1389             break;
1390         case CONVERT_SSCALED:
1391             fpCast           = Instruction::CastOps::SIToFP;
1392             conversionFactor = VIMMED1((float)(1.0));
1393             break;
1394         case CONVERT_USCALED:
1395             SWR_INVALID("Type should not be sign extended!");
1396             conversionFactor = nullptr;
1397             break;
1398         default:
1399             SWR_ASSERT(conversionType == CONVERT_NONE);
1400             conversionFactor = nullptr;
1401             break;
1402         }
1403
1404         // sign extend all enabled components. If we have a fill vVertexElements, output to current
1405         // simdvertex
1406         for (uint32_t i = 0; i < 4; i++)
1407         {
1408             if (isComponentEnabled(compMask, i))
1409             {
1410                 if (compCtrl[i] == ComponentControl::StoreSrc)
1411                 {
1412                     // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
1413                     uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
1414                     // if x or y, use vi128XY permute result, else use vi128ZW
1415                     Value* selectedPermute_lo = (i < 2) ? vi128XY_lo : vi128ZW_lo;
1416                     Value* selectedPermute_hi = (i < 2) ? vi128XY_hi : vi128ZW_hi;
1417
1418                     // sign extend
1419                     Value* temp_lo =
1420                         PMOVSXBD(BITCAST(VEXTRACT(selectedPermute_lo, C(lane)), v16x8Ty));
1421                     Value* temp_hi =
1422                         PMOVSXBD(BITCAST(VEXTRACT(selectedPermute_hi, C(lane)), v16x8Ty));
1423
1424                     Value* temp = JOIN_16(temp_lo, temp_hi);
1425
1426                     // denormalize if needed
1427                     if (conversionType != CONVERT_NONE)
1428                     {
1429                         temp = FMUL(CAST(fpCast, temp, mSimdFP32Ty), conversionFactor);
1430                     }
1431
1432                     vVertexElements[currentVertexElement] = temp;
1433
1434                     currentVertexElement += 1;
1435                 }
1436                 else
1437                 {
1438                     vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1439                 }
1440
1441                 if (currentVertexElement > 3)
1442                 {
1443                     StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1444                     // reset to the next vVertexElement to output
1445                     currentVertexElement = 0;
1446                 }
1447             }
1448         }
1449     }
1450     // else zero extend
1451     else if ((extendType == Instruction::CastOps::ZExt) ||
1452              (extendType == Instruction::CastOps::UIToFP))
1453     {
1454         // init denormalize variables if needed
1455         Instruction::CastOps fpCast;
1456         Value*               conversionFactor;
1457
1458         switch (conversionType)
1459         {
1460         case CONVERT_NORMALIZED:
1461             fpCast           = Instruction::CastOps::UIToFP;
1462             conversionFactor = VIMMED1((float)(1.0 / 255.0));
1463             break;
1464         case CONVERT_USCALED:
1465             fpCast           = Instruction::CastOps::UIToFP;
1466             conversionFactor = VIMMED1((float)(1.0));
1467             break;
1468         case CONVERT_SSCALED:
1469             SWR_INVALID("Type should not be zero extended!");
1470             conversionFactor = nullptr;
1471             break;
1472         default:
1473             SWR_ASSERT(conversionType == CONVERT_NONE);
1474             conversionFactor = nullptr;
1475             break;
1476         }
1477
1478         // shuffle enabled components into lower byte of each 32bit lane, 0 extending to 32 bits
1479         for (uint32_t i = 0; i < 4; i++)
1480         {
1481             if (isComponentEnabled(compMask, i))
1482             {
1483                 if (compCtrl[i] == ComponentControl::StoreSrc)
1484                 {
1485                     // pshufb masks for each component
1486                     Value* vConstMask;
1487                     switch (swizzle[i])
1488                     {
1489                     case 0:
1490                         // x shuffle mask
1491                         vConstMask =
1492                             C<char>({0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1,
1493                                      0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1});
1494                         break;
1495                     case 1:
1496                         // y shuffle mask
1497                         vConstMask =
1498                             C<char>({1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1,
1499                                      1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1});
1500                         break;
1501                     case 2:
1502                         // z shuffle mask
1503                         vConstMask =
1504                             C<char>({2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1,
1505                                      2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1});
1506                         break;
1507                     case 3:
1508                         // w shuffle mask
1509                         vConstMask =
1510                             C<char>({3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1,
1511                                      3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1});
1512                         break;
1513                     default:
1514                         vConstMask = nullptr;
1515                         break;
1516                     }
1517
1518                     Value* vGatherResult_lo = EXTRACT_16(vGatherResult, 0);
1519                     Value* vGatherResult_hi = EXTRACT_16(vGatherResult, 1);
1520
1521                     Value* temp_lo =
1522                         BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask), vGatherTy);
1523                     Value* temp_hi =
1524                         BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask), vGatherTy);
1525
1526                     // after pshufb for x channel
1527                     // 256i - 0    1    2    3    4    5    6    7
1528                     //        x000 x000 x000 x000 x000 x000 x000 x000
1529
1530                     Value* temp = JOIN_16(temp_lo, temp_hi);
1531
1532                     // denormalize if needed
1533                     if (conversionType != CONVERT_NONE)
1534                     {
1535                         temp = FMUL(CAST(fpCast, temp, mSimdFP32Ty), conversionFactor);
1536                     }
1537
1538                     vVertexElements[currentVertexElement] = temp;
1539
1540                     currentVertexElement += 1;
1541                 }
1542                 else
1543                 {
1544                     vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1545                 }
1546
1547                 if (currentVertexElement > 3)
1548                 {
1549                     StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1550                     // reset to the next vVertexElement to output
1551                     currentVertexElement = 0;
1552                 }
1553             }
1554         }
1555     }
1556     else
1557     {
1558         SWR_INVALID("Unsupported conversion type");
1559     }
1560 }
1561
1562 void FetchJit::Shuffle8bpcGatherd(Shuffle8bpcArgs& args)
1563 {
1564     // Unpack tuple args
1565     Value*&                    vGatherResult        = std::get<0>(args);
1566     Value*                     pVtxOut              = std::get<1>(args);
1567     const Instruction::CastOps extendType           = std::get<2>(args);
1568     const ConversionType       conversionType       = std::get<3>(args);
1569     uint32_t&                  currentVertexElement = std::get<4>(args);
1570     uint32_t&                  outputElt            = std::get<5>(args);
1571     const ComponentEnable      compMask             = std::get<6>(args);
1572     const ComponentControl(&compCtrl)[4]            = std::get<7>(args);
1573     Value*(&vVertexElements)[4]                     = std::get<8>(args);
1574     const uint32_t(&swizzle)[4]                     = std::get<9>(args);
1575
1576     // cast types
1577     Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
1578
1579     for (uint32_t i = 0; i < 4; i++)
1580     {
1581         if (!isComponentEnabled(compMask, i))
1582             continue;
1583
1584         if (compCtrl[i] == ComponentControl::StoreSrc)
1585         {
1586             std::vector<uint32_t> vShuffleMasks[4] = {
1587                 {0, 4, 8, 12, 16, 20, 24, 28},  // x
1588                 {1, 5, 9, 13, 17, 21, 25, 29},  // y
1589                 {2, 6, 10, 14, 18, 22, 26, 30}, // z
1590                 {3, 7, 11, 15, 19, 23, 27, 31}, // w
1591             };
1592
1593             Value* val = VSHUFFLE(BITCAST(vGatherResult, v32x8Ty),
1594                                   UndefValue::get(v32x8Ty),
1595                                   vShuffleMasks[swizzle[i]]);
1596
1597             if ((extendType == Instruction::CastOps::SExt) ||
1598                 (extendType == Instruction::CastOps::SIToFP))
1599             {
1600                 switch (conversionType)
1601                 {
1602                 case CONVERT_NORMALIZED:
1603                     val = FMUL(SI_TO_FP(val, mSimdFP32Ty), VIMMED1((float)(1.0 / 127.0)));
1604                     break;
1605                 case CONVERT_SSCALED:
1606                     val = SI_TO_FP(val, mSimdFP32Ty);
1607                     break;
1608                 case CONVERT_USCALED:
1609                     SWR_INVALID("Type should not be sign extended!");
1610                     break;
1611                 default:
1612                     SWR_ASSERT(conversionType == CONVERT_NONE);
1613                     val = S_EXT(val, mSimdInt32Ty);
1614                     break;
1615                 }
1616             }
1617             else if ((extendType == Instruction::CastOps::ZExt) ||
1618                      (extendType == Instruction::CastOps::UIToFP))
1619             {
1620                 switch (conversionType)
1621                 {
1622                 case CONVERT_NORMALIZED:
1623                     val = FMUL(UI_TO_FP(val, mSimdFP32Ty), VIMMED1((float)(1.0 / 255.0)));
1624                     break;
1625                 case CONVERT_SSCALED:
1626                     SWR_INVALID("Type should not be zero extended!");
1627                     break;
1628                 case CONVERT_USCALED:
1629                     val = UI_TO_FP(val, mSimdFP32Ty);
1630                     break;
1631                 default:
1632                     SWR_ASSERT(conversionType == CONVERT_NONE);
1633                     val = Z_EXT(val, mSimdInt32Ty);
1634                     break;
1635                 }
1636             }
1637             else
1638             {
1639                 SWR_INVALID("Unsupported conversion type");
1640             }
1641
1642             vVertexElements[currentVertexElement++] = val;
1643         }
1644         else
1645         {
1646             vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1647         }
1648
1649         if (currentVertexElement > 3)
1650         {
1651             StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1652             // reset to the next vVertexElement to output
1653             currentVertexElement = 0;
1654         }
1655     }
1656 }
1657
1658 //////////////////////////////////////////////////////////////////////////
1659 /// @brief Takes a SIMD of gathered 16bpc verts, zero or sign extends,
1660 /// denormalizes if needed, converts to F32 if needed, and positions in
1661 //  the proper SIMD rows to be output to the simdvertex structure
1662 /// @param args: (tuple of args, listed below)
1663 ///   @param vGatherResult[2] - array of gathered 16bpc vertices, 4 per index
1664 ///   @param pVtxOut - base pointer to output simdvertex struct
1665 ///   @param extendType - sign extend or zero extend
1666 ///   @param bNormalized - do we need to denormalize?
1667 ///   @param currentVertexElement - reference to the current vVertexElement
1668 ///   @param outputElt - reference to the current offset from simdvertex we're o
1669 ///   @param compMask - component packing mask
1670 ///   @param compCtrl - component control val
1671 ///   @param vVertexElements[4] - vertex components to output
1672 void FetchJit::Shuffle16bpcGather16(Shuffle16bpcArgs& args)
1673 {
1674     // Unpack tuple args
1675     Value*(&vGatherResult)[2]                       = std::get<0>(args);
1676     Value*                     pVtxOut              = std::get<1>(args);
1677     const Instruction::CastOps extendType           = std::get<2>(args);
1678     const ConversionType       conversionType       = std::get<3>(args);
1679     uint32_t&                  currentVertexElement = std::get<4>(args);
1680     uint32_t&                  outputElt            = std::get<5>(args);
1681     const ComponentEnable      compMask             = std::get<6>(args);
1682     const ComponentControl(&compCtrl)[4]            = std::get<7>(args);
1683     Value*(&vVertexElements)[4]                     = std::get<8>(args);
1684
1685     // cast types
1686     Type* vGatherTy = VectorType::get(mInt32Ty, 8);
1687     Type* v32x8Ty   = VectorType::get(mInt8Ty, 32);
1688
1689     // have to do extra work for sign extending
1690     if ((extendType == Instruction::CastOps::SExt) ||
1691         (extendType == Instruction::CastOps::SIToFP) || (extendType == Instruction::CastOps::FPExt))
1692     {
1693         // is this PP float?
1694         bool bFP = (extendType == Instruction::CastOps::FPExt) ? true : false;
1695
1696         Type* v8x16Ty   = VectorType::get(mInt16Ty, 8); // 8x16bit in a 128bit lane
1697         Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), 2);
1698
1699         // shuffle mask
1700         Value* vConstMask = C<uint8_t>({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
1701                                         0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15});
1702         Value* vi128XY_lo = nullptr;
1703         Value* vi128XY_hi = nullptr;
1704         if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
1705         {
1706             // SIMD16 PSHUFB isnt part of AVX-512F, so split into SIMD8 for the sake of KNL, for
1707             // now..
1708
1709             Value* vGatherResult_lo = BITCAST(EXTRACT_16(vGatherResult[0], 0), v32x8Ty);
1710             Value* vGatherResult_hi = BITCAST(EXTRACT_16(vGatherResult[0], 1), v32x8Ty);
1711
1712             Value* vShufResult_lo = BITCAST(PSHUFB(vGatherResult_lo, vConstMask), vGatherTy);
1713             Value* vShufResult_hi = BITCAST(PSHUFB(vGatherResult_hi, vConstMask), vGatherTy);
1714
1715             // after pshufb: group components together in each 128bit lane
1716             // 256i - 0    1    2    3    4    5    6    7
1717             //        xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
1718
1719             vi128XY_lo = BITCAST(
1720                 VSHUFFLE(vShufResult_lo, vShufResult_lo, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})),
1721                 v128bitTy);
1722             vi128XY_hi = BITCAST(
1723                 VSHUFFLE(vShufResult_hi, vShufResult_hi, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})),
1724                 v128bitTy);
1725
1726             // after PERMD: move and pack xy components into each 128bit lane
1727             // 256i - 0    1    2    3    4    5    6    7
1728             //        xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
1729         }
1730
1731         // do the same for zw components
1732         Value* vi128ZW_lo = nullptr;
1733         Value* vi128ZW_hi = nullptr;
1734         if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
1735         {
1736             Value* vGatherResult_lo = BITCAST(EXTRACT_16(vGatherResult[1], 0), v32x8Ty);
1737             Value* vGatherResult_hi = BITCAST(EXTRACT_16(vGatherResult[1], 1), v32x8Ty);
1738
1739             Value* vShufResult_lo = BITCAST(PSHUFB(vGatherResult_lo, vConstMask), vGatherTy);
1740             Value* vShufResult_hi = BITCAST(PSHUFB(vGatherResult_hi, vConstMask), vGatherTy);
1741
1742             vi128ZW_lo = BITCAST(
1743                 VSHUFFLE(vShufResult_lo, vShufResult_lo, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})),
1744                 v128bitTy);
1745             vi128ZW_hi = BITCAST(
1746                 VSHUFFLE(vShufResult_hi, vShufResult_hi, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})),
1747                 v128bitTy);
1748         }
1749
1750         // init denormalize variables if needed
1751         Instruction::CastOps IntToFpCast;
1752         Value*               conversionFactor;
1753
1754         switch (conversionType)
1755         {
1756         case CONVERT_NORMALIZED:
1757             IntToFpCast      = Instruction::CastOps::SIToFP;
1758             conversionFactor = VIMMED1((float)(1.0 / 32767.0));
1759             break;
1760         case CONVERT_SSCALED:
1761             IntToFpCast      = Instruction::CastOps::SIToFP;
1762             conversionFactor = VIMMED1((float)(1.0));
1763             break;
1764         case CONVERT_USCALED:
1765             SWR_INVALID("Type should not be sign extended!");
1766             conversionFactor = nullptr;
1767             break;
1768         default:
1769             SWR_ASSERT(conversionType == CONVERT_NONE);
1770             conversionFactor = nullptr;
1771             break;
1772         }
1773
1774         // sign extend all enabled components. If we have a fill vVertexElements, output to current
1775         // simdvertex
1776         for (uint32_t i = 0; i < 4; i++)
1777         {
1778             if (isComponentEnabled(compMask, i))
1779             {
1780                 if (compCtrl[i] == ComponentControl::StoreSrc)
1781                 {
1782                     // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
1783                     uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
1784                     // if x or y, use vi128XY permute result, else use vi128ZW
1785                     Value* selectedPermute_lo = (i < 2) ? vi128XY_lo : vi128ZW_lo;
1786                     Value* selectedPermute_hi = (i < 2) ? vi128XY_hi : vi128ZW_hi;
1787
1788                     if (bFP)
1789                     {
1790                         // extract 128 bit lanes to sign extend each component
1791                         Value* temp_lo =
1792                             CVTPH2PS(BITCAST(VEXTRACT(selectedPermute_lo, C(lane)), v8x16Ty));
1793                         Value* temp_hi =
1794                             CVTPH2PS(BITCAST(VEXTRACT(selectedPermute_hi, C(lane)), v8x16Ty));
1795
1796                         vVertexElements[currentVertexElement] = JOIN_16(temp_lo, temp_hi);
1797                     }
1798                     else
1799                     {
1800                         // extract 128 bit lanes to sign extend each component
1801                         Value* temp_lo =
1802                             PMOVSXWD(BITCAST(VEXTRACT(selectedPermute_lo, C(lane)), v8x16Ty));
1803                         Value* temp_hi =
1804                             PMOVSXWD(BITCAST(VEXTRACT(selectedPermute_hi, C(lane)), v8x16Ty));
1805
1806                         Value* temp = JOIN_16(temp_lo, temp_hi);
1807
1808                         // denormalize if needed
1809                         if (conversionType != CONVERT_NONE)
1810                         {
1811                             temp = FMUL(CAST(IntToFpCast, temp, mSimdFP32Ty), conversionFactor);
1812                         }
1813
1814                         vVertexElements[currentVertexElement] = temp;
1815                     }
1816
1817                     currentVertexElement += 1;
1818                 }
1819                 else
1820                 {
1821                     vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1822                 }
1823
1824                 if (currentVertexElement > 3)
1825                 {
1826                     StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1827                     // reset to the next vVertexElement to output
1828                     currentVertexElement = 0;
1829                 }
1830             }
1831         }
1832     }
1833     // else zero extend
1834     else if ((extendType == Instruction::CastOps::ZExt) ||
1835              (extendType == Instruction::CastOps::UIToFP))
1836     {
1837         // pshufb masks for each component
1838         Value* vConstMask[2];
1839
1840         if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 2))
1841         {
1842             // x/z shuffle mask
1843             vConstMask[0] = C<char>({
1844                 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
1845                 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
1846             });
1847         }
1848
1849         if (isComponentEnabled(compMask, 1) || isComponentEnabled(compMask, 3))
1850         {
1851             // y/w shuffle mask
1852             vConstMask[1] = C<char>({2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
1853                                      2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1});
1854         }
1855
1856         // init denormalize variables if needed
1857         Instruction::CastOps fpCast;
1858         Value*               conversionFactor;
1859
1860         switch (conversionType)
1861         {
1862         case CONVERT_NORMALIZED:
1863             fpCast           = Instruction::CastOps::UIToFP;
1864             conversionFactor = VIMMED1((float)(1.0 / 65535.0));
1865             break;
1866         case CONVERT_USCALED:
1867             fpCast           = Instruction::CastOps::UIToFP;
1868             conversionFactor = VIMMED1((float)(1.0f));
1869             break;
1870         case CONVERT_SSCALED:
1871             SWR_INVALID("Type should not be zero extended!");
1872             conversionFactor = nullptr;
1873             break;
1874         default:
1875             SWR_ASSERT(conversionType == CONVERT_NONE);
1876             conversionFactor = nullptr;
1877             break;
1878         }
1879
1880         // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits
1881         for (uint32_t i = 0; i < 4; i++)
1882         {
1883             if (isComponentEnabled(compMask, i))
1884             {
1885                 if (compCtrl[i] == ComponentControl::StoreSrc)
1886                 {
1887                     // select correct constMask for x/z or y/w pshufb
1888                     uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1;
1889                     // if x or y, use vi128XY permute result, else use vi128ZW
1890                     uint32_t selectedGather = (i < 2) ? 0 : 1;
1891
1892                     // SIMD16 PSHUFB isnt part of AVX-512F, so split into SIMD8 for the sake of KNL,
1893                     // for now..
1894
1895                     Value* vGatherResult_lo = EXTRACT_16(vGatherResult[selectedGather], 0);
1896                     Value* vGatherResult_hi = EXTRACT_16(vGatherResult[selectedGather], 1);
1897
1898                     Value* temp_lo = BITCAST(
1899                         PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask[selectedMask]),
1900                         vGatherTy);
1901                     Value* temp_hi = BITCAST(
1902                         PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask[selectedMask]),
1903                         vGatherTy);
1904
1905                     // after pshufb mask for x channel; z uses the same shuffle from the second
1906                     // gather 256i - 0    1    2    3    4    5    6    7
1907                     //        xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00
1908
1909                     Value* temp = JOIN_16(temp_lo, temp_hi);
1910
1911                     // denormalize if needed
1912                     if (conversionType != CONVERT_NONE)
1913                     {
1914                         temp = FMUL(CAST(fpCast, temp, mSimdFP32Ty), conversionFactor);
1915                     }
1916
1917                     vVertexElements[currentVertexElement] = temp;
1918
1919                     currentVertexElement += 1;
1920                 }
1921                 else
1922                 {
1923                     vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1924                 }
1925
1926                 if (currentVertexElement > 3)
1927                 {
1928                     StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1929                     // reset to the next vVertexElement to output
1930                     currentVertexElement = 0;
1931                 }
1932             }
1933         }
1934     }
1935     else
1936     {
1937         SWR_INVALID("Unsupported conversion type");
1938     }
1939 }
1940
1941 void FetchJit::Shuffle16bpcGather(Shuffle16bpcArgs& args)
1942 {
1943     // Unpack tuple args
1944     Value*(&vGatherResult)[2]                       = std::get<0>(args);
1945     Value*                     pVtxOut              = std::get<1>(args);
1946     const Instruction::CastOps extendType           = std::get<2>(args);
1947     const ConversionType       conversionType       = std::get<3>(args);
1948     uint32_t&                  currentVertexElement = std::get<4>(args);
1949     uint32_t&                  outputElt            = std::get<5>(args);
1950     const ComponentEnable      compMask             = std::get<6>(args);
1951     const ComponentControl(&compCtrl)[4]            = std::get<7>(args);
1952     Value*(&vVertexElements)[4]                     = std::get<8>(args);
1953
1954     // cast types
1955     Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
1956     Type* v32x8Ty   = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
1957
1958     // have to do extra work for sign extending
1959     if ((extendType == Instruction::CastOps::SExt) ||
1960         (extendType == Instruction::CastOps::SIToFP) || (extendType == Instruction::CastOps::FPExt))
1961     {
1962         // is this PP float?
1963         bool bFP = (extendType == Instruction::CastOps::FPExt) ? true : false;
1964
1965         Type* v8x16Ty   = VectorType::get(mInt16Ty, 8); // 8x16bit in a 128bit lane
1966         Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128),
1967                                           mVWidth / 4); // vwidth is units of 32 bits
1968
1969         // shuffle mask
1970         Value* vConstMask = C<char>({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
1971                                      0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15});
1972         Value* vi128XY    = nullptr;
1973         if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
1974         {
1975             Value* vShufResult =
1976                 BITCAST(PSHUFB(BITCAST(vGatherResult[0], v32x8Ty), vConstMask), vGatherTy);
1977             // after pshufb: group components together in each 128bit lane
1978             // 256i - 0    1    2    3    4    5    6    7
1979             //        xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
1980
1981             vi128XY = BITCAST(VPERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
1982             // after PERMD: move and pack xy components into each 128bit lane
1983             // 256i - 0    1    2    3    4    5    6    7
1984             //        xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
1985         }
1986
1987         // do the same for zw components
1988         Value* vi128ZW = nullptr;
1989         if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
1990         {
1991             Value* vShufResult =
1992                 BITCAST(PSHUFB(BITCAST(vGatherResult[1], v32x8Ty), vConstMask), vGatherTy);
1993             vi128ZW = BITCAST(VPERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
1994         }
1995
1996         // init denormalize variables if needed
1997         Instruction::CastOps IntToFpCast;
1998         Value*               conversionFactor;
1999
2000         switch (conversionType)
2001         {
2002         case CONVERT_NORMALIZED:
2003             IntToFpCast      = Instruction::CastOps::SIToFP;
2004             conversionFactor = VIMMED1((float)(1.0 / 32767.0));
2005             break;
2006         case CONVERT_SSCALED:
2007             IntToFpCast      = Instruction::CastOps::SIToFP;
2008             conversionFactor = VIMMED1((float)(1.0));
2009             break;
2010         case CONVERT_USCALED:
2011             SWR_INVALID("Type should not be sign extended!");
2012             conversionFactor = nullptr;
2013             break;
2014         default:
2015             SWR_ASSERT(conversionType == CONVERT_NONE);
2016             conversionFactor = nullptr;
2017             break;
2018         }
2019
2020         // sign extend all enabled components. If we have a fill vVertexElements, output to current
2021         // simdvertex
2022         for (uint32_t i = 0; i < 4; i++)
2023         {
2024             if (isComponentEnabled(compMask, i))
2025             {
2026                 if (compCtrl[i] == ComponentControl::StoreSrc)
2027                 {
2028                     // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
2029                     uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
2030                     // if x or y, use vi128XY permute result, else use vi128ZW
2031                     Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
2032
2033                     if (bFP)
2034                     {
2035                         // extract 128 bit lanes to sign extend each component
2036                         vVertexElements[currentVertexElement] =
2037                             CVTPH2PS(BITCAST(VEXTRACT(selectedPermute, C(lane)), v8x16Ty));
2038                     }
2039                     else
2040                     {
2041                         // extract 128 bit lanes to sign extend each component
2042                         vVertexElements[currentVertexElement] =
2043                             PMOVSXWD(BITCAST(VEXTRACT(selectedPermute, C(lane)), v8x16Ty));
2044
2045                         // denormalize if needed
2046                         if (conversionType != CONVERT_NONE)
2047                         {
2048                             vVertexElements[currentVertexElement] =
2049                                 FMUL(CAST(IntToFpCast,
2050                                           vVertexElements[currentVertexElement],
2051                                           mSimdFP32Ty),
2052                                      conversionFactor);
2053                         }
2054                     }
2055                     currentVertexElement++;
2056                 }
2057                 else
2058                 {
2059                     vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
2060                 }
2061
2062                 if (currentVertexElement > 3)
2063                 {
2064                     StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
2065                     // reset to the next vVertexElement to output
2066                     currentVertexElement = 0;
2067                 }
2068             }
2069         }
2070     }
2071     // else zero extend
2072     else if ((extendType == Instruction::CastOps::ZExt) ||
2073              (extendType == Instruction::CastOps::UIToFP))
2074     {
2075         // pshufb masks for each component
2076         Value* vConstMask[2];
2077         if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 2))
2078         {
2079             // x/z shuffle mask
2080             vConstMask[0] = C<char>({
2081                 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
2082                 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
2083             });
2084         }
2085
2086         if (isComponentEnabled(compMask, 1) || isComponentEnabled(compMask, 3))
2087         {
2088             // y/w shuffle mask
2089             vConstMask[1] = C<char>({2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
2090                                      2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1});
2091         }
2092
2093         // init denormalize variables if needed
2094         Instruction::CastOps fpCast;
2095         Value*               conversionFactor;
2096
2097         switch (conversionType)
2098         {
2099         case CONVERT_NORMALIZED:
2100             fpCast           = Instruction::CastOps::UIToFP;
2101             conversionFactor = VIMMED1((float)(1.0 / 65535.0));
2102             break;
2103         case CONVERT_USCALED:
2104             fpCast           = Instruction::CastOps::UIToFP;
2105             conversionFactor = VIMMED1((float)(1.0f));
2106             break;
2107         case CONVERT_SSCALED:
2108             SWR_INVALID("Type should not be zero extended!");
2109             conversionFactor = nullptr;
2110             break;
2111         default:
2112             SWR_ASSERT(conversionType == CONVERT_NONE);
2113             conversionFactor = nullptr;
2114             break;
2115         }
2116
2117         // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits
2118         for (uint32_t i = 0; i < 4; i++)
2119         {
2120             if (isComponentEnabled(compMask, i))
2121             {
2122                 if (compCtrl[i] == ComponentControl::StoreSrc)
2123                 {
2124                     // select correct constMask for x/z or y/w pshufb
2125                     uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1;
2126                     // if x or y, use vi128XY permute result, else use vi128ZW
2127                     uint32_t selectedGather = (i < 2) ? 0 : 1;
2128
2129                     vVertexElements[currentVertexElement] =
2130                         BITCAST(PSHUFB(BITCAST(vGatherResult[selectedGather], v32x8Ty),
2131                                        vConstMask[selectedMask]),
2132                                 vGatherTy);
2133                     // after pshufb mask for x channel; z uses the same shuffle from the second
2134                     // gather 256i - 0    1    2    3    4    5    6    7
2135                     //        xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00
2136
2137                     // denormalize if needed
2138                     if (conversionType != CONVERT_NONE)
2139                     {
2140                         vVertexElements[currentVertexElement] =
2141                             FMUL(CAST(fpCast, vVertexElements[currentVertexElement], mSimdFP32Ty),
2142                                  conversionFactor);
2143                     }
2144                     currentVertexElement++;
2145                 }
2146                 else
2147                 {
2148                     vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
2149                 }
2150
2151                 if (currentVertexElement > 3)
2152                 {
2153                     StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
2154                     // reset to the next vVertexElement to output
2155                     currentVertexElement = 0;
2156                 }
2157             }
2158         }
2159     }
2160     else
2161     {
2162         SWR_INVALID("Unsupported conversion type");
2163     }
2164 }
2165
2166 //////////////////////////////////////////////////////////////////////////
2167 /// @brief Output a simdvertex worth of elements to the current outputElt
2168 /// @param pVtxOut - base address of VIN output struct
2169 /// @param outputElt - simdvertex offset in VIN to write to
2170 /// @param numEltsToStore - number of simdvertex rows to write out
2171 /// @param vVertexElements - LLVM Value*[] simdvertex to write out
2172 void FetchJit::StoreVertexElements(Value*         pVtxOut,
2173                                    const uint32_t outputElt,
2174                                    const uint32_t numEltsToStore,
2175                                    Value* (&vVertexElements)[4])
2176 {
2177     SWR_ASSERT(numEltsToStore <= 4, "Invalid element count.");
2178
2179     for (uint32_t c = 0; c < numEltsToStore; ++c)
2180     {
2181         // STORE expects FP32 x vWidth type, just bitcast if needed
2182         if (!vVertexElements[c]->getType()->getScalarType()->isFloatTy())
2183         {
2184 #if FETCH_DUMP_VERTEX
2185             PRINT("vVertexElements[%d]: 0x%x\n", {C(c), vVertexElements[c]});
2186 #endif
2187             vVertexElements[c] = BITCAST(vVertexElements[c], mSimdFP32Ty);
2188         }
2189 #if FETCH_DUMP_VERTEX
2190         else
2191         {
2192             PRINT("vVertexElements[%d]: %f\n", {C(c), vVertexElements[c]});
2193         }
2194 #endif
2195         // outputElt * 4 = offsetting by the size of a simdvertex
2196         // + c offsets to a 32bit x vWidth row within the current vertex
2197         Value* dest = GEP(pVtxOut, C(outputElt * 4 + c), nullptr, "destGEP");
2198         STORE(vVertexElements[c], dest);
2199     }
2200 }
2201
2202 //////////////////////////////////////////////////////////////////////////
2203 /// @brief Generates a constant vector of values based on the
2204 /// ComponentControl value
2205 /// @param ctrl - ComponentControl value
2206 Value* FetchJit::GenerateCompCtrlVector(const ComponentControl ctrl)
2207 {
2208     switch (ctrl)
2209     {
2210     case NoStore:
2211         return VUNDEF_I();
2212     case Store0:
2213         return VIMMED1(0);
2214     case Store1Fp:
2215         return VIMMED1(1.0f);
2216     case Store1Int:
2217         return VIMMED1(1);
2218     case StoreVertexId:
2219     {
2220         if (mVWidth == 16)
2221         {
2222             Type*  pSimd8FPTy = VectorType::get(mFP32Ty, 8);
2223             Value* pIdLo =
2224                 BITCAST(LOAD(GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_VertexID})), pSimd8FPTy);
2225             Value* pIdHi =
2226                 BITCAST(LOAD(GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_VertexID2})), pSimd8FPTy);
2227             return JOIN_16(pIdLo, pIdHi);
2228         }
2229         else
2230         {
2231             return BITCAST(LOAD(GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_VertexID})), mSimdFP32Ty);
2232         }
2233     }
2234     case StoreInstanceId:
2235     {
2236         Value* pId = BITCAST(LOAD(GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_CurInstance})), mFP32Ty);
2237         return VBROADCAST(pId);
2238     }
2239
2240
2241     case StoreSrc:
2242     default:
2243         SWR_INVALID("Invalid component control");
2244         return VUNDEF_I();
2245     }
2246 }
2247
2248 //////////////////////////////////////////////////////////////////////////
2249 /// @brief Returns the enable mask for the specified component.
2250 /// @param enableMask - enable bits
2251 /// @param component - component to check if enabled.
2252 bool isComponentEnabled(ComponentEnable enableMask, uint8_t component)
2253 {
2254     switch (component)
2255     {
2256         // X
2257     case 0:
2258         return (enableMask & ComponentEnable::X);
2259         // Y
2260     case 1:
2261         return (enableMask & ComponentEnable::Y);
2262         // Z
2263     case 2:
2264         return (enableMask & ComponentEnable::Z);
2265         // W
2266     case 3:
2267         return (enableMask & ComponentEnable::W);
2268
2269     default:
2270         return false;
2271     }
2272 }
2273
2274 // Don't want two threads compiling the same fetch shader simultaneously
2275 // Has problems in the JIT cache implementation
2276 // This is only a problem for fetch right now.
2277 static std::mutex gFetchCodegenMutex;
2278
2279 //////////////////////////////////////////////////////////////////////////
2280 /// @brief JITs from fetch shader IR
2281 /// @param hJitMgr - JitManager handle
2282 /// @param func   - LLVM function IR
2283 /// @return PFN_FETCH_FUNC - pointer to fetch code
2284 PFN_FETCH_FUNC JitFetchFunc(HANDLE hJitMgr, const HANDLE hFunc)
2285 {
2286     const llvm::Function* func    = (const llvm::Function*)hFunc;
2287     JitManager*           pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
2288     PFN_FETCH_FUNC        pfnFetch;
2289
2290     gFetchCodegenMutex.lock();
2291     pfnFetch = (PFN_FETCH_FUNC)(pJitMgr->mpExec->getFunctionAddress(func->getName().str()));
2292     // MCJIT finalizes modules the first time you JIT code from them. After finalized, you cannot
2293     // add new IR to the module
2294     pJitMgr->mIsModuleFinalized = true;
2295
2296 #if defined(KNOB_SWRC_TRACING)
2297     char        fName[1024];
2298     const char* funcName = func->getName().data();
2299     sprintf(fName, "%s.bin", funcName);
2300     FILE* fd = fopen(fName, "wb");
2301     fwrite((void*)pfnFetch, 1, 2048, fd);
2302     fclose(fd);
2303 #endif
2304
2305     pJitMgr->DumpAsm(const_cast<llvm::Function*>(func), "final");
2306     gFetchCodegenMutex.unlock();
2307
2308
2309     return pfnFetch;
2310 }
2311
2312 //////////////////////////////////////////////////////////////////////////
2313 /// @brief JIT compiles fetch shader
2314 /// @param hJitMgr - JitManager handle
2315 /// @param state   - fetch state to build function from
2316 extern "C" PFN_FETCH_FUNC JITCALL JitCompileFetch(HANDLE hJitMgr, const FETCH_COMPILE_STATE& state)
2317 {
2318     JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
2319
2320     pJitMgr->SetupNewModule();
2321
2322     FetchJit theJit(pJitMgr);
2323     HANDLE   hFunc = theJit.Create(state);
2324
2325     return JitFetchFunc(hJitMgr, hFunc);
2326 }