src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp

   1 /****************************************************************************
   2 * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
   3 *
   4 * Permission is hereby granted, free of charge, to any person obtaining a
   5 * copy of this software and associated documentation files (the "Software"),
   6 * to deal in the Software without restriction, including without limitation
   7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8 * and/or sell copies of the Software, and to permit persons to whom the
   9 * Software is furnished to do so, subject to the following conditions:
  10 *
  11 * The above copyright notice and this permission notice (including the next
  12 * paragraph) shall be included in all copies or substantial portions of the
  13 * Software.
  14 *
  15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21 * IN THE SOFTWARE.
  22 *
  23 * @file fetch_jit.cpp
  24 *
  25 * @brief Implementation of the fetch jitter
  26 *
  27 * Notes:
  28 *
  29 ******************************************************************************/
  30 #include "jit_pch.hpp"
  31 #include "builder_gfx_mem.h"
  32 #include "jit_api.h"
  33 #include "fetch_jit.h"
  34 #include "gen_state_llvm.h"
  35 #include "functionpasses/passes.h"
  36
  37 //#define FETCH_DUMP_VERTEX 1
  38 using namespace llvm;
  39 using namespace SwrJit;
  40
  41 bool isComponentEnabled(ComponentEnable enableMask, uint8_t component);
  42
  43 enum ConversionType
  44 {
  45     CONVERT_NONE,
  46     CONVERT_NORMALIZED,
  47     CONVERT_USCALED,
  48     CONVERT_SSCALED,
  49     CONVERT_SFIXED,
  50 };
  51
  52 //////////////////////////////////////////////////////////////////////////
  53 /// Interface to Jitting a fetch shader
  54 //////////////////////////////////////////////////////////////////////////
  55 struct FetchJit : public BuilderGfxMem
  56 {
  57     FetchJit(JitManager* pJitMgr) :
  58         BuilderGfxMem(pJitMgr)
  59     {}
  60
  61     Function* Create(const FETCH_COMPILE_STATE& fetchState);
  62
  63     Value* GetSimdValid32bitIndices(Value* vIndices, Value* pLastIndex);
  64     Value* GetSimdValid16bitIndices(Value* vIndices, Value* pLastIndex);
  65     Value* GetSimdValid8bitIndices(Value* vIndices, Value* pLastIndex);
  66
  67     // package up Shuffle*bpcGatherd args into a tuple for convenience
  68     typedef std::tuple<Value*&, Value*, const Instruction::CastOps, const ConversionType,
  69         uint32_t&, uint32_t&, const ComponentEnable, const ComponentControl(&)[4], Value*(&)[4],
  70         const uint32_t(&)[4]> Shuffle8bpcArgs;
  71
  72     void Shuffle8bpcGatherd16(Shuffle8bpcArgs &args);
  73     void Shuffle8bpcGatherd(Shuffle8bpcArgs &args);
  74
  75     typedef std::tuple<Value*(&)[2], Value*, const Instruction::CastOps, const ConversionType,
  76         uint32_t&, uint32_t&, const ComponentEnable, const ComponentControl(&)[4], Value*(&)[4]> Shuffle16bpcArgs;
  77
  78     void Shuffle16bpcGather16(Shuffle16bpcArgs &args);
  79     void Shuffle16bpcGather(Shuffle16bpcArgs &args);
  80
  81     void StoreVertexElements(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4]);
  82
  83     Value *GenerateCompCtrlVector(const ComponentControl ctrl);
  84
  85     void JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, Value* streams, Value* vIndices, Value* pVtxOut);
  86
  87     bool IsOddFormat(SWR_FORMAT format);
  88     bool IsUniformFormat(SWR_FORMAT format);
  89     void UnpackComponents(SWR_FORMAT format, Value* vInput, Value* result[4]);
  90     void CreateGatherOddFormats(SWR_FORMAT format, Value* pMask, Value* pBase, Value* offsets, Value* result[4]);
  91     void ConvertFormat(SWR_FORMAT format, Value *texels[4]);
  92
  93     Value* mpFetchInfo;
  94 };
  95
  96 Function* FetchJit::Create(const FETCH_COMPILE_STATE& fetchState)
  97 {
  98     std::stringstream fnName("FCH_", std::ios_base::in | std::ios_base::out | std::ios_base::ate);
  99     fnName << ComputeCRC(0, &fetchState, sizeof(fetchState));
 100
 101     Function*    fetch = Function::Create(JM()->mFetchShaderTy, GlobalValue::ExternalLinkage, fnName.str(), JM()->mpCurrentModule);
 102     BasicBlock*    entry = BasicBlock::Create(JM()->mContext, "entry", fetch);
 103
 104     fetch->getParent()->setModuleIdentifier(fetch->getName());
 105
 106     IRB()->SetInsertPoint(entry);
 107
 108     auto    argitr = fetch->arg_begin();
 109
 110     // Fetch shader arguments
 111     Value* privateContext = &*argitr; ++argitr;
 112     privateContext->setName("privateContext");
 113     SetPrivateContext(privateContext);
 114
 115     mpFetchInfo = &*argitr; ++argitr;
 116     mpFetchInfo->setName("fetchInfo");
 117     Value*    pVtxOut = &*argitr;
 118     pVtxOut->setName("vtxOutput");
 119
 120     uint32_t baseWidth = mVWidth;
 121
 122     SWR_ASSERT(mVWidth == 8 || mVWidth == 16, "Unsupported vector width %d", mVWidth);
 123
 124     // Override builder target width to force 16-wide SIMD
 125 #if USE_SIMD16_SHADERS
 126     SetTargetWidth(16);
 127 #endif
 128
 129     pVtxOut = BITCAST(pVtxOut, PointerType::get(mSimdFP32Ty, 0));
 130
 131     // SWR_FETCH_CONTEXT::pStreams
 132     Value*    streams = LOAD(mpFetchInfo,{0, SWR_FETCH_CONTEXT_pStreams});
 133     streams->setName("pStreams");
 134
 135     // SWR_FETCH_CONTEXT::pIndices
 136     Value*    indices = LOAD(mpFetchInfo,{0, SWR_FETCH_CONTEXT_xpIndices});
 137     indices->setName("pIndices");
 138
 139     // SWR_FETCH_CONTEXT::pLastIndex
 140     Value*    pLastIndex = LOAD(mpFetchInfo,{0, SWR_FETCH_CONTEXT_xpLastIndex});
 141     pLastIndex->setName("pLastIndex");
 142
 143     Value* vIndices;
 144     switch(fetchState.indexType)
 145     {
 146         case R8_UINT:
 147             indices = BITCAST(indices, Type::getInt8PtrTy(JM()->mContext, 0));
 148             if(fetchState.bDisableIndexOOBCheck)
 149             {
 150                 vIndices = LOAD(BITCAST(indices, PointerType::get(VectorType::get(mInt8Ty, mpJitMgr->mVWidth), 0)), {(uint32_t)0});
 151                 vIndices = Z_EXT(vIndices, mSimdInt32Ty);
 152             }
 153             else
 154             {
 155                 vIndices = GetSimdValid8bitIndices(indices, pLastIndex);
 156             }
 157             break;
 158         case R16_UINT:
 159             if(fetchState.bDisableIndexOOBCheck)
 160             {
 161                 vIndices = LOAD(BITCAST(indices, PointerType::get(VectorType::get(mInt16Ty, mpJitMgr->mVWidth), 0)), {(uint32_t)0});
 162                 vIndices = Z_EXT(vIndices, mSimdInt32Ty);
 163             }
 164             else
 165             {
 166                 vIndices = GetSimdValid16bitIndices(indices, pLastIndex);
 167             }
 168             break;
 169         case R32_UINT:
 170             (fetchState.bDisableIndexOOBCheck) ? vIndices = LOAD(indices, "", PointerType::get(mSimdInt32Ty, 0), GFX_MEM_CLIENT_FETCH)
 171                                                : vIndices = GetSimdValid32bitIndices(indices, pLastIndex);
 172             break; // incoming type is already 32bit int
 173         default:
 174             SWR_INVALID("Unsupported index type");
 175             vIndices = nullptr;
 176             break;
 177     }
 178
 179     if(fetchState.bForceSequentialAccessEnable)
 180     {
 181         Value* pOffsets = mVWidth == 8 ? C({ 0, 1, 2, 3, 4, 5, 6, 7 }) :
 182             C({ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 });
 183
 184         // VertexData buffers are accessed sequentially, the index is equal to the vertex number
 185         vIndices = VBROADCAST(LOAD(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_StartVertex }));
 186         vIndices = ADD(vIndices, pOffsets);
 187     }
 188
 189     Value* vVertexId = vIndices;
 190     if (fetchState.bVertexIDOffsetEnable)
 191     {
 192         // Assuming one of baseVertex or startVertex is 0, so adding both should be functionally correct
 193         Value* vBaseVertex = VBROADCAST(LOAD(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_BaseVertex }));
 194         Value* vStartVertex = VBROADCAST(LOAD(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_StartVertex }));
 195         vVertexId = ADD(vIndices, vBaseVertex);
 196         vVertexId = ADD(vVertexId, vStartVertex);
 197     }
 198
 199     // store out vertex IDs
 200     if (mVWidth == 16)
 201     {
 202         // store out in simd8 halves until core supports 16-wide natively
 203         auto vVertexIdLo = EXTRACT_16(vVertexId, 0);
 204         auto vVertexIdHi = EXTRACT_16(vVertexId, 1);
 205         STORE(vVertexIdLo, GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID }));
 206         STORE(vVertexIdHi, GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID2 }));
 207     }
 208     else if (mVWidth == 8)
 209     {
 210         STORE(vVertexId, GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID }));
 211     }
 212
 213     // store out cut mask if enabled
 214     if (fetchState.bEnableCutIndex)
 215     {
 216         Value* vCutIndex = VIMMED1(fetchState.cutIndex);
 217         Value* cutMask = VMASK(ICMP_EQ(vIndices, vCutIndex));
 218
 219         if (mVWidth == 16)
 220         {
 221             auto cutMaskLo = EXTRACT_16(cutMask, 0);
 222             auto cutMaskHi = EXTRACT_16(cutMask, 1);
 223             STORE(cutMaskLo, GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_CutMask }));
 224             STORE(cutMaskHi, GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_CutMask2 }));
 225         }
 226         else if (mVWidth == 8)
 227         {
 228             STORE(cutMask, GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_CutMask }));
 229         }
 230     }
 231
 232     // Fetch attributes from memory and output to a simdvertex struct
 233     JitGatherVertices(fetchState, streams, vIndices, pVtxOut);
 234
 235     RET_VOID();
 236
 237     JitManager::DumpToFile(fetch, "src");
 238
 239 #if defined(_DEBUG)
 240     verifyFunction(*fetch);
 241 #endif
 242
 243     ::FunctionPassManager setupPasses(JM()->mpCurrentModule);
 244
 245     ///@todo We don't need the CFG passes for fetch. (e.g. BreakCriticalEdges and CFGSimplification)
 246     setupPasses.add(createBreakCriticalEdgesPass());
 247     setupPasses.add(createCFGSimplificationPass());
 248     setupPasses.add(createEarlyCSEPass());
 249     setupPasses.add(createPromoteMemoryToRegisterPass());
 250
 251     setupPasses.run(*fetch);
 252
 253     JitManager::DumpToFile(fetch, "se");
 254
 255     ::FunctionPassManager optPasses(JM()->mpCurrentModule);
 256
 257     ///@todo Haven't touched these either. Need to remove some of these and add others.
 258     optPasses.add(createCFGSimplificationPass());
 259     optPasses.add(createEarlyCSEPass());
 260     optPasses.add(createInstructionCombiningPass());
 261     optPasses.add(createInstructionSimplifierPass());
 262     optPasses.add(createConstantPropagationPass());
 263     optPasses.add(createSCCPPass());
 264     optPasses.add(createAggressiveDCEPass());
 265
 266     optPasses.run(*fetch);
 267
 268     optPasses.add(createLowerX86Pass(JM(), this));
 269     optPasses.run(*fetch);
 270
 271     JitManager::DumpToFile(fetch, "opt");
 272
 273
 274     // Revert 16-wide override
 275 #if USE_SIMD16_SHADERS
 276     SetTargetWidth(baseWidth);
 277 #endif
 278
 279     return fetch;
 280 }
 281
 282 // returns true for odd formats that require special state.gather handling
 283 bool FetchJit::IsOddFormat(SWR_FORMAT format)
 284 {
 285     const SWR_FORMAT_INFO& info = GetFormatInfo(format);
 286     if (info.bpc[0] != 8 && info.bpc[0] != 16 && info.bpc[0] != 32 && info.bpc[0] != 64)
 287     {
 288         return true;
 289     }
 290     return false;
 291 }
 292
 293 // format is uniform if all components are the same size and type
 294 bool FetchJit::IsUniformFormat(SWR_FORMAT format)
 295 {
 296     const SWR_FORMAT_INFO& info = GetFormatInfo(format);
 297     uint32_t bpc0 = info.bpc[0];
 298     uint32_t type0 = info.type[0];
 299
 300     for (uint32_t c = 1; c < info.numComps; ++c)
 301     {
 302         if (bpc0 != info.bpc[c] || type0 != info.type[c])
 303         {
 304             return false;
 305         }
 306     }
 307     return true;
 308 }
 309
 310 // unpacks components based on format
 311 // foreach component in the pixel
 312 //   mask off everything but this component
 313 //   shift component to LSB
 314 void FetchJit::UnpackComponents(SWR_FORMAT format, Value* vInput, Value* result[4])
 315 {
 316     const SWR_FORMAT_INFO& info = GetFormatInfo(format);
 317
 318     uint32_t bitOffset = 0;
 319     for (uint32_t c = 0; c < info.numComps; ++c)
 320     {
 321         uint32_t swizzledIndex = info.swizzle[c];
 322         uint32_t compBits = info.bpc[c];
 323         uint32_t bitmask = ((1 << compBits) - 1) << bitOffset;
 324         Value* comp = AND(vInput, bitmask);
 325         comp = LSHR(comp, bitOffset);
 326
 327         result[swizzledIndex] = comp;
 328         bitOffset += compBits;
 329     }
 330 }
 331
 332 // gather for odd component size formats
 333 // gather SIMD full pixels per lane then shift/mask to move each component to their
 334 // own vector
 335 void FetchJit::CreateGatherOddFormats(SWR_FORMAT format, Value* pMask, Value* pBase, Value* pOffsets, Value* pResult[4])
 336 {
 337     const SWR_FORMAT_INFO &info = GetFormatInfo(format);
 338
 339     // only works if pixel size is <= 32bits
 340     SWR_ASSERT(info.bpp <= 32);
 341
 342     Value *pGather;
 343     if (info.bpp == 32)
 344     {
 345         pGather = GATHERDD(VIMMED1(0), pBase, pOffsets, pMask);
 346     }
 347     else
 348     {
 349         // Can't use 32-bit gather for items less than 32-bits, could cause page faults.
 350         Value *pMem = ALLOCA(mSimdInt32Ty);
 351         STORE(VIMMED1(0u), pMem);
 352
 353         pBase = BITCAST(pBase, PointerType::get(mInt8Ty, 0));
 354         Value* pDstMem = BITCAST(pMem, mInt32PtrTy);
 355
 356         for (uint32_t lane = 0; lane < mVWidth; ++lane)
 357         {
 358             // Get index
 359             Value* index = VEXTRACT(pOffsets, C(lane));
 360             Value* mask = VEXTRACT(pMask, C(lane));
 361             switch (info.bpp)
 362             {
 363             case 8:
 364             {
 365                 Value* pDst = BITCAST(GEP(pDstMem, C(lane)), PointerType::get(mInt8Ty, 0));
 366                 Value* pSrc = BITCAST(GEP(pBase, index), PointerType::get(mInt8Ty, 0));
 367                 STORE(LOAD(SELECT(mask, pSrc, pDst)), pDst);
 368                 break;
 369             }
 370
 371             case 16:
 372             {
 373                 Value* pDst = BITCAST(GEP(pDstMem, C(lane)), PointerType::get(mInt16Ty, 0));
 374                 Value* pSrc = BITCAST(GEP(pBase, index), PointerType::get(mInt16Ty, 0));
 375                 STORE(LOAD(SELECT(mask, pSrc, pDst)), pDst);
 376                 break;
 377             }
 378             break;
 379
 380             case 24:
 381             {
 382                 // First 16-bits of data
 383                 Value* pDst = BITCAST(GEP(pDstMem, C(lane)), PointerType::get(mInt16Ty, 0));
 384                 Value* pSrc = BITCAST(GEP(pBase, index), PointerType::get(mInt16Ty, 0));
 385                 STORE(LOAD(SELECT(mask, pSrc, pDst)), pDst);
 386
 387                 // Last 8-bits of data
 388                 pDst = BITCAST(GEP(pDst, C(1)), PointerType::get(mInt8Ty, 0));
 389                 pSrc = BITCAST(GEP(pSrc, C(1)), PointerType::get(mInt8Ty, 0));
 390                 STORE(LOAD(SELECT(mask, pSrc, pDst)), pDst);
 391                 break;
 392             }
 393
 394             default:
 395                 SWR_INVALID("Shouldn't have BPP = %d now", info.bpp);
 396                 break;
 397             }
 398         }
 399
 400         pGather = LOAD(pMem);
 401     }
 402
 403     for (uint32_t comp = 0; comp < 4; ++comp)
 404     {
 405         pResult[comp] = VIMMED1((int)info.defaults[comp]);
 406     }
 407
 408     UnpackComponents(format, pGather, pResult);
 409
 410     // cast to fp32
 411     pResult[0] = BITCAST(pResult[0], mSimdFP32Ty);
 412     pResult[1] = BITCAST(pResult[1], mSimdFP32Ty);
 413     pResult[2] = BITCAST(pResult[2], mSimdFP32Ty);
 414     pResult[3] = BITCAST(pResult[3], mSimdFP32Ty);
 415 }
 416
 417 void FetchJit::ConvertFormat(SWR_FORMAT format, Value *texels[4])
 418 {
 419     const SWR_FORMAT_INFO &info = GetFormatInfo(format);
 420
 421     for (uint32_t c = 0; c < info.numComps; ++c)
 422     {
 423         uint32_t compIndex = info.swizzle[c];
 424
 425         // skip any conversion on UNUSED components
 426         if (info.type[c] == SWR_TYPE_UNUSED)
 427         {
 428             continue;
 429         }
 430
 431         if (info.isNormalized[c])
 432         {
 433             if (info.type[c] == SWR_TYPE_SNORM)
 434             {
 435                 /// @todo The most-negative value maps to -1.0f. e.g. the 5-bit value 10000 maps to -1.0f.
 436
 437                 /// result = c * (1.0f / (2^(n-1) - 1);
 438                 uint32_t n = info.bpc[c];
 439                 uint32_t pow2 = 1 << (n - 1);
 440                 float scale = 1.0f / (float)(pow2 - 1);
 441                 Value *vScale = VIMMED1(scale);
 442                 texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty);
 443                 texels[compIndex] = SI_TO_FP(texels[compIndex], mSimdFP32Ty);
 444                 texels[compIndex] = FMUL(texels[compIndex], vScale);
 445             }
 446             else
 447             {
 448                 SWR_ASSERT(info.type[c] == SWR_TYPE_UNORM);
 449
 450                 /// result = c * (1.0f / (2^n - 1))
 451                 uint32_t n = info.bpc[c];
 452                 uint32_t pow2 = 1 << n;
 453                 // special case 24bit unorm format, which requires a full divide to meet ULP requirement
 454                 if (n == 24)
 455                 {
 456                     float scale = (float)(pow2 - 1);
 457                     Value* vScale = VIMMED1(scale);
 458                     texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty);
 459                     texels[compIndex] = SI_TO_FP(texels[compIndex], mSimdFP32Ty);
 460                     texels[compIndex] = FDIV(texels[compIndex], vScale);
 461                 }
 462                 else
 463                 {
 464                     float scale = 1.0f / (float)(pow2 - 1);
 465                     Value *vScale = VIMMED1(scale);
 466                     texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty);
 467                     texels[compIndex] = UI_TO_FP(texels[compIndex], mSimdFP32Ty);
 468                     texels[compIndex] = FMUL(texels[compIndex], vScale);
 469                 }
 470             }
 471             continue;
 472         }
 473     }
 474 }
 475
 476 //////////////////////////////////////////////////////////////////////////
 477 /// @brief Loads attributes from memory using AVX2 GATHER(s)
 478 /// @param fetchState - info about attributes to be fetched from memory
 479 /// @param streams - value pointer to the current vertex stream
 480 /// @param vIndices - vector value of indices to gather
 481 /// @param pVtxOut - value pointer to output simdvertex struct
 482 void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
 483     Value* streams, Value* vIndices, Value* pVtxOut)
 484 {
 485     uint32_t currentVertexElement = 0;
 486     uint32_t outputElt = 0;
 487     Value* vVertexElements[4];
 488
 489     Value* startVertex = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartVertex});
 490     Value* startInstance = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartInstance});
 491     Value* curInstance = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_CurInstance});
 492     Value* vBaseVertex = VBROADCAST(LOAD(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_BaseVertex }));
 493     curInstance->setName("curInstance");
 494
 495     for (uint32_t nInputElt = 0; nInputElt < fetchState.numAttribs; nInputElt += 1)
 496     {
 497         const INPUT_ELEMENT_DESC& ied = fetchState.layout[nInputElt];
 498
 499         // skip element if all components are disabled
 500         if (ied.ComponentPacking == ComponentEnable::NONE)
 501         {
 502             continue;
 503         }
 504
 505         const SWR_FORMAT_INFO &info = GetFormatInfo((SWR_FORMAT)ied.Format);
 506         SWR_ASSERT((info.bpp != 0), "Unsupported format in JitGatherVertices.");
 507         uint32_t bpc = info.bpp / info.numComps;  ///@todo Code below assumes all components are same size. Need to fix.
 508
 509         Value *stream = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_xpData});
 510
 511         // VGATHER* takes an *i8 src pointer
 512         Value *pStreamBase = INT_TO_PTR(stream, PointerType::get(mInt8Ty, 0));
 513
 514         Value *stride = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pitch});
 515         Value *vStride = VBROADCAST(stride);
 516
 517         // max vertex index that is fully in bounds
 518         Value *maxVertex = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_maxVertex)});
 519         maxVertex = LOAD(maxVertex);
 520
 521         Value *minVertex = NULL;
 522         if (fetchState.bPartialVertexBuffer)
 523         {
 524             // min vertex index for low bounds OOB checking
 525             minVertex = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_minVertex)});
 526             minVertex = LOAD(minVertex);
 527         }
 528
 529         if (fetchState.bInstanceIDOffsetEnable)
 530         {
 531             // the InstanceID (curInstance) value is offset by StartInstanceLocation
 532             curInstance = ADD(curInstance, startInstance);
 533         }
 534
 535         Value *vCurIndices;
 536         Value *startOffset;
 537         Value *vInstanceStride = VIMMED1(0);
 538
 539         if (ied.InstanceEnable)
 540         {
 541             Value* stepRate = C(ied.InstanceAdvancementState);
 542
 543             // prevent a div by 0 for 0 step rate
 544             Value* isNonZeroStep = ICMP_UGT(stepRate, C(0));
 545             stepRate = SELECT(isNonZeroStep, stepRate, C(1));
 546
 547             // calc the current offset into instanced data buffer
 548             Value* calcInstance = UDIV(curInstance, stepRate);
 549
 550             // if step rate is 0, every instance gets instance 0
 551             calcInstance = SELECT(isNonZeroStep, calcInstance, C(0));
 552
 553             vCurIndices = VBROADCAST(calcInstance);
 554             startOffset = startInstance;
 555         }
 556         else if (ied.InstanceStrideEnable)
 557         {
 558             // grab the instance advancement state, determines stride in bytes from one instance to the next
 559             Value* stepRate = C(ied.InstanceAdvancementState);
 560             vInstanceStride = VBROADCAST(MUL(curInstance, stepRate));
 561
 562             // offset indices by baseVertex
 563             vCurIndices = ADD(vIndices, vBaseVertex);
 564
 565             startOffset = startVertex;
 566             SWR_ASSERT((0), "TODO: Fill out more once driver sends this down.");
 567         }
 568         else
 569         {
 570             // offset indices by baseVertex
 571             vCurIndices = ADD(vIndices, vBaseVertex);
 572             startOffset = startVertex;
 573         }
 574
 575         // All of the OOB calculations are in vertices, not VB offsets, to prevent having to
 576         // do 64bit address offset calculations.
 577
 578         // calculate byte offset to the start of the VB
 579         Value* baseOffset = MUL(Z_EXT(startOffset, mInt64Ty), Z_EXT(stride, mInt64Ty));
 580         pStreamBase = GEP(pStreamBase, baseOffset);
 581         Value* pStreamBaseGFX = ADD(stream, baseOffset);
 582
 583         // if we have a start offset, subtract from max vertex. Used for OOB check
 584         maxVertex = SUB(Z_EXT(maxVertex, mInt64Ty), Z_EXT(startOffset, mInt64Ty));
 585         Value* maxNeg = ICMP_SLT(maxVertex, C((int64_t)0));
 586         // if we have a negative value, we're already OOB. clamp at 0.
 587         maxVertex = SELECT(maxNeg, C(0), TRUNC(maxVertex, mInt32Ty));
 588
 589         if (fetchState.bPartialVertexBuffer)
 590         {
 591             // similary for min vertex
 592             minVertex = SUB(Z_EXT(minVertex, mInt64Ty), Z_EXT(startOffset, mInt64Ty));
 593             Value *minNeg = ICMP_SLT(minVertex, C((int64_t)0));
 594             minVertex = SELECT(minNeg, C(0), TRUNC(minVertex, mInt32Ty));
 595         }
 596
 597         // Load the in bounds size of a partially valid vertex
 598         Value *partialInboundsSize = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_partialInboundsSize)});
 599         partialInboundsSize = LOAD(partialInboundsSize);
 600         Value *vPartialVertexSize = VBROADCAST(partialInboundsSize);
 601         Value *vBpp = VBROADCAST(C(info.Bpp));
 602         Value *vAlignmentOffsets = VBROADCAST(C(ied.AlignedByteOffset));
 603
 604         // is the element is <= the partially valid size
 605         Value *vElementInBoundsMask = ICMP_SLE(vBpp, SUB(vPartialVertexSize, vAlignmentOffsets));
 606
 607         // override cur indices with 0 if pitch is 0
 608         Value* pZeroPitchMask = ICMP_EQ(vStride, VIMMED1(0));
 609         vCurIndices = SELECT(pZeroPitchMask, VIMMED1(0), vCurIndices);
 610
 611         // are vertices partially OOB?
 612         Value* vMaxVertex = VBROADCAST(maxVertex);
 613         Value* vPartialOOBMask = ICMP_EQ(vCurIndices, vMaxVertex);
 614
 615         // are vertices fully in bounds?
 616         Value* vMaxGatherMask = ICMP_ULT(vCurIndices, vMaxVertex);
 617
 618         Value *vGatherMask;
 619         if (fetchState.bPartialVertexBuffer)
 620         {
 621             // are vertices below minVertex limit?
 622             Value *vMinVertex = VBROADCAST(minVertex);
 623             Value *vMinGatherMask = ICMP_UGE(vCurIndices, vMinVertex);
 624
 625             // only fetch lanes that pass both tests
 626             vGatherMask = AND(vMaxGatherMask, vMinGatherMask);
 627         }
 628         else
 629         {
 630             vGatherMask = vMaxGatherMask;
 631         }
 632
 633         // blend in any partially OOB indices that have valid elements
 634         vGatherMask = SELECT(vPartialOOBMask, vElementInBoundsMask, vGatherMask);
 635
 636         // calculate the actual offsets into the VB
 637         Value* vOffsets = MUL(vCurIndices, vStride);
 638         vOffsets = ADD(vOffsets, vAlignmentOffsets);
 639
 640         // if instance stride enable is:
 641         //  true  - add product of the instanceID and advancement state to the offst into the VB
 642         //  false - value of vInstanceStride has been initialialized to zero
 643         vOffsets = ADD(vOffsets, vInstanceStride);
 644
 645         // Packing and component control
 646         ComponentEnable compMask = (ComponentEnable)ied.ComponentPacking;
 647         const ComponentControl compCtrl[4] { (ComponentControl)ied.ComponentControl0, (ComponentControl)ied.ComponentControl1,
 648                                              (ComponentControl)ied.ComponentControl2, (ComponentControl)ied.ComponentControl3};
 649
 650         // Special gather/conversion for formats without equal component sizes
 651         if (IsOddFormat((SWR_FORMAT)ied.Format))
 652         {
 653             Value *pResults[4];
 654             CreateGatherOddFormats((SWR_FORMAT)ied.Format, vGatherMask, pStreamBase, vOffsets, pResults);
 655             ConvertFormat((SWR_FORMAT)ied.Format, pResults);
 656
 657             for (uint32_t c = 0; c < 4; c += 1)
 658             {
 659                 if (isComponentEnabled(compMask, c))
 660                 {
 661                     vVertexElements[currentVertexElement++] = pResults[c];
 662                     if (currentVertexElement > 3)
 663                     {
 664                         StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
 665                         // reset to the next vVertexElement to output
 666                         currentVertexElement = 0;
 667                     }
 668                 }
 669             }
 670         }
 671         else if(info.type[0] == SWR_TYPE_FLOAT)
 672         {
 673             ///@todo: support 64 bit vb accesses
 674             Value *gatherSrc = VIMMED1(0.0f);
 675
 676             SWR_ASSERT(IsUniformFormat((SWR_FORMAT)ied.Format),
 677                 "Unsupported format for standard gather fetch.");
 678
 679             // Gather components from memory to store in a simdvertex structure
 680             switch (bpc)
 681             {
 682                 case 16:
 683                 {
 684                     Value *vGatherResult[2];
 685
 686                     // if we have at least one component out of x or y to fetch
 687                     if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
 688                     {
 689                         vGatherResult[0] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vGatherMask);
 690                         // e.g. result of first 8x32bit integer gather for 16bit components
 691                         // 256i - 0    1    2    3    4    5    6    7
 692                         //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
 693                         //
 694                     }
 695
 696                     // if we have at least one component out of z or w to fetch
 697                     if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
 698                     {
 699                         // offset base to the next components(zw) in the vertex to gather
 700                         pStreamBase = GEP(pStreamBase, C((char)4));
 701
 702                         vGatherResult[1] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vGatherMask);
 703                         // e.g. result of second 8x32bit integer gather for 16bit components
 704                         // 256i - 0    1    2    3    4    5    6    7
 705                         //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
 706                         //
 707                     }
 708
 709                     // if we have at least one component to shuffle into place
 710                     if (compMask)
 711                     {
 712                         Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, Instruction::CastOps::FPExt, CONVERT_NONE,
 713                             currentVertexElement, outputElt, compMask, compCtrl, vVertexElements);
 714
 715                         // Shuffle gathered components into place in simdvertex struct
 716                         mVWidth == 16 ? Shuffle16bpcGather16(args) : Shuffle16bpcGather(args);  // outputs to vVertexElements ref
 717                     }
 718                 }
 719                     break;
 720                 case 32:
 721                 {
 722                     for (uint32_t i = 0; i < 4; i += 1)
 723                     {
 724                         if (isComponentEnabled(compMask, i))
 725                         {
 726                             // if we need to gather the component
 727                             if (compCtrl[i] == StoreSrc)
 728                             {
 729                                 // Gather a SIMD of vertices
 730                                 // APIs allow a 4GB range for offsets
 731                                 // However, GATHERPS uses signed 32-bit offsets, so only a 2GB range :(
 732                                 // But, we know that elements must be aligned for FETCH. :)
 733                                 // Right shift the offset by a bit and then scale by 2 to remove the sign extension.
 734                                 Value *vShiftedOffsets = LSHR(vOffsets, 1);
 735                                 vVertexElements[currentVertexElement++] = GATHERPS(gatherSrc, pStreamBaseGFX, vShiftedOffsets, vGatherMask, 2, GFX_MEM_CLIENT_FETCH);
 736                             }
 737                             else
 738                             {
 739                                 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
 740                             }
 741
 742                             if (currentVertexElement > 3)
 743                             {
 744                                 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
 745                                 // reset to the next vVertexElement to output
 746                                 currentVertexElement = 0;
 747                             }
 748                         }
 749
 750                         // offset base to the next component in the vertex to gather
 751                         pStreamBase = GEP(pStreamBase, C((char)4));
 752                         pStreamBaseGFX = ADD(pStreamBaseGFX, C((int64_t)4));
 753                     }
 754                 }
 755                     break;
 756                 case 64:
 757                 {
 758                     for (uint32_t i = 0; i < 4; i += 1)
 759                     {
 760                         if (isComponentEnabled(compMask, i))
 761                         {
 762                             // if we need to gather the component
 763                             if (compCtrl[i] == StoreSrc)
 764                             {
 765                                 Value* vShufLo;
 766                                 Value* vShufHi;
 767                                 Value* vShufAll;
 768
 769                                 if (mVWidth == 8)
 770                                 {
 771                                     vShufLo = C({ 0, 1, 2, 3 });
 772                                     vShufHi = C({ 4, 5, 6, 7 });
 773                                     vShufAll = C({ 0, 1, 2, 3, 4, 5, 6, 7 });
 774                                 }
 775                                 else
 776                                 {
 777                                     SWR_ASSERT(mVWidth == 16);
 778                                     vShufLo = C({ 0, 1, 2, 3, 4, 5, 6, 7 });
 779                                     vShufHi = C({ 8, 9, 10, 11, 12, 13, 14, 15 });
 780                                     vShufAll = C({ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 });
 781                                 }
 782
 783                                 Value *vMaskLo = VSHUFFLE(vGatherMask, vGatherMask, vShufLo);
 784                                 Value *vMaskHi = VSHUFFLE(vGatherMask, vGatherMask, vShufHi);
 785
 786                                 Value *vOffsetsLo = VSHUFFLE(vOffsets, vOffsets, vShufLo);
 787                                 Value *vOffsetsHi = VSHUFFLE(vOffsets, vOffsets, vShufHi);
 788
 789                                 Value *vZeroDouble = VECTOR_SPLAT(mVWidth / 2, ConstantFP::get(IRB()->getDoubleTy(), 0.0f));
 790
 791                                 Value* pGatherLo = GATHERPD(vZeroDouble, pStreamBase, vOffsetsLo, vMaskLo);
 792                                 Value* pGatherHi = GATHERPD(vZeroDouble, pStreamBase, vOffsetsHi, vMaskHi);
 793
 794                                 pGatherLo = VCVTPD2PS(pGatherLo);
 795                                 pGatherHi = VCVTPD2PS(pGatherHi);
 796
 797                                 Value *pGather = VSHUFFLE(pGatherLo, pGatherHi, vShufAll);
 798
 799                                 vVertexElements[currentVertexElement++] = pGather;
 800                             }
 801                             else
 802                             {
 803                                 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
 804                             }
 805
 806                             if (currentVertexElement > 3)
 807                             {
 808                                 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
 809                                 // reset to the next vVertexElement to output
 810                                 currentVertexElement = 0;
 811                             }
 812                         }
 813
 814                         // offset base to the next component  in the vertex to gather
 815                         pStreamBase = GEP(pStreamBase, C((char)8));
 816                     }
 817                 }
 818                     break;
 819                 default:
 820                     SWR_INVALID("Tried to fetch invalid FP format");
 821                     break;
 822             }
 823         }
 824         else
 825         {
 826             Instruction::CastOps extendCastType = Instruction::CastOps::CastOpsEnd;
 827             ConversionType conversionType = CONVERT_NONE;
 828
 829             SWR_ASSERT(IsUniformFormat((SWR_FORMAT)ied.Format),
 830                 "Unsupported format for standard gather fetch.");
 831
 832             switch(info.type[0])
 833             {
 834                 case SWR_TYPE_UNORM:
 835                     conversionType = CONVERT_NORMALIZED;
 836                 case SWR_TYPE_UINT:
 837                     extendCastType = Instruction::CastOps::ZExt;
 838                     break;
 839                 case SWR_TYPE_SNORM:
 840                     conversionType = CONVERT_NORMALIZED;
 841                 case SWR_TYPE_SINT:
 842                     extendCastType = Instruction::CastOps::SExt;
 843                     break;
 844                 case SWR_TYPE_USCALED:
 845                     conversionType = CONVERT_USCALED;
 846                     extendCastType = Instruction::CastOps::UIToFP;
 847                     break;
 848                 case SWR_TYPE_SSCALED:
 849                     conversionType = CONVERT_SSCALED;
 850                     extendCastType = Instruction::CastOps::SIToFP;
 851                     break;
 852                 case SWR_TYPE_SFIXED:
 853                     conversionType = CONVERT_SFIXED;
 854                     extendCastType = Instruction::CastOps::SExt;
 855                     break;
 856                 default:
 857                     break;
 858             }
 859
 860             // value substituted when component of gather is masked
 861             Value* gatherSrc = VIMMED1(0);
 862
 863             // Gather components from memory to store in a simdvertex structure
 864             switch (bpc)
 865             {
 866                 case 8:
 867                 {
 868                     // if we have at least one component to fetch
 869                     if (compMask)
 870                     {
 871                         Value *vGatherResult = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask);
 872                         // e.g. result of an 8x32bit integer gather for 8bit components
 873                         // 256i - 0    1    2    3    4    5    6    7
 874                         //        xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw
 875
 876                         Shuffle8bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, extendCastType, conversionType,
 877                             currentVertexElement, outputElt, compMask, compCtrl, vVertexElements, info.swizzle);
 878
 879                         // Shuffle gathered components into place in simdvertex struct
 880                         mVWidth == 16 ? Shuffle8bpcGatherd16(args) : Shuffle8bpcGatherd(args); // outputs to vVertexElements ref
 881                     }
 882                 }
 883                 break;
 884                 case 16:
 885                 {
 886                     Value *vGatherResult[2];
 887
 888                     // if we have at least one component out of x or y to fetch
 889                     if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
 890                     {
 891                         vGatherResult[0] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask);
 892                         // e.g. result of first 8x32bit integer gather for 16bit components
 893                         // 256i - 0    1    2    3    4    5    6    7
 894                         //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
 895                         //
 896                     }
 897
 898                     // if we have at least one component out of z or w to fetch
 899                     if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
 900                     {
 901                         // offset base to the next components(zw) in the vertex to gather
 902                         pStreamBase = GEP(pStreamBase, C((char)4));
 903
 904                         vGatherResult[1] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask);
 905                         // e.g. result of second 8x32bit integer gather for 16bit components
 906                         // 256i - 0    1    2    3    4    5    6    7
 907                         //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
 908                         //
 909                     }
 910
 911                     // if we have at least one component to shuffle into place
 912                     if (compMask)
 913                     {
 914                         Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, extendCastType, conversionType,
 915                             currentVertexElement, outputElt, compMask, compCtrl, vVertexElements);
 916
 917                         // Shuffle gathered components into place in simdvertex struct
 918                         mVWidth == 16 ? Shuffle16bpcGather16(args) : Shuffle16bpcGather(args); // outputs to vVertexElements ref
 919                     }
 920                 }
 921                 break;
 922                 case 32:
 923                 {
 924                     // Gathered components into place in simdvertex struct
 925                     for (uint32_t i = 0; i < 4; i++)
 926                     {
 927                         if (isComponentEnabled(compMask, i))
 928                         {
 929                             // if we need to gather the component
 930                             if (compCtrl[i] == StoreSrc)
 931                             {
 932                                 Value* pGather = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask);
 933
 934                                 if (conversionType == CONVERT_USCALED)
 935                                 {
 936                                     pGather = UI_TO_FP(pGather, mSimdFP32Ty);
 937                                 }
 938                                 else if (conversionType == CONVERT_SSCALED)
 939                                 {
 940                                     pGather = SI_TO_FP(pGather, mSimdFP32Ty);
 941                                 }
 942                                 else if (conversionType == CONVERT_SFIXED)
 943                                 {
 944                                     pGather = FMUL(SI_TO_FP(pGather, mSimdFP32Ty), VBROADCAST(C(1/65536.0f)));
 945                                 }
 946
 947                                 vVertexElements[currentVertexElement++] = pGather;
 948
 949                                 // e.g. result of a single 8x32bit integer gather for 32bit components
 950                                 // 256i - 0    1    2    3    4    5    6    7
 951                                 //        xxxx xxxx xxxx xxxx xxxx xxxx xxxx xxxx
 952                             }
 953                             else
 954                             {
 955                                 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
 956                             }
 957
 958                             if (currentVertexElement > 3)
 959                             {
 960                                 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
 961
 962                                 // reset to the next vVertexElement to output
 963                                 currentVertexElement = 0;
 964                             }
 965
 966                         }
 967
 968                         // offset base to the next component  in the vertex to gather
 969                         pStreamBase = GEP(pStreamBase, C((char)4));
 970                     }
 971                 }
 972                 break;
 973             }
 974         }
 975     }
 976
 977     // if we have a partially filled vVertexElement struct, output it
 978     if (currentVertexElement > 0)
 979     {
 980         StoreVertexElements(pVtxOut, outputElt++, currentVertexElement, vVertexElements);
 981     }
 982 }
 983
 984 typedef void*(*PFN_TRANSLATEGFXADDRESS_FUNC)(void* pdc, gfxptr_t va);
 985 extern "C" void GetSimdValid8bitIndicesGfx(gfxptr_t indices, gfxptr_t lastIndex, uint32_t vWidth, PFN_TRANSLATEGFXADDRESS_FUNC pfnTranslate, void* pdc, uint32_t* outIndices);
 986 extern "C" void GetSimdValid16bitIndicesGfx(gfxptr_t indices, gfxptr_t lastIndex, uint32_t vWidth, PFN_TRANSLATEGFXADDRESS_FUNC pfnTranslate, void* pdc, uint32_t* outIndices);
 987
 988 //////////////////////////////////////////////////////////////////////////
 989 /// @brief Loads a simd of valid indices. OOB indices are set to 0
 990 /// *Note* have to do 8bit index checking in scalar until we have AVX-512
 991 /// support
 992 /// @param pIndices - pointer to 8 bit indices
 993 /// @param pLastIndex - pointer to last valid index
 994 Value* FetchJit::GetSimdValid8bitIndices(Value* pIndices, Value* pLastIndex)
 995 {
 996     SWR_ASSERT(pIndices->getType() == mInt64Ty && pLastIndex->getType() == mInt64Ty, "Function expects gfxptr_t for both input parameters.");
 997
 998     Value* vIndices = VUNDEF_I();
 999
1000     {
1001         // store 0 index on stack to be used to conditionally load from if index address is OOB
1002         Value* pZeroIndex = ALLOCA(mInt8Ty);
1003         STORE(C((uint8_t)0), pZeroIndex);
1004
1005         // Load a SIMD of index pointers
1006         for (int64_t lane = 0; lane < mVWidth; lane++)
1007         {
1008             // Calculate the address of the requested index
1009             Value *pIndex = GEP(pIndices, C(lane), mInt8PtrTy);
1010
1011             pLastIndex = INT_TO_PTR(pLastIndex, mInt8PtrTy);
1012
1013             // check if the address is less than the max index,
1014             Value* mask = ICMP_ULT(pIndex, pLastIndex);
1015
1016             // if valid, load the index. if not, load 0 from the stack
1017             Value* pValid = SELECT(mask, pIndex, pZeroIndex);
1018             Value *index = LOAD(pValid, "valid index", PointerType::get(mInt8Ty, 0), GFX_MEM_CLIENT_FETCH);
1019
1020             // zero extended index to 32 bits and insert into the correct simd lane
1021             index = Z_EXT(index, mInt32Ty);
1022             vIndices = VINSERT(vIndices, index, lane);
1023         }
1024     }
1025
1026     return vIndices;
1027 }
1028
1029 //////////////////////////////////////////////////////////////////////////
1030 /// @brief Loads a simd of valid indices. OOB indices are set to 0
1031 /// *Note* have to do 16bit index checking in scalar until we have AVX-512
1032 /// support
1033 /// @param pIndices - pointer to 16 bit indices
1034 /// @param pLastIndex - pointer to last valid index
1035 Value* FetchJit::GetSimdValid16bitIndices(Value* pIndices, Value* pLastIndex)
1036 {
1037     SWR_ASSERT(pIndices->getType() == mInt64Ty && pLastIndex->getType() == mInt64Ty, "Function expects gfxptr_t for both input parameters.");
1038
1039     Value* vIndices = VUNDEF_I();
1040
1041     {
1042         // store 0 index on stack to be used to conditionally load from if index address is OOB
1043         Value* pZeroIndex = ALLOCA(mInt16Ty);
1044         STORE(C((uint16_t)0), pZeroIndex);
1045
1046         // Load a SIMD of index pointers
1047         for (int64_t lane = 0; lane < mVWidth; lane++)
1048         {
1049             // Calculate the address of the requested index
1050             Value *pIndex = GEP(pIndices, C(lane), mInt16PtrTy);
1051
1052             pLastIndex = INT_TO_PTR(pLastIndex, mInt16PtrTy);
1053
1054             // check if the address is less than the max index,
1055             Value* mask = ICMP_ULT(pIndex, pLastIndex);
1056
1057             // if valid, load the index. if not, load 0 from the stack
1058             Value* pValid = SELECT(mask, pIndex, pZeroIndex);
1059             Value *index = LOAD(pValid, "valid index", PointerType::get(mInt16Ty, 0), GFX_MEM_CLIENT_FETCH);
1060
1061             // zero extended index to 32 bits and insert into the correct simd lane
1062             index = Z_EXT(index, mInt32Ty);
1063             vIndices = VINSERT(vIndices, index, lane);
1064         }
1065     }
1066
1067     return vIndices;
1068 }
1069
1070 //////////////////////////////////////////////////////////////////////////
1071 /// @brief Loads a simd of valid indices. OOB indices are set to 0
1072 /// @param pIndices - pointer to 32 bit indices
1073 /// @param pLastIndex - pointer to last valid index
1074 Value* FetchJit::GetSimdValid32bitIndices(Value* pIndices, Value* pLastIndex)
1075 {
1076     DataLayout dL(JM()->mpCurrentModule);
1077     unsigned int ptrSize = dL.getPointerSize() * 8;  // ptr size in bits
1078     Value* iLastIndex = pLastIndex;
1079     Value* iIndices = pIndices;
1080
1081     // get the number of indices left in the buffer (endPtr - curPtr) / sizeof(index)
1082     Value* numIndicesLeft = SUB(iLastIndex,iIndices);
1083     numIndicesLeft = TRUNC(numIndicesLeft, mInt32Ty);
1084     numIndicesLeft = SDIV(numIndicesLeft, C(4));
1085
1086     // create a vector of index counts from the base index ptr passed into the fetch
1087     Constant* vIndexOffsets;
1088     if (mVWidth == 8)
1089     {
1090         vIndexOffsets = C({ 0, 1, 2, 3, 4, 5, 6, 7 });
1091     }
1092     else
1093     {
1094         vIndexOffsets = C({ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 });
1095     }
1096
1097     // compare index count to the max valid index
1098     // e.g vMaxIndex      4 4 4 4 4 4 4 4 : 4 indices left to load
1099     //     vIndexOffsets  0 1 2 3 4 5 6 7
1100     //     ------------------------------
1101     //     vIndexMask    -1-1-1-1 0 0 0 0 : offsets < max pass
1102     //     vLoadedIndices 0 1 2 3 0 0 0 0 : offsets >= max masked to 0
1103     Value* vMaxIndex = VBROADCAST(numIndicesLeft);
1104     Value* vIndexMask = ICMP_SGT(vMaxIndex, vIndexOffsets);
1105
1106     // Load the indices; OOB loads 0
1107     pIndices = BITCAST(pIndices, PointerType::get(mSimdInt32Ty, 0));
1108     return MASKED_LOAD(pIndices, 4, vIndexMask, VIMMED1(0));
1109 }
1110
1111 //////////////////////////////////////////////////////////////////////////
1112 /// @brief Takes a SIMD of gathered 8bpc verts, zero or sign extends,
1113 /// denormalizes if needed, converts to F32 if needed, and positions in
1114 //  the proper SIMD rows to be output to the simdvertex structure
1115 /// @param args: (tuple of args, listed below)
1116 ///   @param vGatherResult - 8 gathered 8bpc vertices
1117 ///   @param pVtxOut - base pointer to output simdvertex struct
1118 ///   @param extendType - sign extend or zero extend
1119 ///   @param bNormalized - do we need to denormalize?
1120 ///   @param currentVertexElement - reference to the current vVertexElement
1121 ///   @param outputElt - reference to the current offset from simdvertex we're o
1122 ///   @param compMask - component packing mask
1123 ///   @param compCtrl - component control val
1124 ///   @param vVertexElements[4] - vertex components to output
1125 ///   @param swizzle[4] - component swizzle location
1126 void FetchJit::Shuffle8bpcGatherd16(Shuffle8bpcArgs &args)
1127 {
1128     // Unpack tuple args
1129     Value*& vGatherResult = std::get<0>(args);
1130     Value* pVtxOut = std::get<1>(args);
1131     const Instruction::CastOps extendType = std::get<2>(args);
1132     const ConversionType conversionType = std::get<3>(args);
1133     uint32_t &currentVertexElement = std::get<4>(args);
1134     uint32_t &outputElt = std::get<5>(args);
1135     const ComponentEnable compMask = std::get<6>(args);
1136     const ComponentControl(&compCtrl)[4] = std::get<7>(args);
1137     Value* (&vVertexElements)[4] = std::get<8>(args);
1138     const uint32_t(&swizzle)[4] = std::get<9>(args);
1139
1140     // cast types
1141     Type *vGatherTy = VectorType::get(mInt32Ty, 8);
1142     Type *v32x8Ty = VectorType::get(mInt8Ty, 32);
1143
1144     // have to do extra work for sign extending
1145     if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP))
1146     {
1147         Type *v16x8Ty = VectorType::get(mInt8Ty, 16); // 8x16bit ints in a 128bit lane
1148         Type *v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), 2);
1149
1150         // shuffle mask, including any swizzling
1151         const char x = (char)swizzle[0]; const char y = (char)swizzle[1];
1152         const char z = (char)swizzle[2]; const char w = (char)swizzle[3];
1153         Value *vConstMask = C<char>({ char(x), char(x + 4), char(x + 8), char(x + 12),
1154             char(y), char(y + 4), char(y + 8), char(y + 12),
1155             char(z), char(z + 4), char(z + 8), char(z + 12),
1156             char(w), char(w + 4), char(w + 8), char(w + 12),
1157             char(x), char(x + 4), char(x + 8), char(x + 12),
1158             char(y), char(y + 4), char(y + 8), char(y + 12),
1159             char(z), char(z + 4), char(z + 8), char(z + 12),
1160             char(w), char(w + 4), char(w + 8), char(w + 12) });
1161
1162         // SIMD16 PSHUFB isnt part of AVX-512F, so split into SIMD8 for the sake of KNL, for now..
1163
1164         Value *vGatherResult_lo = EXTRACT_16(vGatherResult, 0);
1165         Value *vGatherResult_hi = EXTRACT_16(vGatherResult, 1);
1166
1167         Value *vShufResult_lo = BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask), vGatherTy);
1168         Value *vShufResult_hi = BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask), vGatherTy);
1169
1170         // after pshufb: group components together in each 128bit lane
1171         // 256i - 0    1    2    3    4    5    6    7
1172         //        xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww
1173
1174         Value *vi128XY_lo = nullptr;
1175         Value *vi128XY_hi = nullptr;
1176         if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
1177         {
1178             vi128XY_lo = BITCAST(VSHUFFLE(vShufResult_lo, vShufResult_lo, C<int32_t>({ 0, 4, 0, 0, 1, 5, 0, 0 })), v128Ty);
1179             vi128XY_hi = BITCAST(VSHUFFLE(vShufResult_hi, vShufResult_hi, C<int32_t>({ 0, 4, 0, 0, 1, 5, 0, 0 })), v128Ty);
1180
1181             // after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane
1182             // 256i - 0    1    2    3    4    5    6    7
1183             //        xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care)
1184         }
1185
1186         // do the same for zw components
1187         Value *vi128ZW_lo = nullptr;
1188         Value *vi128ZW_hi = nullptr;
1189         if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
1190         {
1191             vi128ZW_lo = BITCAST(VSHUFFLE(vShufResult_lo, vShufResult_lo, C<int32_t>({ 2, 6, 0, 0, 3, 7, 0, 0 })), v128Ty);
1192             vi128ZW_hi = BITCAST(VSHUFFLE(vShufResult_hi, vShufResult_hi, C<int32_t>({ 2, 6, 0, 0, 3, 7, 0, 0 })), v128Ty);
1193         }
1194
1195         // init denormalize variables if needed
1196         Instruction::CastOps fpCast;
1197         Value *conversionFactor;
1198
1199         switch (conversionType)
1200         {
1201         case CONVERT_NORMALIZED:
1202             fpCast = Instruction::CastOps::SIToFP;
1203             conversionFactor = VIMMED1((float)(1.0 / 127.0));
1204             break;
1205         case CONVERT_SSCALED:
1206             fpCast = Instruction::CastOps::SIToFP;
1207             conversionFactor = VIMMED1((float)(1.0));
1208             break;
1209         case CONVERT_USCALED:
1210             SWR_INVALID("Type should not be sign extended!");
1211             conversionFactor = nullptr;
1212             break;
1213         default:
1214             SWR_ASSERT(conversionType == CONVERT_NONE);
1215             conversionFactor = nullptr;
1216             break;
1217         }
1218
1219         // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
1220         for (uint32_t i = 0; i < 4; i++)
1221         {
1222             if (isComponentEnabled(compMask, i))
1223             {
1224                 if (compCtrl[i] == ComponentControl::StoreSrc)
1225                 {
1226                     // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
1227                     uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
1228                     // if x or y, use vi128XY permute result, else use vi128ZW
1229                     Value *selectedPermute_lo = (i < 2) ? vi128XY_lo : vi128ZW_lo;
1230                     Value *selectedPermute_hi = (i < 2) ? vi128XY_hi : vi128ZW_hi;
1231
1232                     // sign extend
1233                     Value *temp_lo = PMOVSXBD(BITCAST(VEXTRACT(selectedPermute_lo, C(lane)), v16x8Ty));
1234                     Value *temp_hi = PMOVSXBD(BITCAST(VEXTRACT(selectedPermute_hi, C(lane)), v16x8Ty));
1235
1236                     Value* temp = JOIN_16(temp_lo, temp_hi);
1237
1238                     // denormalize if needed
1239                     if (conversionType != CONVERT_NONE)
1240                     {
1241                         temp = FMUL(CAST(fpCast, temp, mSimdFP32Ty), conversionFactor);
1242                     }
1243
1244                     vVertexElements[currentVertexElement] = temp;
1245
1246                     currentVertexElement += 1;
1247                 }
1248                 else
1249                 {
1250                     vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1251                 }
1252
1253                 if (currentVertexElement > 3)
1254                 {
1255                     StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1256                     // reset to the next vVertexElement to output
1257                     currentVertexElement = 0;
1258                 }
1259             }
1260         }
1261     }
1262     // else zero extend
1263     else if ((extendType == Instruction::CastOps::ZExt) || (extendType == Instruction::CastOps::UIToFP))
1264     {
1265         // init denormalize variables if needed
1266         Instruction::CastOps fpCast;
1267         Value *conversionFactor;
1268
1269         switch (conversionType)
1270         {
1271         case CONVERT_NORMALIZED:
1272             fpCast = Instruction::CastOps::UIToFP;
1273             conversionFactor = VIMMED1((float)(1.0 / 255.0));
1274             break;
1275         case CONVERT_USCALED:
1276             fpCast = Instruction::CastOps::UIToFP;
1277             conversionFactor = VIMMED1((float)(1.0));
1278             break;
1279         case CONVERT_SSCALED:
1280             SWR_INVALID("Type should not be zero extended!");
1281             conversionFactor = nullptr;
1282             break;
1283         default:
1284             SWR_ASSERT(conversionType == CONVERT_NONE);
1285             conversionFactor = nullptr;
1286             break;
1287         }
1288
1289         // shuffle enabled components into lower byte of each 32bit lane, 0 extending to 32 bits
1290         for (uint32_t i = 0; i < 4; i++)
1291         {
1292             if (isComponentEnabled(compMask, i))
1293             {
1294                 if (compCtrl[i] == ComponentControl::StoreSrc)
1295                 {
1296                     // pshufb masks for each component
1297                     Value *vConstMask;
1298                     switch (swizzle[i])
1299                     {
1300                     case 0:
1301                         // x shuffle mask
1302                         vConstMask = C<char>({ 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1,
1303                             0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1 });
1304                         break;
1305                     case 1:
1306                         // y shuffle mask
1307                         vConstMask = C<char>({ 1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1,
1308                             1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1 });
1309                         break;
1310                     case 2:
1311                         // z shuffle mask
1312                         vConstMask = C<char>({ 2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1,
1313                             2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1 });
1314                         break;
1315                     case 3:
1316                         // w shuffle mask
1317                         vConstMask = C<char>({ 3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1,
1318                             3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1 });
1319                         break;
1320                     default:
1321                         vConstMask = nullptr;
1322                         break;
1323                     }
1324
1325                     Value *vGatherResult_lo = EXTRACT_16(vGatherResult, 0);
1326                     Value *vGatherResult_hi = EXTRACT_16(vGatherResult, 1);
1327
1328                     Value *temp_lo = BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask), vGatherTy);
1329                     Value *temp_hi = BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask), vGatherTy);
1330
1331                     // after pshufb for x channel
1332                     // 256i - 0    1    2    3    4    5    6    7
1333                     //        x000 x000 x000 x000 x000 x000 x000 x000
1334
1335                     Value* temp = JOIN_16(temp_lo, temp_hi);
1336
1337                     // denormalize if needed
1338                     if (conversionType != CONVERT_NONE)
1339                     {
1340                         temp = FMUL(CAST(fpCast, temp, mSimdFP32Ty), conversionFactor);
1341                     }
1342
1343                     vVertexElements[currentVertexElement] = temp;
1344
1345                     currentVertexElement += 1;
1346                 }
1347                 else
1348                 {
1349                     vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1350                 }
1351
1352                 if (currentVertexElement > 3)
1353                 {
1354                     StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1355                     // reset to the next vVertexElement to output
1356                     currentVertexElement = 0;
1357                 }
1358             }
1359         }
1360     }
1361     else
1362     {
1363         SWR_INVALID("Unsupported conversion type");
1364     }
1365 }
1366
1367 void FetchJit::Shuffle8bpcGatherd(Shuffle8bpcArgs &args)
1368 {
1369     // Unpack tuple args
1370     Value*& vGatherResult = std::get<0>(args);
1371     Value* pVtxOut = std::get<1>(args);
1372     const Instruction::CastOps extendType = std::get<2>(args);
1373     const ConversionType conversionType = std::get<3>(args);
1374     uint32_t &currentVertexElement = std::get<4>(args);
1375     uint32_t &outputElt = std::get<5>(args);
1376     const ComponentEnable compMask = std::get<6>(args);
1377     const ComponentControl(&compCtrl)[4] = std::get<7>(args);
1378     Value* (&vVertexElements)[4] = std::get<8>(args);
1379     const uint32_t(&swizzle)[4] = std::get<9>(args);
1380
1381     // cast types
1382     Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
1383
1384     for (uint32_t i = 0; i < 4; i++)
1385     {
1386         if (!isComponentEnabled(compMask, i))
1387             continue;
1388
1389         if (compCtrl[i] == ComponentControl::StoreSrc)
1390         {
1391             std::vector<uint32_t> vShuffleMasks[4] = {
1392                 { 0, 4,  8, 12, 16, 20, 24, 28 }, // x
1393                 { 1, 5,  9, 13, 17, 21, 25, 29 }, // y
1394                 { 2, 6, 10, 14, 18, 22, 26, 30 }, // z
1395                 { 3, 7, 11, 15, 19, 23, 27, 31 }, // w
1396             };
1397
1398             Value *val = VSHUFFLE(BITCAST(vGatherResult, v32x8Ty),
1399                 UndefValue::get(v32x8Ty),
1400                 vShuffleMasks[swizzle[i]]);
1401
1402             if ((extendType == Instruction::CastOps::SExt) ||
1403                 (extendType == Instruction::CastOps::SIToFP)) {
1404                 switch (conversionType)
1405                 {
1406                 case CONVERT_NORMALIZED:
1407                     val = FMUL(SI_TO_FP(val, mSimdFP32Ty), VIMMED1((float)(1.0 / 127.0)));
1408                     break;
1409                 case CONVERT_SSCALED:
1410                     val = SI_TO_FP(val, mSimdFP32Ty);
1411                     break;
1412                 case CONVERT_USCALED:
1413                     SWR_INVALID("Type should not be sign extended!");
1414                     break;
1415                 default:
1416                     SWR_ASSERT(conversionType == CONVERT_NONE);
1417                     val = S_EXT(val, mSimdInt32Ty);
1418                     break;
1419                 }
1420             }
1421             else if ((extendType == Instruction::CastOps::ZExt) ||
1422                 (extendType == Instruction::CastOps::UIToFP)) {
1423                 switch (conversionType)
1424                 {
1425                 case CONVERT_NORMALIZED:
1426                     val = FMUL(UI_TO_FP(val, mSimdFP32Ty), VIMMED1((float)(1.0 / 255.0)));
1427                     break;
1428                 case CONVERT_SSCALED:
1429                     SWR_INVALID("Type should not be zero extended!");
1430                     break;
1431                 case CONVERT_USCALED:
1432                     val = UI_TO_FP(val, mSimdFP32Ty);
1433                     break;
1434                 default:
1435                     SWR_ASSERT(conversionType == CONVERT_NONE);
1436                     val = Z_EXT(val, mSimdInt32Ty);
1437                     break;
1438                 }
1439             }
1440             else
1441             {
1442                 SWR_INVALID("Unsupported conversion type");
1443             }
1444
1445             vVertexElements[currentVertexElement++] = val;
1446         }
1447         else
1448         {
1449             vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1450         }
1451
1452         if (currentVertexElement > 3)
1453         {
1454             StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1455             // reset to the next vVertexElement to output
1456             currentVertexElement = 0;
1457         }
1458     }
1459 }
1460
1461 //////////////////////////////////////////////////////////////////////////
1462 /// @brief Takes a SIMD of gathered 16bpc verts, zero or sign extends,
1463 /// denormalizes if needed, converts to F32 if needed, and positions in
1464 //  the proper SIMD rows to be output to the simdvertex structure
1465 /// @param args: (tuple of args, listed below)
1466 ///   @param vGatherResult[2] - array of gathered 16bpc vertices, 4 per index
1467 ///   @param pVtxOut - base pointer to output simdvertex struct
1468 ///   @param extendType - sign extend or zero extend
1469 ///   @param bNormalized - do we need to denormalize?
1470 ///   @param currentVertexElement - reference to the current vVertexElement
1471 ///   @param outputElt - reference to the current offset from simdvertex we're o
1472 ///   @param compMask - component packing mask
1473 ///   @param compCtrl - component control val
1474 ///   @param vVertexElements[4] - vertex components to output
1475 void FetchJit::Shuffle16bpcGather16(Shuffle16bpcArgs &args)
1476 {
1477     // Unpack tuple args
1478     Value* (&vGatherResult)[2] = std::get<0>(args);
1479     Value* pVtxOut = std::get<1>(args);
1480     const Instruction::CastOps extendType = std::get<2>(args);
1481     const ConversionType conversionType = std::get<3>(args);
1482     uint32_t &currentVertexElement = std::get<4>(args);
1483     uint32_t &outputElt = std::get<5>(args);
1484     const ComponentEnable compMask = std::get<6>(args);
1485     const ComponentControl(&compCtrl)[4] = std::get<7>(args);
1486     Value* (&vVertexElements)[4] = std::get<8>(args);
1487
1488     // cast types
1489     Type *vGatherTy = VectorType::get(mInt32Ty, 8);
1490     Type *v32x8Ty = VectorType::get(mInt8Ty, 32);
1491
1492     // have to do extra work for sign extending
1493     if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP) || (extendType == Instruction::CastOps::FPExt))
1494     {
1495         // is this PP float?
1496         bool bFP = (extendType == Instruction::CastOps::FPExt) ? true : false;
1497
1498         Type *v8x16Ty = VectorType::get(mInt16Ty, 8); // 8x16bit in a 128bit lane
1499         Type *v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), 2);
1500
1501         // shuffle mask
1502         Value *vConstMask = C<uint8_t>({ 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
1503                                           0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 });
1504         Value *vi128XY_lo = nullptr;
1505         Value *vi128XY_hi = nullptr;
1506         if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
1507         {
1508             // SIMD16 PSHUFB isnt part of AVX-512F, so split into SIMD8 for the sake of KNL, for now..
1509
1510             Value *vGatherResult_lo = BITCAST(EXTRACT_16(vGatherResult[0], 0), v32x8Ty);
1511             Value *vGatherResult_hi = BITCAST(EXTRACT_16(vGatherResult[0], 1), v32x8Ty);
1512
1513             Value *vShufResult_lo = BITCAST(PSHUFB(vGatherResult_lo, vConstMask), vGatherTy);
1514             Value *vShufResult_hi = BITCAST(PSHUFB(vGatherResult_hi, vConstMask), vGatherTy);
1515
1516             // after pshufb: group components together in each 128bit lane
1517             // 256i - 0    1    2    3    4    5    6    7
1518             //        xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
1519
1520             vi128XY_lo = BITCAST(VSHUFFLE(vShufResult_lo, vShufResult_lo, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
1521             vi128XY_hi = BITCAST(VSHUFFLE(vShufResult_hi, vShufResult_hi, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
1522
1523             // after PERMD: move and pack xy components into each 128bit lane
1524             // 256i - 0    1    2    3    4    5    6    7
1525             //        xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
1526         }
1527
1528         // do the same for zw components
1529         Value *vi128ZW_lo = nullptr;
1530         Value *vi128ZW_hi = nullptr;
1531         if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
1532         {
1533             Value *vGatherResult_lo = BITCAST(EXTRACT_16(vGatherResult[1], 0), v32x8Ty);
1534             Value *vGatherResult_hi = BITCAST(EXTRACT_16(vGatherResult[1], 1), v32x8Ty);
1535
1536             Value *vShufResult_lo = BITCAST(PSHUFB(vGatherResult_lo, vConstMask), vGatherTy);
1537             Value *vShufResult_hi = BITCAST(PSHUFB(vGatherResult_hi, vConstMask), vGatherTy);
1538
1539             vi128ZW_lo = BITCAST(VSHUFFLE(vShufResult_lo, vShufResult_lo, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
1540             vi128ZW_hi = BITCAST(VSHUFFLE(vShufResult_hi, vShufResult_hi, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
1541         }
1542
1543         // init denormalize variables if needed
1544         Instruction::CastOps IntToFpCast;
1545         Value *conversionFactor;
1546
1547         switch (conversionType)
1548         {
1549         case CONVERT_NORMALIZED:
1550             IntToFpCast = Instruction::CastOps::SIToFP;
1551             conversionFactor = VIMMED1((float)(1.0 / 32767.0));
1552             break;
1553         case CONVERT_SSCALED:
1554             IntToFpCast = Instruction::CastOps::SIToFP;
1555             conversionFactor = VIMMED1((float)(1.0));
1556             break;
1557         case CONVERT_USCALED:
1558             SWR_INVALID("Type should not be sign extended!");
1559             conversionFactor = nullptr;
1560             break;
1561         default:
1562             SWR_ASSERT(conversionType == CONVERT_NONE);
1563             conversionFactor = nullptr;
1564             break;
1565         }
1566
1567         // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
1568         for (uint32_t i = 0; i < 4; i++)
1569         {
1570             if (isComponentEnabled(compMask, i))
1571             {
1572                 if (compCtrl[i] == ComponentControl::StoreSrc)
1573                 {
1574                     // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
1575                     uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
1576                     // if x or y, use vi128XY permute result, else use vi128ZW
1577                     Value *selectedPermute_lo = (i < 2) ? vi128XY_lo : vi128ZW_lo;
1578                     Value *selectedPermute_hi = (i < 2) ? vi128XY_hi : vi128ZW_hi;
1579
1580                     if (bFP)
1581                     {
1582                         // extract 128 bit lanes to sign extend each component
1583                         Value *temp_lo = CVTPH2PS(BITCAST(VEXTRACT(selectedPermute_lo, C(lane)), v8x16Ty));
1584                         Value *temp_hi = CVTPH2PS(BITCAST(VEXTRACT(selectedPermute_hi, C(lane)), v8x16Ty));
1585
1586                         vVertexElements[currentVertexElement] = JOIN_16(temp_lo, temp_hi);
1587                     }
1588                     else
1589                     {
1590                         // extract 128 bit lanes to sign extend each component
1591                         Value *temp_lo = PMOVSXWD(BITCAST(VEXTRACT(selectedPermute_lo, C(lane)), v8x16Ty));
1592                         Value *temp_hi = PMOVSXWD(BITCAST(VEXTRACT(selectedPermute_hi, C(lane)), v8x16Ty));
1593
1594                         Value* temp = JOIN_16(temp_lo, temp_hi);
1595
1596                         // denormalize if needed
1597                         if (conversionType != CONVERT_NONE)
1598                         {
1599                             temp = FMUL(CAST(IntToFpCast, temp, mSimdFP32Ty), conversionFactor);
1600                         }
1601
1602                         vVertexElements[currentVertexElement] = temp;
1603                     }
1604
1605                     currentVertexElement += 1;
1606                 }
1607                 else
1608                 {
1609                     vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1610                 }
1611
1612                 if (currentVertexElement > 3)
1613                 {
1614                     StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1615                     // reset to the next vVertexElement to output
1616                     currentVertexElement = 0;
1617                 }
1618             }
1619         }
1620     }
1621     // else zero extend
1622     else if ((extendType == Instruction::CastOps::ZExt) || (extendType == Instruction::CastOps::UIToFP))
1623     {
1624         // pshufb masks for each component
1625         Value *vConstMask[2];
1626
1627         if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 2))
1628         {
1629             // x/z shuffle mask
1630             vConstMask[0] = C<char>({ 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
1631                 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, });
1632         }
1633
1634         if (isComponentEnabled(compMask, 1) || isComponentEnabled(compMask, 3))
1635         {
1636             // y/w shuffle mask
1637             vConstMask[1] = C<char>({ 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
1638                 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1 });
1639         }
1640
1641         // init denormalize variables if needed
1642         Instruction::CastOps fpCast;
1643         Value* conversionFactor;
1644
1645         switch (conversionType)
1646         {
1647         case CONVERT_NORMALIZED:
1648             fpCast = Instruction::CastOps::UIToFP;
1649             conversionFactor = VIMMED1((float)(1.0 / 65535.0));
1650             break;
1651         case CONVERT_USCALED:
1652             fpCast = Instruction::CastOps::UIToFP;
1653             conversionFactor = VIMMED1((float)(1.0f));
1654             break;
1655         case CONVERT_SSCALED:
1656             SWR_INVALID("Type should not be zero extended!");
1657             conversionFactor = nullptr;
1658             break;
1659         default:
1660             SWR_ASSERT(conversionType == CONVERT_NONE);
1661             conversionFactor = nullptr;
1662             break;
1663         }
1664
1665         // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits
1666         for (uint32_t i = 0; i < 4; i++)
1667         {
1668             if (isComponentEnabled(compMask, i))
1669             {
1670                 if (compCtrl[i] == ComponentControl::StoreSrc)
1671                 {
1672                     // select correct constMask for x/z or y/w pshufb
1673                     uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1;
1674                     // if x or y, use vi128XY permute result, else use vi128ZW
1675                     uint32_t selectedGather = (i < 2) ? 0 : 1;
1676
1677                     // SIMD16 PSHUFB isnt part of AVX-512F, so split into SIMD8 for the sake of KNL, for now..
1678
1679                     Value *vGatherResult_lo = EXTRACT_16(vGatherResult[selectedGather], 0);
1680                     Value *vGatherResult_hi = EXTRACT_16(vGatherResult[selectedGather], 1);
1681
1682                     Value *temp_lo = BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask[selectedMask]), vGatherTy);
1683                     Value *temp_hi = BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask[selectedMask]), vGatherTy);
1684
1685                     // after pshufb mask for x channel; z uses the same shuffle from the second gather
1686                     // 256i - 0    1    2    3    4    5    6    7
1687                     //        xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00
1688
1689                     Value* temp = JOIN_16(temp_lo, temp_hi);
1690
1691                     // denormalize if needed
1692                     if (conversionType != CONVERT_NONE)
1693                     {
1694                         temp = FMUL(CAST(fpCast, temp, mSimdFP32Ty), conversionFactor);
1695                     }
1696
1697                     vVertexElements[currentVertexElement] = temp;
1698
1699                     currentVertexElement += 1;
1700                 }
1701                 else
1702                 {
1703                     vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1704                 }
1705
1706                 if (currentVertexElement > 3)
1707                 {
1708                     StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1709                     // reset to the next vVertexElement to output
1710                     currentVertexElement = 0;
1711                 }
1712             }
1713         }
1714     }
1715     else
1716     {
1717         SWR_INVALID("Unsupported conversion type");
1718     }
1719 }
1720
1721 void FetchJit::Shuffle16bpcGather(Shuffle16bpcArgs &args)
1722 {
1723     // Unpack tuple args
1724     Value* (&vGatherResult)[2] = std::get<0>(args);
1725     Value* pVtxOut = std::get<1>(args);
1726     const Instruction::CastOps extendType = std::get<2>(args);
1727     const ConversionType conversionType = std::get<3>(args);
1728     uint32_t &currentVertexElement = std::get<4>(args);
1729     uint32_t &outputElt = std::get<5>(args);
1730     const ComponentEnable compMask = std::get<6>(args);
1731     const ComponentControl(&compCtrl)[4] = std::get<7>(args);
1732     Value* (&vVertexElements)[4] = std::get<8>(args);
1733
1734     // cast types
1735     Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
1736     Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
1737
1738                                                            // have to do extra work for sign extending
1739     if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP) ||
1740         (extendType == Instruction::CastOps::FPExt))
1741     {
1742         // is this PP float?
1743         bool bFP = (extendType == Instruction::CastOps::FPExt) ? true : false;
1744
1745         Type* v8x16Ty = VectorType::get(mInt16Ty, 8); // 8x16bit in a 128bit lane
1746         Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
1747
1748                                                                                                      // shuffle mask
1749         Value* vConstMask = C<char>({ 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
1750             0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 });
1751         Value* vi128XY = nullptr;
1752         if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)) {
1753             Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherResult[0], v32x8Ty), vConstMask), vGatherTy);
1754             // after pshufb: group components together in each 128bit lane
1755             // 256i - 0    1    2    3    4    5    6    7
1756             //        xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
1757
1758             vi128XY = BITCAST(VPERMD(vShufResult, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
1759             // after PERMD: move and pack xy components into each 128bit lane
1760             // 256i - 0    1    2    3    4    5    6    7
1761             //        xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
1762         }
1763
1764         // do the same for zw components
1765         Value* vi128ZW = nullptr;
1766         if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)) {
1767             Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherResult[1], v32x8Ty), vConstMask), vGatherTy);
1768             vi128ZW = BITCAST(VPERMD(vShufResult, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
1769         }
1770
1771         // init denormalize variables if needed
1772         Instruction::CastOps IntToFpCast;
1773         Value* conversionFactor;
1774
1775         switch (conversionType)
1776         {
1777         case CONVERT_NORMALIZED:
1778             IntToFpCast = Instruction::CastOps::SIToFP;
1779             conversionFactor = VIMMED1((float)(1.0 / 32767.0));
1780             break;
1781         case CONVERT_SSCALED:
1782             IntToFpCast = Instruction::CastOps::SIToFP;
1783             conversionFactor = VIMMED1((float)(1.0));
1784             break;
1785         case CONVERT_USCALED:
1786             SWR_INVALID("Type should not be sign extended!");
1787             conversionFactor = nullptr;
1788             break;
1789         default:
1790             SWR_ASSERT(conversionType == CONVERT_NONE);
1791             conversionFactor = nullptr;
1792             break;
1793         }
1794
1795         // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
1796         for (uint32_t i = 0; i < 4; i++)
1797         {
1798             if (isComponentEnabled(compMask, i))
1799             {
1800                 if (compCtrl[i] == ComponentControl::StoreSrc)
1801                 {
1802                     // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
1803                     uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
1804                     // if x or y, use vi128XY permute result, else use vi128ZW
1805                     Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
1806
1807                     if (bFP) {
1808                         // extract 128 bit lanes to sign extend each component
1809                         vVertexElements[currentVertexElement] = CVTPH2PS(BITCAST(VEXTRACT(selectedPermute, C(lane)), v8x16Ty));
1810                     }
1811                     else {
1812                         // extract 128 bit lanes to sign extend each component
1813                         vVertexElements[currentVertexElement] = PMOVSXWD(BITCAST(VEXTRACT(selectedPermute, C(lane)), v8x16Ty));
1814
1815                         // denormalize if needed
1816                         if (conversionType != CONVERT_NONE) {
1817                             vVertexElements[currentVertexElement] = FMUL(CAST(IntToFpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor);
1818                         }
1819                     }
1820                     currentVertexElement++;
1821                 }
1822                 else
1823                 {
1824                     vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1825                 }
1826
1827                 if (currentVertexElement > 3)
1828                 {
1829                     StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1830                     // reset to the next vVertexElement to output
1831                     currentVertexElement = 0;
1832                 }
1833             }
1834         }
1835     }
1836     // else zero extend
1837     else if ((extendType == Instruction::CastOps::ZExt) || (extendType == Instruction::CastOps::UIToFP))
1838     {
1839         // pshufb masks for each component
1840         Value* vConstMask[2];
1841         if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 2)) {
1842             // x/z shuffle mask
1843             vConstMask[0] = C<char>({ 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
1844                 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, });
1845         }
1846
1847         if (isComponentEnabled(compMask, 1) || isComponentEnabled(compMask, 3)) {
1848             // y/w shuffle mask
1849             vConstMask[1] = C<char>({ 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
1850                 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1 });
1851         }
1852
1853         // init denormalize variables if needed
1854         Instruction::CastOps fpCast;
1855         Value* conversionFactor;
1856
1857         switch (conversionType)
1858         {
1859         case CONVERT_NORMALIZED:
1860             fpCast = Instruction::CastOps::UIToFP;
1861             conversionFactor = VIMMED1((float)(1.0 / 65535.0));
1862             break;
1863         case CONVERT_USCALED:
1864             fpCast = Instruction::CastOps::UIToFP;
1865             conversionFactor = VIMMED1((float)(1.0f));
1866             break;
1867         case CONVERT_SSCALED:
1868             SWR_INVALID("Type should not be zero extended!");
1869             conversionFactor = nullptr;
1870             break;
1871         default:
1872             SWR_ASSERT(conversionType == CONVERT_NONE);
1873             conversionFactor = nullptr;
1874             break;
1875         }
1876
1877         // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits
1878         for (uint32_t i = 0; i < 4; i++)
1879         {
1880             if (isComponentEnabled(compMask, i))
1881             {
1882                 if (compCtrl[i] == ComponentControl::StoreSrc)
1883                 {
1884                     // select correct constMask for x/z or y/w pshufb
1885                     uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1;
1886                     // if x or y, use vi128XY permute result, else use vi128ZW
1887                     uint32_t selectedGather = (i < 2) ? 0 : 1;
1888
1889                     vVertexElements[currentVertexElement] = BITCAST(PSHUFB(BITCAST(vGatherResult[selectedGather], v32x8Ty), vConstMask[selectedMask]), vGatherTy);
1890                     // after pshufb mask for x channel; z uses the same shuffle from the second gather
1891                     // 256i - 0    1    2    3    4    5    6    7
1892                     //        xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00
1893
1894                     // denormalize if needed
1895                     if (conversionType != CONVERT_NONE)
1896                     {
1897                         vVertexElements[currentVertexElement] = FMUL(CAST(fpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor);
1898                     }
1899                     currentVertexElement++;
1900                 }
1901                 else
1902                 {
1903                     vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1904                 }
1905
1906                 if (currentVertexElement > 3)
1907                 {
1908                     StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1909                     // reset to the next vVertexElement to output
1910                     currentVertexElement = 0;
1911                 }
1912             }
1913         }
1914     }
1915     else
1916     {
1917         SWR_INVALID("Unsupported conversion type");
1918     }
1919 }
1920
1921 //////////////////////////////////////////////////////////////////////////
1922 /// @brief Output a simdvertex worth of elements to the current outputElt
1923 /// @param pVtxOut - base address of VIN output struct
1924 /// @param outputElt - simdvertex offset in VIN to write to
1925 /// @param numEltsToStore - number of simdvertex rows to write out
1926 /// @param vVertexElements - LLVM Value*[] simdvertex to write out
1927 void FetchJit::StoreVertexElements(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4])
1928 {
1929     SWR_ASSERT(numEltsToStore <= 4, "Invalid element count.");
1930
1931     for (uint32_t c = 0; c < numEltsToStore; ++c)
1932     {
1933         // STORE expects FP32 x vWidth type, just bitcast if needed
1934         if (!vVertexElements[c]->getType()->getScalarType()->isFloatTy())
1935         {
1936 #if FETCH_DUMP_VERTEX
1937             PRINT("vVertexElements[%d]: 0x%x\n", { C(c), vVertexElements[c] });
1938 #endif
1939             vVertexElements[c] = BITCAST(vVertexElements[c], mSimdFP32Ty);
1940         }
1941 #if FETCH_DUMP_VERTEX
1942         else
1943         {
1944             PRINT("vVertexElements[%d]: %f\n", { C(c), vVertexElements[c] });
1945         }
1946 #endif
1947         // outputElt * 4 = offsetting by the size of a simdvertex
1948         // + c offsets to a 32bit x vWidth row within the current vertex
1949         Value* dest = GEP(pVtxOut, C(outputElt * 4 + c), nullptr, "destGEP");
1950         STORE(vVertexElements[c], dest);
1951     }
1952 }
1953
1954 //////////////////////////////////////////////////////////////////////////
1955 /// @brief Generates a constant vector of values based on the
1956 /// ComponentControl value
1957 /// @param ctrl - ComponentControl value
1958 Value *FetchJit::GenerateCompCtrlVector(const ComponentControl ctrl)
1959 {
1960     switch (ctrl)
1961     {
1962     case NoStore:
1963         return VUNDEF_I();
1964     case Store0:
1965         return VIMMED1(0);
1966     case Store1Fp:
1967         return VIMMED1(1.0f);
1968     case Store1Int:
1969         return VIMMED1(1);
1970     case StoreVertexId:
1971     {
1972         if (mVWidth == 16)
1973         {
1974             Type* pSimd8FPTy = VectorType::get(mFP32Ty, 8);
1975             Value *pIdLo = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID })), pSimd8FPTy);
1976             Value *pIdHi = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID2 })), pSimd8FPTy);
1977             return JOIN_16(pIdLo, pIdHi);
1978         }
1979         else
1980         {
1981             return BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID })), mSimdFP32Ty);
1982         }
1983     }
1984     case StoreInstanceId:
1985         {
1986             Value *pId = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_CurInstance })), mFP32Ty);
1987             return VBROADCAST(pId);
1988         }
1989
1990
1991     case StoreSrc:
1992     default:
1993         SWR_INVALID("Invalid component control");
1994         return VUNDEF_I();
1995     }
1996 }
1997
1998 //////////////////////////////////////////////////////////////////////////
1999 /// @brief Returns the enable mask for the specified component.
2000 /// @param enableMask - enable bits
2001 /// @param component - component to check if enabled.
2002 bool isComponentEnabled(ComponentEnable enableMask, uint8_t component)
2003 {
2004     switch (component)
2005     {
2006         // X
2007     case 0: return (enableMask & ComponentEnable::X);
2008         // Y
2009     case 1: return (enableMask & ComponentEnable::Y);
2010         // Z
2011     case 2: return (enableMask & ComponentEnable::Z);
2012         // W
2013     case 3: return (enableMask & ComponentEnable::W);
2014
2015     default: return false;
2016     }
2017 }
2018
2019 // Don't want two threads compiling the same fetch shader simultaneously
2020 // Has problems in the JIT cache implementation
2021 // This is only a problem for fetch right now.
2022 static std::mutex gFetchCodegenMutex;
2023
2024 //////////////////////////////////////////////////////////////////////////
2025 /// @brief JITs from fetch shader IR
2026 /// @param hJitMgr - JitManager handle
2027 /// @param func   - LLVM function IR
2028 /// @return PFN_FETCH_FUNC - pointer to fetch code
2029 PFN_FETCH_FUNC JitFetchFunc(HANDLE hJitMgr, const HANDLE hFunc)
2030 {
2031     const llvm::Function* func = (const llvm::Function*)hFunc;
2032     JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
2033     PFN_FETCH_FUNC pfnFetch;
2034
2035     gFetchCodegenMutex.lock();
2036     pfnFetch = (PFN_FETCH_FUNC)(pJitMgr->mpExec->getFunctionAddress(func->getName().str()));
2037     // MCJIT finalizes modules the first time you JIT code from them. After finalized, you cannot add new IR to the module
2038     pJitMgr->mIsModuleFinalized = true;
2039
2040 #if defined(KNOB_SWRC_TRACING)
2041     char fName[1024];
2042     const char *funcName = func->getName().data();
2043     sprintf(fName, "%s.bin", funcName);
2044     FILE *fd = fopen(fName, "wb");
2045     fwrite((void *)pfnFetch, 1, 2048, fd);
2046     fclose(fd);
2047 #endif
2048
2049     pJitMgr->DumpAsm(const_cast<llvm::Function*>(func), "final");
2050     gFetchCodegenMutex.unlock();
2051
2052
2053
2054     return pfnFetch;
2055 }
2056
2057 //////////////////////////////////////////////////////////////////////////
2058 /// @brief JIT compiles fetch shader
2059 /// @param hJitMgr - JitManager handle
2060 /// @param state   - fetch state to build function from
2061 extern "C" PFN_FETCH_FUNC JITCALL JitCompileFetch(HANDLE hJitMgr, const FETCH_COMPILE_STATE& state)
2062 {
2063     JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
2064
2065     pJitMgr->SetupNewModule();
2066
2067     FetchJit theJit(pJitMgr);
2068     HANDLE hFunc = theJit.Create(state);
2069
2070     return JitFetchFunc(hJitMgr, hFunc);
2071 }