src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp

   1 /****************************************************************************
   2 * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
   3 *
   4 * Permission is hereby granted, free of charge, to any person obtaining a
   5 * copy of this software and associated documentation files (the "Software"),
   6 * to deal in the Software without restriction, including without limitation
   7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8 * and/or sell copies of the Software, and to permit persons to whom the
   9 * Software is furnished to do so, subject to the following conditions:
  10 *
  11 * The above copyright notice and this permission notice (including the next
  12 * paragraph) shall be included in all copies or substantial portions of the
  13 * Software.
  14 *
  15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21 * IN THE SOFTWARE.
  22 *
  23 * @file fetch_jit.cpp
  24 *
  25 * @brief Implementation of the fetch jitter
  26 *
  27 * Notes:
  28 *
  29 ******************************************************************************/
  30 #include "jit_pch.hpp"
  31 #include "builder_gfx_mem.h"
  32 #include "jit_api.h"
  33 #include "fetch_jit.h"
  34 #include "gen_state_llvm.h"
  35 #include "functionpasses/passes.h"
  36
  37 //#define FETCH_DUMP_VERTEX 1
  38 using namespace llvm;
  39 using namespace SwrJit;
  40
  41 bool isComponentEnabled(ComponentEnable enableMask, uint8_t component);
  42
  43 enum ConversionType
  44 {
  45     CONVERT_NONE,
  46     CONVERT_NORMALIZED,
  47     CONVERT_USCALED,
  48     CONVERT_SSCALED,
  49     CONVERT_SFIXED,
  50 };
  51
  52 //////////////////////////////////////////////////////////////////////////
  53 /// Interface to Jitting a fetch shader
  54 //////////////////////////////////////////////////////////////////////////
  55 struct FetchJit : public BuilderGfxMem
  56 {
  57     FetchJit(JitManager* pJitMgr) :
  58         BuilderGfxMem(pJitMgr)
  59     {}
  60
  61     Function* Create(const FETCH_COMPILE_STATE& fetchState);
  62
  63     Value* GetSimdValid32bitIndices(Value* vIndices, Value* pLastIndex);
  64     Value* GetSimdValid16bitIndices(Value* vIndices, Value* pLastIndex);
  65     Value* GetSimdValid8bitIndices(Value* vIndices, Value* pLastIndex);
  66     template<typename T> Value* GetSimdValidIndicesHelper(Value* pIndices, Value* pLastIndex);
  67
  68     // package up Shuffle*bpcGatherd args into a tuple for convenience
  69     typedef std::tuple<Value*&, Value*, const Instruction::CastOps, const ConversionType,
  70         uint32_t&, uint32_t&, const ComponentEnable, const ComponentControl(&)[4], Value*(&)[4],
  71         const uint32_t(&)[4]> Shuffle8bpcArgs;
  72
  73     void Shuffle8bpcGatherd16(Shuffle8bpcArgs &args);
  74     void Shuffle8bpcGatherd(Shuffle8bpcArgs &args);
  75
  76     typedef std::tuple<Value*(&)[2], Value*, const Instruction::CastOps, const ConversionType,
  77         uint32_t&, uint32_t&, const ComponentEnable, const ComponentControl(&)[4], Value*(&)[4]> Shuffle16bpcArgs;
  78
  79     void Shuffle16bpcGather16(Shuffle16bpcArgs &args);
  80     void Shuffle16bpcGather(Shuffle16bpcArgs &args);
  81
  82     void StoreVertexElements(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4]);
  83
  84     Value *GenerateCompCtrlVector(const ComponentControl ctrl);
  85
  86     void JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, Value* streams, Value* vIndices, Value* pVtxOut);
  87
  88     bool IsOddFormat(SWR_FORMAT format);
  89     bool IsUniformFormat(SWR_FORMAT format);
  90     void UnpackComponents(SWR_FORMAT format, Value* vInput, Value* result[4]);
  91     void CreateGatherOddFormats(SWR_FORMAT format, Value* pMask, Value* pBase, Value* offsets, Value* result[4]);
  92     void ConvertFormat(SWR_FORMAT format, Value *texels[4]);
  93
  94     Value* mpFetchInfo;
  95 };
  96
  97 Function* FetchJit::Create(const FETCH_COMPILE_STATE& fetchState)
  98 {
  99     std::stringstream fnName("FCH_", std::ios_base::in | std::ios_base::out | std::ios_base::ate);
 100     fnName << ComputeCRC(0, &fetchState, sizeof(fetchState));
 101
 102     Function*    fetch = Function::Create(JM()->mFetchShaderTy, GlobalValue::ExternalLinkage, fnName.str(), JM()->mpCurrentModule);
 103     BasicBlock*    entry = BasicBlock::Create(JM()->mContext, "entry", fetch);
 104
 105     fetch->getParent()->setModuleIdentifier(fetch->getName());
 106
 107     IRB()->SetInsertPoint(entry);
 108
 109     auto    argitr = fetch->arg_begin();
 110
 111     // Fetch shader arguments
 112     Value* privateContext = &*argitr; ++argitr;
 113     privateContext->setName("privateContext");
 114     SetPrivateContext(privateContext);
 115
 116     mpFetchInfo = &*argitr; ++argitr;
 117     mpFetchInfo->setName("fetchInfo");
 118     Value*    pVtxOut = &*argitr;
 119     pVtxOut->setName("vtxOutput");
 120
 121     uint32_t baseWidth = mVWidth;
 122
 123     SWR_ASSERT(mVWidth == 8 || mVWidth == 16, "Unsupported vector width %d", mVWidth);
 124
 125     // Override builder target width to force 16-wide SIMD
 126 #if USE_SIMD16_SHADERS
 127     SetTargetWidth(16);
 128 #endif
 129
 130     pVtxOut = BITCAST(pVtxOut, PointerType::get(mSimdFP32Ty, 0));
 131
 132     // SWR_FETCH_CONTEXT::pStreams
 133     Value*    streams = LOAD(mpFetchInfo,{0, SWR_FETCH_CONTEXT_pStreams});
 134     streams->setName("pStreams");
 135
 136     // SWR_FETCH_CONTEXT::pIndices
 137     Value*    indices = LOAD(mpFetchInfo,{0, SWR_FETCH_CONTEXT_xpIndices});
 138     indices->setName("pIndices");
 139
 140     // SWR_FETCH_CONTEXT::pLastIndex
 141     Value*    pLastIndex = LOAD(mpFetchInfo,{0, SWR_FETCH_CONTEXT_xpLastIndex});
 142     pLastIndex->setName("pLastIndex");
 143
 144     Value* vIndices;
 145     switch(fetchState.indexType)
 146     {
 147         case R8_UINT:
 148             indices = BITCAST(indices, Type::getInt8PtrTy(JM()->mContext, 0));
 149             if(fetchState.bDisableIndexOOBCheck)
 150             {
 151                 vIndices = LOAD(BITCAST(indices, PointerType::get(VectorType::get(mInt8Ty, mpJitMgr->mVWidth), 0)), {(uint32_t)0});
 152                 vIndices = Z_EXT(vIndices, mSimdInt32Ty);
 153             }
 154             else
 155             {
 156                 vIndices = GetSimdValid8bitIndices(indices, pLastIndex);
 157             }
 158             break;
 159         case R16_UINT:
 160             if(fetchState.bDisableIndexOOBCheck)
 161             {
 162                 vIndices = LOAD(BITCAST(indices, PointerType::get(VectorType::get(mInt16Ty, mpJitMgr->mVWidth), 0)), {(uint32_t)0});
 163                 vIndices = Z_EXT(vIndices, mSimdInt32Ty);
 164             }
 165             else
 166             {
 167                 vIndices = GetSimdValid16bitIndices(indices, pLastIndex);
 168             }
 169             break;
 170         case R32_UINT:
 171             (fetchState.bDisableIndexOOBCheck) ? vIndices = LOAD(indices, "", PointerType::get(mSimdInt32Ty, 0), GFX_MEM_CLIENT_FETCH)
 172                                                : vIndices = GetSimdValid32bitIndices(indices, pLastIndex);
 173             break; // incoming type is already 32bit int
 174         default:
 175             SWR_INVALID("Unsupported index type");
 176             vIndices = nullptr;
 177             break;
 178     }
 179
 180     if(fetchState.bForceSequentialAccessEnable)
 181     {
 182         Value* pOffsets = mVWidth == 8 ? C({ 0, 1, 2, 3, 4, 5, 6, 7 }) :
 183             C({ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 });
 184
 185         // VertexData buffers are accessed sequentially, the index is equal to the vertex number
 186         vIndices = VBROADCAST(LOAD(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_StartVertex }));
 187         vIndices = ADD(vIndices, pOffsets);
 188     }
 189
 190     Value* vVertexId = vIndices;
 191     if (fetchState.bVertexIDOffsetEnable)
 192     {
 193         // Assuming one of baseVertex or startVertex is 0, so adding both should be functionally correct
 194         Value* vBaseVertex = VBROADCAST(LOAD(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_BaseVertex }));
 195         Value* vStartVertex = VBROADCAST(LOAD(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_StartVertex }));
 196         vVertexId = ADD(vIndices, vBaseVertex);
 197         vVertexId = ADD(vVertexId, vStartVertex);
 198     }
 199
 200     // store out vertex IDs
 201     if (mVWidth == 16)
 202     {
 203         // store out in simd8 halves until core supports 16-wide natively
 204         auto vVertexIdLo = EXTRACT_16(vVertexId, 0);
 205         auto vVertexIdHi = EXTRACT_16(vVertexId, 1);
 206         STORE(vVertexIdLo, GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID }));
 207         STORE(vVertexIdHi, GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID2 }));
 208     }
 209     else if (mVWidth == 8)
 210     {
 211         STORE(vVertexId, GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID }));
 212     }
 213
 214     // store out cut mask if enabled
 215     if (fetchState.bEnableCutIndex)
 216     {
 217         Value* vCutIndex = VIMMED1(fetchState.cutIndex);
 218         Value* cutMask = VMASK(ICMP_EQ(vIndices, vCutIndex));
 219
 220         if (mVWidth == 16)
 221         {
 222             auto cutMaskLo = EXTRACT_16(cutMask, 0);
 223             auto cutMaskHi = EXTRACT_16(cutMask, 1);
 224             STORE(cutMaskLo, GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_CutMask }));
 225             STORE(cutMaskHi, GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_CutMask2 }));
 226         }
 227         else if (mVWidth == 8)
 228         {
 229             STORE(cutMask, GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_CutMask }));
 230         }
 231     }
 232
 233     // Fetch attributes from memory and output to a simdvertex struct
 234     JitGatherVertices(fetchState, streams, vIndices, pVtxOut);
 235
 236     RET_VOID();
 237
 238     JitManager::DumpToFile(fetch, "src");
 239
 240 #if defined(_DEBUG)
 241     verifyFunction(*fetch);
 242 #endif
 243
 244     ::FunctionPassManager setupPasses(JM()->mpCurrentModule);
 245
 246     ///@todo We don't need the CFG passes for fetch. (e.g. BreakCriticalEdges and CFGSimplification)
 247     setupPasses.add(createBreakCriticalEdgesPass());
 248     setupPasses.add(createCFGSimplificationPass());
 249     setupPasses.add(createEarlyCSEPass());
 250     setupPasses.add(createPromoteMemoryToRegisterPass());
 251
 252     setupPasses.run(*fetch);
 253
 254     JitManager::DumpToFile(fetch, "se");
 255
 256     ::FunctionPassManager optPasses(JM()->mpCurrentModule);
 257
 258     ///@todo Haven't touched these either. Need to remove some of these and add others.
 259     optPasses.add(createCFGSimplificationPass());
 260     optPasses.add(createEarlyCSEPass());
 261     optPasses.add(createInstructionCombiningPass());
 262     optPasses.add(createInstructionSimplifierPass());
 263     optPasses.add(createConstantPropagationPass());
 264     optPasses.add(createSCCPPass());
 265     optPasses.add(createAggressiveDCEPass());
 266
 267     optPasses.run(*fetch);
 268
 269     optPasses.add(createLowerX86Pass(JM(), this));
 270     optPasses.run(*fetch);
 271
 272     JitManager::DumpToFile(fetch, "opt");
 273
 274
 275     // Revert 16-wide override
 276 #if USE_SIMD16_SHADERS
 277     SetTargetWidth(baseWidth);
 278 #endif
 279
 280     return fetch;
 281 }
 282
 283 // returns true for odd formats that require special state.gather handling
 284 bool FetchJit::IsOddFormat(SWR_FORMAT format)
 285 {
 286     const SWR_FORMAT_INFO& info = GetFormatInfo(format);
 287     if (info.bpc[0] != 8 && info.bpc[0] != 16 && info.bpc[0] != 32 && info.bpc[0] != 64)
 288     {
 289         return true;
 290     }
 291     return false;
 292 }
 293
 294 // format is uniform if all components are the same size and type
 295 bool FetchJit::IsUniformFormat(SWR_FORMAT format)
 296 {
 297     const SWR_FORMAT_INFO& info = GetFormatInfo(format);
 298     uint32_t bpc0 = info.bpc[0];
 299     uint32_t type0 = info.type[0];
 300
 301     for (uint32_t c = 1; c < info.numComps; ++c)
 302     {
 303         if (bpc0 != info.bpc[c] || type0 != info.type[c])
 304         {
 305             return false;
 306         }
 307     }
 308     return true;
 309 }
 310
 311 // unpacks components based on format
 312 // foreach component in the pixel
 313 //   mask off everything but this component
 314 //   shift component to LSB
 315 void FetchJit::UnpackComponents(SWR_FORMAT format, Value* vInput, Value* result[4])
 316 {
 317     const SWR_FORMAT_INFO& info = GetFormatInfo(format);
 318
 319     uint32_t bitOffset = 0;
 320     for (uint32_t c = 0; c < info.numComps; ++c)
 321     {
 322         uint32_t swizzledIndex = info.swizzle[c];
 323         uint32_t compBits = info.bpc[c];
 324         uint32_t bitmask = ((1 << compBits) - 1) << bitOffset;
 325         Value* comp = AND(vInput, bitmask);
 326         comp = LSHR(comp, bitOffset);
 327
 328         result[swizzledIndex] = comp;
 329         bitOffset += compBits;
 330     }
 331 }
 332
 333 // gather for odd component size formats
 334 // gather SIMD full pixels per lane then shift/mask to move each component to their
 335 // own vector
 336 void FetchJit::CreateGatherOddFormats(SWR_FORMAT format, Value* pMask, Value* pBase, Value* pOffsets, Value* pResult[4])
 337 {
 338     const SWR_FORMAT_INFO &info = GetFormatInfo(format);
 339
 340     // only works if pixel size is <= 32bits
 341     SWR_ASSERT(info.bpp <= 32);
 342
 343     Value *pGather;
 344     if (info.bpp == 32)
 345     {
 346         pGather = GATHERDD(VIMMED1(0), pBase, pOffsets, pMask);
 347     }
 348     else
 349     {
 350         // Can't use 32-bit gather for items less than 32-bits, could cause page faults.
 351         Value *pMem = ALLOCA(mSimdInt32Ty);
 352         STORE(VIMMED1(0u), pMem);
 353
 354         pBase = BITCAST(pBase, PointerType::get(mInt8Ty, 0));
 355         Value* pDstMem = BITCAST(pMem, mInt32PtrTy);
 356
 357         for (uint32_t lane = 0; lane < mVWidth; ++lane)
 358         {
 359             // Get index
 360             Value* index = VEXTRACT(pOffsets, C(lane));
 361             Value* mask = VEXTRACT(pMask, C(lane));
 362             switch (info.bpp)
 363             {
 364             case 8:
 365             {
 366                 Value* pDst = BITCAST(GEP(pDstMem, C(lane)), PointerType::get(mInt8Ty, 0));
 367                 Value* pSrc = BITCAST(GEP(pBase, index), PointerType::get(mInt8Ty, 0));
 368                 STORE(LOAD(SELECT(mask, pSrc, pDst)), pDst);
 369                 break;
 370             }
 371
 372             case 16:
 373             {
 374                 Value* pDst = BITCAST(GEP(pDstMem, C(lane)), PointerType::get(mInt16Ty, 0));
 375                 Value* pSrc = BITCAST(GEP(pBase, index), PointerType::get(mInt16Ty, 0));
 376                 STORE(LOAD(SELECT(mask, pSrc, pDst)), pDst);
 377                 break;
 378             }
 379             break;
 380
 381             case 24:
 382             {
 383                 // First 16-bits of data
 384                 Value* pDst = BITCAST(GEP(pDstMem, C(lane)), PointerType::get(mInt16Ty, 0));
 385                 Value* pSrc = BITCAST(GEP(pBase, index), PointerType::get(mInt16Ty, 0));
 386                 STORE(LOAD(SELECT(mask, pSrc, pDst)), pDst);
 387
 388                 // Last 8-bits of data
 389                 pDst = BITCAST(GEP(pDst, C(1)), PointerType::get(mInt8Ty, 0));
 390                 pSrc = BITCAST(GEP(pSrc, C(1)), PointerType::get(mInt8Ty, 0));
 391                 STORE(LOAD(SELECT(mask, pSrc, pDst)), pDst);
 392                 break;
 393             }
 394
 395             default:
 396                 SWR_INVALID("Shouldn't have BPP = %d now", info.bpp);
 397                 break;
 398             }
 399         }
 400
 401         pGather = LOAD(pMem);
 402     }
 403
 404     for (uint32_t comp = 0; comp < 4; ++comp)
 405     {
 406         pResult[comp] = VIMMED1((int)info.defaults[comp]);
 407     }
 408
 409     UnpackComponents(format, pGather, pResult);
 410
 411     // cast to fp32
 412     pResult[0] = BITCAST(pResult[0], mSimdFP32Ty);
 413     pResult[1] = BITCAST(pResult[1], mSimdFP32Ty);
 414     pResult[2] = BITCAST(pResult[2], mSimdFP32Ty);
 415     pResult[3] = BITCAST(pResult[3], mSimdFP32Ty);
 416 }
 417
 418 void FetchJit::ConvertFormat(SWR_FORMAT format, Value *texels[4])
 419 {
 420     const SWR_FORMAT_INFO &info = GetFormatInfo(format);
 421
 422     for (uint32_t c = 0; c < info.numComps; ++c)
 423     {
 424         uint32_t compIndex = info.swizzle[c];
 425
 426         // skip any conversion on UNUSED components
 427         if (info.type[c] == SWR_TYPE_UNUSED)
 428         {
 429             continue;
 430         }
 431
 432         if (info.isNormalized[c])
 433         {
 434             if (info.type[c] == SWR_TYPE_SNORM)
 435             {
 436                 /// @todo The most-negative value maps to -1.0f. e.g. the 5-bit value 10000 maps to -1.0f.
 437
 438                 /// result = c * (1.0f / (2^(n-1) - 1);
 439                 uint32_t n = info.bpc[c];
 440                 uint32_t pow2 = 1 << (n - 1);
 441                 float scale = 1.0f / (float)(pow2 - 1);
 442                 Value *vScale = VIMMED1(scale);
 443                 texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty);
 444                 texels[compIndex] = SI_TO_FP(texels[compIndex], mSimdFP32Ty);
 445                 texels[compIndex] = FMUL(texels[compIndex], vScale);
 446             }
 447             else
 448             {
 449                 SWR_ASSERT(info.type[c] == SWR_TYPE_UNORM);
 450
 451                 /// result = c * (1.0f / (2^n - 1))
 452                 uint32_t n = info.bpc[c];
 453                 uint32_t pow2 = 1 << n;
 454                 // special case 24bit unorm format, which requires a full divide to meet ULP requirement
 455                 if (n == 24)
 456                 {
 457                     float scale = (float)(pow2 - 1);
 458                     Value* vScale = VIMMED1(scale);
 459                     texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty);
 460                     texels[compIndex] = SI_TO_FP(texels[compIndex], mSimdFP32Ty);
 461                     texels[compIndex] = FDIV(texels[compIndex], vScale);
 462                 }
 463                 else
 464                 {
 465                     float scale = 1.0f / (float)(pow2 - 1);
 466                     Value *vScale = VIMMED1(scale);
 467                     texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty);
 468                     texels[compIndex] = UI_TO_FP(texels[compIndex], mSimdFP32Ty);
 469                     texels[compIndex] = FMUL(texels[compIndex], vScale);
 470                 }
 471             }
 472             continue;
 473         }
 474     }
 475 }
 476
 477 //////////////////////////////////////////////////////////////////////////
 478 /// @brief Loads attributes from memory using AVX2 GATHER(s)
 479 /// @param fetchState - info about attributes to be fetched from memory
 480 /// @param streams - value pointer to the current vertex stream
 481 /// @param vIndices - vector value of indices to gather
 482 /// @param pVtxOut - value pointer to output simdvertex struct
 483 void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
 484     Value* streams, Value* vIndices, Value* pVtxOut)
 485 {
 486     uint32_t currentVertexElement = 0;
 487     uint32_t outputElt = 0;
 488     Value* vVertexElements[4];
 489
 490     Value* startVertex = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartVertex});
 491     Value* startInstance = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartInstance});
 492     Value* curInstance = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_CurInstance});
 493     Value* vBaseVertex = VBROADCAST(LOAD(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_BaseVertex }));
 494     curInstance->setName("curInstance");
 495
 496     for (uint32_t nInputElt = 0; nInputElt < fetchState.numAttribs; nInputElt += 1)
 497     {
 498         const INPUT_ELEMENT_DESC& ied = fetchState.layout[nInputElt];
 499
 500         // skip element if all components are disabled
 501         if (ied.ComponentPacking == ComponentEnable::NONE)
 502         {
 503             continue;
 504         }
 505
 506         const SWR_FORMAT_INFO &info = GetFormatInfo((SWR_FORMAT)ied.Format);
 507         SWR_ASSERT((info.bpp != 0), "Unsupported format in JitGatherVertices.");
 508         uint32_t bpc = info.bpp / info.numComps;  ///@todo Code below assumes all components are same size. Need to fix.
 509
 510         Value *stream = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_xpData});
 511
 512         // VGATHER* takes an *i8 src pointer
 513         Value *pStreamBase = INT_TO_PTR(stream, PointerType::get(mInt8Ty, 0));
 514
 515         Value *stride = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pitch});
 516         Value *vStride = VBROADCAST(stride);
 517
 518         // max vertex index that is fully in bounds
 519         Value *maxVertex = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_maxVertex)});
 520         maxVertex = LOAD(maxVertex);
 521
 522         Value *minVertex = NULL;
 523         if (fetchState.bPartialVertexBuffer)
 524         {
 525             // min vertex index for low bounds OOB checking
 526             minVertex = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_minVertex)});
 527             minVertex = LOAD(minVertex);
 528         }
 529
 530         if (fetchState.bInstanceIDOffsetEnable)
 531         {
 532             // the InstanceID (curInstance) value is offset by StartInstanceLocation
 533             curInstance = ADD(curInstance, startInstance);
 534         }
 535
 536         Value *vCurIndices;
 537         Value *startOffset;
 538         Value *vInstanceStride = VIMMED1(0);
 539
 540         if (ied.InstanceEnable)
 541         {
 542             Value* stepRate = C(ied.InstanceAdvancementState);
 543
 544             // prevent a div by 0 for 0 step rate
 545             Value* isNonZeroStep = ICMP_UGT(stepRate, C(0));
 546             stepRate = SELECT(isNonZeroStep, stepRate, C(1));
 547
 548             // calc the current offset into instanced data buffer
 549             Value* calcInstance = UDIV(curInstance, stepRate);
 550
 551             // if step rate is 0, every instance gets instance 0
 552             calcInstance = SELECT(isNonZeroStep, calcInstance, C(0));
 553
 554             vCurIndices = VBROADCAST(calcInstance);
 555             startOffset = startInstance;
 556         }
 557         else if (ied.InstanceStrideEnable)
 558         {
 559             // grab the instance advancement state, determines stride in bytes from one instance to the next
 560             Value* stepRate = C(ied.InstanceAdvancementState);
 561             vInstanceStride = VBROADCAST(MUL(curInstance, stepRate));
 562
 563             // offset indices by baseVertex
 564             vCurIndices = ADD(vIndices, vBaseVertex);
 565
 566             startOffset = startVertex;
 567             SWR_ASSERT((0), "TODO: Fill out more once driver sends this down.");
 568         }
 569         else
 570         {
 571             // offset indices by baseVertex
 572             vCurIndices = ADD(vIndices, vBaseVertex);
 573             startOffset = startVertex;
 574         }
 575
 576         // All of the OOB calculations are in vertices, not VB offsets, to prevent having to
 577         // do 64bit address offset calculations.
 578
 579         // calculate byte offset to the start of the VB
 580         Value* baseOffset = MUL(Z_EXT(startOffset, mInt64Ty), Z_EXT(stride, mInt64Ty));
 581         pStreamBase = GEP(pStreamBase, baseOffset);
 582         Value* pStreamBaseGFX = ADD(stream, baseOffset);
 583
 584         // if we have a start offset, subtract from max vertex. Used for OOB check
 585         maxVertex = SUB(Z_EXT(maxVertex, mInt64Ty), Z_EXT(startOffset, mInt64Ty));
 586         Value* maxNeg = ICMP_SLT(maxVertex, C((int64_t)0));
 587         // if we have a negative value, we're already OOB. clamp at 0.
 588         maxVertex = SELECT(maxNeg, C(0), TRUNC(maxVertex, mInt32Ty));
 589
 590         if (fetchState.bPartialVertexBuffer)
 591         {
 592             // similary for min vertex
 593             minVertex = SUB(Z_EXT(minVertex, mInt64Ty), Z_EXT(startOffset, mInt64Ty));
 594             Value *minNeg = ICMP_SLT(minVertex, C((int64_t)0));
 595             minVertex = SELECT(minNeg, C(0), TRUNC(minVertex, mInt32Ty));
 596         }
 597
 598         // Load the in bounds size of a partially valid vertex
 599         Value *partialInboundsSize = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_partialInboundsSize)});
 600         partialInboundsSize = LOAD(partialInboundsSize);
 601         Value *vPartialVertexSize = VBROADCAST(partialInboundsSize);
 602         Value *vBpp = VBROADCAST(C(info.Bpp));
 603         Value *vAlignmentOffsets = VBROADCAST(C(ied.AlignedByteOffset));
 604
 605         // is the element is <= the partially valid size
 606         Value *vElementInBoundsMask = ICMP_SLE(vBpp, SUB(vPartialVertexSize, vAlignmentOffsets));
 607
 608         // override cur indices with 0 if pitch is 0
 609         Value* pZeroPitchMask = ICMP_EQ(vStride, VIMMED1(0));
 610         vCurIndices = SELECT(pZeroPitchMask, VIMMED1(0), vCurIndices);
 611
 612         // are vertices partially OOB?
 613         Value* vMaxVertex = VBROADCAST(maxVertex);
 614         Value* vPartialOOBMask = ICMP_EQ(vCurIndices, vMaxVertex);
 615
 616         // are vertices fully in bounds?
 617         Value* vMaxGatherMask = ICMP_ULT(vCurIndices, vMaxVertex);
 618
 619         Value *vGatherMask;
 620         if (fetchState.bPartialVertexBuffer)
 621         {
 622             // are vertices below minVertex limit?
 623             Value *vMinVertex = VBROADCAST(minVertex);
 624             Value *vMinGatherMask = ICMP_UGE(vCurIndices, vMinVertex);
 625
 626             // only fetch lanes that pass both tests
 627             vGatherMask = AND(vMaxGatherMask, vMinGatherMask);
 628         }
 629         else
 630         {
 631             vGatherMask = vMaxGatherMask;
 632         }
 633
 634         // blend in any partially OOB indices that have valid elements
 635         vGatherMask = SELECT(vPartialOOBMask, vElementInBoundsMask, vGatherMask);
 636
 637         // calculate the actual offsets into the VB
 638         Value* vOffsets = MUL(vCurIndices, vStride);
 639         vOffsets = ADD(vOffsets, vAlignmentOffsets);
 640
 641         // if instance stride enable is:
 642         //  true  - add product of the instanceID and advancement state to the offst into the VB
 643         //  false - value of vInstanceStride has been initialialized to zero
 644         vOffsets = ADD(vOffsets, vInstanceStride);
 645
 646         // Packing and component control
 647         ComponentEnable compMask = (ComponentEnable)ied.ComponentPacking;
 648         const ComponentControl compCtrl[4] { (ComponentControl)ied.ComponentControl0, (ComponentControl)ied.ComponentControl1,
 649                                              (ComponentControl)ied.ComponentControl2, (ComponentControl)ied.ComponentControl3};
 650
 651         // Special gather/conversion for formats without equal component sizes
 652         if (IsOddFormat((SWR_FORMAT)ied.Format))
 653         {
 654             Value *pResults[4];
 655             CreateGatherOddFormats((SWR_FORMAT)ied.Format, vGatherMask, pStreamBase, vOffsets, pResults);
 656             ConvertFormat((SWR_FORMAT)ied.Format, pResults);
 657
 658             for (uint32_t c = 0; c < 4; c += 1)
 659             {
 660                 if (isComponentEnabled(compMask, c))
 661                 {
 662                     vVertexElements[currentVertexElement++] = pResults[c];
 663                     if (currentVertexElement > 3)
 664                     {
 665                         StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
 666                         // reset to the next vVertexElement to output
 667                         currentVertexElement = 0;
 668                     }
 669                 }
 670             }
 671         }
 672         else if(info.type[0] == SWR_TYPE_FLOAT)
 673         {
 674             ///@todo: support 64 bit vb accesses
 675             Value *gatherSrc = VIMMED1(0.0f);
 676
 677             SWR_ASSERT(IsUniformFormat((SWR_FORMAT)ied.Format),
 678                 "Unsupported format for standard gather fetch.");
 679
 680             // Gather components from memory to store in a simdvertex structure
 681             switch (bpc)
 682             {
 683                 case 16:
 684                 {
 685                     Value *vGatherResult[2];
 686
 687                     // if we have at least one component out of x or y to fetch
 688                     if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
 689                     {
 690                         vGatherResult[0] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vGatherMask);
 691                         // e.g. result of first 8x32bit integer gather for 16bit components
 692                         // 256i - 0    1    2    3    4    5    6    7
 693                         //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
 694                         //
 695                     }
 696
 697                     // if we have at least one component out of z or w to fetch
 698                     if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
 699                     {
 700                         // offset base to the next components(zw) in the vertex to gather
 701                         pStreamBase = GEP(pStreamBase, C((char)4));
 702
 703                         vGatherResult[1] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vGatherMask);
 704                         // e.g. result of second 8x32bit integer gather for 16bit components
 705                         // 256i - 0    1    2    3    4    5    6    7
 706                         //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
 707                         //
 708                     }
 709
 710                     // if we have at least one component to shuffle into place
 711                     if (compMask)
 712                     {
 713                         Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, Instruction::CastOps::FPExt, CONVERT_NONE,
 714                             currentVertexElement, outputElt, compMask, compCtrl, vVertexElements);
 715
 716                         // Shuffle gathered components into place in simdvertex struct
 717                         mVWidth == 16 ? Shuffle16bpcGather16(args) : Shuffle16bpcGather(args);  // outputs to vVertexElements ref
 718                     }
 719                 }
 720                     break;
 721                 case 32:
 722                 {
 723                     for (uint32_t i = 0; i < 4; i += 1)
 724                     {
 725                         if (isComponentEnabled(compMask, i))
 726                         {
 727                             // if we need to gather the component
 728                             if (compCtrl[i] == StoreSrc)
 729                             {
 730                                 // Gather a SIMD of vertices
 731                                 // APIs allow a 4GB range for offsets
 732                                 // However, GATHERPS uses signed 32-bit offsets, so only a 2GB range :(
 733                                 // But, we know that elements must be aligned for FETCH. :)
 734                                 // Right shift the offset by a bit and then scale by 2 to remove the sign extension.
 735                                 Value *vShiftedOffsets = LSHR(vOffsets, 1);
 736                                 vVertexElements[currentVertexElement++] = GATHERPS(gatherSrc, pStreamBaseGFX, vShiftedOffsets, vGatherMask, 2, GFX_MEM_CLIENT_FETCH);
 737                             }
 738                             else
 739                             {
 740                                 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
 741                             }
 742
 743                             if (currentVertexElement > 3)
 744                             {
 745                                 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
 746                                 // reset to the next vVertexElement to output
 747                                 currentVertexElement = 0;
 748                             }
 749                         }
 750
 751                         // offset base to the next component in the vertex to gather
 752                         pStreamBase = GEP(pStreamBase, C((char)4));
 753                         pStreamBaseGFX = ADD(pStreamBaseGFX, C((int64_t)4));
 754                     }
 755                 }
 756                     break;
 757                 case 64:
 758                 {
 759                     for (uint32_t i = 0; i < 4; i += 1)
 760                     {
 761                         if (isComponentEnabled(compMask, i))
 762                         {
 763                             // if we need to gather the component
 764                             if (compCtrl[i] == StoreSrc)
 765                             {
 766                                 Value* vShufLo;
 767                                 Value* vShufHi;
 768                                 Value* vShufAll;
 769
 770                                 if (mVWidth == 8)
 771                                 {
 772                                     vShufLo = C({ 0, 1, 2, 3 });
 773                                     vShufHi = C({ 4, 5, 6, 7 });
 774                                     vShufAll = C({ 0, 1, 2, 3, 4, 5, 6, 7 });
 775                                 }
 776                                 else
 777                                 {
 778                                     SWR_ASSERT(mVWidth == 16);
 779                                     vShufLo = C({ 0, 1, 2, 3, 4, 5, 6, 7 });
 780                                     vShufHi = C({ 8, 9, 10, 11, 12, 13, 14, 15 });
 781                                     vShufAll = C({ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 });
 782                                 }
 783
 784                                 Value *vMaskLo = VSHUFFLE(vGatherMask, vGatherMask, vShufLo);
 785                                 Value *vMaskHi = VSHUFFLE(vGatherMask, vGatherMask, vShufHi);
 786
 787                                 Value *vOffsetsLo = VSHUFFLE(vOffsets, vOffsets, vShufLo);
 788                                 Value *vOffsetsHi = VSHUFFLE(vOffsets, vOffsets, vShufHi);
 789
 790                                 Value *vZeroDouble = VECTOR_SPLAT(mVWidth / 2, ConstantFP::get(IRB()->getDoubleTy(), 0.0f));
 791
 792                                 Value* pGatherLo = GATHERPD(vZeroDouble, pStreamBase, vOffsetsLo, vMaskLo);
 793                                 Value* pGatherHi = GATHERPD(vZeroDouble, pStreamBase, vOffsetsHi, vMaskHi);
 794
 795                                 pGatherLo = VCVTPD2PS(pGatherLo);
 796                                 pGatherHi = VCVTPD2PS(pGatherHi);
 797
 798                                 Value *pGather = VSHUFFLE(pGatherLo, pGatherHi, vShufAll);
 799
 800                                 vVertexElements[currentVertexElement++] = pGather;
 801                             }
 802                             else
 803                             {
 804                                 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
 805                             }
 806
 807                             if (currentVertexElement > 3)
 808                             {
 809                                 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
 810                                 // reset to the next vVertexElement to output
 811                                 currentVertexElement = 0;
 812                             }
 813                         }
 814
 815                         // offset base to the next component  in the vertex to gather
 816                         pStreamBase = GEP(pStreamBase, C((char)8));
 817                     }
 818                 }
 819                     break;
 820                 default:
 821                     SWR_INVALID("Tried to fetch invalid FP format");
 822                     break;
 823             }
 824         }
 825         else
 826         {
 827             Instruction::CastOps extendCastType = Instruction::CastOps::CastOpsEnd;
 828             ConversionType conversionType = CONVERT_NONE;
 829
 830             SWR_ASSERT(IsUniformFormat((SWR_FORMAT)ied.Format),
 831                 "Unsupported format for standard gather fetch.");
 832
 833             switch(info.type[0])
 834             {
 835                 case SWR_TYPE_UNORM:
 836                     conversionType = CONVERT_NORMALIZED;
 837                 case SWR_TYPE_UINT:
 838                     extendCastType = Instruction::CastOps::ZExt;
 839                     break;
 840                 case SWR_TYPE_SNORM:
 841                     conversionType = CONVERT_NORMALIZED;
 842                 case SWR_TYPE_SINT:
 843                     extendCastType = Instruction::CastOps::SExt;
 844                     break;
 845                 case SWR_TYPE_USCALED:
 846                     conversionType = CONVERT_USCALED;
 847                     extendCastType = Instruction::CastOps::UIToFP;
 848                     break;
 849                 case SWR_TYPE_SSCALED:
 850                     conversionType = CONVERT_SSCALED;
 851                     extendCastType = Instruction::CastOps::SIToFP;
 852                     break;
 853                 case SWR_TYPE_SFIXED:
 854                     conversionType = CONVERT_SFIXED;
 855                     extendCastType = Instruction::CastOps::SExt;
 856                     break;
 857                 default:
 858                     break;
 859             }
 860
 861             // value substituted when component of gather is masked
 862             Value* gatherSrc = VIMMED1(0);
 863
 864             // Gather components from memory to store in a simdvertex structure
 865             switch (bpc)
 866             {
 867                 case 8:
 868                 {
 869                     // if we have at least one component to fetch
 870                     if (compMask)
 871                     {
 872                         Value *vGatherResult = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask);
 873                         // e.g. result of an 8x32bit integer gather for 8bit components
 874                         // 256i - 0    1    2    3    4    5    6    7
 875                         //        xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw
 876
 877                         Shuffle8bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, extendCastType, conversionType,
 878                             currentVertexElement, outputElt, compMask, compCtrl, vVertexElements, info.swizzle);
 879
 880                         // Shuffle gathered components into place in simdvertex struct
 881                         mVWidth == 16 ? Shuffle8bpcGatherd16(args) : Shuffle8bpcGatherd(args); // outputs to vVertexElements ref
 882                     }
 883                 }
 884                 break;
 885                 case 16:
 886                 {
 887                     Value *vGatherResult[2];
 888
 889                     // if we have at least one component out of x or y to fetch
 890                     if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
 891                     {
 892                         vGatherResult[0] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask);
 893                         // e.g. result of first 8x32bit integer gather for 16bit components
 894                         // 256i - 0    1    2    3    4    5    6    7
 895                         //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
 896                         //
 897                     }
 898
 899                     // if we have at least one component out of z or w to fetch
 900                     if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
 901                     {
 902                         // offset base to the next components(zw) in the vertex to gather
 903                         pStreamBase = GEP(pStreamBase, C((char)4));
 904
 905                         vGatherResult[1] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask);
 906                         // e.g. result of second 8x32bit integer gather for 16bit components
 907                         // 256i - 0    1    2    3    4    5    6    7
 908                         //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
 909                         //
 910                     }
 911
 912                     // if we have at least one component to shuffle into place
 913                     if (compMask)
 914                     {
 915                         Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, extendCastType, conversionType,
 916                             currentVertexElement, outputElt, compMask, compCtrl, vVertexElements);
 917
 918                         // Shuffle gathered components into place in simdvertex struct
 919                         mVWidth == 16 ? Shuffle16bpcGather16(args) : Shuffle16bpcGather(args); // outputs to vVertexElements ref
 920                     }
 921                 }
 922                 break;
 923                 case 32:
 924                 {
 925                     // Gathered components into place in simdvertex struct
 926                     for (uint32_t i = 0; i < 4; i++)
 927                     {
 928                         if (isComponentEnabled(compMask, i))
 929                         {
 930                             // if we need to gather the component
 931                             if (compCtrl[i] == StoreSrc)
 932                             {
 933                                 Value* pGather = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask);
 934
 935                                 if (conversionType == CONVERT_USCALED)
 936                                 {
 937                                     pGather = UI_TO_FP(pGather, mSimdFP32Ty);
 938                                 }
 939                                 else if (conversionType == CONVERT_SSCALED)
 940                                 {
 941                                     pGather = SI_TO_FP(pGather, mSimdFP32Ty);
 942                                 }
 943                                 else if (conversionType == CONVERT_SFIXED)
 944                                 {
 945                                     pGather = FMUL(SI_TO_FP(pGather, mSimdFP32Ty), VBROADCAST(C(1/65536.0f)));
 946                                 }
 947
 948                                 vVertexElements[currentVertexElement++] = pGather;
 949
 950                                 // e.g. result of a single 8x32bit integer gather for 32bit components
 951                                 // 256i - 0    1    2    3    4    5    6    7
 952                                 //        xxxx xxxx xxxx xxxx xxxx xxxx xxxx xxxx
 953                             }
 954                             else
 955                             {
 956                                 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
 957                             }
 958
 959                             if (currentVertexElement > 3)
 960                             {
 961                                 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
 962
 963                                 // reset to the next vVertexElement to output
 964                                 currentVertexElement = 0;
 965                             }
 966
 967                         }
 968
 969                         // offset base to the next component  in the vertex to gather
 970                         pStreamBase = GEP(pStreamBase, C((char)4));
 971                     }
 972                 }
 973                 break;
 974             }
 975         }
 976     }
 977
 978     // if we have a partially filled vVertexElement struct, output it
 979     if (currentVertexElement > 0)
 980     {
 981         StoreVertexElements(pVtxOut, outputElt++, currentVertexElement, vVertexElements);
 982     }
 983 }
 984
 985 typedef void*(*PFN_TRANSLATEGFXADDRESS_FUNC)(void* pdc, gfxptr_t va);
 986 extern "C" void GetSimdValid8bitIndicesGfx(gfxptr_t indices, gfxptr_t lastIndex, uint32_t vWidth, PFN_TRANSLATEGFXADDRESS_FUNC pfnTranslate, void* pdc, uint32_t* outIndices);
 987 extern "C" void GetSimdValid16bitIndicesGfx(gfxptr_t indices, gfxptr_t lastIndex, uint32_t vWidth, PFN_TRANSLATEGFXADDRESS_FUNC pfnTranslate, void* pdc, uint32_t* outIndices);
 988
 989 template<typename T> Value* FetchJit::GetSimdValidIndicesHelper(Value* pIndices, Value* pLastIndex)
 990 {
 991     SWR_ASSERT(pIndices->getType() == mInt64Ty && pLastIndex->getType() == mInt64Ty, "Function expects gfxptr_t for both input parameters.");
 992
 993     Type* Ty = nullptr;
 994
 995     static_assert(sizeof(T) == sizeof(uint16_t) || sizeof(T) == sizeof(uint8_t), "Unsupported type for use with GetSimdValidIndicesHelper<T>");
 996     constexpr bool bSize = (sizeof(T) == sizeof(uint16_t));
 997     if (bSize)
 998     {
 999         Ty = mInt16PtrTy;
1000     }
1001     else if (sizeof(T) == sizeof(uint8_t))
1002     {
1003         Ty = mInt8PtrTy;
1004     }
1005     else
1006     {
1007         SWR_ASSERT(false, "This should never happen as per static_assert above.");
1008     }
1009
1010     Value* vIndices = VUNDEF_I();
1011
1012     {
1013         // store 0 index on stack to be used to conditionally load from if index address is OOB
1014         Value* pZeroIndex = ALLOCA(Ty);
1015         STORE(C((T)0), pZeroIndex);
1016
1017         // Load a SIMD of index pointers
1018         for (int64_t lane = 0; lane < mVWidth; lane++)
1019         {
1020             // Calculate the address of the requested index
1021             Value *pIndex = GEP(pIndices, C(lane), Ty);
1022
1023             pLastIndex = INT_TO_PTR(pLastIndex, Ty);
1024
1025             // check if the address is less than the max index,
1026             Value* mask = ICMP_ULT(pIndex, pLastIndex);
1027
1028             // if valid, load the index. if not, load 0 from the stack
1029             Value* pValid = SELECT(mask, pIndex, pZeroIndex);
1030             Value *index = LOAD(pValid, "valid index", Ty, GFX_MEM_CLIENT_FETCH);
1031
1032             // zero extended index to 32 bits and insert into the correct simd lane
1033             index = Z_EXT(index, mInt32Ty);
1034             vIndices = VINSERT(vIndices, index, lane);
1035         }
1036     }
1037
1038     return vIndices;
1039 }
1040
1041 //////////////////////////////////////////////////////////////////////////
1042 /// @brief Loads a simd of valid indices. OOB indices are set to 0
1043 /// *Note* have to do 8bit index checking in scalar until we have AVX-512
1044 /// support
1045 /// @param pIndices - pointer to 8 bit indices
1046 /// @param pLastIndex - pointer to last valid index
1047 Value* FetchJit::GetSimdValid8bitIndices(Value* pIndices, Value* pLastIndex)
1048 {
1049     return GetSimdValidIndicesHelper<uint8_t>(pIndices, pLastIndex);
1050 }
1051
1052 //////////////////////////////////////////////////////////////////////////
1053 /// @brief Loads a simd of valid indices. OOB indices are set to 0
1054 /// *Note* have to do 16bit index checking in scalar until we have AVX-512
1055 /// support
1056 /// @param pIndices - pointer to 16 bit indices
1057 /// @param pLastIndex - pointer to last valid index
1058 Value* FetchJit::GetSimdValid16bitIndices(Value* pIndices, Value* pLastIndex)
1059 {
1060     return GetSimdValidIndicesHelper<uint16_t>(pIndices, pLastIndex);
1061 }
1062
1063 //////////////////////////////////////////////////////////////////////////
1064 /// @brief Loads a simd of valid indices. OOB indices are set to 0
1065 /// @param pIndices - pointer to 32 bit indices
1066 /// @param pLastIndex - pointer to last valid index
1067 Value* FetchJit::GetSimdValid32bitIndices(Value* pIndices, Value* pLastIndex)
1068 {
1069     DataLayout dL(JM()->mpCurrentModule);
1070     unsigned int ptrSize = dL.getPointerSize() * 8;  // ptr size in bits
1071     Value* iLastIndex = pLastIndex;
1072     Value* iIndices = pIndices;
1073
1074     // get the number of indices left in the buffer (endPtr - curPtr) / sizeof(index)
1075     Value* numIndicesLeft = SUB(iLastIndex,iIndices);
1076     numIndicesLeft = TRUNC(numIndicesLeft, mInt32Ty);
1077     numIndicesLeft = SDIV(numIndicesLeft, C(4));
1078
1079     // create a vector of index counts from the base index ptr passed into the fetch
1080     Constant* vIndexOffsets;
1081     if (mVWidth == 8)
1082     {
1083         vIndexOffsets = C({ 0, 1, 2, 3, 4, 5, 6, 7 });
1084     }
1085     else
1086     {
1087         vIndexOffsets = C({ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 });
1088     }
1089
1090     // compare index count to the max valid index
1091     // e.g vMaxIndex      4 4 4 4 4 4 4 4 : 4 indices left to load
1092     //     vIndexOffsets  0 1 2 3 4 5 6 7
1093     //     ------------------------------
1094     //     vIndexMask    -1-1-1-1 0 0 0 0 : offsets < max pass
1095     //     vLoadedIndices 0 1 2 3 0 0 0 0 : offsets >= max masked to 0
1096     Value* vMaxIndex = VBROADCAST(numIndicesLeft);
1097     Value* vIndexMask = ICMP_SGT(vMaxIndex, vIndexOffsets);
1098
1099     // Load the indices; OOB loads 0
1100     pIndices = BITCAST(pIndices, PointerType::get(mSimdInt32Ty, 0));
1101     return MASKED_LOAD(pIndices, 4, vIndexMask, VIMMED1(0));
1102 }
1103
1104 //////////////////////////////////////////////////////////////////////////
1105 /// @brief Takes a SIMD of gathered 8bpc verts, zero or sign extends,
1106 /// denormalizes if needed, converts to F32 if needed, and positions in
1107 //  the proper SIMD rows to be output to the simdvertex structure
1108 /// @param args: (tuple of args, listed below)
1109 ///   @param vGatherResult - 8 gathered 8bpc vertices
1110 ///   @param pVtxOut - base pointer to output simdvertex struct
1111 ///   @param extendType - sign extend or zero extend
1112 ///   @param bNormalized - do we need to denormalize?
1113 ///   @param currentVertexElement - reference to the current vVertexElement
1114 ///   @param outputElt - reference to the current offset from simdvertex we're o
1115 ///   @param compMask - component packing mask
1116 ///   @param compCtrl - component control val
1117 ///   @param vVertexElements[4] - vertex components to output
1118 ///   @param swizzle[4] - component swizzle location
1119 void FetchJit::Shuffle8bpcGatherd16(Shuffle8bpcArgs &args)
1120 {
1121     // Unpack tuple args
1122     Value*& vGatherResult = std::get<0>(args);
1123     Value* pVtxOut = std::get<1>(args);
1124     const Instruction::CastOps extendType = std::get<2>(args);
1125     const ConversionType conversionType = std::get<3>(args);
1126     uint32_t &currentVertexElement = std::get<4>(args);
1127     uint32_t &outputElt = std::get<5>(args);
1128     const ComponentEnable compMask = std::get<6>(args);
1129     const ComponentControl(&compCtrl)[4] = std::get<7>(args);
1130     Value* (&vVertexElements)[4] = std::get<8>(args);
1131     const uint32_t(&swizzle)[4] = std::get<9>(args);
1132
1133     // cast types
1134     Type *vGatherTy = VectorType::get(mInt32Ty, 8);
1135     Type *v32x8Ty = VectorType::get(mInt8Ty, 32);
1136
1137     // have to do extra work for sign extending
1138     if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP))
1139     {
1140         Type *v16x8Ty = VectorType::get(mInt8Ty, 16); // 8x16bit ints in a 128bit lane
1141         Type *v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), 2);
1142
1143         // shuffle mask, including any swizzling
1144         const char x = (char)swizzle[0]; const char y = (char)swizzle[1];
1145         const char z = (char)swizzle[2]; const char w = (char)swizzle[3];
1146         Value *vConstMask = C<char>({ char(x), char(x + 4), char(x + 8), char(x + 12),
1147             char(y), char(y + 4), char(y + 8), char(y + 12),
1148             char(z), char(z + 4), char(z + 8), char(z + 12),
1149             char(w), char(w + 4), char(w + 8), char(w + 12),
1150             char(x), char(x + 4), char(x + 8), char(x + 12),
1151             char(y), char(y + 4), char(y + 8), char(y + 12),
1152             char(z), char(z + 4), char(z + 8), char(z + 12),
1153             char(w), char(w + 4), char(w + 8), char(w + 12) });
1154
1155         // SIMD16 PSHUFB isnt part of AVX-512F, so split into SIMD8 for the sake of KNL, for now..
1156
1157         Value *vGatherResult_lo = EXTRACT_16(vGatherResult, 0);
1158         Value *vGatherResult_hi = EXTRACT_16(vGatherResult, 1);
1159
1160         Value *vShufResult_lo = BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask), vGatherTy);
1161         Value *vShufResult_hi = BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask), vGatherTy);
1162
1163         // after pshufb: group components together in each 128bit lane
1164         // 256i - 0    1    2    3    4    5    6    7
1165         //        xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww
1166
1167         Value *vi128XY_lo = nullptr;
1168         Value *vi128XY_hi = nullptr;
1169         if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
1170         {
1171             vi128XY_lo = BITCAST(VSHUFFLE(vShufResult_lo, vShufResult_lo, C<int32_t>({ 0, 4, 0, 0, 1, 5, 0, 0 })), v128Ty);
1172             vi128XY_hi = BITCAST(VSHUFFLE(vShufResult_hi, vShufResult_hi, C<int32_t>({ 0, 4, 0, 0, 1, 5, 0, 0 })), v128Ty);
1173
1174             // after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane
1175             // 256i - 0    1    2    3    4    5    6    7
1176             //        xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care)
1177         }
1178
1179         // do the same for zw components
1180         Value *vi128ZW_lo = nullptr;
1181         Value *vi128ZW_hi = nullptr;
1182         if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
1183         {
1184             vi128ZW_lo = BITCAST(VSHUFFLE(vShufResult_lo, vShufResult_lo, C<int32_t>({ 2, 6, 0, 0, 3, 7, 0, 0 })), v128Ty);
1185             vi128ZW_hi = BITCAST(VSHUFFLE(vShufResult_hi, vShufResult_hi, C<int32_t>({ 2, 6, 0, 0, 3, 7, 0, 0 })), v128Ty);
1186         }
1187
1188         // init denormalize variables if needed
1189         Instruction::CastOps fpCast;
1190         Value *conversionFactor;
1191
1192         switch (conversionType)
1193         {
1194         case CONVERT_NORMALIZED:
1195             fpCast = Instruction::CastOps::SIToFP;
1196             conversionFactor = VIMMED1((float)(1.0 / 127.0));
1197             break;
1198         case CONVERT_SSCALED:
1199             fpCast = Instruction::CastOps::SIToFP;
1200             conversionFactor = VIMMED1((float)(1.0));
1201             break;
1202         case CONVERT_USCALED:
1203             SWR_INVALID("Type should not be sign extended!");
1204             conversionFactor = nullptr;
1205             break;
1206         default:
1207             SWR_ASSERT(conversionType == CONVERT_NONE);
1208             conversionFactor = nullptr;
1209             break;
1210         }
1211
1212         // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
1213         for (uint32_t i = 0; i < 4; i++)
1214         {
1215             if (isComponentEnabled(compMask, i))
1216             {
1217                 if (compCtrl[i] == ComponentControl::StoreSrc)
1218                 {
1219                     // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
1220                     uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
1221                     // if x or y, use vi128XY permute result, else use vi128ZW
1222                     Value *selectedPermute_lo = (i < 2) ? vi128XY_lo : vi128ZW_lo;
1223                     Value *selectedPermute_hi = (i < 2) ? vi128XY_hi : vi128ZW_hi;
1224
1225                     // sign extend
1226                     Value *temp_lo = PMOVSXBD(BITCAST(VEXTRACT(selectedPermute_lo, C(lane)), v16x8Ty));
1227                     Value *temp_hi = PMOVSXBD(BITCAST(VEXTRACT(selectedPermute_hi, C(lane)), v16x8Ty));
1228
1229                     Value* temp = JOIN_16(temp_lo, temp_hi);
1230
1231                     // denormalize if needed
1232                     if (conversionType != CONVERT_NONE)
1233                     {
1234                         temp = FMUL(CAST(fpCast, temp, mSimdFP32Ty), conversionFactor);
1235                     }
1236
1237                     vVertexElements[currentVertexElement] = temp;
1238
1239                     currentVertexElement += 1;
1240                 }
1241                 else
1242                 {
1243                     vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1244                 }
1245
1246                 if (currentVertexElement > 3)
1247                 {
1248                     StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1249                     // reset to the next vVertexElement to output
1250                     currentVertexElement = 0;
1251                 }
1252             }
1253         }
1254     }
1255     // else zero extend
1256     else if ((extendType == Instruction::CastOps::ZExt) || (extendType == Instruction::CastOps::UIToFP))
1257     {
1258         // init denormalize variables if needed
1259         Instruction::CastOps fpCast;
1260         Value *conversionFactor;
1261
1262         switch (conversionType)
1263         {
1264         case CONVERT_NORMALIZED:
1265             fpCast = Instruction::CastOps::UIToFP;
1266             conversionFactor = VIMMED1((float)(1.0 / 255.0));
1267             break;
1268         case CONVERT_USCALED:
1269             fpCast = Instruction::CastOps::UIToFP;
1270             conversionFactor = VIMMED1((float)(1.0));
1271             break;
1272         case CONVERT_SSCALED:
1273             SWR_INVALID("Type should not be zero extended!");
1274             conversionFactor = nullptr;
1275             break;
1276         default:
1277             SWR_ASSERT(conversionType == CONVERT_NONE);
1278             conversionFactor = nullptr;
1279             break;
1280         }
1281
1282         // shuffle enabled components into lower byte of each 32bit lane, 0 extending to 32 bits
1283         for (uint32_t i = 0; i < 4; i++)
1284         {
1285             if (isComponentEnabled(compMask, i))
1286             {
1287                 if (compCtrl[i] == ComponentControl::StoreSrc)
1288                 {
1289                     // pshufb masks for each component
1290                     Value *vConstMask;
1291                     switch (swizzle[i])
1292                     {
1293                     case 0:
1294                         // x shuffle mask
1295                         vConstMask = C<char>({ 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1,
1296                             0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1 });
1297                         break;
1298                     case 1:
1299                         // y shuffle mask
1300                         vConstMask = C<char>({ 1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1,
1301                             1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1 });
1302                         break;
1303                     case 2:
1304                         // z shuffle mask
1305                         vConstMask = C<char>({ 2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1,
1306                             2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1 });
1307                         break;
1308                     case 3:
1309                         // w shuffle mask
1310                         vConstMask = C<char>({ 3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1,
1311                             3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1 });
1312                         break;
1313                     default:
1314                         vConstMask = nullptr;
1315                         break;
1316                     }
1317
1318                     Value *vGatherResult_lo = EXTRACT_16(vGatherResult, 0);
1319                     Value *vGatherResult_hi = EXTRACT_16(vGatherResult, 1);
1320
1321                     Value *temp_lo = BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask), vGatherTy);
1322                     Value *temp_hi = BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask), vGatherTy);
1323
1324                     // after pshufb for x channel
1325                     // 256i - 0    1    2    3    4    5    6    7
1326                     //        x000 x000 x000 x000 x000 x000 x000 x000
1327
1328                     Value* temp = JOIN_16(temp_lo, temp_hi);
1329
1330                     // denormalize if needed
1331                     if (conversionType != CONVERT_NONE)
1332                     {
1333                         temp = FMUL(CAST(fpCast, temp, mSimdFP32Ty), conversionFactor);
1334                     }
1335
1336                     vVertexElements[currentVertexElement] = temp;
1337
1338                     currentVertexElement += 1;
1339                 }
1340                 else
1341                 {
1342                     vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1343                 }
1344
1345                 if (currentVertexElement > 3)
1346                 {
1347                     StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1348                     // reset to the next vVertexElement to output
1349                     currentVertexElement = 0;
1350                 }
1351             }
1352         }
1353     }
1354     else
1355     {
1356         SWR_INVALID("Unsupported conversion type");
1357     }
1358 }
1359
1360 void FetchJit::Shuffle8bpcGatherd(Shuffle8bpcArgs &args)
1361 {
1362     // Unpack tuple args
1363     Value*& vGatherResult = std::get<0>(args);
1364     Value* pVtxOut = std::get<1>(args);
1365     const Instruction::CastOps extendType = std::get<2>(args);
1366     const ConversionType conversionType = std::get<3>(args);
1367     uint32_t &currentVertexElement = std::get<4>(args);
1368     uint32_t &outputElt = std::get<5>(args);
1369     const ComponentEnable compMask = std::get<6>(args);
1370     const ComponentControl(&compCtrl)[4] = std::get<7>(args);
1371     Value* (&vVertexElements)[4] = std::get<8>(args);
1372     const uint32_t(&swizzle)[4] = std::get<9>(args);
1373
1374     // cast types
1375     Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
1376
1377     for (uint32_t i = 0; i < 4; i++)
1378     {
1379         if (!isComponentEnabled(compMask, i))
1380             continue;
1381
1382         if (compCtrl[i] == ComponentControl::StoreSrc)
1383         {
1384             std::vector<uint32_t> vShuffleMasks[4] = {
1385                 { 0, 4,  8, 12, 16, 20, 24, 28 }, // x
1386                 { 1, 5,  9, 13, 17, 21, 25, 29 }, // y
1387                 { 2, 6, 10, 14, 18, 22, 26, 30 }, // z
1388                 { 3, 7, 11, 15, 19, 23, 27, 31 }, // w
1389             };
1390
1391             Value *val = VSHUFFLE(BITCAST(vGatherResult, v32x8Ty),
1392                 UndefValue::get(v32x8Ty),
1393                 vShuffleMasks[swizzle[i]]);
1394
1395             if ((extendType == Instruction::CastOps::SExt) ||
1396                 (extendType == Instruction::CastOps::SIToFP)) {
1397                 switch (conversionType)
1398                 {
1399                 case CONVERT_NORMALIZED:
1400                     val = FMUL(SI_TO_FP(val, mSimdFP32Ty), VIMMED1((float)(1.0 / 127.0)));
1401                     break;
1402                 case CONVERT_SSCALED:
1403                     val = SI_TO_FP(val, mSimdFP32Ty);
1404                     break;
1405                 case CONVERT_USCALED:
1406                     SWR_INVALID("Type should not be sign extended!");
1407                     break;
1408                 default:
1409                     SWR_ASSERT(conversionType == CONVERT_NONE);
1410                     val = S_EXT(val, mSimdInt32Ty);
1411                     break;
1412                 }
1413             }
1414             else if ((extendType == Instruction::CastOps::ZExt) ||
1415                 (extendType == Instruction::CastOps::UIToFP)) {
1416                 switch (conversionType)
1417                 {
1418                 case CONVERT_NORMALIZED:
1419                     val = FMUL(UI_TO_FP(val, mSimdFP32Ty), VIMMED1((float)(1.0 / 255.0)));
1420                     break;
1421                 case CONVERT_SSCALED:
1422                     SWR_INVALID("Type should not be zero extended!");
1423                     break;
1424                 case CONVERT_USCALED:
1425                     val = UI_TO_FP(val, mSimdFP32Ty);
1426                     break;
1427                 default:
1428                     SWR_ASSERT(conversionType == CONVERT_NONE);
1429                     val = Z_EXT(val, mSimdInt32Ty);
1430                     break;
1431                 }
1432             }
1433             else
1434             {
1435                 SWR_INVALID("Unsupported conversion type");
1436             }
1437
1438             vVertexElements[currentVertexElement++] = val;
1439         }
1440         else
1441         {
1442             vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1443         }
1444
1445         if (currentVertexElement > 3)
1446         {
1447             StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1448             // reset to the next vVertexElement to output
1449             currentVertexElement = 0;
1450         }
1451     }
1452 }
1453
1454 //////////////////////////////////////////////////////////////////////////
1455 /// @brief Takes a SIMD of gathered 16bpc verts, zero or sign extends,
1456 /// denormalizes if needed, converts to F32 if needed, and positions in
1457 //  the proper SIMD rows to be output to the simdvertex structure
1458 /// @param args: (tuple of args, listed below)
1459 ///   @param vGatherResult[2] - array of gathered 16bpc vertices, 4 per index
1460 ///   @param pVtxOut - base pointer to output simdvertex struct
1461 ///   @param extendType - sign extend or zero extend
1462 ///   @param bNormalized - do we need to denormalize?
1463 ///   @param currentVertexElement - reference to the current vVertexElement
1464 ///   @param outputElt - reference to the current offset from simdvertex we're o
1465 ///   @param compMask - component packing mask
1466 ///   @param compCtrl - component control val
1467 ///   @param vVertexElements[4] - vertex components to output
1468 void FetchJit::Shuffle16bpcGather16(Shuffle16bpcArgs &args)
1469 {
1470     // Unpack tuple args
1471     Value* (&vGatherResult)[2] = std::get<0>(args);
1472     Value* pVtxOut = std::get<1>(args);
1473     const Instruction::CastOps extendType = std::get<2>(args);
1474     const ConversionType conversionType = std::get<3>(args);
1475     uint32_t &currentVertexElement = std::get<4>(args);
1476     uint32_t &outputElt = std::get<5>(args);
1477     const ComponentEnable compMask = std::get<6>(args);
1478     const ComponentControl(&compCtrl)[4] = std::get<7>(args);
1479     Value* (&vVertexElements)[4] = std::get<8>(args);
1480
1481     // cast types
1482     Type *vGatherTy = VectorType::get(mInt32Ty, 8);
1483     Type *v32x8Ty = VectorType::get(mInt8Ty, 32);
1484
1485     // have to do extra work for sign extending
1486     if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP) || (extendType == Instruction::CastOps::FPExt))
1487     {
1488         // is this PP float?
1489         bool bFP = (extendType == Instruction::CastOps::FPExt) ? true : false;
1490
1491         Type *v8x16Ty = VectorType::get(mInt16Ty, 8); // 8x16bit in a 128bit lane
1492         Type *v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), 2);
1493
1494         // shuffle mask
1495         Value *vConstMask = C<uint8_t>({ 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
1496                                           0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 });
1497         Value *vi128XY_lo = nullptr;
1498         Value *vi128XY_hi = nullptr;
1499         if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
1500         {
1501             // SIMD16 PSHUFB isnt part of AVX-512F, so split into SIMD8 for the sake of KNL, for now..
1502
1503             Value *vGatherResult_lo = BITCAST(EXTRACT_16(vGatherResult[0], 0), v32x8Ty);
1504             Value *vGatherResult_hi = BITCAST(EXTRACT_16(vGatherResult[0], 1), v32x8Ty);
1505
1506             Value *vShufResult_lo = BITCAST(PSHUFB(vGatherResult_lo, vConstMask), vGatherTy);
1507             Value *vShufResult_hi = BITCAST(PSHUFB(vGatherResult_hi, vConstMask), vGatherTy);
1508
1509             // after pshufb: group components together in each 128bit lane
1510             // 256i - 0    1    2    3    4    5    6    7
1511             //        xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
1512
1513             vi128XY_lo = BITCAST(VSHUFFLE(vShufResult_lo, vShufResult_lo, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
1514             vi128XY_hi = BITCAST(VSHUFFLE(vShufResult_hi, vShufResult_hi, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
1515
1516             // after PERMD: move and pack xy components into each 128bit lane
1517             // 256i - 0    1    2    3    4    5    6    7
1518             //        xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
1519         }
1520
1521         // do the same for zw components
1522         Value *vi128ZW_lo = nullptr;
1523         Value *vi128ZW_hi = nullptr;
1524         if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
1525         {
1526             Value *vGatherResult_lo = BITCAST(EXTRACT_16(vGatherResult[1], 0), v32x8Ty);
1527             Value *vGatherResult_hi = BITCAST(EXTRACT_16(vGatherResult[1], 1), v32x8Ty);
1528
1529             Value *vShufResult_lo = BITCAST(PSHUFB(vGatherResult_lo, vConstMask), vGatherTy);
1530             Value *vShufResult_hi = BITCAST(PSHUFB(vGatherResult_hi, vConstMask), vGatherTy);
1531
1532             vi128ZW_lo = BITCAST(VSHUFFLE(vShufResult_lo, vShufResult_lo, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
1533             vi128ZW_hi = BITCAST(VSHUFFLE(vShufResult_hi, vShufResult_hi, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
1534         }
1535
1536         // init denormalize variables if needed
1537         Instruction::CastOps IntToFpCast;
1538         Value *conversionFactor;
1539
1540         switch (conversionType)
1541         {
1542         case CONVERT_NORMALIZED:
1543             IntToFpCast = Instruction::CastOps::SIToFP;
1544             conversionFactor = VIMMED1((float)(1.0 / 32767.0));
1545             break;
1546         case CONVERT_SSCALED:
1547             IntToFpCast = Instruction::CastOps::SIToFP;
1548             conversionFactor = VIMMED1((float)(1.0));
1549             break;
1550         case CONVERT_USCALED:
1551             SWR_INVALID("Type should not be sign extended!");
1552             conversionFactor = nullptr;
1553             break;
1554         default:
1555             SWR_ASSERT(conversionType == CONVERT_NONE);
1556             conversionFactor = nullptr;
1557             break;
1558         }
1559
1560         // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
1561         for (uint32_t i = 0; i < 4; i++)
1562         {
1563             if (isComponentEnabled(compMask, i))
1564             {
1565                 if (compCtrl[i] == ComponentControl::StoreSrc)
1566                 {
1567                     // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
1568                     uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
1569                     // if x or y, use vi128XY permute result, else use vi128ZW
1570                     Value *selectedPermute_lo = (i < 2) ? vi128XY_lo : vi128ZW_lo;
1571                     Value *selectedPermute_hi = (i < 2) ? vi128XY_hi : vi128ZW_hi;
1572
1573                     if (bFP)
1574                     {
1575                         // extract 128 bit lanes to sign extend each component
1576                         Value *temp_lo = CVTPH2PS(BITCAST(VEXTRACT(selectedPermute_lo, C(lane)), v8x16Ty));
1577                         Value *temp_hi = CVTPH2PS(BITCAST(VEXTRACT(selectedPermute_hi, C(lane)), v8x16Ty));
1578
1579                         vVertexElements[currentVertexElement] = JOIN_16(temp_lo, temp_hi);
1580                     }
1581                     else
1582                     {
1583                         // extract 128 bit lanes to sign extend each component
1584                         Value *temp_lo = PMOVSXWD(BITCAST(VEXTRACT(selectedPermute_lo, C(lane)), v8x16Ty));
1585                         Value *temp_hi = PMOVSXWD(BITCAST(VEXTRACT(selectedPermute_hi, C(lane)), v8x16Ty));
1586
1587                         Value* temp = JOIN_16(temp_lo, temp_hi);
1588
1589                         // denormalize if needed
1590                         if (conversionType != CONVERT_NONE)
1591                         {
1592                             temp = FMUL(CAST(IntToFpCast, temp, mSimdFP32Ty), conversionFactor);
1593                         }
1594
1595                         vVertexElements[currentVertexElement] = temp;
1596                     }
1597
1598                     currentVertexElement += 1;
1599                 }
1600                 else
1601                 {
1602                     vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1603                 }
1604
1605                 if (currentVertexElement > 3)
1606                 {
1607                     StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1608                     // reset to the next vVertexElement to output
1609                     currentVertexElement = 0;
1610                 }
1611             }
1612         }
1613     }
1614     // else zero extend
1615     else if ((extendType == Instruction::CastOps::ZExt) || (extendType == Instruction::CastOps::UIToFP))
1616     {
1617         // pshufb masks for each component
1618         Value *vConstMask[2];
1619
1620         if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 2))
1621         {
1622             // x/z shuffle mask
1623             vConstMask[0] = C<char>({ 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
1624                 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, });
1625         }
1626
1627         if (isComponentEnabled(compMask, 1) || isComponentEnabled(compMask, 3))
1628         {
1629             // y/w shuffle mask
1630             vConstMask[1] = C<char>({ 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
1631                 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1 });
1632         }
1633
1634         // init denormalize variables if needed
1635         Instruction::CastOps fpCast;
1636         Value* conversionFactor;
1637
1638         switch (conversionType)
1639         {
1640         case CONVERT_NORMALIZED:
1641             fpCast = Instruction::CastOps::UIToFP;
1642             conversionFactor = VIMMED1((float)(1.0 / 65535.0));
1643             break;
1644         case CONVERT_USCALED:
1645             fpCast = Instruction::CastOps::UIToFP;
1646             conversionFactor = VIMMED1((float)(1.0f));
1647             break;
1648         case CONVERT_SSCALED:
1649             SWR_INVALID("Type should not be zero extended!");
1650             conversionFactor = nullptr;
1651             break;
1652         default:
1653             SWR_ASSERT(conversionType == CONVERT_NONE);
1654             conversionFactor = nullptr;
1655             break;
1656         }
1657
1658         // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits
1659         for (uint32_t i = 0; i < 4; i++)
1660         {
1661             if (isComponentEnabled(compMask, i))
1662             {
1663                 if (compCtrl[i] == ComponentControl::StoreSrc)
1664                 {
1665                     // select correct constMask for x/z or y/w pshufb
1666                     uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1;
1667                     // if x or y, use vi128XY permute result, else use vi128ZW
1668                     uint32_t selectedGather = (i < 2) ? 0 : 1;
1669
1670                     // SIMD16 PSHUFB isnt part of AVX-512F, so split into SIMD8 for the sake of KNL, for now..
1671
1672                     Value *vGatherResult_lo = EXTRACT_16(vGatherResult[selectedGather], 0);
1673                     Value *vGatherResult_hi = EXTRACT_16(vGatherResult[selectedGather], 1);
1674
1675                     Value *temp_lo = BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask[selectedMask]), vGatherTy);
1676                     Value *temp_hi = BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask[selectedMask]), vGatherTy);
1677
1678                     // after pshufb mask for x channel; z uses the same shuffle from the second gather
1679                     // 256i - 0    1    2    3    4    5    6    7
1680                     //        xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00
1681
1682                     Value* temp = JOIN_16(temp_lo, temp_hi);
1683
1684                     // denormalize if needed
1685                     if (conversionType != CONVERT_NONE)
1686                     {
1687                         temp = FMUL(CAST(fpCast, temp, mSimdFP32Ty), conversionFactor);
1688                     }
1689
1690                     vVertexElements[currentVertexElement] = temp;
1691
1692                     currentVertexElement += 1;
1693                 }
1694                 else
1695                 {
1696                     vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1697                 }
1698
1699                 if (currentVertexElement > 3)
1700                 {
1701                     StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1702                     // reset to the next vVertexElement to output
1703                     currentVertexElement = 0;
1704                 }
1705             }
1706         }
1707     }
1708     else
1709     {
1710         SWR_INVALID("Unsupported conversion type");
1711     }
1712 }
1713
1714 void FetchJit::Shuffle16bpcGather(Shuffle16bpcArgs &args)
1715 {
1716     // Unpack tuple args
1717     Value* (&vGatherResult)[2] = std::get<0>(args);
1718     Value* pVtxOut = std::get<1>(args);
1719     const Instruction::CastOps extendType = std::get<2>(args);
1720     const ConversionType conversionType = std::get<3>(args);
1721     uint32_t &currentVertexElement = std::get<4>(args);
1722     uint32_t &outputElt = std::get<5>(args);
1723     const ComponentEnable compMask = std::get<6>(args);
1724     const ComponentControl(&compCtrl)[4] = std::get<7>(args);
1725     Value* (&vVertexElements)[4] = std::get<8>(args);
1726
1727     // cast types
1728     Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
1729     Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
1730
1731                                                            // have to do extra work for sign extending
1732     if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP) ||
1733         (extendType == Instruction::CastOps::FPExt))
1734     {
1735         // is this PP float?
1736         bool bFP = (extendType == Instruction::CastOps::FPExt) ? true : false;
1737
1738         Type* v8x16Ty = VectorType::get(mInt16Ty, 8); // 8x16bit in a 128bit lane
1739         Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
1740
1741                                                                                                      // shuffle mask
1742         Value* vConstMask = C<char>({ 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
1743             0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 });
1744         Value* vi128XY = nullptr;
1745         if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)) {
1746             Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherResult[0], v32x8Ty), vConstMask), vGatherTy);
1747             // after pshufb: group components together in each 128bit lane
1748             // 256i - 0    1    2    3    4    5    6    7
1749             //        xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
1750
1751             vi128XY = BITCAST(VPERMD(vShufResult, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
1752             // after PERMD: move and pack xy components into each 128bit lane
1753             // 256i - 0    1    2    3    4    5    6    7
1754             //        xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
1755         }
1756
1757         // do the same for zw components
1758         Value* vi128ZW = nullptr;
1759         if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)) {
1760             Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherResult[1], v32x8Ty), vConstMask), vGatherTy);
1761             vi128ZW = BITCAST(VPERMD(vShufResult, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
1762         }
1763
1764         // init denormalize variables if needed
1765         Instruction::CastOps IntToFpCast;
1766         Value* conversionFactor;
1767
1768         switch (conversionType)
1769         {
1770         case CONVERT_NORMALIZED:
1771             IntToFpCast = Instruction::CastOps::SIToFP;
1772             conversionFactor = VIMMED1((float)(1.0 / 32767.0));
1773             break;
1774         case CONVERT_SSCALED:
1775             IntToFpCast = Instruction::CastOps::SIToFP;
1776             conversionFactor = VIMMED1((float)(1.0));
1777             break;
1778         case CONVERT_USCALED:
1779             SWR_INVALID("Type should not be sign extended!");
1780             conversionFactor = nullptr;
1781             break;
1782         default:
1783             SWR_ASSERT(conversionType == CONVERT_NONE);
1784             conversionFactor = nullptr;
1785             break;
1786         }
1787
1788         // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
1789         for (uint32_t i = 0; i < 4; i++)
1790         {
1791             if (isComponentEnabled(compMask, i))
1792             {
1793                 if (compCtrl[i] == ComponentControl::StoreSrc)
1794                 {
1795                     // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
1796                     uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
1797                     // if x or y, use vi128XY permute result, else use vi128ZW
1798                     Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
1799
1800                     if (bFP) {
1801                         // extract 128 bit lanes to sign extend each component
1802                         vVertexElements[currentVertexElement] = CVTPH2PS(BITCAST(VEXTRACT(selectedPermute, C(lane)), v8x16Ty));
1803                     }
1804                     else {
1805                         // extract 128 bit lanes to sign extend each component
1806                         vVertexElements[currentVertexElement] = PMOVSXWD(BITCAST(VEXTRACT(selectedPermute, C(lane)), v8x16Ty));
1807
1808                         // denormalize if needed
1809                         if (conversionType != CONVERT_NONE) {
1810                             vVertexElements[currentVertexElement] = FMUL(CAST(IntToFpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor);
1811                         }
1812                     }
1813                     currentVertexElement++;
1814                 }
1815                 else
1816                 {
1817                     vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1818                 }
1819
1820                 if (currentVertexElement > 3)
1821                 {
1822                     StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1823                     // reset to the next vVertexElement to output
1824                     currentVertexElement = 0;
1825                 }
1826             }
1827         }
1828     }
1829     // else zero extend
1830     else if ((extendType == Instruction::CastOps::ZExt) || (extendType == Instruction::CastOps::UIToFP))
1831     {
1832         // pshufb masks for each component
1833         Value* vConstMask[2];
1834         if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 2)) {
1835             // x/z shuffle mask
1836             vConstMask[0] = C<char>({ 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
1837                 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, });
1838         }
1839
1840         if (isComponentEnabled(compMask, 1) || isComponentEnabled(compMask, 3)) {
1841             // y/w shuffle mask
1842             vConstMask[1] = C<char>({ 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
1843                 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1 });
1844         }
1845
1846         // init denormalize variables if needed
1847         Instruction::CastOps fpCast;
1848         Value* conversionFactor;
1849
1850         switch (conversionType)
1851         {
1852         case CONVERT_NORMALIZED:
1853             fpCast = Instruction::CastOps::UIToFP;
1854             conversionFactor = VIMMED1((float)(1.0 / 65535.0));
1855             break;
1856         case CONVERT_USCALED:
1857             fpCast = Instruction::CastOps::UIToFP;
1858             conversionFactor = VIMMED1((float)(1.0f));
1859             break;
1860         case CONVERT_SSCALED:
1861             SWR_INVALID("Type should not be zero extended!");
1862             conversionFactor = nullptr;
1863             break;
1864         default:
1865             SWR_ASSERT(conversionType == CONVERT_NONE);
1866             conversionFactor = nullptr;
1867             break;
1868         }
1869
1870         // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits
1871         for (uint32_t i = 0; i < 4; i++)
1872         {
1873             if (isComponentEnabled(compMask, i))
1874             {
1875                 if (compCtrl[i] == ComponentControl::StoreSrc)
1876                 {
1877                     // select correct constMask for x/z or y/w pshufb
1878                     uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1;
1879                     // if x or y, use vi128XY permute result, else use vi128ZW
1880                     uint32_t selectedGather = (i < 2) ? 0 : 1;
1881
1882                     vVertexElements[currentVertexElement] = BITCAST(PSHUFB(BITCAST(vGatherResult[selectedGather], v32x8Ty), vConstMask[selectedMask]), vGatherTy);
1883                     // after pshufb mask for x channel; z uses the same shuffle from the second gather
1884                     // 256i - 0    1    2    3    4    5    6    7
1885                     //        xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00
1886
1887                     // denormalize if needed
1888                     if (conversionType != CONVERT_NONE)
1889                     {
1890                         vVertexElements[currentVertexElement] = FMUL(CAST(fpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor);
1891                     }
1892                     currentVertexElement++;
1893                 }
1894                 else
1895                 {
1896                     vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1897                 }
1898
1899                 if (currentVertexElement > 3)
1900                 {
1901                     StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1902                     // reset to the next vVertexElement to output
1903                     currentVertexElement = 0;
1904                 }
1905             }
1906         }
1907     }
1908     else
1909     {
1910         SWR_INVALID("Unsupported conversion type");
1911     }
1912 }
1913
1914 //////////////////////////////////////////////////////////////////////////
1915 /// @brief Output a simdvertex worth of elements to the current outputElt
1916 /// @param pVtxOut - base address of VIN output struct
1917 /// @param outputElt - simdvertex offset in VIN to write to
1918 /// @param numEltsToStore - number of simdvertex rows to write out
1919 /// @param vVertexElements - LLVM Value*[] simdvertex to write out
1920 void FetchJit::StoreVertexElements(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4])
1921 {
1922     SWR_ASSERT(numEltsToStore <= 4, "Invalid element count.");
1923
1924     for (uint32_t c = 0; c < numEltsToStore; ++c)
1925     {
1926         // STORE expects FP32 x vWidth type, just bitcast if needed
1927         if (!vVertexElements[c]->getType()->getScalarType()->isFloatTy())
1928         {
1929 #if FETCH_DUMP_VERTEX
1930             PRINT("vVertexElements[%d]: 0x%x\n", { C(c), vVertexElements[c] });
1931 #endif
1932             vVertexElements[c] = BITCAST(vVertexElements[c], mSimdFP32Ty);
1933         }
1934 #if FETCH_DUMP_VERTEX
1935         else
1936         {
1937             PRINT("vVertexElements[%d]: %f\n", { C(c), vVertexElements[c] });
1938         }
1939 #endif
1940         // outputElt * 4 = offsetting by the size of a simdvertex
1941         // + c offsets to a 32bit x vWidth row within the current vertex
1942         Value* dest = GEP(pVtxOut, C(outputElt * 4 + c), nullptr, "destGEP");
1943         STORE(vVertexElements[c], dest);
1944     }
1945 }
1946
1947 //////////////////////////////////////////////////////////////////////////
1948 /// @brief Generates a constant vector of values based on the
1949 /// ComponentControl value
1950 /// @param ctrl - ComponentControl value
1951 Value *FetchJit::GenerateCompCtrlVector(const ComponentControl ctrl)
1952 {
1953     switch (ctrl)
1954     {
1955     case NoStore:
1956         return VUNDEF_I();
1957     case Store0:
1958         return VIMMED1(0);
1959     case Store1Fp:
1960         return VIMMED1(1.0f);
1961     case Store1Int:
1962         return VIMMED1(1);
1963     case StoreVertexId:
1964     {
1965         if (mVWidth == 16)
1966         {
1967             Type* pSimd8FPTy = VectorType::get(mFP32Ty, 8);
1968             Value *pIdLo = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID })), pSimd8FPTy);
1969             Value *pIdHi = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID2 })), pSimd8FPTy);
1970             return JOIN_16(pIdLo, pIdHi);
1971         }
1972         else
1973         {
1974             return BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID })), mSimdFP32Ty);
1975         }
1976     }
1977     case StoreInstanceId:
1978         {
1979             Value *pId = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_CurInstance })), mFP32Ty);
1980             return VBROADCAST(pId);
1981         }
1982
1983
1984     case StoreSrc:
1985     default:
1986         SWR_INVALID("Invalid component control");
1987         return VUNDEF_I();
1988     }
1989 }
1990
1991 //////////////////////////////////////////////////////////////////////////
1992 /// @brief Returns the enable mask for the specified component.
1993 /// @param enableMask - enable bits
1994 /// @param component - component to check if enabled.
1995 bool isComponentEnabled(ComponentEnable enableMask, uint8_t component)
1996 {
1997     switch (component)
1998     {
1999         // X
2000     case 0: return (enableMask & ComponentEnable::X);
2001         // Y
2002     case 1: return (enableMask & ComponentEnable::Y);
2003         // Z
2004     case 2: return (enableMask & ComponentEnable::Z);
2005         // W
2006     case 3: return (enableMask & ComponentEnable::W);
2007
2008     default: return false;
2009     }
2010 }
2011
2012 // Don't want two threads compiling the same fetch shader simultaneously
2013 // Has problems in the JIT cache implementation
2014 // This is only a problem for fetch right now.
2015 static std::mutex gFetchCodegenMutex;
2016
2017 //////////////////////////////////////////////////////////////////////////
2018 /// @brief JITs from fetch shader IR
2019 /// @param hJitMgr - JitManager handle
2020 /// @param func   - LLVM function IR
2021 /// @return PFN_FETCH_FUNC - pointer to fetch code
2022 PFN_FETCH_FUNC JitFetchFunc(HANDLE hJitMgr, const HANDLE hFunc)
2023 {
2024     const llvm::Function* func = (const llvm::Function*)hFunc;
2025     JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
2026     PFN_FETCH_FUNC pfnFetch;
2027
2028     gFetchCodegenMutex.lock();
2029     pfnFetch = (PFN_FETCH_FUNC)(pJitMgr->mpExec->getFunctionAddress(func->getName().str()));
2030     // MCJIT finalizes modules the first time you JIT code from them. After finalized, you cannot add new IR to the module
2031     pJitMgr->mIsModuleFinalized = true;
2032
2033 #if defined(KNOB_SWRC_TRACING)
2034     char fName[1024];
2035     const char *funcName = func->getName().data();
2036     sprintf(fName, "%s.bin", funcName);
2037     FILE *fd = fopen(fName, "wb");
2038     fwrite((void *)pfnFetch, 1, 2048, fd);
2039     fclose(fd);
2040 #endif
2041
2042     pJitMgr->DumpAsm(const_cast<llvm::Function*>(func), "final");
2043     gFetchCodegenMutex.unlock();
2044
2045
2046
2047     return pfnFetch;
2048 }
2049
2050 //////////////////////////////////////////////////////////////////////////
2051 /// @brief JIT compiles fetch shader
2052 /// @param hJitMgr - JitManager handle
2053 /// @param state   - fetch state to build function from
2054 extern "C" PFN_FETCH_FUNC JITCALL JitCompileFetch(HANDLE hJitMgr, const FETCH_COMPILE_STATE& state)
2055 {
2056     JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
2057
2058     pJitMgr->SetupNewModule();
2059
2060     FetchJit theJit(pJitMgr);
2061     HANDLE hFunc = theJit.Create(state);
2062
2063     return JitFetchFunc(hJitMgr, hFunc);
2064 }