src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp

   1 /****************************************************************************
   2 * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
   3 *
   4 * Permission is hereby granted, free of charge, to any person obtaining a
   5 * copy of this software and associated documentation files (the "Software"),
   6 * to deal in the Software without restriction, including without limitation
   7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8 * and/or sell copies of the Software, and to permit persons to whom the
   9 * Software is furnished to do so, subject to the following conditions:
  10 *
  11 * The above copyright notice and this permission notice (including the next
  12 * paragraph) shall be included in all copies or substantial portions of the
  13 * Software.
  14 *
  15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21 * IN THE SOFTWARE.
  22 *
  23 * @file fetch_jit.cpp
  24 *
  25 * @brief Implementation of the fetch jitter
  26 *
  27 * Notes:
  28 *
  29 ******************************************************************************/
  30 #include "builder.h"
  31 #include "jit_api.h"
  32 #include "fetch_jit.h"
  33 #include "gen_state_llvm.h"
  34 #include <sstream>
  35 #include <tuple>
  36
  37 //#define FETCH_DUMP_VERTEX 1
  38 using namespace llvm;
  39 using namespace SwrJit;
  40
  41 bool isComponentEnabled(ComponentEnable enableMask, uint8_t component);
  42
  43 enum ConversionType
  44 {
  45     CONVERT_NONE,
  46     CONVERT_NORMALIZED,
  47     CONVERT_USCALED,
  48     CONVERT_SSCALED,
  49     CONVERT_SFIXED,
  50 };
  51
  52 //////////////////////////////////////////////////////////////////////////
  53 /// Interface to Jitting a fetch shader
  54 //////////////////////////////////////////////////////////////////////////
  55 struct FetchJit : public Builder
  56 {
  57     FetchJit(JitManager* pJitMgr) : Builder(pJitMgr){};
  58
  59     Function* Create(const FETCH_COMPILE_STATE& fetchState);
  60     Value* GetSimdValid32bitIndices(Value* vIndices, Value* pLastIndex);
  61     Value* GetSimdValid16bitIndices(Value* vIndices, Value* pLastIndex);
  62     Value* GetSimdValid8bitIndices(Value* vIndices, Value* pLastIndex);
  63
  64     // package up Shuffle*bpcGatherd args into a tuple for convenience
  65     typedef std::tuple<Value*&, Value*, const Instruction::CastOps, const ConversionType,
  66         uint32_t&, uint32_t&, const ComponentEnable, const ComponentControl(&)[4], Value*(&)[4],
  67         const uint32_t(&)[4]> Shuffle8bpcArgs;
  68 #if USE_SIMD16_SHADERS
  69     void Shuffle8bpcGatherd(Shuffle8bpcArgs &args, bool useVertexID2);
  70 #else
  71     void Shuffle8bpcGatherd(Shuffle8bpcArgs &args);
  72 #endif
  73
  74     typedef std::tuple<Value*(&)[2], Value*, const Instruction::CastOps, const ConversionType,
  75         uint32_t&, uint32_t&, const ComponentEnable, const ComponentControl(&)[4], Value*(&)[4]> Shuffle16bpcArgs;
  76 #if USE_SIMD16_SHADERS
  77     void Shuffle16bpcGather(Shuffle16bpcArgs &args, bool useVertexID2);
  78 #else
  79     void Shuffle16bpcGather(Shuffle16bpcArgs &args);
  80 #endif
  81
  82     void StoreVertexElements(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4]);
  83 #if USE_SIMD16_BUILDER
  84     void StoreVertexElements2(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4]);
  85 #endif
  86
  87 #if USE_SIMD16_SHADERS
  88     Value* GenerateCompCtrlVector(const ComponentControl ctrl, bool useVertexID2);
  89 #else
  90     Value* GenerateCompCtrlVector(const ComponentControl ctrl);
  91 #endif
  92
  93     void JitLoadVertices(const FETCH_COMPILE_STATE &fetchState, Value* streams, Value* vIndices, Value* pVtxOut);
  94 #if USE_SIMD16_SHADERS
  95 #define USE_SIMD16_GATHERS 0
  96
  97 #if USE_SIMD16_GATHERS
  98     void JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, Value *streams, Value *vIndices, Value *vIndices2, Value *pVtxOut, bool useVertexID2);
  99 #else
 100     void JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, Value* streams, Value* vIndices, Value* pVtxOut, bool useVertexID2);
 101 #endif
 102 #else
 103     void JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, Value* streams, Value* vIndices, Value* pVtxOut);
 104 #endif
 105
 106     bool IsOddFormat(SWR_FORMAT format);
 107     bool IsUniformFormat(SWR_FORMAT format);
 108     void UnpackComponents(SWR_FORMAT format, Value* vInput, Value* result[4]);
 109     void CreateGatherOddFormats(SWR_FORMAT format, Value* pMask, Value* pBase, Value* offsets, Value* result[4]);
 110     void ConvertFormat(SWR_FORMAT format, Value *texels[4]);
 111
 112     Value* mpFetchInfo;
 113 };
 114
 115 Function* FetchJit::Create(const FETCH_COMPILE_STATE& fetchState)
 116 {
 117     std::stringstream fnName("FetchShader_", std::ios_base::in | std::ios_base::out | std::ios_base::ate);
 118     fnName << ComputeCRC(0, &fetchState, sizeof(fetchState));
 119
 120     Function*    fetch = Function::Create(JM()->mFetchShaderTy, GlobalValue::ExternalLinkage, fnName.str(), JM()->mpCurrentModule);
 121     BasicBlock*    entry = BasicBlock::Create(JM()->mContext, "entry", fetch);
 122
 123     fetch->getParent()->setModuleIdentifier(fetch->getName());
 124
 125     IRB()->SetInsertPoint(entry);
 126
 127     auto    argitr = fetch->arg_begin();
 128
 129     // Fetch shader arguments
 130     mpFetchInfo = &*argitr; ++argitr;
 131     mpFetchInfo->setName("fetchInfo");
 132     Value*    pVtxOut = &*argitr;
 133     pVtxOut->setName("vtxOutput");
 134     // this is just shorthand to tell LLVM to get a pointer to the base address of simdvertex
 135     // index 0(just the pointer to the simdvertex structure
 136     // index 1(which element of the simdvertex structure to offset to(in this case 0)
 137     // so the indices being i32's doesn't matter
 138     // TODO: generated this GEP with a VECTOR structure type so this makes sense
 139     std::vector<Value*>    vtxInputIndices(2, C(0));
 140     // GEP
 141     pVtxOut = GEP(pVtxOut, C(0));
 142 #if USE_SIMD16_SHADERS
 143 #if 0// USE_SIMD16_BUILDER
 144     pVtxOut = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth2), 0));
 145 #else
 146     pVtxOut = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth), 0));
 147 #endif
 148 #else
 149     pVtxOut = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth), 0));
 150 #endif
 151
 152     // SWR_FETCH_CONTEXT::pStreams
 153     Value*    streams = LOAD(mpFetchInfo,{0, SWR_FETCH_CONTEXT_pStreams});
 154     streams->setName("pStreams");
 155
 156     // SWR_FETCH_CONTEXT::pIndices
 157     Value*    indices = LOAD(mpFetchInfo,{0, SWR_FETCH_CONTEXT_pIndices});
 158     indices->setName("pIndices");
 159
 160     // SWR_FETCH_CONTEXT::pLastIndex
 161     Value*    pLastIndex = LOAD(mpFetchInfo,{0, SWR_FETCH_CONTEXT_pLastIndex});
 162     pLastIndex->setName("pLastIndex");
 163
 164
 165     Value* vIndices;
 166 #if USE_SIMD16_SHADERS
 167     Value* indices2;
 168     Value* vIndices2;
 169 #endif
 170     switch(fetchState.indexType)
 171     {
 172         case R8_UINT:
 173             indices = BITCAST(indices, Type::getInt8PtrTy(JM()->mContext, 0));
 174 #if USE_SIMD16_SHADERS
 175             indices2 = GEP(indices, C(8));
 176 #endif
 177             if(fetchState.bDisableIndexOOBCheck)
 178             {
 179                 vIndices = LOAD(BITCAST(indices, PointerType::get(VectorType::get(mInt8Ty, mpJitMgr->mVWidth), 0)), {(uint32_t)0});
 180                 vIndices = Z_EXT(vIndices, mSimdInt32Ty);
 181 #if USE_SIMD16_SHADERS
 182                 vIndices2 = LOAD(BITCAST(indices2, PointerType::get(VectorType::get(mInt8Ty, mpJitMgr->mVWidth), 0)), { (uint32_t)0 });
 183                 vIndices2 = Z_EXT(vIndices2, mSimdInt32Ty);
 184 #endif
 185             }
 186             else
 187             {
 188                 pLastIndex = BITCAST(pLastIndex, Type::getInt8PtrTy(JM()->mContext, 0));
 189                 vIndices = GetSimdValid8bitIndices(indices, pLastIndex);
 190 #if USE_SIMD16_SHADERS
 191                 pLastIndex = BITCAST(pLastIndex, Type::getInt8PtrTy(JM()->mContext, 0));
 192                 vIndices2 = GetSimdValid8bitIndices(indices2, pLastIndex);
 193 #endif
 194             }
 195             break;
 196         case R16_UINT:
 197             indices = BITCAST(indices, Type::getInt16PtrTy(JM()->mContext, 0));
 198 #if USE_SIMD16_SHADERS
 199             indices2 = GEP(indices, C(8));
 200 #endif
 201             if(fetchState.bDisableIndexOOBCheck)
 202             {
 203                 vIndices = LOAD(BITCAST(indices, PointerType::get(VectorType::get(mInt16Ty, mpJitMgr->mVWidth), 0)), {(uint32_t)0});
 204                 vIndices = Z_EXT(vIndices, mSimdInt32Ty);
 205 #if USE_SIMD16_SHADERS
 206                 vIndices2 = LOAD(BITCAST(indices2, PointerType::get(VectorType::get(mInt16Ty, mpJitMgr->mVWidth), 0)), { (uint32_t)0 });
 207                 vIndices2 = Z_EXT(vIndices2, mSimdInt32Ty);
 208 #endif
 209             }
 210             else
 211             {
 212                 pLastIndex = BITCAST(pLastIndex, Type::getInt16PtrTy(JM()->mContext, 0));
 213                 vIndices = GetSimdValid16bitIndices(indices, pLastIndex);
 214 #if USE_SIMD16_SHADERS
 215                 pLastIndex = BITCAST(pLastIndex, Type::getInt16PtrTy(JM()->mContext, 0));
 216                 vIndices2 = GetSimdValid16bitIndices(indices2, pLastIndex);
 217 #endif
 218             }
 219             break;
 220         case R32_UINT:
 221 #if USE_SIMD16_SHADERS
 222             indices2 = GEP(indices, C(8));
 223 #endif
 224             (fetchState.bDisableIndexOOBCheck) ? vIndices = LOAD(BITCAST(indices, PointerType::get(mSimdInt32Ty,0)),{(uint32_t)0})
 225                                                : vIndices = GetSimdValid32bitIndices(indices, pLastIndex);
 226 #if USE_SIMD16_SHADERS
 227             (fetchState.bDisableIndexOOBCheck) ? vIndices2 = LOAD(BITCAST(indices2, PointerType::get(mSimdInt32Ty, 0)), { (uint32_t)0 })
 228                                                : vIndices2 = GetSimdValid32bitIndices(indices2, pLastIndex);
 229 #endif
 230             break; // incoming type is already 32bit int
 231         default: SWR_INVALID("Unsupported index type"); vIndices = nullptr; break;
 232     }
 233
 234     if(fetchState.bForceSequentialAccessEnable)
 235     {
 236         Value* pOffsets = C({ 0, 1, 2, 3, 4, 5, 6, 7 });
 237
 238         // VertexData buffers are accessed sequentially, the index is equal to the vertex number
 239         vIndices = VBROADCAST(LOAD(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_StartVertex }));
 240         vIndices = ADD(vIndices, pOffsets);
 241 #if USE_SIMD16_SHADERS
 242         vIndices2 = ADD(vIndices, VIMMED1(8));
 243 #endif
 244     }
 245
 246     Value* vVertexId = vIndices;
 247 #if USE_SIMD16_SHADERS
 248     Value* vVertexId2 = vIndices2;
 249 #endif
 250     if (fetchState.bVertexIDOffsetEnable)
 251     {
 252         // Assuming one of baseVertex or startVertex is 0, so adding both should be functionally correct
 253         Value* vBaseVertex = VBROADCAST(LOAD(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_BaseVertex }));
 254         Value* vStartVertex = VBROADCAST(LOAD(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_StartVertex }));
 255         vVertexId = ADD(vIndices, vBaseVertex);
 256         vVertexId = ADD(vVertexId, vStartVertex);
 257 #if USE_SIMD16_SHADERS
 258         vVertexId2 = ADD(vIndices2, vBaseVertex);
 259         vVertexId2 = ADD(vVertexId2, vStartVertex);
 260 #endif
 261     }
 262
 263     // store out vertex IDs
 264     STORE(vVertexId, GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID }));
 265 #if USE_SIMD16_SHADERS
 266     STORE(vVertexId2, GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID2 }));
 267 #endif
 268
 269     // store out cut mask if enabled
 270     if (fetchState.bEnableCutIndex)
 271     {
 272         Value* vCutIndex = VIMMED1(fetchState.cutIndex);
 273         Value* cutMask = VMASK(ICMP_EQ(vIndices, vCutIndex));
 274         STORE(cutMask, GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_CutMask }));
 275 #if USE_SIMD16_SHADERS
 276         Value* cutMask2 = VMASK(ICMP_EQ(vIndices2, vCutIndex));
 277         STORE(cutMask2, GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_CutMask2 }));
 278 #endif
 279     }
 280
 281     // Fetch attributes from memory and output to a simdvertex struct
 282     // since VGATHER has a perf penalty on HSW vs BDW, allow client to choose which fetch method to use
 283 #if USE_SIMD16_SHADERS
 284     if (fetchState.bDisableVGATHER)
 285     {
 286         JitLoadVertices(fetchState, streams, vIndices, pVtxOut);
 287         JitLoadVertices(fetchState, streams, vIndices2, GEP(pVtxOut, C(1)));
 288     }
 289     else
 290     {
 291 #if USE_SIMD16_GATHERS
 292         JitGatherVertices(fetchState, streams, vIndices, vIndices2, pVtxOut, false);
 293 #else
 294         JitGatherVertices(fetchState, streams, vIndices, pVtxOut, false);
 295         JitGatherVertices(fetchState, streams, vIndices2, GEP(pVtxOut, C(1)), true);
 296 #endif
 297     }
 298 #else
 299     (fetchState.bDisableVGATHER) ? JitLoadVertices(fetchState, streams, vIndices, pVtxOut)
 300                                  : JitGatherVertices(fetchState, streams, vIndices, pVtxOut);
 301 #endif
 302
 303     RET_VOID();
 304
 305     JitManager::DumpToFile(fetch, "src");
 306
 307 #if defined(_DEBUG)
 308     verifyFunction(*fetch);
 309 #endif
 310
 311     ::FunctionPassManager setupPasses(JM()->mpCurrentModule);
 312
 313     ///@todo We don't need the CFG passes for fetch. (e.g. BreakCriticalEdges and CFGSimplification)
 314     setupPasses.add(createBreakCriticalEdgesPass());
 315     setupPasses.add(createCFGSimplificationPass());
 316     setupPasses.add(createEarlyCSEPass());
 317     setupPasses.add(createPromoteMemoryToRegisterPass());
 318
 319     setupPasses.run(*fetch);
 320
 321     JitManager::DumpToFile(fetch, "se");
 322
 323     ::FunctionPassManager optPasses(JM()->mpCurrentModule);
 324
 325     ///@todo Haven't touched these either. Need to remove some of these and add others.
 326     optPasses.add(createCFGSimplificationPass());
 327     optPasses.add(createEarlyCSEPass());
 328     optPasses.add(createInstructionCombiningPass());
 329     optPasses.add(createInstructionSimplifierPass());
 330     optPasses.add(createConstantPropagationPass());
 331     optPasses.add(createSCCPPass());
 332     optPasses.add(createAggressiveDCEPass());
 333
 334     optPasses.run(*fetch);
 335     optPasses.run(*fetch);
 336
 337     JitManager::DumpToFile(fetch, "opt");
 338
 339     return fetch;
 340 }
 341
 342 //////////////////////////////////////////////////////////////////////////
 343 /// @brief Loads attributes from memory using LOADs, shuffling the
 344 /// components into SOA form.
 345 /// *Note* currently does not support component control,
 346 /// component packing, instancing
 347 /// @param fetchState - info about attributes to be fetched from memory
 348 /// @param streams - value pointer to the current vertex stream
 349 /// @param vIndices - vector value of indices to load
 350 /// @param pVtxOut - value pointer to output simdvertex struct
 351 void FetchJit::JitLoadVertices(const FETCH_COMPILE_STATE &fetchState, Value* streams, Value* vIndices, Value* pVtxOut)
 352 {
 353     // Zack shuffles; a variant of the Charleston.
 354
 355     std::vector<Value*> vectors(16);
 356     std::vector<Constant*>    pMask(mVWidth);
 357     for(uint32_t i = 0; i < mVWidth; ++i)
 358     {
 359         pMask[i] = (C(i < 4 ? i : 4));
 360     }
 361     Constant* promoteMask = ConstantVector::get(pMask);
 362     Constant* uwvec = UndefValue::get(VectorType::get(mFP32Ty, 4));
 363
 364     Value* startVertex = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartVertex});
 365     Value* startInstance = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartInstance});
 366     Value* curInstance = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_CurInstance});
 367     Value* vBaseVertex = VBROADCAST(LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_BaseVertex}));
 368     curInstance->setName("curInstance");
 369
 370     for(uint32_t nelt = 0; nelt < fetchState.numAttribs; ++nelt)
 371     {
 372         Value*    elements[4] = {0};
 373         const INPUT_ELEMENT_DESC& ied = fetchState.layout[nelt];
 374         const SWR_FORMAT_INFO &info = GetFormatInfo((SWR_FORMAT)ied.Format);
 375         SWR_ASSERT((info.bpp != 0), "Unsupported format in JitLoadVertices.");
 376         uint32_t    numComponents = info.numComps;
 377         uint32_t bpc = info.bpp / info.numComps;  ///@todo Code below assumes all components are same size. Need to fix.
 378
 379         // load path doesn't support component packing
 380         SWR_ASSERT(ied.ComponentPacking == ComponentEnable::XYZW, "Fetch load path doesn't support component packing.");
 381
 382         vectors.clear();
 383
 384         if (fetchState.bInstanceIDOffsetEnable)
 385         {
 386             SWR_ASSERT((0), "TODO: Fill out more once driver sends this down");
 387         }
 388
 389         Value *vCurIndices;
 390         Value *startOffset;
 391         if(ied.InstanceEnable)
 392         {
 393             Value* stepRate = C(ied.InstanceAdvancementState);
 394
 395             // prevent a div by 0 for 0 step rate
 396             Value* isNonZeroStep = ICMP_UGT(stepRate, C(0));
 397             stepRate = SELECT(isNonZeroStep, stepRate, C(1));
 398
 399             // calc the current offset into instanced data buffer
 400             Value* calcInstance = UDIV(curInstance, stepRate);
 401
 402             // if step rate is 0, every instance gets instance 0
 403             calcInstance = SELECT(isNonZeroStep, calcInstance, C(0));
 404
 405             vCurIndices = VBROADCAST(calcInstance);
 406
 407             startOffset = startInstance;
 408         }
 409         else if (ied.InstanceStrideEnable)
 410         {
 411             SWR_ASSERT((0), "TODO: Fill out more once driver sends this down.");
 412         }
 413         else
 414         {
 415             // offset indices by baseVertex
 416             vCurIndices = ADD(vIndices, vBaseVertex);
 417
 418             startOffset = startVertex;
 419         }
 420
 421         // load SWR_VERTEX_BUFFER_STATE::pData
 422         Value *stream = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pData});
 423
 424         // load SWR_VERTEX_BUFFER_STATE::pitch
 425         Value *stride = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pitch});
 426         stride = Z_EXT(stride, mInt64Ty);
 427
 428         // load SWR_VERTEX_BUFFER_STATE::size
 429         Value *size = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_size});
 430         size = Z_EXT(size, mInt64Ty);
 431
 432         Value* startVertexOffset = MUL(Z_EXT(startOffset, mInt64Ty), stride);
 433
 434         Value *minVertex = NULL;
 435         Value *minVertexOffset = NULL;
 436         if (fetchState.bPartialVertexBuffer) {
 437             // fetch min index for low bounds checking
 438             minVertex = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_minVertex)});
 439             minVertex = LOAD(minVertex);
 440             if (!fetchState.bDisableIndexOOBCheck) {
 441                 minVertexOffset = MUL(Z_EXT(minVertex, mInt64Ty), stride);
 442             }
 443         }
 444
 445         // Load from the stream.
 446         for(uint32_t lane = 0; lane < mVWidth; ++lane)
 447         {
 448             // Get index
 449             Value* index = VEXTRACT(vCurIndices, C(lane));
 450
 451             if (fetchState.bPartialVertexBuffer) {
 452                 // clamp below minvertex
 453                 Value *isBelowMin = ICMP_SLT(index, minVertex);
 454                 index = SELECT(isBelowMin, minVertex, index);
 455             }
 456
 457             index = Z_EXT(index, mInt64Ty);
 458
 459             Value*    offset = MUL(index, stride);
 460             offset = ADD(offset, C((int64_t)ied.AlignedByteOffset));
 461             offset = ADD(offset, startVertexOffset);
 462
 463             if (!fetchState.bDisableIndexOOBCheck) {
 464                 // check for out of bound access, including partial OOB, and replace them with minVertex
 465                 Value *endOffset = ADD(offset, C((int64_t)info.Bpp));
 466                 Value *oob = ICMP_ULE(endOffset, size);
 467                 if (fetchState.bPartialVertexBuffer) {
 468                     offset = SELECT(oob, offset, minVertexOffset);
 469                 } else {
 470                     offset = SELECT(oob, offset, ConstantInt::get(mInt64Ty, 0));
 471                 }
 472             }
 473
 474             Value*    pointer = GEP(stream, offset);
 475             // We use a full-lane, but don't actually care.
 476             Value*    vptr = 0;
 477
 478             // get a pointer to a 4 component attrib in default address space
 479             switch(bpc)
 480             {
 481                 case 8: vptr = BITCAST(pointer, PointerType::get(VectorType::get(mInt8Ty, 4), 0)); break;
 482                 case 16: vptr = BITCAST(pointer, PointerType::get(VectorType::get(mInt16Ty, 4), 0)); break;
 483                 case 32: vptr = BITCAST(pointer, PointerType::get(VectorType::get(mFP32Ty, 4), 0)); break;
 484                 default: SWR_INVALID("Unsupported underlying bpp!");
 485             }
 486
 487             // load 4 components of attribute
 488             Value*    vec = ALIGNED_LOAD(vptr, 1, false);
 489
 490             // Convert To FP32 internally
 491             switch(info.type[0])
 492             {
 493                 case SWR_TYPE_UNORM:
 494                     switch(bpc)
 495                     {
 496                         case 8:
 497                             vec = UI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
 498                             vec = FMUL(vec, ConstantVector::get(std::vector<Constant*>(4, ConstantFP::get(mFP32Ty, 1.0 / 255.0))));
 499                             break;
 500                         case 16:
 501                             vec = UI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
 502                             vec = FMUL(vec, ConstantVector::get(std::vector<Constant*>(4, ConstantFP::get(mFP32Ty, 1.0 / 65535.0))));
 503                             break;
 504                         default:
 505                             SWR_INVALID("Unsupported underlying type!");
 506                             break;
 507                     }
 508                     break;
 509                 case SWR_TYPE_SNORM:
 510                     switch(bpc)
 511                     {
 512                         case 8:
 513                             vec = SI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
 514                             vec = FMUL(vec, ConstantVector::get(std::vector<Constant*>(4, ConstantFP::get(mFP32Ty, 1.0 / 128.0))));
 515                             break;
 516                         case 16:
 517                             vec = SI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
 518                             vec = FMUL(vec, ConstantVector::get(std::vector<Constant*>(4, ConstantFP::get(mFP32Ty, 1.0 / 32768.0))));
 519                             break;
 520                         default:
 521                             SWR_INVALID("Unsupported underlying type!");
 522                             break;
 523                     }
 524                     break;
 525                 case SWR_TYPE_UINT:
 526                     // Zero extend uint32_t types.
 527                     switch(bpc)
 528                     {
 529                         case 8:
 530                         case 16:
 531                             vec = Z_EXT(vec, VectorType::get(mInt32Ty, 4));
 532                             vec = BITCAST(vec, VectorType::get(mFP32Ty, 4));
 533                             break;
 534                         case 32:
 535                             break; // Pass through unchanged.
 536                         default:
 537                             SWR_INVALID("Unsupported underlying type!");
 538                             break;
 539                     }
 540                     break;
 541                 case SWR_TYPE_SINT:
 542                     // Sign extend SINT types.
 543                     switch(bpc)
 544                     {
 545                         case 8:
 546                         case 16:
 547                             vec = S_EXT(vec, VectorType::get(mInt32Ty, 4));
 548                             vec = BITCAST(vec, VectorType::get(mFP32Ty, 4));
 549                             break;
 550                         case 32:
 551                             break; // Pass through unchanged.
 552                         default:
 553                             SWR_INVALID("Unsupported underlying type!");
 554                             break;
 555                     }
 556                     break;
 557                 case SWR_TYPE_FLOAT:
 558                     switch(bpc)
 559                     {
 560                         case 32:
 561                             break; // Pass through unchanged.
 562                         default:
 563                             SWR_INVALID("Unsupported underlying type!");
 564                     }
 565                     break;
 566                 case SWR_TYPE_USCALED:
 567                     vec = UI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
 568                     break;
 569                 case SWR_TYPE_SSCALED:
 570                     vec = SI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
 571                     break;
 572                 case SWR_TYPE_SFIXED:
 573                     vec = FMUL(SI_TO_FP(vec, VectorType::get(mFP32Ty, 4)), VBROADCAST(C(1/65536.0f)));
 574                     break;
 575                 case SWR_TYPE_UNKNOWN:
 576                 case SWR_TYPE_UNUSED:
 577                     SWR_INVALID("Unsupported type %d!", info.type[0]);
 578             }
 579
 580             // promote mask: sse(0,1,2,3) | avx(0,1,2,3,4,4,4,4)
 581             // uwvec: 4 x F32, undef value
 582             Value*    wvec = VSHUFFLE(vec, uwvec, promoteMask);
 583             vectors.push_back(wvec);
 584         }
 585
 586         std::vector<Constant*>        v01Mask(mVWidth);
 587         std::vector<Constant*>        v23Mask(mVWidth);
 588         std::vector<Constant*>        v02Mask(mVWidth);
 589         std::vector<Constant*>        v13Mask(mVWidth);
 590
 591         // Concatenate the vectors together.
 592         elements[0] = VUNDEF_F();
 593         elements[1] = VUNDEF_F();
 594         elements[2] = VUNDEF_F();
 595         elements[3] = VUNDEF_F();
 596         for(uint32_t b = 0, num4Wide = mVWidth / 4; b < num4Wide; ++b)
 597         {
 598             v01Mask[4 * b + 0] = C(0 + 4 * b);
 599             v01Mask[4 * b + 1] = C(1 + 4 * b);
 600             v01Mask[4 * b + 2] = C(0 + 4 * b + mVWidth);
 601             v01Mask[4 * b + 3] = C(1 + 4 * b + mVWidth);
 602
 603             v23Mask[4 * b + 0] = C(2 + 4 * b);
 604             v23Mask[4 * b + 1] = C(3 + 4 * b);
 605             v23Mask[4 * b + 2] = C(2 + 4 * b + mVWidth);
 606             v23Mask[4 * b + 3] = C(3 + 4 * b + mVWidth);
 607
 608             v02Mask[4 * b + 0] = C(0 + 4 * b);
 609             v02Mask[4 * b + 1] = C(2 + 4 * b);
 610             v02Mask[4 * b + 2] = C(0 + 4 * b + mVWidth);
 611             v02Mask[4 * b + 3] = C(2 + 4 * b + mVWidth);
 612
 613             v13Mask[4 * b + 0] = C(1 + 4 * b);
 614             v13Mask[4 * b + 1] = C(3 + 4 * b);
 615             v13Mask[4 * b + 2] = C(1 + 4 * b + mVWidth);
 616             v13Mask[4 * b + 3] = C(3 + 4 * b + mVWidth);
 617
 618             std::vector<Constant*>    iMask(mVWidth);
 619             for(uint32_t i = 0; i < mVWidth; ++i)
 620             {
 621                 if(((4 * b) <= i) && (i < (4 * (b + 1))))
 622                 {
 623                     iMask[i] = C(i % 4 + mVWidth);
 624                 }
 625                 else
 626                 {
 627                     iMask[i] = C(i);
 628                 }
 629             }
 630             Constant* insertMask = ConstantVector::get(iMask);
 631             elements[0] = VSHUFFLE(elements[0], vectors[4 * b + 0], insertMask);
 632             elements[1] = VSHUFFLE(elements[1], vectors[4 * b + 1], insertMask);
 633             elements[2] = VSHUFFLE(elements[2], vectors[4 * b + 2], insertMask);
 634             elements[3] = VSHUFFLE(elements[3], vectors[4 * b + 3], insertMask);
 635         }
 636
 637         Value* x0y0x1y1 = VSHUFFLE(elements[0], elements[1], ConstantVector::get(v01Mask));
 638         Value* x2y2x3y3 = VSHUFFLE(elements[2], elements[3], ConstantVector::get(v01Mask));
 639         Value* z0w0z1w1 = VSHUFFLE(elements[0], elements[1], ConstantVector::get(v23Mask));
 640         Value* z2w3z2w3 = VSHUFFLE(elements[2], elements[3], ConstantVector::get(v23Mask));
 641         elements[0] = VSHUFFLE(x0y0x1y1, x2y2x3y3, ConstantVector::get(v02Mask));
 642         elements[1] = VSHUFFLE(x0y0x1y1, x2y2x3y3, ConstantVector::get(v13Mask));
 643         elements[2] = VSHUFFLE(z0w0z1w1, z2w3z2w3, ConstantVector::get(v02Mask));
 644         elements[3] = VSHUFFLE(z0w0z1w1, z2w3z2w3, ConstantVector::get(v13Mask));
 645
 646         switch(numComponents + 1)
 647         {
 648             case    1: elements[0] = VIMMED1(0.0f);
 649             case    2: elements[1] = VIMMED1(0.0f);
 650             case    3: elements[2] = VIMMED1(0.0f);
 651             case    4: elements[3] = VIMMED1(1.0f);
 652         }
 653
 654         for(uint32_t c = 0; c < 4; ++c)
 655         {
 656 #if USE_SIMD16_SHADERS
 657             Value* dest = GEP(pVtxOut, C(nelt * 8 + c * 2), "destGEP");
 658 #else
 659             Value* dest = GEP(pVtxOut, C(nelt * 4 + c), "destGEP");
 660 #endif
 661             STORE(elements[c], dest);
 662         }
 663     }
 664 }
 665
 666 // returns true for odd formats that require special state.gather handling
 667 bool FetchJit::IsOddFormat(SWR_FORMAT format)
 668 {
 669     const SWR_FORMAT_INFO& info = GetFormatInfo(format);
 670     if (info.bpc[0] != 8 && info.bpc[0] != 16 && info.bpc[0] != 32 && info.bpc[0] != 64)
 671     {
 672         return true;
 673     }
 674     return false;
 675 }
 676
 677 // format is uniform if all components are the same size and type
 678 bool FetchJit::IsUniformFormat(SWR_FORMAT format)
 679 {
 680     const SWR_FORMAT_INFO& info = GetFormatInfo(format);
 681     uint32_t bpc0 = info.bpc[0];
 682     uint32_t type0 = info.type[0];
 683
 684     for (uint32_t c = 1; c < info.numComps; ++c)
 685     {
 686         if (bpc0 != info.bpc[c] || type0 != info.type[c])
 687         {
 688             return false;
 689         }
 690     }
 691     return true;
 692 }
 693
 694 // unpacks components based on format
 695 // foreach component in the pixel
 696 //   mask off everything but this component
 697 //   shift component to LSB
 698 void FetchJit::UnpackComponents(SWR_FORMAT format, Value* vInput, Value* result[4])
 699 {
 700     const SWR_FORMAT_INFO& info = GetFormatInfo(format);
 701
 702     uint32_t bitOffset = 0;
 703     for (uint32_t c = 0; c < info.numComps; ++c)
 704     {
 705         uint32_t swizzledIndex = info.swizzle[c];
 706         uint32_t compBits = info.bpc[c];
 707         uint32_t bitmask = ((1 << compBits) - 1) << bitOffset;
 708         Value* comp = AND(vInput, bitmask);
 709         comp = LSHR(comp, bitOffset);
 710
 711         result[swizzledIndex] = comp;
 712         bitOffset += compBits;
 713     }
 714 }
 715
 716 // gather for odd component size formats
 717 // gather SIMD full pixels per lane then shift/mask to move each component to their
 718 // own vector
 719 void FetchJit::CreateGatherOddFormats(SWR_FORMAT format, Value* pMask, Value* pBase, Value* pOffsets, Value* pResult[4])
 720 {
 721     const SWR_FORMAT_INFO &info = GetFormatInfo(format);
 722
 723     // only works if pixel size is <= 32bits
 724     SWR_ASSERT(info.bpp <= 32);
 725
 726         Value* pGather = GATHERDD(VIMMED1(0), pBase, pOffsets, pMask);
 727
 728     for (uint32_t comp = 0; comp < 4; ++comp)
 729     {
 730         pResult[comp] = VIMMED1((int)info.defaults[comp]);
 731     }
 732
 733     UnpackComponents(format, pGather, pResult);
 734
 735     // cast to fp32
 736     pResult[0] = BITCAST(pResult[0], mSimdFP32Ty);
 737     pResult[1] = BITCAST(pResult[1], mSimdFP32Ty);
 738     pResult[2] = BITCAST(pResult[2], mSimdFP32Ty);
 739     pResult[3] = BITCAST(pResult[3], mSimdFP32Ty);
 740 }
 741
 742 void FetchJit::ConvertFormat(SWR_FORMAT format, Value *texels[4])
 743 {
 744     const SWR_FORMAT_INFO &info = GetFormatInfo(format);
 745
 746     for (uint32_t c = 0; c < info.numComps; ++c)
 747     {
 748         uint32_t compIndex = info.swizzle[c];
 749
 750         // skip any conversion on UNUSED components
 751         if (info.type[c] == SWR_TYPE_UNUSED)
 752         {
 753             continue;
 754         }
 755
 756         if (info.isNormalized[c])
 757         {
 758             if (info.type[c] == SWR_TYPE_SNORM)
 759             {
 760                 /// @todo The most-negative value maps to -1.0f. e.g. the 5-bit value 10000 maps to -1.0f.
 761
 762                 /// result = c * (1.0f / (2^(n-1) - 1);
 763                 uint32_t n = info.bpc[c];
 764                 uint32_t pow2 = 1 << (n - 1);
 765                 float scale = 1.0f / (float)(pow2 - 1);
 766                 Value *vScale = VIMMED1(scale);
 767                 texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty);
 768                 texels[compIndex] = SI_TO_FP(texels[compIndex], mSimdFP32Ty);
 769                 texels[compIndex] = FMUL(texels[compIndex], vScale);
 770             }
 771             else
 772             {
 773                 SWR_ASSERT(info.type[c] == SWR_TYPE_UNORM);
 774
 775                 /// result = c * (1.0f / (2^n - 1))
 776                 uint32_t n = info.bpc[c];
 777                 uint32_t pow2 = 1 << n;
 778                 // special case 24bit unorm format, which requires a full divide to meet ULP requirement
 779                 if (n == 24)
 780                 {
 781                     float scale = (float)(pow2 - 1);
 782                     Value* vScale = VIMMED1(scale);
 783                     texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty);
 784                     texels[compIndex] = SI_TO_FP(texels[compIndex], mSimdFP32Ty);
 785                     texels[compIndex] = FDIV(texels[compIndex], vScale);
 786                 }
 787                 else
 788                 {
 789                     float scale = 1.0f / (float)(pow2 - 1);
 790                     Value *vScale = VIMMED1(scale);
 791                     texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty);
 792                     texels[compIndex] = UI_TO_FP(texels[compIndex], mSimdFP32Ty);
 793                     texels[compIndex] = FMUL(texels[compIndex], vScale);
 794                 }
 795             }
 796             continue;
 797         }
 798     }
 799 }
 800
 801 //////////////////////////////////////////////////////////////////////////
 802 /// @brief Loads attributes from memory using AVX2 GATHER(s)
 803 /// @param fetchState - info about attributes to be fetched from memory
 804 /// @param streams - value pointer to the current vertex stream
 805 /// @param vIndices - vector value of indices to gather
 806 /// @param pVtxOut - value pointer to output simdvertex struct
 807 #if USE_SIMD16_SHADERS
 808 #if USE_SIMD16_GATHERS
 809 void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
 810     Value *streams, Value *vIndices, Value *vIndices2, Value *pVtxOut, bool useVertexID2)
 811 #else
 812 void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
 813     Value* streams, Value* vIndices, Value* pVtxOut, bool useVertexID2)
 814 #endif
 815 #else
 816 void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
 817     Value* streams, Value* vIndices, Value* pVtxOut)
 818 #endif
 819 {
 820     uint32_t currentVertexElement = 0;
 821     uint32_t outputElt = 0;
 822     Value* vVertexElements[4];
 823 #if USE_SIMD16_GATHERS
 824     Value* vVertexElements2[4];
 825 #endif
 826
 827     Value* startVertex = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartVertex});
 828     Value* startInstance = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartInstance});
 829     Value* curInstance = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_CurInstance});
 830     Value* vBaseVertex = VBROADCAST(LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_BaseVertex}));
 831     curInstance->setName("curInstance");
 832
 833     for (uint32_t nInputElt = 0; nInputElt < fetchState.numAttribs; nInputElt += 1)
 834     {
 835         const INPUT_ELEMENT_DESC& ied = fetchState.layout[nInputElt];
 836
 837         // skip element if all components are disabled
 838         if (ied.ComponentPacking == ComponentEnable::NONE)
 839         {
 840             continue;
 841         }
 842
 843         const SWR_FORMAT_INFO &info = GetFormatInfo((SWR_FORMAT)ied.Format);
 844         SWR_ASSERT((info.bpp != 0), "Unsupported format in JitGatherVertices.");
 845         uint32_t bpc = info.bpp / info.numComps;  ///@todo Code below assumes all components are same size. Need to fix.
 846
 847         Value *stream = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pData});
 848
 849         // VGATHER* takes an *i8 src pointer
 850         Value* pStreamBase = BITCAST(stream, PointerType::get(mInt8Ty, 0));
 851
 852         Value *stride = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pitch});
 853         Value *vStride = VBROADCAST(stride);
 854
 855         // max vertex index that is fully in bounds
 856         Value *maxVertex = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_maxVertex)});
 857         maxVertex = LOAD(maxVertex);
 858
 859         Value *minVertex = NULL;
 860         if (fetchState.bPartialVertexBuffer)
 861         {
 862             // min vertex index for low bounds OOB checking
 863             minVertex = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_minVertex)});
 864             minVertex = LOAD(minVertex);
 865         }
 866
 867         if (fetchState.bInstanceIDOffsetEnable)
 868         {
 869             // the InstanceID (curInstance) value is offset by StartInstanceLocation
 870             curInstance = ADD(curInstance, startInstance);
 871         }
 872
 873         Value *vCurIndices;
 874 #if USE_SIMD16_GATHERS
 875         Value *vCurIndices2;
 876 #endif
 877         Value *startOffset;
 878         Value *vInstanceStride = VIMMED1(0);
 879
 880         if (ied.InstanceEnable)
 881         {
 882             Value* stepRate = C(ied.InstanceAdvancementState);
 883
 884             // prevent a div by 0 for 0 step rate
 885             Value* isNonZeroStep = ICMP_UGT(stepRate, C(0));
 886             stepRate = SELECT(isNonZeroStep, stepRate, C(1));
 887
 888             // calc the current offset into instanced data buffer
 889             Value* calcInstance = UDIV(curInstance, stepRate);
 890
 891             // if step rate is 0, every instance gets instance 0
 892             calcInstance = SELECT(isNonZeroStep, calcInstance, C(0));
 893
 894             vCurIndices = VBROADCAST(calcInstance);
 895 #if USE_SIMD16_GATHERS
 896             vCurIndices2 = VBROADCAST(calcInstance);
 897 #endif
 898
 899             startOffset = startInstance;
 900         }
 901         else if (ied.InstanceStrideEnable)
 902         {
 903             // grab the instance advancement state, determines stride in bytes from one instance to the next
 904             Value* stepRate = C(ied.InstanceAdvancementState);
 905             vInstanceStride = VBROADCAST(MUL(curInstance, stepRate));
 906
 907             // offset indices by baseVertex
 908             vCurIndices = ADD(vIndices, vBaseVertex);
 909 #if USE_SIMD16_GATHERS
 910             vCurIndices2 = ADD(vIndices2, vBaseVertex);
 911 #endif
 912
 913             startOffset = startVertex;
 914             SWR_ASSERT((0), "TODO: Fill out more once driver sends this down.");
 915         }
 916         else
 917         {
 918             // offset indices by baseVertex
 919             vCurIndices = ADD(vIndices, vBaseVertex);
 920 #if USE_SIMD16_GATHERS
 921             vCurIndices2 = ADD(vIndices2, vBaseVertex);
 922 #endif
 923
 924             startOffset = startVertex;
 925         }
 926
 927         // All of the OOB calculations are in vertices, not VB offsets, to prevent having to
 928         // do 64bit address offset calculations.
 929
 930         // calculate byte offset to the start of the VB
 931         Value* baseOffset = MUL(Z_EXT(startOffset, mInt64Ty), Z_EXT(stride, mInt64Ty));
 932         pStreamBase = GEP(pStreamBase, baseOffset);
 933
 934         // if we have a start offset, subtract from max vertex. Used for OOB check
 935         maxVertex = SUB(Z_EXT(maxVertex, mInt64Ty), Z_EXT(startOffset, mInt64Ty));
 936         Value* maxNeg = ICMP_SLT(maxVertex, C((int64_t)0));
 937         // if we have a negative value, we're already OOB. clamp at 0.
 938         maxVertex = SELECT(maxNeg, C(0), TRUNC(maxVertex, mInt32Ty));
 939
 940         if (fetchState.bPartialVertexBuffer)
 941         {
 942             // similary for min vertex
 943             minVertex = SUB(Z_EXT(minVertex, mInt64Ty), Z_EXT(startOffset, mInt64Ty));
 944             Value *minNeg = ICMP_SLT(minVertex, C((int64_t)0));
 945             minVertex = SELECT(minNeg, C(0), TRUNC(minVertex, mInt32Ty));
 946         }
 947
 948         // Load the in bounds size of a partially valid vertex
 949         Value *partialInboundsSize = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_partialInboundsSize)});
 950         partialInboundsSize = LOAD(partialInboundsSize);
 951         Value* vPartialVertexSize = VBROADCAST(partialInboundsSize);
 952         Value* vBpp = VBROADCAST(C(info.Bpp));
 953         Value* vAlignmentOffsets = VBROADCAST(C(ied.AlignedByteOffset));
 954
 955         // is the element is <= the partially valid size
 956         Value* vElementInBoundsMask = ICMP_SLE(vBpp, SUB(vPartialVertexSize, vAlignmentOffsets));
 957
 958 #if USE_SIMD16_GATHERS
 959         // override cur indices with 0 if pitch is 0
 960         Value* pZeroPitchMask = ICMP_EQ(vStride, VIMMED1(0));
 961         vCurIndices2 = SELECT(pZeroPitchMask, VIMMED1(0), vCurIndices2);
 962
 963         // are vertices partially OOB?
 964         Value* vMaxVertex = VBROADCAST(maxVertex);
 965         Value* vPartialOOBMask = ICMP_EQ(vCurIndices, vMaxVertex);
 966         Value* vPartialOOBMask2 = ICMP_EQ(vCurIndices2, vMaxVertex);
 967
 968         // are vertices fully in bounds?
 969         Value* vMaxGatherMask = ICMP_ULT(vCurIndices, vMaxVertex);
 970         Value* vMaxGatherMask2 = ICMP_ULT(vCurIndices2, vMaxVertex);
 971
 972         Value *vGatherMask;
 973         Value *vGatherMask2;
 974         if (fetchState.bPartialVertexBuffer)
 975         {
 976             // are vertices below minVertex limit?
 977             Value *vMinVertex = VBROADCAST(minVertex);
 978             Value *vMinGatherMask = ICMP_UGE(vCurIndices, vMinVertex);
 979             Value *vMinGatherMask2 = ICMP_UGE(vCurIndices2, vMinVertex);
 980
 981             // only fetch lanes that pass both tests
 982             vGatherMask = AND(vMaxGatherMask, vMinGatherMask);
 983             vGatherMask2 = AND(vMaxGatherMask, vMinGatherMask2);
 984         }
 985         else
 986         {
 987             vGatherMask = vMaxGatherMask;
 988             vGatherMask2 = vMaxGatherMask2;
 989         }
 990
 991         // blend in any partially OOB indices that have valid elements
 992         vGatherMask = SELECT(vPartialOOBMask, vElementInBoundsMask, vGatherMask);
 993         vGatherMask2 = SELECT(vPartialOOBMask2, vElementInBoundsMask, vGatherMask2);
 994         Value *pMask = vGatherMask;
 995         Value *pMask2 = vGatherMask2;
 996         vGatherMask = VMASK(vGatherMask);
 997         vGatherMask2 = VMASK(vGatherMask2);
 998
 999         // calculate the actual offsets into the VB
1000         Value* vOffsets = MUL(vCurIndices, vStride);
1001         vOffsets = ADD(vOffsets, vAlignmentOffsets);
1002
1003         Value* vOffsets2 = MUL(vCurIndices2, vStride);
1004         vOffsets2 = ADD(vOffsets2, vAlignmentOffsets);
1005
1006         // if instance stride enable is:
1007         //  true  - add product of the instanceID and advancement state to the offst into the VB
1008         //  false - value of vInstanceStride has been initialialized to zero
1009         vOffsets = ADD(vOffsets, vInstanceStride);
1010         vOffsets2 = ADD(vOffsets2, vInstanceStride);
1011
1012 #else
1013         // override cur indices with 0 if pitch is 0
1014         Value* pZeroPitchMask = ICMP_EQ(vStride, VIMMED1(0));
1015         vCurIndices = SELECT(pZeroPitchMask, VIMMED1(0), vCurIndices);
1016
1017         // are vertices partially OOB?
1018         Value* vMaxVertex = VBROADCAST(maxVertex);
1019         Value* vPartialOOBMask = ICMP_EQ(vCurIndices, vMaxVertex);
1020
1021         // are vertices fully in bounds?
1022         Value* vMaxGatherMask = ICMP_ULT(vCurIndices, vMaxVertex);
1023
1024         Value *vGatherMask;
1025         if (fetchState.bPartialVertexBuffer)
1026         {
1027             // are vertices below minVertex limit?
1028             Value *vMinVertex = VBROADCAST(minVertex);
1029             Value *vMinGatherMask = ICMP_UGE(vCurIndices, vMinVertex);
1030
1031             // only fetch lanes that pass both tests
1032             vGatherMask = AND(vMaxGatherMask, vMinGatherMask);
1033         }
1034         else
1035         {
1036             vGatherMask = vMaxGatherMask;
1037         }
1038
1039         // blend in any partially OOB indices that have valid elements
1040         vGatherMask = SELECT(vPartialOOBMask, vElementInBoundsMask, vGatherMask);
1041         Value* pMask = vGatherMask;
1042         vGatherMask = VMASK(vGatherMask);
1043
1044         // calculate the actual offsets into the VB
1045         Value* vOffsets = MUL(vCurIndices, vStride);
1046         vOffsets = ADD(vOffsets, vAlignmentOffsets);
1047
1048         // if instance stride enable is:
1049         //  true  - add product of the instanceID and advancement state to the offst into the VB
1050         //  false - value of vInstanceStride has been initialialized to zero
1051         vOffsets = ADD(vOffsets, vInstanceStride);
1052
1053 #endif
1054         // Packing and component control
1055         ComponentEnable compMask = (ComponentEnable)ied.ComponentPacking;
1056         const ComponentControl compCtrl[4] { (ComponentControl)ied.ComponentControl0, (ComponentControl)ied.ComponentControl1,
1057                                              (ComponentControl)ied.ComponentControl2, (ComponentControl)ied.ComponentControl3};
1058
1059         // Special gather/conversion for formats without equal component sizes
1060         if (IsOddFormat((SWR_FORMAT)ied.Format))
1061         {
1062 #if USE_SIMD16_GATHERS
1063             Value *pResults[4];
1064             Value *pResults2[4];
1065             CreateGatherOddFormats((SWR_FORMAT)ied.Format, vGatherMask, pStreamBase, vOffsets, pResults);
1066             CreateGatherOddFormats((SWR_FORMAT)ied.Format, vGatherMask2, pStreamBase, vOffsets2, pResults2);
1067             ConvertFormat((SWR_FORMAT)ied.Format, pResults);
1068             ConvertFormat((SWR_FORMAT)ied.Format, pResults2);
1069
1070             for (uint32_t c = 0; c < 4; c += 1)
1071             {
1072                 if (isComponentEnabled(compMask, c))
1073                 {
1074                     vVertexElements[currentVertexElement] = pResults[c];
1075                     vVertexElements2[currentVertexElement] = pResults2[c];
1076                     currentVertexElement++;
1077
1078                     if (currentVertexElement > 3)
1079                     {
1080                         StoreVertexElements(pVtxOut, outputElt, 4, vVertexElements);
1081                         StoreVertexElements(GEP(pVtxOut, C(1)), outputElt, 4, vVertexElements2);
1082
1083                         outputElt += 1;
1084
1085                         // reset to the next vVertexElement to output
1086                         currentVertexElement = 0;
1087                     }
1088                 }
1089             }
1090 #else
1091             Value* pResults[4];
1092             CreateGatherOddFormats((SWR_FORMAT)ied.Format, vGatherMask, pStreamBase, vOffsets, pResults);
1093             ConvertFormat((SWR_FORMAT)ied.Format, pResults);
1094
1095             for (uint32_t c = 0; c < 4; ++c)
1096             {
1097                 if (isComponentEnabled(compMask, c))
1098                 {
1099                     vVertexElements[currentVertexElement++] = pResults[c];
1100                     if (currentVertexElement > 3)
1101                     {
1102                         StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1103                         // reset to the next vVertexElement to output
1104                         currentVertexElement = 0;
1105                     }
1106                 }
1107             }
1108 #endif
1109         }
1110         else if(info.type[0] == SWR_TYPE_FLOAT)
1111         {
1112             ///@todo: support 64 bit vb accesses
1113             Value* gatherSrc = VIMMED1(0.0f);
1114 #if USE_SIMD16_GATHERS
1115             Value* gatherSrc2 = VIMMED1(0.0f);
1116 #endif
1117
1118             SWR_ASSERT(IsUniformFormat((SWR_FORMAT)ied.Format),
1119                 "Unsupported format for standard gather fetch.");
1120
1121             // Gather components from memory to store in a simdvertex structure
1122             switch (bpc)
1123             {
1124                 case 16:
1125                 {
1126 #if USE_SIMD16_GATHERS
1127                     Value* vGatherResult[2];
1128                     Value* vGatherResult2[2];
1129                     Value *vMask;
1130                     Value *vMask2;
1131
1132                     // if we have at least one component out of x or y to fetch
1133                     if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
1134                     {
1135                         // save mask as it is zero'd out after each gather
1136                         vMask = vGatherMask;
1137                         vMask2 = vGatherMask2;
1138
1139                         vGatherResult[0] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vMask);
1140                         vGatherResult2[0] = GATHERPS(gatherSrc2, pStreamBase, vOffsets2, vMask2);
1141                         // e.g. result of first 8x32bit integer gather for 16bit components
1142                         // 256i - 0    1    2    3    4    5    6    7
1143                         //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
1144                         //
1145                     }
1146
1147                     // if we have at least one component out of z or w to fetch
1148                     if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
1149                     {
1150                         // offset base to the next components(zw) in the vertex to gather
1151                         pStreamBase = GEP(pStreamBase, C((char)4));
1152                         vMask = vGatherMask;
1153                         vMask2 = vGatherMask2;
1154
1155                         vGatherResult[1] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vMask);
1156                         vGatherResult2[1] = GATHERPS(gatherSrc2, pStreamBase, vOffsets2, vMask2);
1157                         // e.g. result of second 8x32bit integer gather for 16bit components
1158                         // 256i - 0    1    2    3    4    5    6    7
1159                         //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
1160                         //
1161                     }
1162
1163
1164                     // if we have at least one component to shuffle into place
1165                     if (compMask)
1166                     {
1167                         Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, Instruction::CastOps::FPExt, CONVERT_NONE,
1168                             currentVertexElement, outputElt, compMask, compCtrl, vVertexElements);
1169                         Shuffle16bpcArgs args2 = std::forward_as_tuple(vGatherResult2, GEP(pVtxOut, C(1)), Instruction::CastOps::FPExt, CONVERT_NONE,
1170                             currentVertexElement, outputElt, compMask, compCtrl, vVertexElements2);
1171
1172                         // Shuffle gathered components into place in simdvertex struct
1173                         Shuffle16bpcGather(args, false);  // outputs to vVertexElements ref
1174                         Shuffle16bpcGather(args2, true);  // outputs to vVertexElements ref
1175                     }
1176 #else
1177                     Value* vGatherResult[2];
1178                     Value *vMask;
1179
1180                     // if we have at least one component out of x or y to fetch
1181                     if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)){
1182                         // save mask as it is zero'd out after each gather
1183                         vMask = vGatherMask;
1184
1185                         vGatherResult[0] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vMask);
1186                         // e.g. result of first 8x32bit integer gather for 16bit components
1187                         // 256i - 0    1    2    3    4    5    6    7
1188                         //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
1189                         //
1190                     }
1191
1192                     // if we have at least one component out of z or w to fetch
1193                     if(isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)){
1194                         // offset base to the next components(zw) in the vertex to gather
1195                         pStreamBase = GEP(pStreamBase, C((char)4));
1196                         vMask = vGatherMask;
1197
1198                         vGatherResult[1] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vMask);
1199                         // e.g. result of second 8x32bit integer gather for 16bit components
1200                         // 256i - 0    1    2    3    4    5    6    7
1201                         //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
1202                         //
1203                     }
1204
1205                     // if we have at least one component to shuffle into place
1206                     if(compMask){
1207                         Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, Instruction::CastOps::FPExt, CONVERT_NONE,
1208                             currentVertexElement, outputElt, compMask, compCtrl, vVertexElements);
1209
1210                         // Shuffle gathered components into place in simdvertex struct
1211 #if USE_SIMD16_SHADERS
1212                         Shuffle16bpcGather(args, useVertexID2);  // outputs to vVertexElements ref
1213 #else
1214                         Shuffle16bpcGather(args);  // outputs to vVertexElements ref
1215 #endif
1216                     }
1217 #endif
1218                 }
1219                     break;
1220                 case 32:
1221                 {
1222                     for (uint32_t i = 0; i < 4; i += 1)
1223                     {
1224 #if USE_SIMD16_GATHERS
1225                         if (isComponentEnabled(compMask, i))
1226                         {
1227                             // if we need to gather the component
1228                             if (compCtrl[i] == StoreSrc)
1229                             {
1230                                 // save mask as it is zero'd out after each gather
1231                                 Value *vMask = vGatherMask;
1232                                 Value *vMask2 = vGatherMask2;
1233
1234                                 // Gather a SIMD of vertices
1235                                 // APIs allow a 4GB range for offsets
1236                                 // However, GATHERPS uses signed 32-bit offsets, so only a 2GB range :(
1237                                 // But, we know that elements must be aligned for FETCH. :)
1238                                 // Right shift the offset by a bit and then scale by 2 to remove the sign extension.
1239                                 Value *vShiftedOffsets = VPSRLI(vOffsets, C(1));
1240                                 Value *vShiftedOffsets2 = VPSRLI(vOffsets2, C(1));
1241                                 vVertexElements[currentVertexElement] = GATHERPS(gatherSrc, pStreamBase, vShiftedOffsets, vMask, 2);
1242                                 vVertexElements2[currentVertexElement] = GATHERPS(gatherSrc2, pStreamBase, vShiftedOffsets2, vMask2, 2);
1243
1244                                 currentVertexElement += 1;
1245                             }
1246                             else
1247                             {
1248                                 vVertexElements[currentVertexElement] = GenerateCompCtrlVector(compCtrl[i], false);
1249                                 vVertexElements2[currentVertexElement] = GenerateCompCtrlVector(compCtrl[i], true);
1250
1251                                 currentVertexElement += 1;
1252                             }
1253
1254                             if (currentVertexElement > 3)
1255                             {
1256 #if USE_SIMD16_BUILDER
1257                                 Value *pVtxSrc2[4];
1258
1259                                 // pack adjacent pairs of SIMD8s into SIMD16s
1260                                 for (uint32_t i = 0; i < 4; i += 1)
1261                                 {
1262                                     pVtxSrc2[i] = VUNDEF2_F();
1263
1264                                     pVtxSrc2[i] = INSERT(pVtxSrc2[i], vVertexElements[i],  0);
1265                                     pVtxSrc2[i] = INSERT(pVtxSrc2[i], vVertexElements2[i], 1);
1266                                 }
1267
1268                                 // store SIMD16s
1269                                 Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth2), 0));
1270                                 StoreVertexElements2(pVtxOut2, outputElt, 4, pVtxSrc2);
1271
1272 #else
1273                                 StoreVertexElements(pVtxOut, outputElt, 4, vVertexElements);
1274                                 StoreVertexElements(GEP(pVtxOut, C(1)), outputElt, 4, vVertexElements2);
1275
1276 #endif
1277                                 outputElt += 1;
1278
1279                                 // reset to the next vVertexElement to output
1280                                 currentVertexElement = 0;
1281                             }
1282                         }
1283
1284                         // offset base to the next component in the vertex to gather
1285                         pStreamBase = GEP(pStreamBase, C((char)4));
1286 #else
1287                         if (isComponentEnabled(compMask, i))
1288                         {
1289                             // if we need to gather the component
1290                             if (compCtrl[i] == StoreSrc)
1291                             {
1292                                 // save mask as it is zero'd out after each gather
1293                                 Value *vMask = vGatherMask;
1294
1295                                 // Gather a SIMD of vertices
1296                                 // APIs allow a 4GB range for offsets
1297                                 // However, GATHERPS uses signed 32-bit offsets, so only a 2GB range :(
1298                                 // But, we know that elements must be aligned for FETCH. :)
1299                                 // Right shift the offset by a bit and then scale by 2 to remove the sign extension.
1300                                 Value* vShiftedOffsets = VPSRLI(vOffsets, C(1));
1301                                 vVertexElements[currentVertexElement++] = GATHERPS(gatherSrc, pStreamBase, vShiftedOffsets, vMask, 2);
1302                             }
1303                             else
1304                             {
1305 #if USE_SIMD16_SHADERS
1306                                 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i], useVertexID2);
1307 #else
1308                                 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1309 #endif
1310                             }
1311
1312                             if (currentVertexElement > 3)
1313                             {
1314                                 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1315                                 // reset to the next vVertexElement to output
1316                                 currentVertexElement = 0;
1317                             }
1318                         }
1319
1320                         // offset base to the next component in the vertex to gather
1321                         pStreamBase = GEP(pStreamBase, C((char)4));
1322 #endif
1323                     }
1324                 }
1325                     break;
1326                 case 64:
1327                 {
1328                     for (uint32_t i = 0; i < 4; i += 1)
1329                     {
1330 #if USE_SIMD16_GATHERS
1331                         if (isComponentEnabled(compMask, i))
1332                         {
1333                             // if we need to gather the component
1334                             if (compCtrl[i] == StoreSrc)
1335                             {
1336                                 Value *vMaskLo = VSHUFFLE(pMask, VUNDEF(mInt1Ty, 8), C({ 0, 1, 2, 3 }));
1337                                 Value *vMaskLo2 = VSHUFFLE(pMask2, VUNDEF(mInt1Ty, 8), C({ 0, 1, 2, 3 }));
1338                                 Value *vMaskHi = VSHUFFLE(pMask, VUNDEF(mInt1Ty, 8), C({ 4, 5, 6, 7 }));
1339                                 Value *vMaskHi2 = VSHUFFLE(pMask2, VUNDEF(mInt1Ty, 8), C({ 4, 5, 6, 7 }));
1340                                 vMaskLo = S_EXT(vMaskLo, VectorType::get(mInt64Ty, 4));
1341                                 vMaskLo2 = S_EXT(vMaskLo2, VectorType::get(mInt64Ty, 4));
1342                                 vMaskHi = S_EXT(vMaskHi, VectorType::get(mInt64Ty, 4));
1343                                 vMaskHi2 = S_EXT(vMaskHi2, VectorType::get(mInt64Ty, 4));
1344                                 vMaskLo = BITCAST(vMaskLo, VectorType::get(mDoubleTy, 4));
1345                                 vMaskLo2 = BITCAST(vMaskLo2, VectorType::get(mDoubleTy, 4));
1346                                 vMaskHi = BITCAST(vMaskHi, VectorType::get(mDoubleTy, 4));
1347                                 vMaskHi2 = BITCAST(vMaskHi2, VectorType::get(mDoubleTy, 4));
1348
1349                                 Value *vOffsetsLo = VEXTRACTI128(vOffsets, C(0));
1350                                 Value *vOffsetsLo2 = VEXTRACTI128(vOffsets2, C(0));
1351                                 Value *vOffsetsHi = VEXTRACTI128(vOffsets, C(1));
1352                                 Value *vOffsetsHi2 = VEXTRACTI128(vOffsets2, C(1));
1353
1354                                 Value *vZeroDouble = VECTOR_SPLAT(4, ConstantFP::get(IRB()->getDoubleTy(), 0.0f));
1355
1356                                 Value* pGatherLo = GATHERPD(vZeroDouble, pStreamBase, vOffsetsLo, vMaskLo);
1357                                 Value* pGatherLo2 = GATHERPD(vZeroDouble, pStreamBase, vOffsetsLo2, vMaskLo2);
1358                                 Value* pGatherHi = GATHERPD(vZeroDouble, pStreamBase, vOffsetsHi, vMaskHi);
1359                                 Value* pGatherHi2 = GATHERPD(vZeroDouble, pStreamBase, vOffsetsHi2, vMaskHi2);
1360
1361                                 pGatherLo = VCVTPD2PS(pGatherLo);
1362                                 pGatherLo2 = VCVTPD2PS(pGatherLo2);
1363                                 pGatherHi = VCVTPD2PS(pGatherHi);
1364                                 pGatherHi2 = VCVTPD2PS(pGatherHi2);
1365
1366                                 Value *pGather = VSHUFFLE(pGatherLo, pGatherHi, C({ 0, 1, 2, 3, 4, 5, 6, 7 }));
1367                                 Value *pGather2 = VSHUFFLE(pGatherLo2, pGatherHi2, C({ 0, 1, 2, 3, 4, 5, 6, 7 }));
1368
1369                                 vVertexElements[currentVertexElement] = pGather;
1370                                 vVertexElements2[currentVertexElement] = pGather2;
1371
1372                                 currentVertexElement += 1;
1373                             }
1374                             else
1375                             {
1376                                 vVertexElements[currentVertexElement] = GenerateCompCtrlVector(compCtrl[i], false);
1377                                 vVertexElements2[currentVertexElement] = GenerateCompCtrlVector(compCtrl[i], true);
1378
1379                                 currentVertexElement += 1;
1380                             }
1381
1382                             if (currentVertexElement > 3)
1383                             {
1384                                 StoreVertexElements(pVtxOut, outputElt, 4, vVertexElements);
1385                                 StoreVertexElements(GEP(pVtxOut, C(1)), outputElt, 4, vVertexElements2);
1386
1387                                 outputElt += 1;
1388
1389                                 // reset to the next vVertexElement to output
1390                                 currentVertexElement = 0;
1391                             }
1392                         }
1393
1394                         // offset base to the next component  in the vertex to gather
1395                         pStreamBase = GEP(pStreamBase, C((char)8));
1396 #else
1397                         if (isComponentEnabled(compMask, i))
1398                         {
1399                             // if we need to gather the component
1400                             if (compCtrl[i] == StoreSrc)
1401                             {
1402                                 Value *vMaskLo = VSHUFFLE(pMask, VUNDEF(mInt1Ty, 8), C({0, 1, 2, 3}));
1403                                 Value *vMaskHi = VSHUFFLE(pMask, VUNDEF(mInt1Ty, 8), C({4, 5, 6, 7}));
1404                                 vMaskLo = S_EXT(vMaskLo, VectorType::get(mInt64Ty, 4));
1405                                 vMaskHi = S_EXT(vMaskHi, VectorType::get(mInt64Ty, 4));
1406                                 vMaskLo = BITCAST(vMaskLo, VectorType::get(mDoubleTy, 4));
1407                                 vMaskHi = BITCAST(vMaskHi, VectorType::get(mDoubleTy, 4));
1408
1409                                 Value *vOffsetsLo = VEXTRACTI128(vOffsets, C(0));
1410                                 Value *vOffsetsHi = VEXTRACTI128(vOffsets, C(1));
1411
1412                                 Value *vZeroDouble = VECTOR_SPLAT(4, ConstantFP::get(IRB()->getDoubleTy(), 0.0f));
1413
1414                                 Value* pGatherLo = GATHERPD(vZeroDouble,
1415                                                             pStreamBase, vOffsetsLo, vMaskLo);
1416                                 Value* pGatherHi = GATHERPD(vZeroDouble,
1417                                                             pStreamBase, vOffsetsHi, vMaskHi);
1418
1419                                 pGatherLo = VCVTPD2PS(pGatherLo);
1420                                 pGatherHi = VCVTPD2PS(pGatherHi);
1421
1422                                 Value *pGather = VSHUFFLE(pGatherLo, pGatherHi, C({0, 1, 2, 3, 4, 5, 6, 7}));
1423
1424                                 vVertexElements[currentVertexElement++] = pGather;
1425                             }
1426                             else
1427                             {
1428 #if USE_SIMD16_SHADERS
1429                                 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i], useVertexID2);
1430 #else
1431                                 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1432 #endif
1433                             }
1434
1435                             if (currentVertexElement > 3)
1436                             {
1437                                 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1438                                 // reset to the next vVertexElement to output
1439                                 currentVertexElement = 0;
1440                             }
1441                         }
1442
1443                         // offset base to the next component  in the vertex to gather
1444                         pStreamBase = GEP(pStreamBase, C((char)8));
1445 #endif
1446                     }
1447                 }
1448                     break;
1449                 default:
1450                     SWR_INVALID("Tried to fetch invalid FP format");
1451                     break;
1452             }
1453         }
1454         else
1455         {
1456             Instruction::CastOps extendCastType = Instruction::CastOps::CastOpsEnd;
1457             ConversionType conversionType = CONVERT_NONE;
1458
1459             SWR_ASSERT(IsUniformFormat((SWR_FORMAT)ied.Format),
1460                 "Unsupported format for standard gather fetch.");
1461
1462             switch(info.type[0])
1463             {
1464                 case SWR_TYPE_UNORM:
1465                     conversionType = CONVERT_NORMALIZED;
1466                 case SWR_TYPE_UINT:
1467                     extendCastType = Instruction::CastOps::ZExt;
1468                     break;
1469                 case SWR_TYPE_SNORM:
1470                     conversionType = CONVERT_NORMALIZED;
1471                 case SWR_TYPE_SINT:
1472                     extendCastType = Instruction::CastOps::SExt;
1473                     break;
1474                 case SWR_TYPE_USCALED:
1475                     conversionType = CONVERT_USCALED;
1476                     extendCastType = Instruction::CastOps::UIToFP;
1477                     break;
1478                 case SWR_TYPE_SSCALED:
1479                     conversionType = CONVERT_SSCALED;
1480                     extendCastType = Instruction::CastOps::SIToFP;
1481                     break;
1482                 case SWR_TYPE_SFIXED:
1483                     conversionType = CONVERT_SFIXED;
1484                     extendCastType = Instruction::CastOps::SExt;
1485                     break;
1486                 default:
1487                     break;
1488             }
1489
1490             // value substituted when component of gather is masked
1491             Value* gatherSrc = VIMMED1(0);
1492 #if USE_SIMD16_GATHERS
1493             Value* gatherSrc2 = VIMMED1(0);
1494 #endif
1495
1496             // Gather components from memory to store in a simdvertex structure
1497             switch (bpc)
1498             {
1499                 case 8:
1500                 {
1501                     // if we have at least one component to fetch
1502                     if (compMask)
1503                     {
1504 #if USE_SIMD16_GATHERS
1505                         Value* vGatherResult = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask);
1506                         Value* vGatherResult2 = GATHERDD(gatherSrc2, pStreamBase, vOffsets2, vGatherMask2);
1507                         // e.g. result of an 8x32bit integer gather for 8bit components
1508                         // 256i - 0    1    2    3    4    5    6    7
1509                         //        xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw
1510
1511                         Shuffle8bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, extendCastType, conversionType,
1512                             currentVertexElement, outputElt, compMask, compCtrl, vVertexElements, info.swizzle);
1513                         Shuffle8bpcArgs args2 = std::forward_as_tuple(vGatherResult2, GEP(pVtxOut, C(1)), extendCastType, conversionType,
1514                             currentVertexElement, outputElt, compMask, compCtrl, vVertexElements2, info.swizzle);
1515
1516                         // Shuffle gathered components into place in simdvertex struct
1517                         Shuffle8bpcGatherd(args, false); // outputs to vVertexElements ref
1518                         Shuffle8bpcGatherd(args2, true); // outputs to vVertexElements ref
1519 #else
1520                         Value* vGatherResult = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask);
1521                         // e.g. result of an 8x32bit integer gather for 8bit components
1522                         // 256i - 0    1    2    3    4    5    6    7
1523                         //        xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw
1524
1525                         Shuffle8bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, extendCastType, conversionType,
1526                             currentVertexElement, outputElt, compMask, compCtrl, vVertexElements, info.swizzle);
1527
1528                         // Shuffle gathered components into place in simdvertex struct
1529 #if USE_SIMD16_SHADERS
1530                         Shuffle8bpcGatherd(args, useVertexID2); // outputs to vVertexElements ref
1531 #else
1532                         Shuffle8bpcGatherd(args); // outputs to vVertexElements ref
1533 #endif
1534 #endif
1535                     }
1536                 }
1537                 break;
1538                 case 16:
1539                 {
1540 #if USE_SIMD16_GATHERS
1541                     Value* vGatherResult[2];
1542                     Value *vMask;
1543                     Value* vGatherResult2[2];
1544                     Value *vMask2;
1545
1546                     // if we have at least one component out of x or y to fetch
1547                     if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
1548                     {
1549                         // save mask as it is zero'd out after each gather
1550                         vMask = vGatherMask;
1551                         vMask2 = vGatherMask2;
1552
1553                         vGatherResult[0] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vMask);
1554                         vGatherResult2[0] = GATHERDD(gatherSrc2, pStreamBase, vOffsets2, vMask2);
1555                         // e.g. result of first 8x32bit integer gather for 16bit components
1556                         // 256i - 0    1    2    3    4    5    6    7
1557                         //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
1558                         //
1559                     }
1560
1561                     // if we have at least one component out of z or w to fetch
1562                     if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
1563                     {
1564                         // offset base to the next components(zw) in the vertex to gather
1565                         pStreamBase = GEP(pStreamBase, C((char)4));
1566                         vMask = vGatherMask;
1567                         vMask2 = vGatherMask2;
1568
1569                         vGatherResult[1] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vMask);
1570                         vGatherResult2[1] = GATHERDD(gatherSrc2, pStreamBase, vOffsets2, vMask2);
1571                         // e.g. result of second 8x32bit integer gather for 16bit components
1572                         // 256i - 0    1    2    3    4    5    6    7
1573                         //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
1574                         //
1575                     }
1576
1577                     // if we have at least one component to shuffle into place
1578                     if (compMask)
1579                     {
1580                         Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, extendCastType, conversionType,
1581                             currentVertexElement, outputElt, compMask, compCtrl, vVertexElements);
1582                         Shuffle16bpcArgs args2 = std::forward_as_tuple(vGatherResult2, GEP(pVtxOut, C(1)), extendCastType, conversionType,
1583                             currentVertexElement, outputElt, compMask, compCtrl, vVertexElements2);
1584
1585                         // Shuffle gathered components into place in simdvertex struct
1586                         Shuffle16bpcGather(args, false);  // outputs to vVertexElements ref
1587                         Shuffle16bpcGather(args2, true);  // outputs to vVertexElements ref
1588                     }
1589 #else
1590                     Value* vGatherResult[2];
1591                     Value *vMask;
1592
1593                     // if we have at least one component out of x or y to fetch
1594                     if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)){
1595                         // save mask as it is zero'd out after each gather
1596                         vMask = vGatherMask;
1597
1598                         vGatherResult[0] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vMask);
1599                         // e.g. result of first 8x32bit integer gather for 16bit components
1600                         // 256i - 0    1    2    3    4    5    6    7
1601                         //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
1602                         //
1603                     }
1604
1605                     // if we have at least one component out of z or w to fetch
1606                     if(isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)){
1607                         // offset base to the next components(zw) in the vertex to gather
1608                         pStreamBase = GEP(pStreamBase, C((char)4));
1609                         vMask = vGatherMask;
1610
1611                         vGatherResult[1] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vMask);
1612                         // e.g. result of second 8x32bit integer gather for 16bit components
1613                         // 256i - 0    1    2    3    4    5    6    7
1614                         //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
1615                         //
1616                     }
1617
1618                     // if we have at least one component to shuffle into place
1619                     if(compMask){
1620                         Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, extendCastType, conversionType,
1621                             currentVertexElement, outputElt, compMask, compCtrl, vVertexElements);
1622
1623                         // Shuffle gathered components into place in simdvertex struct
1624 #if USE_SIMD16_SHADERS
1625                         Shuffle16bpcGather(args, useVertexID2);  // outputs to vVertexElements ref
1626 #else
1627                         Shuffle16bpcGather(args);  // outputs to vVertexElements ref
1628 #endif
1629                     }
1630 #endif
1631                 }
1632                 break;
1633                 case 32:
1634                 {
1635                     // Gathered components into place in simdvertex struct
1636                     for (uint32_t i = 0; i < 4; i++)
1637                     {
1638                         if (isComponentEnabled(compMask, i))
1639                         {
1640                             // if we need to gather the component
1641                             if (compCtrl[i] == StoreSrc)
1642                             {
1643 #if USE_SIMD16_GATHERS
1644                                 // save mask as it is zero'd out after each gather
1645                                 Value *vMask = vGatherMask;
1646                                 Value *vMask2 = vGatherMask2;
1647
1648                                 Value *pGather = GATHERDD(gatherSrc, pStreamBase, vOffsets, vMask);
1649                                 Value *pGather2 = GATHERDD(gatherSrc2, pStreamBase, vOffsets2, vMask2);
1650
1651                                 if (conversionType == CONVERT_USCALED)
1652                                 {
1653                                     pGather = UI_TO_FP(pGather, mSimdFP32Ty);
1654                                     pGather2 = UI_TO_FP(pGather2, mSimdFP32Ty);
1655                                 }
1656                                 else if (conversionType == CONVERT_SSCALED)
1657                                 {
1658                                     pGather = SI_TO_FP(pGather, mSimdFP32Ty);
1659                                     pGather2 = SI_TO_FP(pGather2, mSimdFP32Ty);
1660                                 }
1661                                 else if (conversionType == CONVERT_SFIXED)
1662                                 {
1663                                     pGather = FMUL(SI_TO_FP(pGather, mSimdFP32Ty), VBROADCAST(C(1 / 65536.0f)));
1664                                     pGather2 = FMUL(SI_TO_FP(pGather2, mSimdFP32Ty), VBROADCAST(C(1 / 65536.0f)));
1665                                 }
1666
1667                                 vVertexElements[currentVertexElement] = pGather;
1668                                 vVertexElements2[currentVertexElement] = pGather2;
1669                                 // e.g. result of a single 8x32bit integer gather for 32bit components
1670                                 // 256i - 0    1    2    3    4    5    6    7
1671                                 //        xxxx xxxx xxxx xxxx xxxx xxxx xxxx xxxx
1672
1673                                 currentVertexElement += 1;
1674 #else
1675                                 // save mask as it is zero'd out after each gather
1676                                 Value *vMask = vGatherMask;
1677
1678                                 Value* pGather = GATHERDD(gatherSrc, pStreamBase, vOffsets, vMask);
1679
1680                                 if (conversionType == CONVERT_USCALED)
1681                                 {
1682                                     pGather = UI_TO_FP(pGather, mSimdFP32Ty);
1683                                 }
1684                                 else if (conversionType == CONVERT_SSCALED)
1685                                 {
1686                                     pGather = SI_TO_FP(pGather, mSimdFP32Ty);
1687                                 }
1688                                 else if (conversionType == CONVERT_SFIXED)
1689                                 {
1690                                     pGather = FMUL(SI_TO_FP(pGather, mSimdFP32Ty), VBROADCAST(C(1/65536.0f)));
1691                                 }
1692
1693                                 vVertexElements[currentVertexElement++] = pGather;
1694                                 // e.g. result of a single 8x32bit integer gather for 32bit components
1695                                 // 256i - 0    1    2    3    4    5    6    7
1696                                 //        xxxx xxxx xxxx xxxx xxxx xxxx xxxx xxxx
1697 #endif
1698                             }
1699                             else
1700                             {
1701 #if USE_SIMD16_SHADERS
1702 #if USE_SIMD16_GATHERS
1703                                 vVertexElements[currentVertexElement] = GenerateCompCtrlVector(compCtrl[i], false);
1704                                 vVertexElements2[currentVertexElement] = GenerateCompCtrlVector(compCtrl[i], true);
1705
1706                                 currentVertexElement += 1;
1707 #else
1708                                 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i], useVertexID2);
1709 #endif
1710 #else
1711                                 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1712 #endif
1713                             }
1714
1715                             if (currentVertexElement > 3)
1716                             {
1717 #if USE_SIMD16_GATHERS
1718                                 StoreVertexElements(pVtxOut, outputElt, 4, vVertexElements);
1719                                 StoreVertexElements(GEP(pVtxOut, C(1)), outputElt, 4, vVertexElements2);
1720
1721                                 outputElt += 1;
1722 #else
1723                                 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1724 #endif
1725
1726                                 // reset to the next vVertexElement to output
1727                                 currentVertexElement = 0;
1728                             }
1729
1730                         }
1731
1732                         // offset base to the next component  in the vertex to gather
1733                         pStreamBase = GEP(pStreamBase, C((char)4));
1734                     }
1735                 }
1736                 break;
1737             }
1738         }
1739     }
1740
1741     // if we have a partially filled vVertexElement struct, output it
1742     if (currentVertexElement > 0)
1743     {
1744 #if USE_SIMD16_GATHERS
1745         StoreVertexElements(pVtxOut, outputElt, currentVertexElement, vVertexElements);
1746         StoreVertexElements(GEP(pVtxOut, C(1)), outputElt, currentVertexElement, vVertexElements2);
1747
1748         outputElt += 1;
1749 #else
1750         StoreVertexElements(pVtxOut, outputElt++, currentVertexElement, vVertexElements);
1751 #endif
1752     }
1753 }
1754
1755 //////////////////////////////////////////////////////////////////////////
1756 /// @brief Loads a simd of valid indices. OOB indices are set to 0
1757 /// *Note* have to do 16bit index checking in scalar until we have AVX-512
1758 /// support
1759 /// @param pIndices - pointer to 8 bit indices
1760 /// @param pLastIndex - pointer to last valid index
1761 Value* FetchJit::GetSimdValid8bitIndices(Value* pIndices, Value* pLastIndex)
1762 {
1763     // can fit 2 16 bit integers per vWidth lane
1764     Value* vIndices =  VUNDEF_I();
1765
1766     // store 0 index on stack to be used to conditionally load from if index address is OOB
1767     Value* pZeroIndex = ALLOCA(mInt8Ty);
1768     STORE(C((uint8_t)0), pZeroIndex);
1769
1770     // Load a SIMD of index pointers
1771     for(int64_t lane = 0; lane < mVWidth; lane++)
1772     {
1773         // Calculate the address of the requested index
1774         Value *pIndex = GEP(pIndices, C(lane));
1775
1776         // check if the address is less than the max index,
1777         Value* mask = ICMP_ULT(pIndex, pLastIndex);
1778
1779         // if valid, load the index. if not, load 0 from the stack
1780         Value* pValid = SELECT(mask, pIndex, pZeroIndex);
1781         Value *index = LOAD(pValid, "valid index");
1782
1783         // zero extended index to 32 bits and insert into the correct simd lane
1784         index = Z_EXT(index, mInt32Ty);
1785         vIndices = VINSERT(vIndices, index, lane);
1786     }
1787     return vIndices;
1788 }
1789
1790 //////////////////////////////////////////////////////////////////////////
1791 /// @brief Loads a simd of valid indices. OOB indices are set to 0
1792 /// *Note* have to do 16bit index checking in scalar until we have AVX-512
1793 /// support
1794 /// @param pIndices - pointer to 16 bit indices
1795 /// @param pLastIndex - pointer to last valid index
1796 Value* FetchJit::GetSimdValid16bitIndices(Value* pIndices, Value* pLastIndex)
1797 {
1798     // can fit 2 16 bit integers per vWidth lane
1799     Value* vIndices =  VUNDEF_I();
1800
1801     // store 0 index on stack to be used to conditionally load from if index address is OOB
1802     Value* pZeroIndex = ALLOCA(mInt16Ty);
1803     STORE(C((uint16_t)0), pZeroIndex);
1804
1805     // Load a SIMD of index pointers
1806     for(int64_t lane = 0; lane < mVWidth; lane++)
1807     {
1808         // Calculate the address of the requested index
1809         Value *pIndex = GEP(pIndices, C(lane));
1810
1811         // check if the address is less than the max index,
1812         Value* mask = ICMP_ULT(pIndex, pLastIndex);
1813
1814         // if valid, load the index. if not, load 0 from the stack
1815         Value* pValid = SELECT(mask, pIndex, pZeroIndex);
1816         Value *index = LOAD(pValid, "valid index");
1817
1818         // zero extended index to 32 bits and insert into the correct simd lane
1819         index = Z_EXT(index, mInt32Ty);
1820         vIndices = VINSERT(vIndices, index, lane);
1821     }
1822     return vIndices;
1823 }
1824
1825 //////////////////////////////////////////////////////////////////////////
1826 /// @brief Loads a simd of valid indices. OOB indices are set to 0
1827 /// @param pIndices - pointer to 32 bit indices
1828 /// @param pLastIndex - pointer to last valid index
1829 Value* FetchJit::GetSimdValid32bitIndices(Value* pIndices, Value* pLastIndex)
1830 {
1831     DataLayout dL(JM()->mpCurrentModule);
1832     unsigned int ptrSize = dL.getPointerSize() * 8;  // ptr size in bits
1833     Value* iLastIndex = PTR_TO_INT(pLastIndex, Type::getIntNTy(JM()->mContext, ptrSize));
1834     Value* iIndices = PTR_TO_INT(pIndices, Type::getIntNTy(JM()->mContext, ptrSize));
1835
1836     // get the number of indices left in the buffer (endPtr - curPtr) / sizeof(index)
1837     Value* numIndicesLeft = SUB(iLastIndex,iIndices);
1838     numIndicesLeft = TRUNC(numIndicesLeft, mInt32Ty);
1839     numIndicesLeft = SDIV(numIndicesLeft, C(4));
1840
1841     // create a vector of index counts from the base index ptr passed into the fetch
1842     const std::vector<Constant*> vecIndices {C(0), C(1), C(2), C(3), C(4), C(5), C(6), C(7)};
1843     Constant* vIndexOffsets = ConstantVector::get(vecIndices);
1844
1845     // compare index count to the max valid index
1846     // e.g vMaxIndex      4 4 4 4 4 4 4 4 : 4 indices left to load
1847     //     vIndexOffsets  0 1 2 3 4 5 6 7
1848     //     ------------------------------
1849     //     vIndexMask    -1-1-1-1 0 0 0 0 : offsets < max pass
1850     //     vLoadedIndices 0 1 2 3 0 0 0 0 : offsets >= max masked to 0
1851     Value* vMaxIndex = VBROADCAST(numIndicesLeft);
1852     Value* vIndexMask = VPCMPGTD(vMaxIndex,vIndexOffsets);
1853
1854     // VMASKLOAD takes an *i8 src pointer
1855     pIndices = BITCAST(pIndices,PointerType::get(mInt8Ty,0));
1856
1857     // Load the indices; OOB loads 0
1858     return MASKLOADD(pIndices,vIndexMask);
1859 }
1860
1861 //////////////////////////////////////////////////////////////////////////
1862 /// @brief Takes a SIMD of gathered 8bpc verts, zero or sign extends,
1863 /// denormalizes if needed, converts to F32 if needed, and positions in
1864 //  the proper SIMD rows to be output to the simdvertex structure
1865 /// @param args: (tuple of args, listed below)
1866 ///   @param vGatherResult - 8 gathered 8bpc vertices
1867 ///   @param pVtxOut - base pointer to output simdvertex struct
1868 ///   @param extendType - sign extend or zero extend
1869 ///   @param bNormalized - do we need to denormalize?
1870 ///   @param currentVertexElement - reference to the current vVertexElement
1871 ///   @param outputElt - reference to the current offset from simdvertex we're o
1872 ///   @param compMask - component packing mask
1873 ///   @param compCtrl - component control val
1874 ///   @param vVertexElements[4] - vertex components to output
1875 ///   @param swizzle[4] - component swizzle location
1876 #if USE_SIMD16_SHADERS
1877 void FetchJit::Shuffle8bpcGatherd(Shuffle8bpcArgs &args, bool useVertexID2)
1878 #else
1879 void FetchJit::Shuffle8bpcGatherd(Shuffle8bpcArgs &args)
1880 #endif
1881 {
1882     // Unpack tuple args
1883     Value*& vGatherResult = std::get<0>(args);
1884     Value* pVtxOut = std::get<1>(args);
1885     const Instruction::CastOps extendType = std::get<2>(args);
1886     const ConversionType conversionType = std::get<3>(args);
1887     uint32_t &currentVertexElement = std::get<4>(args);
1888     uint32_t &outputElt =  std::get<5>(args);
1889     const ComponentEnable compMask = std::get<6>(args);
1890     const ComponentControl (&compCtrl)[4] = std::get<7>(args);
1891     Value* (&vVertexElements)[4] = std::get<8>(args);
1892     const uint32_t (&swizzle)[4] = std::get<9>(args);
1893
1894     // cast types
1895     Type* vGatherTy = mSimdInt32Ty;
1896     Type* v32x8Ty =  VectorType::get(mInt8Ty, mVWidth * 4 ); // vwidth is units of 32 bits
1897
1898     // have to do extra work for sign extending
1899     if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP)){
1900         Type* v16x8Ty = VectorType::get(mInt8Ty, mVWidth * 2); // 8x16bit ints in a 128bit lane
1901         Type* v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
1902
1903         // shuffle mask, including any swizzling
1904         const char x = (char)swizzle[0]; const char y = (char)swizzle[1];
1905         const char z = (char)swizzle[2]; const char w = (char)swizzle[3];
1906         Value* vConstMask = C<char>({char(x), char(x+4), char(x+8), char(x+12),
1907                     char(y), char(y+4), char(y+8), char(y+12),
1908                     char(z), char(z+4), char(z+8), char(z+12),
1909                     char(w), char(w+4), char(w+8), char(w+12),
1910                     char(x), char(x+4), char(x+8), char(x+12),
1911                     char(y), char(y+4), char(y+8), char(y+12),
1912                     char(z), char(z+4), char(z+8), char(z+12),
1913                     char(w), char(w+4), char(w+8), char(w+12)});
1914
1915         Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherResult, v32x8Ty), vConstMask), vGatherTy);
1916         // after pshufb: group components together in each 128bit lane
1917         // 256i - 0    1    2    3    4    5    6    7
1918         //        xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww
1919
1920         Value* vi128XY = nullptr;
1921         if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)){
1922             vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({0, 4, 0, 0, 1, 5, 0, 0})), v128Ty);
1923             // after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane
1924             // 256i - 0    1    2    3    4    5    6    7
1925             //        xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care)
1926         }
1927
1928         // do the same for zw components
1929         Value* vi128ZW = nullptr;
1930         if(isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)){
1931             vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({2, 6, 0, 0, 3, 7, 0, 0})), v128Ty);
1932         }
1933
1934         // init denormalize variables if needed
1935         Instruction::CastOps fpCast;
1936         Value* conversionFactor;
1937
1938         switch (conversionType)
1939         {
1940         case CONVERT_NORMALIZED:
1941             fpCast = Instruction::CastOps::SIToFP;
1942             conversionFactor = VIMMED1((float)(1.0 / 127.0));
1943             break;
1944         case CONVERT_SSCALED:
1945             fpCast = Instruction::CastOps::SIToFP;
1946             conversionFactor = VIMMED1((float)(1.0));
1947             break;
1948         case CONVERT_USCALED:
1949             SWR_INVALID("Type should not be sign extended!");
1950             conversionFactor = nullptr;
1951             break;
1952         default:
1953             SWR_ASSERT(conversionType == CONVERT_NONE);
1954             conversionFactor = nullptr;
1955             break;
1956         }
1957
1958         // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
1959         for (uint32_t i = 0; i < 4; i++)
1960         {
1961             if (isComponentEnabled(compMask, i))
1962             {
1963                 if (compCtrl[i] == ComponentControl::StoreSrc)
1964                 {
1965                     // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
1966                     uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
1967                     // if x or y, use vi128XY permute result, else use vi128ZW
1968                     Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
1969
1970                     // sign extend
1971                     vVertexElements[currentVertexElement] = PMOVSXBD(BITCAST(VEXTRACT(selectedPermute, C(lane)), v16x8Ty));
1972
1973                     // denormalize if needed
1974                     if (conversionType != CONVERT_NONE)
1975                     {
1976                         vVertexElements[currentVertexElement] = FMUL(CAST(fpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor);
1977                     }
1978                     currentVertexElement++;
1979                 }
1980                 else
1981                 {
1982 #if USE_SIMD16_SHADERS
1983                     vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i], useVertexID2);
1984 #else
1985                     vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1986 #endif
1987                 }
1988
1989                 if (currentVertexElement > 3)
1990                 {
1991                     StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1992                     // reset to the next vVertexElement to output
1993                     currentVertexElement = 0;
1994                 }
1995             }
1996         }
1997     }
1998     // else zero extend
1999     else if ((extendType == Instruction::CastOps::ZExt) || (extendType == Instruction::CastOps::UIToFP))
2000     {
2001         // init denormalize variables if needed
2002         Instruction::CastOps fpCast;
2003         Value* conversionFactor;
2004
2005         switch (conversionType)
2006         {
2007         case CONVERT_NORMALIZED:
2008             fpCast = Instruction::CastOps::UIToFP;
2009             conversionFactor = VIMMED1((float)(1.0 / 255.0));
2010             break;
2011         case CONVERT_USCALED:
2012             fpCast = Instruction::CastOps::UIToFP;
2013             conversionFactor = VIMMED1((float)(1.0));
2014             break;
2015         case CONVERT_SSCALED:
2016             SWR_INVALID("Type should not be zero extended!");
2017             conversionFactor = nullptr;
2018             break;
2019         default:
2020             SWR_ASSERT(conversionType == CONVERT_NONE);
2021             conversionFactor = nullptr;
2022             break;
2023         }
2024
2025         // shuffle enabled components into lower byte of each 32bit lane, 0 extending to 32 bits
2026         for (uint32_t i = 0; i < 4; i++)
2027         {
2028             if (isComponentEnabled(compMask, i))
2029             {
2030                 if (compCtrl[i] == ComponentControl::StoreSrc)
2031                 {
2032                     // pshufb masks for each component
2033                     Value* vConstMask;
2034                     switch (swizzle[i])
2035                     {
2036                     case 0:
2037                         // x shuffle mask
2038                         vConstMask = C<char>({ 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1,
2039                                                0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1 });
2040                         break;
2041                     case 1:
2042                         // y shuffle mask
2043                         vConstMask = C<char>({ 1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1,
2044                                                1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1 });
2045                         break;
2046                     case 2:
2047                         // z shuffle mask
2048                         vConstMask = C<char>({ 2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1,
2049                                                2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1 });
2050                         break;
2051                     case 3:
2052                         // w shuffle mask
2053                         vConstMask = C<char>({ 3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1,
2054                                                3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1 });
2055                         break;
2056                     default:
2057                         vConstMask = nullptr;
2058                         break;
2059                     }
2060
2061                     vVertexElements[currentVertexElement] = BITCAST(PSHUFB(BITCAST(vGatherResult, v32x8Ty), vConstMask), vGatherTy);
2062                     // after pshufb for x channel
2063                     // 256i - 0    1    2    3    4    5    6    7
2064                     //        x000 x000 x000 x000 x000 x000 x000 x000
2065
2066                     // denormalize if needed
2067                     if (conversionType != CONVERT_NONE)
2068                     {
2069                         vVertexElements[currentVertexElement] = FMUL(CAST(fpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor);
2070                     }
2071                     currentVertexElement++;
2072                 }
2073                 else
2074                 {
2075 #if USE_SIMD16_SHADERS
2076                     vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i], useVertexID2);
2077 #else
2078                     vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
2079 #endif
2080                 }
2081
2082                 if (currentVertexElement > 3)
2083                 {
2084                     StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
2085                     // reset to the next vVertexElement to output
2086                     currentVertexElement = 0;
2087                 }
2088             }
2089         }
2090     }
2091     else
2092     {
2093         SWR_INVALID("Unsupported conversion type");
2094     }
2095 }
2096
2097 //////////////////////////////////////////////////////////////////////////
2098 /// @brief Takes a SIMD of gathered 16bpc verts, zero or sign extends,
2099 /// denormalizes if needed, converts to F32 if needed, and positions in
2100 //  the proper SIMD rows to be output to the simdvertex structure
2101 /// @param args: (tuple of args, listed below)
2102 ///   @param vGatherResult[2] - array of gathered 16bpc vertices, 4 per index
2103 ///   @param pVtxOut - base pointer to output simdvertex struct
2104 ///   @param extendType - sign extend or zero extend
2105 ///   @param bNormalized - do we need to denormalize?
2106 ///   @param currentVertexElement - reference to the current vVertexElement
2107 ///   @param outputElt - reference to the current offset from simdvertex we're o
2108 ///   @param compMask - component packing mask
2109 ///   @param compCtrl - component control val
2110 ///   @param vVertexElements[4] - vertex components to output
2111 #if USE_SIMD16_SHADERS
2112 void FetchJit::Shuffle16bpcGather(Shuffle16bpcArgs &args, bool useVertexID2)
2113 #else
2114 void FetchJit::Shuffle16bpcGather(Shuffle16bpcArgs &args)
2115 #endif
2116 {
2117     // Unpack tuple args
2118     Value* (&vGatherResult)[2] = std::get<0>(args);
2119     Value* pVtxOut = std::get<1>(args);
2120     const Instruction::CastOps extendType = std::get<2>(args);
2121     const ConversionType conversionType = std::get<3>(args);
2122     uint32_t &currentVertexElement = std::get<4>(args);
2123     uint32_t &outputElt = std::get<5>(args);
2124     const ComponentEnable compMask = std::get<6>(args);
2125     const ComponentControl(&compCtrl)[4] = std::get<7>(args);
2126     Value* (&vVertexElements)[4] = std::get<8>(args);
2127
2128     // cast types
2129     Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
2130     Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
2131
2132     // have to do extra work for sign extending
2133     if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP)||
2134         (extendType == Instruction::CastOps::FPExt))
2135     {
2136         // is this PP float?
2137         bool bFP = (extendType == Instruction::CastOps::FPExt) ? true : false;
2138
2139         Type* v8x16Ty = VectorType::get(mInt16Ty, 8); // 8x16bit in a 128bit lane
2140         Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
2141
2142         // shuffle mask
2143         Value* vConstMask = C<char>({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
2144                                      0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15});
2145         Value* vi128XY = nullptr;
2146         if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)){
2147             Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherResult[0], v32x8Ty), vConstMask), vGatherTy);
2148             // after pshufb: group components together in each 128bit lane
2149             // 256i - 0    1    2    3    4    5    6    7
2150             //        xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
2151
2152             vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
2153             // after PERMD: move and pack xy components into each 128bit lane
2154             // 256i - 0    1    2    3    4    5    6    7
2155             //        xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
2156         }
2157
2158         // do the same for zw components
2159         Value* vi128ZW = nullptr;
2160         if(isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)){
2161             Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherResult[1], v32x8Ty), vConstMask), vGatherTy);
2162             vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
2163         }
2164
2165         // init denormalize variables if needed
2166         Instruction::CastOps IntToFpCast;
2167         Value* conversionFactor;
2168
2169         switch (conversionType)
2170         {
2171         case CONVERT_NORMALIZED:
2172             IntToFpCast = Instruction::CastOps::SIToFP;
2173             conversionFactor = VIMMED1((float)(1.0 / 32767.0));
2174             break;
2175         case CONVERT_SSCALED:
2176             IntToFpCast = Instruction::CastOps::SIToFP;
2177             conversionFactor = VIMMED1((float)(1.0));
2178             break;
2179         case CONVERT_USCALED:
2180             SWR_INVALID("Type should not be sign extended!");
2181             conversionFactor = nullptr;
2182             break;
2183         default:
2184             SWR_ASSERT(conversionType == CONVERT_NONE);
2185             conversionFactor = nullptr;
2186             break;
2187         }
2188
2189         // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
2190         for (uint32_t i = 0; i < 4; i++)
2191         {
2192             if (isComponentEnabled(compMask, i))
2193             {
2194                 if (compCtrl[i] == ComponentControl::StoreSrc)
2195                 {
2196                     // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
2197                     uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
2198                     // if x or y, use vi128XY permute result, else use vi128ZW
2199                     Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
2200
2201                     if (bFP) {
2202                         // extract 128 bit lanes to sign extend each component
2203                         vVertexElements[currentVertexElement] = CVTPH2PS(BITCAST(VEXTRACT(selectedPermute, C(lane)), v8x16Ty));
2204                     }
2205                     else {
2206                         // extract 128 bit lanes to sign extend each component
2207                         vVertexElements[currentVertexElement] = PMOVSXWD(BITCAST(VEXTRACT(selectedPermute, C(lane)), v8x16Ty));
2208
2209                         // denormalize if needed
2210                         if (conversionType != CONVERT_NONE) {
2211                             vVertexElements[currentVertexElement] = FMUL(CAST(IntToFpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor);
2212                         }
2213                     }
2214                     currentVertexElement++;
2215                 }
2216                 else
2217                 {
2218 #if USE_SIMD16_SHADERS
2219                     vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i], useVertexID2);
2220 #else
2221                     vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
2222 #endif
2223                 }
2224
2225                 if (currentVertexElement > 3)
2226                 {
2227                     StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
2228                     // reset to the next vVertexElement to output
2229                     currentVertexElement = 0;
2230                 }
2231             }
2232         }
2233     }
2234     // else zero extend
2235     else if ((extendType == Instruction::CastOps::ZExt) || (extendType == Instruction::CastOps::UIToFP))
2236     {
2237         // pshufb masks for each component
2238         Value* vConstMask[2];
2239         if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 2)){
2240             // x/z shuffle mask
2241             vConstMask[0] = C<char>({0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
2242                                      0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, });
2243         }
2244
2245         if(isComponentEnabled(compMask, 1) || isComponentEnabled(compMask, 3)){
2246             // y/w shuffle mask
2247             vConstMask[1] = C<char>({2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
2248                                      2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1});
2249         }
2250
2251         // init denormalize variables if needed
2252         Instruction::CastOps fpCast;
2253         Value* conversionFactor;
2254
2255         switch (conversionType)
2256         {
2257         case CONVERT_NORMALIZED:
2258             fpCast = Instruction::CastOps::UIToFP;
2259             conversionFactor = VIMMED1((float)(1.0 / 65535.0));
2260             break;
2261         case CONVERT_USCALED:
2262             fpCast = Instruction::CastOps::UIToFP;
2263             conversionFactor = VIMMED1((float)(1.0f));
2264             break;
2265         case CONVERT_SSCALED:
2266             SWR_INVALID("Type should not be zero extended!");
2267             conversionFactor = nullptr;
2268             break;
2269         default:
2270             SWR_ASSERT(conversionType == CONVERT_NONE);
2271             conversionFactor = nullptr;
2272             break;
2273         }
2274
2275         // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits
2276         for (uint32_t i = 0; i < 4; i++)
2277         {
2278             if (isComponentEnabled(compMask, i))
2279             {
2280                 if (compCtrl[i] == ComponentControl::StoreSrc)
2281                 {
2282                     // select correct constMask for x/z or y/w pshufb
2283                     uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1;
2284                     // if x or y, use vi128XY permute result, else use vi128ZW
2285                     uint32_t selectedGather = (i < 2) ? 0 : 1;
2286
2287                     vVertexElements[currentVertexElement] = BITCAST(PSHUFB(BITCAST(vGatherResult[selectedGather], v32x8Ty), vConstMask[selectedMask]), vGatherTy);
2288                     // after pshufb mask for x channel; z uses the same shuffle from the second gather
2289                     // 256i - 0    1    2    3    4    5    6    7
2290                     //        xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00
2291
2292                     // denormalize if needed
2293                     if (conversionType != CONVERT_NONE)
2294                     {
2295                         vVertexElements[currentVertexElement] = FMUL(CAST(fpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor);
2296                     }
2297                     currentVertexElement++;
2298                 }
2299                 else
2300                 {
2301 #if USE_SIMD16_SHADERS
2302                     vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i], useVertexID2);
2303 #else
2304                     vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
2305 #endif
2306                 }
2307
2308                 if (currentVertexElement > 3)
2309                 {
2310                     StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
2311                     // reset to the next vVertexElement to output
2312                     currentVertexElement = 0;
2313                 }
2314             }
2315         }
2316     }
2317     else
2318     {
2319         SWR_INVALID("Unsupported conversion type");
2320     }
2321 }
2322
2323 //////////////////////////////////////////////////////////////////////////
2324 /// @brief Output a simdvertex worth of elements to the current outputElt
2325 /// @param pVtxOut - base address of VIN output struct
2326 /// @param outputElt - simdvertex offset in VIN to write to
2327 /// @param numEltsToStore - number of simdvertex rows to write out
2328 /// @param vVertexElements - LLVM Value*[] simdvertex to write out
2329 void FetchJit::StoreVertexElements(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4])
2330 {
2331     SWR_ASSERT(numEltsToStore <= 4, "Invalid element count.");
2332
2333     for(uint32_t c = 0; c < numEltsToStore; ++c)
2334     {
2335         // STORE expects FP32 x vWidth type, just bitcast if needed
2336         if(!vVertexElements[c]->getType()->getScalarType()->isFloatTy())
2337         {
2338 #if FETCH_DUMP_VERTEX
2339             PRINT("vVertexElements[%d]: 0x%x\n", {C(c), vVertexElements[c]});
2340 #endif
2341             vVertexElements[c] = BITCAST(vVertexElements[c], mSimdFP32Ty);
2342         }
2343 #if FETCH_DUMP_VERTEX
2344         else
2345         {
2346             PRINT("vVertexElements[%d]: %f\n", {C(c), vVertexElements[c]});
2347         }
2348 #endif
2349         // outputElt * 4 = offsetting by the size of a simdvertex
2350         // + c offsets to a 32bit x vWidth row within the current vertex
2351 #if USE_SIMD16_SHADERS
2352         Value* dest = GEP(pVtxOut, C(outputElt * 8 + c * 2), "destGEP");
2353 #else
2354         Value* dest = GEP(pVtxOut, C(outputElt * 4 + c), "destGEP");
2355 #endif
2356         STORE(vVertexElements[c], dest);
2357     }
2358 }
2359
2360 #if USE_SIMD16_BUILDER
2361 void FetchJit::StoreVertexElements2(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4])
2362 {
2363     SWR_ASSERT(numEltsToStore <= 4, "Invalid element count.");
2364
2365     for (uint32_t c = 0; c < numEltsToStore; ++c)
2366     {
2367         // STORE expects FP32 x vWidth type, just bitcast if needed
2368         if (!vVertexElements[c]->getType()->getScalarType()->isFloatTy())
2369         {
2370 #if FETCH_DUMP_VERTEX
2371             PRINT("vVertexElements[%d]: 0x%x\n", { C(c), vVertexElements[c] });
2372 #endif
2373             vVertexElements[c] = BITCAST(vVertexElements[c], mSimd2FP32Ty);
2374         }
2375 #if FETCH_DUMP_VERTEX
2376         else
2377         {
2378             PRINT("vVertexElements[%d]: %f\n", { C(c), vVertexElements[c] });
2379         }
2380 #endif
2381         // outputElt * 4 = offsetting by the size of a simdvertex
2382         // + c offsets to a 32bit x vWidth row within the current vertex
2383         Value* dest = GEP(pVtxOut, C(outputElt * 4 + c), "destGEP");
2384         STORE(vVertexElements[c], dest);
2385     }
2386 }
2387
2388 #endif
2389 //////////////////////////////////////////////////////////////////////////
2390 /// @brief Generates a constant vector of values based on the
2391 /// ComponentControl value
2392 /// @param ctrl - ComponentControl value
2393 #if USE_SIMD16_SHADERS
2394 Value* FetchJit::GenerateCompCtrlVector(const ComponentControl ctrl, bool useVertexID2)
2395 #else
2396 Value* FetchJit::GenerateCompCtrlVector(const ComponentControl ctrl)
2397 #endif
2398 {
2399     switch(ctrl)
2400     {
2401         case NoStore:   return VUNDEF_I();
2402         case Store0:    return VIMMED1(0);
2403         case Store1Fp:  return VIMMED1(1.0f);
2404         case Store1Int: return VIMMED1(1);
2405         case StoreVertexId:
2406         {
2407 #if USE_SIMD16_SHADERS
2408             Value* pId;
2409             if (useVertexID2)
2410             {
2411                 pId = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID2 })), mSimdFP32Ty);
2412             }
2413             else
2414             {
2415                 pId = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID })), mSimdFP32Ty);
2416             }
2417 #else
2418             Value* pId = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID })), mSimdFP32Ty);
2419 #endif
2420             return VBROADCAST(pId);
2421         }
2422         case StoreInstanceId:
2423         {
2424             Value* pId = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_CurInstance })), mFP32Ty);
2425             return VBROADCAST(pId);
2426         }
2427         case StoreSrc:
2428         default:        SWR_INVALID("Invalid component control"); return VUNDEF_I();
2429     }
2430 }
2431
2432 //////////////////////////////////////////////////////////////////////////
2433 /// @brief Returns the enable mask for the specified component.
2434 /// @param enableMask - enable bits
2435 /// @param component - component to check if enabled.
2436 bool isComponentEnabled(ComponentEnable enableMask, uint8_t component)
2437 {
2438     switch (component)
2439     {
2440         // X
2441     case 0: return (enableMask & ComponentEnable::X);
2442         // Y
2443     case 1: return (enableMask & ComponentEnable::Y);
2444         // Z
2445     case 2: return (enableMask & ComponentEnable::Z);
2446         // W
2447     case 3: return (enableMask & ComponentEnable::W);
2448
2449     default: return false;
2450     }
2451 }
2452
2453
2454 //////////////////////////////////////////////////////////////////////////
2455 /// @brief JITs from fetch shader IR
2456 /// @param hJitMgr - JitManager handle
2457 /// @param func   - LLVM function IR
2458 /// @return PFN_FETCH_FUNC - pointer to fetch code
2459 PFN_FETCH_FUNC JitFetchFunc(HANDLE hJitMgr, const HANDLE hFunc)
2460 {
2461     const llvm::Function* func = (const llvm::Function*)hFunc;
2462     JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
2463     PFN_FETCH_FUNC pfnFetch;
2464
2465     pfnFetch = (PFN_FETCH_FUNC)(pJitMgr->mpExec->getFunctionAddress(func->getName().str()));
2466     // MCJIT finalizes modules the first time you JIT code from them. After finalized, you cannot add new IR to the module
2467     pJitMgr->mIsModuleFinalized = true;
2468
2469 #if defined(KNOB_SWRC_TRACING)
2470     char fName[1024];
2471     const char *funcName = func->getName().data();
2472     sprintf(fName, "%s.bin", funcName);
2473     FILE *fd = fopen(fName, "wb");
2474     fwrite((void *)pfnFetch, 1, 2048, fd);
2475     fclose(fd);
2476 #endif
2477
2478     pJitMgr->DumpAsm(const_cast<llvm::Function*>(func), "final");
2479
2480     return pfnFetch;
2481 }
2482
2483 //////////////////////////////////////////////////////////////////////////
2484 /// @brief JIT compiles fetch shader
2485 /// @param hJitMgr - JitManager handle
2486 /// @param state   - fetch state to build function from
2487 extern "C" PFN_FETCH_FUNC JITCALL JitCompileFetch(HANDLE hJitMgr, const FETCH_COMPILE_STATE& state)
2488 {
2489     JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
2490
2491     pJitMgr->SetupNewModule();
2492
2493     FetchJit theJit(pJitMgr);
2494     HANDLE hFunc = theJit.Create(state);
2495
2496     return JitFetchFunc(hJitMgr, hFunc);
2497 }