src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp

   1 /****************************************************************************
   2 * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
   3 *
   4 * Permission is hereby granted, free of charge, to any person obtaining a
   5 * copy of this software and associated documentation files (the "Software"),
   6 * to deal in the Software without restriction, including without limitation
   7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8 * and/or sell copies of the Software, and to permit persons to whom the
   9 * Software is furnished to do so, subject to the following conditions:
  10 *
  11 * The above copyright notice and this permission notice (including the next
  12 * paragraph) shall be included in all copies or substantial portions of the
  13 * Software.
  14 *
  15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21 * IN THE SOFTWARE.
  22 *
  23 * @file fetch_jit.cpp
  24 *
  25 * @brief Implementation of the fetch jitter
  26 *
  27 * Notes:
  28 *
  29 ******************************************************************************/
  30 #include "builder.h"
  31 #include "jit_api.h"
  32 #include "fetch_jit.h"
  33 #include "gen_state_llvm.h"
  34 #include <sstream>
  35 #include <tuple>
  36
  37 //#define FETCH_DUMP_VERTEX 1
  38 using namespace llvm;
  39 using namespace SwrJit;
  40
  41 bool isComponentEnabled(ComponentEnable enableMask, uint8_t component);
  42
  43 enum ConversionType
  44 {
  45     CONVERT_NONE,
  46     CONVERT_NORMALIZED,
  47     CONVERT_USCALED,
  48     CONVERT_SSCALED,
  49     CONVERT_SFIXED,
  50 };
  51
  52 //////////////////////////////////////////////////////////////////////////
  53 /// Interface to Jitting a fetch shader
  54 //////////////////////////////////////////////////////////////////////////
  55 struct FetchJit : public Builder
  56 {
  57     FetchJit(JitManager* pJitMgr) : Builder(pJitMgr){};
  58
  59     Function* Create(const FETCH_COMPILE_STATE& fetchState);
  60     Value* GetSimdValid32bitIndices(Value* vIndices, Value* pLastIndex);
  61     Value* GetSimdValid16bitIndices(Value* vIndices, Value* pLastIndex);
  62     Value* GetSimdValid8bitIndices(Value* vIndices, Value* pLastIndex);
  63
  64     // package up Shuffle*bpcGatherd args into a tuple for convenience
  65     typedef std::tuple<Value*&, Value*, const Instruction::CastOps, const ConversionType,
  66         uint32_t&, uint32_t&, const ComponentEnable, const ComponentControl(&)[4], Value*(&)[4],
  67         const uint32_t(&)[4]> Shuffle8bpcArgs;
  68 #if USE_SIMD16_SHADERS
  69     void Shuffle8bpcGatherd(Shuffle8bpcArgs &args, bool useVertexID2);
  70 #else
  71     void Shuffle8bpcGatherd(Shuffle8bpcArgs &args);
  72 #endif
  73
  74     typedef std::tuple<Value*(&)[2], Value*, const Instruction::CastOps, const ConversionType,
  75         uint32_t&, uint32_t&, const ComponentEnable, const ComponentControl(&)[4], Value*(&)[4]> Shuffle16bpcArgs;
  76 #if USE_SIMD16_SHADERS
  77     void Shuffle16bpcGather(Shuffle16bpcArgs &args, bool useVertexID2);
  78 #else
  79     void Shuffle16bpcGather(Shuffle16bpcArgs &args);
  80 #endif
  81
  82     void StoreVertexElements(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4]);
  83 #if USE_SIMD16_BUILDER
  84     void StoreVertexElements2(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4]);
  85 #endif
  86
  87 #if USE_SIMD16_SHADERS
  88     Value* GenerateCompCtrlVector(const ComponentControl ctrl, bool useVertexID2);
  89 #else
  90     Value* GenerateCompCtrlVector(const ComponentControl ctrl);
  91 #endif
  92 #if USE_SIMD16_BUILDER
  93     Value* GenerateCompCtrlVector2(const ComponentControl ctrl);
  94 #endif
  95
  96     void JitLoadVertices(const FETCH_COMPILE_STATE &fetchState, Value* streams, Value* vIndices, Value* pVtxOut);
  97 #if USE_SIMD16_SHADERS
  98 #define USE_SIMD16_GATHERS 0
  99
 100 #if USE_SIMD16_GATHERS
 101     void JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, Value *streams, Value *vIndices, Value *vIndices2, Value *pVtxOut, bool useVertexID2);
 102 #else
 103     void JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, Value* streams, Value* vIndices, Value* pVtxOut, bool useVertexID2);
 104 #endif
 105 #else
 106     void JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, Value* streams, Value* vIndices, Value* pVtxOut);
 107 #endif
 108
 109     bool IsOddFormat(SWR_FORMAT format);
 110     bool IsUniformFormat(SWR_FORMAT format);
 111     void UnpackComponents(SWR_FORMAT format, Value* vInput, Value* result[4]);
 112     void CreateGatherOddFormats(SWR_FORMAT format, Value* pMask, Value* pBase, Value* offsets, Value* result[4]);
 113     void ConvertFormat(SWR_FORMAT format, Value *texels[4]);
 114
 115     Value* mpFetchInfo;
 116 };
 117
 118 Function* FetchJit::Create(const FETCH_COMPILE_STATE& fetchState)
 119 {
 120     std::stringstream fnName("FetchShader_", std::ios_base::in | std::ios_base::out | std::ios_base::ate);
 121     fnName << ComputeCRC(0, &fetchState, sizeof(fetchState));
 122
 123     Function*    fetch = Function::Create(JM()->mFetchShaderTy, GlobalValue::ExternalLinkage, fnName.str(), JM()->mpCurrentModule);
 124     BasicBlock*    entry = BasicBlock::Create(JM()->mContext, "entry", fetch);
 125
 126     fetch->getParent()->setModuleIdentifier(fetch->getName());
 127
 128     IRB()->SetInsertPoint(entry);
 129
 130     auto    argitr = fetch->arg_begin();
 131
 132     // Fetch shader arguments
 133     mpFetchInfo = &*argitr; ++argitr;
 134     mpFetchInfo->setName("fetchInfo");
 135     Value*    pVtxOut = &*argitr;
 136     pVtxOut->setName("vtxOutput");
 137     // this is just shorthand to tell LLVM to get a pointer to the base address of simdvertex
 138     // index 0(just the pointer to the simdvertex structure
 139     // index 1(which element of the simdvertex structure to offset to(in this case 0)
 140     // so the indices being i32's doesn't matter
 141     // TODO: generated this GEP with a VECTOR structure type so this makes sense
 142     std::vector<Value*>    vtxInputIndices(2, C(0));
 143     // GEP
 144     pVtxOut = GEP(pVtxOut, C(0));
 145 #if USE_SIMD16_SHADERS
 146 #if 0// USE_SIMD16_BUILDER
 147     pVtxOut = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth2), 0));
 148 #else
 149     pVtxOut = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth), 0));
 150 #endif
 151 #else
 152     pVtxOut = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth), 0));
 153 #endif
 154
 155     // SWR_FETCH_CONTEXT::pStreams
 156     Value*    streams = LOAD(mpFetchInfo,{0, SWR_FETCH_CONTEXT_pStreams});
 157     streams->setName("pStreams");
 158
 159     // SWR_FETCH_CONTEXT::pIndices
 160     Value*    indices = LOAD(mpFetchInfo,{0, SWR_FETCH_CONTEXT_pIndices});
 161     indices->setName("pIndices");
 162
 163     // SWR_FETCH_CONTEXT::pLastIndex
 164     Value*    pLastIndex = LOAD(mpFetchInfo,{0, SWR_FETCH_CONTEXT_pLastIndex});
 165     pLastIndex->setName("pLastIndex");
 166
 167
 168     Value* vIndices;
 169 #if USE_SIMD16_SHADERS
 170     Value* indices2;
 171     Value* vIndices2;
 172 #endif
 173     switch(fetchState.indexType)
 174     {
 175         case R8_UINT:
 176             indices = BITCAST(indices, Type::getInt8PtrTy(JM()->mContext, 0));
 177 #if USE_SIMD16_SHADERS
 178             indices2 = GEP(indices, C(8));
 179 #endif
 180             if(fetchState.bDisableIndexOOBCheck)
 181             {
 182                 vIndices = LOAD(BITCAST(indices, PointerType::get(VectorType::get(mInt8Ty, mpJitMgr->mVWidth), 0)), {(uint32_t)0});
 183                 vIndices = Z_EXT(vIndices, mSimdInt32Ty);
 184 #if USE_SIMD16_SHADERS
 185                 vIndices2 = LOAD(BITCAST(indices2, PointerType::get(VectorType::get(mInt8Ty, mpJitMgr->mVWidth), 0)), { (uint32_t)0 });
 186                 vIndices2 = Z_EXT(vIndices2, mSimdInt32Ty);
 187 #endif
 188             }
 189             else
 190             {
 191                 pLastIndex = BITCAST(pLastIndex, Type::getInt8PtrTy(JM()->mContext, 0));
 192                 vIndices = GetSimdValid8bitIndices(indices, pLastIndex);
 193 #if USE_SIMD16_SHADERS
 194                 pLastIndex = BITCAST(pLastIndex, Type::getInt8PtrTy(JM()->mContext, 0));
 195                 vIndices2 = GetSimdValid8bitIndices(indices2, pLastIndex);
 196 #endif
 197             }
 198             break;
 199         case R16_UINT:
 200             indices = BITCAST(indices, Type::getInt16PtrTy(JM()->mContext, 0));
 201 #if USE_SIMD16_SHADERS
 202             indices2 = GEP(indices, C(8));
 203 #endif
 204             if(fetchState.bDisableIndexOOBCheck)
 205             {
 206                 vIndices = LOAD(BITCAST(indices, PointerType::get(VectorType::get(mInt16Ty, mpJitMgr->mVWidth), 0)), {(uint32_t)0});
 207                 vIndices = Z_EXT(vIndices, mSimdInt32Ty);
 208 #if USE_SIMD16_SHADERS
 209                 vIndices2 = LOAD(BITCAST(indices2, PointerType::get(VectorType::get(mInt16Ty, mpJitMgr->mVWidth), 0)), { (uint32_t)0 });
 210                 vIndices2 = Z_EXT(vIndices2, mSimdInt32Ty);
 211 #endif
 212             }
 213             else
 214             {
 215                 pLastIndex = BITCAST(pLastIndex, Type::getInt16PtrTy(JM()->mContext, 0));
 216                 vIndices = GetSimdValid16bitIndices(indices, pLastIndex);
 217 #if USE_SIMD16_SHADERS
 218                 pLastIndex = BITCAST(pLastIndex, Type::getInt16PtrTy(JM()->mContext, 0));
 219                 vIndices2 = GetSimdValid16bitIndices(indices2, pLastIndex);
 220 #endif
 221             }
 222             break;
 223         case R32_UINT:
 224 #if USE_SIMD16_SHADERS
 225             indices2 = GEP(indices, C(8));
 226 #endif
 227             (fetchState.bDisableIndexOOBCheck) ? vIndices = LOAD(BITCAST(indices, PointerType::get(mSimdInt32Ty,0)),{(uint32_t)0})
 228                                                : vIndices = GetSimdValid32bitIndices(indices, pLastIndex);
 229 #if USE_SIMD16_SHADERS
 230             (fetchState.bDisableIndexOOBCheck) ? vIndices2 = LOAD(BITCAST(indices2, PointerType::get(mSimdInt32Ty, 0)), { (uint32_t)0 })
 231                                                : vIndices2 = GetSimdValid32bitIndices(indices2, pLastIndex);
 232 #endif
 233             break; // incoming type is already 32bit int
 234         default: SWR_INVALID("Unsupported index type"); vIndices = nullptr; break;
 235     }
 236
 237     if(fetchState.bForceSequentialAccessEnable)
 238     {
 239         Value* pOffsets = C({ 0, 1, 2, 3, 4, 5, 6, 7 });
 240
 241         // VertexData buffers are accessed sequentially, the index is equal to the vertex number
 242         vIndices = VBROADCAST(LOAD(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_StartVertex }));
 243         vIndices = ADD(vIndices, pOffsets);
 244 #if USE_SIMD16_SHADERS
 245         vIndices2 = ADD(vIndices, VIMMED1(8));
 246 #endif
 247     }
 248
 249     Value* vVertexId = vIndices;
 250 #if USE_SIMD16_SHADERS
 251     Value* vVertexId2 = vIndices2;
 252 #endif
 253     if (fetchState.bVertexIDOffsetEnable)
 254     {
 255         // Assuming one of baseVertex or startVertex is 0, so adding both should be functionally correct
 256         Value* vBaseVertex = VBROADCAST(LOAD(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_BaseVertex }));
 257         Value* vStartVertex = VBROADCAST(LOAD(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_StartVertex }));
 258         vVertexId = ADD(vIndices, vBaseVertex);
 259         vVertexId = ADD(vVertexId, vStartVertex);
 260 #if USE_SIMD16_SHADERS
 261         vVertexId2 = ADD(vIndices2, vBaseVertex);
 262         vVertexId2 = ADD(vVertexId2, vStartVertex);
 263 #endif
 264     }
 265
 266     // store out vertex IDs
 267     STORE(vVertexId, GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID }));
 268 #if USE_SIMD16_SHADERS
 269     STORE(vVertexId2, GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID2 }));
 270 #endif
 271
 272     // store out cut mask if enabled
 273     if (fetchState.bEnableCutIndex)
 274     {
 275         Value* vCutIndex = VIMMED1(fetchState.cutIndex);
 276         Value* cutMask = VMASK(ICMP_EQ(vIndices, vCutIndex));
 277         STORE(cutMask, GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_CutMask }));
 278 #if USE_SIMD16_SHADERS
 279         Value* cutMask2 = VMASK(ICMP_EQ(vIndices2, vCutIndex));
 280         STORE(cutMask2, GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_CutMask2 }));
 281 #endif
 282     }
 283
 284     // Fetch attributes from memory and output to a simdvertex struct
 285     // since VGATHER has a perf penalty on HSW vs BDW, allow client to choose which fetch method to use
 286 #if USE_SIMD16_SHADERS
 287     if (fetchState.bDisableVGATHER)
 288     {
 289         JitLoadVertices(fetchState, streams, vIndices, pVtxOut);
 290         JitLoadVertices(fetchState, streams, vIndices2, GEP(pVtxOut, C(1)));
 291     }
 292     else
 293     {
 294 #if USE_SIMD16_GATHERS
 295         JitGatherVertices(fetchState, streams, vIndices, vIndices2, pVtxOut, false);
 296 #else
 297         JitGatherVertices(fetchState, streams, vIndices, pVtxOut, false);
 298         JitGatherVertices(fetchState, streams, vIndices2, GEP(pVtxOut, C(1)), true);
 299 #endif
 300     }
 301 #else
 302     (fetchState.bDisableVGATHER) ? JitLoadVertices(fetchState, streams, vIndices, pVtxOut)
 303                                  : JitGatherVertices(fetchState, streams, vIndices, pVtxOut);
 304 #endif
 305
 306     RET_VOID();
 307
 308     JitManager::DumpToFile(fetch, "src");
 309
 310 #if defined(_DEBUG)
 311     verifyFunction(*fetch);
 312 #endif
 313
 314     ::FunctionPassManager setupPasses(JM()->mpCurrentModule);
 315
 316     ///@todo We don't need the CFG passes for fetch. (e.g. BreakCriticalEdges and CFGSimplification)
 317     setupPasses.add(createBreakCriticalEdgesPass());
 318     setupPasses.add(createCFGSimplificationPass());
 319     setupPasses.add(createEarlyCSEPass());
 320     setupPasses.add(createPromoteMemoryToRegisterPass());
 321
 322     setupPasses.run(*fetch);
 323
 324     JitManager::DumpToFile(fetch, "se");
 325
 326     ::FunctionPassManager optPasses(JM()->mpCurrentModule);
 327
 328     ///@todo Haven't touched these either. Need to remove some of these and add others.
 329     optPasses.add(createCFGSimplificationPass());
 330     optPasses.add(createEarlyCSEPass());
 331     optPasses.add(createInstructionCombiningPass());
 332     optPasses.add(createInstructionSimplifierPass());
 333     optPasses.add(createConstantPropagationPass());
 334     optPasses.add(createSCCPPass());
 335     optPasses.add(createAggressiveDCEPass());
 336
 337     optPasses.run(*fetch);
 338     optPasses.run(*fetch);
 339
 340     JitManager::DumpToFile(fetch, "opt");
 341
 342     return fetch;
 343 }
 344
 345 //////////////////////////////////////////////////////////////////////////
 346 /// @brief Loads attributes from memory using LOADs, shuffling the
 347 /// components into SOA form.
 348 /// *Note* currently does not support component control,
 349 /// component packing, instancing
 350 /// @param fetchState - info about attributes to be fetched from memory
 351 /// @param streams - value pointer to the current vertex stream
 352 /// @param vIndices - vector value of indices to load
 353 /// @param pVtxOut - value pointer to output simdvertex struct
 354 void FetchJit::JitLoadVertices(const FETCH_COMPILE_STATE &fetchState, Value* streams, Value* vIndices, Value* pVtxOut)
 355 {
 356     // Zack shuffles; a variant of the Charleston.
 357
 358     std::vector<Value*> vectors(16);
 359     std::vector<Constant*>    pMask(mVWidth);
 360     for(uint32_t i = 0; i < mVWidth; ++i)
 361     {
 362         pMask[i] = (C(i < 4 ? i : 4));
 363     }
 364     Constant* promoteMask = ConstantVector::get(pMask);
 365     Constant* uwvec = UndefValue::get(VectorType::get(mFP32Ty, 4));
 366
 367     Value* startVertex = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartVertex});
 368     Value* startInstance = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartInstance});
 369     Value* curInstance = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_CurInstance});
 370     Value* vBaseVertex = VBROADCAST(LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_BaseVertex}));
 371     curInstance->setName("curInstance");
 372
 373     for(uint32_t nelt = 0; nelt < fetchState.numAttribs; ++nelt)
 374     {
 375         Value*    elements[4] = {0};
 376         const INPUT_ELEMENT_DESC& ied = fetchState.layout[nelt];
 377         const SWR_FORMAT_INFO &info = GetFormatInfo((SWR_FORMAT)ied.Format);
 378         SWR_ASSERT((info.bpp != 0), "Unsupported format in JitLoadVertices.");
 379         uint32_t    numComponents = info.numComps;
 380         uint32_t bpc = info.bpp / info.numComps;  ///@todo Code below assumes all components are same size. Need to fix.
 381
 382         // load path doesn't support component packing
 383         SWR_ASSERT(ied.ComponentPacking == ComponentEnable::XYZW, "Fetch load path doesn't support component packing.");
 384
 385         vectors.clear();
 386
 387         if (fetchState.bInstanceIDOffsetEnable)
 388         {
 389             SWR_ASSERT((0), "TODO: Fill out more once driver sends this down");
 390         }
 391
 392         Value *vCurIndices;
 393         Value *startOffset;
 394         if(ied.InstanceEnable)
 395         {
 396             Value* stepRate = C(ied.InstanceAdvancementState);
 397
 398             // prevent a div by 0 for 0 step rate
 399             Value* isNonZeroStep = ICMP_UGT(stepRate, C(0));
 400             stepRate = SELECT(isNonZeroStep, stepRate, C(1));
 401
 402             // calc the current offset into instanced data buffer
 403             Value* calcInstance = UDIV(curInstance, stepRate);
 404
 405             // if step rate is 0, every instance gets instance 0
 406             calcInstance = SELECT(isNonZeroStep, calcInstance, C(0));
 407
 408             vCurIndices = VBROADCAST(calcInstance);
 409
 410             startOffset = startInstance;
 411         }
 412         else if (ied.InstanceStrideEnable)
 413         {
 414             SWR_ASSERT((0), "TODO: Fill out more once driver sends this down.");
 415         }
 416         else
 417         {
 418             // offset indices by baseVertex
 419             vCurIndices = ADD(vIndices, vBaseVertex);
 420
 421             startOffset = startVertex;
 422         }
 423
 424         // load SWR_VERTEX_BUFFER_STATE::pData
 425         Value *stream = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pData});
 426
 427         // load SWR_VERTEX_BUFFER_STATE::pitch
 428         Value *stride = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pitch});
 429         stride = Z_EXT(stride, mInt64Ty);
 430
 431         // load SWR_VERTEX_BUFFER_STATE::size
 432         Value *size = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_size});
 433         size = Z_EXT(size, mInt64Ty);
 434
 435         Value* startVertexOffset = MUL(Z_EXT(startOffset, mInt64Ty), stride);
 436
 437         Value *minVertex = NULL;
 438         Value *minVertexOffset = NULL;
 439         if (fetchState.bPartialVertexBuffer) {
 440             // fetch min index for low bounds checking
 441             minVertex = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_minVertex)});
 442             minVertex = LOAD(minVertex);
 443             if (!fetchState.bDisableIndexOOBCheck) {
 444                 minVertexOffset = MUL(Z_EXT(minVertex, mInt64Ty), stride);
 445             }
 446         }
 447
 448         // Load from the stream.
 449         for(uint32_t lane = 0; lane < mVWidth; ++lane)
 450         {
 451             // Get index
 452             Value* index = VEXTRACT(vCurIndices, C(lane));
 453
 454             if (fetchState.bPartialVertexBuffer) {
 455                 // clamp below minvertex
 456                 Value *isBelowMin = ICMP_SLT(index, minVertex);
 457                 index = SELECT(isBelowMin, minVertex, index);
 458             }
 459
 460             index = Z_EXT(index, mInt64Ty);
 461
 462             Value*    offset = MUL(index, stride);
 463             offset = ADD(offset, C((int64_t)ied.AlignedByteOffset));
 464             offset = ADD(offset, startVertexOffset);
 465
 466             if (!fetchState.bDisableIndexOOBCheck) {
 467                 // check for out of bound access, including partial OOB, and replace them with minVertex
 468                 Value *endOffset = ADD(offset, C((int64_t)info.Bpp));
 469                 Value *oob = ICMP_ULE(endOffset, size);
 470                 if (fetchState.bPartialVertexBuffer) {
 471                     offset = SELECT(oob, offset, minVertexOffset);
 472                 } else {
 473                     offset = SELECT(oob, offset, ConstantInt::get(mInt64Ty, 0));
 474                 }
 475             }
 476
 477             Value*    pointer = GEP(stream, offset);
 478             // We use a full-lane, but don't actually care.
 479             Value*    vptr = 0;
 480
 481             // get a pointer to a 4 component attrib in default address space
 482             switch(bpc)
 483             {
 484                 case 8: vptr = BITCAST(pointer, PointerType::get(VectorType::get(mInt8Ty, 4), 0)); break;
 485                 case 16: vptr = BITCAST(pointer, PointerType::get(VectorType::get(mInt16Ty, 4), 0)); break;
 486                 case 32: vptr = BITCAST(pointer, PointerType::get(VectorType::get(mFP32Ty, 4), 0)); break;
 487                 default: SWR_INVALID("Unsupported underlying bpp!");
 488             }
 489
 490             // load 4 components of attribute
 491             Value*    vec = ALIGNED_LOAD(vptr, 1, false);
 492
 493             // Convert To FP32 internally
 494             switch(info.type[0])
 495             {
 496                 case SWR_TYPE_UNORM:
 497                     switch(bpc)
 498                     {
 499                         case 8:
 500                             vec = UI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
 501                             vec = FMUL(vec, ConstantVector::get(std::vector<Constant*>(4, ConstantFP::get(mFP32Ty, 1.0 / 255.0))));
 502                             break;
 503                         case 16:
 504                             vec = UI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
 505                             vec = FMUL(vec, ConstantVector::get(std::vector<Constant*>(4, ConstantFP::get(mFP32Ty, 1.0 / 65535.0))));
 506                             break;
 507                         default:
 508                             SWR_INVALID("Unsupported underlying type!");
 509                             break;
 510                     }
 511                     break;
 512                 case SWR_TYPE_SNORM:
 513                     switch(bpc)
 514                     {
 515                         case 8:
 516                             vec = SI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
 517                             vec = FMUL(vec, ConstantVector::get(std::vector<Constant*>(4, ConstantFP::get(mFP32Ty, 1.0 / 128.0))));
 518                             break;
 519                         case 16:
 520                             vec = SI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
 521                             vec = FMUL(vec, ConstantVector::get(std::vector<Constant*>(4, ConstantFP::get(mFP32Ty, 1.0 / 32768.0))));
 522                             break;
 523                         default:
 524                             SWR_INVALID("Unsupported underlying type!");
 525                             break;
 526                     }
 527                     break;
 528                 case SWR_TYPE_UINT:
 529                     // Zero extend uint32_t types.
 530                     switch(bpc)
 531                     {
 532                         case 8:
 533                         case 16:
 534                             vec = Z_EXT(vec, VectorType::get(mInt32Ty, 4));
 535                             vec = BITCAST(vec, VectorType::get(mFP32Ty, 4));
 536                             break;
 537                         case 32:
 538                             break; // Pass through unchanged.
 539                         default:
 540                             SWR_INVALID("Unsupported underlying type!");
 541                             break;
 542                     }
 543                     break;
 544                 case SWR_TYPE_SINT:
 545                     // Sign extend SINT types.
 546                     switch(bpc)
 547                     {
 548                         case 8:
 549                         case 16:
 550                             vec = S_EXT(vec, VectorType::get(mInt32Ty, 4));
 551                             vec = BITCAST(vec, VectorType::get(mFP32Ty, 4));
 552                             break;
 553                         case 32:
 554                             break; // Pass through unchanged.
 555                         default:
 556                             SWR_INVALID("Unsupported underlying type!");
 557                             break;
 558                     }
 559                     break;
 560                 case SWR_TYPE_FLOAT:
 561                     switch(bpc)
 562                     {
 563                         case 32:
 564                             break; // Pass through unchanged.
 565                         default:
 566                             SWR_INVALID("Unsupported underlying type!");
 567                     }
 568                     break;
 569                 case SWR_TYPE_USCALED:
 570                     vec = UI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
 571                     break;
 572                 case SWR_TYPE_SSCALED:
 573                     vec = SI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
 574                     break;
 575                 case SWR_TYPE_SFIXED:
 576                     vec = FMUL(SI_TO_FP(vec, VectorType::get(mFP32Ty, 4)), VBROADCAST(C(1/65536.0f)));
 577                     break;
 578                 case SWR_TYPE_UNKNOWN:
 579                 case SWR_TYPE_UNUSED:
 580                     SWR_INVALID("Unsupported type %d!", info.type[0]);
 581             }
 582
 583             // promote mask: sse(0,1,2,3) | avx(0,1,2,3,4,4,4,4)
 584             // uwvec: 4 x F32, undef value
 585             Value*    wvec = VSHUFFLE(vec, uwvec, promoteMask);
 586             vectors.push_back(wvec);
 587         }
 588
 589         std::vector<Constant*>        v01Mask(mVWidth);
 590         std::vector<Constant*>        v23Mask(mVWidth);
 591         std::vector<Constant*>        v02Mask(mVWidth);
 592         std::vector<Constant*>        v13Mask(mVWidth);
 593
 594         // Concatenate the vectors together.
 595         elements[0] = VUNDEF_F();
 596         elements[1] = VUNDEF_F();
 597         elements[2] = VUNDEF_F();
 598         elements[3] = VUNDEF_F();
 599         for(uint32_t b = 0, num4Wide = mVWidth / 4; b < num4Wide; ++b)
 600         {
 601             v01Mask[4 * b + 0] = C(0 + 4 * b);
 602             v01Mask[4 * b + 1] = C(1 + 4 * b);
 603             v01Mask[4 * b + 2] = C(0 + 4 * b + mVWidth);
 604             v01Mask[4 * b + 3] = C(1 + 4 * b + mVWidth);
 605
 606             v23Mask[4 * b + 0] = C(2 + 4 * b);
 607             v23Mask[4 * b + 1] = C(3 + 4 * b);
 608             v23Mask[4 * b + 2] = C(2 + 4 * b + mVWidth);
 609             v23Mask[4 * b + 3] = C(3 + 4 * b + mVWidth);
 610
 611             v02Mask[4 * b + 0] = C(0 + 4 * b);
 612             v02Mask[4 * b + 1] = C(2 + 4 * b);
 613             v02Mask[4 * b + 2] = C(0 + 4 * b + mVWidth);
 614             v02Mask[4 * b + 3] = C(2 + 4 * b + mVWidth);
 615
 616             v13Mask[4 * b + 0] = C(1 + 4 * b);
 617             v13Mask[4 * b + 1] = C(3 + 4 * b);
 618             v13Mask[4 * b + 2] = C(1 + 4 * b + mVWidth);
 619             v13Mask[4 * b + 3] = C(3 + 4 * b + mVWidth);
 620
 621             std::vector<Constant*>    iMask(mVWidth);
 622             for(uint32_t i = 0; i < mVWidth; ++i)
 623             {
 624                 if(((4 * b) <= i) && (i < (4 * (b + 1))))
 625                 {
 626                     iMask[i] = C(i % 4 + mVWidth);
 627                 }
 628                 else
 629                 {
 630                     iMask[i] = C(i);
 631                 }
 632             }
 633             Constant* insertMask = ConstantVector::get(iMask);
 634             elements[0] = VSHUFFLE(elements[0], vectors[4 * b + 0], insertMask);
 635             elements[1] = VSHUFFLE(elements[1], vectors[4 * b + 1], insertMask);
 636             elements[2] = VSHUFFLE(elements[2], vectors[4 * b + 2], insertMask);
 637             elements[3] = VSHUFFLE(elements[3], vectors[4 * b + 3], insertMask);
 638         }
 639
 640         Value* x0y0x1y1 = VSHUFFLE(elements[0], elements[1], ConstantVector::get(v01Mask));
 641         Value* x2y2x3y3 = VSHUFFLE(elements[2], elements[3], ConstantVector::get(v01Mask));
 642         Value* z0w0z1w1 = VSHUFFLE(elements[0], elements[1], ConstantVector::get(v23Mask));
 643         Value* z2w3z2w3 = VSHUFFLE(elements[2], elements[3], ConstantVector::get(v23Mask));
 644         elements[0] = VSHUFFLE(x0y0x1y1, x2y2x3y3, ConstantVector::get(v02Mask));
 645         elements[1] = VSHUFFLE(x0y0x1y1, x2y2x3y3, ConstantVector::get(v13Mask));
 646         elements[2] = VSHUFFLE(z0w0z1w1, z2w3z2w3, ConstantVector::get(v02Mask));
 647         elements[3] = VSHUFFLE(z0w0z1w1, z2w3z2w3, ConstantVector::get(v13Mask));
 648
 649         switch(numComponents + 1)
 650         {
 651             case    1: elements[0] = VIMMED1(0.0f);
 652             case    2: elements[1] = VIMMED1(0.0f);
 653             case    3: elements[2] = VIMMED1(0.0f);
 654             case    4: elements[3] = VIMMED1(1.0f);
 655         }
 656
 657         for(uint32_t c = 0; c < 4; ++c)
 658         {
 659 #if USE_SIMD16_SHADERS
 660             Value* dest = GEP(pVtxOut, C(nelt * 8 + c * 2), "destGEP");
 661 #else
 662             Value* dest = GEP(pVtxOut, C(nelt * 4 + c), "destGEP");
 663 #endif
 664             STORE(elements[c], dest);
 665         }
 666     }
 667 }
 668
 669 // returns true for odd formats that require special state.gather handling
 670 bool FetchJit::IsOddFormat(SWR_FORMAT format)
 671 {
 672     const SWR_FORMAT_INFO& info = GetFormatInfo(format);
 673     if (info.bpc[0] != 8 && info.bpc[0] != 16 && info.bpc[0] != 32 && info.bpc[0] != 64)
 674     {
 675         return true;
 676     }
 677     return false;
 678 }
 679
 680 // format is uniform if all components are the same size and type
 681 bool FetchJit::IsUniformFormat(SWR_FORMAT format)
 682 {
 683     const SWR_FORMAT_INFO& info = GetFormatInfo(format);
 684     uint32_t bpc0 = info.bpc[0];
 685     uint32_t type0 = info.type[0];
 686
 687     for (uint32_t c = 1; c < info.numComps; ++c)
 688     {
 689         if (bpc0 != info.bpc[c] || type0 != info.type[c])
 690         {
 691             return false;
 692         }
 693     }
 694     return true;
 695 }
 696
 697 // unpacks components based on format
 698 // foreach component in the pixel
 699 //   mask off everything but this component
 700 //   shift component to LSB
 701 void FetchJit::UnpackComponents(SWR_FORMAT format, Value* vInput, Value* result[4])
 702 {
 703     const SWR_FORMAT_INFO& info = GetFormatInfo(format);
 704
 705     uint32_t bitOffset = 0;
 706     for (uint32_t c = 0; c < info.numComps; ++c)
 707     {
 708         uint32_t swizzledIndex = info.swizzle[c];
 709         uint32_t compBits = info.bpc[c];
 710         uint32_t bitmask = ((1 << compBits) - 1) << bitOffset;
 711         Value* comp = AND(vInput, bitmask);
 712         comp = LSHR(comp, bitOffset);
 713
 714         result[swizzledIndex] = comp;
 715         bitOffset += compBits;
 716     }
 717 }
 718
 719 // gather for odd component size formats
 720 // gather SIMD full pixels per lane then shift/mask to move each component to their
 721 // own vector
 722 void FetchJit::CreateGatherOddFormats(SWR_FORMAT format, Value* pMask, Value* pBase, Value* pOffsets, Value* pResult[4])
 723 {
 724     const SWR_FORMAT_INFO &info = GetFormatInfo(format);
 725
 726     // only works if pixel size is <= 32bits
 727     SWR_ASSERT(info.bpp <= 32);
 728
 729         Value* pGather = GATHERDD(VIMMED1(0), pBase, pOffsets, pMask);
 730
 731     for (uint32_t comp = 0; comp < 4; ++comp)
 732     {
 733         pResult[comp] = VIMMED1((int)info.defaults[comp]);
 734     }
 735
 736     UnpackComponents(format, pGather, pResult);
 737
 738     // cast to fp32
 739     pResult[0] = BITCAST(pResult[0], mSimdFP32Ty);
 740     pResult[1] = BITCAST(pResult[1], mSimdFP32Ty);
 741     pResult[2] = BITCAST(pResult[2], mSimdFP32Ty);
 742     pResult[3] = BITCAST(pResult[3], mSimdFP32Ty);
 743 }
 744
 745 void FetchJit::ConvertFormat(SWR_FORMAT format, Value *texels[4])
 746 {
 747     const SWR_FORMAT_INFO &info = GetFormatInfo(format);
 748
 749     for (uint32_t c = 0; c < info.numComps; ++c)
 750     {
 751         uint32_t compIndex = info.swizzle[c];
 752
 753         // skip any conversion on UNUSED components
 754         if (info.type[c] == SWR_TYPE_UNUSED)
 755         {
 756             continue;
 757         }
 758
 759         if (info.isNormalized[c])
 760         {
 761             if (info.type[c] == SWR_TYPE_SNORM)
 762             {
 763                 /// @todo The most-negative value maps to -1.0f. e.g. the 5-bit value 10000 maps to -1.0f.
 764
 765                 /// result = c * (1.0f / (2^(n-1) - 1);
 766                 uint32_t n = info.bpc[c];
 767                 uint32_t pow2 = 1 << (n - 1);
 768                 float scale = 1.0f / (float)(pow2 - 1);
 769                 Value *vScale = VIMMED1(scale);
 770                 texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty);
 771                 texels[compIndex] = SI_TO_FP(texels[compIndex], mSimdFP32Ty);
 772                 texels[compIndex] = FMUL(texels[compIndex], vScale);
 773             }
 774             else
 775             {
 776                 SWR_ASSERT(info.type[c] == SWR_TYPE_UNORM);
 777
 778                 /// result = c * (1.0f / (2^n - 1))
 779                 uint32_t n = info.bpc[c];
 780                 uint32_t pow2 = 1 << n;
 781                 // special case 24bit unorm format, which requires a full divide to meet ULP requirement
 782                 if (n == 24)
 783                 {
 784                     float scale = (float)(pow2 - 1);
 785                     Value* vScale = VIMMED1(scale);
 786                     texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty);
 787                     texels[compIndex] = SI_TO_FP(texels[compIndex], mSimdFP32Ty);
 788                     texels[compIndex] = FDIV(texels[compIndex], vScale);
 789                 }
 790                 else
 791                 {
 792                     float scale = 1.0f / (float)(pow2 - 1);
 793                     Value *vScale = VIMMED1(scale);
 794                     texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty);
 795                     texels[compIndex] = UI_TO_FP(texels[compIndex], mSimdFP32Ty);
 796                     texels[compIndex] = FMUL(texels[compIndex], vScale);
 797                 }
 798             }
 799             continue;
 800         }
 801     }
 802 }
 803
 804 //////////////////////////////////////////////////////////////////////////
 805 /// @brief Loads attributes from memory using AVX2 GATHER(s)
 806 /// @param fetchState - info about attributes to be fetched from memory
 807 /// @param streams - value pointer to the current vertex stream
 808 /// @param vIndices - vector value of indices to gather
 809 /// @param pVtxOut - value pointer to output simdvertex struct
 810 #if USE_SIMD16_SHADERS
 811 #if USE_SIMD16_GATHERS
 812 void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
 813     Value *streams, Value *vIndices, Value *vIndices2, Value *pVtxOut, bool useVertexID2)
 814 #else
 815 void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
 816     Value* streams, Value* vIndices, Value* pVtxOut, bool useVertexID2)
 817 #endif
 818 #else
 819 void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
 820     Value* streams, Value* vIndices, Value* pVtxOut)
 821 #endif
 822 {
 823     uint32_t currentVertexElement = 0;
 824     uint32_t outputElt = 0;
 825     Value* vVertexElements[4];
 826 #if USE_SIMD16_GATHERS
 827     Value* vVertexElements2[4];
 828 #endif
 829
 830     Value* startVertex = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartVertex});
 831     Value* startInstance = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartInstance});
 832     Value* curInstance = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_CurInstance});
 833     Value* vBaseVertex = VBROADCAST(LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_BaseVertex}));
 834     curInstance->setName("curInstance");
 835
 836     for (uint32_t nInputElt = 0; nInputElt < fetchState.numAttribs; nInputElt += 1)
 837     {
 838         const INPUT_ELEMENT_DESC& ied = fetchState.layout[nInputElt];
 839
 840         // skip element if all components are disabled
 841         if (ied.ComponentPacking == ComponentEnable::NONE)
 842         {
 843             continue;
 844         }
 845
 846         const SWR_FORMAT_INFO &info = GetFormatInfo((SWR_FORMAT)ied.Format);
 847         SWR_ASSERT((info.bpp != 0), "Unsupported format in JitGatherVertices.");
 848         uint32_t bpc = info.bpp / info.numComps;  ///@todo Code below assumes all components are same size. Need to fix.
 849
 850         Value *stream = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pData});
 851
 852         // VGATHER* takes an *i8 src pointer
 853         Value* pStreamBase = BITCAST(stream, PointerType::get(mInt8Ty, 0));
 854
 855         Value *stride = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pitch});
 856         Value *vStride = VBROADCAST(stride);
 857
 858         // max vertex index that is fully in bounds
 859         Value *maxVertex = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_maxVertex)});
 860         maxVertex = LOAD(maxVertex);
 861
 862         Value *minVertex = NULL;
 863         if (fetchState.bPartialVertexBuffer)
 864         {
 865             // min vertex index for low bounds OOB checking
 866             minVertex = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_minVertex)});
 867             minVertex = LOAD(minVertex);
 868         }
 869
 870         if (fetchState.bInstanceIDOffsetEnable)
 871         {
 872             // the InstanceID (curInstance) value is offset by StartInstanceLocation
 873             curInstance = ADD(curInstance, startInstance);
 874         }
 875
 876         Value *vCurIndices;
 877 #if USE_SIMD16_GATHERS
 878         Value *vCurIndices2;
 879 #endif
 880         Value *startOffset;
 881         Value *vInstanceStride = VIMMED1(0);
 882
 883         if (ied.InstanceEnable)
 884         {
 885             Value* stepRate = C(ied.InstanceAdvancementState);
 886
 887             // prevent a div by 0 for 0 step rate
 888             Value* isNonZeroStep = ICMP_UGT(stepRate, C(0));
 889             stepRate = SELECT(isNonZeroStep, stepRate, C(1));
 890
 891             // calc the current offset into instanced data buffer
 892             Value* calcInstance = UDIV(curInstance, stepRate);
 893
 894             // if step rate is 0, every instance gets instance 0
 895             calcInstance = SELECT(isNonZeroStep, calcInstance, C(0));
 896
 897             vCurIndices = VBROADCAST(calcInstance);
 898 #if USE_SIMD16_GATHERS
 899             vCurIndices2 = VBROADCAST(calcInstance);
 900 #endif
 901
 902             startOffset = startInstance;
 903         }
 904         else if (ied.InstanceStrideEnable)
 905         {
 906             // grab the instance advancement state, determines stride in bytes from one instance to the next
 907             Value* stepRate = C(ied.InstanceAdvancementState);
 908             vInstanceStride = VBROADCAST(MUL(curInstance, stepRate));
 909
 910             // offset indices by baseVertex
 911             vCurIndices = ADD(vIndices, vBaseVertex);
 912 #if USE_SIMD16_GATHERS
 913             vCurIndices2 = ADD(vIndices2, vBaseVertex);
 914 #endif
 915
 916             startOffset = startVertex;
 917             SWR_ASSERT((0), "TODO: Fill out more once driver sends this down.");
 918         }
 919         else
 920         {
 921             // offset indices by baseVertex
 922             vCurIndices = ADD(vIndices, vBaseVertex);
 923 #if USE_SIMD16_GATHERS
 924             vCurIndices2 = ADD(vIndices2, vBaseVertex);
 925 #endif
 926
 927             startOffset = startVertex;
 928         }
 929
 930         // All of the OOB calculations are in vertices, not VB offsets, to prevent having to
 931         // do 64bit address offset calculations.
 932
 933         // calculate byte offset to the start of the VB
 934         Value* baseOffset = MUL(Z_EXT(startOffset, mInt64Ty), Z_EXT(stride, mInt64Ty));
 935         pStreamBase = GEP(pStreamBase, baseOffset);
 936
 937         // if we have a start offset, subtract from max vertex. Used for OOB check
 938         maxVertex = SUB(Z_EXT(maxVertex, mInt64Ty), Z_EXT(startOffset, mInt64Ty));
 939         Value* maxNeg = ICMP_SLT(maxVertex, C((int64_t)0));
 940         // if we have a negative value, we're already OOB. clamp at 0.
 941         maxVertex = SELECT(maxNeg, C(0), TRUNC(maxVertex, mInt32Ty));
 942
 943         if (fetchState.bPartialVertexBuffer)
 944         {
 945             // similary for min vertex
 946             minVertex = SUB(Z_EXT(minVertex, mInt64Ty), Z_EXT(startOffset, mInt64Ty));
 947             Value *minNeg = ICMP_SLT(minVertex, C((int64_t)0));
 948             minVertex = SELECT(minNeg, C(0), TRUNC(minVertex, mInt32Ty));
 949         }
 950
 951         // Load the in bounds size of a partially valid vertex
 952         Value *partialInboundsSize = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_partialInboundsSize)});
 953         partialInboundsSize = LOAD(partialInboundsSize);
 954         Value* vPartialVertexSize = VBROADCAST(partialInboundsSize);
 955         Value* vBpp = VBROADCAST(C(info.Bpp));
 956         Value* vAlignmentOffsets = VBROADCAST(C(ied.AlignedByteOffset));
 957
 958         // is the element is <= the partially valid size
 959         Value* vElementInBoundsMask = ICMP_SLE(vBpp, SUB(vPartialVertexSize, vAlignmentOffsets));
 960
 961 #if USE_SIMD16_GATHERS
 962         // override cur indices with 0 if pitch is 0
 963         Value* pZeroPitchMask = ICMP_EQ(vStride, VIMMED1(0));
 964         vCurIndices2 = SELECT(pZeroPitchMask, VIMMED1(0), vCurIndices2);
 965
 966         // are vertices partially OOB?
 967         Value* vMaxVertex = VBROADCAST(maxVertex);
 968         Value* vPartialOOBMask = ICMP_EQ(vCurIndices, vMaxVertex);
 969         Value* vPartialOOBMask2 = ICMP_EQ(vCurIndices2, vMaxVertex);
 970
 971         // are vertices fully in bounds?
 972         Value* vMaxGatherMask = ICMP_ULT(vCurIndices, vMaxVertex);
 973         Value* vMaxGatherMask2 = ICMP_ULT(vCurIndices2, vMaxVertex);
 974
 975         Value *vGatherMask;
 976         Value *vGatherMask2;
 977         if (fetchState.bPartialVertexBuffer)
 978         {
 979             // are vertices below minVertex limit?
 980             Value *vMinVertex = VBROADCAST(minVertex);
 981             Value *vMinGatherMask = ICMP_UGE(vCurIndices, vMinVertex);
 982             Value *vMinGatherMask2 = ICMP_UGE(vCurIndices2, vMinVertex);
 983
 984             // only fetch lanes that pass both tests
 985             vGatherMask = AND(vMaxGatherMask, vMinGatherMask);
 986             vGatherMask2 = AND(vMaxGatherMask, vMinGatherMask2);
 987         }
 988         else
 989         {
 990             vGatherMask = vMaxGatherMask;
 991             vGatherMask2 = vMaxGatherMask2;
 992         }
 993
 994         // blend in any partially OOB indices that have valid elements
 995         vGatherMask = SELECT(vPartialOOBMask, vElementInBoundsMask, vGatherMask);
 996         vGatherMask2 = SELECT(vPartialOOBMask2, vElementInBoundsMask, vGatherMask2);
 997         Value *pMask = vGatherMask;
 998         Value *pMask2 = vGatherMask2;
 999         vGatherMask = VMASK(vGatherMask);
1000         vGatherMask2 = VMASK(vGatherMask2);
1001
1002         // calculate the actual offsets into the VB
1003         Value* vOffsets = MUL(vCurIndices, vStride);
1004         vOffsets = ADD(vOffsets, vAlignmentOffsets);
1005
1006         Value* vOffsets2 = MUL(vCurIndices2, vStride);
1007         vOffsets2 = ADD(vOffsets2, vAlignmentOffsets);
1008
1009         // if instance stride enable is:
1010         //  true  - add product of the instanceID and advancement state to the offst into the VB
1011         //  false - value of vInstanceStride has been initialialized to zero
1012         vOffsets = ADD(vOffsets, vInstanceStride);
1013         vOffsets2 = ADD(vOffsets2, vInstanceStride);
1014
1015 #else
1016         // override cur indices with 0 if pitch is 0
1017         Value* pZeroPitchMask = ICMP_EQ(vStride, VIMMED1(0));
1018         vCurIndices = SELECT(pZeroPitchMask, VIMMED1(0), vCurIndices);
1019
1020         // are vertices partially OOB?
1021         Value* vMaxVertex = VBROADCAST(maxVertex);
1022         Value* vPartialOOBMask = ICMP_EQ(vCurIndices, vMaxVertex);
1023
1024         // are vertices fully in bounds?
1025         Value* vMaxGatherMask = ICMP_ULT(vCurIndices, vMaxVertex);
1026
1027         Value *vGatherMask;
1028         if (fetchState.bPartialVertexBuffer)
1029         {
1030             // are vertices below minVertex limit?
1031             Value *vMinVertex = VBROADCAST(minVertex);
1032             Value *vMinGatherMask = ICMP_UGE(vCurIndices, vMinVertex);
1033
1034             // only fetch lanes that pass both tests
1035             vGatherMask = AND(vMaxGatherMask, vMinGatherMask);
1036         }
1037         else
1038         {
1039             vGatherMask = vMaxGatherMask;
1040         }
1041
1042         // blend in any partially OOB indices that have valid elements
1043         vGatherMask = SELECT(vPartialOOBMask, vElementInBoundsMask, vGatherMask);
1044         Value* pMask = vGatherMask;
1045         vGatherMask = VMASK(vGatherMask);
1046
1047         // calculate the actual offsets into the VB
1048         Value* vOffsets = MUL(vCurIndices, vStride);
1049         vOffsets = ADD(vOffsets, vAlignmentOffsets);
1050
1051         // if instance stride enable is:
1052         //  true  - add product of the instanceID and advancement state to the offst into the VB
1053         //  false - value of vInstanceStride has been initialialized to zero
1054         vOffsets = ADD(vOffsets, vInstanceStride);
1055
1056 #endif
1057         // Packing and component control
1058         ComponentEnable compMask = (ComponentEnable)ied.ComponentPacking;
1059         const ComponentControl compCtrl[4] { (ComponentControl)ied.ComponentControl0, (ComponentControl)ied.ComponentControl1,
1060                                              (ComponentControl)ied.ComponentControl2, (ComponentControl)ied.ComponentControl3};
1061
1062         // Special gather/conversion for formats without equal component sizes
1063         if (IsOddFormat((SWR_FORMAT)ied.Format))
1064         {
1065 #if USE_SIMD16_GATHERS
1066             Value *pResults[4];
1067             Value *pResults2[4];
1068             CreateGatherOddFormats((SWR_FORMAT)ied.Format, vGatherMask, pStreamBase, vOffsets, pResults);
1069             CreateGatherOddFormats((SWR_FORMAT)ied.Format, vGatherMask2, pStreamBase, vOffsets2, pResults2);
1070             ConvertFormat((SWR_FORMAT)ied.Format, pResults);
1071             ConvertFormat((SWR_FORMAT)ied.Format, pResults2);
1072
1073             for (uint32_t c = 0; c < 4; c += 1)
1074             {
1075                 if (isComponentEnabled(compMask, c))
1076                 {
1077                     vVertexElements[currentVertexElement] = pResults[c];
1078                     vVertexElements2[currentVertexElement] = pResults2[c];
1079                     currentVertexElement++;
1080
1081                     if (currentVertexElement > 3)
1082                     {
1083                         StoreVertexElements(pVtxOut, outputElt, 4, vVertexElements);
1084                         StoreVertexElements(GEP(pVtxOut, C(1)), outputElt, 4, vVertexElements2);
1085
1086                         outputElt += 1;
1087
1088                         // reset to the next vVertexElement to output
1089                         currentVertexElement = 0;
1090                     }
1091                 }
1092             }
1093 #else
1094             Value* pResults[4];
1095             CreateGatherOddFormats((SWR_FORMAT)ied.Format, vGatherMask, pStreamBase, vOffsets, pResults);
1096             ConvertFormat((SWR_FORMAT)ied.Format, pResults);
1097
1098             for (uint32_t c = 0; c < 4; ++c)
1099             {
1100                 if (isComponentEnabled(compMask, c))
1101                 {
1102                     vVertexElements[currentVertexElement++] = pResults[c];
1103                     if (currentVertexElement > 3)
1104                     {
1105                         StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1106                         // reset to the next vVertexElement to output
1107                         currentVertexElement = 0;
1108                     }
1109                 }
1110             }
1111 #endif
1112         }
1113         else if(info.type[0] == SWR_TYPE_FLOAT)
1114         {
1115             ///@todo: support 64 bit vb accesses
1116             Value* gatherSrc = VIMMED1(0.0f);
1117 #if USE_SIMD16_GATHERS
1118             Value* gatherSrc2 = VIMMED1(0.0f);
1119 #endif
1120
1121             SWR_ASSERT(IsUniformFormat((SWR_FORMAT)ied.Format),
1122                 "Unsupported format for standard gather fetch.");
1123
1124             // Gather components from memory to store in a simdvertex structure
1125             switch (bpc)
1126             {
1127                 case 16:
1128                 {
1129 #if USE_SIMD16_GATHERS
1130                     Value* vGatherResult[2];
1131                     Value* vGatherResult2[2];
1132
1133                     // if we have at least one component out of x or y to fetch
1134                     if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
1135                     {
1136                         vGatherResult[0] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vGatherMask);
1137                         vGatherResult2[0] = GATHERPS(gatherSrc2, pStreamBase, vOffsets2, vGatherMask2);
1138                         // e.g. result of first 8x32bit integer gather for 16bit components
1139                         // 256i - 0    1    2    3    4    5    6    7
1140                         //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
1141                         //
1142                     }
1143
1144                     // if we have at least one component out of z or w to fetch
1145                     if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
1146                     {
1147                         // offset base to the next components(zw) in the vertex to gather
1148                         pStreamBase = GEP(pStreamBase, C((char)4));
1149
1150                         vGatherResult[1] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vGatherMask);
1151                         vGatherResult2[1] = GATHERPS(gatherSrc2, pStreamBase, vOffsets2, vGatherMask2);
1152                         // e.g. result of second 8x32bit integer gather for 16bit components
1153                         // 256i - 0    1    2    3    4    5    6    7
1154                         //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
1155                         //
1156                     }
1157
1158
1159                     // if we have at least one component to shuffle into place
1160                     if (compMask)
1161                     {
1162                         Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, Instruction::CastOps::FPExt, CONVERT_NONE,
1163                             currentVertexElement, outputElt, compMask, compCtrl, vVertexElements);
1164                         Shuffle16bpcArgs args2 = std::forward_as_tuple(vGatherResult2, GEP(pVtxOut, C(1)), Instruction::CastOps::FPExt, CONVERT_NONE,
1165                             currentVertexElement, outputElt, compMask, compCtrl, vVertexElements2);
1166
1167                         // Shuffle gathered components into place in simdvertex struct
1168                         Shuffle16bpcGather(args, false);  // outputs to vVertexElements ref
1169                         Shuffle16bpcGather(args2, true);  // outputs to vVertexElements ref
1170                     }
1171 #else
1172                     Value* vGatherResult[2];
1173
1174                     // if we have at least one component out of x or y to fetch
1175                     if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)){
1176                         vGatherResult[0] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vGatherMask);
1177                         // e.g. result of first 8x32bit integer gather for 16bit components
1178                         // 256i - 0    1    2    3    4    5    6    7
1179                         //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
1180                         //
1181                     }
1182
1183                     // if we have at least one component out of z or w to fetch
1184                     if(isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)){
1185                         // offset base to the next components(zw) in the vertex to gather
1186                         pStreamBase = GEP(pStreamBase, C((char)4));
1187
1188                         vGatherResult[1] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vGatherMask);
1189                         // e.g. result of second 8x32bit integer gather for 16bit components
1190                         // 256i - 0    1    2    3    4    5    6    7
1191                         //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
1192                         //
1193                     }
1194
1195                     // if we have at least one component to shuffle into place
1196                     if(compMask){
1197                         Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, Instruction::CastOps::FPExt, CONVERT_NONE,
1198                             currentVertexElement, outputElt, compMask, compCtrl, vVertexElements);
1199
1200                         // Shuffle gathered components into place in simdvertex struct
1201 #if USE_SIMD16_SHADERS
1202                         Shuffle16bpcGather(args, useVertexID2);  // outputs to vVertexElements ref
1203 #else
1204                         Shuffle16bpcGather(args);  // outputs to vVertexElements ref
1205 #endif
1206                     }
1207 #endif
1208                 }
1209                     break;
1210                 case 32:
1211                 {
1212 #if USE_SIMD16_GATHERS
1213 #if USE_SIMD16_BUILDER
1214                     Value *pVtxSrc2[4];
1215
1216 #endif
1217 #endif
1218                     for (uint32_t i = 0; i < 4; i += 1)
1219                     {
1220 #if USE_SIMD16_GATHERS
1221                         if (isComponentEnabled(compMask, i))
1222                         {
1223                             // if we need to gather the component
1224                             if (compCtrl[i] == StoreSrc)
1225                             {
1226                                 // Gather a SIMD of vertices
1227                                 // APIs allow a 4GB range for offsets
1228                                 // However, GATHERPS uses signed 32-bit offsets, so only a 2GB range :(
1229                                 // But, we know that elements must be aligned for FETCH. :)
1230                                 // Right shift the offset by a bit and then scale by 2 to remove the sign extension.
1231                                 Value *vShiftedOffsets  = VPSRLI(vOffsets,  C(1));
1232                                 Value *vShiftedOffsets2 = VPSRLI(vOffsets2, C(1));
1233 #if USE_SIMD16_BUILDER
1234                                 Value *src = VUNDEF2_F();
1235                                 src = INSERT2_F(src, gatherSrc,  0);
1236                                 src = INSERT2_F(src, gatherSrc2, 1);
1237
1238                                 Value *indices = VUNDEF2_I();
1239                                 indices = INSERT2_I(indices, vShiftedOffsets,  0);
1240                                 indices = INSERT2_I(indices, vShiftedOffsets2, 1);
1241
1242                                 Value *mask = VUNDEF2_I();
1243                                 mask = INSERT2_I(mask, vGatherMask,  0);
1244                                 mask = INSERT2_I(mask, vGatherMask2, 1);
1245
1246                                 pVtxSrc2[currentVertexElement] = GATHERPS2(src, pStreamBase, indices, mask, 2);
1247 #if 1
1248
1249                                 vVertexElements[currentVertexElement]  = EXTRACT2_F(pVtxSrc2[currentVertexElement], 0);
1250                                 vVertexElements2[currentVertexElement] = EXTRACT2_F(pVtxSrc2[currentVertexElement], 1);
1251 #endif
1252 #else
1253                                 vVertexElements[currentVertexElement]  = GATHERPS(gatherSrc, pStreamBase, vShiftedOffsets, vGatherMask, 2);
1254                                 vVertexElements2[currentVertexElement] = GATHERPS(gatherSrc2, pStreamBase, vShiftedOffsets2, vGatherMask2, 2);
1255
1256 #if USE_SIMD16_BUILDER
1257                                 // pack adjacent pairs of SIMD8s into SIMD16s
1258                                 pVtxSrc2[currentVertexElement] = VUNDEF2_F();
1259                                 pVtxSrc2[currentVertexElement] = INSERT2_F(pVtxSrc2[currentVertexElement], vVertexElements[currentVertexElement],  0);
1260                                 pVtxSrc2[currentVertexElement] = INSERT2_F(pVtxSrc2[currentVertexElement], vVertexElements2[currentVertexElement], 1);
1261
1262 #endif
1263 #endif
1264                                 currentVertexElement += 1;
1265                             }
1266                             else
1267                             {
1268 #if USE_SIMD16_BUILDER
1269                                 pVtxSrc2[currentVertexElement] = GenerateCompCtrlVector2(compCtrl[i]);
1270 #else
1271                                 vVertexElements[currentVertexElement] = GenerateCompCtrlVector(compCtrl[i], false);
1272                                 vVertexElements2[currentVertexElement] = GenerateCompCtrlVector(compCtrl[i], true);
1273
1274 #if USE_SIMD16_BUILDER
1275                                 // pack adjacent pairs of SIMD8s into SIMD16s
1276                                 pVtxSrc2[currentVertexElement] = VUNDEF2_F();
1277                                 pVtxSrc2[currentVertexElement] = INSERT2_F(pVtxSrc2[currentVertexElement], vVertexElements[currentVertexElement],  0);
1278                                 pVtxSrc2[currentVertexElement] = INSERT2_F(pVtxSrc2[currentVertexElement], vVertexElements2[currentVertexElement], 1);
1279
1280 #endif
1281 #endif
1282                                 currentVertexElement += 1;
1283                             }
1284
1285                             if (currentVertexElement > 3)
1286                             {
1287 #if USE_SIMD16_BUILDER
1288                                 // store SIMD16s
1289                                 Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth2), 0));
1290
1291                                 StoreVertexElements2(pVtxOut2, outputElt, 4, pVtxSrc2);
1292
1293 #else
1294                                 StoreVertexElements(pVtxOut, outputElt, 4, vVertexElements);
1295                                 StoreVertexElements(GEP(pVtxOut, C(1)), outputElt, 4, vVertexElements2);
1296
1297 #endif
1298                                 outputElt += 1;
1299
1300                                 // reset to the next vVertexElement to output
1301                                 currentVertexElement = 0;
1302                             }
1303                         }
1304
1305                         // offset base to the next component in the vertex to gather
1306                         pStreamBase = GEP(pStreamBase, C((char)4));
1307 #else
1308                         if (isComponentEnabled(compMask, i))
1309                         {
1310                             // if we need to gather the component
1311                             if (compCtrl[i] == StoreSrc)
1312                             {
1313                                 // Gather a SIMD of vertices
1314                                 // APIs allow a 4GB range for offsets
1315                                 // However, GATHERPS uses signed 32-bit offsets, so only a 2GB range :(
1316                                 // But, we know that elements must be aligned for FETCH. :)
1317                                 // Right shift the offset by a bit and then scale by 2 to remove the sign extension.
1318                                 Value* vShiftedOffsets = VPSRLI(vOffsets, C(1));
1319                                 vVertexElements[currentVertexElement++] = GATHERPS(gatherSrc, pStreamBase, vShiftedOffsets, vGatherMask, 2);
1320                             }
1321                             else
1322                             {
1323 #if USE_SIMD16_SHADERS
1324                                 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i], useVertexID2);
1325 #else
1326                                 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1327 #endif
1328                             }
1329
1330                             if (currentVertexElement > 3)
1331                             {
1332                                 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1333                                 // reset to the next vVertexElement to output
1334                                 currentVertexElement = 0;
1335                             }
1336                         }
1337
1338                         // offset base to the next component in the vertex to gather
1339                         pStreamBase = GEP(pStreamBase, C((char)4));
1340 #endif
1341                     }
1342                 }
1343                     break;
1344                 case 64:
1345                 {
1346                     for (uint32_t i = 0; i < 4; i += 1)
1347                     {
1348 #if USE_SIMD16_GATHERS
1349                         if (isComponentEnabled(compMask, i))
1350                         {
1351                             // if we need to gather the component
1352                             if (compCtrl[i] == StoreSrc)
1353                             {
1354                                 Value *vMaskLo = VSHUFFLE(pMask, VUNDEF(mInt1Ty, 8), C({ 0, 1, 2, 3 }));
1355                                 Value *vMaskLo2 = VSHUFFLE(pMask2, VUNDEF(mInt1Ty, 8), C({ 0, 1, 2, 3 }));
1356                                 Value *vMaskHi = VSHUFFLE(pMask, VUNDEF(mInt1Ty, 8), C({ 4, 5, 6, 7 }));
1357                                 Value *vMaskHi2 = VSHUFFLE(pMask2, VUNDEF(mInt1Ty, 8), C({ 4, 5, 6, 7 }));
1358                                 vMaskLo = S_EXT(vMaskLo, VectorType::get(mInt64Ty, 4));
1359                                 vMaskLo2 = S_EXT(vMaskLo2, VectorType::get(mInt64Ty, 4));
1360                                 vMaskHi = S_EXT(vMaskHi, VectorType::get(mInt64Ty, 4));
1361                                 vMaskHi2 = S_EXT(vMaskHi2, VectorType::get(mInt64Ty, 4));
1362                                 vMaskLo = BITCAST(vMaskLo, VectorType::get(mDoubleTy, 4));
1363                                 vMaskLo2 = BITCAST(vMaskLo2, VectorType::get(mDoubleTy, 4));
1364                                 vMaskHi = BITCAST(vMaskHi, VectorType::get(mDoubleTy, 4));
1365                                 vMaskHi2 = BITCAST(vMaskHi2, VectorType::get(mDoubleTy, 4));
1366
1367                                 Value *vOffsetsLo = VEXTRACTI128(vOffsets, C(0));
1368                                 Value *vOffsetsLo2 = VEXTRACTI128(vOffsets2, C(0));
1369                                 Value *vOffsetsHi = VEXTRACTI128(vOffsets, C(1));
1370                                 Value *vOffsetsHi2 = VEXTRACTI128(vOffsets2, C(1));
1371
1372                                 Value *vZeroDouble = VECTOR_SPLAT(4, ConstantFP::get(IRB()->getDoubleTy(), 0.0f));
1373
1374                                 Value* pGatherLo = GATHERPD(vZeroDouble, pStreamBase, vOffsetsLo, vMaskLo);
1375                                 Value* pGatherLo2 = GATHERPD(vZeroDouble, pStreamBase, vOffsetsLo2, vMaskLo2);
1376                                 Value* pGatherHi = GATHERPD(vZeroDouble, pStreamBase, vOffsetsHi, vMaskHi);
1377                                 Value* pGatherHi2 = GATHERPD(vZeroDouble, pStreamBase, vOffsetsHi2, vMaskHi2);
1378
1379                                 pGatherLo = VCVTPD2PS(pGatherLo);
1380                                 pGatherLo2 = VCVTPD2PS(pGatherLo2);
1381                                 pGatherHi = VCVTPD2PS(pGatherHi);
1382                                 pGatherHi2 = VCVTPD2PS(pGatherHi2);
1383
1384                                 Value *pGather = VSHUFFLE(pGatherLo, pGatherHi, C({ 0, 1, 2, 3, 4, 5, 6, 7 }));
1385                                 Value *pGather2 = VSHUFFLE(pGatherLo2, pGatherHi2, C({ 0, 1, 2, 3, 4, 5, 6, 7 }));
1386
1387                                 vVertexElements[currentVertexElement] = pGather;
1388                                 vVertexElements2[currentVertexElement] = pGather2;
1389
1390                                 currentVertexElement += 1;
1391                             }
1392                             else
1393                             {
1394                                 vVertexElements[currentVertexElement] = GenerateCompCtrlVector(compCtrl[i], false);
1395                                 vVertexElements2[currentVertexElement] = GenerateCompCtrlVector(compCtrl[i], true);
1396
1397                                 currentVertexElement += 1;
1398                             }
1399
1400                             if (currentVertexElement > 3)
1401                             {
1402                                 StoreVertexElements(pVtxOut, outputElt, 4, vVertexElements);
1403                                 StoreVertexElements(GEP(pVtxOut, C(1)), outputElt, 4, vVertexElements2);
1404
1405                                 outputElt += 1;
1406
1407                                 // reset to the next vVertexElement to output
1408                                 currentVertexElement = 0;
1409                             }
1410                         }
1411
1412                         // offset base to the next component  in the vertex to gather
1413                         pStreamBase = GEP(pStreamBase, C((char)8));
1414 #else
1415                         if (isComponentEnabled(compMask, i))
1416                         {
1417                             // if we need to gather the component
1418                             if (compCtrl[i] == StoreSrc)
1419                             {
1420                                 Value *vMaskLo = VSHUFFLE(pMask, VUNDEF(mInt1Ty, 8), C({0, 1, 2, 3}));
1421                                 Value *vMaskHi = VSHUFFLE(pMask, VUNDEF(mInt1Ty, 8), C({4, 5, 6, 7}));
1422                                 vMaskLo = S_EXT(vMaskLo, VectorType::get(mInt64Ty, 4));
1423                                 vMaskHi = S_EXT(vMaskHi, VectorType::get(mInt64Ty, 4));
1424                                 vMaskLo = BITCAST(vMaskLo, VectorType::get(mDoubleTy, 4));
1425                                 vMaskHi = BITCAST(vMaskHi, VectorType::get(mDoubleTy, 4));
1426
1427                                 Value *vOffsetsLo = VEXTRACTI128(vOffsets, C(0));
1428                                 Value *vOffsetsHi = VEXTRACTI128(vOffsets, C(1));
1429
1430                                 Value *vZeroDouble = VECTOR_SPLAT(4, ConstantFP::get(IRB()->getDoubleTy(), 0.0f));
1431
1432                                 Value* pGatherLo = GATHERPD(vZeroDouble,
1433                                                             pStreamBase, vOffsetsLo, vMaskLo);
1434                                 Value* pGatherHi = GATHERPD(vZeroDouble,
1435                                                             pStreamBase, vOffsetsHi, vMaskHi);
1436
1437                                 pGatherLo = VCVTPD2PS(pGatherLo);
1438                                 pGatherHi = VCVTPD2PS(pGatherHi);
1439
1440                                 Value *pGather = VSHUFFLE(pGatherLo, pGatherHi, C({0, 1, 2, 3, 4, 5, 6, 7}));
1441
1442                                 vVertexElements[currentVertexElement++] = pGather;
1443                             }
1444                             else
1445                             {
1446 #if USE_SIMD16_SHADERS
1447                                 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i], useVertexID2);
1448 #else
1449                                 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1450 #endif
1451                             }
1452
1453                             if (currentVertexElement > 3)
1454                             {
1455                                 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1456                                 // reset to the next vVertexElement to output
1457                                 currentVertexElement = 0;
1458                             }
1459                         }
1460
1461                         // offset base to the next component  in the vertex to gather
1462                         pStreamBase = GEP(pStreamBase, C((char)8));
1463 #endif
1464                     }
1465                 }
1466                     break;
1467                 default:
1468                     SWR_INVALID("Tried to fetch invalid FP format");
1469                     break;
1470             }
1471         }
1472         else
1473         {
1474             Instruction::CastOps extendCastType = Instruction::CastOps::CastOpsEnd;
1475             ConversionType conversionType = CONVERT_NONE;
1476
1477             SWR_ASSERT(IsUniformFormat((SWR_FORMAT)ied.Format),
1478                 "Unsupported format for standard gather fetch.");
1479
1480             switch(info.type[0])
1481             {
1482                 case SWR_TYPE_UNORM:
1483                     conversionType = CONVERT_NORMALIZED;
1484                 case SWR_TYPE_UINT:
1485                     extendCastType = Instruction::CastOps::ZExt;
1486                     break;
1487                 case SWR_TYPE_SNORM:
1488                     conversionType = CONVERT_NORMALIZED;
1489                 case SWR_TYPE_SINT:
1490                     extendCastType = Instruction::CastOps::SExt;
1491                     break;
1492                 case SWR_TYPE_USCALED:
1493                     conversionType = CONVERT_USCALED;
1494                     extendCastType = Instruction::CastOps::UIToFP;
1495                     break;
1496                 case SWR_TYPE_SSCALED:
1497                     conversionType = CONVERT_SSCALED;
1498                     extendCastType = Instruction::CastOps::SIToFP;
1499                     break;
1500                 case SWR_TYPE_SFIXED:
1501                     conversionType = CONVERT_SFIXED;
1502                     extendCastType = Instruction::CastOps::SExt;
1503                     break;
1504                 default:
1505                     break;
1506             }
1507
1508             // value substituted when component of gather is masked
1509             Value* gatherSrc = VIMMED1(0);
1510 #if USE_SIMD16_GATHERS
1511             Value* gatherSrc2 = VIMMED1(0);
1512 #endif
1513
1514             // Gather components from memory to store in a simdvertex structure
1515             switch (bpc)
1516             {
1517                 case 8:
1518                 {
1519                     // if we have at least one component to fetch
1520                     if (compMask)
1521                     {
1522 #if USE_SIMD16_GATHERS
1523                         Value* vGatherResult = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask);
1524                         Value* vGatherResult2 = GATHERDD(gatherSrc2, pStreamBase, vOffsets2, vGatherMask2);
1525                         // e.g. result of an 8x32bit integer gather for 8bit components
1526                         // 256i - 0    1    2    3    4    5    6    7
1527                         //        xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw
1528
1529                         Shuffle8bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, extendCastType, conversionType,
1530                             currentVertexElement, outputElt, compMask, compCtrl, vVertexElements, info.swizzle);
1531                         Shuffle8bpcArgs args2 = std::forward_as_tuple(vGatherResult2, GEP(pVtxOut, C(1)), extendCastType, conversionType,
1532                             currentVertexElement, outputElt, compMask, compCtrl, vVertexElements2, info.swizzle);
1533
1534                         // Shuffle gathered components into place in simdvertex struct
1535                         Shuffle8bpcGatherd(args, false); // outputs to vVertexElements ref
1536                         Shuffle8bpcGatherd(args2, true); // outputs to vVertexElements ref
1537 #else
1538                         Value* vGatherResult = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask);
1539                         // e.g. result of an 8x32bit integer gather for 8bit components
1540                         // 256i - 0    1    2    3    4    5    6    7
1541                         //        xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw
1542
1543                         Shuffle8bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, extendCastType, conversionType,
1544                             currentVertexElement, outputElt, compMask, compCtrl, vVertexElements, info.swizzle);
1545
1546                         // Shuffle gathered components into place in simdvertex struct
1547 #if USE_SIMD16_SHADERS
1548                         Shuffle8bpcGatherd(args, useVertexID2); // outputs to vVertexElements ref
1549 #else
1550                         Shuffle8bpcGatherd(args); // outputs to vVertexElements ref
1551 #endif
1552 #endif
1553                     }
1554                 }
1555                 break;
1556                 case 16:
1557                 {
1558 #if USE_SIMD16_GATHERS
1559                     Value* vGatherResult[2];
1560                     Value* vGatherResult2[2];
1561
1562                     // if we have at least one component out of x or y to fetch
1563                     if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
1564                     {
1565                         vGatherResult[0] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask);
1566                         vGatherResult2[0] = GATHERDD(gatherSrc2, pStreamBase, vOffsets2, vGatherMask2);
1567                         // e.g. result of first 8x32bit integer gather for 16bit components
1568                         // 256i - 0    1    2    3    4    5    6    7
1569                         //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
1570                         //
1571                     }
1572
1573                     // if we have at least one component out of z or w to fetch
1574                     if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
1575                     {
1576                         // offset base to the next components(zw) in the vertex to gather
1577                         pStreamBase = GEP(pStreamBase, C((char)4));
1578
1579                         vGatherResult[1] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask);
1580                         vGatherResult2[1] = GATHERDD(gatherSrc2, pStreamBase, vOffsets2, vGatherMask2);
1581                         // e.g. result of second 8x32bit integer gather for 16bit components
1582                         // 256i - 0    1    2    3    4    5    6    7
1583                         //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
1584                         //
1585                     }
1586
1587                     // if we have at least one component to shuffle into place
1588                     if (compMask)
1589                     {
1590                         Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, extendCastType, conversionType,
1591                             currentVertexElement, outputElt, compMask, compCtrl, vVertexElements);
1592                         Shuffle16bpcArgs args2 = std::forward_as_tuple(vGatherResult2, GEP(pVtxOut, C(1)), extendCastType, conversionType,
1593                             currentVertexElement, outputElt, compMask, compCtrl, vVertexElements2);
1594
1595                         // Shuffle gathered components into place in simdvertex struct
1596                         Shuffle16bpcGather(args, false);  // outputs to vVertexElements ref
1597                         Shuffle16bpcGather(args2, true);  // outputs to vVertexElements ref
1598                     }
1599 #else
1600                     Value* vGatherResult[2];
1601
1602                     // if we have at least one component out of x or y to fetch
1603                     if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)){
1604                         vGatherResult[0] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask);
1605                         // e.g. result of first 8x32bit integer gather for 16bit components
1606                         // 256i - 0    1    2    3    4    5    6    7
1607                         //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
1608                         //
1609                     }
1610
1611                     // if we have at least one component out of z or w to fetch
1612                     if(isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)){
1613                         // offset base to the next components(zw) in the vertex to gather
1614                         pStreamBase = GEP(pStreamBase, C((char)4));
1615
1616                         vGatherResult[1] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask);
1617                         // e.g. result of second 8x32bit integer gather for 16bit components
1618                         // 256i - 0    1    2    3    4    5    6    7
1619                         //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
1620                         //
1621                     }
1622
1623                     // if we have at least one component to shuffle into place
1624                     if(compMask){
1625                         Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, extendCastType, conversionType,
1626                             currentVertexElement, outputElt, compMask, compCtrl, vVertexElements);
1627
1628                         // Shuffle gathered components into place in simdvertex struct
1629 #if USE_SIMD16_SHADERS
1630                         Shuffle16bpcGather(args, useVertexID2);  // outputs to vVertexElements ref
1631 #else
1632                         Shuffle16bpcGather(args);  // outputs to vVertexElements ref
1633 #endif
1634                     }
1635 #endif
1636                 }
1637                 break;
1638                 case 32:
1639                 {
1640                     // Gathered components into place in simdvertex struct
1641                     for (uint32_t i = 0; i < 4; i++)
1642                     {
1643                         if (isComponentEnabled(compMask, i))
1644                         {
1645                             // if we need to gather the component
1646                             if (compCtrl[i] == StoreSrc)
1647                             {
1648 #if USE_SIMD16_GATHERS
1649                                 Value *pGather = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask);
1650                                 Value *pGather2 = GATHERDD(gatherSrc2, pStreamBase, vOffsets2, vGatherMask2);
1651
1652                                 if (conversionType == CONVERT_USCALED)
1653                                 {
1654                                     pGather = UI_TO_FP(pGather, mSimdFP32Ty);
1655                                     pGather2 = UI_TO_FP(pGather2, mSimdFP32Ty);
1656                                 }
1657                                 else if (conversionType == CONVERT_SSCALED)
1658                                 {
1659                                     pGather = SI_TO_FP(pGather, mSimdFP32Ty);
1660                                     pGather2 = SI_TO_FP(pGather2, mSimdFP32Ty);
1661                                 }
1662                                 else if (conversionType == CONVERT_SFIXED)
1663                                 {
1664                                     pGather = FMUL(SI_TO_FP(pGather, mSimdFP32Ty), VBROADCAST(C(1 / 65536.0f)));
1665                                     pGather2 = FMUL(SI_TO_FP(pGather2, mSimdFP32Ty), VBROADCAST(C(1 / 65536.0f)));
1666                                 }
1667
1668                                 vVertexElements[currentVertexElement] = pGather;
1669                                 vVertexElements2[currentVertexElement] = pGather2;
1670                                 // e.g. result of a single 8x32bit integer gather for 32bit components
1671                                 // 256i - 0    1    2    3    4    5    6    7
1672                                 //        xxxx xxxx xxxx xxxx xxxx xxxx xxxx xxxx
1673
1674                                 currentVertexElement += 1;
1675 #else
1676                                 Value* pGather = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask);
1677
1678                                 if (conversionType == CONVERT_USCALED)
1679                                 {
1680                                     pGather = UI_TO_FP(pGather, mSimdFP32Ty);
1681                                 }
1682                                 else if (conversionType == CONVERT_SSCALED)
1683                                 {
1684                                     pGather = SI_TO_FP(pGather, mSimdFP32Ty);
1685                                 }
1686                                 else if (conversionType == CONVERT_SFIXED)
1687                                 {
1688                                     pGather = FMUL(SI_TO_FP(pGather, mSimdFP32Ty), VBROADCAST(C(1/65536.0f)));
1689                                 }
1690
1691                                 vVertexElements[currentVertexElement++] = pGather;
1692                                 // e.g. result of a single 8x32bit integer gather for 32bit components
1693                                 // 256i - 0    1    2    3    4    5    6    7
1694                                 //        xxxx xxxx xxxx xxxx xxxx xxxx xxxx xxxx
1695 #endif
1696                             }
1697                             else
1698                             {
1699 #if USE_SIMD16_SHADERS
1700 #if USE_SIMD16_GATHERS
1701                                 vVertexElements[currentVertexElement] = GenerateCompCtrlVector(compCtrl[i], false);
1702                                 vVertexElements2[currentVertexElement] = GenerateCompCtrlVector(compCtrl[i], true);
1703
1704                                 currentVertexElement += 1;
1705 #else
1706                                 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i], useVertexID2);
1707 #endif
1708 #else
1709                                 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1710 #endif
1711                             }
1712
1713                             if (currentVertexElement > 3)
1714                             {
1715 #if USE_SIMD16_GATHERS
1716                                 StoreVertexElements(pVtxOut, outputElt, 4, vVertexElements);
1717                                 StoreVertexElements(GEP(pVtxOut, C(1)), outputElt, 4, vVertexElements2);
1718
1719                                 outputElt += 1;
1720 #else
1721                                 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1722 #endif
1723
1724                                 // reset to the next vVertexElement to output
1725                                 currentVertexElement = 0;
1726                             }
1727
1728                         }
1729
1730                         // offset base to the next component  in the vertex to gather
1731                         pStreamBase = GEP(pStreamBase, C((char)4));
1732                     }
1733                 }
1734                 break;
1735             }
1736         }
1737     }
1738
1739     // if we have a partially filled vVertexElement struct, output it
1740     if (currentVertexElement > 0)
1741     {
1742 #if USE_SIMD16_GATHERS
1743         StoreVertexElements(pVtxOut, outputElt, currentVertexElement, vVertexElements);
1744         StoreVertexElements(GEP(pVtxOut, C(1)), outputElt, currentVertexElement, vVertexElements2);
1745
1746         outputElt += 1;
1747 #else
1748         StoreVertexElements(pVtxOut, outputElt++, currentVertexElement, vVertexElements);
1749 #endif
1750     }
1751 }
1752
1753 //////////////////////////////////////////////////////////////////////////
1754 /// @brief Loads a simd of valid indices. OOB indices are set to 0
1755 /// *Note* have to do 16bit index checking in scalar until we have AVX-512
1756 /// support
1757 /// @param pIndices - pointer to 8 bit indices
1758 /// @param pLastIndex - pointer to last valid index
1759 Value* FetchJit::GetSimdValid8bitIndices(Value* pIndices, Value* pLastIndex)
1760 {
1761     // can fit 2 16 bit integers per vWidth lane
1762     Value* vIndices =  VUNDEF_I();
1763
1764     // store 0 index on stack to be used to conditionally load from if index address is OOB
1765     Value* pZeroIndex = ALLOCA(mInt8Ty);
1766     STORE(C((uint8_t)0), pZeroIndex);
1767
1768     // Load a SIMD of index pointers
1769     for(int64_t lane = 0; lane < mVWidth; lane++)
1770     {
1771         // Calculate the address of the requested index
1772         Value *pIndex = GEP(pIndices, C(lane));
1773
1774         // check if the address is less than the max index,
1775         Value* mask = ICMP_ULT(pIndex, pLastIndex);
1776
1777         // if valid, load the index. if not, load 0 from the stack
1778         Value* pValid = SELECT(mask, pIndex, pZeroIndex);
1779         Value *index = LOAD(pValid, "valid index");
1780
1781         // zero extended index to 32 bits and insert into the correct simd lane
1782         index = Z_EXT(index, mInt32Ty);
1783         vIndices = VINSERT(vIndices, index, lane);
1784     }
1785     return vIndices;
1786 }
1787
1788 //////////////////////////////////////////////////////////////////////////
1789 /// @brief Loads a simd of valid indices. OOB indices are set to 0
1790 /// *Note* have to do 16bit index checking in scalar until we have AVX-512
1791 /// support
1792 /// @param pIndices - pointer to 16 bit indices
1793 /// @param pLastIndex - pointer to last valid index
1794 Value* FetchJit::GetSimdValid16bitIndices(Value* pIndices, Value* pLastIndex)
1795 {
1796     // can fit 2 16 bit integers per vWidth lane
1797     Value* vIndices =  VUNDEF_I();
1798
1799     // store 0 index on stack to be used to conditionally load from if index address is OOB
1800     Value* pZeroIndex = ALLOCA(mInt16Ty);
1801     STORE(C((uint16_t)0), pZeroIndex);
1802
1803     // Load a SIMD of index pointers
1804     for(int64_t lane = 0; lane < mVWidth; lane++)
1805     {
1806         // Calculate the address of the requested index
1807         Value *pIndex = GEP(pIndices, C(lane));
1808
1809         // check if the address is less than the max index,
1810         Value* mask = ICMP_ULT(pIndex, pLastIndex);
1811
1812         // if valid, load the index. if not, load 0 from the stack
1813         Value* pValid = SELECT(mask, pIndex, pZeroIndex);
1814         Value *index = LOAD(pValid, "valid index");
1815
1816         // zero extended index to 32 bits and insert into the correct simd lane
1817         index = Z_EXT(index, mInt32Ty);
1818         vIndices = VINSERT(vIndices, index, lane);
1819     }
1820     return vIndices;
1821 }
1822
1823 //////////////////////////////////////////////////////////////////////////
1824 /// @brief Loads a simd of valid indices. OOB indices are set to 0
1825 /// @param pIndices - pointer to 32 bit indices
1826 /// @param pLastIndex - pointer to last valid index
1827 Value* FetchJit::GetSimdValid32bitIndices(Value* pIndices, Value* pLastIndex)
1828 {
1829     DataLayout dL(JM()->mpCurrentModule);
1830     unsigned int ptrSize = dL.getPointerSize() * 8;  // ptr size in bits
1831     Value* iLastIndex = PTR_TO_INT(pLastIndex, Type::getIntNTy(JM()->mContext, ptrSize));
1832     Value* iIndices = PTR_TO_INT(pIndices, Type::getIntNTy(JM()->mContext, ptrSize));
1833
1834     // get the number of indices left in the buffer (endPtr - curPtr) / sizeof(index)
1835     Value* numIndicesLeft = SUB(iLastIndex,iIndices);
1836     numIndicesLeft = TRUNC(numIndicesLeft, mInt32Ty);
1837     numIndicesLeft = SDIV(numIndicesLeft, C(4));
1838
1839     // create a vector of index counts from the base index ptr passed into the fetch
1840     const std::vector<Constant*> vecIndices {C(0), C(1), C(2), C(3), C(4), C(5), C(6), C(7)};
1841     Constant* vIndexOffsets = ConstantVector::get(vecIndices);
1842
1843     // compare index count to the max valid index
1844     // e.g vMaxIndex      4 4 4 4 4 4 4 4 : 4 indices left to load
1845     //     vIndexOffsets  0 1 2 3 4 5 6 7
1846     //     ------------------------------
1847     //     vIndexMask    -1-1-1-1 0 0 0 0 : offsets < max pass
1848     //     vLoadedIndices 0 1 2 3 0 0 0 0 : offsets >= max masked to 0
1849     Value* vMaxIndex = VBROADCAST(numIndicesLeft);
1850     Value* vIndexMask = VPCMPGTD(vMaxIndex,vIndexOffsets);
1851
1852     // VMASKLOAD takes an *i8 src pointer
1853     pIndices = BITCAST(pIndices,PointerType::get(mInt8Ty,0));
1854
1855     // Load the indices; OOB loads 0
1856     return MASKLOADD(pIndices,vIndexMask);
1857 }
1858
1859 //////////////////////////////////////////////////////////////////////////
1860 /// @brief Takes a SIMD of gathered 8bpc verts, zero or sign extends,
1861 /// denormalizes if needed, converts to F32 if needed, and positions in
1862 //  the proper SIMD rows to be output to the simdvertex structure
1863 /// @param args: (tuple of args, listed below)
1864 ///   @param vGatherResult - 8 gathered 8bpc vertices
1865 ///   @param pVtxOut - base pointer to output simdvertex struct
1866 ///   @param extendType - sign extend or zero extend
1867 ///   @param bNormalized - do we need to denormalize?
1868 ///   @param currentVertexElement - reference to the current vVertexElement
1869 ///   @param outputElt - reference to the current offset from simdvertex we're o
1870 ///   @param compMask - component packing mask
1871 ///   @param compCtrl - component control val
1872 ///   @param vVertexElements[4] - vertex components to output
1873 ///   @param swizzle[4] - component swizzle location
1874 #if USE_SIMD16_SHADERS
1875 void FetchJit::Shuffle8bpcGatherd(Shuffle8bpcArgs &args, bool useVertexID2)
1876 #else
1877 void FetchJit::Shuffle8bpcGatherd(Shuffle8bpcArgs &args)
1878 #endif
1879 {
1880     // Unpack tuple args
1881     Value*& vGatherResult = std::get<0>(args);
1882     Value* pVtxOut = std::get<1>(args);
1883     const Instruction::CastOps extendType = std::get<2>(args);
1884     const ConversionType conversionType = std::get<3>(args);
1885     uint32_t &currentVertexElement = std::get<4>(args);
1886     uint32_t &outputElt =  std::get<5>(args);
1887     const ComponentEnable compMask = std::get<6>(args);
1888     const ComponentControl (&compCtrl)[4] = std::get<7>(args);
1889     Value* (&vVertexElements)[4] = std::get<8>(args);
1890     const uint32_t (&swizzle)[4] = std::get<9>(args);
1891
1892     // cast types
1893     Type* vGatherTy = mSimdInt32Ty;
1894     Type* v32x8Ty =  VectorType::get(mInt8Ty, mVWidth * 4 ); // vwidth is units of 32 bits
1895
1896     // have to do extra work for sign extending
1897     if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP)){
1898         Type* v16x8Ty = VectorType::get(mInt8Ty, mVWidth * 2); // 8x16bit ints in a 128bit lane
1899         Type* v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
1900
1901         // shuffle mask, including any swizzling
1902         const char x = (char)swizzle[0]; const char y = (char)swizzle[1];
1903         const char z = (char)swizzle[2]; const char w = (char)swizzle[3];
1904         Value* vConstMask = C<char>({char(x), char(x+4), char(x+8), char(x+12),
1905                     char(y), char(y+4), char(y+8), char(y+12),
1906                     char(z), char(z+4), char(z+8), char(z+12),
1907                     char(w), char(w+4), char(w+8), char(w+12),
1908                     char(x), char(x+4), char(x+8), char(x+12),
1909                     char(y), char(y+4), char(y+8), char(y+12),
1910                     char(z), char(z+4), char(z+8), char(z+12),
1911                     char(w), char(w+4), char(w+8), char(w+12)});
1912
1913         Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherResult, v32x8Ty), vConstMask), vGatherTy);
1914         // after pshufb: group components together in each 128bit lane
1915         // 256i - 0    1    2    3    4    5    6    7
1916         //        xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww
1917
1918         Value* vi128XY = nullptr;
1919         if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)){
1920             vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({0, 4, 0, 0, 1, 5, 0, 0})), v128Ty);
1921             // after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane
1922             // 256i - 0    1    2    3    4    5    6    7
1923             //        xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care)
1924         }
1925
1926         // do the same for zw components
1927         Value* vi128ZW = nullptr;
1928         if(isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)){
1929             vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({2, 6, 0, 0, 3, 7, 0, 0})), v128Ty);
1930         }
1931
1932         // init denormalize variables if needed
1933         Instruction::CastOps fpCast;
1934         Value* conversionFactor;
1935
1936         switch (conversionType)
1937         {
1938         case CONVERT_NORMALIZED:
1939             fpCast = Instruction::CastOps::SIToFP;
1940             conversionFactor = VIMMED1((float)(1.0 / 127.0));
1941             break;
1942         case CONVERT_SSCALED:
1943             fpCast = Instruction::CastOps::SIToFP;
1944             conversionFactor = VIMMED1((float)(1.0));
1945             break;
1946         case CONVERT_USCALED:
1947             SWR_INVALID("Type should not be sign extended!");
1948             conversionFactor = nullptr;
1949             break;
1950         default:
1951             SWR_ASSERT(conversionType == CONVERT_NONE);
1952             conversionFactor = nullptr;
1953             break;
1954         }
1955
1956         // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
1957         for (uint32_t i = 0; i < 4; i++)
1958         {
1959             if (isComponentEnabled(compMask, i))
1960             {
1961                 if (compCtrl[i] == ComponentControl::StoreSrc)
1962                 {
1963                     // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
1964                     uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
1965                     // if x or y, use vi128XY permute result, else use vi128ZW
1966                     Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
1967
1968                     // sign extend
1969                     vVertexElements[currentVertexElement] = PMOVSXBD(BITCAST(VEXTRACT(selectedPermute, C(lane)), v16x8Ty));
1970
1971                     // denormalize if needed
1972                     if (conversionType != CONVERT_NONE)
1973                     {
1974                         vVertexElements[currentVertexElement] = FMUL(CAST(fpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor);
1975                     }
1976                     currentVertexElement++;
1977                 }
1978                 else
1979                 {
1980 #if USE_SIMD16_SHADERS
1981                     vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i], useVertexID2);
1982 #else
1983                     vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1984 #endif
1985                 }
1986
1987                 if (currentVertexElement > 3)
1988                 {
1989                     StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1990                     // reset to the next vVertexElement to output
1991                     currentVertexElement = 0;
1992                 }
1993             }
1994         }
1995     }
1996     // else zero extend
1997     else if ((extendType == Instruction::CastOps::ZExt) || (extendType == Instruction::CastOps::UIToFP))
1998     {
1999         // init denormalize variables if needed
2000         Instruction::CastOps fpCast;
2001         Value* conversionFactor;
2002
2003         switch (conversionType)
2004         {
2005         case CONVERT_NORMALIZED:
2006             fpCast = Instruction::CastOps::UIToFP;
2007             conversionFactor = VIMMED1((float)(1.0 / 255.0));
2008             break;
2009         case CONVERT_USCALED:
2010             fpCast = Instruction::CastOps::UIToFP;
2011             conversionFactor = VIMMED1((float)(1.0));
2012             break;
2013         case CONVERT_SSCALED:
2014             SWR_INVALID("Type should not be zero extended!");
2015             conversionFactor = nullptr;
2016             break;
2017         default:
2018             SWR_ASSERT(conversionType == CONVERT_NONE);
2019             conversionFactor = nullptr;
2020             break;
2021         }
2022
2023         // shuffle enabled components into lower byte of each 32bit lane, 0 extending to 32 bits
2024         for (uint32_t i = 0; i < 4; i++)
2025         {
2026             if (isComponentEnabled(compMask, i))
2027             {
2028                 if (compCtrl[i] == ComponentControl::StoreSrc)
2029                 {
2030                     // pshufb masks for each component
2031                     Value* vConstMask;
2032                     switch (swizzle[i])
2033                     {
2034                     case 0:
2035                         // x shuffle mask
2036                         vConstMask = C<char>({ 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1,
2037                                                0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1 });
2038                         break;
2039                     case 1:
2040                         // y shuffle mask
2041                         vConstMask = C<char>({ 1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1,
2042                                                1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1 });
2043                         break;
2044                     case 2:
2045                         // z shuffle mask
2046                         vConstMask = C<char>({ 2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1,
2047                                                2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1 });
2048                         break;
2049                     case 3:
2050                         // w shuffle mask
2051                         vConstMask = C<char>({ 3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1,
2052                                                3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1 });
2053                         break;
2054                     default:
2055                         vConstMask = nullptr;
2056                         break;
2057                     }
2058
2059                     vVertexElements[currentVertexElement] = BITCAST(PSHUFB(BITCAST(vGatherResult, v32x8Ty), vConstMask), vGatherTy);
2060                     // after pshufb for x channel
2061                     // 256i - 0    1    2    3    4    5    6    7
2062                     //        x000 x000 x000 x000 x000 x000 x000 x000
2063
2064                     // denormalize if needed
2065                     if (conversionType != CONVERT_NONE)
2066                     {
2067                         vVertexElements[currentVertexElement] = FMUL(CAST(fpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor);
2068                     }
2069                     currentVertexElement++;
2070                 }
2071                 else
2072                 {
2073 #if USE_SIMD16_SHADERS
2074                     vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i], useVertexID2);
2075 #else
2076                     vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
2077 #endif
2078                 }
2079
2080                 if (currentVertexElement > 3)
2081                 {
2082                     StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
2083                     // reset to the next vVertexElement to output
2084                     currentVertexElement = 0;
2085                 }
2086             }
2087         }
2088     }
2089     else
2090     {
2091         SWR_INVALID("Unsupported conversion type");
2092     }
2093 }
2094
2095 //////////////////////////////////////////////////////////////////////////
2096 /// @brief Takes a SIMD of gathered 16bpc verts, zero or sign extends,
2097 /// denormalizes if needed, converts to F32 if needed, and positions in
2098 //  the proper SIMD rows to be output to the simdvertex structure
2099 /// @param args: (tuple of args, listed below)
2100 ///   @param vGatherResult[2] - array of gathered 16bpc vertices, 4 per index
2101 ///   @param pVtxOut - base pointer to output simdvertex struct
2102 ///   @param extendType - sign extend or zero extend
2103 ///   @param bNormalized - do we need to denormalize?
2104 ///   @param currentVertexElement - reference to the current vVertexElement
2105 ///   @param outputElt - reference to the current offset from simdvertex we're o
2106 ///   @param compMask - component packing mask
2107 ///   @param compCtrl - component control val
2108 ///   @param vVertexElements[4] - vertex components to output
2109 #if USE_SIMD16_SHADERS
2110 void FetchJit::Shuffle16bpcGather(Shuffle16bpcArgs &args, bool useVertexID2)
2111 #else
2112 void FetchJit::Shuffle16bpcGather(Shuffle16bpcArgs &args)
2113 #endif
2114 {
2115     // Unpack tuple args
2116     Value* (&vGatherResult)[2] = std::get<0>(args);
2117     Value* pVtxOut = std::get<1>(args);
2118     const Instruction::CastOps extendType = std::get<2>(args);
2119     const ConversionType conversionType = std::get<3>(args);
2120     uint32_t &currentVertexElement = std::get<4>(args);
2121     uint32_t &outputElt = std::get<5>(args);
2122     const ComponentEnable compMask = std::get<6>(args);
2123     const ComponentControl(&compCtrl)[4] = std::get<7>(args);
2124     Value* (&vVertexElements)[4] = std::get<8>(args);
2125
2126     // cast types
2127     Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
2128     Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
2129
2130     // have to do extra work for sign extending
2131     if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP)||
2132         (extendType == Instruction::CastOps::FPExt))
2133     {
2134         // is this PP float?
2135         bool bFP = (extendType == Instruction::CastOps::FPExt) ? true : false;
2136
2137         Type* v8x16Ty = VectorType::get(mInt16Ty, 8); // 8x16bit in a 128bit lane
2138         Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
2139
2140         // shuffle mask
2141         Value* vConstMask = C<char>({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
2142                                      0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15});
2143         Value* vi128XY = nullptr;
2144         if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)){
2145             Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherResult[0], v32x8Ty), vConstMask), vGatherTy);
2146             // after pshufb: group components together in each 128bit lane
2147             // 256i - 0    1    2    3    4    5    6    7
2148             //        xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
2149
2150             vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
2151             // after PERMD: move and pack xy components into each 128bit lane
2152             // 256i - 0    1    2    3    4    5    6    7
2153             //        xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
2154         }
2155
2156         // do the same for zw components
2157         Value* vi128ZW = nullptr;
2158         if(isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)){
2159             Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherResult[1], v32x8Ty), vConstMask), vGatherTy);
2160             vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
2161         }
2162
2163         // init denormalize variables if needed
2164         Instruction::CastOps IntToFpCast;
2165         Value* conversionFactor;
2166
2167         switch (conversionType)
2168         {
2169         case CONVERT_NORMALIZED:
2170             IntToFpCast = Instruction::CastOps::SIToFP;
2171             conversionFactor = VIMMED1((float)(1.0 / 32767.0));
2172             break;
2173         case CONVERT_SSCALED:
2174             IntToFpCast = Instruction::CastOps::SIToFP;
2175             conversionFactor = VIMMED1((float)(1.0));
2176             break;
2177         case CONVERT_USCALED:
2178             SWR_INVALID("Type should not be sign extended!");
2179             conversionFactor = nullptr;
2180             break;
2181         default:
2182             SWR_ASSERT(conversionType == CONVERT_NONE);
2183             conversionFactor = nullptr;
2184             break;
2185         }
2186
2187         // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
2188         for (uint32_t i = 0; i < 4; i++)
2189         {
2190             if (isComponentEnabled(compMask, i))
2191             {
2192                 if (compCtrl[i] == ComponentControl::StoreSrc)
2193                 {
2194                     // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
2195                     uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
2196                     // if x or y, use vi128XY permute result, else use vi128ZW
2197                     Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
2198
2199                     if (bFP) {
2200                         // extract 128 bit lanes to sign extend each component
2201                         vVertexElements[currentVertexElement] = CVTPH2PS(BITCAST(VEXTRACT(selectedPermute, C(lane)), v8x16Ty));
2202                     }
2203                     else {
2204                         // extract 128 bit lanes to sign extend each component
2205                         vVertexElements[currentVertexElement] = PMOVSXWD(BITCAST(VEXTRACT(selectedPermute, C(lane)), v8x16Ty));
2206
2207                         // denormalize if needed
2208                         if (conversionType != CONVERT_NONE) {
2209                             vVertexElements[currentVertexElement] = FMUL(CAST(IntToFpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor);
2210                         }
2211                     }
2212                     currentVertexElement++;
2213                 }
2214                 else
2215                 {
2216 #if USE_SIMD16_SHADERS
2217                     vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i], useVertexID2);
2218 #else
2219                     vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
2220 #endif
2221                 }
2222
2223                 if (currentVertexElement > 3)
2224                 {
2225                     StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
2226                     // reset to the next vVertexElement to output
2227                     currentVertexElement = 0;
2228                 }
2229             }
2230         }
2231     }
2232     // else zero extend
2233     else if ((extendType == Instruction::CastOps::ZExt) || (extendType == Instruction::CastOps::UIToFP))
2234     {
2235         // pshufb masks for each component
2236         Value* vConstMask[2];
2237         if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 2)){
2238             // x/z shuffle mask
2239             vConstMask[0] = C<char>({0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
2240                                      0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, });
2241         }
2242
2243         if(isComponentEnabled(compMask, 1) || isComponentEnabled(compMask, 3)){
2244             // y/w shuffle mask
2245             vConstMask[1] = C<char>({2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
2246                                      2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1});
2247         }
2248
2249         // init denormalize variables if needed
2250         Instruction::CastOps fpCast;
2251         Value* conversionFactor;
2252
2253         switch (conversionType)
2254         {
2255         case CONVERT_NORMALIZED:
2256             fpCast = Instruction::CastOps::UIToFP;
2257             conversionFactor = VIMMED1((float)(1.0 / 65535.0));
2258             break;
2259         case CONVERT_USCALED:
2260             fpCast = Instruction::CastOps::UIToFP;
2261             conversionFactor = VIMMED1((float)(1.0f));
2262             break;
2263         case CONVERT_SSCALED:
2264             SWR_INVALID("Type should not be zero extended!");
2265             conversionFactor = nullptr;
2266             break;
2267         default:
2268             SWR_ASSERT(conversionType == CONVERT_NONE);
2269             conversionFactor = nullptr;
2270             break;
2271         }
2272
2273         // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits
2274         for (uint32_t i = 0; i < 4; i++)
2275         {
2276             if (isComponentEnabled(compMask, i))
2277             {
2278                 if (compCtrl[i] == ComponentControl::StoreSrc)
2279                 {
2280                     // select correct constMask for x/z or y/w pshufb
2281                     uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1;
2282                     // if x or y, use vi128XY permute result, else use vi128ZW
2283                     uint32_t selectedGather = (i < 2) ? 0 : 1;
2284
2285                     vVertexElements[currentVertexElement] = BITCAST(PSHUFB(BITCAST(vGatherResult[selectedGather], v32x8Ty), vConstMask[selectedMask]), vGatherTy);
2286                     // after pshufb mask for x channel; z uses the same shuffle from the second gather
2287                     // 256i - 0    1    2    3    4    5    6    7
2288                     //        xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00
2289
2290                     // denormalize if needed
2291                     if (conversionType != CONVERT_NONE)
2292                     {
2293                         vVertexElements[currentVertexElement] = FMUL(CAST(fpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor);
2294                     }
2295                     currentVertexElement++;
2296                 }
2297                 else
2298                 {
2299 #if USE_SIMD16_SHADERS
2300                     vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i], useVertexID2);
2301 #else
2302                     vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
2303 #endif
2304                 }
2305
2306                 if (currentVertexElement > 3)
2307                 {
2308                     StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
2309                     // reset to the next vVertexElement to output
2310                     currentVertexElement = 0;
2311                 }
2312             }
2313         }
2314     }
2315     else
2316     {
2317         SWR_INVALID("Unsupported conversion type");
2318     }
2319 }
2320
2321 //////////////////////////////////////////////////////////////////////////
2322 /// @brief Output a simdvertex worth of elements to the current outputElt
2323 /// @param pVtxOut - base address of VIN output struct
2324 /// @param outputElt - simdvertex offset in VIN to write to
2325 /// @param numEltsToStore - number of simdvertex rows to write out
2326 /// @param vVertexElements - LLVM Value*[] simdvertex to write out
2327 void FetchJit::StoreVertexElements(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4])
2328 {
2329     SWR_ASSERT(numEltsToStore <= 4, "Invalid element count.");
2330
2331     for(uint32_t c = 0; c < numEltsToStore; ++c)
2332     {
2333         // STORE expects FP32 x vWidth type, just bitcast if needed
2334         if(!vVertexElements[c]->getType()->getScalarType()->isFloatTy())
2335         {
2336 #if FETCH_DUMP_VERTEX
2337             PRINT("vVertexElements[%d]: 0x%x\n", {C(c), vVertexElements[c]});
2338 #endif
2339             vVertexElements[c] = BITCAST(vVertexElements[c], mSimdFP32Ty);
2340         }
2341 #if FETCH_DUMP_VERTEX
2342         else
2343         {
2344             PRINT("vVertexElements[%d]: %f\n", {C(c), vVertexElements[c]});
2345         }
2346 #endif
2347         // outputElt * 4 = offsetting by the size of a simdvertex
2348         // + c offsets to a 32bit x vWidth row within the current vertex
2349 #if USE_SIMD16_SHADERS
2350         Value* dest = GEP(pVtxOut, C(outputElt * 8 + c * 2), "destGEP");
2351 #else
2352         Value* dest = GEP(pVtxOut, C(outputElt * 4 + c), "destGEP");
2353 #endif
2354         STORE(vVertexElements[c], dest);
2355     }
2356 }
2357
2358 #if USE_SIMD16_BUILDER
2359 void FetchJit::StoreVertexElements2(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4])
2360 {
2361     SWR_ASSERT(numEltsToStore <= 4, "Invalid element count.");
2362
2363     for (uint32_t c = 0; c < numEltsToStore; ++c)
2364     {
2365         // STORE expects FP32 x vWidth type, just bitcast if needed
2366         if (!vVertexElements[c]->getType()->getScalarType()->isFloatTy())
2367         {
2368 #if FETCH_DUMP_VERTEX
2369             PRINT("vVertexElements[%d]: 0x%x\n", { C(c), vVertexElements[c] });
2370 #endif
2371             vVertexElements[c] = BITCAST(vVertexElements[c], mSimd2FP32Ty);
2372         }
2373 #if FETCH_DUMP_VERTEX
2374         else
2375         {
2376             PRINT("vVertexElements[%d]: %f\n", { C(c), vVertexElements[c] });
2377         }
2378 #endif
2379         // outputElt * 4 = offsetting by the size of a simdvertex
2380         // + c offsets to a 32bit x vWidth row within the current vertex
2381         Value* dest = GEP(pVtxOut, C(outputElt * 4 + c), "destGEP");
2382         STORE(vVertexElements[c], dest);
2383     }
2384 }
2385
2386 #endif
2387 //////////////////////////////////////////////////////////////////////////
2388 /// @brief Generates a constant vector of values based on the
2389 /// ComponentControl value
2390 /// @param ctrl - ComponentControl value
2391 #if USE_SIMD16_SHADERS
2392 Value* FetchJit::GenerateCompCtrlVector(const ComponentControl ctrl, bool useVertexID2)
2393 #else
2394 Value* FetchJit::GenerateCompCtrlVector(const ComponentControl ctrl)
2395 #endif
2396 {
2397     switch(ctrl)
2398     {
2399         case NoStore:   return VUNDEF_I();
2400         case Store0:    return VIMMED1(0);
2401         case Store1Fp:  return VIMMED1(1.0f);
2402         case Store1Int: return VIMMED1(1);
2403         case StoreVertexId:
2404         {
2405 #if USE_SIMD16_SHADERS
2406             Value* pId;
2407             if (useVertexID2)
2408             {
2409                 pId = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID2 })), mSimdFP32Ty);
2410             }
2411             else
2412             {
2413                 pId = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID })), mSimdFP32Ty);
2414             }
2415 #else
2416             Value* pId = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID })), mSimdFP32Ty);
2417 #endif
2418             return VBROADCAST(pId);
2419         }
2420         case StoreInstanceId:
2421         {
2422             Value* pId = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_CurInstance })), mFP32Ty);
2423             return VBROADCAST(pId);
2424         }
2425         case StoreSrc:
2426         default:        SWR_INVALID("Invalid component control"); return VUNDEF_I();
2427     }
2428 }
2429
2430 #if USE_SIMD16_BUILDER
2431 Value* FetchJit::GenerateCompCtrlVector2(const ComponentControl ctrl)
2432 {
2433     switch (ctrl)
2434     {
2435         case NoStore:   return VUNDEF2_I();
2436         case Store0:    return VIMMED2_1(0);
2437         case Store1Fp:  return VIMMED2_1(1.0f);
2438         case Store1Int: return VIMMED2_1(1);
2439         case StoreVertexId:
2440         {
2441             Value* pId = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID })), mSimd2FP32Ty);
2442             return VBROADCAST2(pId);
2443         }
2444         case StoreInstanceId:
2445         {
2446             Value* pId = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_CurInstance })), mFP32Ty);
2447             return VBROADCAST2(pId);
2448         }
2449         case StoreSrc:
2450         default:        SWR_INVALID("Invalid component control"); return VUNDEF2_I();
2451     }
2452 }
2453
2454 #endif
2455 //////////////////////////////////////////////////////////////////////////
2456 /// @brief Returns the enable mask for the specified component.
2457 /// @param enableMask - enable bits
2458 /// @param component - component to check if enabled.
2459 bool isComponentEnabled(ComponentEnable enableMask, uint8_t component)
2460 {
2461     switch (component)
2462     {
2463         // X
2464     case 0: return (enableMask & ComponentEnable::X);
2465         // Y
2466     case 1: return (enableMask & ComponentEnable::Y);
2467         // Z
2468     case 2: return (enableMask & ComponentEnable::Z);
2469         // W
2470     case 3: return (enableMask & ComponentEnable::W);
2471
2472     default: return false;
2473     }
2474 }
2475
2476
2477 //////////////////////////////////////////////////////////////////////////
2478 /// @brief JITs from fetch shader IR
2479 /// @param hJitMgr - JitManager handle
2480 /// @param func   - LLVM function IR
2481 /// @return PFN_FETCH_FUNC - pointer to fetch code
2482 PFN_FETCH_FUNC JitFetchFunc(HANDLE hJitMgr, const HANDLE hFunc)
2483 {
2484     const llvm::Function* func = (const llvm::Function*)hFunc;
2485     JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
2486     PFN_FETCH_FUNC pfnFetch;
2487
2488     pfnFetch = (PFN_FETCH_FUNC)(pJitMgr->mpExec->getFunctionAddress(func->getName().str()));
2489     // MCJIT finalizes modules the first time you JIT code from them. After finalized, you cannot add new IR to the module
2490     pJitMgr->mIsModuleFinalized = true;
2491
2492 #if defined(KNOB_SWRC_TRACING)
2493     char fName[1024];
2494     const char *funcName = func->getName().data();
2495     sprintf(fName, "%s.bin", funcName);
2496     FILE *fd = fopen(fName, "wb");
2497     fwrite((void *)pfnFetch, 1, 2048, fd);
2498     fclose(fd);
2499 #endif
2500
2501     pJitMgr->DumpAsm(const_cast<llvm::Function*>(func), "final");
2502
2503     return pfnFetch;
2504 }
2505
2506 //////////////////////////////////////////////////////////////////////////
2507 /// @brief JIT compiles fetch shader
2508 /// @param hJitMgr - JitManager handle
2509 /// @param state   - fetch state to build function from
2510 extern "C" PFN_FETCH_FUNC JITCALL JitCompileFetch(HANDLE hJitMgr, const FETCH_COMPILE_STATE& state)
2511 {
2512     JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
2513
2514     pJitMgr->SetupNewModule();
2515
2516     FetchJit theJit(pJitMgr);
2517     HANDLE hFunc = theJit.Create(state);
2518
2519     return JitFetchFunc(hJitMgr, hFunc);
2520 }