src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp

   1 /****************************************************************************
   2 * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
   3 *
   4 * Permission is hereby granted, free of charge, to any person obtaining a
   5 * copy of this software and associated documentation files (the "Software"),
   6 * to deal in the Software without restriction, including without limitation
   7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8 * and/or sell copies of the Software, and to permit persons to whom the
   9 * Software is furnished to do so, subject to the following conditions:
  10 *
  11 * The above copyright notice and this permission notice (including the next
  12 * paragraph) shall be included in all copies or substantial portions of the
  13 * Software.
  14 *
  15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21 * IN THE SOFTWARE.
  22 *
  23 * @file fetch_jit.cpp
  24 *
  25 * @brief Implementation of the fetch jitter
  26 *
  27 * Notes:
  28 *
  29 ******************************************************************************/
  30 #include "builder.h"
  31 #include "jit_api.h"
  32 #include "fetch_jit.h"
  33 #include "gen_state_llvm.h"
  34 #include <sstream>
  35 #include <tuple>
  36
  37 //#define FETCH_DUMP_VERTEX 1
  38 using namespace llvm;
  39 using namespace SwrJit;
  40
  41 bool isComponentEnabled(ComponentEnable enableMask, uint8_t component);
  42
  43 enum ConversionType
  44 {
  45     CONVERT_NONE,
  46     CONVERT_NORMALIZED,
  47     CONVERT_USCALED,
  48     CONVERT_SSCALED,
  49     CONVERT_SFIXED,
  50 };
  51
  52 //////////////////////////////////////////////////////////////////////////
  53 /// Interface to Jitting a fetch shader
  54 //////////////////////////////////////////////////////////////////////////
  55 struct FetchJit : public Builder
  56 {
  57     FetchJit(JitManager* pJitMgr) : Builder(pJitMgr){};
  58
  59     Function* Create(const FETCH_COMPILE_STATE& fetchState);
  60     Value* GetSimdValid32bitIndices(Value* vIndices, Value* pLastIndex);
  61     Value* GetSimdValid16bitIndices(Value* vIndices, Value* pLastIndex);
  62     Value* GetSimdValid8bitIndices(Value* vIndices, Value* pLastIndex);
  63
  64     // package up Shuffle*bpcGatherd args into a tuple for convenience
  65     typedef std::tuple<Value*&, Value*, const Instruction::CastOps, const ConversionType,
  66         uint32_t&, uint32_t&, const ComponentEnable, const ComponentControl(&)[4], Value*(&)[4],
  67         const uint32_t(&)[4]> Shuffle8bpcArgs;
  68 #if USE_SIMD16_SHADERS
  69     void Shuffle8bpcGatherd(Shuffle8bpcArgs &args, bool useVertexID2);
  70 #else
  71     void Shuffle8bpcGatherd(Shuffle8bpcArgs &args);
  72 #endif
  73 #if USE_SIMD16_BUILDER
  74     void Shuffle8bpcGatherd2(Shuffle8bpcArgs &args);
  75 #endif
  76
  77     typedef std::tuple<Value*(&)[2], Value*, const Instruction::CastOps, const ConversionType,
  78         uint32_t&, uint32_t&, const ComponentEnable, const ComponentControl(&)[4], Value*(&)[4]> Shuffle16bpcArgs;
  79 #if USE_SIMD16_SHADERS
  80     void Shuffle16bpcGather(Shuffle16bpcArgs &args, bool useVertexID2);
  81 #else
  82     void Shuffle16bpcGather(Shuffle16bpcArgs &args);
  83 #endif
  84 #if USE_SIMD16_BUILDER
  85     void Shuffle16bpcGather2(Shuffle16bpcArgs &args);
  86 #endif
  87
  88     void StoreVertexElements(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4]);
  89 #if USE_SIMD16_BUILDER
  90     void StoreVertexElements2(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4]);
  91 #endif
  92
  93 #if USE_SIMD16_SHADERS
  94     Value* GenerateCompCtrlVector(const ComponentControl ctrl, bool useVertexID2);
  95 #else
  96     Value* GenerateCompCtrlVector(const ComponentControl ctrl);
  97 #endif
  98 #if USE_SIMD16_BUILDER
  99     Value* GenerateCompCtrlVector2(const ComponentControl ctrl);
 100 #endif
 101
 102     void JitLoadVertices(const FETCH_COMPILE_STATE &fetchState, Value* streams, Value* vIndices, Value* pVtxOut);
 103 #if USE_SIMD16_SHADERS
 104 #define USE_SIMD16_GATHERS 0
 105
 106 #if USE_SIMD16_GATHERS
 107     void JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, Value *streams, Value *vIndices, Value *vIndices2, Value *pVtxOut, bool useVertexID2);
 108 #else
 109     void JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, Value* streams, Value* vIndices, Value* pVtxOut, bool useVertexID2);
 110 #endif
 111 #else
 112     void JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, Value* streams, Value* vIndices, Value* pVtxOut);
 113 #endif
 114
 115     bool IsOddFormat(SWR_FORMAT format);
 116     bool IsUniformFormat(SWR_FORMAT format);
 117     void UnpackComponents(SWR_FORMAT format, Value* vInput, Value* result[4]);
 118     void CreateGatherOddFormats(SWR_FORMAT format, Value* pMask, Value* pBase, Value* offsets, Value* result[4]);
 119     void ConvertFormat(SWR_FORMAT format, Value *texels[4]);
 120
 121     Value* mpFetchInfo;
 122 };
 123
 124 Function* FetchJit::Create(const FETCH_COMPILE_STATE& fetchState)
 125 {
 126     std::stringstream fnName("FetchShader_", std::ios_base::in | std::ios_base::out | std::ios_base::ate);
 127     fnName << ComputeCRC(0, &fetchState, sizeof(fetchState));
 128
 129     Function*    fetch = Function::Create(JM()->mFetchShaderTy, GlobalValue::ExternalLinkage, fnName.str(), JM()->mpCurrentModule);
 130     BasicBlock*    entry = BasicBlock::Create(JM()->mContext, "entry", fetch);
 131
 132     fetch->getParent()->setModuleIdentifier(fetch->getName());
 133
 134     IRB()->SetInsertPoint(entry);
 135
 136     auto    argitr = fetch->arg_begin();
 137
 138     // Fetch shader arguments
 139     mpFetchInfo = &*argitr; ++argitr;
 140     mpFetchInfo->setName("fetchInfo");
 141     Value*    pVtxOut = &*argitr;
 142     pVtxOut->setName("vtxOutput");
 143     // this is just shorthand to tell LLVM to get a pointer to the base address of simdvertex
 144     // index 0(just the pointer to the simdvertex structure
 145     // index 1(which element of the simdvertex structure to offset to(in this case 0)
 146     // so the indices being i32's doesn't matter
 147     // TODO: generated this GEP with a VECTOR structure type so this makes sense
 148     std::vector<Value*>    vtxInputIndices(2, C(0));
 149     // GEP
 150     pVtxOut = GEP(pVtxOut, C(0));
 151 #if USE_SIMD16_SHADERS
 152 #if 0// USE_SIMD16_BUILDER
 153     pVtxOut = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth2), 0));
 154 #else
 155     pVtxOut = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth), 0));
 156 #endif
 157 #else
 158     pVtxOut = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth), 0));
 159 #endif
 160
 161     // SWR_FETCH_CONTEXT::pStreams
 162     Value*    streams = LOAD(mpFetchInfo,{0, SWR_FETCH_CONTEXT_pStreams});
 163     streams->setName("pStreams");
 164
 165     // SWR_FETCH_CONTEXT::pIndices
 166     Value*    indices = LOAD(mpFetchInfo,{0, SWR_FETCH_CONTEXT_pIndices});
 167     indices->setName("pIndices");
 168
 169     // SWR_FETCH_CONTEXT::pLastIndex
 170     Value*    pLastIndex = LOAD(mpFetchInfo,{0, SWR_FETCH_CONTEXT_pLastIndex});
 171     pLastIndex->setName("pLastIndex");
 172
 173
 174     Value* vIndices;
 175 #if USE_SIMD16_SHADERS
 176     Value* indices2;
 177     Value* vIndices2;
 178 #endif
 179     switch(fetchState.indexType)
 180     {
 181         case R8_UINT:
 182             indices = BITCAST(indices, Type::getInt8PtrTy(JM()->mContext, 0));
 183 #if USE_SIMD16_SHADERS
 184             indices2 = GEP(indices, C(8));
 185 #endif
 186             if(fetchState.bDisableIndexOOBCheck)
 187             {
 188                 vIndices = LOAD(BITCAST(indices, PointerType::get(VectorType::get(mInt8Ty, mpJitMgr->mVWidth), 0)), {(uint32_t)0});
 189                 vIndices = Z_EXT(vIndices, mSimdInt32Ty);
 190 #if USE_SIMD16_SHADERS
 191                 vIndices2 = LOAD(BITCAST(indices2, PointerType::get(VectorType::get(mInt8Ty, mpJitMgr->mVWidth), 0)), { (uint32_t)0 });
 192                 vIndices2 = Z_EXT(vIndices2, mSimdInt32Ty);
 193 #endif
 194             }
 195             else
 196             {
 197                 pLastIndex = BITCAST(pLastIndex, Type::getInt8PtrTy(JM()->mContext, 0));
 198                 vIndices = GetSimdValid8bitIndices(indices, pLastIndex);
 199 #if USE_SIMD16_SHADERS
 200                 pLastIndex = BITCAST(pLastIndex, Type::getInt8PtrTy(JM()->mContext, 0));
 201                 vIndices2 = GetSimdValid8bitIndices(indices2, pLastIndex);
 202 #endif
 203             }
 204             break;
 205         case R16_UINT:
 206             indices = BITCAST(indices, Type::getInt16PtrTy(JM()->mContext, 0));
 207 #if USE_SIMD16_SHADERS
 208             indices2 = GEP(indices, C(8));
 209 #endif
 210             if(fetchState.bDisableIndexOOBCheck)
 211             {
 212                 vIndices = LOAD(BITCAST(indices, PointerType::get(VectorType::get(mInt16Ty, mpJitMgr->mVWidth), 0)), {(uint32_t)0});
 213                 vIndices = Z_EXT(vIndices, mSimdInt32Ty);
 214 #if USE_SIMD16_SHADERS
 215                 vIndices2 = LOAD(BITCAST(indices2, PointerType::get(VectorType::get(mInt16Ty, mpJitMgr->mVWidth), 0)), { (uint32_t)0 });
 216                 vIndices2 = Z_EXT(vIndices2, mSimdInt32Ty);
 217 #endif
 218             }
 219             else
 220             {
 221                 pLastIndex = BITCAST(pLastIndex, Type::getInt16PtrTy(JM()->mContext, 0));
 222                 vIndices = GetSimdValid16bitIndices(indices, pLastIndex);
 223 #if USE_SIMD16_SHADERS
 224                 pLastIndex = BITCAST(pLastIndex, Type::getInt16PtrTy(JM()->mContext, 0));
 225                 vIndices2 = GetSimdValid16bitIndices(indices2, pLastIndex);
 226 #endif
 227             }
 228             break;
 229         case R32_UINT:
 230 #if USE_SIMD16_SHADERS
 231             indices2 = GEP(indices, C(8));
 232 #endif
 233             (fetchState.bDisableIndexOOBCheck) ? vIndices = LOAD(BITCAST(indices, PointerType::get(mSimdInt32Ty,0)),{(uint32_t)0})
 234                                                : vIndices = GetSimdValid32bitIndices(indices, pLastIndex);
 235 #if USE_SIMD16_SHADERS
 236             (fetchState.bDisableIndexOOBCheck) ? vIndices2 = LOAD(BITCAST(indices2, PointerType::get(mSimdInt32Ty, 0)), { (uint32_t)0 })
 237                                                : vIndices2 = GetSimdValid32bitIndices(indices2, pLastIndex);
 238 #endif
 239             break; // incoming type is already 32bit int
 240         default: SWR_INVALID("Unsupported index type"); vIndices = nullptr; break;
 241     }
 242
 243     if(fetchState.bForceSequentialAccessEnable)
 244     {
 245         Value* pOffsets = C({ 0, 1, 2, 3, 4, 5, 6, 7 });
 246
 247         // VertexData buffers are accessed sequentially, the index is equal to the vertex number
 248         vIndices = VBROADCAST(LOAD(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_StartVertex }));
 249         vIndices = ADD(vIndices, pOffsets);
 250 #if USE_SIMD16_SHADERS
 251         vIndices2 = ADD(vIndices, VIMMED1(8));
 252 #endif
 253     }
 254
 255     Value* vVertexId = vIndices;
 256 #if USE_SIMD16_SHADERS
 257     Value* vVertexId2 = vIndices2;
 258 #endif
 259     if (fetchState.bVertexIDOffsetEnable)
 260     {
 261         // Assuming one of baseVertex or startVertex is 0, so adding both should be functionally correct
 262         Value* vBaseVertex = VBROADCAST(LOAD(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_BaseVertex }));
 263         Value* vStartVertex = VBROADCAST(LOAD(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_StartVertex }));
 264         vVertexId = ADD(vIndices, vBaseVertex);
 265         vVertexId = ADD(vVertexId, vStartVertex);
 266 #if USE_SIMD16_SHADERS
 267         vVertexId2 = ADD(vIndices2, vBaseVertex);
 268         vVertexId2 = ADD(vVertexId2, vStartVertex);
 269 #endif
 270     }
 271
 272     // store out vertex IDs
 273     STORE(vVertexId, GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID }));
 274 #if USE_SIMD16_SHADERS
 275     STORE(vVertexId2, GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID2 }));
 276 #endif
 277
 278     // store out cut mask if enabled
 279     if (fetchState.bEnableCutIndex)
 280     {
 281         Value* vCutIndex = VIMMED1(fetchState.cutIndex);
 282         Value* cutMask = VMASK(ICMP_EQ(vIndices, vCutIndex));
 283         STORE(cutMask, GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_CutMask }));
 284 #if USE_SIMD16_SHADERS
 285         Value* cutMask2 = VMASK(ICMP_EQ(vIndices2, vCutIndex));
 286         STORE(cutMask2, GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_CutMask2 }));
 287 #endif
 288     }
 289
 290     // Fetch attributes from memory and output to a simdvertex struct
 291     // since VGATHER has a perf penalty on HSW vs BDW, allow client to choose which fetch method to use
 292 #if USE_SIMD16_SHADERS
 293     if (fetchState.bDisableVGATHER)
 294     {
 295         JitLoadVertices(fetchState, streams, vIndices, pVtxOut);
 296         JitLoadVertices(fetchState, streams, vIndices2, GEP(pVtxOut, C(1)));
 297     }
 298     else
 299     {
 300 #if USE_SIMD16_GATHERS
 301         JitGatherVertices(fetchState, streams, vIndices, vIndices2, pVtxOut, false);
 302 #else
 303         JitGatherVertices(fetchState, streams, vIndices, pVtxOut, false);
 304         JitGatherVertices(fetchState, streams, vIndices2, GEP(pVtxOut, C(1)), true);
 305 #endif
 306     }
 307 #else
 308     (fetchState.bDisableVGATHER) ? JitLoadVertices(fetchState, streams, vIndices, pVtxOut)
 309                                  : JitGatherVertices(fetchState, streams, vIndices, pVtxOut);
 310 #endif
 311
 312     RET_VOID();
 313
 314     JitManager::DumpToFile(fetch, "src");
 315
 316 #if defined(_DEBUG)
 317     verifyFunction(*fetch);
 318 #endif
 319
 320     ::FunctionPassManager setupPasses(JM()->mpCurrentModule);
 321
 322     ///@todo We don't need the CFG passes for fetch. (e.g. BreakCriticalEdges and CFGSimplification)
 323     setupPasses.add(createBreakCriticalEdgesPass());
 324     setupPasses.add(createCFGSimplificationPass());
 325     setupPasses.add(createEarlyCSEPass());
 326     setupPasses.add(createPromoteMemoryToRegisterPass());
 327
 328     setupPasses.run(*fetch);
 329
 330     JitManager::DumpToFile(fetch, "se");
 331
 332     ::FunctionPassManager optPasses(JM()->mpCurrentModule);
 333
 334     ///@todo Haven't touched these either. Need to remove some of these and add others.
 335     optPasses.add(createCFGSimplificationPass());
 336     optPasses.add(createEarlyCSEPass());
 337     optPasses.add(createInstructionCombiningPass());
 338     optPasses.add(createInstructionSimplifierPass());
 339     optPasses.add(createConstantPropagationPass());
 340     optPasses.add(createSCCPPass());
 341     optPasses.add(createAggressiveDCEPass());
 342
 343     optPasses.run(*fetch);
 344     optPasses.run(*fetch);
 345
 346     JitManager::DumpToFile(fetch, "opt");
 347
 348     return fetch;
 349 }
 350
 351 //////////////////////////////////////////////////////////////////////////
 352 /// @brief Loads attributes from memory using LOADs, shuffling the
 353 /// components into SOA form.
 354 /// *Note* currently does not support component control,
 355 /// component packing, instancing
 356 /// @param fetchState - info about attributes to be fetched from memory
 357 /// @param streams - value pointer to the current vertex stream
 358 /// @param vIndices - vector value of indices to load
 359 /// @param pVtxOut - value pointer to output simdvertex struct
 360 void FetchJit::JitLoadVertices(const FETCH_COMPILE_STATE &fetchState, Value* streams, Value* vIndices, Value* pVtxOut)
 361 {
 362     // Zack shuffles; a variant of the Charleston.
 363
 364     std::vector<Value*> vectors(16);
 365     std::vector<Constant*>    pMask(mVWidth);
 366     for(uint32_t i = 0; i < mVWidth; ++i)
 367     {
 368         pMask[i] = (C(i < 4 ? i : 4));
 369     }
 370     Constant* promoteMask = ConstantVector::get(pMask);
 371     Constant* uwvec = UndefValue::get(VectorType::get(mFP32Ty, 4));
 372
 373     Value* startVertex = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartVertex});
 374     Value* startInstance = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartInstance});
 375     Value* curInstance = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_CurInstance});
 376     Value* vBaseVertex = VBROADCAST(LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_BaseVertex}));
 377     curInstance->setName("curInstance");
 378
 379     for(uint32_t nelt = 0; nelt < fetchState.numAttribs; ++nelt)
 380     {
 381         Value*    elements[4] = {0};
 382         const INPUT_ELEMENT_DESC& ied = fetchState.layout[nelt];
 383         const SWR_FORMAT_INFO &info = GetFormatInfo((SWR_FORMAT)ied.Format);
 384         SWR_ASSERT((info.bpp != 0), "Unsupported format in JitLoadVertices.");
 385         uint32_t    numComponents = info.numComps;
 386         uint32_t bpc = info.bpp / info.numComps;  ///@todo Code below assumes all components are same size. Need to fix.
 387
 388         // load path doesn't support component packing
 389         SWR_ASSERT(ied.ComponentPacking == ComponentEnable::XYZW, "Fetch load path doesn't support component packing.");
 390
 391         vectors.clear();
 392
 393         if (fetchState.bInstanceIDOffsetEnable)
 394         {
 395             SWR_ASSERT((0), "TODO: Fill out more once driver sends this down");
 396         }
 397
 398         Value *vCurIndices;
 399         Value *startOffset;
 400         if(ied.InstanceEnable)
 401         {
 402             Value* stepRate = C(ied.InstanceAdvancementState);
 403
 404             // prevent a div by 0 for 0 step rate
 405             Value* isNonZeroStep = ICMP_UGT(stepRate, C(0));
 406             stepRate = SELECT(isNonZeroStep, stepRate, C(1));
 407
 408             // calc the current offset into instanced data buffer
 409             Value* calcInstance = UDIV(curInstance, stepRate);
 410
 411             // if step rate is 0, every instance gets instance 0
 412             calcInstance = SELECT(isNonZeroStep, calcInstance, C(0));
 413
 414             vCurIndices = VBROADCAST(calcInstance);
 415
 416             startOffset = startInstance;
 417         }
 418         else if (ied.InstanceStrideEnable)
 419         {
 420             SWR_ASSERT((0), "TODO: Fill out more once driver sends this down.");
 421         }
 422         else
 423         {
 424             // offset indices by baseVertex
 425             vCurIndices = ADD(vIndices, vBaseVertex);
 426
 427             startOffset = startVertex;
 428         }
 429
 430         // load SWR_VERTEX_BUFFER_STATE::pData
 431         Value *stream = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pData});
 432
 433         // load SWR_VERTEX_BUFFER_STATE::pitch
 434         Value *stride = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pitch});
 435         stride = Z_EXT(stride, mInt64Ty);
 436
 437         // load SWR_VERTEX_BUFFER_STATE::size
 438         Value *size = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_size});
 439         size = Z_EXT(size, mInt64Ty);
 440
 441         Value* startVertexOffset = MUL(Z_EXT(startOffset, mInt64Ty), stride);
 442
 443         Value *minVertex = NULL;
 444         Value *minVertexOffset = NULL;
 445         if (fetchState.bPartialVertexBuffer) {
 446             // fetch min index for low bounds checking
 447             minVertex = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_minVertex)});
 448             minVertex = LOAD(minVertex);
 449             if (!fetchState.bDisableIndexOOBCheck) {
 450                 minVertexOffset = MUL(Z_EXT(minVertex, mInt64Ty), stride);
 451             }
 452         }
 453
 454         // Load from the stream.
 455         for(uint32_t lane = 0; lane < mVWidth; ++lane)
 456         {
 457             // Get index
 458             Value* index = VEXTRACT(vCurIndices, C(lane));
 459
 460             if (fetchState.bPartialVertexBuffer) {
 461                 // clamp below minvertex
 462                 Value *isBelowMin = ICMP_SLT(index, minVertex);
 463                 index = SELECT(isBelowMin, minVertex, index);
 464             }
 465
 466             index = Z_EXT(index, mInt64Ty);
 467
 468             Value*    offset = MUL(index, stride);
 469             offset = ADD(offset, C((int64_t)ied.AlignedByteOffset));
 470             offset = ADD(offset, startVertexOffset);
 471
 472             if (!fetchState.bDisableIndexOOBCheck) {
 473                 // check for out of bound access, including partial OOB, and replace them with minVertex
 474                 Value *endOffset = ADD(offset, C((int64_t)info.Bpp));
 475                 Value *oob = ICMP_ULE(endOffset, size);
 476                 if (fetchState.bPartialVertexBuffer) {
 477                     offset = SELECT(oob, offset, minVertexOffset);
 478                 } else {
 479                     offset = SELECT(oob, offset, ConstantInt::get(mInt64Ty, 0));
 480                 }
 481             }
 482
 483             Value*    pointer = GEP(stream, offset);
 484             // We use a full-lane, but don't actually care.
 485             Value*    vptr = 0;
 486
 487             // get a pointer to a 4 component attrib in default address space
 488             switch(bpc)
 489             {
 490                 case 8: vptr = BITCAST(pointer, PointerType::get(VectorType::get(mInt8Ty, 4), 0)); break;
 491                 case 16: vptr = BITCAST(pointer, PointerType::get(VectorType::get(mInt16Ty, 4), 0)); break;
 492                 case 32: vptr = BITCAST(pointer, PointerType::get(VectorType::get(mFP32Ty, 4), 0)); break;
 493                 default: SWR_INVALID("Unsupported underlying bpp!");
 494             }
 495
 496             // load 4 components of attribute
 497             Value*    vec = ALIGNED_LOAD(vptr, 1, false);
 498
 499             // Convert To FP32 internally
 500             switch(info.type[0])
 501             {
 502                 case SWR_TYPE_UNORM:
 503                     switch(bpc)
 504                     {
 505                         case 8:
 506                             vec = UI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
 507                             vec = FMUL(vec, ConstantVector::get(std::vector<Constant*>(4, ConstantFP::get(mFP32Ty, 1.0 / 255.0))));
 508                             break;
 509                         case 16:
 510                             vec = UI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
 511                             vec = FMUL(vec, ConstantVector::get(std::vector<Constant*>(4, ConstantFP::get(mFP32Ty, 1.0 / 65535.0))));
 512                             break;
 513                         default:
 514                             SWR_INVALID("Unsupported underlying type!");
 515                             break;
 516                     }
 517                     break;
 518                 case SWR_TYPE_SNORM:
 519                     switch(bpc)
 520                     {
 521                         case 8:
 522                             vec = SI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
 523                             vec = FMUL(vec, ConstantVector::get(std::vector<Constant*>(4, ConstantFP::get(mFP32Ty, 1.0 / 128.0))));
 524                             break;
 525                         case 16:
 526                             vec = SI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
 527                             vec = FMUL(vec, ConstantVector::get(std::vector<Constant*>(4, ConstantFP::get(mFP32Ty, 1.0 / 32768.0))));
 528                             break;
 529                         default:
 530                             SWR_INVALID("Unsupported underlying type!");
 531                             break;
 532                     }
 533                     break;
 534                 case SWR_TYPE_UINT:
 535                     // Zero extend uint32_t types.
 536                     switch(bpc)
 537                     {
 538                         case 8:
 539                         case 16:
 540                             vec = Z_EXT(vec, VectorType::get(mInt32Ty, 4));
 541                             vec = BITCAST(vec, VectorType::get(mFP32Ty, 4));
 542                             break;
 543                         case 32:
 544                             break; // Pass through unchanged.
 545                         default:
 546                             SWR_INVALID("Unsupported underlying type!");
 547                             break;
 548                     }
 549                     break;
 550                 case SWR_TYPE_SINT:
 551                     // Sign extend SINT types.
 552                     switch(bpc)
 553                     {
 554                         case 8:
 555                         case 16:
 556                             vec = S_EXT(vec, VectorType::get(mInt32Ty, 4));
 557                             vec = BITCAST(vec, VectorType::get(mFP32Ty, 4));
 558                             break;
 559                         case 32:
 560                             break; // Pass through unchanged.
 561                         default:
 562                             SWR_INVALID("Unsupported underlying type!");
 563                             break;
 564                     }
 565                     break;
 566                 case SWR_TYPE_FLOAT:
 567                     switch(bpc)
 568                     {
 569                         case 32:
 570                             break; // Pass through unchanged.
 571                         default:
 572                             SWR_INVALID("Unsupported underlying type!");
 573                     }
 574                     break;
 575                 case SWR_TYPE_USCALED:
 576                     vec = UI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
 577                     break;
 578                 case SWR_TYPE_SSCALED:
 579                     vec = SI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
 580                     break;
 581                 case SWR_TYPE_SFIXED:
 582                     vec = FMUL(SI_TO_FP(vec, VectorType::get(mFP32Ty, 4)), VBROADCAST(C(1/65536.0f)));
 583                     break;
 584                 case SWR_TYPE_UNKNOWN:
 585                 case SWR_TYPE_UNUSED:
 586                     SWR_INVALID("Unsupported type %d!", info.type[0]);
 587             }
 588
 589             // promote mask: sse(0,1,2,3) | avx(0,1,2,3,4,4,4,4)
 590             // uwvec: 4 x F32, undef value
 591             Value*    wvec = VSHUFFLE(vec, uwvec, promoteMask);
 592             vectors.push_back(wvec);
 593         }
 594
 595         std::vector<Constant*>        v01Mask(mVWidth);
 596         std::vector<Constant*>        v23Mask(mVWidth);
 597         std::vector<Constant*>        v02Mask(mVWidth);
 598         std::vector<Constant*>        v13Mask(mVWidth);
 599
 600         // Concatenate the vectors together.
 601         elements[0] = VUNDEF_F();
 602         elements[1] = VUNDEF_F();
 603         elements[2] = VUNDEF_F();
 604         elements[3] = VUNDEF_F();
 605         for(uint32_t b = 0, num4Wide = mVWidth / 4; b < num4Wide; ++b)
 606         {
 607             v01Mask[4 * b + 0] = C(0 + 4 * b);
 608             v01Mask[4 * b + 1] = C(1 + 4 * b);
 609             v01Mask[4 * b + 2] = C(0 + 4 * b + mVWidth);
 610             v01Mask[4 * b + 3] = C(1 + 4 * b + mVWidth);
 611
 612             v23Mask[4 * b + 0] = C(2 + 4 * b);
 613             v23Mask[4 * b + 1] = C(3 + 4 * b);
 614             v23Mask[4 * b + 2] = C(2 + 4 * b + mVWidth);
 615             v23Mask[4 * b + 3] = C(3 + 4 * b + mVWidth);
 616
 617             v02Mask[4 * b + 0] = C(0 + 4 * b);
 618             v02Mask[4 * b + 1] = C(2 + 4 * b);
 619             v02Mask[4 * b + 2] = C(0 + 4 * b + mVWidth);
 620             v02Mask[4 * b + 3] = C(2 + 4 * b + mVWidth);
 621
 622             v13Mask[4 * b + 0] = C(1 + 4 * b);
 623             v13Mask[4 * b + 1] = C(3 + 4 * b);
 624             v13Mask[4 * b + 2] = C(1 + 4 * b + mVWidth);
 625             v13Mask[4 * b + 3] = C(3 + 4 * b + mVWidth);
 626
 627             std::vector<Constant*>    iMask(mVWidth);
 628             for(uint32_t i = 0; i < mVWidth; ++i)
 629             {
 630                 if(((4 * b) <= i) && (i < (4 * (b + 1))))
 631                 {
 632                     iMask[i] = C(i % 4 + mVWidth);
 633                 }
 634                 else
 635                 {
 636                     iMask[i] = C(i);
 637                 }
 638             }
 639             Constant* insertMask = ConstantVector::get(iMask);
 640             elements[0] = VSHUFFLE(elements[0], vectors[4 * b + 0], insertMask);
 641             elements[1] = VSHUFFLE(elements[1], vectors[4 * b + 1], insertMask);
 642             elements[2] = VSHUFFLE(elements[2], vectors[4 * b + 2], insertMask);
 643             elements[3] = VSHUFFLE(elements[3], vectors[4 * b + 3], insertMask);
 644         }
 645
 646         Value* x0y0x1y1 = VSHUFFLE(elements[0], elements[1], ConstantVector::get(v01Mask));
 647         Value* x2y2x3y3 = VSHUFFLE(elements[2], elements[3], ConstantVector::get(v01Mask));
 648         Value* z0w0z1w1 = VSHUFFLE(elements[0], elements[1], ConstantVector::get(v23Mask));
 649         Value* z2w3z2w3 = VSHUFFLE(elements[2], elements[3], ConstantVector::get(v23Mask));
 650         elements[0] = VSHUFFLE(x0y0x1y1, x2y2x3y3, ConstantVector::get(v02Mask));
 651         elements[1] = VSHUFFLE(x0y0x1y1, x2y2x3y3, ConstantVector::get(v13Mask));
 652         elements[2] = VSHUFFLE(z0w0z1w1, z2w3z2w3, ConstantVector::get(v02Mask));
 653         elements[3] = VSHUFFLE(z0w0z1w1, z2w3z2w3, ConstantVector::get(v13Mask));
 654
 655         switch(numComponents + 1)
 656         {
 657             case    1: elements[0] = VIMMED1(0.0f);
 658             case    2: elements[1] = VIMMED1(0.0f);
 659             case    3: elements[2] = VIMMED1(0.0f);
 660             case    4: elements[3] = VIMMED1(1.0f);
 661         }
 662
 663         for(uint32_t c = 0; c < 4; ++c)
 664         {
 665 #if USE_SIMD16_SHADERS
 666             Value* dest = GEP(pVtxOut, C(nelt * 8 + c * 2), "destGEP");
 667 #else
 668             Value* dest = GEP(pVtxOut, C(nelt * 4 + c), "destGEP");
 669 #endif
 670             STORE(elements[c], dest);
 671         }
 672     }
 673 }
 674
 675 // returns true for odd formats that require special state.gather handling
 676 bool FetchJit::IsOddFormat(SWR_FORMAT format)
 677 {
 678     const SWR_FORMAT_INFO& info = GetFormatInfo(format);
 679     if (info.bpc[0] != 8 && info.bpc[0] != 16 && info.bpc[0] != 32 && info.bpc[0] != 64)
 680     {
 681         return true;
 682     }
 683     return false;
 684 }
 685
 686 // format is uniform if all components are the same size and type
 687 bool FetchJit::IsUniformFormat(SWR_FORMAT format)
 688 {
 689     const SWR_FORMAT_INFO& info = GetFormatInfo(format);
 690     uint32_t bpc0 = info.bpc[0];
 691     uint32_t type0 = info.type[0];
 692
 693     for (uint32_t c = 1; c < info.numComps; ++c)
 694     {
 695         if (bpc0 != info.bpc[c] || type0 != info.type[c])
 696         {
 697             return false;
 698         }
 699     }
 700     return true;
 701 }
 702
 703 // unpacks components based on format
 704 // foreach component in the pixel
 705 //   mask off everything but this component
 706 //   shift component to LSB
 707 void FetchJit::UnpackComponents(SWR_FORMAT format, Value* vInput, Value* result[4])
 708 {
 709     const SWR_FORMAT_INFO& info = GetFormatInfo(format);
 710
 711     uint32_t bitOffset = 0;
 712     for (uint32_t c = 0; c < info.numComps; ++c)
 713     {
 714         uint32_t swizzledIndex = info.swizzle[c];
 715         uint32_t compBits = info.bpc[c];
 716         uint32_t bitmask = ((1 << compBits) - 1) << bitOffset;
 717         Value* comp = AND(vInput, bitmask);
 718         comp = LSHR(comp, bitOffset);
 719
 720         result[swizzledIndex] = comp;
 721         bitOffset += compBits;
 722     }
 723 }
 724
 725 // gather for odd component size formats
 726 // gather SIMD full pixels per lane then shift/mask to move each component to their
 727 // own vector
 728 void FetchJit::CreateGatherOddFormats(SWR_FORMAT format, Value* pMask, Value* pBase, Value* pOffsets, Value* pResult[4])
 729 {
 730     const SWR_FORMAT_INFO &info = GetFormatInfo(format);
 731
 732     // only works if pixel size is <= 32bits
 733     SWR_ASSERT(info.bpp <= 32);
 734
 735     Value *pGather = GATHERDD(VIMMED1(0), pBase, pOffsets, pMask);
 736
 737     for (uint32_t comp = 0; comp < 4; ++comp)
 738     {
 739         pResult[comp] = VIMMED1((int)info.defaults[comp]);
 740     }
 741
 742     UnpackComponents(format, pGather, pResult);
 743
 744     // cast to fp32
 745     pResult[0] = BITCAST(pResult[0], mSimdFP32Ty);
 746     pResult[1] = BITCAST(pResult[1], mSimdFP32Ty);
 747     pResult[2] = BITCAST(pResult[2], mSimdFP32Ty);
 748     pResult[3] = BITCAST(pResult[3], mSimdFP32Ty);
 749 }
 750
 751 void FetchJit::ConvertFormat(SWR_FORMAT format, Value *texels[4])
 752 {
 753     const SWR_FORMAT_INFO &info = GetFormatInfo(format);
 754
 755     for (uint32_t c = 0; c < info.numComps; ++c)
 756     {
 757         uint32_t compIndex = info.swizzle[c];
 758
 759         // skip any conversion on UNUSED components
 760         if (info.type[c] == SWR_TYPE_UNUSED)
 761         {
 762             continue;
 763         }
 764
 765         if (info.isNormalized[c])
 766         {
 767             if (info.type[c] == SWR_TYPE_SNORM)
 768             {
 769                 /// @todo The most-negative value maps to -1.0f. e.g. the 5-bit value 10000 maps to -1.0f.
 770
 771                 /// result = c * (1.0f / (2^(n-1) - 1);
 772                 uint32_t n = info.bpc[c];
 773                 uint32_t pow2 = 1 << (n - 1);
 774                 float scale = 1.0f / (float)(pow2 - 1);
 775                 Value *vScale = VIMMED1(scale);
 776                 texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty);
 777                 texels[compIndex] = SI_TO_FP(texels[compIndex], mSimdFP32Ty);
 778                 texels[compIndex] = FMUL(texels[compIndex], vScale);
 779             }
 780             else
 781             {
 782                 SWR_ASSERT(info.type[c] == SWR_TYPE_UNORM);
 783
 784                 /// result = c * (1.0f / (2^n - 1))
 785                 uint32_t n = info.bpc[c];
 786                 uint32_t pow2 = 1 << n;
 787                 // special case 24bit unorm format, which requires a full divide to meet ULP requirement
 788                 if (n == 24)
 789                 {
 790                     float scale = (float)(pow2 - 1);
 791                     Value* vScale = VIMMED1(scale);
 792                     texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty);
 793                     texels[compIndex] = SI_TO_FP(texels[compIndex], mSimdFP32Ty);
 794                     texels[compIndex] = FDIV(texels[compIndex], vScale);
 795                 }
 796                 else
 797                 {
 798                     float scale = 1.0f / (float)(pow2 - 1);
 799                     Value *vScale = VIMMED1(scale);
 800                     texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty);
 801                     texels[compIndex] = UI_TO_FP(texels[compIndex], mSimdFP32Ty);
 802                     texels[compIndex] = FMUL(texels[compIndex], vScale);
 803                 }
 804             }
 805             continue;
 806         }
 807     }
 808 }
 809
 810 //////////////////////////////////////////////////////////////////////////
 811 /// @brief Loads attributes from memory using AVX2 GATHER(s)
 812 /// @param fetchState - info about attributes to be fetched from memory
 813 /// @param streams - value pointer to the current vertex stream
 814 /// @param vIndices - vector value of indices to gather
 815 /// @param pVtxOut - value pointer to output simdvertex struct
 816 #if USE_SIMD16_SHADERS
 817 #if USE_SIMD16_GATHERS
 818 void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
 819     Value *streams, Value *vIndices, Value *vIndices2, Value *pVtxOut, bool useVertexID2)
 820 #else
 821 void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
 822     Value* streams, Value* vIndices, Value* pVtxOut, bool useVertexID2)
 823 #endif
 824 #else
 825 void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
 826     Value* streams, Value* vIndices, Value* pVtxOut)
 827 #endif
 828 {
 829     uint32_t currentVertexElement = 0;
 830     uint32_t outputElt = 0;
 831     Value* vVertexElements[4];
 832 #if USE_SIMD16_GATHERS
 833     Value* vVertexElements2[4];
 834 #if USE_SIMD16_BUILDER
 835     Value *pVtxSrc2[4];
 836 #endif
 837 #endif
 838
 839     Value* startVertex = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartVertex});
 840     Value* startInstance = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartInstance});
 841     Value* curInstance = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_CurInstance});
 842     Value* vBaseVertex = VBROADCAST(LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_BaseVertex}));
 843     curInstance->setName("curInstance");
 844
 845     for (uint32_t nInputElt = 0; nInputElt < fetchState.numAttribs; nInputElt += 1)
 846     {
 847         const INPUT_ELEMENT_DESC& ied = fetchState.layout[nInputElt];
 848
 849         // skip element if all components are disabled
 850         if (ied.ComponentPacking == ComponentEnable::NONE)
 851         {
 852             continue;
 853         }
 854
 855         const SWR_FORMAT_INFO &info = GetFormatInfo((SWR_FORMAT)ied.Format);
 856         SWR_ASSERT((info.bpp != 0), "Unsupported format in JitGatherVertices.");
 857         uint32_t bpc = info.bpp / info.numComps;  ///@todo Code below assumes all components are same size. Need to fix.
 858
 859         Value *stream = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pData});
 860
 861         // VGATHER* takes an *i8 src pointer
 862         Value* pStreamBase = BITCAST(stream, PointerType::get(mInt8Ty, 0));
 863
 864         Value *stride = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pitch});
 865         Value *vStride = VBROADCAST(stride);
 866
 867         // max vertex index that is fully in bounds
 868         Value *maxVertex = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_maxVertex)});
 869         maxVertex = LOAD(maxVertex);
 870
 871         Value *minVertex = NULL;
 872         if (fetchState.bPartialVertexBuffer)
 873         {
 874             // min vertex index for low bounds OOB checking
 875             minVertex = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_minVertex)});
 876             minVertex = LOAD(minVertex);
 877         }
 878
 879         if (fetchState.bInstanceIDOffsetEnable)
 880         {
 881             // the InstanceID (curInstance) value is offset by StartInstanceLocation
 882             curInstance = ADD(curInstance, startInstance);
 883         }
 884
 885         Value *vCurIndices;
 886 #if USE_SIMD16_GATHERS
 887         Value *vCurIndices2;
 888 #endif
 889         Value *startOffset;
 890         Value *vInstanceStride = VIMMED1(0);
 891
 892         if (ied.InstanceEnable)
 893         {
 894             Value* stepRate = C(ied.InstanceAdvancementState);
 895
 896             // prevent a div by 0 for 0 step rate
 897             Value* isNonZeroStep = ICMP_UGT(stepRate, C(0));
 898             stepRate = SELECT(isNonZeroStep, stepRate, C(1));
 899
 900             // calc the current offset into instanced data buffer
 901             Value* calcInstance = UDIV(curInstance, stepRate);
 902
 903             // if step rate is 0, every instance gets instance 0
 904             calcInstance = SELECT(isNonZeroStep, calcInstance, C(0));
 905
 906             vCurIndices = VBROADCAST(calcInstance);
 907 #if USE_SIMD16_GATHERS
 908             vCurIndices2 = VBROADCAST(calcInstance);
 909 #endif
 910
 911             startOffset = startInstance;
 912         }
 913         else if (ied.InstanceStrideEnable)
 914         {
 915             // grab the instance advancement state, determines stride in bytes from one instance to the next
 916             Value* stepRate = C(ied.InstanceAdvancementState);
 917             vInstanceStride = VBROADCAST(MUL(curInstance, stepRate));
 918
 919             // offset indices by baseVertex
 920             vCurIndices = ADD(vIndices, vBaseVertex);
 921 #if USE_SIMD16_GATHERS
 922             vCurIndices2 = ADD(vIndices2, vBaseVertex);
 923 #endif
 924
 925             startOffset = startVertex;
 926             SWR_ASSERT((0), "TODO: Fill out more once driver sends this down.");
 927         }
 928         else
 929         {
 930             // offset indices by baseVertex
 931             vCurIndices = ADD(vIndices, vBaseVertex);
 932 #if USE_SIMD16_GATHERS
 933             vCurIndices2 = ADD(vIndices2, vBaseVertex);
 934 #endif
 935
 936             startOffset = startVertex;
 937         }
 938
 939         // All of the OOB calculations are in vertices, not VB offsets, to prevent having to
 940         // do 64bit address offset calculations.
 941
 942         // calculate byte offset to the start of the VB
 943         Value* baseOffset = MUL(Z_EXT(startOffset, mInt64Ty), Z_EXT(stride, mInt64Ty));
 944         pStreamBase = GEP(pStreamBase, baseOffset);
 945
 946         // if we have a start offset, subtract from max vertex. Used for OOB check
 947         maxVertex = SUB(Z_EXT(maxVertex, mInt64Ty), Z_EXT(startOffset, mInt64Ty));
 948         Value* maxNeg = ICMP_SLT(maxVertex, C((int64_t)0));
 949         // if we have a negative value, we're already OOB. clamp at 0.
 950         maxVertex = SELECT(maxNeg, C(0), TRUNC(maxVertex, mInt32Ty));
 951
 952         if (fetchState.bPartialVertexBuffer)
 953         {
 954             // similary for min vertex
 955             minVertex = SUB(Z_EXT(minVertex, mInt64Ty), Z_EXT(startOffset, mInt64Ty));
 956             Value *minNeg = ICMP_SLT(minVertex, C((int64_t)0));
 957             minVertex = SELECT(minNeg, C(0), TRUNC(minVertex, mInt32Ty));
 958         }
 959
 960         // Load the in bounds size of a partially valid vertex
 961         Value *partialInboundsSize = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_partialInboundsSize)});
 962         partialInboundsSize = LOAD(partialInboundsSize);
 963         Value* vPartialVertexSize = VBROADCAST(partialInboundsSize);
 964         Value* vBpp = VBROADCAST(C(info.Bpp));
 965         Value* vAlignmentOffsets = VBROADCAST(C(ied.AlignedByteOffset));
 966
 967         // is the element is <= the partially valid size
 968         Value* vElementInBoundsMask = ICMP_SLE(vBpp, SUB(vPartialVertexSize, vAlignmentOffsets));
 969
 970 #if USE_SIMD16_GATHERS
 971         // override cur indices with 0 if pitch is 0
 972         Value* pZeroPitchMask = ICMP_EQ(vStride, VIMMED1(0));
 973         vCurIndices = SELECT(pZeroPitchMask, VIMMED1(0), vCurIndices);
 974         vCurIndices2 = SELECT(pZeroPitchMask, VIMMED1(0), vCurIndices2);
 975
 976         // are vertices partially OOB?
 977         Value* vMaxVertex = VBROADCAST(maxVertex);
 978         Value* vPartialOOBMask = ICMP_EQ(vCurIndices, vMaxVertex);
 979         Value* vPartialOOBMask2 = ICMP_EQ(vCurIndices2, vMaxVertex);
 980
 981         // are vertices fully in bounds?
 982         Value* vMaxGatherMask = ICMP_ULT(vCurIndices, vMaxVertex);
 983         Value* vMaxGatherMask2 = ICMP_ULT(vCurIndices2, vMaxVertex);
 984
 985         Value *vGatherMask;
 986         Value *vGatherMask2;
 987         if (fetchState.bPartialVertexBuffer)
 988         {
 989             // are vertices below minVertex limit?
 990             Value *vMinVertex = VBROADCAST(minVertex);
 991             Value *vMinGatherMask = ICMP_UGE(vCurIndices, vMinVertex);
 992             Value *vMinGatherMask2 = ICMP_UGE(vCurIndices2, vMinVertex);
 993
 994             // only fetch lanes that pass both tests
 995             vGatherMask = AND(vMaxGatherMask, vMinGatherMask);
 996             vGatherMask2 = AND(vMaxGatherMask2, vMinGatherMask2);
 997         }
 998         else
 999         {
1000             vGatherMask = vMaxGatherMask;
1001             vGatherMask2 = vMaxGatherMask2;
1002         }
1003
1004         // blend in any partially OOB indices that have valid elements
1005         vGatherMask = SELECT(vPartialOOBMask, vElementInBoundsMask, vGatherMask);
1006         vGatherMask2 = SELECT(vPartialOOBMask2, vElementInBoundsMask, vGatherMask2);
1007
1008         // calculate the actual offsets into the VB
1009         Value* vOffsets = MUL(vCurIndices, vStride);
1010         vOffsets = ADD(vOffsets, vAlignmentOffsets);
1011
1012         Value* vOffsets2 = MUL(vCurIndices2, vStride);
1013         vOffsets2 = ADD(vOffsets2, vAlignmentOffsets);
1014
1015         // if instance stride enable is:
1016         //  true  - add product of the instanceID and advancement state to the offst into the VB
1017         //  false - value of vInstanceStride has been initialialized to zero
1018         vOffsets = ADD(vOffsets, vInstanceStride);
1019         vOffsets2 = ADD(vOffsets2, vInstanceStride);
1020
1021 #else
1022         // override cur indices with 0 if pitch is 0
1023         Value* pZeroPitchMask = ICMP_EQ(vStride, VIMMED1(0));
1024         vCurIndices = SELECT(pZeroPitchMask, VIMMED1(0), vCurIndices);
1025
1026         // are vertices partially OOB?
1027         Value* vMaxVertex = VBROADCAST(maxVertex);
1028         Value* vPartialOOBMask = ICMP_EQ(vCurIndices, vMaxVertex);
1029
1030         // are vertices fully in bounds?
1031         Value* vMaxGatherMask = ICMP_ULT(vCurIndices, vMaxVertex);
1032
1033         Value *vGatherMask;
1034         if (fetchState.bPartialVertexBuffer)
1035         {
1036             // are vertices below minVertex limit?
1037             Value *vMinVertex = VBROADCAST(minVertex);
1038             Value *vMinGatherMask = ICMP_UGE(vCurIndices, vMinVertex);
1039
1040             // only fetch lanes that pass both tests
1041             vGatherMask = AND(vMaxGatherMask, vMinGatherMask);
1042         }
1043         else
1044         {
1045             vGatherMask = vMaxGatherMask;
1046         }
1047
1048         // blend in any partially OOB indices that have valid elements
1049         vGatherMask = SELECT(vPartialOOBMask, vElementInBoundsMask, vGatherMask);
1050
1051         // calculate the actual offsets into the VB
1052         Value* vOffsets = MUL(vCurIndices, vStride);
1053         vOffsets = ADD(vOffsets, vAlignmentOffsets);
1054
1055         // if instance stride enable is:
1056         //  true  - add product of the instanceID and advancement state to the offst into the VB
1057         //  false - value of vInstanceStride has been initialialized to zero
1058         vOffsets = ADD(vOffsets, vInstanceStride);
1059
1060 #endif
1061         // Packing and component control
1062         ComponentEnable compMask = (ComponentEnable)ied.ComponentPacking;
1063         const ComponentControl compCtrl[4] { (ComponentControl)ied.ComponentControl0, (ComponentControl)ied.ComponentControl1,
1064                                              (ComponentControl)ied.ComponentControl2, (ComponentControl)ied.ComponentControl3};
1065
1066         // Special gather/conversion for formats without equal component sizes
1067         if (IsOddFormat((SWR_FORMAT)ied.Format))
1068         {
1069 #if USE_SIMD16_GATHERS
1070             Value *pResults[4];
1071             Value *pResults2[4];
1072             CreateGatherOddFormats((SWR_FORMAT)ied.Format, vGatherMask, pStreamBase, vOffsets, pResults);
1073             CreateGatherOddFormats((SWR_FORMAT)ied.Format, vGatherMask2, pStreamBase, vOffsets2, pResults2);
1074             ConvertFormat((SWR_FORMAT)ied.Format, pResults);
1075             ConvertFormat((SWR_FORMAT)ied.Format, pResults2);
1076
1077             for (uint32_t c = 0; c < 4; c += 1)
1078             {
1079                 if (isComponentEnabled(compMask, c))
1080                 {
1081 #if USE_SIMD16_BUILDER
1082                     // pack adjacent pairs of SIMD8s into SIMD16s
1083                     pVtxSrc2[currentVertexElement] = VUNDEF2_F();
1084                     pVtxSrc2[currentVertexElement] = INSERT2_F(pVtxSrc2[currentVertexElement], pResults[c],  0);
1085                     pVtxSrc2[currentVertexElement] = INSERT2_F(pVtxSrc2[currentVertexElement], pResults2[c], 1);
1086
1087 #else
1088                     vVertexElements[currentVertexElement]  = pResults[c];
1089                     vVertexElements2[currentVertexElement] = pResults2[c];
1090
1091 #endif
1092                     currentVertexElement += 1;
1093
1094                     if (currentVertexElement > 3)
1095                     {
1096 #if USE_SIMD16_BUILDER
1097                         // store SIMD16s
1098                         Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth2), 0));
1099
1100                         StoreVertexElements2(pVtxOut2, outputElt, 4, pVtxSrc2);
1101
1102 #else
1103                         StoreVertexElements(pVtxOut, outputElt, 4, vVertexElements);
1104                         StoreVertexElements(GEP(pVtxOut, C(1)), outputElt, 4, vVertexElements2);
1105
1106 #endif
1107                         outputElt += 1;
1108
1109                         // reset to the next vVertexElement to output
1110                         currentVertexElement = 0;
1111                     }
1112                 }
1113             }
1114 #else
1115             Value* pResults[4];
1116             CreateGatherOddFormats((SWR_FORMAT)ied.Format, vGatherMask, pStreamBase, vOffsets, pResults);
1117             ConvertFormat((SWR_FORMAT)ied.Format, pResults);
1118
1119             for (uint32_t c = 0; c < 4; ++c)
1120             {
1121                 if (isComponentEnabled(compMask, c))
1122                 {
1123                     vVertexElements[currentVertexElement++] = pResults[c];
1124                     if (currentVertexElement > 3)
1125                     {
1126                         StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1127                         // reset to the next vVertexElement to output
1128                         currentVertexElement = 0;
1129                     }
1130                 }
1131             }
1132 #endif
1133         }
1134         else if(info.type[0] == SWR_TYPE_FLOAT)
1135         {
1136             ///@todo: support 64 bit vb accesses
1137             Value *gatherSrc = VIMMED1(0.0f);
1138 #if USE_SIMD16_GATHERS
1139             Value *gatherSrc2 = VIMMED1(0.0f);
1140 #if USE_SIMD16_BUILDER
1141             Value *gatherSrc16 = VIMMED2_1(0.0f);
1142 #endif
1143 #endif
1144
1145             SWR_ASSERT(IsUniformFormat((SWR_FORMAT)ied.Format),
1146                 "Unsupported format for standard gather fetch.");
1147
1148             // Gather components from memory to store in a simdvertex structure
1149             switch (bpc)
1150             {
1151                 case 16:
1152                 {
1153 #if USE_SIMD16_GATHERS
1154                     Value *vGatherResult[2];
1155                     Value *vGatherResult2[2];
1156
1157                     // if we have at least one component out of x or y to fetch
1158                     if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
1159                     {
1160                         vGatherResult[0] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vGatherMask);
1161                         vGatherResult2[0] = GATHERPS(gatherSrc2, pStreamBase, vOffsets2, vGatherMask2);
1162                         // e.g. result of first 8x32bit integer gather for 16bit components
1163                         // 256i - 0    1    2    3    4    5    6    7
1164                         //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
1165                         //
1166                     }
1167                     else
1168                     {
1169                         vGatherResult[0]  = VUNDEF_I();
1170                         vGatherResult2[0] = VUNDEF_I();
1171                     }
1172
1173                     // if we have at least one component out of z or w to fetch
1174                     if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
1175                     {
1176                         // offset base to the next components(zw) in the vertex to gather
1177                         pStreamBase = GEP(pStreamBase, C((char)4));
1178
1179                         vGatherResult[1] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vGatherMask);
1180                         vGatherResult2[1] = GATHERPS(gatherSrc2, pStreamBase, vOffsets2, vGatherMask2);
1181                         // e.g. result of second 8x32bit integer gather for 16bit components
1182                         // 256i - 0    1    2    3    4    5    6    7
1183                         //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
1184                         //
1185                     }
1186                     else
1187                     {
1188                         vGatherResult[1]  = VUNDEF_I();
1189                         vGatherResult2[1] = VUNDEF_I();
1190                     }
1191
1192                     // if we have at least one component to shuffle into place
1193                     if (compMask)
1194                     {
1195 #if USE_SIMD16_BUILDER
1196                         Value *gatherResult[2];
1197
1198                         gatherResult[0] = VUNDEF2_I();
1199                         gatherResult[1] = VUNDEF2_I();
1200
1201                         gatherResult[0] = INSERT2_I(gatherResult[0], vGatherResult[0],  0);
1202                         gatherResult[0] = INSERT2_I(gatherResult[0], vGatherResult2[0], 1);
1203
1204                         gatherResult[1] = INSERT2_I(gatherResult[1], vGatherResult[1],  0);
1205                         gatherResult[1] = INSERT2_I(gatherResult[1], vGatherResult2[1], 1);
1206
1207                         Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth2), 0));
1208
1209                         Shuffle16bpcArgs args = std::forward_as_tuple(gatherResult, pVtxOut2, Instruction::CastOps::FPExt, CONVERT_NONE,
1210                             currentVertexElement, outputElt, compMask, compCtrl, pVtxSrc2);
1211
1212                         // Shuffle gathered components into place in simdvertex struct
1213                         Shuffle16bpcGather2(args);  // outputs to vVertexElements ref
1214 #else
1215                         Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, Instruction::CastOps::FPExt, CONVERT_NONE,
1216                             currentVertexElement, outputElt, compMask, compCtrl, vVertexElements);
1217                         Shuffle16bpcArgs args2 = std::forward_as_tuple(vGatherResult2, GEP(pVtxOut, C(1)), Instruction::CastOps::FPExt, CONVERT_NONE,
1218                             currentVertexElement, outputElt, compMask, compCtrl, vVertexElements2);
1219
1220                         // Shuffle gathered components into place in simdvertex struct
1221                         Shuffle16bpcGather(args, false);  // outputs to vVertexElements ref
1222                         Shuffle16bpcGather(args2, true);  // outputs to vVertexElements ref
1223 #endif
1224                     }
1225 #else
1226                     Value* vGatherResult[2];
1227
1228                     // if we have at least one component out of x or y to fetch
1229                     if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)){
1230                         vGatherResult[0] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vGatherMask);
1231                         // e.g. result of first 8x32bit integer gather for 16bit components
1232                         // 256i - 0    1    2    3    4    5    6    7
1233                         //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
1234                         //
1235                     }
1236
1237                     // if we have at least one component out of z or w to fetch
1238                     if(isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)){
1239                         // offset base to the next components(zw) in the vertex to gather
1240                         pStreamBase = GEP(pStreamBase, C((char)4));
1241
1242                         vGatherResult[1] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vGatherMask);
1243                         // e.g. result of second 8x32bit integer gather for 16bit components
1244                         // 256i - 0    1    2    3    4    5    6    7
1245                         //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
1246                         //
1247                     }
1248
1249                     // if we have at least one component to shuffle into place
1250                     if(compMask){
1251                         Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, Instruction::CastOps::FPExt, CONVERT_NONE,
1252                             currentVertexElement, outputElt, compMask, compCtrl, vVertexElements);
1253
1254                         // Shuffle gathered components into place in simdvertex struct
1255 #if USE_SIMD16_SHADERS
1256                         Shuffle16bpcGather(args, useVertexID2);  // outputs to vVertexElements ref
1257 #else
1258                         Shuffle16bpcGather(args);  // outputs to vVertexElements ref
1259 #endif
1260                     }
1261 #endif
1262                 }
1263                     break;
1264                 case 32:
1265                 {
1266                     for (uint32_t i = 0; i < 4; i += 1)
1267                     {
1268 #if USE_SIMD16_GATHERS
1269                         if (isComponentEnabled(compMask, i))
1270                         {
1271                             // if we need to gather the component
1272                             if (compCtrl[i] == StoreSrc)
1273                             {
1274                                 // Gather a SIMD of vertices
1275                                 // APIs allow a 4GB range for offsets
1276                                 // However, GATHERPS uses signed 32-bit offsets, so only a 2GB range :(
1277                                 // But, we know that elements must be aligned for FETCH. :)
1278                                 // Right shift the offset by a bit and then scale by 2 to remove the sign extension.
1279                                 Value *vShiftedOffsets  = VPSRLI(vOffsets,  C(1));
1280                                 Value *vShiftedOffsets2 = VPSRLI(vOffsets2, C(1));
1281 #if USE_SIMD16_BUILDER
1282                                 Value *indices = VUNDEF2_I();
1283                                 indices = INSERT2_I(indices, vShiftedOffsets,  0);
1284                                 indices = INSERT2_I(indices, vShiftedOffsets2, 1);
1285
1286                                 Value *mask = VSHUFFLE(vGatherMask, vGatherMask2, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15});
1287
1288                                 pVtxSrc2[currentVertexElement] = GATHERPS2(gatherSrc16, pStreamBase, indices, mask, 2);
1289 #else
1290                                 vVertexElements[currentVertexElement]  = GATHERPS(gatherSrc, pStreamBase, vShiftedOffsets, vGatherMask, 2);
1291                                 vVertexElements2[currentVertexElement] = GATHERPS(gatherSrc2, pStreamBase, vShiftedOffsets2, vGatherMask2, 2);
1292
1293 #if USE_SIMD16_BUILDER
1294                                 // pack adjacent pairs of SIMD8s into SIMD16s
1295                                 pVtxSrc2[currentVertexElement] = VUNDEF2_F();
1296                                 pVtxSrc2[currentVertexElement] = INSERT2_F(pVtxSrc2[currentVertexElement], vVertexElements[currentVertexElement],  0);
1297                                 pVtxSrc2[currentVertexElement] = INSERT2_F(pVtxSrc2[currentVertexElement], vVertexElements2[currentVertexElement], 1);
1298
1299 #endif
1300 #endif
1301                                 currentVertexElement += 1;
1302                             }
1303                             else
1304                             {
1305 #if USE_SIMD16_BUILDER
1306                                 pVtxSrc2[currentVertexElement] = GenerateCompCtrlVector2(compCtrl[i]);
1307 #else
1308                                 vVertexElements[currentVertexElement] = GenerateCompCtrlVector(compCtrl[i], false);
1309                                 vVertexElements2[currentVertexElement] = GenerateCompCtrlVector(compCtrl[i], true);
1310
1311 #if USE_SIMD16_BUILDER
1312                                 // pack adjacent pairs of SIMD8s into SIMD16s
1313                                 pVtxSrc2[currentVertexElement] = VUNDEF2_F();
1314                                 pVtxSrc2[currentVertexElement] = INSERT2_F(pVtxSrc2[currentVertexElement], vVertexElements[currentVertexElement],  0);
1315                                 pVtxSrc2[currentVertexElement] = INSERT2_F(pVtxSrc2[currentVertexElement], vVertexElements2[currentVertexElement], 1);
1316
1317 #endif
1318 #endif
1319                                 currentVertexElement += 1;
1320                             }
1321
1322                             if (currentVertexElement > 3)
1323                             {
1324 #if USE_SIMD16_BUILDER
1325                                 // store SIMD16s
1326                                 Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth2), 0));
1327
1328                                 StoreVertexElements2(pVtxOut2, outputElt, 4, pVtxSrc2);
1329
1330 #else
1331                                 StoreVertexElements(pVtxOut, outputElt, 4, vVertexElements);
1332                                 StoreVertexElements(GEP(pVtxOut, C(1)), outputElt, 4, vVertexElements2);
1333
1334 #endif
1335                                 outputElt += 1;
1336
1337                                 // reset to the next vVertexElement to output
1338                                 currentVertexElement = 0;
1339                             }
1340                         }
1341
1342                         // offset base to the next component in the vertex to gather
1343                         pStreamBase = GEP(pStreamBase, C((char)4));
1344 #else
1345                         if (isComponentEnabled(compMask, i))
1346                         {
1347                             // if we need to gather the component
1348                             if (compCtrl[i] == StoreSrc)
1349                             {
1350                                 // Gather a SIMD of vertices
1351                                 // APIs allow a 4GB range for offsets
1352                                 // However, GATHERPS uses signed 32-bit offsets, so only a 2GB range :(
1353                                 // But, we know that elements must be aligned for FETCH. :)
1354                                 // Right shift the offset by a bit and then scale by 2 to remove the sign extension.
1355                                 Value* vShiftedOffsets = VPSRLI(vOffsets, C(1));
1356                                 vVertexElements[currentVertexElement++] = GATHERPS(gatherSrc, pStreamBase, vShiftedOffsets, vGatherMask, 2);
1357                             }
1358                             else
1359                             {
1360 #if USE_SIMD16_SHADERS
1361                                 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i], useVertexID2);
1362 #else
1363                                 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1364 #endif
1365                             }
1366
1367                             if (currentVertexElement > 3)
1368                             {
1369                                 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1370                                 // reset to the next vVertexElement to output
1371                                 currentVertexElement = 0;
1372                             }
1373                         }
1374
1375                         // offset base to the next component in the vertex to gather
1376                         pStreamBase = GEP(pStreamBase, C((char)4));
1377 #endif
1378                     }
1379                 }
1380                     break;
1381                 case 64:
1382                 {
1383                     for (uint32_t i = 0; i < 4; i += 1)
1384                     {
1385 #if USE_SIMD16_GATHERS
1386                         if (isComponentEnabled(compMask, i))
1387                         {
1388                             // if we need to gather the component
1389                             if (compCtrl[i] == StoreSrc)
1390                             {
1391                                 Value *vMaskLo = VSHUFFLE(vGatherMask, VUNDEF(mInt1Ty, 8), C({ 0, 1, 2, 3 }));
1392                                 Value *vMaskLo2 = VSHUFFLE(vGatherMask2, VUNDEF(mInt1Ty, 8), C({ 0, 1, 2, 3 }));
1393                                 Value *vMaskHi = VSHUFFLE(vGatherMask, VUNDEF(mInt1Ty, 8), C({ 4, 5, 6, 7 }));
1394                                 Value *vMaskHi2 = VSHUFFLE(vGatherMask2, VUNDEF(mInt1Ty, 8), C({ 4, 5, 6, 7 }));
1395
1396                                 Value *vOffsetsLo = VEXTRACTI128(vOffsets, C(0));
1397                                 Value *vOffsetsLo2 = VEXTRACTI128(vOffsets2, C(0));
1398                                 Value *vOffsetsHi = VEXTRACTI128(vOffsets, C(1));
1399                                 Value *vOffsetsHi2 = VEXTRACTI128(vOffsets2, C(1));
1400
1401                                 Value *vZeroDouble = VECTOR_SPLAT(4, ConstantFP::get(IRB()->getDoubleTy(), 0.0f));
1402
1403                                 Value* pGatherLo = GATHERPD(vZeroDouble, pStreamBase, vOffsetsLo, vMaskLo);
1404                                 Value* pGatherLo2 = GATHERPD(vZeroDouble, pStreamBase, vOffsetsLo2, vMaskLo2);
1405                                 Value* pGatherHi = GATHERPD(vZeroDouble, pStreamBase, vOffsetsHi, vMaskHi);
1406                                 Value* pGatherHi2 = GATHERPD(vZeroDouble, pStreamBase, vOffsetsHi2, vMaskHi2);
1407
1408                                 pGatherLo = VCVTPD2PS(pGatherLo);
1409                                 pGatherLo2 = VCVTPD2PS(pGatherLo2);
1410                                 pGatherHi = VCVTPD2PS(pGatherHi);
1411                                 pGatherHi2 = VCVTPD2PS(pGatherHi2);
1412
1413                                 Value *pGather = VSHUFFLE(pGatherLo, pGatherHi, C({ 0, 1, 2, 3, 4, 5, 6, 7 }));
1414                                 Value *pGather2 = VSHUFFLE(pGatherLo2, pGatherHi2, C({ 0, 1, 2, 3, 4, 5, 6, 7 }));
1415
1416 #if USE_SIMD16_BUILDER
1417                                 // pack adjacent pairs of SIMD8s into SIMD16s
1418                                 pVtxSrc2[currentVertexElement] = VUNDEF2_F();
1419                                 pVtxSrc2[currentVertexElement] = INSERT2_F(pVtxSrc2[currentVertexElement], pGather,  0);
1420                                 pVtxSrc2[currentVertexElement] = INSERT2_F(pVtxSrc2[currentVertexElement], pGather2, 1);
1421
1422 #else
1423                                 vVertexElements[currentVertexElement]  = pGather;
1424                                 vVertexElements2[currentVertexElement] = pGather2;
1425
1426 #endif
1427                                 currentVertexElement += 1;
1428                             }
1429                             else
1430                             {
1431 #if USE_SIMD16_BUILDER
1432                                 pVtxSrc2[currentVertexElement] = GenerateCompCtrlVector2(compCtrl[i]);
1433
1434 #else
1435                                 vVertexElements[currentVertexElement] = GenerateCompCtrlVector(compCtrl[i], false);
1436                                 vVertexElements2[currentVertexElement] = GenerateCompCtrlVector(compCtrl[i], true);
1437
1438 #endif
1439                                 currentVertexElement += 1;
1440                             }
1441
1442                             if (currentVertexElement > 3)
1443                             {
1444 #if USE_SIMD16_BUILDER
1445                                 // store SIMD16s
1446                                 Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth2), 0));
1447
1448                                 StoreVertexElements2(pVtxOut2, outputElt, 4, pVtxSrc2);
1449
1450 #else
1451                                 StoreVertexElements(pVtxOut, outputElt, 4, vVertexElements);
1452                                 StoreVertexElements(GEP(pVtxOut, C(1)), outputElt, 4, vVertexElements2);
1453
1454 #endif
1455                                 outputElt += 1;
1456
1457                                 // reset to the next vVertexElement to output
1458                                 currentVertexElement = 0;
1459                             }
1460                         }
1461
1462                         // offset base to the next component  in the vertex to gather
1463                         pStreamBase = GEP(pStreamBase, C((char)8));
1464 #else
1465                         if (isComponentEnabled(compMask, i))
1466                         {
1467                             // if we need to gather the component
1468                             if (compCtrl[i] == StoreSrc)
1469                             {
1470                                 Value *vMaskLo = VSHUFFLE(vGatherMask, VUNDEF(mInt1Ty, 8), C({0, 1, 2, 3}));
1471                                 Value *vMaskHi = VSHUFFLE(vGatherMask, VUNDEF(mInt1Ty, 8), C({4, 5, 6, 7}));
1472
1473                                 Value *vOffsetsLo = VEXTRACTI128(vOffsets, C(0));
1474                                 Value *vOffsetsHi = VEXTRACTI128(vOffsets, C(1));
1475
1476                                 Value *vZeroDouble = VECTOR_SPLAT(4, ConstantFP::get(IRB()->getDoubleTy(), 0.0f));
1477
1478                                 Value* pGatherLo = GATHERPD(vZeroDouble,
1479                                                             pStreamBase, vOffsetsLo, vMaskLo);
1480                                 Value* pGatherHi = GATHERPD(vZeroDouble,
1481                                                             pStreamBase, vOffsetsHi, vMaskHi);
1482
1483                                 pGatherLo = VCVTPD2PS(pGatherLo);
1484                                 pGatherHi = VCVTPD2PS(pGatherHi);
1485
1486                                 Value *pGather = VSHUFFLE(pGatherLo, pGatherHi, C({0, 1, 2, 3, 4, 5, 6, 7}));
1487
1488                                 vVertexElements[currentVertexElement++] = pGather;
1489                             }
1490                             else
1491                             {
1492 #if USE_SIMD16_SHADERS
1493                                 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i], useVertexID2);
1494 #else
1495                                 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1496 #endif
1497                             }
1498
1499                             if (currentVertexElement > 3)
1500                             {
1501                                 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1502                                 // reset to the next vVertexElement to output
1503                                 currentVertexElement = 0;
1504                             }
1505                         }
1506
1507                         // offset base to the next component  in the vertex to gather
1508                         pStreamBase = GEP(pStreamBase, C((char)8));
1509 #endif
1510                     }
1511                 }
1512                     break;
1513                 default:
1514                     SWR_INVALID("Tried to fetch invalid FP format");
1515                     break;
1516             }
1517         }
1518         else
1519         {
1520             Instruction::CastOps extendCastType = Instruction::CastOps::CastOpsEnd;
1521             ConversionType conversionType = CONVERT_NONE;
1522
1523             SWR_ASSERT(IsUniformFormat((SWR_FORMAT)ied.Format),
1524                 "Unsupported format for standard gather fetch.");
1525
1526             switch(info.type[0])
1527             {
1528                 case SWR_TYPE_UNORM:
1529                     conversionType = CONVERT_NORMALIZED;
1530                 case SWR_TYPE_UINT:
1531                     extendCastType = Instruction::CastOps::ZExt;
1532                     break;
1533                 case SWR_TYPE_SNORM:
1534                     conversionType = CONVERT_NORMALIZED;
1535                 case SWR_TYPE_SINT:
1536                     extendCastType = Instruction::CastOps::SExt;
1537                     break;
1538                 case SWR_TYPE_USCALED:
1539                     conversionType = CONVERT_USCALED;
1540                     extendCastType = Instruction::CastOps::UIToFP;
1541                     break;
1542                 case SWR_TYPE_SSCALED:
1543                     conversionType = CONVERT_SSCALED;
1544                     extendCastType = Instruction::CastOps::SIToFP;
1545                     break;
1546                 case SWR_TYPE_SFIXED:
1547                     conversionType = CONVERT_SFIXED;
1548                     extendCastType = Instruction::CastOps::SExt;
1549                     break;
1550                 default:
1551                     break;
1552             }
1553
1554             // value substituted when component of gather is masked
1555             Value* gatherSrc = VIMMED1(0);
1556 #if USE_SIMD16_GATHERS
1557             Value* gatherSrc2 = VIMMED1(0);
1558 #endif
1559
1560             // Gather components from memory to store in a simdvertex structure
1561             switch (bpc)
1562             {
1563                 case 8:
1564                 {
1565                     // if we have at least one component to fetch
1566                     if (compMask)
1567                     {
1568 #if USE_SIMD16_GATHERS
1569                         Value* vGatherResult = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask);
1570                         Value* vGatherResult2 = GATHERDD(gatherSrc2, pStreamBase, vOffsets2, vGatherMask2);
1571
1572                         // e.g. result of an 8x32bit integer gather for 8bit components
1573                         // 256i - 0    1    2    3    4    5    6    7
1574                         //        xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw
1575
1576 #if USE_SIMD16_BUILDER
1577                         Value *gatherResult = VUNDEF2_I();
1578
1579                         gatherResult = INSERT2_I(gatherResult, vGatherResult,  0);
1580                         gatherResult = INSERT2_I(gatherResult, vGatherResult2, 1);
1581
1582                         Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth2), 0));
1583
1584                         Shuffle8bpcArgs args = std::forward_as_tuple(gatherResult, pVtxOut2, extendCastType, conversionType,
1585                             currentVertexElement, outputElt, compMask, compCtrl, pVtxSrc2, info.swizzle);
1586
1587                         // Shuffle gathered components into place in simdvertex struct
1588                         Shuffle8bpcGatherd2(args);  // outputs to vVertexElements ref
1589 #else
1590                         Shuffle8bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, extendCastType, conversionType,
1591                             currentVertexElement, outputElt, compMask, compCtrl, vVertexElements, info.swizzle);
1592                         Shuffle8bpcArgs args2 = std::forward_as_tuple(vGatherResult2, GEP(pVtxOut, C(1)), extendCastType, conversionType,
1593                             currentVertexElement, outputElt, compMask, compCtrl, vVertexElements2, info.swizzle);
1594
1595                         // Shuffle gathered components into place in simdvertex struct
1596                         Shuffle8bpcGatherd(args, false); // outputs to vVertexElements ref
1597                         Shuffle8bpcGatherd(args2, true); // outputs to vVertexElements ref
1598 #endif
1599 #else
1600                         Value* vGatherResult = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask);
1601                         // e.g. result of an 8x32bit integer gather for 8bit components
1602                         // 256i - 0    1    2    3    4    5    6    7
1603                         //        xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw
1604
1605                         Shuffle8bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, extendCastType, conversionType,
1606                             currentVertexElement, outputElt, compMask, compCtrl, vVertexElements, info.swizzle);
1607
1608                         // Shuffle gathered components into place in simdvertex struct
1609 #if USE_SIMD16_SHADERS
1610                         Shuffle8bpcGatherd(args, useVertexID2); // outputs to vVertexElements ref
1611 #else
1612                         Shuffle8bpcGatherd(args); // outputs to vVertexElements ref
1613 #endif
1614 #endif
1615                     }
1616                 }
1617                 break;
1618                 case 16:
1619                 {
1620 #if USE_SIMD16_GATHERS
1621                     Value* vGatherResult[2];
1622                     Value* vGatherResult2[2];
1623
1624                     // if we have at least one component out of x or y to fetch
1625                     if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
1626                     {
1627                         vGatherResult[0] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask);
1628                         vGatherResult2[0] = GATHERDD(gatherSrc2, pStreamBase, vOffsets2, vGatherMask2);
1629                         // e.g. result of first 8x32bit integer gather for 16bit components
1630                         // 256i - 0    1    2    3    4    5    6    7
1631                         //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
1632                         //
1633                     }
1634                     else
1635                     {
1636                         vGatherResult[0]  = VUNDEF_I();
1637                         vGatherResult2[0] = VUNDEF_I();
1638                     }
1639
1640                     // if we have at least one component out of z or w to fetch
1641                     if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
1642                     {
1643                         // offset base to the next components(zw) in the vertex to gather
1644                         pStreamBase = GEP(pStreamBase, C((char)4));
1645
1646                         vGatherResult[1] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask);
1647                         vGatherResult2[1] = GATHERDD(gatherSrc2, pStreamBase, vOffsets2, vGatherMask2);
1648                         // e.g. result of second 8x32bit integer gather for 16bit components
1649                         // 256i - 0    1    2    3    4    5    6    7
1650                         //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
1651                         //
1652                     }
1653                     else
1654                     {
1655                         vGatherResult[1]  = VUNDEF_I();
1656                         vGatherResult2[1] = VUNDEF_I();
1657                     }
1658
1659                     // if we have at least one component to shuffle into place
1660                     if (compMask)
1661                     {
1662 #if USE_SIMD16_BUILDER
1663                         Value *gatherResult[2];
1664
1665                         gatherResult[0] = VUNDEF2_I();
1666                         gatherResult[1] = VUNDEF2_I();
1667
1668                         gatherResult[0] = INSERT2_I(gatherResult[0], vGatherResult[0],  0);
1669                         gatherResult[0] = INSERT2_I(gatherResult[0], vGatherResult2[0], 1);
1670
1671                         gatherResult[1] = INSERT2_I(gatherResult[1], vGatherResult[1],  0);
1672                         gatherResult[1] = INSERT2_I(gatherResult[1], vGatherResult2[1], 1);
1673
1674                         Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth2), 0));
1675
1676                         Shuffle16bpcArgs args = std::forward_as_tuple(gatherResult, pVtxOut2, extendCastType, conversionType,
1677                             currentVertexElement, outputElt, compMask, compCtrl, pVtxSrc2);
1678
1679                         // Shuffle gathered components into place in simdvertex struct
1680                         Shuffle16bpcGather2(args);  // outputs to vVertexElements ref
1681 #else
1682                         Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, extendCastType, conversionType,
1683                             currentVertexElement, outputElt, compMask, compCtrl, vVertexElements);
1684                         Shuffle16bpcArgs args2 = std::forward_as_tuple(vGatherResult2, GEP(pVtxOut, C(1)), extendCastType, conversionType,
1685                             currentVertexElement, outputElt, compMask, compCtrl, vVertexElements2);
1686
1687                         // Shuffle gathered components into place in simdvertex struct
1688                         Shuffle16bpcGather(args, false);  // outputs to vVertexElements ref
1689                         Shuffle16bpcGather(args2, true);  // outputs to vVertexElements ref
1690 #endif
1691                     }
1692 #else
1693                     Value* vGatherResult[2];
1694
1695                     // if we have at least one component out of x or y to fetch
1696                     if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)){
1697                         vGatherResult[0] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask);
1698                         // e.g. result of first 8x32bit integer gather for 16bit components
1699                         // 256i - 0    1    2    3    4    5    6    7
1700                         //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
1701                         //
1702                     }
1703
1704                     // if we have at least one component out of z or w to fetch
1705                     if(isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)){
1706                         // offset base to the next components(zw) in the vertex to gather
1707                         pStreamBase = GEP(pStreamBase, C((char)4));
1708
1709                         vGatherResult[1] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask);
1710                         // e.g. result of second 8x32bit integer gather for 16bit components
1711                         // 256i - 0    1    2    3    4    5    6    7
1712                         //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
1713                         //
1714                     }
1715
1716                     // if we have at least one component to shuffle into place
1717                     if(compMask){
1718                         Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, extendCastType, conversionType,
1719                             currentVertexElement, outputElt, compMask, compCtrl, vVertexElements);
1720
1721                         // Shuffle gathered components into place in simdvertex struct
1722 #if USE_SIMD16_SHADERS
1723                         Shuffle16bpcGather(args, useVertexID2);  // outputs to vVertexElements ref
1724 #else
1725                         Shuffle16bpcGather(args);  // outputs to vVertexElements ref
1726 #endif
1727                     }
1728 #endif
1729                 }
1730                 break;
1731                 case 32:
1732                 {
1733                     // Gathered components into place in simdvertex struct
1734                     for (uint32_t i = 0; i < 4; i++)
1735                     {
1736                         if (isComponentEnabled(compMask, i))
1737                         {
1738                             // if we need to gather the component
1739                             if (compCtrl[i] == StoreSrc)
1740                             {
1741 #if USE_SIMD16_GATHERS
1742                                 Value *pGather = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask);
1743                                 Value *pGather2 = GATHERDD(gatherSrc2, pStreamBase, vOffsets2, vGatherMask2);
1744
1745                                 if (conversionType == CONVERT_USCALED)
1746                                 {
1747                                     pGather = UI_TO_FP(pGather, mSimdFP32Ty);
1748                                     pGather2 = UI_TO_FP(pGather2, mSimdFP32Ty);
1749                                 }
1750                                 else if (conversionType == CONVERT_SSCALED)
1751                                 {
1752                                     pGather = SI_TO_FP(pGather, mSimdFP32Ty);
1753                                     pGather2 = SI_TO_FP(pGather2, mSimdFP32Ty);
1754                                 }
1755                                 else if (conversionType == CONVERT_SFIXED)
1756                                 {
1757                                     pGather = FMUL(SI_TO_FP(pGather, mSimdFP32Ty), VBROADCAST(C(1 / 65536.0f)));
1758                                     pGather2 = FMUL(SI_TO_FP(pGather2, mSimdFP32Ty), VBROADCAST(C(1 / 65536.0f)));
1759                                 }
1760
1761 #if USE_SIMD16_BUILDER
1762                                 // pack adjacent pairs of SIMD8s into SIMD16s
1763                                 pVtxSrc2[currentVertexElement] = VUNDEF2_F();
1764                                 pVtxSrc2[currentVertexElement] = INSERT2_F(pVtxSrc2[currentVertexElement], pGather,  0);
1765                                 pVtxSrc2[currentVertexElement] = INSERT2_F(pVtxSrc2[currentVertexElement], pGather2, 1);
1766
1767 #else
1768                                 vVertexElements[currentVertexElement] = pGather;
1769                                 vVertexElements2[currentVertexElement] = pGather2;
1770
1771 #endif
1772
1773                                 // e.g. result of a single 8x32bit integer gather for 32bit components
1774                                 // 256i - 0    1    2    3    4    5    6    7
1775                                 //        xxxx xxxx xxxx xxxx xxxx xxxx xxxx xxxx
1776
1777                                 currentVertexElement += 1;
1778 #else
1779                                 Value* pGather = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask);
1780
1781                                 if (conversionType == CONVERT_USCALED)
1782                                 {
1783                                     pGather = UI_TO_FP(pGather, mSimdFP32Ty);
1784                                 }
1785                                 else if (conversionType == CONVERT_SSCALED)
1786                                 {
1787                                     pGather = SI_TO_FP(pGather, mSimdFP32Ty);
1788                                 }
1789                                 else if (conversionType == CONVERT_SFIXED)
1790                                 {
1791                                     pGather = FMUL(SI_TO_FP(pGather, mSimdFP32Ty), VBROADCAST(C(1/65536.0f)));
1792                                 }
1793
1794                                 vVertexElements[currentVertexElement++] = pGather;
1795                                 // e.g. result of a single 8x32bit integer gather for 32bit components
1796                                 // 256i - 0    1    2    3    4    5    6    7
1797                                 //        xxxx xxxx xxxx xxxx xxxx xxxx xxxx xxxx
1798 #endif
1799                             }
1800                             else
1801                             {
1802 #if USE_SIMD16_SHADERS
1803 #if USE_SIMD16_GATHERS
1804 #if USE_SIMD16_BUILDER
1805                                 pVtxSrc2[currentVertexElement] = GenerateCompCtrlVector2(compCtrl[i]);
1806
1807 #else
1808                                 vVertexElements[currentVertexElement] = GenerateCompCtrlVector(compCtrl[i], false);
1809                                 vVertexElements2[currentVertexElement] = GenerateCompCtrlVector(compCtrl[i], true);
1810
1811 #endif
1812                                 currentVertexElement += 1;
1813 #else
1814                                 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i], useVertexID2);
1815 #endif
1816 #else
1817                                 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1818 #endif
1819                             }
1820
1821                             if (currentVertexElement > 3)
1822                             {
1823 #if USE_SIMD16_GATHERS
1824 #if USE_SIMD16_BUILDER
1825                                 // store SIMD16s
1826                                 Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth2), 0));
1827
1828                                 StoreVertexElements2(pVtxOut2, outputElt, 4, pVtxSrc2);
1829
1830 #else
1831                                 StoreVertexElements(pVtxOut, outputElt, 4, vVertexElements);
1832                                 StoreVertexElements(GEP(pVtxOut, C(1)), outputElt, 4, vVertexElements2);
1833
1834 #endif
1835                                 outputElt += 1;
1836 #else
1837                                 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1838 #endif
1839
1840                                 // reset to the next vVertexElement to output
1841                                 currentVertexElement = 0;
1842                             }
1843
1844                         }
1845
1846                         // offset base to the next component  in the vertex to gather
1847                         pStreamBase = GEP(pStreamBase, C((char)4));
1848                     }
1849                 }
1850                 break;
1851             }
1852         }
1853     }
1854
1855     // if we have a partially filled vVertexElement struct, output it
1856     if (currentVertexElement > 0)
1857     {
1858 #if USE_SIMD16_GATHERS
1859 #if USE_SIMD16_BUILDER
1860         // store SIMD16s
1861         Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth2), 0));
1862
1863         StoreVertexElements2(pVtxOut2, outputElt, currentVertexElement, pVtxSrc2);
1864
1865 #else
1866         StoreVertexElements(pVtxOut, outputElt, currentVertexElement, vVertexElements);
1867         StoreVertexElements(GEP(pVtxOut, C(1)), outputElt, currentVertexElement, vVertexElements2);
1868
1869 #endif
1870         outputElt += 1;
1871 #else
1872         StoreVertexElements(pVtxOut, outputElt++, currentVertexElement, vVertexElements);
1873 #endif
1874     }
1875 }
1876
1877 //////////////////////////////////////////////////////////////////////////
1878 /// @brief Loads a simd of valid indices. OOB indices are set to 0
1879 /// *Note* have to do 16bit index checking in scalar until we have AVX-512
1880 /// support
1881 /// @param pIndices - pointer to 8 bit indices
1882 /// @param pLastIndex - pointer to last valid index
1883 Value* FetchJit::GetSimdValid8bitIndices(Value* pIndices, Value* pLastIndex)
1884 {
1885     // can fit 2 16 bit integers per vWidth lane
1886     Value* vIndices =  VUNDEF_I();
1887
1888     // store 0 index on stack to be used to conditionally load from if index address is OOB
1889     Value* pZeroIndex = ALLOCA(mInt8Ty);
1890     STORE(C((uint8_t)0), pZeroIndex);
1891
1892     // Load a SIMD of index pointers
1893     for(int64_t lane = 0; lane < mVWidth; lane++)
1894     {
1895         // Calculate the address of the requested index
1896         Value *pIndex = GEP(pIndices, C(lane));
1897
1898         // check if the address is less than the max index,
1899         Value* mask = ICMP_ULT(pIndex, pLastIndex);
1900
1901         // if valid, load the index. if not, load 0 from the stack
1902         Value* pValid = SELECT(mask, pIndex, pZeroIndex);
1903         Value *index = LOAD(pValid, "valid index");
1904
1905         // zero extended index to 32 bits and insert into the correct simd lane
1906         index = Z_EXT(index, mInt32Ty);
1907         vIndices = VINSERT(vIndices, index, lane);
1908     }
1909     return vIndices;
1910 }
1911
1912 //////////////////////////////////////////////////////////////////////////
1913 /// @brief Loads a simd of valid indices. OOB indices are set to 0
1914 /// *Note* have to do 16bit index checking in scalar until we have AVX-512
1915 /// support
1916 /// @param pIndices - pointer to 16 bit indices
1917 /// @param pLastIndex - pointer to last valid index
1918 Value* FetchJit::GetSimdValid16bitIndices(Value* pIndices, Value* pLastIndex)
1919 {
1920     // can fit 2 16 bit integers per vWidth lane
1921     Value* vIndices =  VUNDEF_I();
1922
1923     // store 0 index on stack to be used to conditionally load from if index address is OOB
1924     Value* pZeroIndex = ALLOCA(mInt16Ty);
1925     STORE(C((uint16_t)0), pZeroIndex);
1926
1927     // Load a SIMD of index pointers
1928     for(int64_t lane = 0; lane < mVWidth; lane++)
1929     {
1930         // Calculate the address of the requested index
1931         Value *pIndex = GEP(pIndices, C(lane));
1932
1933         // check if the address is less than the max index,
1934         Value* mask = ICMP_ULT(pIndex, pLastIndex);
1935
1936         // if valid, load the index. if not, load 0 from the stack
1937         Value* pValid = SELECT(mask, pIndex, pZeroIndex);
1938         Value *index = LOAD(pValid, "valid index");
1939
1940         // zero extended index to 32 bits and insert into the correct simd lane
1941         index = Z_EXT(index, mInt32Ty);
1942         vIndices = VINSERT(vIndices, index, lane);
1943     }
1944     return vIndices;
1945 }
1946
1947 //////////////////////////////////////////////////////////////////////////
1948 /// @brief Loads a simd of valid indices. OOB indices are set to 0
1949 /// @param pIndices - pointer to 32 bit indices
1950 /// @param pLastIndex - pointer to last valid index
1951 Value* FetchJit::GetSimdValid32bitIndices(Value* pIndices, Value* pLastIndex)
1952 {
1953     DataLayout dL(JM()->mpCurrentModule);
1954     unsigned int ptrSize = dL.getPointerSize() * 8;  // ptr size in bits
1955     Value* iLastIndex = PTR_TO_INT(pLastIndex, Type::getIntNTy(JM()->mContext, ptrSize));
1956     Value* iIndices = PTR_TO_INT(pIndices, Type::getIntNTy(JM()->mContext, ptrSize));
1957
1958     // get the number of indices left in the buffer (endPtr - curPtr) / sizeof(index)
1959     Value* numIndicesLeft = SUB(iLastIndex,iIndices);
1960     numIndicesLeft = TRUNC(numIndicesLeft, mInt32Ty);
1961     numIndicesLeft = SDIV(numIndicesLeft, C(4));
1962
1963     // create a vector of index counts from the base index ptr passed into the fetch
1964     const std::vector<Constant*> vecIndices {C(0), C(1), C(2), C(3), C(4), C(5), C(6), C(7)};
1965     Constant* vIndexOffsets = ConstantVector::get(vecIndices);
1966
1967     // compare index count to the max valid index
1968     // e.g vMaxIndex      4 4 4 4 4 4 4 4 : 4 indices left to load
1969     //     vIndexOffsets  0 1 2 3 4 5 6 7
1970     //     ------------------------------
1971     //     vIndexMask    -1-1-1-1 0 0 0 0 : offsets < max pass
1972     //     vLoadedIndices 0 1 2 3 0 0 0 0 : offsets >= max masked to 0
1973     Value* vMaxIndex = VBROADCAST(numIndicesLeft);
1974     Value* vIndexMask = VPCMPGTD(vMaxIndex,vIndexOffsets);
1975
1976     // VMASKLOAD takes an *i8 src pointer
1977     pIndices = BITCAST(pIndices,PointerType::get(mInt8Ty,0));
1978
1979     // Load the indices; OOB loads 0
1980     return MASKLOADD(pIndices,vIndexMask);
1981 }
1982
1983 //////////////////////////////////////////////////////////////////////////
1984 /// @brief Takes a SIMD of gathered 8bpc verts, zero or sign extends,
1985 /// denormalizes if needed, converts to F32 if needed, and positions in
1986 //  the proper SIMD rows to be output to the simdvertex structure
1987 /// @param args: (tuple of args, listed below)
1988 ///   @param vGatherResult - 8 gathered 8bpc vertices
1989 ///   @param pVtxOut - base pointer to output simdvertex struct
1990 ///   @param extendType - sign extend or zero extend
1991 ///   @param bNormalized - do we need to denormalize?
1992 ///   @param currentVertexElement - reference to the current vVertexElement
1993 ///   @param outputElt - reference to the current offset from simdvertex we're o
1994 ///   @param compMask - component packing mask
1995 ///   @param compCtrl - component control val
1996 ///   @param vVertexElements[4] - vertex components to output
1997 ///   @param swizzle[4] - component swizzle location
1998 #if USE_SIMD16_SHADERS
1999 void FetchJit::Shuffle8bpcGatherd(Shuffle8bpcArgs &args, bool useVertexID2)
2000 #else
2001 void FetchJit::Shuffle8bpcGatherd(Shuffle8bpcArgs &args)
2002 #endif
2003 {
2004     // Unpack tuple args
2005     Value*& vGatherResult = std::get<0>(args);
2006     Value* pVtxOut = std::get<1>(args);
2007     const Instruction::CastOps extendType = std::get<2>(args);
2008     const ConversionType conversionType = std::get<3>(args);
2009     uint32_t &currentVertexElement = std::get<4>(args);
2010     uint32_t &outputElt =  std::get<5>(args);
2011     const ComponentEnable compMask = std::get<6>(args);
2012     const ComponentControl (&compCtrl)[4] = std::get<7>(args);
2013     Value* (&vVertexElements)[4] = std::get<8>(args);
2014     const uint32_t (&swizzle)[4] = std::get<9>(args);
2015
2016     // cast types
2017     Type* vGatherTy = mSimdInt32Ty;
2018     Type* v32x8Ty =  VectorType::get(mInt8Ty, mVWidth * 4 ); // vwidth is units of 32 bits
2019
2020     // have to do extra work for sign extending
2021     if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP)){
2022         Type* v16x8Ty = VectorType::get(mInt8Ty, mVWidth * 2); // 8x16bit ints in a 128bit lane
2023         Type* v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
2024
2025         // shuffle mask, including any swizzling
2026         const char x = (char)swizzle[0]; const char y = (char)swizzle[1];
2027         const char z = (char)swizzle[2]; const char w = (char)swizzle[3];
2028         Value* vConstMask = C<char>({char(x), char(x+4), char(x+8), char(x+12),
2029                     char(y), char(y+4), char(y+8), char(y+12),
2030                     char(z), char(z+4), char(z+8), char(z+12),
2031                     char(w), char(w+4), char(w+8), char(w+12),
2032                     char(x), char(x+4), char(x+8), char(x+12),
2033                     char(y), char(y+4), char(y+8), char(y+12),
2034                     char(z), char(z+4), char(z+8), char(z+12),
2035                     char(w), char(w+4), char(w+8), char(w+12)});
2036
2037         Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherResult, v32x8Ty), vConstMask), vGatherTy);
2038         // after pshufb: group components together in each 128bit lane
2039         // 256i - 0    1    2    3    4    5    6    7
2040         //        xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww
2041
2042         Value* vi128XY = nullptr;
2043         if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)){
2044             vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({0, 4, 0, 0, 1, 5, 0, 0})), v128Ty);
2045             // after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane
2046             // 256i - 0    1    2    3    4    5    6    7
2047             //        xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care)
2048         }
2049
2050         // do the same for zw components
2051         Value* vi128ZW = nullptr;
2052         if(isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)){
2053             vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({2, 6, 0, 0, 3, 7, 0, 0})), v128Ty);
2054         }
2055
2056         // init denormalize variables if needed
2057         Instruction::CastOps fpCast;
2058         Value* conversionFactor;
2059
2060         switch (conversionType)
2061         {
2062         case CONVERT_NORMALIZED:
2063             fpCast = Instruction::CastOps::SIToFP;
2064             conversionFactor = VIMMED1((float)(1.0 / 127.0));
2065             break;
2066         case CONVERT_SSCALED:
2067             fpCast = Instruction::CastOps::SIToFP;
2068             conversionFactor = VIMMED1((float)(1.0));
2069             break;
2070         case CONVERT_USCALED:
2071             SWR_INVALID("Type should not be sign extended!");
2072             conversionFactor = nullptr;
2073             break;
2074         default:
2075             SWR_ASSERT(conversionType == CONVERT_NONE);
2076             conversionFactor = nullptr;
2077             break;
2078         }
2079
2080         // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
2081         for (uint32_t i = 0; i < 4; i++)
2082         {
2083             if (isComponentEnabled(compMask, i))
2084             {
2085                 if (compCtrl[i] == ComponentControl::StoreSrc)
2086                 {
2087                     // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
2088                     uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
2089                     // if x or y, use vi128XY permute result, else use vi128ZW
2090                     Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
2091
2092                     // sign extend
2093                     vVertexElements[currentVertexElement] = PMOVSXBD(BITCAST(VEXTRACT(selectedPermute, C(lane)), v16x8Ty));
2094
2095                     // denormalize if needed
2096                     if (conversionType != CONVERT_NONE)
2097                     {
2098                         vVertexElements[currentVertexElement] = FMUL(CAST(fpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor);
2099                     }
2100                     currentVertexElement++;
2101                 }
2102                 else
2103                 {
2104 #if USE_SIMD16_SHADERS
2105                     vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i], useVertexID2);
2106 #else
2107                     vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
2108 #endif
2109                 }
2110
2111                 if (currentVertexElement > 3)
2112                 {
2113                     StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
2114                     // reset to the next vVertexElement to output
2115                     currentVertexElement = 0;
2116                 }
2117             }
2118         }
2119     }
2120     // else zero extend
2121     else if ((extendType == Instruction::CastOps::ZExt) || (extendType == Instruction::CastOps::UIToFP))
2122     {
2123         // init denormalize variables if needed
2124         Instruction::CastOps fpCast;
2125         Value* conversionFactor;
2126
2127         switch (conversionType)
2128         {
2129         case CONVERT_NORMALIZED:
2130             fpCast = Instruction::CastOps::UIToFP;
2131             conversionFactor = VIMMED1((float)(1.0 / 255.0));
2132             break;
2133         case CONVERT_USCALED:
2134             fpCast = Instruction::CastOps::UIToFP;
2135             conversionFactor = VIMMED1((float)(1.0));
2136             break;
2137         case CONVERT_SSCALED:
2138             SWR_INVALID("Type should not be zero extended!");
2139             conversionFactor = nullptr;
2140             break;
2141         default:
2142             SWR_ASSERT(conversionType == CONVERT_NONE);
2143             conversionFactor = nullptr;
2144             break;
2145         }
2146
2147         // shuffle enabled components into lower byte of each 32bit lane, 0 extending to 32 bits
2148         for (uint32_t i = 0; i < 4; i++)
2149         {
2150             if (isComponentEnabled(compMask, i))
2151             {
2152                 if (compCtrl[i] == ComponentControl::StoreSrc)
2153                 {
2154                     // pshufb masks for each component
2155                     Value* vConstMask;
2156                     switch (swizzle[i])
2157                     {
2158                     case 0:
2159                         // x shuffle mask
2160                         vConstMask = C<char>({ 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1,
2161                                                0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1 });
2162                         break;
2163                     case 1:
2164                         // y shuffle mask
2165                         vConstMask = C<char>({ 1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1,
2166                                                1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1 });
2167                         break;
2168                     case 2:
2169                         // z shuffle mask
2170                         vConstMask = C<char>({ 2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1,
2171                                                2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1 });
2172                         break;
2173                     case 3:
2174                         // w shuffle mask
2175                         vConstMask = C<char>({ 3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1,
2176                                                3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1 });
2177                         break;
2178                     default:
2179                         vConstMask = nullptr;
2180                         break;
2181                     }
2182
2183                     vVertexElements[currentVertexElement] = BITCAST(PSHUFB(BITCAST(vGatherResult, v32x8Ty), vConstMask), vGatherTy);
2184                     // after pshufb for x channel
2185                     // 256i - 0    1    2    3    4    5    6    7
2186                     //        x000 x000 x000 x000 x000 x000 x000 x000
2187
2188                     // denormalize if needed
2189                     if (conversionType != CONVERT_NONE)
2190                     {
2191                         vVertexElements[currentVertexElement] = FMUL(CAST(fpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor);
2192                     }
2193                     currentVertexElement++;
2194                 }
2195                 else
2196                 {
2197 #if USE_SIMD16_SHADERS
2198                     vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i], useVertexID2);
2199 #else
2200                     vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
2201 #endif
2202                 }
2203
2204                 if (currentVertexElement > 3)
2205                 {
2206                     StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
2207                     // reset to the next vVertexElement to output
2208                     currentVertexElement = 0;
2209                 }
2210             }
2211         }
2212     }
2213     else
2214     {
2215         SWR_INVALID("Unsupported conversion type");
2216     }
2217 }
2218
2219 #if USE_SIMD16_BUILDER
2220 void FetchJit::Shuffle8bpcGatherd2(Shuffle8bpcArgs &args)
2221 {
2222     // Unpack tuple args
2223     Value*& vGatherResult = std::get<0>(args);
2224     Value* pVtxOut = std::get<1>(args);
2225     const Instruction::CastOps extendType = std::get<2>(args);
2226     const ConversionType conversionType = std::get<3>(args);
2227     uint32_t &currentVertexElement = std::get<4>(args);
2228     uint32_t &outputElt = std::get<5>(args);
2229     const ComponentEnable compMask = std::get<6>(args);
2230     const ComponentControl(&compCtrl)[4] = std::get<7>(args);
2231     Value* (&vVertexElements)[4] = std::get<8>(args);
2232     const uint32_t(&swizzle)[4] = std::get<9>(args);
2233
2234     // cast types
2235     Type *vGatherTy = mSimdInt32Ty;
2236     Type *v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
2237
2238     // have to do extra work for sign extending
2239     if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP))
2240     {
2241         Type *v16x8Ty = VectorType::get(mInt8Ty, mVWidth * 2); // 8x16bit ints in a 128bit lane
2242         Type *v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
2243
2244         // shuffle mask, including any swizzling
2245         const char x = (char)swizzle[0]; const char y = (char)swizzle[1];
2246         const char z = (char)swizzle[2]; const char w = (char)swizzle[3];
2247         Value *vConstMask = C<char>({ char(x), char(x + 4), char(x + 8), char(x + 12),
2248             char(y), char(y + 4), char(y + 8), char(y + 12),
2249             char(z), char(z + 4), char(z + 8), char(z + 12),
2250             char(w), char(w + 4), char(w + 8), char(w + 12),
2251             char(x), char(x + 4), char(x + 8), char(x + 12),
2252             char(y), char(y + 4), char(y + 8), char(y + 12),
2253             char(z), char(z + 4), char(z + 8), char(z + 12),
2254             char(w), char(w + 4), char(w + 8), char(w + 12) });
2255
2256         // SIMD16 PSHUFB isnt part of AVX-512F, so split into SIMD8 for the sake of KNL, for now..
2257
2258         Value *vGatherResult_lo = EXTRACT2_I(vGatherResult, 0);
2259         Value *vGatherResult_hi = EXTRACT2_I(vGatherResult, 1);
2260
2261         Value *vShufResult_lo = BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask), vGatherTy);
2262         Value *vShufResult_hi = BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask), vGatherTy);
2263
2264         // after pshufb: group components together in each 128bit lane
2265         // 256i - 0    1    2    3    4    5    6    7
2266         //        xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww
2267
2268         Value *vi128XY_lo = nullptr;
2269         Value *vi128XY_hi = nullptr;
2270         if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
2271         {
2272             vi128XY_lo = BITCAST(PERMD(vShufResult_lo, C<int32_t>({ 0, 4, 0, 0, 1, 5, 0, 0 })), v128Ty);
2273             vi128XY_hi = BITCAST(PERMD(vShufResult_hi, C<int32_t>({ 0, 4, 0, 0, 1, 5, 0, 0 })), v128Ty);
2274
2275             // after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane
2276             // 256i - 0    1    2    3    4    5    6    7
2277             //        xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care)
2278         }
2279
2280         // do the same for zw components
2281         Value *vi128ZW_lo = nullptr;
2282         Value *vi128ZW_hi = nullptr;
2283         if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
2284         {
2285             vi128ZW_lo = BITCAST(PERMD(vShufResult_lo, C<int32_t>({ 2, 6, 0, 0, 3, 7, 0, 0 })), v128Ty);
2286             vi128ZW_hi = BITCAST(PERMD(vShufResult_hi, C<int32_t>({ 2, 6, 0, 0, 3, 7, 0, 0 })), v128Ty);
2287         }
2288
2289         // init denormalize variables if needed
2290         Instruction::CastOps fpCast;
2291         Value *conversionFactor;
2292
2293         switch (conversionType)
2294         {
2295         case CONVERT_NORMALIZED:
2296             fpCast = Instruction::CastOps::SIToFP;
2297             conversionFactor = VIMMED1((float)(1.0 / 127.0));
2298             break;
2299         case CONVERT_SSCALED:
2300             fpCast = Instruction::CastOps::SIToFP;
2301             conversionFactor = VIMMED1((float)(1.0));
2302             break;
2303         case CONVERT_USCALED:
2304             SWR_INVALID("Type should not be sign extended!");
2305             conversionFactor = nullptr;
2306             break;
2307         default:
2308             SWR_ASSERT(conversionType == CONVERT_NONE);
2309             conversionFactor = nullptr;
2310             break;
2311         }
2312
2313         // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
2314         for (uint32_t i = 0; i < 4; i++)
2315         {
2316             if (isComponentEnabled(compMask, i))
2317             {
2318                 if (compCtrl[i] == ComponentControl::StoreSrc)
2319                 {
2320                     // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
2321                     uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
2322                     // if x or y, use vi128XY permute result, else use vi128ZW
2323                     Value *selectedPermute_lo = (i < 2) ? vi128XY_lo : vi128ZW_lo;
2324                     Value *selectedPermute_hi = (i < 2) ? vi128XY_hi : vi128ZW_hi;
2325
2326                     // sign extend
2327                     Value *temp_lo = PMOVSXBD(BITCAST(VEXTRACT(selectedPermute_lo, C(lane)), v16x8Ty));
2328                     Value *temp_hi = PMOVSXBD(BITCAST(VEXTRACT(selectedPermute_hi, C(lane)), v16x8Ty));
2329
2330                     // denormalize if needed
2331                     if (conversionType != CONVERT_NONE)
2332                     {
2333                         temp_lo = FMUL(CAST(fpCast, temp_lo, mSimdFP32Ty), conversionFactor);
2334                         temp_hi = FMUL(CAST(fpCast, temp_hi, mSimdFP32Ty), conversionFactor);
2335                     }
2336
2337                     vVertexElements[currentVertexElement] = VUNDEF2_F();
2338                     vVertexElements[currentVertexElement] = INSERT2_F(vVertexElements[currentVertexElement], temp_lo, 0);
2339                     vVertexElements[currentVertexElement] = INSERT2_F(vVertexElements[currentVertexElement], temp_hi, 1);
2340
2341                     currentVertexElement += 1;
2342                 }
2343                 else
2344                 {
2345                     vVertexElements[currentVertexElement++] = GenerateCompCtrlVector2(compCtrl[i]);
2346                 }
2347
2348                 if (currentVertexElement > 3)
2349                 {
2350                     StoreVertexElements2(pVtxOut, outputElt++, 4, vVertexElements);
2351                     // reset to the next vVertexElement to output
2352                     currentVertexElement = 0;
2353                 }
2354             }
2355         }
2356     }
2357     // else zero extend
2358     else if ((extendType == Instruction::CastOps::ZExt) || (extendType == Instruction::CastOps::UIToFP))
2359     {
2360         // init denormalize variables if needed
2361         Instruction::CastOps fpCast;
2362         Value *conversionFactor;
2363
2364         switch (conversionType)
2365         {
2366         case CONVERT_NORMALIZED:
2367             fpCast = Instruction::CastOps::UIToFP;
2368             conversionFactor = VIMMED1((float)(1.0 / 255.0));
2369             break;
2370         case CONVERT_USCALED:
2371             fpCast = Instruction::CastOps::UIToFP;
2372             conversionFactor = VIMMED1((float)(1.0));
2373             break;
2374         case CONVERT_SSCALED:
2375             SWR_INVALID("Type should not be zero extended!");
2376             conversionFactor = nullptr;
2377             break;
2378         default:
2379             SWR_ASSERT(conversionType == CONVERT_NONE);
2380             conversionFactor = nullptr;
2381             break;
2382         }
2383
2384         // shuffle enabled components into lower byte of each 32bit lane, 0 extending to 32 bits
2385         for (uint32_t i = 0; i < 4; i++)
2386         {
2387             if (isComponentEnabled(compMask, i))
2388             {
2389                 if (compCtrl[i] == ComponentControl::StoreSrc)
2390                 {
2391                     // pshufb masks for each component
2392                     Value *vConstMask;
2393                     switch (swizzle[i])
2394                     {
2395                     case 0:
2396                         // x shuffle mask
2397                         vConstMask = C<char>({ 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1,
2398                             0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1 });
2399                         break;
2400                     case 1:
2401                         // y shuffle mask
2402                         vConstMask = C<char>({ 1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1,
2403                             1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1 });
2404                         break;
2405                     case 2:
2406                         // z shuffle mask
2407                         vConstMask = C<char>({ 2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1,
2408                             2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1 });
2409                         break;
2410                     case 3:
2411                         // w shuffle mask
2412                         vConstMask = C<char>({ 3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1,
2413                             3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1 });
2414                         break;
2415                     default:
2416                         vConstMask = nullptr;
2417                         break;
2418                     }
2419
2420                     Value *vGatherResult_lo = EXTRACT2_I(vGatherResult, 0);
2421                     Value *vGatherResult_hi = EXTRACT2_I(vGatherResult, 1);
2422
2423                     Value *temp_lo = BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask), vGatherTy);
2424                     Value *temp_hi = BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask), vGatherTy);
2425
2426                     // after pshufb for x channel
2427                     // 256i - 0    1    2    3    4    5    6    7
2428                     //        x000 x000 x000 x000 x000 x000 x000 x000
2429
2430                     // denormalize if needed
2431                     if (conversionType != CONVERT_NONE)
2432                     {
2433                         temp_lo = FMUL(CAST(fpCast, temp_lo, mSimdFP32Ty), conversionFactor);
2434                         temp_hi = FMUL(CAST(fpCast, temp_hi, mSimdFP32Ty), conversionFactor);
2435                     }
2436
2437                     vVertexElements[currentVertexElement] = VUNDEF2_F();
2438                     vVertexElements[currentVertexElement] = INSERT2_F(vVertexElements[currentVertexElement], temp_lo, 0);
2439                     vVertexElements[currentVertexElement] = INSERT2_F(vVertexElements[currentVertexElement], temp_hi, 1);
2440
2441                     currentVertexElement += 1;
2442                 }
2443                 else
2444                 {
2445                     vVertexElements[currentVertexElement++] = GenerateCompCtrlVector2(compCtrl[i]);
2446                 }
2447
2448                 if (currentVertexElement > 3)
2449                 {
2450                     StoreVertexElements2(pVtxOut, outputElt++, 4, vVertexElements);
2451                     // reset to the next vVertexElement to output
2452                     currentVertexElement = 0;
2453                 }
2454             }
2455         }
2456     }
2457     else
2458     {
2459         SWR_INVALID("Unsupported conversion type");
2460     }
2461 }
2462
2463 #endif
2464 //////////////////////////////////////////////////////////////////////////
2465 /// @brief Takes a SIMD of gathered 16bpc verts, zero or sign extends,
2466 /// denormalizes if needed, converts to F32 if needed, and positions in
2467 //  the proper SIMD rows to be output to the simdvertex structure
2468 /// @param args: (tuple of args, listed below)
2469 ///   @param vGatherResult[2] - array of gathered 16bpc vertices, 4 per index
2470 ///   @param pVtxOut - base pointer to output simdvertex struct
2471 ///   @param extendType - sign extend or zero extend
2472 ///   @param bNormalized - do we need to denormalize?
2473 ///   @param currentVertexElement - reference to the current vVertexElement
2474 ///   @param outputElt - reference to the current offset from simdvertex we're o
2475 ///   @param compMask - component packing mask
2476 ///   @param compCtrl - component control val
2477 ///   @param vVertexElements[4] - vertex components to output
2478 #if USE_SIMD16_SHADERS
2479 void FetchJit::Shuffle16bpcGather(Shuffle16bpcArgs &args, bool useVertexID2)
2480 #else
2481 void FetchJit::Shuffle16bpcGather(Shuffle16bpcArgs &args)
2482 #endif
2483 {
2484     // Unpack tuple args
2485     Value* (&vGatherResult)[2] = std::get<0>(args);
2486     Value* pVtxOut = std::get<1>(args);
2487     const Instruction::CastOps extendType = std::get<2>(args);
2488     const ConversionType conversionType = std::get<3>(args);
2489     uint32_t &currentVertexElement = std::get<4>(args);
2490     uint32_t &outputElt = std::get<5>(args);
2491     const ComponentEnable compMask = std::get<6>(args);
2492     const ComponentControl(&compCtrl)[4] = std::get<7>(args);
2493     Value* (&vVertexElements)[4] = std::get<8>(args);
2494
2495     // cast types
2496     Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
2497     Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
2498
2499     // have to do extra work for sign extending
2500     if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP)||
2501         (extendType == Instruction::CastOps::FPExt))
2502     {
2503         // is this PP float?
2504         bool bFP = (extendType == Instruction::CastOps::FPExt) ? true : false;
2505
2506         Type* v8x16Ty = VectorType::get(mInt16Ty, 8); // 8x16bit in a 128bit lane
2507         Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
2508
2509         // shuffle mask
2510         Value* vConstMask = C<char>({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
2511                                      0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15});
2512         Value* vi128XY = nullptr;
2513         if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)){
2514             Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherResult[0], v32x8Ty), vConstMask), vGatherTy);
2515             // after pshufb: group components together in each 128bit lane
2516             // 256i - 0    1    2    3    4    5    6    7
2517             //        xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
2518
2519             vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
2520             // after PERMD: move and pack xy components into each 128bit lane
2521             // 256i - 0    1    2    3    4    5    6    7
2522             //        xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
2523         }
2524
2525         // do the same for zw components
2526         Value* vi128ZW = nullptr;
2527         if(isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)){
2528             Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherResult[1], v32x8Ty), vConstMask), vGatherTy);
2529             vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
2530         }
2531
2532         // init denormalize variables if needed
2533         Instruction::CastOps IntToFpCast;
2534         Value* conversionFactor;
2535
2536         switch (conversionType)
2537         {
2538         case CONVERT_NORMALIZED:
2539             IntToFpCast = Instruction::CastOps::SIToFP;
2540             conversionFactor = VIMMED1((float)(1.0 / 32767.0));
2541             break;
2542         case CONVERT_SSCALED:
2543             IntToFpCast = Instruction::CastOps::SIToFP;
2544             conversionFactor = VIMMED1((float)(1.0));
2545             break;
2546         case CONVERT_USCALED:
2547             SWR_INVALID("Type should not be sign extended!");
2548             conversionFactor = nullptr;
2549             break;
2550         default:
2551             SWR_ASSERT(conversionType == CONVERT_NONE);
2552             conversionFactor = nullptr;
2553             break;
2554         }
2555
2556         // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
2557         for (uint32_t i = 0; i < 4; i++)
2558         {
2559             if (isComponentEnabled(compMask, i))
2560             {
2561                 if (compCtrl[i] == ComponentControl::StoreSrc)
2562                 {
2563                     // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
2564                     uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
2565                     // if x or y, use vi128XY permute result, else use vi128ZW
2566                     Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
2567
2568                     if (bFP) {
2569                         // extract 128 bit lanes to sign extend each component
2570                         vVertexElements[currentVertexElement] = CVTPH2PS(BITCAST(VEXTRACT(selectedPermute, C(lane)), v8x16Ty));
2571                     }
2572                     else {
2573                         // extract 128 bit lanes to sign extend each component
2574                         vVertexElements[currentVertexElement] = PMOVSXWD(BITCAST(VEXTRACT(selectedPermute, C(lane)), v8x16Ty));
2575
2576                         // denormalize if needed
2577                         if (conversionType != CONVERT_NONE) {
2578                             vVertexElements[currentVertexElement] = FMUL(CAST(IntToFpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor);
2579                         }
2580                     }
2581                     currentVertexElement++;
2582                 }
2583                 else
2584                 {
2585 #if USE_SIMD16_SHADERS
2586                     vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i], useVertexID2);
2587 #else
2588                     vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
2589 #endif
2590                 }
2591
2592                 if (currentVertexElement > 3)
2593                 {
2594                     StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
2595                     // reset to the next vVertexElement to output
2596                     currentVertexElement = 0;
2597                 }
2598             }
2599         }
2600     }
2601     // else zero extend
2602     else if ((extendType == Instruction::CastOps::ZExt) || (extendType == Instruction::CastOps::UIToFP))
2603     {
2604         // pshufb masks for each component
2605         Value* vConstMask[2];
2606         if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 2)){
2607             // x/z shuffle mask
2608             vConstMask[0] = C<char>({0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
2609                                      0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, });
2610         }
2611
2612         if(isComponentEnabled(compMask, 1) || isComponentEnabled(compMask, 3)){
2613             // y/w shuffle mask
2614             vConstMask[1] = C<char>({2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
2615                                      2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1});
2616         }
2617
2618         // init denormalize variables if needed
2619         Instruction::CastOps fpCast;
2620         Value* conversionFactor;
2621
2622         switch (conversionType)
2623         {
2624         case CONVERT_NORMALIZED:
2625             fpCast = Instruction::CastOps::UIToFP;
2626             conversionFactor = VIMMED1((float)(1.0 / 65535.0));
2627             break;
2628         case CONVERT_USCALED:
2629             fpCast = Instruction::CastOps::UIToFP;
2630             conversionFactor = VIMMED1((float)(1.0f));
2631             break;
2632         case CONVERT_SSCALED:
2633             SWR_INVALID("Type should not be zero extended!");
2634             conversionFactor = nullptr;
2635             break;
2636         default:
2637             SWR_ASSERT(conversionType == CONVERT_NONE);
2638             conversionFactor = nullptr;
2639             break;
2640         }
2641
2642         // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits
2643         for (uint32_t i = 0; i < 4; i++)
2644         {
2645             if (isComponentEnabled(compMask, i))
2646             {
2647                 if (compCtrl[i] == ComponentControl::StoreSrc)
2648                 {
2649                     // select correct constMask for x/z or y/w pshufb
2650                     uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1;
2651                     // if x or y, use vi128XY permute result, else use vi128ZW
2652                     uint32_t selectedGather = (i < 2) ? 0 : 1;
2653
2654                     vVertexElements[currentVertexElement] = BITCAST(PSHUFB(BITCAST(vGatherResult[selectedGather], v32x8Ty), vConstMask[selectedMask]), vGatherTy);
2655                     // after pshufb mask for x channel; z uses the same shuffle from the second gather
2656                     // 256i - 0    1    2    3    4    5    6    7
2657                     //        xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00
2658
2659                     // denormalize if needed
2660                     if (conversionType != CONVERT_NONE)
2661                     {
2662                         vVertexElements[currentVertexElement] = FMUL(CAST(fpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor);
2663                     }
2664                     currentVertexElement++;
2665                 }
2666                 else
2667                 {
2668 #if USE_SIMD16_SHADERS
2669                     vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i], useVertexID2);
2670 #else
2671                     vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
2672 #endif
2673                 }
2674
2675                 if (currentVertexElement > 3)
2676                 {
2677                     StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
2678                     // reset to the next vVertexElement to output
2679                     currentVertexElement = 0;
2680                 }
2681             }
2682         }
2683     }
2684     else
2685     {
2686         SWR_INVALID("Unsupported conversion type");
2687     }
2688 }
2689
2690 #if USE_SIMD16_BUILDER
2691 void FetchJit::Shuffle16bpcGather2(Shuffle16bpcArgs &args)
2692 {
2693     // Unpack tuple args
2694     Value* (&vGatherResult)[2] = std::get<0>(args);
2695     Value* pVtxOut = std::get<1>(args);
2696     const Instruction::CastOps extendType = std::get<2>(args);
2697     const ConversionType conversionType = std::get<3>(args);
2698     uint32_t &currentVertexElement = std::get<4>(args);
2699     uint32_t &outputElt = std::get<5>(args);
2700     const ComponentEnable compMask = std::get<6>(args);
2701     const ComponentControl(&compCtrl)[4] = std::get<7>(args);
2702     Value* (&vVertexElements)[4] = std::get<8>(args);
2703
2704     // cast types
2705     Type *vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
2706     Type *v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
2707
2708     // have to do extra work for sign extending
2709     if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP) || (extendType == Instruction::CastOps::FPExt))
2710     {
2711         // is this PP float?
2712         bool bFP = (extendType == Instruction::CastOps::FPExt) ? true : false;
2713
2714         Type *v8x16Ty = VectorType::get(mInt16Ty, 8); // 8x16bit in a 128bit lane
2715         Type *v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
2716
2717         // shuffle mask
2718         Value *vConstMask = C<char>({ 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
2719                                       0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 });
2720         Value *vi128XY = nullptr;
2721         Value *vi128XY_lo = nullptr;
2722         Value *vi128XY_hi = nullptr;
2723         if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
2724         {
2725             // SIMD16 PSHUFB isnt part of AVX-512F, so split into SIMD8 for the sake of KNL, for now..
2726
2727             Value *vGatherResult_lo = EXTRACT2_I(vGatherResult[0], 0);
2728             Value *vGatherResult_hi = EXTRACT2_I(vGatherResult[0], 1);
2729
2730             Value *vShufResult_lo = BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask), vGatherTy);
2731             Value *vShufResult_hi = BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask), vGatherTy);
2732
2733             // after pshufb: group components together in each 128bit lane
2734             // 256i - 0    1    2    3    4    5    6    7
2735             //        xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
2736
2737             vi128XY_lo = BITCAST(PERMD(vShufResult_lo, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
2738             vi128XY_hi = BITCAST(PERMD(vShufResult_hi, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
2739
2740             // after PERMD: move and pack xy components into each 128bit lane
2741             // 256i - 0    1    2    3    4    5    6    7
2742             //        xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
2743 #if 0
2744
2745             vi128XY = VUNDEF2_I();
2746             vi128XY = INSERT2_I(vi128XY, vi128XY_lo, 0);
2747             vi128XY = INSERT2_I(vi128XY, vi128XY_hi, 1);
2748 #endif
2749         }
2750
2751         // do the same for zw components
2752         Value *vi128ZW = nullptr;
2753         Value *vi128ZW_lo = nullptr;
2754         Value *vi128ZW_hi = nullptr;
2755         if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
2756         {
2757             Value *vGatherResult_lo = EXTRACT2_I(vGatherResult[1], 0);
2758             Value *vGatherResult_hi = EXTRACT2_I(vGatherResult[1], 1);
2759
2760             Value *vShufResult_lo = BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask), vGatherTy);
2761             Value *vShufResult_hi = BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask), vGatherTy);
2762
2763             vi128ZW_lo = BITCAST(PERMD(vShufResult_lo, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
2764             vi128ZW_hi = BITCAST(PERMD(vShufResult_hi, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
2765 #if 0
2766
2767             vi128ZW = VUNDEF2_I();
2768             vi128ZW = INSERT2_I(vi128ZW, vi128ZW_lo, 0);
2769             vi128ZW = INSERT2_I(vi128ZW, vi128ZW_hi, 1);
2770 #endif
2771         }
2772
2773         // init denormalize variables if needed
2774         Instruction::CastOps IntToFpCast;
2775         Value *conversionFactor;
2776
2777         switch (conversionType)
2778         {
2779         case CONVERT_NORMALIZED:
2780             IntToFpCast = Instruction::CastOps::SIToFP;
2781             conversionFactor = VIMMED1((float)(1.0 / 32767.0));
2782             break;
2783         case CONVERT_SSCALED:
2784             IntToFpCast = Instruction::CastOps::SIToFP;
2785             conversionFactor = VIMMED1((float)(1.0));
2786             break;
2787         case CONVERT_USCALED:
2788             SWR_INVALID("Type should not be sign extended!");
2789             conversionFactor = nullptr;
2790             break;
2791         default:
2792             SWR_ASSERT(conversionType == CONVERT_NONE);
2793             conversionFactor = nullptr;
2794             break;
2795         }
2796
2797         // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
2798         for (uint32_t i = 0; i < 4; i++)
2799         {
2800             if (isComponentEnabled(compMask, i))
2801             {
2802                 if (compCtrl[i] == ComponentControl::StoreSrc)
2803                 {
2804                     // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
2805                     uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
2806                     // if x or y, use vi128XY permute result, else use vi128ZW
2807                     Value *selectedPermute_lo = (i < 2) ? vi128XY_lo : vi128ZW_lo;
2808                     Value *selectedPermute_hi = (i < 2) ? vi128XY_hi : vi128ZW_hi;
2809
2810                     if (bFP)
2811                     {
2812                         // extract 128 bit lanes to sign extend each component
2813                         Value *temp_lo = CVTPH2PS(BITCAST(VEXTRACT(selectedPermute_lo, C(lane)), v8x16Ty));
2814                         Value *temp_hi = CVTPH2PS(BITCAST(VEXTRACT(selectedPermute_hi, C(lane)), v8x16Ty));
2815
2816                         vVertexElements[currentVertexElement] = VUNDEF2_F();
2817                         vVertexElements[currentVertexElement] = INSERT2_F(vVertexElements[currentVertexElement], temp_lo, 0);
2818                         vVertexElements[currentVertexElement] = INSERT2_F(vVertexElements[currentVertexElement], temp_hi, 1);
2819                     }
2820                     else
2821                     {
2822                         // extract 128 bit lanes to sign extend each component
2823                         Value *temp_lo = PMOVSXWD(BITCAST(VEXTRACT(selectedPermute_lo, C(lane)), v8x16Ty));
2824                         Value *temp_hi = PMOVSXWD(BITCAST(VEXTRACT(selectedPermute_hi, C(lane)), v8x16Ty));
2825
2826                         // denormalize if needed
2827                         if (conversionType != CONVERT_NONE)
2828                         {
2829                             temp_lo = FMUL(CAST(IntToFpCast, temp_lo, mSimdFP32Ty), conversionFactor);
2830                             temp_hi = FMUL(CAST(IntToFpCast, temp_hi, mSimdFP32Ty), conversionFactor);
2831                         }
2832
2833                         vVertexElements[currentVertexElement] = VUNDEF2_F();
2834                         vVertexElements[currentVertexElement] = INSERT2_F(vVertexElements[currentVertexElement], temp_lo, 0);
2835                         vVertexElements[currentVertexElement] = INSERT2_F(vVertexElements[currentVertexElement], temp_hi, 1);
2836                     }
2837
2838                     currentVertexElement += 1;
2839                 }
2840                 else
2841                 {
2842                     vVertexElements[currentVertexElement++] = GenerateCompCtrlVector2(compCtrl[i]);
2843                 }
2844
2845                 if (currentVertexElement > 3)
2846                 {
2847                     StoreVertexElements2(pVtxOut, outputElt++, 4, vVertexElements);
2848                     // reset to the next vVertexElement to output
2849                     currentVertexElement = 0;
2850                 }
2851             }
2852         }
2853     }
2854     // else zero extend
2855     else if ((extendType == Instruction::CastOps::ZExt) || (extendType == Instruction::CastOps::UIToFP))
2856     {
2857         // pshufb masks for each component
2858         Value *vConstMask[2];
2859
2860         if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 2))
2861         {
2862             // x/z shuffle mask
2863             vConstMask[0] = C<char>({ 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
2864                 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, });
2865         }
2866
2867         if (isComponentEnabled(compMask, 1) || isComponentEnabled(compMask, 3))
2868         {
2869             // y/w shuffle mask
2870             vConstMask[1] = C<char>({ 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
2871                 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1 });
2872         }
2873
2874         // init denormalize variables if needed
2875         Instruction::CastOps fpCast;
2876         Value* conversionFactor;
2877
2878         switch (conversionType)
2879         {
2880         case CONVERT_NORMALIZED:
2881             fpCast = Instruction::CastOps::UIToFP;
2882             conversionFactor = VIMMED1((float)(1.0 / 65535.0));
2883             break;
2884         case CONVERT_USCALED:
2885             fpCast = Instruction::CastOps::UIToFP;
2886             conversionFactor = VIMMED1((float)(1.0f));
2887             break;
2888         case CONVERT_SSCALED:
2889             SWR_INVALID("Type should not be zero extended!");
2890             conversionFactor = nullptr;
2891             break;
2892         default:
2893             SWR_ASSERT(conversionType == CONVERT_NONE);
2894             conversionFactor = nullptr;
2895             break;
2896         }
2897
2898         // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits
2899         for (uint32_t i = 0; i < 4; i++)
2900         {
2901             if (isComponentEnabled(compMask, i))
2902             {
2903                 if (compCtrl[i] == ComponentControl::StoreSrc)
2904                 {
2905                     // select correct constMask for x/z or y/w pshufb
2906                     uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1;
2907                     // if x or y, use vi128XY permute result, else use vi128ZW
2908                     uint32_t selectedGather = (i < 2) ? 0 : 1;
2909
2910                     // SIMD16 PSHUFB isnt part of AVX-512F, so split into SIMD8 for the sake of KNL, for now..
2911
2912                     Value *vGatherResult_lo = EXTRACT2_I(vGatherResult[selectedGather], 0);
2913                     Value *vGatherResult_hi = EXTRACT2_I(vGatherResult[selectedGather], 1);
2914
2915                     Value *temp_lo = BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask[selectedMask]), vGatherTy);
2916                     Value *temp_hi = BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask[selectedMask]), vGatherTy);
2917
2918                     // after pshufb mask for x channel; z uses the same shuffle from the second gather
2919                     // 256i - 0    1    2    3    4    5    6    7
2920                     //        xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00
2921
2922                     // denormalize if needed
2923                     if (conversionType != CONVERT_NONE)
2924                     {
2925                         temp_lo = FMUL(CAST(fpCast, temp_lo, mSimdFP32Ty), conversionFactor);
2926                         temp_hi = FMUL(CAST(fpCast, temp_hi, mSimdFP32Ty), conversionFactor);
2927                     }
2928
2929                     vVertexElements[currentVertexElement] = VUNDEF2_F();
2930                     vVertexElements[currentVertexElement] = INSERT2_F(vVertexElements[currentVertexElement], temp_lo, 0);
2931                     vVertexElements[currentVertexElement] = INSERT2_F(vVertexElements[currentVertexElement], temp_hi, 1);
2932
2933                     currentVertexElement += 1;
2934                 }
2935                 else
2936                 {
2937                     vVertexElements[currentVertexElement++] = GenerateCompCtrlVector2(compCtrl[i]);
2938                 }
2939
2940                 if (currentVertexElement > 3)
2941                 {
2942                     StoreVertexElements2(pVtxOut, outputElt++, 4, vVertexElements);
2943                     // reset to the next vVertexElement to output
2944                     currentVertexElement = 0;
2945                 }
2946             }
2947         }
2948     }
2949     else
2950     {
2951         SWR_INVALID("Unsupported conversion type");
2952     }
2953 }
2954
2955 #endif
2956 //////////////////////////////////////////////////////////////////////////
2957 /// @brief Output a simdvertex worth of elements to the current outputElt
2958 /// @param pVtxOut - base address of VIN output struct
2959 /// @param outputElt - simdvertex offset in VIN to write to
2960 /// @param numEltsToStore - number of simdvertex rows to write out
2961 /// @param vVertexElements - LLVM Value*[] simdvertex to write out
2962 void FetchJit::StoreVertexElements(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4])
2963 {
2964     SWR_ASSERT(numEltsToStore <= 4, "Invalid element count.");
2965
2966     for(uint32_t c = 0; c < numEltsToStore; ++c)
2967     {
2968         // STORE expects FP32 x vWidth type, just bitcast if needed
2969         if(!vVertexElements[c]->getType()->getScalarType()->isFloatTy())
2970         {
2971 #if FETCH_DUMP_VERTEX
2972             PRINT("vVertexElements[%d]: 0x%x\n", {C(c), vVertexElements[c]});
2973 #endif
2974             vVertexElements[c] = BITCAST(vVertexElements[c], mSimdFP32Ty);
2975         }
2976 #if FETCH_DUMP_VERTEX
2977         else
2978         {
2979             PRINT("vVertexElements[%d]: %f\n", {C(c), vVertexElements[c]});
2980         }
2981 #endif
2982         // outputElt * 4 = offsetting by the size of a simdvertex
2983         // + c offsets to a 32bit x vWidth row within the current vertex
2984 #if USE_SIMD16_SHADERS
2985         Value* dest = GEP(pVtxOut, C(outputElt * 8 + c * 2), "destGEP");
2986 #else
2987         Value* dest = GEP(pVtxOut, C(outputElt * 4 + c), "destGEP");
2988 #endif
2989         STORE(vVertexElements[c], dest);
2990     }
2991 }
2992
2993 #if USE_SIMD16_BUILDER
2994 void FetchJit::StoreVertexElements2(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4])
2995 {
2996     SWR_ASSERT(numEltsToStore <= 4, "Invalid element count.");
2997
2998     for (uint32_t c = 0; c < numEltsToStore; ++c)
2999     {
3000         // STORE expects FP32 x vWidth type, just bitcast if needed
3001         if (!vVertexElements[c]->getType()->getScalarType()->isFloatTy())
3002         {
3003 #if FETCH_DUMP_VERTEX
3004             PRINT("vVertexElements[%d]: 0x%x\n", { C(c), vVertexElements[c] });
3005 #endif
3006             vVertexElements[c] = BITCAST(vVertexElements[c], mSimd2FP32Ty);
3007         }
3008 #if FETCH_DUMP_VERTEX
3009         else
3010         {
3011             PRINT("vVertexElements[%d]: %f\n", { C(c), vVertexElements[c] });
3012         }
3013 #endif
3014         // outputElt * 4 = offsetting by the size of a simdvertex
3015         // + c offsets to a 32bit x vWidth row within the current vertex
3016         Value* dest = GEP(pVtxOut, C(outputElt * 4 + c), "destGEP");
3017         STORE(vVertexElements[c], dest);
3018     }
3019 }
3020
3021 #endif
3022 //////////////////////////////////////////////////////////////////////////
3023 /// @brief Generates a constant vector of values based on the
3024 /// ComponentControl value
3025 /// @param ctrl - ComponentControl value
3026 #if USE_SIMD16_SHADERS
3027 Value* FetchJit::GenerateCompCtrlVector(const ComponentControl ctrl, bool useVertexID2)
3028 #else
3029 Value* FetchJit::GenerateCompCtrlVector(const ComponentControl ctrl)
3030 #endif
3031 {
3032     switch(ctrl)
3033     {
3034         case NoStore:   return VUNDEF_I();
3035         case Store0:    return VIMMED1(0);
3036         case Store1Fp:  return VIMMED1(1.0f);
3037         case Store1Int: return VIMMED1(1);
3038         case StoreVertexId:
3039         {
3040 #if USE_SIMD16_SHADERS
3041             Value* pId;
3042             if (useVertexID2)
3043             {
3044                 pId = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID2 })), mSimdFP32Ty);
3045             }
3046             else
3047             {
3048                 pId = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID })), mSimdFP32Ty);
3049             }
3050 #else
3051             Value* pId = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID })), mSimdFP32Ty);
3052 #endif
3053             return VBROADCAST(pId);
3054         }
3055         case StoreInstanceId:
3056         {
3057             Value* pId = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_CurInstance })), mFP32Ty);
3058             return VBROADCAST(pId);
3059         }
3060         case StoreSrc:
3061         default:        SWR_INVALID("Invalid component control"); return VUNDEF_I();
3062     }
3063 }
3064
3065 #if USE_SIMD16_BUILDER
3066 Value* FetchJit::GenerateCompCtrlVector2(const ComponentControl ctrl)
3067 {
3068     switch (ctrl)
3069     {
3070         case NoStore:   return VUNDEF2_I();
3071         case Store0:    return VIMMED2_1(0);
3072         case Store1Fp:  return VIMMED2_1(1.0f);
3073         case Store1Int: return VIMMED2_1(1);
3074         case StoreVertexId:
3075         {
3076             Value* pId = VUNDEF2_F();
3077
3078             Value* pId_lo = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID  })), mSimdFP32Ty);
3079             Value* pId_hi = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID2 })), mSimdFP32Ty);
3080
3081             pId = INSERT2_F(pId, pId_lo, 0);
3082             pId = INSERT2_F(pId, pId_hi, 1);
3083
3084             return VBROADCAST2(pId);
3085         }
3086         case StoreInstanceId:
3087         {
3088             Value* pId = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_CurInstance })), mFP32Ty);
3089             return VBROADCAST2(pId);
3090         }
3091         case StoreSrc:
3092         default:        SWR_INVALID("Invalid component control"); return VUNDEF2_I();
3093     }
3094 }
3095
3096 #endif
3097 //////////////////////////////////////////////////////////////////////////
3098 /// @brief Returns the enable mask for the specified component.
3099 /// @param enableMask - enable bits
3100 /// @param component - component to check if enabled.
3101 bool isComponentEnabled(ComponentEnable enableMask, uint8_t component)
3102 {
3103     switch (component)
3104     {
3105         // X
3106     case 0: return (enableMask & ComponentEnable::X);
3107         // Y
3108     case 1: return (enableMask & ComponentEnable::Y);
3109         // Z
3110     case 2: return (enableMask & ComponentEnable::Z);
3111         // W
3112     case 3: return (enableMask & ComponentEnable::W);
3113
3114     default: return false;
3115     }
3116 }
3117
3118
3119 //////////////////////////////////////////////////////////////////////////
3120 /// @brief JITs from fetch shader IR
3121 /// @param hJitMgr - JitManager handle
3122 /// @param func   - LLVM function IR
3123 /// @return PFN_FETCH_FUNC - pointer to fetch code
3124 PFN_FETCH_FUNC JitFetchFunc(HANDLE hJitMgr, const HANDLE hFunc)
3125 {
3126     const llvm::Function* func = (const llvm::Function*)hFunc;
3127     JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
3128     PFN_FETCH_FUNC pfnFetch;
3129
3130     pfnFetch = (PFN_FETCH_FUNC)(pJitMgr->mpExec->getFunctionAddress(func->getName().str()));
3131     // MCJIT finalizes modules the first time you JIT code from them. After finalized, you cannot add new IR to the module
3132     pJitMgr->mIsModuleFinalized = true;
3133
3134 #if defined(KNOB_SWRC_TRACING)
3135     char fName[1024];
3136     const char *funcName = func->getName().data();
3137     sprintf(fName, "%s.bin", funcName);
3138     FILE *fd = fopen(fName, "wb");
3139     fwrite((void *)pfnFetch, 1, 2048, fd);
3140     fclose(fd);
3141 #endif
3142
3143     pJitMgr->DumpAsm(const_cast<llvm::Function*>(func), "final");
3144
3145     return pfnFetch;
3146 }
3147
3148 //////////////////////////////////////////////////////////////////////////
3149 /// @brief JIT compiles fetch shader
3150 /// @param hJitMgr - JitManager handle
3151 /// @param state   - fetch state to build function from
3152 extern "C" PFN_FETCH_FUNC JITCALL JitCompileFetch(HANDLE hJitMgr, const FETCH_COMPILE_STATE& state)
3153 {
3154     JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
3155
3156     pJitMgr->SetupNewModule();
3157
3158     FetchJit theJit(pJitMgr);
3159     HANDLE hFunc = theJit.Create(state);
3160
3161     return JitFetchFunc(hJitMgr, hFunc);
3162 }