src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp

   1 /****************************************************************************
   2 * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
   3 *
   4 * Permission is hereby granted, free of charge, to any person obtaining a
   5 * copy of this software and associated documentation files (the "Software"),
   6 * to deal in the Software without restriction, including without limitation
   7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8 * and/or sell copies of the Software, and to permit persons to whom the
   9 * Software is furnished to do so, subject to the following conditions:
  10 *
  11 * The above copyright notice and this permission notice (including the next
  12 * paragraph) shall be included in all copies or substantial portions of the
  13 * Software.
  14 *
  15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21 * IN THE SOFTWARE.
  22 *
  23 * @file fetch_jit.cpp
  24 *
  25 * @brief Implementation of the fetch jitter
  26 *
  27 * Notes:
  28 *
  29 ******************************************************************************/
  30 #include "builder.h"
  31 #include "jit_api.h"
  32 #include "fetch_jit.h"
  33 #include "gen_state_llvm.h"
  34 #include <sstream>
  35 #include <tuple>
  36
  37 //#define FETCH_DUMP_VERTEX 1
  38 using namespace llvm;
  39 using namespace SwrJit;
  40
  41 bool isComponentEnabled(ComponentEnable enableMask, uint8_t component);
  42
  43 enum ConversionType
  44 {
  45     CONVERT_NONE,
  46     CONVERT_NORMALIZED,
  47     CONVERT_USCALED,
  48     CONVERT_SSCALED,
  49     CONVERT_SFIXED,
  50 };
  51
  52 //////////////////////////////////////////////////////////////////////////
  53 /// Interface to Jitting a fetch shader
  54 //////////////////////////////////////////////////////////////////////////
  55 struct FetchJit : public Builder
  56 {
  57     FetchJit(JitManager* pJitMgr) : Builder(pJitMgr){};
  58
  59     Function* Create(const FETCH_COMPILE_STATE& fetchState);
  60     Value* GetSimdValid32bitIndices(Value* vIndices, Value* pLastIndex);
  61     Value* GetSimdValid16bitIndices(Value* vIndices, Value* pLastIndex);
  62     Value* GetSimdValid8bitIndices(Value* vIndices, Value* pLastIndex);
  63
  64     // package up Shuffle*bpcGatherd args into a tuple for convenience
  65     typedef std::tuple<Value*&, Value*, const Instruction::CastOps, const ConversionType,
  66         uint32_t&, uint32_t&, const ComponentEnable, const ComponentControl(&)[4], Value*(&)[4],
  67         const uint32_t(&)[4]> Shuffle8bpcArgs;
  68 #if USE_SIMD16_SHADERS
  69     void Shuffle8bpcGatherd(Shuffle8bpcArgs &args, bool useVertexID2);
  70 #else
  71     void Shuffle8bpcGatherd(Shuffle8bpcArgs &args);
  72 #endif
  73 #if USE_SIMD16_BUILDER
  74     void Shuffle8bpcGatherd2(Shuffle8bpcArgs &args);
  75 #endif
  76
  77     typedef std::tuple<Value*(&)[2], Value*, const Instruction::CastOps, const ConversionType,
  78         uint32_t&, uint32_t&, const ComponentEnable, const ComponentControl(&)[4], Value*(&)[4]> Shuffle16bpcArgs;
  79 #if USE_SIMD16_SHADERS
  80     void Shuffle16bpcGather(Shuffle16bpcArgs &args, bool useVertexID2);
  81 #else
  82     void Shuffle16bpcGather(Shuffle16bpcArgs &args);
  83 #endif
  84 #if USE_SIMD16_BUILDER
  85     void Shuffle16bpcGather2(Shuffle16bpcArgs &args);
  86 #endif
  87
  88     void StoreVertexElements(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4]);
  89 #if USE_SIMD16_BUILDER
  90     void StoreVertexElements2(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4]);
  91 #endif
  92
  93 #if USE_SIMD16_SHADERS
  94     Value* GenerateCompCtrlVector(const ComponentControl ctrl, bool useVertexID2);
  95 #else
  96     Value* GenerateCompCtrlVector(const ComponentControl ctrl);
  97 #endif
  98 #if USE_SIMD16_BUILDER
  99     Value* GenerateCompCtrlVector2(const ComponentControl ctrl);
 100 #endif
 101
 102     void JitLoadVertices(const FETCH_COMPILE_STATE &fetchState, Value* streams, Value* vIndices, Value* pVtxOut);
 103 #if USE_SIMD16_SHADERS
 104 #define USE_SIMD16_GATHERS 0
 105
 106 #if USE_SIMD16_GATHERS
 107     void JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, Value *streams, Value *vIndices, Value *vIndices2, Value *pVtxOut, bool useVertexID2);
 108 #else
 109     void JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, Value* streams, Value* vIndices, Value* pVtxOut, bool useVertexID2);
 110 #endif
 111 #else
 112     void JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, Value* streams, Value* vIndices, Value* pVtxOut);
 113 #endif
 114
 115     bool IsOddFormat(SWR_FORMAT format);
 116     bool IsUniformFormat(SWR_FORMAT format);
 117     void UnpackComponents(SWR_FORMAT format, Value* vInput, Value* result[4]);
 118     void CreateGatherOddFormats(SWR_FORMAT format, Value* pMask, Value* pBase, Value* offsets, Value* result[4]);
 119     void ConvertFormat(SWR_FORMAT format, Value *texels[4]);
 120
 121     Value* mpFetchInfo;
 122 };
 123
 124 Function* FetchJit::Create(const FETCH_COMPILE_STATE& fetchState)
 125 {
 126     std::stringstream fnName("FetchShader_", std::ios_base::in | std::ios_base::out | std::ios_base::ate);
 127     fnName << ComputeCRC(0, &fetchState, sizeof(fetchState));
 128
 129     Function*    fetch = Function::Create(JM()->mFetchShaderTy, GlobalValue::ExternalLinkage, fnName.str(), JM()->mpCurrentModule);
 130     BasicBlock*    entry = BasicBlock::Create(JM()->mContext, "entry", fetch);
 131
 132     fetch->getParent()->setModuleIdentifier(fetch->getName());
 133
 134     IRB()->SetInsertPoint(entry);
 135
 136     auto    argitr = fetch->arg_begin();
 137
 138     // Fetch shader arguments
 139     mpFetchInfo = &*argitr; ++argitr;
 140     mpFetchInfo->setName("fetchInfo");
 141     Value*    pVtxOut = &*argitr;
 142     pVtxOut->setName("vtxOutput");
 143     // this is just shorthand to tell LLVM to get a pointer to the base address of simdvertex
 144     // index 0(just the pointer to the simdvertex structure
 145     // index 1(which element of the simdvertex structure to offset to(in this case 0)
 146     // so the indices being i32's doesn't matter
 147     // TODO: generated this GEP with a VECTOR structure type so this makes sense
 148     std::vector<Value*>    vtxInputIndices(2, C(0));
 149     // GEP
 150     pVtxOut = GEP(pVtxOut, C(0));
 151 #if USE_SIMD16_SHADERS
 152 #if 0// USE_SIMD16_BUILDER
 153     pVtxOut = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth2), 0));
 154 #else
 155     pVtxOut = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth), 0));
 156 #endif
 157 #else
 158     pVtxOut = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth), 0));
 159 #endif
 160
 161     // SWR_FETCH_CONTEXT::pStreams
 162     Value*    streams = LOAD(mpFetchInfo,{0, SWR_FETCH_CONTEXT_pStreams});
 163     streams->setName("pStreams");
 164
 165     // SWR_FETCH_CONTEXT::pIndices
 166     Value*    indices = LOAD(mpFetchInfo,{0, SWR_FETCH_CONTEXT_pIndices});
 167     indices->setName("pIndices");
 168
 169     // SWR_FETCH_CONTEXT::pLastIndex
 170     Value*    pLastIndex = LOAD(mpFetchInfo,{0, SWR_FETCH_CONTEXT_pLastIndex});
 171     pLastIndex->setName("pLastIndex");
 172
 173
 174     Value* vIndices;
 175 #if USE_SIMD16_SHADERS
 176     Value* indices2;
 177     Value* vIndices2;
 178 #endif
 179     switch(fetchState.indexType)
 180     {
 181         case R8_UINT:
 182             indices = BITCAST(indices, Type::getInt8PtrTy(JM()->mContext, 0));
 183 #if USE_SIMD16_SHADERS
 184             indices2 = GEP(indices, C(8));
 185 #endif
 186             if(fetchState.bDisableIndexOOBCheck)
 187             {
 188                 vIndices = LOAD(BITCAST(indices, PointerType::get(VectorType::get(mInt8Ty, mpJitMgr->mVWidth), 0)), {(uint32_t)0});
 189                 vIndices = Z_EXT(vIndices, mSimdInt32Ty);
 190 #if USE_SIMD16_SHADERS
 191                 vIndices2 = LOAD(BITCAST(indices2, PointerType::get(VectorType::get(mInt8Ty, mpJitMgr->mVWidth), 0)), { (uint32_t)0 });
 192                 vIndices2 = Z_EXT(vIndices2, mSimdInt32Ty);
 193 #endif
 194             }
 195             else
 196             {
 197                 pLastIndex = BITCAST(pLastIndex, Type::getInt8PtrTy(JM()->mContext, 0));
 198                 vIndices = GetSimdValid8bitIndices(indices, pLastIndex);
 199 #if USE_SIMD16_SHADERS
 200                 pLastIndex = BITCAST(pLastIndex, Type::getInt8PtrTy(JM()->mContext, 0));
 201                 vIndices2 = GetSimdValid8bitIndices(indices2, pLastIndex);
 202 #endif
 203             }
 204             break;
 205         case R16_UINT:
 206             indices = BITCAST(indices, Type::getInt16PtrTy(JM()->mContext, 0));
 207 #if USE_SIMD16_SHADERS
 208             indices2 = GEP(indices, C(8));
 209 #endif
 210             if(fetchState.bDisableIndexOOBCheck)
 211             {
 212                 vIndices = LOAD(BITCAST(indices, PointerType::get(VectorType::get(mInt16Ty, mpJitMgr->mVWidth), 0)), {(uint32_t)0});
 213                 vIndices = Z_EXT(vIndices, mSimdInt32Ty);
 214 #if USE_SIMD16_SHADERS
 215                 vIndices2 = LOAD(BITCAST(indices2, PointerType::get(VectorType::get(mInt16Ty, mpJitMgr->mVWidth), 0)), { (uint32_t)0 });
 216                 vIndices2 = Z_EXT(vIndices2, mSimdInt32Ty);
 217 #endif
 218             }
 219             else
 220             {
 221                 pLastIndex = BITCAST(pLastIndex, Type::getInt16PtrTy(JM()->mContext, 0));
 222                 vIndices = GetSimdValid16bitIndices(indices, pLastIndex);
 223 #if USE_SIMD16_SHADERS
 224                 pLastIndex = BITCAST(pLastIndex, Type::getInt16PtrTy(JM()->mContext, 0));
 225                 vIndices2 = GetSimdValid16bitIndices(indices2, pLastIndex);
 226 #endif
 227             }
 228             break;
 229         case R32_UINT:
 230 #if USE_SIMD16_SHADERS
 231             indices2 = GEP(indices, C(8));
 232 #endif
 233             (fetchState.bDisableIndexOOBCheck) ? vIndices = LOAD(BITCAST(indices, PointerType::get(mSimdInt32Ty,0)),{(uint32_t)0})
 234                                                : vIndices = GetSimdValid32bitIndices(indices, pLastIndex);
 235 #if USE_SIMD16_SHADERS
 236             (fetchState.bDisableIndexOOBCheck) ? vIndices2 = LOAD(BITCAST(indices2, PointerType::get(mSimdInt32Ty, 0)), { (uint32_t)0 })
 237                                                : vIndices2 = GetSimdValid32bitIndices(indices2, pLastIndex);
 238 #endif
 239             break; // incoming type is already 32bit int
 240         default: SWR_INVALID("Unsupported index type"); vIndices = nullptr; break;
 241     }
 242
 243     if(fetchState.bForceSequentialAccessEnable)
 244     {
 245         Value* pOffsets = C({ 0, 1, 2, 3, 4, 5, 6, 7 });
 246
 247         // VertexData buffers are accessed sequentially, the index is equal to the vertex number
 248         vIndices = VBROADCAST(LOAD(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_StartVertex }));
 249         vIndices = ADD(vIndices, pOffsets);
 250 #if USE_SIMD16_SHADERS
 251         vIndices2 = ADD(vIndices, VIMMED1(8));
 252 #endif
 253     }
 254
 255     Value* vVertexId = vIndices;
 256 #if USE_SIMD16_SHADERS
 257     Value* vVertexId2 = vIndices2;
 258 #endif
 259     if (fetchState.bVertexIDOffsetEnable)
 260     {
 261         // Assuming one of baseVertex or startVertex is 0, so adding both should be functionally correct
 262         Value* vBaseVertex = VBROADCAST(LOAD(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_BaseVertex }));
 263         Value* vStartVertex = VBROADCAST(LOAD(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_StartVertex }));
 264         vVertexId = ADD(vIndices, vBaseVertex);
 265         vVertexId = ADD(vVertexId, vStartVertex);
 266 #if USE_SIMD16_SHADERS
 267         vVertexId2 = ADD(vIndices2, vBaseVertex);
 268         vVertexId2 = ADD(vVertexId2, vStartVertex);
 269 #endif
 270     }
 271
 272     // store out vertex IDs
 273     STORE(vVertexId, GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID }));
 274 #if USE_SIMD16_SHADERS
 275     STORE(vVertexId2, GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID2 }));
 276 #endif
 277
 278     // store out cut mask if enabled
 279     if (fetchState.bEnableCutIndex)
 280     {
 281         Value* vCutIndex = VIMMED1(fetchState.cutIndex);
 282         Value* cutMask = VMASK(ICMP_EQ(vIndices, vCutIndex));
 283         STORE(cutMask, GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_CutMask }));
 284 #if USE_SIMD16_SHADERS
 285         Value* cutMask2 = VMASK(ICMP_EQ(vIndices2, vCutIndex));
 286         STORE(cutMask2, GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_CutMask2 }));
 287 #endif
 288     }
 289
 290     // Fetch attributes from memory and output to a simdvertex struct
 291     // since VGATHER has a perf penalty on HSW vs BDW, allow client to choose which fetch method to use
 292 #if USE_SIMD16_SHADERS
 293     if (fetchState.bDisableVGATHER)
 294     {
 295         JitLoadVertices(fetchState, streams, vIndices, pVtxOut);
 296         JitLoadVertices(fetchState, streams, vIndices2, GEP(pVtxOut, C(1)));
 297     }
 298     else
 299     {
 300 #if USE_SIMD16_GATHERS
 301         JitGatherVertices(fetchState, streams, vIndices, vIndices2, pVtxOut, false);
 302 #else
 303         JitGatherVertices(fetchState, streams, vIndices, pVtxOut, false);
 304         JitGatherVertices(fetchState, streams, vIndices2, GEP(pVtxOut, C(1)), true);
 305 #endif
 306     }
 307 #else
 308     (fetchState.bDisableVGATHER) ? JitLoadVertices(fetchState, streams, vIndices, pVtxOut)
 309                                  : JitGatherVertices(fetchState, streams, vIndices, pVtxOut);
 310 #endif
 311
 312     RET_VOID();
 313
 314     JitManager::DumpToFile(fetch, "src");
 315
 316 #if defined(_DEBUG)
 317     verifyFunction(*fetch);
 318 #endif
 319
 320     ::FunctionPassManager setupPasses(JM()->mpCurrentModule);
 321
 322     ///@todo We don't need the CFG passes for fetch. (e.g. BreakCriticalEdges and CFGSimplification)
 323     setupPasses.add(createBreakCriticalEdgesPass());
 324     setupPasses.add(createCFGSimplificationPass());
 325     setupPasses.add(createEarlyCSEPass());
 326     setupPasses.add(createPromoteMemoryToRegisterPass());
 327
 328     setupPasses.run(*fetch);
 329
 330     JitManager::DumpToFile(fetch, "se");
 331
 332     ::FunctionPassManager optPasses(JM()->mpCurrentModule);
 333
 334     ///@todo Haven't touched these either. Need to remove some of these and add others.
 335     optPasses.add(createCFGSimplificationPass());
 336     optPasses.add(createEarlyCSEPass());
 337     optPasses.add(createInstructionCombiningPass());
 338     optPasses.add(createInstructionSimplifierPass());
 339     optPasses.add(createConstantPropagationPass());
 340     optPasses.add(createSCCPPass());
 341     optPasses.add(createAggressiveDCEPass());
 342
 343     optPasses.run(*fetch);
 344     optPasses.run(*fetch);
 345
 346     JitManager::DumpToFile(fetch, "opt");
 347
 348     return fetch;
 349 }
 350
 351 //////////////////////////////////////////////////////////////////////////
 352 /// @brief Loads attributes from memory using LOADs, shuffling the
 353 /// components into SOA form.
 354 /// *Note* currently does not support component control,
 355 /// component packing, instancing
 356 /// @param fetchState - info about attributes to be fetched from memory
 357 /// @param streams - value pointer to the current vertex stream
 358 /// @param vIndices - vector value of indices to load
 359 /// @param pVtxOut - value pointer to output simdvertex struct
 360 void FetchJit::JitLoadVertices(const FETCH_COMPILE_STATE &fetchState, Value* streams, Value* vIndices, Value* pVtxOut)
 361 {
 362     // Zack shuffles; a variant of the Charleston.
 363
 364     std::vector<Value*> vectors(16);
 365     std::vector<Constant*>    pMask(mVWidth);
 366     for(uint32_t i = 0; i < mVWidth; ++i)
 367     {
 368         pMask[i] = (C(i < 4 ? i : 4));
 369     }
 370     Constant* promoteMask = ConstantVector::get(pMask);
 371     Constant* uwvec = UndefValue::get(VectorType::get(mFP32Ty, 4));
 372
 373     Value* startVertex = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartVertex});
 374     Value* startInstance = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartInstance});
 375     Value* curInstance = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_CurInstance});
 376     Value* vBaseVertex = VBROADCAST(LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_BaseVertex}));
 377     curInstance->setName("curInstance");
 378
 379     for(uint32_t nelt = 0; nelt < fetchState.numAttribs; ++nelt)
 380     {
 381         Value*    elements[4] = {0};
 382         const INPUT_ELEMENT_DESC& ied = fetchState.layout[nelt];
 383         const SWR_FORMAT_INFO &info = GetFormatInfo((SWR_FORMAT)ied.Format);
 384         SWR_ASSERT((info.bpp != 0), "Unsupported format in JitLoadVertices.");
 385         uint32_t    numComponents = info.numComps;
 386         uint32_t bpc = info.bpp / info.numComps;  ///@todo Code below assumes all components are same size. Need to fix.
 387
 388         // load path doesn't support component packing
 389         SWR_ASSERT(ied.ComponentPacking == ComponentEnable::XYZW, "Fetch load path doesn't support component packing.");
 390
 391         vectors.clear();
 392
 393         if (fetchState.bInstanceIDOffsetEnable)
 394         {
 395             SWR_ASSERT((0), "TODO: Fill out more once driver sends this down");
 396         }
 397
 398         Value *vCurIndices;
 399         Value *startOffset;
 400         if(ied.InstanceEnable)
 401         {
 402             Value* stepRate = C(ied.InstanceAdvancementState);
 403
 404             // prevent a div by 0 for 0 step rate
 405             Value* isNonZeroStep = ICMP_UGT(stepRate, C(0));
 406             stepRate = SELECT(isNonZeroStep, stepRate, C(1));
 407
 408             // calc the current offset into instanced data buffer
 409             Value* calcInstance = UDIV(curInstance, stepRate);
 410
 411             // if step rate is 0, every instance gets instance 0
 412             calcInstance = SELECT(isNonZeroStep, calcInstance, C(0));
 413
 414             vCurIndices = VBROADCAST(calcInstance);
 415
 416             startOffset = startInstance;
 417         }
 418         else if (ied.InstanceStrideEnable)
 419         {
 420             SWR_ASSERT((0), "TODO: Fill out more once driver sends this down.");
 421         }
 422         else
 423         {
 424             // offset indices by baseVertex
 425             vCurIndices = ADD(vIndices, vBaseVertex);
 426
 427             startOffset = startVertex;
 428         }
 429
 430         // load SWR_VERTEX_BUFFER_STATE::pData
 431         Value *stream = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pData});
 432
 433         // load SWR_VERTEX_BUFFER_STATE::pitch
 434         Value *stride = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pitch});
 435         stride = Z_EXT(stride, mInt64Ty);
 436
 437         // load SWR_VERTEX_BUFFER_STATE::size
 438         Value *size = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_size});
 439         size = Z_EXT(size, mInt64Ty);
 440
 441         Value* startVertexOffset = MUL(Z_EXT(startOffset, mInt64Ty), stride);
 442
 443         Value *minVertex = NULL;
 444         Value *minVertexOffset = NULL;
 445         if (fetchState.bPartialVertexBuffer) {
 446             // fetch min index for low bounds checking
 447             minVertex = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_minVertex)});
 448             minVertex = LOAD(minVertex);
 449             if (!fetchState.bDisableIndexOOBCheck) {
 450                 minVertexOffset = MUL(Z_EXT(minVertex, mInt64Ty), stride);
 451             }
 452         }
 453
 454         // Load from the stream.
 455         for(uint32_t lane = 0; lane < mVWidth; ++lane)
 456         {
 457             // Get index
 458             Value* index = VEXTRACT(vCurIndices, C(lane));
 459
 460             if (fetchState.bPartialVertexBuffer) {
 461                 // clamp below minvertex
 462                 Value *isBelowMin = ICMP_SLT(index, minVertex);
 463                 index = SELECT(isBelowMin, minVertex, index);
 464             }
 465
 466             index = Z_EXT(index, mInt64Ty);
 467
 468             Value*    offset = MUL(index, stride);
 469             offset = ADD(offset, C((int64_t)ied.AlignedByteOffset));
 470             offset = ADD(offset, startVertexOffset);
 471
 472             if (!fetchState.bDisableIndexOOBCheck) {
 473                 // check for out of bound access, including partial OOB, and replace them with minVertex
 474                 Value *endOffset = ADD(offset, C((int64_t)info.Bpp));
 475                 Value *oob = ICMP_ULE(endOffset, size);
 476                 if (fetchState.bPartialVertexBuffer) {
 477                     offset = SELECT(oob, offset, minVertexOffset);
 478                 } else {
 479                     offset = SELECT(oob, offset, ConstantInt::get(mInt64Ty, 0));
 480                 }
 481             }
 482
 483             Value*    pointer = GEP(stream, offset);
 484             // We use a full-lane, but don't actually care.
 485             Value*    vptr = 0;
 486
 487             // get a pointer to a 4 component attrib in default address space
 488             switch(bpc)
 489             {
 490                 case 8: vptr = BITCAST(pointer, PointerType::get(VectorType::get(mInt8Ty, 4), 0)); break;
 491                 case 16: vptr = BITCAST(pointer, PointerType::get(VectorType::get(mInt16Ty, 4), 0)); break;
 492                 case 32: vptr = BITCAST(pointer, PointerType::get(VectorType::get(mFP32Ty, 4), 0)); break;
 493                 default: SWR_INVALID("Unsupported underlying bpp!");
 494             }
 495
 496             // load 4 components of attribute
 497             Value*    vec = ALIGNED_LOAD(vptr, 1, false);
 498
 499             // Convert To FP32 internally
 500             switch(info.type[0])
 501             {
 502                 case SWR_TYPE_UNORM:
 503                     switch(bpc)
 504                     {
 505                         case 8:
 506                             vec = UI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
 507                             vec = FMUL(vec, ConstantVector::get(std::vector<Constant*>(4, ConstantFP::get(mFP32Ty, 1.0 / 255.0))));
 508                             break;
 509                         case 16:
 510                             vec = UI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
 511                             vec = FMUL(vec, ConstantVector::get(std::vector<Constant*>(4, ConstantFP::get(mFP32Ty, 1.0 / 65535.0))));
 512                             break;
 513                         default:
 514                             SWR_INVALID("Unsupported underlying type!");
 515                             break;
 516                     }
 517                     break;
 518                 case SWR_TYPE_SNORM:
 519                     switch(bpc)
 520                     {
 521                         case 8:
 522                             vec = SI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
 523                             vec = FMUL(vec, ConstantVector::get(std::vector<Constant*>(4, ConstantFP::get(mFP32Ty, 1.0 / 128.0))));
 524                             break;
 525                         case 16:
 526                             vec = SI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
 527                             vec = FMUL(vec, ConstantVector::get(std::vector<Constant*>(4, ConstantFP::get(mFP32Ty, 1.0 / 32768.0))));
 528                             break;
 529                         default:
 530                             SWR_INVALID("Unsupported underlying type!");
 531                             break;
 532                     }
 533                     break;
 534                 case SWR_TYPE_UINT:
 535                     // Zero extend uint32_t types.
 536                     switch(bpc)
 537                     {
 538                         case 8:
 539                         case 16:
 540                             vec = Z_EXT(vec, VectorType::get(mInt32Ty, 4));
 541                             vec = BITCAST(vec, VectorType::get(mFP32Ty, 4));
 542                             break;
 543                         case 32:
 544                             break; // Pass through unchanged.
 545                         default:
 546                             SWR_INVALID("Unsupported underlying type!");
 547                             break;
 548                     }
 549                     break;
 550                 case SWR_TYPE_SINT:
 551                     // Sign extend SINT types.
 552                     switch(bpc)
 553                     {
 554                         case 8:
 555                         case 16:
 556                             vec = S_EXT(vec, VectorType::get(mInt32Ty, 4));
 557                             vec = BITCAST(vec, VectorType::get(mFP32Ty, 4));
 558                             break;
 559                         case 32:
 560                             break; // Pass through unchanged.
 561                         default:
 562                             SWR_INVALID("Unsupported underlying type!");
 563                             break;
 564                     }
 565                     break;
 566                 case SWR_TYPE_FLOAT:
 567                     switch(bpc)
 568                     {
 569                         case 32:
 570                             break; // Pass through unchanged.
 571                         default:
 572                             SWR_INVALID("Unsupported underlying type!");
 573                     }
 574                     break;
 575                 case SWR_TYPE_USCALED:
 576                     vec = UI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
 577                     break;
 578                 case SWR_TYPE_SSCALED:
 579                     vec = SI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
 580                     break;
 581                 case SWR_TYPE_SFIXED:
 582                     vec = FMUL(SI_TO_FP(vec, VectorType::get(mFP32Ty, 4)), VBROADCAST(C(1/65536.0f)));
 583                     break;
 584                 case SWR_TYPE_UNKNOWN:
 585                 case SWR_TYPE_UNUSED:
 586                     SWR_INVALID("Unsupported type %d!", info.type[0]);
 587             }
 588
 589             // promote mask: sse(0,1,2,3) | avx(0,1,2,3,4,4,4,4)
 590             // uwvec: 4 x F32, undef value
 591             Value*    wvec = VSHUFFLE(vec, uwvec, promoteMask);
 592             vectors.push_back(wvec);
 593         }
 594
 595         std::vector<Constant*>        v01Mask(mVWidth);
 596         std::vector<Constant*>        v23Mask(mVWidth);
 597         std::vector<Constant*>        v02Mask(mVWidth);
 598         std::vector<Constant*>        v13Mask(mVWidth);
 599
 600         // Concatenate the vectors together.
 601         elements[0] = VUNDEF_F();
 602         elements[1] = VUNDEF_F();
 603         elements[2] = VUNDEF_F();
 604         elements[3] = VUNDEF_F();
 605         for(uint32_t b = 0, num4Wide = mVWidth / 4; b < num4Wide; ++b)
 606         {
 607             v01Mask[4 * b + 0] = C(0 + 4 * b);
 608             v01Mask[4 * b + 1] = C(1 + 4 * b);
 609             v01Mask[4 * b + 2] = C(0 + 4 * b + mVWidth);
 610             v01Mask[4 * b + 3] = C(1 + 4 * b + mVWidth);
 611
 612             v23Mask[4 * b + 0] = C(2 + 4 * b);
 613             v23Mask[4 * b + 1] = C(3 + 4 * b);
 614             v23Mask[4 * b + 2] = C(2 + 4 * b + mVWidth);
 615             v23Mask[4 * b + 3] = C(3 + 4 * b + mVWidth);
 616
 617             v02Mask[4 * b + 0] = C(0 + 4 * b);
 618             v02Mask[4 * b + 1] = C(2 + 4 * b);
 619             v02Mask[4 * b + 2] = C(0 + 4 * b + mVWidth);
 620             v02Mask[4 * b + 3] = C(2 + 4 * b + mVWidth);
 621
 622             v13Mask[4 * b + 0] = C(1 + 4 * b);
 623             v13Mask[4 * b + 1] = C(3 + 4 * b);
 624             v13Mask[4 * b + 2] = C(1 + 4 * b + mVWidth);
 625             v13Mask[4 * b + 3] = C(3 + 4 * b + mVWidth);
 626
 627             std::vector<Constant*>    iMask(mVWidth);
 628             for(uint32_t i = 0; i < mVWidth; ++i)
 629             {
 630                 if(((4 * b) <= i) && (i < (4 * (b + 1))))
 631                 {
 632                     iMask[i] = C(i % 4 + mVWidth);
 633                 }
 634                 else
 635                 {
 636                     iMask[i] = C(i);
 637                 }
 638             }
 639             Constant* insertMask = ConstantVector::get(iMask);
 640             elements[0] = VSHUFFLE(elements[0], vectors[4 * b + 0], insertMask);
 641             elements[1] = VSHUFFLE(elements[1], vectors[4 * b + 1], insertMask);
 642             elements[2] = VSHUFFLE(elements[2], vectors[4 * b + 2], insertMask);
 643             elements[3] = VSHUFFLE(elements[3], vectors[4 * b + 3], insertMask);
 644         }
 645
 646         Value* x0y0x1y1 = VSHUFFLE(elements[0], elements[1], ConstantVector::get(v01Mask));
 647         Value* x2y2x3y3 = VSHUFFLE(elements[2], elements[3], ConstantVector::get(v01Mask));
 648         Value* z0w0z1w1 = VSHUFFLE(elements[0], elements[1], ConstantVector::get(v23Mask));
 649         Value* z2w3z2w3 = VSHUFFLE(elements[2], elements[3], ConstantVector::get(v23Mask));
 650         elements[0] = VSHUFFLE(x0y0x1y1, x2y2x3y3, ConstantVector::get(v02Mask));
 651         elements[1] = VSHUFFLE(x0y0x1y1, x2y2x3y3, ConstantVector::get(v13Mask));
 652         elements[2] = VSHUFFLE(z0w0z1w1, z2w3z2w3, ConstantVector::get(v02Mask));
 653         elements[3] = VSHUFFLE(z0w0z1w1, z2w3z2w3, ConstantVector::get(v13Mask));
 654
 655         switch(numComponents + 1)
 656         {
 657             case    1: elements[0] = VIMMED1(0.0f);
 658             case    2: elements[1] = VIMMED1(0.0f);
 659             case    3: elements[2] = VIMMED1(0.0f);
 660             case    4: elements[3] = VIMMED1(1.0f);
 661         }
 662
 663         for(uint32_t c = 0; c < 4; ++c)
 664         {
 665 #if USE_SIMD16_SHADERS
 666             Value* dest = GEP(pVtxOut, C(nelt * 8 + c * 2), "destGEP");
 667 #else
 668             Value* dest = GEP(pVtxOut, C(nelt * 4 + c), "destGEP");
 669 #endif
 670             STORE(elements[c], dest);
 671         }
 672     }
 673 }
 674
 675 // returns true for odd formats that require special state.gather handling
 676 bool FetchJit::IsOddFormat(SWR_FORMAT format)
 677 {
 678     const SWR_FORMAT_INFO& info = GetFormatInfo(format);
 679     if (info.bpc[0] != 8 && info.bpc[0] != 16 && info.bpc[0] != 32 && info.bpc[0] != 64)
 680     {
 681         return true;
 682     }
 683     return false;
 684 }
 685
 686 // format is uniform if all components are the same size and type
 687 bool FetchJit::IsUniformFormat(SWR_FORMAT format)
 688 {
 689     const SWR_FORMAT_INFO& info = GetFormatInfo(format);
 690     uint32_t bpc0 = info.bpc[0];
 691     uint32_t type0 = info.type[0];
 692
 693     for (uint32_t c = 1; c < info.numComps; ++c)
 694     {
 695         if (bpc0 != info.bpc[c] || type0 != info.type[c])
 696         {
 697             return false;
 698         }
 699     }
 700     return true;
 701 }
 702
 703 // unpacks components based on format
 704 // foreach component in the pixel
 705 //   mask off everything but this component
 706 //   shift component to LSB
 707 void FetchJit::UnpackComponents(SWR_FORMAT format, Value* vInput, Value* result[4])
 708 {
 709     const SWR_FORMAT_INFO& info = GetFormatInfo(format);
 710
 711     uint32_t bitOffset = 0;
 712     for (uint32_t c = 0; c < info.numComps; ++c)
 713     {
 714         uint32_t swizzledIndex = info.swizzle[c];
 715         uint32_t compBits = info.bpc[c];
 716         uint32_t bitmask = ((1 << compBits) - 1) << bitOffset;
 717         Value* comp = AND(vInput, bitmask);
 718         comp = LSHR(comp, bitOffset);
 719
 720         result[swizzledIndex] = comp;
 721         bitOffset += compBits;
 722     }
 723 }
 724
 725 // gather for odd component size formats
 726 // gather SIMD full pixels per lane then shift/mask to move each component to their
 727 // own vector
 728 void FetchJit::CreateGatherOddFormats(SWR_FORMAT format, Value* pMask, Value* pBase, Value* pOffsets, Value* pResult[4])
 729 {
 730     const SWR_FORMAT_INFO &info = GetFormatInfo(format);
 731
 732     // only works if pixel size is <= 32bits
 733     SWR_ASSERT(info.bpp <= 32);
 734
 735     Value *pGather = GATHERDD(VIMMED1(0), pBase, pOffsets, pMask);
 736
 737     for (uint32_t comp = 0; comp < 4; ++comp)
 738     {
 739         pResult[comp] = VIMMED1((int)info.defaults[comp]);
 740     }
 741
 742     UnpackComponents(format, pGather, pResult);
 743
 744     // cast to fp32
 745     pResult[0] = BITCAST(pResult[0], mSimdFP32Ty);
 746     pResult[1] = BITCAST(pResult[1], mSimdFP32Ty);
 747     pResult[2] = BITCAST(pResult[2], mSimdFP32Ty);
 748     pResult[3] = BITCAST(pResult[3], mSimdFP32Ty);
 749 }
 750
 751 void FetchJit::ConvertFormat(SWR_FORMAT format, Value *texels[4])
 752 {
 753     const SWR_FORMAT_INFO &info = GetFormatInfo(format);
 754
 755     for (uint32_t c = 0; c < info.numComps; ++c)
 756     {
 757         uint32_t compIndex = info.swizzle[c];
 758
 759         // skip any conversion on UNUSED components
 760         if (info.type[c] == SWR_TYPE_UNUSED)
 761         {
 762             continue;
 763         }
 764
 765         if (info.isNormalized[c])
 766         {
 767             if (info.type[c] == SWR_TYPE_SNORM)
 768             {
 769                 /// @todo The most-negative value maps to -1.0f. e.g. the 5-bit value 10000 maps to -1.0f.
 770
 771                 /// result = c * (1.0f / (2^(n-1) - 1);
 772                 uint32_t n = info.bpc[c];
 773                 uint32_t pow2 = 1 << (n - 1);
 774                 float scale = 1.0f / (float)(pow2 - 1);
 775                 Value *vScale = VIMMED1(scale);
 776                 texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty);
 777                 texels[compIndex] = SI_TO_FP(texels[compIndex], mSimdFP32Ty);
 778                 texels[compIndex] = FMUL(texels[compIndex], vScale);
 779             }
 780             else
 781             {
 782                 SWR_ASSERT(info.type[c] == SWR_TYPE_UNORM);
 783
 784                 /// result = c * (1.0f / (2^n - 1))
 785                 uint32_t n = info.bpc[c];
 786                 uint32_t pow2 = 1 << n;
 787                 // special case 24bit unorm format, which requires a full divide to meet ULP requirement
 788                 if (n == 24)
 789                 {
 790                     float scale = (float)(pow2 - 1);
 791                     Value* vScale = VIMMED1(scale);
 792                     texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty);
 793                     texels[compIndex] = SI_TO_FP(texels[compIndex], mSimdFP32Ty);
 794                     texels[compIndex] = FDIV(texels[compIndex], vScale);
 795                 }
 796                 else
 797                 {
 798                     float scale = 1.0f / (float)(pow2 - 1);
 799                     Value *vScale = VIMMED1(scale);
 800                     texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty);
 801                     texels[compIndex] = UI_TO_FP(texels[compIndex], mSimdFP32Ty);
 802                     texels[compIndex] = FMUL(texels[compIndex], vScale);
 803                 }
 804             }
 805             continue;
 806         }
 807     }
 808 }
 809
 810 //////////////////////////////////////////////////////////////////////////
 811 /// @brief Loads attributes from memory using AVX2 GATHER(s)
 812 /// @param fetchState - info about attributes to be fetched from memory
 813 /// @param streams - value pointer to the current vertex stream
 814 /// @param vIndices - vector value of indices to gather
 815 /// @param pVtxOut - value pointer to output simdvertex struct
 816 #if USE_SIMD16_SHADERS
 817 #if USE_SIMD16_GATHERS
 818 void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
 819     Value *streams, Value *vIndices, Value *vIndices2, Value *pVtxOut, bool useVertexID2)
 820 #else
 821 void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
 822     Value* streams, Value* vIndices, Value* pVtxOut, bool useVertexID2)
 823 #endif
 824 #else
 825 void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
 826     Value* streams, Value* vIndices, Value* pVtxOut)
 827 #endif
 828 {
 829     uint32_t currentVertexElement = 0;
 830     uint32_t outputElt = 0;
 831     Value* vVertexElements[4];
 832 #if USE_SIMD16_GATHERS
 833     Value* vVertexElements2[4];
 834 #if USE_SIMD16_BUILDER
 835     Value *pVtxSrc2[4];
 836 #endif
 837 #endif
 838
 839     Value* startVertex = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartVertex});
 840     Value* startInstance = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartInstance});
 841     Value* curInstance = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_CurInstance});
 842     Value* vBaseVertex = VBROADCAST(LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_BaseVertex}));
 843     curInstance->setName("curInstance");
 844
 845     for (uint32_t nInputElt = 0; nInputElt < fetchState.numAttribs; nInputElt += 1)
 846     {
 847         const INPUT_ELEMENT_DESC& ied = fetchState.layout[nInputElt];
 848
 849         // skip element if all components are disabled
 850         if (ied.ComponentPacking == ComponentEnable::NONE)
 851         {
 852             continue;
 853         }
 854
 855         const SWR_FORMAT_INFO &info = GetFormatInfo((SWR_FORMAT)ied.Format);
 856         SWR_ASSERT((info.bpp != 0), "Unsupported format in JitGatherVertices.");
 857         uint32_t bpc = info.bpp / info.numComps;  ///@todo Code below assumes all components are same size. Need to fix.
 858
 859         Value *stream = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pData});
 860
 861         // VGATHER* takes an *i8 src pointer
 862         Value* pStreamBase = BITCAST(stream, PointerType::get(mInt8Ty, 0));
 863
 864         Value *stride = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pitch});
 865         Value *vStride = VBROADCAST(stride);
 866
 867         // max vertex index that is fully in bounds
 868         Value *maxVertex = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_maxVertex)});
 869         maxVertex = LOAD(maxVertex);
 870
 871         Value *minVertex = NULL;
 872         if (fetchState.bPartialVertexBuffer)
 873         {
 874             // min vertex index for low bounds OOB checking
 875             minVertex = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_minVertex)});
 876             minVertex = LOAD(minVertex);
 877         }
 878
 879         if (fetchState.bInstanceIDOffsetEnable)
 880         {
 881             // the InstanceID (curInstance) value is offset by StartInstanceLocation
 882             curInstance = ADD(curInstance, startInstance);
 883         }
 884
 885         Value *vCurIndices;
 886 #if USE_SIMD16_GATHERS
 887         Value *vCurIndices2;
 888 #endif
 889         Value *startOffset;
 890         Value *vInstanceStride = VIMMED1(0);
 891
 892         if (ied.InstanceEnable)
 893         {
 894             Value* stepRate = C(ied.InstanceAdvancementState);
 895
 896             // prevent a div by 0 for 0 step rate
 897             Value* isNonZeroStep = ICMP_UGT(stepRate, C(0));
 898             stepRate = SELECT(isNonZeroStep, stepRate, C(1));
 899
 900             // calc the current offset into instanced data buffer
 901             Value* calcInstance = UDIV(curInstance, stepRate);
 902
 903             // if step rate is 0, every instance gets instance 0
 904             calcInstance = SELECT(isNonZeroStep, calcInstance, C(0));
 905
 906             vCurIndices = VBROADCAST(calcInstance);
 907 #if USE_SIMD16_GATHERS
 908             vCurIndices2 = VBROADCAST(calcInstance);
 909 #endif
 910
 911             startOffset = startInstance;
 912         }
 913         else if (ied.InstanceStrideEnable)
 914         {
 915             // grab the instance advancement state, determines stride in bytes from one instance to the next
 916             Value* stepRate = C(ied.InstanceAdvancementState);
 917             vInstanceStride = VBROADCAST(MUL(curInstance, stepRate));
 918
 919             // offset indices by baseVertex
 920             vCurIndices = ADD(vIndices, vBaseVertex);
 921 #if USE_SIMD16_GATHERS
 922             vCurIndices2 = ADD(vIndices2, vBaseVertex);
 923 #endif
 924
 925             startOffset = startVertex;
 926             SWR_ASSERT((0), "TODO: Fill out more once driver sends this down.");
 927         }
 928         else
 929         {
 930             // offset indices by baseVertex
 931             vCurIndices = ADD(vIndices, vBaseVertex);
 932 #if USE_SIMD16_GATHERS
 933             vCurIndices2 = ADD(vIndices2, vBaseVertex);
 934 #endif
 935
 936             startOffset = startVertex;
 937         }
 938
 939         // All of the OOB calculations are in vertices, not VB offsets, to prevent having to
 940         // do 64bit address offset calculations.
 941
 942         // calculate byte offset to the start of the VB
 943         Value* baseOffset = MUL(Z_EXT(startOffset, mInt64Ty), Z_EXT(stride, mInt64Ty));
 944         pStreamBase = GEP(pStreamBase, baseOffset);
 945
 946         // if we have a start offset, subtract from max vertex. Used for OOB check
 947         maxVertex = SUB(Z_EXT(maxVertex, mInt64Ty), Z_EXT(startOffset, mInt64Ty));
 948         Value* maxNeg = ICMP_SLT(maxVertex, C((int64_t)0));
 949         // if we have a negative value, we're already OOB. clamp at 0.
 950         maxVertex = SELECT(maxNeg, C(0), TRUNC(maxVertex, mInt32Ty));
 951
 952         if (fetchState.bPartialVertexBuffer)
 953         {
 954             // similary for min vertex
 955             minVertex = SUB(Z_EXT(minVertex, mInt64Ty), Z_EXT(startOffset, mInt64Ty));
 956             Value *minNeg = ICMP_SLT(minVertex, C((int64_t)0));
 957             minVertex = SELECT(minNeg, C(0), TRUNC(minVertex, mInt32Ty));
 958         }
 959
 960         // Load the in bounds size of a partially valid vertex
 961         Value *partialInboundsSize = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_partialInboundsSize)});
 962         partialInboundsSize = LOAD(partialInboundsSize);
 963         Value* vPartialVertexSize = VBROADCAST(partialInboundsSize);
 964         Value* vBpp = VBROADCAST(C(info.Bpp));
 965         Value* vAlignmentOffsets = VBROADCAST(C(ied.AlignedByteOffset));
 966
 967         // is the element is <= the partially valid size
 968         Value* vElementInBoundsMask = ICMP_SLE(vBpp, SUB(vPartialVertexSize, vAlignmentOffsets));
 969
 970 #if USE_SIMD16_GATHERS
 971         // override cur indices with 0 if pitch is 0
 972         Value* pZeroPitchMask = ICMP_EQ(vStride, VIMMED1(0));
 973         vCurIndices = SELECT(pZeroPitchMask, VIMMED1(0), vCurIndices);
 974         vCurIndices2 = SELECT(pZeroPitchMask, VIMMED1(0), vCurIndices2);
 975
 976         // are vertices partially OOB?
 977         Value* vMaxVertex = VBROADCAST(maxVertex);
 978         Value* vPartialOOBMask = ICMP_EQ(vCurIndices, vMaxVertex);
 979         Value* vPartialOOBMask2 = ICMP_EQ(vCurIndices2, vMaxVertex);
 980
 981         // are vertices fully in bounds?
 982         Value* vMaxGatherMask = ICMP_ULT(vCurIndices, vMaxVertex);
 983         Value* vMaxGatherMask2 = ICMP_ULT(vCurIndices2, vMaxVertex);
 984
 985         Value *vGatherMask;
 986         Value *vGatherMask2;
 987         if (fetchState.bPartialVertexBuffer)
 988         {
 989             // are vertices below minVertex limit?
 990             Value *vMinVertex = VBROADCAST(minVertex);
 991             Value *vMinGatherMask = ICMP_UGE(vCurIndices, vMinVertex);
 992             Value *vMinGatherMask2 = ICMP_UGE(vCurIndices2, vMinVertex);
 993
 994             // only fetch lanes that pass both tests
 995             vGatherMask = AND(vMaxGatherMask, vMinGatherMask);
 996             vGatherMask2 = AND(vMaxGatherMask2, vMinGatherMask2);
 997         }
 998         else
 999         {
1000             vGatherMask = vMaxGatherMask;
1001             vGatherMask2 = vMaxGatherMask2;
1002         }
1003
1004         // blend in any partially OOB indices that have valid elements
1005         vGatherMask = SELECT(vPartialOOBMask, vElementInBoundsMask, vGatherMask);
1006         vGatherMask2 = SELECT(vPartialOOBMask2, vElementInBoundsMask, vGatherMask2);
1007         Value *pMask = vGatherMask;
1008         Value *pMask2 = vGatherMask2;
1009         vGatherMask = VMASK(vGatherMask);
1010         vGatherMask2 = VMASK(vGatherMask2);
1011
1012         // calculate the actual offsets into the VB
1013         Value* vOffsets = MUL(vCurIndices, vStride);
1014         vOffsets = ADD(vOffsets, vAlignmentOffsets);
1015
1016         Value* vOffsets2 = MUL(vCurIndices2, vStride);
1017         vOffsets2 = ADD(vOffsets2, vAlignmentOffsets);
1018
1019         // if instance stride enable is:
1020         //  true  - add product of the instanceID and advancement state to the offst into the VB
1021         //  false - value of vInstanceStride has been initialialized to zero
1022         vOffsets = ADD(vOffsets, vInstanceStride);
1023         vOffsets2 = ADD(vOffsets2, vInstanceStride);
1024
1025 #else
1026         // override cur indices with 0 if pitch is 0
1027         Value* pZeroPitchMask = ICMP_EQ(vStride, VIMMED1(0));
1028         vCurIndices = SELECT(pZeroPitchMask, VIMMED1(0), vCurIndices);
1029
1030         // are vertices partially OOB?
1031         Value* vMaxVertex = VBROADCAST(maxVertex);
1032         Value* vPartialOOBMask = ICMP_EQ(vCurIndices, vMaxVertex);
1033
1034         // are vertices fully in bounds?
1035         Value* vMaxGatherMask = ICMP_ULT(vCurIndices, vMaxVertex);
1036
1037         Value *vGatherMask;
1038         if (fetchState.bPartialVertexBuffer)
1039         {
1040             // are vertices below minVertex limit?
1041             Value *vMinVertex = VBROADCAST(minVertex);
1042             Value *vMinGatherMask = ICMP_UGE(vCurIndices, vMinVertex);
1043
1044             // only fetch lanes that pass both tests
1045             vGatherMask = AND(vMaxGatherMask, vMinGatherMask);
1046         }
1047         else
1048         {
1049             vGatherMask = vMaxGatherMask;
1050         }
1051
1052         // blend in any partially OOB indices that have valid elements
1053         vGatherMask = SELECT(vPartialOOBMask, vElementInBoundsMask, vGatherMask);
1054         Value* pMask = vGatherMask;
1055         vGatherMask = VMASK(vGatherMask);
1056
1057         // calculate the actual offsets into the VB
1058         Value* vOffsets = MUL(vCurIndices, vStride);
1059         vOffsets = ADD(vOffsets, vAlignmentOffsets);
1060
1061         // if instance stride enable is:
1062         //  true  - add product of the instanceID and advancement state to the offst into the VB
1063         //  false - value of vInstanceStride has been initialialized to zero
1064         vOffsets = ADD(vOffsets, vInstanceStride);
1065
1066 #endif
1067         // Packing and component control
1068         ComponentEnable compMask = (ComponentEnable)ied.ComponentPacking;
1069         const ComponentControl compCtrl[4] { (ComponentControl)ied.ComponentControl0, (ComponentControl)ied.ComponentControl1,
1070                                              (ComponentControl)ied.ComponentControl2, (ComponentControl)ied.ComponentControl3};
1071
1072         // Special gather/conversion for formats without equal component sizes
1073         if (IsOddFormat((SWR_FORMAT)ied.Format))
1074         {
1075 #if USE_SIMD16_GATHERS
1076             Value *pResults[4];
1077             Value *pResults2[4];
1078             CreateGatherOddFormats((SWR_FORMAT)ied.Format, vGatherMask, pStreamBase, vOffsets, pResults);
1079             CreateGatherOddFormats((SWR_FORMAT)ied.Format, vGatherMask2, pStreamBase, vOffsets2, pResults2);
1080             ConvertFormat((SWR_FORMAT)ied.Format, pResults);
1081             ConvertFormat((SWR_FORMAT)ied.Format, pResults2);
1082
1083             for (uint32_t c = 0; c < 4; c += 1)
1084             {
1085                 if (isComponentEnabled(compMask, c))
1086                 {
1087 #if USE_SIMD16_BUILDER
1088                     // pack adjacent pairs of SIMD8s into SIMD16s
1089                     pVtxSrc2[currentVertexElement] = VUNDEF2_F();
1090                     pVtxSrc2[currentVertexElement] = INSERT2_F(pVtxSrc2[currentVertexElement], pResults[c],  0);
1091                     pVtxSrc2[currentVertexElement] = INSERT2_F(pVtxSrc2[currentVertexElement], pResults2[c], 1);
1092
1093 #else
1094                     vVertexElements[currentVertexElement]  = pResults[c];
1095                     vVertexElements2[currentVertexElement] = pResults2[c];
1096
1097 #endif
1098                     currentVertexElement += 1;
1099
1100                     if (currentVertexElement > 3)
1101                     {
1102 #if USE_SIMD16_BUILDER
1103                         // store SIMD16s
1104                         Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth2), 0));
1105
1106                         StoreVertexElements2(pVtxOut2, outputElt, 4, pVtxSrc2);
1107
1108 #else
1109                         StoreVertexElements(pVtxOut, outputElt, 4, vVertexElements);
1110                         StoreVertexElements(GEP(pVtxOut, C(1)), outputElt, 4, vVertexElements2);
1111
1112 #endif
1113                         outputElt += 1;
1114
1115                         // reset to the next vVertexElement to output
1116                         currentVertexElement = 0;
1117                     }
1118                 }
1119             }
1120 #else
1121             Value* pResults[4];
1122             CreateGatherOddFormats((SWR_FORMAT)ied.Format, vGatherMask, pStreamBase, vOffsets, pResults);
1123             ConvertFormat((SWR_FORMAT)ied.Format, pResults);
1124
1125             for (uint32_t c = 0; c < 4; ++c)
1126             {
1127                 if (isComponentEnabled(compMask, c))
1128                 {
1129                     vVertexElements[currentVertexElement++] = pResults[c];
1130                     if (currentVertexElement > 3)
1131                     {
1132                         StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1133                         // reset to the next vVertexElement to output
1134                         currentVertexElement = 0;
1135                     }
1136                 }
1137             }
1138 #endif
1139         }
1140         else if(info.type[0] == SWR_TYPE_FLOAT)
1141         {
1142             ///@todo: support 64 bit vb accesses
1143             Value *gatherSrc = VIMMED1(0.0f);
1144 #if USE_SIMD16_GATHERS
1145             Value *gatherSrc2 = VIMMED1(0.0f);
1146 #if USE_SIMD16_BUILDER
1147             Value *gatherSrc16 = VIMMED2_1(0.0f);
1148 #endif
1149 #endif
1150
1151             SWR_ASSERT(IsUniformFormat((SWR_FORMAT)ied.Format),
1152                 "Unsupported format for standard gather fetch.");
1153
1154             // Gather components from memory to store in a simdvertex structure
1155             switch (bpc)
1156             {
1157                 case 16:
1158                 {
1159 #if USE_SIMD16_GATHERS
1160                     Value *vGatherResult[2];
1161                     Value *vGatherResult2[2];
1162
1163                     // if we have at least one component out of x or y to fetch
1164                     if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
1165                     {
1166                         vGatherResult[0] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vGatherMask);
1167                         vGatherResult2[0] = GATHERPS(gatherSrc2, pStreamBase, vOffsets2, vGatherMask2);
1168                         // e.g. result of first 8x32bit integer gather for 16bit components
1169                         // 256i - 0    1    2    3    4    5    6    7
1170                         //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
1171                         //
1172                     }
1173                     else
1174                     {
1175                         vGatherResult[0]  = VUNDEF_I();
1176                         vGatherResult2[0] = VUNDEF_I();
1177                     }
1178
1179                     // if we have at least one component out of z or w to fetch
1180                     if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
1181                     {
1182                         // offset base to the next components(zw) in the vertex to gather
1183                         pStreamBase = GEP(pStreamBase, C((char)4));
1184
1185                         vGatherResult[1] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vGatherMask);
1186                         vGatherResult2[1] = GATHERPS(gatherSrc2, pStreamBase, vOffsets2, vGatherMask2);
1187                         // e.g. result of second 8x32bit integer gather for 16bit components
1188                         // 256i - 0    1    2    3    4    5    6    7
1189                         //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
1190                         //
1191                     }
1192                     else
1193                     {
1194                         vGatherResult[1]  = VUNDEF_I();
1195                         vGatherResult2[1] = VUNDEF_I();
1196                     }
1197
1198                     // if we have at least one component to shuffle into place
1199                     if (compMask)
1200                     {
1201 #if USE_SIMD16_BUILDER
1202                         Value *gatherResult[2];
1203
1204                         gatherResult[0] = VUNDEF2_I();
1205                         gatherResult[1] = VUNDEF2_I();
1206
1207                         gatherResult[0] = INSERT2_I(gatherResult[0], vGatherResult[0],  0);
1208                         gatherResult[0] = INSERT2_I(gatherResult[0], vGatherResult2[0], 1);
1209
1210                         gatherResult[1] = INSERT2_I(gatherResult[1], vGatherResult[1],  0);
1211                         gatherResult[1] = INSERT2_I(gatherResult[1], vGatherResult2[1], 1);
1212
1213                         Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth2), 0));
1214
1215                         Shuffle16bpcArgs args = std::forward_as_tuple(gatherResult, pVtxOut2, Instruction::CastOps::FPExt, CONVERT_NONE,
1216                             currentVertexElement, outputElt, compMask, compCtrl, pVtxSrc2);
1217
1218                         // Shuffle gathered components into place in simdvertex struct
1219                         Shuffle16bpcGather2(args);  // outputs to vVertexElements ref
1220 #else
1221                         Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, Instruction::CastOps::FPExt, CONVERT_NONE,
1222                             currentVertexElement, outputElt, compMask, compCtrl, vVertexElements);
1223                         Shuffle16bpcArgs args2 = std::forward_as_tuple(vGatherResult2, GEP(pVtxOut, C(1)), Instruction::CastOps::FPExt, CONVERT_NONE,
1224                             currentVertexElement, outputElt, compMask, compCtrl, vVertexElements2);
1225
1226                         // Shuffle gathered components into place in simdvertex struct
1227                         Shuffle16bpcGather(args, false);  // outputs to vVertexElements ref
1228                         Shuffle16bpcGather(args2, true);  // outputs to vVertexElements ref
1229 #endif
1230                     }
1231 #else
1232                     Value* vGatherResult[2];
1233
1234                     // if we have at least one component out of x or y to fetch
1235                     if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)){
1236                         vGatherResult[0] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vGatherMask);
1237                         // e.g. result of first 8x32bit integer gather for 16bit components
1238                         // 256i - 0    1    2    3    4    5    6    7
1239                         //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
1240                         //
1241                     }
1242
1243                     // if we have at least one component out of z or w to fetch
1244                     if(isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)){
1245                         // offset base to the next components(zw) in the vertex to gather
1246                         pStreamBase = GEP(pStreamBase, C((char)4));
1247
1248                         vGatherResult[1] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vGatherMask);
1249                         // e.g. result of second 8x32bit integer gather for 16bit components
1250                         // 256i - 0    1    2    3    4    5    6    7
1251                         //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
1252                         //
1253                     }
1254
1255                     // if we have at least one component to shuffle into place
1256                     if(compMask){
1257                         Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, Instruction::CastOps::FPExt, CONVERT_NONE,
1258                             currentVertexElement, outputElt, compMask, compCtrl, vVertexElements);
1259
1260                         // Shuffle gathered components into place in simdvertex struct
1261 #if USE_SIMD16_SHADERS
1262                         Shuffle16bpcGather(args, useVertexID2);  // outputs to vVertexElements ref
1263 #else
1264                         Shuffle16bpcGather(args);  // outputs to vVertexElements ref
1265 #endif
1266                     }
1267 #endif
1268                 }
1269                     break;
1270                 case 32:
1271                 {
1272                     for (uint32_t i = 0; i < 4; i += 1)
1273                     {
1274 #if USE_SIMD16_GATHERS
1275                         if (isComponentEnabled(compMask, i))
1276                         {
1277                             // if we need to gather the component
1278                             if (compCtrl[i] == StoreSrc)
1279                             {
1280                                 // Gather a SIMD of vertices
1281                                 // APIs allow a 4GB range for offsets
1282                                 // However, GATHERPS uses signed 32-bit offsets, so only a 2GB range :(
1283                                 // But, we know that elements must be aligned for FETCH. :)
1284                                 // Right shift the offset by a bit and then scale by 2 to remove the sign extension.
1285                                 Value *vShiftedOffsets  = VPSRLI(vOffsets,  C(1));
1286                                 Value *vShiftedOffsets2 = VPSRLI(vOffsets2, C(1));
1287 #if USE_SIMD16_BUILDER
1288                                 Value *indices = VUNDEF2_I();
1289                                 indices = INSERT2_I(indices, vShiftedOffsets,  0);
1290                                 indices = INSERT2_I(indices, vShiftedOffsets2, 1);
1291
1292                                 Value *mask = VUNDEF2_I();
1293                                 mask = INSERT2_I(mask, vGatherMask,  0);
1294                                 mask = INSERT2_I(mask, vGatherMask2, 1);
1295
1296                                 pVtxSrc2[currentVertexElement] = GATHERPS2(gatherSrc16, pStreamBase, indices, mask, 2);
1297 #else
1298                                 vVertexElements[currentVertexElement]  = GATHERPS(gatherSrc, pStreamBase, vShiftedOffsets, vGatherMask, 2);
1299                                 vVertexElements2[currentVertexElement] = GATHERPS(gatherSrc2, pStreamBase, vShiftedOffsets2, vGatherMask2, 2);
1300
1301 #if USE_SIMD16_BUILDER
1302                                 // pack adjacent pairs of SIMD8s into SIMD16s
1303                                 pVtxSrc2[currentVertexElement] = VUNDEF2_F();
1304                                 pVtxSrc2[currentVertexElement] = INSERT2_F(pVtxSrc2[currentVertexElement], vVertexElements[currentVertexElement],  0);
1305                                 pVtxSrc2[currentVertexElement] = INSERT2_F(pVtxSrc2[currentVertexElement], vVertexElements2[currentVertexElement], 1);
1306
1307 #endif
1308 #endif
1309                                 currentVertexElement += 1;
1310                             }
1311                             else
1312                             {
1313 #if USE_SIMD16_BUILDER
1314                                 pVtxSrc2[currentVertexElement] = GenerateCompCtrlVector2(compCtrl[i]);
1315 #else
1316                                 vVertexElements[currentVertexElement] = GenerateCompCtrlVector(compCtrl[i], false);
1317                                 vVertexElements2[currentVertexElement] = GenerateCompCtrlVector(compCtrl[i], true);
1318
1319 #if USE_SIMD16_BUILDER
1320                                 // pack adjacent pairs of SIMD8s into SIMD16s
1321                                 pVtxSrc2[currentVertexElement] = VUNDEF2_F();
1322                                 pVtxSrc2[currentVertexElement] = INSERT2_F(pVtxSrc2[currentVertexElement], vVertexElements[currentVertexElement],  0);
1323                                 pVtxSrc2[currentVertexElement] = INSERT2_F(pVtxSrc2[currentVertexElement], vVertexElements2[currentVertexElement], 1);
1324
1325 #endif
1326 #endif
1327                                 currentVertexElement += 1;
1328                             }
1329
1330                             if (currentVertexElement > 3)
1331                             {
1332 #if USE_SIMD16_BUILDER
1333                                 // store SIMD16s
1334                                 Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth2), 0));
1335
1336                                 StoreVertexElements2(pVtxOut2, outputElt, 4, pVtxSrc2);
1337
1338 #else
1339                                 StoreVertexElements(pVtxOut, outputElt, 4, vVertexElements);
1340                                 StoreVertexElements(GEP(pVtxOut, C(1)), outputElt, 4, vVertexElements2);
1341
1342 #endif
1343                                 outputElt += 1;
1344
1345                                 // reset to the next vVertexElement to output
1346                                 currentVertexElement = 0;
1347                             }
1348                         }
1349
1350                         // offset base to the next component in the vertex to gather
1351                         pStreamBase = GEP(pStreamBase, C((char)4));
1352 #else
1353                         if (isComponentEnabled(compMask, i))
1354                         {
1355                             // if we need to gather the component
1356                             if (compCtrl[i] == StoreSrc)
1357                             {
1358                                 // Gather a SIMD of vertices
1359                                 // APIs allow a 4GB range for offsets
1360                                 // However, GATHERPS uses signed 32-bit offsets, so only a 2GB range :(
1361                                 // But, we know that elements must be aligned for FETCH. :)
1362                                 // Right shift the offset by a bit and then scale by 2 to remove the sign extension.
1363                                 Value* vShiftedOffsets = VPSRLI(vOffsets, C(1));
1364                                 vVertexElements[currentVertexElement++] = GATHERPS(gatherSrc, pStreamBase, vShiftedOffsets, vGatherMask, 2);
1365                             }
1366                             else
1367                             {
1368 #if USE_SIMD16_SHADERS
1369                                 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i], useVertexID2);
1370 #else
1371                                 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1372 #endif
1373                             }
1374
1375                             if (currentVertexElement > 3)
1376                             {
1377                                 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1378                                 // reset to the next vVertexElement to output
1379                                 currentVertexElement = 0;
1380                             }
1381                         }
1382
1383                         // offset base to the next component in the vertex to gather
1384                         pStreamBase = GEP(pStreamBase, C((char)4));
1385 #endif
1386                     }
1387                 }
1388                     break;
1389                 case 64:
1390                 {
1391                     for (uint32_t i = 0; i < 4; i += 1)
1392                     {
1393 #if USE_SIMD16_GATHERS
1394                         if (isComponentEnabled(compMask, i))
1395                         {
1396                             // if we need to gather the component
1397                             if (compCtrl[i] == StoreSrc)
1398                             {
1399                                 Value *vMaskLo = VSHUFFLE(pMask, VUNDEF(mInt1Ty, 8), C({ 0, 1, 2, 3 }));
1400                                 Value *vMaskLo2 = VSHUFFLE(pMask2, VUNDEF(mInt1Ty, 8), C({ 0, 1, 2, 3 }));
1401                                 Value *vMaskHi = VSHUFFLE(pMask, VUNDEF(mInt1Ty, 8), C({ 4, 5, 6, 7 }));
1402                                 Value *vMaskHi2 = VSHUFFLE(pMask2, VUNDEF(mInt1Ty, 8), C({ 4, 5, 6, 7 }));
1403                                 vMaskLo = S_EXT(vMaskLo, VectorType::get(mInt64Ty, 4));
1404                                 vMaskLo2 = S_EXT(vMaskLo2, VectorType::get(mInt64Ty, 4));
1405                                 vMaskHi = S_EXT(vMaskHi, VectorType::get(mInt64Ty, 4));
1406                                 vMaskHi2 = S_EXT(vMaskHi2, VectorType::get(mInt64Ty, 4));
1407                                 vMaskLo = BITCAST(vMaskLo, VectorType::get(mDoubleTy, 4));
1408                                 vMaskLo2 = BITCAST(vMaskLo2, VectorType::get(mDoubleTy, 4));
1409                                 vMaskHi = BITCAST(vMaskHi, VectorType::get(mDoubleTy, 4));
1410                                 vMaskHi2 = BITCAST(vMaskHi2, VectorType::get(mDoubleTy, 4));
1411
1412                                 Value *vOffsetsLo = VEXTRACTI128(vOffsets, C(0));
1413                                 Value *vOffsetsLo2 = VEXTRACTI128(vOffsets2, C(0));
1414                                 Value *vOffsetsHi = VEXTRACTI128(vOffsets, C(1));
1415                                 Value *vOffsetsHi2 = VEXTRACTI128(vOffsets2, C(1));
1416
1417                                 Value *vZeroDouble = VECTOR_SPLAT(4, ConstantFP::get(IRB()->getDoubleTy(), 0.0f));
1418
1419                                 Value* pGatherLo = GATHERPD(vZeroDouble, pStreamBase, vOffsetsLo, vMaskLo);
1420                                 Value* pGatherLo2 = GATHERPD(vZeroDouble, pStreamBase, vOffsetsLo2, vMaskLo2);
1421                                 Value* pGatherHi = GATHERPD(vZeroDouble, pStreamBase, vOffsetsHi, vMaskHi);
1422                                 Value* pGatherHi2 = GATHERPD(vZeroDouble, pStreamBase, vOffsetsHi2, vMaskHi2);
1423
1424                                 pGatherLo = VCVTPD2PS(pGatherLo);
1425                                 pGatherLo2 = VCVTPD2PS(pGatherLo2);
1426                                 pGatherHi = VCVTPD2PS(pGatherHi);
1427                                 pGatherHi2 = VCVTPD2PS(pGatherHi2);
1428
1429                                 Value *pGather = VSHUFFLE(pGatherLo, pGatherHi, C({ 0, 1, 2, 3, 4, 5, 6, 7 }));
1430                                 Value *pGather2 = VSHUFFLE(pGatherLo2, pGatherHi2, C({ 0, 1, 2, 3, 4, 5, 6, 7 }));
1431
1432 #if USE_SIMD16_BUILDER
1433                                 // pack adjacent pairs of SIMD8s into SIMD16s
1434                                 pVtxSrc2[currentVertexElement] = VUNDEF2_F();
1435                                 pVtxSrc2[currentVertexElement] = INSERT2_F(pVtxSrc2[currentVertexElement], pGather,  0);
1436                                 pVtxSrc2[currentVertexElement] = INSERT2_F(pVtxSrc2[currentVertexElement], pGather2, 1);
1437
1438 #else
1439                                 vVertexElements[currentVertexElement]  = pGather;
1440                                 vVertexElements2[currentVertexElement] = pGather2;
1441
1442 #endif
1443                                 currentVertexElement += 1;
1444                             }
1445                             else
1446                             {
1447 #if USE_SIMD16_BUILDER
1448                                 pVtxSrc2[currentVertexElement] = GenerateCompCtrlVector2(compCtrl[i]);
1449
1450 #else
1451                                 vVertexElements[currentVertexElement] = GenerateCompCtrlVector(compCtrl[i], false);
1452                                 vVertexElements2[currentVertexElement] = GenerateCompCtrlVector(compCtrl[i], true);
1453
1454 #endif
1455                                 currentVertexElement += 1;
1456                             }
1457
1458                             if (currentVertexElement > 3)
1459                             {
1460 #if USE_SIMD16_BUILDER
1461                                 // store SIMD16s
1462                                 Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth2), 0));
1463
1464                                 StoreVertexElements2(pVtxOut2, outputElt, 4, pVtxSrc2);
1465
1466 #else
1467                                 StoreVertexElements(pVtxOut, outputElt, 4, vVertexElements);
1468                                 StoreVertexElements(GEP(pVtxOut, C(1)), outputElt, 4, vVertexElements2);
1469
1470 #endif
1471                                 outputElt += 1;
1472
1473                                 // reset to the next vVertexElement to output
1474                                 currentVertexElement = 0;
1475                             }
1476                         }
1477
1478                         // offset base to the next component  in the vertex to gather
1479                         pStreamBase = GEP(pStreamBase, C((char)8));
1480 #else
1481                         if (isComponentEnabled(compMask, i))
1482                         {
1483                             // if we need to gather the component
1484                             if (compCtrl[i] == StoreSrc)
1485                             {
1486                                 Value *vMaskLo = VSHUFFLE(pMask, VUNDEF(mInt1Ty, 8), C({0, 1, 2, 3}));
1487                                 Value *vMaskHi = VSHUFFLE(pMask, VUNDEF(mInt1Ty, 8), C({4, 5, 6, 7}));
1488                                 vMaskLo = S_EXT(vMaskLo, VectorType::get(mInt64Ty, 4));
1489                                 vMaskHi = S_EXT(vMaskHi, VectorType::get(mInt64Ty, 4));
1490                                 vMaskLo = BITCAST(vMaskLo, VectorType::get(mDoubleTy, 4));
1491                                 vMaskHi = BITCAST(vMaskHi, VectorType::get(mDoubleTy, 4));
1492
1493                                 Value *vOffsetsLo = VEXTRACTI128(vOffsets, C(0));
1494                                 Value *vOffsetsHi = VEXTRACTI128(vOffsets, C(1));
1495
1496                                 Value *vZeroDouble = VECTOR_SPLAT(4, ConstantFP::get(IRB()->getDoubleTy(), 0.0f));
1497
1498                                 Value* pGatherLo = GATHERPD(vZeroDouble,
1499                                                             pStreamBase, vOffsetsLo, vMaskLo);
1500                                 Value* pGatherHi = GATHERPD(vZeroDouble,
1501                                                             pStreamBase, vOffsetsHi, vMaskHi);
1502
1503                                 pGatherLo = VCVTPD2PS(pGatherLo);
1504                                 pGatherHi = VCVTPD2PS(pGatherHi);
1505
1506                                 Value *pGather = VSHUFFLE(pGatherLo, pGatherHi, C({0, 1, 2, 3, 4, 5, 6, 7}));
1507
1508                                 vVertexElements[currentVertexElement++] = pGather;
1509                             }
1510                             else
1511                             {
1512 #if USE_SIMD16_SHADERS
1513                                 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i], useVertexID2);
1514 #else
1515                                 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1516 #endif
1517                             }
1518
1519                             if (currentVertexElement > 3)
1520                             {
1521                                 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1522                                 // reset to the next vVertexElement to output
1523                                 currentVertexElement = 0;
1524                             }
1525                         }
1526
1527                         // offset base to the next component  in the vertex to gather
1528                         pStreamBase = GEP(pStreamBase, C((char)8));
1529 #endif
1530                     }
1531                 }
1532                     break;
1533                 default:
1534                     SWR_INVALID("Tried to fetch invalid FP format");
1535                     break;
1536             }
1537         }
1538         else
1539         {
1540             Instruction::CastOps extendCastType = Instruction::CastOps::CastOpsEnd;
1541             ConversionType conversionType = CONVERT_NONE;
1542
1543             SWR_ASSERT(IsUniformFormat((SWR_FORMAT)ied.Format),
1544                 "Unsupported format for standard gather fetch.");
1545
1546             switch(info.type[0])
1547             {
1548                 case SWR_TYPE_UNORM:
1549                     conversionType = CONVERT_NORMALIZED;
1550                 case SWR_TYPE_UINT:
1551                     extendCastType = Instruction::CastOps::ZExt;
1552                     break;
1553                 case SWR_TYPE_SNORM:
1554                     conversionType = CONVERT_NORMALIZED;
1555                 case SWR_TYPE_SINT:
1556                     extendCastType = Instruction::CastOps::SExt;
1557                     break;
1558                 case SWR_TYPE_USCALED:
1559                     conversionType = CONVERT_USCALED;
1560                     extendCastType = Instruction::CastOps::UIToFP;
1561                     break;
1562                 case SWR_TYPE_SSCALED:
1563                     conversionType = CONVERT_SSCALED;
1564                     extendCastType = Instruction::CastOps::SIToFP;
1565                     break;
1566                 case SWR_TYPE_SFIXED:
1567                     conversionType = CONVERT_SFIXED;
1568                     extendCastType = Instruction::CastOps::SExt;
1569                     break;
1570                 default:
1571                     break;
1572             }
1573
1574             // value substituted when component of gather is masked
1575             Value* gatherSrc = VIMMED1(0);
1576 #if USE_SIMD16_GATHERS
1577             Value* gatherSrc2 = VIMMED1(0);
1578 #endif
1579
1580             // Gather components from memory to store in a simdvertex structure
1581             switch (bpc)
1582             {
1583                 case 8:
1584                 {
1585                     // if we have at least one component to fetch
1586                     if (compMask)
1587                     {
1588 #if USE_SIMD16_GATHERS
1589                         Value* vGatherResult = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask);
1590                         Value* vGatherResult2 = GATHERDD(gatherSrc2, pStreamBase, vOffsets2, vGatherMask2);
1591
1592                         // e.g. result of an 8x32bit integer gather for 8bit components
1593                         // 256i - 0    1    2    3    4    5    6    7
1594                         //        xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw
1595
1596 #if USE_SIMD16_BUILDER
1597                         Value *gatherResult = VUNDEF2_I();
1598
1599                         gatherResult = INSERT2_I(gatherResult, vGatherResult,  0);
1600                         gatherResult = INSERT2_I(gatherResult, vGatherResult2, 1);
1601
1602                         Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth2), 0));
1603
1604                         Shuffle8bpcArgs args = std::forward_as_tuple(gatherResult, pVtxOut2, extendCastType, conversionType,
1605                             currentVertexElement, outputElt, compMask, compCtrl, pVtxSrc2, info.swizzle);
1606
1607                         // Shuffle gathered components into place in simdvertex struct
1608                         Shuffle8bpcGatherd2(args);  // outputs to vVertexElements ref
1609 #else
1610                         Shuffle8bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, extendCastType, conversionType,
1611                             currentVertexElement, outputElt, compMask, compCtrl, vVertexElements, info.swizzle);
1612                         Shuffle8bpcArgs args2 = std::forward_as_tuple(vGatherResult2, GEP(pVtxOut, C(1)), extendCastType, conversionType,
1613                             currentVertexElement, outputElt, compMask, compCtrl, vVertexElements2, info.swizzle);
1614
1615                         // Shuffle gathered components into place in simdvertex struct
1616                         Shuffle8bpcGatherd(args, false); // outputs to vVertexElements ref
1617                         Shuffle8bpcGatherd(args2, true); // outputs to vVertexElements ref
1618 #endif
1619 #else
1620                         Value* vGatherResult = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask);
1621                         // e.g. result of an 8x32bit integer gather for 8bit components
1622                         // 256i - 0    1    2    3    4    5    6    7
1623                         //        xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw
1624
1625                         Shuffle8bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, extendCastType, conversionType,
1626                             currentVertexElement, outputElt, compMask, compCtrl, vVertexElements, info.swizzle);
1627
1628                         // Shuffle gathered components into place in simdvertex struct
1629 #if USE_SIMD16_SHADERS
1630                         Shuffle8bpcGatherd(args, useVertexID2); // outputs to vVertexElements ref
1631 #else
1632                         Shuffle8bpcGatherd(args); // outputs to vVertexElements ref
1633 #endif
1634 #endif
1635                     }
1636                 }
1637                 break;
1638                 case 16:
1639                 {
1640 #if USE_SIMD16_GATHERS
1641                     Value* vGatherResult[2];
1642                     Value* vGatherResult2[2];
1643
1644                     // if we have at least one component out of x or y to fetch
1645                     if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
1646                     {
1647                         vGatherResult[0] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask);
1648                         vGatherResult2[0] = GATHERDD(gatherSrc2, pStreamBase, vOffsets2, vGatherMask2);
1649                         // e.g. result of first 8x32bit integer gather for 16bit components
1650                         // 256i - 0    1    2    3    4    5    6    7
1651                         //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
1652                         //
1653                     }
1654                     else
1655                     {
1656                         vGatherResult[0]  = VUNDEF_I();
1657                         vGatherResult2[0] = VUNDEF_I();
1658                     }
1659
1660                     // if we have at least one component out of z or w to fetch
1661                     if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
1662                     {
1663                         // offset base to the next components(zw) in the vertex to gather
1664                         pStreamBase = GEP(pStreamBase, C((char)4));
1665
1666                         vGatherResult[1] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask);
1667                         vGatherResult2[1] = GATHERDD(gatherSrc2, pStreamBase, vOffsets2, vGatherMask2);
1668                         // e.g. result of second 8x32bit integer gather for 16bit components
1669                         // 256i - 0    1    2    3    4    5    6    7
1670                         //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
1671                         //
1672                     }
1673                     else
1674                     {
1675                         vGatherResult[1]  = VUNDEF_I();
1676                         vGatherResult2[1] = VUNDEF_I();
1677                     }
1678
1679                     // if we have at least one component to shuffle into place
1680                     if (compMask)
1681                     {
1682 #if USE_SIMD16_BUILDER
1683                         Value *gatherResult[2];
1684
1685                         gatherResult[0] = VUNDEF2_I();
1686                         gatherResult[1] = VUNDEF2_I();
1687
1688                         gatherResult[0] = INSERT2_I(gatherResult[0], vGatherResult[0],  0);
1689                         gatherResult[0] = INSERT2_I(gatherResult[0], vGatherResult2[0], 1);
1690
1691                         gatherResult[1] = INSERT2_I(gatherResult[1], vGatherResult[1],  0);
1692                         gatherResult[1] = INSERT2_I(gatherResult[1], vGatherResult2[1], 1);
1693
1694                         Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth2), 0));
1695
1696                         Shuffle16bpcArgs args = std::forward_as_tuple(gatherResult, pVtxOut2, extendCastType, conversionType,
1697                             currentVertexElement, outputElt, compMask, compCtrl, pVtxSrc2);
1698
1699                         // Shuffle gathered components into place in simdvertex struct
1700                         Shuffle16bpcGather2(args);  // outputs to vVertexElements ref
1701 #else
1702                         Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, extendCastType, conversionType,
1703                             currentVertexElement, outputElt, compMask, compCtrl, vVertexElements);
1704                         Shuffle16bpcArgs args2 = std::forward_as_tuple(vGatherResult2, GEP(pVtxOut, C(1)), extendCastType, conversionType,
1705                             currentVertexElement, outputElt, compMask, compCtrl, vVertexElements2);
1706
1707                         // Shuffle gathered components into place in simdvertex struct
1708                         Shuffle16bpcGather(args, false);  // outputs to vVertexElements ref
1709                         Shuffle16bpcGather(args2, true);  // outputs to vVertexElements ref
1710 #endif
1711                     }
1712 #else
1713                     Value* vGatherResult[2];
1714
1715                     // if we have at least one component out of x or y to fetch
1716                     if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)){
1717                         vGatherResult[0] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask);
1718                         // e.g. result of first 8x32bit integer gather for 16bit components
1719                         // 256i - 0    1    2    3    4    5    6    7
1720                         //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
1721                         //
1722                     }
1723
1724                     // if we have at least one component out of z or w to fetch
1725                     if(isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)){
1726                         // offset base to the next components(zw) in the vertex to gather
1727                         pStreamBase = GEP(pStreamBase, C((char)4));
1728
1729                         vGatherResult[1] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask);
1730                         // e.g. result of second 8x32bit integer gather for 16bit components
1731                         // 256i - 0    1    2    3    4    5    6    7
1732                         //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
1733                         //
1734                     }
1735
1736                     // if we have at least one component to shuffle into place
1737                     if(compMask){
1738                         Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, extendCastType, conversionType,
1739                             currentVertexElement, outputElt, compMask, compCtrl, vVertexElements);
1740
1741                         // Shuffle gathered components into place in simdvertex struct
1742 #if USE_SIMD16_SHADERS
1743                         Shuffle16bpcGather(args, useVertexID2);  // outputs to vVertexElements ref
1744 #else
1745                         Shuffle16bpcGather(args);  // outputs to vVertexElements ref
1746 #endif
1747                     }
1748 #endif
1749                 }
1750                 break;
1751                 case 32:
1752                 {
1753                     // Gathered components into place in simdvertex struct
1754                     for (uint32_t i = 0; i < 4; i++)
1755                     {
1756                         if (isComponentEnabled(compMask, i))
1757                         {
1758                             // if we need to gather the component
1759                             if (compCtrl[i] == StoreSrc)
1760                             {
1761 #if USE_SIMD16_GATHERS
1762                                 Value *pGather = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask);
1763                                 Value *pGather2 = GATHERDD(gatherSrc2, pStreamBase, vOffsets2, vGatherMask2);
1764
1765                                 if (conversionType == CONVERT_USCALED)
1766                                 {
1767                                     pGather = UI_TO_FP(pGather, mSimdFP32Ty);
1768                                     pGather2 = UI_TO_FP(pGather2, mSimdFP32Ty);
1769                                 }
1770                                 else if (conversionType == CONVERT_SSCALED)
1771                                 {
1772                                     pGather = SI_TO_FP(pGather, mSimdFP32Ty);
1773                                     pGather2 = SI_TO_FP(pGather2, mSimdFP32Ty);
1774                                 }
1775                                 else if (conversionType == CONVERT_SFIXED)
1776                                 {
1777                                     pGather = FMUL(SI_TO_FP(pGather, mSimdFP32Ty), VBROADCAST(C(1 / 65536.0f)));
1778                                     pGather2 = FMUL(SI_TO_FP(pGather2, mSimdFP32Ty), VBROADCAST(C(1 / 65536.0f)));
1779                                 }
1780
1781 #if USE_SIMD16_BUILDER
1782                                 // pack adjacent pairs of SIMD8s into SIMD16s
1783                                 pVtxSrc2[currentVertexElement] = VUNDEF2_F();
1784                                 pVtxSrc2[currentVertexElement] = INSERT2_F(pVtxSrc2[currentVertexElement], pGather,  0);
1785                                 pVtxSrc2[currentVertexElement] = INSERT2_F(pVtxSrc2[currentVertexElement], pGather2, 1);
1786
1787 #else
1788                                 vVertexElements[currentVertexElement] = pGather;
1789                                 vVertexElements2[currentVertexElement] = pGather2;
1790
1791 #endif
1792
1793                                 // e.g. result of a single 8x32bit integer gather for 32bit components
1794                                 // 256i - 0    1    2    3    4    5    6    7
1795                                 //        xxxx xxxx xxxx xxxx xxxx xxxx xxxx xxxx
1796
1797                                 currentVertexElement += 1;
1798 #else
1799                                 Value* pGather = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask);
1800
1801                                 if (conversionType == CONVERT_USCALED)
1802                                 {
1803                                     pGather = UI_TO_FP(pGather, mSimdFP32Ty);
1804                                 }
1805                                 else if (conversionType == CONVERT_SSCALED)
1806                                 {
1807                                     pGather = SI_TO_FP(pGather, mSimdFP32Ty);
1808                                 }
1809                                 else if (conversionType == CONVERT_SFIXED)
1810                                 {
1811                                     pGather = FMUL(SI_TO_FP(pGather, mSimdFP32Ty), VBROADCAST(C(1/65536.0f)));
1812                                 }
1813
1814                                 vVertexElements[currentVertexElement++] = pGather;
1815                                 // e.g. result of a single 8x32bit integer gather for 32bit components
1816                                 // 256i - 0    1    2    3    4    5    6    7
1817                                 //        xxxx xxxx xxxx xxxx xxxx xxxx xxxx xxxx
1818 #endif
1819                             }
1820                             else
1821                             {
1822 #if USE_SIMD16_SHADERS
1823 #if USE_SIMD16_GATHERS
1824 #if USE_SIMD16_BUILDER
1825                                 pVtxSrc2[currentVertexElement] = GenerateCompCtrlVector2(compCtrl[i]);
1826
1827 #else
1828                                 vVertexElements[currentVertexElement] = GenerateCompCtrlVector(compCtrl[i], false);
1829                                 vVertexElements2[currentVertexElement] = GenerateCompCtrlVector(compCtrl[i], true);
1830
1831 #endif
1832                                 currentVertexElement += 1;
1833 #else
1834                                 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i], useVertexID2);
1835 #endif
1836 #else
1837                                 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1838 #endif
1839                             }
1840
1841                             if (currentVertexElement > 3)
1842                             {
1843 #if USE_SIMD16_GATHERS
1844 #if USE_SIMD16_BUILDER
1845                                 // store SIMD16s
1846                                 Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth2), 0));
1847
1848                                 StoreVertexElements2(pVtxOut2, outputElt, 4, pVtxSrc2);
1849
1850 #else
1851                                 StoreVertexElements(pVtxOut, outputElt, 4, vVertexElements);
1852                                 StoreVertexElements(GEP(pVtxOut, C(1)), outputElt, 4, vVertexElements2);
1853
1854 #endif
1855                                 outputElt += 1;
1856 #else
1857                                 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1858 #endif
1859
1860                                 // reset to the next vVertexElement to output
1861                                 currentVertexElement = 0;
1862                             }
1863
1864                         }
1865
1866                         // offset base to the next component  in the vertex to gather
1867                         pStreamBase = GEP(pStreamBase, C((char)4));
1868                     }
1869                 }
1870                 break;
1871             }
1872         }
1873     }
1874
1875     // if we have a partially filled vVertexElement struct, output it
1876     if (currentVertexElement > 0)
1877     {
1878 #if USE_SIMD16_GATHERS
1879 #if USE_SIMD16_BUILDER
1880         // store SIMD16s
1881         Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth2), 0));
1882
1883         StoreVertexElements2(pVtxOut2, outputElt, currentVertexElement, pVtxSrc2);
1884
1885 #else
1886         StoreVertexElements(pVtxOut, outputElt, currentVertexElement, vVertexElements);
1887         StoreVertexElements(GEP(pVtxOut, C(1)), outputElt, currentVertexElement, vVertexElements2);
1888
1889 #endif
1890         outputElt += 1;
1891 #else
1892         StoreVertexElements(pVtxOut, outputElt++, currentVertexElement, vVertexElements);
1893 #endif
1894     }
1895 }
1896
1897 //////////////////////////////////////////////////////////////////////////
1898 /// @brief Loads a simd of valid indices. OOB indices are set to 0
1899 /// *Note* have to do 16bit index checking in scalar until we have AVX-512
1900 /// support
1901 /// @param pIndices - pointer to 8 bit indices
1902 /// @param pLastIndex - pointer to last valid index
1903 Value* FetchJit::GetSimdValid8bitIndices(Value* pIndices, Value* pLastIndex)
1904 {
1905     // can fit 2 16 bit integers per vWidth lane
1906     Value* vIndices =  VUNDEF_I();
1907
1908     // store 0 index on stack to be used to conditionally load from if index address is OOB
1909     Value* pZeroIndex = ALLOCA(mInt8Ty);
1910     STORE(C((uint8_t)0), pZeroIndex);
1911
1912     // Load a SIMD of index pointers
1913     for(int64_t lane = 0; lane < mVWidth; lane++)
1914     {
1915         // Calculate the address of the requested index
1916         Value *pIndex = GEP(pIndices, C(lane));
1917
1918         // check if the address is less than the max index,
1919         Value* mask = ICMP_ULT(pIndex, pLastIndex);
1920
1921         // if valid, load the index. if not, load 0 from the stack
1922         Value* pValid = SELECT(mask, pIndex, pZeroIndex);
1923         Value *index = LOAD(pValid, "valid index");
1924
1925         // zero extended index to 32 bits and insert into the correct simd lane
1926         index = Z_EXT(index, mInt32Ty);
1927         vIndices = VINSERT(vIndices, index, lane);
1928     }
1929     return vIndices;
1930 }
1931
1932 //////////////////////////////////////////////////////////////////////////
1933 /// @brief Loads a simd of valid indices. OOB indices are set to 0
1934 /// *Note* have to do 16bit index checking in scalar until we have AVX-512
1935 /// support
1936 /// @param pIndices - pointer to 16 bit indices
1937 /// @param pLastIndex - pointer to last valid index
1938 Value* FetchJit::GetSimdValid16bitIndices(Value* pIndices, Value* pLastIndex)
1939 {
1940     // can fit 2 16 bit integers per vWidth lane
1941     Value* vIndices =  VUNDEF_I();
1942
1943     // store 0 index on stack to be used to conditionally load from if index address is OOB
1944     Value* pZeroIndex = ALLOCA(mInt16Ty);
1945     STORE(C((uint16_t)0), pZeroIndex);
1946
1947     // Load a SIMD of index pointers
1948     for(int64_t lane = 0; lane < mVWidth; lane++)
1949     {
1950         // Calculate the address of the requested index
1951         Value *pIndex = GEP(pIndices, C(lane));
1952
1953         // check if the address is less than the max index,
1954         Value* mask = ICMP_ULT(pIndex, pLastIndex);
1955
1956         // if valid, load the index. if not, load 0 from the stack
1957         Value* pValid = SELECT(mask, pIndex, pZeroIndex);
1958         Value *index = LOAD(pValid, "valid index");
1959
1960         // zero extended index to 32 bits and insert into the correct simd lane
1961         index = Z_EXT(index, mInt32Ty);
1962         vIndices = VINSERT(vIndices, index, lane);
1963     }
1964     return vIndices;
1965 }
1966
1967 //////////////////////////////////////////////////////////////////////////
1968 /// @brief Loads a simd of valid indices. OOB indices are set to 0
1969 /// @param pIndices - pointer to 32 bit indices
1970 /// @param pLastIndex - pointer to last valid index
1971 Value* FetchJit::GetSimdValid32bitIndices(Value* pIndices, Value* pLastIndex)
1972 {
1973     DataLayout dL(JM()->mpCurrentModule);
1974     unsigned int ptrSize = dL.getPointerSize() * 8;  // ptr size in bits
1975     Value* iLastIndex = PTR_TO_INT(pLastIndex, Type::getIntNTy(JM()->mContext, ptrSize));
1976     Value* iIndices = PTR_TO_INT(pIndices, Type::getIntNTy(JM()->mContext, ptrSize));
1977
1978     // get the number of indices left in the buffer (endPtr - curPtr) / sizeof(index)
1979     Value* numIndicesLeft = SUB(iLastIndex,iIndices);
1980     numIndicesLeft = TRUNC(numIndicesLeft, mInt32Ty);
1981     numIndicesLeft = SDIV(numIndicesLeft, C(4));
1982
1983     // create a vector of index counts from the base index ptr passed into the fetch
1984     const std::vector<Constant*> vecIndices {C(0), C(1), C(2), C(3), C(4), C(5), C(6), C(7)};
1985     Constant* vIndexOffsets = ConstantVector::get(vecIndices);
1986
1987     // compare index count to the max valid index
1988     // e.g vMaxIndex      4 4 4 4 4 4 4 4 : 4 indices left to load
1989     //     vIndexOffsets  0 1 2 3 4 5 6 7
1990     //     ------------------------------
1991     //     vIndexMask    -1-1-1-1 0 0 0 0 : offsets < max pass
1992     //     vLoadedIndices 0 1 2 3 0 0 0 0 : offsets >= max masked to 0
1993     Value* vMaxIndex = VBROADCAST(numIndicesLeft);
1994     Value* vIndexMask = VPCMPGTD(vMaxIndex,vIndexOffsets);
1995
1996     // VMASKLOAD takes an *i8 src pointer
1997     pIndices = BITCAST(pIndices,PointerType::get(mInt8Ty,0));
1998
1999     // Load the indices; OOB loads 0
2000     return MASKLOADD(pIndices,vIndexMask);
2001 }
2002
2003 //////////////////////////////////////////////////////////////////////////
2004 /// @brief Takes a SIMD of gathered 8bpc verts, zero or sign extends,
2005 /// denormalizes if needed, converts to F32 if needed, and positions in
2006 //  the proper SIMD rows to be output to the simdvertex structure
2007 /// @param args: (tuple of args, listed below)
2008 ///   @param vGatherResult - 8 gathered 8bpc vertices
2009 ///   @param pVtxOut - base pointer to output simdvertex struct
2010 ///   @param extendType - sign extend or zero extend
2011 ///   @param bNormalized - do we need to denormalize?
2012 ///   @param currentVertexElement - reference to the current vVertexElement
2013 ///   @param outputElt - reference to the current offset from simdvertex we're o
2014 ///   @param compMask - component packing mask
2015 ///   @param compCtrl - component control val
2016 ///   @param vVertexElements[4] - vertex components to output
2017 ///   @param swizzle[4] - component swizzle location
2018 #if USE_SIMD16_SHADERS
2019 void FetchJit::Shuffle8bpcGatherd(Shuffle8bpcArgs &args, bool useVertexID2)
2020 #else
2021 void FetchJit::Shuffle8bpcGatherd(Shuffle8bpcArgs &args)
2022 #endif
2023 {
2024     // Unpack tuple args
2025     Value*& vGatherResult = std::get<0>(args);
2026     Value* pVtxOut = std::get<1>(args);
2027     const Instruction::CastOps extendType = std::get<2>(args);
2028     const ConversionType conversionType = std::get<3>(args);
2029     uint32_t &currentVertexElement = std::get<4>(args);
2030     uint32_t &outputElt =  std::get<5>(args);
2031     const ComponentEnable compMask = std::get<6>(args);
2032     const ComponentControl (&compCtrl)[4] = std::get<7>(args);
2033     Value* (&vVertexElements)[4] = std::get<8>(args);
2034     const uint32_t (&swizzle)[4] = std::get<9>(args);
2035
2036     // cast types
2037     Type* vGatherTy = mSimdInt32Ty;
2038     Type* v32x8Ty =  VectorType::get(mInt8Ty, mVWidth * 4 ); // vwidth is units of 32 bits
2039
2040     // have to do extra work for sign extending
2041     if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP)){
2042         Type* v16x8Ty = VectorType::get(mInt8Ty, mVWidth * 2); // 8x16bit ints in a 128bit lane
2043         Type* v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
2044
2045         // shuffle mask, including any swizzling
2046         const char x = (char)swizzle[0]; const char y = (char)swizzle[1];
2047         const char z = (char)swizzle[2]; const char w = (char)swizzle[3];
2048         Value* vConstMask = C<char>({char(x), char(x+4), char(x+8), char(x+12),
2049                     char(y), char(y+4), char(y+8), char(y+12),
2050                     char(z), char(z+4), char(z+8), char(z+12),
2051                     char(w), char(w+4), char(w+8), char(w+12),
2052                     char(x), char(x+4), char(x+8), char(x+12),
2053                     char(y), char(y+4), char(y+8), char(y+12),
2054                     char(z), char(z+4), char(z+8), char(z+12),
2055                     char(w), char(w+4), char(w+8), char(w+12)});
2056
2057         Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherResult, v32x8Ty), vConstMask), vGatherTy);
2058         // after pshufb: group components together in each 128bit lane
2059         // 256i - 0    1    2    3    4    5    6    7
2060         //        xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww
2061
2062         Value* vi128XY = nullptr;
2063         if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)){
2064             vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({0, 4, 0, 0, 1, 5, 0, 0})), v128Ty);
2065             // after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane
2066             // 256i - 0    1    2    3    4    5    6    7
2067             //        xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care)
2068         }
2069
2070         // do the same for zw components
2071         Value* vi128ZW = nullptr;
2072         if(isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)){
2073             vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({2, 6, 0, 0, 3, 7, 0, 0})), v128Ty);
2074         }
2075
2076         // init denormalize variables if needed
2077         Instruction::CastOps fpCast;
2078         Value* conversionFactor;
2079
2080         switch (conversionType)
2081         {
2082         case CONVERT_NORMALIZED:
2083             fpCast = Instruction::CastOps::SIToFP;
2084             conversionFactor = VIMMED1((float)(1.0 / 127.0));
2085             break;
2086         case CONVERT_SSCALED:
2087             fpCast = Instruction::CastOps::SIToFP;
2088             conversionFactor = VIMMED1((float)(1.0));
2089             break;
2090         case CONVERT_USCALED:
2091             SWR_INVALID("Type should not be sign extended!");
2092             conversionFactor = nullptr;
2093             break;
2094         default:
2095             SWR_ASSERT(conversionType == CONVERT_NONE);
2096             conversionFactor = nullptr;
2097             break;
2098         }
2099
2100         // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
2101         for (uint32_t i = 0; i < 4; i++)
2102         {
2103             if (isComponentEnabled(compMask, i))
2104             {
2105                 if (compCtrl[i] == ComponentControl::StoreSrc)
2106                 {
2107                     // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
2108                     uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
2109                     // if x or y, use vi128XY permute result, else use vi128ZW
2110                     Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
2111
2112                     // sign extend
2113                     vVertexElements[currentVertexElement] = PMOVSXBD(BITCAST(VEXTRACT(selectedPermute, C(lane)), v16x8Ty));
2114
2115                     // denormalize if needed
2116                     if (conversionType != CONVERT_NONE)
2117                     {
2118                         vVertexElements[currentVertexElement] = FMUL(CAST(fpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor);
2119                     }
2120                     currentVertexElement++;
2121                 }
2122                 else
2123                 {
2124 #if USE_SIMD16_SHADERS
2125                     vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i], useVertexID2);
2126 #else
2127                     vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
2128 #endif
2129                 }
2130
2131                 if (currentVertexElement > 3)
2132                 {
2133                     StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
2134                     // reset to the next vVertexElement to output
2135                     currentVertexElement = 0;
2136                 }
2137             }
2138         }
2139     }
2140     // else zero extend
2141     else if ((extendType == Instruction::CastOps::ZExt) || (extendType == Instruction::CastOps::UIToFP))
2142     {
2143         // init denormalize variables if needed
2144         Instruction::CastOps fpCast;
2145         Value* conversionFactor;
2146
2147         switch (conversionType)
2148         {
2149         case CONVERT_NORMALIZED:
2150             fpCast = Instruction::CastOps::UIToFP;
2151             conversionFactor = VIMMED1((float)(1.0 / 255.0));
2152             break;
2153         case CONVERT_USCALED:
2154             fpCast = Instruction::CastOps::UIToFP;
2155             conversionFactor = VIMMED1((float)(1.0));
2156             break;
2157         case CONVERT_SSCALED:
2158             SWR_INVALID("Type should not be zero extended!");
2159             conversionFactor = nullptr;
2160             break;
2161         default:
2162             SWR_ASSERT(conversionType == CONVERT_NONE);
2163             conversionFactor = nullptr;
2164             break;
2165         }
2166
2167         // shuffle enabled components into lower byte of each 32bit lane, 0 extending to 32 bits
2168         for (uint32_t i = 0; i < 4; i++)
2169         {
2170             if (isComponentEnabled(compMask, i))
2171             {
2172                 if (compCtrl[i] == ComponentControl::StoreSrc)
2173                 {
2174                     // pshufb masks for each component
2175                     Value* vConstMask;
2176                     switch (swizzle[i])
2177                     {
2178                     case 0:
2179                         // x shuffle mask
2180                         vConstMask = C<char>({ 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1,
2181                                                0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1 });
2182                         break;
2183                     case 1:
2184                         // y shuffle mask
2185                         vConstMask = C<char>({ 1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1,
2186                                                1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1 });
2187                         break;
2188                     case 2:
2189                         // z shuffle mask
2190                         vConstMask = C<char>({ 2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1,
2191                                                2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1 });
2192                         break;
2193                     case 3:
2194                         // w shuffle mask
2195                         vConstMask = C<char>({ 3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1,
2196                                                3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1 });
2197                         break;
2198                     default:
2199                         vConstMask = nullptr;
2200                         break;
2201                     }
2202
2203                     vVertexElements[currentVertexElement] = BITCAST(PSHUFB(BITCAST(vGatherResult, v32x8Ty), vConstMask), vGatherTy);
2204                     // after pshufb for x channel
2205                     // 256i - 0    1    2    3    4    5    6    7
2206                     //        x000 x000 x000 x000 x000 x000 x000 x000
2207
2208                     // denormalize if needed
2209                     if (conversionType != CONVERT_NONE)
2210                     {
2211                         vVertexElements[currentVertexElement] = FMUL(CAST(fpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor);
2212                     }
2213                     currentVertexElement++;
2214                 }
2215                 else
2216                 {
2217 #if USE_SIMD16_SHADERS
2218                     vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i], useVertexID2);
2219 #else
2220                     vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
2221 #endif
2222                 }
2223
2224                 if (currentVertexElement > 3)
2225                 {
2226                     StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
2227                     // reset to the next vVertexElement to output
2228                     currentVertexElement = 0;
2229                 }
2230             }
2231         }
2232     }
2233     else
2234     {
2235         SWR_INVALID("Unsupported conversion type");
2236     }
2237 }
2238
2239 #if USE_SIMD16_BUILDER
2240 void FetchJit::Shuffle8bpcGatherd2(Shuffle8bpcArgs &args)
2241 {
2242     // Unpack tuple args
2243     Value*& vGatherResult = std::get<0>(args);
2244     Value* pVtxOut = std::get<1>(args);
2245     const Instruction::CastOps extendType = std::get<2>(args);
2246     const ConversionType conversionType = std::get<3>(args);
2247     uint32_t &currentVertexElement = std::get<4>(args);
2248     uint32_t &outputElt = std::get<5>(args);
2249     const ComponentEnable compMask = std::get<6>(args);
2250     const ComponentControl(&compCtrl)[4] = std::get<7>(args);
2251     Value* (&vVertexElements)[4] = std::get<8>(args);
2252     const uint32_t(&swizzle)[4] = std::get<9>(args);
2253
2254     // cast types
2255     Type *vGatherTy = mSimdInt32Ty;
2256     Type *v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
2257
2258     // have to do extra work for sign extending
2259     if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP))
2260     {
2261         Type *v16x8Ty = VectorType::get(mInt8Ty, mVWidth * 2); // 8x16bit ints in a 128bit lane
2262         Type *v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
2263
2264         // shuffle mask, including any swizzling
2265         const char x = (char)swizzle[0]; const char y = (char)swizzle[1];
2266         const char z = (char)swizzle[2]; const char w = (char)swizzle[3];
2267         Value *vConstMask = C<char>({ char(x), char(x + 4), char(x + 8), char(x + 12),
2268             char(y), char(y + 4), char(y + 8), char(y + 12),
2269             char(z), char(z + 4), char(z + 8), char(z + 12),
2270             char(w), char(w + 4), char(w + 8), char(w + 12),
2271             char(x), char(x + 4), char(x + 8), char(x + 12),
2272             char(y), char(y + 4), char(y + 8), char(y + 12),
2273             char(z), char(z + 4), char(z + 8), char(z + 12),
2274             char(w), char(w + 4), char(w + 8), char(w + 12) });
2275
2276         // SIMD16 PSHUFB isnt part of AVX-512F, so split into SIMD8 for the sake of KNL, for now..
2277
2278         Value *vGatherResult_lo = EXTRACT2_I(vGatherResult, 0);
2279         Value *vGatherResult_hi = EXTRACT2_I(vGatherResult, 1);
2280
2281         Value *vShufResult_lo = BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask), vGatherTy);
2282         Value *vShufResult_hi = BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask), vGatherTy);
2283
2284         // after pshufb: group components together in each 128bit lane
2285         // 256i - 0    1    2    3    4    5    6    7
2286         //        xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww
2287
2288         Value *vi128XY_lo = nullptr;
2289         Value *vi128XY_hi = nullptr;
2290         if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
2291         {
2292             vi128XY_lo = BITCAST(PERMD(vShufResult_lo, C<int32_t>({ 0, 4, 0, 0, 1, 5, 0, 0 })), v128Ty);
2293             vi128XY_hi = BITCAST(PERMD(vShufResult_hi, C<int32_t>({ 0, 4, 0, 0, 1, 5, 0, 0 })), v128Ty);
2294
2295             // after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane
2296             // 256i - 0    1    2    3    4    5    6    7
2297             //        xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care)
2298         }
2299
2300         // do the same for zw components
2301         Value *vi128ZW_lo = nullptr;
2302         Value *vi128ZW_hi = nullptr;
2303         if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
2304         {
2305             vi128ZW_lo = BITCAST(PERMD(vShufResult_lo, C<int32_t>({ 2, 6, 0, 0, 3, 7, 0, 0 })), v128Ty);
2306             vi128ZW_hi = BITCAST(PERMD(vShufResult_hi, C<int32_t>({ 2, 6, 0, 0, 3, 7, 0, 0 })), v128Ty);
2307         }
2308
2309         // init denormalize variables if needed
2310         Instruction::CastOps fpCast;
2311         Value *conversionFactor;
2312
2313         switch (conversionType)
2314         {
2315         case CONVERT_NORMALIZED:
2316             fpCast = Instruction::CastOps::SIToFP;
2317             conversionFactor = VIMMED1((float)(1.0 / 127.0));
2318             break;
2319         case CONVERT_SSCALED:
2320             fpCast = Instruction::CastOps::SIToFP;
2321             conversionFactor = VIMMED1((float)(1.0));
2322             break;
2323         case CONVERT_USCALED:
2324             SWR_INVALID("Type should not be sign extended!");
2325             conversionFactor = nullptr;
2326             break;
2327         default:
2328             SWR_ASSERT(conversionType == CONVERT_NONE);
2329             conversionFactor = nullptr;
2330             break;
2331         }
2332
2333         // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
2334         for (uint32_t i = 0; i < 4; i++)
2335         {
2336             if (isComponentEnabled(compMask, i))
2337             {
2338                 if (compCtrl[i] == ComponentControl::StoreSrc)
2339                 {
2340                     // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
2341                     uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
2342                     // if x or y, use vi128XY permute result, else use vi128ZW
2343                     Value *selectedPermute_lo = (i < 2) ? vi128XY_lo : vi128ZW_lo;
2344                     Value *selectedPermute_hi = (i < 2) ? vi128XY_hi : vi128ZW_hi;
2345
2346                     // sign extend
2347                     Value *temp_lo = PMOVSXBD(BITCAST(VEXTRACT(selectedPermute_lo, C(lane)), v16x8Ty));
2348                     Value *temp_hi = PMOVSXBD(BITCAST(VEXTRACT(selectedPermute_hi, C(lane)), v16x8Ty));
2349
2350                     // denormalize if needed
2351                     if (conversionType != CONVERT_NONE)
2352                     {
2353                         temp_lo = FMUL(CAST(fpCast, temp_lo, mSimdFP32Ty), conversionFactor);
2354                         temp_hi = FMUL(CAST(fpCast, temp_hi, mSimdFP32Ty), conversionFactor);
2355                     }
2356
2357                     vVertexElements[currentVertexElement] = VUNDEF2_F();
2358                     vVertexElements[currentVertexElement] = INSERT2_F(vVertexElements[currentVertexElement], temp_lo, 0);
2359                     vVertexElements[currentVertexElement] = INSERT2_F(vVertexElements[currentVertexElement], temp_hi, 1);
2360
2361                     currentVertexElement += 1;
2362                 }
2363                 else
2364                 {
2365                     vVertexElements[currentVertexElement++] = GenerateCompCtrlVector2(compCtrl[i]);
2366                 }
2367
2368                 if (currentVertexElement > 3)
2369                 {
2370                     StoreVertexElements2(pVtxOut, outputElt++, 4, vVertexElements);
2371                     // reset to the next vVertexElement to output
2372                     currentVertexElement = 0;
2373                 }
2374             }
2375         }
2376     }
2377     // else zero extend
2378     else if ((extendType == Instruction::CastOps::ZExt) || (extendType == Instruction::CastOps::UIToFP))
2379     {
2380         // init denormalize variables if needed
2381         Instruction::CastOps fpCast;
2382         Value *conversionFactor;
2383
2384         switch (conversionType)
2385         {
2386         case CONVERT_NORMALIZED:
2387             fpCast = Instruction::CastOps::UIToFP;
2388             conversionFactor = VIMMED1((float)(1.0 / 255.0));
2389             break;
2390         case CONVERT_USCALED:
2391             fpCast = Instruction::CastOps::UIToFP;
2392             conversionFactor = VIMMED1((float)(1.0));
2393             break;
2394         case CONVERT_SSCALED:
2395             SWR_INVALID("Type should not be zero extended!");
2396             conversionFactor = nullptr;
2397             break;
2398         default:
2399             SWR_ASSERT(conversionType == CONVERT_NONE);
2400             conversionFactor = nullptr;
2401             break;
2402         }
2403
2404         // shuffle enabled components into lower byte of each 32bit lane, 0 extending to 32 bits
2405         for (uint32_t i = 0; i < 4; i++)
2406         {
2407             if (isComponentEnabled(compMask, i))
2408             {
2409                 if (compCtrl[i] == ComponentControl::StoreSrc)
2410                 {
2411                     // pshufb masks for each component
2412                     Value *vConstMask;
2413                     switch (swizzle[i])
2414                     {
2415                     case 0:
2416                         // x shuffle mask
2417                         vConstMask = C<char>({ 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1,
2418                             0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1 });
2419                         break;
2420                     case 1:
2421                         // y shuffle mask
2422                         vConstMask = C<char>({ 1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1,
2423                             1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1 });
2424                         break;
2425                     case 2:
2426                         // z shuffle mask
2427                         vConstMask = C<char>({ 2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1,
2428                             2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1 });
2429                         break;
2430                     case 3:
2431                         // w shuffle mask
2432                         vConstMask = C<char>({ 3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1,
2433                             3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1 });
2434                         break;
2435                     default:
2436                         vConstMask = nullptr;
2437                         break;
2438                     }
2439
2440                     Value *vGatherResult_lo = EXTRACT2_I(vGatherResult, 0);
2441                     Value *vGatherResult_hi = EXTRACT2_I(vGatherResult, 1);
2442
2443                     Value *temp_lo = BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask), vGatherTy);
2444                     Value *temp_hi = BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask), vGatherTy);
2445
2446                     // after pshufb for x channel
2447                     // 256i - 0    1    2    3    4    5    6    7
2448                     //        x000 x000 x000 x000 x000 x000 x000 x000
2449
2450                     // denormalize if needed
2451                     if (conversionType != CONVERT_NONE)
2452                     {
2453                         temp_lo = FMUL(CAST(fpCast, temp_lo, mSimdFP32Ty), conversionFactor);
2454                         temp_hi = FMUL(CAST(fpCast, temp_hi, mSimdFP32Ty), conversionFactor);
2455                     }
2456
2457                     vVertexElements[currentVertexElement] = VUNDEF2_F();
2458                     vVertexElements[currentVertexElement] = INSERT2_F(vVertexElements[currentVertexElement], temp_lo, 0);
2459                     vVertexElements[currentVertexElement] = INSERT2_F(vVertexElements[currentVertexElement], temp_hi, 1);
2460
2461                     currentVertexElement += 1;
2462                 }
2463                 else
2464                 {
2465                     vVertexElements[currentVertexElement++] = GenerateCompCtrlVector2(compCtrl[i]);
2466                 }
2467
2468                 if (currentVertexElement > 3)
2469                 {
2470                     StoreVertexElements2(pVtxOut, outputElt++, 4, vVertexElements);
2471                     // reset to the next vVertexElement to output
2472                     currentVertexElement = 0;
2473                 }
2474             }
2475         }
2476     }
2477     else
2478     {
2479         SWR_INVALID("Unsupported conversion type");
2480     }
2481 }
2482
2483 #endif
2484 //////////////////////////////////////////////////////////////////////////
2485 /// @brief Takes a SIMD of gathered 16bpc verts, zero or sign extends,
2486 /// denormalizes if needed, converts to F32 if needed, and positions in
2487 //  the proper SIMD rows to be output to the simdvertex structure
2488 /// @param args: (tuple of args, listed below)
2489 ///   @param vGatherResult[2] - array of gathered 16bpc vertices, 4 per index
2490 ///   @param pVtxOut - base pointer to output simdvertex struct
2491 ///   @param extendType - sign extend or zero extend
2492 ///   @param bNormalized - do we need to denormalize?
2493 ///   @param currentVertexElement - reference to the current vVertexElement
2494 ///   @param outputElt - reference to the current offset from simdvertex we're o
2495 ///   @param compMask - component packing mask
2496 ///   @param compCtrl - component control val
2497 ///   @param vVertexElements[4] - vertex components to output
2498 #if USE_SIMD16_SHADERS
2499 void FetchJit::Shuffle16bpcGather(Shuffle16bpcArgs &args, bool useVertexID2)
2500 #else
2501 void FetchJit::Shuffle16bpcGather(Shuffle16bpcArgs &args)
2502 #endif
2503 {
2504     // Unpack tuple args
2505     Value* (&vGatherResult)[2] = std::get<0>(args);
2506     Value* pVtxOut = std::get<1>(args);
2507     const Instruction::CastOps extendType = std::get<2>(args);
2508     const ConversionType conversionType = std::get<3>(args);
2509     uint32_t &currentVertexElement = std::get<4>(args);
2510     uint32_t &outputElt = std::get<5>(args);
2511     const ComponentEnable compMask = std::get<6>(args);
2512     const ComponentControl(&compCtrl)[4] = std::get<7>(args);
2513     Value* (&vVertexElements)[4] = std::get<8>(args);
2514
2515     // cast types
2516     Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
2517     Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
2518
2519     // have to do extra work for sign extending
2520     if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP)||
2521         (extendType == Instruction::CastOps::FPExt))
2522     {
2523         // is this PP float?
2524         bool bFP = (extendType == Instruction::CastOps::FPExt) ? true : false;
2525
2526         Type* v8x16Ty = VectorType::get(mInt16Ty, 8); // 8x16bit in a 128bit lane
2527         Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
2528
2529         // shuffle mask
2530         Value* vConstMask = C<char>({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
2531                                      0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15});
2532         Value* vi128XY = nullptr;
2533         if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)){
2534             Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherResult[0], v32x8Ty), vConstMask), vGatherTy);
2535             // after pshufb: group components together in each 128bit lane
2536             // 256i - 0    1    2    3    4    5    6    7
2537             //        xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
2538
2539             vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
2540             // after PERMD: move and pack xy components into each 128bit lane
2541             // 256i - 0    1    2    3    4    5    6    7
2542             //        xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
2543         }
2544
2545         // do the same for zw components
2546         Value* vi128ZW = nullptr;
2547         if(isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)){
2548             Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherResult[1], v32x8Ty), vConstMask), vGatherTy);
2549             vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
2550         }
2551
2552         // init denormalize variables if needed
2553         Instruction::CastOps IntToFpCast;
2554         Value* conversionFactor;
2555
2556         switch (conversionType)
2557         {
2558         case CONVERT_NORMALIZED:
2559             IntToFpCast = Instruction::CastOps::SIToFP;
2560             conversionFactor = VIMMED1((float)(1.0 / 32767.0));
2561             break;
2562         case CONVERT_SSCALED:
2563             IntToFpCast = Instruction::CastOps::SIToFP;
2564             conversionFactor = VIMMED1((float)(1.0));
2565             break;
2566         case CONVERT_USCALED:
2567             SWR_INVALID("Type should not be sign extended!");
2568             conversionFactor = nullptr;
2569             break;
2570         default:
2571             SWR_ASSERT(conversionType == CONVERT_NONE);
2572             conversionFactor = nullptr;
2573             break;
2574         }
2575
2576         // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
2577         for (uint32_t i = 0; i < 4; i++)
2578         {
2579             if (isComponentEnabled(compMask, i))
2580             {
2581                 if (compCtrl[i] == ComponentControl::StoreSrc)
2582                 {
2583                     // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
2584                     uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
2585                     // if x or y, use vi128XY permute result, else use vi128ZW
2586                     Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
2587
2588                     if (bFP) {
2589                         // extract 128 bit lanes to sign extend each component
2590                         vVertexElements[currentVertexElement] = CVTPH2PS(BITCAST(VEXTRACT(selectedPermute, C(lane)), v8x16Ty));
2591                     }
2592                     else {
2593                         // extract 128 bit lanes to sign extend each component
2594                         vVertexElements[currentVertexElement] = PMOVSXWD(BITCAST(VEXTRACT(selectedPermute, C(lane)), v8x16Ty));
2595
2596                         // denormalize if needed
2597                         if (conversionType != CONVERT_NONE) {
2598                             vVertexElements[currentVertexElement] = FMUL(CAST(IntToFpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor);
2599                         }
2600                     }
2601                     currentVertexElement++;
2602                 }
2603                 else
2604                 {
2605 #if USE_SIMD16_SHADERS
2606                     vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i], useVertexID2);
2607 #else
2608                     vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
2609 #endif
2610                 }
2611
2612                 if (currentVertexElement > 3)
2613                 {
2614                     StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
2615                     // reset to the next vVertexElement to output
2616                     currentVertexElement = 0;
2617                 }
2618             }
2619         }
2620     }
2621     // else zero extend
2622     else if ((extendType == Instruction::CastOps::ZExt) || (extendType == Instruction::CastOps::UIToFP))
2623     {
2624         // pshufb masks for each component
2625         Value* vConstMask[2];
2626         if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 2)){
2627             // x/z shuffle mask
2628             vConstMask[0] = C<char>({0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
2629                                      0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, });
2630         }
2631
2632         if(isComponentEnabled(compMask, 1) || isComponentEnabled(compMask, 3)){
2633             // y/w shuffle mask
2634             vConstMask[1] = C<char>({2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
2635                                      2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1});
2636         }
2637
2638         // init denormalize variables if needed
2639         Instruction::CastOps fpCast;
2640         Value* conversionFactor;
2641
2642         switch (conversionType)
2643         {
2644         case CONVERT_NORMALIZED:
2645             fpCast = Instruction::CastOps::UIToFP;
2646             conversionFactor = VIMMED1((float)(1.0 / 65535.0));
2647             break;
2648         case CONVERT_USCALED:
2649             fpCast = Instruction::CastOps::UIToFP;
2650             conversionFactor = VIMMED1((float)(1.0f));
2651             break;
2652         case CONVERT_SSCALED:
2653             SWR_INVALID("Type should not be zero extended!");
2654             conversionFactor = nullptr;
2655             break;
2656         default:
2657             SWR_ASSERT(conversionType == CONVERT_NONE);
2658             conversionFactor = nullptr;
2659             break;
2660         }
2661
2662         // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits
2663         for (uint32_t i = 0; i < 4; i++)
2664         {
2665             if (isComponentEnabled(compMask, i))
2666             {
2667                 if (compCtrl[i] == ComponentControl::StoreSrc)
2668                 {
2669                     // select correct constMask for x/z or y/w pshufb
2670                     uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1;
2671                     // if x or y, use vi128XY permute result, else use vi128ZW
2672                     uint32_t selectedGather = (i < 2) ? 0 : 1;
2673
2674                     vVertexElements[currentVertexElement] = BITCAST(PSHUFB(BITCAST(vGatherResult[selectedGather], v32x8Ty), vConstMask[selectedMask]), vGatherTy);
2675                     // after pshufb mask for x channel; z uses the same shuffle from the second gather
2676                     // 256i - 0    1    2    3    4    5    6    7
2677                     //        xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00
2678
2679                     // denormalize if needed
2680                     if (conversionType != CONVERT_NONE)
2681                     {
2682                         vVertexElements[currentVertexElement] = FMUL(CAST(fpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor);
2683                     }
2684                     currentVertexElement++;
2685                 }
2686                 else
2687                 {
2688 #if USE_SIMD16_SHADERS
2689                     vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i], useVertexID2);
2690 #else
2691                     vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
2692 #endif
2693                 }
2694
2695                 if (currentVertexElement > 3)
2696                 {
2697                     StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
2698                     // reset to the next vVertexElement to output
2699                     currentVertexElement = 0;
2700                 }
2701             }
2702         }
2703     }
2704     else
2705     {
2706         SWR_INVALID("Unsupported conversion type");
2707     }
2708 }
2709
2710 #if USE_SIMD16_BUILDER
2711 void FetchJit::Shuffle16bpcGather2(Shuffle16bpcArgs &args)
2712 {
2713     // Unpack tuple args
2714     Value* (&vGatherResult)[2] = std::get<0>(args);
2715     Value* pVtxOut = std::get<1>(args);
2716     const Instruction::CastOps extendType = std::get<2>(args);
2717     const ConversionType conversionType = std::get<3>(args);
2718     uint32_t &currentVertexElement = std::get<4>(args);
2719     uint32_t &outputElt = std::get<5>(args);
2720     const ComponentEnable compMask = std::get<6>(args);
2721     const ComponentControl(&compCtrl)[4] = std::get<7>(args);
2722     Value* (&vVertexElements)[4] = std::get<8>(args);
2723
2724     // cast types
2725     Type *vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
2726     Type *v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
2727
2728     // have to do extra work for sign extending
2729     if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP) || (extendType == Instruction::CastOps::FPExt))
2730     {
2731         // is this PP float?
2732         bool bFP = (extendType == Instruction::CastOps::FPExt) ? true : false;
2733
2734         Type *v8x16Ty = VectorType::get(mInt16Ty, 8); // 8x16bit in a 128bit lane
2735         Type *v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
2736
2737         // shuffle mask
2738         Value *vConstMask = C<char>({ 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
2739                                       0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 });
2740         Value *vi128XY = nullptr;
2741         Value *vi128XY_lo = nullptr;
2742         Value *vi128XY_hi = nullptr;
2743         if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
2744         {
2745             // SIMD16 PSHUFB isnt part of AVX-512F, so split into SIMD8 for the sake of KNL, for now..
2746
2747             Value *vGatherResult_lo = EXTRACT2_I(vGatherResult[0], 0);
2748             Value *vGatherResult_hi = EXTRACT2_I(vGatherResult[0], 1);
2749
2750             Value *vShufResult_lo = BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask), vGatherTy);
2751             Value *vShufResult_hi = BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask), vGatherTy);
2752
2753             // after pshufb: group components together in each 128bit lane
2754             // 256i - 0    1    2    3    4    5    6    7
2755             //        xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
2756
2757             vi128XY_lo = BITCAST(PERMD(vShufResult_lo, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
2758             vi128XY_hi = BITCAST(PERMD(vShufResult_hi, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
2759
2760             // after PERMD: move and pack xy components into each 128bit lane
2761             // 256i - 0    1    2    3    4    5    6    7
2762             //        xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
2763 #if 0
2764
2765             vi128XY = VUNDEF2_I();
2766             vi128XY = INSERT2_I(vi128XY, vi128XY_lo, 0);
2767             vi128XY = INSERT2_I(vi128XY, vi128XY_hi, 1);
2768 #endif
2769         }
2770
2771         // do the same for zw components
2772         Value *vi128ZW = nullptr;
2773         Value *vi128ZW_lo = nullptr;
2774         Value *vi128ZW_hi = nullptr;
2775         if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
2776         {
2777             Value *vGatherResult_lo = EXTRACT2_I(vGatherResult[1], 0);
2778             Value *vGatherResult_hi = EXTRACT2_I(vGatherResult[1], 1);
2779
2780             Value *vShufResult_lo = BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask), vGatherTy);
2781             Value *vShufResult_hi = BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask), vGatherTy);
2782
2783             vi128ZW_lo = BITCAST(PERMD(vShufResult_lo, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
2784             vi128ZW_hi = BITCAST(PERMD(vShufResult_hi, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
2785 #if 0
2786
2787             vi128ZW = VUNDEF2_I();
2788             vi128ZW = INSERT2_I(vi128ZW, vi128ZW_lo, 0);
2789             vi128ZW = INSERT2_I(vi128ZW, vi128ZW_hi, 1);
2790 #endif
2791         }
2792
2793         // init denormalize variables if needed
2794         Instruction::CastOps IntToFpCast;
2795         Value *conversionFactor;
2796
2797         switch (conversionType)
2798         {
2799         case CONVERT_NORMALIZED:
2800             IntToFpCast = Instruction::CastOps::SIToFP;
2801             conversionFactor = VIMMED1((float)(1.0 / 32767.0));
2802             break;
2803         case CONVERT_SSCALED:
2804             IntToFpCast = Instruction::CastOps::SIToFP;
2805             conversionFactor = VIMMED1((float)(1.0));
2806             break;
2807         case CONVERT_USCALED:
2808             SWR_INVALID("Type should not be sign extended!");
2809             conversionFactor = nullptr;
2810             break;
2811         default:
2812             SWR_ASSERT(conversionType == CONVERT_NONE);
2813             conversionFactor = nullptr;
2814             break;
2815         }
2816
2817         // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
2818         for (uint32_t i = 0; i < 4; i++)
2819         {
2820             if (isComponentEnabled(compMask, i))
2821             {
2822                 if (compCtrl[i] == ComponentControl::StoreSrc)
2823                 {
2824                     // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
2825                     uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
2826                     // if x or y, use vi128XY permute result, else use vi128ZW
2827                     Value *selectedPermute_lo = (i < 2) ? vi128XY_lo : vi128ZW_lo;
2828                     Value *selectedPermute_hi = (i < 2) ? vi128XY_hi : vi128ZW_hi;
2829
2830                     if (bFP)
2831                     {
2832                         // extract 128 bit lanes to sign extend each component
2833                         Value *temp_lo = CVTPH2PS(BITCAST(VEXTRACT(selectedPermute_lo, C(lane)), v8x16Ty));
2834                         Value *temp_hi = CVTPH2PS(BITCAST(VEXTRACT(selectedPermute_hi, C(lane)), v8x16Ty));
2835
2836                         vVertexElements[currentVertexElement] = VUNDEF2_F();
2837                         vVertexElements[currentVertexElement] = INSERT2_F(vVertexElements[currentVertexElement], temp_lo, 0);
2838                         vVertexElements[currentVertexElement] = INSERT2_F(vVertexElements[currentVertexElement], temp_hi, 1);
2839                     }
2840                     else
2841                     {
2842                         // extract 128 bit lanes to sign extend each component
2843                         Value *temp_lo = PMOVSXWD(BITCAST(VEXTRACT(selectedPermute_lo, C(lane)), v8x16Ty));
2844                         Value *temp_hi = PMOVSXWD(BITCAST(VEXTRACT(selectedPermute_hi, C(lane)), v8x16Ty));
2845
2846                         // denormalize if needed
2847                         if (conversionType != CONVERT_NONE)
2848                         {
2849                             temp_lo = FMUL(CAST(IntToFpCast, temp_lo, mSimdFP32Ty), conversionFactor);
2850                             temp_hi = FMUL(CAST(IntToFpCast, temp_hi, mSimdFP32Ty), conversionFactor);
2851                         }
2852
2853                         vVertexElements[currentVertexElement] = VUNDEF2_F();
2854                         vVertexElements[currentVertexElement] = INSERT2_F(vVertexElements[currentVertexElement], temp_lo, 0);
2855                         vVertexElements[currentVertexElement] = INSERT2_F(vVertexElements[currentVertexElement], temp_hi, 1);
2856                     }
2857
2858                     currentVertexElement += 1;
2859                 }
2860                 else
2861                 {
2862                     vVertexElements[currentVertexElement++] = GenerateCompCtrlVector2(compCtrl[i]);
2863                 }
2864
2865                 if (currentVertexElement > 3)
2866                 {
2867                     StoreVertexElements2(pVtxOut, outputElt++, 4, vVertexElements);
2868                     // reset to the next vVertexElement to output
2869                     currentVertexElement = 0;
2870                 }
2871             }
2872         }
2873     }
2874     // else zero extend
2875     else if ((extendType == Instruction::CastOps::ZExt) || (extendType == Instruction::CastOps::UIToFP))
2876     {
2877         // pshufb masks for each component
2878         Value *vConstMask[2];
2879
2880         if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 2))
2881         {
2882             // x/z shuffle mask
2883             vConstMask[0] = C<char>({ 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
2884                 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, });
2885         }
2886
2887         if (isComponentEnabled(compMask, 1) || isComponentEnabled(compMask, 3))
2888         {
2889             // y/w shuffle mask
2890             vConstMask[1] = C<char>({ 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
2891                 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1 });
2892         }
2893
2894         // init denormalize variables if needed
2895         Instruction::CastOps fpCast;
2896         Value* conversionFactor;
2897
2898         switch (conversionType)
2899         {
2900         case CONVERT_NORMALIZED:
2901             fpCast = Instruction::CastOps::UIToFP;
2902             conversionFactor = VIMMED1((float)(1.0 / 65535.0));
2903             break;
2904         case CONVERT_USCALED:
2905             fpCast = Instruction::CastOps::UIToFP;
2906             conversionFactor = VIMMED1((float)(1.0f));
2907             break;
2908         case CONVERT_SSCALED:
2909             SWR_INVALID("Type should not be zero extended!");
2910             conversionFactor = nullptr;
2911             break;
2912         default:
2913             SWR_ASSERT(conversionType == CONVERT_NONE);
2914             conversionFactor = nullptr;
2915             break;
2916         }
2917
2918         // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits
2919         for (uint32_t i = 0; i < 4; i++)
2920         {
2921             if (isComponentEnabled(compMask, i))
2922             {
2923                 if (compCtrl[i] == ComponentControl::StoreSrc)
2924                 {
2925                     // select correct constMask for x/z or y/w pshufb
2926                     uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1;
2927                     // if x or y, use vi128XY permute result, else use vi128ZW
2928                     uint32_t selectedGather = (i < 2) ? 0 : 1;
2929
2930                     // SIMD16 PSHUFB isnt part of AVX-512F, so split into SIMD8 for the sake of KNL, for now..
2931
2932                     Value *vGatherResult_lo = EXTRACT2_I(vGatherResult[selectedGather], 0);
2933                     Value *vGatherResult_hi = EXTRACT2_I(vGatherResult[selectedGather], 1);
2934
2935                     Value *temp_lo = BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask[selectedMask]), vGatherTy);
2936                     Value *temp_hi = BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask[selectedMask]), vGatherTy);
2937
2938                     // after pshufb mask for x channel; z uses the same shuffle from the second gather
2939                     // 256i - 0    1    2    3    4    5    6    7
2940                     //        xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00
2941
2942                     // denormalize if needed
2943                     if (conversionType != CONVERT_NONE)
2944                     {
2945                         temp_lo = FMUL(CAST(fpCast, temp_lo, mSimdFP32Ty), conversionFactor);
2946                         temp_hi = FMUL(CAST(fpCast, temp_hi, mSimdFP32Ty), conversionFactor);
2947                     }
2948
2949                     vVertexElements[currentVertexElement] = VUNDEF2_F();
2950                     vVertexElements[currentVertexElement] = INSERT2_F(vVertexElements[currentVertexElement], temp_lo, 0);
2951                     vVertexElements[currentVertexElement] = INSERT2_F(vVertexElements[currentVertexElement], temp_hi, 1);
2952
2953                     currentVertexElement += 1;
2954                 }
2955                 else
2956                 {
2957                     vVertexElements[currentVertexElement++] = GenerateCompCtrlVector2(compCtrl[i]);
2958                 }
2959
2960                 if (currentVertexElement > 3)
2961                 {
2962                     StoreVertexElements2(pVtxOut, outputElt++, 4, vVertexElements);
2963                     // reset to the next vVertexElement to output
2964                     currentVertexElement = 0;
2965                 }
2966             }
2967         }
2968     }
2969     else
2970     {
2971         SWR_INVALID("Unsupported conversion type");
2972     }
2973 }
2974
2975 #endif
2976 //////////////////////////////////////////////////////////////////////////
2977 /// @brief Output a simdvertex worth of elements to the current outputElt
2978 /// @param pVtxOut - base address of VIN output struct
2979 /// @param outputElt - simdvertex offset in VIN to write to
2980 /// @param numEltsToStore - number of simdvertex rows to write out
2981 /// @param vVertexElements - LLVM Value*[] simdvertex to write out
2982 void FetchJit::StoreVertexElements(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4])
2983 {
2984     SWR_ASSERT(numEltsToStore <= 4, "Invalid element count.");
2985
2986     for(uint32_t c = 0; c < numEltsToStore; ++c)
2987     {
2988         // STORE expects FP32 x vWidth type, just bitcast if needed
2989         if(!vVertexElements[c]->getType()->getScalarType()->isFloatTy())
2990         {
2991 #if FETCH_DUMP_VERTEX
2992             PRINT("vVertexElements[%d]: 0x%x\n", {C(c), vVertexElements[c]});
2993 #endif
2994             vVertexElements[c] = BITCAST(vVertexElements[c], mSimdFP32Ty);
2995         }
2996 #if FETCH_DUMP_VERTEX
2997         else
2998         {
2999             PRINT("vVertexElements[%d]: %f\n", {C(c), vVertexElements[c]});
3000         }
3001 #endif
3002         // outputElt * 4 = offsetting by the size of a simdvertex
3003         // + c offsets to a 32bit x vWidth row within the current vertex
3004 #if USE_SIMD16_SHADERS
3005         Value* dest = GEP(pVtxOut, C(outputElt * 8 + c * 2), "destGEP");
3006 #else
3007         Value* dest = GEP(pVtxOut, C(outputElt * 4 + c), "destGEP");
3008 #endif
3009         STORE(vVertexElements[c], dest);
3010     }
3011 }
3012
3013 #if USE_SIMD16_BUILDER
3014 void FetchJit::StoreVertexElements2(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4])
3015 {
3016     SWR_ASSERT(numEltsToStore <= 4, "Invalid element count.");
3017
3018     for (uint32_t c = 0; c < numEltsToStore; ++c)
3019     {
3020         // STORE expects FP32 x vWidth type, just bitcast if needed
3021         if (!vVertexElements[c]->getType()->getScalarType()->isFloatTy())
3022         {
3023 #if FETCH_DUMP_VERTEX
3024             PRINT("vVertexElements[%d]: 0x%x\n", { C(c), vVertexElements[c] });
3025 #endif
3026             vVertexElements[c] = BITCAST(vVertexElements[c], mSimd2FP32Ty);
3027         }
3028 #if FETCH_DUMP_VERTEX
3029         else
3030         {
3031             PRINT("vVertexElements[%d]: %f\n", { C(c), vVertexElements[c] });
3032         }
3033 #endif
3034         // outputElt * 4 = offsetting by the size of a simdvertex
3035         // + c offsets to a 32bit x vWidth row within the current vertex
3036         Value* dest = GEP(pVtxOut, C(outputElt * 4 + c), "destGEP");
3037         STORE(vVertexElements[c], dest);
3038     }
3039 }
3040
3041 #endif
3042 //////////////////////////////////////////////////////////////////////////
3043 /// @brief Generates a constant vector of values based on the
3044 /// ComponentControl value
3045 /// @param ctrl - ComponentControl value
3046 #if USE_SIMD16_SHADERS
3047 Value* FetchJit::GenerateCompCtrlVector(const ComponentControl ctrl, bool useVertexID2)
3048 #else
3049 Value* FetchJit::GenerateCompCtrlVector(const ComponentControl ctrl)
3050 #endif
3051 {
3052     switch(ctrl)
3053     {
3054         case NoStore:   return VUNDEF_I();
3055         case Store0:    return VIMMED1(0);
3056         case Store1Fp:  return VIMMED1(1.0f);
3057         case Store1Int: return VIMMED1(1);
3058         case StoreVertexId:
3059         {
3060 #if USE_SIMD16_SHADERS
3061             Value* pId;
3062             if (useVertexID2)
3063             {
3064                 pId = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID2 })), mSimdFP32Ty);
3065             }
3066             else
3067             {
3068                 pId = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID })), mSimdFP32Ty);
3069             }
3070 #else
3071             Value* pId = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID })), mSimdFP32Ty);
3072 #endif
3073             return VBROADCAST(pId);
3074         }
3075         case StoreInstanceId:
3076         {
3077             Value* pId = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_CurInstance })), mFP32Ty);
3078             return VBROADCAST(pId);
3079         }
3080         case StoreSrc:
3081         default:        SWR_INVALID("Invalid component control"); return VUNDEF_I();
3082     }
3083 }
3084
3085 #if USE_SIMD16_BUILDER
3086 Value* FetchJit::GenerateCompCtrlVector2(const ComponentControl ctrl)
3087 {
3088     switch (ctrl)
3089     {
3090         case NoStore:   return VUNDEF2_I();
3091         case Store0:    return VIMMED2_1(0);
3092         case Store1Fp:  return VIMMED2_1(1.0f);
3093         case Store1Int: return VIMMED2_1(1);
3094         case StoreVertexId:
3095         {
3096             Value* pId = VUNDEF2_F();
3097
3098             Value* pId_lo = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID  })), mSimdFP32Ty);
3099             Value* pId_hi = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID2 })), mSimdFP32Ty);
3100
3101             pId = INSERT2_F(pId, pId_lo, 0);
3102             pId = INSERT2_F(pId, pId_hi, 1);
3103
3104             return VBROADCAST2(pId);
3105         }
3106         case StoreInstanceId:
3107         {
3108             Value* pId = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_CurInstance })), mFP32Ty);
3109             return VBROADCAST2(pId);
3110         }
3111         case StoreSrc:
3112         default:        SWR_INVALID("Invalid component control"); return VUNDEF2_I();
3113     }
3114 }
3115
3116 #endif
3117 //////////////////////////////////////////////////////////////////////////
3118 /// @brief Returns the enable mask for the specified component.
3119 /// @param enableMask - enable bits
3120 /// @param component - component to check if enabled.
3121 bool isComponentEnabled(ComponentEnable enableMask, uint8_t component)
3122 {
3123     switch (component)
3124     {
3125         // X
3126     case 0: return (enableMask & ComponentEnable::X);
3127         // Y
3128     case 1: return (enableMask & ComponentEnable::Y);
3129         // Z
3130     case 2: return (enableMask & ComponentEnable::Z);
3131         // W
3132     case 3: return (enableMask & ComponentEnable::W);
3133
3134     default: return false;
3135     }
3136 }
3137
3138
3139 //////////////////////////////////////////////////////////////////////////
3140 /// @brief JITs from fetch shader IR
3141 /// @param hJitMgr - JitManager handle
3142 /// @param func   - LLVM function IR
3143 /// @return PFN_FETCH_FUNC - pointer to fetch code
3144 PFN_FETCH_FUNC JitFetchFunc(HANDLE hJitMgr, const HANDLE hFunc)
3145 {
3146     const llvm::Function* func = (const llvm::Function*)hFunc;
3147     JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
3148     PFN_FETCH_FUNC pfnFetch;
3149
3150     pfnFetch = (PFN_FETCH_FUNC)(pJitMgr->mpExec->getFunctionAddress(func->getName().str()));
3151     // MCJIT finalizes modules the first time you JIT code from them. After finalized, you cannot add new IR to the module
3152     pJitMgr->mIsModuleFinalized = true;
3153
3154 #if defined(KNOB_SWRC_TRACING)
3155     char fName[1024];
3156     const char *funcName = func->getName().data();
3157     sprintf(fName, "%s.bin", funcName);
3158     FILE *fd = fopen(fName, "wb");
3159     fwrite((void *)pfnFetch, 1, 2048, fd);
3160     fclose(fd);
3161 #endif
3162
3163     pJitMgr->DumpAsm(const_cast<llvm::Function*>(func), "final");
3164
3165     return pfnFetch;
3166 }
3167
3168 //////////////////////////////////////////////////////////////////////////
3169 /// @brief JIT compiles fetch shader
3170 /// @param hJitMgr - JitManager handle
3171 /// @param state   - fetch state to build function from
3172 extern "C" PFN_FETCH_FUNC JITCALL JitCompileFetch(HANDLE hJitMgr, const FETCH_COMPILE_STATE& state)
3173 {
3174     JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
3175
3176     pJitMgr->SetupNewModule();
3177
3178     FetchJit theJit(pJitMgr);
3179     HANDLE hFunc = theJit.Create(state);
3180
3181     return JitFetchFunc(hJitMgr, hFunc);
3182 }