src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp

   1 /****************************************************************************
   2 * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
   3 *
   4 * Permission is hereby granted, free of charge, to any person obtaining a
   5 * copy of this software and associated documentation files (the "Software"),
   6 * to deal in the Software without restriction, including without limitation
   7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8 * and/or sell copies of the Software, and to permit persons to whom the
   9 * Software is furnished to do so, subject to the following conditions:
  10 *
  11 * The above copyright notice and this permission notice (including the next
  12 * paragraph) shall be included in all copies or substantial portions of the
  13 * Software.
  14 *
  15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21 * IN THE SOFTWARE.
  22 *
  23 * @file fetch_jit.cpp
  24 *
  25 * @brief Implementation of the fetch jitter
  26 *
  27 * Notes:
  28 *
  29 ******************************************************************************/
  30 #include "jit_pch.hpp"
  31 #include "builder.h"
  32 #include "jit_api.h"
  33 #include "fetch_jit.h"
  34 #include "gen_state_llvm.h"
  35
  36 //#define FETCH_DUMP_VERTEX 1
  37 using namespace llvm;
  38 using namespace SwrJit;
  39
  40 bool isComponentEnabled(ComponentEnable enableMask, uint8_t component);
  41
  42 enum ConversionType
  43 {
  44     CONVERT_NONE,
  45     CONVERT_NORMALIZED,
  46     CONVERT_USCALED,
  47     CONVERT_SSCALED,
  48     CONVERT_SFIXED,
  49 };
  50
  51 #if USE_SIMD16_SHADERS
  52 #define USE_SIMD16_GATHERS 0
  53 #endif
  54
  55 //////////////////////////////////////////////////////////////////////////
  56 /// Interface to Jitting a fetch shader
  57 //////////////////////////////////////////////////////////////////////////
  58 struct FetchJit :
  59     public Builder
  60 {
  61     FetchJit(JitManager* pJitMgr) :
  62         Builder(pJitMgr)
  63     {}
  64
  65     Function* Create(const FETCH_COMPILE_STATE& fetchState);
  66
  67     Value* GetSimdValid32bitIndices(Value* vIndices, Value* pLastIndex);
  68     Value* GetSimdValid16bitIndices(Value* vIndices, Value* pLastIndex);
  69     Value* GetSimdValid8bitIndices(Value* vIndices, Value* pLastIndex);
  70
  71     // package up Shuffle*bpcGatherd args into a tuple for convenience
  72     typedef std::tuple<Value*&, Value*, const Instruction::CastOps, const ConversionType,
  73         uint32_t&, uint32_t&, const ComponentEnable, const ComponentControl(&)[4], Value*(&)[4],
  74         const uint32_t(&)[4]> Shuffle8bpcArgs;
  75
  76 #if USE_SIMD16_SHADERS
  77 #if USE_SIMD16_GATHERS
  78     void Shuffle8bpcGatherd16(Shuffle8bpcArgs &args);
  79 #else
  80     void Shuffle8bpcGatherd(Shuffle8bpcArgs &args, bool useVertexID2);
  81 #endif
  82 #else
  83     void Shuffle8bpcGatherd(Shuffle8bpcArgs &args);
  84 #endif
  85
  86     typedef std::tuple<Value*(&)[2], Value*, const Instruction::CastOps, const ConversionType,
  87         uint32_t&, uint32_t&, const ComponentEnable, const ComponentControl(&)[4], Value*(&)[4]> Shuffle16bpcArgs;
  88
  89 #if USE_SIMD16_SHADERS
  90 #if USE_SIMD16_GATHERS
  91     void Shuffle16bpcGather16(Shuffle16bpcArgs &args);
  92 #else
  93     void Shuffle16bpcGather(Shuffle16bpcArgs &args, bool useVertexID2);
  94 #endif
  95 #else
  96     void Shuffle16bpcGather(Shuffle16bpcArgs &args);
  97 #endif
  98
  99 #if USE_SIMD16_GATHERS
 100     void StoreVertexElements16(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4]);
 101 #else
 102     void StoreVertexElements(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4]);
 103 #endif
 104
 105 #if USE_SIMD16_SHADERS
 106 #if USE_SIMD16_GATHERS
 107     Value *GenerateCompCtrlVector16(const ComponentControl ctrl);
 108 #else
 109     Value *GenerateCompCtrlVector(const ComponentControl ctrl, bool useVertexID2);
 110 #endif
 111 #else
 112     Value *GenerateCompCtrlVector(const ComponentControl ctrl);
 113 #endif
 114
 115     void JitLoadVertices(const FETCH_COMPILE_STATE &fetchState, Value* streams, Value* vIndices, Value* pVtxOut);
 116
 117 #if USE_SIMD16_SHADERS
 118 #if USE_SIMD16_GATHERS
 119     void JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, Value *streams, Value *vIndices, Value *vIndices2, Value *pVtxOut, bool useVertexID2);
 120 #else
 121     void JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, Value* streams, Value* vIndices, Value* pVtxOut, bool useVertexID2);
 122 #endif
 123 #else
 124     void JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, Value* streams, Value* vIndices, Value* pVtxOut);
 125 #endif
 126
 127     bool IsOddFormat(SWR_FORMAT format);
 128     bool IsUniformFormat(SWR_FORMAT format);
 129     void UnpackComponents(SWR_FORMAT format, Value* vInput, Value* result[4]);
 130     void CreateGatherOddFormats(SWR_FORMAT format, Value* pMask, Value* pBase, Value* offsets, Value* result[4]);
 131     void ConvertFormat(SWR_FORMAT format, Value *texels[4]);
 132
 133     Value* mpFetchInfo;
 134 };
 135
 136 Function* FetchJit::Create(const FETCH_COMPILE_STATE& fetchState)
 137 {
 138     std::stringstream fnName("FCH_", std::ios_base::in | std::ios_base::out | std::ios_base::ate);
 139     fnName << ComputeCRC(0, &fetchState, sizeof(fetchState));
 140
 141     Function*    fetch = Function::Create(JM()->mFetchShaderTy, GlobalValue::ExternalLinkage, fnName.str(), JM()->mpCurrentModule);
 142     BasicBlock*    entry = BasicBlock::Create(JM()->mContext, "entry", fetch);
 143
 144     fetch->getParent()->setModuleIdentifier(fetch->getName());
 145
 146     IRB()->SetInsertPoint(entry);
 147
 148     auto    argitr = fetch->arg_begin();
 149
 150     // Fetch shader arguments
 151     mpPrivateContext = &*argitr; ++argitr;
 152     mpPrivateContext->setName("privateContext");
 153
 154     mpFetchInfo = &*argitr; ++argitr;
 155     mpFetchInfo->setName("fetchInfo");
 156     Value*    pVtxOut = &*argitr;
 157     pVtxOut->setName("vtxOutput");
 158     // this is just shorthand to tell LLVM to get a pointer to the base address of simdvertex
 159     // index 0(just the pointer to the simdvertex structure
 160     // index 1(which element of the simdvertex structure to offset to(in this case 0)
 161     // so the indices being i32's doesn't matter
 162     // TODO: generated this GEP with a VECTOR structure type so this makes sense
 163     std::vector<Value*>    vtxInputIndices(2, C(0));
 164     // GEP
 165     pVtxOut = GEP(pVtxOut, C(0));
 166 #if USE_SIMD16_SHADERS
 167 #if 0// USE_SIMD16_BUILDER
 168     pVtxOut = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth16), 0));
 169 #else
 170     pVtxOut = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth), 0));
 171 #endif
 172 #else
 173     pVtxOut = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth), 0));
 174 #endif
 175
 176     // SWR_FETCH_CONTEXT::pStreams
 177     Value*    streams = LOAD(mpFetchInfo,{0, SWR_FETCH_CONTEXT_pStreams});
 178     streams->setName("pStreams");
 179
 180     // SWR_FETCH_CONTEXT::pIndices
 181     Value*    indices = LOAD(mpFetchInfo,{0, SWR_FETCH_CONTEXT_pIndices});
 182     indices->setName("pIndices");
 183
 184     // SWR_FETCH_CONTEXT::pLastIndex
 185     Value*    pLastIndex = LOAD(mpFetchInfo,{0, SWR_FETCH_CONTEXT_pLastIndex});
 186     pLastIndex->setName("pLastIndex");
 187
 188
 189     Value* vIndices;
 190 #if USE_SIMD16_SHADERS
 191     Value* indices2;
 192     Value* vIndices2;
 193 #endif
 194     switch(fetchState.indexType)
 195     {
 196         case R8_UINT:
 197             indices = BITCAST(indices, Type::getInt8PtrTy(JM()->mContext, 0));
 198 #if USE_SIMD16_SHADERS
 199             indices2 = GEP(indices, C(8));
 200 #endif
 201             if(fetchState.bDisableIndexOOBCheck)
 202             {
 203                 vIndices = LOAD(BITCAST(indices, PointerType::get(VectorType::get(mInt8Ty, mpJitMgr->mVWidth), 0)), {(uint32_t)0});
 204                 vIndices = Z_EXT(vIndices, mSimdInt32Ty);
 205 #if USE_SIMD16_SHADERS
 206                 vIndices2 = LOAD(BITCAST(indices2, PointerType::get(VectorType::get(mInt8Ty, mpJitMgr->mVWidth), 0)), { (uint32_t)0 });
 207                 vIndices2 = Z_EXT(vIndices2, mSimdInt32Ty);
 208 #endif
 209             }
 210             else
 211             {
 212                 pLastIndex = BITCAST(pLastIndex, Type::getInt8PtrTy(JM()->mContext, 0));
 213                 vIndices = GetSimdValid8bitIndices(indices, pLastIndex);
 214 #if USE_SIMD16_SHADERS
 215                 pLastIndex = BITCAST(pLastIndex, Type::getInt8PtrTy(JM()->mContext, 0));
 216                 vIndices2 = GetSimdValid8bitIndices(indices2, pLastIndex);
 217 #endif
 218             }
 219             break;
 220         case R16_UINT:
 221             indices = BITCAST(indices, Type::getInt16PtrTy(JM()->mContext, 0));
 222 #if USE_SIMD16_SHADERS
 223             indices2 = GEP(indices, C(8));
 224 #endif
 225             if(fetchState.bDisableIndexOOBCheck)
 226             {
 227                 vIndices = LOAD(BITCAST(indices, PointerType::get(VectorType::get(mInt16Ty, mpJitMgr->mVWidth), 0)), {(uint32_t)0});
 228                 vIndices = Z_EXT(vIndices, mSimdInt32Ty);
 229 #if USE_SIMD16_SHADERS
 230                 vIndices2 = LOAD(BITCAST(indices2, PointerType::get(VectorType::get(mInt16Ty, mpJitMgr->mVWidth), 0)), { (uint32_t)0 });
 231                 vIndices2 = Z_EXT(vIndices2, mSimdInt32Ty);
 232 #endif
 233             }
 234             else
 235             {
 236                 pLastIndex = BITCAST(pLastIndex, Type::getInt16PtrTy(JM()->mContext, 0));
 237                 vIndices = GetSimdValid16bitIndices(indices, pLastIndex);
 238 #if USE_SIMD16_SHADERS
 239                 pLastIndex = BITCAST(pLastIndex, Type::getInt16PtrTy(JM()->mContext, 0));
 240                 vIndices2 = GetSimdValid16bitIndices(indices2, pLastIndex);
 241 #endif
 242             }
 243             break;
 244         case R32_UINT:
 245 #if USE_SIMD16_SHADERS
 246             indices2 = GEP(indices, C(8));
 247 #endif
 248             (fetchState.bDisableIndexOOBCheck) ? vIndices = LOAD(BITCAST(indices, PointerType::get(mSimdInt32Ty,0)),{(uint32_t)0})
 249                                                : vIndices = GetSimdValid32bitIndices(indices, pLastIndex);
 250 #if USE_SIMD16_SHADERS
 251             (fetchState.bDisableIndexOOBCheck) ? vIndices2 = LOAD(BITCAST(indices2, PointerType::get(mSimdInt32Ty, 0)), { (uint32_t)0 })
 252                                                : vIndices2 = GetSimdValid32bitIndices(indices2, pLastIndex);
 253 #endif
 254             break; // incoming type is already 32bit int
 255         default: SWR_INVALID("Unsupported index type"); vIndices = nullptr; break;
 256     }
 257
 258     if(fetchState.bForceSequentialAccessEnable)
 259     {
 260         Value* pOffsets = C({ 0, 1, 2, 3, 4, 5, 6, 7 });
 261
 262         // VertexData buffers are accessed sequentially, the index is equal to the vertex number
 263         vIndices = VBROADCAST(LOAD(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_StartVertex }));
 264         vIndices = ADD(vIndices, pOffsets);
 265 #if USE_SIMD16_SHADERS
 266         vIndices2 = ADD(vIndices, VIMMED1(8));
 267 #endif
 268     }
 269
 270     Value* vVertexId = vIndices;
 271 #if USE_SIMD16_SHADERS
 272     Value* vVertexId2 = vIndices2;
 273 #endif
 274     if (fetchState.bVertexIDOffsetEnable)
 275     {
 276         // Assuming one of baseVertex or startVertex is 0, so adding both should be functionally correct
 277         Value* vBaseVertex = VBROADCAST(LOAD(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_BaseVertex }));
 278         Value* vStartVertex = VBROADCAST(LOAD(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_StartVertex }));
 279         vVertexId = ADD(vIndices, vBaseVertex);
 280         vVertexId = ADD(vVertexId, vStartVertex);
 281 #if USE_SIMD16_SHADERS
 282         vVertexId2 = ADD(vIndices2, vBaseVertex);
 283         vVertexId2 = ADD(vVertexId2, vStartVertex);
 284 #endif
 285     }
 286
 287     // store out vertex IDs
 288     STORE(vVertexId, GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID }));
 289 #if USE_SIMD16_SHADERS
 290     STORE(vVertexId2, GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID2 }));
 291 #endif
 292
 293     // store out cut mask if enabled
 294     if (fetchState.bEnableCutIndex)
 295     {
 296         Value* vCutIndex = VIMMED1(fetchState.cutIndex);
 297         Value* cutMask = VMASK(ICMP_EQ(vIndices, vCutIndex));
 298         STORE(cutMask, GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_CutMask }));
 299 #if USE_SIMD16_SHADERS
 300         Value* cutMask2 = VMASK(ICMP_EQ(vIndices2, vCutIndex));
 301         STORE(cutMask2, GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_CutMask2 }));
 302 #endif
 303     }
 304
 305     // Fetch attributes from memory and output to a simdvertex struct
 306     // since VGATHER has a perf penalty on HSW vs BDW, allow client to choose which fetch method to use
 307 #if USE_SIMD16_SHADERS
 308     if (fetchState.bDisableVGATHER)
 309     {
 310         JitLoadVertices(fetchState, streams, vIndices, pVtxOut);
 311         JitLoadVertices(fetchState, streams, vIndices2, GEP(pVtxOut, C(1)));
 312     }
 313     else
 314     {
 315 #if USE_SIMD16_GATHERS
 316         JitGatherVertices(fetchState, streams, vIndices, vIndices2, pVtxOut, false);
 317 #else
 318         JitGatherVertices(fetchState, streams, vIndices, pVtxOut, false);
 319         JitGatherVertices(fetchState, streams, vIndices2, GEP(pVtxOut, C(1)), true);
 320 #endif
 321     }
 322 #else
 323     (fetchState.bDisableVGATHER) ? JitLoadVertices(fetchState, streams, vIndices, pVtxOut)
 324                                  : JitGatherVertices(fetchState, streams, vIndices, pVtxOut);
 325 #endif
 326
 327     RET_VOID();
 328
 329     JitManager::DumpToFile(fetch, "src");
 330
 331 #if defined(_DEBUG)
 332     verifyFunction(*fetch);
 333 #endif
 334
 335     ::FunctionPassManager setupPasses(JM()->mpCurrentModule);
 336
 337     ///@todo We don't need the CFG passes for fetch. (e.g. BreakCriticalEdges and CFGSimplification)
 338     setupPasses.add(createBreakCriticalEdgesPass());
 339     setupPasses.add(createCFGSimplificationPass());
 340     setupPasses.add(createEarlyCSEPass());
 341     setupPasses.add(createPromoteMemoryToRegisterPass());
 342
 343     setupPasses.run(*fetch);
 344
 345     JitManager::DumpToFile(fetch, "se");
 346
 347     ::FunctionPassManager optPasses(JM()->mpCurrentModule);
 348
 349     ///@todo Haven't touched these either. Need to remove some of these and add others.
 350     optPasses.add(createCFGSimplificationPass());
 351     optPasses.add(createEarlyCSEPass());
 352     optPasses.add(createInstructionCombiningPass());
 353     optPasses.add(createInstructionSimplifierPass());
 354     optPasses.add(createConstantPropagationPass());
 355     optPasses.add(createSCCPPass());
 356     optPasses.add(createAggressiveDCEPass());
 357
 358     optPasses.run(*fetch);
 359     optPasses.run(*fetch);
 360
 361     JitManager::DumpToFile(fetch, "opt");
 362
 363
 364     return fetch;
 365 }
 366
 367 //////////////////////////////////////////////////////////////////////////
 368 /// @brief Loads attributes from memory using LOADs, shuffling the
 369 /// components into SOA form.
 370 /// *Note* currently does not support component control,
 371 /// component packing, instancing
 372 /// @param fetchState - info about attributes to be fetched from memory
 373 /// @param streams - value pointer to the current vertex stream
 374 /// @param vIndices - vector value of indices to load
 375 /// @param pVtxOut - value pointer to output simdvertex struct
 376 void FetchJit::JitLoadVertices(const FETCH_COMPILE_STATE &fetchState, Value* streams, Value* vIndices, Value* pVtxOut)
 377 {
 378     // Zack shuffles; a variant of the Charleston.
 379
 380     std::vector<Value*> vectors(16);
 381     std::vector<Constant*>    pMask(mVWidth);
 382     for(uint32_t i = 0; i < mVWidth; ++i)
 383     {
 384         pMask[i] = (C(i < 4 ? i : 4));
 385     }
 386     Constant* promoteMask = ConstantVector::get(pMask);
 387     Constant* uwvec = UndefValue::get(VectorType::get(mFP32Ty, 4));
 388
 389     Value* startVertex = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartVertex});
 390     Value* startInstance = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartInstance});
 391     Value* curInstance = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_CurInstance});
 392     Value* vBaseVertex = VBROADCAST(LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_BaseVertex}));
 393     curInstance->setName("curInstance");
 394
 395     for(uint32_t nelt = 0; nelt < fetchState.numAttribs; ++nelt)
 396     {
 397         Value*    elements[4] = {0};
 398         const INPUT_ELEMENT_DESC& ied = fetchState.layout[nelt];
 399         const SWR_FORMAT_INFO &info = GetFormatInfo((SWR_FORMAT)ied.Format);
 400         SWR_ASSERT((info.bpp != 0), "Unsupported format in JitLoadVertices.");
 401         uint32_t    numComponents = info.numComps;
 402         uint32_t bpc = info.bpp / info.numComps;  ///@todo Code below assumes all components are same size. Need to fix.
 403
 404         // load path doesn't support component packing
 405         SWR_ASSERT(ied.ComponentPacking == ComponentEnable::XYZW, "Fetch load path doesn't support component packing.");
 406
 407         vectors.clear();
 408
 409         if (fetchState.bInstanceIDOffsetEnable)
 410         {
 411             SWR_ASSERT((0), "TODO: Fill out more once driver sends this down");
 412         }
 413
 414         Value *vCurIndices;
 415         Value *startOffset;
 416         if(ied.InstanceEnable)
 417         {
 418             Value* stepRate = C(ied.InstanceAdvancementState);
 419
 420             // prevent a div by 0 for 0 step rate
 421             Value* isNonZeroStep = ICMP_UGT(stepRate, C(0));
 422             stepRate = SELECT(isNonZeroStep, stepRate, C(1));
 423
 424             // calc the current offset into instanced data buffer
 425             Value* calcInstance = UDIV(curInstance, stepRate);
 426
 427             // if step rate is 0, every instance gets instance 0
 428             calcInstance = SELECT(isNonZeroStep, calcInstance, C(0));
 429
 430             vCurIndices = VBROADCAST(calcInstance);
 431
 432             startOffset = startInstance;
 433         }
 434         else if (ied.InstanceStrideEnable)
 435         {
 436             SWR_ASSERT((0), "TODO: Fill out more once driver sends this down.");
 437         }
 438         else
 439         {
 440             // offset indices by baseVertex
 441             vCurIndices = ADD(vIndices, vBaseVertex);
 442
 443             startOffset = startVertex;
 444         }
 445
 446         // load SWR_VERTEX_BUFFER_STATE::pData
 447         Value *stream = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pData});
 448
 449         // load SWR_VERTEX_BUFFER_STATE::pitch
 450         Value *stride = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pitch});
 451         stride = Z_EXT(stride, mInt64Ty);
 452
 453         // load SWR_VERTEX_BUFFER_STATE::size
 454         Value *size = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_size});
 455         size = Z_EXT(size, mInt64Ty);
 456
 457         Value* startVertexOffset = MUL(Z_EXT(startOffset, mInt64Ty), stride);
 458
 459         Value *minVertex = NULL;
 460         Value *minVertexOffset = NULL;
 461         if (fetchState.bPartialVertexBuffer) {
 462             // fetch min index for low bounds checking
 463             minVertex = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_minVertex)});
 464             minVertex = LOAD(minVertex);
 465             if (!fetchState.bDisableIndexOOBCheck) {
 466                 minVertexOffset = MUL(Z_EXT(minVertex, mInt64Ty), stride);
 467             }
 468         }
 469
 470         // Load from the stream.
 471         for(uint32_t lane = 0; lane < mVWidth; ++lane)
 472         {
 473             // Get index
 474             Value* index = VEXTRACT(vCurIndices, C(lane));
 475
 476             if (fetchState.bPartialVertexBuffer) {
 477                 // clamp below minvertex
 478                 Value *isBelowMin = ICMP_SLT(index, minVertex);
 479                 index = SELECT(isBelowMin, minVertex, index);
 480             }
 481
 482             index = Z_EXT(index, mInt64Ty);
 483
 484             Value*    offset = MUL(index, stride);
 485             offset = ADD(offset, C((int64_t)ied.AlignedByteOffset));
 486             offset = ADD(offset, startVertexOffset);
 487
 488             if (!fetchState.bDisableIndexOOBCheck) {
 489                 // check for out of bound access, including partial OOB, and replace them with minVertex
 490                 Value *endOffset = ADD(offset, C((int64_t)info.Bpp));
 491                 Value *oob = ICMP_ULE(endOffset, size);
 492                 if (fetchState.bPartialVertexBuffer) {
 493                     offset = SELECT(oob, offset, minVertexOffset);
 494                 } else {
 495                     offset = SELECT(oob, offset, ConstantInt::get(mInt64Ty, 0));
 496                 }
 497             }
 498
 499             Value*    pointer = GEP(stream, offset);
 500             // We use a full-lane, but don't actually care.
 501             Value*    vptr = 0;
 502
 503             // get a pointer to a 4 component attrib in default address space
 504             switch(bpc)
 505             {
 506                 case 8: vptr = BITCAST(pointer, PointerType::get(VectorType::get(mInt8Ty, 4), 0)); break;
 507                 case 16: vptr = BITCAST(pointer, PointerType::get(VectorType::get(mInt16Ty, 4), 0)); break;
 508                 case 32: vptr = BITCAST(pointer, PointerType::get(VectorType::get(mFP32Ty, 4), 0)); break;
 509                 default: SWR_INVALID("Unsupported underlying bpp!");
 510             }
 511
 512             // load 4 components of attribute
 513             Value*    vec = ALIGNED_LOAD(vptr, 1, false);
 514
 515             // Convert To FP32 internally
 516             switch(info.type[0])
 517             {
 518                 case SWR_TYPE_UNORM:
 519                     switch(bpc)
 520                     {
 521                         case 8:
 522                             vec = UI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
 523                             vec = FMUL(vec, ConstantVector::get(std::vector<Constant*>(4, ConstantFP::get(mFP32Ty, 1.0 / 255.0))));
 524                             break;
 525                         case 16:
 526                             vec = UI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
 527                             vec = FMUL(vec, ConstantVector::get(std::vector<Constant*>(4, ConstantFP::get(mFP32Ty, 1.0 / 65535.0))));
 528                             break;
 529                         default:
 530                             SWR_INVALID("Unsupported underlying type!");
 531                             break;
 532                     }
 533                     break;
 534                 case SWR_TYPE_SNORM:
 535                     switch(bpc)
 536                     {
 537                         case 8:
 538                             vec = SI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
 539                             vec = FMUL(vec, ConstantVector::get(std::vector<Constant*>(4, ConstantFP::get(mFP32Ty, 1.0 / 128.0))));
 540                             break;
 541                         case 16:
 542                             vec = SI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
 543                             vec = FMUL(vec, ConstantVector::get(std::vector<Constant*>(4, ConstantFP::get(mFP32Ty, 1.0 / 32768.0))));
 544                             break;
 545                         default:
 546                             SWR_INVALID("Unsupported underlying type!");
 547                             break;
 548                     }
 549                     break;
 550                 case SWR_TYPE_UINT:
 551                     // Zero extend uint32_t types.
 552                     switch(bpc)
 553                     {
 554                         case 8:
 555                         case 16:
 556                             vec = Z_EXT(vec, VectorType::get(mInt32Ty, 4));
 557                             vec = BITCAST(vec, VectorType::get(mFP32Ty, 4));
 558                             break;
 559                         case 32:
 560                             break; // Pass through unchanged.
 561                         default:
 562                             SWR_INVALID("Unsupported underlying type!");
 563                             break;
 564                     }
 565                     break;
 566                 case SWR_TYPE_SINT:
 567                     // Sign extend SINT types.
 568                     switch(bpc)
 569                     {
 570                         case 8:
 571                         case 16:
 572                             vec = S_EXT(vec, VectorType::get(mInt32Ty, 4));
 573                             vec = BITCAST(vec, VectorType::get(mFP32Ty, 4));
 574                             break;
 575                         case 32:
 576                             break; // Pass through unchanged.
 577                         default:
 578                             SWR_INVALID("Unsupported underlying type!");
 579                             break;
 580                     }
 581                     break;
 582                 case SWR_TYPE_FLOAT:
 583                     switch(bpc)
 584                     {
 585                         case 32:
 586                             break; // Pass through unchanged.
 587                         default:
 588                             SWR_INVALID("Unsupported underlying type!");
 589                     }
 590                     break;
 591                 case SWR_TYPE_USCALED:
 592                     vec = UI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
 593                     break;
 594                 case SWR_TYPE_SSCALED:
 595                     vec = SI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
 596                     break;
 597                 case SWR_TYPE_SFIXED:
 598                     vec = FMUL(SI_TO_FP(vec, VectorType::get(mFP32Ty, 4)), VBROADCAST(C(1/65536.0f)));
 599                     break;
 600                 case SWR_TYPE_UNKNOWN:
 601                 case SWR_TYPE_UNUSED:
 602                     SWR_INVALID("Unsupported type %d!", info.type[0]);
 603             }
 604
 605             // promote mask: sse(0,1,2,3) | avx(0,1,2,3,4,4,4,4)
 606             // uwvec: 4 x F32, undef value
 607             Value*    wvec = VSHUFFLE(vec, uwvec, promoteMask);
 608             vectors.push_back(wvec);
 609         }
 610
 611         std::vector<Constant*>        v01Mask(mVWidth);
 612         std::vector<Constant*>        v23Mask(mVWidth);
 613         std::vector<Constant*>        v02Mask(mVWidth);
 614         std::vector<Constant*>        v13Mask(mVWidth);
 615
 616         // Concatenate the vectors together.
 617         elements[0] = VUNDEF_F();
 618         elements[1] = VUNDEF_F();
 619         elements[2] = VUNDEF_F();
 620         elements[3] = VUNDEF_F();
 621         for(uint32_t b = 0, num4Wide = mVWidth / 4; b < num4Wide; ++b)
 622         {
 623             v01Mask[4 * b + 0] = C(0 + 4 * b);
 624             v01Mask[4 * b + 1] = C(1 + 4 * b);
 625             v01Mask[4 * b + 2] = C(0 + 4 * b + mVWidth);
 626             v01Mask[4 * b + 3] = C(1 + 4 * b + mVWidth);
 627
 628             v23Mask[4 * b + 0] = C(2 + 4 * b);
 629             v23Mask[4 * b + 1] = C(3 + 4 * b);
 630             v23Mask[4 * b + 2] = C(2 + 4 * b + mVWidth);
 631             v23Mask[4 * b + 3] = C(3 + 4 * b + mVWidth);
 632
 633             v02Mask[4 * b + 0] = C(0 + 4 * b);
 634             v02Mask[4 * b + 1] = C(2 + 4 * b);
 635             v02Mask[4 * b + 2] = C(0 + 4 * b + mVWidth);
 636             v02Mask[4 * b + 3] = C(2 + 4 * b + mVWidth);
 637
 638             v13Mask[4 * b + 0] = C(1 + 4 * b);
 639             v13Mask[4 * b + 1] = C(3 + 4 * b);
 640             v13Mask[4 * b + 2] = C(1 + 4 * b + mVWidth);
 641             v13Mask[4 * b + 3] = C(3 + 4 * b + mVWidth);
 642
 643             std::vector<Constant*>    iMask(mVWidth);
 644             for(uint32_t i = 0; i < mVWidth; ++i)
 645             {
 646                 if(((4 * b) <= i) && (i < (4 * (b + 1))))
 647                 {
 648                     iMask[i] = C(i % 4 + mVWidth);
 649                 }
 650                 else
 651                 {
 652                     iMask[i] = C(i);
 653                 }
 654             }
 655             Constant* insertMask = ConstantVector::get(iMask);
 656             elements[0] = VSHUFFLE(elements[0], vectors[4 * b + 0], insertMask);
 657             elements[1] = VSHUFFLE(elements[1], vectors[4 * b + 1], insertMask);
 658             elements[2] = VSHUFFLE(elements[2], vectors[4 * b + 2], insertMask);
 659             elements[3] = VSHUFFLE(elements[3], vectors[4 * b + 3], insertMask);
 660         }
 661
 662         Value* x0y0x1y1 = VSHUFFLE(elements[0], elements[1], ConstantVector::get(v01Mask));
 663         Value* x2y2x3y3 = VSHUFFLE(elements[2], elements[3], ConstantVector::get(v01Mask));
 664         Value* z0w0z1w1 = VSHUFFLE(elements[0], elements[1], ConstantVector::get(v23Mask));
 665         Value* z2w3z2w3 = VSHUFFLE(elements[2], elements[3], ConstantVector::get(v23Mask));
 666         elements[0] = VSHUFFLE(x0y0x1y1, x2y2x3y3, ConstantVector::get(v02Mask));
 667         elements[1] = VSHUFFLE(x0y0x1y1, x2y2x3y3, ConstantVector::get(v13Mask));
 668         elements[2] = VSHUFFLE(z0w0z1w1, z2w3z2w3, ConstantVector::get(v02Mask));
 669         elements[3] = VSHUFFLE(z0w0z1w1, z2w3z2w3, ConstantVector::get(v13Mask));
 670
 671         switch(numComponents + 1)
 672         {
 673             case    1: elements[0] = VIMMED1(0.0f);
 674             case    2: elements[1] = VIMMED1(0.0f);
 675             case    3: elements[2] = VIMMED1(0.0f);
 676             case    4: elements[3] = VIMMED1(1.0f);
 677         }
 678
 679         for(uint32_t c = 0; c < 4; ++c)
 680         {
 681 #if USE_SIMD16_SHADERS
 682             Value* dest = GEP(pVtxOut, C(nelt * 8 + c * 2), "destGEP");
 683 #else
 684             Value* dest = GEP(pVtxOut, C(nelt * 4 + c), "destGEP");
 685 #endif
 686             STORE(elements[c], dest);
 687         }
 688     }
 689 }
 690
 691 // returns true for odd formats that require special state.gather handling
 692 bool FetchJit::IsOddFormat(SWR_FORMAT format)
 693 {
 694     const SWR_FORMAT_INFO& info = GetFormatInfo(format);
 695     if (info.bpc[0] != 8 && info.bpc[0] != 16 && info.bpc[0] != 32 && info.bpc[0] != 64)
 696     {
 697         return true;
 698     }
 699     return false;
 700 }
 701
 702 // format is uniform if all components are the same size and type
 703 bool FetchJit::IsUniformFormat(SWR_FORMAT format)
 704 {
 705     const SWR_FORMAT_INFO& info = GetFormatInfo(format);
 706     uint32_t bpc0 = info.bpc[0];
 707     uint32_t type0 = info.type[0];
 708
 709     for (uint32_t c = 1; c < info.numComps; ++c)
 710     {
 711         if (bpc0 != info.bpc[c] || type0 != info.type[c])
 712         {
 713             return false;
 714         }
 715     }
 716     return true;
 717 }
 718
 719 // unpacks components based on format
 720 // foreach component in the pixel
 721 //   mask off everything but this component
 722 //   shift component to LSB
 723 void FetchJit::UnpackComponents(SWR_FORMAT format, Value* vInput, Value* result[4])
 724 {
 725     const SWR_FORMAT_INFO& info = GetFormatInfo(format);
 726
 727     uint32_t bitOffset = 0;
 728     for (uint32_t c = 0; c < info.numComps; ++c)
 729     {
 730         uint32_t swizzledIndex = info.swizzle[c];
 731         uint32_t compBits = info.bpc[c];
 732         uint32_t bitmask = ((1 << compBits) - 1) << bitOffset;
 733         Value* comp = AND(vInput, bitmask);
 734         comp = LSHR(comp, bitOffset);
 735
 736         result[swizzledIndex] = comp;
 737         bitOffset += compBits;
 738     }
 739 }
 740
 741 // gather for odd component size formats
 742 // gather SIMD full pixels per lane then shift/mask to move each component to their
 743 // own vector
 744 void FetchJit::CreateGatherOddFormats(SWR_FORMAT format, Value* pMask, Value* pBase, Value* pOffsets, Value* pResult[4])
 745 {
 746     const SWR_FORMAT_INFO &info = GetFormatInfo(format);
 747
 748     // only works if pixel size is <= 32bits
 749     SWR_ASSERT(info.bpp <= 32);
 750
 751     Value *pGather;
 752     if (info.bpp == 32)
 753     {
 754         pGather = GATHERDD(VIMMED1(0), pBase, pOffsets, pMask);
 755     }
 756     else
 757     {
 758         // Can't use 32-bit gather for items less than 32-bits, could cause page faults.
 759         Value *pMem = ALLOCA(mSimdInt32Ty);
 760         STORE(VIMMED1(0u), pMem);
 761
 762         pBase = BITCAST(pBase, PointerType::get(mInt8Ty, 0));
 763         Value* pDstMem = BITCAST(pMem, mInt32PtrTy);
 764
 765         for (uint32_t lane = 0; lane < mVWidth; ++lane)
 766         {
 767             // Get index
 768             Value* index = VEXTRACT(pOffsets, C(lane));
 769             Value* mask = VEXTRACT(pMask, C(lane));
 770             switch (info.bpp)
 771             {
 772             case 8:
 773             {
 774                 Value* pDst = BITCAST(GEP(pDstMem, C(lane)), PointerType::get(mInt8Ty, 0));
 775                 Value* pSrc = BITCAST(GEP(pBase, index), PointerType::get(mInt8Ty, 0));
 776                 STORE(LOAD(SELECT(mask, pSrc, pDst)), pDst);
 777                 break;
 778             }
 779
 780             case 16:
 781             {
 782                 Value* pDst = BITCAST(GEP(pDstMem, C(lane)), PointerType::get(mInt16Ty, 0));
 783                 Value* pSrc = BITCAST(GEP(pBase, index), PointerType::get(mInt16Ty, 0));
 784                 STORE(LOAD(SELECT(mask, pSrc, pDst)), pDst);
 785                 break;
 786             }
 787             break;
 788
 789             case 24:
 790             {
 791                 // First 16-bits of data
 792                 Value* pDst = BITCAST(GEP(pDstMem, C(lane)), PointerType::get(mInt16Ty, 0));
 793                 Value* pSrc = BITCAST(GEP(pBase, index), PointerType::get(mInt16Ty, 0));
 794                 STORE(LOAD(SELECT(mask, pSrc, pDst)), pDst);
 795
 796                 // Last 8-bits of data
 797                 pDst = BITCAST(GEP(pDst, C(1)), PointerType::get(mInt8Ty, 0));
 798                 pSrc = BITCAST(GEP(pSrc, C(1)), PointerType::get(mInt8Ty, 0));
 799                 STORE(LOAD(SELECT(mask, pSrc, pDst)), pDst);
 800                 break;
 801             }
 802
 803             default:
 804                 SWR_INVALID("Shouldn't have BPP = %d now", info.bpp);
 805                 break;
 806             }
 807         }
 808
 809         pGather = LOAD(pMem);
 810     }
 811
 812     for (uint32_t comp = 0; comp < 4; ++comp)
 813     {
 814         pResult[comp] = VIMMED1((int)info.defaults[comp]);
 815     }
 816
 817     UnpackComponents(format, pGather, pResult);
 818
 819     // cast to fp32
 820     pResult[0] = BITCAST(pResult[0], mSimdFP32Ty);
 821     pResult[1] = BITCAST(pResult[1], mSimdFP32Ty);
 822     pResult[2] = BITCAST(pResult[2], mSimdFP32Ty);
 823     pResult[3] = BITCAST(pResult[3], mSimdFP32Ty);
 824 }
 825
 826 void FetchJit::ConvertFormat(SWR_FORMAT format, Value *texels[4])
 827 {
 828     const SWR_FORMAT_INFO &info = GetFormatInfo(format);
 829
 830     for (uint32_t c = 0; c < info.numComps; ++c)
 831     {
 832         uint32_t compIndex = info.swizzle[c];
 833
 834         // skip any conversion on UNUSED components
 835         if (info.type[c] == SWR_TYPE_UNUSED)
 836         {
 837             continue;
 838         }
 839
 840         if (info.isNormalized[c])
 841         {
 842             if (info.type[c] == SWR_TYPE_SNORM)
 843             {
 844                 /// @todo The most-negative value maps to -1.0f. e.g. the 5-bit value 10000 maps to -1.0f.
 845
 846                 /// result = c * (1.0f / (2^(n-1) - 1);
 847                 uint32_t n = info.bpc[c];
 848                 uint32_t pow2 = 1 << (n - 1);
 849                 float scale = 1.0f / (float)(pow2 - 1);
 850                 Value *vScale = VIMMED1(scale);
 851                 texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty);
 852                 texels[compIndex] = SI_TO_FP(texels[compIndex], mSimdFP32Ty);
 853                 texels[compIndex] = FMUL(texels[compIndex], vScale);
 854             }
 855             else
 856             {
 857                 SWR_ASSERT(info.type[c] == SWR_TYPE_UNORM);
 858
 859                 /// result = c * (1.0f / (2^n - 1))
 860                 uint32_t n = info.bpc[c];
 861                 uint32_t pow2 = 1 << n;
 862                 // special case 24bit unorm format, which requires a full divide to meet ULP requirement
 863                 if (n == 24)
 864                 {
 865                     float scale = (float)(pow2 - 1);
 866                     Value* vScale = VIMMED1(scale);
 867                     texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty);
 868                     texels[compIndex] = SI_TO_FP(texels[compIndex], mSimdFP32Ty);
 869                     texels[compIndex] = FDIV(texels[compIndex], vScale);
 870                 }
 871                 else
 872                 {
 873                     float scale = 1.0f / (float)(pow2 - 1);
 874                     Value *vScale = VIMMED1(scale);
 875                     texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty);
 876                     texels[compIndex] = UI_TO_FP(texels[compIndex], mSimdFP32Ty);
 877                     texels[compIndex] = FMUL(texels[compIndex], vScale);
 878                 }
 879             }
 880             continue;
 881         }
 882     }
 883 }
 884
 885 //////////////////////////////////////////////////////////////////////////
 886 /// @brief Loads attributes from memory using AVX2 GATHER(s)
 887 /// @param fetchState - info about attributes to be fetched from memory
 888 /// @param streams - value pointer to the current vertex stream
 889 /// @param vIndices - vector value of indices to gather
 890 /// @param pVtxOut - value pointer to output simdvertex struct
 891 #if USE_SIMD16_SHADERS
 892 #if USE_SIMD16_GATHERS
 893 void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
 894     Value *streams, Value *vIndices, Value *vIndices2, Value *pVtxOut, bool useVertexID2)
 895 #else
 896 void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
 897     Value* streams, Value* vIndices, Value* pVtxOut, bool useVertexID2)
 898 #endif
 899 #else
 900 void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
 901     Value* streams, Value* vIndices, Value* pVtxOut)
 902 #endif
 903 {
 904     uint32_t currentVertexElement = 0;
 905     uint32_t outputElt = 0;
 906     Value* vVertexElements[4];
 907 #if USE_SIMD16_GATHERS
 908     Value *pVtxSrc2[4];
 909 #endif
 910
 911     Value* startVertex = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartVertex});
 912     Value* startInstance = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartInstance});
 913     Value* curInstance = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_CurInstance});
 914 #if USE_SIMD16_GATHERS
 915     Value* vBaseVertex16 = VBROADCAST_16(LOAD(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_BaseVertex }));
 916 #else
 917     Value* vBaseVertex = VBROADCAST(LOAD(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_BaseVertex }));
 918 #endif
 919     curInstance->setName("curInstance");
 920
 921     for (uint32_t nInputElt = 0; nInputElt < fetchState.numAttribs; nInputElt += 1)
 922     {
 923         const INPUT_ELEMENT_DESC& ied = fetchState.layout[nInputElt];
 924
 925         // skip element if all components are disabled
 926         if (ied.ComponentPacking == ComponentEnable::NONE)
 927         {
 928             continue;
 929         }
 930
 931         const SWR_FORMAT_INFO &info = GetFormatInfo((SWR_FORMAT)ied.Format);
 932         SWR_ASSERT((info.bpp != 0), "Unsupported format in JitGatherVertices.");
 933         uint32_t bpc = info.bpp / info.numComps;  ///@todo Code below assumes all components are same size. Need to fix.
 934
 935         Value *stream = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pData});
 936
 937         // VGATHER* takes an *i8 src pointer
 938         Value *pStreamBase = BITCAST(stream, PointerType::get(mInt8Ty, 0));
 939
 940         Value *stride = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pitch});
 941 #if USE_SIMD16_GATHERS
 942         Value *vStride16 = VBROADCAST_16(stride);
 943 #else
 944         Value *vStride = VBROADCAST(stride);
 945 #endif
 946
 947         // max vertex index that is fully in bounds
 948         Value *maxVertex = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_maxVertex)});
 949         maxVertex = LOAD(maxVertex);
 950
 951         Value *minVertex = NULL;
 952         if (fetchState.bPartialVertexBuffer)
 953         {
 954             // min vertex index for low bounds OOB checking
 955             minVertex = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_minVertex)});
 956             minVertex = LOAD(minVertex);
 957         }
 958
 959         if (fetchState.bInstanceIDOffsetEnable)
 960         {
 961             // the InstanceID (curInstance) value is offset by StartInstanceLocation
 962             curInstance = ADD(curInstance, startInstance);
 963         }
 964
 965 #if USE_SIMD16_GATHERS
 966         Value *vCurIndices16;
 967 #else
 968         Value *vCurIndices;
 969 #endif
 970         Value *startOffset;
 971 #if USE_SIMD16_GATHERS
 972         Value *vInstanceStride16 = VIMMED1_16(0);
 973 #else
 974         Value *vInstanceStride = VIMMED1(0);
 975 #endif
 976
 977         if (ied.InstanceEnable)
 978         {
 979             Value* stepRate = C(ied.InstanceAdvancementState);
 980
 981             // prevent a div by 0 for 0 step rate
 982             Value* isNonZeroStep = ICMP_UGT(stepRate, C(0));
 983             stepRate = SELECT(isNonZeroStep, stepRate, C(1));
 984
 985             // calc the current offset into instanced data buffer
 986             Value* calcInstance = UDIV(curInstance, stepRate);
 987
 988             // if step rate is 0, every instance gets instance 0
 989             calcInstance = SELECT(isNonZeroStep, calcInstance, C(0));
 990
 991 #if USE_SIMD16_GATHERS
 992             vCurIndices16 = VBROADCAST_16(calcInstance);
 993 #else
 994             vCurIndices = VBROADCAST(calcInstance);
 995 #endif
 996
 997             startOffset = startInstance;
 998         }
 999         else if (ied.InstanceStrideEnable)
1000         {
1001             // grab the instance advancement state, determines stride in bytes from one instance to the next
1002             Value* stepRate = C(ied.InstanceAdvancementState);
1003 #if USE_SIMD16_GATHERS
1004             vInstanceStride16 = VBROADCAST_16(MUL(curInstance, stepRate));
1005 #else
1006             vInstanceStride = VBROADCAST(MUL(curInstance, stepRate));
1007 #endif
1008
1009             // offset indices by baseVertex
1010 #if USE_SIMD16_GATHERS
1011             Value *vIndices16 = JOIN_16(vIndices, vIndices2);
1012
1013             vCurIndices16 = ADD(vIndices16, vBaseVertex16);
1014 #else
1015             vCurIndices = ADD(vIndices, vBaseVertex);
1016 #endif
1017
1018             startOffset = startVertex;
1019             SWR_ASSERT((0), "TODO: Fill out more once driver sends this down.");
1020         }
1021         else
1022         {
1023             // offset indices by baseVertex
1024 #if USE_SIMD16_GATHERS
1025             Value *vIndices16 = JOIN_16(vIndices, vIndices2);
1026
1027             vCurIndices16 = ADD(vIndices16, vBaseVertex16);
1028 #else
1029             vCurIndices = ADD(vIndices, vBaseVertex);
1030 #endif
1031
1032             startOffset = startVertex;
1033         }
1034
1035         // All of the OOB calculations are in vertices, not VB offsets, to prevent having to
1036         // do 64bit address offset calculations.
1037
1038         // calculate byte offset to the start of the VB
1039         Value* baseOffset = MUL(Z_EXT(startOffset, mInt64Ty), Z_EXT(stride, mInt64Ty));
1040         pStreamBase = GEP(pStreamBase, baseOffset);
1041
1042         // if we have a start offset, subtract from max vertex. Used for OOB check
1043         maxVertex = SUB(Z_EXT(maxVertex, mInt64Ty), Z_EXT(startOffset, mInt64Ty));
1044         Value* maxNeg = ICMP_SLT(maxVertex, C((int64_t)0));
1045         // if we have a negative value, we're already OOB. clamp at 0.
1046         maxVertex = SELECT(maxNeg, C(0), TRUNC(maxVertex, mInt32Ty));
1047
1048         if (fetchState.bPartialVertexBuffer)
1049         {
1050             // similary for min vertex
1051             minVertex = SUB(Z_EXT(minVertex, mInt64Ty), Z_EXT(startOffset, mInt64Ty));
1052             Value *minNeg = ICMP_SLT(minVertex, C((int64_t)0));
1053             minVertex = SELECT(minNeg, C(0), TRUNC(minVertex, mInt32Ty));
1054         }
1055
1056         // Load the in bounds size of a partially valid vertex
1057         Value *partialInboundsSize = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_partialInboundsSize)});
1058         partialInboundsSize = LOAD(partialInboundsSize);
1059 #if USE_SIMD16_GATHERS
1060         Value *vPartialVertexSize = VBROADCAST_16(partialInboundsSize);
1061         Value *vBpp = VBROADCAST_16(C(info.Bpp));
1062         Value *vAlignmentOffsets = VBROADCAST_16(C(ied.AlignedByteOffset));
1063 #else
1064         Value *vPartialVertexSize = VBROADCAST(partialInboundsSize);
1065         Value *vBpp = VBROADCAST(C(info.Bpp));
1066         Value *vAlignmentOffsets = VBROADCAST(C(ied.AlignedByteOffset));
1067 #endif
1068
1069         // is the element is <= the partially valid size
1070         Value *vElementInBoundsMask = ICMP_SLE(vBpp, SUB(vPartialVertexSize, vAlignmentOffsets));
1071
1072 #if USE_SIMD16_GATHERS
1073         // override cur indices with 0 if pitch is 0
1074         Value *pZeroPitchMask16 = ICMP_EQ(vStride16, VIMMED1_16(0));
1075         vCurIndices16 = SELECT(pZeroPitchMask16, VIMMED1_16(0), vCurIndices16);
1076
1077         // are vertices partially OOB?
1078         Value *vMaxVertex16 = VBROADCAST_16(maxVertex);
1079         Value *vPartialOOBMask = ICMP_EQ(vCurIndices16, vMaxVertex16);
1080
1081         // are vertices fully in bounds?
1082         Value *vMaxGatherMask16 = ICMP_ULT(vCurIndices16, vMaxVertex16);
1083
1084         Value *vGatherMask16;
1085
1086         if (fetchState.bPartialVertexBuffer)
1087         {
1088             // are vertices below minVertex limit?
1089             Value *vMinVertex16 = VBROADCAST_16(minVertex);
1090             Value *vMinGatherMask16 = ICMP_UGE(vCurIndices16, vMinVertex16);
1091
1092             // only fetch lanes that pass both tests
1093             vGatherMask16 = AND(vMaxGatherMask16, vMinGatherMask16);
1094         }
1095         else
1096         {
1097             vGatherMask16 = vMaxGatherMask16;
1098         }
1099
1100         // blend in any partially OOB indices that have valid elements
1101         vGatherMask16 = SELECT(vPartialOOBMask, vElementInBoundsMask, vGatherMask16);
1102
1103         // calculate the actual offsets into the VB
1104         Value *vOffsets16 = MUL(vCurIndices16, vStride16);
1105         vOffsets16 = ADD(vOffsets16, vAlignmentOffsets);
1106
1107         // if instance stride enable is:
1108         //  true  - add product of the instanceID and advancement state to the offst into the VB
1109         //  false - value of vInstanceStride has been initialialized to zero
1110         vOffsets16 = ADD(vOffsets16, vInstanceStride16);
1111
1112         // TODO: remove the following simd8 interop stuff once all code paths are fully widened to SIMD16..
1113
1114         Value *vGatherMask  = EXTRACT_16(vGatherMask16, 0);
1115         Value *vGatherMask2 = EXTRACT_16(vGatherMask16, 1);
1116
1117         Value *vOffsets  = EXTRACT_16(vOffsets16, 0);
1118         Value *vOffsets2 = EXTRACT_16(vOffsets16, 1);
1119 #else
1120         // override cur indices with 0 if pitch is 0
1121         Value* pZeroPitchMask = ICMP_EQ(vStride, VIMMED1(0));
1122         vCurIndices = SELECT(pZeroPitchMask, VIMMED1(0), vCurIndices);
1123
1124         // are vertices partially OOB?
1125         Value* vMaxVertex = VBROADCAST(maxVertex);
1126         Value* vPartialOOBMask = ICMP_EQ(vCurIndices, vMaxVertex);
1127
1128         // are vertices fully in bounds?
1129         Value* vMaxGatherMask = ICMP_ULT(vCurIndices, vMaxVertex);
1130
1131         Value *vGatherMask;
1132         if (fetchState.bPartialVertexBuffer)
1133         {
1134             // are vertices below minVertex limit?
1135             Value *vMinVertex = VBROADCAST(minVertex);
1136             Value *vMinGatherMask = ICMP_UGE(vCurIndices, vMinVertex);
1137
1138             // only fetch lanes that pass both tests
1139             vGatherMask = AND(vMaxGatherMask, vMinGatherMask);
1140         }
1141         else
1142         {
1143             vGatherMask = vMaxGatherMask;
1144         }
1145
1146         // blend in any partially OOB indices that have valid elements
1147         vGatherMask = SELECT(vPartialOOBMask, vElementInBoundsMask, vGatherMask);
1148
1149         // calculate the actual offsets into the VB
1150         Value* vOffsets = MUL(vCurIndices, vStride);
1151         vOffsets = ADD(vOffsets, vAlignmentOffsets);
1152
1153         // if instance stride enable is:
1154         //  true  - add product of the instanceID and advancement state to the offst into the VB
1155         //  false - value of vInstanceStride has been initialialized to zero
1156         vOffsets = ADD(vOffsets, vInstanceStride);
1157
1158 #endif
1159         // Packing and component control
1160         ComponentEnable compMask = (ComponentEnable)ied.ComponentPacking;
1161         const ComponentControl compCtrl[4] { (ComponentControl)ied.ComponentControl0, (ComponentControl)ied.ComponentControl1,
1162                                              (ComponentControl)ied.ComponentControl2, (ComponentControl)ied.ComponentControl3};
1163
1164         // Special gather/conversion for formats without equal component sizes
1165         if (IsOddFormat((SWR_FORMAT)ied.Format))
1166         {
1167 #if USE_SIMD16_GATHERS
1168             Value *pResults[4];
1169             Value *pResults2[4];
1170             CreateGatherOddFormats((SWR_FORMAT)ied.Format, vGatherMask,  pStreamBase, vOffsets,  pResults);
1171             CreateGatherOddFormats((SWR_FORMAT)ied.Format, vGatherMask2, pStreamBase, vOffsets2, pResults2);
1172             ConvertFormat((SWR_FORMAT)ied.Format, pResults);
1173             ConvertFormat((SWR_FORMAT)ied.Format, pResults2);
1174
1175             for (uint32_t c = 0; c < 4; c += 1)
1176             {
1177                 if (isComponentEnabled(compMask, c))
1178                 {
1179                     // pack adjacent pairs of SIMD8s into SIMD16s
1180                     pVtxSrc2[currentVertexElement++] = JOIN_16(pResults[c], pResults2[c]);
1181
1182                     if (currentVertexElement > 3)
1183                     {
1184                         // store SIMD16s
1185                         Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth16), 0));
1186
1187                         StoreVertexElements16(pVtxOut2, outputElt++, 4, pVtxSrc2);
1188                         // reset to the next vVertexElement to output
1189                         currentVertexElement = 0;
1190                     }
1191                 }
1192             }
1193 #else
1194             Value *pResults[4];
1195             CreateGatherOddFormats((SWR_FORMAT)ied.Format, vGatherMask, pStreamBase, vOffsets, pResults);
1196             ConvertFormat((SWR_FORMAT)ied.Format, pResults);
1197
1198             for (uint32_t c = 0; c < 4; c += 1)
1199             {
1200                 if (isComponentEnabled(compMask, c))
1201                 {
1202                     vVertexElements[currentVertexElement++] = pResults[c];
1203                     if (currentVertexElement > 3)
1204                     {
1205                         StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1206                         // reset to the next vVertexElement to output
1207                         currentVertexElement = 0;
1208                     }
1209                 }
1210             }
1211 #endif
1212         }
1213         else if(info.type[0] == SWR_TYPE_FLOAT)
1214         {
1215             ///@todo: support 64 bit vb accesses
1216             Value *gatherSrc = VIMMED1(0.0f);
1217 #if USE_SIMD16_GATHERS
1218             Value *gatherSrc16 = VIMMED1_16(0.0f);
1219 #endif
1220
1221             SWR_ASSERT(IsUniformFormat((SWR_FORMAT)ied.Format),
1222                 "Unsupported format for standard gather fetch.");
1223
1224             // Gather components from memory to store in a simdvertex structure
1225             switch (bpc)
1226             {
1227                 case 16:
1228                 {
1229 #if USE_SIMD16_GATHERS
1230                     Value *gatherResult[2];
1231
1232                     // if we have at least one component out of x or y to fetch
1233                     if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
1234                     {
1235                         gatherResult[0] = GATHERPS_16(gatherSrc16, pStreamBase, vOffsets16, vGatherMask16);
1236
1237                         // e.g. result of first 8x32bit integer gather for 16bit components
1238                         // 256i - 0    1    2    3    4    5    6    7
1239                         //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
1240                         //
1241                     }
1242                     else
1243                     {
1244                         gatherResult[0] = VUNDEF_I_16();
1245                     }
1246
1247                     // if we have at least one component out of z or w to fetch
1248                     if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
1249                     {
1250                         // offset base to the next components(zw) in the vertex to gather
1251                         pStreamBase = GEP(pStreamBase, C((char)4));
1252
1253                         gatherResult[1] = GATHERPS_16(gatherSrc16, pStreamBase, vOffsets16, vGatherMask16);
1254
1255                         // e.g. result of second 8x32bit integer gather for 16bit components
1256                         // 256i - 0    1    2    3    4    5    6    7
1257                         //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
1258                         //
1259                     }
1260                     else
1261                     {
1262                         gatherResult[1] = VUNDEF_I_16();
1263                     }
1264
1265                     // if we have at least one component to shuffle into place
1266                     if (compMask)
1267                     {
1268                         Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth16), 0));
1269
1270                         Shuffle16bpcArgs args = std::forward_as_tuple(gatherResult, pVtxOut2, Instruction::CastOps::FPExt, CONVERT_NONE,
1271                             currentVertexElement, outputElt, compMask, compCtrl, pVtxSrc2);
1272
1273                         // Shuffle gathered components into place in simdvertex struct
1274                         Shuffle16bpcGather16(args);  // outputs to vVertexElements ref
1275                     }
1276 #else
1277                     Value *vGatherResult[2];
1278
1279                     // if we have at least one component out of x or y to fetch
1280                     if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
1281                     {
1282                         vGatherResult[0] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vGatherMask);
1283                         // e.g. result of first 8x32bit integer gather for 16bit components
1284                         // 256i - 0    1    2    3    4    5    6    7
1285                         //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
1286                         //
1287                     }
1288
1289                     // if we have at least one component out of z or w to fetch
1290                     if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
1291                     {
1292                         // offset base to the next components(zw) in the vertex to gather
1293                         pStreamBase = GEP(pStreamBase, C((char)4));
1294
1295                         vGatherResult[1] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vGatherMask);
1296                         // e.g. result of second 8x32bit integer gather for 16bit components
1297                         // 256i - 0    1    2    3    4    5    6    7
1298                         //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
1299                         //
1300                     }
1301
1302                     // if we have at least one component to shuffle into place
1303                     if (compMask)
1304                     {
1305                         Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, Instruction::CastOps::FPExt, CONVERT_NONE,
1306                             currentVertexElement, outputElt, compMask, compCtrl, vVertexElements);
1307
1308                         // Shuffle gathered components into place in simdvertex struct
1309 #if USE_SIMD16_SHADERS
1310                         Shuffle16bpcGather(args, useVertexID2);  // outputs to vVertexElements ref
1311 #else
1312                         Shuffle16bpcGather(args);  // outputs to vVertexElements ref
1313 #endif
1314                     }
1315 #endif
1316                 }
1317                     break;
1318                 case 32:
1319                 {
1320                     for (uint32_t i = 0; i < 4; i += 1)
1321                     {
1322 #if USE_SIMD16_GATHERS
1323                         if (isComponentEnabled(compMask, i))
1324                         {
1325                             // if we need to gather the component
1326                             if (compCtrl[i] == StoreSrc)
1327                             {
1328                                 // Gather a SIMD of vertices
1329                                 // APIs allow a 4GB range for offsets
1330                                 // However, GATHERPS uses signed 32-bit offsets, so only a 2GB range :(
1331                                 // But, we know that elements must be aligned for FETCH. :)
1332                                 // Right shift the offset by a bit and then scale by 2 to remove the sign extension.
1333                                 Value *shiftedOffsets16 = LSHR(vOffsets16, 1);
1334                                 pVtxSrc2[currentVertexElement++] = GATHERPS_16(gatherSrc16, pStreamBase, shiftedOffsets16, vGatherMask16, 2);
1335                             }
1336                             else
1337                             {
1338                                 pVtxSrc2[currentVertexElement++] = GenerateCompCtrlVector16(compCtrl[i]);
1339                             }
1340
1341                             if (currentVertexElement > 3)
1342                             {
1343                                 // store SIMD16s
1344                                 Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth16), 0));
1345
1346                                 StoreVertexElements16(pVtxOut2, outputElt++, 4, pVtxSrc2);
1347                                 // reset to the next vVertexElement to output
1348                                 currentVertexElement = 0;
1349                             }
1350                         }
1351
1352                         // offset base to the next component in the vertex to gather
1353                         pStreamBase = GEP(pStreamBase, C((char)4));
1354 #else
1355                         if (isComponentEnabled(compMask, i))
1356                         {
1357                             // if we need to gather the component
1358                             if (compCtrl[i] == StoreSrc)
1359                             {
1360                                 // Gather a SIMD of vertices
1361                                 // APIs allow a 4GB range for offsets
1362                                 // However, GATHERPS uses signed 32-bit offsets, so only a 2GB range :(
1363                                 // But, we know that elements must be aligned for FETCH. :)
1364                                 // Right shift the offset by a bit and then scale by 2 to remove the sign extension.
1365                                 Value *vShiftedOffsets = LSHR(vOffsets, 1);
1366                                 vVertexElements[currentVertexElement++] = GATHERPS(gatherSrc, pStreamBase, vShiftedOffsets, vGatherMask, 2);
1367                             }
1368                             else
1369                             {
1370 #if USE_SIMD16_SHADERS
1371                                 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i], useVertexID2);
1372 #else
1373                                 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1374 #endif
1375                             }
1376
1377                             if (currentVertexElement > 3)
1378                             {
1379                                 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1380                                 // reset to the next vVertexElement to output
1381                                 currentVertexElement = 0;
1382                             }
1383                         }
1384
1385                         // offset base to the next component in the vertex to gather
1386                         pStreamBase = GEP(pStreamBase, C((char)4));
1387 #endif
1388                     }
1389                 }
1390                     break;
1391                 case 64:
1392                 {
1393                     for (uint32_t i = 0; i < 4; i += 1)
1394                     {
1395 #if USE_SIMD16_GATHERS
1396                         if (isComponentEnabled(compMask, i))
1397                         {
1398                             // if we need to gather the component
1399                             if (compCtrl[i] == StoreSrc)
1400                             {
1401                                 Value *vMaskLo  = VSHUFFLE(vGatherMask,  VUNDEF(mInt1Ty, 8), C({ 0, 1, 2, 3 }));
1402                                 Value *vMaskLo2 = VSHUFFLE(vGatherMask2, VUNDEF(mInt1Ty, 8), C({ 0, 1, 2, 3 }));
1403                                 Value *vMaskHi  = VSHUFFLE(vGatherMask,  VUNDEF(mInt1Ty, 8), C({ 4, 5, 6, 7 }));
1404                                 Value *vMaskHi2 = VSHUFFLE(vGatherMask2, VUNDEF(mInt1Ty, 8), C({ 4, 5, 6, 7 }));
1405
1406                                 Value *vOffsetsLo  = VEXTRACTI128(vOffsets,  C(0));
1407                                 Value *vOffsetsLo2 = VEXTRACTI128(vOffsets2, C(0));
1408                                 Value *vOffsetsHi  = VEXTRACTI128(vOffsets,  C(1));
1409                                 Value *vOffsetsHi2 = VEXTRACTI128(vOffsets2, C(1));
1410
1411                                 Value *vZeroDouble = VECTOR_SPLAT(4, ConstantFP::get(IRB()->getDoubleTy(), 0.0f));
1412
1413                                 Value* pGatherLo  = GATHERPD(vZeroDouble, pStreamBase, vOffsetsLo,  vMaskLo);
1414                                 Value* pGatherLo2 = GATHERPD(vZeroDouble, pStreamBase, vOffsetsLo2, vMaskLo2);
1415                                 Value* pGatherHi  = GATHERPD(vZeroDouble, pStreamBase, vOffsetsHi,  vMaskHi);
1416                                 Value* pGatherHi2 = GATHERPD(vZeroDouble, pStreamBase, vOffsetsHi2, vMaskHi2);
1417
1418                                 pGatherLo  = VCVTPD2PS(pGatherLo);
1419                                 pGatherLo2 = VCVTPD2PS(pGatherLo2);
1420                                 pGatherHi  = VCVTPD2PS(pGatherHi);
1421                                 pGatherHi2 = VCVTPD2PS(pGatherHi2);
1422
1423                                 Value *pGather  = VSHUFFLE(pGatherLo,  pGatherHi,  C({ 0, 1, 2, 3, 4, 5, 6, 7 }));
1424                                 Value *pGather2 = VSHUFFLE(pGatherLo2, pGatherHi2, C({ 0, 1, 2, 3, 4, 5, 6, 7 }));
1425
1426                                 // pack adjacent pairs of SIMD8s into SIMD16s
1427                                 pVtxSrc2[currentVertexElement++] = JOIN_16(pGather, pGather2);
1428                             }
1429                             else
1430                             {
1431                                 pVtxSrc2[currentVertexElement++] = GenerateCompCtrlVector16(compCtrl[i]);
1432                             }
1433
1434                             if (currentVertexElement > 3)
1435                             {
1436                                 // store SIMD16s
1437                                 Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth16), 0));
1438
1439                                 StoreVertexElements16(pVtxOut2, outputElt++, 4, pVtxSrc2);
1440                                 // reset to the next vVertexElement to output
1441                                 currentVertexElement = 0;
1442                             }
1443                         }
1444
1445                         // offset base to the next component  in the vertex to gather
1446                         pStreamBase = GEP(pStreamBase, C((char)8));
1447 #else
1448                         if (isComponentEnabled(compMask, i))
1449                         {
1450                             // if we need to gather the component
1451                             if (compCtrl[i] == StoreSrc)
1452                             {
1453                                 Value *vMaskLo = VSHUFFLE(vGatherMask, VUNDEF(mInt1Ty, 8), C({0, 1, 2, 3}));
1454                                 Value *vMaskHi = VSHUFFLE(vGatherMask, VUNDEF(mInt1Ty, 8), C({4, 5, 6, 7}));
1455
1456                                 Value *vOffsetsLo = VEXTRACTI128(vOffsets, C(0));
1457                                 Value *vOffsetsHi = VEXTRACTI128(vOffsets, C(1));
1458
1459                                 Value *vZeroDouble = VECTOR_SPLAT(4, ConstantFP::get(IRB()->getDoubleTy(), 0.0f));
1460
1461                                 Value* pGatherLo = GATHERPD(vZeroDouble, pStreamBase, vOffsetsLo, vMaskLo);
1462                                 Value* pGatherHi = GATHERPD(vZeroDouble, pStreamBase, vOffsetsHi, vMaskHi);
1463
1464                                 pGatherLo = VCVTPD2PS(pGatherLo);
1465                                 pGatherHi = VCVTPD2PS(pGatherHi);
1466
1467                                 Value *pGather = VSHUFFLE(pGatherLo, pGatherHi, C({0, 1, 2, 3, 4, 5, 6, 7}));
1468
1469                                 vVertexElements[currentVertexElement++] = pGather;
1470                             }
1471                             else
1472                             {
1473 #if USE_SIMD16_SHADERS
1474                                 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i], useVertexID2);
1475 #else
1476                                 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1477 #endif
1478                             }
1479
1480                             if (currentVertexElement > 3)
1481                             {
1482                                 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1483                                 // reset to the next vVertexElement to output
1484                                 currentVertexElement = 0;
1485                             }
1486                         }
1487
1488                         // offset base to the next component  in the vertex to gather
1489                         pStreamBase = GEP(pStreamBase, C((char)8));
1490 #endif
1491                     }
1492                 }
1493                     break;
1494                 default:
1495                     SWR_INVALID("Tried to fetch invalid FP format");
1496                     break;
1497             }
1498         }
1499         else
1500         {
1501             Instruction::CastOps extendCastType = Instruction::CastOps::CastOpsEnd;
1502             ConversionType conversionType = CONVERT_NONE;
1503
1504             SWR_ASSERT(IsUniformFormat((SWR_FORMAT)ied.Format),
1505                 "Unsupported format for standard gather fetch.");
1506
1507             switch(info.type[0])
1508             {
1509                 case SWR_TYPE_UNORM:
1510                     conversionType = CONVERT_NORMALIZED;
1511                 case SWR_TYPE_UINT:
1512                     extendCastType = Instruction::CastOps::ZExt;
1513                     break;
1514                 case SWR_TYPE_SNORM:
1515                     conversionType = CONVERT_NORMALIZED;
1516                 case SWR_TYPE_SINT:
1517                     extendCastType = Instruction::CastOps::SExt;
1518                     break;
1519                 case SWR_TYPE_USCALED:
1520                     conversionType = CONVERT_USCALED;
1521                     extendCastType = Instruction::CastOps::UIToFP;
1522                     break;
1523                 case SWR_TYPE_SSCALED:
1524                     conversionType = CONVERT_SSCALED;
1525                     extendCastType = Instruction::CastOps::SIToFP;
1526                     break;
1527                 case SWR_TYPE_SFIXED:
1528                     conversionType = CONVERT_SFIXED;
1529                     extendCastType = Instruction::CastOps::SExt;
1530                     break;
1531                 default:
1532                     break;
1533             }
1534
1535             // value substituted when component of gather is masked
1536             Value* gatherSrc = VIMMED1(0);
1537 #if USE_SIMD16_GATHERS
1538             Value *gatherSrc16 = VIMMED1_16(0);
1539 #endif
1540
1541             // Gather components from memory to store in a simdvertex structure
1542             switch (bpc)
1543             {
1544                 case 8:
1545                 {
1546                     // if we have at least one component to fetch
1547                     if (compMask)
1548                     {
1549 #if USE_SIMD16_GATHERS
1550                         Value *gatherResult = GATHERDD_16(gatherSrc16, pStreamBase, vOffsets16, vGatherMask16);
1551
1552                         // e.g. result of an 8x32bit integer gather for 8bit components
1553                         // 256i - 0    1    2    3    4    5    6    7
1554                         //        xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw
1555
1556                         Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth16), 0));
1557
1558                         Shuffle8bpcArgs args = std::forward_as_tuple(gatherResult, pVtxOut2, extendCastType, conversionType,
1559                             currentVertexElement, outputElt, compMask, compCtrl, pVtxSrc2, info.swizzle);
1560
1561                         // Shuffle gathered components into place in simdvertex struct
1562                         Shuffle8bpcGatherd16(args);  // outputs to vVertexElements ref
1563 #else
1564                         Value *vGatherResult = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask);
1565                         // e.g. result of an 8x32bit integer gather for 8bit components
1566                         // 256i - 0    1    2    3    4    5    6    7
1567                         //        xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw
1568
1569                         Shuffle8bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, extendCastType, conversionType,
1570                             currentVertexElement, outputElt, compMask, compCtrl, vVertexElements, info.swizzle);
1571
1572                         // Shuffle gathered components into place in simdvertex struct
1573 #if USE_SIMD16_SHADERS
1574                         Shuffle8bpcGatherd(args, useVertexID2); // outputs to vVertexElements ref
1575 #else
1576                         Shuffle8bpcGatherd(args); // outputs to vVertexElements ref
1577 #endif
1578 #endif
1579                     }
1580                 }
1581                 break;
1582                 case 16:
1583                 {
1584 #if USE_SIMD16_GATHERS
1585                     Value *gatherResult[2];
1586
1587                     // if we have at least one component out of x or y to fetch
1588                     if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
1589                     {
1590                         gatherResult[0] = GATHERDD_16(gatherSrc16, pStreamBase, vOffsets16, vGatherMask16);
1591
1592                         // e.g. result of first 8x32bit integer gather for 16bit components
1593                         // 256i - 0    1    2    3    4    5    6    7
1594                         //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
1595                         //
1596                     }
1597                     else
1598                     {
1599                         gatherResult[0] = VUNDEF_I_16();
1600                     }
1601
1602                     // if we have at least one component out of z or w to fetch
1603                     if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
1604                     {
1605                         // offset base to the next components(zw) in the vertex to gather
1606                         pStreamBase = GEP(pStreamBase, C((char)4));
1607
1608                         gatherResult[1] = GATHERDD_16(gatherSrc16, pStreamBase, vOffsets16, vGatherMask16);
1609
1610                         // e.g. result of second 8x32bit integer gather for 16bit components
1611                         // 256i - 0    1    2    3    4    5    6    7
1612                         //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
1613                         //
1614                     }
1615                     else
1616                     {
1617                         gatherResult[1] = VUNDEF_I_16();
1618                     }
1619
1620                     // if we have at least one component to shuffle into place
1621                     if (compMask)
1622                     {
1623                         Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth16), 0));
1624
1625                         Shuffle16bpcArgs args = std::forward_as_tuple(gatherResult, pVtxOut2, extendCastType, conversionType,
1626                             currentVertexElement, outputElt, compMask, compCtrl, pVtxSrc2);
1627
1628                         // Shuffle gathered components into place in simdvertex struct
1629                         Shuffle16bpcGather16(args);  // outputs to vVertexElements ref
1630                     }
1631 #else
1632                     Value *vGatherResult[2];
1633
1634                     // if we have at least one component out of x or y to fetch
1635                     if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
1636                     {
1637                         vGatherResult[0] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask);
1638                         // e.g. result of first 8x32bit integer gather for 16bit components
1639                         // 256i - 0    1    2    3    4    5    6    7
1640                         //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
1641                         //
1642                     }
1643
1644                     // if we have at least one component out of z or w to fetch
1645                     if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
1646                     {
1647                         // offset base to the next components(zw) in the vertex to gather
1648                         pStreamBase = GEP(pStreamBase, C((char)4));
1649
1650                         vGatherResult[1] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask);
1651                         // e.g. result of second 8x32bit integer gather for 16bit components
1652                         // 256i - 0    1    2    3    4    5    6    7
1653                         //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
1654                         //
1655                     }
1656
1657                     // if we have at least one component to shuffle into place
1658                     if (compMask)
1659                     {
1660                         Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, extendCastType, conversionType,
1661                             currentVertexElement, outputElt, compMask, compCtrl, vVertexElements);
1662
1663                         // Shuffle gathered components into place in simdvertex struct
1664 #if USE_SIMD16_SHADERS
1665                         Shuffle16bpcGather(args, useVertexID2);  // outputs to vVertexElements ref
1666 #else
1667                         Shuffle16bpcGather(args);  // outputs to vVertexElements ref
1668 #endif
1669                     }
1670 #endif
1671                 }
1672                 break;
1673                 case 32:
1674                 {
1675                     // Gathered components into place in simdvertex struct
1676                     for (uint32_t i = 0; i < 4; i++)
1677                     {
1678                         if (isComponentEnabled(compMask, i))
1679                         {
1680                             // if we need to gather the component
1681                             if (compCtrl[i] == StoreSrc)
1682                             {
1683 #if USE_SIMD16_GATHERS
1684                                 Value *pGather = GATHERDD_16(gatherSrc16, pStreamBase, vOffsets16, vGatherMask16);
1685
1686                                 if (conversionType == CONVERT_USCALED)
1687                                 {
1688                                     pGather = UI_TO_FP(pGather, mSimd16FP32Ty);
1689                                 }
1690                                 else if (conversionType == CONVERT_SSCALED)
1691                                 {
1692                                     pGather = SI_TO_FP(pGather, mSimd16FP32Ty);
1693                                 }
1694                                 else if (conversionType == CONVERT_SFIXED)
1695                                 {
1696                                     pGather = FMUL(SI_TO_FP(pGather, mSimd16FP32Ty), VBROADCAST_16(C(1 / 65536.0f)));
1697                                 }
1698
1699                                 pVtxSrc2[currentVertexElement++] = pGather;
1700
1701                                 // e.g. result of a single 8x32bit integer gather for 32bit components
1702                                 // 256i - 0    1    2    3    4    5    6    7
1703                                 //        xxxx xxxx xxxx xxxx xxxx xxxx xxxx xxxx
1704 #else
1705                                 Value* pGather = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask);
1706
1707                                 if (conversionType == CONVERT_USCALED)
1708                                 {
1709                                     pGather = UI_TO_FP(pGather, mSimdFP32Ty);
1710                                 }
1711                                 else if (conversionType == CONVERT_SSCALED)
1712                                 {
1713                                     pGather = SI_TO_FP(pGather, mSimdFP32Ty);
1714                                 }
1715                                 else if (conversionType == CONVERT_SFIXED)
1716                                 {
1717                                     pGather = FMUL(SI_TO_FP(pGather, mSimdFP32Ty), VBROADCAST(C(1/65536.0f)));
1718                                 }
1719
1720                                 vVertexElements[currentVertexElement++] = pGather;
1721
1722                                 // e.g. result of a single 8x32bit integer gather for 32bit components
1723                                 // 256i - 0    1    2    3    4    5    6    7
1724                                 //        xxxx xxxx xxxx xxxx xxxx xxxx xxxx xxxx
1725 #endif
1726                             }
1727                             else
1728                             {
1729 #if USE_SIMD16_GATHERS
1730                                 pVtxSrc2[currentVertexElement++] = GenerateCompCtrlVector16(compCtrl[i]);
1731 #else
1732 #if USE_SIMD16_SHADERS
1733                                 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i], useVertexID2);
1734 #else
1735                                 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1736 #endif
1737 #endif
1738                             }
1739
1740                             if (currentVertexElement > 3)
1741                             {
1742 #if USE_SIMD16_GATHERS
1743                                 // store SIMD16s
1744                                 Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth16), 0));
1745
1746                                 StoreVertexElements16(pVtxOut2, outputElt++, 4, pVtxSrc2);
1747 #else
1748                                 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1749 #endif
1750
1751                                 // reset to the next vVertexElement to output
1752                                 currentVertexElement = 0;
1753                             }
1754
1755                         }
1756
1757                         // offset base to the next component  in the vertex to gather
1758                         pStreamBase = GEP(pStreamBase, C((char)4));
1759                     }
1760                 }
1761                 break;
1762             }
1763         }
1764     }
1765
1766     // if we have a partially filled vVertexElement struct, output it
1767     if (currentVertexElement > 0)
1768     {
1769 #if USE_SIMD16_GATHERS
1770         // store SIMD16s
1771         Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth16), 0));
1772
1773         StoreVertexElements16(pVtxOut2, outputElt++, currentVertexElement, pVtxSrc2);
1774 #else
1775         StoreVertexElements(pVtxOut, outputElt++, currentVertexElement, vVertexElements);
1776 #endif
1777     }
1778 }
1779
1780 //////////////////////////////////////////////////////////////////////////
1781 /// @brief Loads a simd of valid indices. OOB indices are set to 0
1782 /// *Note* have to do 16bit index checking in scalar until we have AVX-512
1783 /// support
1784 /// @param pIndices - pointer to 8 bit indices
1785 /// @param pLastIndex - pointer to last valid index
1786 Value* FetchJit::GetSimdValid8bitIndices(Value* pIndices, Value* pLastIndex)
1787 {
1788     // can fit 2 16 bit integers per vWidth lane
1789     Value* vIndices =  VUNDEF_I();
1790
1791     // store 0 index on stack to be used to conditionally load from if index address is OOB
1792     Value* pZeroIndex = ALLOCA(mInt8Ty);
1793     STORE(C((uint8_t)0), pZeroIndex);
1794
1795     // Load a SIMD of index pointers
1796     for(int64_t lane = 0; lane < mVWidth; lane++)
1797     {
1798         // Calculate the address of the requested index
1799         Value *pIndex = GEP(pIndices, C(lane));
1800
1801         // check if the address is less than the max index,
1802         Value* mask = ICMP_ULT(pIndex, pLastIndex);
1803
1804         // if valid, load the index. if not, load 0 from the stack
1805         Value* pValid = SELECT(mask, pIndex, pZeroIndex);
1806         Value *index = LOAD(pValid, "valid index");
1807
1808         // zero extended index to 32 bits and insert into the correct simd lane
1809         index = Z_EXT(index, mInt32Ty);
1810         vIndices = VINSERT(vIndices, index, lane);
1811     }
1812     return vIndices;
1813 }
1814
1815 //////////////////////////////////////////////////////////////////////////
1816 /// @brief Loads a simd of valid indices. OOB indices are set to 0
1817 /// *Note* have to do 16bit index checking in scalar until we have AVX-512
1818 /// support
1819 /// @param pIndices - pointer to 16 bit indices
1820 /// @param pLastIndex - pointer to last valid index
1821 Value* FetchJit::GetSimdValid16bitIndices(Value* pIndices, Value* pLastIndex)
1822 {
1823     // can fit 2 16 bit integers per vWidth lane
1824     Value* vIndices =  VUNDEF_I();
1825
1826     // store 0 index on stack to be used to conditionally load from if index address is OOB
1827     Value* pZeroIndex = ALLOCA(mInt16Ty);
1828     STORE(C((uint16_t)0), pZeroIndex);
1829
1830     // Load a SIMD of index pointers
1831     for(int64_t lane = 0; lane < mVWidth; lane++)
1832     {
1833         // Calculate the address of the requested index
1834         Value *pIndex = GEP(pIndices, C(lane));
1835
1836         // check if the address is less than the max index,
1837         Value* mask = ICMP_ULT(pIndex, pLastIndex);
1838
1839         // if valid, load the index. if not, load 0 from the stack
1840         Value* pValid = SELECT(mask, pIndex, pZeroIndex);
1841         Value *index = LOAD(pValid, "valid index");
1842
1843         // zero extended index to 32 bits and insert into the correct simd lane
1844         index = Z_EXT(index, mInt32Ty);
1845         vIndices = VINSERT(vIndices, index, lane);
1846     }
1847     return vIndices;
1848 }
1849
1850 //////////////////////////////////////////////////////////////////////////
1851 /// @brief Loads a simd of valid indices. OOB indices are set to 0
1852 /// @param pIndices - pointer to 32 bit indices
1853 /// @param pLastIndex - pointer to last valid index
1854 Value* FetchJit::GetSimdValid32bitIndices(Value* pIndices, Value* pLastIndex)
1855 {
1856     DataLayout dL(JM()->mpCurrentModule);
1857     unsigned int ptrSize = dL.getPointerSize() * 8;  // ptr size in bits
1858     Value* iLastIndex = PTR_TO_INT(pLastIndex, Type::getIntNTy(JM()->mContext, ptrSize));
1859     Value* iIndices = PTR_TO_INT(pIndices, Type::getIntNTy(JM()->mContext, ptrSize));
1860
1861     // get the number of indices left in the buffer (endPtr - curPtr) / sizeof(index)
1862     Value* numIndicesLeft = SUB(iLastIndex,iIndices);
1863     numIndicesLeft = TRUNC(numIndicesLeft, mInt32Ty);
1864     numIndicesLeft = SDIV(numIndicesLeft, C(4));
1865
1866     // create a vector of index counts from the base index ptr passed into the fetch
1867     const std::vector<Constant*> vecIndices {C(0), C(1), C(2), C(3), C(4), C(5), C(6), C(7)};
1868     Constant* vIndexOffsets = ConstantVector::get(vecIndices);
1869
1870     // compare index count to the max valid index
1871     // e.g vMaxIndex      4 4 4 4 4 4 4 4 : 4 indices left to load
1872     //     vIndexOffsets  0 1 2 3 4 5 6 7
1873     //     ------------------------------
1874     //     vIndexMask    -1-1-1-1 0 0 0 0 : offsets < max pass
1875     //     vLoadedIndices 0 1 2 3 0 0 0 0 : offsets >= max masked to 0
1876     Value* vMaxIndex = VBROADCAST(numIndicesLeft);
1877     Value* vIndexMask = VPCMPGTD(vMaxIndex,vIndexOffsets);
1878
1879     // VMASKLOAD takes an *i8 src pointer
1880     pIndices = BITCAST(pIndices,PointerType::get(mInt8Ty,0));
1881
1882     // Load the indices; OOB loads 0
1883     return MASKLOADD(pIndices,vIndexMask);
1884 }
1885
1886 //////////////////////////////////////////////////////////////////////////
1887 /// @brief Takes a SIMD of gathered 8bpc verts, zero or sign extends,
1888 /// denormalizes if needed, converts to F32 if needed, and positions in
1889 //  the proper SIMD rows to be output to the simdvertex structure
1890 /// @param args: (tuple of args, listed below)
1891 ///   @param vGatherResult - 8 gathered 8bpc vertices
1892 ///   @param pVtxOut - base pointer to output simdvertex struct
1893 ///   @param extendType - sign extend or zero extend
1894 ///   @param bNormalized - do we need to denormalize?
1895 ///   @param currentVertexElement - reference to the current vVertexElement
1896 ///   @param outputElt - reference to the current offset from simdvertex we're o
1897 ///   @param compMask - component packing mask
1898 ///   @param compCtrl - component control val
1899 ///   @param vVertexElements[4] - vertex components to output
1900 ///   @param swizzle[4] - component swizzle location
1901 #if USE_SIMD16_GATHERS
1902 void FetchJit::Shuffle8bpcGatherd16(Shuffle8bpcArgs &args)
1903 {
1904     // Unpack tuple args
1905     Value*& vGatherResult = std::get<0>(args);
1906     Value* pVtxOut = std::get<1>(args);
1907     const Instruction::CastOps extendType = std::get<2>(args);
1908     const ConversionType conversionType = std::get<3>(args);
1909     uint32_t &currentVertexElement = std::get<4>(args);
1910     uint32_t &outputElt = std::get<5>(args);
1911     const ComponentEnable compMask = std::get<6>(args);
1912     const ComponentControl(&compCtrl)[4] = std::get<7>(args);
1913     Value* (&vVertexElements)[4] = std::get<8>(args);
1914     const uint32_t(&swizzle)[4] = std::get<9>(args);
1915
1916     // cast types
1917     Type *vGatherTy = mSimdInt32Ty;
1918     Type *v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
1919
1920     // have to do extra work for sign extending
1921     if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP))
1922     {
1923         Type *v16x8Ty = VectorType::get(mInt8Ty, mVWidth * 2); // 8x16bit ints in a 128bit lane
1924         Type *v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
1925
1926         // shuffle mask, including any swizzling
1927         const char x = (char)swizzle[0]; const char y = (char)swizzle[1];
1928         const char z = (char)swizzle[2]; const char w = (char)swizzle[3];
1929         Value *vConstMask = C<char>({ char(x), char(x + 4), char(x + 8), char(x + 12),
1930             char(y), char(y + 4), char(y + 8), char(y + 12),
1931             char(z), char(z + 4), char(z + 8), char(z + 12),
1932             char(w), char(w + 4), char(w + 8), char(w + 12),
1933             char(x), char(x + 4), char(x + 8), char(x + 12),
1934             char(y), char(y + 4), char(y + 8), char(y + 12),
1935             char(z), char(z + 4), char(z + 8), char(z + 12),
1936             char(w), char(w + 4), char(w + 8), char(w + 12) });
1937
1938         // SIMD16 PSHUFB isnt part of AVX-512F, so split into SIMD8 for the sake of KNL, for now..
1939
1940         Value *vGatherResult_lo = EXTRACT_16(vGatherResult, 0);
1941         Value *vGatherResult_hi = EXTRACT_16(vGatherResult, 1);
1942
1943         Value *vShufResult_lo = BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask), vGatherTy);
1944         Value *vShufResult_hi = BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask), vGatherTy);
1945
1946         // after pshufb: group components together in each 128bit lane
1947         // 256i - 0    1    2    3    4    5    6    7
1948         //        xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww
1949
1950         Value *vi128XY_lo = nullptr;
1951         Value *vi128XY_hi = nullptr;
1952         if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
1953         {
1954             vi128XY_lo = BITCAST(PERMD(vShufResult_lo, C<int32_t>({ 0, 4, 0, 0, 1, 5, 0, 0 })), v128Ty);
1955             vi128XY_hi = BITCAST(PERMD(vShufResult_hi, C<int32_t>({ 0, 4, 0, 0, 1, 5, 0, 0 })), v128Ty);
1956
1957             // after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane
1958             // 256i - 0    1    2    3    4    5    6    7
1959             //        xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care)
1960         }
1961
1962         // do the same for zw components
1963         Value *vi128ZW_lo = nullptr;
1964         Value *vi128ZW_hi = nullptr;
1965         if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
1966         {
1967             vi128ZW_lo = BITCAST(PERMD(vShufResult_lo, C<int32_t>({ 2, 6, 0, 0, 3, 7, 0, 0 })), v128Ty);
1968             vi128ZW_hi = BITCAST(PERMD(vShufResult_hi, C<int32_t>({ 2, 6, 0, 0, 3, 7, 0, 0 })), v128Ty);
1969         }
1970
1971         // init denormalize variables if needed
1972         Instruction::CastOps fpCast;
1973         Value *conversionFactor;
1974
1975         switch (conversionType)
1976         {
1977         case CONVERT_NORMALIZED:
1978             fpCast = Instruction::CastOps::SIToFP;
1979             conversionFactor = VIMMED1((float)(1.0 / 127.0));
1980             break;
1981         case CONVERT_SSCALED:
1982             fpCast = Instruction::CastOps::SIToFP;
1983             conversionFactor = VIMMED1((float)(1.0));
1984             break;
1985         case CONVERT_USCALED:
1986             SWR_INVALID("Type should not be sign extended!");
1987             conversionFactor = nullptr;
1988             break;
1989         default:
1990             SWR_ASSERT(conversionType == CONVERT_NONE);
1991             conversionFactor = nullptr;
1992             break;
1993         }
1994
1995         // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
1996         for (uint32_t i = 0; i < 4; i++)
1997         {
1998             if (isComponentEnabled(compMask, i))
1999             {
2000                 if (compCtrl[i] == ComponentControl::StoreSrc)
2001                 {
2002                     // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
2003                     uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
2004                     // if x or y, use vi128XY permute result, else use vi128ZW
2005                     Value *selectedPermute_lo = (i < 2) ? vi128XY_lo : vi128ZW_lo;
2006                     Value *selectedPermute_hi = (i < 2) ? vi128XY_hi : vi128ZW_hi;
2007
2008                     // sign extend
2009                     Value *temp_lo = PMOVSXBD(BITCAST(VEXTRACT(selectedPermute_lo, C(lane)), v16x8Ty));
2010                     Value *temp_hi = PMOVSXBD(BITCAST(VEXTRACT(selectedPermute_hi, C(lane)), v16x8Ty));
2011
2012                     // denormalize if needed
2013                     if (conversionType != CONVERT_NONE)
2014                     {
2015                         temp_lo = FMUL(CAST(fpCast, temp_lo, mSimdFP32Ty), conversionFactor);
2016                         temp_hi = FMUL(CAST(fpCast, temp_hi, mSimdFP32Ty), conversionFactor);
2017                     }
2018
2019                     vVertexElements[currentVertexElement] = JOIN_16(temp_lo, temp_hi);
2020
2021                     currentVertexElement += 1;
2022                 }
2023                 else
2024                 {
2025                     vVertexElements[currentVertexElement++] = GenerateCompCtrlVector16(compCtrl[i]);
2026                 }
2027
2028                 if (currentVertexElement > 3)
2029                 {
2030                     StoreVertexElements16(pVtxOut, outputElt++, 4, vVertexElements);
2031                     // reset to the next vVertexElement to output
2032                     currentVertexElement = 0;
2033                 }
2034             }
2035         }
2036     }
2037     // else zero extend
2038     else if ((extendType == Instruction::CastOps::ZExt) || (extendType == Instruction::CastOps::UIToFP))
2039     {
2040         // init denormalize variables if needed
2041         Instruction::CastOps fpCast;
2042         Value *conversionFactor;
2043
2044         switch (conversionType)
2045         {
2046         case CONVERT_NORMALIZED:
2047             fpCast = Instruction::CastOps::UIToFP;
2048             conversionFactor = VIMMED1((float)(1.0 / 255.0));
2049             break;
2050         case CONVERT_USCALED:
2051             fpCast = Instruction::CastOps::UIToFP;
2052             conversionFactor = VIMMED1((float)(1.0));
2053             break;
2054         case CONVERT_SSCALED:
2055             SWR_INVALID("Type should not be zero extended!");
2056             conversionFactor = nullptr;
2057             break;
2058         default:
2059             SWR_ASSERT(conversionType == CONVERT_NONE);
2060             conversionFactor = nullptr;
2061             break;
2062         }
2063
2064         // shuffle enabled components into lower byte of each 32bit lane, 0 extending to 32 bits
2065         for (uint32_t i = 0; i < 4; i++)
2066         {
2067             if (isComponentEnabled(compMask, i))
2068             {
2069                 if (compCtrl[i] == ComponentControl::StoreSrc)
2070                 {
2071                     // pshufb masks for each component
2072                     Value *vConstMask;
2073                     switch (swizzle[i])
2074                     {
2075                     case 0:
2076                         // x shuffle mask
2077                         vConstMask = C<char>({ 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1,
2078                             0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1 });
2079                         break;
2080                     case 1:
2081                         // y shuffle mask
2082                         vConstMask = C<char>({ 1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1,
2083                             1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1 });
2084                         break;
2085                     case 2:
2086                         // z shuffle mask
2087                         vConstMask = C<char>({ 2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1,
2088                             2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1 });
2089                         break;
2090                     case 3:
2091                         // w shuffle mask
2092                         vConstMask = C<char>({ 3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1,
2093                             3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1 });
2094                         break;
2095                     default:
2096                         vConstMask = nullptr;
2097                         break;
2098                     }
2099
2100                     Value *vGatherResult_lo = EXTRACT_16(vGatherResult, 0);
2101                     Value *vGatherResult_hi = EXTRACT_16(vGatherResult, 1);
2102
2103                     Value *temp_lo = BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask), vGatherTy);
2104                     Value *temp_hi = BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask), vGatherTy);
2105
2106                     // after pshufb for x channel
2107                     // 256i - 0    1    2    3    4    5    6    7
2108                     //        x000 x000 x000 x000 x000 x000 x000 x000
2109
2110                     // denormalize if needed
2111                     if (conversionType != CONVERT_NONE)
2112                     {
2113                         temp_lo = FMUL(CAST(fpCast, temp_lo, mSimdFP32Ty), conversionFactor);
2114                         temp_hi = FMUL(CAST(fpCast, temp_hi, mSimdFP32Ty), conversionFactor);
2115                     }
2116
2117                     vVertexElements[currentVertexElement] = JOIN_16(temp_lo, temp_hi);
2118
2119                     currentVertexElement += 1;
2120                 }
2121                 else
2122                 {
2123                     vVertexElements[currentVertexElement++] = GenerateCompCtrlVector16(compCtrl[i]);
2124                 }
2125
2126                 if (currentVertexElement > 3)
2127                 {
2128                     StoreVertexElements16(pVtxOut, outputElt++, 4, vVertexElements);
2129                     // reset to the next vVertexElement to output
2130                     currentVertexElement = 0;
2131                 }
2132             }
2133         }
2134     }
2135     else
2136     {
2137         SWR_INVALID("Unsupported conversion type");
2138     }
2139 }
2140
2141 #else
2142 #if USE_SIMD16_SHADERS
2143 void FetchJit::Shuffle8bpcGatherd(Shuffle8bpcArgs &args, bool useVertexID2)
2144 #else
2145 void FetchJit::Shuffle8bpcGatherd(Shuffle8bpcArgs &args)
2146 #endif
2147 {
2148     // Unpack tuple args
2149     Value*& vGatherResult = std::get<0>(args);
2150     Value* pVtxOut = std::get<1>(args);
2151     const Instruction::CastOps extendType = std::get<2>(args);
2152     const ConversionType conversionType = std::get<3>(args);
2153     uint32_t &currentVertexElement = std::get<4>(args);
2154     uint32_t &outputElt = std::get<5>(args);
2155     const ComponentEnable compMask = std::get<6>(args);
2156     const ComponentControl(&compCtrl)[4] = std::get<7>(args);
2157     Value* (&vVertexElements)[4] = std::get<8>(args);
2158     const uint32_t(&swizzle)[4] = std::get<9>(args);
2159
2160     // cast types
2161     Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
2162
2163     for (uint32_t i = 0; i < 4; i++)
2164     {
2165         if (!isComponentEnabled(compMask, i))
2166             continue;
2167
2168         if (compCtrl[i] == ComponentControl::StoreSrc)
2169         {
2170             std::vector<uint32_t> vShuffleMasks[4] = {
2171                 { 0, 4,  8, 12, 16, 20, 24, 28 }, // x
2172                 { 1, 5,  9, 13, 17, 21, 25, 29 }, // y
2173                 { 2, 6, 10, 14, 18, 22, 26, 30 }, // z
2174                 { 3, 7, 11, 15, 19, 23, 27, 31 }, // w
2175             };
2176
2177             Value *val = VSHUFFLE(BITCAST(vGatherResult, v32x8Ty),
2178                 UndefValue::get(v32x8Ty),
2179                 vShuffleMasks[swizzle[i]]);
2180
2181             if ((extendType == Instruction::CastOps::SExt) ||
2182                 (extendType == Instruction::CastOps::SIToFP)) {
2183                 switch (conversionType)
2184                 {
2185                 case CONVERT_NORMALIZED:
2186                     val = FMUL(SI_TO_FP(val, mSimdFP32Ty), VIMMED1((float)(1.0 / 127.0)));
2187                     break;
2188                 case CONVERT_SSCALED:
2189                     val = SI_TO_FP(val, mSimdFP32Ty);
2190                     break;
2191                 case CONVERT_USCALED:
2192                     SWR_INVALID("Type should not be sign extended!");
2193                     break;
2194                 default:
2195                     SWR_ASSERT(conversionType == CONVERT_NONE);
2196                     val = S_EXT(val, mSimdInt32Ty);
2197                     break;
2198                 }
2199             }
2200             else if ((extendType == Instruction::CastOps::ZExt) ||
2201                 (extendType == Instruction::CastOps::UIToFP)) {
2202                 switch (conversionType)
2203                 {
2204                 case CONVERT_NORMALIZED:
2205                     val = FMUL(UI_TO_FP(val, mSimdFP32Ty), VIMMED1((float)(1.0 / 255.0)));
2206                     break;
2207                 case CONVERT_SSCALED:
2208                     SWR_INVALID("Type should not be zero extended!");
2209                     break;
2210                 case CONVERT_USCALED:
2211                     val = UI_TO_FP(val, mSimdFP32Ty);
2212                     break;
2213                 default:
2214                     SWR_ASSERT(conversionType == CONVERT_NONE);
2215                     val = Z_EXT(val, mSimdInt32Ty);
2216                     break;
2217                 }
2218             }
2219             else
2220             {
2221                 SWR_INVALID("Unsupported conversion type");
2222             }
2223
2224             vVertexElements[currentVertexElement++] = val;
2225         }
2226         else
2227         {
2228 #if USE_SIMD16_SHADERS
2229             vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i], useVertexID2);
2230 #else
2231             vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
2232 #endif
2233         }
2234
2235         if (currentVertexElement > 3)
2236         {
2237             StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
2238             // reset to the next vVertexElement to output
2239             currentVertexElement = 0;
2240         }
2241     }
2242 }
2243
2244 #endif
2245 //////////////////////////////////////////////////////////////////////////
2246 /// @brief Takes a SIMD of gathered 16bpc verts, zero or sign extends,
2247 /// denormalizes if needed, converts to F32 if needed, and positions in
2248 //  the proper SIMD rows to be output to the simdvertex structure
2249 /// @param args: (tuple of args, listed below)
2250 ///   @param vGatherResult[2] - array of gathered 16bpc vertices, 4 per index
2251 ///   @param pVtxOut - base pointer to output simdvertex struct
2252 ///   @param extendType - sign extend or zero extend
2253 ///   @param bNormalized - do we need to denormalize?
2254 ///   @param currentVertexElement - reference to the current vVertexElement
2255 ///   @param outputElt - reference to the current offset from simdvertex we're o
2256 ///   @param compMask - component packing mask
2257 ///   @param compCtrl - component control val
2258 ///   @param vVertexElements[4] - vertex components to output
2259 #if USE_SIMD16_GATHERS
2260 void FetchJit::Shuffle16bpcGather16(Shuffle16bpcArgs &args)
2261 {
2262     // Unpack tuple args
2263     Value* (&vGatherResult)[2] = std::get<0>(args);
2264     Value* pVtxOut = std::get<1>(args);
2265     const Instruction::CastOps extendType = std::get<2>(args);
2266     const ConversionType conversionType = std::get<3>(args);
2267     uint32_t &currentVertexElement = std::get<4>(args);
2268     uint32_t &outputElt = std::get<5>(args);
2269     const ComponentEnable compMask = std::get<6>(args);
2270     const ComponentControl(&compCtrl)[4] = std::get<7>(args);
2271     Value* (&vVertexElements)[4] = std::get<8>(args);
2272
2273     // cast types
2274     Type *vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
2275     Type *v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
2276
2277     // have to do extra work for sign extending
2278     if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP) || (extendType == Instruction::CastOps::FPExt))
2279     {
2280         // is this PP float?
2281         bool bFP = (extendType == Instruction::CastOps::FPExt) ? true : false;
2282
2283         Type *v8x16Ty = VectorType::get(mInt16Ty, 8); // 8x16bit in a 128bit lane
2284         Type *v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
2285
2286         // shuffle mask
2287         Value *vConstMask = C<char>({ 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
2288                                       0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 });
2289         Value *vi128XY_lo = nullptr;
2290         Value *vi128XY_hi = nullptr;
2291         if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
2292         {
2293             // SIMD16 PSHUFB isnt part of AVX-512F, so split into SIMD8 for the sake of KNL, for now..
2294
2295             Value *vGatherResult_lo = EXTRACT_16(vGatherResult[0], 0);
2296             Value *vGatherResult_hi = EXTRACT_16(vGatherResult[0], 1);
2297
2298             Value *vShufResult_lo = BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask), vGatherTy);
2299             Value *vShufResult_hi = BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask), vGatherTy);
2300
2301             // after pshufb: group components together in each 128bit lane
2302             // 256i - 0    1    2    3    4    5    6    7
2303             //        xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
2304
2305             vi128XY_lo = BITCAST(PERMD(vShufResult_lo, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
2306             vi128XY_hi = BITCAST(PERMD(vShufResult_hi, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
2307
2308             // after PERMD: move and pack xy components into each 128bit lane
2309             // 256i - 0    1    2    3    4    5    6    7
2310             //        xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
2311         }
2312
2313         // do the same for zw components
2314         Value *vi128ZW_lo = nullptr;
2315         Value *vi128ZW_hi = nullptr;
2316         if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
2317         {
2318             Value *vGatherResult_lo = EXTRACT_16(vGatherResult[1], 0);
2319             Value *vGatherResult_hi = EXTRACT_16(vGatherResult[1], 1);
2320
2321             Value *vShufResult_lo = BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask), vGatherTy);
2322             Value *vShufResult_hi = BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask), vGatherTy);
2323
2324             vi128ZW_lo = BITCAST(PERMD(vShufResult_lo, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
2325             vi128ZW_hi = BITCAST(PERMD(vShufResult_hi, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
2326         }
2327
2328         // init denormalize variables if needed
2329         Instruction::CastOps IntToFpCast;
2330         Value *conversionFactor;
2331
2332         switch (conversionType)
2333         {
2334         case CONVERT_NORMALIZED:
2335             IntToFpCast = Instruction::CastOps::SIToFP;
2336             conversionFactor = VIMMED1((float)(1.0 / 32767.0));
2337             break;
2338         case CONVERT_SSCALED:
2339             IntToFpCast = Instruction::CastOps::SIToFP;
2340             conversionFactor = VIMMED1((float)(1.0));
2341             break;
2342         case CONVERT_USCALED:
2343             SWR_INVALID("Type should not be sign extended!");
2344             conversionFactor = nullptr;
2345             break;
2346         default:
2347             SWR_ASSERT(conversionType == CONVERT_NONE);
2348             conversionFactor = nullptr;
2349             break;
2350         }
2351
2352         // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
2353         for (uint32_t i = 0; i < 4; i++)
2354         {
2355             if (isComponentEnabled(compMask, i))
2356             {
2357                 if (compCtrl[i] == ComponentControl::StoreSrc)
2358                 {
2359                     // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
2360                     uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
2361                     // if x or y, use vi128XY permute result, else use vi128ZW
2362                     Value *selectedPermute_lo = (i < 2) ? vi128XY_lo : vi128ZW_lo;
2363                     Value *selectedPermute_hi = (i < 2) ? vi128XY_hi : vi128ZW_hi;
2364
2365                     if (bFP)
2366                     {
2367                         // extract 128 bit lanes to sign extend each component
2368                         Value *temp_lo = CVTPH2PS(BITCAST(VEXTRACT(selectedPermute_lo, C(lane)), v8x16Ty));
2369                         Value *temp_hi = CVTPH2PS(BITCAST(VEXTRACT(selectedPermute_hi, C(lane)), v8x16Ty));
2370
2371                         vVertexElements[currentVertexElement] = JOIN_16(temp_lo, temp_hi);
2372                     }
2373                     else
2374                     {
2375                         // extract 128 bit lanes to sign extend each component
2376                         Value *temp_lo = PMOVSXWD(BITCAST(VEXTRACT(selectedPermute_lo, C(lane)), v8x16Ty));
2377                         Value *temp_hi = PMOVSXWD(BITCAST(VEXTRACT(selectedPermute_hi, C(lane)), v8x16Ty));
2378
2379                         // denormalize if needed
2380                         if (conversionType != CONVERT_NONE)
2381                         {
2382                             temp_lo = FMUL(CAST(IntToFpCast, temp_lo, mSimdFP32Ty), conversionFactor);
2383                             temp_hi = FMUL(CAST(IntToFpCast, temp_hi, mSimdFP32Ty), conversionFactor);
2384                         }
2385
2386                         vVertexElements[currentVertexElement] = JOIN_16(temp_lo, temp_hi);
2387                     }
2388
2389                     currentVertexElement += 1;
2390                 }
2391                 else
2392                 {
2393                     vVertexElements[currentVertexElement++] = GenerateCompCtrlVector16(compCtrl[i]);
2394                 }
2395
2396                 if (currentVertexElement > 3)
2397                 {
2398                     StoreVertexElements16(pVtxOut, outputElt++, 4, vVertexElements);
2399                     // reset to the next vVertexElement to output
2400                     currentVertexElement = 0;
2401                 }
2402             }
2403         }
2404     }
2405     // else zero extend
2406     else if ((extendType == Instruction::CastOps::ZExt) || (extendType == Instruction::CastOps::UIToFP))
2407     {
2408         // pshufb masks for each component
2409         Value *vConstMask[2];
2410
2411         if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 2))
2412         {
2413             // x/z shuffle mask
2414             vConstMask[0] = C<char>({ 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
2415                 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, });
2416         }
2417
2418         if (isComponentEnabled(compMask, 1) || isComponentEnabled(compMask, 3))
2419         {
2420             // y/w shuffle mask
2421             vConstMask[1] = C<char>({ 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
2422                 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1 });
2423         }
2424
2425         // init denormalize variables if needed
2426         Instruction::CastOps fpCast;
2427         Value* conversionFactor;
2428
2429         switch (conversionType)
2430         {
2431         case CONVERT_NORMALIZED:
2432             fpCast = Instruction::CastOps::UIToFP;
2433             conversionFactor = VIMMED1((float)(1.0 / 65535.0));
2434             break;
2435         case CONVERT_USCALED:
2436             fpCast = Instruction::CastOps::UIToFP;
2437             conversionFactor = VIMMED1((float)(1.0f));
2438             break;
2439         case CONVERT_SSCALED:
2440             SWR_INVALID("Type should not be zero extended!");
2441             conversionFactor = nullptr;
2442             break;
2443         default:
2444             SWR_ASSERT(conversionType == CONVERT_NONE);
2445             conversionFactor = nullptr;
2446             break;
2447         }
2448
2449         // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits
2450         for (uint32_t i = 0; i < 4; i++)
2451         {
2452             if (isComponentEnabled(compMask, i))
2453             {
2454                 if (compCtrl[i] == ComponentControl::StoreSrc)
2455                 {
2456                     // select correct constMask for x/z or y/w pshufb
2457                     uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1;
2458                     // if x or y, use vi128XY permute result, else use vi128ZW
2459                     uint32_t selectedGather = (i < 2) ? 0 : 1;
2460
2461                     // SIMD16 PSHUFB isnt part of AVX-512F, so split into SIMD8 for the sake of KNL, for now..
2462
2463                     Value *vGatherResult_lo = EXTRACT_16(vGatherResult[selectedGather], 0);
2464                     Value *vGatherResult_hi = EXTRACT_16(vGatherResult[selectedGather], 1);
2465
2466                     Value *temp_lo = BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask[selectedMask]), vGatherTy);
2467                     Value *temp_hi = BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask[selectedMask]), vGatherTy);
2468
2469                     // after pshufb mask for x channel; z uses the same shuffle from the second gather
2470                     // 256i - 0    1    2    3    4    5    6    7
2471                     //        xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00
2472
2473                     // denormalize if needed
2474                     if (conversionType != CONVERT_NONE)
2475                     {
2476                         temp_lo = FMUL(CAST(fpCast, temp_lo, mSimdFP32Ty), conversionFactor);
2477                         temp_hi = FMUL(CAST(fpCast, temp_hi, mSimdFP32Ty), conversionFactor);
2478                     }
2479
2480                     vVertexElements[currentVertexElement] = JOIN_16(temp_lo, temp_hi);
2481
2482                     currentVertexElement += 1;
2483                 }
2484                 else
2485                 {
2486                     vVertexElements[currentVertexElement++] = GenerateCompCtrlVector16(compCtrl[i]);
2487                 }
2488
2489                 if (currentVertexElement > 3)
2490                 {
2491                     StoreVertexElements16(pVtxOut, outputElt++, 4, vVertexElements);
2492                     // reset to the next vVertexElement to output
2493                     currentVertexElement = 0;
2494                 }
2495             }
2496         }
2497     }
2498     else
2499     {
2500         SWR_INVALID("Unsupported conversion type");
2501     }
2502 }
2503
2504 #else
2505 #if USE_SIMD16_SHADERS
2506 void FetchJit::Shuffle16bpcGather(Shuffle16bpcArgs &args, bool useVertexID2)
2507 #else
2508 void FetchJit::Shuffle16bpcGather(Shuffle16bpcArgs &args)
2509 #endif
2510 {
2511     // Unpack tuple args
2512     Value* (&vGatherResult)[2] = std::get<0>(args);
2513     Value* pVtxOut = std::get<1>(args);
2514     const Instruction::CastOps extendType = std::get<2>(args);
2515     const ConversionType conversionType = std::get<3>(args);
2516     uint32_t &currentVertexElement = std::get<4>(args);
2517     uint32_t &outputElt = std::get<5>(args);
2518     const ComponentEnable compMask = std::get<6>(args);
2519     const ComponentControl(&compCtrl)[4] = std::get<7>(args);
2520     Value* (&vVertexElements)[4] = std::get<8>(args);
2521
2522     // cast types
2523     Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
2524     Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
2525
2526                                                            // have to do extra work for sign extending
2527     if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP) ||
2528         (extendType == Instruction::CastOps::FPExt))
2529     {
2530         // is this PP float?
2531         bool bFP = (extendType == Instruction::CastOps::FPExt) ? true : false;
2532
2533         Type* v8x16Ty = VectorType::get(mInt16Ty, 8); // 8x16bit in a 128bit lane
2534         Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
2535
2536                                                                                                      // shuffle mask
2537         Value* vConstMask = C<char>({ 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
2538             0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 });
2539         Value* vi128XY = nullptr;
2540         if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)) {
2541             Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherResult[0], v32x8Ty), vConstMask), vGatherTy);
2542             // after pshufb: group components together in each 128bit lane
2543             // 256i - 0    1    2    3    4    5    6    7
2544             //        xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
2545
2546             vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
2547             // after PERMD: move and pack xy components into each 128bit lane
2548             // 256i - 0    1    2    3    4    5    6    7
2549             //        xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
2550         }
2551
2552         // do the same for zw components
2553         Value* vi128ZW = nullptr;
2554         if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)) {
2555             Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherResult[1], v32x8Ty), vConstMask), vGatherTy);
2556             vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
2557         }
2558
2559         // init denormalize variables if needed
2560         Instruction::CastOps IntToFpCast;
2561         Value* conversionFactor;
2562
2563         switch (conversionType)
2564         {
2565         case CONVERT_NORMALIZED:
2566             IntToFpCast = Instruction::CastOps::SIToFP;
2567             conversionFactor = VIMMED1((float)(1.0 / 32767.0));
2568             break;
2569         case CONVERT_SSCALED:
2570             IntToFpCast = Instruction::CastOps::SIToFP;
2571             conversionFactor = VIMMED1((float)(1.0));
2572             break;
2573         case CONVERT_USCALED:
2574             SWR_INVALID("Type should not be sign extended!");
2575             conversionFactor = nullptr;
2576             break;
2577         default:
2578             SWR_ASSERT(conversionType == CONVERT_NONE);
2579             conversionFactor = nullptr;
2580             break;
2581         }
2582
2583         // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
2584         for (uint32_t i = 0; i < 4; i++)
2585         {
2586             if (isComponentEnabled(compMask, i))
2587             {
2588                 if (compCtrl[i] == ComponentControl::StoreSrc)
2589                 {
2590                     // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
2591                     uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
2592                     // if x or y, use vi128XY permute result, else use vi128ZW
2593                     Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
2594
2595                     if (bFP) {
2596                         // extract 128 bit lanes to sign extend each component
2597                         vVertexElements[currentVertexElement] = CVTPH2PS(BITCAST(VEXTRACT(selectedPermute, C(lane)), v8x16Ty));
2598                     }
2599                     else {
2600                         // extract 128 bit lanes to sign extend each component
2601                         vVertexElements[currentVertexElement] = PMOVSXWD(BITCAST(VEXTRACT(selectedPermute, C(lane)), v8x16Ty));
2602
2603                         // denormalize if needed
2604                         if (conversionType != CONVERT_NONE) {
2605                             vVertexElements[currentVertexElement] = FMUL(CAST(IntToFpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor);
2606                         }
2607                     }
2608                     currentVertexElement++;
2609                 }
2610                 else
2611                 {
2612 #if USE_SIMD16_SHADERS
2613                     vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i], useVertexID2);
2614 #else
2615                     vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
2616 #endif
2617                 }
2618
2619                 if (currentVertexElement > 3)
2620                 {
2621                     StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
2622                     // reset to the next vVertexElement to output
2623                     currentVertexElement = 0;
2624                 }
2625             }
2626         }
2627     }
2628     // else zero extend
2629     else if ((extendType == Instruction::CastOps::ZExt) || (extendType == Instruction::CastOps::UIToFP))
2630     {
2631         // pshufb masks for each component
2632         Value* vConstMask[2];
2633         if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 2)) {
2634             // x/z shuffle mask
2635             vConstMask[0] = C<char>({ 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
2636                 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, });
2637         }
2638
2639         if (isComponentEnabled(compMask, 1) || isComponentEnabled(compMask, 3)) {
2640             // y/w shuffle mask
2641             vConstMask[1] = C<char>({ 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
2642                 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1 });
2643         }
2644
2645         // init denormalize variables if needed
2646         Instruction::CastOps fpCast;
2647         Value* conversionFactor;
2648
2649         switch (conversionType)
2650         {
2651         case CONVERT_NORMALIZED:
2652             fpCast = Instruction::CastOps::UIToFP;
2653             conversionFactor = VIMMED1((float)(1.0 / 65535.0));
2654             break;
2655         case CONVERT_USCALED:
2656             fpCast = Instruction::CastOps::UIToFP;
2657             conversionFactor = VIMMED1((float)(1.0f));
2658             break;
2659         case CONVERT_SSCALED:
2660             SWR_INVALID("Type should not be zero extended!");
2661             conversionFactor = nullptr;
2662             break;
2663         default:
2664             SWR_ASSERT(conversionType == CONVERT_NONE);
2665             conversionFactor = nullptr;
2666             break;
2667         }
2668
2669         // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits
2670         for (uint32_t i = 0; i < 4; i++)
2671         {
2672             if (isComponentEnabled(compMask, i))
2673             {
2674                 if (compCtrl[i] == ComponentControl::StoreSrc)
2675                 {
2676                     // select correct constMask for x/z or y/w pshufb
2677                     uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1;
2678                     // if x or y, use vi128XY permute result, else use vi128ZW
2679                     uint32_t selectedGather = (i < 2) ? 0 : 1;
2680
2681                     vVertexElements[currentVertexElement] = BITCAST(PSHUFB(BITCAST(vGatherResult[selectedGather], v32x8Ty), vConstMask[selectedMask]), vGatherTy);
2682                     // after pshufb mask for x channel; z uses the same shuffle from the second gather
2683                     // 256i - 0    1    2    3    4    5    6    7
2684                     //        xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00
2685
2686                     // denormalize if needed
2687                     if (conversionType != CONVERT_NONE)
2688                     {
2689                         vVertexElements[currentVertexElement] = FMUL(CAST(fpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor);
2690                     }
2691                     currentVertexElement++;
2692                 }
2693                 else
2694                 {
2695 #if USE_SIMD16_SHADERS
2696                     vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i], useVertexID2);
2697 #else
2698                     vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
2699 #endif
2700                 }
2701
2702                 if (currentVertexElement > 3)
2703                 {
2704                     StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
2705                     // reset to the next vVertexElement to output
2706                     currentVertexElement = 0;
2707                 }
2708             }
2709         }
2710     }
2711     else
2712     {
2713         SWR_INVALID("Unsupported conversion type");
2714     }
2715 }
2716
2717 #endif
2718 //////////////////////////////////////////////////////////////////////////
2719 /// @brief Output a simdvertex worth of elements to the current outputElt
2720 /// @param pVtxOut - base address of VIN output struct
2721 /// @param outputElt - simdvertex offset in VIN to write to
2722 /// @param numEltsToStore - number of simdvertex rows to write out
2723 /// @param vVertexElements - LLVM Value*[] simdvertex to write out
2724 #if USE_SIMD16_GATHERS
2725 void FetchJit::StoreVertexElements16(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4])
2726 {
2727     SWR_ASSERT(numEltsToStore <= 4, "Invalid element count.");
2728
2729     for (uint32_t c = 0; c < numEltsToStore; ++c)
2730     {
2731         // STORE expects FP32 x vWidth type, just bitcast if needed
2732         if (!vVertexElements[c]->getType()->getScalarType()->isFloatTy())
2733         {
2734 #if FETCH_DUMP_VERTEX
2735             PRINT("vVertexElements[%d]: 0x%x\n", { C(c), vVertexElements[c] });
2736 #endif
2737             vVertexElements[c] = BITCAST(vVertexElements[c], mSimd16FP32Ty);
2738         }
2739 #if FETCH_DUMP_VERTEX
2740         else
2741         {
2742             PRINT("vVertexElements[%d]: %f\n", { C(c), vVertexElements[c] });
2743         }
2744 #endif
2745         // outputElt * 4 = offsetting by the size of a simdvertex
2746         // + c offsets to a 32bit x vWidth row within the current vertex
2747         Value* dest = GEP(pVtxOut, C(outputElt * 4 + c), "destGEP");
2748         STORE(vVertexElements[c], dest);
2749     }
2750 }
2751
2752 #else
2753 void FetchJit::StoreVertexElements(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4])
2754 {
2755     SWR_ASSERT(numEltsToStore <= 4, "Invalid element count.");
2756
2757     for (uint32_t c = 0; c < numEltsToStore; ++c)
2758     {
2759         // STORE expects FP32 x vWidth type, just bitcast if needed
2760         if (!vVertexElements[c]->getType()->getScalarType()->isFloatTy())
2761         {
2762 #if FETCH_DUMP_VERTEX
2763             PRINT("vVertexElements[%d]: 0x%x\n", { C(c), vVertexElements[c] });
2764 #endif
2765             vVertexElements[c] = BITCAST(vVertexElements[c], mSimdFP32Ty);
2766         }
2767 #if FETCH_DUMP_VERTEX
2768         else
2769         {
2770             PRINT("vVertexElements[%d]: %f\n", { C(c), vVertexElements[c] });
2771         }
2772 #endif
2773         // outputElt * 4 = offsetting by the size of a simdvertex
2774         // + c offsets to a 32bit x vWidth row within the current vertex
2775 #if USE_SIMD16_SHADERS
2776         Value* dest = GEP(pVtxOut, C(outputElt * 8 + c * 2), "destGEP");
2777 #else
2778         Value* dest = GEP(pVtxOut, C(outputElt * 4 + c), "destGEP");
2779 #endif
2780         STORE(vVertexElements[c], dest);
2781     }
2782 }
2783
2784 #endif
2785 //////////////////////////////////////////////////////////////////////////
2786 /// @brief Generates a constant vector of values based on the
2787 /// ComponentControl value
2788 /// @param ctrl - ComponentControl value
2789 #if USE_SIMD16_GATHERS
2790 Value *FetchJit::GenerateCompCtrlVector16(const ComponentControl ctrl)
2791 {
2792     switch (ctrl)
2793     {
2794         case NoStore:
2795             return VUNDEF_I_16();
2796         case Store0:
2797             return VIMMED1_16(0);
2798         case Store1Fp:
2799             return VIMMED1_16(1.0f);
2800         case Store1Int:
2801             return VIMMED1_16(1);
2802         case StoreVertexId:
2803         {
2804             Value *pId_lo = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID  })), mSimdFP32Ty);
2805             Value *pId_hi = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID2 })), mSimdFP32Ty);
2806
2807             Value *pId = JOIN_16(pId_lo, pId_hi);
2808
2809             return pId;
2810         }
2811         case StoreInstanceId:
2812         {
2813             Value *pId = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_CurInstance })), mFP32Ty);
2814             return VBROADCAST_16(pId);
2815         }
2816
2817
2818         case StoreSrc:
2819         default:
2820             SWR_INVALID("Invalid component control");
2821             return VUNDEF_I_16();
2822     }
2823 }
2824
2825 #else
2826 #if USE_SIMD16_SHADERS
2827 Value *FetchJit::GenerateCompCtrlVector(const ComponentControl ctrl, bool useVertexID2)
2828 #else
2829 Value *FetchJit::GenerateCompCtrlVector(const ComponentControl ctrl)
2830 #endif
2831 {
2832     switch (ctrl)
2833     {
2834     case NoStore:
2835         return VUNDEF_I();
2836     case Store0:
2837         return VIMMED1(0);
2838     case Store1Fp:
2839         return VIMMED1(1.0f);
2840     case Store1Int:
2841         return VIMMED1(1);
2842     case StoreVertexId:
2843         {
2844 #if USE_SIMD16_SHADERS
2845             Value *pId;
2846             if (useVertexID2)
2847             {
2848                 pId = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID2 })), mSimdFP32Ty);
2849             }
2850             else
2851             {
2852                 pId = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID })), mSimdFP32Ty);
2853             }
2854 #else
2855             Value *pId = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID })), mSimdFP32Ty);
2856 #endif
2857             return pId;
2858         }
2859     case StoreInstanceId:
2860         {
2861             Value *pId = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_CurInstance })), mFP32Ty);
2862             return VBROADCAST(pId);
2863         }
2864
2865
2866     case StoreSrc:
2867     default:
2868         SWR_INVALID("Invalid component control");
2869         return VUNDEF_I();
2870     }
2871 }
2872
2873 #endif
2874 //////////////////////////////////////////////////////////////////////////
2875 /// @brief Returns the enable mask for the specified component.
2876 /// @param enableMask - enable bits
2877 /// @param component - component to check if enabled.
2878 bool isComponentEnabled(ComponentEnable enableMask, uint8_t component)
2879 {
2880     switch (component)
2881     {
2882         // X
2883     case 0: return (enableMask & ComponentEnable::X);
2884         // Y
2885     case 1: return (enableMask & ComponentEnable::Y);
2886         // Z
2887     case 2: return (enableMask & ComponentEnable::Z);
2888         // W
2889     case 3: return (enableMask & ComponentEnable::W);
2890
2891     default: return false;
2892     }
2893 }
2894
2895 // Don't want two threads compiling the same fetch shader simultaneously
2896 // Has problems in the JIT cache implementation
2897 // This is only a problem for fetch right now.
2898 static std::mutex gFetchCodegenMutex;
2899
2900 //////////////////////////////////////////////////////////////////////////
2901 /// @brief JITs from fetch shader IR
2902 /// @param hJitMgr - JitManager handle
2903 /// @param func   - LLVM function IR
2904 /// @return PFN_FETCH_FUNC - pointer to fetch code
2905 PFN_FETCH_FUNC JitFetchFunc(HANDLE hJitMgr, const HANDLE hFunc)
2906 {
2907     const llvm::Function* func = (const llvm::Function*)hFunc;
2908     JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
2909     PFN_FETCH_FUNC pfnFetch;
2910
2911     gFetchCodegenMutex.lock();
2912     pfnFetch = (PFN_FETCH_FUNC)(pJitMgr->mpExec->getFunctionAddress(func->getName().str()));
2913     // MCJIT finalizes modules the first time you JIT code from them. After finalized, you cannot add new IR to the module
2914     pJitMgr->mIsModuleFinalized = true;
2915
2916 #if defined(KNOB_SWRC_TRACING)
2917     char fName[1024];
2918     const char *funcName = func->getName().data();
2919     sprintf(fName, "%s.bin", funcName);
2920     FILE *fd = fopen(fName, "wb");
2921     fwrite((void *)pfnFetch, 1, 2048, fd);
2922     fclose(fd);
2923 #endif
2924
2925     pJitMgr->DumpAsm(const_cast<llvm::Function*>(func), "final");
2926     gFetchCodegenMutex.unlock();
2927
2928
2929
2930     return pfnFetch;
2931 }
2932
2933 //////////////////////////////////////////////////////////////////////////
2934 /// @brief JIT compiles fetch shader
2935 /// @param hJitMgr - JitManager handle
2936 /// @param state   - fetch state to build function from
2937 extern "C" PFN_FETCH_FUNC JITCALL JitCompileFetch(HANDLE hJitMgr, const FETCH_COMPILE_STATE& state)
2938 {
2939     JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
2940
2941     pJitMgr->SetupNewModule();
2942
2943     FetchJit theJit(pJitMgr);
2944     HANDLE hFunc = theJit.Create(state);
2945
2946     return JitFetchFunc(hJitMgr, hFunc);
2947 }