src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp

   1 /****************************************************************************
   2 * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
   3 *
   4 * Permission is hereby granted, free of charge, to any person obtaining a
   5 * copy of this software and associated documentation files (the "Software"),
   6 * to deal in the Software without restriction, including without limitation
   7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8 * and/or sell copies of the Software, and to permit persons to whom the
   9 * Software is furnished to do so, subject to the following conditions:
  10 *
  11 * The above copyright notice and this permission notice (including the next
  12 * paragraph) shall be included in all copies or substantial portions of the
  13 * Software.
  14 *
  15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21 * IN THE SOFTWARE.
  22 *
  23 * @file fetch_jit.cpp
  24 *
  25 * @brief Implementation of the fetch jitter
  26 *
  27 * Notes:
  28 *
  29 ******************************************************************************/
  30 #include "jit_pch.hpp"
  31 #include "builder.h"
  32 #include "jit_api.h"
  33 #include "fetch_jit.h"
  34 #include "gen_state_llvm.h"
  35
  36 //#define FETCH_DUMP_VERTEX 1
  37 using namespace llvm;
  38 using namespace SwrJit;
  39
  40 bool isComponentEnabled(ComponentEnable enableMask, uint8_t component);
  41
  42 enum ConversionType
  43 {
  44     CONVERT_NONE,
  45     CONVERT_NORMALIZED,
  46     CONVERT_USCALED,
  47     CONVERT_SSCALED,
  48     CONVERT_SFIXED,
  49 };
  50
  51 #if USE_SIMD16_SHADERS
  52 #define USE_SIMD16_GATHERS 0
  53 #endif
  54
  55 //////////////////////////////////////////////////////////////////////////
  56 /// Interface to Jitting a fetch shader
  57 //////////////////////////////////////////////////////////////////////////
  58 struct FetchJit :
  59     public Builder
  60 {
  61     FetchJit(JitManager* pJitMgr) :
  62         Builder(pJitMgr)
  63     {}
  64
  65     Function* Create(const FETCH_COMPILE_STATE& fetchState);
  66
  67     Value* GetSimdValid32bitIndices(Value* vIndices, Value* pLastIndex);
  68     Value* GetSimdValid16bitIndices(Value* vIndices, Value* pLastIndex);
  69     Value* GetSimdValid8bitIndices(Value* vIndices, Value* pLastIndex);
  70
  71     // package up Shuffle*bpcGatherd args into a tuple for convenience
  72     typedef std::tuple<Value*&, Value*, const Instruction::CastOps, const ConversionType,
  73         uint32_t&, uint32_t&, const ComponentEnable, const ComponentControl(&)[4], Value*(&)[4],
  74         const uint32_t(&)[4]> Shuffle8bpcArgs;
  75
  76 #if USE_SIMD16_SHADERS
  77 #if USE_SIMD16_GATHERS
  78     void Shuffle8bpcGatherd16(Shuffle8bpcArgs &args);
  79 #else
  80     void Shuffle8bpcGatherd(Shuffle8bpcArgs &args, bool useVertexID2);
  81 #endif
  82 #else
  83     void Shuffle8bpcGatherd(Shuffle8bpcArgs &args);
  84 #endif
  85
  86     typedef std::tuple<Value*(&)[2], Value*, const Instruction::CastOps, const ConversionType,
  87         uint32_t&, uint32_t&, const ComponentEnable, const ComponentControl(&)[4], Value*(&)[4]> Shuffle16bpcArgs;
  88
  89 #if USE_SIMD16_SHADERS
  90 #if USE_SIMD16_GATHERS
  91     void Shuffle16bpcGather16(Shuffle16bpcArgs &args);
  92 #else
  93     void Shuffle16bpcGather(Shuffle16bpcArgs &args, bool useVertexID2);
  94 #endif
  95 #else
  96     void Shuffle16bpcGather(Shuffle16bpcArgs &args);
  97 #endif
  98
  99 #if USE_SIMD16_GATHERS
 100     void StoreVertexElements16(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4]);
 101 #else
 102     void StoreVertexElements(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4]);
 103 #endif
 104
 105 #if USE_SIMD16_SHADERS
 106 #if USE_SIMD16_GATHERS
 107     Value *GenerateCompCtrlVector16(const ComponentControl ctrl);
 108 #else
 109     Value *GenerateCompCtrlVector(const ComponentControl ctrl, bool useVertexID2);
 110 #endif
 111 #else
 112     Value *GenerateCompCtrlVector(const ComponentControl ctrl);
 113 #endif
 114
 115     void JitLoadVertices(const FETCH_COMPILE_STATE &fetchState, Value* streams, Value* vIndices, Value* pVtxOut);
 116
 117 #if USE_SIMD16_SHADERS
 118 #if USE_SIMD16_GATHERS
 119     void JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, Value *streams, Value *vIndices, Value *vIndices2, Value *pVtxOut, bool useVertexID2);
 120 #else
 121     void JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, Value* streams, Value* vIndices, Value* pVtxOut, bool useVertexID2);
 122 #endif
 123 #else
 124     void JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, Value* streams, Value* vIndices, Value* pVtxOut);
 125 #endif
 126
 127     bool IsOddFormat(SWR_FORMAT format);
 128     bool IsUniformFormat(SWR_FORMAT format);
 129     void UnpackComponents(SWR_FORMAT format, Value* vInput, Value* result[4]);
 130     void CreateGatherOddFormats(SWR_FORMAT format, Value* pMask, Value* pBase, Value* offsets, Value* result[4]);
 131     void ConvertFormat(SWR_FORMAT format, Value *texels[4]);
 132
 133     Value* mpFetchInfo;
 134 };
 135
 136 Function* FetchJit::Create(const FETCH_COMPILE_STATE& fetchState)
 137 {
 138     std::stringstream fnName("FCH_", std::ios_base::in | std::ios_base::out | std::ios_base::ate);
 139     fnName << ComputeCRC(0, &fetchState, sizeof(fetchState));
 140
 141     Function*    fetch = Function::Create(JM()->mFetchShaderTy, GlobalValue::ExternalLinkage, fnName.str(), JM()->mpCurrentModule);
 142     BasicBlock*    entry = BasicBlock::Create(JM()->mContext, "entry", fetch);
 143
 144     fetch->getParent()->setModuleIdentifier(fetch->getName());
 145
 146     IRB()->SetInsertPoint(entry);
 147
 148     auto    argitr = fetch->arg_begin();
 149
 150     // Fetch shader arguments
 151     Value* privateContext = &*argitr; ++argitr;
 152     privateContext->setName("privateContext");
 153     SetPrivateContext(privateContext);
 154
 155     mpFetchInfo = &*argitr; ++argitr;
 156     mpFetchInfo->setName("fetchInfo");
 157     Value*    pVtxOut = &*argitr;
 158     pVtxOut->setName("vtxOutput");
 159     // this is just shorthand to tell LLVM to get a pointer to the base address of simdvertex
 160     // index 0(just the pointer to the simdvertex structure
 161     // index 1(which element of the simdvertex structure to offset to(in this case 0)
 162     // so the indices being i32's doesn't matter
 163     // TODO: generated this GEP with a VECTOR structure type so this makes sense
 164     std::vector<Value*>    vtxInputIndices(2, C(0));
 165     // GEP
 166     pVtxOut = GEP(pVtxOut, C(0));
 167 #if USE_SIMD16_SHADERS
 168 #if 0// USE_SIMD16_BUILDER
 169     pVtxOut = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth16), 0));
 170 #else
 171     pVtxOut = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth), 0));
 172 #endif
 173 #else
 174     pVtxOut = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth), 0));
 175 #endif
 176
 177     // SWR_FETCH_CONTEXT::pStreams
 178     Value*    streams = LOAD(mpFetchInfo,{0, SWR_FETCH_CONTEXT_pStreams});
 179     streams->setName("pStreams");
 180
 181     // SWR_FETCH_CONTEXT::pIndices
 182     Value*    indices = LOAD(mpFetchInfo,{0, SWR_FETCH_CONTEXT_pIndices});
 183     indices->setName("pIndices");
 184
 185     // SWR_FETCH_CONTEXT::pLastIndex
 186     Value*    pLastIndex = LOAD(mpFetchInfo,{0, SWR_FETCH_CONTEXT_pLastIndex});
 187     pLastIndex->setName("pLastIndex");
 188
 189
 190     Value* vIndices;
 191 #if USE_SIMD16_SHADERS
 192     Value* indices2;
 193     Value* vIndices2;
 194 #endif
 195     switch(fetchState.indexType)
 196     {
 197         case R8_UINT:
 198             indices = BITCAST(indices, Type::getInt8PtrTy(JM()->mContext, 0));
 199 #if USE_SIMD16_SHADERS
 200             indices2 = GEP(indices, C(8));
 201 #endif
 202             if(fetchState.bDisableIndexOOBCheck)
 203             {
 204                 vIndices = LOAD(BITCAST(indices, PointerType::get(VectorType::get(mInt8Ty, mpJitMgr->mVWidth), 0)), {(uint32_t)0});
 205                 vIndices = Z_EXT(vIndices, mSimdInt32Ty);
 206 #if USE_SIMD16_SHADERS
 207                 vIndices2 = LOAD(BITCAST(indices2, PointerType::get(VectorType::get(mInt8Ty, mpJitMgr->mVWidth), 0)), { (uint32_t)0 });
 208                 vIndices2 = Z_EXT(vIndices2, mSimdInt32Ty);
 209 #endif
 210             }
 211             else
 212             {
 213                 pLastIndex = BITCAST(pLastIndex, Type::getInt8PtrTy(JM()->mContext, 0));
 214                 vIndices = GetSimdValid8bitIndices(indices, pLastIndex);
 215 #if USE_SIMD16_SHADERS
 216                 pLastIndex = BITCAST(pLastIndex, Type::getInt8PtrTy(JM()->mContext, 0));
 217                 vIndices2 = GetSimdValid8bitIndices(indices2, pLastIndex);
 218 #endif
 219             }
 220             break;
 221         case R16_UINT:
 222             indices = BITCAST(indices, Type::getInt16PtrTy(JM()->mContext, 0));
 223 #if USE_SIMD16_SHADERS
 224             indices2 = GEP(indices, C(8));
 225 #endif
 226             if(fetchState.bDisableIndexOOBCheck)
 227             {
 228                 vIndices = LOAD(BITCAST(indices, PointerType::get(VectorType::get(mInt16Ty, mpJitMgr->mVWidth), 0)), {(uint32_t)0});
 229                 vIndices = Z_EXT(vIndices, mSimdInt32Ty);
 230 #if USE_SIMD16_SHADERS
 231                 vIndices2 = LOAD(BITCAST(indices2, PointerType::get(VectorType::get(mInt16Ty, mpJitMgr->mVWidth), 0)), { (uint32_t)0 });
 232                 vIndices2 = Z_EXT(vIndices2, mSimdInt32Ty);
 233 #endif
 234             }
 235             else
 236             {
 237                 pLastIndex = BITCAST(pLastIndex, Type::getInt16PtrTy(JM()->mContext, 0));
 238                 vIndices = GetSimdValid16bitIndices(indices, pLastIndex);
 239 #if USE_SIMD16_SHADERS
 240                 pLastIndex = BITCAST(pLastIndex, Type::getInt16PtrTy(JM()->mContext, 0));
 241                 vIndices2 = GetSimdValid16bitIndices(indices2, pLastIndex);
 242 #endif
 243             }
 244             break;
 245         case R32_UINT:
 246 #if USE_SIMD16_SHADERS
 247             indices2 = GEP(indices, C(8));
 248 #endif
 249             (fetchState.bDisableIndexOOBCheck) ? vIndices = LOAD(BITCAST(indices, PointerType::get(mSimdInt32Ty,0)),{(uint32_t)0})
 250                                                : vIndices = GetSimdValid32bitIndices(indices, pLastIndex);
 251 #if USE_SIMD16_SHADERS
 252             (fetchState.bDisableIndexOOBCheck) ? vIndices2 = LOAD(BITCAST(indices2, PointerType::get(mSimdInt32Ty, 0)), { (uint32_t)0 })
 253                                                : vIndices2 = GetSimdValid32bitIndices(indices2, pLastIndex);
 254 #endif
 255             break; // incoming type is already 32bit int
 256         default: SWR_INVALID("Unsupported index type"); vIndices = nullptr; break;
 257     }
 258
 259     if(fetchState.bForceSequentialAccessEnable)
 260     {
 261         Value* pOffsets = C({ 0, 1, 2, 3, 4, 5, 6, 7 });
 262
 263         // VertexData buffers are accessed sequentially, the index is equal to the vertex number
 264         vIndices = VBROADCAST(LOAD(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_StartVertex }));
 265         vIndices = ADD(vIndices, pOffsets);
 266 #if USE_SIMD16_SHADERS
 267         vIndices2 = ADD(vIndices, VIMMED1(8));
 268 #endif
 269     }
 270
 271     Value* vVertexId = vIndices;
 272 #if USE_SIMD16_SHADERS
 273     Value* vVertexId2 = vIndices2;
 274 #endif
 275     if (fetchState.bVertexIDOffsetEnable)
 276     {
 277         // Assuming one of baseVertex or startVertex is 0, so adding both should be functionally correct
 278         Value* vBaseVertex = VBROADCAST(LOAD(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_BaseVertex }));
 279         Value* vStartVertex = VBROADCAST(LOAD(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_StartVertex }));
 280         vVertexId = ADD(vIndices, vBaseVertex);
 281         vVertexId = ADD(vVertexId, vStartVertex);
 282 #if USE_SIMD16_SHADERS
 283         vVertexId2 = ADD(vIndices2, vBaseVertex);
 284         vVertexId2 = ADD(vVertexId2, vStartVertex);
 285 #endif
 286     }
 287
 288     // store out vertex IDs
 289     STORE(vVertexId, GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID }));
 290 #if USE_SIMD16_SHADERS
 291     STORE(vVertexId2, GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID2 }));
 292 #endif
 293
 294     // store out cut mask if enabled
 295     if (fetchState.bEnableCutIndex)
 296     {
 297         Value* vCutIndex = VIMMED1(fetchState.cutIndex);
 298         Value* cutMask = VMASK(ICMP_EQ(vIndices, vCutIndex));
 299         STORE(cutMask, GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_CutMask }));
 300 #if USE_SIMD16_SHADERS
 301         Value* cutMask2 = VMASK(ICMP_EQ(vIndices2, vCutIndex));
 302         STORE(cutMask2, GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_CutMask2 }));
 303 #endif
 304     }
 305
 306     // Fetch attributes from memory and output to a simdvertex struct
 307     // since VGATHER has a perf penalty on HSW vs BDW, allow client to choose which fetch method to use
 308 #if USE_SIMD16_SHADERS
 309     if (fetchState.bDisableVGATHER)
 310     {
 311         JitLoadVertices(fetchState, streams, vIndices, pVtxOut);
 312         JitLoadVertices(fetchState, streams, vIndices2, GEP(pVtxOut, C(1)));
 313     }
 314     else
 315     {
 316 #if USE_SIMD16_GATHERS
 317         JitGatherVertices(fetchState, streams, vIndices, vIndices2, pVtxOut, false);
 318 #else
 319         JitGatherVertices(fetchState, streams, vIndices, pVtxOut, false);
 320         JitGatherVertices(fetchState, streams, vIndices2, GEP(pVtxOut, C(1)), true);
 321 #endif
 322     }
 323 #else
 324     (fetchState.bDisableVGATHER) ? JitLoadVertices(fetchState, streams, vIndices, pVtxOut)
 325                                  : JitGatherVertices(fetchState, streams, vIndices, pVtxOut);
 326 #endif
 327
 328     RET_VOID();
 329
 330     JitManager::DumpToFile(fetch, "src");
 331
 332 #if defined(_DEBUG)
 333     verifyFunction(*fetch);
 334 #endif
 335
 336     ::FunctionPassManager setupPasses(JM()->mpCurrentModule);
 337
 338     ///@todo We don't need the CFG passes for fetch. (e.g. BreakCriticalEdges and CFGSimplification)
 339     setupPasses.add(createBreakCriticalEdgesPass());
 340     setupPasses.add(createCFGSimplificationPass());
 341     setupPasses.add(createEarlyCSEPass());
 342     setupPasses.add(createPromoteMemoryToRegisterPass());
 343
 344     setupPasses.run(*fetch);
 345
 346     JitManager::DumpToFile(fetch, "se");
 347
 348     ::FunctionPassManager optPasses(JM()->mpCurrentModule);
 349
 350     ///@todo Haven't touched these either. Need to remove some of these and add others.
 351     optPasses.add(createCFGSimplificationPass());
 352     optPasses.add(createEarlyCSEPass());
 353     optPasses.add(createInstructionCombiningPass());
 354     optPasses.add(createInstructionSimplifierPass());
 355     optPasses.add(createConstantPropagationPass());
 356     optPasses.add(createSCCPPass());
 357     optPasses.add(createAggressiveDCEPass());
 358
 359     optPasses.run(*fetch);
 360     optPasses.run(*fetch);
 361
 362     JitManager::DumpToFile(fetch, "opt");
 363
 364
 365     return fetch;
 366 }
 367
 368 //////////////////////////////////////////////////////////////////////////
 369 /// @brief Loads attributes from memory using LOADs, shuffling the
 370 /// components into SOA form.
 371 /// *Note* currently does not support component control,
 372 /// component packing, instancing
 373 /// @param fetchState - info about attributes to be fetched from memory
 374 /// @param streams - value pointer to the current vertex stream
 375 /// @param vIndices - vector value of indices to load
 376 /// @param pVtxOut - value pointer to output simdvertex struct
 377 void FetchJit::JitLoadVertices(const FETCH_COMPILE_STATE &fetchState, Value* streams, Value* vIndices, Value* pVtxOut)
 378 {
 379     // Zack shuffles; a variant of the Charleston.
 380
 381     std::vector<Value*> vectors(16);
 382     std::vector<Constant*>    pMask(mVWidth);
 383     for(uint32_t i = 0; i < mVWidth; ++i)
 384     {
 385         pMask[i] = (C(i < 4 ? i : 4));
 386     }
 387     Constant* promoteMask = ConstantVector::get(pMask);
 388     Constant* uwvec = UndefValue::get(VectorType::get(mFP32Ty, 4));
 389
 390     Value* startVertex = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartVertex});
 391     Value* startInstance = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartInstance});
 392     Value* curInstance = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_CurInstance});
 393     Value* vBaseVertex = VBROADCAST(LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_BaseVertex}));
 394     curInstance->setName("curInstance");
 395
 396     for(uint32_t nelt = 0; nelt < fetchState.numAttribs; ++nelt)
 397     {
 398         Value*    elements[4] = {0};
 399         const INPUT_ELEMENT_DESC& ied = fetchState.layout[nelt];
 400         const SWR_FORMAT_INFO &info = GetFormatInfo((SWR_FORMAT)ied.Format);
 401         SWR_ASSERT((info.bpp != 0), "Unsupported format in JitLoadVertices.");
 402         uint32_t    numComponents = info.numComps;
 403         uint32_t bpc = info.bpp / info.numComps;  ///@todo Code below assumes all components are same size. Need to fix.
 404
 405         // load path doesn't support component packing
 406         SWR_ASSERT(ied.ComponentPacking == ComponentEnable::XYZW, "Fetch load path doesn't support component packing.");
 407
 408         vectors.clear();
 409
 410         if (fetchState.bInstanceIDOffsetEnable)
 411         {
 412             SWR_ASSERT((0), "TODO: Fill out more once driver sends this down");
 413         }
 414
 415         Value *vCurIndices;
 416         Value *startOffset;
 417         if(ied.InstanceEnable)
 418         {
 419             Value* stepRate = C(ied.InstanceAdvancementState);
 420
 421             // prevent a div by 0 for 0 step rate
 422             Value* isNonZeroStep = ICMP_UGT(stepRate, C(0));
 423             stepRate = SELECT(isNonZeroStep, stepRate, C(1));
 424
 425             // calc the current offset into instanced data buffer
 426             Value* calcInstance = UDIV(curInstance, stepRate);
 427
 428             // if step rate is 0, every instance gets instance 0
 429             calcInstance = SELECT(isNonZeroStep, calcInstance, C(0));
 430
 431             vCurIndices = VBROADCAST(calcInstance);
 432
 433             startOffset = startInstance;
 434         }
 435         else if (ied.InstanceStrideEnable)
 436         {
 437             SWR_ASSERT((0), "TODO: Fill out more once driver sends this down.");
 438         }
 439         else
 440         {
 441             // offset indices by baseVertex
 442             vCurIndices = ADD(vIndices, vBaseVertex);
 443
 444             startOffset = startVertex;
 445         }
 446
 447         // load SWR_VERTEX_BUFFER_STATE::pData
 448         Value *stream = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_xpData});
 449
 450         // load SWR_VERTEX_BUFFER_STATE::pitch
 451         Value *stride = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pitch});
 452         stride = Z_EXT(stride, mInt64Ty);
 453
 454         // load SWR_VERTEX_BUFFER_STATE::size
 455         Value *size = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_size});
 456         size = Z_EXT(size, mInt64Ty);
 457
 458         Value* startVertexOffset = MUL(Z_EXT(startOffset, mInt64Ty), stride);
 459
 460         Value *minVertex = NULL;
 461         Value *minVertexOffset = NULL;
 462         if (fetchState.bPartialVertexBuffer) {
 463             // fetch min index for low bounds checking
 464             minVertex = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_minVertex)});
 465             minVertex = LOAD(minVertex);
 466             if (!fetchState.bDisableIndexOOBCheck) {
 467                 minVertexOffset = MUL(Z_EXT(minVertex, mInt64Ty), stride);
 468             }
 469         }
 470
 471         // Load from the stream.
 472         for(uint32_t lane = 0; lane < mVWidth; ++lane)
 473         {
 474             // Get index
 475             Value* index = VEXTRACT(vCurIndices, C(lane));
 476
 477             if (fetchState.bPartialVertexBuffer) {
 478                 // clamp below minvertex
 479                 Value *isBelowMin = ICMP_SLT(index, minVertex);
 480                 index = SELECT(isBelowMin, minVertex, index);
 481             }
 482
 483             index = Z_EXT(index, mInt64Ty);
 484
 485             Value*    offset = MUL(index, stride);
 486             offset = ADD(offset, C((int64_t)ied.AlignedByteOffset));
 487             offset = ADD(offset, startVertexOffset);
 488
 489             if (!fetchState.bDisableIndexOOBCheck) {
 490                 // check for out of bound access, including partial OOB, and replace them with minVertex
 491                 Value *endOffset = ADD(offset, C((int64_t)info.Bpp));
 492                 Value *oob = ICMP_ULE(endOffset, size);
 493                 if (fetchState.bPartialVertexBuffer) {
 494                     offset = SELECT(oob, offset, minVertexOffset);
 495                 } else {
 496                     offset = SELECT(oob, offset, ConstantInt::get(mInt64Ty, 0));
 497                 }
 498             }
 499
 500             Value*    pointer = GEP(stream, offset);
 501             // We use a full-lane, but don't actually care.
 502             Value*    vptr = 0;
 503
 504             // get a pointer to a 4 component attrib in default address space
 505             switch(bpc)
 506             {
 507                 case 8: vptr = BITCAST(pointer, PointerType::get(VectorType::get(mInt8Ty, 4), 0)); break;
 508                 case 16: vptr = BITCAST(pointer, PointerType::get(VectorType::get(mInt16Ty, 4), 0)); break;
 509                 case 32: vptr = BITCAST(pointer, PointerType::get(VectorType::get(mFP32Ty, 4), 0)); break;
 510                 default: SWR_INVALID("Unsupported underlying bpp!");
 511             }
 512
 513             // load 4 components of attribute
 514             Value*    vec = ALIGNED_LOAD(vptr, 1, false);
 515
 516             // Convert To FP32 internally
 517             switch(info.type[0])
 518             {
 519                 case SWR_TYPE_UNORM:
 520                     switch(bpc)
 521                     {
 522                         case 8:
 523                             vec = UI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
 524                             vec = FMUL(vec, ConstantVector::get(std::vector<Constant*>(4, ConstantFP::get(mFP32Ty, 1.0 / 255.0))));
 525                             break;
 526                         case 16:
 527                             vec = UI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
 528                             vec = FMUL(vec, ConstantVector::get(std::vector<Constant*>(4, ConstantFP::get(mFP32Ty, 1.0 / 65535.0))));
 529                             break;
 530                         default:
 531                             SWR_INVALID("Unsupported underlying type!");
 532                             break;
 533                     }
 534                     break;
 535                 case SWR_TYPE_SNORM:
 536                     switch(bpc)
 537                     {
 538                         case 8:
 539                             vec = SI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
 540                             vec = FMUL(vec, ConstantVector::get(std::vector<Constant*>(4, ConstantFP::get(mFP32Ty, 1.0 / 128.0))));
 541                             break;
 542                         case 16:
 543                             vec = SI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
 544                             vec = FMUL(vec, ConstantVector::get(std::vector<Constant*>(4, ConstantFP::get(mFP32Ty, 1.0 / 32768.0))));
 545                             break;
 546                         default:
 547                             SWR_INVALID("Unsupported underlying type!");
 548                             break;
 549                     }
 550                     break;
 551                 case SWR_TYPE_UINT:
 552                     // Zero extend uint32_t types.
 553                     switch(bpc)
 554                     {
 555                         case 8:
 556                         case 16:
 557                             vec = Z_EXT(vec, VectorType::get(mInt32Ty, 4));
 558                             vec = BITCAST(vec, VectorType::get(mFP32Ty, 4));
 559                             break;
 560                         case 32:
 561                             break; // Pass through unchanged.
 562                         default:
 563                             SWR_INVALID("Unsupported underlying type!");
 564                             break;
 565                     }
 566                     break;
 567                 case SWR_TYPE_SINT:
 568                     // Sign extend SINT types.
 569                     switch(bpc)
 570                     {
 571                         case 8:
 572                         case 16:
 573                             vec = S_EXT(vec, VectorType::get(mInt32Ty, 4));
 574                             vec = BITCAST(vec, VectorType::get(mFP32Ty, 4));
 575                             break;
 576                         case 32:
 577                             break; // Pass through unchanged.
 578                         default:
 579                             SWR_INVALID("Unsupported underlying type!");
 580                             break;
 581                     }
 582                     break;
 583                 case SWR_TYPE_FLOAT:
 584                     switch(bpc)
 585                     {
 586                         case 32:
 587                             break; // Pass through unchanged.
 588                         default:
 589                             SWR_INVALID("Unsupported underlying type!");
 590                     }
 591                     break;
 592                 case SWR_TYPE_USCALED:
 593                     vec = UI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
 594                     break;
 595                 case SWR_TYPE_SSCALED:
 596                     vec = SI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
 597                     break;
 598                 case SWR_TYPE_SFIXED:
 599                     vec = FMUL(SI_TO_FP(vec, VectorType::get(mFP32Ty, 4)), VBROADCAST(C(1/65536.0f)));
 600                     break;
 601                 case SWR_TYPE_UNKNOWN:
 602                 case SWR_TYPE_UNUSED:
 603                     SWR_INVALID("Unsupported type %d!", info.type[0]);
 604             }
 605
 606             // promote mask: sse(0,1,2,3) | avx(0,1,2,3,4,4,4,4)
 607             // uwvec: 4 x F32, undef value
 608             Value*    wvec = VSHUFFLE(vec, uwvec, promoteMask);
 609             vectors.push_back(wvec);
 610         }
 611
 612         std::vector<Constant*>        v01Mask(mVWidth);
 613         std::vector<Constant*>        v23Mask(mVWidth);
 614         std::vector<Constant*>        v02Mask(mVWidth);
 615         std::vector<Constant*>        v13Mask(mVWidth);
 616
 617         // Concatenate the vectors together.
 618         elements[0] = VUNDEF_F();
 619         elements[1] = VUNDEF_F();
 620         elements[2] = VUNDEF_F();
 621         elements[3] = VUNDEF_F();
 622         for(uint32_t b = 0, num4Wide = mVWidth / 4; b < num4Wide; ++b)
 623         {
 624             v01Mask[4 * b + 0] = C(0 + 4 * b);
 625             v01Mask[4 * b + 1] = C(1 + 4 * b);
 626             v01Mask[4 * b + 2] = C(0 + 4 * b + mVWidth);
 627             v01Mask[4 * b + 3] = C(1 + 4 * b + mVWidth);
 628
 629             v23Mask[4 * b + 0] = C(2 + 4 * b);
 630             v23Mask[4 * b + 1] = C(3 + 4 * b);
 631             v23Mask[4 * b + 2] = C(2 + 4 * b + mVWidth);
 632             v23Mask[4 * b + 3] = C(3 + 4 * b + mVWidth);
 633
 634             v02Mask[4 * b + 0] = C(0 + 4 * b);
 635             v02Mask[4 * b + 1] = C(2 + 4 * b);
 636             v02Mask[4 * b + 2] = C(0 + 4 * b + mVWidth);
 637             v02Mask[4 * b + 3] = C(2 + 4 * b + mVWidth);
 638
 639             v13Mask[4 * b + 0] = C(1 + 4 * b);
 640             v13Mask[4 * b + 1] = C(3 + 4 * b);
 641             v13Mask[4 * b + 2] = C(1 + 4 * b + mVWidth);
 642             v13Mask[4 * b + 3] = C(3 + 4 * b + mVWidth);
 643
 644             std::vector<Constant*>    iMask(mVWidth);
 645             for(uint32_t i = 0; i < mVWidth; ++i)
 646             {
 647                 if(((4 * b) <= i) && (i < (4 * (b + 1))))
 648                 {
 649                     iMask[i] = C(i % 4 + mVWidth);
 650                 }
 651                 else
 652                 {
 653                     iMask[i] = C(i);
 654                 }
 655             }
 656             Constant* insertMask = ConstantVector::get(iMask);
 657             elements[0] = VSHUFFLE(elements[0], vectors[4 * b + 0], insertMask);
 658             elements[1] = VSHUFFLE(elements[1], vectors[4 * b + 1], insertMask);
 659             elements[2] = VSHUFFLE(elements[2], vectors[4 * b + 2], insertMask);
 660             elements[3] = VSHUFFLE(elements[3], vectors[4 * b + 3], insertMask);
 661         }
 662
 663         Value* x0y0x1y1 = VSHUFFLE(elements[0], elements[1], ConstantVector::get(v01Mask));
 664         Value* x2y2x3y3 = VSHUFFLE(elements[2], elements[3], ConstantVector::get(v01Mask));
 665         Value* z0w0z1w1 = VSHUFFLE(elements[0], elements[1], ConstantVector::get(v23Mask));
 666         Value* z2w3z2w3 = VSHUFFLE(elements[2], elements[3], ConstantVector::get(v23Mask));
 667         elements[0] = VSHUFFLE(x0y0x1y1, x2y2x3y3, ConstantVector::get(v02Mask));
 668         elements[1] = VSHUFFLE(x0y0x1y1, x2y2x3y3, ConstantVector::get(v13Mask));
 669         elements[2] = VSHUFFLE(z0w0z1w1, z2w3z2w3, ConstantVector::get(v02Mask));
 670         elements[3] = VSHUFFLE(z0w0z1w1, z2w3z2w3, ConstantVector::get(v13Mask));
 671
 672         switch(numComponents + 1)
 673         {
 674             case    1: elements[0] = VIMMED1(0.0f);
 675             case    2: elements[1] = VIMMED1(0.0f);
 676             case    3: elements[2] = VIMMED1(0.0f);
 677             case    4: elements[3] = VIMMED1(1.0f);
 678         }
 679
 680         for(uint32_t c = 0; c < 4; ++c)
 681         {
 682 #if USE_SIMD16_SHADERS
 683             Value* dest = GEP(pVtxOut, C(nelt * 8 + c * 2), "destGEP");
 684 #else
 685             Value* dest = GEP(pVtxOut, C(nelt * 4 + c), "destGEP");
 686 #endif
 687             STORE(elements[c], dest);
 688         }
 689     }
 690 }
 691
 692 // returns true for odd formats that require special state.gather handling
 693 bool FetchJit::IsOddFormat(SWR_FORMAT format)
 694 {
 695     const SWR_FORMAT_INFO& info = GetFormatInfo(format);
 696     if (info.bpc[0] != 8 && info.bpc[0] != 16 && info.bpc[0] != 32 && info.bpc[0] != 64)
 697     {
 698         return true;
 699     }
 700     return false;
 701 }
 702
 703 // format is uniform if all components are the same size and type
 704 bool FetchJit::IsUniformFormat(SWR_FORMAT format)
 705 {
 706     const SWR_FORMAT_INFO& info = GetFormatInfo(format);
 707     uint32_t bpc0 = info.bpc[0];
 708     uint32_t type0 = info.type[0];
 709
 710     for (uint32_t c = 1; c < info.numComps; ++c)
 711     {
 712         if (bpc0 != info.bpc[c] || type0 != info.type[c])
 713         {
 714             return false;
 715         }
 716     }
 717     return true;
 718 }
 719
 720 // unpacks components based on format
 721 // foreach component in the pixel
 722 //   mask off everything but this component
 723 //   shift component to LSB
 724 void FetchJit::UnpackComponents(SWR_FORMAT format, Value* vInput, Value* result[4])
 725 {
 726     const SWR_FORMAT_INFO& info = GetFormatInfo(format);
 727
 728     uint32_t bitOffset = 0;
 729     for (uint32_t c = 0; c < info.numComps; ++c)
 730     {
 731         uint32_t swizzledIndex = info.swizzle[c];
 732         uint32_t compBits = info.bpc[c];
 733         uint32_t bitmask = ((1 << compBits) - 1) << bitOffset;
 734         Value* comp = AND(vInput, bitmask);
 735         comp = LSHR(comp, bitOffset);
 736
 737         result[swizzledIndex] = comp;
 738         bitOffset += compBits;
 739     }
 740 }
 741
 742 // gather for odd component size formats
 743 // gather SIMD full pixels per lane then shift/mask to move each component to their
 744 // own vector
 745 void FetchJit::CreateGatherOddFormats(SWR_FORMAT format, Value* pMask, Value* pBase, Value* pOffsets, Value* pResult[4])
 746 {
 747     const SWR_FORMAT_INFO &info = GetFormatInfo(format);
 748
 749     // only works if pixel size is <= 32bits
 750     SWR_ASSERT(info.bpp <= 32);
 751
 752     Value *pGather;
 753     if (info.bpp == 32)
 754     {
 755         pGather = GATHERDD(VIMMED1(0), pBase, pOffsets, pMask);
 756     }
 757     else
 758     {
 759         // Can't use 32-bit gather for items less than 32-bits, could cause page faults.
 760         Value *pMem = ALLOCA(mSimdInt32Ty);
 761         STORE(VIMMED1(0u), pMem);
 762
 763         pBase = BITCAST(pBase, PointerType::get(mInt8Ty, 0));
 764         Value* pDstMem = BITCAST(pMem, mInt32PtrTy);
 765
 766         for (uint32_t lane = 0; lane < mVWidth; ++lane)
 767         {
 768             // Get index
 769             Value* index = VEXTRACT(pOffsets, C(lane));
 770             Value* mask = VEXTRACT(pMask, C(lane));
 771             switch (info.bpp)
 772             {
 773             case 8:
 774             {
 775                 Value* pDst = BITCAST(GEP(pDstMem, C(lane)), PointerType::get(mInt8Ty, 0));
 776                 Value* pSrc = BITCAST(GEP(pBase, index), PointerType::get(mInt8Ty, 0));
 777                 STORE(LOAD(SELECT(mask, pSrc, pDst)), pDst);
 778                 break;
 779             }
 780
 781             case 16:
 782             {
 783                 Value* pDst = BITCAST(GEP(pDstMem, C(lane)), PointerType::get(mInt16Ty, 0));
 784                 Value* pSrc = BITCAST(GEP(pBase, index), PointerType::get(mInt16Ty, 0));
 785                 STORE(LOAD(SELECT(mask, pSrc, pDst)), pDst);
 786                 break;
 787             }
 788             break;
 789
 790             case 24:
 791             {
 792                 // First 16-bits of data
 793                 Value* pDst = BITCAST(GEP(pDstMem, C(lane)), PointerType::get(mInt16Ty, 0));
 794                 Value* pSrc = BITCAST(GEP(pBase, index), PointerType::get(mInt16Ty, 0));
 795                 STORE(LOAD(SELECT(mask, pSrc, pDst)), pDst);
 796
 797                 // Last 8-bits of data
 798                 pDst = BITCAST(GEP(pDst, C(1)), PointerType::get(mInt8Ty, 0));
 799                 pSrc = BITCAST(GEP(pSrc, C(1)), PointerType::get(mInt8Ty, 0));
 800                 STORE(LOAD(SELECT(mask, pSrc, pDst)), pDst);
 801                 break;
 802             }
 803
 804             default:
 805                 SWR_INVALID("Shouldn't have BPP = %d now", info.bpp);
 806                 break;
 807             }
 808         }
 809
 810         pGather = LOAD(pMem);
 811     }
 812
 813     for (uint32_t comp = 0; comp < 4; ++comp)
 814     {
 815         pResult[comp] = VIMMED1((int)info.defaults[comp]);
 816     }
 817
 818     UnpackComponents(format, pGather, pResult);
 819
 820     // cast to fp32
 821     pResult[0] = BITCAST(pResult[0], mSimdFP32Ty);
 822     pResult[1] = BITCAST(pResult[1], mSimdFP32Ty);
 823     pResult[2] = BITCAST(pResult[2], mSimdFP32Ty);
 824     pResult[3] = BITCAST(pResult[3], mSimdFP32Ty);
 825 }
 826
 827 void FetchJit::ConvertFormat(SWR_FORMAT format, Value *texels[4])
 828 {
 829     const SWR_FORMAT_INFO &info = GetFormatInfo(format);
 830
 831     for (uint32_t c = 0; c < info.numComps; ++c)
 832     {
 833         uint32_t compIndex = info.swizzle[c];
 834
 835         // skip any conversion on UNUSED components
 836         if (info.type[c] == SWR_TYPE_UNUSED)
 837         {
 838             continue;
 839         }
 840
 841         if (info.isNormalized[c])
 842         {
 843             if (info.type[c] == SWR_TYPE_SNORM)
 844             {
 845                 /// @todo The most-negative value maps to -1.0f. e.g. the 5-bit value 10000 maps to -1.0f.
 846
 847                 /// result = c * (1.0f / (2^(n-1) - 1);
 848                 uint32_t n = info.bpc[c];
 849                 uint32_t pow2 = 1 << (n - 1);
 850                 float scale = 1.0f / (float)(pow2 - 1);
 851                 Value *vScale = VIMMED1(scale);
 852                 texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty);
 853                 texels[compIndex] = SI_TO_FP(texels[compIndex], mSimdFP32Ty);
 854                 texels[compIndex] = FMUL(texels[compIndex], vScale);
 855             }
 856             else
 857             {
 858                 SWR_ASSERT(info.type[c] == SWR_TYPE_UNORM);
 859
 860                 /// result = c * (1.0f / (2^n - 1))
 861                 uint32_t n = info.bpc[c];
 862                 uint32_t pow2 = 1 << n;
 863                 // special case 24bit unorm format, which requires a full divide to meet ULP requirement
 864                 if (n == 24)
 865                 {
 866                     float scale = (float)(pow2 - 1);
 867                     Value* vScale = VIMMED1(scale);
 868                     texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty);
 869                     texels[compIndex] = SI_TO_FP(texels[compIndex], mSimdFP32Ty);
 870                     texels[compIndex] = FDIV(texels[compIndex], vScale);
 871                 }
 872                 else
 873                 {
 874                     float scale = 1.0f / (float)(pow2 - 1);
 875                     Value *vScale = VIMMED1(scale);
 876                     texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty);
 877                     texels[compIndex] = UI_TO_FP(texels[compIndex], mSimdFP32Ty);
 878                     texels[compIndex] = FMUL(texels[compIndex], vScale);
 879                 }
 880             }
 881             continue;
 882         }
 883     }
 884 }
 885
 886 //////////////////////////////////////////////////////////////////////////
 887 /// @brief Loads attributes from memory using AVX2 GATHER(s)
 888 /// @param fetchState - info about attributes to be fetched from memory
 889 /// @param streams - value pointer to the current vertex stream
 890 /// @param vIndices - vector value of indices to gather
 891 /// @param pVtxOut - value pointer to output simdvertex struct
 892 #if USE_SIMD16_SHADERS
 893 #if USE_SIMD16_GATHERS
 894 void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
 895     Value *streams, Value *vIndices, Value *vIndices2, Value *pVtxOut, bool useVertexID2)
 896 #else
 897 void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
 898     Value* streams, Value* vIndices, Value* pVtxOut, bool useVertexID2)
 899 #endif
 900 #else
 901 void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
 902     Value* streams, Value* vIndices, Value* pVtxOut)
 903 #endif
 904 {
 905     uint32_t currentVertexElement = 0;
 906     uint32_t outputElt = 0;
 907     Value* vVertexElements[4];
 908 #if USE_SIMD16_GATHERS
 909     Value *pVtxSrc2[4];
 910 #endif
 911
 912     Value* startVertex = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartVertex});
 913     Value* startInstance = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartInstance});
 914     Value* curInstance = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_CurInstance});
 915 #if USE_SIMD16_GATHERS
 916     Value* vBaseVertex16 = VBROADCAST_16(LOAD(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_BaseVertex }));
 917 #else
 918     Value* vBaseVertex = VBROADCAST(LOAD(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_BaseVertex }));
 919 #endif
 920     curInstance->setName("curInstance");
 921
 922     for (uint32_t nInputElt = 0; nInputElt < fetchState.numAttribs; nInputElt += 1)
 923     {
 924         const INPUT_ELEMENT_DESC& ied = fetchState.layout[nInputElt];
 925
 926         // skip element if all components are disabled
 927         if (ied.ComponentPacking == ComponentEnable::NONE)
 928         {
 929             continue;
 930         }
 931
 932         const SWR_FORMAT_INFO &info = GetFormatInfo((SWR_FORMAT)ied.Format);
 933         SWR_ASSERT((info.bpp != 0), "Unsupported format in JitGatherVertices.");
 934         uint32_t bpc = info.bpp / info.numComps;  ///@todo Code below assumes all components are same size. Need to fix.
 935
 936         Value *stream = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_xpData});
 937
 938         // VGATHER* takes an *i8 src pointer
 939         Value *pStreamBase = INT_TO_PTR(stream, PointerType::get(mInt8Ty, 0));
 940
 941         Value *stride = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pitch});
 942 #if USE_SIMD16_GATHERS
 943         Value *vStride16 = VBROADCAST_16(stride);
 944 #else
 945         Value *vStride = VBROADCAST(stride);
 946 #endif
 947
 948         // max vertex index that is fully in bounds
 949         Value *maxVertex = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_maxVertex)});
 950         maxVertex = LOAD(maxVertex);
 951
 952         Value *minVertex = NULL;
 953         if (fetchState.bPartialVertexBuffer)
 954         {
 955             // min vertex index for low bounds OOB checking
 956             minVertex = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_minVertex)});
 957             minVertex = LOAD(minVertex);
 958         }
 959
 960         if (fetchState.bInstanceIDOffsetEnable)
 961         {
 962             // the InstanceID (curInstance) value is offset by StartInstanceLocation
 963             curInstance = ADD(curInstance, startInstance);
 964         }
 965
 966 #if USE_SIMD16_GATHERS
 967         Value *vCurIndices16;
 968 #else
 969         Value *vCurIndices;
 970 #endif
 971         Value *startOffset;
 972 #if USE_SIMD16_GATHERS
 973         Value *vInstanceStride16 = VIMMED1_16(0);
 974 #else
 975         Value *vInstanceStride = VIMMED1(0);
 976 #endif
 977
 978         if (ied.InstanceEnable)
 979         {
 980             Value* stepRate = C(ied.InstanceAdvancementState);
 981
 982             // prevent a div by 0 for 0 step rate
 983             Value* isNonZeroStep = ICMP_UGT(stepRate, C(0));
 984             stepRate = SELECT(isNonZeroStep, stepRate, C(1));
 985
 986             // calc the current offset into instanced data buffer
 987             Value* calcInstance = UDIV(curInstance, stepRate);
 988
 989             // if step rate is 0, every instance gets instance 0
 990             calcInstance = SELECT(isNonZeroStep, calcInstance, C(0));
 991
 992 #if USE_SIMD16_GATHERS
 993             vCurIndices16 = VBROADCAST_16(calcInstance);
 994 #else
 995             vCurIndices = VBROADCAST(calcInstance);
 996 #endif
 997
 998             startOffset = startInstance;
 999         }
1000         else if (ied.InstanceStrideEnable)
1001         {
1002             // grab the instance advancement state, determines stride in bytes from one instance to the next
1003             Value* stepRate = C(ied.InstanceAdvancementState);
1004 #if USE_SIMD16_GATHERS
1005             vInstanceStride16 = VBROADCAST_16(MUL(curInstance, stepRate));
1006 #else
1007             vInstanceStride = VBROADCAST(MUL(curInstance, stepRate));
1008 #endif
1009
1010             // offset indices by baseVertex
1011 #if USE_SIMD16_GATHERS
1012             Value *vIndices16 = JOIN_16(vIndices, vIndices2);
1013
1014             vCurIndices16 = ADD(vIndices16, vBaseVertex16);
1015 #else
1016             vCurIndices = ADD(vIndices, vBaseVertex);
1017 #endif
1018
1019             startOffset = startVertex;
1020             SWR_ASSERT((0), "TODO: Fill out more once driver sends this down.");
1021         }
1022         else
1023         {
1024             // offset indices by baseVertex
1025 #if USE_SIMD16_GATHERS
1026             Value *vIndices16 = JOIN_16(vIndices, vIndices2);
1027
1028             vCurIndices16 = ADD(vIndices16, vBaseVertex16);
1029 #else
1030             vCurIndices = ADD(vIndices, vBaseVertex);
1031 #endif
1032
1033             startOffset = startVertex;
1034         }
1035
1036         // All of the OOB calculations are in vertices, not VB offsets, to prevent having to
1037         // do 64bit address offset calculations.
1038
1039         // calculate byte offset to the start of the VB
1040         Value* baseOffset = MUL(Z_EXT(startOffset, mInt64Ty), Z_EXT(stride, mInt64Ty));
1041         pStreamBase = GEP(pStreamBase, baseOffset);
1042         Value* pStreamBaseGFX = ADD(stream, baseOffset);
1043
1044         // if we have a start offset, subtract from max vertex. Used for OOB check
1045         maxVertex = SUB(Z_EXT(maxVertex, mInt64Ty), Z_EXT(startOffset, mInt64Ty));
1046         Value* maxNeg = ICMP_SLT(maxVertex, C((int64_t)0));
1047         // if we have a negative value, we're already OOB. clamp at 0.
1048         maxVertex = SELECT(maxNeg, C(0), TRUNC(maxVertex, mInt32Ty));
1049
1050         if (fetchState.bPartialVertexBuffer)
1051         {
1052             // similary for min vertex
1053             minVertex = SUB(Z_EXT(minVertex, mInt64Ty), Z_EXT(startOffset, mInt64Ty));
1054             Value *minNeg = ICMP_SLT(minVertex, C((int64_t)0));
1055             minVertex = SELECT(minNeg, C(0), TRUNC(minVertex, mInt32Ty));
1056         }
1057
1058         // Load the in bounds size of a partially valid vertex
1059         Value *partialInboundsSize = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_partialInboundsSize)});
1060         partialInboundsSize = LOAD(partialInboundsSize);
1061 #if USE_SIMD16_GATHERS
1062         Value *vPartialVertexSize = VBROADCAST_16(partialInboundsSize);
1063         Value *vBpp = VBROADCAST_16(C(info.Bpp));
1064         Value *vAlignmentOffsets = VBROADCAST_16(C(ied.AlignedByteOffset));
1065 #else
1066         Value *vPartialVertexSize = VBROADCAST(partialInboundsSize);
1067         Value *vBpp = VBROADCAST(C(info.Bpp));
1068         Value *vAlignmentOffsets = VBROADCAST(C(ied.AlignedByteOffset));
1069 #endif
1070
1071         // is the element is <= the partially valid size
1072         Value *vElementInBoundsMask = ICMP_SLE(vBpp, SUB(vPartialVertexSize, vAlignmentOffsets));
1073
1074 #if USE_SIMD16_GATHERS
1075         // override cur indices with 0 if pitch is 0
1076         Value *pZeroPitchMask16 = ICMP_EQ(vStride16, VIMMED1_16(0));
1077         vCurIndices16 = SELECT(pZeroPitchMask16, VIMMED1_16(0), vCurIndices16);
1078
1079         // are vertices partially OOB?
1080         Value *vMaxVertex16 = VBROADCAST_16(maxVertex);
1081         Value *vPartialOOBMask = ICMP_EQ(vCurIndices16, vMaxVertex16);
1082
1083         // are vertices fully in bounds?
1084         Value *vMaxGatherMask16 = ICMP_ULT(vCurIndices16, vMaxVertex16);
1085
1086         Value *vGatherMask16;
1087
1088         if (fetchState.bPartialVertexBuffer)
1089         {
1090             // are vertices below minVertex limit?
1091             Value *vMinVertex16 = VBROADCAST_16(minVertex);
1092             Value *vMinGatherMask16 = ICMP_UGE(vCurIndices16, vMinVertex16);
1093
1094             // only fetch lanes that pass both tests
1095             vGatherMask16 = AND(vMaxGatherMask16, vMinGatherMask16);
1096         }
1097         else
1098         {
1099             vGatherMask16 = vMaxGatherMask16;
1100         }
1101
1102         // blend in any partially OOB indices that have valid elements
1103         vGatherMask16 = SELECT(vPartialOOBMask, vElementInBoundsMask, vGatherMask16);
1104
1105         // calculate the actual offsets into the VB
1106         Value *vOffsets16 = MUL(vCurIndices16, vStride16);
1107         vOffsets16 = ADD(vOffsets16, vAlignmentOffsets);
1108
1109         // if instance stride enable is:
1110         //  true  - add product of the instanceID and advancement state to the offst into the VB
1111         //  false - value of vInstanceStride has been initialialized to zero
1112         vOffsets16 = ADD(vOffsets16, vInstanceStride16);
1113
1114         // TODO: remove the following simd8 interop stuff once all code paths are fully widened to SIMD16..
1115
1116         Value *vGatherMask  = EXTRACT_16(vGatherMask16, 0);
1117         Value *vGatherMask2 = EXTRACT_16(vGatherMask16, 1);
1118
1119         Value *vOffsets  = EXTRACT_16(vOffsets16, 0);
1120         Value *vOffsets2 = EXTRACT_16(vOffsets16, 1);
1121 #else
1122         // override cur indices with 0 if pitch is 0
1123         Value* pZeroPitchMask = ICMP_EQ(vStride, VIMMED1(0));
1124         vCurIndices = SELECT(pZeroPitchMask, VIMMED1(0), vCurIndices);
1125
1126         // are vertices partially OOB?
1127         Value* vMaxVertex = VBROADCAST(maxVertex);
1128         Value* vPartialOOBMask = ICMP_EQ(vCurIndices, vMaxVertex);
1129
1130         // are vertices fully in bounds?
1131         Value* vMaxGatherMask = ICMP_ULT(vCurIndices, vMaxVertex);
1132
1133         Value *vGatherMask;
1134         if (fetchState.bPartialVertexBuffer)
1135         {
1136             // are vertices below minVertex limit?
1137             Value *vMinVertex = VBROADCAST(minVertex);
1138             Value *vMinGatherMask = ICMP_UGE(vCurIndices, vMinVertex);
1139
1140             // only fetch lanes that pass both tests
1141             vGatherMask = AND(vMaxGatherMask, vMinGatherMask);
1142         }
1143         else
1144         {
1145             vGatherMask = vMaxGatherMask;
1146         }
1147
1148         // blend in any partially OOB indices that have valid elements
1149         vGatherMask = SELECT(vPartialOOBMask, vElementInBoundsMask, vGatherMask);
1150
1151         // calculate the actual offsets into the VB
1152         Value* vOffsets = MUL(vCurIndices, vStride);
1153         vOffsets = ADD(vOffsets, vAlignmentOffsets);
1154
1155         // if instance stride enable is:
1156         //  true  - add product of the instanceID and advancement state to the offst into the VB
1157         //  false - value of vInstanceStride has been initialialized to zero
1158         vOffsets = ADD(vOffsets, vInstanceStride);
1159
1160 #endif
1161         // Packing and component control
1162         ComponentEnable compMask = (ComponentEnable)ied.ComponentPacking;
1163         const ComponentControl compCtrl[4] { (ComponentControl)ied.ComponentControl0, (ComponentControl)ied.ComponentControl1,
1164                                              (ComponentControl)ied.ComponentControl2, (ComponentControl)ied.ComponentControl3};
1165
1166         // Special gather/conversion for formats without equal component sizes
1167         if (IsOddFormat((SWR_FORMAT)ied.Format))
1168         {
1169 #if USE_SIMD16_GATHERS
1170             Value *pResults[4];
1171             Value *pResults2[4];
1172             CreateGatherOddFormats((SWR_FORMAT)ied.Format, vGatherMask,  pStreamBase, vOffsets,  pResults);
1173             CreateGatherOddFormats((SWR_FORMAT)ied.Format, vGatherMask2, pStreamBase, vOffsets2, pResults2);
1174             ConvertFormat((SWR_FORMAT)ied.Format, pResults);
1175             ConvertFormat((SWR_FORMAT)ied.Format, pResults2);
1176
1177             for (uint32_t c = 0; c < 4; c += 1)
1178             {
1179                 if (isComponentEnabled(compMask, c))
1180                 {
1181                     // pack adjacent pairs of SIMD8s into SIMD16s
1182                     pVtxSrc2[currentVertexElement++] = JOIN_16(pResults[c], pResults2[c]);
1183
1184                     if (currentVertexElement > 3)
1185                     {
1186                         // store SIMD16s
1187                         Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth16), 0));
1188
1189                         StoreVertexElements16(pVtxOut2, outputElt++, 4, pVtxSrc2);
1190                         // reset to the next vVertexElement to output
1191                         currentVertexElement = 0;
1192                     }
1193                 }
1194             }
1195 #else
1196             Value *pResults[4];
1197             CreateGatherOddFormats((SWR_FORMAT)ied.Format, vGatherMask, pStreamBase, vOffsets, pResults);
1198             ConvertFormat((SWR_FORMAT)ied.Format, pResults);
1199
1200             for (uint32_t c = 0; c < 4; c += 1)
1201             {
1202                 if (isComponentEnabled(compMask, c))
1203                 {
1204                     vVertexElements[currentVertexElement++] = pResults[c];
1205                     if (currentVertexElement > 3)
1206                     {
1207                         StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1208                         // reset to the next vVertexElement to output
1209                         currentVertexElement = 0;
1210                     }
1211                 }
1212             }
1213 #endif
1214         }
1215         else if(info.type[0] == SWR_TYPE_FLOAT)
1216         {
1217             ///@todo: support 64 bit vb accesses
1218             Value *gatherSrc = VIMMED1(0.0f);
1219 #if USE_SIMD16_GATHERS
1220             Value *gatherSrc16 = VIMMED1_16(0.0f);
1221 #endif
1222
1223             SWR_ASSERT(IsUniformFormat((SWR_FORMAT)ied.Format),
1224                 "Unsupported format for standard gather fetch.");
1225
1226             // Gather components from memory to store in a simdvertex structure
1227             switch (bpc)
1228             {
1229                 case 16:
1230                 {
1231 #if USE_SIMD16_GATHERS
1232                     Value *gatherResult[2];
1233
1234                     // if we have at least one component out of x or y to fetch
1235                     if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
1236                     {
1237                         gatherResult[0] = GATHERPS_16(gatherSrc16, pStreamBase, vOffsets16, vGatherMask16);
1238
1239                         // e.g. result of first 8x32bit integer gather for 16bit components
1240                         // 256i - 0    1    2    3    4    5    6    7
1241                         //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
1242                         //
1243                     }
1244                     else
1245                     {
1246                         gatherResult[0] = VUNDEF_I_16();
1247                     }
1248
1249                     // if we have at least one component out of z or w to fetch
1250                     if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
1251                     {
1252                         // offset base to the next components(zw) in the vertex to gather
1253                         pStreamBase = GEP(pStreamBase, C((char)4));
1254
1255                         gatherResult[1] = GATHERPS_16(gatherSrc16, pStreamBase, vOffsets16, vGatherMask16);
1256
1257                         // e.g. result of second 8x32bit integer gather for 16bit components
1258                         // 256i - 0    1    2    3    4    5    6    7
1259                         //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
1260                         //
1261                     }
1262                     else
1263                     {
1264                         gatherResult[1] = VUNDEF_I_16();
1265                     }
1266
1267                     // if we have at least one component to shuffle into place
1268                     if (compMask)
1269                     {
1270                         Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth16), 0));
1271
1272                         Shuffle16bpcArgs args = std::forward_as_tuple(gatherResult, pVtxOut2, Instruction::CastOps::FPExt, CONVERT_NONE,
1273                             currentVertexElement, outputElt, compMask, compCtrl, pVtxSrc2);
1274
1275                         // Shuffle gathered components into place in simdvertex struct
1276                         Shuffle16bpcGather16(args);  // outputs to vVertexElements ref
1277                     }
1278 #else
1279                     Value *vGatherResult[2];
1280
1281                     // if we have at least one component out of x or y to fetch
1282                     if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
1283                     {
1284                         vGatherResult[0] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vGatherMask);
1285                         // e.g. result of first 8x32bit integer gather for 16bit components
1286                         // 256i - 0    1    2    3    4    5    6    7
1287                         //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
1288                         //
1289                     }
1290
1291                     // if we have at least one component out of z or w to fetch
1292                     if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
1293                     {
1294                         // offset base to the next components(zw) in the vertex to gather
1295                         pStreamBase = GEP(pStreamBase, C((char)4));
1296
1297                         vGatherResult[1] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vGatherMask);
1298                         // e.g. result of second 8x32bit integer gather for 16bit components
1299                         // 256i - 0    1    2    3    4    5    6    7
1300                         //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
1301                         //
1302                     }
1303
1304                     // if we have at least one component to shuffle into place
1305                     if (compMask)
1306                     {
1307                         Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, Instruction::CastOps::FPExt, CONVERT_NONE,
1308                             currentVertexElement, outputElt, compMask, compCtrl, vVertexElements);
1309
1310                         // Shuffle gathered components into place in simdvertex struct
1311 #if USE_SIMD16_SHADERS
1312                         Shuffle16bpcGather(args, useVertexID2);  // outputs to vVertexElements ref
1313 #else
1314                         Shuffle16bpcGather(args);  // outputs to vVertexElements ref
1315 #endif
1316                     }
1317 #endif
1318                 }
1319                     break;
1320                 case 32:
1321                 {
1322                     for (uint32_t i = 0; i < 4; i += 1)
1323                     {
1324 #if USE_SIMD16_GATHERS
1325                         if (isComponentEnabled(compMask, i))
1326                         {
1327                             // if we need to gather the component
1328                             if (compCtrl[i] == StoreSrc)
1329                             {
1330                                 // Gather a SIMD of vertices
1331                                 // APIs allow a 4GB range for offsets
1332                                 // However, GATHERPS uses signed 32-bit offsets, so only a 2GB range :(
1333                                 // But, we know that elements must be aligned for FETCH. :)
1334                                 // Right shift the offset by a bit and then scale by 2 to remove the sign extension.
1335                                 Value *shiftedOffsets16 = LSHR(vOffsets16, 1);
1336                                 pVtxSrc2[currentVertexElement++] = GATHERPS_16(gatherSrc16, pStreamBase, shiftedOffsets16, vGatherMask16, 2);
1337                             }
1338                             else
1339                             {
1340                                 pVtxSrc2[currentVertexElement++] = GenerateCompCtrlVector16(compCtrl[i]);
1341                             }
1342
1343                             if (currentVertexElement > 3)
1344                             {
1345                                 // store SIMD16s
1346                                 Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth16), 0));
1347
1348                                 StoreVertexElements16(pVtxOut2, outputElt++, 4, pVtxSrc2);
1349                                 // reset to the next vVertexElement to output
1350                                 currentVertexElement = 0;
1351                             }
1352                         }
1353
1354                         // offset base to the next component in the vertex to gather
1355                         pStreamBase = GEP(pStreamBase, C((char)4));
1356 #else
1357                         if (isComponentEnabled(compMask, i))
1358                         {
1359                             // if we need to gather the component
1360                             if (compCtrl[i] == StoreSrc)
1361                             {
1362                                 // Gather a SIMD of vertices
1363                                 // APIs allow a 4GB range for offsets
1364                                 // However, GATHERPS uses signed 32-bit offsets, so only a 2GB range :(
1365                                 // But, we know that elements must be aligned for FETCH. :)
1366                                 // Right shift the offset by a bit and then scale by 2 to remove the sign extension.
1367                                 Value *vShiftedOffsets = LSHR(vOffsets, 1);
1368                                 vVertexElements[currentVertexElement++] = GATHERPS(gatherSrc, pStreamBaseGFX, vShiftedOffsets, vGatherMask, 2);
1369                             }
1370                             else
1371                             {
1372 #if USE_SIMD16_SHADERS
1373                                 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i], useVertexID2);
1374 #else
1375                                 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1376 #endif
1377                             }
1378
1379                             if (currentVertexElement > 3)
1380                             {
1381                                 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1382                                 // reset to the next vVertexElement to output
1383                                 currentVertexElement = 0;
1384                             }
1385                         }
1386
1387                         // offset base to the next component in the vertex to gather
1388                         pStreamBase = GEP(pStreamBase, C((char)4));
1389                         pStreamBaseGFX = ADD(pStreamBaseGFX, C((int64_t)4));
1390 #endif
1391                     }
1392                 }
1393                     break;
1394                 case 64:
1395                 {
1396                     for (uint32_t i = 0; i < 4; i += 1)
1397                     {
1398 #if USE_SIMD16_GATHERS
1399                         if (isComponentEnabled(compMask, i))
1400                         {
1401                             // if we need to gather the component
1402                             if (compCtrl[i] == StoreSrc)
1403                             {
1404                                 Value *vMaskLo  = VSHUFFLE(vGatherMask,  VUNDEF(mInt1Ty, 8), C({ 0, 1, 2, 3 }));
1405                                 Value *vMaskLo2 = VSHUFFLE(vGatherMask2, VUNDEF(mInt1Ty, 8), C({ 0, 1, 2, 3 }));
1406                                 Value *vMaskHi  = VSHUFFLE(vGatherMask,  VUNDEF(mInt1Ty, 8), C({ 4, 5, 6, 7 }));
1407                                 Value *vMaskHi2 = VSHUFFLE(vGatherMask2, VUNDEF(mInt1Ty, 8), C({ 4, 5, 6, 7 }));
1408
1409                                 Value *vOffsetsLo  = VEXTRACTI128(vOffsets,  C(0));
1410                                 Value *vOffsetsLo2 = VEXTRACTI128(vOffsets2, C(0));
1411                                 Value *vOffsetsHi  = VEXTRACTI128(vOffsets,  C(1));
1412                                 Value *vOffsetsHi2 = VEXTRACTI128(vOffsets2, C(1));
1413
1414                                 Value *vZeroDouble = VECTOR_SPLAT(4, ConstantFP::get(IRB()->getDoubleTy(), 0.0f));
1415
1416                                 Value* pGatherLo  = GATHERPD(vZeroDouble, pStreamBase, vOffsetsLo,  vMaskLo);
1417                                 Value* pGatherLo2 = GATHERPD(vZeroDouble, pStreamBase, vOffsetsLo2, vMaskLo2);
1418                                 Value* pGatherHi  = GATHERPD(vZeroDouble, pStreamBase, vOffsetsHi,  vMaskHi);
1419                                 Value* pGatherHi2 = GATHERPD(vZeroDouble, pStreamBase, vOffsetsHi2, vMaskHi2);
1420
1421                                 pGatherLo  = VCVTPD2PS(pGatherLo);
1422                                 pGatherLo2 = VCVTPD2PS(pGatherLo2);
1423                                 pGatherHi  = VCVTPD2PS(pGatherHi);
1424                                 pGatherHi2 = VCVTPD2PS(pGatherHi2);
1425
1426                                 Value *pGather  = VSHUFFLE(pGatherLo,  pGatherHi,  C({ 0, 1, 2, 3, 4, 5, 6, 7 }));
1427                                 Value *pGather2 = VSHUFFLE(pGatherLo2, pGatherHi2, C({ 0, 1, 2, 3, 4, 5, 6, 7 }));
1428
1429                                 // pack adjacent pairs of SIMD8s into SIMD16s
1430                                 pVtxSrc2[currentVertexElement++] = JOIN_16(pGather, pGather2);
1431                             }
1432                             else
1433                             {
1434                                 pVtxSrc2[currentVertexElement++] = GenerateCompCtrlVector16(compCtrl[i]);
1435                             }
1436
1437                             if (currentVertexElement > 3)
1438                             {
1439                                 // store SIMD16s
1440                                 Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth16), 0));
1441
1442                                 StoreVertexElements16(pVtxOut2, outputElt++, 4, pVtxSrc2);
1443                                 // reset to the next vVertexElement to output
1444                                 currentVertexElement = 0;
1445                             }
1446                         }
1447
1448                         // offset base to the next component  in the vertex to gather
1449                         pStreamBase = GEP(pStreamBase, C((char)8));
1450 #else
1451                         if (isComponentEnabled(compMask, i))
1452                         {
1453                             // if we need to gather the component
1454                             if (compCtrl[i] == StoreSrc)
1455                             {
1456                                 Value *vMaskLo = VSHUFFLE(vGatherMask, VUNDEF(mInt1Ty, 8), C({0, 1, 2, 3}));
1457                                 Value *vMaskHi = VSHUFFLE(vGatherMask, VUNDEF(mInt1Ty, 8), C({4, 5, 6, 7}));
1458
1459                                 Value *vOffsetsLo = VEXTRACTI128(vOffsets, C(0));
1460                                 Value *vOffsetsHi = VEXTRACTI128(vOffsets, C(1));
1461
1462                                 Value *vZeroDouble = VECTOR_SPLAT(4, ConstantFP::get(IRB()->getDoubleTy(), 0.0f));
1463
1464                                 Value* pGatherLo = GATHERPD(vZeroDouble, pStreamBase, vOffsetsLo, vMaskLo);
1465                                 Value* pGatherHi = GATHERPD(vZeroDouble, pStreamBase, vOffsetsHi, vMaskHi);
1466
1467                                 pGatherLo = VCVTPD2PS(pGatherLo);
1468                                 pGatherHi = VCVTPD2PS(pGatherHi);
1469
1470                                 Value *pGather = VSHUFFLE(pGatherLo, pGatherHi, C({0, 1, 2, 3, 4, 5, 6, 7}));
1471
1472                                 vVertexElements[currentVertexElement++] = pGather;
1473                             }
1474                             else
1475                             {
1476 #if USE_SIMD16_SHADERS
1477                                 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i], useVertexID2);
1478 #else
1479                                 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1480 #endif
1481                             }
1482
1483                             if (currentVertexElement > 3)
1484                             {
1485                                 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1486                                 // reset to the next vVertexElement to output
1487                                 currentVertexElement = 0;
1488                             }
1489                         }
1490
1491                         // offset base to the next component  in the vertex to gather
1492                         pStreamBase = GEP(pStreamBase, C((char)8));
1493 #endif
1494                     }
1495                 }
1496                     break;
1497                 default:
1498                     SWR_INVALID("Tried to fetch invalid FP format");
1499                     break;
1500             }
1501         }
1502         else
1503         {
1504             Instruction::CastOps extendCastType = Instruction::CastOps::CastOpsEnd;
1505             ConversionType conversionType = CONVERT_NONE;
1506
1507             SWR_ASSERT(IsUniformFormat((SWR_FORMAT)ied.Format),
1508                 "Unsupported format for standard gather fetch.");
1509
1510             switch(info.type[0])
1511             {
1512                 case SWR_TYPE_UNORM:
1513                     conversionType = CONVERT_NORMALIZED;
1514                 case SWR_TYPE_UINT:
1515                     extendCastType = Instruction::CastOps::ZExt;
1516                     break;
1517                 case SWR_TYPE_SNORM:
1518                     conversionType = CONVERT_NORMALIZED;
1519                 case SWR_TYPE_SINT:
1520                     extendCastType = Instruction::CastOps::SExt;
1521                     break;
1522                 case SWR_TYPE_USCALED:
1523                     conversionType = CONVERT_USCALED;
1524                     extendCastType = Instruction::CastOps::UIToFP;
1525                     break;
1526                 case SWR_TYPE_SSCALED:
1527                     conversionType = CONVERT_SSCALED;
1528                     extendCastType = Instruction::CastOps::SIToFP;
1529                     break;
1530                 case SWR_TYPE_SFIXED:
1531                     conversionType = CONVERT_SFIXED;
1532                     extendCastType = Instruction::CastOps::SExt;
1533                     break;
1534                 default:
1535                     break;
1536             }
1537
1538             // value substituted when component of gather is masked
1539             Value* gatherSrc = VIMMED1(0);
1540 #if USE_SIMD16_GATHERS
1541             Value *gatherSrc16 = VIMMED1_16(0);
1542 #endif
1543
1544             // Gather components from memory to store in a simdvertex structure
1545             switch (bpc)
1546             {
1547                 case 8:
1548                 {
1549                     // if we have at least one component to fetch
1550                     if (compMask)
1551                     {
1552 #if USE_SIMD16_GATHERS
1553                         Value *gatherResult = GATHERDD_16(gatherSrc16, pStreamBase, vOffsets16, vGatherMask16);
1554
1555                         // e.g. result of an 8x32bit integer gather for 8bit components
1556                         // 256i - 0    1    2    3    4    5    6    7
1557                         //        xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw
1558
1559                         Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth16), 0));
1560
1561                         Shuffle8bpcArgs args = std::forward_as_tuple(gatherResult, pVtxOut2, extendCastType, conversionType,
1562                             currentVertexElement, outputElt, compMask, compCtrl, pVtxSrc2, info.swizzle);
1563
1564                         // Shuffle gathered components into place in simdvertex struct
1565                         Shuffle8bpcGatherd16(args);  // outputs to vVertexElements ref
1566 #else
1567                         Value *vGatherResult = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask);
1568                         // e.g. result of an 8x32bit integer gather for 8bit components
1569                         // 256i - 0    1    2    3    4    5    6    7
1570                         //        xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw
1571
1572                         Shuffle8bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, extendCastType, conversionType,
1573                             currentVertexElement, outputElt, compMask, compCtrl, vVertexElements, info.swizzle);
1574
1575                         // Shuffle gathered components into place in simdvertex struct
1576 #if USE_SIMD16_SHADERS
1577                         Shuffle8bpcGatherd(args, useVertexID2); // outputs to vVertexElements ref
1578 #else
1579                         Shuffle8bpcGatherd(args); // outputs to vVertexElements ref
1580 #endif
1581 #endif
1582                     }
1583                 }
1584                 break;
1585                 case 16:
1586                 {
1587 #if USE_SIMD16_GATHERS
1588                     Value *gatherResult[2];
1589
1590                     // if we have at least one component out of x or y to fetch
1591                     if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
1592                     {
1593                         gatherResult[0] = GATHERDD_16(gatherSrc16, pStreamBase, vOffsets16, vGatherMask16);
1594
1595                         // e.g. result of first 8x32bit integer gather for 16bit components
1596                         // 256i - 0    1    2    3    4    5    6    7
1597                         //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
1598                         //
1599                     }
1600                     else
1601                     {
1602                         gatherResult[0] = VUNDEF_I_16();
1603                     }
1604
1605                     // if we have at least one component out of z or w to fetch
1606                     if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
1607                     {
1608                         // offset base to the next components(zw) in the vertex to gather
1609                         pStreamBase = GEP(pStreamBase, C((char)4));
1610
1611                         gatherResult[1] = GATHERDD_16(gatherSrc16, pStreamBase, vOffsets16, vGatherMask16);
1612
1613                         // e.g. result of second 8x32bit integer gather for 16bit components
1614                         // 256i - 0    1    2    3    4    5    6    7
1615                         //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
1616                         //
1617                     }
1618                     else
1619                     {
1620                         gatherResult[1] = VUNDEF_I_16();
1621                     }
1622
1623                     // if we have at least one component to shuffle into place
1624                     if (compMask)
1625                     {
1626                         Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth16), 0));
1627
1628                         Shuffle16bpcArgs args = std::forward_as_tuple(gatherResult, pVtxOut2, extendCastType, conversionType,
1629                             currentVertexElement, outputElt, compMask, compCtrl, pVtxSrc2);
1630
1631                         // Shuffle gathered components into place in simdvertex struct
1632                         Shuffle16bpcGather16(args);  // outputs to vVertexElements ref
1633                     }
1634 #else
1635                     Value *vGatherResult[2];
1636
1637                     // if we have at least one component out of x or y to fetch
1638                     if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
1639                     {
1640                         vGatherResult[0] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask);
1641                         // e.g. result of first 8x32bit integer gather for 16bit components
1642                         // 256i - 0    1    2    3    4    5    6    7
1643                         //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
1644                         //
1645                     }
1646
1647                     // if we have at least one component out of z or w to fetch
1648                     if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
1649                     {
1650                         // offset base to the next components(zw) in the vertex to gather
1651                         pStreamBase = GEP(pStreamBase, C((char)4));
1652
1653                         vGatherResult[1] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask);
1654                         // e.g. result of second 8x32bit integer gather for 16bit components
1655                         // 256i - 0    1    2    3    4    5    6    7
1656                         //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
1657                         //
1658                     }
1659
1660                     // if we have at least one component to shuffle into place
1661                     if (compMask)
1662                     {
1663                         Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, extendCastType, conversionType,
1664                             currentVertexElement, outputElt, compMask, compCtrl, vVertexElements);
1665
1666                         // Shuffle gathered components into place in simdvertex struct
1667 #if USE_SIMD16_SHADERS
1668                         Shuffle16bpcGather(args, useVertexID2);  // outputs to vVertexElements ref
1669 #else
1670                         Shuffle16bpcGather(args);  // outputs to vVertexElements ref
1671 #endif
1672                     }
1673 #endif
1674                 }
1675                 break;
1676                 case 32:
1677                 {
1678                     // Gathered components into place in simdvertex struct
1679                     for (uint32_t i = 0; i < 4; i++)
1680                     {
1681                         if (isComponentEnabled(compMask, i))
1682                         {
1683                             // if we need to gather the component
1684                             if (compCtrl[i] == StoreSrc)
1685                             {
1686 #if USE_SIMD16_GATHERS
1687                                 Value *pGather = GATHERDD_16(gatherSrc16, pStreamBase, vOffsets16, vGatherMask16);
1688
1689                                 if (conversionType == CONVERT_USCALED)
1690                                 {
1691                                     pGather = UI_TO_FP(pGather, mSimd16FP32Ty);
1692                                 }
1693                                 else if (conversionType == CONVERT_SSCALED)
1694                                 {
1695                                     pGather = SI_TO_FP(pGather, mSimd16FP32Ty);
1696                                 }
1697                                 else if (conversionType == CONVERT_SFIXED)
1698                                 {
1699                                     pGather = FMUL(SI_TO_FP(pGather, mSimd16FP32Ty), VBROADCAST_16(C(1 / 65536.0f)));
1700                                 }
1701
1702                                 pVtxSrc2[currentVertexElement++] = pGather;
1703
1704                                 // e.g. result of a single 8x32bit integer gather for 32bit components
1705                                 // 256i - 0    1    2    3    4    5    6    7
1706                                 //        xxxx xxxx xxxx xxxx xxxx xxxx xxxx xxxx
1707 #else
1708                                 Value* pGather = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask);
1709
1710                                 if (conversionType == CONVERT_USCALED)
1711                                 {
1712                                     pGather = UI_TO_FP(pGather, mSimdFP32Ty);
1713                                 }
1714                                 else if (conversionType == CONVERT_SSCALED)
1715                                 {
1716                                     pGather = SI_TO_FP(pGather, mSimdFP32Ty);
1717                                 }
1718                                 else if (conversionType == CONVERT_SFIXED)
1719                                 {
1720                                     pGather = FMUL(SI_TO_FP(pGather, mSimdFP32Ty), VBROADCAST(C(1/65536.0f)));
1721                                 }
1722
1723                                 vVertexElements[currentVertexElement++] = pGather;
1724
1725                                 // e.g. result of a single 8x32bit integer gather for 32bit components
1726                                 // 256i - 0    1    2    3    4    5    6    7
1727                                 //        xxxx xxxx xxxx xxxx xxxx xxxx xxxx xxxx
1728 #endif
1729                             }
1730                             else
1731                             {
1732 #if USE_SIMD16_GATHERS
1733                                 pVtxSrc2[currentVertexElement++] = GenerateCompCtrlVector16(compCtrl[i]);
1734 #else
1735 #if USE_SIMD16_SHADERS
1736                                 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i], useVertexID2);
1737 #else
1738                                 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1739 #endif
1740 #endif
1741                             }
1742
1743                             if (currentVertexElement > 3)
1744                             {
1745 #if USE_SIMD16_GATHERS
1746                                 // store SIMD16s
1747                                 Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth16), 0));
1748
1749                                 StoreVertexElements16(pVtxOut2, outputElt++, 4, pVtxSrc2);
1750 #else
1751                                 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1752 #endif
1753
1754                                 // reset to the next vVertexElement to output
1755                                 currentVertexElement = 0;
1756                             }
1757
1758                         }
1759
1760                         // offset base to the next component  in the vertex to gather
1761                         pStreamBase = GEP(pStreamBase, C((char)4));
1762                     }
1763                 }
1764                 break;
1765             }
1766         }
1767     }
1768
1769     // if we have a partially filled vVertexElement struct, output it
1770     if (currentVertexElement > 0)
1771     {
1772 #if USE_SIMD16_GATHERS
1773         // store SIMD16s
1774         Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth16), 0));
1775
1776         StoreVertexElements16(pVtxOut2, outputElt++, currentVertexElement, pVtxSrc2);
1777 #else
1778         StoreVertexElements(pVtxOut, outputElt++, currentVertexElement, vVertexElements);
1779 #endif
1780     }
1781 }
1782
1783 //////////////////////////////////////////////////////////////////////////
1784 /// @brief Loads a simd of valid indices. OOB indices are set to 0
1785 /// *Note* have to do 16bit index checking in scalar until we have AVX-512
1786 /// support
1787 /// @param pIndices - pointer to 8 bit indices
1788 /// @param pLastIndex - pointer to last valid index
1789 Value* FetchJit::GetSimdValid8bitIndices(Value* pIndices, Value* pLastIndex)
1790 {
1791     // can fit 2 16 bit integers per vWidth lane
1792     Value* vIndices =  VUNDEF_I();
1793
1794     // store 0 index on stack to be used to conditionally load from if index address is OOB
1795     Value* pZeroIndex = ALLOCA(mInt8Ty);
1796     STORE(C((uint8_t)0), pZeroIndex);
1797
1798     // Load a SIMD of index pointers
1799     for(int64_t lane = 0; lane < mVWidth; lane++)
1800     {
1801         // Calculate the address of the requested index
1802         Value *pIndex = GEP(pIndices, C(lane));
1803
1804         // check if the address is less than the max index,
1805         Value* mask = ICMP_ULT(pIndex, pLastIndex);
1806
1807         // if valid, load the index. if not, load 0 from the stack
1808         Value* pValid = SELECT(mask, pIndex, pZeroIndex);
1809         Value *index = LOAD(pValid, "valid index");
1810
1811         // zero extended index to 32 bits and insert into the correct simd lane
1812         index = Z_EXT(index, mInt32Ty);
1813         vIndices = VINSERT(vIndices, index, lane);
1814     }
1815     return vIndices;
1816 }
1817
1818 //////////////////////////////////////////////////////////////////////////
1819 /// @brief Loads a simd of valid indices. OOB indices are set to 0
1820 /// *Note* have to do 16bit index checking in scalar until we have AVX-512
1821 /// support
1822 /// @param pIndices - pointer to 16 bit indices
1823 /// @param pLastIndex - pointer to last valid index
1824 Value* FetchJit::GetSimdValid16bitIndices(Value* pIndices, Value* pLastIndex)
1825 {
1826     // can fit 2 16 bit integers per vWidth lane
1827     Value* vIndices =  VUNDEF_I();
1828
1829     // store 0 index on stack to be used to conditionally load from if index address is OOB
1830     Value* pZeroIndex = ALLOCA(mInt16Ty);
1831     STORE(C((uint16_t)0), pZeroIndex);
1832
1833     // Load a SIMD of index pointers
1834     for(int64_t lane = 0; lane < mVWidth; lane++)
1835     {
1836         // Calculate the address of the requested index
1837         Value *pIndex = GEP(pIndices, C(lane));
1838
1839         // check if the address is less than the max index,
1840         Value* mask = ICMP_ULT(pIndex, pLastIndex);
1841
1842         // if valid, load the index. if not, load 0 from the stack
1843         Value* pValid = SELECT(mask, pIndex, pZeroIndex);
1844         Value *index = LOAD(pValid, "valid index");
1845
1846         // zero extended index to 32 bits and insert into the correct simd lane
1847         index = Z_EXT(index, mInt32Ty);
1848         vIndices = VINSERT(vIndices, index, lane);
1849     }
1850     return vIndices;
1851 }
1852
1853 //////////////////////////////////////////////////////////////////////////
1854 /// @brief Loads a simd of valid indices. OOB indices are set to 0
1855 /// @param pIndices - pointer to 32 bit indices
1856 /// @param pLastIndex - pointer to last valid index
1857 Value* FetchJit::GetSimdValid32bitIndices(Value* pIndices, Value* pLastIndex)
1858 {
1859     DataLayout dL(JM()->mpCurrentModule);
1860     unsigned int ptrSize = dL.getPointerSize() * 8;  // ptr size in bits
1861     Value* iLastIndex = PTR_TO_INT(pLastIndex, Type::getIntNTy(JM()->mContext, ptrSize));
1862     Value* iIndices = PTR_TO_INT(pIndices, Type::getIntNTy(JM()->mContext, ptrSize));
1863
1864     // get the number of indices left in the buffer (endPtr - curPtr) / sizeof(index)
1865     Value* numIndicesLeft = SUB(iLastIndex,iIndices);
1866     numIndicesLeft = TRUNC(numIndicesLeft, mInt32Ty);
1867     numIndicesLeft = SDIV(numIndicesLeft, C(4));
1868
1869     // create a vector of index counts from the base index ptr passed into the fetch
1870     const std::vector<Constant*> vecIndices {C(0), C(1), C(2), C(3), C(4), C(5), C(6), C(7)};
1871     Constant* vIndexOffsets = ConstantVector::get(vecIndices);
1872
1873     // compare index count to the max valid index
1874     // e.g vMaxIndex      4 4 4 4 4 4 4 4 : 4 indices left to load
1875     //     vIndexOffsets  0 1 2 3 4 5 6 7
1876     //     ------------------------------
1877     //     vIndexMask    -1-1-1-1 0 0 0 0 : offsets < max pass
1878     //     vLoadedIndices 0 1 2 3 0 0 0 0 : offsets >= max masked to 0
1879     Value* vMaxIndex = VBROADCAST(numIndicesLeft);
1880     Value* vIndexMask = VPCMPGTD(vMaxIndex, vIndexOffsets);
1881
1882     // VMASKLOAD takes an *i8 src pointer
1883     pIndices = BITCAST(pIndices,PointerType::get(mInt8Ty,0));
1884
1885     // Load the indices; OOB loads 0
1886     return MASKLOADD(pIndices,vIndexMask);
1887 }
1888
1889 //////////////////////////////////////////////////////////////////////////
1890 /// @brief Takes a SIMD of gathered 8bpc verts, zero or sign extends,
1891 /// denormalizes if needed, converts to F32 if needed, and positions in
1892 //  the proper SIMD rows to be output to the simdvertex structure
1893 /// @param args: (tuple of args, listed below)
1894 ///   @param vGatherResult - 8 gathered 8bpc vertices
1895 ///   @param pVtxOut - base pointer to output simdvertex struct
1896 ///   @param extendType - sign extend or zero extend
1897 ///   @param bNormalized - do we need to denormalize?
1898 ///   @param currentVertexElement - reference to the current vVertexElement
1899 ///   @param outputElt - reference to the current offset from simdvertex we're o
1900 ///   @param compMask - component packing mask
1901 ///   @param compCtrl - component control val
1902 ///   @param vVertexElements[4] - vertex components to output
1903 ///   @param swizzle[4] - component swizzle location
1904 #if USE_SIMD16_GATHERS
1905 void FetchJit::Shuffle8bpcGatherd16(Shuffle8bpcArgs &args)
1906 {
1907     // Unpack tuple args
1908     Value*& vGatherResult = std::get<0>(args);
1909     Value* pVtxOut = std::get<1>(args);
1910     const Instruction::CastOps extendType = std::get<2>(args);
1911     const ConversionType conversionType = std::get<3>(args);
1912     uint32_t &currentVertexElement = std::get<4>(args);
1913     uint32_t &outputElt = std::get<5>(args);
1914     const ComponentEnable compMask = std::get<6>(args);
1915     const ComponentControl(&compCtrl)[4] = std::get<7>(args);
1916     Value* (&vVertexElements)[4] = std::get<8>(args);
1917     const uint32_t(&swizzle)[4] = std::get<9>(args);
1918
1919     // cast types
1920     Type *vGatherTy = mSimdInt32Ty;
1921     Type *v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
1922
1923     // have to do extra work for sign extending
1924     if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP))
1925     {
1926         Type *v16x8Ty = VectorType::get(mInt8Ty, mVWidth * 2); // 8x16bit ints in a 128bit lane
1927         Type *v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
1928
1929         // shuffle mask, including any swizzling
1930         const char x = (char)swizzle[0]; const char y = (char)swizzle[1];
1931         const char z = (char)swizzle[2]; const char w = (char)swizzle[3];
1932         Value *vConstMask = C<char>({ char(x), char(x + 4), char(x + 8), char(x + 12),
1933             char(y), char(y + 4), char(y + 8), char(y + 12),
1934             char(z), char(z + 4), char(z + 8), char(z + 12),
1935             char(w), char(w + 4), char(w + 8), char(w + 12),
1936             char(x), char(x + 4), char(x + 8), char(x + 12),
1937             char(y), char(y + 4), char(y + 8), char(y + 12),
1938             char(z), char(z + 4), char(z + 8), char(z + 12),
1939             char(w), char(w + 4), char(w + 8), char(w + 12) });
1940
1941         // SIMD16 PSHUFB isnt part of AVX-512F, so split into SIMD8 for the sake of KNL, for now..
1942
1943         Value *vGatherResult_lo = EXTRACT_16(vGatherResult, 0);
1944         Value *vGatherResult_hi = EXTRACT_16(vGatherResult, 1);
1945
1946         Value *vShufResult_lo = BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask), vGatherTy);
1947         Value *vShufResult_hi = BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask), vGatherTy);
1948
1949         // after pshufb: group components together in each 128bit lane
1950         // 256i - 0    1    2    3    4    5    6    7
1951         //        xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww
1952
1953         Value *vi128XY_lo = nullptr;
1954         Value *vi128XY_hi = nullptr;
1955         if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
1956         {
1957             vi128XY_lo = BITCAST(PERMD(vShufResult_lo, C<int32_t>({ 0, 4, 0, 0, 1, 5, 0, 0 })), v128Ty);
1958             vi128XY_hi = BITCAST(PERMD(vShufResult_hi, C<int32_t>({ 0, 4, 0, 0, 1, 5, 0, 0 })), v128Ty);
1959
1960             // after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane
1961             // 256i - 0    1    2    3    4    5    6    7
1962             //        xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care)
1963         }
1964
1965         // do the same for zw components
1966         Value *vi128ZW_lo = nullptr;
1967         Value *vi128ZW_hi = nullptr;
1968         if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
1969         {
1970             vi128ZW_lo = BITCAST(PERMD(vShufResult_lo, C<int32_t>({ 2, 6, 0, 0, 3, 7, 0, 0 })), v128Ty);
1971             vi128ZW_hi = BITCAST(PERMD(vShufResult_hi, C<int32_t>({ 2, 6, 0, 0, 3, 7, 0, 0 })), v128Ty);
1972         }
1973
1974         // init denormalize variables if needed
1975         Instruction::CastOps fpCast;
1976         Value *conversionFactor;
1977
1978         switch (conversionType)
1979         {
1980         case CONVERT_NORMALIZED:
1981             fpCast = Instruction::CastOps::SIToFP;
1982             conversionFactor = VIMMED1((float)(1.0 / 127.0));
1983             break;
1984         case CONVERT_SSCALED:
1985             fpCast = Instruction::CastOps::SIToFP;
1986             conversionFactor = VIMMED1((float)(1.0));
1987             break;
1988         case CONVERT_USCALED:
1989             SWR_INVALID("Type should not be sign extended!");
1990             conversionFactor = nullptr;
1991             break;
1992         default:
1993             SWR_ASSERT(conversionType == CONVERT_NONE);
1994             conversionFactor = nullptr;
1995             break;
1996         }
1997
1998         // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
1999         for (uint32_t i = 0; i < 4; i++)
2000         {
2001             if (isComponentEnabled(compMask, i))
2002             {
2003                 if (compCtrl[i] == ComponentControl::StoreSrc)
2004                 {
2005                     // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
2006                     uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
2007                     // if x or y, use vi128XY permute result, else use vi128ZW
2008                     Value *selectedPermute_lo = (i < 2) ? vi128XY_lo : vi128ZW_lo;
2009                     Value *selectedPermute_hi = (i < 2) ? vi128XY_hi : vi128ZW_hi;
2010
2011                     // sign extend
2012                     Value *temp_lo = PMOVSXBD(BITCAST(VEXTRACT(selectedPermute_lo, C(lane)), v16x8Ty));
2013                     Value *temp_hi = PMOVSXBD(BITCAST(VEXTRACT(selectedPermute_hi, C(lane)), v16x8Ty));
2014
2015                     // denormalize if needed
2016                     if (conversionType != CONVERT_NONE)
2017                     {
2018                         temp_lo = FMUL(CAST(fpCast, temp_lo, mSimdFP32Ty), conversionFactor);
2019                         temp_hi = FMUL(CAST(fpCast, temp_hi, mSimdFP32Ty), conversionFactor);
2020                     }
2021
2022                     vVertexElements[currentVertexElement] = JOIN_16(temp_lo, temp_hi);
2023
2024                     currentVertexElement += 1;
2025                 }
2026                 else
2027                 {
2028                     vVertexElements[currentVertexElement++] = GenerateCompCtrlVector16(compCtrl[i]);
2029                 }
2030
2031                 if (currentVertexElement > 3)
2032                 {
2033                     StoreVertexElements16(pVtxOut, outputElt++, 4, vVertexElements);
2034                     // reset to the next vVertexElement to output
2035                     currentVertexElement = 0;
2036                 }
2037             }
2038         }
2039     }
2040     // else zero extend
2041     else if ((extendType == Instruction::CastOps::ZExt) || (extendType == Instruction::CastOps::UIToFP))
2042     {
2043         // init denormalize variables if needed
2044         Instruction::CastOps fpCast;
2045         Value *conversionFactor;
2046
2047         switch (conversionType)
2048         {
2049         case CONVERT_NORMALIZED:
2050             fpCast = Instruction::CastOps::UIToFP;
2051             conversionFactor = VIMMED1((float)(1.0 / 255.0));
2052             break;
2053         case CONVERT_USCALED:
2054             fpCast = Instruction::CastOps::UIToFP;
2055             conversionFactor = VIMMED1((float)(1.0));
2056             break;
2057         case CONVERT_SSCALED:
2058             SWR_INVALID("Type should not be zero extended!");
2059             conversionFactor = nullptr;
2060             break;
2061         default:
2062             SWR_ASSERT(conversionType == CONVERT_NONE);
2063             conversionFactor = nullptr;
2064             break;
2065         }
2066
2067         // shuffle enabled components into lower byte of each 32bit lane, 0 extending to 32 bits
2068         for (uint32_t i = 0; i < 4; i++)
2069         {
2070             if (isComponentEnabled(compMask, i))
2071             {
2072                 if (compCtrl[i] == ComponentControl::StoreSrc)
2073                 {
2074                     // pshufb masks for each component
2075                     Value *vConstMask;
2076                     switch (swizzle[i])
2077                     {
2078                     case 0:
2079                         // x shuffle mask
2080                         vConstMask = C<char>({ 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1,
2081                             0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1 });
2082                         break;
2083                     case 1:
2084                         // y shuffle mask
2085                         vConstMask = C<char>({ 1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1,
2086                             1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1 });
2087                         break;
2088                     case 2:
2089                         // z shuffle mask
2090                         vConstMask = C<char>({ 2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1,
2091                             2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1 });
2092                         break;
2093                     case 3:
2094                         // w shuffle mask
2095                         vConstMask = C<char>({ 3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1,
2096                             3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1 });
2097                         break;
2098                     default:
2099                         vConstMask = nullptr;
2100                         break;
2101                     }
2102
2103                     Value *vGatherResult_lo = EXTRACT_16(vGatherResult, 0);
2104                     Value *vGatherResult_hi = EXTRACT_16(vGatherResult, 1);
2105
2106                     Value *temp_lo = BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask), vGatherTy);
2107                     Value *temp_hi = BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask), vGatherTy);
2108
2109                     // after pshufb for x channel
2110                     // 256i - 0    1    2    3    4    5    6    7
2111                     //        x000 x000 x000 x000 x000 x000 x000 x000
2112
2113                     // denormalize if needed
2114                     if (conversionType != CONVERT_NONE)
2115                     {
2116                         temp_lo = FMUL(CAST(fpCast, temp_lo, mSimdFP32Ty), conversionFactor);
2117                         temp_hi = FMUL(CAST(fpCast, temp_hi, mSimdFP32Ty), conversionFactor);
2118                     }
2119
2120                     vVertexElements[currentVertexElement] = JOIN_16(temp_lo, temp_hi);
2121
2122                     currentVertexElement += 1;
2123                 }
2124                 else
2125                 {
2126                     vVertexElements[currentVertexElement++] = GenerateCompCtrlVector16(compCtrl[i]);
2127                 }
2128
2129                 if (currentVertexElement > 3)
2130                 {
2131                     StoreVertexElements16(pVtxOut, outputElt++, 4, vVertexElements);
2132                     // reset to the next vVertexElement to output
2133                     currentVertexElement = 0;
2134                 }
2135             }
2136         }
2137     }
2138     else
2139     {
2140         SWR_INVALID("Unsupported conversion type");
2141     }
2142 }
2143
2144 #else
2145 #if USE_SIMD16_SHADERS
2146 void FetchJit::Shuffle8bpcGatherd(Shuffle8bpcArgs &args, bool useVertexID2)
2147 #else
2148 void FetchJit::Shuffle8bpcGatherd(Shuffle8bpcArgs &args)
2149 #endif
2150 {
2151     // Unpack tuple args
2152     Value*& vGatherResult = std::get<0>(args);
2153     Value* pVtxOut = std::get<1>(args);
2154     const Instruction::CastOps extendType = std::get<2>(args);
2155     const ConversionType conversionType = std::get<3>(args);
2156     uint32_t &currentVertexElement = std::get<4>(args);
2157     uint32_t &outputElt = std::get<5>(args);
2158     const ComponentEnable compMask = std::get<6>(args);
2159     const ComponentControl(&compCtrl)[4] = std::get<7>(args);
2160     Value* (&vVertexElements)[4] = std::get<8>(args);
2161     const uint32_t(&swizzle)[4] = std::get<9>(args);
2162
2163     // cast types
2164     Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
2165
2166     for (uint32_t i = 0; i < 4; i++)
2167     {
2168         if (!isComponentEnabled(compMask, i))
2169             continue;
2170
2171         if (compCtrl[i] == ComponentControl::StoreSrc)
2172         {
2173             std::vector<uint32_t> vShuffleMasks[4] = {
2174                 { 0, 4,  8, 12, 16, 20, 24, 28 }, // x
2175                 { 1, 5,  9, 13, 17, 21, 25, 29 }, // y
2176                 { 2, 6, 10, 14, 18, 22, 26, 30 }, // z
2177                 { 3, 7, 11, 15, 19, 23, 27, 31 }, // w
2178             };
2179
2180             Value *val = VSHUFFLE(BITCAST(vGatherResult, v32x8Ty),
2181                 UndefValue::get(v32x8Ty),
2182                 vShuffleMasks[swizzle[i]]);
2183
2184             if ((extendType == Instruction::CastOps::SExt) ||
2185                 (extendType == Instruction::CastOps::SIToFP)) {
2186                 switch (conversionType)
2187                 {
2188                 case CONVERT_NORMALIZED:
2189                     val = FMUL(SI_TO_FP(val, mSimdFP32Ty), VIMMED1((float)(1.0 / 127.0)));
2190                     break;
2191                 case CONVERT_SSCALED:
2192                     val = SI_TO_FP(val, mSimdFP32Ty);
2193                     break;
2194                 case CONVERT_USCALED:
2195                     SWR_INVALID("Type should not be sign extended!");
2196                     break;
2197                 default:
2198                     SWR_ASSERT(conversionType == CONVERT_NONE);
2199                     val = S_EXT(val, mSimdInt32Ty);
2200                     break;
2201                 }
2202             }
2203             else if ((extendType == Instruction::CastOps::ZExt) ||
2204                 (extendType == Instruction::CastOps::UIToFP)) {
2205                 switch (conversionType)
2206                 {
2207                 case CONVERT_NORMALIZED:
2208                     val = FMUL(UI_TO_FP(val, mSimdFP32Ty), VIMMED1((float)(1.0 / 255.0)));
2209                     break;
2210                 case CONVERT_SSCALED:
2211                     SWR_INVALID("Type should not be zero extended!");
2212                     break;
2213                 case CONVERT_USCALED:
2214                     val = UI_TO_FP(val, mSimdFP32Ty);
2215                     break;
2216                 default:
2217                     SWR_ASSERT(conversionType == CONVERT_NONE);
2218                     val = Z_EXT(val, mSimdInt32Ty);
2219                     break;
2220                 }
2221             }
2222             else
2223             {
2224                 SWR_INVALID("Unsupported conversion type");
2225             }
2226
2227             vVertexElements[currentVertexElement++] = val;
2228         }
2229         else
2230         {
2231 #if USE_SIMD16_SHADERS
2232             vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i], useVertexID2);
2233 #else
2234             vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
2235 #endif
2236         }
2237
2238         if (currentVertexElement > 3)
2239         {
2240             StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
2241             // reset to the next vVertexElement to output
2242             currentVertexElement = 0;
2243         }
2244     }
2245 }
2246
2247 #endif
2248 //////////////////////////////////////////////////////////////////////////
2249 /// @brief Takes a SIMD of gathered 16bpc verts, zero or sign extends,
2250 /// denormalizes if needed, converts to F32 if needed, and positions in
2251 //  the proper SIMD rows to be output to the simdvertex structure
2252 /// @param args: (tuple of args, listed below)
2253 ///   @param vGatherResult[2] - array of gathered 16bpc vertices, 4 per index
2254 ///   @param pVtxOut - base pointer to output simdvertex struct
2255 ///   @param extendType - sign extend or zero extend
2256 ///   @param bNormalized - do we need to denormalize?
2257 ///   @param currentVertexElement - reference to the current vVertexElement
2258 ///   @param outputElt - reference to the current offset from simdvertex we're o
2259 ///   @param compMask - component packing mask
2260 ///   @param compCtrl - component control val
2261 ///   @param vVertexElements[4] - vertex components to output
2262 #if USE_SIMD16_GATHERS
2263 void FetchJit::Shuffle16bpcGather16(Shuffle16bpcArgs &args)
2264 {
2265     // Unpack tuple args
2266     Value* (&vGatherResult)[2] = std::get<0>(args);
2267     Value* pVtxOut = std::get<1>(args);
2268     const Instruction::CastOps extendType = std::get<2>(args);
2269     const ConversionType conversionType = std::get<3>(args);
2270     uint32_t &currentVertexElement = std::get<4>(args);
2271     uint32_t &outputElt = std::get<5>(args);
2272     const ComponentEnable compMask = std::get<6>(args);
2273     const ComponentControl(&compCtrl)[4] = std::get<7>(args);
2274     Value* (&vVertexElements)[4] = std::get<8>(args);
2275
2276     // cast types
2277     Type *vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
2278     Type *v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
2279
2280     // have to do extra work for sign extending
2281     if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP) || (extendType == Instruction::CastOps::FPExt))
2282     {
2283         // is this PP float?
2284         bool bFP = (extendType == Instruction::CastOps::FPExt) ? true : false;
2285
2286         Type *v8x16Ty = VectorType::get(mInt16Ty, 8); // 8x16bit in a 128bit lane
2287         Type *v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
2288
2289         // shuffle mask
2290         Value *vConstMask = C<char>({ 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
2291                                       0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 });
2292         Value *vi128XY_lo = nullptr;
2293         Value *vi128XY_hi = nullptr;
2294         if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
2295         {
2296             // SIMD16 PSHUFB isnt part of AVX-512F, so split into SIMD8 for the sake of KNL, for now..
2297
2298             Value *vGatherResult_lo = EXTRACT_16(vGatherResult[0], 0);
2299             Value *vGatherResult_hi = EXTRACT_16(vGatherResult[0], 1);
2300
2301             Value *vShufResult_lo = BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask), vGatherTy);
2302             Value *vShufResult_hi = BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask), vGatherTy);
2303
2304             // after pshufb: group components together in each 128bit lane
2305             // 256i - 0    1    2    3    4    5    6    7
2306             //        xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
2307
2308             vi128XY_lo = BITCAST(PERMD(vShufResult_lo, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
2309             vi128XY_hi = BITCAST(PERMD(vShufResult_hi, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
2310
2311             // after PERMD: move and pack xy components into each 128bit lane
2312             // 256i - 0    1    2    3    4    5    6    7
2313             //        xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
2314         }
2315
2316         // do the same for zw components
2317         Value *vi128ZW_lo = nullptr;
2318         Value *vi128ZW_hi = nullptr;
2319         if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
2320         {
2321             Value *vGatherResult_lo = EXTRACT_16(vGatherResult[1], 0);
2322             Value *vGatherResult_hi = EXTRACT_16(vGatherResult[1], 1);
2323
2324             Value *vShufResult_lo = BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask), vGatherTy);
2325             Value *vShufResult_hi = BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask), vGatherTy);
2326
2327             vi128ZW_lo = BITCAST(PERMD(vShufResult_lo, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
2328             vi128ZW_hi = BITCAST(PERMD(vShufResult_hi, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
2329         }
2330
2331         // init denormalize variables if needed
2332         Instruction::CastOps IntToFpCast;
2333         Value *conversionFactor;
2334
2335         switch (conversionType)
2336         {
2337         case CONVERT_NORMALIZED:
2338             IntToFpCast = Instruction::CastOps::SIToFP;
2339             conversionFactor = VIMMED1((float)(1.0 / 32767.0));
2340             break;
2341         case CONVERT_SSCALED:
2342             IntToFpCast = Instruction::CastOps::SIToFP;
2343             conversionFactor = VIMMED1((float)(1.0));
2344             break;
2345         case CONVERT_USCALED:
2346             SWR_INVALID("Type should not be sign extended!");
2347             conversionFactor = nullptr;
2348             break;
2349         default:
2350             SWR_ASSERT(conversionType == CONVERT_NONE);
2351             conversionFactor = nullptr;
2352             break;
2353         }
2354
2355         // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
2356         for (uint32_t i = 0; i < 4; i++)
2357         {
2358             if (isComponentEnabled(compMask, i))
2359             {
2360                 if (compCtrl[i] == ComponentControl::StoreSrc)
2361                 {
2362                     // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
2363                     uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
2364                     // if x or y, use vi128XY permute result, else use vi128ZW
2365                     Value *selectedPermute_lo = (i < 2) ? vi128XY_lo : vi128ZW_lo;
2366                     Value *selectedPermute_hi = (i < 2) ? vi128XY_hi : vi128ZW_hi;
2367
2368                     if (bFP)
2369                     {
2370                         // extract 128 bit lanes to sign extend each component
2371                         Value *temp_lo = CVTPH2PS(BITCAST(VEXTRACT(selectedPermute_lo, C(lane)), v8x16Ty));
2372                         Value *temp_hi = CVTPH2PS(BITCAST(VEXTRACT(selectedPermute_hi, C(lane)), v8x16Ty));
2373
2374                         vVertexElements[currentVertexElement] = JOIN_16(temp_lo, temp_hi);
2375                     }
2376                     else
2377                     {
2378                         // extract 128 bit lanes to sign extend each component
2379                         Value *temp_lo = PMOVSXWD(BITCAST(VEXTRACT(selectedPermute_lo, C(lane)), v8x16Ty));
2380                         Value *temp_hi = PMOVSXWD(BITCAST(VEXTRACT(selectedPermute_hi, C(lane)), v8x16Ty));
2381
2382                         // denormalize if needed
2383                         if (conversionType != CONVERT_NONE)
2384                         {
2385                             temp_lo = FMUL(CAST(IntToFpCast, temp_lo, mSimdFP32Ty), conversionFactor);
2386                             temp_hi = FMUL(CAST(IntToFpCast, temp_hi, mSimdFP32Ty), conversionFactor);
2387                         }
2388
2389                         vVertexElements[currentVertexElement] = JOIN_16(temp_lo, temp_hi);
2390                     }
2391
2392                     currentVertexElement += 1;
2393                 }
2394                 else
2395                 {
2396                     vVertexElements[currentVertexElement++] = GenerateCompCtrlVector16(compCtrl[i]);
2397                 }
2398
2399                 if (currentVertexElement > 3)
2400                 {
2401                     StoreVertexElements16(pVtxOut, outputElt++, 4, vVertexElements);
2402                     // reset to the next vVertexElement to output
2403                     currentVertexElement = 0;
2404                 }
2405             }
2406         }
2407     }
2408     // else zero extend
2409     else if ((extendType == Instruction::CastOps::ZExt) || (extendType == Instruction::CastOps::UIToFP))
2410     {
2411         // pshufb masks for each component
2412         Value *vConstMask[2];
2413
2414         if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 2))
2415         {
2416             // x/z shuffle mask
2417             vConstMask[0] = C<char>({ 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
2418                 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, });
2419         }
2420
2421         if (isComponentEnabled(compMask, 1) || isComponentEnabled(compMask, 3))
2422         {
2423             // y/w shuffle mask
2424             vConstMask[1] = C<char>({ 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
2425                 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1 });
2426         }
2427
2428         // init denormalize variables if needed
2429         Instruction::CastOps fpCast;
2430         Value* conversionFactor;
2431
2432         switch (conversionType)
2433         {
2434         case CONVERT_NORMALIZED:
2435             fpCast = Instruction::CastOps::UIToFP;
2436             conversionFactor = VIMMED1((float)(1.0 / 65535.0));
2437             break;
2438         case CONVERT_USCALED:
2439             fpCast = Instruction::CastOps::UIToFP;
2440             conversionFactor = VIMMED1((float)(1.0f));
2441             break;
2442         case CONVERT_SSCALED:
2443             SWR_INVALID("Type should not be zero extended!");
2444             conversionFactor = nullptr;
2445             break;
2446         default:
2447             SWR_ASSERT(conversionType == CONVERT_NONE);
2448             conversionFactor = nullptr;
2449             break;
2450         }
2451
2452         // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits
2453         for (uint32_t i = 0; i < 4; i++)
2454         {
2455             if (isComponentEnabled(compMask, i))
2456             {
2457                 if (compCtrl[i] == ComponentControl::StoreSrc)
2458                 {
2459                     // select correct constMask for x/z or y/w pshufb
2460                     uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1;
2461                     // if x or y, use vi128XY permute result, else use vi128ZW
2462                     uint32_t selectedGather = (i < 2) ? 0 : 1;
2463
2464                     // SIMD16 PSHUFB isnt part of AVX-512F, so split into SIMD8 for the sake of KNL, for now..
2465
2466                     Value *vGatherResult_lo = EXTRACT_16(vGatherResult[selectedGather], 0);
2467                     Value *vGatherResult_hi = EXTRACT_16(vGatherResult[selectedGather], 1);
2468
2469                     Value *temp_lo = BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask[selectedMask]), vGatherTy);
2470                     Value *temp_hi = BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask[selectedMask]), vGatherTy);
2471
2472                     // after pshufb mask for x channel; z uses the same shuffle from the second gather
2473                     // 256i - 0    1    2    3    4    5    6    7
2474                     //        xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00
2475
2476                     // denormalize if needed
2477                     if (conversionType != CONVERT_NONE)
2478                     {
2479                         temp_lo = FMUL(CAST(fpCast, temp_lo, mSimdFP32Ty), conversionFactor);
2480                         temp_hi = FMUL(CAST(fpCast, temp_hi, mSimdFP32Ty), conversionFactor);
2481                     }
2482
2483                     vVertexElements[currentVertexElement] = JOIN_16(temp_lo, temp_hi);
2484
2485                     currentVertexElement += 1;
2486                 }
2487                 else
2488                 {
2489                     vVertexElements[currentVertexElement++] = GenerateCompCtrlVector16(compCtrl[i]);
2490                 }
2491
2492                 if (currentVertexElement > 3)
2493                 {
2494                     StoreVertexElements16(pVtxOut, outputElt++, 4, vVertexElements);
2495                     // reset to the next vVertexElement to output
2496                     currentVertexElement = 0;
2497                 }
2498             }
2499         }
2500     }
2501     else
2502     {
2503         SWR_INVALID("Unsupported conversion type");
2504     }
2505 }
2506
2507 #else
2508 #if USE_SIMD16_SHADERS
2509 void FetchJit::Shuffle16bpcGather(Shuffle16bpcArgs &args, bool useVertexID2)
2510 #else
2511 void FetchJit::Shuffle16bpcGather(Shuffle16bpcArgs &args)
2512 #endif
2513 {
2514     // Unpack tuple args
2515     Value* (&vGatherResult)[2] = std::get<0>(args);
2516     Value* pVtxOut = std::get<1>(args);
2517     const Instruction::CastOps extendType = std::get<2>(args);
2518     const ConversionType conversionType = std::get<3>(args);
2519     uint32_t &currentVertexElement = std::get<4>(args);
2520     uint32_t &outputElt = std::get<5>(args);
2521     const ComponentEnable compMask = std::get<6>(args);
2522     const ComponentControl(&compCtrl)[4] = std::get<7>(args);
2523     Value* (&vVertexElements)[4] = std::get<8>(args);
2524
2525     // cast types
2526     Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
2527     Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
2528
2529                                                            // have to do extra work for sign extending
2530     if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP) ||
2531         (extendType == Instruction::CastOps::FPExt))
2532     {
2533         // is this PP float?
2534         bool bFP = (extendType == Instruction::CastOps::FPExt) ? true : false;
2535
2536         Type* v8x16Ty = VectorType::get(mInt16Ty, 8); // 8x16bit in a 128bit lane
2537         Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
2538
2539                                                                                                      // shuffle mask
2540         Value* vConstMask = C<char>({ 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
2541             0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 });
2542         Value* vi128XY = nullptr;
2543         if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)) {
2544             Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherResult[0], v32x8Ty), vConstMask), vGatherTy);
2545             // after pshufb: group components together in each 128bit lane
2546             // 256i - 0    1    2    3    4    5    6    7
2547             //        xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
2548
2549             vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
2550             // after PERMD: move and pack xy components into each 128bit lane
2551             // 256i - 0    1    2    3    4    5    6    7
2552             //        xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
2553         }
2554
2555         // do the same for zw components
2556         Value* vi128ZW = nullptr;
2557         if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)) {
2558             Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherResult[1], v32x8Ty), vConstMask), vGatherTy);
2559             vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
2560         }
2561
2562         // init denormalize variables if needed
2563         Instruction::CastOps IntToFpCast;
2564         Value* conversionFactor;
2565
2566         switch (conversionType)
2567         {
2568         case CONVERT_NORMALIZED:
2569             IntToFpCast = Instruction::CastOps::SIToFP;
2570             conversionFactor = VIMMED1((float)(1.0 / 32767.0));
2571             break;
2572         case CONVERT_SSCALED:
2573             IntToFpCast = Instruction::CastOps::SIToFP;
2574             conversionFactor = VIMMED1((float)(1.0));
2575             break;
2576         case CONVERT_USCALED:
2577             SWR_INVALID("Type should not be sign extended!");
2578             conversionFactor = nullptr;
2579             break;
2580         default:
2581             SWR_ASSERT(conversionType == CONVERT_NONE);
2582             conversionFactor = nullptr;
2583             break;
2584         }
2585
2586         // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
2587         for (uint32_t i = 0; i < 4; i++)
2588         {
2589             if (isComponentEnabled(compMask, i))
2590             {
2591                 if (compCtrl[i] == ComponentControl::StoreSrc)
2592                 {
2593                     // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
2594                     uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
2595                     // if x or y, use vi128XY permute result, else use vi128ZW
2596                     Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
2597
2598                     if (bFP) {
2599                         // extract 128 bit lanes to sign extend each component
2600                         vVertexElements[currentVertexElement] = CVTPH2PS(BITCAST(VEXTRACT(selectedPermute, C(lane)), v8x16Ty));
2601                     }
2602                     else {
2603                         // extract 128 bit lanes to sign extend each component
2604                         vVertexElements[currentVertexElement] = PMOVSXWD(BITCAST(VEXTRACT(selectedPermute, C(lane)), v8x16Ty));
2605
2606                         // denormalize if needed
2607                         if (conversionType != CONVERT_NONE) {
2608                             vVertexElements[currentVertexElement] = FMUL(CAST(IntToFpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor);
2609                         }
2610                     }
2611                     currentVertexElement++;
2612                 }
2613                 else
2614                 {
2615 #if USE_SIMD16_SHADERS
2616                     vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i], useVertexID2);
2617 #else
2618                     vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
2619 #endif
2620                 }
2621
2622                 if (currentVertexElement > 3)
2623                 {
2624                     StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
2625                     // reset to the next vVertexElement to output
2626                     currentVertexElement = 0;
2627                 }
2628             }
2629         }
2630     }
2631     // else zero extend
2632     else if ((extendType == Instruction::CastOps::ZExt) || (extendType == Instruction::CastOps::UIToFP))
2633     {
2634         // pshufb masks for each component
2635         Value* vConstMask[2];
2636         if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 2)) {
2637             // x/z shuffle mask
2638             vConstMask[0] = C<char>({ 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
2639                 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, });
2640         }
2641
2642         if (isComponentEnabled(compMask, 1) || isComponentEnabled(compMask, 3)) {
2643             // y/w shuffle mask
2644             vConstMask[1] = C<char>({ 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
2645                 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1 });
2646         }
2647
2648         // init denormalize variables if needed
2649         Instruction::CastOps fpCast;
2650         Value* conversionFactor;
2651
2652         switch (conversionType)
2653         {
2654         case CONVERT_NORMALIZED:
2655             fpCast = Instruction::CastOps::UIToFP;
2656             conversionFactor = VIMMED1((float)(1.0 / 65535.0));
2657             break;
2658         case CONVERT_USCALED:
2659             fpCast = Instruction::CastOps::UIToFP;
2660             conversionFactor = VIMMED1((float)(1.0f));
2661             break;
2662         case CONVERT_SSCALED:
2663             SWR_INVALID("Type should not be zero extended!");
2664             conversionFactor = nullptr;
2665             break;
2666         default:
2667             SWR_ASSERT(conversionType == CONVERT_NONE);
2668             conversionFactor = nullptr;
2669             break;
2670         }
2671
2672         // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits
2673         for (uint32_t i = 0; i < 4; i++)
2674         {
2675             if (isComponentEnabled(compMask, i))
2676             {
2677                 if (compCtrl[i] == ComponentControl::StoreSrc)
2678                 {
2679                     // select correct constMask for x/z or y/w pshufb
2680                     uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1;
2681                     // if x or y, use vi128XY permute result, else use vi128ZW
2682                     uint32_t selectedGather = (i < 2) ? 0 : 1;
2683
2684                     vVertexElements[currentVertexElement] = BITCAST(PSHUFB(BITCAST(vGatherResult[selectedGather], v32x8Ty), vConstMask[selectedMask]), vGatherTy);
2685                     // after pshufb mask for x channel; z uses the same shuffle from the second gather
2686                     // 256i - 0    1    2    3    4    5    6    7
2687                     //        xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00
2688
2689                     // denormalize if needed
2690                     if (conversionType != CONVERT_NONE)
2691                     {
2692                         vVertexElements[currentVertexElement] = FMUL(CAST(fpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor);
2693                     }
2694                     currentVertexElement++;
2695                 }
2696                 else
2697                 {
2698 #if USE_SIMD16_SHADERS
2699                     vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i], useVertexID2);
2700 #else
2701                     vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
2702 #endif
2703                 }
2704
2705                 if (currentVertexElement > 3)
2706                 {
2707                     StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
2708                     // reset to the next vVertexElement to output
2709                     currentVertexElement = 0;
2710                 }
2711             }
2712         }
2713     }
2714     else
2715     {
2716         SWR_INVALID("Unsupported conversion type");
2717     }
2718 }
2719
2720 #endif
2721 //////////////////////////////////////////////////////////////////////////
2722 /// @brief Output a simdvertex worth of elements to the current outputElt
2723 /// @param pVtxOut - base address of VIN output struct
2724 /// @param outputElt - simdvertex offset in VIN to write to
2725 /// @param numEltsToStore - number of simdvertex rows to write out
2726 /// @param vVertexElements - LLVM Value*[] simdvertex to write out
2727 #if USE_SIMD16_GATHERS
2728 void FetchJit::StoreVertexElements16(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4])
2729 {
2730     SWR_ASSERT(numEltsToStore <= 4, "Invalid element count.");
2731
2732     for (uint32_t c = 0; c < numEltsToStore; ++c)
2733     {
2734         // STORE expects FP32 x vWidth type, just bitcast if needed
2735         if (!vVertexElements[c]->getType()->getScalarType()->isFloatTy())
2736         {
2737 #if FETCH_DUMP_VERTEX
2738             PRINT("vVertexElements[%d]: 0x%x\n", { C(c), vVertexElements[c] });
2739 #endif
2740             vVertexElements[c] = BITCAST(vVertexElements[c], mSimd16FP32Ty);
2741         }
2742 #if FETCH_DUMP_VERTEX
2743         else
2744         {
2745             PRINT("vVertexElements[%d]: %f\n", { C(c), vVertexElements[c] });
2746         }
2747 #endif
2748         // outputElt * 4 = offsetting by the size of a simdvertex
2749         // + c offsets to a 32bit x vWidth row within the current vertex
2750         Value* dest = GEP(pVtxOut, C(outputElt * 4 + c), "destGEP");
2751         STORE(vVertexElements[c], dest);
2752     }
2753 }
2754
2755 #else
2756 void FetchJit::StoreVertexElements(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4])
2757 {
2758     SWR_ASSERT(numEltsToStore <= 4, "Invalid element count.");
2759
2760     for (uint32_t c = 0; c < numEltsToStore; ++c)
2761     {
2762         // STORE expects FP32 x vWidth type, just bitcast if needed
2763         if (!vVertexElements[c]->getType()->getScalarType()->isFloatTy())
2764         {
2765 #if FETCH_DUMP_VERTEX
2766             PRINT("vVertexElements[%d]: 0x%x\n", { C(c), vVertexElements[c] });
2767 #endif
2768             vVertexElements[c] = BITCAST(vVertexElements[c], mSimdFP32Ty);
2769         }
2770 #if FETCH_DUMP_VERTEX
2771         else
2772         {
2773             PRINT("vVertexElements[%d]: %f\n", { C(c), vVertexElements[c] });
2774         }
2775 #endif
2776         // outputElt * 4 = offsetting by the size of a simdvertex
2777         // + c offsets to a 32bit x vWidth row within the current vertex
2778 #if USE_SIMD16_SHADERS
2779         Value* dest = GEP(pVtxOut, C(outputElt * 8 + c * 2), "destGEP");
2780 #else
2781         Value* dest = GEP(pVtxOut, C(outputElt * 4 + c), "destGEP");
2782 #endif
2783         STORE(vVertexElements[c], dest);
2784     }
2785 }
2786
2787 #endif
2788 //////////////////////////////////////////////////////////////////////////
2789 /// @brief Generates a constant vector of values based on the
2790 /// ComponentControl value
2791 /// @param ctrl - ComponentControl value
2792 #if USE_SIMD16_GATHERS
2793 Value *FetchJit::GenerateCompCtrlVector16(const ComponentControl ctrl)
2794 {
2795     switch (ctrl)
2796     {
2797         case NoStore:
2798             return VUNDEF_I_16();
2799         case Store0:
2800             return VIMMED1_16(0);
2801         case Store1Fp:
2802             return VIMMED1_16(1.0f);
2803         case Store1Int:
2804             return VIMMED1_16(1);
2805         case StoreVertexId:
2806         {
2807             Value *pId_lo = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID  })), mSimdFP32Ty);
2808             Value *pId_hi = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID2 })), mSimdFP32Ty);
2809
2810             Value *pId = JOIN_16(pId_lo, pId_hi);
2811
2812             return pId;
2813         }
2814         case StoreInstanceId:
2815         {
2816             Value *pId = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_CurInstance })), mFP32Ty);
2817             return VBROADCAST_16(pId);
2818         }
2819
2820
2821         case StoreSrc:
2822         default:
2823             SWR_INVALID("Invalid component control");
2824             return VUNDEF_I_16();
2825     }
2826 }
2827
2828 #else
2829 #if USE_SIMD16_SHADERS
2830 Value *FetchJit::GenerateCompCtrlVector(const ComponentControl ctrl, bool useVertexID2)
2831 #else
2832 Value *FetchJit::GenerateCompCtrlVector(const ComponentControl ctrl)
2833 #endif
2834 {
2835     switch (ctrl)
2836     {
2837     case NoStore:
2838         return VUNDEF_I();
2839     case Store0:
2840         return VIMMED1(0);
2841     case Store1Fp:
2842         return VIMMED1(1.0f);
2843     case Store1Int:
2844         return VIMMED1(1);
2845     case StoreVertexId:
2846         {
2847 #if USE_SIMD16_SHADERS
2848             Value *pId;
2849             if (useVertexID2)
2850             {
2851                 pId = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID2 })), mSimdFP32Ty);
2852             }
2853             else
2854             {
2855                 pId = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID })), mSimdFP32Ty);
2856             }
2857 #else
2858             Value *pId = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID })), mSimdFP32Ty);
2859 #endif
2860             return pId;
2861         }
2862     case StoreInstanceId:
2863         {
2864             Value *pId = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_CurInstance })), mFP32Ty);
2865             return VBROADCAST(pId);
2866         }
2867
2868
2869     case StoreSrc:
2870     default:
2871         SWR_INVALID("Invalid component control");
2872         return VUNDEF_I();
2873     }
2874 }
2875
2876 #endif
2877 //////////////////////////////////////////////////////////////////////////
2878 /// @brief Returns the enable mask for the specified component.
2879 /// @param enableMask - enable bits
2880 /// @param component - component to check if enabled.
2881 bool isComponentEnabled(ComponentEnable enableMask, uint8_t component)
2882 {
2883     switch (component)
2884     {
2885         // X
2886     case 0: return (enableMask & ComponentEnable::X);
2887         // Y
2888     case 1: return (enableMask & ComponentEnable::Y);
2889         // Z
2890     case 2: return (enableMask & ComponentEnable::Z);
2891         // W
2892     case 3: return (enableMask & ComponentEnable::W);
2893
2894     default: return false;
2895     }
2896 }
2897
2898 // Don't want two threads compiling the same fetch shader simultaneously
2899 // Has problems in the JIT cache implementation
2900 // This is only a problem for fetch right now.
2901 static std::mutex gFetchCodegenMutex;
2902
2903 //////////////////////////////////////////////////////////////////////////
2904 /// @brief JITs from fetch shader IR
2905 /// @param hJitMgr - JitManager handle
2906 /// @param func   - LLVM function IR
2907 /// @return PFN_FETCH_FUNC - pointer to fetch code
2908 PFN_FETCH_FUNC JitFetchFunc(HANDLE hJitMgr, const HANDLE hFunc)
2909 {
2910     const llvm::Function* func = (const llvm::Function*)hFunc;
2911     JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
2912     PFN_FETCH_FUNC pfnFetch;
2913
2914     gFetchCodegenMutex.lock();
2915     pfnFetch = (PFN_FETCH_FUNC)(pJitMgr->mpExec->getFunctionAddress(func->getName().str()));
2916     // MCJIT finalizes modules the first time you JIT code from them. After finalized, you cannot add new IR to the module
2917     pJitMgr->mIsModuleFinalized = true;
2918
2919 #if defined(KNOB_SWRC_TRACING)
2920     char fName[1024];
2921     const char *funcName = func->getName().data();
2922     sprintf(fName, "%s.bin", funcName);
2923     FILE *fd = fopen(fName, "wb");
2924     fwrite((void *)pfnFetch, 1, 2048, fd);
2925     fclose(fd);
2926 #endif
2927
2928     pJitMgr->DumpAsm(const_cast<llvm::Function*>(func), "final");
2929     gFetchCodegenMutex.unlock();
2930
2931
2932
2933     return pfnFetch;
2934 }
2935
2936 //////////////////////////////////////////////////////////////////////////
2937 /// @brief JIT compiles fetch shader
2938 /// @param hJitMgr - JitManager handle
2939 /// @param state   - fetch state to build function from
2940 extern "C" PFN_FETCH_FUNC JITCALL JitCompileFetch(HANDLE hJitMgr, const FETCH_COMPILE_STATE& state)
2941 {
2942     JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
2943
2944     pJitMgr->SetupNewModule();
2945
2946     FetchJit theJit(pJitMgr);
2947     HANDLE hFunc = theJit.Create(state);
2948
2949     return JitFetchFunc(hJitMgr, hFunc);
2950 }