src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp

   1 /****************************************************************************
   2 * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
   3 *
   4 * Permission is hereby granted, free of charge, to any person obtaining a
   5 * copy of this software and associated documentation files (the "Software"),
   6 * to deal in the Software without restriction, including without limitation
   7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8 * and/or sell copies of the Software, and to permit persons to whom the
   9 * Software is furnished to do so, subject to the following conditions:
  10 *
  11 * The above copyright notice and this permission notice (including the next
  12 * paragraph) shall be included in all copies or substantial portions of the
  13 * Software.
  14 *
  15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21 * IN THE SOFTWARE.
  22 *
  23 * @file fetch_jit.cpp
  24 *
  25 * @brief Implementation of the fetch jitter
  26 *
  27 * Notes:
  28 *
  29 ******************************************************************************/
  30 #include "builder.h"
  31 #include "jit_api.h"
  32 #include "fetch_jit.h"
  33 #include "gen_state_llvm.h"
  34 #include <sstream>
  35 #include <tuple>
  36
  37 //#define FETCH_DUMP_VERTEX 1
  38 using namespace llvm;
  39 using namespace SwrJit;
  40
  41 bool isComponentEnabled(ComponentEnable enableMask, uint8_t component);
  42
  43 enum ConversionType
  44 {
  45     CONVERT_NONE,
  46     CONVERT_NORMALIZED,
  47     CONVERT_USCALED,
  48     CONVERT_SSCALED,
  49     CONVERT_SFIXED,
  50 };
  51
  52 //////////////////////////////////////////////////////////////////////////
  53 /// Interface to Jitting a fetch shader
  54 //////////////////////////////////////////////////////////////////////////
  55 struct FetchJit : public Builder
  56 {
  57     FetchJit(JitManager* pJitMgr) : Builder(pJitMgr){};
  58
  59     Function* Create(const FETCH_COMPILE_STATE& fetchState);
  60     Value* GetSimdValid32bitIndices(Value* vIndices, Value* pLastIndex);
  61     Value* GetSimdValid16bitIndices(Value* vIndices, Value* pLastIndex);
  62     Value* GetSimdValid8bitIndices(Value* vIndices, Value* pLastIndex);
  63
  64     // package up Shuffle*bpcGatherd args into a tuple for convenience
  65     typedef std::tuple<Value*&, Value*, const Instruction::CastOps, const ConversionType,
  66         uint32_t&, uint32_t&, const ComponentEnable, const ComponentControl(&)[4], Value*(&)[4],
  67         const uint32_t(&)[4]> Shuffle8bpcArgs;
  68 #if USE_SIMD16_SHADERS
  69     void Shuffle8bpcGatherd(Shuffle8bpcArgs &args, bool useVertexID2);
  70 #else
  71     void Shuffle8bpcGatherd(Shuffle8bpcArgs &args);
  72 #endif
  73
  74     typedef std::tuple<Value*(&)[2], Value*, const Instruction::CastOps, const ConversionType,
  75         uint32_t&, uint32_t&, const ComponentEnable, const ComponentControl(&)[4], Value*(&)[4]> Shuffle16bpcArgs;
  76 #if USE_SIMD16_SHADERS
  77     void Shuffle16bpcGather(Shuffle16bpcArgs &args, bool useVertexID2);
  78 #else
  79     void Shuffle16bpcGather(Shuffle16bpcArgs &args);
  80 #endif
  81
  82     void StoreVertexElements(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4]);
  83 #if USE_SIMD16_BUILDER
  84     void StoreVertexElements2(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4]);
  85 #endif
  86
  87 #if USE_SIMD16_SHADERS
  88     Value* GenerateCompCtrlVector(const ComponentControl ctrl, bool useVertexID2);
  89 #else
  90     Value* GenerateCompCtrlVector(const ComponentControl ctrl);
  91 #endif
  92 #if USE_SIMD16_BUILDER
  93     Value* GenerateCompCtrlVector2(const ComponentControl ctrl);
  94 #endif
  95
  96     void JitLoadVertices(const FETCH_COMPILE_STATE &fetchState, Value* streams, Value* vIndices, Value* pVtxOut);
  97 #if USE_SIMD16_SHADERS
  98 #define USE_SIMD16_GATHERS 0
  99
 100 #if USE_SIMD16_GATHERS
 101     void JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, Value *streams, Value *vIndices, Value *vIndices2, Value *pVtxOut, bool useVertexID2);
 102 #else
 103     void JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, Value* streams, Value* vIndices, Value* pVtxOut, bool useVertexID2);
 104 #endif
 105 #else
 106     void JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, Value* streams, Value* vIndices, Value* pVtxOut);
 107 #endif
 108
 109     bool IsOddFormat(SWR_FORMAT format);
 110     bool IsUniformFormat(SWR_FORMAT format);
 111     void UnpackComponents(SWR_FORMAT format, Value* vInput, Value* result[4]);
 112     void CreateGatherOddFormats(SWR_FORMAT format, Value* pMask, Value* pBase, Value* offsets, Value* result[4]);
 113     void ConvertFormat(SWR_FORMAT format, Value *texels[4]);
 114
 115     Value* mpFetchInfo;
 116 };
 117
 118 Function* FetchJit::Create(const FETCH_COMPILE_STATE& fetchState)
 119 {
 120     std::stringstream fnName("FetchShader_", std::ios_base::in | std::ios_base::out | std::ios_base::ate);
 121     fnName << ComputeCRC(0, &fetchState, sizeof(fetchState));
 122
 123     Function*    fetch = Function::Create(JM()->mFetchShaderTy, GlobalValue::ExternalLinkage, fnName.str(), JM()->mpCurrentModule);
 124     BasicBlock*    entry = BasicBlock::Create(JM()->mContext, "entry", fetch);
 125
 126     fetch->getParent()->setModuleIdentifier(fetch->getName());
 127
 128     IRB()->SetInsertPoint(entry);
 129
 130     auto    argitr = fetch->arg_begin();
 131
 132     // Fetch shader arguments
 133     mpFetchInfo = &*argitr; ++argitr;
 134     mpFetchInfo->setName("fetchInfo");
 135     Value*    pVtxOut = &*argitr;
 136     pVtxOut->setName("vtxOutput");
 137     // this is just shorthand to tell LLVM to get a pointer to the base address of simdvertex
 138     // index 0(just the pointer to the simdvertex structure
 139     // index 1(which element of the simdvertex structure to offset to(in this case 0)
 140     // so the indices being i32's doesn't matter
 141     // TODO: generated this GEP with a VECTOR structure type so this makes sense
 142     std::vector<Value*>    vtxInputIndices(2, C(0));
 143     // GEP
 144     pVtxOut = GEP(pVtxOut, C(0));
 145 #if USE_SIMD16_SHADERS
 146 #if 0// USE_SIMD16_BUILDER
 147     pVtxOut = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth2), 0));
 148 #else
 149     pVtxOut = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth), 0));
 150 #endif
 151 #else
 152     pVtxOut = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth), 0));
 153 #endif
 154
 155     // SWR_FETCH_CONTEXT::pStreams
 156     Value*    streams = LOAD(mpFetchInfo,{0, SWR_FETCH_CONTEXT_pStreams});
 157     streams->setName("pStreams");
 158
 159     // SWR_FETCH_CONTEXT::pIndices
 160     Value*    indices = LOAD(mpFetchInfo,{0, SWR_FETCH_CONTEXT_pIndices});
 161     indices->setName("pIndices");
 162
 163     // SWR_FETCH_CONTEXT::pLastIndex
 164     Value*    pLastIndex = LOAD(mpFetchInfo,{0, SWR_FETCH_CONTEXT_pLastIndex});
 165     pLastIndex->setName("pLastIndex");
 166
 167
 168     Value* vIndices;
 169 #if USE_SIMD16_SHADERS
 170     Value* indices2;
 171     Value* vIndices2;
 172 #endif
 173     switch(fetchState.indexType)
 174     {
 175         case R8_UINT:
 176             indices = BITCAST(indices, Type::getInt8PtrTy(JM()->mContext, 0));
 177 #if USE_SIMD16_SHADERS
 178             indices2 = GEP(indices, C(8));
 179 #endif
 180             if(fetchState.bDisableIndexOOBCheck)
 181             {
 182                 vIndices = LOAD(BITCAST(indices, PointerType::get(VectorType::get(mInt8Ty, mpJitMgr->mVWidth), 0)), {(uint32_t)0});
 183                 vIndices = Z_EXT(vIndices, mSimdInt32Ty);
 184 #if USE_SIMD16_SHADERS
 185                 vIndices2 = LOAD(BITCAST(indices2, PointerType::get(VectorType::get(mInt8Ty, mpJitMgr->mVWidth), 0)), { (uint32_t)0 });
 186                 vIndices2 = Z_EXT(vIndices2, mSimdInt32Ty);
 187 #endif
 188             }
 189             else
 190             {
 191                 pLastIndex = BITCAST(pLastIndex, Type::getInt8PtrTy(JM()->mContext, 0));
 192                 vIndices = GetSimdValid8bitIndices(indices, pLastIndex);
 193 #if USE_SIMD16_SHADERS
 194                 pLastIndex = BITCAST(pLastIndex, Type::getInt8PtrTy(JM()->mContext, 0));
 195                 vIndices2 = GetSimdValid8bitIndices(indices2, pLastIndex);
 196 #endif
 197             }
 198             break;
 199         case R16_UINT:
 200             indices = BITCAST(indices, Type::getInt16PtrTy(JM()->mContext, 0));
 201 #if USE_SIMD16_SHADERS
 202             indices2 = GEP(indices, C(8));
 203 #endif
 204             if(fetchState.bDisableIndexOOBCheck)
 205             {
 206                 vIndices = LOAD(BITCAST(indices, PointerType::get(VectorType::get(mInt16Ty, mpJitMgr->mVWidth), 0)), {(uint32_t)0});
 207                 vIndices = Z_EXT(vIndices, mSimdInt32Ty);
 208 #if USE_SIMD16_SHADERS
 209                 vIndices2 = LOAD(BITCAST(indices2, PointerType::get(VectorType::get(mInt16Ty, mpJitMgr->mVWidth), 0)), { (uint32_t)0 });
 210                 vIndices2 = Z_EXT(vIndices2, mSimdInt32Ty);
 211 #endif
 212             }
 213             else
 214             {
 215                 pLastIndex = BITCAST(pLastIndex, Type::getInt16PtrTy(JM()->mContext, 0));
 216                 vIndices = GetSimdValid16bitIndices(indices, pLastIndex);
 217 #if USE_SIMD16_SHADERS
 218                 pLastIndex = BITCAST(pLastIndex, Type::getInt16PtrTy(JM()->mContext, 0));
 219                 vIndices2 = GetSimdValid16bitIndices(indices2, pLastIndex);
 220 #endif
 221             }
 222             break;
 223         case R32_UINT:
 224 #if USE_SIMD16_SHADERS
 225             indices2 = GEP(indices, C(8));
 226 #endif
 227             (fetchState.bDisableIndexOOBCheck) ? vIndices = LOAD(BITCAST(indices, PointerType::get(mSimdInt32Ty,0)),{(uint32_t)0})
 228                                                : vIndices = GetSimdValid32bitIndices(indices, pLastIndex);
 229 #if USE_SIMD16_SHADERS
 230             (fetchState.bDisableIndexOOBCheck) ? vIndices2 = LOAD(BITCAST(indices2, PointerType::get(mSimdInt32Ty, 0)), { (uint32_t)0 })
 231                                                : vIndices2 = GetSimdValid32bitIndices(indices2, pLastIndex);
 232 #endif
 233             break; // incoming type is already 32bit int
 234         default: SWR_INVALID("Unsupported index type"); vIndices = nullptr; break;
 235     }
 236
 237     if(fetchState.bForceSequentialAccessEnable)
 238     {
 239         Value* pOffsets = C({ 0, 1, 2, 3, 4, 5, 6, 7 });
 240
 241         // VertexData buffers are accessed sequentially, the index is equal to the vertex number
 242         vIndices = VBROADCAST(LOAD(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_StartVertex }));
 243         vIndices = ADD(vIndices, pOffsets);
 244 #if USE_SIMD16_SHADERS
 245         vIndices2 = ADD(vIndices, VIMMED1(8));
 246 #endif
 247     }
 248
 249     Value* vVertexId = vIndices;
 250 #if USE_SIMD16_SHADERS
 251     Value* vVertexId2 = vIndices2;
 252 #endif
 253     if (fetchState.bVertexIDOffsetEnable)
 254     {
 255         // Assuming one of baseVertex or startVertex is 0, so adding both should be functionally correct
 256         Value* vBaseVertex = VBROADCAST(LOAD(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_BaseVertex }));
 257         Value* vStartVertex = VBROADCAST(LOAD(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_StartVertex }));
 258         vVertexId = ADD(vIndices, vBaseVertex);
 259         vVertexId = ADD(vVertexId, vStartVertex);
 260 #if USE_SIMD16_SHADERS
 261         vVertexId2 = ADD(vIndices2, vBaseVertex);
 262         vVertexId2 = ADD(vVertexId2, vStartVertex);
 263 #endif
 264     }
 265
 266     // store out vertex IDs
 267     STORE(vVertexId, GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID }));
 268 #if USE_SIMD16_SHADERS
 269     STORE(vVertexId2, GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID2 }));
 270 #endif
 271
 272     // store out cut mask if enabled
 273     if (fetchState.bEnableCutIndex)
 274     {
 275         Value* vCutIndex = VIMMED1(fetchState.cutIndex);
 276         Value* cutMask = VMASK(ICMP_EQ(vIndices, vCutIndex));
 277         STORE(cutMask, GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_CutMask }));
 278 #if USE_SIMD16_SHADERS
 279         Value* cutMask2 = VMASK(ICMP_EQ(vIndices2, vCutIndex));
 280         STORE(cutMask2, GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_CutMask2 }));
 281 #endif
 282     }
 283
 284     // Fetch attributes from memory and output to a simdvertex struct
 285     // since VGATHER has a perf penalty on HSW vs BDW, allow client to choose which fetch method to use
 286 #if USE_SIMD16_SHADERS
 287     if (fetchState.bDisableVGATHER)
 288     {
 289         JitLoadVertices(fetchState, streams, vIndices, pVtxOut);
 290         JitLoadVertices(fetchState, streams, vIndices2, GEP(pVtxOut, C(1)));
 291     }
 292     else
 293     {
 294 #if USE_SIMD16_GATHERS
 295         JitGatherVertices(fetchState, streams, vIndices, vIndices2, pVtxOut, false);
 296 #else
 297         JitGatherVertices(fetchState, streams, vIndices, pVtxOut, false);
 298         JitGatherVertices(fetchState, streams, vIndices2, GEP(pVtxOut, C(1)), true);
 299 #endif
 300     }
 301 #else
 302     (fetchState.bDisableVGATHER) ? JitLoadVertices(fetchState, streams, vIndices, pVtxOut)
 303                                  : JitGatherVertices(fetchState, streams, vIndices, pVtxOut);
 304 #endif
 305
 306     RET_VOID();
 307
 308     JitManager::DumpToFile(fetch, "src");
 309
 310 #if defined(_DEBUG)
 311     verifyFunction(*fetch);
 312 #endif
 313
 314     ::FunctionPassManager setupPasses(JM()->mpCurrentModule);
 315
 316     ///@todo We don't need the CFG passes for fetch. (e.g. BreakCriticalEdges and CFGSimplification)
 317     setupPasses.add(createBreakCriticalEdgesPass());
 318     setupPasses.add(createCFGSimplificationPass());
 319     setupPasses.add(createEarlyCSEPass());
 320     setupPasses.add(createPromoteMemoryToRegisterPass());
 321
 322     setupPasses.run(*fetch);
 323
 324     JitManager::DumpToFile(fetch, "se");
 325
 326     ::FunctionPassManager optPasses(JM()->mpCurrentModule);
 327
 328     ///@todo Haven't touched these either. Need to remove some of these and add others.
 329     optPasses.add(createCFGSimplificationPass());
 330     optPasses.add(createEarlyCSEPass());
 331     optPasses.add(createInstructionCombiningPass());
 332     optPasses.add(createInstructionSimplifierPass());
 333     optPasses.add(createConstantPropagationPass());
 334     optPasses.add(createSCCPPass());
 335     optPasses.add(createAggressiveDCEPass());
 336
 337     optPasses.run(*fetch);
 338     optPasses.run(*fetch);
 339
 340     JitManager::DumpToFile(fetch, "opt");
 341
 342     return fetch;
 343 }
 344
 345 //////////////////////////////////////////////////////////////////////////
 346 /// @brief Loads attributes from memory using LOADs, shuffling the
 347 /// components into SOA form.
 348 /// *Note* currently does not support component control,
 349 /// component packing, instancing
 350 /// @param fetchState - info about attributes to be fetched from memory
 351 /// @param streams - value pointer to the current vertex stream
 352 /// @param vIndices - vector value of indices to load
 353 /// @param pVtxOut - value pointer to output simdvertex struct
 354 void FetchJit::JitLoadVertices(const FETCH_COMPILE_STATE &fetchState, Value* streams, Value* vIndices, Value* pVtxOut)
 355 {
 356     // Zack shuffles; a variant of the Charleston.
 357
 358     std::vector<Value*> vectors(16);
 359     std::vector<Constant*>    pMask(mVWidth);
 360     for(uint32_t i = 0; i < mVWidth; ++i)
 361     {
 362         pMask[i] = (C(i < 4 ? i : 4));
 363     }
 364     Constant* promoteMask = ConstantVector::get(pMask);
 365     Constant* uwvec = UndefValue::get(VectorType::get(mFP32Ty, 4));
 366
 367     Value* startVertex = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartVertex});
 368     Value* startInstance = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartInstance});
 369     Value* curInstance = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_CurInstance});
 370     Value* vBaseVertex = VBROADCAST(LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_BaseVertex}));
 371     curInstance->setName("curInstance");
 372
 373     for(uint32_t nelt = 0; nelt < fetchState.numAttribs; ++nelt)
 374     {
 375         Value*    elements[4] = {0};
 376         const INPUT_ELEMENT_DESC& ied = fetchState.layout[nelt];
 377         const SWR_FORMAT_INFO &info = GetFormatInfo((SWR_FORMAT)ied.Format);
 378         SWR_ASSERT((info.bpp != 0), "Unsupported format in JitLoadVertices.");
 379         uint32_t    numComponents = info.numComps;
 380         uint32_t bpc = info.bpp / info.numComps;  ///@todo Code below assumes all components are same size. Need to fix.
 381
 382         // load path doesn't support component packing
 383         SWR_ASSERT(ied.ComponentPacking == ComponentEnable::XYZW, "Fetch load path doesn't support component packing.");
 384
 385         vectors.clear();
 386
 387         if (fetchState.bInstanceIDOffsetEnable)
 388         {
 389             SWR_ASSERT((0), "TODO: Fill out more once driver sends this down");
 390         }
 391
 392         Value *vCurIndices;
 393         Value *startOffset;
 394         if(ied.InstanceEnable)
 395         {
 396             Value* stepRate = C(ied.InstanceAdvancementState);
 397
 398             // prevent a div by 0 for 0 step rate
 399             Value* isNonZeroStep = ICMP_UGT(stepRate, C(0));
 400             stepRate = SELECT(isNonZeroStep, stepRate, C(1));
 401
 402             // calc the current offset into instanced data buffer
 403             Value* calcInstance = UDIV(curInstance, stepRate);
 404
 405             // if step rate is 0, every instance gets instance 0
 406             calcInstance = SELECT(isNonZeroStep, calcInstance, C(0));
 407
 408             vCurIndices = VBROADCAST(calcInstance);
 409
 410             startOffset = startInstance;
 411         }
 412         else if (ied.InstanceStrideEnable)
 413         {
 414             SWR_ASSERT((0), "TODO: Fill out more once driver sends this down.");
 415         }
 416         else
 417         {
 418             // offset indices by baseVertex
 419             vCurIndices = ADD(vIndices, vBaseVertex);
 420
 421             startOffset = startVertex;
 422         }
 423
 424         // load SWR_VERTEX_BUFFER_STATE::pData
 425         Value *stream = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pData});
 426
 427         // load SWR_VERTEX_BUFFER_STATE::pitch
 428         Value *stride = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pitch});
 429         stride = Z_EXT(stride, mInt64Ty);
 430
 431         // load SWR_VERTEX_BUFFER_STATE::size
 432         Value *size = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_size});
 433         size = Z_EXT(size, mInt64Ty);
 434
 435         Value* startVertexOffset = MUL(Z_EXT(startOffset, mInt64Ty), stride);
 436
 437         Value *minVertex = NULL;
 438         Value *minVertexOffset = NULL;
 439         if (fetchState.bPartialVertexBuffer) {
 440             // fetch min index for low bounds checking
 441             minVertex = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_minVertex)});
 442             minVertex = LOAD(minVertex);
 443             if (!fetchState.bDisableIndexOOBCheck) {
 444                 minVertexOffset = MUL(Z_EXT(minVertex, mInt64Ty), stride);
 445             }
 446         }
 447
 448         // Load from the stream.
 449         for(uint32_t lane = 0; lane < mVWidth; ++lane)
 450         {
 451             // Get index
 452             Value* index = VEXTRACT(vCurIndices, C(lane));
 453
 454             if (fetchState.bPartialVertexBuffer) {
 455                 // clamp below minvertex
 456                 Value *isBelowMin = ICMP_SLT(index, minVertex);
 457                 index = SELECT(isBelowMin, minVertex, index);
 458             }
 459
 460             index = Z_EXT(index, mInt64Ty);
 461
 462             Value*    offset = MUL(index, stride);
 463             offset = ADD(offset, C((int64_t)ied.AlignedByteOffset));
 464             offset = ADD(offset, startVertexOffset);
 465
 466             if (!fetchState.bDisableIndexOOBCheck) {
 467                 // check for out of bound access, including partial OOB, and replace them with minVertex
 468                 Value *endOffset = ADD(offset, C((int64_t)info.Bpp));
 469                 Value *oob = ICMP_ULE(endOffset, size);
 470                 if (fetchState.bPartialVertexBuffer) {
 471                     offset = SELECT(oob, offset, minVertexOffset);
 472                 } else {
 473                     offset = SELECT(oob, offset, ConstantInt::get(mInt64Ty, 0));
 474                 }
 475             }
 476
 477             Value*    pointer = GEP(stream, offset);
 478             // We use a full-lane, but don't actually care.
 479             Value*    vptr = 0;
 480
 481             // get a pointer to a 4 component attrib in default address space
 482             switch(bpc)
 483             {
 484                 case 8: vptr = BITCAST(pointer, PointerType::get(VectorType::get(mInt8Ty, 4), 0)); break;
 485                 case 16: vptr = BITCAST(pointer, PointerType::get(VectorType::get(mInt16Ty, 4), 0)); break;
 486                 case 32: vptr = BITCAST(pointer, PointerType::get(VectorType::get(mFP32Ty, 4), 0)); break;
 487                 default: SWR_INVALID("Unsupported underlying bpp!");
 488             }
 489
 490             // load 4 components of attribute
 491             Value*    vec = ALIGNED_LOAD(vptr, 1, false);
 492
 493             // Convert To FP32 internally
 494             switch(info.type[0])
 495             {
 496                 case SWR_TYPE_UNORM:
 497                     switch(bpc)
 498                     {
 499                         case 8:
 500                             vec = UI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
 501                             vec = FMUL(vec, ConstantVector::get(std::vector<Constant*>(4, ConstantFP::get(mFP32Ty, 1.0 / 255.0))));
 502                             break;
 503                         case 16:
 504                             vec = UI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
 505                             vec = FMUL(vec, ConstantVector::get(std::vector<Constant*>(4, ConstantFP::get(mFP32Ty, 1.0 / 65535.0))));
 506                             break;
 507                         default:
 508                             SWR_INVALID("Unsupported underlying type!");
 509                             break;
 510                     }
 511                     break;
 512                 case SWR_TYPE_SNORM:
 513                     switch(bpc)
 514                     {
 515                         case 8:
 516                             vec = SI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
 517                             vec = FMUL(vec, ConstantVector::get(std::vector<Constant*>(4, ConstantFP::get(mFP32Ty, 1.0 / 128.0))));
 518                             break;
 519                         case 16:
 520                             vec = SI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
 521                             vec = FMUL(vec, ConstantVector::get(std::vector<Constant*>(4, ConstantFP::get(mFP32Ty, 1.0 / 32768.0))));
 522                             break;
 523                         default:
 524                             SWR_INVALID("Unsupported underlying type!");
 525                             break;
 526                     }
 527                     break;
 528                 case SWR_TYPE_UINT:
 529                     // Zero extend uint32_t types.
 530                     switch(bpc)
 531                     {
 532                         case 8:
 533                         case 16:
 534                             vec = Z_EXT(vec, VectorType::get(mInt32Ty, 4));
 535                             vec = BITCAST(vec, VectorType::get(mFP32Ty, 4));
 536                             break;
 537                         case 32:
 538                             break; // Pass through unchanged.
 539                         default:
 540                             SWR_INVALID("Unsupported underlying type!");
 541                             break;
 542                     }
 543                     break;
 544                 case SWR_TYPE_SINT:
 545                     // Sign extend SINT types.
 546                     switch(bpc)
 547                     {
 548                         case 8:
 549                         case 16:
 550                             vec = S_EXT(vec, VectorType::get(mInt32Ty, 4));
 551                             vec = BITCAST(vec, VectorType::get(mFP32Ty, 4));
 552                             break;
 553                         case 32:
 554                             break; // Pass through unchanged.
 555                         default:
 556                             SWR_INVALID("Unsupported underlying type!");
 557                             break;
 558                     }
 559                     break;
 560                 case SWR_TYPE_FLOAT:
 561                     switch(bpc)
 562                     {
 563                         case 32:
 564                             break; // Pass through unchanged.
 565                         default:
 566                             SWR_INVALID("Unsupported underlying type!");
 567                     }
 568                     break;
 569                 case SWR_TYPE_USCALED:
 570                     vec = UI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
 571                     break;
 572                 case SWR_TYPE_SSCALED:
 573                     vec = SI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
 574                     break;
 575                 case SWR_TYPE_SFIXED:
 576                     vec = FMUL(SI_TO_FP(vec, VectorType::get(mFP32Ty, 4)), VBROADCAST(C(1/65536.0f)));
 577                     break;
 578                 case SWR_TYPE_UNKNOWN:
 579                 case SWR_TYPE_UNUSED:
 580                     SWR_INVALID("Unsupported type %d!", info.type[0]);
 581             }
 582
 583             // promote mask: sse(0,1,2,3) | avx(0,1,2,3,4,4,4,4)
 584             // uwvec: 4 x F32, undef value
 585             Value*    wvec = VSHUFFLE(vec, uwvec, promoteMask);
 586             vectors.push_back(wvec);
 587         }
 588
 589         std::vector<Constant*>        v01Mask(mVWidth);
 590         std::vector<Constant*>        v23Mask(mVWidth);
 591         std::vector<Constant*>        v02Mask(mVWidth);
 592         std::vector<Constant*>        v13Mask(mVWidth);
 593
 594         // Concatenate the vectors together.
 595         elements[0] = VUNDEF_F();
 596         elements[1] = VUNDEF_F();
 597         elements[2] = VUNDEF_F();
 598         elements[3] = VUNDEF_F();
 599         for(uint32_t b = 0, num4Wide = mVWidth / 4; b < num4Wide; ++b)
 600         {
 601             v01Mask[4 * b + 0] = C(0 + 4 * b);
 602             v01Mask[4 * b + 1] = C(1 + 4 * b);
 603             v01Mask[4 * b + 2] = C(0 + 4 * b + mVWidth);
 604             v01Mask[4 * b + 3] = C(1 + 4 * b + mVWidth);
 605
 606             v23Mask[4 * b + 0] = C(2 + 4 * b);
 607             v23Mask[4 * b + 1] = C(3 + 4 * b);
 608             v23Mask[4 * b + 2] = C(2 + 4 * b + mVWidth);
 609             v23Mask[4 * b + 3] = C(3 + 4 * b + mVWidth);
 610
 611             v02Mask[4 * b + 0] = C(0 + 4 * b);
 612             v02Mask[4 * b + 1] = C(2 + 4 * b);
 613             v02Mask[4 * b + 2] = C(0 + 4 * b + mVWidth);
 614             v02Mask[4 * b + 3] = C(2 + 4 * b + mVWidth);
 615
 616             v13Mask[4 * b + 0] = C(1 + 4 * b);
 617             v13Mask[4 * b + 1] = C(3 + 4 * b);
 618             v13Mask[4 * b + 2] = C(1 + 4 * b + mVWidth);
 619             v13Mask[4 * b + 3] = C(3 + 4 * b + mVWidth);
 620
 621             std::vector<Constant*>    iMask(mVWidth);
 622             for(uint32_t i = 0; i < mVWidth; ++i)
 623             {
 624                 if(((4 * b) <= i) && (i < (4 * (b + 1))))
 625                 {
 626                     iMask[i] = C(i % 4 + mVWidth);
 627                 }
 628                 else
 629                 {
 630                     iMask[i] = C(i);
 631                 }
 632             }
 633             Constant* insertMask = ConstantVector::get(iMask);
 634             elements[0] = VSHUFFLE(elements[0], vectors[4 * b + 0], insertMask);
 635             elements[1] = VSHUFFLE(elements[1], vectors[4 * b + 1], insertMask);
 636             elements[2] = VSHUFFLE(elements[2], vectors[4 * b + 2], insertMask);
 637             elements[3] = VSHUFFLE(elements[3], vectors[4 * b + 3], insertMask);
 638         }
 639
 640         Value* x0y0x1y1 = VSHUFFLE(elements[0], elements[1], ConstantVector::get(v01Mask));
 641         Value* x2y2x3y3 = VSHUFFLE(elements[2], elements[3], ConstantVector::get(v01Mask));
 642         Value* z0w0z1w1 = VSHUFFLE(elements[0], elements[1], ConstantVector::get(v23Mask));
 643         Value* z2w3z2w3 = VSHUFFLE(elements[2], elements[3], ConstantVector::get(v23Mask));
 644         elements[0] = VSHUFFLE(x0y0x1y1, x2y2x3y3, ConstantVector::get(v02Mask));
 645         elements[1] = VSHUFFLE(x0y0x1y1, x2y2x3y3, ConstantVector::get(v13Mask));
 646         elements[2] = VSHUFFLE(z0w0z1w1, z2w3z2w3, ConstantVector::get(v02Mask));
 647         elements[3] = VSHUFFLE(z0w0z1w1, z2w3z2w3, ConstantVector::get(v13Mask));
 648
 649         switch(numComponents + 1)
 650         {
 651             case    1: elements[0] = VIMMED1(0.0f);
 652             case    2: elements[1] = VIMMED1(0.0f);
 653             case    3: elements[2] = VIMMED1(0.0f);
 654             case    4: elements[3] = VIMMED1(1.0f);
 655         }
 656
 657         for(uint32_t c = 0; c < 4; ++c)
 658         {
 659 #if USE_SIMD16_SHADERS
 660             Value* dest = GEP(pVtxOut, C(nelt * 8 + c * 2), "destGEP");
 661 #else
 662             Value* dest = GEP(pVtxOut, C(nelt * 4 + c), "destGEP");
 663 #endif
 664             STORE(elements[c], dest);
 665         }
 666     }
 667 }
 668
 669 // returns true for odd formats that require special state.gather handling
 670 bool FetchJit::IsOddFormat(SWR_FORMAT format)
 671 {
 672     const SWR_FORMAT_INFO& info = GetFormatInfo(format);
 673     if (info.bpc[0] != 8 && info.bpc[0] != 16 && info.bpc[0] != 32 && info.bpc[0] != 64)
 674     {
 675         return true;
 676     }
 677     return false;
 678 }
 679
 680 // format is uniform if all components are the same size and type
 681 bool FetchJit::IsUniformFormat(SWR_FORMAT format)
 682 {
 683     const SWR_FORMAT_INFO& info = GetFormatInfo(format);
 684     uint32_t bpc0 = info.bpc[0];
 685     uint32_t type0 = info.type[0];
 686
 687     for (uint32_t c = 1; c < info.numComps; ++c)
 688     {
 689         if (bpc0 != info.bpc[c] || type0 != info.type[c])
 690         {
 691             return false;
 692         }
 693     }
 694     return true;
 695 }
 696
 697 // unpacks components based on format
 698 // foreach component in the pixel
 699 //   mask off everything but this component
 700 //   shift component to LSB
 701 void FetchJit::UnpackComponents(SWR_FORMAT format, Value* vInput, Value* result[4])
 702 {
 703     const SWR_FORMAT_INFO& info = GetFormatInfo(format);
 704
 705     uint32_t bitOffset = 0;
 706     for (uint32_t c = 0; c < info.numComps; ++c)
 707     {
 708         uint32_t swizzledIndex = info.swizzle[c];
 709         uint32_t compBits = info.bpc[c];
 710         uint32_t bitmask = ((1 << compBits) - 1) << bitOffset;
 711         Value* comp = AND(vInput, bitmask);
 712         comp = LSHR(comp, bitOffset);
 713
 714         result[swizzledIndex] = comp;
 715         bitOffset += compBits;
 716     }
 717 }
 718
 719 // gather for odd component size formats
 720 // gather SIMD full pixels per lane then shift/mask to move each component to their
 721 // own vector
 722 void FetchJit::CreateGatherOddFormats(SWR_FORMAT format, Value* pMask, Value* pBase, Value* pOffsets, Value* pResult[4])
 723 {
 724     const SWR_FORMAT_INFO &info = GetFormatInfo(format);
 725
 726     // only works if pixel size is <= 32bits
 727     SWR_ASSERT(info.bpp <= 32);
 728
 729         Value* pGather = GATHERDD(VIMMED1(0), pBase, pOffsets, pMask);
 730
 731     for (uint32_t comp = 0; comp < 4; ++comp)
 732     {
 733         pResult[comp] = VIMMED1((int)info.defaults[comp]);
 734     }
 735
 736     UnpackComponents(format, pGather, pResult);
 737
 738     // cast to fp32
 739     pResult[0] = BITCAST(pResult[0], mSimdFP32Ty);
 740     pResult[1] = BITCAST(pResult[1], mSimdFP32Ty);
 741     pResult[2] = BITCAST(pResult[2], mSimdFP32Ty);
 742     pResult[3] = BITCAST(pResult[3], mSimdFP32Ty);
 743 }
 744
 745 void FetchJit::ConvertFormat(SWR_FORMAT format, Value *texels[4])
 746 {
 747     const SWR_FORMAT_INFO &info = GetFormatInfo(format);
 748
 749     for (uint32_t c = 0; c < info.numComps; ++c)
 750     {
 751         uint32_t compIndex = info.swizzle[c];
 752
 753         // skip any conversion on UNUSED components
 754         if (info.type[c] == SWR_TYPE_UNUSED)
 755         {
 756             continue;
 757         }
 758
 759         if (info.isNormalized[c])
 760         {
 761             if (info.type[c] == SWR_TYPE_SNORM)
 762             {
 763                 /// @todo The most-negative value maps to -1.0f. e.g. the 5-bit value 10000 maps to -1.0f.
 764
 765                 /// result = c * (1.0f / (2^(n-1) - 1);
 766                 uint32_t n = info.bpc[c];
 767                 uint32_t pow2 = 1 << (n - 1);
 768                 float scale = 1.0f / (float)(pow2 - 1);
 769                 Value *vScale = VIMMED1(scale);
 770                 texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty);
 771                 texels[compIndex] = SI_TO_FP(texels[compIndex], mSimdFP32Ty);
 772                 texels[compIndex] = FMUL(texels[compIndex], vScale);
 773             }
 774             else
 775             {
 776                 SWR_ASSERT(info.type[c] == SWR_TYPE_UNORM);
 777
 778                 /// result = c * (1.0f / (2^n - 1))
 779                 uint32_t n = info.bpc[c];
 780                 uint32_t pow2 = 1 << n;
 781                 // special case 24bit unorm format, which requires a full divide to meet ULP requirement
 782                 if (n == 24)
 783                 {
 784                     float scale = (float)(pow2 - 1);
 785                     Value* vScale = VIMMED1(scale);
 786                     texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty);
 787                     texels[compIndex] = SI_TO_FP(texels[compIndex], mSimdFP32Ty);
 788                     texels[compIndex] = FDIV(texels[compIndex], vScale);
 789                 }
 790                 else
 791                 {
 792                     float scale = 1.0f / (float)(pow2 - 1);
 793                     Value *vScale = VIMMED1(scale);
 794                     texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty);
 795                     texels[compIndex] = UI_TO_FP(texels[compIndex], mSimdFP32Ty);
 796                     texels[compIndex] = FMUL(texels[compIndex], vScale);
 797                 }
 798             }
 799             continue;
 800         }
 801     }
 802 }
 803
 804 //////////////////////////////////////////////////////////////////////////
 805 /// @brief Loads attributes from memory using AVX2 GATHER(s)
 806 /// @param fetchState - info about attributes to be fetched from memory
 807 /// @param streams - value pointer to the current vertex stream
 808 /// @param vIndices - vector value of indices to gather
 809 /// @param pVtxOut - value pointer to output simdvertex struct
 810 #if USE_SIMD16_SHADERS
 811 #if USE_SIMD16_GATHERS
 812 void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
 813     Value *streams, Value *vIndices, Value *vIndices2, Value *pVtxOut, bool useVertexID2)
 814 #else
 815 void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
 816     Value* streams, Value* vIndices, Value* pVtxOut, bool useVertexID2)
 817 #endif
 818 #else
 819 void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
 820     Value* streams, Value* vIndices, Value* pVtxOut)
 821 #endif
 822 {
 823     uint32_t currentVertexElement = 0;
 824     uint32_t outputElt = 0;
 825     Value* vVertexElements[4];
 826 #if USE_SIMD16_GATHERS
 827     Value* vVertexElements2[4];
 828 #endif
 829
 830     Value* startVertex = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartVertex});
 831     Value* startInstance = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartInstance});
 832     Value* curInstance = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_CurInstance});
 833     Value* vBaseVertex = VBROADCAST(LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_BaseVertex}));
 834     curInstance->setName("curInstance");
 835
 836     for (uint32_t nInputElt = 0; nInputElt < fetchState.numAttribs; nInputElt += 1)
 837     {
 838         const INPUT_ELEMENT_DESC& ied = fetchState.layout[nInputElt];
 839
 840         // skip element if all components are disabled
 841         if (ied.ComponentPacking == ComponentEnable::NONE)
 842         {
 843             continue;
 844         }
 845
 846         const SWR_FORMAT_INFO &info = GetFormatInfo((SWR_FORMAT)ied.Format);
 847         SWR_ASSERT((info.bpp != 0), "Unsupported format in JitGatherVertices.");
 848         uint32_t bpc = info.bpp / info.numComps;  ///@todo Code below assumes all components are same size. Need to fix.
 849
 850         Value *stream = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pData});
 851
 852         // VGATHER* takes an *i8 src pointer
 853         Value* pStreamBase = BITCAST(stream, PointerType::get(mInt8Ty, 0));
 854
 855         Value *stride = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pitch});
 856         Value *vStride = VBROADCAST(stride);
 857
 858         // max vertex index that is fully in bounds
 859         Value *maxVertex = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_maxVertex)});
 860         maxVertex = LOAD(maxVertex);
 861
 862         Value *minVertex = NULL;
 863         if (fetchState.bPartialVertexBuffer)
 864         {
 865             // min vertex index for low bounds OOB checking
 866             minVertex = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_minVertex)});
 867             minVertex = LOAD(minVertex);
 868         }
 869
 870         if (fetchState.bInstanceIDOffsetEnable)
 871         {
 872             // the InstanceID (curInstance) value is offset by StartInstanceLocation
 873             curInstance = ADD(curInstance, startInstance);
 874         }
 875
 876         Value *vCurIndices;
 877 #if USE_SIMD16_GATHERS
 878         Value *vCurIndices2;
 879 #endif
 880         Value *startOffset;
 881         Value *vInstanceStride = VIMMED1(0);
 882
 883         if (ied.InstanceEnable)
 884         {
 885             Value* stepRate = C(ied.InstanceAdvancementState);
 886
 887             // prevent a div by 0 for 0 step rate
 888             Value* isNonZeroStep = ICMP_UGT(stepRate, C(0));
 889             stepRate = SELECT(isNonZeroStep, stepRate, C(1));
 890
 891             // calc the current offset into instanced data buffer
 892             Value* calcInstance = UDIV(curInstance, stepRate);
 893
 894             // if step rate is 0, every instance gets instance 0
 895             calcInstance = SELECT(isNonZeroStep, calcInstance, C(0));
 896
 897             vCurIndices = VBROADCAST(calcInstance);
 898 #if USE_SIMD16_GATHERS
 899             vCurIndices2 = VBROADCAST(calcInstance);
 900 #endif
 901
 902             startOffset = startInstance;
 903         }
 904         else if (ied.InstanceStrideEnable)
 905         {
 906             // grab the instance advancement state, determines stride in bytes from one instance to the next
 907             Value* stepRate = C(ied.InstanceAdvancementState);
 908             vInstanceStride = VBROADCAST(MUL(curInstance, stepRate));
 909
 910             // offset indices by baseVertex
 911             vCurIndices = ADD(vIndices, vBaseVertex);
 912 #if USE_SIMD16_GATHERS
 913             vCurIndices2 = ADD(vIndices2, vBaseVertex);
 914 #endif
 915
 916             startOffset = startVertex;
 917             SWR_ASSERT((0), "TODO: Fill out more once driver sends this down.");
 918         }
 919         else
 920         {
 921             // offset indices by baseVertex
 922             vCurIndices = ADD(vIndices, vBaseVertex);
 923 #if USE_SIMD16_GATHERS
 924             vCurIndices2 = ADD(vIndices2, vBaseVertex);
 925 #endif
 926
 927             startOffset = startVertex;
 928         }
 929
 930         // All of the OOB calculations are in vertices, not VB offsets, to prevent having to
 931         // do 64bit address offset calculations.
 932
 933         // calculate byte offset to the start of the VB
 934         Value* baseOffset = MUL(Z_EXT(startOffset, mInt64Ty), Z_EXT(stride, mInt64Ty));
 935         pStreamBase = GEP(pStreamBase, baseOffset);
 936
 937         // if we have a start offset, subtract from max vertex. Used for OOB check
 938         maxVertex = SUB(Z_EXT(maxVertex, mInt64Ty), Z_EXT(startOffset, mInt64Ty));
 939         Value* maxNeg = ICMP_SLT(maxVertex, C((int64_t)0));
 940         // if we have a negative value, we're already OOB. clamp at 0.
 941         maxVertex = SELECT(maxNeg, C(0), TRUNC(maxVertex, mInt32Ty));
 942
 943         if (fetchState.bPartialVertexBuffer)
 944         {
 945             // similary for min vertex
 946             minVertex = SUB(Z_EXT(minVertex, mInt64Ty), Z_EXT(startOffset, mInt64Ty));
 947             Value *minNeg = ICMP_SLT(minVertex, C((int64_t)0));
 948             minVertex = SELECT(minNeg, C(0), TRUNC(minVertex, mInt32Ty));
 949         }
 950
 951         // Load the in bounds size of a partially valid vertex
 952         Value *partialInboundsSize = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_partialInboundsSize)});
 953         partialInboundsSize = LOAD(partialInboundsSize);
 954         Value* vPartialVertexSize = VBROADCAST(partialInboundsSize);
 955         Value* vBpp = VBROADCAST(C(info.Bpp));
 956         Value* vAlignmentOffsets = VBROADCAST(C(ied.AlignedByteOffset));
 957
 958         // is the element is <= the partially valid size
 959         Value* vElementInBoundsMask = ICMP_SLE(vBpp, SUB(vPartialVertexSize, vAlignmentOffsets));
 960
 961 #if USE_SIMD16_GATHERS
 962         // override cur indices with 0 if pitch is 0
 963         Value* pZeroPitchMask = ICMP_EQ(vStride, VIMMED1(0));
 964         vCurIndices2 = SELECT(pZeroPitchMask, VIMMED1(0), vCurIndices2);
 965
 966         // are vertices partially OOB?
 967         Value* vMaxVertex = VBROADCAST(maxVertex);
 968         Value* vPartialOOBMask = ICMP_EQ(vCurIndices, vMaxVertex);
 969         Value* vPartialOOBMask2 = ICMP_EQ(vCurIndices2, vMaxVertex);
 970
 971         // are vertices fully in bounds?
 972         Value* vMaxGatherMask = ICMP_ULT(vCurIndices, vMaxVertex);
 973         Value* vMaxGatherMask2 = ICMP_ULT(vCurIndices2, vMaxVertex);
 974
 975         Value *vGatherMask;
 976         Value *vGatherMask2;
 977         if (fetchState.bPartialVertexBuffer)
 978         {
 979             // are vertices below minVertex limit?
 980             Value *vMinVertex = VBROADCAST(minVertex);
 981             Value *vMinGatherMask = ICMP_UGE(vCurIndices, vMinVertex);
 982             Value *vMinGatherMask2 = ICMP_UGE(vCurIndices2, vMinVertex);
 983
 984             // only fetch lanes that pass both tests
 985             vGatherMask = AND(vMaxGatherMask, vMinGatherMask);
 986             vGatherMask2 = AND(vMaxGatherMask, vMinGatherMask2);
 987         }
 988         else
 989         {
 990             vGatherMask = vMaxGatherMask;
 991             vGatherMask2 = vMaxGatherMask2;
 992         }
 993
 994         // blend in any partially OOB indices that have valid elements
 995         vGatherMask = SELECT(vPartialOOBMask, vElementInBoundsMask, vGatherMask);
 996         vGatherMask2 = SELECT(vPartialOOBMask2, vElementInBoundsMask, vGatherMask2);
 997         Value *pMask = vGatherMask;
 998         Value *pMask2 = vGatherMask2;
 999         vGatherMask = VMASK(vGatherMask);
1000         vGatherMask2 = VMASK(vGatherMask2);
1001
1002         // calculate the actual offsets into the VB
1003         Value* vOffsets = MUL(vCurIndices, vStride);
1004         vOffsets = ADD(vOffsets, vAlignmentOffsets);
1005
1006         Value* vOffsets2 = MUL(vCurIndices2, vStride);
1007         vOffsets2 = ADD(vOffsets2, vAlignmentOffsets);
1008
1009         // if instance stride enable is:
1010         //  true  - add product of the instanceID and advancement state to the offst into the VB
1011         //  false - value of vInstanceStride has been initialialized to zero
1012         vOffsets = ADD(vOffsets, vInstanceStride);
1013         vOffsets2 = ADD(vOffsets2, vInstanceStride);
1014
1015 #else
1016         // override cur indices with 0 if pitch is 0
1017         Value* pZeroPitchMask = ICMP_EQ(vStride, VIMMED1(0));
1018         vCurIndices = SELECT(pZeroPitchMask, VIMMED1(0), vCurIndices);
1019
1020         // are vertices partially OOB?
1021         Value* vMaxVertex = VBROADCAST(maxVertex);
1022         Value* vPartialOOBMask = ICMP_EQ(vCurIndices, vMaxVertex);
1023
1024         // are vertices fully in bounds?
1025         Value* vMaxGatherMask = ICMP_ULT(vCurIndices, vMaxVertex);
1026
1027         Value *vGatherMask;
1028         if (fetchState.bPartialVertexBuffer)
1029         {
1030             // are vertices below minVertex limit?
1031             Value *vMinVertex = VBROADCAST(minVertex);
1032             Value *vMinGatherMask = ICMP_UGE(vCurIndices, vMinVertex);
1033
1034             // only fetch lanes that pass both tests
1035             vGatherMask = AND(vMaxGatherMask, vMinGatherMask);
1036         }
1037         else
1038         {
1039             vGatherMask = vMaxGatherMask;
1040         }
1041
1042         // blend in any partially OOB indices that have valid elements
1043         vGatherMask = SELECT(vPartialOOBMask, vElementInBoundsMask, vGatherMask);
1044         Value* pMask = vGatherMask;
1045         vGatherMask = VMASK(vGatherMask);
1046
1047         // calculate the actual offsets into the VB
1048         Value* vOffsets = MUL(vCurIndices, vStride);
1049         vOffsets = ADD(vOffsets, vAlignmentOffsets);
1050
1051         // if instance stride enable is:
1052         //  true  - add product of the instanceID and advancement state to the offst into the VB
1053         //  false - value of vInstanceStride has been initialialized to zero
1054         vOffsets = ADD(vOffsets, vInstanceStride);
1055
1056 #endif
1057         // Packing and component control
1058         ComponentEnable compMask = (ComponentEnable)ied.ComponentPacking;
1059         const ComponentControl compCtrl[4] { (ComponentControl)ied.ComponentControl0, (ComponentControl)ied.ComponentControl1,
1060                                              (ComponentControl)ied.ComponentControl2, (ComponentControl)ied.ComponentControl3};
1061
1062         // Special gather/conversion for formats without equal component sizes
1063         if (IsOddFormat((SWR_FORMAT)ied.Format))
1064         {
1065 #if USE_SIMD16_GATHERS
1066             Value *pResults[4];
1067             Value *pResults2[4];
1068             CreateGatherOddFormats((SWR_FORMAT)ied.Format, vGatherMask, pStreamBase, vOffsets, pResults);
1069             CreateGatherOddFormats((SWR_FORMAT)ied.Format, vGatherMask2, pStreamBase, vOffsets2, pResults2);
1070             ConvertFormat((SWR_FORMAT)ied.Format, pResults);
1071             ConvertFormat((SWR_FORMAT)ied.Format, pResults2);
1072
1073             for (uint32_t c = 0; c < 4; c += 1)
1074             {
1075                 if (isComponentEnabled(compMask, c))
1076                 {
1077                     vVertexElements[currentVertexElement] = pResults[c];
1078                     vVertexElements2[currentVertexElement] = pResults2[c];
1079                     currentVertexElement++;
1080
1081                     if (currentVertexElement > 3)
1082                     {
1083                         StoreVertexElements(pVtxOut, outputElt, 4, vVertexElements);
1084                         StoreVertexElements(GEP(pVtxOut, C(1)), outputElt, 4, vVertexElements2);
1085
1086                         outputElt += 1;
1087
1088                         // reset to the next vVertexElement to output
1089                         currentVertexElement = 0;
1090                     }
1091                 }
1092             }
1093 #else
1094             Value* pResults[4];
1095             CreateGatherOddFormats((SWR_FORMAT)ied.Format, vGatherMask, pStreamBase, vOffsets, pResults);
1096             ConvertFormat((SWR_FORMAT)ied.Format, pResults);
1097
1098             for (uint32_t c = 0; c < 4; ++c)
1099             {
1100                 if (isComponentEnabled(compMask, c))
1101                 {
1102                     vVertexElements[currentVertexElement++] = pResults[c];
1103                     if (currentVertexElement > 3)
1104                     {
1105                         StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1106                         // reset to the next vVertexElement to output
1107                         currentVertexElement = 0;
1108                     }
1109                 }
1110             }
1111 #endif
1112         }
1113         else if(info.type[0] == SWR_TYPE_FLOAT)
1114         {
1115             ///@todo: support 64 bit vb accesses
1116             Value* gatherSrc = VIMMED1(0.0f);
1117 #if USE_SIMD16_GATHERS
1118             Value* gatherSrc2 = VIMMED1(0.0f);
1119 #endif
1120
1121             SWR_ASSERT(IsUniformFormat((SWR_FORMAT)ied.Format),
1122                 "Unsupported format for standard gather fetch.");
1123
1124             // Gather components from memory to store in a simdvertex structure
1125             switch (bpc)
1126             {
1127                 case 16:
1128                 {
1129 #if USE_SIMD16_GATHERS
1130                     Value* vGatherResult[2];
1131                     Value* vGatherResult2[2];
1132                     Value *vMask;
1133                     Value *vMask2;
1134
1135                     // if we have at least one component out of x or y to fetch
1136                     if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
1137                     {
1138                         // save mask as it is zero'd out after each gather
1139                         vMask = vGatherMask;
1140                         vMask2 = vGatherMask2;
1141
1142                         vGatherResult[0] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vMask);
1143                         vGatherResult2[0] = GATHERPS(gatherSrc2, pStreamBase, vOffsets2, vMask2);
1144                         // e.g. result of first 8x32bit integer gather for 16bit components
1145                         // 256i - 0    1    2    3    4    5    6    7
1146                         //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
1147                         //
1148                     }
1149
1150                     // if we have at least one component out of z or w to fetch
1151                     if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
1152                     {
1153                         // offset base to the next components(zw) in the vertex to gather
1154                         pStreamBase = GEP(pStreamBase, C((char)4));
1155                         vMask = vGatherMask;
1156                         vMask2 = vGatherMask2;
1157
1158                         vGatherResult[1] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vMask);
1159                         vGatherResult2[1] = GATHERPS(gatherSrc2, pStreamBase, vOffsets2, vMask2);
1160                         // e.g. result of second 8x32bit integer gather for 16bit components
1161                         // 256i - 0    1    2    3    4    5    6    7
1162                         //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
1163                         //
1164                     }
1165
1166
1167                     // if we have at least one component to shuffle into place
1168                     if (compMask)
1169                     {
1170                         Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, Instruction::CastOps::FPExt, CONVERT_NONE,
1171                             currentVertexElement, outputElt, compMask, compCtrl, vVertexElements);
1172                         Shuffle16bpcArgs args2 = std::forward_as_tuple(vGatherResult2, GEP(pVtxOut, C(1)), Instruction::CastOps::FPExt, CONVERT_NONE,
1173                             currentVertexElement, outputElt, compMask, compCtrl, vVertexElements2);
1174
1175                         // Shuffle gathered components into place in simdvertex struct
1176                         Shuffle16bpcGather(args, false);  // outputs to vVertexElements ref
1177                         Shuffle16bpcGather(args2, true);  // outputs to vVertexElements ref
1178                     }
1179 #else
1180                     Value* vGatherResult[2];
1181                     Value *vMask;
1182
1183                     // if we have at least one component out of x or y to fetch
1184                     if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)){
1185                         // save mask as it is zero'd out after each gather
1186                         vMask = vGatherMask;
1187
1188                         vGatherResult[0] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vMask);
1189                         // e.g. result of first 8x32bit integer gather for 16bit components
1190                         // 256i - 0    1    2    3    4    5    6    7
1191                         //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
1192                         //
1193                     }
1194
1195                     // if we have at least one component out of z or w to fetch
1196                     if(isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)){
1197                         // offset base to the next components(zw) in the vertex to gather
1198                         pStreamBase = GEP(pStreamBase, C((char)4));
1199                         vMask = vGatherMask;
1200
1201                         vGatherResult[1] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vMask);
1202                         // e.g. result of second 8x32bit integer gather for 16bit components
1203                         // 256i - 0    1    2    3    4    5    6    7
1204                         //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
1205                         //
1206                     }
1207
1208                     // if we have at least one component to shuffle into place
1209                     if(compMask){
1210                         Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, Instruction::CastOps::FPExt, CONVERT_NONE,
1211                             currentVertexElement, outputElt, compMask, compCtrl, vVertexElements);
1212
1213                         // Shuffle gathered components into place in simdvertex struct
1214 #if USE_SIMD16_SHADERS
1215                         Shuffle16bpcGather(args, useVertexID2);  // outputs to vVertexElements ref
1216 #else
1217                         Shuffle16bpcGather(args);  // outputs to vVertexElements ref
1218 #endif
1219                     }
1220 #endif
1221                 }
1222                     break;
1223                 case 32:
1224                 {
1225 #if USE_SIMD16_GATHERS
1226 #if USE_SIMD16_BUILDER
1227                     Value *pVtxSrc2[4];
1228
1229 #endif
1230 #endif
1231                     for (uint32_t i = 0; i < 4; i += 1)
1232                     {
1233 #if USE_SIMD16_GATHERS
1234                         if (isComponentEnabled(compMask, i))
1235                         {
1236                             // if we need to gather the component
1237                             if (compCtrl[i] == StoreSrc)
1238                             {
1239                                 // save mask as it is zero'd out after each gather
1240                                 Value *vMask  = vGatherMask;
1241                                 Value *vMask2 = vGatherMask2;
1242
1243                                 // Gather a SIMD of vertices
1244                                 // APIs allow a 4GB range for offsets
1245                                 // However, GATHERPS uses signed 32-bit offsets, so only a 2GB range :(
1246                                 // But, we know that elements must be aligned for FETCH. :)
1247                                 // Right shift the offset by a bit and then scale by 2 to remove the sign extension.
1248                                 Value *vShiftedOffsets  = VPSRLI(vOffsets,  C(1));
1249                                 Value *vShiftedOffsets2 = VPSRLI(vOffsets2, C(1));
1250 #if USE_SIMD16_BUILDER
1251                                 Value *src = VUNDEF2_F();
1252                                 src = INSERT2_F(src, gatherSrc,  0);
1253                                 src = INSERT2_F(src, gatherSrc2, 1);
1254
1255                                 Value *indices = VUNDEF2_I();
1256                                 indices = INSERT2_I(indices, vShiftedOffsets,  0);
1257                                 indices = INSERT2_I(indices, vShiftedOffsets2, 1);
1258
1259                                 Value *mask = VUNDEF2_I();
1260                                 mask = INSERT2_I(mask, vMask,  0);
1261                                 mask = INSERT2_I(mask, vMask2, 1);
1262
1263                                 pVtxSrc2[currentVertexElement] = GATHERPS2(src, pStreamBase, indices, mask, 2);
1264 #if 1
1265
1266                                 vVertexElements[currentVertexElement]  = EXTRACT2_F(pVtxSrc2[currentVertexElement], 0);
1267                                 vVertexElements2[currentVertexElement] = EXTRACT2_F(pVtxSrc2[currentVertexElement], 1);
1268 #endif
1269 #else
1270                                 vVertexElements[currentVertexElement]  = GATHERPS(gatherSrc, pStreamBase, vShiftedOffsets, vMask, 2);
1271                                 vVertexElements2[currentVertexElement] = GATHERPS(gatherSrc2, pStreamBase, vShiftedOffsets2, vMask2, 2);
1272
1273 #if USE_SIMD16_BUILDER
1274                                 // pack adjacent pairs of SIMD8s into SIMD16s
1275                                 pVtxSrc2[currentVertexElement] = VUNDEF2_F();
1276                                 pVtxSrc2[currentVertexElement] = INSERT2_F(pVtxSrc2[currentVertexElement], vVertexElements[currentVertexElement],  0);
1277                                 pVtxSrc2[currentVertexElement] = INSERT2_F(pVtxSrc2[currentVertexElement], vVertexElements2[currentVertexElement], 1);
1278
1279 #endif
1280 #endif
1281                                 currentVertexElement += 1;
1282                             }
1283                             else
1284                             {
1285 #if USE_SIMD16_BUILDER
1286                                 pVtxSrc2[currentVertexElement] = GenerateCompCtrlVector2(compCtrl[i]);
1287 #else
1288                                 vVertexElements[currentVertexElement] = GenerateCompCtrlVector(compCtrl[i], false);
1289                                 vVertexElements2[currentVertexElement] = GenerateCompCtrlVector(compCtrl[i], true);
1290
1291 #if USE_SIMD16_BUILDER
1292                                 // pack adjacent pairs of SIMD8s into SIMD16s
1293                                 pVtxSrc2[currentVertexElement] = VUNDEF2_F();
1294                                 pVtxSrc2[currentVertexElement] = INSERT2_F(pVtxSrc2[currentVertexElement], vVertexElements[currentVertexElement],  0);
1295                                 pVtxSrc2[currentVertexElement] = INSERT2_F(pVtxSrc2[currentVertexElement], vVertexElements2[currentVertexElement], 1);
1296
1297 #endif
1298 #endif
1299                                 currentVertexElement += 1;
1300                             }
1301
1302                             if (currentVertexElement > 3)
1303                             {
1304 #if USE_SIMD16_BUILDER
1305                                 // store SIMD16s
1306                                 Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth2), 0));
1307
1308                                 StoreVertexElements2(pVtxOut2, outputElt, 4, pVtxSrc2);
1309
1310 #else
1311                                 StoreVertexElements(pVtxOut, outputElt, 4, vVertexElements);
1312                                 StoreVertexElements(GEP(pVtxOut, C(1)), outputElt, 4, vVertexElements2);
1313
1314 #endif
1315                                 outputElt += 1;
1316
1317                                 // reset to the next vVertexElement to output
1318                                 currentVertexElement = 0;
1319                             }
1320                         }
1321
1322                         // offset base to the next component in the vertex to gather
1323                         pStreamBase = GEP(pStreamBase, C((char)4));
1324 #else
1325                         if (isComponentEnabled(compMask, i))
1326                         {
1327                             // if we need to gather the component
1328                             if (compCtrl[i] == StoreSrc)
1329                             {
1330                                 // save mask as it is zero'd out after each gather
1331                                 Value *vMask = vGatherMask;
1332
1333                                 // Gather a SIMD of vertices
1334                                 // APIs allow a 4GB range for offsets
1335                                 // However, GATHERPS uses signed 32-bit offsets, so only a 2GB range :(
1336                                 // But, we know that elements must be aligned for FETCH. :)
1337                                 // Right shift the offset by a bit and then scale by 2 to remove the sign extension.
1338                                 Value* vShiftedOffsets = VPSRLI(vOffsets, C(1));
1339                                 vVertexElements[currentVertexElement++] = GATHERPS(gatherSrc, pStreamBase, vShiftedOffsets, vMask, 2);
1340                             }
1341                             else
1342                             {
1343 #if USE_SIMD16_SHADERS
1344                                 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i], useVertexID2);
1345 #else
1346                                 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1347 #endif
1348                             }
1349
1350                             if (currentVertexElement > 3)
1351                             {
1352                                 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1353                                 // reset to the next vVertexElement to output
1354                                 currentVertexElement = 0;
1355                             }
1356                         }
1357
1358                         // offset base to the next component in the vertex to gather
1359                         pStreamBase = GEP(pStreamBase, C((char)4));
1360 #endif
1361                     }
1362                 }
1363                     break;
1364                 case 64:
1365                 {
1366                     for (uint32_t i = 0; i < 4; i += 1)
1367                     {
1368 #if USE_SIMD16_GATHERS
1369                         if (isComponentEnabled(compMask, i))
1370                         {
1371                             // if we need to gather the component
1372                             if (compCtrl[i] == StoreSrc)
1373                             {
1374                                 Value *vMaskLo = VSHUFFLE(pMask, VUNDEF(mInt1Ty, 8), C({ 0, 1, 2, 3 }));
1375                                 Value *vMaskLo2 = VSHUFFLE(pMask2, VUNDEF(mInt1Ty, 8), C({ 0, 1, 2, 3 }));
1376                                 Value *vMaskHi = VSHUFFLE(pMask, VUNDEF(mInt1Ty, 8), C({ 4, 5, 6, 7 }));
1377                                 Value *vMaskHi2 = VSHUFFLE(pMask2, VUNDEF(mInt1Ty, 8), C({ 4, 5, 6, 7 }));
1378                                 vMaskLo = S_EXT(vMaskLo, VectorType::get(mInt64Ty, 4));
1379                                 vMaskLo2 = S_EXT(vMaskLo2, VectorType::get(mInt64Ty, 4));
1380                                 vMaskHi = S_EXT(vMaskHi, VectorType::get(mInt64Ty, 4));
1381                                 vMaskHi2 = S_EXT(vMaskHi2, VectorType::get(mInt64Ty, 4));
1382                                 vMaskLo = BITCAST(vMaskLo, VectorType::get(mDoubleTy, 4));
1383                                 vMaskLo2 = BITCAST(vMaskLo2, VectorType::get(mDoubleTy, 4));
1384                                 vMaskHi = BITCAST(vMaskHi, VectorType::get(mDoubleTy, 4));
1385                                 vMaskHi2 = BITCAST(vMaskHi2, VectorType::get(mDoubleTy, 4));
1386
1387                                 Value *vOffsetsLo = VEXTRACTI128(vOffsets, C(0));
1388                                 Value *vOffsetsLo2 = VEXTRACTI128(vOffsets2, C(0));
1389                                 Value *vOffsetsHi = VEXTRACTI128(vOffsets, C(1));
1390                                 Value *vOffsetsHi2 = VEXTRACTI128(vOffsets2, C(1));
1391
1392                                 Value *vZeroDouble = VECTOR_SPLAT(4, ConstantFP::get(IRB()->getDoubleTy(), 0.0f));
1393
1394                                 Value* pGatherLo = GATHERPD(vZeroDouble, pStreamBase, vOffsetsLo, vMaskLo);
1395                                 Value* pGatherLo2 = GATHERPD(vZeroDouble, pStreamBase, vOffsetsLo2, vMaskLo2);
1396                                 Value* pGatherHi = GATHERPD(vZeroDouble, pStreamBase, vOffsetsHi, vMaskHi);
1397                                 Value* pGatherHi2 = GATHERPD(vZeroDouble, pStreamBase, vOffsetsHi2, vMaskHi2);
1398
1399                                 pGatherLo = VCVTPD2PS(pGatherLo);
1400                                 pGatherLo2 = VCVTPD2PS(pGatherLo2);
1401                                 pGatherHi = VCVTPD2PS(pGatherHi);
1402                                 pGatherHi2 = VCVTPD2PS(pGatherHi2);
1403
1404                                 Value *pGather = VSHUFFLE(pGatherLo, pGatherHi, C({ 0, 1, 2, 3, 4, 5, 6, 7 }));
1405                                 Value *pGather2 = VSHUFFLE(pGatherLo2, pGatherHi2, C({ 0, 1, 2, 3, 4, 5, 6, 7 }));
1406
1407                                 vVertexElements[currentVertexElement] = pGather;
1408                                 vVertexElements2[currentVertexElement] = pGather2;
1409
1410                                 currentVertexElement += 1;
1411                             }
1412                             else
1413                             {
1414                                 vVertexElements[currentVertexElement] = GenerateCompCtrlVector(compCtrl[i], false);
1415                                 vVertexElements2[currentVertexElement] = GenerateCompCtrlVector(compCtrl[i], true);
1416
1417                                 currentVertexElement += 1;
1418                             }
1419
1420                             if (currentVertexElement > 3)
1421                             {
1422                                 StoreVertexElements(pVtxOut, outputElt, 4, vVertexElements);
1423                                 StoreVertexElements(GEP(pVtxOut, C(1)), outputElt, 4, vVertexElements2);
1424
1425                                 outputElt += 1;
1426
1427                                 // reset to the next vVertexElement to output
1428                                 currentVertexElement = 0;
1429                             }
1430                         }
1431
1432                         // offset base to the next component  in the vertex to gather
1433                         pStreamBase = GEP(pStreamBase, C((char)8));
1434 #else
1435                         if (isComponentEnabled(compMask, i))
1436                         {
1437                             // if we need to gather the component
1438                             if (compCtrl[i] == StoreSrc)
1439                             {
1440                                 Value *vMaskLo = VSHUFFLE(pMask, VUNDEF(mInt1Ty, 8), C({0, 1, 2, 3}));
1441                                 Value *vMaskHi = VSHUFFLE(pMask, VUNDEF(mInt1Ty, 8), C({4, 5, 6, 7}));
1442                                 vMaskLo = S_EXT(vMaskLo, VectorType::get(mInt64Ty, 4));
1443                                 vMaskHi = S_EXT(vMaskHi, VectorType::get(mInt64Ty, 4));
1444                                 vMaskLo = BITCAST(vMaskLo, VectorType::get(mDoubleTy, 4));
1445                                 vMaskHi = BITCAST(vMaskHi, VectorType::get(mDoubleTy, 4));
1446
1447                                 Value *vOffsetsLo = VEXTRACTI128(vOffsets, C(0));
1448                                 Value *vOffsetsHi = VEXTRACTI128(vOffsets, C(1));
1449
1450                                 Value *vZeroDouble = VECTOR_SPLAT(4, ConstantFP::get(IRB()->getDoubleTy(), 0.0f));
1451
1452                                 Value* pGatherLo = GATHERPD(vZeroDouble,
1453                                                             pStreamBase, vOffsetsLo, vMaskLo);
1454                                 Value* pGatherHi = GATHERPD(vZeroDouble,
1455                                                             pStreamBase, vOffsetsHi, vMaskHi);
1456
1457                                 pGatherLo = VCVTPD2PS(pGatherLo);
1458                                 pGatherHi = VCVTPD2PS(pGatherHi);
1459
1460                                 Value *pGather = VSHUFFLE(pGatherLo, pGatherHi, C({0, 1, 2, 3, 4, 5, 6, 7}));
1461
1462                                 vVertexElements[currentVertexElement++] = pGather;
1463                             }
1464                             else
1465                             {
1466 #if USE_SIMD16_SHADERS
1467                                 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i], useVertexID2);
1468 #else
1469                                 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1470 #endif
1471                             }
1472
1473                             if (currentVertexElement > 3)
1474                             {
1475                                 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1476                                 // reset to the next vVertexElement to output
1477                                 currentVertexElement = 0;
1478                             }
1479                         }
1480
1481                         // offset base to the next component  in the vertex to gather
1482                         pStreamBase = GEP(pStreamBase, C((char)8));
1483 #endif
1484                     }
1485                 }
1486                     break;
1487                 default:
1488                     SWR_INVALID("Tried to fetch invalid FP format");
1489                     break;
1490             }
1491         }
1492         else
1493         {
1494             Instruction::CastOps extendCastType = Instruction::CastOps::CastOpsEnd;
1495             ConversionType conversionType = CONVERT_NONE;
1496
1497             SWR_ASSERT(IsUniformFormat((SWR_FORMAT)ied.Format),
1498                 "Unsupported format for standard gather fetch.");
1499
1500             switch(info.type[0])
1501             {
1502                 case SWR_TYPE_UNORM:
1503                     conversionType = CONVERT_NORMALIZED;
1504                 case SWR_TYPE_UINT:
1505                     extendCastType = Instruction::CastOps::ZExt;
1506                     break;
1507                 case SWR_TYPE_SNORM:
1508                     conversionType = CONVERT_NORMALIZED;
1509                 case SWR_TYPE_SINT:
1510                     extendCastType = Instruction::CastOps::SExt;
1511                     break;
1512                 case SWR_TYPE_USCALED:
1513                     conversionType = CONVERT_USCALED;
1514                     extendCastType = Instruction::CastOps::UIToFP;
1515                     break;
1516                 case SWR_TYPE_SSCALED:
1517                     conversionType = CONVERT_SSCALED;
1518                     extendCastType = Instruction::CastOps::SIToFP;
1519                     break;
1520                 case SWR_TYPE_SFIXED:
1521                     conversionType = CONVERT_SFIXED;
1522                     extendCastType = Instruction::CastOps::SExt;
1523                     break;
1524                 default:
1525                     break;
1526             }
1527
1528             // value substituted when component of gather is masked
1529             Value* gatherSrc = VIMMED1(0);
1530 #if USE_SIMD16_GATHERS
1531             Value* gatherSrc2 = VIMMED1(0);
1532 #endif
1533
1534             // Gather components from memory to store in a simdvertex structure
1535             switch (bpc)
1536             {
1537                 case 8:
1538                 {
1539                     // if we have at least one component to fetch
1540                     if (compMask)
1541                     {
1542 #if USE_SIMD16_GATHERS
1543                         Value* vGatherResult = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask);
1544                         Value* vGatherResult2 = GATHERDD(gatherSrc2, pStreamBase, vOffsets2, vGatherMask2);
1545                         // e.g. result of an 8x32bit integer gather for 8bit components
1546                         // 256i - 0    1    2    3    4    5    6    7
1547                         //        xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw
1548
1549                         Shuffle8bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, extendCastType, conversionType,
1550                             currentVertexElement, outputElt, compMask, compCtrl, vVertexElements, info.swizzle);
1551                         Shuffle8bpcArgs args2 = std::forward_as_tuple(vGatherResult2, GEP(pVtxOut, C(1)), extendCastType, conversionType,
1552                             currentVertexElement, outputElt, compMask, compCtrl, vVertexElements2, info.swizzle);
1553
1554                         // Shuffle gathered components into place in simdvertex struct
1555                         Shuffle8bpcGatherd(args, false); // outputs to vVertexElements ref
1556                         Shuffle8bpcGatherd(args2, true); // outputs to vVertexElements ref
1557 #else
1558                         Value* vGatherResult = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask);
1559                         // e.g. result of an 8x32bit integer gather for 8bit components
1560                         // 256i - 0    1    2    3    4    5    6    7
1561                         //        xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw
1562
1563                         Shuffle8bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, extendCastType, conversionType,
1564                             currentVertexElement, outputElt, compMask, compCtrl, vVertexElements, info.swizzle);
1565
1566                         // Shuffle gathered components into place in simdvertex struct
1567 #if USE_SIMD16_SHADERS
1568                         Shuffle8bpcGatherd(args, useVertexID2); // outputs to vVertexElements ref
1569 #else
1570                         Shuffle8bpcGatherd(args); // outputs to vVertexElements ref
1571 #endif
1572 #endif
1573                     }
1574                 }
1575                 break;
1576                 case 16:
1577                 {
1578 #if USE_SIMD16_GATHERS
1579                     Value* vGatherResult[2];
1580                     Value *vMask;
1581                     Value* vGatherResult2[2];
1582                     Value *vMask2;
1583
1584                     // if we have at least one component out of x or y to fetch
1585                     if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
1586                     {
1587                         // save mask as it is zero'd out after each gather
1588                         vMask = vGatherMask;
1589                         vMask2 = vGatherMask2;
1590
1591                         vGatherResult[0] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vMask);
1592                         vGatherResult2[0] = GATHERDD(gatherSrc2, pStreamBase, vOffsets2, vMask2);
1593                         // e.g. result of first 8x32bit integer gather for 16bit components
1594                         // 256i - 0    1    2    3    4    5    6    7
1595                         //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
1596                         //
1597                     }
1598
1599                     // if we have at least one component out of z or w to fetch
1600                     if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
1601                     {
1602                         // offset base to the next components(zw) in the vertex to gather
1603                         pStreamBase = GEP(pStreamBase, C((char)4));
1604                         vMask = vGatherMask;
1605                         vMask2 = vGatherMask2;
1606
1607                         vGatherResult[1] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vMask);
1608                         vGatherResult2[1] = GATHERDD(gatherSrc2, pStreamBase, vOffsets2, vMask2);
1609                         // e.g. result of second 8x32bit integer gather for 16bit components
1610                         // 256i - 0    1    2    3    4    5    6    7
1611                         //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
1612                         //
1613                     }
1614
1615                     // if we have at least one component to shuffle into place
1616                     if (compMask)
1617                     {
1618                         Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, extendCastType, conversionType,
1619                             currentVertexElement, outputElt, compMask, compCtrl, vVertexElements);
1620                         Shuffle16bpcArgs args2 = std::forward_as_tuple(vGatherResult2, GEP(pVtxOut, C(1)), extendCastType, conversionType,
1621                             currentVertexElement, outputElt, compMask, compCtrl, vVertexElements2);
1622
1623                         // Shuffle gathered components into place in simdvertex struct
1624                         Shuffle16bpcGather(args, false);  // outputs to vVertexElements ref
1625                         Shuffle16bpcGather(args2, true);  // outputs to vVertexElements ref
1626                     }
1627 #else
1628                     Value* vGatherResult[2];
1629                     Value *vMask;
1630
1631                     // if we have at least one component out of x or y to fetch
1632                     if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)){
1633                         // save mask as it is zero'd out after each gather
1634                         vMask = vGatherMask;
1635
1636                         vGatherResult[0] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vMask);
1637                         // e.g. result of first 8x32bit integer gather for 16bit components
1638                         // 256i - 0    1    2    3    4    5    6    7
1639                         //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
1640                         //
1641                     }
1642
1643                     // if we have at least one component out of z or w to fetch
1644                     if(isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)){
1645                         // offset base to the next components(zw) in the vertex to gather
1646                         pStreamBase = GEP(pStreamBase, C((char)4));
1647                         vMask = vGatherMask;
1648
1649                         vGatherResult[1] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vMask);
1650                         // e.g. result of second 8x32bit integer gather for 16bit components
1651                         // 256i - 0    1    2    3    4    5    6    7
1652                         //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
1653                         //
1654                     }
1655
1656                     // if we have at least one component to shuffle into place
1657                     if(compMask){
1658                         Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, extendCastType, conversionType,
1659                             currentVertexElement, outputElt, compMask, compCtrl, vVertexElements);
1660
1661                         // Shuffle gathered components into place in simdvertex struct
1662 #if USE_SIMD16_SHADERS
1663                         Shuffle16bpcGather(args, useVertexID2);  // outputs to vVertexElements ref
1664 #else
1665                         Shuffle16bpcGather(args);  // outputs to vVertexElements ref
1666 #endif
1667                     }
1668 #endif
1669                 }
1670                 break;
1671                 case 32:
1672                 {
1673                     // Gathered components into place in simdvertex struct
1674                     for (uint32_t i = 0; i < 4; i++)
1675                     {
1676                         if (isComponentEnabled(compMask, i))
1677                         {
1678                             // if we need to gather the component
1679                             if (compCtrl[i] == StoreSrc)
1680                             {
1681 #if USE_SIMD16_GATHERS
1682                                 // save mask as it is zero'd out after each gather
1683                                 Value *vMask = vGatherMask;
1684                                 Value *vMask2 = vGatherMask2;
1685
1686                                 Value *pGather = GATHERDD(gatherSrc, pStreamBase, vOffsets, vMask);
1687                                 Value *pGather2 = GATHERDD(gatherSrc2, pStreamBase, vOffsets2, vMask2);
1688
1689                                 if (conversionType == CONVERT_USCALED)
1690                                 {
1691                                     pGather = UI_TO_FP(pGather, mSimdFP32Ty);
1692                                     pGather2 = UI_TO_FP(pGather2, mSimdFP32Ty);
1693                                 }
1694                                 else if (conversionType == CONVERT_SSCALED)
1695                                 {
1696                                     pGather = SI_TO_FP(pGather, mSimdFP32Ty);
1697                                     pGather2 = SI_TO_FP(pGather2, mSimdFP32Ty);
1698                                 }
1699                                 else if (conversionType == CONVERT_SFIXED)
1700                                 {
1701                                     pGather = FMUL(SI_TO_FP(pGather, mSimdFP32Ty), VBROADCAST(C(1 / 65536.0f)));
1702                                     pGather2 = FMUL(SI_TO_FP(pGather2, mSimdFP32Ty), VBROADCAST(C(1 / 65536.0f)));
1703                                 }
1704
1705                                 vVertexElements[currentVertexElement] = pGather;
1706                                 vVertexElements2[currentVertexElement] = pGather2;
1707                                 // e.g. result of a single 8x32bit integer gather for 32bit components
1708                                 // 256i - 0    1    2    3    4    5    6    7
1709                                 //        xxxx xxxx xxxx xxxx xxxx xxxx xxxx xxxx
1710
1711                                 currentVertexElement += 1;
1712 #else
1713                                 // save mask as it is zero'd out after each gather
1714                                 Value *vMask = vGatherMask;
1715
1716                                 Value* pGather = GATHERDD(gatherSrc, pStreamBase, vOffsets, vMask);
1717
1718                                 if (conversionType == CONVERT_USCALED)
1719                                 {
1720                                     pGather = UI_TO_FP(pGather, mSimdFP32Ty);
1721                                 }
1722                                 else if (conversionType == CONVERT_SSCALED)
1723                                 {
1724                                     pGather = SI_TO_FP(pGather, mSimdFP32Ty);
1725                                 }
1726                                 else if (conversionType == CONVERT_SFIXED)
1727                                 {
1728                                     pGather = FMUL(SI_TO_FP(pGather, mSimdFP32Ty), VBROADCAST(C(1/65536.0f)));
1729                                 }
1730
1731                                 vVertexElements[currentVertexElement++] = pGather;
1732                                 // e.g. result of a single 8x32bit integer gather for 32bit components
1733                                 // 256i - 0    1    2    3    4    5    6    7
1734                                 //        xxxx xxxx xxxx xxxx xxxx xxxx xxxx xxxx
1735 #endif
1736                             }
1737                             else
1738                             {
1739 #if USE_SIMD16_SHADERS
1740 #if USE_SIMD16_GATHERS
1741                                 vVertexElements[currentVertexElement] = GenerateCompCtrlVector(compCtrl[i], false);
1742                                 vVertexElements2[currentVertexElement] = GenerateCompCtrlVector(compCtrl[i], true);
1743
1744                                 currentVertexElement += 1;
1745 #else
1746                                 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i], useVertexID2);
1747 #endif
1748 #else
1749                                 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1750 #endif
1751                             }
1752
1753                             if (currentVertexElement > 3)
1754                             {
1755 #if USE_SIMD16_GATHERS
1756                                 StoreVertexElements(pVtxOut, outputElt, 4, vVertexElements);
1757                                 StoreVertexElements(GEP(pVtxOut, C(1)), outputElt, 4, vVertexElements2);
1758
1759                                 outputElt += 1;
1760 #else
1761                                 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1762 #endif
1763
1764                                 // reset to the next vVertexElement to output
1765                                 currentVertexElement = 0;
1766                             }
1767
1768                         }
1769
1770                         // offset base to the next component  in the vertex to gather
1771                         pStreamBase = GEP(pStreamBase, C((char)4));
1772                     }
1773                 }
1774                 break;
1775             }
1776         }
1777     }
1778
1779     // if we have a partially filled vVertexElement struct, output it
1780     if (currentVertexElement > 0)
1781     {
1782 #if USE_SIMD16_GATHERS
1783         StoreVertexElements(pVtxOut, outputElt, currentVertexElement, vVertexElements);
1784         StoreVertexElements(GEP(pVtxOut, C(1)), outputElt, currentVertexElement, vVertexElements2);
1785
1786         outputElt += 1;
1787 #else
1788         StoreVertexElements(pVtxOut, outputElt++, currentVertexElement, vVertexElements);
1789 #endif
1790     }
1791 }
1792
1793 //////////////////////////////////////////////////////////////////////////
1794 /// @brief Loads a simd of valid indices. OOB indices are set to 0
1795 /// *Note* have to do 16bit index checking in scalar until we have AVX-512
1796 /// support
1797 /// @param pIndices - pointer to 8 bit indices
1798 /// @param pLastIndex - pointer to last valid index
1799 Value* FetchJit::GetSimdValid8bitIndices(Value* pIndices, Value* pLastIndex)
1800 {
1801     // can fit 2 16 bit integers per vWidth lane
1802     Value* vIndices =  VUNDEF_I();
1803
1804     // store 0 index on stack to be used to conditionally load from if index address is OOB
1805     Value* pZeroIndex = ALLOCA(mInt8Ty);
1806     STORE(C((uint8_t)0), pZeroIndex);
1807
1808     // Load a SIMD of index pointers
1809     for(int64_t lane = 0; lane < mVWidth; lane++)
1810     {
1811         // Calculate the address of the requested index
1812         Value *pIndex = GEP(pIndices, C(lane));
1813
1814         // check if the address is less than the max index,
1815         Value* mask = ICMP_ULT(pIndex, pLastIndex);
1816
1817         // if valid, load the index. if not, load 0 from the stack
1818         Value* pValid = SELECT(mask, pIndex, pZeroIndex);
1819         Value *index = LOAD(pValid, "valid index");
1820
1821         // zero extended index to 32 bits and insert into the correct simd lane
1822         index = Z_EXT(index, mInt32Ty);
1823         vIndices = VINSERT(vIndices, index, lane);
1824     }
1825     return vIndices;
1826 }
1827
1828 //////////////////////////////////////////////////////////////////////////
1829 /// @brief Loads a simd of valid indices. OOB indices are set to 0
1830 /// *Note* have to do 16bit index checking in scalar until we have AVX-512
1831 /// support
1832 /// @param pIndices - pointer to 16 bit indices
1833 /// @param pLastIndex - pointer to last valid index
1834 Value* FetchJit::GetSimdValid16bitIndices(Value* pIndices, Value* pLastIndex)
1835 {
1836     // can fit 2 16 bit integers per vWidth lane
1837     Value* vIndices =  VUNDEF_I();
1838
1839     // store 0 index on stack to be used to conditionally load from if index address is OOB
1840     Value* pZeroIndex = ALLOCA(mInt16Ty);
1841     STORE(C((uint16_t)0), pZeroIndex);
1842
1843     // Load a SIMD of index pointers
1844     for(int64_t lane = 0; lane < mVWidth; lane++)
1845     {
1846         // Calculate the address of the requested index
1847         Value *pIndex = GEP(pIndices, C(lane));
1848
1849         // check if the address is less than the max index,
1850         Value* mask = ICMP_ULT(pIndex, pLastIndex);
1851
1852         // if valid, load the index. if not, load 0 from the stack
1853         Value* pValid = SELECT(mask, pIndex, pZeroIndex);
1854         Value *index = LOAD(pValid, "valid index");
1855
1856         // zero extended index to 32 bits and insert into the correct simd lane
1857         index = Z_EXT(index, mInt32Ty);
1858         vIndices = VINSERT(vIndices, index, lane);
1859     }
1860     return vIndices;
1861 }
1862
1863 //////////////////////////////////////////////////////////////////////////
1864 /// @brief Loads a simd of valid indices. OOB indices are set to 0
1865 /// @param pIndices - pointer to 32 bit indices
1866 /// @param pLastIndex - pointer to last valid index
1867 Value* FetchJit::GetSimdValid32bitIndices(Value* pIndices, Value* pLastIndex)
1868 {
1869     DataLayout dL(JM()->mpCurrentModule);
1870     unsigned int ptrSize = dL.getPointerSize() * 8;  // ptr size in bits
1871     Value* iLastIndex = PTR_TO_INT(pLastIndex, Type::getIntNTy(JM()->mContext, ptrSize));
1872     Value* iIndices = PTR_TO_INT(pIndices, Type::getIntNTy(JM()->mContext, ptrSize));
1873
1874     // get the number of indices left in the buffer (endPtr - curPtr) / sizeof(index)
1875     Value* numIndicesLeft = SUB(iLastIndex,iIndices);
1876     numIndicesLeft = TRUNC(numIndicesLeft, mInt32Ty);
1877     numIndicesLeft = SDIV(numIndicesLeft, C(4));
1878
1879     // create a vector of index counts from the base index ptr passed into the fetch
1880     const std::vector<Constant*> vecIndices {C(0), C(1), C(2), C(3), C(4), C(5), C(6), C(7)};
1881     Constant* vIndexOffsets = ConstantVector::get(vecIndices);
1882
1883     // compare index count to the max valid index
1884     // e.g vMaxIndex      4 4 4 4 4 4 4 4 : 4 indices left to load
1885     //     vIndexOffsets  0 1 2 3 4 5 6 7
1886     //     ------------------------------
1887     //     vIndexMask    -1-1-1-1 0 0 0 0 : offsets < max pass
1888     //     vLoadedIndices 0 1 2 3 0 0 0 0 : offsets >= max masked to 0
1889     Value* vMaxIndex = VBROADCAST(numIndicesLeft);
1890     Value* vIndexMask = VPCMPGTD(vMaxIndex,vIndexOffsets);
1891
1892     // VMASKLOAD takes an *i8 src pointer
1893     pIndices = BITCAST(pIndices,PointerType::get(mInt8Ty,0));
1894
1895     // Load the indices; OOB loads 0
1896     return MASKLOADD(pIndices,vIndexMask);
1897 }
1898
1899 //////////////////////////////////////////////////////////////////////////
1900 /// @brief Takes a SIMD of gathered 8bpc verts, zero or sign extends,
1901 /// denormalizes if needed, converts to F32 if needed, and positions in
1902 //  the proper SIMD rows to be output to the simdvertex structure
1903 /// @param args: (tuple of args, listed below)
1904 ///   @param vGatherResult - 8 gathered 8bpc vertices
1905 ///   @param pVtxOut - base pointer to output simdvertex struct
1906 ///   @param extendType - sign extend or zero extend
1907 ///   @param bNormalized - do we need to denormalize?
1908 ///   @param currentVertexElement - reference to the current vVertexElement
1909 ///   @param outputElt - reference to the current offset from simdvertex we're o
1910 ///   @param compMask - component packing mask
1911 ///   @param compCtrl - component control val
1912 ///   @param vVertexElements[4] - vertex components to output
1913 ///   @param swizzle[4] - component swizzle location
1914 #if USE_SIMD16_SHADERS
1915 void FetchJit::Shuffle8bpcGatherd(Shuffle8bpcArgs &args, bool useVertexID2)
1916 #else
1917 void FetchJit::Shuffle8bpcGatherd(Shuffle8bpcArgs &args)
1918 #endif
1919 {
1920     // Unpack tuple args
1921     Value*& vGatherResult = std::get<0>(args);
1922     Value* pVtxOut = std::get<1>(args);
1923     const Instruction::CastOps extendType = std::get<2>(args);
1924     const ConversionType conversionType = std::get<3>(args);
1925     uint32_t &currentVertexElement = std::get<4>(args);
1926     uint32_t &outputElt =  std::get<5>(args);
1927     const ComponentEnable compMask = std::get<6>(args);
1928     const ComponentControl (&compCtrl)[4] = std::get<7>(args);
1929     Value* (&vVertexElements)[4] = std::get<8>(args);
1930     const uint32_t (&swizzle)[4] = std::get<9>(args);
1931
1932     // cast types
1933     Type* vGatherTy = mSimdInt32Ty;
1934     Type* v32x8Ty =  VectorType::get(mInt8Ty, mVWidth * 4 ); // vwidth is units of 32 bits
1935
1936     // have to do extra work for sign extending
1937     if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP)){
1938         Type* v16x8Ty = VectorType::get(mInt8Ty, mVWidth * 2); // 8x16bit ints in a 128bit lane
1939         Type* v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
1940
1941         // shuffle mask, including any swizzling
1942         const char x = (char)swizzle[0]; const char y = (char)swizzle[1];
1943         const char z = (char)swizzle[2]; const char w = (char)swizzle[3];
1944         Value* vConstMask = C<char>({char(x), char(x+4), char(x+8), char(x+12),
1945                     char(y), char(y+4), char(y+8), char(y+12),
1946                     char(z), char(z+4), char(z+8), char(z+12),
1947                     char(w), char(w+4), char(w+8), char(w+12),
1948                     char(x), char(x+4), char(x+8), char(x+12),
1949                     char(y), char(y+4), char(y+8), char(y+12),
1950                     char(z), char(z+4), char(z+8), char(z+12),
1951                     char(w), char(w+4), char(w+8), char(w+12)});
1952
1953         Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherResult, v32x8Ty), vConstMask), vGatherTy);
1954         // after pshufb: group components together in each 128bit lane
1955         // 256i - 0    1    2    3    4    5    6    7
1956         //        xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww
1957
1958         Value* vi128XY = nullptr;
1959         if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)){
1960             vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({0, 4, 0, 0, 1, 5, 0, 0})), v128Ty);
1961             // after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane
1962             // 256i - 0    1    2    3    4    5    6    7
1963             //        xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care)
1964         }
1965
1966         // do the same for zw components
1967         Value* vi128ZW = nullptr;
1968         if(isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)){
1969             vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({2, 6, 0, 0, 3, 7, 0, 0})), v128Ty);
1970         }
1971
1972         // init denormalize variables if needed
1973         Instruction::CastOps fpCast;
1974         Value* conversionFactor;
1975
1976         switch (conversionType)
1977         {
1978         case CONVERT_NORMALIZED:
1979             fpCast = Instruction::CastOps::SIToFP;
1980             conversionFactor = VIMMED1((float)(1.0 / 127.0));
1981             break;
1982         case CONVERT_SSCALED:
1983             fpCast = Instruction::CastOps::SIToFP;
1984             conversionFactor = VIMMED1((float)(1.0));
1985             break;
1986         case CONVERT_USCALED:
1987             SWR_INVALID("Type should not be sign extended!");
1988             conversionFactor = nullptr;
1989             break;
1990         default:
1991             SWR_ASSERT(conversionType == CONVERT_NONE);
1992             conversionFactor = nullptr;
1993             break;
1994         }
1995
1996         // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
1997         for (uint32_t i = 0; i < 4; i++)
1998         {
1999             if (isComponentEnabled(compMask, i))
2000             {
2001                 if (compCtrl[i] == ComponentControl::StoreSrc)
2002                 {
2003                     // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
2004                     uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
2005                     // if x or y, use vi128XY permute result, else use vi128ZW
2006                     Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
2007
2008                     // sign extend
2009                     vVertexElements[currentVertexElement] = PMOVSXBD(BITCAST(VEXTRACT(selectedPermute, C(lane)), v16x8Ty));
2010
2011                     // denormalize if needed
2012                     if (conversionType != CONVERT_NONE)
2013                     {
2014                         vVertexElements[currentVertexElement] = FMUL(CAST(fpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor);
2015                     }
2016                     currentVertexElement++;
2017                 }
2018                 else
2019                 {
2020 #if USE_SIMD16_SHADERS
2021                     vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i], useVertexID2);
2022 #else
2023                     vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
2024 #endif
2025                 }
2026
2027                 if (currentVertexElement > 3)
2028                 {
2029                     StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
2030                     // reset to the next vVertexElement to output
2031                     currentVertexElement = 0;
2032                 }
2033             }
2034         }
2035     }
2036     // else zero extend
2037     else if ((extendType == Instruction::CastOps::ZExt) || (extendType == Instruction::CastOps::UIToFP))
2038     {
2039         // init denormalize variables if needed
2040         Instruction::CastOps fpCast;
2041         Value* conversionFactor;
2042
2043         switch (conversionType)
2044         {
2045         case CONVERT_NORMALIZED:
2046             fpCast = Instruction::CastOps::UIToFP;
2047             conversionFactor = VIMMED1((float)(1.0 / 255.0));
2048             break;
2049         case CONVERT_USCALED:
2050             fpCast = Instruction::CastOps::UIToFP;
2051             conversionFactor = VIMMED1((float)(1.0));
2052             break;
2053         case CONVERT_SSCALED:
2054             SWR_INVALID("Type should not be zero extended!");
2055             conversionFactor = nullptr;
2056             break;
2057         default:
2058             SWR_ASSERT(conversionType == CONVERT_NONE);
2059             conversionFactor = nullptr;
2060             break;
2061         }
2062
2063         // shuffle enabled components into lower byte of each 32bit lane, 0 extending to 32 bits
2064         for (uint32_t i = 0; i < 4; i++)
2065         {
2066             if (isComponentEnabled(compMask, i))
2067             {
2068                 if (compCtrl[i] == ComponentControl::StoreSrc)
2069                 {
2070                     // pshufb masks for each component
2071                     Value* vConstMask;
2072                     switch (swizzle[i])
2073                     {
2074                     case 0:
2075                         // x shuffle mask
2076                         vConstMask = C<char>({ 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1,
2077                                                0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1 });
2078                         break;
2079                     case 1:
2080                         // y shuffle mask
2081                         vConstMask = C<char>({ 1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1,
2082                                                1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1 });
2083                         break;
2084                     case 2:
2085                         // z shuffle mask
2086                         vConstMask = C<char>({ 2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1,
2087                                                2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1 });
2088                         break;
2089                     case 3:
2090                         // w shuffle mask
2091                         vConstMask = C<char>({ 3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1,
2092                                                3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1 });
2093                         break;
2094                     default:
2095                         vConstMask = nullptr;
2096                         break;
2097                     }
2098
2099                     vVertexElements[currentVertexElement] = BITCAST(PSHUFB(BITCAST(vGatherResult, v32x8Ty), vConstMask), vGatherTy);
2100                     // after pshufb for x channel
2101                     // 256i - 0    1    2    3    4    5    6    7
2102                     //        x000 x000 x000 x000 x000 x000 x000 x000
2103
2104                     // denormalize if needed
2105                     if (conversionType != CONVERT_NONE)
2106                     {
2107                         vVertexElements[currentVertexElement] = FMUL(CAST(fpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor);
2108                     }
2109                     currentVertexElement++;
2110                 }
2111                 else
2112                 {
2113 #if USE_SIMD16_SHADERS
2114                     vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i], useVertexID2);
2115 #else
2116                     vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
2117 #endif
2118                 }
2119
2120                 if (currentVertexElement > 3)
2121                 {
2122                     StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
2123                     // reset to the next vVertexElement to output
2124                     currentVertexElement = 0;
2125                 }
2126             }
2127         }
2128     }
2129     else
2130     {
2131         SWR_INVALID("Unsupported conversion type");
2132     }
2133 }
2134
2135 //////////////////////////////////////////////////////////////////////////
2136 /// @brief Takes a SIMD of gathered 16bpc verts, zero or sign extends,
2137 /// denormalizes if needed, converts to F32 if needed, and positions in
2138 //  the proper SIMD rows to be output to the simdvertex structure
2139 /// @param args: (tuple of args, listed below)
2140 ///   @param vGatherResult[2] - array of gathered 16bpc vertices, 4 per index
2141 ///   @param pVtxOut - base pointer to output simdvertex struct
2142 ///   @param extendType - sign extend or zero extend
2143 ///   @param bNormalized - do we need to denormalize?
2144 ///   @param currentVertexElement - reference to the current vVertexElement
2145 ///   @param outputElt - reference to the current offset from simdvertex we're o
2146 ///   @param compMask - component packing mask
2147 ///   @param compCtrl - component control val
2148 ///   @param vVertexElements[4] - vertex components to output
2149 #if USE_SIMD16_SHADERS
2150 void FetchJit::Shuffle16bpcGather(Shuffle16bpcArgs &args, bool useVertexID2)
2151 #else
2152 void FetchJit::Shuffle16bpcGather(Shuffle16bpcArgs &args)
2153 #endif
2154 {
2155     // Unpack tuple args
2156     Value* (&vGatherResult)[2] = std::get<0>(args);
2157     Value* pVtxOut = std::get<1>(args);
2158     const Instruction::CastOps extendType = std::get<2>(args);
2159     const ConversionType conversionType = std::get<3>(args);
2160     uint32_t &currentVertexElement = std::get<4>(args);
2161     uint32_t &outputElt = std::get<5>(args);
2162     const ComponentEnable compMask = std::get<6>(args);
2163     const ComponentControl(&compCtrl)[4] = std::get<7>(args);
2164     Value* (&vVertexElements)[4] = std::get<8>(args);
2165
2166     // cast types
2167     Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
2168     Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
2169
2170     // have to do extra work for sign extending
2171     if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP)||
2172         (extendType == Instruction::CastOps::FPExt))
2173     {
2174         // is this PP float?
2175         bool bFP = (extendType == Instruction::CastOps::FPExt) ? true : false;
2176
2177         Type* v8x16Ty = VectorType::get(mInt16Ty, 8); // 8x16bit in a 128bit lane
2178         Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
2179
2180         // shuffle mask
2181         Value* vConstMask = C<char>({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
2182                                      0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15});
2183         Value* vi128XY = nullptr;
2184         if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)){
2185             Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherResult[0], v32x8Ty), vConstMask), vGatherTy);
2186             // after pshufb: group components together in each 128bit lane
2187             // 256i - 0    1    2    3    4    5    6    7
2188             //        xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
2189
2190             vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
2191             // after PERMD: move and pack xy components into each 128bit lane
2192             // 256i - 0    1    2    3    4    5    6    7
2193             //        xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
2194         }
2195
2196         // do the same for zw components
2197         Value* vi128ZW = nullptr;
2198         if(isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)){
2199             Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherResult[1], v32x8Ty), vConstMask), vGatherTy);
2200             vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
2201         }
2202
2203         // init denormalize variables if needed
2204         Instruction::CastOps IntToFpCast;
2205         Value* conversionFactor;
2206
2207         switch (conversionType)
2208         {
2209         case CONVERT_NORMALIZED:
2210             IntToFpCast = Instruction::CastOps::SIToFP;
2211             conversionFactor = VIMMED1((float)(1.0 / 32767.0));
2212             break;
2213         case CONVERT_SSCALED:
2214             IntToFpCast = Instruction::CastOps::SIToFP;
2215             conversionFactor = VIMMED1((float)(1.0));
2216             break;
2217         case CONVERT_USCALED:
2218             SWR_INVALID("Type should not be sign extended!");
2219             conversionFactor = nullptr;
2220             break;
2221         default:
2222             SWR_ASSERT(conversionType == CONVERT_NONE);
2223             conversionFactor = nullptr;
2224             break;
2225         }
2226
2227         // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
2228         for (uint32_t i = 0; i < 4; i++)
2229         {
2230             if (isComponentEnabled(compMask, i))
2231             {
2232                 if (compCtrl[i] == ComponentControl::StoreSrc)
2233                 {
2234                     // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
2235                     uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
2236                     // if x or y, use vi128XY permute result, else use vi128ZW
2237                     Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
2238
2239                     if (bFP) {
2240                         // extract 128 bit lanes to sign extend each component
2241                         vVertexElements[currentVertexElement] = CVTPH2PS(BITCAST(VEXTRACT(selectedPermute, C(lane)), v8x16Ty));
2242                     }
2243                     else {
2244                         // extract 128 bit lanes to sign extend each component
2245                         vVertexElements[currentVertexElement] = PMOVSXWD(BITCAST(VEXTRACT(selectedPermute, C(lane)), v8x16Ty));
2246
2247                         // denormalize if needed
2248                         if (conversionType != CONVERT_NONE) {
2249                             vVertexElements[currentVertexElement] = FMUL(CAST(IntToFpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor);
2250                         }
2251                     }
2252                     currentVertexElement++;
2253                 }
2254                 else
2255                 {
2256 #if USE_SIMD16_SHADERS
2257                     vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i], useVertexID2);
2258 #else
2259                     vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
2260 #endif
2261                 }
2262
2263                 if (currentVertexElement > 3)
2264                 {
2265                     StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
2266                     // reset to the next vVertexElement to output
2267                     currentVertexElement = 0;
2268                 }
2269             }
2270         }
2271     }
2272     // else zero extend
2273     else if ((extendType == Instruction::CastOps::ZExt) || (extendType == Instruction::CastOps::UIToFP))
2274     {
2275         // pshufb masks for each component
2276         Value* vConstMask[2];
2277         if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 2)){
2278             // x/z shuffle mask
2279             vConstMask[0] = C<char>({0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
2280                                      0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, });
2281         }
2282
2283         if(isComponentEnabled(compMask, 1) || isComponentEnabled(compMask, 3)){
2284             // y/w shuffle mask
2285             vConstMask[1] = C<char>({2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
2286                                      2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1});
2287         }
2288
2289         // init denormalize variables if needed
2290         Instruction::CastOps fpCast;
2291         Value* conversionFactor;
2292
2293         switch (conversionType)
2294         {
2295         case CONVERT_NORMALIZED:
2296             fpCast = Instruction::CastOps::UIToFP;
2297             conversionFactor = VIMMED1((float)(1.0 / 65535.0));
2298             break;
2299         case CONVERT_USCALED:
2300             fpCast = Instruction::CastOps::UIToFP;
2301             conversionFactor = VIMMED1((float)(1.0f));
2302             break;
2303         case CONVERT_SSCALED:
2304             SWR_INVALID("Type should not be zero extended!");
2305             conversionFactor = nullptr;
2306             break;
2307         default:
2308             SWR_ASSERT(conversionType == CONVERT_NONE);
2309             conversionFactor = nullptr;
2310             break;
2311         }
2312
2313         // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits
2314         for (uint32_t i = 0; i < 4; i++)
2315         {
2316             if (isComponentEnabled(compMask, i))
2317             {
2318                 if (compCtrl[i] == ComponentControl::StoreSrc)
2319                 {
2320                     // select correct constMask for x/z or y/w pshufb
2321                     uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1;
2322                     // if x or y, use vi128XY permute result, else use vi128ZW
2323                     uint32_t selectedGather = (i < 2) ? 0 : 1;
2324
2325                     vVertexElements[currentVertexElement] = BITCAST(PSHUFB(BITCAST(vGatherResult[selectedGather], v32x8Ty), vConstMask[selectedMask]), vGatherTy);
2326                     // after pshufb mask for x channel; z uses the same shuffle from the second gather
2327                     // 256i - 0    1    2    3    4    5    6    7
2328                     //        xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00
2329
2330                     // denormalize if needed
2331                     if (conversionType != CONVERT_NONE)
2332                     {
2333                         vVertexElements[currentVertexElement] = FMUL(CAST(fpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor);
2334                     }
2335                     currentVertexElement++;
2336                 }
2337                 else
2338                 {
2339 #if USE_SIMD16_SHADERS
2340                     vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i], useVertexID2);
2341 #else
2342                     vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
2343 #endif
2344                 }
2345
2346                 if (currentVertexElement > 3)
2347                 {
2348                     StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
2349                     // reset to the next vVertexElement to output
2350                     currentVertexElement = 0;
2351                 }
2352             }
2353         }
2354     }
2355     else
2356     {
2357         SWR_INVALID("Unsupported conversion type");
2358     }
2359 }
2360
2361 //////////////////////////////////////////////////////////////////////////
2362 /// @brief Output a simdvertex worth of elements to the current outputElt
2363 /// @param pVtxOut - base address of VIN output struct
2364 /// @param outputElt - simdvertex offset in VIN to write to
2365 /// @param numEltsToStore - number of simdvertex rows to write out
2366 /// @param vVertexElements - LLVM Value*[] simdvertex to write out
2367 void FetchJit::StoreVertexElements(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4])
2368 {
2369     SWR_ASSERT(numEltsToStore <= 4, "Invalid element count.");
2370
2371     for(uint32_t c = 0; c < numEltsToStore; ++c)
2372     {
2373         // STORE expects FP32 x vWidth type, just bitcast if needed
2374         if(!vVertexElements[c]->getType()->getScalarType()->isFloatTy())
2375         {
2376 #if FETCH_DUMP_VERTEX
2377             PRINT("vVertexElements[%d]: 0x%x\n", {C(c), vVertexElements[c]});
2378 #endif
2379             vVertexElements[c] = BITCAST(vVertexElements[c], mSimdFP32Ty);
2380         }
2381 #if FETCH_DUMP_VERTEX
2382         else
2383         {
2384             PRINT("vVertexElements[%d]: %f\n", {C(c), vVertexElements[c]});
2385         }
2386 #endif
2387         // outputElt * 4 = offsetting by the size of a simdvertex
2388         // + c offsets to a 32bit x vWidth row within the current vertex
2389 #if USE_SIMD16_SHADERS
2390         Value* dest = GEP(pVtxOut, C(outputElt * 8 + c * 2), "destGEP");
2391 #else
2392         Value* dest = GEP(pVtxOut, C(outputElt * 4 + c), "destGEP");
2393 #endif
2394         STORE(vVertexElements[c], dest);
2395     }
2396 }
2397
2398 #if USE_SIMD16_BUILDER
2399 void FetchJit::StoreVertexElements2(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4])
2400 {
2401     SWR_ASSERT(numEltsToStore <= 4, "Invalid element count.");
2402
2403     for (uint32_t c = 0; c < numEltsToStore; ++c)
2404     {
2405         // STORE expects FP32 x vWidth type, just bitcast if needed
2406         if (!vVertexElements[c]->getType()->getScalarType()->isFloatTy())
2407         {
2408 #if FETCH_DUMP_VERTEX
2409             PRINT("vVertexElements[%d]: 0x%x\n", { C(c), vVertexElements[c] });
2410 #endif
2411             vVertexElements[c] = BITCAST(vVertexElements[c], mSimd2FP32Ty);
2412         }
2413 #if FETCH_DUMP_VERTEX
2414         else
2415         {
2416             PRINT("vVertexElements[%d]: %f\n", { C(c), vVertexElements[c] });
2417         }
2418 #endif
2419         // outputElt * 4 = offsetting by the size of a simdvertex
2420         // + c offsets to a 32bit x vWidth row within the current vertex
2421         Value* dest = GEP(pVtxOut, C(outputElt * 4 + c), "destGEP");
2422         STORE(vVertexElements[c], dest);
2423     }
2424 }
2425
2426 #endif
2427 //////////////////////////////////////////////////////////////////////////
2428 /// @brief Generates a constant vector of values based on the
2429 /// ComponentControl value
2430 /// @param ctrl - ComponentControl value
2431 #if USE_SIMD16_SHADERS
2432 Value* FetchJit::GenerateCompCtrlVector(const ComponentControl ctrl, bool useVertexID2)
2433 #else
2434 Value* FetchJit::GenerateCompCtrlVector(const ComponentControl ctrl)
2435 #endif
2436 {
2437     switch(ctrl)
2438     {
2439         case NoStore:   return VUNDEF_I();
2440         case Store0:    return VIMMED1(0);
2441         case Store1Fp:  return VIMMED1(1.0f);
2442         case Store1Int: return VIMMED1(1);
2443         case StoreVertexId:
2444         {
2445 #if USE_SIMD16_SHADERS
2446             Value* pId;
2447             if (useVertexID2)
2448             {
2449                 pId = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID2 })), mSimdFP32Ty);
2450             }
2451             else
2452             {
2453                 pId = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID })), mSimdFP32Ty);
2454             }
2455 #else
2456             Value* pId = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID })), mSimdFP32Ty);
2457 #endif
2458             return VBROADCAST(pId);
2459         }
2460         case StoreInstanceId:
2461         {
2462             Value* pId = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_CurInstance })), mFP32Ty);
2463             return VBROADCAST(pId);
2464         }
2465         case StoreSrc:
2466         default:        SWR_INVALID("Invalid component control"); return VUNDEF_I();
2467     }
2468 }
2469
2470 #if USE_SIMD16_BUILDER
2471 Value* FetchJit::GenerateCompCtrlVector2(const ComponentControl ctrl)
2472 {
2473     switch (ctrl)
2474     {
2475         case NoStore:   return VUNDEF2_I();
2476         case Store0:    return VIMMED2_1(0);
2477         case Store1Fp:  return VIMMED2_1(1.0f);
2478         case Store1Int: return VIMMED2_1(1);
2479         case StoreVertexId:
2480         {
2481             Value* pId = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID })), mSimd2FP32Ty);
2482             return VBROADCAST2(pId);
2483         }
2484         case StoreInstanceId:
2485         {
2486             Value* pId = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_CurInstance })), mFP32Ty);
2487             return VBROADCAST2(pId);
2488         }
2489         case StoreSrc:
2490         default:        SWR_INVALID("Invalid component control"); return VUNDEF2_I();
2491     }
2492 }
2493
2494 #endif
2495 //////////////////////////////////////////////////////////////////////////
2496 /// @brief Returns the enable mask for the specified component.
2497 /// @param enableMask - enable bits
2498 /// @param component - component to check if enabled.
2499 bool isComponentEnabled(ComponentEnable enableMask, uint8_t component)
2500 {
2501     switch (component)
2502     {
2503         // X
2504     case 0: return (enableMask & ComponentEnable::X);
2505         // Y
2506     case 1: return (enableMask & ComponentEnable::Y);
2507         // Z
2508     case 2: return (enableMask & ComponentEnable::Z);
2509         // W
2510     case 3: return (enableMask & ComponentEnable::W);
2511
2512     default: return false;
2513     }
2514 }
2515
2516
2517 //////////////////////////////////////////////////////////////////////////
2518 /// @brief JITs from fetch shader IR
2519 /// @param hJitMgr - JitManager handle
2520 /// @param func   - LLVM function IR
2521 /// @return PFN_FETCH_FUNC - pointer to fetch code
2522 PFN_FETCH_FUNC JitFetchFunc(HANDLE hJitMgr, const HANDLE hFunc)
2523 {
2524     const llvm::Function* func = (const llvm::Function*)hFunc;
2525     JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
2526     PFN_FETCH_FUNC pfnFetch;
2527
2528     pfnFetch = (PFN_FETCH_FUNC)(pJitMgr->mpExec->getFunctionAddress(func->getName().str()));
2529     // MCJIT finalizes modules the first time you JIT code from them. After finalized, you cannot add new IR to the module
2530     pJitMgr->mIsModuleFinalized = true;
2531
2532 #if defined(KNOB_SWRC_TRACING)
2533     char fName[1024];
2534     const char *funcName = func->getName().data();
2535     sprintf(fName, "%s.bin", funcName);
2536     FILE *fd = fopen(fName, "wb");
2537     fwrite((void *)pfnFetch, 1, 2048, fd);
2538     fclose(fd);
2539 #endif
2540
2541     pJitMgr->DumpAsm(const_cast<llvm::Function*>(func), "final");
2542
2543     return pfnFetch;
2544 }
2545
2546 //////////////////////////////////////////////////////////////////////////
2547 /// @brief JIT compiles fetch shader
2548 /// @param hJitMgr - JitManager handle
2549 /// @param state   - fetch state to build function from
2550 extern "C" PFN_FETCH_FUNC JITCALL JitCompileFetch(HANDLE hJitMgr, const FETCH_COMPILE_STATE& state)
2551 {
2552     JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
2553
2554     pJitMgr->SetupNewModule();
2555
2556     FetchJit theJit(pJitMgr);
2557     HANDLE hFunc = theJit.Create(state);
2558
2559     return JitFetchFunc(hJitMgr, hFunc);
2560 }