src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp

   1 /****************************************************************************
   2 * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
   3 *
   4 * Permission is hereby granted, free of charge, to any person obtaining a
   5 * copy of this software and associated documentation files (the "Software"),
   6 * to deal in the Software without restriction, including without limitation
   7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8 * and/or sell copies of the Software, and to permit persons to whom the
   9 * Software is furnished to do so, subject to the following conditions:
  10 *
  11 * The above copyright notice and this permission notice (including the next
  12 * paragraph) shall be included in all copies or substantial portions of the
  13 * Software.
  14 *
  15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21 * IN THE SOFTWARE.
  22 *
  23 * @file fetch_jit.cpp
  24 *
  25 * @brief Implementation of the fetch jitter
  26 *
  27 * Notes:
  28 *
  29 ******************************************************************************/
  30 #include "builder.h"
  31 #include "jit_api.h"
  32 #include "fetch_jit.h"
  33 #include "gen_state_llvm.h"
  34 #include <sstream>
  35 #include <tuple>
  36
  37 //#define FETCH_DUMP_VERTEX 1
  38 using namespace llvm;
  39 using namespace SwrJit;
  40
  41 bool isComponentEnabled(ComponentEnable enableMask, uint8_t component);
  42
  43 enum ConversionType
  44 {
  45     CONVERT_NONE,
  46     CONVERT_NORMALIZED,
  47     CONVERT_USCALED,
  48     CONVERT_SSCALED,
  49     CONVERT_SFIXED,
  50 };
  51
  52 //////////////////////////////////////////////////////////////////////////
  53 /// Interface to Jitting a fetch shader
  54 //////////////////////////////////////////////////////////////////////////
  55 struct FetchJit : public Builder
  56 {
  57     FetchJit(JitManager* pJitMgr) : Builder(pJitMgr){};
  58
  59     Function* Create(const FETCH_COMPILE_STATE& fetchState);
  60     Value* GetSimdValid32bitIndices(Value* vIndices, Value* pLastIndex);
  61     Value* GetSimdValid16bitIndices(Value* vIndices, Value* pLastIndex);
  62     Value* GetSimdValid8bitIndices(Value* vIndices, Value* pLastIndex);
  63
  64     // package up Shuffle*bpcGatherd args into a tuple for convenience
  65     typedef std::tuple<Value*&, Value*, const Instruction::CastOps, const ConversionType,
  66         uint32_t&, uint32_t&, const ComponentEnable, const ComponentControl(&)[4], Value*(&)[4],
  67         const uint32_t(&)[4]> Shuffle8bpcArgs;
  68 #if USE_SIMD16_SHADERS
  69     void Shuffle8bpcGatherd(Shuffle8bpcArgs &args, bool useVertexID2);
  70 #else
  71     void Shuffle8bpcGatherd(Shuffle8bpcArgs &args);
  72 #endif
  73
  74     typedef std::tuple<Value*(&)[2], Value*, const Instruction::CastOps, const ConversionType,
  75         uint32_t&, uint32_t&, const ComponentEnable, const ComponentControl(&)[4], Value*(&)[4]> Shuffle16bpcArgs;
  76 #if USE_SIMD16_SHADERS
  77     void Shuffle16bpcGather(Shuffle16bpcArgs &args, bool useVertexID2);
  78 #else
  79     void Shuffle16bpcGather(Shuffle16bpcArgs &args);
  80 #endif
  81
  82     void StoreVertexElements(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4]);
  83
  84 #if USE_SIMD16_SHADERS
  85     Value* GenerateCompCtrlVector(const ComponentControl ctrl, bool useVertexID2);
  86 #else
  87     Value* GenerateCompCtrlVector(const ComponentControl ctrl);
  88 #endif
  89
  90     void JitLoadVertices(const FETCH_COMPILE_STATE &fetchState, Value* streams, Value* vIndices, Value* pVtxOut);
  91 #if USE_SIMD16_SHADERS
  92     void JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, Value* streams, Value* vIndices, Value* pVtxOut, bool useVertexID2);
  93 #else
  94     void JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, Value* streams, Value* vIndices, Value* pVtxOut);
  95 #endif
  96
  97     bool IsOddFormat(SWR_FORMAT format);
  98     bool IsUniformFormat(SWR_FORMAT format);
  99     void UnpackComponents(SWR_FORMAT format, Value* vInput, Value* result[4]);
 100     void CreateGatherOddFormats(SWR_FORMAT format, Value* pMask, Value* pBase, Value* offsets, Value* result[4]);
 101     void ConvertFormat(SWR_FORMAT format, Value *texels[4]);
 102
 103     Value* mpFetchInfo;
 104 };
 105
 106 Function* FetchJit::Create(const FETCH_COMPILE_STATE& fetchState)
 107 {
 108     std::stringstream fnName("FetchShader_", std::ios_base::in | std::ios_base::out | std::ios_base::ate);
 109     fnName << ComputeCRC(0, &fetchState, sizeof(fetchState));
 110
 111     Function*    fetch = Function::Create(JM()->mFetchShaderTy, GlobalValue::ExternalLinkage, fnName.str(), JM()->mpCurrentModule);
 112     BasicBlock*    entry = BasicBlock::Create(JM()->mContext, "entry", fetch);
 113
 114     fetch->getParent()->setModuleIdentifier(fetch->getName());
 115
 116     IRB()->SetInsertPoint(entry);
 117
 118     auto    argitr = fetch->arg_begin();
 119
 120     // Fetch shader arguments
 121     mpFetchInfo = &*argitr; ++argitr;
 122     mpFetchInfo->setName("fetchInfo");
 123     Value*    pVtxOut = &*argitr;
 124     pVtxOut->setName("vtxOutput");
 125     // this is just shorthand to tell LLVM to get a pointer to the base address of simdvertex
 126     // index 0(just the pointer to the simdvertex structure
 127     // index 1(which element of the simdvertex structure to offset to(in this case 0)
 128     // so the indices being i32's doesn't matter
 129     // TODO: generated this GEP with a VECTOR structure type so this makes sense
 130     std::vector<Value*>    vtxInputIndices(2, C(0));
 131     // GEP
 132     pVtxOut = GEP(pVtxOut, C(0));
 133 #if USE_SIMD16_SHADERS
 134 #if 0
 135     pVtxOut = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth * 2), 0));
 136 #else
 137     pVtxOut = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth), 0));
 138 #endif
 139 #else
 140     pVtxOut = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth), 0));
 141 #endif
 142
 143     // SWR_FETCH_CONTEXT::pStreams
 144     Value*    streams = LOAD(mpFetchInfo,{0, SWR_FETCH_CONTEXT_pStreams});
 145     streams->setName("pStreams");
 146
 147     // SWR_FETCH_CONTEXT::pIndices
 148     Value*    indices = LOAD(mpFetchInfo,{0, SWR_FETCH_CONTEXT_pIndices});
 149     indices->setName("pIndices");
 150
 151     // SWR_FETCH_CONTEXT::pLastIndex
 152     Value*    pLastIndex = LOAD(mpFetchInfo,{0, SWR_FETCH_CONTEXT_pLastIndex});
 153     pLastIndex->setName("pLastIndex");
 154
 155
 156     Value* vIndices;
 157 #if USE_SIMD16_SHADERS
 158     Value* indices2;
 159     Value* vIndices2;
 160 #endif
 161     switch(fetchState.indexType)
 162     {
 163         case R8_UINT:
 164             indices = BITCAST(indices, Type::getInt8PtrTy(JM()->mContext, 0));
 165 #if USE_SIMD16_SHADERS
 166             indices2 = GEP(indices, C(8));
 167 #endif
 168             if(fetchState.bDisableIndexOOBCheck)
 169             {
 170                 vIndices = LOAD(BITCAST(indices, PointerType::get(VectorType::get(mInt8Ty, mpJitMgr->mVWidth), 0)), {(uint32_t)0});
 171                 vIndices = Z_EXT(vIndices, mSimdInt32Ty);
 172 #if USE_SIMD16_SHADERS
 173                 vIndices2 = LOAD(BITCAST(indices2, PointerType::get(VectorType::get(mInt8Ty, mpJitMgr->mVWidth), 0)), { (uint32_t)0 });
 174                 vIndices2 = Z_EXT(vIndices2, mSimdInt32Ty);
 175 #endif
 176             }
 177             else
 178             {
 179                 pLastIndex = BITCAST(pLastIndex, Type::getInt8PtrTy(JM()->mContext, 0));
 180                 vIndices = GetSimdValid8bitIndices(indices, pLastIndex);
 181 #if USE_SIMD16_SHADERS
 182                 pLastIndex = BITCAST(pLastIndex, Type::getInt8PtrTy(JM()->mContext, 0));
 183                 vIndices2 = GetSimdValid8bitIndices(indices2, pLastIndex);
 184 #endif
 185             }
 186             break;
 187         case R16_UINT:
 188             indices = BITCAST(indices, Type::getInt16PtrTy(JM()->mContext, 0));
 189 #if USE_SIMD16_SHADERS
 190             indices2 = GEP(indices, C(8));
 191 #endif
 192             if(fetchState.bDisableIndexOOBCheck)
 193             {
 194                 vIndices = LOAD(BITCAST(indices, PointerType::get(VectorType::get(mInt16Ty, mpJitMgr->mVWidth), 0)), {(uint32_t)0});
 195                 vIndices = Z_EXT(vIndices, mSimdInt32Ty);
 196 #if USE_SIMD16_SHADERS
 197                 vIndices2 = LOAD(BITCAST(indices2, PointerType::get(VectorType::get(mInt16Ty, mpJitMgr->mVWidth), 0)), { (uint32_t)0 });
 198                 vIndices2 = Z_EXT(vIndices2, mSimdInt32Ty);
 199 #endif
 200             }
 201             else
 202             {
 203                 pLastIndex = BITCAST(pLastIndex, Type::getInt16PtrTy(JM()->mContext, 0));
 204                 vIndices = GetSimdValid16bitIndices(indices, pLastIndex);
 205 #if USE_SIMD16_SHADERS
 206                 pLastIndex = BITCAST(pLastIndex, Type::getInt16PtrTy(JM()->mContext, 0));
 207                 vIndices2 = GetSimdValid16bitIndices(indices2, pLastIndex);
 208 #endif
 209             }
 210             break;
 211         case R32_UINT:
 212 #if USE_SIMD16_SHADERS
 213             indices2 = GEP(indices, C(8));
 214 #endif
 215             (fetchState.bDisableIndexOOBCheck) ? vIndices = LOAD(BITCAST(indices, PointerType::get(mSimdInt32Ty,0)),{(uint32_t)0})
 216                                                : vIndices = GetSimdValid32bitIndices(indices, pLastIndex);
 217 #if USE_SIMD16_SHADERS
 218             (fetchState.bDisableIndexOOBCheck) ? vIndices2 = LOAD(BITCAST(indices2, PointerType::get(mSimdInt32Ty, 0)), { (uint32_t)0 })
 219                                                : vIndices2 = GetSimdValid32bitIndices(indices2, pLastIndex);
 220 #endif
 221             break; // incoming type is already 32bit int
 222         default: SWR_INVALID("Unsupported index type"); vIndices = nullptr; break;
 223     }
 224
 225     Value* vVertexId = vIndices;
 226 #if USE_SIMD16_SHADERS
 227     Value* vVertexId2 = vIndices2;
 228 #endif
 229     if (fetchState.bVertexIDOffsetEnable)
 230     {
 231         // Assuming one of baseVertex or startVertex is 0, so adding both should be functionally correct
 232         Value* vBaseVertex = VBROADCAST(LOAD(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_BaseVertex }));
 233         Value* vStartVertex = VBROADCAST(LOAD(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_StartVertex }));
 234         vVertexId = ADD(vIndices, vBaseVertex);
 235         vVertexId = ADD(vVertexId, vStartVertex);
 236 #if USE_SIMD16_SHADERS
 237         vVertexId2 = ADD(vIndices2, vBaseVertex);
 238         vVertexId2 = ADD(vVertexId2, vStartVertex);
 239 #endif
 240     }
 241
 242     // store out vertex IDs
 243     STORE(vVertexId, GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID }));
 244 #if USE_SIMD16_SHADERS
 245     STORE(vVertexId2, GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID2 }));
 246 #endif
 247
 248     // store out cut mask if enabled
 249     if (fetchState.bEnableCutIndex)
 250     {
 251         Value* vCutIndex = VIMMED1(fetchState.cutIndex);
 252         Value* cutMask = VMASK(ICMP_EQ(vIndices, vCutIndex));
 253         STORE(cutMask, GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_CutMask }));
 254 #if USE_SIMD16_SHADERS
 255         Value* cutMask2 = VMASK(ICMP_EQ(vIndices2, vCutIndex));
 256         STORE(cutMask2, GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_CutMask2 }));
 257 #endif
 258     }
 259
 260     // Fetch attributes from memory and output to a simdvertex struct
 261     // since VGATHER has a perf penalty on HSW vs BDW, allow client to choose which fetch method to use
 262 #if USE_SIMD16_SHADERS
 263     if (fetchState.bDisableVGATHER)
 264     {
 265         JitLoadVertices(fetchState, streams, vIndices, pVtxOut);
 266         JitLoadVertices(fetchState, streams, vIndices2, GEP(pVtxOut, C(1)));
 267     }
 268     else
 269     {
 270         JitGatherVertices(fetchState, streams, vIndices, pVtxOut, false);
 271         JitGatherVertices(fetchState, streams, vIndices2, GEP(pVtxOut, C(1)), true);
 272     }
 273 #else
 274     (fetchState.bDisableVGATHER) ? JitLoadVertices(fetchState, streams, vIndices, pVtxOut)
 275                                  : JitGatherVertices(fetchState, streams, vIndices, pVtxOut);
 276 #endif
 277
 278     RET_VOID();
 279
 280     JitManager::DumpToFile(fetch, "src");
 281
 282 #if defined(_DEBUG)
 283     verifyFunction(*fetch);
 284 #endif
 285
 286     ::FunctionPassManager setupPasses(JM()->mpCurrentModule);
 287
 288     ///@todo We don't need the CFG passes for fetch. (e.g. BreakCriticalEdges and CFGSimplification)
 289     setupPasses.add(createBreakCriticalEdgesPass());
 290     setupPasses.add(createCFGSimplificationPass());
 291     setupPasses.add(createEarlyCSEPass());
 292     setupPasses.add(createPromoteMemoryToRegisterPass());
 293
 294     setupPasses.run(*fetch);
 295
 296     JitManager::DumpToFile(fetch, "se");
 297
 298     ::FunctionPassManager optPasses(JM()->mpCurrentModule);
 299
 300     ///@todo Haven't touched these either. Need to remove some of these and add others.
 301     optPasses.add(createCFGSimplificationPass());
 302     optPasses.add(createEarlyCSEPass());
 303     optPasses.add(createInstructionCombiningPass());
 304     optPasses.add(createInstructionSimplifierPass());
 305     optPasses.add(createConstantPropagationPass());
 306     optPasses.add(createSCCPPass());
 307     optPasses.add(createAggressiveDCEPass());
 308
 309     optPasses.run(*fetch);
 310     optPasses.run(*fetch);
 311
 312     JitManager::DumpToFile(fetch, "opt");
 313
 314     return fetch;
 315 }
 316
 317 //////////////////////////////////////////////////////////////////////////
 318 /// @brief Loads attributes from memory using LOADs, shuffling the
 319 /// components into SOA form.
 320 /// *Note* currently does not support component control,
 321 /// component packing, instancing
 322 /// @param fetchState - info about attributes to be fetched from memory
 323 /// @param streams - value pointer to the current vertex stream
 324 /// @param vIndices - vector value of indices to load
 325 /// @param pVtxOut - value pointer to output simdvertex struct
 326 void FetchJit::JitLoadVertices(const FETCH_COMPILE_STATE &fetchState, Value* streams, Value* vIndices, Value* pVtxOut)
 327 {
 328     // Zack shuffles; a variant of the Charleston.
 329
 330     std::vector<Value*> vectors(16);
 331     std::vector<Constant*>    pMask(mVWidth);
 332     for(uint32_t i = 0; i < mVWidth; ++i)
 333     {
 334         pMask[i] = (C(i < 4 ? i : 4));
 335     }
 336     Constant* promoteMask = ConstantVector::get(pMask);
 337     Constant* uwvec = UndefValue::get(VectorType::get(mFP32Ty, 4));
 338
 339     Value* startVertex = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartVertex});
 340     Value* startInstance = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartInstance});
 341     Value* curInstance = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_CurInstance});
 342     Value* vBaseVertex = VBROADCAST(LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_BaseVertex}));
 343     curInstance->setName("curInstance");
 344
 345     for(uint32_t nelt = 0; nelt < fetchState.numAttribs; ++nelt)
 346     {
 347         Value*    elements[4] = {0};
 348         const INPUT_ELEMENT_DESC& ied = fetchState.layout[nelt];
 349         const SWR_FORMAT_INFO &info = GetFormatInfo((SWR_FORMAT)ied.Format);
 350         SWR_ASSERT((info.bpp != 0), "Unsupported format in JitLoadVertices.");
 351         uint32_t    numComponents = info.numComps;
 352         uint32_t bpc = info.bpp / info.numComps;  ///@todo Code below assumes all components are same size. Need to fix.
 353
 354         // load path doesn't support component packing
 355         SWR_ASSERT(ied.ComponentPacking == ComponentEnable::XYZW, "Fetch load path doesn't support component packing.");
 356
 357         vectors.clear();
 358
 359         Value *vCurIndices;
 360         Value *startOffset;
 361         if(ied.InstanceEnable)
 362         {
 363             Value* stepRate = C(ied.InstanceDataStepRate);
 364
 365             // prevent a div by 0 for 0 step rate
 366             Value* isNonZeroStep = ICMP_UGT(stepRate, C(0));
 367             stepRate = SELECT(isNonZeroStep, stepRate, C(1));
 368
 369             // calc the current offset into instanced data buffer
 370             Value* calcInstance = UDIV(curInstance, stepRate);
 371
 372             // if step rate is 0, every instance gets instance 0
 373             calcInstance = SELECT(isNonZeroStep, calcInstance, C(0));
 374
 375             vCurIndices = VBROADCAST(calcInstance);
 376
 377             startOffset = startInstance;
 378         }
 379         else
 380         {
 381             // offset indices by baseVertex
 382             vCurIndices = ADD(vIndices, vBaseVertex);
 383
 384             startOffset = startVertex;
 385         }
 386
 387         // load SWR_VERTEX_BUFFER_STATE::pData
 388         Value *stream = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pData});
 389
 390         // load SWR_VERTEX_BUFFER_STATE::pitch
 391         Value *stride = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pitch});
 392         stride = Z_EXT(stride, mInt64Ty);
 393
 394         // load SWR_VERTEX_BUFFER_STATE::size
 395         Value *size = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_size});
 396         size = Z_EXT(size, mInt64Ty);
 397
 398         Value* startVertexOffset = MUL(Z_EXT(startOffset, mInt64Ty), stride);
 399
 400         Value *minVertex = NULL;
 401         Value *minVertexOffset = NULL;
 402         if (fetchState.bPartialVertexBuffer) {
 403             // fetch min index for low bounds checking
 404             minVertex = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_minVertex)});
 405             minVertex = LOAD(minVertex);
 406             if (!fetchState.bDisableIndexOOBCheck) {
 407                 minVertexOffset = MUL(Z_EXT(minVertex, mInt64Ty), stride);
 408             }
 409         }
 410
 411         // Load from the stream.
 412         for(uint32_t lane = 0; lane < mVWidth; ++lane)
 413         {
 414             // Get index
 415             Value* index = VEXTRACT(vCurIndices, C(lane));
 416
 417             if (fetchState.bPartialVertexBuffer) {
 418                 // clamp below minvertex
 419                 Value *isBelowMin = ICMP_SLT(index, minVertex);
 420                 index = SELECT(isBelowMin, minVertex, index);
 421             }
 422
 423             index = Z_EXT(index, mInt64Ty);
 424
 425             Value*    offset = MUL(index, stride);
 426             offset = ADD(offset, C((int64_t)ied.AlignedByteOffset));
 427             offset = ADD(offset, startVertexOffset);
 428
 429             if (!fetchState.bDisableIndexOOBCheck) {
 430                 // check for out of bound access, including partial OOB, and replace them with minVertex
 431                 Value *endOffset = ADD(offset, C((int64_t)info.Bpp));
 432                 Value *oob = ICMP_ULE(endOffset, size);
 433                 if (fetchState.bPartialVertexBuffer) {
 434                     offset = SELECT(oob, offset, minVertexOffset);
 435                 } else {
 436                     offset = SELECT(oob, offset, ConstantInt::get(mInt64Ty, 0));
 437                 }
 438             }
 439
 440             Value*    pointer = GEP(stream, offset);
 441             // We use a full-lane, but don't actually care.
 442             Value*    vptr = 0;
 443
 444             // get a pointer to a 4 component attrib in default address space
 445             switch(bpc)
 446             {
 447                 case 8: vptr = BITCAST(pointer, PointerType::get(VectorType::get(mInt8Ty, 4), 0)); break;
 448                 case 16: vptr = BITCAST(pointer, PointerType::get(VectorType::get(mInt16Ty, 4), 0)); break;
 449                 case 32: vptr = BITCAST(pointer, PointerType::get(VectorType::get(mFP32Ty, 4), 0)); break;
 450                 default: SWR_INVALID("Unsupported underlying bpp!");
 451             }
 452
 453             // load 4 components of attribute
 454             Value*    vec = ALIGNED_LOAD(vptr, 1, false);
 455
 456             // Convert To FP32 internally
 457             switch(info.type[0])
 458             {
 459                 case SWR_TYPE_UNORM:
 460                     switch(bpc)
 461                     {
 462                         case 8:
 463                             vec = UI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
 464                             vec = FMUL(vec, ConstantVector::get(std::vector<Constant*>(4, ConstantFP::get(mFP32Ty, 1.0 / 255.0))));
 465                             break;
 466                         case 16:
 467                             vec = UI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
 468                             vec = FMUL(vec, ConstantVector::get(std::vector<Constant*>(4, ConstantFP::get(mFP32Ty, 1.0 / 65535.0))));
 469                             break;
 470                         default:
 471                             SWR_INVALID("Unsupported underlying type!");
 472                             break;
 473                     }
 474                     break;
 475                 case SWR_TYPE_SNORM:
 476                     switch(bpc)
 477                     {
 478                         case 8:
 479                             vec = SI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
 480                             vec = FMUL(vec, ConstantVector::get(std::vector<Constant*>(4, ConstantFP::get(mFP32Ty, 1.0 / 128.0))));
 481                             break;
 482                         case 16:
 483                             vec = SI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
 484                             vec = FMUL(vec, ConstantVector::get(std::vector<Constant*>(4, ConstantFP::get(mFP32Ty, 1.0 / 32768.0))));
 485                             break;
 486                         default:
 487                             SWR_INVALID("Unsupported underlying type!");
 488                             break;
 489                     }
 490                     break;
 491                 case SWR_TYPE_UINT:
 492                     // Zero extend uint32_t types.
 493                     switch(bpc)
 494                     {
 495                         case 8:
 496                         case 16:
 497                             vec = Z_EXT(vec, VectorType::get(mInt32Ty, 4));
 498                             vec = BITCAST(vec, VectorType::get(mFP32Ty, 4));
 499                             break;
 500                         case 32:
 501                             break; // Pass through unchanged.
 502                         default:
 503                             SWR_INVALID("Unsupported underlying type!");
 504                             break;
 505                     }
 506                     break;
 507                 case SWR_TYPE_SINT:
 508                     // Sign extend SINT types.
 509                     switch(bpc)
 510                     {
 511                         case 8:
 512                         case 16:
 513                             vec = S_EXT(vec, VectorType::get(mInt32Ty, 4));
 514                             vec = BITCAST(vec, VectorType::get(mFP32Ty, 4));
 515                             break;
 516                         case 32:
 517                             break; // Pass through unchanged.
 518                         default:
 519                             SWR_INVALID("Unsupported underlying type!");
 520                             break;
 521                     }
 522                     break;
 523                 case SWR_TYPE_FLOAT:
 524                     switch(bpc)
 525                     {
 526                         case 32:
 527                             break; // Pass through unchanged.
 528                         default:
 529                             SWR_INVALID("Unsupported underlying type!");
 530                     }
 531                     break;
 532                 case SWR_TYPE_USCALED:
 533                     vec = UI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
 534                     break;
 535                 case SWR_TYPE_SSCALED:
 536                     vec = SI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
 537                     break;
 538                 case SWR_TYPE_SFIXED:
 539                     vec = FMUL(SI_TO_FP(vec, VectorType::get(mFP32Ty, 4)), VBROADCAST(C(1/65536.0f)));
 540                     break;
 541                 case SWR_TYPE_UNKNOWN:
 542                 case SWR_TYPE_UNUSED:
 543                     SWR_INVALID("Unsupported type %d!", info.type[0]);
 544             }
 545
 546             // promote mask: sse(0,1,2,3) | avx(0,1,2,3,4,4,4,4)
 547             // uwvec: 4 x F32, undef value
 548             Value*    wvec = VSHUFFLE(vec, uwvec, promoteMask);
 549             vectors.push_back(wvec);
 550         }
 551
 552         std::vector<Constant*>        v01Mask(mVWidth);
 553         std::vector<Constant*>        v23Mask(mVWidth);
 554         std::vector<Constant*>        v02Mask(mVWidth);
 555         std::vector<Constant*>        v13Mask(mVWidth);
 556
 557         // Concatenate the vectors together.
 558         elements[0] = VUNDEF_F();
 559         elements[1] = VUNDEF_F();
 560         elements[2] = VUNDEF_F();
 561         elements[3] = VUNDEF_F();
 562         for(uint32_t b = 0, num4Wide = mVWidth / 4; b < num4Wide; ++b)
 563         {
 564             v01Mask[4 * b + 0] = C(0 + 4 * b);
 565             v01Mask[4 * b + 1] = C(1 + 4 * b);
 566             v01Mask[4 * b + 2] = C(0 + 4 * b + mVWidth);
 567             v01Mask[4 * b + 3] = C(1 + 4 * b + mVWidth);
 568
 569             v23Mask[4 * b + 0] = C(2 + 4 * b);
 570             v23Mask[4 * b + 1] = C(3 + 4 * b);
 571             v23Mask[4 * b + 2] = C(2 + 4 * b + mVWidth);
 572             v23Mask[4 * b + 3] = C(3 + 4 * b + mVWidth);
 573
 574             v02Mask[4 * b + 0] = C(0 + 4 * b);
 575             v02Mask[4 * b + 1] = C(2 + 4 * b);
 576             v02Mask[4 * b + 2] = C(0 + 4 * b + mVWidth);
 577             v02Mask[4 * b + 3] = C(2 + 4 * b + mVWidth);
 578
 579             v13Mask[4 * b + 0] = C(1 + 4 * b);
 580             v13Mask[4 * b + 1] = C(3 + 4 * b);
 581             v13Mask[4 * b + 2] = C(1 + 4 * b + mVWidth);
 582             v13Mask[4 * b + 3] = C(3 + 4 * b + mVWidth);
 583
 584             std::vector<Constant*>    iMask(mVWidth);
 585             for(uint32_t i = 0; i < mVWidth; ++i)
 586             {
 587                 if(((4 * b) <= i) && (i < (4 * (b + 1))))
 588                 {
 589                     iMask[i] = C(i % 4 + mVWidth);
 590                 }
 591                 else
 592                 {
 593                     iMask[i] = C(i);
 594                 }
 595             }
 596             Constant* insertMask = ConstantVector::get(iMask);
 597             elements[0] = VSHUFFLE(elements[0], vectors[4 * b + 0], insertMask);
 598             elements[1] = VSHUFFLE(elements[1], vectors[4 * b + 1], insertMask);
 599             elements[2] = VSHUFFLE(elements[2], vectors[4 * b + 2], insertMask);
 600             elements[3] = VSHUFFLE(elements[3], vectors[4 * b + 3], insertMask);
 601         }
 602
 603         Value* x0y0x1y1 = VSHUFFLE(elements[0], elements[1], ConstantVector::get(v01Mask));
 604         Value* x2y2x3y3 = VSHUFFLE(elements[2], elements[3], ConstantVector::get(v01Mask));
 605         Value* z0w0z1w1 = VSHUFFLE(elements[0], elements[1], ConstantVector::get(v23Mask));
 606         Value* z2w3z2w3 = VSHUFFLE(elements[2], elements[3], ConstantVector::get(v23Mask));
 607         elements[0] = VSHUFFLE(x0y0x1y1, x2y2x3y3, ConstantVector::get(v02Mask));
 608         elements[1] = VSHUFFLE(x0y0x1y1, x2y2x3y3, ConstantVector::get(v13Mask));
 609         elements[2] = VSHUFFLE(z0w0z1w1, z2w3z2w3, ConstantVector::get(v02Mask));
 610         elements[3] = VSHUFFLE(z0w0z1w1, z2w3z2w3, ConstantVector::get(v13Mask));
 611
 612         switch(numComponents + 1)
 613         {
 614             case    1: elements[0] = VIMMED1(0.0f);
 615             case    2: elements[1] = VIMMED1(0.0f);
 616             case    3: elements[2] = VIMMED1(0.0f);
 617             case    4: elements[3] = VIMMED1(1.0f);
 618         }
 619
 620         for(uint32_t c = 0; c < 4; ++c)
 621         {
 622 #if USE_SIMD16_SHADERS
 623             Value* dest = GEP(pVtxOut, C(nelt * 8 + c * 2), "destGEP");
 624 #else
 625             Value* dest = GEP(pVtxOut, C(nelt * 4 + c), "destGEP");
 626 #endif
 627             STORE(elements[c], dest);
 628         }
 629     }
 630 }
 631
 632 // returns true for odd formats that require special state.gather handling
 633 bool FetchJit::IsOddFormat(SWR_FORMAT format)
 634 {
 635     const SWR_FORMAT_INFO& info = GetFormatInfo(format);
 636     if (info.bpc[0] != 8 && info.bpc[0] != 16 && info.bpc[0] != 32 && info.bpc[0] != 64)
 637     {
 638         return true;
 639     }
 640     return false;
 641 }
 642
 643 // format is uniform if all components are the same size and type
 644 bool FetchJit::IsUniformFormat(SWR_FORMAT format)
 645 {
 646     const SWR_FORMAT_INFO& info = GetFormatInfo(format);
 647     uint32_t bpc0 = info.bpc[0];
 648     uint32_t type0 = info.type[0];
 649
 650     for (uint32_t c = 1; c < info.numComps; ++c)
 651     {
 652         if (bpc0 != info.bpc[c] || type0 != info.type[c])
 653         {
 654             return false;
 655         }
 656     }
 657     return true;
 658 }
 659
 660 // unpacks components based on format
 661 // foreach component in the pixel
 662 //   mask off everything but this component
 663 //   shift component to LSB
 664 void FetchJit::UnpackComponents(SWR_FORMAT format, Value* vInput, Value* result[4])
 665 {
 666     const SWR_FORMAT_INFO& info = GetFormatInfo(format);
 667
 668     uint32_t bitOffset = 0;
 669     for (uint32_t c = 0; c < info.numComps; ++c)
 670     {
 671         uint32_t swizzledIndex = info.swizzle[c];
 672         uint32_t compBits = info.bpc[c];
 673         uint32_t bitmask = ((1 << compBits) - 1) << bitOffset;
 674         Value* comp = AND(vInput, bitmask);
 675         comp = LSHR(comp, bitOffset);
 676
 677         result[swizzledIndex] = comp;
 678         bitOffset += compBits;
 679     }
 680 }
 681
 682 // gather for odd component size formats
 683 // gather SIMD full pixels per lane then shift/mask to move each component to their
 684 // own vector
 685 void FetchJit::CreateGatherOddFormats(SWR_FORMAT format, Value* pMask, Value* pBase, Value* pOffsets, Value* pResult[4])
 686 {
 687     const SWR_FORMAT_INFO &info = GetFormatInfo(format);
 688
 689     // only works if pixel size is <= 32bits
 690     SWR_ASSERT(info.bpp <= 32);
 691
 692         Value* pGather = GATHERDD(VIMMED1(0), pBase, pOffsets, pMask, C((char)1));
 693
 694     for (uint32_t comp = 0; comp < 4; ++comp)
 695     {
 696         pResult[comp] = VIMMED1((int)info.defaults[comp]);
 697     }
 698
 699     UnpackComponents(format, pGather, pResult);
 700
 701     // cast to fp32
 702     pResult[0] = BITCAST(pResult[0], mSimdFP32Ty);
 703     pResult[1] = BITCAST(pResult[1], mSimdFP32Ty);
 704     pResult[2] = BITCAST(pResult[2], mSimdFP32Ty);
 705     pResult[3] = BITCAST(pResult[3], mSimdFP32Ty);
 706 }
 707
 708 void FetchJit::ConvertFormat(SWR_FORMAT format, Value *texels[4])
 709 {
 710     const SWR_FORMAT_INFO &info = GetFormatInfo(format);
 711
 712     for (uint32_t c = 0; c < info.numComps; ++c)
 713     {
 714         uint32_t compIndex = info.swizzle[c];
 715
 716         // skip any conversion on UNUSED components
 717         if (info.type[c] == SWR_TYPE_UNUSED)
 718         {
 719             continue;
 720         }
 721
 722         if (info.isNormalized[c])
 723         {
 724             if (info.type[c] == SWR_TYPE_SNORM)
 725             {
 726                 /// @todo The most-negative value maps to -1.0f. e.g. the 5-bit value 10000 maps to -1.0f.
 727
 728                 /// result = c * (1.0f / (2^(n-1) - 1);
 729                 uint32_t n = info.bpc[c];
 730                 uint32_t pow2 = 1 << (n - 1);
 731                 float scale = 1.0f / (float)(pow2 - 1);
 732                 Value *vScale = VIMMED1(scale);
 733                 texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty);
 734                 texels[compIndex] = SI_TO_FP(texels[compIndex], mSimdFP32Ty);
 735                 texels[compIndex] = FMUL(texels[compIndex], vScale);
 736             }
 737             else
 738             {
 739                 SWR_ASSERT(info.type[c] == SWR_TYPE_UNORM);
 740
 741                 /// result = c * (1.0f / (2^n - 1))
 742                 uint32_t n = info.bpc[c];
 743                 uint32_t pow2 = 1 << n;
 744                 // special case 24bit unorm format, which requires a full divide to meet ULP requirement
 745                 if (n == 24)
 746                 {
 747                     float scale = (float)(pow2 - 1);
 748                     Value* vScale = VIMMED1(scale);
 749                     texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty);
 750                     texels[compIndex] = SI_TO_FP(texels[compIndex], mSimdFP32Ty);
 751                     texels[compIndex] = FDIV(texels[compIndex], vScale);
 752                 }
 753                 else
 754                 {
 755                     float scale = 1.0f / (float)(pow2 - 1);
 756                     Value *vScale = VIMMED1(scale);
 757                     texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty);
 758                     texels[compIndex] = UI_TO_FP(texels[compIndex], mSimdFP32Ty);
 759                     texels[compIndex] = FMUL(texels[compIndex], vScale);
 760                 }
 761             }
 762             continue;
 763         }
 764     }
 765 }
 766
 767 //////////////////////////////////////////////////////////////////////////
 768 /// @brief Loads attributes from memory using AVX2 GATHER(s)
 769 /// @param fetchState - info about attributes to be fetched from memory
 770 /// @param streams - value pointer to the current vertex stream
 771 /// @param vIndices - vector value of indices to gather
 772 /// @param pVtxOut - value pointer to output simdvertex struct
 773 #if USE_SIMD16_SHADERS
 774 void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
 775     Value* streams, Value* vIndices, Value* pVtxOut, bool useVertexID2)
 776 #else
 777 void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
 778     Value* streams, Value* vIndices, Value* pVtxOut)
 779 #endif
 780 {
 781     uint32_t currentVertexElement = 0;
 782     uint32_t outputElt = 0;
 783     Value* vVertexElements[4];
 784
 785     Value* startVertex = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartVertex});
 786     Value* startInstance = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartInstance});
 787     Value* curInstance = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_CurInstance});
 788     Value* vBaseVertex = VBROADCAST(LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_BaseVertex}));
 789     curInstance->setName("curInstance");
 790
 791     for(uint32_t nInputElt = 0; nInputElt < fetchState.numAttribs; ++nInputElt)
 792     {
 793         const INPUT_ELEMENT_DESC& ied = fetchState.layout[nInputElt];
 794
 795         // skip element if all components are disabled
 796         if (ied.ComponentPacking == ComponentEnable::NONE)
 797         {
 798             continue;
 799         }
 800
 801         const SWR_FORMAT_INFO &info = GetFormatInfo((SWR_FORMAT)ied.Format);
 802         SWR_ASSERT((info.bpp != 0), "Unsupported format in JitGatherVertices.");
 803         uint32_t bpc = info.bpp / info.numComps;  ///@todo Code below assumes all components are same size. Need to fix.
 804
 805         Value *stream = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pData});
 806
 807         // VGATHER* takes an *i8 src pointer
 808         Value* pStreamBase = BITCAST(stream, PointerType::get(mInt8Ty, 0));
 809
 810         Value *stride = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pitch});
 811         Value *vStride = VBROADCAST(stride);
 812
 813         // max vertex index that is fully in bounds
 814         Value *maxVertex = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_maxVertex)});
 815         maxVertex = LOAD(maxVertex);
 816
 817         Value *minVertex = NULL;
 818         if (fetchState.bPartialVertexBuffer) {
 819             // min vertex index for low bounds OOB checking
 820             minVertex = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_minVertex)});
 821             minVertex = LOAD(minVertex);
 822         }
 823
 824         Value *vCurIndices;
 825         Value *startOffset;
 826         if(ied.InstanceEnable)
 827         {
 828             Value* stepRate = C(ied.InstanceDataStepRate);
 829
 830             // prevent a div by 0 for 0 step rate
 831             Value* isNonZeroStep = ICMP_UGT(stepRate, C(0));
 832             stepRate = SELECT(isNonZeroStep, stepRate, C(1));
 833
 834             // calc the current offset into instanced data buffer
 835             Value* calcInstance = UDIV(curInstance, stepRate);
 836
 837             // if step rate is 0, every instance gets instance 0
 838             calcInstance = SELECT(isNonZeroStep, calcInstance, C(0));
 839
 840             vCurIndices = VBROADCAST(calcInstance);
 841
 842             startOffset = startInstance;
 843         }
 844         else
 845         {
 846             // offset indices by baseVertex
 847             vCurIndices = ADD(vIndices, vBaseVertex);
 848
 849             startOffset = startVertex;
 850         }
 851
 852         // All of the OOB calculations are in vertices, not VB offsets, to prevent having to
 853         // do 64bit address offset calculations.
 854
 855         // calculate byte offset to the start of the VB
 856         Value* baseOffset = MUL(Z_EXT(startOffset, mInt64Ty), Z_EXT(stride, mInt64Ty));
 857         pStreamBase = GEP(pStreamBase, baseOffset);
 858
 859         // if we have a start offset, subtract from max vertex. Used for OOB check
 860         maxVertex = SUB(Z_EXT(maxVertex, mInt64Ty), Z_EXT(startOffset, mInt64Ty));
 861         Value* maxNeg = ICMP_SLT(maxVertex, C((int64_t)0));
 862         // if we have a negative value, we're already OOB. clamp at 0.
 863         maxVertex = SELECT(maxNeg, C(0), TRUNC(maxVertex, mInt32Ty));
 864
 865         if (fetchState.bPartialVertexBuffer) {
 866             // similary for min vertex
 867             minVertex = SUB(Z_EXT(minVertex, mInt64Ty), Z_EXT(startOffset, mInt64Ty));
 868             Value *minNeg = ICMP_SLT(minVertex, C((int64_t)0));
 869             minVertex = SELECT(minNeg, C(0), TRUNC(minVertex, mInt32Ty));
 870         }
 871
 872         // Load the in bounds size of a partially valid vertex
 873         Value *partialInboundsSize = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_partialInboundsSize)});
 874         partialInboundsSize = LOAD(partialInboundsSize);
 875         Value* vPartialVertexSize = VBROADCAST(partialInboundsSize);
 876         Value* vBpp = VBROADCAST(C(info.Bpp));
 877         Value* vAlignmentOffsets = VBROADCAST(C(ied.AlignedByteOffset));
 878
 879         // is the element is <= the partially valid size
 880         Value* vElementInBoundsMask = ICMP_SLE(vBpp, SUB(vPartialVertexSize, vAlignmentOffsets));
 881
 882         // override cur indices with 0 if pitch is 0
 883         Value* pZeroPitchMask = ICMP_EQ(vStride, VIMMED1(0));
 884         vCurIndices = SELECT(pZeroPitchMask, VIMMED1(0), vCurIndices);
 885
 886         // are vertices partially OOB?
 887         Value* vMaxVertex = VBROADCAST(maxVertex);
 888         Value* vPartialOOBMask = ICMP_EQ(vCurIndices, vMaxVertex);
 889
 890         // are vertices fully in bounds?
 891         Value* vMaxGatherMask = ICMP_ULT(vCurIndices, vMaxVertex);
 892
 893         Value *vGatherMask;
 894         if (fetchState.bPartialVertexBuffer) {
 895             // are vertices below minVertex limit?
 896             Value *vMinVertex = VBROADCAST(minVertex);
 897             Value *vMinGatherMask = ICMP_UGE(vCurIndices, vMinVertex);
 898
 899             // only fetch lanes that pass both tests
 900             vGatherMask = AND(vMaxGatherMask, vMinGatherMask);
 901         } else {
 902             vGatherMask = vMaxGatherMask;
 903         }
 904
 905         // blend in any partially OOB indices that have valid elements
 906         vGatherMask = SELECT(vPartialOOBMask, vElementInBoundsMask, vGatherMask);
 907         Value* pMask = vGatherMask;
 908         vGatherMask = VMASK(vGatherMask);
 909
 910         // calculate the actual offsets into the VB
 911         Value* vOffsets = MUL(vCurIndices, vStride);
 912         vOffsets = ADD(vOffsets, vAlignmentOffsets);
 913
 914         // Packing and component control
 915         ComponentEnable compMask = (ComponentEnable)ied.ComponentPacking;
 916         const ComponentControl compCtrl[4] { (ComponentControl)ied.ComponentControl0, (ComponentControl)ied.ComponentControl1,
 917                                              (ComponentControl)ied.ComponentControl2, (ComponentControl)ied.ComponentControl3};
 918
 919         // Special gather/conversion for formats without equal component sizes
 920         if (IsOddFormat((SWR_FORMAT)ied.Format))
 921         {
 922             Value* pResults[4];
 923             CreateGatherOddFormats((SWR_FORMAT)ied.Format, vGatherMask, pStreamBase, vOffsets, pResults);
 924             ConvertFormat((SWR_FORMAT)ied.Format, pResults);
 925
 926             for (uint32_t c = 0; c < 4; ++c)
 927             {
 928                 if (isComponentEnabled(compMask, c))
 929                 {
 930                     vVertexElements[currentVertexElement++] = pResults[c];
 931                     if (currentVertexElement > 3)
 932                     {
 933                         StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
 934                         // reset to the next vVertexElement to output
 935                         currentVertexElement = 0;
 936                     }
 937                 }
 938             }
 939         }
 940         else if(info.type[0] == SWR_TYPE_FLOAT)
 941         {
 942             ///@todo: support 64 bit vb accesses
 943             Value* gatherSrc = VIMMED1(0.0f);
 944
 945             SWR_ASSERT(IsUniformFormat((SWR_FORMAT)ied.Format),
 946                 "Unsupported format for standard gather fetch.");
 947
 948             // Gather components from memory to store in a simdvertex structure
 949             switch(bpc)
 950             {
 951                 case 16:
 952                 {
 953                     Value* vGatherResult[2];
 954                     Value *vMask;
 955
 956                     // if we have at least one component out of x or y to fetch
 957                     if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)){
 958                         // save mask as it is zero'd out after each gather
 959                         vMask = vGatherMask;
 960
 961                         vGatherResult[0] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1));
 962                         // e.g. result of first 8x32bit integer gather for 16bit components
 963                         // 256i - 0    1    2    3    4    5    6    7
 964                         //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
 965                         //
 966                     }
 967
 968                     // if we have at least one component out of z or w to fetch
 969                     if(isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)){
 970                         // offset base to the next components(zw) in the vertex to gather
 971                         pStreamBase = GEP(pStreamBase, C((char)4));
 972                         vMask = vGatherMask;
 973
 974                         vGatherResult[1] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1));
 975                         // e.g. result of second 8x32bit integer gather for 16bit components
 976                         // 256i - 0    1    2    3    4    5    6    7
 977                         //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
 978                         //
 979                     }
 980
 981                     // if we have at least one component to shuffle into place
 982                     if(compMask){
 983                         Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, Instruction::CastOps::FPExt, CONVERT_NONE,
 984                             currentVertexElement, outputElt, compMask, compCtrl, vVertexElements);
 985
 986                         // Shuffle gathered components into place in simdvertex struct
 987 #if USE_SIMD16_SHADERS
 988                         Shuffle16bpcGather(args, useVertexID2);  // outputs to vVertexElements ref
 989 #else
 990                         Shuffle16bpcGather(args);  // outputs to vVertexElements ref
 991 #endif
 992                     }
 993                 }
 994                     break;
 995                 case 32:
 996                 {
 997                     for (uint32_t i = 0; i < 4; i++)
 998                     {
 999                         if (isComponentEnabled(compMask, i))
1000                         {
1001                             // if we need to gather the component
1002                             if (compCtrl[i] == StoreSrc)
1003                             {
1004                                 // save mask as it is zero'd out after each gather
1005                                 Value *vMask = vGatherMask;
1006
1007                                 // Gather a SIMD of vertices
1008                                 vVertexElements[currentVertexElement++] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1));
1009                             }
1010                             else
1011                             {
1012 #if USE_SIMD16_SHADERS
1013                                 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i], useVertexID2);
1014 #else
1015                                 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1016 #endif
1017                             }
1018
1019                             if (currentVertexElement > 3)
1020                             {
1021                                 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1022                                 // reset to the next vVertexElement to output
1023                                 currentVertexElement = 0;
1024                             }
1025
1026                         }
1027
1028                         // offset base to the next component in the vertex to gather
1029                         pStreamBase = GEP(pStreamBase, C((char)4));
1030                     }
1031                 }
1032                     break;
1033                 case 64:
1034                 {
1035                     for (uint32_t i = 0; i < 4; i++)
1036                     {
1037                         if (isComponentEnabled(compMask, i))
1038                         {
1039                             // if we need to gather the component
1040                             if (compCtrl[i] == StoreSrc)
1041                             {
1042                                 Value *vMaskLo = VSHUFFLE(pMask, VUNDEF(mInt1Ty, 8), C({0, 1, 2, 3}));
1043                                 Value *vMaskHi = VSHUFFLE(pMask, VUNDEF(mInt1Ty, 8), C({4, 5, 6, 7}));
1044                                 vMaskLo = S_EXT(vMaskLo, VectorType::get(mInt64Ty, 4));
1045                                 vMaskHi = S_EXT(vMaskHi, VectorType::get(mInt64Ty, 4));
1046                                 vMaskLo = BITCAST(vMaskLo, VectorType::get(mDoubleTy, 4));
1047                                 vMaskHi = BITCAST(vMaskHi, VectorType::get(mDoubleTy, 4));
1048
1049                                 Value *vOffsetsLo = VEXTRACTI128(vOffsets, C(0));
1050                                 Value *vOffsetsHi = VEXTRACTI128(vOffsets, C(1));
1051
1052                                 Value *vZeroDouble = VECTOR_SPLAT(4, ConstantFP::get(IRB()->getDoubleTy(), 0.0f));
1053
1054                                 Value* pGatherLo = GATHERPD(vZeroDouble,
1055                                                             pStreamBase, vOffsetsLo, vMaskLo, C((char)1));
1056                                 Value* pGatherHi = GATHERPD(vZeroDouble,
1057                                                             pStreamBase, vOffsetsHi, vMaskHi, C((char)1));
1058
1059                                 pGatherLo = VCVTPD2PS(pGatherLo);
1060                                 pGatherHi = VCVTPD2PS(pGatherHi);
1061
1062                                 Value *pGather = VSHUFFLE(pGatherLo, pGatherHi, C({0, 1, 2, 3, 4, 5, 6, 7}));
1063
1064                                 vVertexElements[currentVertexElement++] = pGather;
1065                             }
1066                             else
1067                             {
1068 #if USE_SIMD16_SHADERS
1069                                 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i], useVertexID2);
1070 #else
1071                                 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1072 #endif
1073                             }
1074
1075                             if (currentVertexElement > 3)
1076                             {
1077                                 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1078                                 // reset to the next vVertexElement to output
1079                                 currentVertexElement = 0;
1080                             }
1081
1082                         }
1083
1084                         // offset base to the next component  in the vertex to gather
1085                         pStreamBase = GEP(pStreamBase, C((char)8));
1086                     }
1087                 }
1088                     break;
1089                 default:
1090                     SWR_INVALID("Tried to fetch invalid FP format");
1091                     break;
1092             }
1093         }
1094         else
1095         {
1096             Instruction::CastOps extendCastType = Instruction::CastOps::CastOpsEnd;
1097             ConversionType conversionType = CONVERT_NONE;
1098
1099             SWR_ASSERT(IsUniformFormat((SWR_FORMAT)ied.Format),
1100                 "Unsupported format for standard gather fetch.");
1101
1102             switch(info.type[0])
1103             {
1104                 case SWR_TYPE_UNORM:
1105                     conversionType = CONVERT_NORMALIZED;
1106                 case SWR_TYPE_UINT:
1107                     extendCastType = Instruction::CastOps::ZExt;
1108                     break;
1109                 case SWR_TYPE_SNORM:
1110                     conversionType = CONVERT_NORMALIZED;
1111                 case SWR_TYPE_SINT:
1112                     extendCastType = Instruction::CastOps::SExt;
1113                     break;
1114                 case SWR_TYPE_USCALED:
1115                     conversionType = CONVERT_USCALED;
1116                     extendCastType = Instruction::CastOps::UIToFP;
1117                     break;
1118                 case SWR_TYPE_SSCALED:
1119                     conversionType = CONVERT_SSCALED;
1120                     extendCastType = Instruction::CastOps::SIToFP;
1121                     break;
1122                 case SWR_TYPE_SFIXED:
1123                     conversionType = CONVERT_SFIXED;
1124                     extendCastType = Instruction::CastOps::SExt;
1125                     break;
1126                 default:
1127                     break;
1128             }
1129
1130             // value substituted when component of gather is masked
1131             Value* gatherSrc = VIMMED1(0);
1132
1133             // Gather components from memory to store in a simdvertex structure
1134             switch (bpc)
1135             {
1136                 case 8:
1137                 {
1138                     // if we have at least one component to fetch
1139                     if(compMask)
1140                     {
1141                         Value* vGatherResult = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask, C((char)1));
1142                         // e.g. result of an 8x32bit integer gather for 8bit components
1143                         // 256i - 0    1    2    3    4    5    6    7
1144                         //        xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw
1145
1146                         Shuffle8bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, extendCastType, conversionType,
1147                             currentVertexElement, outputElt, compMask, compCtrl, vVertexElements, info.swizzle);
1148
1149                         // Shuffle gathered components into place in simdvertex struct
1150 #if USE_SIMD16_SHADERS
1151                         Shuffle8bpcGatherd(args, useVertexID2); // outputs to vVertexElements ref
1152 #else
1153                         Shuffle8bpcGatherd(args); // outputs to vVertexElements ref
1154 #endif
1155                     }
1156                 }
1157                 break;
1158                 case 16:
1159                 {
1160                     Value* vGatherResult[2];
1161                     Value *vMask;
1162
1163                     // if we have at least one component out of x or y to fetch
1164                     if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)){
1165                         // save mask as it is zero'd out after each gather
1166                         vMask = vGatherMask;
1167
1168                         vGatherResult[0] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1));
1169                         // e.g. result of first 8x32bit integer gather for 16bit components
1170                         // 256i - 0    1    2    3    4    5    6    7
1171                         //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
1172                         //
1173                     }
1174
1175                     // if we have at least one component out of z or w to fetch
1176                     if(isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)){
1177                         // offset base to the next components(zw) in the vertex to gather
1178                         pStreamBase = GEP(pStreamBase, C((char)4));
1179                         vMask = vGatherMask;
1180
1181                         vGatherResult[1] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1));
1182                         // e.g. result of second 8x32bit integer gather for 16bit components
1183                         // 256i - 0    1    2    3    4    5    6    7
1184                         //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
1185                         //
1186                     }
1187
1188                     // if we have at least one component to shuffle into place
1189                     if(compMask){
1190                         Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, extendCastType, conversionType,
1191                             currentVertexElement, outputElt, compMask, compCtrl, vVertexElements);
1192
1193                         // Shuffle gathered components into place in simdvertex struct
1194 #if USE_SIMD16_SHADERS
1195                         Shuffle16bpcGather(args, useVertexID2);  // outputs to vVertexElements ref
1196 #else
1197                         Shuffle16bpcGather(args);  // outputs to vVertexElements ref
1198 #endif
1199                     }
1200                 }
1201                 break;
1202                 case 32:
1203                 {
1204                     // Gathered components into place in simdvertex struct
1205                     for (uint32_t i = 0; i < 4; i++)
1206                     {
1207                         if (isComponentEnabled(compMask, i))
1208                         {
1209                             // if we need to gather the component
1210                             if (compCtrl[i] == StoreSrc)
1211                             {
1212                                 // save mask as it is zero'd out after each gather
1213                                 Value *vMask = vGatherMask;
1214
1215                                 Value* pGather = GATHERDD(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1));
1216
1217                                 if (conversionType == CONVERT_USCALED)
1218                                 {
1219                                     pGather = UI_TO_FP(pGather, mSimdFP32Ty);
1220                                 }
1221                                 else if (conversionType == CONVERT_SSCALED)
1222                                 {
1223                                     pGather = SI_TO_FP(pGather, mSimdFP32Ty);
1224                                 }
1225                                 else if (conversionType == CONVERT_SFIXED)
1226                                 {
1227                                     pGather = FMUL(SI_TO_FP(pGather, mSimdFP32Ty), VBROADCAST(C(1/65536.0f)));
1228                                 }
1229
1230                                 vVertexElements[currentVertexElement++] = pGather;
1231                                 // e.g. result of a single 8x32bit integer gather for 32bit components
1232                                 // 256i - 0    1    2    3    4    5    6    7
1233                                 //        xxxx xxxx xxxx xxxx xxxx xxxx xxxx xxxx
1234                             }
1235                             else
1236                             {
1237 #if USE_SIMD16_SHADERS
1238                                 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i], useVertexID2);
1239 #else
1240                                 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1241 #endif
1242                             }
1243
1244                             if (currentVertexElement > 3)
1245                             {
1246                                 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1247                                 // reset to the next vVertexElement to output
1248                                 currentVertexElement = 0;
1249                             }
1250
1251                         }
1252
1253                         // offset base to the next component  in the vertex to gather
1254                         pStreamBase = GEP(pStreamBase, C((char)4));
1255                     }
1256                 }
1257                 break;
1258             }
1259         }
1260     }
1261
1262     // if we have a partially filled vVertexElement struct, output it
1263     if(currentVertexElement > 0){
1264         StoreVertexElements(pVtxOut, outputElt++, currentVertexElement, vVertexElements);
1265     }
1266 }
1267
1268 //////////////////////////////////////////////////////////////////////////
1269 /// @brief Loads a simd of valid indices. OOB indices are set to 0
1270 /// *Note* have to do 16bit index checking in scalar until we have AVX-512
1271 /// support
1272 /// @param pIndices - pointer to 8 bit indices
1273 /// @param pLastIndex - pointer to last valid index
1274 Value* FetchJit::GetSimdValid8bitIndices(Value* pIndices, Value* pLastIndex)
1275 {
1276     // can fit 2 16 bit integers per vWidth lane
1277     Value* vIndices =  VUNDEF_I();
1278
1279     // store 0 index on stack to be used to conditionally load from if index address is OOB
1280     Value* pZeroIndex = ALLOCA(mInt8Ty);
1281     STORE(C((uint8_t)0), pZeroIndex);
1282
1283     // Load a SIMD of index pointers
1284     for(int64_t lane = 0; lane < mVWidth; lane++)
1285     {
1286         // Calculate the address of the requested index
1287         Value *pIndex = GEP(pIndices, C(lane));
1288
1289         // check if the address is less than the max index,
1290         Value* mask = ICMP_ULT(pIndex, pLastIndex);
1291
1292         // if valid, load the index. if not, load 0 from the stack
1293         Value* pValid = SELECT(mask, pIndex, pZeroIndex);
1294         Value *index = LOAD(pValid, "valid index");
1295
1296         // zero extended index to 32 bits and insert into the correct simd lane
1297         index = Z_EXT(index, mInt32Ty);
1298         vIndices = VINSERT(vIndices, index, lane);
1299     }
1300     return vIndices;
1301 }
1302
1303 //////////////////////////////////////////////////////////////////////////
1304 /// @brief Loads a simd of valid indices. OOB indices are set to 0
1305 /// *Note* have to do 16bit index checking in scalar until we have AVX-512
1306 /// support
1307 /// @param pIndices - pointer to 16 bit indices
1308 /// @param pLastIndex - pointer to last valid index
1309 Value* FetchJit::GetSimdValid16bitIndices(Value* pIndices, Value* pLastIndex)
1310 {
1311     // can fit 2 16 bit integers per vWidth lane
1312     Value* vIndices =  VUNDEF_I();
1313
1314     // store 0 index on stack to be used to conditionally load from if index address is OOB
1315     Value* pZeroIndex = ALLOCA(mInt16Ty);
1316     STORE(C((uint16_t)0), pZeroIndex);
1317
1318     // Load a SIMD of index pointers
1319     for(int64_t lane = 0; lane < mVWidth; lane++)
1320     {
1321         // Calculate the address of the requested index
1322         Value *pIndex = GEP(pIndices, C(lane));
1323
1324         // check if the address is less than the max index,
1325         Value* mask = ICMP_ULT(pIndex, pLastIndex);
1326
1327         // if valid, load the index. if not, load 0 from the stack
1328         Value* pValid = SELECT(mask, pIndex, pZeroIndex);
1329         Value *index = LOAD(pValid, "valid index");
1330
1331         // zero extended index to 32 bits and insert into the correct simd lane
1332         index = Z_EXT(index, mInt32Ty);
1333         vIndices = VINSERT(vIndices, index, lane);
1334     }
1335     return vIndices;
1336 }
1337
1338 //////////////////////////////////////////////////////////////////////////
1339 /// @brief Loads a simd of valid indices. OOB indices are set to 0
1340 /// @param pIndices - pointer to 32 bit indices
1341 /// @param pLastIndex - pointer to last valid index
1342 Value* FetchJit::GetSimdValid32bitIndices(Value* pIndices, Value* pLastIndex)
1343 {
1344     DataLayout dL(JM()->mpCurrentModule);
1345     unsigned int ptrSize = dL.getPointerSize() * 8;  // ptr size in bits
1346     Value* iLastIndex = PTR_TO_INT(pLastIndex, Type::getIntNTy(JM()->mContext, ptrSize));
1347     Value* iIndices = PTR_TO_INT(pIndices, Type::getIntNTy(JM()->mContext, ptrSize));
1348
1349     // get the number of indices left in the buffer (endPtr - curPtr) / sizeof(index)
1350     Value* numIndicesLeft = SUB(iLastIndex,iIndices);
1351     numIndicesLeft = TRUNC(numIndicesLeft, mInt32Ty);
1352     numIndicesLeft = SDIV(numIndicesLeft, C(4));
1353
1354     // create a vector of index counts from the base index ptr passed into the fetch
1355     const std::vector<Constant*> vecIndices {C(0), C(1), C(2), C(3), C(4), C(5), C(6), C(7)};
1356     Constant* vIndexOffsets = ConstantVector::get(vecIndices);
1357
1358     // compare index count to the max valid index
1359     // e.g vMaxIndex      4 4 4 4 4 4 4 4 : 4 indices left to load
1360     //     vIndexOffsets  0 1 2 3 4 5 6 7
1361     //     ------------------------------
1362     //     vIndexMask    -1-1-1-1 0 0 0 0 : offsets < max pass
1363     //     vLoadedIndices 0 1 2 3 0 0 0 0 : offsets >= max masked to 0
1364     Value* vMaxIndex = VBROADCAST(numIndicesLeft);
1365     Value* vIndexMask = VPCMPGTD(vMaxIndex,vIndexOffsets);
1366
1367     // VMASKLOAD takes an *i8 src pointer
1368     pIndices = BITCAST(pIndices,PointerType::get(mInt8Ty,0));
1369
1370     // Load the indices; OOB loads 0
1371     return MASKLOADD(pIndices,vIndexMask);
1372 }
1373
1374 //////////////////////////////////////////////////////////////////////////
1375 /// @brief Takes a SIMD of gathered 8bpc verts, zero or sign extends,
1376 /// denormalizes if needed, converts to F32 if needed, and positions in
1377 //  the proper SIMD rows to be output to the simdvertex structure
1378 /// @param args: (tuple of args, listed below)
1379 ///   @param vGatherResult - 8 gathered 8bpc vertices
1380 ///   @param pVtxOut - base pointer to output simdvertex struct
1381 ///   @param extendType - sign extend or zero extend
1382 ///   @param bNormalized - do we need to denormalize?
1383 ///   @param currentVertexElement - reference to the current vVertexElement
1384 ///   @param outputElt - reference to the current offset from simdvertex we're o
1385 ///   @param compMask - component packing mask
1386 ///   @param compCtrl - component control val
1387 ///   @param vVertexElements[4] - vertex components to output
1388 ///   @param swizzle[4] - component swizzle location
1389 #if USE_SIMD16_SHADERS
1390 void FetchJit::Shuffle8bpcGatherd(Shuffle8bpcArgs &args, bool useVertexID2)
1391 #else
1392 void FetchJit::Shuffle8bpcGatherd(Shuffle8bpcArgs &args)
1393 #endif
1394 {
1395     // Unpack tuple args
1396     Value*& vGatherResult = std::get<0>(args);
1397     Value* pVtxOut = std::get<1>(args);
1398     const Instruction::CastOps extendType = std::get<2>(args);
1399     const ConversionType conversionType = std::get<3>(args);
1400     uint32_t &currentVertexElement = std::get<4>(args);
1401     uint32_t &outputElt =  std::get<5>(args);
1402     const ComponentEnable compMask = std::get<6>(args);
1403     const ComponentControl (&compCtrl)[4] = std::get<7>(args);
1404     Value* (&vVertexElements)[4] = std::get<8>(args);
1405     const uint32_t (&swizzle)[4] = std::get<9>(args);
1406
1407     // cast types
1408     Type* vGatherTy = mSimdInt32Ty;
1409     Type* v32x8Ty =  VectorType::get(mInt8Ty, mVWidth * 4 ); // vwidth is units of 32 bits
1410
1411     // have to do extra work for sign extending
1412     if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP)){
1413         Type* v16x8Ty = VectorType::get(mInt8Ty, mVWidth * 2); // 8x16bit ints in a 128bit lane
1414         Type* v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
1415
1416         // shuffle mask, including any swizzling
1417         const char x = (char)swizzle[0]; const char y = (char)swizzle[1];
1418         const char z = (char)swizzle[2]; const char w = (char)swizzle[3];
1419         Value* vConstMask = C<char>({char(x), char(x+4), char(x+8), char(x+12),
1420                     char(y), char(y+4), char(y+8), char(y+12),
1421                     char(z), char(z+4), char(z+8), char(z+12),
1422                     char(w), char(w+4), char(w+8), char(w+12),
1423                     char(x), char(x+4), char(x+8), char(x+12),
1424                     char(y), char(y+4), char(y+8), char(y+12),
1425                     char(z), char(z+4), char(z+8), char(z+12),
1426                     char(w), char(w+4), char(w+8), char(w+12)});
1427
1428         Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherResult, v32x8Ty), vConstMask), vGatherTy);
1429         // after pshufb: group components together in each 128bit lane
1430         // 256i - 0    1    2    3    4    5    6    7
1431         //        xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww
1432
1433         Value* vi128XY = nullptr;
1434         if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)){
1435             vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({0, 4, 0, 0, 1, 5, 0, 0})), v128Ty);
1436             // after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane
1437             // 256i - 0    1    2    3    4    5    6    7
1438             //        xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care)
1439         }
1440
1441         // do the same for zw components
1442         Value* vi128ZW = nullptr;
1443         if(isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)){
1444             vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({2, 6, 0, 0, 3, 7, 0, 0})), v128Ty);
1445         }
1446
1447         // init denormalize variables if needed
1448         Instruction::CastOps fpCast;
1449         Value* conversionFactor;
1450
1451         switch (conversionType)
1452         {
1453         case CONVERT_NORMALIZED:
1454             fpCast = Instruction::CastOps::SIToFP;
1455             conversionFactor = VIMMED1((float)(1.0 / 127.0));
1456             break;
1457         case CONVERT_SSCALED:
1458             fpCast = Instruction::CastOps::SIToFP;
1459             conversionFactor = VIMMED1((float)(1.0));
1460             break;
1461         case CONVERT_USCALED:
1462             SWR_INVALID("Type should not be sign extended!");
1463             conversionFactor = nullptr;
1464             break;
1465         default:
1466             SWR_ASSERT(conversionType == CONVERT_NONE);
1467             conversionFactor = nullptr;
1468             break;
1469         }
1470
1471         // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
1472         for (uint32_t i = 0; i < 4; i++)
1473         {
1474             if (isComponentEnabled(compMask, i))
1475             {
1476                 if (compCtrl[i] == ComponentControl::StoreSrc)
1477                 {
1478                     // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
1479                     uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
1480                     // if x or y, use vi128XY permute result, else use vi128ZW
1481                     Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
1482
1483                     // sign extend
1484                     vVertexElements[currentVertexElement] = PMOVSXBD(BITCAST(VEXTRACT(selectedPermute, C(lane)), v16x8Ty));
1485
1486                     // denormalize if needed
1487                     if (conversionType != CONVERT_NONE)
1488                     {
1489                         vVertexElements[currentVertexElement] = FMUL(CAST(fpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor);
1490                     }
1491                     currentVertexElement++;
1492                 }
1493                 else
1494                 {
1495 #if USE_SIMD16_SHADERS
1496                     vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i], useVertexID2);
1497 #else
1498                     vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1499 #endif
1500                 }
1501
1502                 if (currentVertexElement > 3)
1503                 {
1504                     StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1505                     // reset to the next vVertexElement to output
1506                     currentVertexElement = 0;
1507                 }
1508             }
1509         }
1510     }
1511     // else zero extend
1512     else if ((extendType == Instruction::CastOps::ZExt) || (extendType == Instruction::CastOps::UIToFP))
1513     {
1514         // init denormalize variables if needed
1515         Instruction::CastOps fpCast;
1516         Value* conversionFactor;
1517
1518         switch (conversionType)
1519         {
1520         case CONVERT_NORMALIZED:
1521             fpCast = Instruction::CastOps::UIToFP;
1522             conversionFactor = VIMMED1((float)(1.0 / 255.0));
1523             break;
1524         case CONVERT_USCALED:
1525             fpCast = Instruction::CastOps::UIToFP;
1526             conversionFactor = VIMMED1((float)(1.0));
1527             break;
1528         case CONVERT_SSCALED:
1529             SWR_INVALID("Type should not be zero extended!");
1530             conversionFactor = nullptr;
1531             break;
1532         default:
1533             SWR_ASSERT(conversionType == CONVERT_NONE);
1534             conversionFactor = nullptr;
1535             break;
1536         }
1537
1538         // shuffle enabled components into lower byte of each 32bit lane, 0 extending to 32 bits
1539         for (uint32_t i = 0; i < 4; i++)
1540         {
1541             if (isComponentEnabled(compMask, i))
1542             {
1543                 if (compCtrl[i] == ComponentControl::StoreSrc)
1544                 {
1545                     // pshufb masks for each component
1546                     Value* vConstMask;
1547                     switch (swizzle[i])
1548                     {
1549                     case 0:
1550                         // x shuffle mask
1551                         vConstMask = C<char>({ 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1,
1552                                                0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1 });
1553                         break;
1554                     case 1:
1555                         // y shuffle mask
1556                         vConstMask = C<char>({ 1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1,
1557                                                1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1 });
1558                         break;
1559                     case 2:
1560                         // z shuffle mask
1561                         vConstMask = C<char>({ 2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1,
1562                                                2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1 });
1563                         break;
1564                     case 3:
1565                         // w shuffle mask
1566                         vConstMask = C<char>({ 3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1,
1567                                                3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1 });
1568                         break;
1569                     default:
1570                         vConstMask = nullptr;
1571                         break;
1572                     }
1573
1574                     vVertexElements[currentVertexElement] = BITCAST(PSHUFB(BITCAST(vGatherResult, v32x8Ty), vConstMask), vGatherTy);
1575                     // after pshufb for x channel
1576                     // 256i - 0    1    2    3    4    5    6    7
1577                     //        x000 x000 x000 x000 x000 x000 x000 x000
1578
1579                     // denormalize if needed
1580                     if (conversionType != CONVERT_NONE)
1581                     {
1582                         vVertexElements[currentVertexElement] = FMUL(CAST(fpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor);
1583                     }
1584                     currentVertexElement++;
1585                 }
1586                 else
1587                 {
1588 #if USE_SIMD16_SHADERS
1589                     vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i], useVertexID2);
1590 #else
1591                     vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1592 #endif
1593                 }
1594
1595                 if (currentVertexElement > 3)
1596                 {
1597                     StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1598                     // reset to the next vVertexElement to output
1599                     currentVertexElement = 0;
1600                 }
1601             }
1602         }
1603     }
1604     else
1605     {
1606         SWR_INVALID("Unsupported conversion type");
1607     }
1608 }
1609
1610 //////////////////////////////////////////////////////////////////////////
1611 /// @brief Takes a SIMD of gathered 16bpc verts, zero or sign extends,
1612 /// denormalizes if needed, converts to F32 if needed, and positions in
1613 //  the proper SIMD rows to be output to the simdvertex structure
1614 /// @param args: (tuple of args, listed below)
1615 ///   @param vGatherResult[2] - array of gathered 16bpc vertices, 4 per index
1616 ///   @param pVtxOut - base pointer to output simdvertex struct
1617 ///   @param extendType - sign extend or zero extend
1618 ///   @param bNormalized - do we need to denormalize?
1619 ///   @param currentVertexElement - reference to the current vVertexElement
1620 ///   @param outputElt - reference to the current offset from simdvertex we're o
1621 ///   @param compMask - component packing mask
1622 ///   @param compCtrl - component control val
1623 ///   @param vVertexElements[4] - vertex components to output
1624 #if USE_SIMD16_SHADERS
1625 void FetchJit::Shuffle16bpcGather(Shuffle16bpcArgs &args, bool useVertexID2)
1626 #else
1627 void FetchJit::Shuffle16bpcGather(Shuffle16bpcArgs &args)
1628 #endif
1629 {
1630     // Unpack tuple args
1631     Value* (&vGatherResult)[2] = std::get<0>(args);
1632     Value* pVtxOut = std::get<1>(args);
1633     const Instruction::CastOps extendType = std::get<2>(args);
1634     const ConversionType conversionType = std::get<3>(args);
1635     uint32_t &currentVertexElement = std::get<4>(args);
1636     uint32_t &outputElt = std::get<5>(args);
1637     const ComponentEnable compMask = std::get<6>(args);
1638     const ComponentControl(&compCtrl)[4] = std::get<7>(args);
1639     Value* (&vVertexElements)[4] = std::get<8>(args);
1640
1641     // cast types
1642     Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
1643     Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
1644
1645     // have to do extra work for sign extending
1646     if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP)||
1647         (extendType == Instruction::CastOps::FPExt))
1648     {
1649         // is this PP float?
1650         bool bFP = (extendType == Instruction::CastOps::FPExt) ? true : false;
1651
1652         Type* v8x16Ty = VectorType::get(mInt16Ty, 8); // 8x16bit in a 128bit lane
1653         Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
1654
1655         // shuffle mask
1656         Value* vConstMask = C<char>({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
1657                                      0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15});
1658         Value* vi128XY = nullptr;
1659         if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)){
1660             Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherResult[0], v32x8Ty), vConstMask), vGatherTy);
1661             // after pshufb: group components together in each 128bit lane
1662             // 256i - 0    1    2    3    4    5    6    7
1663             //        xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
1664
1665             vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
1666             // after PERMD: move and pack xy components into each 128bit lane
1667             // 256i - 0    1    2    3    4    5    6    7
1668             //        xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
1669         }
1670
1671         // do the same for zw components
1672         Value* vi128ZW = nullptr;
1673         if(isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)){
1674             Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherResult[1], v32x8Ty), vConstMask), vGatherTy);
1675             vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
1676         }
1677
1678         // init denormalize variables if needed
1679         Instruction::CastOps IntToFpCast;
1680         Value* conversionFactor;
1681
1682         switch (conversionType)
1683         {
1684         case CONVERT_NORMALIZED:
1685             IntToFpCast = Instruction::CastOps::SIToFP;
1686             conversionFactor = VIMMED1((float)(1.0 / 32767.0));
1687             break;
1688         case CONVERT_SSCALED:
1689             IntToFpCast = Instruction::CastOps::SIToFP;
1690             conversionFactor = VIMMED1((float)(1.0));
1691             break;
1692         case CONVERT_USCALED:
1693             SWR_INVALID("Type should not be sign extended!");
1694             conversionFactor = nullptr;
1695             break;
1696         default:
1697             SWR_ASSERT(conversionType == CONVERT_NONE);
1698             conversionFactor = nullptr;
1699             break;
1700         }
1701
1702         // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
1703         for (uint32_t i = 0; i < 4; i++)
1704         {
1705             if (isComponentEnabled(compMask, i))
1706             {
1707                 if (compCtrl[i] == ComponentControl::StoreSrc)
1708                 {
1709                     // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
1710                     uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
1711                     // if x or y, use vi128XY permute result, else use vi128ZW
1712                     Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
1713
1714                     if (bFP) {
1715                         // extract 128 bit lanes to sign extend each component
1716                         vVertexElements[currentVertexElement] = CVTPH2PS(BITCAST(VEXTRACT(selectedPermute, C(lane)), v8x16Ty));
1717                     }
1718                     else {
1719                         // extract 128 bit lanes to sign extend each component
1720                         vVertexElements[currentVertexElement] = PMOVSXWD(BITCAST(VEXTRACT(selectedPermute, C(lane)), v8x16Ty));
1721
1722                         // denormalize if needed
1723                         if (conversionType != CONVERT_NONE) {
1724                             vVertexElements[currentVertexElement] = FMUL(CAST(IntToFpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor);
1725                         }
1726                     }
1727                     currentVertexElement++;
1728                 }
1729                 else
1730                 {
1731 #if USE_SIMD16_SHADERS
1732                     vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i], useVertexID2);
1733 #else
1734                     vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1735 #endif
1736                 }
1737
1738                 if (currentVertexElement > 3)
1739                 {
1740                     StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1741                     // reset to the next vVertexElement to output
1742                     currentVertexElement = 0;
1743                 }
1744             }
1745         }
1746     }
1747     // else zero extend
1748     else if ((extendType == Instruction::CastOps::ZExt) || (extendType == Instruction::CastOps::UIToFP))
1749     {
1750         // pshufb masks for each component
1751         Value* vConstMask[2];
1752         if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 2)){
1753             // x/z shuffle mask
1754             vConstMask[0] = C<char>({0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
1755                                      0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, });
1756         }
1757
1758         if(isComponentEnabled(compMask, 1) || isComponentEnabled(compMask, 3)){
1759             // y/w shuffle mask
1760             vConstMask[1] = C<char>({2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
1761                                      2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1});
1762         }
1763
1764         // init denormalize variables if needed
1765         Instruction::CastOps fpCast;
1766         Value* conversionFactor;
1767
1768         switch (conversionType)
1769         {
1770         case CONVERT_NORMALIZED:
1771             fpCast = Instruction::CastOps::UIToFP;
1772             conversionFactor = VIMMED1((float)(1.0 / 65535.0));
1773             break;
1774         case CONVERT_USCALED:
1775             fpCast = Instruction::CastOps::UIToFP;
1776             conversionFactor = VIMMED1((float)(1.0f));
1777             break;
1778         case CONVERT_SSCALED:
1779             SWR_INVALID("Type should not be zero extended!");
1780             conversionFactor = nullptr;
1781             break;
1782         default:
1783             SWR_ASSERT(conversionType == CONVERT_NONE);
1784             conversionFactor = nullptr;
1785             break;
1786         }
1787
1788         // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits
1789         for (uint32_t i = 0; i < 4; i++)
1790         {
1791             if (isComponentEnabled(compMask, i))
1792             {
1793                 if (compCtrl[i] == ComponentControl::StoreSrc)
1794                 {
1795                     // select correct constMask for x/z or y/w pshufb
1796                     uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1;
1797                     // if x or y, use vi128XY permute result, else use vi128ZW
1798                     uint32_t selectedGather = (i < 2) ? 0 : 1;
1799
1800                     vVertexElements[currentVertexElement] = BITCAST(PSHUFB(BITCAST(vGatherResult[selectedGather], v32x8Ty), vConstMask[selectedMask]), vGatherTy);
1801                     // after pshufb mask for x channel; z uses the same shuffle from the second gather
1802                     // 256i - 0    1    2    3    4    5    6    7
1803                     //        xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00
1804
1805                     // denormalize if needed
1806                     if (conversionType != CONVERT_NONE)
1807                     {
1808                         vVertexElements[currentVertexElement] = FMUL(CAST(fpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor);
1809                     }
1810                     currentVertexElement++;
1811                 }
1812                 else
1813                 {
1814 #if USE_SIMD16_SHADERS
1815                     vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i], useVertexID2);
1816 #else
1817                     vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1818 #endif
1819                 }
1820
1821                 if (currentVertexElement > 3)
1822                 {
1823                     StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1824                     // reset to the next vVertexElement to output
1825                     currentVertexElement = 0;
1826                 }
1827             }
1828         }
1829     }
1830     else
1831     {
1832         SWR_INVALID("Unsupported conversion type");
1833     }
1834 }
1835
1836 //////////////////////////////////////////////////////////////////////////
1837 /// @brief Output a simdvertex worth of elements to the current outputElt
1838 /// @param pVtxOut - base address of VIN output struct
1839 /// @param outputElt - simdvertex offset in VIN to write to
1840 /// @param numEltsToStore - number of simdvertex rows to write out
1841 /// @param vVertexElements - LLVM Value*[] simdvertex to write out
1842 void FetchJit::StoreVertexElements(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4])
1843 {
1844     SWR_ASSERT(numEltsToStore <= 4, "Invalid element count.");
1845
1846     for(uint32_t c = 0; c < numEltsToStore; ++c)
1847     {
1848         // STORE expects FP32 x vWidth type, just bitcast if needed
1849         if(!vVertexElements[c]->getType()->getScalarType()->isFloatTy()){
1850 #if FETCH_DUMP_VERTEX
1851             PRINT("vVertexElements[%d]: 0x%x\n", {C(c), vVertexElements[c]});
1852 #endif
1853             vVertexElements[c] = BITCAST(vVertexElements[c], mSimdFP32Ty);
1854         }
1855 #if FETCH_DUMP_VERTEX
1856         else
1857         {
1858             PRINT("vVertexElements[%d]: %f\n", {C(c), vVertexElements[c]});
1859         }
1860 #endif
1861         // outputElt * 4 = offsetting by the size of a simdvertex
1862         // + c offsets to a 32bit x vWidth row within the current vertex
1863 #if USE_SIMD16_SHADERS
1864         Value* dest = GEP(pVtxOut, C(outputElt * 8 + c * 2), "destGEP");
1865 #else
1866         Value* dest = GEP(pVtxOut, C(outputElt * 4 + c), "destGEP");
1867 #endif
1868         STORE(vVertexElements[c], dest);
1869     }
1870 }
1871
1872 //////////////////////////////////////////////////////////////////////////
1873 /// @brief Generates a constant vector of values based on the
1874 /// ComponentControl value
1875 /// @param ctrl - ComponentControl value
1876 #if USE_SIMD16_SHADERS
1877 Value* FetchJit::GenerateCompCtrlVector(const ComponentControl ctrl, bool useVertexID2)
1878 #else
1879 Value* FetchJit::GenerateCompCtrlVector(const ComponentControl ctrl)
1880 #endif
1881 {
1882     switch(ctrl)
1883     {
1884         case NoStore:   return VUNDEF_I();
1885         case Store0:    return VIMMED1(0);
1886         case Store1Fp:  return VIMMED1(1.0f);
1887         case Store1Int: return VIMMED1(1);
1888         case StoreVertexId:
1889         {
1890 #if USE_SIMD16_SHADERS
1891             Value* pId;
1892             if (useVertexID2)
1893             {
1894                 pId = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID2 })), mSimdFP32Ty);
1895             }
1896             else
1897             {
1898                 pId = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID })), mSimdFP32Ty);
1899             }
1900 #else
1901             Value* pId = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID })), mSimdFP32Ty);
1902 #endif
1903             return VBROADCAST(pId);
1904         }
1905         case StoreInstanceId:
1906         {
1907             Value* pId = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_CurInstance })), mFP32Ty);
1908             return VBROADCAST(pId);
1909         }
1910         case StoreSrc:
1911         default:        SWR_INVALID("Invalid component control"); return VUNDEF_I();
1912     }
1913 }
1914
1915 //////////////////////////////////////////////////////////////////////////
1916 /// @brief Returns the enable mask for the specified component.
1917 /// @param enableMask - enable bits
1918 /// @param component - component to check if enabled.
1919 bool isComponentEnabled(ComponentEnable enableMask, uint8_t component)
1920 {
1921     switch (component)
1922     {
1923         // X
1924     case 0: return (enableMask & ComponentEnable::X);
1925         // Y
1926     case 1: return (enableMask & ComponentEnable::Y);
1927         // Z
1928     case 2: return (enableMask & ComponentEnable::Z);
1929         // W
1930     case 3: return (enableMask & ComponentEnable::W);
1931
1932     default: return false;
1933     }
1934 }
1935
1936
1937 //////////////////////////////////////////////////////////////////////////
1938 /// @brief JITs from fetch shader IR
1939 /// @param hJitMgr - JitManager handle
1940 /// @param func   - LLVM function IR
1941 /// @return PFN_FETCH_FUNC - pointer to fetch code
1942 PFN_FETCH_FUNC JitFetchFunc(HANDLE hJitMgr, const HANDLE hFunc)
1943 {
1944     const llvm::Function* func = (const llvm::Function*)hFunc;
1945     JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
1946     PFN_FETCH_FUNC pfnFetch;
1947
1948     pfnFetch = (PFN_FETCH_FUNC)(pJitMgr->mpExec->getFunctionAddress(func->getName().str()));
1949     // MCJIT finalizes modules the first time you JIT code from them. After finalized, you cannot add new IR to the module
1950     pJitMgr->mIsModuleFinalized = true;
1951
1952 #if defined(KNOB_SWRC_TRACING)
1953     char fName[1024];
1954     const char *funcName = func->getName().data();
1955     sprintf(fName, "%s.bin", funcName);
1956     FILE *fd = fopen(fName, "wb");
1957     fwrite((void *)pfnFetch, 1, 2048, fd);
1958     fclose(fd);
1959 #endif
1960
1961     pJitMgr->DumpAsm(const_cast<llvm::Function*>(func), "final");
1962
1963     return pfnFetch;
1964 }
1965
1966 //////////////////////////////////////////////////////////////////////////
1967 /// @brief JIT compiles fetch shader
1968 /// @param hJitMgr - JitManager handle
1969 /// @param state   - fetch state to build function from
1970 extern "C" PFN_FETCH_FUNC JITCALL JitCompileFetch(HANDLE hJitMgr, const FETCH_COMPILE_STATE& state)
1971 {
1972     JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
1973
1974     pJitMgr->SetupNewModule();
1975
1976     FetchJit theJit(pJitMgr);
1977     HANDLE hFunc = theJit.Create(state);
1978
1979     return JitFetchFunc(hJitMgr, hFunc);
1980 }