src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp

   1 /****************************************************************************
   2 * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
   3 *
   4 * Permission is hereby granted, free of charge, to any person obtaining a
   5 * copy of this software and associated documentation files (the "Software"),
   6 * to deal in the Software without restriction, including without limitation
   7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8 * and/or sell copies of the Software, and to permit persons to whom the
   9 * Software is furnished to do so, subject to the following conditions:
  10 *
  11 * The above copyright notice and this permission notice (including the next
  12 * paragraph) shall be included in all copies or substantial portions of the
  13 * Software.
  14 *
  15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21 * IN THE SOFTWARE.
  22 *
  23 * @file fetch_jit.cpp
  24 *
  25 * @brief Implementation of the fetch jitter
  26 *
  27 * Notes:
  28 *
  29 ******************************************************************************/
  30 #include "jit_api.h"
  31 #include "fetch_jit.h"
  32 #include "builder.h"
  33 #include "state_llvm.h"
  34 #include "llvm/IR/DataLayout.h"
  35 #include <sstream>
  36 #include <tuple>
  37
  38 //#define FETCH_DUMP_VERTEX 1
  39
  40 bool isComponentEnabled(ComponentEnable enableMask, uint8_t component);
  41
  42 enum ConversionType
  43 {
  44     CONVERT_NONE,
  45     CONVERT_NORMALIZED,
  46     CONVERT_USCALED,
  47     CONVERT_SSCALED,
  48 };
  49
  50 //////////////////////////////////////////////////////////////////////////
  51 /// Interface to Jitting a fetch shader
  52 //////////////////////////////////////////////////////////////////////////
  53 struct FetchJit : public Builder
  54 {
  55     FetchJit(JitManager* pJitMgr) : Builder(pJitMgr){};
  56
  57     Function* Create(const FETCH_COMPILE_STATE& fetchState);
  58     Value* GetSimdValid32bitIndices(Value* vIndices, Value* pLastIndex);
  59     Value* GetSimdValid16bitIndices(Value* vIndices, Value* pLastIndex);
  60     Value* GetSimdValid8bitIndices(Value* vIndices, Value* pLastIndex);
  61
  62     // package up Shuffle*bpcGatherd args into a tuple for convenience
  63     typedef std::tuple<Value*&, Value*, const Instruction::CastOps, const ConversionType,
  64         uint32_t&, uint32_t&, const ComponentEnable, const ComponentControl(&)[4], Value*(&)[4],
  65         const uint32_t(&)[4], Value*, bool, uint32_t, bool, uint32_t> Shuffle8bpcArgs;
  66     void Shuffle8bpcGatherd(Shuffle8bpcArgs &args);
  67
  68     typedef std::tuple<Value*(&)[2], Value*, const Instruction::CastOps, const ConversionType,
  69         uint32_t&, uint32_t&, const ComponentEnable, const ComponentControl(&)[4], Value*(&)[4],
  70         Value*, bool, uint32_t, bool, uint32_t> Shuffle16bpcArgs;
  71     void Shuffle16bpcGather(Shuffle16bpcArgs &args);
  72
  73     void StoreVertexElements(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4]);
  74
  75     Value* GenerateCompCtrlVector(const ComponentControl ctrl);
  76
  77     void JitLoadVertices(const FETCH_COMPILE_STATE &fetchState, Value* fetchInfo, Value* streams, Value* vIndices, Value* pVtxOut);
  78     void JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, Value* fetchInfo, Value* streams, Value* vIndices, Value* pVtxOut);
  79
  80     bool IsOddFormat(SWR_FORMAT format);
  81     bool IsUniformFormat(SWR_FORMAT format);
  82     void UnpackComponents(SWR_FORMAT format, Value* vInput, Value* result[4]);
  83     void CreateGatherOddFormats(SWR_FORMAT format, Value* pBase, Value* offsets, Value* result[4]);
  84     void ConvertFormat(SWR_FORMAT format, Value *texels[4]);
  85
  86 };
  87
  88 Function* FetchJit::Create(const FETCH_COMPILE_STATE& fetchState)
  89 {
  90     static std::size_t fetchNum = 0;
  91
  92     std::stringstream fnName("FetchShader", std::ios_base::in | std::ios_base::out | std::ios_base::ate);
  93     fnName << fetchNum++;
  94
  95     Function*    fetch = Function::Create(JM()->mFetchShaderTy, GlobalValue::ExternalLinkage, fnName.str(), JM()->mpCurrentModule);
  96     BasicBlock*    entry = BasicBlock::Create(JM()->mContext, "entry", fetch);
  97
  98     IRB()->SetInsertPoint(entry);
  99
 100     auto    argitr = fetch->getArgumentList().begin();
 101
 102     // Fetch shader arguments
 103     Value*    fetchInfo = &*argitr; ++argitr;
 104     fetchInfo->setName("fetchInfo");
 105     Value*    pVtxOut = &*argitr;
 106     pVtxOut->setName("vtxOutput");
 107     // this is just shorthand to tell LLVM to get a pointer to the base address of simdvertex
 108     // index 0(just the pointer to the simdvertex structure
 109     // index 1(which element of the simdvertex structure to offset to(in this case 0)
 110     // so the indices being i32's doesn't matter
 111     // TODO: generated this GEP with a VECTOR structure type so this makes sense
 112     std::vector<Value*>    vtxInputIndices(2, C(0));
 113     // GEP
 114     pVtxOut = GEP(pVtxOut, C(0));
 115     pVtxOut = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth), 0));
 116
 117     // SWR_FETCH_CONTEXT::pStreams
 118     Value*    streams = LOAD(fetchInfo,{0, SWR_FETCH_CONTEXT_pStreams});
 119     streams->setName("pStreams");
 120
 121     // SWR_FETCH_CONTEXT::pIndices
 122     Value*    indices = LOAD(fetchInfo,{0, SWR_FETCH_CONTEXT_pIndices});
 123     indices->setName("pIndices");
 124
 125     // SWR_FETCH_CONTEXT::pLastIndex
 126     Value*    pLastIndex = LOAD(fetchInfo,{0, SWR_FETCH_CONTEXT_pLastIndex});
 127     pLastIndex->setName("pLastIndex");
 128
 129
 130     Value* vIndices;
 131     switch(fetchState.indexType)
 132     {
 133         case R8_UINT:
 134             indices = BITCAST(indices, Type::getInt8PtrTy(JM()->mContext, 0));
 135             if(fetchState.bDisableIndexOOBCheck){
 136                 vIndices = LOAD(BITCAST(indices, PointerType::get(VectorType::get(mInt8Ty, mpJitMgr->mVWidth), 0)), {(uint32_t)0});
 137                 vIndices = Z_EXT(vIndices, mSimdInt32Ty);
 138             }
 139             else{
 140                 pLastIndex = BITCAST(pLastIndex, Type::getInt8PtrTy(JM()->mContext, 0));
 141                 vIndices = GetSimdValid8bitIndices(indices, pLastIndex);
 142             }
 143             break;
 144         case R16_UINT:
 145             indices = BITCAST(indices, Type::getInt16PtrTy(JM()->mContext, 0));
 146             if(fetchState.bDisableIndexOOBCheck){
 147                 vIndices = LOAD(BITCAST(indices, PointerType::get(VectorType::get(mInt16Ty, mpJitMgr->mVWidth), 0)), {(uint32_t)0});
 148                 vIndices = Z_EXT(vIndices, mSimdInt32Ty);
 149             }
 150             else{
 151                 pLastIndex = BITCAST(pLastIndex, Type::getInt16PtrTy(JM()->mContext, 0));
 152                 vIndices = GetSimdValid16bitIndices(indices, pLastIndex);
 153             }
 154             break;
 155         case R32_UINT:
 156             (fetchState.bDisableIndexOOBCheck) ? vIndices = LOAD(BITCAST(indices, PointerType::get(mSimdInt32Ty,0)),{(uint32_t)0})
 157                                                : vIndices = GetSimdValid32bitIndices(indices, pLastIndex);
 158             break; // incoming type is already 32bit int
 159         default: SWR_ASSERT(0, "Unsupported index type"); vIndices = nullptr; break;
 160     }
 161
 162     // store out vertex IDs
 163     STORE(vIndices, GEP(fetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID }));
 164
 165     // store out cut mask if enabled
 166     if (fetchState.bEnableCutIndex)
 167     {
 168         Value* vCutIndex = VIMMED1(fetchState.cutIndex);
 169         Value* cutMask = VMASK(ICMP_EQ(vIndices, vCutIndex));
 170         STORE(cutMask, GEP(fetchInfo, { 0, SWR_FETCH_CONTEXT_CutMask }));
 171     }
 172
 173     // Fetch attributes from memory and output to a simdvertex struct
 174     // since VGATHER has a perf penalty on HSW vs BDW, allow client to choose which fetch method to use
 175     (fetchState.bDisableVGATHER) ? JitLoadVertices(fetchState, fetchInfo, streams, vIndices, pVtxOut)
 176                                  : JitGatherVertices(fetchState, fetchInfo, streams, vIndices, pVtxOut);
 177
 178     RET_VOID();
 179
 180     JitManager::DumpToFile(fetch, "src");
 181
 182     verifyFunction(*fetch);
 183
 184 #if HAVE_LLVM == 0x306
 185         FunctionPassManager
 186 #else
 187         llvm::legacy::FunctionPassManager
 188 #endif
 189             setupPasses(JM()->mpCurrentModule);
 190
 191     ///@todo We don't need the CFG passes for fetch. (e.g. BreakCriticalEdges and CFGSimplification)
 192     setupPasses.add(createBreakCriticalEdgesPass());
 193     setupPasses.add(createCFGSimplificationPass());
 194     setupPasses.add(createEarlyCSEPass());
 195     setupPasses.add(createPromoteMemoryToRegisterPass());
 196
 197     setupPasses.run(*fetch);
 198
 199     JitManager::DumpToFile(fetch, "se");
 200
 201 #if HAVE_LLVM == 0x306
 202         FunctionPassManager
 203 #else
 204         llvm::legacy::FunctionPassManager
 205 #endif
 206             optPasses(JM()->mpCurrentModule);
 207
 208     ///@todo Haven't touched these either. Need to remove some of these and add others.
 209     optPasses.add(createCFGSimplificationPass());
 210     optPasses.add(createEarlyCSEPass());
 211     optPasses.add(createInstructionCombiningPass());
 212     optPasses.add(createInstructionSimplifierPass());
 213     optPasses.add(createConstantPropagationPass());
 214     optPasses.add(createSCCPPass());
 215     optPasses.add(createAggressiveDCEPass());
 216
 217     optPasses.run(*fetch);
 218     optPasses.run(*fetch);
 219
 220     JitManager::DumpToFile(fetch, "opt");
 221
 222     return fetch;
 223 }
 224
 225 //////////////////////////////////////////////////////////////////////////
 226 /// @brief Loads attributes from memory using LOADs, shuffling the
 227 /// components into SOA form.
 228 /// *Note* currently does not support component control,
 229 /// component packing, instancing, InstanceID SGVs, or VertexID SGVs
 230 /// @param fetchState - info about attributes to be fetched from memory
 231 /// @param streams - value pointer to the current vertex stream
 232 /// @param vIndices - vector value of indices to load
 233 /// @param pVtxOut - value pointer to output simdvertex struct
 234 void FetchJit::JitLoadVertices(const FETCH_COMPILE_STATE &fetchState, Value* fetchInfo, Value* streams, Value* vIndices, Value* pVtxOut)
 235 {
 236     // Zack shuffles; a variant of the Charleston.
 237
 238     std::vector<Value*> vectors(16);
 239     std::vector<Constant*>    pMask(mVWidth);
 240     for(uint32_t i = 0; i < mVWidth; ++i)
 241     {
 242         pMask[i] = (C(i < 4 ? i : 4));
 243     }
 244     Constant* promoteMask = ConstantVector::get(pMask);
 245     Constant* uwvec = UndefValue::get(VectorType::get(mFP32Ty, 4));
 246
 247     Value* startVertex = LOAD(fetchInfo, {0, SWR_FETCH_CONTEXT_StartVertex});
 248     Value* startInstance = LOAD(fetchInfo, {0, SWR_FETCH_CONTEXT_StartInstance});
 249     Value* curInstance = LOAD(fetchInfo, {0, SWR_FETCH_CONTEXT_CurInstance});
 250     Value* vBaseVertex = VBROADCAST(LOAD(fetchInfo, {0, SWR_FETCH_CONTEXT_BaseVertex}));
 251     curInstance->setName("curInstance");
 252
 253     for(uint32_t nelt = 0; nelt < fetchState.numAttribs; ++nelt)
 254     {
 255         Value*    elements[4] = {0};
 256         const INPUT_ELEMENT_DESC& ied = fetchState.layout[nelt];
 257         const SWR_FORMAT_INFO &info = GetFormatInfo((SWR_FORMAT)ied.Format);
 258         SWR_ASSERT((info.bpp != 0), "Unsupported format in JitLoadVertices.");
 259         uint32_t    numComponents = info.numComps;
 260         uint32_t bpc = info.bpp / info.numComps;  ///@todo Code below assumes all components are same size. Need to fix.
 261
 262         vectors.clear();
 263
 264         Value *vCurIndices;
 265         Value *startOffset;
 266         if(ied.InstanceEnable)
 267         {
 268             Value* stepRate = C(ied.InstanceDataStepRate);
 269
 270             // prevent a div by 0 for 0 step rate
 271             Value* isNonZeroStep = ICMP_UGT(stepRate, C(0));
 272             stepRate = SELECT(isNonZeroStep, stepRate, C(1));
 273
 274             // calc the current offset into instanced data buffer
 275             Value* calcInstance = UDIV(curInstance, stepRate);
 276
 277             // if step rate is 0, every instance gets instance 0
 278             calcInstance = SELECT(isNonZeroStep, calcInstance, C(0));
 279
 280             vCurIndices = VBROADCAST(calcInstance);
 281
 282             startOffset = startInstance;
 283         }
 284         else
 285         {
 286             // offset indices by baseVertex
 287             vCurIndices = ADD(vIndices, vBaseVertex);
 288
 289             startOffset = startVertex;
 290         }
 291
 292         // load SWR_VERTEX_BUFFER_STATE::pData
 293         Value *stream = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pData});
 294
 295         // load SWR_VERTEX_BUFFER_STATE::pitch
 296         Value *stride = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pitch});
 297         stride = Z_EXT(stride, mInt64Ty);
 298
 299         // load SWR_VERTEX_BUFFER_STATE::size
 300         Value *size = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_size});
 301         size = Z_EXT(size, mInt64Ty);
 302
 303         Value* startVertexOffset = MUL(Z_EXT(startOffset, mInt64Ty), stride);
 304
 305         // Load from the stream.
 306         for(uint32_t lane = 0; lane < mVWidth; ++lane)
 307         {
 308             // Get index
 309             Value* index = VEXTRACT(vCurIndices, C(lane));
 310             index = Z_EXT(index, mInt64Ty);
 311
 312             Value*    offset = MUL(index, stride);
 313             offset = ADD(offset, C((int64_t)ied.AlignedByteOffset));
 314             offset = ADD(offset, startVertexOffset);
 315
 316             if (!fetchState.bDisableIndexOOBCheck) {
 317                 // check for out of bound access, including partial OOB, and mask them to 0
 318                 Value *endOffset = ADD(offset, C((int64_t)info.Bpp));
 319                 Value *oob = ICMP_ULE(endOffset, size);
 320                 offset = SELECT(oob, offset, ConstantInt::get(mInt64Ty, 0));
 321             }
 322
 323             Value*    pointer = GEP(stream, offset);
 324             // We use a full-lane, but don't actually care.
 325             Value*    vptr = 0;
 326
 327             // get a pointer to a 4 component attrib in default address space
 328             switch(bpc)
 329             {
 330                 case 8: vptr = BITCAST(pointer, PointerType::get(VectorType::get(mInt8Ty, 4), 0)); break;
 331                 case 16: vptr = BITCAST(pointer, PointerType::get(VectorType::get(mInt16Ty, 4), 0)); break;
 332                 case 32: vptr = BITCAST(pointer, PointerType::get(VectorType::get(mFP32Ty, 4), 0)); break;
 333                 default: SWR_ASSERT(false, "Unsupported underlying bpp!");
 334             }
 335
 336             // load 4 components of attribute
 337             Value*    vec = ALIGNED_LOAD(vptr, 1, false);
 338
 339             // Convert To FP32 internally
 340             switch(info.type[0])
 341             {
 342                 case SWR_TYPE_UNORM:
 343                     switch(bpc)
 344                     {
 345                         case 8:
 346                             vec = UI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
 347                             vec = FMUL(vec, ConstantVector::get(std::vector<Constant*>(4, ConstantFP::get(mFP32Ty, 1.0 / 255.0))));
 348                             break;
 349                         case 16:
 350                             vec = UI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
 351                             vec = FMUL(vec, ConstantVector::get(std::vector<Constant*>(4, ConstantFP::get(mFP32Ty, 1.0 / 65535.0))));
 352                             break;
 353                         default:
 354                             SWR_ASSERT(false, "Unsupported underlying type!");
 355                             break;
 356                     }
 357                     break;
 358                 case SWR_TYPE_SNORM:
 359                     switch(bpc)
 360                     {
 361                         case 8:
 362                             vec = SI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
 363                             vec = FMUL(vec, ConstantVector::get(std::vector<Constant*>(4, ConstantFP::get(mFP32Ty, 1.0 / 128.0))));
 364                             break;
 365                         case 16:
 366                             vec = SI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
 367                             vec = FMUL(vec, ConstantVector::get(std::vector<Constant*>(4, ConstantFP::get(mFP32Ty, 1.0 / 32768.0))));
 368                             break;
 369                         default:
 370                             SWR_ASSERT(false, "Unsupported underlying type!");
 371                             break;
 372                     }
 373                     break;
 374                 case SWR_TYPE_UINT:
 375                     // Zero extend uint32_t types.
 376                     switch(bpc)
 377                     {
 378                         case 8:
 379                         case 16:
 380                             vec = Z_EXT(vec, VectorType::get(mInt32Ty, 4));
 381                             vec = BITCAST(vec, VectorType::get(mFP32Ty, 4));
 382                             break;
 383                         case 32:
 384                             break; // Pass through unchanged.
 385                         default:
 386                             SWR_ASSERT(false, "Unsupported underlying type!");
 387                             break;
 388                     }
 389                     break;
 390                 case SWR_TYPE_SINT:
 391                     // Sign extend SINT types.
 392                     switch(bpc)
 393                     {
 394                         case 8:
 395                         case 16:
 396                             vec = S_EXT(vec, VectorType::get(mInt32Ty, 4));
 397                             vec = BITCAST(vec, VectorType::get(mFP32Ty, 4));
 398                             break;
 399                         case 32:
 400                             break; // Pass through unchanged.
 401                         default:
 402                             SWR_ASSERT(false, "Unsupported underlying type!");
 403                             break;
 404                     }
 405                     break;
 406                 case SWR_TYPE_FLOAT:
 407                     switch(bpc)
 408                     {
 409                         case 32:
 410                             break; // Pass through unchanged.
 411                         default:
 412                             SWR_ASSERT(false, "Unsupported underlying type!");
 413                     }
 414                     break;
 415                 case SWR_TYPE_USCALED:
 416                     vec = UI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
 417                     break;
 418                 case SWR_TYPE_SSCALED:
 419                     vec = SI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
 420                     break;
 421                 case SWR_TYPE_UNKNOWN:
 422                 case SWR_TYPE_UNUSED:
 423                     SWR_ASSERT(false, "Unsupported type %d!", info.type[0]);
 424             }
 425
 426             // promote mask: sse(0,1,2,3) | avx(0,1,2,3,4,4,4,4)
 427             // uwvec: 4 x F32, undef value
 428             Value*    wvec = VSHUFFLE(vec, uwvec, promoteMask);
 429             vectors.push_back(wvec);
 430         }
 431
 432         std::vector<Constant*>        v01Mask(mVWidth);
 433         std::vector<Constant*>        v23Mask(mVWidth);
 434         std::vector<Constant*>        v02Mask(mVWidth);
 435         std::vector<Constant*>        v13Mask(mVWidth);
 436
 437         // Concatenate the vectors together.
 438         elements[0] = VUNDEF_F();
 439         elements[1] = VUNDEF_F();
 440         elements[2] = VUNDEF_F();
 441         elements[3] = VUNDEF_F();
 442         for(uint32_t b = 0, num4Wide = mVWidth / 4; b < num4Wide; ++b)
 443         {
 444             v01Mask[4 * b + 0] = C(0 + 4 * b);
 445             v01Mask[4 * b + 1] = C(1 + 4 * b);
 446             v01Mask[4 * b + 2] = C(0 + 4 * b + mVWidth);
 447             v01Mask[4 * b + 3] = C(1 + 4 * b + mVWidth);
 448
 449             v23Mask[4 * b + 0] = C(2 + 4 * b);
 450             v23Mask[4 * b + 1] = C(3 + 4 * b);
 451             v23Mask[4 * b + 2] = C(2 + 4 * b + mVWidth);
 452             v23Mask[4 * b + 3] = C(3 + 4 * b + mVWidth);
 453
 454             v02Mask[4 * b + 0] = C(0 + 4 * b);
 455             v02Mask[4 * b + 1] = C(2 + 4 * b);
 456             v02Mask[4 * b + 2] = C(0 + 4 * b + mVWidth);
 457             v02Mask[4 * b + 3] = C(2 + 4 * b + mVWidth);
 458
 459             v13Mask[4 * b + 0] = C(1 + 4 * b);
 460             v13Mask[4 * b + 1] = C(3 + 4 * b);
 461             v13Mask[4 * b + 2] = C(1 + 4 * b + mVWidth);
 462             v13Mask[4 * b + 3] = C(3 + 4 * b + mVWidth);
 463
 464             std::vector<Constant*>    iMask(mVWidth);
 465             for(uint32_t i = 0; i < mVWidth; ++i)
 466             {
 467                 if(((4 * b) <= i) && (i < (4 * (b + 1))))
 468                 {
 469                     iMask[i] = C(i % 4 + mVWidth);
 470                 }
 471                 else
 472                 {
 473                     iMask[i] = C(i);
 474                 }
 475             }
 476             Constant* insertMask = ConstantVector::get(iMask);
 477             elements[0] = VSHUFFLE(elements[0], vectors[4 * b + 0], insertMask);
 478             elements[1] = VSHUFFLE(elements[1], vectors[4 * b + 1], insertMask);
 479             elements[2] = VSHUFFLE(elements[2], vectors[4 * b + 2], insertMask);
 480             elements[3] = VSHUFFLE(elements[3], vectors[4 * b + 3], insertMask);
 481         }
 482
 483         Value* x0y0x1y1 = VSHUFFLE(elements[0], elements[1], ConstantVector::get(v01Mask));
 484         Value* x2y2x3y3 = VSHUFFLE(elements[2], elements[3], ConstantVector::get(v01Mask));
 485         Value* z0w0z1w1 = VSHUFFLE(elements[0], elements[1], ConstantVector::get(v23Mask));
 486         Value* z2w3z2w3 = VSHUFFLE(elements[2], elements[3], ConstantVector::get(v23Mask));
 487         elements[0] = VSHUFFLE(x0y0x1y1, x2y2x3y3, ConstantVector::get(v02Mask));
 488         elements[1] = VSHUFFLE(x0y0x1y1, x2y2x3y3, ConstantVector::get(v13Mask));
 489         elements[2] = VSHUFFLE(z0w0z1w1, z2w3z2w3, ConstantVector::get(v02Mask));
 490         elements[3] = VSHUFFLE(z0w0z1w1, z2w3z2w3, ConstantVector::get(v13Mask));
 491
 492         switch(numComponents + 1)
 493         {
 494             case    1: elements[0] = VIMMED1(0.0f);
 495             case    2: elements[1] = VIMMED1(0.0f);
 496             case    3: elements[2] = VIMMED1(0.0f);
 497             case    4: elements[3] = VIMMED1(1.0f);
 498         }
 499
 500         for(uint32_t c = 0; c < 4; ++c)
 501         {
 502             Value* dest = GEP(pVtxOut, C(nelt * 4 + c), "destGEP");
 503             STORE(elements[c], dest);
 504         }
 505     }
 506 }
 507
 508 // returns true for odd formats that require special state.gather handling
 509 bool FetchJit::IsOddFormat(SWR_FORMAT format)
 510 {
 511     const SWR_FORMAT_INFO& info = GetFormatInfo(format);
 512     if (info.bpc[0] != 8 && info.bpc[0] != 16 && info.bpc[0] != 32)
 513     {
 514         return true;
 515     }
 516     return false;
 517 }
 518
 519 // format is uniform if all components are the same size and type
 520 bool FetchJit::IsUniformFormat(SWR_FORMAT format)
 521 {
 522     const SWR_FORMAT_INFO& info = GetFormatInfo(format);
 523     uint32_t bpc0 = info.bpc[0];
 524     uint32_t type0 = info.type[0];
 525
 526     for (uint32_t c = 1; c < info.numComps; ++c)
 527     {
 528         if (bpc0 != info.bpc[c] || type0 != info.type[c])
 529         {
 530             return false;
 531         }
 532     }
 533     return true;
 534 }
 535
 536 // unpacks components based on format
 537 // foreach component in the pixel
 538 //   mask off everything but this component
 539 //   shift component to LSB
 540 void FetchJit::UnpackComponents(SWR_FORMAT format, Value* vInput, Value* result[4])
 541 {
 542     const SWR_FORMAT_INFO& info = GetFormatInfo(format);
 543
 544     uint32_t bitOffset = 0;
 545     for (uint32_t c = 0; c < info.numComps; ++c)
 546     {
 547         uint32_t swizzledIndex = info.swizzle[c];
 548         uint32_t compBits = info.bpc[c];
 549         uint32_t bitmask = ((1 << compBits) - 1) << bitOffset;
 550         Value* comp = AND(vInput, bitmask);
 551         comp = LSHR(comp, bitOffset);
 552
 553         result[swizzledIndex] = comp;
 554         bitOffset += compBits;
 555     }
 556 }
 557
 558 // gather for odd component size formats
 559 // gather SIMD full pixels per lane then shift/mask to move each component to their
 560 // own vector
 561 void FetchJit::CreateGatherOddFormats(SWR_FORMAT format, Value* pBase, Value* offsets, Value* result[4])
 562 {
 563     const SWR_FORMAT_INFO &info = GetFormatInfo(format);
 564
 565     // only works if pixel size is <= 32bits
 566     SWR_ASSERT(info.bpp <= 32);
 567
 568     Value* gather = VUNDEF_I();
 569
 570     // assign defaults
 571     for (uint32_t comp = 0; comp < 4; ++comp)
 572     {
 573         result[comp] = VIMMED1((int)info.defaults[comp]);
 574     }
 575
 576     // gather SIMD pixels
 577     for (uint32_t e = 0; e < JM()->mVWidth; ++e)
 578     {
 579         Value* elemOffset = VEXTRACT(offsets, C(e));
 580         Value* load = GEP(pBase, elemOffset);
 581
 582         // load the proper amount of data based on component size
 583         switch (info.bpp)
 584         {
 585         case 8: load = POINTER_CAST(load, Type::getInt8PtrTy(JM()->mContext)); break;
 586         case 16: load = POINTER_CAST(load, Type::getInt16PtrTy(JM()->mContext)); break;
 587         case 32: load = POINTER_CAST(load, Type::getInt32PtrTy(JM()->mContext)); break;
 588         default: SWR_ASSERT(0);
 589         }
 590
 591         // load pixel
 592         Value *val = LOAD(load);
 593
 594         // zero extend to 32bit integer
 595         val = INT_CAST(val, mInt32Ty, false);
 596
 597         // store in simd lane
 598         gather = VINSERT(gather, val, C(e));
 599     }
 600
 601     UnpackComponents(format, gather, result);
 602
 603     // cast to fp32
 604     result[0] = BITCAST(result[0], mSimdFP32Ty);
 605     result[1] = BITCAST(result[1], mSimdFP32Ty);
 606     result[2] = BITCAST(result[2], mSimdFP32Ty);
 607     result[3] = BITCAST(result[3], mSimdFP32Ty);
 608 }
 609
 610 void FetchJit::ConvertFormat(SWR_FORMAT format, Value *texels[4])
 611 {
 612     const SWR_FORMAT_INFO &info = GetFormatInfo(format);
 613
 614     for (uint32_t c = 0; c < info.numComps; ++c)
 615     {
 616         uint32_t compIndex = info.swizzle[c];
 617
 618         // skip any conversion on UNUSED components
 619         if (info.type[c] == SWR_TYPE_UNUSED)
 620         {
 621             continue;
 622         }
 623
 624         if (info.isNormalized[c])
 625         {
 626             if (info.type[c] == SWR_TYPE_SNORM)
 627             {
 628                 /// @todo The most-negative value maps to -1.0f. e.g. the 5-bit value 10000 maps to -1.0f.
 629
 630                 /// result = c * (1.0f / (2^(n-1) - 1);
 631                 uint32_t n = info.bpc[c];
 632                 uint32_t pow2 = 1 << (n - 1);
 633                 float scale = 1.0f / (float)(pow2 - 1);
 634                 Value *vScale = VIMMED1(scale);
 635                 texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty);
 636                 texels[compIndex] = SI_TO_FP(texels[compIndex], mSimdFP32Ty);
 637                 texels[compIndex] = FMUL(texels[compIndex], vScale);
 638             }
 639             else
 640             {
 641                 SWR_ASSERT(info.type[c] == SWR_TYPE_UNORM);
 642
 643                 /// result = c * (1.0f / (2^n - 1))
 644                 uint32_t n = info.bpc[c];
 645                 uint32_t pow2 = 1 << n;
 646                 // special case 24bit unorm format, which requires a full divide to meet ULP requirement
 647                 if (n == 24)
 648                 {
 649                     float scale = (float)(pow2 - 1);
 650                     Value* vScale = VIMMED1(scale);
 651                     texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty);
 652                     texels[compIndex] = SI_TO_FP(texels[compIndex], mSimdFP32Ty);
 653                     texels[compIndex] = FDIV(texels[compIndex], vScale);
 654                 }
 655                 else
 656                 {
 657                     float scale = 1.0f / (float)(pow2 - 1);
 658                     Value *vScale = VIMMED1(scale);
 659                     texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty);
 660                     texels[compIndex] = UI_TO_FP(texels[compIndex], mSimdFP32Ty);
 661                     texels[compIndex] = FMUL(texels[compIndex], vScale);
 662                 }
 663             }
 664             continue;
 665         }
 666     }
 667 }
 668
 669 //////////////////////////////////////////////////////////////////////////
 670 /// @brief Loads attributes from memory using AVX2 GATHER(s)
 671 /// @param fetchState - info about attributes to be fetched from memory
 672 /// @param fetchInfo - first argument passed to fetch shader
 673 /// @param streams - value pointer to the current vertex stream
 674 /// @param vIndices - vector value of indices to gather
 675 /// @param pVtxOut - value pointer to output simdvertex struct
 676 void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, Value* fetchInfo,
 677                                  Value* streams, Value* vIndices, Value* pVtxOut)
 678 {
 679     uint32_t currentVertexElement = 0;
 680     uint32_t outputElt = 0;
 681     Value* vVertexElements[4];
 682
 683     Value* startVertex = LOAD(fetchInfo, {0, SWR_FETCH_CONTEXT_StartVertex});
 684     Value* startInstance = LOAD(fetchInfo, {0, SWR_FETCH_CONTEXT_StartInstance});
 685     Value* curInstance = LOAD(fetchInfo, {0, SWR_FETCH_CONTEXT_CurInstance});
 686     Value* vBaseVertex = VBROADCAST(LOAD(fetchInfo, {0, SWR_FETCH_CONTEXT_BaseVertex}));
 687     curInstance->setName("curInstance");
 688
 689     for(uint32_t nInputElt = 0; nInputElt < fetchState.numAttribs; ++nInputElt)
 690     {
 691         const INPUT_ELEMENT_DESC& ied = fetchState.layout[nInputElt];
 692         const SWR_FORMAT_INFO &info = GetFormatInfo((SWR_FORMAT)ied.Format);
 693         SWR_ASSERT((info.bpp != 0), "Unsupported format in JitGatherVertices.");
 694         uint32_t bpc = info.bpp / info.numComps;  ///@todo Code below assumes all components are same size. Need to fix.
 695
 696         Value *stream = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pData});
 697
 698         // VGATHER* takes an *i8 src pointer
 699         Value* pStreamBase = BITCAST(stream, PointerType::get(mInt8Ty, 0));
 700
 701         Value *stride = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pitch});
 702         Value *vStride = VBROADCAST(stride);
 703
 704         // max vertex index that is fully in bounds
 705         Value *maxVertex = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_maxVertex)});
 706         maxVertex = LOAD(maxVertex);
 707
 708         Value *vCurIndices;
 709         Value *startOffset;
 710         if(ied.InstanceEnable)
 711         {
 712             Value* stepRate = C(ied.InstanceDataStepRate);
 713
 714             // prevent a div by 0 for 0 step rate
 715             Value* isNonZeroStep = ICMP_UGT(stepRate, C(0));
 716             stepRate = SELECT(isNonZeroStep, stepRate, C(1));
 717
 718             // calc the current offset into instanced data buffer
 719             Value* calcInstance = UDIV(curInstance, stepRate);
 720
 721             // if step rate is 0, every instance gets instance 0
 722             calcInstance = SELECT(isNonZeroStep, calcInstance, C(0));
 723
 724             vCurIndices = VBROADCAST(calcInstance);
 725
 726             startOffset = startInstance;
 727         }
 728         else
 729         {
 730             // offset indices by baseVertex
 731             vCurIndices = ADD(vIndices, vBaseVertex);
 732
 733             startOffset = startVertex;
 734         }
 735
 736         // All of the OOB calculations are in vertices, not VB offsets, to prevent having to
 737         // do 64bit address offset calculations.
 738
 739         // calculate byte offset to the start of the VB
 740         Value* baseOffset = MUL(Z_EXT(startOffset, mInt64Ty), Z_EXT(stride, mInt64Ty));
 741         pStreamBase = GEP(pStreamBase, baseOffset);
 742
 743         // if we have a start offset, subtract from max vertex. Used for OOB check
 744         maxVertex = SUB(Z_EXT(maxVertex, mInt64Ty), Z_EXT(startOffset, mInt64Ty));
 745         Value* neg = ICMP_SLT(maxVertex, C((int64_t)0));
 746         // if we have a negative value, we're already OOB. clamp at 0.
 747         maxVertex = SELECT(neg, C(0), TRUNC(maxVertex, mInt32Ty));
 748
 749         // Load the in bounds size of a partially valid vertex
 750         Value *partialInboundsSize = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_partialInboundsSize)});
 751         partialInboundsSize = LOAD(partialInboundsSize);
 752         Value* vPartialVertexSize = VBROADCAST(partialInboundsSize);
 753         Value* vBpp = VBROADCAST(C(info.Bpp));
 754         Value* vAlignmentOffsets = VBROADCAST(C(ied.AlignedByteOffset));
 755
 756         // is the element is <= the partially valid size
 757         Value* vElementInBoundsMask = ICMP_SLE(vBpp, SUB(vPartialVertexSize, vAlignmentOffsets));
 758
 759         // are vertices partially OOB?
 760         Value* vMaxVertex = VBROADCAST(maxVertex);
 761         Value* vPartialOOBMask = ICMP_EQ(vCurIndices, vMaxVertex);
 762
 763         // are vertices are fully in bounds?
 764         Value* vGatherMask = ICMP_ULT(vCurIndices, vMaxVertex);
 765
 766         // blend in any partially OOB indices that have valid elements
 767         vGatherMask = SELECT(vPartialOOBMask, vElementInBoundsMask, vGatherMask);
 768         vGatherMask = VMASK(vGatherMask);
 769
 770         // calculate the actual offsets into the VB
 771         Value* vOffsets = MUL(vCurIndices, vStride);
 772         vOffsets = ADD(vOffsets, vAlignmentOffsets);
 773
 774         // Packing and component control
 775         ComponentEnable compMask = (ComponentEnable)ied.ComponentPacking;
 776         const ComponentControl compCtrl[4] { (ComponentControl)ied.ComponentControl0, (ComponentControl)ied.ComponentControl1,
 777                                              (ComponentControl)ied.ComponentControl2, (ComponentControl)ied.ComponentControl3};
 778
 779         // Special gather/conversion for formats without equal component sizes
 780         if (IsOddFormat((SWR_FORMAT)ied.Format))
 781         {
 782             // Only full 4 component fetch is supported for odd formats
 783             SWR_ASSERT(compMask == XYZW);
 784             Value* pResults[4];
 785             CreateGatherOddFormats((SWR_FORMAT)ied.Format, pStreamBase, vOffsets, pResults);
 786             ConvertFormat((SWR_FORMAT)ied.Format, pResults);
 787
 788             // check for InstanceID SGV
 789             if (fetchState.InstanceIdEnable && (fetchState.InstanceIdElementOffset == nInputElt))
 790             {
 791                 SWR_ASSERT(fetchState.InstanceIdComponentNumber < (sizeof(pResults) / sizeof(pResults[0])));
 792
 793                 // Load a SIMD of InstanceIDs
 794                 pResults[fetchState.InstanceIdComponentNumber] = VBROADCAST(LOAD(GEP(fetchInfo, { 0, SWR_FETCH_CONTEXT_CurInstance })));    // InstanceID
 795             }
 796             // check for VertexID SGV
 797             else if (fetchState.VertexIdEnable && (fetchState.VertexIdElementOffset == nInputElt))
 798             {
 799                 SWR_ASSERT(fetchState.VertexIdComponentNumber < (sizeof(pResults) / sizeof(pResults[0])));
 800
 801                 // Load a SIMD of VertexIDs
 802                 pResults[fetchState.VertexIdComponentNumber] = LOAD(GEP(fetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID }));
 803             }
 804
 805             StoreVertexElements(pVtxOut, outputElt++, 4, pResults);
 806             currentVertexElement = 0;
 807         }
 808         else if(info.type[0] == SWR_TYPE_FLOAT)
 809         {
 810             ///@todo: support 64 bit vb accesses
 811             Value* gatherSrc = VIMMED1(0.0f);
 812
 813             SWR_ASSERT(IsUniformFormat((SWR_FORMAT)ied.Format),
 814                 "Unsupported format for standard gather fetch.");
 815
 816             // Gather components from memory to store in a simdvertex structure
 817             switch(bpc)
 818             {
 819                 case 16:
 820                 {
 821                     Value* vGatherResult[2];
 822                     Value *vMask;
 823
 824                     // if we have at least one component out of x or y to fetch
 825                     if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)){
 826                         // save mask as it is zero'd out after each gather
 827                         vMask = vGatherMask;
 828
 829                         vGatherResult[0] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1));
 830                         // e.g. result of first 8x32bit integer gather for 16bit components
 831                         // 256i - 0    1    2    3    4    5    6    7
 832                         //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
 833                         //
 834                     }
 835
 836                     // if we have at least one component out of z or w to fetch
 837                     if(isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)){
 838                         // offset base to the next components(zw) in the vertex to gather
 839                         pStreamBase = GEP(pStreamBase, C((char)4));
 840                         vMask = vGatherMask;
 841
 842                         vGatherResult[1] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1));
 843                         // e.g. result of second 8x32bit integer gather for 16bit components
 844                         // 256i - 0    1    2    3    4    5    6    7
 845                         //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
 846                         //
 847                     }
 848
 849                     // if we have at least one component to shuffle into place
 850                     if(compMask){
 851                         const bool instanceIdEnable = (fetchState.InstanceIdEnable) && (fetchState.InstanceIdElementOffset == nInputElt);
 852                         const bool vertexIdEnable = (fetchState.VertexIdEnable) && (fetchState.VertexIdElementOffset == nInputElt);
 853
 854                         Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, Instruction::CastOps::FPExt, CONVERT_NONE,
 855                             currentVertexElement, outputElt, compMask, compCtrl, vVertexElements, fetchInfo, instanceIdEnable,
 856                             fetchState.InstanceIdComponentNumber, vertexIdEnable, fetchState.VertexIdComponentNumber);
 857
 858                         // Shuffle gathered components into place in simdvertex struct
 859                         Shuffle16bpcGather(args);  // outputs to vVertexElements ref
 860                     }
 861                 }
 862                     break;
 863                 case 32:
 864                 {
 865                     for (uint32_t i = 0; i < 4; i++)
 866                     {
 867                         if (isComponentEnabled(compMask, i))
 868                         {
 869                             // check for InstanceID SGV
 870                             if ((fetchState.InstanceIdEnable) && (fetchState.InstanceIdElementOffset == nInputElt) && (fetchState.InstanceIdComponentNumber == currentVertexElement))
 871                             {
 872                                 // Load a SIMD of InstanceIDs
 873                                 vVertexElements[currentVertexElement++] = VBROADCAST(LOAD(GEP(fetchInfo, { 0, SWR_FETCH_CONTEXT_CurInstance })));   // InstanceID
 874                             }
 875                             // check for VertexID SGV
 876                             else if ((fetchState.VertexIdEnable) && (fetchState.VertexIdElementOffset == nInputElt) && (fetchState.VertexIdComponentNumber == currentVertexElement))
 877                             {
 878                                 // Load a SIMD of VertexIDs
 879                                 vVertexElements[currentVertexElement++] = LOAD(GEP(fetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID }));
 880                             }
 881                             // if we need to gather the component
 882                             else if (compCtrl[i] == StoreSrc)
 883                             {
 884                                 // save mask as it is zero'd out after each gather
 885                                 Value *vMask = vGatherMask;
 886
 887                                 // Gather a SIMD of vertices
 888                                 vVertexElements[currentVertexElement++] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1));
 889                             }
 890                             else
 891                             {
 892                                 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
 893                             }
 894
 895                             if (currentVertexElement > 3)
 896                             {
 897                                 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
 898                                 // reset to the next vVertexElement to output
 899                                 currentVertexElement = 0;
 900                             }
 901
 902                         }
 903
 904                         // offset base to the next component in the vertex to gather
 905                         pStreamBase = GEP(pStreamBase, C((char)4));
 906                     }
 907                 }
 908                     break;
 909                 default:
 910                     SWR_ASSERT(0, "Tried to fetch invalid FP format");
 911                     break;
 912             }
 913         }
 914         else
 915         {
 916             Instruction::CastOps extendCastType = Instruction::CastOps::CastOpsEnd;
 917             ConversionType conversionType = CONVERT_NONE;
 918
 919             SWR_ASSERT(IsUniformFormat((SWR_FORMAT)ied.Format),
 920                 "Unsupported format for standard gather fetch.");
 921
 922             switch(info.type[0])
 923             {
 924                 case SWR_TYPE_UNORM:
 925                     conversionType = CONVERT_NORMALIZED;
 926                 case SWR_TYPE_UINT:
 927                     extendCastType = Instruction::CastOps::ZExt;
 928                     break;
 929                 case SWR_TYPE_SNORM:
 930                     conversionType = CONVERT_NORMALIZED;
 931                 case SWR_TYPE_SINT:
 932                     extendCastType = Instruction::CastOps::SExt;
 933                     break;
 934                 case SWR_TYPE_USCALED:
 935                     conversionType = CONVERT_USCALED;
 936                     extendCastType = Instruction::CastOps::UIToFP;
 937                     break;
 938                 case SWR_TYPE_SSCALED:
 939                     conversionType = CONVERT_SSCALED;
 940                     extendCastType = Instruction::CastOps::SIToFP;
 941                     break;
 942                 default:
 943                     break;
 944             }
 945
 946             // value substituted when component of gather is masked
 947             Value* gatherSrc = VIMMED1(0);
 948
 949             // Gather components from memory to store in a simdvertex structure
 950             switch (bpc)
 951             {
 952                 case 8:
 953                 {
 954                     // if we have at least one component to fetch
 955                     if(compMask)
 956                     {
 957                         Value* vGatherResult = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask, C((char)1));
 958                         // e.g. result of an 8x32bit integer gather for 8bit components
 959                         // 256i - 0    1    2    3    4    5    6    7
 960                         //        xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw
 961
 962                         const bool instanceIdEnable = fetchState.InstanceIdEnable && (fetchState.InstanceIdElementOffset == nInputElt);
 963                         const bool vertexIdEnable = fetchState.VertexIdEnable && (fetchState.VertexIdElementOffset == nInputElt);
 964
 965                         Shuffle8bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, extendCastType, conversionType,
 966                             currentVertexElement, outputElt, compMask, compCtrl, vVertexElements, info.swizzle, fetchInfo,
 967                             instanceIdEnable, fetchState.InstanceIdComponentNumber, vertexIdEnable, fetchState.VertexIdComponentNumber);
 968
 969                         // Shuffle gathered components into place in simdvertex struct
 970                         Shuffle8bpcGatherd(args); // outputs to vVertexElements ref
 971                     }
 972                 }
 973                 break;
 974                 case 16:
 975                 {
 976                     Value* vGatherResult[2];
 977                     Value *vMask;
 978
 979                     // if we have at least one component out of x or y to fetch
 980                     if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)){
 981                         // save mask as it is zero'd out after each gather
 982                         vMask = vGatherMask;
 983
 984                         vGatherResult[0] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1));
 985                         // e.g. result of first 8x32bit integer gather for 16bit components
 986                         // 256i - 0    1    2    3    4    5    6    7
 987                         //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
 988                         //
 989                     }
 990
 991                     // if we have at least one component out of z or w to fetch
 992                     if(isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)){
 993                         // offset base to the next components(zw) in the vertex to gather
 994                         pStreamBase = GEP(pStreamBase, C((char)4));
 995                         vMask = vGatherMask;
 996
 997                         vGatherResult[1] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1));
 998                         // e.g. result of second 8x32bit integer gather for 16bit components
 999                         // 256i - 0    1    2    3    4    5    6    7
1000                         //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
1001                         //
1002                     }
1003
1004                     // if we have at least one component to shuffle into place
1005                     if(compMask){
1006                         const bool instanceIdEnable = fetchState.InstanceIdEnable && (fetchState.InstanceIdElementOffset == nInputElt);
1007                         const bool vertexIdEnable = fetchState.VertexIdEnable && (fetchState.VertexIdElementOffset == nInputElt);
1008
1009                         Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, extendCastType, conversionType,
1010                             currentVertexElement, outputElt, compMask, compCtrl, vVertexElements, fetchInfo, instanceIdEnable,
1011                             fetchState.InstanceIdComponentNumber, vertexIdEnable, fetchState.VertexIdComponentNumber);
1012
1013                         // Shuffle gathered components into place in simdvertex struct
1014                         Shuffle16bpcGather(args);  // outputs to vVertexElements ref
1015                     }
1016                 }
1017                 break;
1018                 case 32:
1019                 {
1020                     SWR_ASSERT(conversionType == CONVERT_NONE);
1021
1022                     // Gathered components into place in simdvertex struct
1023                     for (uint32_t i = 0; i < 4; i++)
1024                     {
1025                         if (isComponentEnabled(compMask, i))
1026                         {
1027                             // check for InstanceID SGV
1028                             if (fetchState.InstanceIdEnable && (fetchState.InstanceIdElementOffset == nInputElt) && (fetchState.InstanceIdComponentNumber == currentVertexElement))
1029                             {
1030                                 // Load a SIMD of InstanceIDs
1031                                 vVertexElements[currentVertexElement++] = VBROADCAST(LOAD(GEP(fetchInfo, { 0, SWR_FETCH_CONTEXT_CurInstance })));   // InstanceID
1032                             }
1033                             // check for VertexID SGV
1034                             else if (fetchState.VertexIdEnable && (fetchState.VertexIdElementOffset == nInputElt) && (fetchState.VertexIdComponentNumber == currentVertexElement))
1035                             {
1036                                 // Load a SIMD of VertexIDs
1037                                 vVertexElements[currentVertexElement++] = LOAD(GEP(fetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID }));
1038                             }
1039                             // if we need to gather the component
1040                             else if (compCtrl[i] == StoreSrc)
1041                             {
1042                                 // save mask as it is zero'd out after each gather
1043                                 Value *vMask = vGatherMask;
1044
1045                                 vVertexElements[currentVertexElement++] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1));
1046
1047                                 // e.g. result of a single 8x32bit integer gather for 32bit components
1048                                 // 256i - 0    1    2    3    4    5    6    7
1049                                 //        xxxx xxxx xxxx xxxx xxxx xxxx xxxx xxxx
1050                             }
1051                             else
1052                             {
1053                                 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1054                             }
1055
1056                             if (currentVertexElement > 3)
1057                             {
1058                                 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1059                                 // reset to the next vVertexElement to output
1060                                 currentVertexElement = 0;
1061                             }
1062
1063                         }
1064
1065                         // offset base to the next component  in the vertex to gather
1066                         pStreamBase = GEP(pStreamBase, C((char)4));
1067                     }
1068                 }
1069                 break;
1070             }
1071         }
1072     }
1073
1074     // if we have a partially filled vVertexElement struct, output it
1075     if(currentVertexElement > 0){
1076         StoreVertexElements(pVtxOut, outputElt++, currentVertexElement+1, vVertexElements);
1077     }
1078 }
1079
1080 //////////////////////////////////////////////////////////////////////////
1081 /// @brief Loads a simd of valid indices. OOB indices are set to 0
1082 /// *Note* have to do 16bit index checking in scalar until we have AVX-512
1083 /// support
1084 /// @param pIndices - pointer to 8 bit indices
1085 /// @param pLastIndex - pointer to last valid index
1086 Value* FetchJit::GetSimdValid8bitIndices(Value* pIndices, Value* pLastIndex)
1087 {
1088     // can fit 2 16 bit integers per vWidth lane
1089     Value* vIndices =  VUNDEF_I();
1090
1091     // store 0 index on stack to be used to conditionally load from if index address is OOB
1092     Value* pZeroIndex = ALLOCA(mInt8Ty);
1093     STORE(C((uint8_t)0), pZeroIndex);
1094
1095     // Load a SIMD of index pointers
1096     for(int64_t lane = 0; lane < mVWidth; lane++)
1097     {
1098         // Calculate the address of the requested index
1099         Value *pIndex = GEP(pIndices, C(lane));
1100
1101         // check if the address is less than the max index,
1102         Value* mask = ICMP_ULT(pIndex, pLastIndex);
1103
1104         // if valid, load the index. if not, load 0 from the stack
1105         Value* pValid = SELECT(mask, pIndex, pZeroIndex);
1106         Value *index = LOAD(pValid, "valid index");
1107
1108         // zero extended index to 32 bits and insert into the correct simd lane
1109         index = Z_EXT(index, mInt32Ty);
1110         vIndices = VINSERT(vIndices, index, lane);
1111     }
1112     return vIndices;
1113 }
1114
1115 //////////////////////////////////////////////////////////////////////////
1116 /// @brief Loads a simd of valid indices. OOB indices are set to 0
1117 /// *Note* have to do 16bit index checking in scalar until we have AVX-512
1118 /// support
1119 /// @param pIndices - pointer to 16 bit indices
1120 /// @param pLastIndex - pointer to last valid index
1121 Value* FetchJit::GetSimdValid16bitIndices(Value* pIndices, Value* pLastIndex)
1122 {
1123     // can fit 2 16 bit integers per vWidth lane
1124     Value* vIndices =  VUNDEF_I();
1125
1126     // store 0 index on stack to be used to conditionally load from if index address is OOB
1127     Value* pZeroIndex = ALLOCA(mInt16Ty);
1128     STORE(C((uint16_t)0), pZeroIndex);
1129
1130     // Load a SIMD of index pointers
1131     for(int64_t lane = 0; lane < mVWidth; lane++)
1132     {
1133         // Calculate the address of the requested index
1134         Value *pIndex = GEP(pIndices, C(lane));
1135
1136         // check if the address is less than the max index,
1137         Value* mask = ICMP_ULT(pIndex, pLastIndex);
1138
1139         // if valid, load the index. if not, load 0 from the stack
1140         Value* pValid = SELECT(mask, pIndex, pZeroIndex);
1141         Value *index = LOAD(pValid, "valid index");
1142
1143         // zero extended index to 32 bits and insert into the correct simd lane
1144         index = Z_EXT(index, mInt32Ty);
1145         vIndices = VINSERT(vIndices, index, lane);
1146     }
1147     return vIndices;
1148 }
1149
1150 //////////////////////////////////////////////////////////////////////////
1151 /// @brief Loads a simd of valid indices. OOB indices are set to 0
1152 /// @param pIndices - pointer to 32 bit indices
1153 /// @param pLastIndex - pointer to last valid index
1154 Value* FetchJit::GetSimdValid32bitIndices(Value* pIndices, Value* pLastIndex)
1155 {
1156     DataLayout dL(JM()->mpCurrentModule);
1157     unsigned int ptrSize = dL.getPointerSize() * 8;  // ptr size in bits
1158     Value* iLastIndex = PTR_TO_INT(pLastIndex, Type::getIntNTy(JM()->mContext, ptrSize));
1159     Value* iIndices = PTR_TO_INT(pIndices, Type::getIntNTy(JM()->mContext, ptrSize));
1160
1161     // get the number of indices left in the buffer (endPtr - curPtr) / sizeof(index)
1162     Value* numIndicesLeft = SUB(iLastIndex,iIndices);
1163     numIndicesLeft = TRUNC(numIndicesLeft, mInt32Ty);
1164     numIndicesLeft = SDIV(numIndicesLeft, C(4));
1165
1166     // create a vector of index counts from the base index ptr passed into the fetch
1167     const std::vector<Constant*> vecIndices {C(0), C(1), C(2), C(3), C(4), C(5), C(6), C(7)};
1168     Constant* vIndexOffsets = ConstantVector::get(vecIndices);
1169
1170     // compare index count to the max valid index
1171     // e.g vMaxIndex      4 4 4 4 4 4 4 4 : 4 indices left to load
1172     //     vIndexOffsets  0 1 2 3 4 5 6 7
1173     //     ------------------------------
1174     //     vIndexMask    -1-1-1-1 0 0 0 0 : offsets < max pass
1175     //     vLoadedIndices 0 1 2 3 0 0 0 0 : offsets >= max masked to 0
1176     Value* vMaxIndex = VBROADCAST(numIndicesLeft);
1177     Value* vIndexMask = VPCMPGTD(vMaxIndex,vIndexOffsets);
1178
1179     // VMASKLOAD takes an *i8 src pointer
1180     pIndices = BITCAST(pIndices,PointerType::get(mInt8Ty,0));
1181
1182     // Load the indices; OOB loads 0
1183     return MASKLOADD(pIndices,vIndexMask);
1184 }
1185
1186 //////////////////////////////////////////////////////////////////////////
1187 /// @brief Takes a SIMD of gathered 8bpc verts, zero or sign extends,
1188 /// denormalizes if needed, converts to F32 if needed, and positions in
1189 //  the proper SIMD rows to be output to the simdvertex structure
1190 /// @param args: (tuple of args, listed below)
1191 ///   @param vGatherResult - 8 gathered 8bpc vertices
1192 ///   @param pVtxOut - base pointer to output simdvertex struct
1193 ///   @param extendType - sign extend or zero extend
1194 ///   @param bNormalized - do we need to denormalize?
1195 ///   @param currentVertexElement - reference to the current vVertexElement
1196 ///   @param outputElt - reference to the current offset from simdvertex we're o
1197 ///   @param compMask - component packing mask
1198 ///   @param compCtrl - component control val
1199 ///   @param vVertexElements[4] - vertex components to output
1200 ///   @param swizzle[4] - component swizzle location
1201 ///   @param fetchInfo - fetch shader info
1202 ///   @param instanceIdEnable - InstanceID enabled?
1203 ///   @param instanceIdComponentNumber - InstanceID component override
1204 ///   @param vertexIdEnable - VertexID enabled?
1205 ///   @param vertexIdComponentNumber - VertexID component override
1206 void FetchJit::Shuffle8bpcGatherd(Shuffle8bpcArgs &args)
1207 {
1208     // Unpack tuple args
1209     Value*& vGatherResult = std::get<0>(args);
1210     Value* pVtxOut = std::get<1>(args);
1211     const Instruction::CastOps extendType = std::get<2>(args);
1212     const ConversionType conversionType = std::get<3>(args);
1213     uint32_t &currentVertexElement = std::get<4>(args);
1214     uint32_t &outputElt =  std::get<5>(args);
1215     const ComponentEnable compMask = std::get<6>(args);
1216     const ComponentControl (&compCtrl)[4] = std::get<7>(args);
1217     Value* (&vVertexElements)[4] = std::get<8>(args);
1218     const uint32_t (&swizzle)[4] = std::get<9>(args);
1219     Value *fetchInfo = std::get<10>(args);
1220     const bool instanceIdEnable = std::get<11>(args);
1221     const uint32_t instanceIdComponentNumber = std::get<12>(args);
1222     const bool vertexIdEnable = std::get<13>(args);
1223     const uint32_t vertexIdComponentNumber = std::get<14>(args);
1224
1225     // cast types
1226     Type* vGatherTy = mSimdInt32Ty;
1227     Type* v32x8Ty =  VectorType::get(mInt8Ty, mVWidth * 4 ); // vwidth is units of 32 bits
1228
1229     // have to do extra work for sign extending
1230     if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP)){
1231         Type* v16x8Ty = VectorType::get(mInt8Ty, mVWidth * 2); // 8x16bit ints in a 128bit lane
1232         Type* v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
1233
1234         // shuffle mask, including any swizzling
1235         const char x = (char)swizzle[0]; const char y = (char)swizzle[1];
1236         const char z = (char)swizzle[2]; const char w = (char)swizzle[3];
1237         Value* vConstMask = C<char>({char(x), char(x+4), char(x+8), char(x+12),
1238                     char(y), char(y+4), char(y+8), char(y+12),
1239                     char(z), char(z+4), char(z+8), char(z+12),
1240                     char(w), char(w+4), char(w+8), char(w+12),
1241                     char(x), char(x+4), char(x+8), char(x+12),
1242                     char(y), char(y+4), char(y+8), char(y+12),
1243                     char(z), char(z+4), char(z+8), char(z+12),
1244                     char(w), char(w+4), char(w+8), char(w+12)});
1245
1246         Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherResult, v32x8Ty), vConstMask), vGatherTy);
1247         // after pshufb: group components together in each 128bit lane
1248         // 256i - 0    1    2    3    4    5    6    7
1249         //        xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww
1250
1251         Value* vi128XY = nullptr;
1252         if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)){
1253             vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({0, 4, 0, 0, 1, 5, 0, 0})), v128Ty);
1254             // after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane
1255             // 256i - 0    1    2    3    4    5    6    7
1256             //        xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care)
1257         }
1258
1259         // do the same for zw components
1260         Value* vi128ZW = nullptr;
1261         if(isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)){
1262             vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({2, 6, 0, 0, 3, 7, 0, 0})), v128Ty);
1263         }
1264
1265         // init denormalize variables if needed
1266         Instruction::CastOps fpCast;
1267         Value* conversionFactor;
1268
1269         switch (conversionType)
1270         {
1271         case CONVERT_NORMALIZED:
1272             fpCast = Instruction::CastOps::SIToFP;
1273             conversionFactor = VIMMED1((float)(1.0 / 127.0));
1274             break;
1275         case CONVERT_SSCALED:
1276             fpCast = Instruction::CastOps::SIToFP;
1277             conversionFactor = VIMMED1((float)(1.0));
1278             break;
1279         case CONVERT_USCALED:
1280             SWR_ASSERT(0, "Type should not be sign extended!");
1281             conversionFactor = nullptr;
1282             break;
1283         default:
1284             SWR_ASSERT(conversionType == CONVERT_NONE);
1285             conversionFactor = nullptr;
1286             break;
1287         }
1288
1289         // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
1290         for (uint32_t i = 0; i < 4; i++)
1291         {
1292             if (isComponentEnabled(compMask, i))
1293             {
1294                 // check for InstanceID SGV
1295                 if (instanceIdEnable && (instanceIdComponentNumber == currentVertexElement))
1296                 {
1297                     // Load a SIMD of InstanceIDs
1298                     vVertexElements[currentVertexElement++] = VBROADCAST(LOAD(GEP(fetchInfo, { 0, SWR_FETCH_CONTEXT_CurInstance })));   // InstanceID
1299                 }
1300                 // check for VertexID SGV
1301                 else if (vertexIdEnable && (vertexIdComponentNumber == currentVertexElement))
1302                 {
1303                     // Load a SIMD of VertexIDs
1304                     vVertexElements[currentVertexElement++] = LOAD(GEP(fetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID }));
1305                 }
1306                 else if (compCtrl[i] == ComponentControl::StoreSrc)
1307                 {
1308                     // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
1309                     uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
1310                     // if x or y, use vi128XY permute result, else use vi128ZW
1311                     Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
1312
1313                     // sign extend
1314                     vVertexElements[currentVertexElement] = PMOVSXBD(BITCAST(VEXTRACT(selectedPermute, C(lane)), v16x8Ty));
1315
1316                     // denormalize if needed
1317                     if (conversionType != CONVERT_NONE)
1318                     {
1319                         vVertexElements[currentVertexElement] = FMUL(CAST(fpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor);
1320                     }
1321                     currentVertexElement++;
1322                 }
1323                 else
1324                 {
1325                     vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1326                 }
1327
1328                 if (currentVertexElement > 3)
1329                 {
1330                     StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1331                     // reset to the next vVertexElement to output
1332                     currentVertexElement = 0;
1333                 }
1334             }
1335         }
1336     }
1337     // else zero extend
1338     else if ((extendType == Instruction::CastOps::ZExt) || (extendType == Instruction::CastOps::UIToFP))
1339     {
1340         // init denormalize variables if needed
1341         Instruction::CastOps fpCast;
1342         Value* conversionFactor;
1343
1344         switch (conversionType)
1345         {
1346         case CONVERT_NORMALIZED:
1347             fpCast = Instruction::CastOps::UIToFP;
1348             conversionFactor = VIMMED1((float)(1.0 / 255.0));
1349             break;
1350         case CONVERT_USCALED:
1351             fpCast = Instruction::CastOps::UIToFP;
1352             conversionFactor = VIMMED1((float)(1.0));
1353             break;
1354         case CONVERT_SSCALED:
1355             SWR_ASSERT(0, "Type should not be zero extended!");
1356             conversionFactor = nullptr;
1357             break;
1358         default:
1359             SWR_ASSERT(conversionType == CONVERT_NONE);
1360             conversionFactor = nullptr;
1361             break;
1362         }
1363
1364         // shuffle enabled components into lower byte of each 32bit lane, 0 extending to 32 bits
1365         for (uint32_t i = 0; i < 4; i++)
1366         {
1367             if (isComponentEnabled(compMask, i))
1368             {
1369                 // check for InstanceID SGV
1370                 if (instanceIdEnable && (instanceIdComponentNumber == currentVertexElement))
1371                 {
1372                     // Load a SIMD of InstanceIDs
1373                     vVertexElements[currentVertexElement++] = VBROADCAST(LOAD(GEP(fetchInfo, { 0, SWR_FETCH_CONTEXT_CurInstance })));   // InstanceID
1374                 }
1375                 // check for VertexID SGV
1376                 else if (vertexIdEnable && (vertexIdComponentNumber == currentVertexElement))
1377                 {
1378                     // Load a SIMD of VertexIDs
1379                     vVertexElements[currentVertexElement++] = LOAD(GEP(fetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID }));
1380                 }
1381                 else if (compCtrl[i] == ComponentControl::StoreSrc)
1382                 {
1383                     // pshufb masks for each component
1384                     Value* vConstMask;
1385                     switch (swizzle[i])
1386                     {
1387                     case 0:
1388                         // x shuffle mask
1389                         vConstMask = C<char>({ 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1,
1390                                                0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1 });
1391                         break;
1392                     case 1:
1393                         // y shuffle mask
1394                         vConstMask = C<char>({ 1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1,
1395                                                1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1 });
1396                         break;
1397                     case 2:
1398                         // z shuffle mask
1399                         vConstMask = C<char>({ 2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1,
1400                                                2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1 });
1401                         break;
1402                     case 3:
1403                         // w shuffle mask
1404                         vConstMask = C<char>({ 3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1,
1405                                                3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1 });
1406                         break;
1407                     default:
1408                         vConstMask = nullptr;
1409                         break;
1410                     }
1411
1412                     vVertexElements[currentVertexElement] = BITCAST(PSHUFB(BITCAST(vGatherResult, v32x8Ty), vConstMask), vGatherTy);
1413                     // after pshufb for x channel
1414                     // 256i - 0    1    2    3    4    5    6    7
1415                     //        x000 x000 x000 x000 x000 x000 x000 x000
1416
1417                     // denormalize if needed
1418                     if (conversionType != CONVERT_NONE)
1419                     {
1420                         vVertexElements[currentVertexElement] = FMUL(CAST(fpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor);
1421                     }
1422                     currentVertexElement++;
1423                 }
1424                 else
1425                 {
1426                     vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1427                 }
1428
1429                 if (currentVertexElement > 3)
1430                 {
1431                     StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1432                     // reset to the next vVertexElement to output
1433                     currentVertexElement = 0;
1434                 }
1435             }
1436         }
1437     }
1438     else
1439     {
1440         SWR_ASSERT(0, "Unsupported conversion type");
1441     }
1442 }
1443
1444 //////////////////////////////////////////////////////////////////////////
1445 /// @brief Takes a SIMD of gathered 16bpc verts, zero or sign extends,
1446 /// denormalizes if needed, converts to F32 if needed, and positions in
1447 //  the proper SIMD rows to be output to the simdvertex structure
1448 /// @param args: (tuple of args, listed below)
1449 ///   @param vGatherResult[2] - array of gathered 16bpc vertices, 4 per index
1450 ///   @param pVtxOut - base pointer to output simdvertex struct
1451 ///   @param extendType - sign extend or zero extend
1452 ///   @param bNormalized - do we need to denormalize?
1453 ///   @param currentVertexElement - reference to the current vVertexElement
1454 ///   @param outputElt - reference to the current offset from simdvertex we're o
1455 ///   @param compMask - component packing mask
1456 ///   @param compCtrl - component control val
1457 ///   @param vVertexElements[4] - vertex components to output
1458 ///   @param fetchInfo - fetch shader info
1459 ///   @param instanceIdEnable - InstanceID enabled?
1460 ///   @param instanceIdComponentNumber - InstanceID component override
1461 ///   @param vertexIdEnable - VertexID enabled?
1462 ///   @param vertexIdComponentNumber - VertexID component override
1463 void FetchJit::Shuffle16bpcGather(Shuffle16bpcArgs &args)
1464 {
1465     // Unpack tuple args
1466     Value* (&vGatherResult)[2] = std::get<0>(args);
1467     Value* pVtxOut = std::get<1>(args);
1468     const Instruction::CastOps extendType = std::get<2>(args);
1469     const ConversionType conversionType = std::get<3>(args);
1470     uint32_t &currentVertexElement = std::get<4>(args);
1471     uint32_t &outputElt = std::get<5>(args);
1472     const ComponentEnable compMask = std::get<6>(args);
1473     const ComponentControl(&compCtrl)[4] = std::get<7>(args);
1474     Value* (&vVertexElements)[4] = std::get<8>(args);
1475     Value *fetchInfo = std::get<9>(args);
1476     const bool instanceIdEnable = std::get<10>(args);
1477     const uint32_t instanceIdComponentNumber = std::get<11>(args);
1478     const bool vertexIdEnable = std::get<12>(args);
1479     const uint32_t vertexIdComponentNumber = std::get<13>(args);
1480
1481     // cast types
1482     Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
1483     Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
1484
1485     // have to do extra work for sign extending
1486     if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP)||
1487         (extendType == Instruction::CastOps::FPExt))
1488     {
1489         // is this PP float?
1490         bool bFP = (extendType == Instruction::CastOps::FPExt) ? true : false;
1491
1492         Type* v8x16Ty = VectorType::get(mInt16Ty, 8); // 8x16bit in a 128bit lane
1493         Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
1494
1495         // shuffle mask
1496         Value* vConstMask = C<char>({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
1497                                      0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15});
1498         Value* vi128XY = nullptr;
1499         if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)){
1500             Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherResult[0], v32x8Ty), vConstMask), vGatherTy);
1501             // after pshufb: group components together in each 128bit lane
1502             // 256i - 0    1    2    3    4    5    6    7
1503             //        xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
1504
1505             vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
1506             // after PERMD: move and pack xy components into each 128bit lane
1507             // 256i - 0    1    2    3    4    5    6    7
1508             //        xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
1509         }
1510
1511         // do the same for zw components
1512         Value* vi128ZW = nullptr;
1513         if(isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)){
1514             Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherResult[1], v32x8Ty), vConstMask), vGatherTy);
1515             vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
1516         }
1517
1518         // init denormalize variables if needed
1519         Instruction::CastOps IntToFpCast;
1520         Value* conversionFactor;
1521
1522         switch (conversionType)
1523         {
1524         case CONVERT_NORMALIZED:
1525             IntToFpCast = Instruction::CastOps::SIToFP;
1526             conversionFactor = VIMMED1((float)(1.0 / 32767.0));
1527             break;
1528         case CONVERT_SSCALED:
1529             IntToFpCast = Instruction::CastOps::SIToFP;
1530             conversionFactor = VIMMED1((float)(1.0));
1531             break;
1532         case CONVERT_USCALED:
1533             SWR_ASSERT(0, "Type should not be sign extended!");
1534             conversionFactor = nullptr;
1535             break;
1536         default:
1537             SWR_ASSERT(conversionType == CONVERT_NONE);
1538             conversionFactor = nullptr;
1539             break;
1540         }
1541
1542         // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
1543         for (uint32_t i = 0; i < 4; i++)
1544         {
1545             if (isComponentEnabled(compMask, i))
1546             {
1547                 // check for InstanceID SGV
1548                 if (instanceIdEnable && (instanceIdComponentNumber == currentVertexElement))
1549                 {
1550                     // Load a SIMD of InstanceIDs
1551                     vVertexElements[currentVertexElement++] = VBROADCAST(LOAD(GEP(fetchInfo, { 0, SWR_FETCH_CONTEXT_CurInstance })));   // InstanceID
1552                 }
1553                 // check for VertexID SGV
1554                 else if (vertexIdEnable && (vertexIdComponentNumber == currentVertexElement))
1555                 {
1556                     // Load a SIMD of VertexIDs
1557                     vVertexElements[currentVertexElement++] = LOAD(GEP(fetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID }));
1558                 }
1559                 else if (compCtrl[i] == ComponentControl::StoreSrc)
1560                 {
1561                     // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
1562                     uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
1563                     // if x or y, use vi128XY permute result, else use vi128ZW
1564                     Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
1565
1566                     if (bFP) {
1567                         // extract 128 bit lanes to sign extend each component
1568                         vVertexElements[currentVertexElement] = CVTPH2PS(BITCAST(VEXTRACT(selectedPermute, C(lane)), v8x16Ty));
1569                     }
1570                     else {
1571                         // extract 128 bit lanes to sign extend each component
1572                         vVertexElements[currentVertexElement] = PMOVSXWD(BITCAST(VEXTRACT(selectedPermute, C(lane)), v8x16Ty));
1573
1574                         // denormalize if needed
1575                         if (conversionType != CONVERT_NONE) {
1576                             vVertexElements[currentVertexElement] = FMUL(CAST(IntToFpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor);
1577                         }
1578                     }
1579                     currentVertexElement++;
1580                 }
1581                 else
1582                 {
1583                     vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1584                 }
1585
1586                 if (currentVertexElement > 3)
1587                 {
1588                     StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1589                     // reset to the next vVertexElement to output
1590                     currentVertexElement = 0;
1591                 }
1592             }
1593         }
1594     }
1595     // else zero extend
1596     else if ((extendType == Instruction::CastOps::ZExt) || (extendType == Instruction::CastOps::UIToFP))
1597     {
1598         // pshufb masks for each component
1599         Value* vConstMask[2];
1600         if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 2)){
1601             // x/z shuffle mask
1602             vConstMask[0] = C<char>({0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
1603                                      0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, });
1604         }
1605
1606         if(isComponentEnabled(compMask, 1) || isComponentEnabled(compMask, 3)){
1607             // y/w shuffle mask
1608             vConstMask[1] = C<char>({2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
1609                                      2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1});
1610         }
1611
1612         // init denormalize variables if needed
1613         Instruction::CastOps fpCast;
1614         Value* conversionFactor;
1615
1616         switch (conversionType)
1617         {
1618         case CONVERT_NORMALIZED:
1619             fpCast = Instruction::CastOps::UIToFP;
1620             conversionFactor = VIMMED1((float)(1.0 / 65535.0));
1621             break;
1622         case CONVERT_USCALED:
1623             fpCast = Instruction::CastOps::UIToFP;
1624             conversionFactor = VIMMED1((float)(1.0f));
1625             break;
1626         case CONVERT_SSCALED:
1627             SWR_ASSERT(0, "Type should not be zero extended!");
1628             conversionFactor = nullptr;
1629             break;
1630         default:
1631             SWR_ASSERT(conversionType == CONVERT_NONE);
1632             conversionFactor = nullptr;
1633             break;
1634         }
1635
1636         // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits
1637         for (uint32_t i = 0; i < 4; i++)
1638         {
1639             if (isComponentEnabled(compMask, i))
1640             {
1641                 // check for InstanceID SGV
1642                 if (instanceIdEnable && (instanceIdComponentNumber == currentVertexElement))
1643                 {
1644                     // Load a SIMD of InstanceIDs
1645                     vVertexElements[currentVertexElement++] = VBROADCAST(LOAD(GEP(fetchInfo, { 0, SWR_FETCH_CONTEXT_CurInstance })));   // InstanceID
1646                 }
1647                 // check for VertexID SGV
1648                 else if (vertexIdEnable && (vertexIdComponentNumber == currentVertexElement))
1649                 {
1650                     // Load a SIMD of VertexIDs
1651                     vVertexElements[currentVertexElement++] = LOAD(GEP(fetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID }));
1652                 }
1653                 else if (compCtrl[i] == ComponentControl::StoreSrc)
1654                 {
1655                     // select correct constMask for x/z or y/w pshufb
1656                     uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1;
1657                     // if x or y, use vi128XY permute result, else use vi128ZW
1658                     uint32_t selectedGather = (i < 2) ? 0 : 1;
1659
1660                     vVertexElements[currentVertexElement] = BITCAST(PSHUFB(BITCAST(vGatherResult[selectedGather], v32x8Ty), vConstMask[selectedMask]), vGatherTy);
1661                     // after pshufb mask for x channel; z uses the same shuffle from the second gather
1662                     // 256i - 0    1    2    3    4    5    6    7
1663                     //        xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00
1664
1665                     // denormalize if needed
1666                     if (conversionType != CONVERT_NONE)
1667                     {
1668                         vVertexElements[currentVertexElement] = FMUL(CAST(fpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor);
1669                     }
1670                     currentVertexElement++;
1671                 }
1672                 else
1673                 {
1674                     vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1675                 }
1676
1677                 if (currentVertexElement > 3)
1678                 {
1679                     StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1680                     // reset to the next vVertexElement to output
1681                     currentVertexElement = 0;
1682                 }
1683             }
1684         }
1685     }
1686     else
1687     {
1688         SWR_ASSERT(0, "Unsupported conversion type");
1689     }
1690 }
1691
1692 //////////////////////////////////////////////////////////////////////////
1693 /// @brief Output a simdvertex worth of elements to the current outputElt
1694 /// @param pVtxOut - base address of VIN output struct
1695 /// @param outputElt - simdvertex offset in VIN to write to
1696 /// @param numEltsToStore - number of simdvertex rows to write out
1697 /// @param vVertexElements - LLVM Value*[] simdvertex to write out
1698 void FetchJit::StoreVertexElements(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4])
1699 {
1700     for(uint32_t c = 0; c < numEltsToStore; ++c)
1701     {
1702         // STORE expects FP32 x vWidth type, just bitcast if needed
1703         if(!vVertexElements[c]->getType()->getScalarType()->isFloatTy()){
1704 #if FETCH_DUMP_VERTEX
1705             PRINT("vVertexElements[%d]: 0x%x\n", {C(c), vVertexElements[c]});
1706 #endif
1707             vVertexElements[c] = BITCAST(vVertexElements[c], mSimdFP32Ty);
1708         }
1709 #if FETCH_DUMP_VERTEX
1710         else
1711         {
1712             PRINT("vVertexElements[%d]: %f\n", {C(c), vVertexElements[c]});
1713         }
1714 #endif
1715         // outputElt * 4 = offsetting by the size of a simdvertex
1716         // + c offsets to a 32bit x vWidth row within the current vertex
1717         Value* dest = GEP(pVtxOut, C(outputElt * 4 + c), "destGEP");
1718         STORE(vVertexElements[c], dest);
1719     }
1720 }
1721
1722 //////////////////////////////////////////////////////////////////////////
1723 /// @brief Generates a constant vector of values based on the
1724 /// ComponentControl value
1725 /// @param ctrl - ComponentControl value
1726 Value* FetchJit::GenerateCompCtrlVector(const ComponentControl ctrl)
1727 {
1728     switch(ctrl)
1729     {
1730         case NoStore:   return VUNDEF_I();
1731         case Store0:    return VIMMED1(0);
1732         case Store1Fp:  return VIMMED1(1.0f);
1733         case Store1Int: return VIMMED1(1);
1734         case StoreSrc:
1735         default:        SWR_ASSERT(0, "Invalid component control"); return VUNDEF_I();
1736     }
1737 }
1738
1739 //////////////////////////////////////////////////////////////////////////
1740 /// @brief Returns the enable mask for the specified component.
1741 /// @param enableMask - enable bits
1742 /// @param component - component to check if enabled.
1743 bool isComponentEnabled(ComponentEnable enableMask, uint8_t component)
1744 {
1745     switch (component)
1746     {
1747         // X
1748     case 0: return (enableMask & ComponentEnable::X);
1749         // Y
1750     case 1: return (enableMask & ComponentEnable::Y);
1751         // Z
1752     case 2: return (enableMask & ComponentEnable::Z);
1753         // W
1754     case 3: return (enableMask & ComponentEnable::W);
1755
1756     default: return false;
1757     }
1758 }
1759
1760
1761 //////////////////////////////////////////////////////////////////////////
1762 /// @brief JITs from fetch shader IR
1763 /// @param hJitMgr - JitManager handle
1764 /// @param func   - LLVM function IR
1765 /// @return PFN_FETCH_FUNC - pointer to fetch code
1766 PFN_FETCH_FUNC JitFetchFunc(HANDLE hJitMgr, const HANDLE hFunc)
1767 {
1768     const llvm::Function* func = (const llvm::Function*)hFunc;
1769     JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
1770     PFN_FETCH_FUNC pfnFetch;
1771
1772     pfnFetch = (PFN_FETCH_FUNC)(pJitMgr->mpExec->getFunctionAddress(func->getName().str()));
1773     // MCJIT finalizes modules the first time you JIT code from them. After finalized, you cannot add new IR to the module
1774     pJitMgr->mIsModuleFinalized = true;
1775
1776 #if defined(KNOB_SWRC_TRACING)
1777     char fName[1024];
1778     const char *funcName = func->getName().data();
1779     sprintf(fName, "%s.bin", funcName);
1780     FILE *fd = fopen(fName, "wb");
1781     fwrite((void *)pfnFetch, 1, 2048, fd);
1782     fclose(fd);
1783 #endif
1784
1785     return pfnFetch;
1786 }
1787
1788 //////////////////////////////////////////////////////////////////////////
1789 /// @brief JIT compiles fetch shader
1790 /// @param hJitMgr - JitManager handle
1791 /// @param state   - fetch state to build function from
1792 extern "C" PFN_FETCH_FUNC JITCALL JitCompileFetch(HANDLE hJitMgr, const FETCH_COMPILE_STATE& state)
1793 {
1794     JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
1795
1796     pJitMgr->SetupNewModule();
1797
1798     FetchJit theJit(pJitMgr);
1799     HANDLE hFunc = theJit.Create(state);
1800
1801     return JitFetchFunc(hJitMgr, hFunc);
1802 }