src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp

   1 /****************************************************************************
   2 * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
   3 *
   4 * Permission is hereby granted, free of charge, to any person obtaining a
   5 * copy of this software and associated documentation files (the "Software"),
   6 * to deal in the Software without restriction, including without limitation
   7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8 * and/or sell copies of the Software, and to permit persons to whom the
   9 * Software is furnished to do so, subject to the following conditions:
  10 *
  11 * The above copyright notice and this permission notice (including the next
  12 * paragraph) shall be included in all copies or substantial portions of the
  13 * Software.
  14 *
  15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21 * IN THE SOFTWARE.
  22 *
  23 * @file fetch_jit.cpp
  24 *
  25 * @brief Implementation of the fetch jitter
  26 *
  27 * Notes:
  28 *
  29 ******************************************************************************/
  30 #include "jit_api.h"
  31 #include "fetch_jit.h"
  32 #include "builder.h"
  33 #include "state_llvm.h"
  34 #include "common/containers.hpp"
  35 #include "llvm/IR/DataLayout.h"
  36 #include <sstream>
  37 #include <tuple>
  38
  39 //#define FETCH_DUMP_VERTEX 1
  40
  41 bool isComponentEnabled(ComponentEnable enableMask, uint8_t component);
  42
  43 enum ConversionType
  44 {
  45     CONVERT_NONE,
  46     CONVERT_NORMALIZED,
  47     CONVERT_USCALED,
  48     CONVERT_SSCALED,
  49 };
  50
  51 //////////////////////////////////////////////////////////////////////////
  52 /// Interface to Jitting a fetch shader
  53 //////////////////////////////////////////////////////////////////////////
  54 struct FetchJit : public Builder
  55 {
  56     FetchJit(JitManager* pJitMgr) : Builder(pJitMgr){};
  57
  58     Function* Create(const FETCH_COMPILE_STATE& fetchState);
  59     Value* GetSimdValid32bitIndices(Value* vIndices, Value* pLastIndex);
  60     Value* GetSimdValid16bitIndices(Value* vIndices, Value* pLastIndex);
  61     Value* GetSimdValid8bitIndices(Value* vIndices, Value* pLastIndex);
  62
  63     // package up Shuffle*bpcGatherd args into a tuple for convenience
  64     typedef std::tuple<Value*&, Value*, const Instruction::CastOps, const ConversionType,
  65                        uint32_t&, uint32_t&, const ComponentEnable, const ComponentControl(&)[4], Value*(&)[4],
  66                        const uint32_t (&)[4]> Shuffle8bpcArgs;
  67     void Shuffle8bpcGatherd(Shuffle8bpcArgs &args);
  68
  69     typedef std::tuple<Value*(&)[2], Value*, const Instruction::CastOps, const ConversionType,
  70                        uint32_t&, uint32_t&, const ComponentEnable, const ComponentControl(&)[4], Value*(&)[4]> Shuffle16bpcArgs;
  71     void Shuffle16bpcGather(Shuffle16bpcArgs &args);
  72
  73     void StoreVertexElements(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4]);
  74
  75     Value* GenerateCompCtrlVector(const ComponentControl ctrl);
  76
  77     void JitLoadVertices(const FETCH_COMPILE_STATE &fetchState, Value* fetchInfo, Value* streams, Value* vIndices, Value* pVtxOut);
  78     void JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, Value* fetchInfo, Value* streams, Value* vIndices, Value* pVtxOut);
  79 };
  80
  81 Function* FetchJit::Create(const FETCH_COMPILE_STATE& fetchState)
  82 {
  83     static std::size_t fetchNum = 0;
  84
  85     std::stringstream fnName("FetchShader", std::ios_base::in | std::ios_base::out | std::ios_base::ate);
  86     fnName << fetchNum++;
  87
  88     Function*    fetch = Function::Create(JM()->mFetchShaderTy, GlobalValue::ExternalLinkage, fnName.str(), JM()->mpCurrentModule);
  89     BasicBlock*    entry = BasicBlock::Create(JM()->mContext, "entry", fetch);
  90
  91     IRB()->SetInsertPoint(entry);
  92
  93     auto    argitr = fetch->getArgumentList().begin();
  94
  95     // Fetch shader arguments
  96     Value*    fetchInfo = &*argitr; ++argitr;
  97     fetchInfo->setName("fetchInfo");
  98     Value*    pVtxOut = &*argitr;
  99     pVtxOut->setName("vtxOutput");
 100     // this is just shorthand to tell LLVM to get a pointer to the base address of simdvertex
 101     // index 0(just the pointer to the simdvertex structure
 102     // index 1(which element of the simdvertex structure to offset to(in this case 0)
 103     // so the indices being i32's doesn't matter
 104     // TODO: generated this GEP with a VECTOR structure type so this makes sense
 105     std::vector<Value*>    vtxInputIndices(2, C(0));
 106     // GEP
 107     pVtxOut = GEP(pVtxOut, C(0));
 108     pVtxOut = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth), 0));
 109
 110     // SWR_FETCH_CONTEXT::pStreams
 111     Value*    streams = LOAD(fetchInfo,{0, SWR_FETCH_CONTEXT_pStreams});
 112     streams->setName("pStreams");
 113
 114     // SWR_FETCH_CONTEXT::pIndices
 115     Value*    indices = LOAD(fetchInfo,{0, SWR_FETCH_CONTEXT_pIndices});
 116     indices->setName("pIndices");
 117
 118     // SWR_FETCH_CONTEXT::pLastIndex
 119     Value*    pLastIndex = LOAD(fetchInfo,{0, SWR_FETCH_CONTEXT_pLastIndex});
 120     pLastIndex->setName("pLastIndex");
 121
 122
 123     Value* vIndices;
 124     switch(fetchState.indexType)
 125     {
 126         case R8_UINT:
 127             indices = BITCAST(indices, Type::getInt8PtrTy(JM()->mContext, 0));
 128             if(fetchState.bDisableIndexOOBCheck){
 129                 vIndices = LOAD(BITCAST(indices, PointerType::get(VectorType::get(mInt8Ty, mpJitMgr->mVWidth), 0)), {(uint32_t)0});
 130                 vIndices = Z_EXT(vIndices, mSimdInt32Ty);
 131             }
 132             else{
 133                 pLastIndex = BITCAST(pLastIndex, Type::getInt8PtrTy(JM()->mContext, 0));
 134                 vIndices = GetSimdValid8bitIndices(indices, pLastIndex);
 135             }
 136             break;
 137         case R16_UINT:
 138             indices = BITCAST(indices, Type::getInt16PtrTy(JM()->mContext, 0));
 139             if(fetchState.bDisableIndexOOBCheck){
 140                 vIndices = LOAD(BITCAST(indices, PointerType::get(VectorType::get(mInt16Ty, mpJitMgr->mVWidth), 0)), {(uint32_t)0});
 141                 vIndices = Z_EXT(vIndices, mSimdInt32Ty);
 142             }
 143             else{
 144                 pLastIndex = BITCAST(pLastIndex, Type::getInt16PtrTy(JM()->mContext, 0));
 145                 vIndices = GetSimdValid16bitIndices(indices, pLastIndex);
 146             }
 147             break;
 148         case R32_UINT:
 149             (fetchState.bDisableIndexOOBCheck) ? vIndices = LOAD(BITCAST(indices, PointerType::get(mSimdInt32Ty,0)),{(uint32_t)0})
 150                                                : vIndices = GetSimdValid32bitIndices(indices, pLastIndex);
 151             break; // incoming type is already 32bit int
 152         default: SWR_ASSERT(0, "Unsupported index type"); vIndices = nullptr; break;
 153     }
 154
 155     // store out vertex IDs
 156     STORE(vIndices, GEP(fetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID }));
 157
 158     // store out cut mask if enabled
 159     if (fetchState.bEnableCutIndex)
 160     {
 161         Value* vCutIndex = VIMMED1(fetchState.cutIndex);
 162         Value* cutMask = VMASK(ICMP_EQ(vIndices, vCutIndex));
 163         STORE(cutMask, GEP(fetchInfo, { 0, SWR_FETCH_CONTEXT_CutMask }));
 164     }
 165
 166     // Fetch attributes from memory and output to a simdvertex struct
 167     // since VGATHER has a perf penalty on HSW vs BDW, allow client to choose which fetch method to use
 168     (fetchState.bDisableVGATHER) ? JitLoadVertices(fetchState, fetchInfo, streams, vIndices, pVtxOut)
 169                                  : JitGatherVertices(fetchState, fetchInfo, streams, vIndices, pVtxOut);
 170
 171     RET_VOID();
 172
 173     JitManager::DumpToFile(fetch, "src");
 174
 175     verifyFunction(*fetch);
 176
 177 #if HAVE_LLVM == 0x306
 178         FunctionPassManager
 179 #else
 180         llvm::legacy::FunctionPassManager
 181 #endif
 182             setupPasses(JM()->mpCurrentModule);
 183
 184     ///@todo We don't need the CFG passes for fetch. (e.g. BreakCriticalEdges and CFGSimplification)
 185     setupPasses.add(createBreakCriticalEdgesPass());
 186     setupPasses.add(createCFGSimplificationPass());
 187     setupPasses.add(createEarlyCSEPass());
 188     setupPasses.add(createPromoteMemoryToRegisterPass());
 189
 190     setupPasses.run(*fetch);
 191
 192     JitManager::DumpToFile(fetch, "se");
 193
 194 #if HAVE_LLVM == 0x306
 195         FunctionPassManager
 196 #else
 197         llvm::legacy::FunctionPassManager
 198 #endif
 199             optPasses(JM()->mpCurrentModule);
 200
 201     ///@todo Haven't touched these either. Need to remove some of these and add others.
 202     optPasses.add(createCFGSimplificationPass());
 203     optPasses.add(createEarlyCSEPass());
 204     optPasses.add(createInstructionCombiningPass());
 205     optPasses.add(createInstructionSimplifierPass());
 206     optPasses.add(createConstantPropagationPass());
 207     optPasses.add(createSCCPPass());
 208     optPasses.add(createAggressiveDCEPass());
 209
 210     optPasses.run(*fetch);
 211     optPasses.run(*fetch);
 212
 213     JitManager::DumpToFile(fetch, "opt");
 214
 215     return fetch;
 216 }
 217
 218 //////////////////////////////////////////////////////////////////////////
 219 /// @brief Loads attributes from memory using LOADs, shuffling the
 220 /// components into SOA form.
 221 /// *Note* currently does not support component control,
 222 /// component packing, or instancing
 223 /// @param fetchState - info about attributes to be fetched from memory
 224 /// @param streams - value pointer to the current vertex stream
 225 /// @param vIndices - vector value of indices to load
 226 /// @param pVtxOut - value pointer to output simdvertex struct
 227 void FetchJit::JitLoadVertices(const FETCH_COMPILE_STATE &fetchState, Value* fetchInfo, Value* streams, Value* vIndices, Value* pVtxOut)
 228 {
 229     // Zack shuffles; a variant of the Charleston.
 230
 231     SWRL::UncheckedFixedVector<Value*, 16>    vectors;
 232
 233     std::vector<Constant*>    pMask(mVWidth);
 234     for(uint32_t i = 0; i < mVWidth; ++i)
 235     {
 236         pMask[i] = (C(i < 4 ? i : 4));
 237     }
 238     Constant* promoteMask = ConstantVector::get(pMask);
 239     Constant* uwvec = UndefValue::get(VectorType::get(mFP32Ty, 4));
 240
 241     Value* startVertex = LOAD(fetchInfo, {0, SWR_FETCH_CONTEXT_StartVertex});
 242
 243     for(uint32_t nelt = 0; nelt < fetchState.numAttribs; ++nelt)
 244     {
 245         Value*    elements[4] = {0};
 246         const INPUT_ELEMENT_DESC& ied = fetchState.layout[nelt];
 247         const SWR_FORMAT_INFO &info = GetFormatInfo((SWR_FORMAT)ied.Format);
 248         uint32_t    numComponents = info.numComps;
 249         uint32_t bpc = info.bpp / info.numComps;  ///@todo Code below assumes all components are same size. Need to fix.
 250
 251         vectors.clear();
 252
 253         // load SWR_VERTEX_BUFFER_STATE::pData
 254         Value *stream = LOAD(streams, {ied.StreamIndex, 2});
 255
 256         // load SWR_VERTEX_BUFFER_STATE::pitch
 257         Value *stride = LOAD(streams, {ied.StreamIndex, 1});
 258         stride = Z_EXT(stride, mInt64Ty);
 259
 260         // load SWR_VERTEX_BUFFER_STATE::size
 261         Value *size = LOAD(streams, {ied.StreamIndex, 3});
 262         size = Z_EXT(size, mInt64Ty);
 263
 264         Value* startVertexOffset = MUL(Z_EXT(startVertex, mInt64Ty), stride);
 265
 266         // Load from the stream.
 267         for(uint32_t lane = 0; lane < mVWidth; ++lane)
 268         {
 269             // Get index
 270             Value* index = VEXTRACT(vIndices, C(lane));
 271             index = Z_EXT(index, mInt64Ty);
 272
 273             Value*    offset = MUL(index, stride);
 274             offset = ADD(offset, C((int64_t)ied.AlignedByteOffset));
 275             offset = ADD(offset, startVertexOffset);
 276
 277             if (!fetchState.bDisableIndexOOBCheck) {
 278                 // check for out of bound access, including partial OOB, and mask them to 0
 279                 Value *endOffset = ADD(offset, C((int64_t)info.Bpp));
 280                 Value *oob = ICMP_ULE(endOffset, size);
 281                 offset = SELECT(oob, offset, ConstantInt::get(mInt64Ty, 0));
 282             }
 283
 284             Value*    pointer = GEP(stream, offset);
 285             // We use a full-lane, but don't actually care.
 286             Value*    vptr = 0;
 287
 288             // get a pointer to a 4 component attrib in default address space
 289             switch(bpc)
 290             {
 291                 case 8: vptr = BITCAST(pointer, PointerType::get(VectorType::get(mInt8Ty, 4), 0)); break;
 292                 case 16: vptr = BITCAST(pointer, PointerType::get(VectorType::get(mInt16Ty, 4), 0)); break;
 293                 case 32: vptr = BITCAST(pointer, PointerType::get(VectorType::get(mFP32Ty, 4), 0)); break;
 294                 default: SWR_ASSERT(false, "Unsupported underlying bpp!");
 295             }
 296
 297             // load 4 components of attribute
 298             Value*    vec = ALIGNED_LOAD(vptr, 1, false);
 299
 300             // Convert To FP32 internally
 301             switch(info.type[0])
 302             {
 303                 case SWR_TYPE_UNORM:
 304                     switch(bpc)
 305                     {
 306                         case 8:
 307                             vec = UI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
 308                             vec = FMUL(vec, ConstantVector::get(std::vector<Constant*>(4, ConstantFP::get(mFP32Ty, 1.0 / 255.0))));
 309                             break;
 310                         case 16:
 311                             vec = UI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
 312                             vec = FMUL(vec, ConstantVector::get(std::vector<Constant*>(4, ConstantFP::get(mFP32Ty, 1.0 / 65535.0))));
 313                             break;
 314                         default:
 315                             SWR_ASSERT(false, "Unsupported underlying type!");
 316                             break;
 317                     }
 318                     break;
 319                 case SWR_TYPE_SNORM:
 320                     switch(bpc)
 321                     {
 322                         case 8:
 323                             vec = SI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
 324                             vec = FMUL(vec, ConstantVector::get(std::vector<Constant*>(4, ConstantFP::get(mFP32Ty, 1.0 / 128.0))));
 325                             break;
 326                         case 16:
 327                             vec = SI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
 328                             vec = FMUL(vec, ConstantVector::get(std::vector<Constant*>(4, ConstantFP::get(mFP32Ty, 1.0 / 32768.0))));
 329                             break;
 330                         default:
 331                             SWR_ASSERT(false, "Unsupported underlying type!");
 332                             break;
 333                     }
 334                     break;
 335                 case SWR_TYPE_UINT:
 336                     // Zero extend uint32_t types.
 337                     switch(bpc)
 338                     {
 339                         case 8:
 340                         case 16:
 341                             vec = Z_EXT(vec, VectorType::get(mInt32Ty, 4));
 342                             vec = BITCAST(vec, VectorType::get(mFP32Ty, 4));
 343                             break;
 344                         case 32:
 345                             break; // Pass through unchanged.
 346                         default:
 347                             SWR_ASSERT(false, "Unsupported underlying type!");
 348                             break;
 349                     }
 350                     break;
 351                 case SWR_TYPE_SINT:
 352                     // Sign extend SINT types.
 353                     switch(bpc)
 354                     {
 355                         case 8:
 356                         case 16:
 357                             vec = S_EXT(vec, VectorType::get(mInt32Ty, 4));
 358                             vec = BITCAST(vec, VectorType::get(mFP32Ty, 4));
 359                             break;
 360                         case 32:
 361                             break; // Pass through unchanged.
 362                         default:
 363                             SWR_ASSERT(false, "Unsupported underlying type!");
 364                             break;
 365                     }
 366                     break;
 367                 case SWR_TYPE_FLOAT:
 368                     switch(bpc)
 369                     {
 370                         case 32:
 371                             break; // Pass through unchanged.
 372                         default:
 373                             SWR_ASSERT(false, "Unsupported underlying type!");
 374                     }
 375                     break;
 376                 case SWR_TYPE_USCALED:
 377                     vec = UI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
 378                     break;
 379                 case SWR_TYPE_SSCALED:
 380                     vec = SI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
 381                     break;
 382                 case SWR_TYPE_UNKNOWN:
 383                 case SWR_TYPE_UNUSED:
 384                     SWR_ASSERT(false, "Unsupported type %d!", info.type[0]);
 385             }
 386
 387             // promote mask: sse(0,1,2,3) | avx(0,1,2,3,4,4,4,4)
 388             // uwvec: 4 x F32, undef value
 389             Value*    wvec = VSHUFFLE(vec, uwvec, promoteMask);
 390             vectors.push_back(wvec);
 391         }
 392
 393         std::vector<Constant*>        v01Mask(mVWidth);
 394         std::vector<Constant*>        v23Mask(mVWidth);
 395         std::vector<Constant*>        v02Mask(mVWidth);
 396         std::vector<Constant*>        v13Mask(mVWidth);
 397
 398         // Concatenate the vectors together.
 399         elements[0] = VUNDEF_F();
 400         elements[1] = VUNDEF_F();
 401         elements[2] = VUNDEF_F();
 402         elements[3] = VUNDEF_F();
 403         for(uint32_t b = 0, num4Wide = mVWidth / 4; b < num4Wide; ++b)
 404         {
 405             v01Mask[4 * b + 0] = C(0 + 4 * b);
 406             v01Mask[4 * b + 1] = C(1 + 4 * b);
 407             v01Mask[4 * b + 2] = C(0 + 4 * b + mVWidth);
 408             v01Mask[4 * b + 3] = C(1 + 4 * b + mVWidth);
 409
 410             v23Mask[4 * b + 0] = C(2 + 4 * b);
 411             v23Mask[4 * b + 1] = C(3 + 4 * b);
 412             v23Mask[4 * b + 2] = C(2 + 4 * b + mVWidth);
 413             v23Mask[4 * b + 3] = C(3 + 4 * b + mVWidth);
 414
 415             v02Mask[4 * b + 0] = C(0 + 4 * b);
 416             v02Mask[4 * b + 1] = C(2 + 4 * b);
 417             v02Mask[4 * b + 2] = C(0 + 4 * b + mVWidth);
 418             v02Mask[4 * b + 3] = C(2 + 4 * b + mVWidth);
 419
 420             v13Mask[4 * b + 0] = C(1 + 4 * b);
 421             v13Mask[4 * b + 1] = C(3 + 4 * b);
 422             v13Mask[4 * b + 2] = C(1 + 4 * b + mVWidth);
 423             v13Mask[4 * b + 3] = C(3 + 4 * b + mVWidth);
 424
 425             std::vector<Constant*>    iMask(mVWidth);
 426             for(uint32_t i = 0; i < mVWidth; ++i)
 427             {
 428                 if(((4 * b) <= i) && (i < (4 * (b + 1))))
 429                 {
 430                     iMask[i] = C(i % 4 + mVWidth);
 431                 }
 432                 else
 433                 {
 434                     iMask[i] = C(i);
 435                 }
 436             }
 437             Constant* insertMask = ConstantVector::get(iMask);
 438             elements[0] = VSHUFFLE(elements[0], vectors[4 * b + 0], insertMask);
 439             elements[1] = VSHUFFLE(elements[1], vectors[4 * b + 1], insertMask);
 440             elements[2] = VSHUFFLE(elements[2], vectors[4 * b + 2], insertMask);
 441             elements[3] = VSHUFFLE(elements[3], vectors[4 * b + 3], insertMask);
 442         }
 443
 444         Value* x0y0x1y1 = VSHUFFLE(elements[0], elements[1], ConstantVector::get(v01Mask));
 445         Value* x2y2x3y3 = VSHUFFLE(elements[2], elements[3], ConstantVector::get(v01Mask));
 446         Value* z0w0z1w1 = VSHUFFLE(elements[0], elements[1], ConstantVector::get(v23Mask));
 447         Value* z2w3z2w3 = VSHUFFLE(elements[2], elements[3], ConstantVector::get(v23Mask));
 448         elements[0] = VSHUFFLE(x0y0x1y1, x2y2x3y3, ConstantVector::get(v02Mask));
 449         elements[1] = VSHUFFLE(x0y0x1y1, x2y2x3y3, ConstantVector::get(v13Mask));
 450         elements[2] = VSHUFFLE(z0w0z1w1, z2w3z2w3, ConstantVector::get(v02Mask));
 451         elements[3] = VSHUFFLE(z0w0z1w1, z2w3z2w3, ConstantVector::get(v13Mask));
 452
 453         switch(numComponents + 1)
 454         {
 455             case    1: elements[0] = VIMMED1(0.0f);
 456             case    2: elements[1] = VIMMED1(0.0f);
 457             case    3: elements[2] = VIMMED1(0.0f);
 458             case    4: elements[3] = VIMMED1(1.0f);
 459         }
 460
 461         for(uint32_t c = 0; c < 4; ++c)
 462         {
 463             Value* dest = GEP(pVtxOut, C(nelt * 4 + c), "destGEP");
 464             STORE(elements[c], dest);
 465         }
 466     }
 467 }
 468
 469 //////////////////////////////////////////////////////////////////////////
 470 /// @brief Loads attributes from memory using AVX2 GATHER(s)
 471 /// @param fetchState - info about attributes to be fetched from memory
 472 /// @param fetchInfo - first argument passed to fetch shader
 473 /// @param streams - value pointer to the current vertex stream
 474 /// @param vIndices - vector value of indices to gather
 475 /// @param pVtxOut - value pointer to output simdvertex struct
 476 void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, Value* fetchInfo,
 477                                  Value* streams, Value* vIndices, Value* pVtxOut)
 478 {
 479     uint32_t currentVertexElement = 0;
 480     uint32_t outputElt = 0;
 481     Value* vVertexElements[4];
 482
 483     Value* startVertex = LOAD(fetchInfo, {0, SWR_FETCH_CONTEXT_StartVertex});
 484     Value* startInstance = LOAD(fetchInfo, {0, SWR_FETCH_CONTEXT_StartInstance});
 485     Value* curInstance = LOAD(fetchInfo, {0, SWR_FETCH_CONTEXT_CurInstance});
 486     Value* vBaseVertex = VBROADCAST(LOAD(fetchInfo, {0, SWR_FETCH_CONTEXT_BaseVertex}));
 487     curInstance->setName("curInstance");
 488
 489     for(uint32_t nInputElt = 0; nInputElt < fetchState.numAttribs; ++nInputElt)
 490     {
 491         const INPUT_ELEMENT_DESC& ied = fetchState.layout[nInputElt];
 492         const SWR_FORMAT_INFO &info = GetFormatInfo((SWR_FORMAT)ied.Format);
 493         uint32_t bpc = info.bpp / info.numComps;  ///@todo Code below assumes all components are same size. Need to fix.
 494
 495         Value *stream = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pData});
 496
 497         // VGATHER* takes an *i8 src pointer
 498         Value* pStreamBase = BITCAST(stream, PointerType::get(mInt8Ty, 0));
 499
 500         Value *stride = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pitch});
 501         Value *vStride = VBROADCAST(stride);
 502
 503         // max vertex index that is fully in bounds
 504         Value *maxVertex = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_maxVertex)});
 505         maxVertex = LOAD(maxVertex);
 506
 507         Value *vCurIndices;
 508         Value *startOffset;
 509         if(ied.InstanceEnable)
 510         {
 511             Value* stepRate = C(ied.InstanceDataStepRate);
 512
 513             // prevent a div by 0 for 0 step rate
 514             Value* isNonZeroStep = ICMP_UGT(stepRate, C(0));
 515             stepRate = SELECT(isNonZeroStep, stepRate, C(1));
 516
 517             // calc the current offset into instanced data buffer
 518             Value* calcInstance = UDIV(curInstance, stepRate);
 519
 520             // if step rate is 0, every instance gets instance 0
 521             calcInstance = SELECT(isNonZeroStep, calcInstance, C(0));
 522
 523             vCurIndices = VBROADCAST(calcInstance);
 524
 525             startOffset = startInstance;
 526         }
 527         else
 528         {
 529             // offset indices by baseVertex
 530             vCurIndices = ADD(vIndices, vBaseVertex);
 531
 532             startOffset = startVertex;
 533         }
 534
 535         // All of the OOB calculations are in vertices, not VB offsets, to prevent having to
 536         // do 64bit address offset calculations.
 537
 538         // calculate byte offset to the start of the VB
 539         Value* baseOffset = MUL(Z_EXT(startOffset, mInt64Ty), Z_EXT(stride, mInt64Ty));
 540         pStreamBase = GEP(pStreamBase, baseOffset);
 541
 542         // if we have a start offset, subtract from max vertex. Used for OOB check
 543         maxVertex = SUB(Z_EXT(maxVertex, mInt64Ty), Z_EXT(startOffset, mInt64Ty));
 544         Value* neg = ICMP_SLT(maxVertex, C((int64_t)0));
 545         // if we have a negative value, we're already OOB. clamp at 0.
 546         maxVertex = SELECT(neg, C(0), TRUNC(maxVertex, mInt32Ty));
 547
 548         // Load the in bounds size of a partially valid vertex
 549         Value *partialInboundsSize = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_partialInboundsSize)});
 550         partialInboundsSize = LOAD(partialInboundsSize);
 551         Value* vPartialVertexSize = VBROADCAST(partialInboundsSize);
 552         Value* vBpp = VBROADCAST(C(info.Bpp));
 553         Value* vAlignmentOffsets = VBROADCAST(C(ied.AlignedByteOffset));
 554
 555         // is the element is <= the partially valid size
 556         Value* vElementInBoundsMask = ICMP_SLE(vBpp, SUB(vPartialVertexSize, vAlignmentOffsets));
 557
 558         // are vertices partially OOB?
 559         Value* vMaxVertex = VBROADCAST(maxVertex);
 560         Value* vPartialOOBMask = ICMP_EQ(vCurIndices, vMaxVertex);
 561
 562         // are vertices are fully in bounds?
 563         Value* vGatherMask = ICMP_ULT(vCurIndices, vMaxVertex);
 564
 565         // blend in any partially OOB indices that have valid elements
 566         vGatherMask = SELECT(vPartialOOBMask, vElementInBoundsMask, vGatherMask);
 567         vGatherMask = VMASK(vGatherMask);
 568
 569         // calculate the actual offsets into the VB
 570         Value* vOffsets = MUL(vCurIndices, vStride);
 571         vOffsets = ADD(vOffsets, vAlignmentOffsets);
 572
 573         // Packing and component control
 574         ComponentEnable compMask = (ComponentEnable)ied.ComponentPacking;
 575         const ComponentControl compCtrl[4] { (ComponentControl)ied.ComponentControl0, (ComponentControl)ied.ComponentControl1,
 576                                              (ComponentControl)ied.ComponentControl2, (ComponentControl)ied.ComponentControl3};
 577
 578         if(info.type[0] == SWR_TYPE_FLOAT)
 579         {
 580             ///@todo: support 64 bit vb accesses
 581             Value* gatherSrc = VIMMED1(0.0f);
 582
 583             // Gather components from memory to store in a simdvertex structure
 584             switch(bpc)
 585             {
 586                 case 16:
 587                 {
 588                     Value* vGatherResult[2];
 589                     Value *vMask;
 590
 591                     // if we have at least one component out of x or y to fetch
 592                     if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)){
 593                         // save mask as it is zero'd out after each gather
 594                         vMask = vGatherMask;
 595
 596                         vGatherResult[0] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1));
 597                         // e.g. result of first 8x32bit integer gather for 16bit components
 598                         // 256i - 0    1    2    3    4    5    6    7
 599                         //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
 600                         //
 601                     }
 602
 603                     // if we have at least one component out of z or w to fetch
 604                     if(isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)){
 605                         // offset base to the next components(zw) in the vertex to gather
 606                         pStreamBase = GEP(pStreamBase, C((char)4));
 607                         vMask = vGatherMask;
 608
 609                         vGatherResult[1] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1));
 610                         // e.g. result of second 8x32bit integer gather for 16bit components
 611                         // 256i - 0    1    2    3    4    5    6    7
 612                         //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
 613                         //
 614                     }
 615
 616                     // if we have at least one component to shuffle into place
 617                     if(compMask){
 618                         Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, Instruction::CastOps::FPExt, CONVERT_NONE,
 619                                                                       currentVertexElement, outputElt, compMask, compCtrl, vVertexElements);
 620                         // Shuffle gathered components into place in simdvertex struct
 621                         Shuffle16bpcGather(args);  // outputs to vVertexElements ref
 622                     }
 623                 }
 624                     break;
 625                 case 32:
 626                 {
 627                     for(uint32_t i = 0; i < 4; i++)
 628                     {
 629                         if(!isComponentEnabled(compMask, i)){
 630                             // offset base to the next component in the vertex to gather
 631                             pStreamBase = GEP(pStreamBase, C((char)4));
 632                             continue;
 633                         }
 634
 635                         // if we need to gather the component
 636                         if(compCtrl[i] == StoreSrc){
 637                             // save mask as it is zero'd out after each gather
 638                             Value *vMask = vGatherMask;
 639
 640                             // Gather a SIMD of vertices
 641                             vVertexElements[currentVertexElement++] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1));
 642                         }
 643                         else{
 644                             vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
 645                         }
 646
 647                         if(currentVertexElement > 3){
 648                             StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
 649                             // reset to the next vVertexElement to output
 650                             currentVertexElement = 0;
 651                         }
 652
 653                         // offset base to the next component in the vertex to gather
 654                         pStreamBase = GEP(pStreamBase, C((char)4));
 655                     }
 656                 }
 657                     break;
 658                 default:
 659                     SWR_ASSERT(0, "Tried to fetch invalid FP format");
 660                     break;
 661             }
 662         }
 663         else
 664         {
 665             Instruction::CastOps extendCastType = Instruction::CastOps::CastOpsEnd;
 666             ConversionType conversionType = CONVERT_NONE;
 667
 668             switch(info.type[0])
 669             {
 670                 case SWR_TYPE_UNORM:
 671                     conversionType = CONVERT_NORMALIZED;
 672                 case SWR_TYPE_UINT:
 673                     extendCastType = Instruction::CastOps::ZExt;
 674                     break;
 675                 case SWR_TYPE_SNORM:
 676                     conversionType = CONVERT_NORMALIZED;
 677                 case SWR_TYPE_SINT:
 678                     extendCastType = Instruction::CastOps::SExt;
 679                     break;
 680                 case SWR_TYPE_USCALED:
 681                     conversionType = CONVERT_USCALED;
 682                     extendCastType = Instruction::CastOps::UIToFP;
 683                     break;
 684                 case SWR_TYPE_SSCALED:
 685                     conversionType = CONVERT_SSCALED;
 686                     extendCastType = Instruction::CastOps::SIToFP;
 687                     break;
 688                 default:
 689                     break;
 690             }
 691
 692             // value substituted when component of gather is masked
 693             Value* gatherSrc = VIMMED1(0);
 694
 695             // Gather components from memory to store in a simdvertex structure
 696             switch (bpc)
 697             {
 698                 case 8:
 699                 {
 700                     // if we have at least one component to fetch
 701                     if(compMask){
 702                         Value* vGatherResult = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask, C((char)1));
 703                         // e.g. result of an 8x32bit integer gather for 8bit components
 704                         // 256i - 0    1    2    3    4    5    6    7
 705                         //        xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw
 706
 707                         Shuffle8bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, extendCastType, conversionType,
 708                                                                      currentVertexElement, outputElt, compMask, compCtrl, vVertexElements, info.swizzle);
 709                         // Shuffle gathered components into place in simdvertex struct
 710                         Shuffle8bpcGatherd(args); // outputs to vVertexElements ref
 711                     }
 712                 }
 713                 break;
 714                 case 16:
 715                 {
 716                     Value* vGatherResult[2];
 717                     Value *vMask;
 718
 719                     // if we have at least one component out of x or y to fetch
 720                     if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)){
 721                         // save mask as it is zero'd out after each gather
 722                         vMask = vGatherMask;
 723
 724                         vGatherResult[0] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1));
 725                         // e.g. result of first 8x32bit integer gather for 16bit components
 726                         // 256i - 0    1    2    3    4    5    6    7
 727                         //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
 728                         //
 729                     }
 730
 731                     // if we have at least one component out of z or w to fetch
 732                     if(isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)){
 733                         // offset base to the next components(zw) in the vertex to gather
 734                         pStreamBase = GEP(pStreamBase, C((char)4));
 735                         vMask = vGatherMask;
 736
 737                         vGatherResult[1] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1));
 738                         // e.g. result of second 8x32bit integer gather for 16bit components
 739                         // 256i - 0    1    2    3    4    5    6    7
 740                         //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
 741                         //
 742                     }
 743
 744                     // if we have at least one component to shuffle into place
 745                     if(compMask){
 746                         Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, extendCastType, conversionType,
 747                                                                       currentVertexElement, outputElt, compMask, compCtrl, vVertexElements);
 748                         // Shuffle gathered components into place in simdvertex struct
 749                         Shuffle16bpcGather(args);  // outputs to vVertexElements ref
 750                     }
 751                 }
 752                 break;
 753                 case 32:
 754                 {
 755                     SWR_ASSERT(conversionType == CONVERT_NONE);
 756
 757                     // Gathered components into place in simdvertex struct
 758                     for(uint32_t i = 0; i < 4; i++)
 759                     {
 760                         if(!isComponentEnabled(compMask, i)){
 761                             // offset base to the next component in the vertex to gather
 762                             pStreamBase = GEP(pStreamBase, C((char)4));
 763                             continue;
 764                         }
 765
 766                         // if we need to gather the component
 767                         if(compCtrl[i] == StoreSrc){
 768                             // save mask as it is zero'd out after each gather
 769                             Value *vMask = vGatherMask;
 770
 771                             vVertexElements[currentVertexElement++] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1));
 772
 773                             // e.g. result of a single 8x32bit integer gather for 32bit components
 774                             // 256i - 0    1    2    3    4    5    6    7
 775                             //        xxxx xxxx xxxx xxxx xxxx xxxx xxxx xxxx
 776                         }
 777                         else{
 778                             vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
 779                         }
 780
 781                         if(currentVertexElement > 3){
 782                             StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
 783                             // reset to the next vVertexElement to output
 784                             currentVertexElement = 0;
 785                         }
 786
 787                         // offset base to the next component  in the vertex to gather
 788                         pStreamBase = GEP(pStreamBase, C((char)4));
 789                     }
 790                 }
 791                 break;
 792             }
 793         }
 794     }
 795
 796     // if we have a partially filled vVertexElement struct, output it
 797     if(currentVertexElement > 0){
 798         StoreVertexElements(pVtxOut, outputElt++, currentVertexElement+1, vVertexElements);
 799     }
 800 }
 801
 802 //////////////////////////////////////////////////////////////////////////
 803 /// @brief Loads a simd of valid indices. OOB indices are set to 0
 804 /// *Note* have to do 16bit index checking in scalar until we have AVX-512
 805 /// support
 806 /// @param pIndices - pointer to 8 bit indices
 807 /// @param pLastIndex - pointer to last valid index
 808 Value* FetchJit::GetSimdValid8bitIndices(Value* pIndices, Value* pLastIndex)
 809 {
 810     // can fit 2 16 bit integers per vWidth lane
 811     Value* vIndices =  VUNDEF_I();
 812
 813     // store 0 index on stack to be used to conditionally load from if index address is OOB
 814     Value* pZeroIndex = ALLOCA(mInt8Ty);
 815     STORE(C((uint8_t)0), pZeroIndex);
 816
 817     // Load a SIMD of index pointers
 818     for(int64_t lane = 0; lane < mVWidth; lane++)
 819     {
 820         // Calculate the address of the requested index
 821         Value *pIndex = GEP(pIndices, C(lane));
 822
 823         // check if the address is less than the max index,
 824         Value* mask = ICMP_ULT(pIndex, pLastIndex);
 825
 826         // if valid, load the index. if not, load 0 from the stack
 827         Value* pValid = SELECT(mask, pIndex, pZeroIndex);
 828         Value *index = LOAD(pValid, "valid index");
 829
 830         // zero extended index to 32 bits and insert into the correct simd lane
 831         index = Z_EXT(index, mInt32Ty);
 832         vIndices = VINSERT(vIndices, index, lane);
 833     }
 834     return vIndices;
 835 }
 836
 837 //////////////////////////////////////////////////////////////////////////
 838 /// @brief Loads a simd of valid indices. OOB indices are set to 0
 839 /// *Note* have to do 16bit index checking in scalar until we have AVX-512
 840 /// support
 841 /// @param pIndices - pointer to 16 bit indices
 842 /// @param pLastIndex - pointer to last valid index
 843 Value* FetchJit::GetSimdValid16bitIndices(Value* pIndices, Value* pLastIndex)
 844 {
 845     // can fit 2 16 bit integers per vWidth lane
 846     Value* vIndices =  VUNDEF_I();
 847
 848     // store 0 index on stack to be used to conditionally load from if index address is OOB
 849     Value* pZeroIndex = ALLOCA(mInt16Ty);
 850     STORE(C((uint16_t)0), pZeroIndex);
 851
 852     // Load a SIMD of index pointers
 853     for(int64_t lane = 0; lane < mVWidth; lane++)
 854     {
 855         // Calculate the address of the requested index
 856         Value *pIndex = GEP(pIndices, C(lane));
 857
 858         // check if the address is less than the max index,
 859         Value* mask = ICMP_ULT(pIndex, pLastIndex);
 860
 861         // if valid, load the index. if not, load 0 from the stack
 862         Value* pValid = SELECT(mask, pIndex, pZeroIndex);
 863         Value *index = LOAD(pValid, "valid index");
 864
 865         // zero extended index to 32 bits and insert into the correct simd lane
 866         index = Z_EXT(index, mInt32Ty);
 867         vIndices = VINSERT(vIndices, index, lane);
 868     }
 869     return vIndices;
 870 }
 871
 872 //////////////////////////////////////////////////////////////////////////
 873 /// @brief Loads a simd of valid indices. OOB indices are set to 0
 874 /// @param pIndices - pointer to 32 bit indices
 875 /// @param pLastIndex - pointer to last valid index
 876 Value* FetchJit::GetSimdValid32bitIndices(Value* pIndices, Value* pLastIndex)
 877 {
 878     DataLayout dL(JM()->mpCurrentModule);
 879     unsigned int ptrSize = dL.getPointerSize() * 8;  // ptr size in bits
 880     Value* iLastIndex = PTR_TO_INT(pLastIndex, Type::getIntNTy(JM()->mContext, ptrSize));
 881     Value* iIndices = PTR_TO_INT(pIndices, Type::getIntNTy(JM()->mContext, ptrSize));
 882
 883     // get the number of indices left in the buffer (endPtr - curPtr) / sizeof(index)
 884     Value* numIndicesLeft = SUB(iLastIndex,iIndices);
 885     numIndicesLeft = TRUNC(numIndicesLeft, mInt32Ty);
 886     numIndicesLeft = SDIV(numIndicesLeft, C(4));
 887
 888     // create a vector of index counts from the base index ptr passed into the fetch
 889     const std::vector<Constant*> vecIndices {C(0), C(1), C(2), C(3), C(4), C(5), C(6), C(7)};
 890     Constant* vIndexOffsets = ConstantVector::get(vecIndices);
 891
 892     // compare index count to the max valid index
 893     // e.g vMaxIndex      4 4 4 4 4 4 4 4 : 4 indices left to load
 894     //     vIndexOffsets  0 1 2 3 4 5 6 7
 895     //     ------------------------------
 896     //     vIndexMask    -1-1-1-1 0 0 0 0 : offsets < max pass
 897     //     vLoadedIndices 0 1 2 3 0 0 0 0 : offsets >= max masked to 0
 898     Value* vMaxIndex = VBROADCAST(numIndicesLeft);
 899     Value* vIndexMask = VPCMPGTD(vMaxIndex,vIndexOffsets);
 900
 901     // VMASKLOAD takes an *i8 src pointer
 902     pIndices = BITCAST(pIndices,PointerType::get(mInt8Ty,0));
 903
 904     // Load the indices; OOB loads 0
 905     return MASKLOADD(pIndices,vIndexMask);
 906 }
 907
 908 //////////////////////////////////////////////////////////////////////////
 909 /// @brief Takes a SIMD of gathered 8bpc verts, zero or sign extends,
 910 /// denormalizes if needed, converts to F32 if needed, and positions in
 911 //  the proper SIMD rows to be output to the simdvertex structure
 912 /// @param args: (tuple of args, listed below)
 913 ///   @param vGatherResult - 8 gathered 8bpc vertices
 914 ///   @param pVtxOut - base pointer to output simdvertex struct
 915 ///   @param extendType - sign extend or zero extend
 916 ///   @param bNormalized - do we need to denormalize?
 917 ///   @param currentVertexElement - reference to the current vVertexElement
 918 ///   @param outputElt - reference to the current offset from simdvertex we're o
 919 ///   @param compMask - component packing mask
 920 ///   @param compCtrl - component control val
 921 ///   @param vVertexElements[4] - vertex components to output
 922 ///   @param swizzle[4] - component swizzle location
 923 void FetchJit::Shuffle8bpcGatherd(Shuffle8bpcArgs &args)
 924 {
 925     // Unpack tuple args
 926     Value*& vGatherResult = std::get<0>(args);
 927     Value* pVtxOut = std::get<1>(args);
 928     const Instruction::CastOps extendType = std::get<2>(args);
 929     const ConversionType conversionType = std::get<3>(args);
 930     uint32_t &currentVertexElement = std::get<4>(args);
 931     uint32_t &outputElt =  std::get<5>(args);
 932     const ComponentEnable compMask = std::get<6>(args);
 933     const ComponentControl (&compCtrl)[4] = std::get<7>(args);
 934     Value* (&vVertexElements)[4] = std::get<8>(args);
 935     const uint32_t (&swizzle)[4] = std::get<9>(args);
 936
 937     // cast types
 938     Type* vGatherTy = mSimdInt32Ty;
 939     Type* v32x8Ty =  VectorType::get(mInt8Ty, mVWidth * 4 ); // vwidth is units of 32 bits
 940
 941     // have to do extra work for sign extending
 942     if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP)){
 943         Type* v16x8Ty = VectorType::get(mInt8Ty, mVWidth * 2); // 8x16bit ints in a 128bit lane
 944         Type* v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
 945
 946         // shuffle mask, including any swizzling
 947         const char x = (char)swizzle[0]; const char y = (char)swizzle[1];
 948         const char z = (char)swizzle[2]; const char w = (char)swizzle[3];
 949         Value* vConstMask = C<char>({char(x), char(x+4), char(x+8), char(x+12),
 950                     char(y), char(y+4), char(y+8), char(y+12),
 951                     char(z), char(z+4), char(z+8), char(z+12),
 952                     char(w), char(w+4), char(w+8), char(w+12),
 953                     char(x), char(x+4), char(x+8), char(x+12),
 954                     char(y), char(y+4), char(y+8), char(y+12),
 955                     char(z), char(z+4), char(z+8), char(z+12),
 956                     char(w), char(w+4), char(w+8), char(w+12)});
 957
 958         Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherResult, v32x8Ty), vConstMask), vGatherTy);
 959         // after pshufb: group components together in each 128bit lane
 960         // 256i - 0    1    2    3    4    5    6    7
 961         //        xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww
 962
 963         Value* vi128XY = nullptr;
 964         if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)){
 965             vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({0, 4, 0, 0, 1, 5, 0, 0})), v128Ty);
 966             // after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane
 967             // 256i - 0    1    2    3    4    5    6    7
 968             //        xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care)
 969         }
 970
 971         // do the same for zw components
 972         Value* vi128ZW = nullptr;
 973         if(isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)){
 974             vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({2, 6, 0, 0, 3, 7, 0, 0})), v128Ty);
 975         }
 976
 977         // init denormalize variables if needed
 978         Instruction::CastOps fpCast;
 979         Value* conversionFactor;
 980
 981         switch (conversionType)
 982         {
 983         case CONVERT_NORMALIZED:
 984             fpCast = Instruction::CastOps::SIToFP;
 985             conversionFactor = VIMMED1((float)(1.0 / 127.0));
 986             break;
 987         case CONVERT_SSCALED:
 988             fpCast = Instruction::CastOps::SIToFP;
 989             conversionFactor = VIMMED1((float)(1.0));
 990             break;
 991         case CONVERT_USCALED:
 992             SWR_ASSERT(0, "Type should not be sign extended!");
 993             conversionFactor = nullptr;
 994             break;
 995         default:
 996             SWR_ASSERT(conversionType == CONVERT_NONE);
 997             conversionFactor = nullptr;
 998             break;
 999         }
1000
1001         // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
1002         for(uint32_t i = 0; i < 4; i++){
1003             if(!isComponentEnabled(compMask, i)){
1004                 continue;
1005             }
1006
1007             if(compCtrl[i] == ComponentControl::StoreSrc){
1008                 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
1009                 uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
1010                 // if x or y, use vi128XY permute result, else use vi128ZW
1011                 Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
1012
1013                 // sign extend
1014                 vVertexElements[currentVertexElement] = PMOVSXBD(BITCAST(VEXTRACT(selectedPermute, C(lane)), v16x8Ty));
1015
1016                 // denormalize if needed
1017                 if(conversionType != CONVERT_NONE){
1018                     vVertexElements[currentVertexElement] = FMUL(CAST(fpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor);
1019                 }
1020                 currentVertexElement++;
1021             }
1022             else{
1023                 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1024             }
1025
1026             if(currentVertexElement > 3){
1027                 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1028                 // reset to the next vVertexElement to output
1029                 currentVertexElement = 0;
1030             }
1031         }
1032     }
1033     // else zero extend
1034     else if ((extendType == Instruction::CastOps::ZExt) || (extendType == Instruction::CastOps::UIToFP))
1035     {
1036         // init denormalize variables if needed
1037         Instruction::CastOps fpCast;
1038         Value* conversionFactor;
1039
1040         switch (conversionType)
1041         {
1042         case CONVERT_NORMALIZED:
1043             fpCast = Instruction::CastOps::UIToFP;
1044             conversionFactor = VIMMED1((float)(1.0 / 255.0));
1045             break;
1046         case CONVERT_USCALED:
1047             fpCast = Instruction::CastOps::UIToFP;
1048             conversionFactor = VIMMED1((float)(1.0));
1049             break;
1050         case CONVERT_SSCALED:
1051             SWR_ASSERT(0, "Type should not be zero extended!");
1052             conversionFactor = nullptr;
1053             break;
1054         default:
1055             SWR_ASSERT(conversionType == CONVERT_NONE);
1056             conversionFactor = nullptr;
1057             break;
1058         }
1059
1060         // shuffle enabled components into lower byte of each 32bit lane, 0 extending to 32 bits
1061         for(uint32_t i = 0; i < 4; i++){
1062             if(!isComponentEnabled(compMask, i)){
1063                 continue;
1064             }
1065
1066             if(compCtrl[i] == ComponentControl::StoreSrc){
1067                 // pshufb masks for each component
1068                 Value* vConstMask;
1069                 switch(swizzle[i]){
1070                     case 0:
1071                         // x shuffle mask
1072                         vConstMask = C<char>({0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1,
1073                                               0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1});
1074                         break;
1075                     case 1:
1076                         // y shuffle mask
1077                         vConstMask = C<char>({1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1,
1078                                               1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1});
1079                         break;
1080                     case 2:
1081                         // z shuffle mask
1082                         vConstMask = C<char>({2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1,
1083                                               2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1});
1084                         break;
1085                     case 3:
1086                         // w shuffle mask
1087                         vConstMask = C<char>({3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1,
1088                                               3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1});
1089                         break;
1090                     default:
1091                         vConstMask = nullptr;
1092                         break;
1093                 }
1094
1095                 vVertexElements[currentVertexElement] = BITCAST(PSHUFB(BITCAST(vGatherResult, v32x8Ty), vConstMask), vGatherTy);
1096                 // after pshufb for x channel
1097                 // 256i - 0    1    2    3    4    5    6    7
1098                 //        x000 x000 x000 x000 x000 x000 x000 x000
1099
1100                 // denormalize if needed
1101                 if (conversionType != CONVERT_NONE){
1102                     vVertexElements[currentVertexElement] = FMUL(CAST(fpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor);
1103                 }
1104                 currentVertexElement++;
1105             }
1106             else{
1107                 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1108             }
1109
1110             if(currentVertexElement > 3){
1111                 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1112                 // reset to the next vVertexElement to output
1113                 currentVertexElement = 0;
1114             }
1115         }
1116     }
1117     else
1118     {
1119         SWR_ASSERT(0, "Unsupported conversion type");
1120     }
1121 }
1122
1123 //////////////////////////////////////////////////////////////////////////
1124 /// @brief Takes a SIMD of gathered 16bpc verts, zero or sign extends,
1125 /// denormalizes if needed, converts to F32 if needed, and positions in
1126 //  the proper SIMD rows to be output to the simdvertex structure
1127 /// @param args: (tuple of args, listed below)
1128 ///   @param vGatherResult[2] - array of gathered 16bpc vertices, 4 per index
1129 ///   @param pVtxOut - base pointer to output simdvertex struct
1130 ///   @param extendType - sign extend or zero extend
1131 ///   @param bNormalized - do we need to denormalize?
1132 ///   @param currentVertexElement - reference to the current vVertexElement
1133 ///   @param outputElt - reference to the current offset from simdvertex we're o
1134 ///   @param compMask - component packing mask
1135 ///   @param compCtrl - component control val
1136 ///   @param vVertexElements[4] - vertex components to output
1137 void FetchJit::Shuffle16bpcGather(Shuffle16bpcArgs &args)
1138 {
1139     // Unpack tuple args
1140     Value* (&vGatherResult)[2] = std::get<0>(args);
1141     Value* pVtxOut = std::get<1>(args);
1142     const Instruction::CastOps extendType = std::get<2>(args);
1143     const ConversionType conversionType = std::get<3>(args);
1144     uint32_t &currentVertexElement = std::get<4>(args);
1145     uint32_t &outputElt = std::get<5>(args);
1146     const ComponentEnable compMask = std::get<6>(args);
1147     const ComponentControl(&compCtrl)[4] = std::get<7>(args);
1148     Value* (&vVertexElements)[4] = std::get<8>(args);
1149
1150     // cast types
1151     Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
1152     Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
1153
1154     // have to do extra work for sign extending
1155     if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP)||
1156         (extendType == Instruction::CastOps::FPExt))
1157     {
1158         // is this PP float?
1159         bool bFP = (extendType == Instruction::CastOps::FPExt) ? true : false;
1160
1161         Type* v8x16Ty = VectorType::get(mInt16Ty, 8); // 8x16bit in a 128bit lane
1162         Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
1163
1164         // shuffle mask
1165         Value* vConstMask = C<char>({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
1166                                      0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15});
1167         Value* vi128XY = nullptr;
1168         if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)){
1169             Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherResult[0], v32x8Ty), vConstMask), vGatherTy);
1170             // after pshufb: group components together in each 128bit lane
1171             // 256i - 0    1    2    3    4    5    6    7
1172             //        xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
1173
1174             vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
1175             // after PERMD: move and pack xy components into each 128bit lane
1176             // 256i - 0    1    2    3    4    5    6    7
1177             //        xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
1178         }
1179
1180         // do the same for zw components
1181         Value* vi128ZW = nullptr;
1182         if(isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)){
1183             Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherResult[1], v32x8Ty), vConstMask), vGatherTy);
1184             vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
1185         }
1186
1187         // init denormalize variables if needed
1188         Instruction::CastOps IntToFpCast;
1189         Value* conversionFactor;
1190
1191         switch (conversionType)
1192         {
1193         case CONVERT_NORMALIZED:
1194             IntToFpCast = Instruction::CastOps::SIToFP;
1195             conversionFactor = VIMMED1((float)(1.0 / 32767.0));
1196             break;
1197         case CONVERT_SSCALED:
1198             IntToFpCast = Instruction::CastOps::SIToFP;
1199             conversionFactor = VIMMED1((float)(1.0));
1200             break;
1201         case CONVERT_USCALED:
1202             SWR_ASSERT(0, "Type should not be sign extended!");
1203             conversionFactor = nullptr;
1204             break;
1205         default:
1206             SWR_ASSERT(conversionType == CONVERT_NONE);
1207             conversionFactor = nullptr;
1208             break;
1209         }
1210
1211         // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
1212         for(uint32_t i = 0; i < 4; i++){
1213             if(!isComponentEnabled(compMask, i)){
1214                 continue;
1215             }
1216
1217             if(compCtrl[i] == ComponentControl::StoreSrc){
1218                 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
1219                 uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
1220                 // if x or y, use vi128XY permute result, else use vi128ZW
1221                 Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
1222
1223                 if(bFP) {
1224                     // extract 128 bit lanes to sign extend each component
1225                     vVertexElements[currentVertexElement] = CVTPH2PS(BITCAST(VEXTRACT(selectedPermute, C(lane)), v8x16Ty));
1226                 }
1227                 else {
1228                     // extract 128 bit lanes to sign extend each component
1229                     vVertexElements[currentVertexElement] = PMOVSXWD(BITCAST(VEXTRACT(selectedPermute, C(lane)), v8x16Ty));
1230
1231                     // denormalize if needed
1232                     if(conversionType != CONVERT_NONE){
1233                         vVertexElements[currentVertexElement] = FMUL(CAST(IntToFpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor);
1234                     }
1235                 }
1236                 currentVertexElement++;
1237             }
1238             else{
1239                 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1240             }
1241
1242             if(currentVertexElement > 3){
1243                 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1244                 // reset to the next vVertexElement to output
1245                 currentVertexElement = 0;
1246             }
1247         }
1248
1249     }
1250     // else zero extend
1251     else if ((extendType == Instruction::CastOps::ZExt) || (extendType == Instruction::CastOps::UIToFP))
1252     {
1253         // pshufb masks for each component
1254         Value* vConstMask[2];
1255         if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 2)){
1256             // x/z shuffle mask
1257             vConstMask[0] = C<char>({0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
1258                                      0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, });
1259         }
1260
1261         if(isComponentEnabled(compMask, 1) || isComponentEnabled(compMask, 3)){
1262             // y/w shuffle mask
1263             vConstMask[1] = C<char>({2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
1264                                      2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1});
1265         }
1266
1267         // init denormalize variables if needed
1268         Instruction::CastOps fpCast;
1269         Value* conversionFactor;
1270
1271         switch (conversionType)
1272         {
1273         case CONVERT_NORMALIZED:
1274             fpCast = Instruction::CastOps::UIToFP;
1275             conversionFactor = VIMMED1((float)(1.0 / 65535.0));
1276             break;
1277         case CONVERT_USCALED:
1278             fpCast = Instruction::CastOps::UIToFP;
1279             conversionFactor = VIMMED1((float)(1.0f));
1280             break;
1281         case CONVERT_SSCALED:
1282             SWR_ASSERT(0, "Type should not be zero extended!");
1283             conversionFactor = nullptr;
1284             break;
1285         default:
1286             SWR_ASSERT(conversionType == CONVERT_NONE);
1287             conversionFactor = nullptr;
1288             break;
1289         }
1290
1291         // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits
1292         for(uint32_t i = 0; i < 4; i++){
1293             if(!isComponentEnabled(compMask, i)){
1294                 continue;
1295             }
1296
1297             if(compCtrl[i] == ComponentControl::StoreSrc){
1298                 // select correct constMask for x/z or y/w pshufb
1299                 uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1;
1300                 // if x or y, use vi128XY permute result, else use vi128ZW
1301                 uint32_t selectedGather = (i < 2) ? 0 : 1;
1302
1303                 vVertexElements[currentVertexElement] = BITCAST(PSHUFB(BITCAST(vGatherResult[selectedGather], v32x8Ty), vConstMask[selectedMask]), vGatherTy);
1304                 // after pshufb mask for x channel; z uses the same shuffle from the second gather
1305                 // 256i - 0    1    2    3    4    5    6    7
1306                 //        xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00
1307
1308                 // denormalize if needed
1309                 if(conversionType != CONVERT_NONE){
1310                     vVertexElements[currentVertexElement] = FMUL(CAST(fpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor);
1311                 }
1312                 currentVertexElement++;
1313             }
1314             else{
1315                 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1316             }
1317
1318             if(currentVertexElement > 3){
1319                 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1320                 // reset to the next vVertexElement to output
1321                 currentVertexElement = 0;
1322             }
1323         }
1324     }
1325     else
1326     {
1327         SWR_ASSERT(0, "Unsupported conversion type");
1328     }
1329 }
1330
1331 //////////////////////////////////////////////////////////////////////////
1332 /// @brief Output a simdvertex worth of elements to the current outputElt
1333 /// @param pVtxOut - base address of VIN output struct
1334 /// @param outputElt - simdvertex offset in VIN to write to
1335 /// @param numEltsToStore - number of simdvertex rows to write out
1336 /// @param vVertexElements - LLVM Value*[] simdvertex to write out
1337 void FetchJit::StoreVertexElements(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4])
1338 {
1339     for(uint32_t c = 0; c < numEltsToStore; ++c)
1340     {
1341         // STORE expects FP32 x vWidth type, just bitcast if needed
1342         if(!vVertexElements[c]->getType()->getScalarType()->isFloatTy()){
1343 #if FETCH_DUMP_VERTEX
1344             PRINT("vVertexElements[%d]: 0x%x\n", {C(c), vVertexElements[c]});
1345 #endif
1346             vVertexElements[c] = BITCAST(vVertexElements[c], mSimdFP32Ty);
1347         }
1348 #if FETCH_DUMP_VERTEX
1349         else
1350         {
1351             PRINT("vVertexElements[%d]: %f\n", {C(c), vVertexElements[c]});
1352         }
1353 #endif
1354         // outputElt * 4 = offsetting by the size of a simdvertex
1355         // + c offsets to a 32bit x vWidth row within the current vertex
1356         Value* dest = GEP(pVtxOut, C(outputElt * 4 + c), "destGEP");
1357         STORE(vVertexElements[c], dest);
1358     }
1359 }
1360
1361 //////////////////////////////////////////////////////////////////////////
1362 /// @brief Generates a constant vector of values based on the
1363 /// ComponentControl value
1364 /// @param ctrl - ComponentControl value
1365 Value* FetchJit::GenerateCompCtrlVector(const ComponentControl ctrl)
1366 {
1367     switch(ctrl)
1368     {
1369         case NoStore:   return VUNDEF_I();
1370         case Store0:    return VIMMED1(0);
1371         case Store1Fp:  return VIMMED1(1.0f);
1372         case Store1Int: return VIMMED1(1);
1373         case StoreSrc:
1374         default:        SWR_ASSERT(0, "Invalid component control"); return VUNDEF_I();
1375     }
1376 }
1377
1378 //////////////////////////////////////////////////////////////////////////
1379 /// @brief Returns the enable mask for the specified component.
1380 /// @param enableMask - enable bits
1381 /// @param component - component to check if enabled.
1382 bool isComponentEnabled(ComponentEnable enableMask, uint8_t component)
1383 {
1384     switch (component)
1385     {
1386         // X
1387     case 0: return (enableMask & ComponentEnable::X);
1388         // Y
1389     case 1: return (enableMask & ComponentEnable::Y);
1390         // Z
1391     case 2: return (enableMask & ComponentEnable::Z);
1392         // W
1393     case 3: return (enableMask & ComponentEnable::W);
1394
1395     default: return false;
1396     }
1397 }
1398
1399
1400 //////////////////////////////////////////////////////////////////////////
1401 /// @brief JITs from fetch shader IR
1402 /// @param hJitMgr - JitManager handle
1403 /// @param func   - LLVM function IR
1404 /// @return PFN_FETCH_FUNC - pointer to fetch code
1405 PFN_FETCH_FUNC JitFetchFunc(HANDLE hJitMgr, const HANDLE hFunc)
1406 {
1407     const llvm::Function* func = (const llvm::Function*)hFunc;
1408     JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
1409     PFN_FETCH_FUNC pfnFetch;
1410
1411     pfnFetch = (PFN_FETCH_FUNC)(pJitMgr->mpExec->getFunctionAddress(func->getName().str()));
1412     // MCJIT finalizes modules the first time you JIT code from them. After finalized, you cannot add new IR to the module
1413     pJitMgr->mIsModuleFinalized = true;
1414
1415 #if defined(KNOB_SWRC_TRACING)
1416     char fName[1024];
1417     const char *funcName = func->getName().data();
1418     sprintf(fName, "%s.bin", funcName);
1419     FILE *fd = fopen(fName, "wb");
1420     fwrite((void *)pfnFetch, 1, 2048, fd);
1421     fclose(fd);
1422 #endif
1423
1424     return pfnFetch;
1425 }
1426
1427 //////////////////////////////////////////////////////////////////////////
1428 /// @brief JIT compiles fetch shader
1429 /// @param hJitMgr - JitManager handle
1430 /// @param state   - fetch state to build function from
1431 extern "C" PFN_FETCH_FUNC JITCALL JitCompileFetch(HANDLE hJitMgr, const FETCH_COMPILE_STATE& state)
1432 {
1433     JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
1434
1435     pJitMgr->SetupNewModule();
1436
1437     FetchJit theJit(pJitMgr);
1438     HANDLE hFunc = theJit.Create(state);
1439
1440     return JitFetchFunc(hJitMgr, hFunc);
1441 }