src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp

   1 /****************************************************************************
   2 * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
   3 *
   4 * Permission is hereby granted, free of charge, to any person obtaining a
   5 * copy of this software and associated documentation files (the "Software"),
   6 * to deal in the Software without restriction, including without limitation
   7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8 * and/or sell copies of the Software, and to permit persons to whom the
   9 * Software is furnished to do so, subject to the following conditions:
  10 *
  11 * The above copyright notice and this permission notice (including the next
  12 * paragraph) shall be included in all copies or substantial portions of the
  13 * Software.
  14 *
  15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21 * IN THE SOFTWARE.
  22 *
  23 * @file fetch_jit.cpp
  24 *
  25 * @brief Implementation of the fetch jitter
  26 *
  27 * Notes:
  28 *
  29 ******************************************************************************/
  30 #include "jit_api.h"
  31 #include "fetch_jit.h"
  32 #include "builder.h"
  33 #include "state_llvm.h"
  34 #include <sstream>
  35 #include <tuple>
  36
  37 //#define FETCH_DUMP_VERTEX 1
  38
  39 bool isComponentEnabled(ComponentEnable enableMask, uint8_t component);
  40
  41 enum ConversionType
  42 {
  43     CONVERT_NONE,
  44     CONVERT_NORMALIZED,
  45     CONVERT_USCALED,
  46     CONVERT_SSCALED,
  47 };
  48
  49 //////////////////////////////////////////////////////////////////////////
  50 /// Interface to Jitting a fetch shader
  51 //////////////////////////////////////////////////////////////////////////
  52 struct FetchJit : public Builder
  53 {
  54     FetchJit(JitManager* pJitMgr) : Builder(pJitMgr){};
  55
  56     Function* Create(const FETCH_COMPILE_STATE& fetchState);
  57     Value* GetSimdValid32bitIndices(Value* vIndices, Value* pLastIndex);
  58     Value* GetSimdValid16bitIndices(Value* vIndices, Value* pLastIndex);
  59     Value* GetSimdValid8bitIndices(Value* vIndices, Value* pLastIndex);
  60
  61     // package up Shuffle*bpcGatherd args into a tuple for convenience
  62     typedef std::tuple<Value*&, Value*, const Instruction::CastOps, const ConversionType,
  63         uint32_t&, uint32_t&, const ComponentEnable, const ComponentControl(&)[4], Value*(&)[4],
  64         const uint32_t(&)[4]> Shuffle8bpcArgs;
  65     void Shuffle8bpcGatherd(Shuffle8bpcArgs &args);
  66
  67     typedef std::tuple<Value*(&)[2], Value*, const Instruction::CastOps, const ConversionType,
  68         uint32_t&, uint32_t&, const ComponentEnable, const ComponentControl(&)[4], Value*(&)[4]> Shuffle16bpcArgs;
  69     void Shuffle16bpcGather(Shuffle16bpcArgs &args);
  70
  71     void StoreVertexElements(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4]);
  72
  73     Value* GenerateCompCtrlVector(const ComponentControl ctrl);
  74
  75     void JitLoadVertices(const FETCH_COMPILE_STATE &fetchState, Value* streams, Value* vIndices, Value* pVtxOut);
  76     void JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, Value* streams, Value* vIndices, Value* pVtxOut);
  77
  78     bool IsOddFormat(SWR_FORMAT format);
  79     bool IsUniformFormat(SWR_FORMAT format);
  80     void UnpackComponents(SWR_FORMAT format, Value* vInput, Value* result[4]);
  81     void CreateGatherOddFormats(SWR_FORMAT format, Value* pBase, Value* offsets, Value* result[4]);
  82     void ConvertFormat(SWR_FORMAT format, Value *texels[4]);
  83
  84     Value* mpFetchInfo;
  85 };
  86
  87 Function* FetchJit::Create(const FETCH_COMPILE_STATE& fetchState)
  88 {
  89     static std::size_t fetchNum = 0;
  90
  91     std::stringstream fnName("FetchShader", std::ios_base::in | std::ios_base::out | std::ios_base::ate);
  92     fnName << fetchNum++;
  93
  94     Function*    fetch = Function::Create(JM()->mFetchShaderTy, GlobalValue::ExternalLinkage, fnName.str(), JM()->mpCurrentModule);
  95     BasicBlock*    entry = BasicBlock::Create(JM()->mContext, "entry", fetch);
  96
  97     IRB()->SetInsertPoint(entry);
  98
  99     auto    argitr = fetch->getArgumentList().begin();
 100
 101     // Fetch shader arguments
 102     mpFetchInfo = &*argitr; ++argitr;
 103     mpFetchInfo->setName("fetchInfo");
 104     Value*    pVtxOut = &*argitr;
 105     pVtxOut->setName("vtxOutput");
 106     // this is just shorthand to tell LLVM to get a pointer to the base address of simdvertex
 107     // index 0(just the pointer to the simdvertex structure
 108     // index 1(which element of the simdvertex structure to offset to(in this case 0)
 109     // so the indices being i32's doesn't matter
 110     // TODO: generated this GEP with a VECTOR structure type so this makes sense
 111     std::vector<Value*>    vtxInputIndices(2, C(0));
 112     // GEP
 113     pVtxOut = GEP(pVtxOut, C(0));
 114     pVtxOut = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth), 0));
 115
 116     // SWR_FETCH_CONTEXT::pStreams
 117     Value*    streams = LOAD(mpFetchInfo,{0, SWR_FETCH_CONTEXT_pStreams});
 118     streams->setName("pStreams");
 119
 120     // SWR_FETCH_CONTEXT::pIndices
 121     Value*    indices = LOAD(mpFetchInfo,{0, SWR_FETCH_CONTEXT_pIndices});
 122     indices->setName("pIndices");
 123
 124     // SWR_FETCH_CONTEXT::pLastIndex
 125     Value*    pLastIndex = LOAD(mpFetchInfo,{0, SWR_FETCH_CONTEXT_pLastIndex});
 126     pLastIndex->setName("pLastIndex");
 127
 128
 129     Value* vIndices;
 130     switch(fetchState.indexType)
 131     {
 132         case R8_UINT:
 133             indices = BITCAST(indices, Type::getInt8PtrTy(JM()->mContext, 0));
 134             if(fetchState.bDisableIndexOOBCheck){
 135                 vIndices = LOAD(BITCAST(indices, PointerType::get(VectorType::get(mInt8Ty, mpJitMgr->mVWidth), 0)), {(uint32_t)0});
 136                 vIndices = Z_EXT(vIndices, mSimdInt32Ty);
 137             }
 138             else{
 139                 pLastIndex = BITCAST(pLastIndex, Type::getInt8PtrTy(JM()->mContext, 0));
 140                 vIndices = GetSimdValid8bitIndices(indices, pLastIndex);
 141             }
 142             break;
 143         case R16_UINT:
 144             indices = BITCAST(indices, Type::getInt16PtrTy(JM()->mContext, 0));
 145             if(fetchState.bDisableIndexOOBCheck){
 146                 vIndices = LOAD(BITCAST(indices, PointerType::get(VectorType::get(mInt16Ty, mpJitMgr->mVWidth), 0)), {(uint32_t)0});
 147                 vIndices = Z_EXT(vIndices, mSimdInt32Ty);
 148             }
 149             else{
 150                 pLastIndex = BITCAST(pLastIndex, Type::getInt16PtrTy(JM()->mContext, 0));
 151                 vIndices = GetSimdValid16bitIndices(indices, pLastIndex);
 152             }
 153             break;
 154         case R32_UINT:
 155             (fetchState.bDisableIndexOOBCheck) ? vIndices = LOAD(BITCAST(indices, PointerType::get(mSimdInt32Ty,0)),{(uint32_t)0})
 156                                                : vIndices = GetSimdValid32bitIndices(indices, pLastIndex);
 157             break; // incoming type is already 32bit int
 158         default: SWR_ASSERT(0, "Unsupported index type"); vIndices = nullptr; break;
 159     }
 160
 161     // store out vertex IDs
 162     STORE(vIndices, GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID }));
 163
 164     // store out cut mask if enabled
 165     if (fetchState.bEnableCutIndex)
 166     {
 167         Value* vCutIndex = VIMMED1(fetchState.cutIndex);
 168         Value* cutMask = VMASK(ICMP_EQ(vIndices, vCutIndex));
 169         STORE(cutMask, GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_CutMask }));
 170     }
 171
 172     // Fetch attributes from memory and output to a simdvertex struct
 173     // since VGATHER has a perf penalty on HSW vs BDW, allow client to choose which fetch method to use
 174     (fetchState.bDisableVGATHER) ? JitLoadVertices(fetchState, streams, vIndices, pVtxOut)
 175                                  : JitGatherVertices(fetchState, streams, vIndices, pVtxOut);
 176
 177     RET_VOID();
 178
 179     JitManager::DumpToFile(fetch, "src");
 180
 181 #if defined(_DEBUG)
 182     verifyFunction(*fetch);
 183 #endif
 184
 185     ::FunctionPassManager setupPasses(JM()->mpCurrentModule);
 186
 187     ///@todo We don't need the CFG passes for fetch. (e.g. BreakCriticalEdges and CFGSimplification)
 188     setupPasses.add(createBreakCriticalEdgesPass());
 189     setupPasses.add(createCFGSimplificationPass());
 190     setupPasses.add(createEarlyCSEPass());
 191     setupPasses.add(createPromoteMemoryToRegisterPass());
 192
 193     setupPasses.run(*fetch);
 194
 195     JitManager::DumpToFile(fetch, "se");
 196
 197     ::FunctionPassManager optPasses(JM()->mpCurrentModule);
 198
 199     ///@todo Haven't touched these either. Need to remove some of these and add others.
 200     optPasses.add(createCFGSimplificationPass());
 201     optPasses.add(createEarlyCSEPass());
 202     optPasses.add(createInstructionCombiningPass());
 203     optPasses.add(createInstructionSimplifierPass());
 204     optPasses.add(createConstantPropagationPass());
 205     optPasses.add(createSCCPPass());
 206     optPasses.add(createAggressiveDCEPass());
 207
 208     optPasses.run(*fetch);
 209     optPasses.run(*fetch);
 210
 211     JitManager::DumpToFile(fetch, "opt");
 212
 213     return fetch;
 214 }
 215
 216 //////////////////////////////////////////////////////////////////////////
 217 /// @brief Loads attributes from memory using LOADs, shuffling the
 218 /// components into SOA form.
 219 /// *Note* currently does not support component control,
 220 /// component packing, instancing
 221 /// @param fetchState - info about attributes to be fetched from memory
 222 /// @param streams - value pointer to the current vertex stream
 223 /// @param vIndices - vector value of indices to load
 224 /// @param pVtxOut - value pointer to output simdvertex struct
 225 void FetchJit::JitLoadVertices(const FETCH_COMPILE_STATE &fetchState, Value* streams, Value* vIndices, Value* pVtxOut)
 226 {
 227     // Zack shuffles; a variant of the Charleston.
 228
 229     std::vector<Value*> vectors(16);
 230     std::vector<Constant*>    pMask(mVWidth);
 231     for(uint32_t i = 0; i < mVWidth; ++i)
 232     {
 233         pMask[i] = (C(i < 4 ? i : 4));
 234     }
 235     Constant* promoteMask = ConstantVector::get(pMask);
 236     Constant* uwvec = UndefValue::get(VectorType::get(mFP32Ty, 4));
 237
 238     Value* startVertex = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartVertex});
 239     Value* startInstance = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartInstance});
 240     Value* curInstance = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_CurInstance});
 241     Value* vBaseVertex = VBROADCAST(LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_BaseVertex}));
 242     curInstance->setName("curInstance");
 243
 244     for(uint32_t nelt = 0; nelt < fetchState.numAttribs; ++nelt)
 245     {
 246         Value*    elements[4] = {0};
 247         const INPUT_ELEMENT_DESC& ied = fetchState.layout[nelt];
 248         const SWR_FORMAT_INFO &info = GetFormatInfo((SWR_FORMAT)ied.Format);
 249         SWR_ASSERT((info.bpp != 0), "Unsupported format in JitLoadVertices.");
 250         uint32_t    numComponents = info.numComps;
 251         uint32_t bpc = info.bpp / info.numComps;  ///@todo Code below assumes all components are same size. Need to fix.
 252
 253         // load path doesn't support component packing
 254         SWR_ASSERT(ied.ComponentPacking == ComponentEnable::XYZW, "Fetch load path doesn't support component packing.");
 255
 256         vectors.clear();
 257
 258         Value *vCurIndices;
 259         Value *startOffset;
 260         if(ied.InstanceEnable)
 261         {
 262             Value* stepRate = C(ied.InstanceDataStepRate);
 263
 264             // prevent a div by 0 for 0 step rate
 265             Value* isNonZeroStep = ICMP_UGT(stepRate, C(0));
 266             stepRate = SELECT(isNonZeroStep, stepRate, C(1));
 267
 268             // calc the current offset into instanced data buffer
 269             Value* calcInstance = UDIV(curInstance, stepRate);
 270
 271             // if step rate is 0, every instance gets instance 0
 272             calcInstance = SELECT(isNonZeroStep, calcInstance, C(0));
 273
 274             vCurIndices = VBROADCAST(calcInstance);
 275
 276             startOffset = startInstance;
 277         }
 278         else
 279         {
 280             // offset indices by baseVertex
 281             vCurIndices = ADD(vIndices, vBaseVertex);
 282
 283             startOffset = startVertex;
 284         }
 285
 286         // load SWR_VERTEX_BUFFER_STATE::pData
 287         Value *stream = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pData});
 288
 289         // load SWR_VERTEX_BUFFER_STATE::pitch
 290         Value *stride = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pitch});
 291         stride = Z_EXT(stride, mInt64Ty);
 292
 293         // load SWR_VERTEX_BUFFER_STATE::size
 294         Value *size = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_size});
 295         size = Z_EXT(size, mInt64Ty);
 296
 297         Value* startVertexOffset = MUL(Z_EXT(startOffset, mInt64Ty), stride);
 298
 299         // Load from the stream.
 300         for(uint32_t lane = 0; lane < mVWidth; ++lane)
 301         {
 302             // Get index
 303             Value* index = VEXTRACT(vCurIndices, C(lane));
 304             index = Z_EXT(index, mInt64Ty);
 305
 306             Value*    offset = MUL(index, stride);
 307             offset = ADD(offset, C((int64_t)ied.AlignedByteOffset));
 308             offset = ADD(offset, startVertexOffset);
 309
 310             if (!fetchState.bDisableIndexOOBCheck) {
 311                 // check for out of bound access, including partial OOB, and mask them to 0
 312                 Value *endOffset = ADD(offset, C((int64_t)info.Bpp));
 313                 Value *oob = ICMP_ULE(endOffset, size);
 314                 offset = SELECT(oob, offset, ConstantInt::get(mInt64Ty, 0));
 315             }
 316
 317             Value*    pointer = GEP(stream, offset);
 318             // We use a full-lane, but don't actually care.
 319             Value*    vptr = 0;
 320
 321             // get a pointer to a 4 component attrib in default address space
 322             switch(bpc)
 323             {
 324                 case 8: vptr = BITCAST(pointer, PointerType::get(VectorType::get(mInt8Ty, 4), 0)); break;
 325                 case 16: vptr = BITCAST(pointer, PointerType::get(VectorType::get(mInt16Ty, 4), 0)); break;
 326                 case 32: vptr = BITCAST(pointer, PointerType::get(VectorType::get(mFP32Ty, 4), 0)); break;
 327                 default: SWR_ASSERT(false, "Unsupported underlying bpp!");
 328             }
 329
 330             // load 4 components of attribute
 331             Value*    vec = ALIGNED_LOAD(vptr, 1, false);
 332
 333             // Convert To FP32 internally
 334             switch(info.type[0])
 335             {
 336                 case SWR_TYPE_UNORM:
 337                     switch(bpc)
 338                     {
 339                         case 8:
 340                             vec = UI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
 341                             vec = FMUL(vec, ConstantVector::get(std::vector<Constant*>(4, ConstantFP::get(mFP32Ty, 1.0 / 255.0))));
 342                             break;
 343                         case 16:
 344                             vec = UI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
 345                             vec = FMUL(vec, ConstantVector::get(std::vector<Constant*>(4, ConstantFP::get(mFP32Ty, 1.0 / 65535.0))));
 346                             break;
 347                         default:
 348                             SWR_ASSERT(false, "Unsupported underlying type!");
 349                             break;
 350                     }
 351                     break;
 352                 case SWR_TYPE_SNORM:
 353                     switch(bpc)
 354                     {
 355                         case 8:
 356                             vec = SI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
 357                             vec = FMUL(vec, ConstantVector::get(std::vector<Constant*>(4, ConstantFP::get(mFP32Ty, 1.0 / 128.0))));
 358                             break;
 359                         case 16:
 360                             vec = SI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
 361                             vec = FMUL(vec, ConstantVector::get(std::vector<Constant*>(4, ConstantFP::get(mFP32Ty, 1.0 / 32768.0))));
 362                             break;
 363                         default:
 364                             SWR_ASSERT(false, "Unsupported underlying type!");
 365                             break;
 366                     }
 367                     break;
 368                 case SWR_TYPE_UINT:
 369                     // Zero extend uint32_t types.
 370                     switch(bpc)
 371                     {
 372                         case 8:
 373                         case 16:
 374                             vec = Z_EXT(vec, VectorType::get(mInt32Ty, 4));
 375                             vec = BITCAST(vec, VectorType::get(mFP32Ty, 4));
 376                             break;
 377                         case 32:
 378                             break; // Pass through unchanged.
 379                         default:
 380                             SWR_ASSERT(false, "Unsupported underlying type!");
 381                             break;
 382                     }
 383                     break;
 384                 case SWR_TYPE_SINT:
 385                     // Sign extend SINT types.
 386                     switch(bpc)
 387                     {
 388                         case 8:
 389                         case 16:
 390                             vec = S_EXT(vec, VectorType::get(mInt32Ty, 4));
 391                             vec = BITCAST(vec, VectorType::get(mFP32Ty, 4));
 392                             break;
 393                         case 32:
 394                             break; // Pass through unchanged.
 395                         default:
 396                             SWR_ASSERT(false, "Unsupported underlying type!");
 397                             break;
 398                     }
 399                     break;
 400                 case SWR_TYPE_FLOAT:
 401                     switch(bpc)
 402                     {
 403                         case 32:
 404                             break; // Pass through unchanged.
 405                         default:
 406                             SWR_ASSERT(false, "Unsupported underlying type!");
 407                     }
 408                     break;
 409                 case SWR_TYPE_USCALED:
 410                     vec = UI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
 411                     break;
 412                 case SWR_TYPE_SSCALED:
 413                     vec = SI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
 414                     break;
 415                 case SWR_TYPE_UNKNOWN:
 416                 case SWR_TYPE_UNUSED:
 417                     SWR_ASSERT(false, "Unsupported type %d!", info.type[0]);
 418             }
 419
 420             // promote mask: sse(0,1,2,3) | avx(0,1,2,3,4,4,4,4)
 421             // uwvec: 4 x F32, undef value
 422             Value*    wvec = VSHUFFLE(vec, uwvec, promoteMask);
 423             vectors.push_back(wvec);
 424         }
 425
 426         std::vector<Constant*>        v01Mask(mVWidth);
 427         std::vector<Constant*>        v23Mask(mVWidth);
 428         std::vector<Constant*>        v02Mask(mVWidth);
 429         std::vector<Constant*>        v13Mask(mVWidth);
 430
 431         // Concatenate the vectors together.
 432         elements[0] = VUNDEF_F();
 433         elements[1] = VUNDEF_F();
 434         elements[2] = VUNDEF_F();
 435         elements[3] = VUNDEF_F();
 436         for(uint32_t b = 0, num4Wide = mVWidth / 4; b < num4Wide; ++b)
 437         {
 438             v01Mask[4 * b + 0] = C(0 + 4 * b);
 439             v01Mask[4 * b + 1] = C(1 + 4 * b);
 440             v01Mask[4 * b + 2] = C(0 + 4 * b + mVWidth);
 441             v01Mask[4 * b + 3] = C(1 + 4 * b + mVWidth);
 442
 443             v23Mask[4 * b + 0] = C(2 + 4 * b);
 444             v23Mask[4 * b + 1] = C(3 + 4 * b);
 445             v23Mask[4 * b + 2] = C(2 + 4 * b + mVWidth);
 446             v23Mask[4 * b + 3] = C(3 + 4 * b + mVWidth);
 447
 448             v02Mask[4 * b + 0] = C(0 + 4 * b);
 449             v02Mask[4 * b + 1] = C(2 + 4 * b);
 450             v02Mask[4 * b + 2] = C(0 + 4 * b + mVWidth);
 451             v02Mask[4 * b + 3] = C(2 + 4 * b + mVWidth);
 452
 453             v13Mask[4 * b + 0] = C(1 + 4 * b);
 454             v13Mask[4 * b + 1] = C(3 + 4 * b);
 455             v13Mask[4 * b + 2] = C(1 + 4 * b + mVWidth);
 456             v13Mask[4 * b + 3] = C(3 + 4 * b + mVWidth);
 457
 458             std::vector<Constant*>    iMask(mVWidth);
 459             for(uint32_t i = 0; i < mVWidth; ++i)
 460             {
 461                 if(((4 * b) <= i) && (i < (4 * (b + 1))))
 462                 {
 463                     iMask[i] = C(i % 4 + mVWidth);
 464                 }
 465                 else
 466                 {
 467                     iMask[i] = C(i);
 468                 }
 469             }
 470             Constant* insertMask = ConstantVector::get(iMask);
 471             elements[0] = VSHUFFLE(elements[0], vectors[4 * b + 0], insertMask);
 472             elements[1] = VSHUFFLE(elements[1], vectors[4 * b + 1], insertMask);
 473             elements[2] = VSHUFFLE(elements[2], vectors[4 * b + 2], insertMask);
 474             elements[3] = VSHUFFLE(elements[3], vectors[4 * b + 3], insertMask);
 475         }
 476
 477         Value* x0y0x1y1 = VSHUFFLE(elements[0], elements[1], ConstantVector::get(v01Mask));
 478         Value* x2y2x3y3 = VSHUFFLE(elements[2], elements[3], ConstantVector::get(v01Mask));
 479         Value* z0w0z1w1 = VSHUFFLE(elements[0], elements[1], ConstantVector::get(v23Mask));
 480         Value* z2w3z2w3 = VSHUFFLE(elements[2], elements[3], ConstantVector::get(v23Mask));
 481         elements[0] = VSHUFFLE(x0y0x1y1, x2y2x3y3, ConstantVector::get(v02Mask));
 482         elements[1] = VSHUFFLE(x0y0x1y1, x2y2x3y3, ConstantVector::get(v13Mask));
 483         elements[2] = VSHUFFLE(z0w0z1w1, z2w3z2w3, ConstantVector::get(v02Mask));
 484         elements[3] = VSHUFFLE(z0w0z1w1, z2w3z2w3, ConstantVector::get(v13Mask));
 485
 486         switch(numComponents + 1)
 487         {
 488             case    1: elements[0] = VIMMED1(0.0f);
 489             case    2: elements[1] = VIMMED1(0.0f);
 490             case    3: elements[2] = VIMMED1(0.0f);
 491             case    4: elements[3] = VIMMED1(1.0f);
 492         }
 493
 494         for(uint32_t c = 0; c < 4; ++c)
 495         {
 496             Value* dest = GEP(pVtxOut, C(nelt * 4 + c), "destGEP");
 497             STORE(elements[c], dest);
 498         }
 499     }
 500 }
 501
 502 // returns true for odd formats that require special state.gather handling
 503 bool FetchJit::IsOddFormat(SWR_FORMAT format)
 504 {
 505     const SWR_FORMAT_INFO& info = GetFormatInfo(format);
 506     if (info.bpc[0] != 8 && info.bpc[0] != 16 && info.bpc[0] != 32)
 507     {
 508         return true;
 509     }
 510     return false;
 511 }
 512
 513 // format is uniform if all components are the same size and type
 514 bool FetchJit::IsUniformFormat(SWR_FORMAT format)
 515 {
 516     const SWR_FORMAT_INFO& info = GetFormatInfo(format);
 517     uint32_t bpc0 = info.bpc[0];
 518     uint32_t type0 = info.type[0];
 519
 520     for (uint32_t c = 1; c < info.numComps; ++c)
 521     {
 522         if (bpc0 != info.bpc[c] || type0 != info.type[c])
 523         {
 524             return false;
 525         }
 526     }
 527     return true;
 528 }
 529
 530 // unpacks components based on format
 531 // foreach component in the pixel
 532 //   mask off everything but this component
 533 //   shift component to LSB
 534 void FetchJit::UnpackComponents(SWR_FORMAT format, Value* vInput, Value* result[4])
 535 {
 536     const SWR_FORMAT_INFO& info = GetFormatInfo(format);
 537
 538     uint32_t bitOffset = 0;
 539     for (uint32_t c = 0; c < info.numComps; ++c)
 540     {
 541         uint32_t swizzledIndex = info.swizzle[c];
 542         uint32_t compBits = info.bpc[c];
 543         uint32_t bitmask = ((1 << compBits) - 1) << bitOffset;
 544         Value* comp = AND(vInput, bitmask);
 545         comp = LSHR(comp, bitOffset);
 546
 547         result[swizzledIndex] = comp;
 548         bitOffset += compBits;
 549     }
 550 }
 551
 552 // gather for odd component size formats
 553 // gather SIMD full pixels per lane then shift/mask to move each component to their
 554 // own vector
 555 void FetchJit::CreateGatherOddFormats(SWR_FORMAT format, Value* pBase, Value* offsets, Value* result[4])
 556 {
 557     const SWR_FORMAT_INFO &info = GetFormatInfo(format);
 558
 559     // only works if pixel size is <= 32bits
 560     SWR_ASSERT(info.bpp <= 32);
 561
 562     Value* gather = VUNDEF_I();
 563
 564     // assign defaults
 565     for (uint32_t comp = 0; comp < 4; ++comp)
 566     {
 567         result[comp] = VIMMED1((int)info.defaults[comp]);
 568     }
 569
 570     // gather SIMD pixels
 571     for (uint32_t e = 0; e < JM()->mVWidth; ++e)
 572     {
 573         Value* elemOffset = VEXTRACT(offsets, C(e));
 574         Value* load = GEP(pBase, elemOffset);
 575
 576         // load the proper amount of data based on component size
 577         switch (info.bpp)
 578         {
 579         case 8: load = POINTER_CAST(load, Type::getInt8PtrTy(JM()->mContext)); break;
 580         case 16: load = POINTER_CAST(load, Type::getInt16PtrTy(JM()->mContext)); break;
 581         case 32: load = POINTER_CAST(load, Type::getInt32PtrTy(JM()->mContext)); break;
 582         default: SWR_ASSERT(0);
 583         }
 584
 585         // load pixel
 586         Value *val = LOAD(load);
 587
 588         // zero extend to 32bit integer
 589         val = INT_CAST(val, mInt32Ty, false);
 590
 591         // store in simd lane
 592         gather = VINSERT(gather, val, C(e));
 593     }
 594
 595     UnpackComponents(format, gather, result);
 596
 597     // cast to fp32
 598     result[0] = BITCAST(result[0], mSimdFP32Ty);
 599     result[1] = BITCAST(result[1], mSimdFP32Ty);
 600     result[2] = BITCAST(result[2], mSimdFP32Ty);
 601     result[3] = BITCAST(result[3], mSimdFP32Ty);
 602 }
 603
 604 void FetchJit::ConvertFormat(SWR_FORMAT format, Value *texels[4])
 605 {
 606     const SWR_FORMAT_INFO &info = GetFormatInfo(format);
 607
 608     for (uint32_t c = 0; c < info.numComps; ++c)
 609     {
 610         uint32_t compIndex = info.swizzle[c];
 611
 612         // skip any conversion on UNUSED components
 613         if (info.type[c] == SWR_TYPE_UNUSED)
 614         {
 615             continue;
 616         }
 617
 618         if (info.isNormalized[c])
 619         {
 620             if (info.type[c] == SWR_TYPE_SNORM)
 621             {
 622                 /// @todo The most-negative value maps to -1.0f. e.g. the 5-bit value 10000 maps to -1.0f.
 623
 624                 /// result = c * (1.0f / (2^(n-1) - 1);
 625                 uint32_t n = info.bpc[c];
 626                 uint32_t pow2 = 1 << (n - 1);
 627                 float scale = 1.0f / (float)(pow2 - 1);
 628                 Value *vScale = VIMMED1(scale);
 629                 texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty);
 630                 texels[compIndex] = SI_TO_FP(texels[compIndex], mSimdFP32Ty);
 631                 texels[compIndex] = FMUL(texels[compIndex], vScale);
 632             }
 633             else
 634             {
 635                 SWR_ASSERT(info.type[c] == SWR_TYPE_UNORM);
 636
 637                 /// result = c * (1.0f / (2^n - 1))
 638                 uint32_t n = info.bpc[c];
 639                 uint32_t pow2 = 1 << n;
 640                 // special case 24bit unorm format, which requires a full divide to meet ULP requirement
 641                 if (n == 24)
 642                 {
 643                     float scale = (float)(pow2 - 1);
 644                     Value* vScale = VIMMED1(scale);
 645                     texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty);
 646                     texels[compIndex] = SI_TO_FP(texels[compIndex], mSimdFP32Ty);
 647                     texels[compIndex] = FDIV(texels[compIndex], vScale);
 648                 }
 649                 else
 650                 {
 651                     float scale = 1.0f / (float)(pow2 - 1);
 652                     Value *vScale = VIMMED1(scale);
 653                     texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty);
 654                     texels[compIndex] = UI_TO_FP(texels[compIndex], mSimdFP32Ty);
 655                     texels[compIndex] = FMUL(texels[compIndex], vScale);
 656                 }
 657             }
 658             continue;
 659         }
 660     }
 661 }
 662
 663 //////////////////////////////////////////////////////////////////////////
 664 /// @brief Loads attributes from memory using AVX2 GATHER(s)
 665 /// @param fetchState - info about attributes to be fetched from memory
 666 /// @param streams - value pointer to the current vertex stream
 667 /// @param vIndices - vector value of indices to gather
 668 /// @param pVtxOut - value pointer to output simdvertex struct
 669 void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
 670                                  Value* streams, Value* vIndices, Value* pVtxOut)
 671 {
 672     uint32_t currentVertexElement = 0;
 673     uint32_t outputElt = 0;
 674     Value* vVertexElements[4];
 675
 676     Value* startVertex = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartVertex});
 677     Value* startInstance = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartInstance});
 678     Value* curInstance = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_CurInstance});
 679     Value* vBaseVertex = VBROADCAST(LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_BaseVertex}));
 680     curInstance->setName("curInstance");
 681
 682     for(uint32_t nInputElt = 0; nInputElt < fetchState.numAttribs; ++nInputElt)
 683     {
 684         const INPUT_ELEMENT_DESC& ied = fetchState.layout[nInputElt];
 685
 686         // skip element if all components are disabled
 687         if (ied.ComponentPacking == ComponentEnable::NONE)
 688         {
 689             continue;
 690         }
 691
 692         const SWR_FORMAT_INFO &info = GetFormatInfo((SWR_FORMAT)ied.Format);
 693         SWR_ASSERT((info.bpp != 0), "Unsupported format in JitGatherVertices.");
 694         uint32_t bpc = info.bpp / info.numComps;  ///@todo Code below assumes all components are same size. Need to fix.
 695
 696         Value *stream = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pData});
 697
 698         // VGATHER* takes an *i8 src pointer
 699         Value* pStreamBase = BITCAST(stream, PointerType::get(mInt8Ty, 0));
 700
 701         Value *stride = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pitch});
 702         Value *vStride = VBROADCAST(stride);
 703
 704         // max vertex index that is fully in bounds
 705         Value *maxVertex = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_maxVertex)});
 706         maxVertex = LOAD(maxVertex);
 707
 708         Value *vCurIndices;
 709         Value *startOffset;
 710         if(ied.InstanceEnable)
 711         {
 712             Value* stepRate = C(ied.InstanceDataStepRate);
 713
 714             // prevent a div by 0 for 0 step rate
 715             Value* isNonZeroStep = ICMP_UGT(stepRate, C(0));
 716             stepRate = SELECT(isNonZeroStep, stepRate, C(1));
 717
 718             // calc the current offset into instanced data buffer
 719             Value* calcInstance = UDIV(curInstance, stepRate);
 720
 721             // if step rate is 0, every instance gets instance 0
 722             calcInstance = SELECT(isNonZeroStep, calcInstance, C(0));
 723
 724             vCurIndices = VBROADCAST(calcInstance);
 725
 726             startOffset = startInstance;
 727         }
 728         else
 729         {
 730             // offset indices by baseVertex
 731             vCurIndices = ADD(vIndices, vBaseVertex);
 732
 733             startOffset = startVertex;
 734         }
 735
 736         // All of the OOB calculations are in vertices, not VB offsets, to prevent having to
 737         // do 64bit address offset calculations.
 738
 739         // calculate byte offset to the start of the VB
 740         Value* baseOffset = MUL(Z_EXT(startOffset, mInt64Ty), Z_EXT(stride, mInt64Ty));
 741         pStreamBase = GEP(pStreamBase, baseOffset);
 742
 743         // if we have a start offset, subtract from max vertex. Used for OOB check
 744         maxVertex = SUB(Z_EXT(maxVertex, mInt64Ty), Z_EXT(startOffset, mInt64Ty));
 745         Value* neg = ICMP_SLT(maxVertex, C((int64_t)0));
 746         // if we have a negative value, we're already OOB. clamp at 0.
 747         maxVertex = SELECT(neg, C(0), TRUNC(maxVertex, mInt32Ty));
 748
 749         // Load the in bounds size of a partially valid vertex
 750         Value *partialInboundsSize = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_partialInboundsSize)});
 751         partialInboundsSize = LOAD(partialInboundsSize);
 752         Value* vPartialVertexSize = VBROADCAST(partialInboundsSize);
 753         Value* vBpp = VBROADCAST(C(info.Bpp));
 754         Value* vAlignmentOffsets = VBROADCAST(C(ied.AlignedByteOffset));
 755
 756         // is the element is <= the partially valid size
 757         Value* vElementInBoundsMask = ICMP_SLE(vBpp, SUB(vPartialVertexSize, vAlignmentOffsets));
 758
 759         // override cur indices with 0 if pitch is 0
 760         Value* pZeroPitchMask = ICMP_EQ(vStride, VIMMED1(0));
 761         vCurIndices = SELECT(pZeroPitchMask, VIMMED1(0), vCurIndices);
 762
 763         // are vertices partially OOB?
 764         Value* vMaxVertex = VBROADCAST(maxVertex);
 765         Value* vPartialOOBMask = ICMP_EQ(vCurIndices, vMaxVertex);
 766
 767         // are vertices are fully in bounds?
 768         Value* vGatherMask = ICMP_ULT(vCurIndices, vMaxVertex);
 769
 770         // blend in any partially OOB indices that have valid elements
 771         vGatherMask = SELECT(vPartialOOBMask, vElementInBoundsMask, vGatherMask);
 772         vGatherMask = VMASK(vGatherMask);
 773
 774         // calculate the actual offsets into the VB
 775         Value* vOffsets = MUL(vCurIndices, vStride);
 776         vOffsets = ADD(vOffsets, vAlignmentOffsets);
 777
 778         // Packing and component control
 779         ComponentEnable compMask = (ComponentEnable)ied.ComponentPacking;
 780         const ComponentControl compCtrl[4] { (ComponentControl)ied.ComponentControl0, (ComponentControl)ied.ComponentControl1,
 781                                              (ComponentControl)ied.ComponentControl2, (ComponentControl)ied.ComponentControl3};
 782
 783         // Special gather/conversion for formats without equal component sizes
 784         if (IsOddFormat((SWR_FORMAT)ied.Format))
 785         {
 786             Value* pResults[4];
 787             CreateGatherOddFormats((SWR_FORMAT)ied.Format, pStreamBase, vOffsets, pResults);
 788             ConvertFormat((SWR_FORMAT)ied.Format, pResults);
 789
 790             for (uint32_t c = 0; c < 4; ++c)
 791             {
 792                 if (isComponentEnabled(compMask, c))
 793                 {
 794                     vVertexElements[currentVertexElement++] = pResults[c];
 795                     if (currentVertexElement > 3)
 796                     {
 797                         StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
 798                         // reset to the next vVertexElement to output
 799                         currentVertexElement = 0;
 800                     }
 801                 }
 802             }
 803         }
 804         else if(info.type[0] == SWR_TYPE_FLOAT)
 805         {
 806             ///@todo: support 64 bit vb accesses
 807             Value* gatherSrc = VIMMED1(0.0f);
 808
 809             SWR_ASSERT(IsUniformFormat((SWR_FORMAT)ied.Format),
 810                 "Unsupported format for standard gather fetch.");
 811
 812             // Gather components from memory to store in a simdvertex structure
 813             switch(bpc)
 814             {
 815                 case 16:
 816                 {
 817                     Value* vGatherResult[2];
 818                     Value *vMask;
 819
 820                     // if we have at least one component out of x or y to fetch
 821                     if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)){
 822                         // save mask as it is zero'd out after each gather
 823                         vMask = vGatherMask;
 824
 825                         vGatherResult[0] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1));
 826                         // e.g. result of first 8x32bit integer gather for 16bit components
 827                         // 256i - 0    1    2    3    4    5    6    7
 828                         //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
 829                         //
 830                     }
 831
 832                     // if we have at least one component out of z or w to fetch
 833                     if(isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)){
 834                         // offset base to the next components(zw) in the vertex to gather
 835                         pStreamBase = GEP(pStreamBase, C((char)4));
 836                         vMask = vGatherMask;
 837
 838                         vGatherResult[1] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1));
 839                         // e.g. result of second 8x32bit integer gather for 16bit components
 840                         // 256i - 0    1    2    3    4    5    6    7
 841                         //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
 842                         //
 843                     }
 844
 845                     // if we have at least one component to shuffle into place
 846                     if(compMask){
 847                         Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, Instruction::CastOps::FPExt, CONVERT_NONE,
 848                             currentVertexElement, outputElt, compMask, compCtrl, vVertexElements);
 849
 850                         // Shuffle gathered components into place in simdvertex struct
 851                         Shuffle16bpcGather(args);  // outputs to vVertexElements ref
 852                     }
 853                 }
 854                     break;
 855                 case 32:
 856                 {
 857                     for (uint32_t i = 0; i < 4; i++)
 858                     {
 859                         if (isComponentEnabled(compMask, i))
 860                         {
 861                             // if we need to gather the component
 862                             if (compCtrl[i] == StoreSrc)
 863                             {
 864                                 // save mask as it is zero'd out after each gather
 865                                 Value *vMask = vGatherMask;
 866
 867                                 // Gather a SIMD of vertices
 868                                 vVertexElements[currentVertexElement++] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1));
 869                             }
 870                             else
 871                             {
 872                                 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
 873                             }
 874
 875                             if (currentVertexElement > 3)
 876                             {
 877                                 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
 878                                 // reset to the next vVertexElement to output
 879                                 currentVertexElement = 0;
 880                             }
 881
 882                         }
 883
 884                         // offset base to the next component in the vertex to gather
 885                         pStreamBase = GEP(pStreamBase, C((char)4));
 886                     }
 887                 }
 888                     break;
 889                 default:
 890                     SWR_ASSERT(0, "Tried to fetch invalid FP format");
 891                     break;
 892             }
 893         }
 894         else
 895         {
 896             Instruction::CastOps extendCastType = Instruction::CastOps::CastOpsEnd;
 897             ConversionType conversionType = CONVERT_NONE;
 898
 899             SWR_ASSERT(IsUniformFormat((SWR_FORMAT)ied.Format),
 900                 "Unsupported format for standard gather fetch.");
 901
 902             switch(info.type[0])
 903             {
 904                 case SWR_TYPE_UNORM:
 905                     conversionType = CONVERT_NORMALIZED;
 906                 case SWR_TYPE_UINT:
 907                     extendCastType = Instruction::CastOps::ZExt;
 908                     break;
 909                 case SWR_TYPE_SNORM:
 910                     conversionType = CONVERT_NORMALIZED;
 911                 case SWR_TYPE_SINT:
 912                     extendCastType = Instruction::CastOps::SExt;
 913                     break;
 914                 case SWR_TYPE_USCALED:
 915                     conversionType = CONVERT_USCALED;
 916                     extendCastType = Instruction::CastOps::UIToFP;
 917                     break;
 918                 case SWR_TYPE_SSCALED:
 919                     conversionType = CONVERT_SSCALED;
 920                     extendCastType = Instruction::CastOps::SIToFP;
 921                     break;
 922                 default:
 923                     break;
 924             }
 925
 926             // value substituted when component of gather is masked
 927             Value* gatherSrc = VIMMED1(0);
 928
 929             // Gather components from memory to store in a simdvertex structure
 930             switch (bpc)
 931             {
 932                 case 8:
 933                 {
 934                     // if we have at least one component to fetch
 935                     if(compMask)
 936                     {
 937                         Value* vGatherResult = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask, C((char)1));
 938                         // e.g. result of an 8x32bit integer gather for 8bit components
 939                         // 256i - 0    1    2    3    4    5    6    7
 940                         //        xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw
 941
 942                         Shuffle8bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, extendCastType, conversionType,
 943                             currentVertexElement, outputElt, compMask, compCtrl, vVertexElements, info.swizzle);
 944
 945                         // Shuffle gathered components into place in simdvertex struct
 946                         Shuffle8bpcGatherd(args); // outputs to vVertexElements ref
 947                     }
 948                 }
 949                 break;
 950                 case 16:
 951                 {
 952                     Value* vGatherResult[2];
 953                     Value *vMask;
 954
 955                     // if we have at least one component out of x or y to fetch
 956                     if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)){
 957                         // save mask as it is zero'd out after each gather
 958                         vMask = vGatherMask;
 959
 960                         vGatherResult[0] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1));
 961                         // e.g. result of first 8x32bit integer gather for 16bit components
 962                         // 256i - 0    1    2    3    4    5    6    7
 963                         //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
 964                         //
 965                     }
 966
 967                     // if we have at least one component out of z or w to fetch
 968                     if(isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)){
 969                         // offset base to the next components(zw) in the vertex to gather
 970                         pStreamBase = GEP(pStreamBase, C((char)4));
 971                         vMask = vGatherMask;
 972
 973                         vGatherResult[1] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1));
 974                         // e.g. result of second 8x32bit integer gather for 16bit components
 975                         // 256i - 0    1    2    3    4    5    6    7
 976                         //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
 977                         //
 978                     }
 979
 980                     // if we have at least one component to shuffle into place
 981                     if(compMask){
 982                         Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, extendCastType, conversionType,
 983                             currentVertexElement, outputElt, compMask, compCtrl, vVertexElements);
 984
 985                         // Shuffle gathered components into place in simdvertex struct
 986                         Shuffle16bpcGather(args);  // outputs to vVertexElements ref
 987                     }
 988                 }
 989                 break;
 990                 case 32:
 991                 {
 992                     // Gathered components into place in simdvertex struct
 993                     for (uint32_t i = 0; i < 4; i++)
 994                     {
 995                         if (isComponentEnabled(compMask, i))
 996                         {
 997                             // if we need to gather the component
 998                             if (compCtrl[i] == StoreSrc)
 999                             {
1000                                 // save mask as it is zero'd out after each gather
1001                                 Value *vMask = vGatherMask;
1002
1003                                 Value* pGather = GATHERDD(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1));
1004
1005                                 if (conversionType == CONVERT_USCALED)
1006                                 {
1007                                     pGather = UI_TO_FP(pGather, mSimdFP32Ty);
1008                                 }
1009                                 else if (conversionType == CONVERT_SSCALED)
1010                                 {
1011                                     pGather = SI_TO_FP(pGather, mSimdFP32Ty);
1012                                 }
1013
1014                                 vVertexElements[currentVertexElement++] = pGather;
1015                                 // e.g. result of a single 8x32bit integer gather for 32bit components
1016                                 // 256i - 0    1    2    3    4    5    6    7
1017                                 //        xxxx xxxx xxxx xxxx xxxx xxxx xxxx xxxx
1018                             }
1019                             else
1020                             {
1021                                 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1022                             }
1023
1024                             if (currentVertexElement > 3)
1025                             {
1026                                 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1027                                 // reset to the next vVertexElement to output
1028                                 currentVertexElement = 0;
1029                             }
1030
1031                         }
1032
1033                         // offset base to the next component  in the vertex to gather
1034                         pStreamBase = GEP(pStreamBase, C((char)4));
1035                     }
1036                 }
1037                 break;
1038             }
1039         }
1040     }
1041
1042     // if we have a partially filled vVertexElement struct, output it
1043     if(currentVertexElement > 0){
1044         StoreVertexElements(pVtxOut, outputElt++, currentVertexElement, vVertexElements);
1045     }
1046 }
1047
1048 //////////////////////////////////////////////////////////////////////////
1049 /// @brief Loads a simd of valid indices. OOB indices are set to 0
1050 /// *Note* have to do 16bit index checking in scalar until we have AVX-512
1051 /// support
1052 /// @param pIndices - pointer to 8 bit indices
1053 /// @param pLastIndex - pointer to last valid index
1054 Value* FetchJit::GetSimdValid8bitIndices(Value* pIndices, Value* pLastIndex)
1055 {
1056     // can fit 2 16 bit integers per vWidth lane
1057     Value* vIndices =  VUNDEF_I();
1058
1059     // store 0 index on stack to be used to conditionally load from if index address is OOB
1060     Value* pZeroIndex = ALLOCA(mInt8Ty);
1061     STORE(C((uint8_t)0), pZeroIndex);
1062
1063     // Load a SIMD of index pointers
1064     for(int64_t lane = 0; lane < mVWidth; lane++)
1065     {
1066         // Calculate the address of the requested index
1067         Value *pIndex = GEP(pIndices, C(lane));
1068
1069         // check if the address is less than the max index,
1070         Value* mask = ICMP_ULT(pIndex, pLastIndex);
1071
1072         // if valid, load the index. if not, load 0 from the stack
1073         Value* pValid = SELECT(mask, pIndex, pZeroIndex);
1074         Value *index = LOAD(pValid, "valid index");
1075
1076         // zero extended index to 32 bits and insert into the correct simd lane
1077         index = Z_EXT(index, mInt32Ty);
1078         vIndices = VINSERT(vIndices, index, lane);
1079     }
1080     return vIndices;
1081 }
1082
1083 //////////////////////////////////////////////////////////////////////////
1084 /// @brief Loads a simd of valid indices. OOB indices are set to 0
1085 /// *Note* have to do 16bit index checking in scalar until we have AVX-512
1086 /// support
1087 /// @param pIndices - pointer to 16 bit indices
1088 /// @param pLastIndex - pointer to last valid index
1089 Value* FetchJit::GetSimdValid16bitIndices(Value* pIndices, Value* pLastIndex)
1090 {
1091     // can fit 2 16 bit integers per vWidth lane
1092     Value* vIndices =  VUNDEF_I();
1093
1094     // store 0 index on stack to be used to conditionally load from if index address is OOB
1095     Value* pZeroIndex = ALLOCA(mInt16Ty);
1096     STORE(C((uint16_t)0), pZeroIndex);
1097
1098     // Load a SIMD of index pointers
1099     for(int64_t lane = 0; lane < mVWidth; lane++)
1100     {
1101         // Calculate the address of the requested index
1102         Value *pIndex = GEP(pIndices, C(lane));
1103
1104         // check if the address is less than the max index,
1105         Value* mask = ICMP_ULT(pIndex, pLastIndex);
1106
1107         // if valid, load the index. if not, load 0 from the stack
1108         Value* pValid = SELECT(mask, pIndex, pZeroIndex);
1109         Value *index = LOAD(pValid, "valid index");
1110
1111         // zero extended index to 32 bits and insert into the correct simd lane
1112         index = Z_EXT(index, mInt32Ty);
1113         vIndices = VINSERT(vIndices, index, lane);
1114     }
1115     return vIndices;
1116 }
1117
1118 //////////////////////////////////////////////////////////////////////////
1119 /// @brief Loads a simd of valid indices. OOB indices are set to 0
1120 /// @param pIndices - pointer to 32 bit indices
1121 /// @param pLastIndex - pointer to last valid index
1122 Value* FetchJit::GetSimdValid32bitIndices(Value* pIndices, Value* pLastIndex)
1123 {
1124     DataLayout dL(JM()->mpCurrentModule);
1125     unsigned int ptrSize = dL.getPointerSize() * 8;  // ptr size in bits
1126     Value* iLastIndex = PTR_TO_INT(pLastIndex, Type::getIntNTy(JM()->mContext, ptrSize));
1127     Value* iIndices = PTR_TO_INT(pIndices, Type::getIntNTy(JM()->mContext, ptrSize));
1128
1129     // get the number of indices left in the buffer (endPtr - curPtr) / sizeof(index)
1130     Value* numIndicesLeft = SUB(iLastIndex,iIndices);
1131     numIndicesLeft = TRUNC(numIndicesLeft, mInt32Ty);
1132     numIndicesLeft = SDIV(numIndicesLeft, C(4));
1133
1134     // create a vector of index counts from the base index ptr passed into the fetch
1135     const std::vector<Constant*> vecIndices {C(0), C(1), C(2), C(3), C(4), C(5), C(6), C(7)};
1136     Constant* vIndexOffsets = ConstantVector::get(vecIndices);
1137
1138     // compare index count to the max valid index
1139     // e.g vMaxIndex      4 4 4 4 4 4 4 4 : 4 indices left to load
1140     //     vIndexOffsets  0 1 2 3 4 5 6 7
1141     //     ------------------------------
1142     //     vIndexMask    -1-1-1-1 0 0 0 0 : offsets < max pass
1143     //     vLoadedIndices 0 1 2 3 0 0 0 0 : offsets >= max masked to 0
1144     Value* vMaxIndex = VBROADCAST(numIndicesLeft);
1145     Value* vIndexMask = VPCMPGTD(vMaxIndex,vIndexOffsets);
1146
1147     // VMASKLOAD takes an *i8 src pointer
1148     pIndices = BITCAST(pIndices,PointerType::get(mInt8Ty,0));
1149
1150     // Load the indices; OOB loads 0
1151     return MASKLOADD(pIndices,vIndexMask);
1152 }
1153
1154 //////////////////////////////////////////////////////////////////////////
1155 /// @brief Takes a SIMD of gathered 8bpc verts, zero or sign extends,
1156 /// denormalizes if needed, converts to F32 if needed, and positions in
1157 //  the proper SIMD rows to be output to the simdvertex structure
1158 /// @param args: (tuple of args, listed below)
1159 ///   @param vGatherResult - 8 gathered 8bpc vertices
1160 ///   @param pVtxOut - base pointer to output simdvertex struct
1161 ///   @param extendType - sign extend or zero extend
1162 ///   @param bNormalized - do we need to denormalize?
1163 ///   @param currentVertexElement - reference to the current vVertexElement
1164 ///   @param outputElt - reference to the current offset from simdvertex we're o
1165 ///   @param compMask - component packing mask
1166 ///   @param compCtrl - component control val
1167 ///   @param vVertexElements[4] - vertex components to output
1168 ///   @param swizzle[4] - component swizzle location
1169 void FetchJit::Shuffle8bpcGatherd(Shuffle8bpcArgs &args)
1170 {
1171     // Unpack tuple args
1172     Value*& vGatherResult = std::get<0>(args);
1173     Value* pVtxOut = std::get<1>(args);
1174     const Instruction::CastOps extendType = std::get<2>(args);
1175     const ConversionType conversionType = std::get<3>(args);
1176     uint32_t &currentVertexElement = std::get<4>(args);
1177     uint32_t &outputElt =  std::get<5>(args);
1178     const ComponentEnable compMask = std::get<6>(args);
1179     const ComponentControl (&compCtrl)[4] = std::get<7>(args);
1180     Value* (&vVertexElements)[4] = std::get<8>(args);
1181     const uint32_t (&swizzle)[4] = std::get<9>(args);
1182
1183     // cast types
1184     Type* vGatherTy = mSimdInt32Ty;
1185     Type* v32x8Ty =  VectorType::get(mInt8Ty, mVWidth * 4 ); // vwidth is units of 32 bits
1186
1187     // have to do extra work for sign extending
1188     if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP)){
1189         Type* v16x8Ty = VectorType::get(mInt8Ty, mVWidth * 2); // 8x16bit ints in a 128bit lane
1190         Type* v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
1191
1192         // shuffle mask, including any swizzling
1193         const char x = (char)swizzle[0]; const char y = (char)swizzle[1];
1194         const char z = (char)swizzle[2]; const char w = (char)swizzle[3];
1195         Value* vConstMask = C<char>({char(x), char(x+4), char(x+8), char(x+12),
1196                     char(y), char(y+4), char(y+8), char(y+12),
1197                     char(z), char(z+4), char(z+8), char(z+12),
1198                     char(w), char(w+4), char(w+8), char(w+12),
1199                     char(x), char(x+4), char(x+8), char(x+12),
1200                     char(y), char(y+4), char(y+8), char(y+12),
1201                     char(z), char(z+4), char(z+8), char(z+12),
1202                     char(w), char(w+4), char(w+8), char(w+12)});
1203
1204         Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherResult, v32x8Ty), vConstMask), vGatherTy);
1205         // after pshufb: group components together in each 128bit lane
1206         // 256i - 0    1    2    3    4    5    6    7
1207         //        xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww
1208
1209         Value* vi128XY = nullptr;
1210         if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)){
1211             vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({0, 4, 0, 0, 1, 5, 0, 0})), v128Ty);
1212             // after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane
1213             // 256i - 0    1    2    3    4    5    6    7
1214             //        xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care)
1215         }
1216
1217         // do the same for zw components
1218         Value* vi128ZW = nullptr;
1219         if(isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)){
1220             vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({2, 6, 0, 0, 3, 7, 0, 0})), v128Ty);
1221         }
1222
1223         // init denormalize variables if needed
1224         Instruction::CastOps fpCast;
1225         Value* conversionFactor;
1226
1227         switch (conversionType)
1228         {
1229         case CONVERT_NORMALIZED:
1230             fpCast = Instruction::CastOps::SIToFP;
1231             conversionFactor = VIMMED1((float)(1.0 / 127.0));
1232             break;
1233         case CONVERT_SSCALED:
1234             fpCast = Instruction::CastOps::SIToFP;
1235             conversionFactor = VIMMED1((float)(1.0));
1236             break;
1237         case CONVERT_USCALED:
1238             SWR_ASSERT(0, "Type should not be sign extended!");
1239             conversionFactor = nullptr;
1240             break;
1241         default:
1242             SWR_ASSERT(conversionType == CONVERT_NONE);
1243             conversionFactor = nullptr;
1244             break;
1245         }
1246
1247         // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
1248         for (uint32_t i = 0; i < 4; i++)
1249         {
1250             if (isComponentEnabled(compMask, i))
1251             {
1252                 if (compCtrl[i] == ComponentControl::StoreSrc)
1253                 {
1254                     // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
1255                     uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
1256                     // if x or y, use vi128XY permute result, else use vi128ZW
1257                     Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
1258
1259                     // sign extend
1260                     vVertexElements[currentVertexElement] = PMOVSXBD(BITCAST(VEXTRACT(selectedPermute, C(lane)), v16x8Ty));
1261
1262                     // denormalize if needed
1263                     if (conversionType != CONVERT_NONE)
1264                     {
1265                         vVertexElements[currentVertexElement] = FMUL(CAST(fpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor);
1266                     }
1267                     currentVertexElement++;
1268                 }
1269                 else
1270                 {
1271                     vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1272                 }
1273
1274                 if (currentVertexElement > 3)
1275                 {
1276                     StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1277                     // reset to the next vVertexElement to output
1278                     currentVertexElement = 0;
1279                 }
1280             }
1281         }
1282     }
1283     // else zero extend
1284     else if ((extendType == Instruction::CastOps::ZExt) || (extendType == Instruction::CastOps::UIToFP))
1285     {
1286         // init denormalize variables if needed
1287         Instruction::CastOps fpCast;
1288         Value* conversionFactor;
1289
1290         switch (conversionType)
1291         {
1292         case CONVERT_NORMALIZED:
1293             fpCast = Instruction::CastOps::UIToFP;
1294             conversionFactor = VIMMED1((float)(1.0 / 255.0));
1295             break;
1296         case CONVERT_USCALED:
1297             fpCast = Instruction::CastOps::UIToFP;
1298             conversionFactor = VIMMED1((float)(1.0));
1299             break;
1300         case CONVERT_SSCALED:
1301             SWR_ASSERT(0, "Type should not be zero extended!");
1302             conversionFactor = nullptr;
1303             break;
1304         default:
1305             SWR_ASSERT(conversionType == CONVERT_NONE);
1306             conversionFactor = nullptr;
1307             break;
1308         }
1309
1310         // shuffle enabled components into lower byte of each 32bit lane, 0 extending to 32 bits
1311         for (uint32_t i = 0; i < 4; i++)
1312         {
1313             if (isComponentEnabled(compMask, i))
1314             {
1315                 if (compCtrl[i] == ComponentControl::StoreSrc)
1316                 {
1317                     // pshufb masks for each component
1318                     Value* vConstMask;
1319                     switch (swizzle[i])
1320                     {
1321                     case 0:
1322                         // x shuffle mask
1323                         vConstMask = C<char>({ 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1,
1324                                                0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1 });
1325                         break;
1326                     case 1:
1327                         // y shuffle mask
1328                         vConstMask = C<char>({ 1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1,
1329                                                1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1 });
1330                         break;
1331                     case 2:
1332                         // z shuffle mask
1333                         vConstMask = C<char>({ 2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1,
1334                                                2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1 });
1335                         break;
1336                     case 3:
1337                         // w shuffle mask
1338                         vConstMask = C<char>({ 3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1,
1339                                                3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1 });
1340                         break;
1341                     default:
1342                         vConstMask = nullptr;
1343                         break;
1344                     }
1345
1346                     vVertexElements[currentVertexElement] = BITCAST(PSHUFB(BITCAST(vGatherResult, v32x8Ty), vConstMask), vGatherTy);
1347                     // after pshufb for x channel
1348                     // 256i - 0    1    2    3    4    5    6    7
1349                     //        x000 x000 x000 x000 x000 x000 x000 x000
1350
1351                     // denormalize if needed
1352                     if (conversionType != CONVERT_NONE)
1353                     {
1354                         vVertexElements[currentVertexElement] = FMUL(CAST(fpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor);
1355                     }
1356                     currentVertexElement++;
1357                 }
1358                 else
1359                 {
1360                     vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1361                 }
1362
1363                 if (currentVertexElement > 3)
1364                 {
1365                     StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1366                     // reset to the next vVertexElement to output
1367                     currentVertexElement = 0;
1368                 }
1369             }
1370         }
1371     }
1372     else
1373     {
1374         SWR_ASSERT(0, "Unsupported conversion type");
1375     }
1376 }
1377
1378 //////////////////////////////////////////////////////////////////////////
1379 /// @brief Takes a SIMD of gathered 16bpc verts, zero or sign extends,
1380 /// denormalizes if needed, converts to F32 if needed, and positions in
1381 //  the proper SIMD rows to be output to the simdvertex structure
1382 /// @param args: (tuple of args, listed below)
1383 ///   @param vGatherResult[2] - array of gathered 16bpc vertices, 4 per index
1384 ///   @param pVtxOut - base pointer to output simdvertex struct
1385 ///   @param extendType - sign extend or zero extend
1386 ///   @param bNormalized - do we need to denormalize?
1387 ///   @param currentVertexElement - reference to the current vVertexElement
1388 ///   @param outputElt - reference to the current offset from simdvertex we're o
1389 ///   @param compMask - component packing mask
1390 ///   @param compCtrl - component control val
1391 ///   @param vVertexElements[4] - vertex components to output
1392 void FetchJit::Shuffle16bpcGather(Shuffle16bpcArgs &args)
1393 {
1394     // Unpack tuple args
1395     Value* (&vGatherResult)[2] = std::get<0>(args);
1396     Value* pVtxOut = std::get<1>(args);
1397     const Instruction::CastOps extendType = std::get<2>(args);
1398     const ConversionType conversionType = std::get<3>(args);
1399     uint32_t &currentVertexElement = std::get<4>(args);
1400     uint32_t &outputElt = std::get<5>(args);
1401     const ComponentEnable compMask = std::get<6>(args);
1402     const ComponentControl(&compCtrl)[4] = std::get<7>(args);
1403     Value* (&vVertexElements)[4] = std::get<8>(args);
1404
1405     // cast types
1406     Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
1407     Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
1408
1409     // have to do extra work for sign extending
1410     if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP)||
1411         (extendType == Instruction::CastOps::FPExt))
1412     {
1413         // is this PP float?
1414         bool bFP = (extendType == Instruction::CastOps::FPExt) ? true : false;
1415
1416         Type* v8x16Ty = VectorType::get(mInt16Ty, 8); // 8x16bit in a 128bit lane
1417         Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
1418
1419         // shuffle mask
1420         Value* vConstMask = C<char>({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
1421                                      0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15});
1422         Value* vi128XY = nullptr;
1423         if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)){
1424             Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherResult[0], v32x8Ty), vConstMask), vGatherTy);
1425             // after pshufb: group components together in each 128bit lane
1426             // 256i - 0    1    2    3    4    5    6    7
1427             //        xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
1428
1429             vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
1430             // after PERMD: move and pack xy components into each 128bit lane
1431             // 256i - 0    1    2    3    4    5    6    7
1432             //        xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
1433         }
1434
1435         // do the same for zw components
1436         Value* vi128ZW = nullptr;
1437         if(isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)){
1438             Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherResult[1], v32x8Ty), vConstMask), vGatherTy);
1439             vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
1440         }
1441
1442         // init denormalize variables if needed
1443         Instruction::CastOps IntToFpCast;
1444         Value* conversionFactor;
1445
1446         switch (conversionType)
1447         {
1448         case CONVERT_NORMALIZED:
1449             IntToFpCast = Instruction::CastOps::SIToFP;
1450             conversionFactor = VIMMED1((float)(1.0 / 32767.0));
1451             break;
1452         case CONVERT_SSCALED:
1453             IntToFpCast = Instruction::CastOps::SIToFP;
1454             conversionFactor = VIMMED1((float)(1.0));
1455             break;
1456         case CONVERT_USCALED:
1457             SWR_ASSERT(0, "Type should not be sign extended!");
1458             conversionFactor = nullptr;
1459             break;
1460         default:
1461             SWR_ASSERT(conversionType == CONVERT_NONE);
1462             conversionFactor = nullptr;
1463             break;
1464         }
1465
1466         // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
1467         for (uint32_t i = 0; i < 4; i++)
1468         {
1469             if (isComponentEnabled(compMask, i))
1470             {
1471                 if (compCtrl[i] == ComponentControl::StoreSrc)
1472                 {
1473                     // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
1474                     uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
1475                     // if x or y, use vi128XY permute result, else use vi128ZW
1476                     Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
1477
1478                     if (bFP) {
1479                         // extract 128 bit lanes to sign extend each component
1480                         vVertexElements[currentVertexElement] = CVTPH2PS(BITCAST(VEXTRACT(selectedPermute, C(lane)), v8x16Ty));
1481                     }
1482                     else {
1483                         // extract 128 bit lanes to sign extend each component
1484                         vVertexElements[currentVertexElement] = PMOVSXWD(BITCAST(VEXTRACT(selectedPermute, C(lane)), v8x16Ty));
1485
1486                         // denormalize if needed
1487                         if (conversionType != CONVERT_NONE) {
1488                             vVertexElements[currentVertexElement] = FMUL(CAST(IntToFpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor);
1489                         }
1490                     }
1491                     currentVertexElement++;
1492                 }
1493                 else
1494                 {
1495                     vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1496                 }
1497
1498                 if (currentVertexElement > 3)
1499                 {
1500                     StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1501                     // reset to the next vVertexElement to output
1502                     currentVertexElement = 0;
1503                 }
1504             }
1505         }
1506     }
1507     // else zero extend
1508     else if ((extendType == Instruction::CastOps::ZExt) || (extendType == Instruction::CastOps::UIToFP))
1509     {
1510         // pshufb masks for each component
1511         Value* vConstMask[2];
1512         if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 2)){
1513             // x/z shuffle mask
1514             vConstMask[0] = C<char>({0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
1515                                      0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, });
1516         }
1517
1518         if(isComponentEnabled(compMask, 1) || isComponentEnabled(compMask, 3)){
1519             // y/w shuffle mask
1520             vConstMask[1] = C<char>({2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
1521                                      2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1});
1522         }
1523
1524         // init denormalize variables if needed
1525         Instruction::CastOps fpCast;
1526         Value* conversionFactor;
1527
1528         switch (conversionType)
1529         {
1530         case CONVERT_NORMALIZED:
1531             fpCast = Instruction::CastOps::UIToFP;
1532             conversionFactor = VIMMED1((float)(1.0 / 65535.0));
1533             break;
1534         case CONVERT_USCALED:
1535             fpCast = Instruction::CastOps::UIToFP;
1536             conversionFactor = VIMMED1((float)(1.0f));
1537             break;
1538         case CONVERT_SSCALED:
1539             SWR_ASSERT(0, "Type should not be zero extended!");
1540             conversionFactor = nullptr;
1541             break;
1542         default:
1543             SWR_ASSERT(conversionType == CONVERT_NONE);
1544             conversionFactor = nullptr;
1545             break;
1546         }
1547
1548         // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits
1549         for (uint32_t i = 0; i < 4; i++)
1550         {
1551             if (isComponentEnabled(compMask, i))
1552             {
1553                 if (compCtrl[i] == ComponentControl::StoreSrc)
1554                 {
1555                     // select correct constMask for x/z or y/w pshufb
1556                     uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1;
1557                     // if x or y, use vi128XY permute result, else use vi128ZW
1558                     uint32_t selectedGather = (i < 2) ? 0 : 1;
1559
1560                     vVertexElements[currentVertexElement] = BITCAST(PSHUFB(BITCAST(vGatherResult[selectedGather], v32x8Ty), vConstMask[selectedMask]), vGatherTy);
1561                     // after pshufb mask for x channel; z uses the same shuffle from the second gather
1562                     // 256i - 0    1    2    3    4    5    6    7
1563                     //        xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00
1564
1565                     // denormalize if needed
1566                     if (conversionType != CONVERT_NONE)
1567                     {
1568                         vVertexElements[currentVertexElement] = FMUL(CAST(fpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor);
1569                     }
1570                     currentVertexElement++;
1571                 }
1572                 else
1573                 {
1574                     vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1575                 }
1576
1577                 if (currentVertexElement > 3)
1578                 {
1579                     StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1580                     // reset to the next vVertexElement to output
1581                     currentVertexElement = 0;
1582                 }
1583             }
1584         }
1585     }
1586     else
1587     {
1588         SWR_ASSERT(0, "Unsupported conversion type");
1589     }
1590 }
1591
1592 //////////////////////////////////////////////////////////////////////////
1593 /// @brief Output a simdvertex worth of elements to the current outputElt
1594 /// @param pVtxOut - base address of VIN output struct
1595 /// @param outputElt - simdvertex offset in VIN to write to
1596 /// @param numEltsToStore - number of simdvertex rows to write out
1597 /// @param vVertexElements - LLVM Value*[] simdvertex to write out
1598 void FetchJit::StoreVertexElements(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4])
1599 {
1600     SWR_ASSERT(numEltsToStore <= 4, "Invalid element count.");
1601
1602     for(uint32_t c = 0; c < numEltsToStore; ++c)
1603     {
1604         // STORE expects FP32 x vWidth type, just bitcast if needed
1605         if(!vVertexElements[c]->getType()->getScalarType()->isFloatTy()){
1606 #if FETCH_DUMP_VERTEX
1607             PRINT("vVertexElements[%d]: 0x%x\n", {C(c), vVertexElements[c]});
1608 #endif
1609             vVertexElements[c] = BITCAST(vVertexElements[c], mSimdFP32Ty);
1610         }
1611 #if FETCH_DUMP_VERTEX
1612         else
1613         {
1614             PRINT("vVertexElements[%d]: %f\n", {C(c), vVertexElements[c]});
1615         }
1616 #endif
1617         // outputElt * 4 = offsetting by the size of a simdvertex
1618         // + c offsets to a 32bit x vWidth row within the current vertex
1619         Value* dest = GEP(pVtxOut, C(outputElt * 4 + c), "destGEP");
1620         STORE(vVertexElements[c], dest);
1621     }
1622 }
1623
1624 //////////////////////////////////////////////////////////////////////////
1625 /// @brief Generates a constant vector of values based on the
1626 /// ComponentControl value
1627 /// @param ctrl - ComponentControl value
1628 Value* FetchJit::GenerateCompCtrlVector(const ComponentControl ctrl)
1629 {
1630     switch(ctrl)
1631     {
1632         case NoStore:   return VUNDEF_I();
1633         case Store0:    return VIMMED1(0);
1634         case Store1Fp:  return VIMMED1(1.0f);
1635         case Store1Int: return VIMMED1(1);
1636         case StoreVertexId:
1637         {
1638             Value* pId = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID })), mSimdFP32Ty);
1639             return VBROADCAST(pId);
1640         }
1641         case StoreInstanceId:
1642         {
1643             Value* pId = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_CurInstance })), mFP32Ty);
1644             return VBROADCAST(pId);
1645         }
1646         case StoreSrc:
1647         default:        SWR_ASSERT(0, "Invalid component control"); return VUNDEF_I();
1648     }
1649 }
1650
1651 //////////////////////////////////////////////////////////////////////////
1652 /// @brief Returns the enable mask for the specified component.
1653 /// @param enableMask - enable bits
1654 /// @param component - component to check if enabled.
1655 bool isComponentEnabled(ComponentEnable enableMask, uint8_t component)
1656 {
1657     switch (component)
1658     {
1659         // X
1660     case 0: return (enableMask & ComponentEnable::X);
1661         // Y
1662     case 1: return (enableMask & ComponentEnable::Y);
1663         // Z
1664     case 2: return (enableMask & ComponentEnable::Z);
1665         // W
1666     case 3: return (enableMask & ComponentEnable::W);
1667
1668     default: return false;
1669     }
1670 }
1671
1672
1673 //////////////////////////////////////////////////////////////////////////
1674 /// @brief JITs from fetch shader IR
1675 /// @param hJitMgr - JitManager handle
1676 /// @param func   - LLVM function IR
1677 /// @return PFN_FETCH_FUNC - pointer to fetch code
1678 PFN_FETCH_FUNC JitFetchFunc(HANDLE hJitMgr, const HANDLE hFunc)
1679 {
1680     const llvm::Function* func = (const llvm::Function*)hFunc;
1681     JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
1682     PFN_FETCH_FUNC pfnFetch;
1683
1684     pfnFetch = (PFN_FETCH_FUNC)(pJitMgr->mpExec->getFunctionAddress(func->getName().str()));
1685     // MCJIT finalizes modules the first time you JIT code from them. After finalized, you cannot add new IR to the module
1686     pJitMgr->mIsModuleFinalized = true;
1687
1688 #if defined(KNOB_SWRC_TRACING)
1689     char fName[1024];
1690     const char *funcName = func->getName().data();
1691     sprintf(fName, "%s.bin", funcName);
1692     FILE *fd = fopen(fName, "wb");
1693     fwrite((void *)pfnFetch, 1, 2048, fd);
1694     fclose(fd);
1695 #endif
1696
1697     return pfnFetch;
1698 }
1699
1700 //////////////////////////////////////////////////////////////////////////
1701 /// @brief JIT compiles fetch shader
1702 /// @param hJitMgr - JitManager handle
1703 /// @param state   - fetch state to build function from
1704 extern "C" PFN_FETCH_FUNC JITCALL JitCompileFetch(HANDLE hJitMgr, const FETCH_COMPILE_STATE& state)
1705 {
1706     JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
1707
1708     pJitMgr->SetupNewModule();
1709
1710     FetchJit theJit(pJitMgr);
1711     HANDLE hFunc = theJit.Create(state);
1712
1713     return JitFetchFunc(hJitMgr, hFunc);
1714 }