src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp

   1 /****************************************************************************
   2 * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
   3 *
   4 * Permission is hereby granted, free of charge, to any person obtaining a
   5 * copy of this software and associated documentation files (the "Software"),
   6 * to deal in the Software without restriction, including without limitation
   7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8 * and/or sell copies of the Software, and to permit persons to whom the
   9 * Software is furnished to do so, subject to the following conditions:
  10 *
  11 * The above copyright notice and this permission notice (including the next
  12 * paragraph) shall be included in all copies or substantial portions of the
  13 * Software.
  14 *
  15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21 * IN THE SOFTWARE.
  22 *
  23 * @file fetch_jit.cpp
  24 *
  25 * @brief Implementation of the fetch jitter
  26 *
  27 * Notes:
  28 *
  29 ******************************************************************************/
  30 #include "jit_api.h"
  31 #include "fetch_jit.h"
  32 #include "builder.h"
  33 #include "state_llvm.h"
  34 #include <sstream>
  35 #include <tuple>
  36
  37 //#define FETCH_DUMP_VERTEX 1
  38 using namespace llvm;
  39 using namespace SwrJit;
  40
  41 bool isComponentEnabled(ComponentEnable enableMask, uint8_t component);
  42
  43 enum ConversionType
  44 {
  45     CONVERT_NONE,
  46     CONVERT_NORMALIZED,
  47     CONVERT_USCALED,
  48     CONVERT_SSCALED,
  49     CONVERT_SFIXED,
  50 };
  51
  52 //////////////////////////////////////////////////////////////////////////
  53 /// Interface to Jitting a fetch shader
  54 //////////////////////////////////////////////////////////////////////////
  55 struct FetchJit : public Builder
  56 {
  57     FetchJit(JitManager* pJitMgr) : Builder(pJitMgr){};
  58
  59     Function* Create(const FETCH_COMPILE_STATE& fetchState);
  60     Value* GetSimdValid32bitIndices(Value* vIndices, Value* pLastIndex);
  61     Value* GetSimdValid16bitIndices(Value* vIndices, Value* pLastIndex);
  62     Value* GetSimdValid8bitIndices(Value* vIndices, Value* pLastIndex);
  63
  64     // package up Shuffle*bpcGatherd args into a tuple for convenience
  65     typedef std::tuple<Value*&, Value*, const Instruction::CastOps, const ConversionType,
  66         uint32_t&, uint32_t&, const ComponentEnable, const ComponentControl(&)[4], Value*(&)[4],
  67         const uint32_t(&)[4]> Shuffle8bpcArgs;
  68     void Shuffle8bpcGatherd(Shuffle8bpcArgs &args);
  69
  70     typedef std::tuple<Value*(&)[2], Value*, const Instruction::CastOps, const ConversionType,
  71         uint32_t&, uint32_t&, const ComponentEnable, const ComponentControl(&)[4], Value*(&)[4]> Shuffle16bpcArgs;
  72     void Shuffle16bpcGather(Shuffle16bpcArgs &args);
  73
  74     void StoreVertexElements(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4]);
  75
  76     Value* GenerateCompCtrlVector(const ComponentControl ctrl);
  77
  78     void JitLoadVertices(const FETCH_COMPILE_STATE &fetchState, Value* streams, Value* vIndices, Value* pVtxOut);
  79     void JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, Value* streams, Value* vIndices, Value* pVtxOut);
  80
  81     bool IsOddFormat(SWR_FORMAT format);
  82     bool IsUniformFormat(SWR_FORMAT format);
  83     void UnpackComponents(SWR_FORMAT format, Value* vInput, Value* result[4]);
  84     void CreateGatherOddFormats(SWR_FORMAT format, Value* pMask, Value* pBase, Value* offsets, Value* result[4]);
  85     void ConvertFormat(SWR_FORMAT format, Value *texels[4]);
  86
  87     Value* mpFetchInfo;
  88 };
  89
  90 Function* FetchJit::Create(const FETCH_COMPILE_STATE& fetchState)
  91 {
  92     static std::size_t fetchNum = 0;
  93
  94     std::stringstream fnName("FetchShader", std::ios_base::in | std::ios_base::out | std::ios_base::ate);
  95     fnName << fetchNum++;
  96
  97     Function*    fetch = Function::Create(JM()->mFetchShaderTy, GlobalValue::ExternalLinkage, fnName.str(), JM()->mpCurrentModule);
  98     BasicBlock*    entry = BasicBlock::Create(JM()->mContext, "entry", fetch);
  99
 100     IRB()->SetInsertPoint(entry);
 101
 102     auto    argitr = fetch->getArgumentList().begin();
 103
 104     // Fetch shader arguments
 105     mpFetchInfo = &*argitr; ++argitr;
 106     mpFetchInfo->setName("fetchInfo");
 107     Value*    pVtxOut = &*argitr;
 108     pVtxOut->setName("vtxOutput");
 109     // this is just shorthand to tell LLVM to get a pointer to the base address of simdvertex
 110     // index 0(just the pointer to the simdvertex structure
 111     // index 1(which element of the simdvertex structure to offset to(in this case 0)
 112     // so the indices being i32's doesn't matter
 113     // TODO: generated this GEP with a VECTOR structure type so this makes sense
 114     std::vector<Value*>    vtxInputIndices(2, C(0));
 115     // GEP
 116     pVtxOut = GEP(pVtxOut, C(0));
 117     pVtxOut = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth), 0));
 118
 119     // SWR_FETCH_CONTEXT::pStreams
 120     Value*    streams = LOAD(mpFetchInfo,{0, SWR_FETCH_CONTEXT_pStreams});
 121     streams->setName("pStreams");
 122
 123     // SWR_FETCH_CONTEXT::pIndices
 124     Value*    indices = LOAD(mpFetchInfo,{0, SWR_FETCH_CONTEXT_pIndices});
 125     indices->setName("pIndices");
 126
 127     // SWR_FETCH_CONTEXT::pLastIndex
 128     Value*    pLastIndex = LOAD(mpFetchInfo,{0, SWR_FETCH_CONTEXT_pLastIndex});
 129     pLastIndex->setName("pLastIndex");
 130
 131
 132     Value* vIndices;
 133     switch(fetchState.indexType)
 134     {
 135         case R8_UINT:
 136             indices = BITCAST(indices, Type::getInt8PtrTy(JM()->mContext, 0));
 137             if(fetchState.bDisableIndexOOBCheck){
 138                 vIndices = LOAD(BITCAST(indices, PointerType::get(VectorType::get(mInt8Ty, mpJitMgr->mVWidth), 0)), {(uint32_t)0});
 139                 vIndices = Z_EXT(vIndices, mSimdInt32Ty);
 140             }
 141             else{
 142                 pLastIndex = BITCAST(pLastIndex, Type::getInt8PtrTy(JM()->mContext, 0));
 143                 vIndices = GetSimdValid8bitIndices(indices, pLastIndex);
 144             }
 145             break;
 146         case R16_UINT:
 147             indices = BITCAST(indices, Type::getInt16PtrTy(JM()->mContext, 0));
 148             if(fetchState.bDisableIndexOOBCheck){
 149                 vIndices = LOAD(BITCAST(indices, PointerType::get(VectorType::get(mInt16Ty, mpJitMgr->mVWidth), 0)), {(uint32_t)0});
 150                 vIndices = Z_EXT(vIndices, mSimdInt32Ty);
 151             }
 152             else{
 153                 pLastIndex = BITCAST(pLastIndex, Type::getInt16PtrTy(JM()->mContext, 0));
 154                 vIndices = GetSimdValid16bitIndices(indices, pLastIndex);
 155             }
 156             break;
 157         case R32_UINT:
 158             (fetchState.bDisableIndexOOBCheck) ? vIndices = LOAD(BITCAST(indices, PointerType::get(mSimdInt32Ty,0)),{(uint32_t)0})
 159                                                : vIndices = GetSimdValid32bitIndices(indices, pLastIndex);
 160             break; // incoming type is already 32bit int
 161         default: SWR_ASSERT(0, "Unsupported index type"); vIndices = nullptr; break;
 162     }
 163
 164     Value* vVertexId = vIndices;
 165     if (fetchState.bVertexIDOffsetEnable)
 166     {
 167         // Assuming one of baseVertex or startVertex is 0, so adding both should be functionally correct
 168         Value* vBaseVertex = VBROADCAST(LOAD(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_BaseVertex }));
 169         Value* vStartVertex = VBROADCAST(LOAD(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_StartVertex }));
 170         vVertexId = ADD(vIndices, vBaseVertex);
 171         vVertexId = ADD(vVertexId, vStartVertex);
 172     }
 173
 174     // store out vertex IDs
 175     STORE(vVertexId, GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID }));
 176
 177     // store out cut mask if enabled
 178     if (fetchState.bEnableCutIndex)
 179     {
 180         Value* vCutIndex = VIMMED1(fetchState.cutIndex);
 181         Value* cutMask = VMASK(ICMP_EQ(vIndices, vCutIndex));
 182         STORE(cutMask, GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_CutMask }));
 183     }
 184
 185     // Fetch attributes from memory and output to a simdvertex struct
 186     // since VGATHER has a perf penalty on HSW vs BDW, allow client to choose which fetch method to use
 187     (fetchState.bDisableVGATHER) ? JitLoadVertices(fetchState, streams, vIndices, pVtxOut)
 188                                  : JitGatherVertices(fetchState, streams, vIndices, pVtxOut);
 189
 190     RET_VOID();
 191
 192     JitManager::DumpToFile(fetch, "src");
 193
 194 #if defined(_DEBUG)
 195     verifyFunction(*fetch);
 196 #endif
 197
 198     ::FunctionPassManager setupPasses(JM()->mpCurrentModule);
 199
 200     ///@todo We don't need the CFG passes for fetch. (e.g. BreakCriticalEdges and CFGSimplification)
 201     setupPasses.add(createBreakCriticalEdgesPass());
 202     setupPasses.add(createCFGSimplificationPass());
 203     setupPasses.add(createEarlyCSEPass());
 204     setupPasses.add(createPromoteMemoryToRegisterPass());
 205
 206     setupPasses.run(*fetch);
 207
 208     JitManager::DumpToFile(fetch, "se");
 209
 210     ::FunctionPassManager optPasses(JM()->mpCurrentModule);
 211
 212     ///@todo Haven't touched these either. Need to remove some of these and add others.
 213     optPasses.add(createCFGSimplificationPass());
 214     optPasses.add(createEarlyCSEPass());
 215     optPasses.add(createInstructionCombiningPass());
 216     optPasses.add(createInstructionSimplifierPass());
 217     optPasses.add(createConstantPropagationPass());
 218     optPasses.add(createSCCPPass());
 219     optPasses.add(createAggressiveDCEPass());
 220
 221     optPasses.run(*fetch);
 222     optPasses.run(*fetch);
 223
 224     JitManager::DumpToFile(fetch, "opt");
 225
 226     return fetch;
 227 }
 228
 229 //////////////////////////////////////////////////////////////////////////
 230 /// @brief Loads attributes from memory using LOADs, shuffling the
 231 /// components into SOA form.
 232 /// *Note* currently does not support component control,
 233 /// component packing, instancing
 234 /// @param fetchState - info about attributes to be fetched from memory
 235 /// @param streams - value pointer to the current vertex stream
 236 /// @param vIndices - vector value of indices to load
 237 /// @param pVtxOut - value pointer to output simdvertex struct
 238 void FetchJit::JitLoadVertices(const FETCH_COMPILE_STATE &fetchState, Value* streams, Value* vIndices, Value* pVtxOut)
 239 {
 240     // Zack shuffles; a variant of the Charleston.
 241
 242     std::vector<Value*> vectors(16);
 243     std::vector<Constant*>    pMask(mVWidth);
 244     for(uint32_t i = 0; i < mVWidth; ++i)
 245     {
 246         pMask[i] = (C(i < 4 ? i : 4));
 247     }
 248     Constant* promoteMask = ConstantVector::get(pMask);
 249     Constant* uwvec = UndefValue::get(VectorType::get(mFP32Ty, 4));
 250
 251     Value* startVertex = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartVertex});
 252     Value* startInstance = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartInstance});
 253     Value* curInstance = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_CurInstance});
 254     Value* vBaseVertex = VBROADCAST(LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_BaseVertex}));
 255     curInstance->setName("curInstance");
 256
 257     for(uint32_t nelt = 0; nelt < fetchState.numAttribs; ++nelt)
 258     {
 259         Value*    elements[4] = {0};
 260         const INPUT_ELEMENT_DESC& ied = fetchState.layout[nelt];
 261         const SWR_FORMAT_INFO &info = GetFormatInfo((SWR_FORMAT)ied.Format);
 262         SWR_ASSERT((info.bpp != 0), "Unsupported format in JitLoadVertices.");
 263         uint32_t    numComponents = info.numComps;
 264         uint32_t bpc = info.bpp / info.numComps;  ///@todo Code below assumes all components are same size. Need to fix.
 265
 266         // load path doesn't support component packing
 267         SWR_ASSERT(ied.ComponentPacking == ComponentEnable::XYZW, "Fetch load path doesn't support component packing.");
 268
 269         vectors.clear();
 270
 271         Value *vCurIndices;
 272         Value *startOffset;
 273         if(ied.InstanceEnable)
 274         {
 275             Value* stepRate = C(ied.InstanceDataStepRate);
 276
 277             // prevent a div by 0 for 0 step rate
 278             Value* isNonZeroStep = ICMP_UGT(stepRate, C(0));
 279             stepRate = SELECT(isNonZeroStep, stepRate, C(1));
 280
 281             // calc the current offset into instanced data buffer
 282             Value* calcInstance = UDIV(curInstance, stepRate);
 283
 284             // if step rate is 0, every instance gets instance 0
 285             calcInstance = SELECT(isNonZeroStep, calcInstance, C(0));
 286
 287             vCurIndices = VBROADCAST(calcInstance);
 288
 289             startOffset = startInstance;
 290         }
 291         else
 292         {
 293             // offset indices by baseVertex
 294             vCurIndices = ADD(vIndices, vBaseVertex);
 295
 296             startOffset = startVertex;
 297         }
 298
 299         // load SWR_VERTEX_BUFFER_STATE::pData
 300         Value *stream = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pData});
 301
 302         // load SWR_VERTEX_BUFFER_STATE::pitch
 303         Value *stride = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pitch});
 304         stride = Z_EXT(stride, mInt64Ty);
 305
 306         // load SWR_VERTEX_BUFFER_STATE::size
 307         Value *size = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_size});
 308         size = Z_EXT(size, mInt64Ty);
 309
 310         Value* startVertexOffset = MUL(Z_EXT(startOffset, mInt64Ty), stride);
 311
 312         // Load from the stream.
 313         for(uint32_t lane = 0; lane < mVWidth; ++lane)
 314         {
 315             // Get index
 316             Value* index = VEXTRACT(vCurIndices, C(lane));
 317             index = Z_EXT(index, mInt64Ty);
 318
 319             Value*    offset = MUL(index, stride);
 320             offset = ADD(offset, C((int64_t)ied.AlignedByteOffset));
 321             offset = ADD(offset, startVertexOffset);
 322
 323             if (!fetchState.bDisableIndexOOBCheck) {
 324                 // check for out of bound access, including partial OOB, and mask them to 0
 325                 Value *endOffset = ADD(offset, C((int64_t)info.Bpp));
 326                 Value *oob = ICMP_ULE(endOffset, size);
 327                 offset = SELECT(oob, offset, ConstantInt::get(mInt64Ty, 0));
 328             }
 329
 330             Value*    pointer = GEP(stream, offset);
 331             // We use a full-lane, but don't actually care.
 332             Value*    vptr = 0;
 333
 334             // get a pointer to a 4 component attrib in default address space
 335             switch(bpc)
 336             {
 337                 case 8: vptr = BITCAST(pointer, PointerType::get(VectorType::get(mInt8Ty, 4), 0)); break;
 338                 case 16: vptr = BITCAST(pointer, PointerType::get(VectorType::get(mInt16Ty, 4), 0)); break;
 339                 case 32: vptr = BITCAST(pointer, PointerType::get(VectorType::get(mFP32Ty, 4), 0)); break;
 340                 default: SWR_ASSERT(false, "Unsupported underlying bpp!");
 341             }
 342
 343             // load 4 components of attribute
 344             Value*    vec = ALIGNED_LOAD(vptr, 1, false);
 345
 346             // Convert To FP32 internally
 347             switch(info.type[0])
 348             {
 349                 case SWR_TYPE_UNORM:
 350                     switch(bpc)
 351                     {
 352                         case 8:
 353                             vec = UI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
 354                             vec = FMUL(vec, ConstantVector::get(std::vector<Constant*>(4, ConstantFP::get(mFP32Ty, 1.0 / 255.0))));
 355                             break;
 356                         case 16:
 357                             vec = UI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
 358                             vec = FMUL(vec, ConstantVector::get(std::vector<Constant*>(4, ConstantFP::get(mFP32Ty, 1.0 / 65535.0))));
 359                             break;
 360                         default:
 361                             SWR_ASSERT(false, "Unsupported underlying type!");
 362                             break;
 363                     }
 364                     break;
 365                 case SWR_TYPE_SNORM:
 366                     switch(bpc)
 367                     {
 368                         case 8:
 369                             vec = SI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
 370                             vec = FMUL(vec, ConstantVector::get(std::vector<Constant*>(4, ConstantFP::get(mFP32Ty, 1.0 / 128.0))));
 371                             break;
 372                         case 16:
 373                             vec = SI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
 374                             vec = FMUL(vec, ConstantVector::get(std::vector<Constant*>(4, ConstantFP::get(mFP32Ty, 1.0 / 32768.0))));
 375                             break;
 376                         default:
 377                             SWR_ASSERT(false, "Unsupported underlying type!");
 378                             break;
 379                     }
 380                     break;
 381                 case SWR_TYPE_UINT:
 382                     // Zero extend uint32_t types.
 383                     switch(bpc)
 384                     {
 385                         case 8:
 386                         case 16:
 387                             vec = Z_EXT(vec, VectorType::get(mInt32Ty, 4));
 388                             vec = BITCAST(vec, VectorType::get(mFP32Ty, 4));
 389                             break;
 390                         case 32:
 391                             break; // Pass through unchanged.
 392                         default:
 393                             SWR_ASSERT(false, "Unsupported underlying type!");
 394                             break;
 395                     }
 396                     break;
 397                 case SWR_TYPE_SINT:
 398                     // Sign extend SINT types.
 399                     switch(bpc)
 400                     {
 401                         case 8:
 402                         case 16:
 403                             vec = S_EXT(vec, VectorType::get(mInt32Ty, 4));
 404                             vec = BITCAST(vec, VectorType::get(mFP32Ty, 4));
 405                             break;
 406                         case 32:
 407                             break; // Pass through unchanged.
 408                         default:
 409                             SWR_ASSERT(false, "Unsupported underlying type!");
 410                             break;
 411                     }
 412                     break;
 413                 case SWR_TYPE_FLOAT:
 414                     switch(bpc)
 415                     {
 416                         case 32:
 417                             break; // Pass through unchanged.
 418                         default:
 419                             SWR_ASSERT(false, "Unsupported underlying type!");
 420                     }
 421                     break;
 422                 case SWR_TYPE_USCALED:
 423                     vec = UI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
 424                     break;
 425                 case SWR_TYPE_SSCALED:
 426                     vec = SI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
 427                     break;
 428                 case SWR_TYPE_SFIXED:
 429                     vec = FMUL(SI_TO_FP(vec, VectorType::get(mFP32Ty, 4)), VBROADCAST(C(1/65536.0f)));
 430                     break;
 431                 case SWR_TYPE_UNKNOWN:
 432                 case SWR_TYPE_UNUSED:
 433                     SWR_ASSERT(false, "Unsupported type %d!", info.type[0]);
 434             }
 435
 436             // promote mask: sse(0,1,2,3) | avx(0,1,2,3,4,4,4,4)
 437             // uwvec: 4 x F32, undef value
 438             Value*    wvec = VSHUFFLE(vec, uwvec, promoteMask);
 439             vectors.push_back(wvec);
 440         }
 441
 442         std::vector<Constant*>        v01Mask(mVWidth);
 443         std::vector<Constant*>        v23Mask(mVWidth);
 444         std::vector<Constant*>        v02Mask(mVWidth);
 445         std::vector<Constant*>        v13Mask(mVWidth);
 446
 447         // Concatenate the vectors together.
 448         elements[0] = VUNDEF_F();
 449         elements[1] = VUNDEF_F();
 450         elements[2] = VUNDEF_F();
 451         elements[3] = VUNDEF_F();
 452         for(uint32_t b = 0, num4Wide = mVWidth / 4; b < num4Wide; ++b)
 453         {
 454             v01Mask[4 * b + 0] = C(0 + 4 * b);
 455             v01Mask[4 * b + 1] = C(1 + 4 * b);
 456             v01Mask[4 * b + 2] = C(0 + 4 * b + mVWidth);
 457             v01Mask[4 * b + 3] = C(1 + 4 * b + mVWidth);
 458
 459             v23Mask[4 * b + 0] = C(2 + 4 * b);
 460             v23Mask[4 * b + 1] = C(3 + 4 * b);
 461             v23Mask[4 * b + 2] = C(2 + 4 * b + mVWidth);
 462             v23Mask[4 * b + 3] = C(3 + 4 * b + mVWidth);
 463
 464             v02Mask[4 * b + 0] = C(0 + 4 * b);
 465             v02Mask[4 * b + 1] = C(2 + 4 * b);
 466             v02Mask[4 * b + 2] = C(0 + 4 * b + mVWidth);
 467             v02Mask[4 * b + 3] = C(2 + 4 * b + mVWidth);
 468
 469             v13Mask[4 * b + 0] = C(1 + 4 * b);
 470             v13Mask[4 * b + 1] = C(3 + 4 * b);
 471             v13Mask[4 * b + 2] = C(1 + 4 * b + mVWidth);
 472             v13Mask[4 * b + 3] = C(3 + 4 * b + mVWidth);
 473
 474             std::vector<Constant*>    iMask(mVWidth);
 475             for(uint32_t i = 0; i < mVWidth; ++i)
 476             {
 477                 if(((4 * b) <= i) && (i < (4 * (b + 1))))
 478                 {
 479                     iMask[i] = C(i % 4 + mVWidth);
 480                 }
 481                 else
 482                 {
 483                     iMask[i] = C(i);
 484                 }
 485             }
 486             Constant* insertMask = ConstantVector::get(iMask);
 487             elements[0] = VSHUFFLE(elements[0], vectors[4 * b + 0], insertMask);
 488             elements[1] = VSHUFFLE(elements[1], vectors[4 * b + 1], insertMask);
 489             elements[2] = VSHUFFLE(elements[2], vectors[4 * b + 2], insertMask);
 490             elements[3] = VSHUFFLE(elements[3], vectors[4 * b + 3], insertMask);
 491         }
 492
 493         Value* x0y0x1y1 = VSHUFFLE(elements[0], elements[1], ConstantVector::get(v01Mask));
 494         Value* x2y2x3y3 = VSHUFFLE(elements[2], elements[3], ConstantVector::get(v01Mask));
 495         Value* z0w0z1w1 = VSHUFFLE(elements[0], elements[1], ConstantVector::get(v23Mask));
 496         Value* z2w3z2w3 = VSHUFFLE(elements[2], elements[3], ConstantVector::get(v23Mask));
 497         elements[0] = VSHUFFLE(x0y0x1y1, x2y2x3y3, ConstantVector::get(v02Mask));
 498         elements[1] = VSHUFFLE(x0y0x1y1, x2y2x3y3, ConstantVector::get(v13Mask));
 499         elements[2] = VSHUFFLE(z0w0z1w1, z2w3z2w3, ConstantVector::get(v02Mask));
 500         elements[3] = VSHUFFLE(z0w0z1w1, z2w3z2w3, ConstantVector::get(v13Mask));
 501
 502         switch(numComponents + 1)
 503         {
 504             case    1: elements[0] = VIMMED1(0.0f);
 505             case    2: elements[1] = VIMMED1(0.0f);
 506             case    3: elements[2] = VIMMED1(0.0f);
 507             case    4: elements[3] = VIMMED1(1.0f);
 508         }
 509
 510         for(uint32_t c = 0; c < 4; ++c)
 511         {
 512             Value* dest = GEP(pVtxOut, C(nelt * 4 + c), "destGEP");
 513             STORE(elements[c], dest);
 514         }
 515     }
 516 }
 517
 518 // returns true for odd formats that require special state.gather handling
 519 bool FetchJit::IsOddFormat(SWR_FORMAT format)
 520 {
 521     const SWR_FORMAT_INFO& info = GetFormatInfo(format);
 522     if (info.bpc[0] != 8 && info.bpc[0] != 16 && info.bpc[0] != 32)
 523     {
 524         return true;
 525     }
 526     return false;
 527 }
 528
 529 // format is uniform if all components are the same size and type
 530 bool FetchJit::IsUniformFormat(SWR_FORMAT format)
 531 {
 532     const SWR_FORMAT_INFO& info = GetFormatInfo(format);
 533     uint32_t bpc0 = info.bpc[0];
 534     uint32_t type0 = info.type[0];
 535
 536     for (uint32_t c = 1; c < info.numComps; ++c)
 537     {
 538         if (bpc0 != info.bpc[c] || type0 != info.type[c])
 539         {
 540             return false;
 541         }
 542     }
 543     return true;
 544 }
 545
 546 // unpacks components based on format
 547 // foreach component in the pixel
 548 //   mask off everything but this component
 549 //   shift component to LSB
 550 void FetchJit::UnpackComponents(SWR_FORMAT format, Value* vInput, Value* result[4])
 551 {
 552     const SWR_FORMAT_INFO& info = GetFormatInfo(format);
 553
 554     uint32_t bitOffset = 0;
 555     for (uint32_t c = 0; c < info.numComps; ++c)
 556     {
 557         uint32_t swizzledIndex = info.swizzle[c];
 558         uint32_t compBits = info.bpc[c];
 559         uint32_t bitmask = ((1 << compBits) - 1) << bitOffset;
 560         Value* comp = AND(vInput, bitmask);
 561         comp = LSHR(comp, bitOffset);
 562
 563         result[swizzledIndex] = comp;
 564         bitOffset += compBits;
 565     }
 566 }
 567
 568 // gather for odd component size formats
 569 // gather SIMD full pixels per lane then shift/mask to move each component to their
 570 // own vector
 571 void FetchJit::CreateGatherOddFormats(SWR_FORMAT format, Value* pMask, Value* pBase, Value* offsets, Value* result[4])
 572 {
 573     const SWR_FORMAT_INFO &info = GetFormatInfo(format);
 574
 575     // only works if pixel size is <= 32bits
 576     SWR_ASSERT(info.bpp <= 32);
 577
 578     Value* gather = VUNDEF_I();
 579
 580     // assign defaults
 581     for (uint32_t comp = 0; comp < 4; ++comp)
 582     {
 583         result[comp] = VIMMED1((int)info.defaults[comp]);
 584     }
 585
 586     // load the proper amount of data based on component size
 587     PointerType* pLoadTy = nullptr;
 588     switch (info.bpp)
 589     {
 590     case 8: pLoadTy = Type::getInt8PtrTy(JM()->mContext); break;
 591     case 16: pLoadTy = Type::getInt16PtrTy(JM()->mContext); break;
 592     case 24:
 593     case 32: pLoadTy = Type::getInt32PtrTy(JM()->mContext); break;
 594     default: SWR_ASSERT(0);
 595     }
 596
 597     // allocate temporary memory for masked off lanes
 598     Value* pTmp = ALLOCA(pLoadTy->getElementType());
 599
 600     // gather SIMD pixels
 601     for (uint32_t e = 0; e < JM()->mVWidth; ++e)
 602     {
 603         Value* pElemOffset = VEXTRACT(offsets, C(e));
 604         Value* pLoad = GEP(pBase, pElemOffset);
 605         Value* pLaneMask = VEXTRACT(pMask, C(e));
 606
 607         pLoad = POINTER_CAST(pLoad, pLoadTy);
 608
 609         // mask in tmp pointer for disabled lanes
 610         pLoad = SELECT(pLaneMask, pLoad, pTmp);
 611
 612         // load pixel
 613         Value *val = LOAD(pLoad);
 614
 615         // zero extend to 32bit integer
 616         val = INT_CAST(val, mInt32Ty, false);
 617
 618         // store in simd lane
 619         gather = VINSERT(gather, val, C(e));
 620     }
 621
 622     UnpackComponents(format, gather, result);
 623
 624     // cast to fp32
 625     result[0] = BITCAST(result[0], mSimdFP32Ty);
 626     result[1] = BITCAST(result[1], mSimdFP32Ty);
 627     result[2] = BITCAST(result[2], mSimdFP32Ty);
 628     result[3] = BITCAST(result[3], mSimdFP32Ty);
 629 }
 630
 631 void FetchJit::ConvertFormat(SWR_FORMAT format, Value *texels[4])
 632 {
 633     const SWR_FORMAT_INFO &info = GetFormatInfo(format);
 634
 635     for (uint32_t c = 0; c < info.numComps; ++c)
 636     {
 637         uint32_t compIndex = info.swizzle[c];
 638
 639         // skip any conversion on UNUSED components
 640         if (info.type[c] == SWR_TYPE_UNUSED)
 641         {
 642             continue;
 643         }
 644
 645         if (info.isNormalized[c])
 646         {
 647             if (info.type[c] == SWR_TYPE_SNORM)
 648             {
 649                 /// @todo The most-negative value maps to -1.0f. e.g. the 5-bit value 10000 maps to -1.0f.
 650
 651                 /// result = c * (1.0f / (2^(n-1) - 1);
 652                 uint32_t n = info.bpc[c];
 653                 uint32_t pow2 = 1 << (n - 1);
 654                 float scale = 1.0f / (float)(pow2 - 1);
 655                 Value *vScale = VIMMED1(scale);
 656                 texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty);
 657                 texels[compIndex] = SI_TO_FP(texels[compIndex], mSimdFP32Ty);
 658                 texels[compIndex] = FMUL(texels[compIndex], vScale);
 659             }
 660             else
 661             {
 662                 SWR_ASSERT(info.type[c] == SWR_TYPE_UNORM);
 663
 664                 /// result = c * (1.0f / (2^n - 1))
 665                 uint32_t n = info.bpc[c];
 666                 uint32_t pow2 = 1 << n;
 667                 // special case 24bit unorm format, which requires a full divide to meet ULP requirement
 668                 if (n == 24)
 669                 {
 670                     float scale = (float)(pow2 - 1);
 671                     Value* vScale = VIMMED1(scale);
 672                     texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty);
 673                     texels[compIndex] = SI_TO_FP(texels[compIndex], mSimdFP32Ty);
 674                     texels[compIndex] = FDIV(texels[compIndex], vScale);
 675                 }
 676                 else
 677                 {
 678                     float scale = 1.0f / (float)(pow2 - 1);
 679                     Value *vScale = VIMMED1(scale);
 680                     texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty);
 681                     texels[compIndex] = UI_TO_FP(texels[compIndex], mSimdFP32Ty);
 682                     texels[compIndex] = FMUL(texels[compIndex], vScale);
 683                 }
 684             }
 685             continue;
 686         }
 687     }
 688 }
 689
 690 //////////////////////////////////////////////////////////////////////////
 691 /// @brief Loads attributes from memory using AVX2 GATHER(s)
 692 /// @param fetchState - info about attributes to be fetched from memory
 693 /// @param streams - value pointer to the current vertex stream
 694 /// @param vIndices - vector value of indices to gather
 695 /// @param pVtxOut - value pointer to output simdvertex struct
 696 void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
 697                                  Value* streams, Value* vIndices, Value* pVtxOut)
 698 {
 699     uint32_t currentVertexElement = 0;
 700     uint32_t outputElt = 0;
 701     Value* vVertexElements[4];
 702
 703     Value* startVertex = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartVertex});
 704     Value* startInstance = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartInstance});
 705     Value* curInstance = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_CurInstance});
 706     Value* vBaseVertex = VBROADCAST(LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_BaseVertex}));
 707     curInstance->setName("curInstance");
 708
 709     for(uint32_t nInputElt = 0; nInputElt < fetchState.numAttribs; ++nInputElt)
 710     {
 711         const INPUT_ELEMENT_DESC& ied = fetchState.layout[nInputElt];
 712
 713         // skip element if all components are disabled
 714         if (ied.ComponentPacking == ComponentEnable::NONE)
 715         {
 716             continue;
 717         }
 718
 719         const SWR_FORMAT_INFO &info = GetFormatInfo((SWR_FORMAT)ied.Format);
 720         SWR_ASSERT((info.bpp != 0), "Unsupported format in JitGatherVertices.");
 721         uint32_t bpc = info.bpp / info.numComps;  ///@todo Code below assumes all components are same size. Need to fix.
 722
 723         Value *stream = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pData});
 724
 725         // VGATHER* takes an *i8 src pointer
 726         Value* pStreamBase = BITCAST(stream, PointerType::get(mInt8Ty, 0));
 727
 728         Value *stride = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pitch});
 729         Value *vStride = VBROADCAST(stride);
 730
 731         // max vertex index that is fully in bounds
 732         Value *maxVertex = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_maxVertex)});
 733         maxVertex = LOAD(maxVertex);
 734
 735         Value *vCurIndices;
 736         Value *startOffset;
 737         if(ied.InstanceEnable)
 738         {
 739             Value* stepRate = C(ied.InstanceDataStepRate);
 740
 741             // prevent a div by 0 for 0 step rate
 742             Value* isNonZeroStep = ICMP_UGT(stepRate, C(0));
 743             stepRate = SELECT(isNonZeroStep, stepRate, C(1));
 744
 745             // calc the current offset into instanced data buffer
 746             Value* calcInstance = UDIV(curInstance, stepRate);
 747
 748             // if step rate is 0, every instance gets instance 0
 749             calcInstance = SELECT(isNonZeroStep, calcInstance, C(0));
 750
 751             vCurIndices = VBROADCAST(calcInstance);
 752
 753             startOffset = startInstance;
 754         }
 755         else
 756         {
 757             // offset indices by baseVertex
 758             vCurIndices = ADD(vIndices, vBaseVertex);
 759
 760             startOffset = startVertex;
 761         }
 762
 763         // All of the OOB calculations are in vertices, not VB offsets, to prevent having to
 764         // do 64bit address offset calculations.
 765
 766         // calculate byte offset to the start of the VB
 767         Value* baseOffset = MUL(Z_EXT(startOffset, mInt64Ty), Z_EXT(stride, mInt64Ty));
 768         pStreamBase = GEP(pStreamBase, baseOffset);
 769
 770         // if we have a start offset, subtract from max vertex. Used for OOB check
 771         maxVertex = SUB(Z_EXT(maxVertex, mInt64Ty), Z_EXT(startOffset, mInt64Ty));
 772         Value* neg = ICMP_SLT(maxVertex, C((int64_t)0));
 773         // if we have a negative value, we're already OOB. clamp at 0.
 774         maxVertex = SELECT(neg, C(0), TRUNC(maxVertex, mInt32Ty));
 775
 776         // Load the in bounds size of a partially valid vertex
 777         Value *partialInboundsSize = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_partialInboundsSize)});
 778         partialInboundsSize = LOAD(partialInboundsSize);
 779         Value* vPartialVertexSize = VBROADCAST(partialInboundsSize);
 780         Value* vBpp = VBROADCAST(C(info.Bpp));
 781         Value* vAlignmentOffsets = VBROADCAST(C(ied.AlignedByteOffset));
 782
 783         // is the element is <= the partially valid size
 784         Value* vElementInBoundsMask = ICMP_SLE(vBpp, SUB(vPartialVertexSize, vAlignmentOffsets));
 785
 786         // override cur indices with 0 if pitch is 0
 787         Value* pZeroPitchMask = ICMP_EQ(vStride, VIMMED1(0));
 788         vCurIndices = SELECT(pZeroPitchMask, VIMMED1(0), vCurIndices);
 789
 790         // are vertices partially OOB?
 791         Value* vMaxVertex = VBROADCAST(maxVertex);
 792         Value* vPartialOOBMask = ICMP_EQ(vCurIndices, vMaxVertex);
 793
 794         // are vertices are fully in bounds?
 795         Value* vGatherMask = ICMP_ULT(vCurIndices, vMaxVertex);
 796
 797         // blend in any partially OOB indices that have valid elements
 798         vGatherMask = SELECT(vPartialOOBMask, vElementInBoundsMask, vGatherMask);
 799         Value* pMask = vGatherMask;
 800         vGatherMask = VMASK(vGatherMask);
 801
 802         // calculate the actual offsets into the VB
 803         Value* vOffsets = MUL(vCurIndices, vStride);
 804         vOffsets = ADD(vOffsets, vAlignmentOffsets);
 805
 806         // Packing and component control
 807         ComponentEnable compMask = (ComponentEnable)ied.ComponentPacking;
 808         const ComponentControl compCtrl[4] { (ComponentControl)ied.ComponentControl0, (ComponentControl)ied.ComponentControl1,
 809                                              (ComponentControl)ied.ComponentControl2, (ComponentControl)ied.ComponentControl3};
 810
 811         // Special gather/conversion for formats without equal component sizes
 812         if (IsOddFormat((SWR_FORMAT)ied.Format))
 813         {
 814             Value* pResults[4];
 815             CreateGatherOddFormats((SWR_FORMAT)ied.Format, pMask, pStreamBase, vOffsets, pResults);
 816             ConvertFormat((SWR_FORMAT)ied.Format, pResults);
 817
 818             for (uint32_t c = 0; c < 4; ++c)
 819             {
 820                 if (isComponentEnabled(compMask, c))
 821                 {
 822                     vVertexElements[currentVertexElement++] = pResults[c];
 823                     if (currentVertexElement > 3)
 824                     {
 825                         StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
 826                         // reset to the next vVertexElement to output
 827                         currentVertexElement = 0;
 828                     }
 829                 }
 830             }
 831         }
 832         else if(info.type[0] == SWR_TYPE_FLOAT)
 833         {
 834             ///@todo: support 64 bit vb accesses
 835             Value* gatherSrc = VIMMED1(0.0f);
 836
 837             SWR_ASSERT(IsUniformFormat((SWR_FORMAT)ied.Format),
 838                 "Unsupported format for standard gather fetch.");
 839
 840             // Gather components from memory to store in a simdvertex structure
 841             switch(bpc)
 842             {
 843                 case 16:
 844                 {
 845                     Value* vGatherResult[2];
 846                     Value *vMask;
 847
 848                     // if we have at least one component out of x or y to fetch
 849                     if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)){
 850                         // save mask as it is zero'd out after each gather
 851                         vMask = vGatherMask;
 852
 853                         vGatherResult[0] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1));
 854                         // e.g. result of first 8x32bit integer gather for 16bit components
 855                         // 256i - 0    1    2    3    4    5    6    7
 856                         //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
 857                         //
 858                     }
 859
 860                     // if we have at least one component out of z or w to fetch
 861                     if(isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)){
 862                         // offset base to the next components(zw) in the vertex to gather
 863                         pStreamBase = GEP(pStreamBase, C((char)4));
 864                         vMask = vGatherMask;
 865
 866                         vGatherResult[1] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1));
 867                         // e.g. result of second 8x32bit integer gather for 16bit components
 868                         // 256i - 0    1    2    3    4    5    6    7
 869                         //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
 870                         //
 871                     }
 872
 873                     // if we have at least one component to shuffle into place
 874                     if(compMask){
 875                         Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, Instruction::CastOps::FPExt, CONVERT_NONE,
 876                             currentVertexElement, outputElt, compMask, compCtrl, vVertexElements);
 877
 878                         // Shuffle gathered components into place in simdvertex struct
 879                         Shuffle16bpcGather(args);  // outputs to vVertexElements ref
 880                     }
 881                 }
 882                     break;
 883                 case 32:
 884                 {
 885                     for (uint32_t i = 0; i < 4; i++)
 886                     {
 887                         if (isComponentEnabled(compMask, i))
 888                         {
 889                             // if we need to gather the component
 890                             if (compCtrl[i] == StoreSrc)
 891                             {
 892                                 // save mask as it is zero'd out after each gather
 893                                 Value *vMask = vGatherMask;
 894
 895                                 // Gather a SIMD of vertices
 896                                 vVertexElements[currentVertexElement++] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1));
 897                             }
 898                             else
 899                             {
 900                                 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
 901                             }
 902
 903                             if (currentVertexElement > 3)
 904                             {
 905                                 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
 906                                 // reset to the next vVertexElement to output
 907                                 currentVertexElement = 0;
 908                             }
 909
 910                         }
 911
 912                         // offset base to the next component in the vertex to gather
 913                         pStreamBase = GEP(pStreamBase, C((char)4));
 914                     }
 915                 }
 916                     break;
 917                 default:
 918                     SWR_ASSERT(0, "Tried to fetch invalid FP format");
 919                     break;
 920             }
 921         }
 922         else
 923         {
 924             Instruction::CastOps extendCastType = Instruction::CastOps::CastOpsEnd;
 925             ConversionType conversionType = CONVERT_NONE;
 926
 927             SWR_ASSERT(IsUniformFormat((SWR_FORMAT)ied.Format),
 928                 "Unsupported format for standard gather fetch.");
 929
 930             switch(info.type[0])
 931             {
 932                 case SWR_TYPE_UNORM:
 933                     conversionType = CONVERT_NORMALIZED;
 934                 case SWR_TYPE_UINT:
 935                     extendCastType = Instruction::CastOps::ZExt;
 936                     break;
 937                 case SWR_TYPE_SNORM:
 938                     conversionType = CONVERT_NORMALIZED;
 939                 case SWR_TYPE_SINT:
 940                     extendCastType = Instruction::CastOps::SExt;
 941                     break;
 942                 case SWR_TYPE_USCALED:
 943                     conversionType = CONVERT_USCALED;
 944                     extendCastType = Instruction::CastOps::UIToFP;
 945                     break;
 946                 case SWR_TYPE_SSCALED:
 947                     conversionType = CONVERT_SSCALED;
 948                     extendCastType = Instruction::CastOps::SIToFP;
 949                     break;
 950                 case SWR_TYPE_SFIXED:
 951                     conversionType = CONVERT_SFIXED;
 952                     extendCastType = Instruction::CastOps::SExt;
 953                     break;
 954                 default:
 955                     break;
 956             }
 957
 958             // value substituted when component of gather is masked
 959             Value* gatherSrc = VIMMED1(0);
 960
 961             // Gather components from memory to store in a simdvertex structure
 962             switch (bpc)
 963             {
 964                 case 8:
 965                 {
 966                     // if we have at least one component to fetch
 967                     if(compMask)
 968                     {
 969                         Value* vGatherResult = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask, C((char)1));
 970                         // e.g. result of an 8x32bit integer gather for 8bit components
 971                         // 256i - 0    1    2    3    4    5    6    7
 972                         //        xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw
 973
 974                         Shuffle8bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, extendCastType, conversionType,
 975                             currentVertexElement, outputElt, compMask, compCtrl, vVertexElements, info.swizzle);
 976
 977                         // Shuffle gathered components into place in simdvertex struct
 978                         Shuffle8bpcGatherd(args); // outputs to vVertexElements ref
 979                     }
 980                 }
 981                 break;
 982                 case 16:
 983                 {
 984                     Value* vGatherResult[2];
 985                     Value *vMask;
 986
 987                     // if we have at least one component out of x or y to fetch
 988                     if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)){
 989                         // save mask as it is zero'd out after each gather
 990                         vMask = vGatherMask;
 991
 992                         vGatherResult[0] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1));
 993                         // e.g. result of first 8x32bit integer gather for 16bit components
 994                         // 256i - 0    1    2    3    4    5    6    7
 995                         //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
 996                         //
 997                     }
 998
 999                     // if we have at least one component out of z or w to fetch
1000                     if(isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)){
1001                         // offset base to the next components(zw) in the vertex to gather
1002                         pStreamBase = GEP(pStreamBase, C((char)4));
1003                         vMask = vGatherMask;
1004
1005                         vGatherResult[1] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1));
1006                         // e.g. result of second 8x32bit integer gather for 16bit components
1007                         // 256i - 0    1    2    3    4    5    6    7
1008                         //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
1009                         //
1010                     }
1011
1012                     // if we have at least one component to shuffle into place
1013                     if(compMask){
1014                         Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, extendCastType, conversionType,
1015                             currentVertexElement, outputElt, compMask, compCtrl, vVertexElements);
1016
1017                         // Shuffle gathered components into place in simdvertex struct
1018                         Shuffle16bpcGather(args);  // outputs to vVertexElements ref
1019                     }
1020                 }
1021                 break;
1022                 case 32:
1023                 {
1024                     // Gathered components into place in simdvertex struct
1025                     for (uint32_t i = 0; i < 4; i++)
1026                     {
1027                         if (isComponentEnabled(compMask, i))
1028                         {
1029                             // if we need to gather the component
1030                             if (compCtrl[i] == StoreSrc)
1031                             {
1032                                 // save mask as it is zero'd out after each gather
1033                                 Value *vMask = vGatherMask;
1034
1035                                 Value* pGather = GATHERDD(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1));
1036
1037                                 if (conversionType == CONVERT_USCALED)
1038                                 {
1039                                     pGather = UI_TO_FP(pGather, mSimdFP32Ty);
1040                                 }
1041                                 else if (conversionType == CONVERT_SSCALED)
1042                                 {
1043                                     pGather = SI_TO_FP(pGather, mSimdFP32Ty);
1044                                 }
1045                                 else if (conversionType == CONVERT_SFIXED)
1046                                 {
1047                                     pGather = FMUL(SI_TO_FP(pGather, mSimdFP32Ty), VBROADCAST(C(1/65536.0f)));
1048                                 }
1049
1050                                 vVertexElements[currentVertexElement++] = pGather;
1051                                 // e.g. result of a single 8x32bit integer gather for 32bit components
1052                                 // 256i - 0    1    2    3    4    5    6    7
1053                                 //        xxxx xxxx xxxx xxxx xxxx xxxx xxxx xxxx
1054                             }
1055                             else
1056                             {
1057                                 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1058                             }
1059
1060                             if (currentVertexElement > 3)
1061                             {
1062                                 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1063                                 // reset to the next vVertexElement to output
1064                                 currentVertexElement = 0;
1065                             }
1066
1067                         }
1068
1069                         // offset base to the next component  in the vertex to gather
1070                         pStreamBase = GEP(pStreamBase, C((char)4));
1071                     }
1072                 }
1073                 break;
1074             }
1075         }
1076     }
1077
1078     // if we have a partially filled vVertexElement struct, output it
1079     if(currentVertexElement > 0){
1080         StoreVertexElements(pVtxOut, outputElt++, currentVertexElement, vVertexElements);
1081     }
1082 }
1083
1084 //////////////////////////////////////////////////////////////////////////
1085 /// @brief Loads a simd of valid indices. OOB indices are set to 0
1086 /// *Note* have to do 16bit index checking in scalar until we have AVX-512
1087 /// support
1088 /// @param pIndices - pointer to 8 bit indices
1089 /// @param pLastIndex - pointer to last valid index
1090 Value* FetchJit::GetSimdValid8bitIndices(Value* pIndices, Value* pLastIndex)
1091 {
1092     // can fit 2 16 bit integers per vWidth lane
1093     Value* vIndices =  VUNDEF_I();
1094
1095     // store 0 index on stack to be used to conditionally load from if index address is OOB
1096     Value* pZeroIndex = ALLOCA(mInt8Ty);
1097     STORE(C((uint8_t)0), pZeroIndex);
1098
1099     // Load a SIMD of index pointers
1100     for(int64_t lane = 0; lane < mVWidth; lane++)
1101     {
1102         // Calculate the address of the requested index
1103         Value *pIndex = GEP(pIndices, C(lane));
1104
1105         // check if the address is less than the max index,
1106         Value* mask = ICMP_ULT(pIndex, pLastIndex);
1107
1108         // if valid, load the index. if not, load 0 from the stack
1109         Value* pValid = SELECT(mask, pIndex, pZeroIndex);
1110         Value *index = LOAD(pValid, "valid index");
1111
1112         // zero extended index to 32 bits and insert into the correct simd lane
1113         index = Z_EXT(index, mInt32Ty);
1114         vIndices = VINSERT(vIndices, index, lane);
1115     }
1116     return vIndices;
1117 }
1118
1119 //////////////////////////////////////////////////////////////////////////
1120 /// @brief Loads a simd of valid indices. OOB indices are set to 0
1121 /// *Note* have to do 16bit index checking in scalar until we have AVX-512
1122 /// support
1123 /// @param pIndices - pointer to 16 bit indices
1124 /// @param pLastIndex - pointer to last valid index
1125 Value* FetchJit::GetSimdValid16bitIndices(Value* pIndices, Value* pLastIndex)
1126 {
1127     // can fit 2 16 bit integers per vWidth lane
1128     Value* vIndices =  VUNDEF_I();
1129
1130     // store 0 index on stack to be used to conditionally load from if index address is OOB
1131     Value* pZeroIndex = ALLOCA(mInt16Ty);
1132     STORE(C((uint16_t)0), pZeroIndex);
1133
1134     // Load a SIMD of index pointers
1135     for(int64_t lane = 0; lane < mVWidth; lane++)
1136     {
1137         // Calculate the address of the requested index
1138         Value *pIndex = GEP(pIndices, C(lane));
1139
1140         // check if the address is less than the max index,
1141         Value* mask = ICMP_ULT(pIndex, pLastIndex);
1142
1143         // if valid, load the index. if not, load 0 from the stack
1144         Value* pValid = SELECT(mask, pIndex, pZeroIndex);
1145         Value *index = LOAD(pValid, "valid index");
1146
1147         // zero extended index to 32 bits and insert into the correct simd lane
1148         index = Z_EXT(index, mInt32Ty);
1149         vIndices = VINSERT(vIndices, index, lane);
1150     }
1151     return vIndices;
1152 }
1153
1154 //////////////////////////////////////////////////////////////////////////
1155 /// @brief Loads a simd of valid indices. OOB indices are set to 0
1156 /// @param pIndices - pointer to 32 bit indices
1157 /// @param pLastIndex - pointer to last valid index
1158 Value* FetchJit::GetSimdValid32bitIndices(Value* pIndices, Value* pLastIndex)
1159 {
1160     DataLayout dL(JM()->mpCurrentModule);
1161     unsigned int ptrSize = dL.getPointerSize() * 8;  // ptr size in bits
1162     Value* iLastIndex = PTR_TO_INT(pLastIndex, Type::getIntNTy(JM()->mContext, ptrSize));
1163     Value* iIndices = PTR_TO_INT(pIndices, Type::getIntNTy(JM()->mContext, ptrSize));
1164
1165     // get the number of indices left in the buffer (endPtr - curPtr) / sizeof(index)
1166     Value* numIndicesLeft = SUB(iLastIndex,iIndices);
1167     numIndicesLeft = TRUNC(numIndicesLeft, mInt32Ty);
1168     numIndicesLeft = SDIV(numIndicesLeft, C(4));
1169
1170     // create a vector of index counts from the base index ptr passed into the fetch
1171     const std::vector<Constant*> vecIndices {C(0), C(1), C(2), C(3), C(4), C(5), C(6), C(7)};
1172     Constant* vIndexOffsets = ConstantVector::get(vecIndices);
1173
1174     // compare index count to the max valid index
1175     // e.g vMaxIndex      4 4 4 4 4 4 4 4 : 4 indices left to load
1176     //     vIndexOffsets  0 1 2 3 4 5 6 7
1177     //     ------------------------------
1178     //     vIndexMask    -1-1-1-1 0 0 0 0 : offsets < max pass
1179     //     vLoadedIndices 0 1 2 3 0 0 0 0 : offsets >= max masked to 0
1180     Value* vMaxIndex = VBROADCAST(numIndicesLeft);
1181     Value* vIndexMask = VPCMPGTD(vMaxIndex,vIndexOffsets);
1182
1183     // VMASKLOAD takes an *i8 src pointer
1184     pIndices = BITCAST(pIndices,PointerType::get(mInt8Ty,0));
1185
1186     // Load the indices; OOB loads 0
1187     return MASKLOADD(pIndices,vIndexMask);
1188 }
1189
1190 //////////////////////////////////////////////////////////////////////////
1191 /// @brief Takes a SIMD of gathered 8bpc verts, zero or sign extends,
1192 /// denormalizes if needed, converts to F32 if needed, and positions in
1193 //  the proper SIMD rows to be output to the simdvertex structure
1194 /// @param args: (tuple of args, listed below)
1195 ///   @param vGatherResult - 8 gathered 8bpc vertices
1196 ///   @param pVtxOut - base pointer to output simdvertex struct
1197 ///   @param extendType - sign extend or zero extend
1198 ///   @param bNormalized - do we need to denormalize?
1199 ///   @param currentVertexElement - reference to the current vVertexElement
1200 ///   @param outputElt - reference to the current offset from simdvertex we're o
1201 ///   @param compMask - component packing mask
1202 ///   @param compCtrl - component control val
1203 ///   @param vVertexElements[4] - vertex components to output
1204 ///   @param swizzle[4] - component swizzle location
1205 void FetchJit::Shuffle8bpcGatherd(Shuffle8bpcArgs &args)
1206 {
1207     // Unpack tuple args
1208     Value*& vGatherResult = std::get<0>(args);
1209     Value* pVtxOut = std::get<1>(args);
1210     const Instruction::CastOps extendType = std::get<2>(args);
1211     const ConversionType conversionType = std::get<3>(args);
1212     uint32_t &currentVertexElement = std::get<4>(args);
1213     uint32_t &outputElt =  std::get<5>(args);
1214     const ComponentEnable compMask = std::get<6>(args);
1215     const ComponentControl (&compCtrl)[4] = std::get<7>(args);
1216     Value* (&vVertexElements)[4] = std::get<8>(args);
1217     const uint32_t (&swizzle)[4] = std::get<9>(args);
1218
1219     // cast types
1220     Type* vGatherTy = mSimdInt32Ty;
1221     Type* v32x8Ty =  VectorType::get(mInt8Ty, mVWidth * 4 ); // vwidth is units of 32 bits
1222
1223     // have to do extra work for sign extending
1224     if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP)){
1225         Type* v16x8Ty = VectorType::get(mInt8Ty, mVWidth * 2); // 8x16bit ints in a 128bit lane
1226         Type* v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
1227
1228         // shuffle mask, including any swizzling
1229         const char x = (char)swizzle[0]; const char y = (char)swizzle[1];
1230         const char z = (char)swizzle[2]; const char w = (char)swizzle[3];
1231         Value* vConstMask = C<char>({char(x), char(x+4), char(x+8), char(x+12),
1232                     char(y), char(y+4), char(y+8), char(y+12),
1233                     char(z), char(z+4), char(z+8), char(z+12),
1234                     char(w), char(w+4), char(w+8), char(w+12),
1235                     char(x), char(x+4), char(x+8), char(x+12),
1236                     char(y), char(y+4), char(y+8), char(y+12),
1237                     char(z), char(z+4), char(z+8), char(z+12),
1238                     char(w), char(w+4), char(w+8), char(w+12)});
1239
1240         Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherResult, v32x8Ty), vConstMask), vGatherTy);
1241         // after pshufb: group components together in each 128bit lane
1242         // 256i - 0    1    2    3    4    5    6    7
1243         //        xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww
1244
1245         Value* vi128XY = nullptr;
1246         if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)){
1247             vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({0, 4, 0, 0, 1, 5, 0, 0})), v128Ty);
1248             // after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane
1249             // 256i - 0    1    2    3    4    5    6    7
1250             //        xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care)
1251         }
1252
1253         // do the same for zw components
1254         Value* vi128ZW = nullptr;
1255         if(isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)){
1256             vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({2, 6, 0, 0, 3, 7, 0, 0})), v128Ty);
1257         }
1258
1259         // init denormalize variables if needed
1260         Instruction::CastOps fpCast;
1261         Value* conversionFactor;
1262
1263         switch (conversionType)
1264         {
1265         case CONVERT_NORMALIZED:
1266             fpCast = Instruction::CastOps::SIToFP;
1267             conversionFactor = VIMMED1((float)(1.0 / 127.0));
1268             break;
1269         case CONVERT_SSCALED:
1270             fpCast = Instruction::CastOps::SIToFP;
1271             conversionFactor = VIMMED1((float)(1.0));
1272             break;
1273         case CONVERT_USCALED:
1274             SWR_ASSERT(0, "Type should not be sign extended!");
1275             conversionFactor = nullptr;
1276             break;
1277         default:
1278             SWR_ASSERT(conversionType == CONVERT_NONE);
1279             conversionFactor = nullptr;
1280             break;
1281         }
1282
1283         // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
1284         for (uint32_t i = 0; i < 4; i++)
1285         {
1286             if (isComponentEnabled(compMask, i))
1287             {
1288                 if (compCtrl[i] == ComponentControl::StoreSrc)
1289                 {
1290                     // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
1291                     uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
1292                     // if x or y, use vi128XY permute result, else use vi128ZW
1293                     Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
1294
1295                     // sign extend
1296                     vVertexElements[currentVertexElement] = PMOVSXBD(BITCAST(VEXTRACT(selectedPermute, C(lane)), v16x8Ty));
1297
1298                     // denormalize if needed
1299                     if (conversionType != CONVERT_NONE)
1300                     {
1301                         vVertexElements[currentVertexElement] = FMUL(CAST(fpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor);
1302                     }
1303                     currentVertexElement++;
1304                 }
1305                 else
1306                 {
1307                     vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1308                 }
1309
1310                 if (currentVertexElement > 3)
1311                 {
1312                     StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1313                     // reset to the next vVertexElement to output
1314                     currentVertexElement = 0;
1315                 }
1316             }
1317         }
1318     }
1319     // else zero extend
1320     else if ((extendType == Instruction::CastOps::ZExt) || (extendType == Instruction::CastOps::UIToFP))
1321     {
1322         // init denormalize variables if needed
1323         Instruction::CastOps fpCast;
1324         Value* conversionFactor;
1325
1326         switch (conversionType)
1327         {
1328         case CONVERT_NORMALIZED:
1329             fpCast = Instruction::CastOps::UIToFP;
1330             conversionFactor = VIMMED1((float)(1.0 / 255.0));
1331             break;
1332         case CONVERT_USCALED:
1333             fpCast = Instruction::CastOps::UIToFP;
1334             conversionFactor = VIMMED1((float)(1.0));
1335             break;
1336         case CONVERT_SSCALED:
1337             SWR_ASSERT(0, "Type should not be zero extended!");
1338             conversionFactor = nullptr;
1339             break;
1340         default:
1341             SWR_ASSERT(conversionType == CONVERT_NONE);
1342             conversionFactor = nullptr;
1343             break;
1344         }
1345
1346         // shuffle enabled components into lower byte of each 32bit lane, 0 extending to 32 bits
1347         for (uint32_t i = 0; i < 4; i++)
1348         {
1349             if (isComponentEnabled(compMask, i))
1350             {
1351                 if (compCtrl[i] == ComponentControl::StoreSrc)
1352                 {
1353                     // pshufb masks for each component
1354                     Value* vConstMask;
1355                     switch (swizzle[i])
1356                     {
1357                     case 0:
1358                         // x shuffle mask
1359                         vConstMask = C<char>({ 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1,
1360                                                0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1 });
1361                         break;
1362                     case 1:
1363                         // y shuffle mask
1364                         vConstMask = C<char>({ 1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1,
1365                                                1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1 });
1366                         break;
1367                     case 2:
1368                         // z shuffle mask
1369                         vConstMask = C<char>({ 2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1,
1370                                                2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1 });
1371                         break;
1372                     case 3:
1373                         // w shuffle mask
1374                         vConstMask = C<char>({ 3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1,
1375                                                3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1 });
1376                         break;
1377                     default:
1378                         vConstMask = nullptr;
1379                         break;
1380                     }
1381
1382                     vVertexElements[currentVertexElement] = BITCAST(PSHUFB(BITCAST(vGatherResult, v32x8Ty), vConstMask), vGatherTy);
1383                     // after pshufb for x channel
1384                     // 256i - 0    1    2    3    4    5    6    7
1385                     //        x000 x000 x000 x000 x000 x000 x000 x000
1386
1387                     // denormalize if needed
1388                     if (conversionType != CONVERT_NONE)
1389                     {
1390                         vVertexElements[currentVertexElement] = FMUL(CAST(fpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor);
1391                     }
1392                     currentVertexElement++;
1393                 }
1394                 else
1395                 {
1396                     vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1397                 }
1398
1399                 if (currentVertexElement > 3)
1400                 {
1401                     StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1402                     // reset to the next vVertexElement to output
1403                     currentVertexElement = 0;
1404                 }
1405             }
1406         }
1407     }
1408     else
1409     {
1410         SWR_ASSERT(0, "Unsupported conversion type");
1411     }
1412 }
1413
1414 //////////////////////////////////////////////////////////////////////////
1415 /// @brief Takes a SIMD of gathered 16bpc verts, zero or sign extends,
1416 /// denormalizes if needed, converts to F32 if needed, and positions in
1417 //  the proper SIMD rows to be output to the simdvertex structure
1418 /// @param args: (tuple of args, listed below)
1419 ///   @param vGatherResult[2] - array of gathered 16bpc vertices, 4 per index
1420 ///   @param pVtxOut - base pointer to output simdvertex struct
1421 ///   @param extendType - sign extend or zero extend
1422 ///   @param bNormalized - do we need to denormalize?
1423 ///   @param currentVertexElement - reference to the current vVertexElement
1424 ///   @param outputElt - reference to the current offset from simdvertex we're o
1425 ///   @param compMask - component packing mask
1426 ///   @param compCtrl - component control val
1427 ///   @param vVertexElements[4] - vertex components to output
1428 void FetchJit::Shuffle16bpcGather(Shuffle16bpcArgs &args)
1429 {
1430     // Unpack tuple args
1431     Value* (&vGatherResult)[2] = std::get<0>(args);
1432     Value* pVtxOut = std::get<1>(args);
1433     const Instruction::CastOps extendType = std::get<2>(args);
1434     const ConversionType conversionType = std::get<3>(args);
1435     uint32_t &currentVertexElement = std::get<4>(args);
1436     uint32_t &outputElt = std::get<5>(args);
1437     const ComponentEnable compMask = std::get<6>(args);
1438     const ComponentControl(&compCtrl)[4] = std::get<7>(args);
1439     Value* (&vVertexElements)[4] = std::get<8>(args);
1440
1441     // cast types
1442     Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
1443     Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
1444
1445     // have to do extra work for sign extending
1446     if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP)||
1447         (extendType == Instruction::CastOps::FPExt))
1448     {
1449         // is this PP float?
1450         bool bFP = (extendType == Instruction::CastOps::FPExt) ? true : false;
1451
1452         Type* v8x16Ty = VectorType::get(mInt16Ty, 8); // 8x16bit in a 128bit lane
1453         Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
1454
1455         // shuffle mask
1456         Value* vConstMask = C<char>({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
1457                                      0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15});
1458         Value* vi128XY = nullptr;
1459         if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)){
1460             Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherResult[0], v32x8Ty), vConstMask), vGatherTy);
1461             // after pshufb: group components together in each 128bit lane
1462             // 256i - 0    1    2    3    4    5    6    7
1463             //        xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
1464
1465             vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
1466             // after PERMD: move and pack xy components into each 128bit lane
1467             // 256i - 0    1    2    3    4    5    6    7
1468             //        xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
1469         }
1470
1471         // do the same for zw components
1472         Value* vi128ZW = nullptr;
1473         if(isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)){
1474             Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherResult[1], v32x8Ty), vConstMask), vGatherTy);
1475             vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
1476         }
1477
1478         // init denormalize variables if needed
1479         Instruction::CastOps IntToFpCast;
1480         Value* conversionFactor;
1481
1482         switch (conversionType)
1483         {
1484         case CONVERT_NORMALIZED:
1485             IntToFpCast = Instruction::CastOps::SIToFP;
1486             conversionFactor = VIMMED1((float)(1.0 / 32767.0));
1487             break;
1488         case CONVERT_SSCALED:
1489             IntToFpCast = Instruction::CastOps::SIToFP;
1490             conversionFactor = VIMMED1((float)(1.0));
1491             break;
1492         case CONVERT_USCALED:
1493             SWR_ASSERT(0, "Type should not be sign extended!");
1494             conversionFactor = nullptr;
1495             break;
1496         default:
1497             SWR_ASSERT(conversionType == CONVERT_NONE);
1498             conversionFactor = nullptr;
1499             break;
1500         }
1501
1502         // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
1503         for (uint32_t i = 0; i < 4; i++)
1504         {
1505             if (isComponentEnabled(compMask, i))
1506             {
1507                 if (compCtrl[i] == ComponentControl::StoreSrc)
1508                 {
1509                     // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
1510                     uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
1511                     // if x or y, use vi128XY permute result, else use vi128ZW
1512                     Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
1513
1514                     if (bFP) {
1515                         // extract 128 bit lanes to sign extend each component
1516                         vVertexElements[currentVertexElement] = CVTPH2PS(BITCAST(VEXTRACT(selectedPermute, C(lane)), v8x16Ty));
1517                     }
1518                     else {
1519                         // extract 128 bit lanes to sign extend each component
1520                         vVertexElements[currentVertexElement] = PMOVSXWD(BITCAST(VEXTRACT(selectedPermute, C(lane)), v8x16Ty));
1521
1522                         // denormalize if needed
1523                         if (conversionType != CONVERT_NONE) {
1524                             vVertexElements[currentVertexElement] = FMUL(CAST(IntToFpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor);
1525                         }
1526                     }
1527                     currentVertexElement++;
1528                 }
1529                 else
1530                 {
1531                     vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1532                 }
1533
1534                 if (currentVertexElement > 3)
1535                 {
1536                     StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1537                     // reset to the next vVertexElement to output
1538                     currentVertexElement = 0;
1539                 }
1540             }
1541         }
1542     }
1543     // else zero extend
1544     else if ((extendType == Instruction::CastOps::ZExt) || (extendType == Instruction::CastOps::UIToFP))
1545     {
1546         // pshufb masks for each component
1547         Value* vConstMask[2];
1548         if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 2)){
1549             // x/z shuffle mask
1550             vConstMask[0] = C<char>({0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
1551                                      0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, });
1552         }
1553
1554         if(isComponentEnabled(compMask, 1) || isComponentEnabled(compMask, 3)){
1555             // y/w shuffle mask
1556             vConstMask[1] = C<char>({2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
1557                                      2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1});
1558         }
1559
1560         // init denormalize variables if needed
1561         Instruction::CastOps fpCast;
1562         Value* conversionFactor;
1563
1564         switch (conversionType)
1565         {
1566         case CONVERT_NORMALIZED:
1567             fpCast = Instruction::CastOps::UIToFP;
1568             conversionFactor = VIMMED1((float)(1.0 / 65535.0));
1569             break;
1570         case CONVERT_USCALED:
1571             fpCast = Instruction::CastOps::UIToFP;
1572             conversionFactor = VIMMED1((float)(1.0f));
1573             break;
1574         case CONVERT_SSCALED:
1575             SWR_ASSERT(0, "Type should not be zero extended!");
1576             conversionFactor = nullptr;
1577             break;
1578         default:
1579             SWR_ASSERT(conversionType == CONVERT_NONE);
1580             conversionFactor = nullptr;
1581             break;
1582         }
1583
1584         // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits
1585         for (uint32_t i = 0; i < 4; i++)
1586         {
1587             if (isComponentEnabled(compMask, i))
1588             {
1589                 if (compCtrl[i] == ComponentControl::StoreSrc)
1590                 {
1591                     // select correct constMask for x/z or y/w pshufb
1592                     uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1;
1593                     // if x or y, use vi128XY permute result, else use vi128ZW
1594                     uint32_t selectedGather = (i < 2) ? 0 : 1;
1595
1596                     vVertexElements[currentVertexElement] = BITCAST(PSHUFB(BITCAST(vGatherResult[selectedGather], v32x8Ty), vConstMask[selectedMask]), vGatherTy);
1597                     // after pshufb mask for x channel; z uses the same shuffle from the second gather
1598                     // 256i - 0    1    2    3    4    5    6    7
1599                     //        xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00
1600
1601                     // denormalize if needed
1602                     if (conversionType != CONVERT_NONE)
1603                     {
1604                         vVertexElements[currentVertexElement] = FMUL(CAST(fpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor);
1605                     }
1606                     currentVertexElement++;
1607                 }
1608                 else
1609                 {
1610                     vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1611                 }
1612
1613                 if (currentVertexElement > 3)
1614                 {
1615                     StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1616                     // reset to the next vVertexElement to output
1617                     currentVertexElement = 0;
1618                 }
1619             }
1620         }
1621     }
1622     else
1623     {
1624         SWR_ASSERT(0, "Unsupported conversion type");
1625     }
1626 }
1627
1628 //////////////////////////////////////////////////////////////////////////
1629 /// @brief Output a simdvertex worth of elements to the current outputElt
1630 /// @param pVtxOut - base address of VIN output struct
1631 /// @param outputElt - simdvertex offset in VIN to write to
1632 /// @param numEltsToStore - number of simdvertex rows to write out
1633 /// @param vVertexElements - LLVM Value*[] simdvertex to write out
1634 void FetchJit::StoreVertexElements(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4])
1635 {
1636     SWR_ASSERT(numEltsToStore <= 4, "Invalid element count.");
1637
1638     for(uint32_t c = 0; c < numEltsToStore; ++c)
1639     {
1640         // STORE expects FP32 x vWidth type, just bitcast if needed
1641         if(!vVertexElements[c]->getType()->getScalarType()->isFloatTy()){
1642 #if FETCH_DUMP_VERTEX
1643             PRINT("vVertexElements[%d]: 0x%x\n", {C(c), vVertexElements[c]});
1644 #endif
1645             vVertexElements[c] = BITCAST(vVertexElements[c], mSimdFP32Ty);
1646         }
1647 #if FETCH_DUMP_VERTEX
1648         else
1649         {
1650             PRINT("vVertexElements[%d]: %f\n", {C(c), vVertexElements[c]});
1651         }
1652 #endif
1653         // outputElt * 4 = offsetting by the size of a simdvertex
1654         // + c offsets to a 32bit x vWidth row within the current vertex
1655         Value* dest = GEP(pVtxOut, C(outputElt * 4 + c), "destGEP");
1656         STORE(vVertexElements[c], dest);
1657     }
1658 }
1659
1660 //////////////////////////////////////////////////////////////////////////
1661 /// @brief Generates a constant vector of values based on the
1662 /// ComponentControl value
1663 /// @param ctrl - ComponentControl value
1664 Value* FetchJit::GenerateCompCtrlVector(const ComponentControl ctrl)
1665 {
1666     switch(ctrl)
1667     {
1668         case NoStore:   return VUNDEF_I();
1669         case Store0:    return VIMMED1(0);
1670         case Store1Fp:  return VIMMED1(1.0f);
1671         case Store1Int: return VIMMED1(1);
1672         case StoreVertexId:
1673         {
1674             Value* pId = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID })), mSimdFP32Ty);
1675             return VBROADCAST(pId);
1676         }
1677         case StoreInstanceId:
1678         {
1679             Value* pId = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_CurInstance })), mFP32Ty);
1680             return VBROADCAST(pId);
1681         }
1682         case StoreSrc:
1683         default:        SWR_ASSERT(0, "Invalid component control"); return VUNDEF_I();
1684     }
1685 }
1686
1687 //////////////////////////////////////////////////////////////////////////
1688 /// @brief Returns the enable mask for the specified component.
1689 /// @param enableMask - enable bits
1690 /// @param component - component to check if enabled.
1691 bool isComponentEnabled(ComponentEnable enableMask, uint8_t component)
1692 {
1693     switch (component)
1694     {
1695         // X
1696     case 0: return (enableMask & ComponentEnable::X);
1697         // Y
1698     case 1: return (enableMask & ComponentEnable::Y);
1699         // Z
1700     case 2: return (enableMask & ComponentEnable::Z);
1701         // W
1702     case 3: return (enableMask & ComponentEnable::W);
1703
1704     default: return false;
1705     }
1706 }
1707
1708
1709 //////////////////////////////////////////////////////////////////////////
1710 /// @brief JITs from fetch shader IR
1711 /// @param hJitMgr - JitManager handle
1712 /// @param func   - LLVM function IR
1713 /// @return PFN_FETCH_FUNC - pointer to fetch code
1714 PFN_FETCH_FUNC JitFetchFunc(HANDLE hJitMgr, const HANDLE hFunc)
1715 {
1716     const llvm::Function* func = (const llvm::Function*)hFunc;
1717     JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
1718     PFN_FETCH_FUNC pfnFetch;
1719
1720     pfnFetch = (PFN_FETCH_FUNC)(pJitMgr->mpExec->getFunctionAddress(func->getName().str()));
1721     // MCJIT finalizes modules the first time you JIT code from them. After finalized, you cannot add new IR to the module
1722     pJitMgr->mIsModuleFinalized = true;
1723
1724 #if defined(KNOB_SWRC_TRACING)
1725     char fName[1024];
1726     const char *funcName = func->getName().data();
1727     sprintf(fName, "%s.bin", funcName);
1728     FILE *fd = fopen(fName, "wb");
1729     fwrite((void *)pfnFetch, 1, 2048, fd);
1730     fclose(fd);
1731 #endif
1732
1733     return pfnFetch;
1734 }
1735
1736 //////////////////////////////////////////////////////////////////////////
1737 /// @brief JIT compiles fetch shader
1738 /// @param hJitMgr - JitManager handle
1739 /// @param state   - fetch state to build function from
1740 extern "C" PFN_FETCH_FUNC JITCALL JitCompileFetch(HANDLE hJitMgr, const FETCH_COMPILE_STATE& state)
1741 {
1742     JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
1743
1744     pJitMgr->SetupNewModule();
1745
1746     FetchJit theJit(pJitMgr);
1747     HANDLE hFunc = theJit.Create(state);
1748
1749     return JitFetchFunc(hJitMgr, hFunc);
1750 }