src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp

   1 /****************************************************************************
   2 * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
   3 *
   4 * Permission is hereby granted, free of charge, to any person obtaining a
   5 * copy of this software and associated documentation files (the "Software"),
   6 * to deal in the Software without restriction, including without limitation
   7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8 * and/or sell copies of the Software, and to permit persons to whom the
   9 * Software is furnished to do so, subject to the following conditions:
  10 *
  11 * The above copyright notice and this permission notice (including the next
  12 * paragraph) shall be included in all copies or substantial portions of the
  13 * Software.
  14 *
  15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21 * IN THE SOFTWARE.
  22 *
  23 * @file fetch_jit.cpp
  24 *
  25 * @brief Implementation of the fetch jitter
  26 *
  27 * Notes:
  28 *
  29 ******************************************************************************/
  30 #include "jit_api.h"
  31 #include "fetch_jit.h"
  32 #include "builder.h"
  33 #include "state_llvm.h"
  34 #include <sstream>
  35 #include <tuple>
  36
  37 //#define FETCH_DUMP_VERTEX 1
  38
  39 bool isComponentEnabled(ComponentEnable enableMask, uint8_t component);
  40
  41 enum ConversionType
  42 {
  43     CONVERT_NONE,
  44     CONVERT_NORMALIZED,
  45     CONVERT_USCALED,
  46     CONVERT_SSCALED,
  47 };
  48
  49 //////////////////////////////////////////////////////////////////////////
  50 /// Interface to Jitting a fetch shader
  51 //////////////////////////////////////////////////////////////////////////
  52 struct FetchJit : public Builder
  53 {
  54     FetchJit(JitManager* pJitMgr) : Builder(pJitMgr){};
  55
  56     Function* Create(const FETCH_COMPILE_STATE& fetchState);
  57     Value* GetSimdValid32bitIndices(Value* vIndices, Value* pLastIndex);
  58     Value* GetSimdValid16bitIndices(Value* vIndices, Value* pLastIndex);
  59     Value* GetSimdValid8bitIndices(Value* vIndices, Value* pLastIndex);
  60
  61     // package up Shuffle*bpcGatherd args into a tuple for convenience
  62     typedef std::tuple<Value*&, Value*, const Instruction::CastOps, const ConversionType,
  63         uint32_t&, uint32_t&, const ComponentEnable, const ComponentControl(&)[4], Value*(&)[4],
  64         const uint32_t(&)[4], Value*, bool, uint32_t, bool, uint32_t> Shuffle8bpcArgs;
  65     void Shuffle8bpcGatherd(Shuffle8bpcArgs &args);
  66
  67     typedef std::tuple<Value*(&)[2], Value*, const Instruction::CastOps, const ConversionType,
  68         uint32_t&, uint32_t&, const ComponentEnable, const ComponentControl(&)[4], Value*(&)[4],
  69         Value*, bool, uint32_t, bool, uint32_t> Shuffle16bpcArgs;
  70     void Shuffle16bpcGather(Shuffle16bpcArgs &args);
  71
  72     void StoreVertexElements(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4]);
  73
  74     Value* GenerateCompCtrlVector(const ComponentControl ctrl);
  75
  76     void JitLoadVertices(const FETCH_COMPILE_STATE &fetchState, Value* fetchInfo, Value* streams, Value* vIndices, Value* pVtxOut);
  77     void JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, Value* fetchInfo, Value* streams, Value* vIndices, Value* pVtxOut);
  78
  79     bool IsOddFormat(SWR_FORMAT format);
  80     bool IsUniformFormat(SWR_FORMAT format);
  81     void UnpackComponents(SWR_FORMAT format, Value* vInput, Value* result[4]);
  82     void CreateGatherOddFormats(SWR_FORMAT format, Value* pBase, Value* offsets, Value* result[4]);
  83     void ConvertFormat(SWR_FORMAT format, Value *texels[4]);
  84
  85 };
  86
  87 Function* FetchJit::Create(const FETCH_COMPILE_STATE& fetchState)
  88 {
  89     static std::size_t fetchNum = 0;
  90
  91     std::stringstream fnName("FetchShader", std::ios_base::in | std::ios_base::out | std::ios_base::ate);
  92     fnName << fetchNum++;
  93
  94     Function*    fetch = Function::Create(JM()->mFetchShaderTy, GlobalValue::ExternalLinkage, fnName.str(), JM()->mpCurrentModule);
  95     BasicBlock*    entry = BasicBlock::Create(JM()->mContext, "entry", fetch);
  96
  97     IRB()->SetInsertPoint(entry);
  98
  99     auto    argitr = fetch->getArgumentList().begin();
 100
 101     // Fetch shader arguments
 102     Value*    fetchInfo = &*argitr; ++argitr;
 103     fetchInfo->setName("fetchInfo");
 104     Value*    pVtxOut = &*argitr;
 105     pVtxOut->setName("vtxOutput");
 106     // this is just shorthand to tell LLVM to get a pointer to the base address of simdvertex
 107     // index 0(just the pointer to the simdvertex structure
 108     // index 1(which element of the simdvertex structure to offset to(in this case 0)
 109     // so the indices being i32's doesn't matter
 110     // TODO: generated this GEP with a VECTOR structure type so this makes sense
 111     std::vector<Value*>    vtxInputIndices(2, C(0));
 112     // GEP
 113     pVtxOut = GEP(pVtxOut, C(0));
 114     pVtxOut = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth), 0));
 115
 116     // SWR_FETCH_CONTEXT::pStreams
 117     Value*    streams = LOAD(fetchInfo,{0, SWR_FETCH_CONTEXT_pStreams});
 118     streams->setName("pStreams");
 119
 120     // SWR_FETCH_CONTEXT::pIndices
 121     Value*    indices = LOAD(fetchInfo,{0, SWR_FETCH_CONTEXT_pIndices});
 122     indices->setName("pIndices");
 123
 124     // SWR_FETCH_CONTEXT::pLastIndex
 125     Value*    pLastIndex = LOAD(fetchInfo,{0, SWR_FETCH_CONTEXT_pLastIndex});
 126     pLastIndex->setName("pLastIndex");
 127
 128
 129     Value* vIndices;
 130     switch(fetchState.indexType)
 131     {
 132         case R8_UINT:
 133             indices = BITCAST(indices, Type::getInt8PtrTy(JM()->mContext, 0));
 134             if(fetchState.bDisableIndexOOBCheck){
 135                 vIndices = LOAD(BITCAST(indices, PointerType::get(VectorType::get(mInt8Ty, mpJitMgr->mVWidth), 0)), {(uint32_t)0});
 136                 vIndices = Z_EXT(vIndices, mSimdInt32Ty);
 137             }
 138             else{
 139                 pLastIndex = BITCAST(pLastIndex, Type::getInt8PtrTy(JM()->mContext, 0));
 140                 vIndices = GetSimdValid8bitIndices(indices, pLastIndex);
 141             }
 142             break;
 143         case R16_UINT:
 144             indices = BITCAST(indices, Type::getInt16PtrTy(JM()->mContext, 0));
 145             if(fetchState.bDisableIndexOOBCheck){
 146                 vIndices = LOAD(BITCAST(indices, PointerType::get(VectorType::get(mInt16Ty, mpJitMgr->mVWidth), 0)), {(uint32_t)0});
 147                 vIndices = Z_EXT(vIndices, mSimdInt32Ty);
 148             }
 149             else{
 150                 pLastIndex = BITCAST(pLastIndex, Type::getInt16PtrTy(JM()->mContext, 0));
 151                 vIndices = GetSimdValid16bitIndices(indices, pLastIndex);
 152             }
 153             break;
 154         case R32_UINT:
 155             (fetchState.bDisableIndexOOBCheck) ? vIndices = LOAD(BITCAST(indices, PointerType::get(mSimdInt32Ty,0)),{(uint32_t)0})
 156                                                : vIndices = GetSimdValid32bitIndices(indices, pLastIndex);
 157             break; // incoming type is already 32bit int
 158         default: SWR_ASSERT(0, "Unsupported index type"); vIndices = nullptr; break;
 159     }
 160
 161     // store out vertex IDs
 162     STORE(vIndices, GEP(fetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID }));
 163
 164     // store out cut mask if enabled
 165     if (fetchState.bEnableCutIndex)
 166     {
 167         Value* vCutIndex = VIMMED1(fetchState.cutIndex);
 168         Value* cutMask = VMASK(ICMP_EQ(vIndices, vCutIndex));
 169         STORE(cutMask, GEP(fetchInfo, { 0, SWR_FETCH_CONTEXT_CutMask }));
 170     }
 171
 172     // Fetch attributes from memory and output to a simdvertex struct
 173     // since VGATHER has a perf penalty on HSW vs BDW, allow client to choose which fetch method to use
 174     (fetchState.bDisableVGATHER) ? JitLoadVertices(fetchState, fetchInfo, streams, vIndices, pVtxOut)
 175                                  : JitGatherVertices(fetchState, fetchInfo, streams, vIndices, pVtxOut);
 176
 177     RET_VOID();
 178
 179     JitManager::DumpToFile(fetch, "src");
 180
 181     verifyFunction(*fetch);
 182
 183     ::FunctionPassManager setupPasses(JM()->mpCurrentModule);
 184
 185     ///@todo We don't need the CFG passes for fetch. (e.g. BreakCriticalEdges and CFGSimplification)
 186     setupPasses.add(createBreakCriticalEdgesPass());
 187     setupPasses.add(createCFGSimplificationPass());
 188     setupPasses.add(createEarlyCSEPass());
 189     setupPasses.add(createPromoteMemoryToRegisterPass());
 190
 191     setupPasses.run(*fetch);
 192
 193     JitManager::DumpToFile(fetch, "se");
 194
 195     ::FunctionPassManager optPasses(JM()->mpCurrentModule);
 196
 197     ///@todo Haven't touched these either. Need to remove some of these and add others.
 198     optPasses.add(createCFGSimplificationPass());
 199     optPasses.add(createEarlyCSEPass());
 200     optPasses.add(createInstructionCombiningPass());
 201     optPasses.add(createInstructionSimplifierPass());
 202     optPasses.add(createConstantPropagationPass());
 203     optPasses.add(createSCCPPass());
 204     optPasses.add(createAggressiveDCEPass());
 205
 206     optPasses.run(*fetch);
 207     optPasses.run(*fetch);
 208
 209     JitManager::DumpToFile(fetch, "opt");
 210
 211     return fetch;
 212 }
 213
 214 //////////////////////////////////////////////////////////////////////////
 215 /// @brief Loads attributes from memory using LOADs, shuffling the
 216 /// components into SOA form.
 217 /// *Note* currently does not support component control,
 218 /// component packing, instancing, InstanceID SGVs, or VertexID SGVs
 219 /// @param fetchState - info about attributes to be fetched from memory
 220 /// @param streams - value pointer to the current vertex stream
 221 /// @param vIndices - vector value of indices to load
 222 /// @param pVtxOut - value pointer to output simdvertex struct
 223 void FetchJit::JitLoadVertices(const FETCH_COMPILE_STATE &fetchState, Value* fetchInfo, Value* streams, Value* vIndices, Value* pVtxOut)
 224 {
 225     // Zack shuffles; a variant of the Charleston.
 226
 227     std::vector<Value*> vectors(16);
 228     std::vector<Constant*>    pMask(mVWidth);
 229     for(uint32_t i = 0; i < mVWidth; ++i)
 230     {
 231         pMask[i] = (C(i < 4 ? i : 4));
 232     }
 233     Constant* promoteMask = ConstantVector::get(pMask);
 234     Constant* uwvec = UndefValue::get(VectorType::get(mFP32Ty, 4));
 235
 236     Value* startVertex = LOAD(fetchInfo, {0, SWR_FETCH_CONTEXT_StartVertex});
 237     Value* startInstance = LOAD(fetchInfo, {0, SWR_FETCH_CONTEXT_StartInstance});
 238     Value* curInstance = LOAD(fetchInfo, {0, SWR_FETCH_CONTEXT_CurInstance});
 239     Value* vBaseVertex = VBROADCAST(LOAD(fetchInfo, {0, SWR_FETCH_CONTEXT_BaseVertex}));
 240     curInstance->setName("curInstance");
 241
 242     for(uint32_t nelt = 0; nelt < fetchState.numAttribs; ++nelt)
 243     {
 244         Value*    elements[4] = {0};
 245         const INPUT_ELEMENT_DESC& ied = fetchState.layout[nelt];
 246         const SWR_FORMAT_INFO &info = GetFormatInfo((SWR_FORMAT)ied.Format);
 247         SWR_ASSERT((info.bpp != 0), "Unsupported format in JitLoadVertices.");
 248         uint32_t    numComponents = info.numComps;
 249         uint32_t bpc = info.bpp / info.numComps;  ///@todo Code below assumes all components are same size. Need to fix.
 250
 251         vectors.clear();
 252
 253         Value *vCurIndices;
 254         Value *startOffset;
 255         if(ied.InstanceEnable)
 256         {
 257             Value* stepRate = C(ied.InstanceDataStepRate);
 258
 259             // prevent a div by 0 for 0 step rate
 260             Value* isNonZeroStep = ICMP_UGT(stepRate, C(0));
 261             stepRate = SELECT(isNonZeroStep, stepRate, C(1));
 262
 263             // calc the current offset into instanced data buffer
 264             Value* calcInstance = UDIV(curInstance, stepRate);
 265
 266             // if step rate is 0, every instance gets instance 0
 267             calcInstance = SELECT(isNonZeroStep, calcInstance, C(0));
 268
 269             vCurIndices = VBROADCAST(calcInstance);
 270
 271             startOffset = startInstance;
 272         }
 273         else
 274         {
 275             // offset indices by baseVertex
 276             vCurIndices = ADD(vIndices, vBaseVertex);
 277
 278             startOffset = startVertex;
 279         }
 280
 281         // load SWR_VERTEX_BUFFER_STATE::pData
 282         Value *stream = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pData});
 283
 284         // load SWR_VERTEX_BUFFER_STATE::pitch
 285         Value *stride = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pitch});
 286         stride = Z_EXT(stride, mInt64Ty);
 287
 288         // load SWR_VERTEX_BUFFER_STATE::size
 289         Value *size = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_size});
 290         size = Z_EXT(size, mInt64Ty);
 291
 292         Value* startVertexOffset = MUL(Z_EXT(startOffset, mInt64Ty), stride);
 293
 294         // Load from the stream.
 295         for(uint32_t lane = 0; lane < mVWidth; ++lane)
 296         {
 297             // Get index
 298             Value* index = VEXTRACT(vCurIndices, C(lane));
 299             index = Z_EXT(index, mInt64Ty);
 300
 301             Value*    offset = MUL(index, stride);
 302             offset = ADD(offset, C((int64_t)ied.AlignedByteOffset));
 303             offset = ADD(offset, startVertexOffset);
 304
 305             if (!fetchState.bDisableIndexOOBCheck) {
 306                 // check for out of bound access, including partial OOB, and mask them to 0
 307                 Value *endOffset = ADD(offset, C((int64_t)info.Bpp));
 308                 Value *oob = ICMP_ULE(endOffset, size);
 309                 offset = SELECT(oob, offset, ConstantInt::get(mInt64Ty, 0));
 310             }
 311
 312             Value*    pointer = GEP(stream, offset);
 313             // We use a full-lane, but don't actually care.
 314             Value*    vptr = 0;
 315
 316             // get a pointer to a 4 component attrib in default address space
 317             switch(bpc)
 318             {
 319                 case 8: vptr = BITCAST(pointer, PointerType::get(VectorType::get(mInt8Ty, 4), 0)); break;
 320                 case 16: vptr = BITCAST(pointer, PointerType::get(VectorType::get(mInt16Ty, 4), 0)); break;
 321                 case 32: vptr = BITCAST(pointer, PointerType::get(VectorType::get(mFP32Ty, 4), 0)); break;
 322                 default: SWR_ASSERT(false, "Unsupported underlying bpp!");
 323             }
 324
 325             // load 4 components of attribute
 326             Value*    vec = ALIGNED_LOAD(vptr, 1, false);
 327
 328             // Convert To FP32 internally
 329             switch(info.type[0])
 330             {
 331                 case SWR_TYPE_UNORM:
 332                     switch(bpc)
 333                     {
 334                         case 8:
 335                             vec = UI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
 336                             vec = FMUL(vec, ConstantVector::get(std::vector<Constant*>(4, ConstantFP::get(mFP32Ty, 1.0 / 255.0))));
 337                             break;
 338                         case 16:
 339                             vec = UI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
 340                             vec = FMUL(vec, ConstantVector::get(std::vector<Constant*>(4, ConstantFP::get(mFP32Ty, 1.0 / 65535.0))));
 341                             break;
 342                         default:
 343                             SWR_ASSERT(false, "Unsupported underlying type!");
 344                             break;
 345                     }
 346                     break;
 347                 case SWR_TYPE_SNORM:
 348                     switch(bpc)
 349                     {
 350                         case 8:
 351                             vec = SI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
 352                             vec = FMUL(vec, ConstantVector::get(std::vector<Constant*>(4, ConstantFP::get(mFP32Ty, 1.0 / 128.0))));
 353                             break;
 354                         case 16:
 355                             vec = SI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
 356                             vec = FMUL(vec, ConstantVector::get(std::vector<Constant*>(4, ConstantFP::get(mFP32Ty, 1.0 / 32768.0))));
 357                             break;
 358                         default:
 359                             SWR_ASSERT(false, "Unsupported underlying type!");
 360                             break;
 361                     }
 362                     break;
 363                 case SWR_TYPE_UINT:
 364                     // Zero extend uint32_t types.
 365                     switch(bpc)
 366                     {
 367                         case 8:
 368                         case 16:
 369                             vec = Z_EXT(vec, VectorType::get(mInt32Ty, 4));
 370                             vec = BITCAST(vec, VectorType::get(mFP32Ty, 4));
 371                             break;
 372                         case 32:
 373                             break; // Pass through unchanged.
 374                         default:
 375                             SWR_ASSERT(false, "Unsupported underlying type!");
 376                             break;
 377                     }
 378                     break;
 379                 case SWR_TYPE_SINT:
 380                     // Sign extend SINT types.
 381                     switch(bpc)
 382                     {
 383                         case 8:
 384                         case 16:
 385                             vec = S_EXT(vec, VectorType::get(mInt32Ty, 4));
 386                             vec = BITCAST(vec, VectorType::get(mFP32Ty, 4));
 387                             break;
 388                         case 32:
 389                             break; // Pass through unchanged.
 390                         default:
 391                             SWR_ASSERT(false, "Unsupported underlying type!");
 392                             break;
 393                     }
 394                     break;
 395                 case SWR_TYPE_FLOAT:
 396                     switch(bpc)
 397                     {
 398                         case 32:
 399                             break; // Pass through unchanged.
 400                         default:
 401                             SWR_ASSERT(false, "Unsupported underlying type!");
 402                     }
 403                     break;
 404                 case SWR_TYPE_USCALED:
 405                     vec = UI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
 406                     break;
 407                 case SWR_TYPE_SSCALED:
 408                     vec = SI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
 409                     break;
 410                 case SWR_TYPE_UNKNOWN:
 411                 case SWR_TYPE_UNUSED:
 412                     SWR_ASSERT(false, "Unsupported type %d!", info.type[0]);
 413             }
 414
 415             // promote mask: sse(0,1,2,3) | avx(0,1,2,3,4,4,4,4)
 416             // uwvec: 4 x F32, undef value
 417             Value*    wvec = VSHUFFLE(vec, uwvec, promoteMask);
 418             vectors.push_back(wvec);
 419         }
 420
 421         std::vector<Constant*>        v01Mask(mVWidth);
 422         std::vector<Constant*>        v23Mask(mVWidth);
 423         std::vector<Constant*>        v02Mask(mVWidth);
 424         std::vector<Constant*>        v13Mask(mVWidth);
 425
 426         // Concatenate the vectors together.
 427         elements[0] = VUNDEF_F();
 428         elements[1] = VUNDEF_F();
 429         elements[2] = VUNDEF_F();
 430         elements[3] = VUNDEF_F();
 431         for(uint32_t b = 0, num4Wide = mVWidth / 4; b < num4Wide; ++b)
 432         {
 433             v01Mask[4 * b + 0] = C(0 + 4 * b);
 434             v01Mask[4 * b + 1] = C(1 + 4 * b);
 435             v01Mask[4 * b + 2] = C(0 + 4 * b + mVWidth);
 436             v01Mask[4 * b + 3] = C(1 + 4 * b + mVWidth);
 437
 438             v23Mask[4 * b + 0] = C(2 + 4 * b);
 439             v23Mask[4 * b + 1] = C(3 + 4 * b);
 440             v23Mask[4 * b + 2] = C(2 + 4 * b + mVWidth);
 441             v23Mask[4 * b + 3] = C(3 + 4 * b + mVWidth);
 442
 443             v02Mask[4 * b + 0] = C(0 + 4 * b);
 444             v02Mask[4 * b + 1] = C(2 + 4 * b);
 445             v02Mask[4 * b + 2] = C(0 + 4 * b + mVWidth);
 446             v02Mask[4 * b + 3] = C(2 + 4 * b + mVWidth);
 447
 448             v13Mask[4 * b + 0] = C(1 + 4 * b);
 449             v13Mask[4 * b + 1] = C(3 + 4 * b);
 450             v13Mask[4 * b + 2] = C(1 + 4 * b + mVWidth);
 451             v13Mask[4 * b + 3] = C(3 + 4 * b + mVWidth);
 452
 453             std::vector<Constant*>    iMask(mVWidth);
 454             for(uint32_t i = 0; i < mVWidth; ++i)
 455             {
 456                 if(((4 * b) <= i) && (i < (4 * (b + 1))))
 457                 {
 458                     iMask[i] = C(i % 4 + mVWidth);
 459                 }
 460                 else
 461                 {
 462                     iMask[i] = C(i);
 463                 }
 464             }
 465             Constant* insertMask = ConstantVector::get(iMask);
 466             elements[0] = VSHUFFLE(elements[0], vectors[4 * b + 0], insertMask);
 467             elements[1] = VSHUFFLE(elements[1], vectors[4 * b + 1], insertMask);
 468             elements[2] = VSHUFFLE(elements[2], vectors[4 * b + 2], insertMask);
 469             elements[3] = VSHUFFLE(elements[3], vectors[4 * b + 3], insertMask);
 470         }
 471
 472         Value* x0y0x1y1 = VSHUFFLE(elements[0], elements[1], ConstantVector::get(v01Mask));
 473         Value* x2y2x3y3 = VSHUFFLE(elements[2], elements[3], ConstantVector::get(v01Mask));
 474         Value* z0w0z1w1 = VSHUFFLE(elements[0], elements[1], ConstantVector::get(v23Mask));
 475         Value* z2w3z2w3 = VSHUFFLE(elements[2], elements[3], ConstantVector::get(v23Mask));
 476         elements[0] = VSHUFFLE(x0y0x1y1, x2y2x3y3, ConstantVector::get(v02Mask));
 477         elements[1] = VSHUFFLE(x0y0x1y1, x2y2x3y3, ConstantVector::get(v13Mask));
 478         elements[2] = VSHUFFLE(z0w0z1w1, z2w3z2w3, ConstantVector::get(v02Mask));
 479         elements[3] = VSHUFFLE(z0w0z1w1, z2w3z2w3, ConstantVector::get(v13Mask));
 480
 481         switch(numComponents + 1)
 482         {
 483             case    1: elements[0] = VIMMED1(0.0f);
 484             case    2: elements[1] = VIMMED1(0.0f);
 485             case    3: elements[2] = VIMMED1(0.0f);
 486             case    4: elements[3] = VIMMED1(1.0f);
 487         }
 488
 489         for(uint32_t c = 0; c < 4; ++c)
 490         {
 491             Value* dest = GEP(pVtxOut, C(nelt * 4 + c), "destGEP");
 492             STORE(elements[c], dest);
 493         }
 494     }
 495 }
 496
 497 // returns true for odd formats that require special state.gather handling
 498 bool FetchJit::IsOddFormat(SWR_FORMAT format)
 499 {
 500     const SWR_FORMAT_INFO& info = GetFormatInfo(format);
 501     if (info.bpc[0] != 8 && info.bpc[0] != 16 && info.bpc[0] != 32)
 502     {
 503         return true;
 504     }
 505     return false;
 506 }
 507
 508 // format is uniform if all components are the same size and type
 509 bool FetchJit::IsUniformFormat(SWR_FORMAT format)
 510 {
 511     const SWR_FORMAT_INFO& info = GetFormatInfo(format);
 512     uint32_t bpc0 = info.bpc[0];
 513     uint32_t type0 = info.type[0];
 514
 515     for (uint32_t c = 1; c < info.numComps; ++c)
 516     {
 517         if (bpc0 != info.bpc[c] || type0 != info.type[c])
 518         {
 519             return false;
 520         }
 521     }
 522     return true;
 523 }
 524
 525 // unpacks components based on format
 526 // foreach component in the pixel
 527 //   mask off everything but this component
 528 //   shift component to LSB
 529 void FetchJit::UnpackComponents(SWR_FORMAT format, Value* vInput, Value* result[4])
 530 {
 531     const SWR_FORMAT_INFO& info = GetFormatInfo(format);
 532
 533     uint32_t bitOffset = 0;
 534     for (uint32_t c = 0; c < info.numComps; ++c)
 535     {
 536         uint32_t swizzledIndex = info.swizzle[c];
 537         uint32_t compBits = info.bpc[c];
 538         uint32_t bitmask = ((1 << compBits) - 1) << bitOffset;
 539         Value* comp = AND(vInput, bitmask);
 540         comp = LSHR(comp, bitOffset);
 541
 542         result[swizzledIndex] = comp;
 543         bitOffset += compBits;
 544     }
 545 }
 546
 547 // gather for odd component size formats
 548 // gather SIMD full pixels per lane then shift/mask to move each component to their
 549 // own vector
 550 void FetchJit::CreateGatherOddFormats(SWR_FORMAT format, Value* pBase, Value* offsets, Value* result[4])
 551 {
 552     const SWR_FORMAT_INFO &info = GetFormatInfo(format);
 553
 554     // only works if pixel size is <= 32bits
 555     SWR_ASSERT(info.bpp <= 32);
 556
 557     Value* gather = VUNDEF_I();
 558
 559     // assign defaults
 560     for (uint32_t comp = 0; comp < 4; ++comp)
 561     {
 562         result[comp] = VIMMED1((int)info.defaults[comp]);
 563     }
 564
 565     // gather SIMD pixels
 566     for (uint32_t e = 0; e < JM()->mVWidth; ++e)
 567     {
 568         Value* elemOffset = VEXTRACT(offsets, C(e));
 569         Value* load = GEP(pBase, elemOffset);
 570
 571         // load the proper amount of data based on component size
 572         switch (info.bpp)
 573         {
 574         case 8: load = POINTER_CAST(load, Type::getInt8PtrTy(JM()->mContext)); break;
 575         case 16: load = POINTER_CAST(load, Type::getInt16PtrTy(JM()->mContext)); break;
 576         case 32: load = POINTER_CAST(load, Type::getInt32PtrTy(JM()->mContext)); break;
 577         default: SWR_ASSERT(0);
 578         }
 579
 580         // load pixel
 581         Value *val = LOAD(load);
 582
 583         // zero extend to 32bit integer
 584         val = INT_CAST(val, mInt32Ty, false);
 585
 586         // store in simd lane
 587         gather = VINSERT(gather, val, C(e));
 588     }
 589
 590     UnpackComponents(format, gather, result);
 591
 592     // cast to fp32
 593     result[0] = BITCAST(result[0], mSimdFP32Ty);
 594     result[1] = BITCAST(result[1], mSimdFP32Ty);
 595     result[2] = BITCAST(result[2], mSimdFP32Ty);
 596     result[3] = BITCAST(result[3], mSimdFP32Ty);
 597 }
 598
 599 void FetchJit::ConvertFormat(SWR_FORMAT format, Value *texels[4])
 600 {
 601     const SWR_FORMAT_INFO &info = GetFormatInfo(format);
 602
 603     for (uint32_t c = 0; c < info.numComps; ++c)
 604     {
 605         uint32_t compIndex = info.swizzle[c];
 606
 607         // skip any conversion on UNUSED components
 608         if (info.type[c] == SWR_TYPE_UNUSED)
 609         {
 610             continue;
 611         }
 612
 613         if (info.isNormalized[c])
 614         {
 615             if (info.type[c] == SWR_TYPE_SNORM)
 616             {
 617                 /// @todo The most-negative value maps to -1.0f. e.g. the 5-bit value 10000 maps to -1.0f.
 618
 619                 /// result = c * (1.0f / (2^(n-1) - 1);
 620                 uint32_t n = info.bpc[c];
 621                 uint32_t pow2 = 1 << (n - 1);
 622                 float scale = 1.0f / (float)(pow2 - 1);
 623                 Value *vScale = VIMMED1(scale);
 624                 texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty);
 625                 texels[compIndex] = SI_TO_FP(texels[compIndex], mSimdFP32Ty);
 626                 texels[compIndex] = FMUL(texels[compIndex], vScale);
 627             }
 628             else
 629             {
 630                 SWR_ASSERT(info.type[c] == SWR_TYPE_UNORM);
 631
 632                 /// result = c * (1.0f / (2^n - 1))
 633                 uint32_t n = info.bpc[c];
 634                 uint32_t pow2 = 1 << n;
 635                 // special case 24bit unorm format, which requires a full divide to meet ULP requirement
 636                 if (n == 24)
 637                 {
 638                     float scale = (float)(pow2 - 1);
 639                     Value* vScale = VIMMED1(scale);
 640                     texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty);
 641                     texels[compIndex] = SI_TO_FP(texels[compIndex], mSimdFP32Ty);
 642                     texels[compIndex] = FDIV(texels[compIndex], vScale);
 643                 }
 644                 else
 645                 {
 646                     float scale = 1.0f / (float)(pow2 - 1);
 647                     Value *vScale = VIMMED1(scale);
 648                     texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty);
 649                     texels[compIndex] = UI_TO_FP(texels[compIndex], mSimdFP32Ty);
 650                     texels[compIndex] = FMUL(texels[compIndex], vScale);
 651                 }
 652             }
 653             continue;
 654         }
 655     }
 656 }
 657
 658 //////////////////////////////////////////////////////////////////////////
 659 /// @brief Loads attributes from memory using AVX2 GATHER(s)
 660 /// @param fetchState - info about attributes to be fetched from memory
 661 /// @param fetchInfo - first argument passed to fetch shader
 662 /// @param streams - value pointer to the current vertex stream
 663 /// @param vIndices - vector value of indices to gather
 664 /// @param pVtxOut - value pointer to output simdvertex struct
 665 void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, Value* fetchInfo,
 666                                  Value* streams, Value* vIndices, Value* pVtxOut)
 667 {
 668     uint32_t currentVertexElement = 0;
 669     uint32_t outputElt = 0;
 670     Value* vVertexElements[4];
 671
 672     Value* startVertex = LOAD(fetchInfo, {0, SWR_FETCH_CONTEXT_StartVertex});
 673     Value* startInstance = LOAD(fetchInfo, {0, SWR_FETCH_CONTEXT_StartInstance});
 674     Value* curInstance = LOAD(fetchInfo, {0, SWR_FETCH_CONTEXT_CurInstance});
 675     Value* vBaseVertex = VBROADCAST(LOAD(fetchInfo, {0, SWR_FETCH_CONTEXT_BaseVertex}));
 676     curInstance->setName("curInstance");
 677
 678     for(uint32_t nInputElt = 0; nInputElt < fetchState.numAttribs; ++nInputElt)
 679     {
 680         const INPUT_ELEMENT_DESC& ied = fetchState.layout[nInputElt];
 681         const SWR_FORMAT_INFO &info = GetFormatInfo((SWR_FORMAT)ied.Format);
 682         SWR_ASSERT((info.bpp != 0), "Unsupported format in JitGatherVertices.");
 683         uint32_t bpc = info.bpp / info.numComps;  ///@todo Code below assumes all components are same size. Need to fix.
 684
 685         Value *stream = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pData});
 686
 687         // VGATHER* takes an *i8 src pointer
 688         Value* pStreamBase = BITCAST(stream, PointerType::get(mInt8Ty, 0));
 689
 690         Value *stride = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pitch});
 691         Value *vStride = VBROADCAST(stride);
 692
 693         // max vertex index that is fully in bounds
 694         Value *maxVertex = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_maxVertex)});
 695         maxVertex = LOAD(maxVertex);
 696
 697         Value *vCurIndices;
 698         Value *startOffset;
 699         if(ied.InstanceEnable)
 700         {
 701             Value* stepRate = C(ied.InstanceDataStepRate);
 702
 703             // prevent a div by 0 for 0 step rate
 704             Value* isNonZeroStep = ICMP_UGT(stepRate, C(0));
 705             stepRate = SELECT(isNonZeroStep, stepRate, C(1));
 706
 707             // calc the current offset into instanced data buffer
 708             Value* calcInstance = UDIV(curInstance, stepRate);
 709
 710             // if step rate is 0, every instance gets instance 0
 711             calcInstance = SELECT(isNonZeroStep, calcInstance, C(0));
 712
 713             vCurIndices = VBROADCAST(calcInstance);
 714
 715             startOffset = startInstance;
 716         }
 717         else
 718         {
 719             // offset indices by baseVertex
 720             vCurIndices = ADD(vIndices, vBaseVertex);
 721
 722             startOffset = startVertex;
 723         }
 724
 725         // All of the OOB calculations are in vertices, not VB offsets, to prevent having to
 726         // do 64bit address offset calculations.
 727
 728         // calculate byte offset to the start of the VB
 729         Value* baseOffset = MUL(Z_EXT(startOffset, mInt64Ty), Z_EXT(stride, mInt64Ty));
 730         pStreamBase = GEP(pStreamBase, baseOffset);
 731
 732         // if we have a start offset, subtract from max vertex. Used for OOB check
 733         maxVertex = SUB(Z_EXT(maxVertex, mInt64Ty), Z_EXT(startOffset, mInt64Ty));
 734         Value* neg = ICMP_SLT(maxVertex, C((int64_t)0));
 735         // if we have a negative value, we're already OOB. clamp at 0.
 736         maxVertex = SELECT(neg, C(0), TRUNC(maxVertex, mInt32Ty));
 737
 738         // Load the in bounds size of a partially valid vertex
 739         Value *partialInboundsSize = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_partialInboundsSize)});
 740         partialInboundsSize = LOAD(partialInboundsSize);
 741         Value* vPartialVertexSize = VBROADCAST(partialInboundsSize);
 742         Value* vBpp = VBROADCAST(C(info.Bpp));
 743         Value* vAlignmentOffsets = VBROADCAST(C(ied.AlignedByteOffset));
 744
 745         // is the element is <= the partially valid size
 746         Value* vElementInBoundsMask = ICMP_SLE(vBpp, SUB(vPartialVertexSize, vAlignmentOffsets));
 747
 748         // are vertices partially OOB?
 749         Value* vMaxVertex = VBROADCAST(maxVertex);
 750         Value* vPartialOOBMask = ICMP_EQ(vCurIndices, vMaxVertex);
 751
 752         // are vertices are fully in bounds?
 753         Value* vGatherMask = ICMP_ULT(vCurIndices, vMaxVertex);
 754
 755         // blend in any partially OOB indices that have valid elements
 756         vGatherMask = SELECT(vPartialOOBMask, vElementInBoundsMask, vGatherMask);
 757         vGatherMask = VMASK(vGatherMask);
 758
 759         // calculate the actual offsets into the VB
 760         Value* vOffsets = MUL(vCurIndices, vStride);
 761         vOffsets = ADD(vOffsets, vAlignmentOffsets);
 762
 763         // Packing and component control
 764         ComponentEnable compMask = (ComponentEnable)ied.ComponentPacking;
 765         const ComponentControl compCtrl[4] { (ComponentControl)ied.ComponentControl0, (ComponentControl)ied.ComponentControl1,
 766                                              (ComponentControl)ied.ComponentControl2, (ComponentControl)ied.ComponentControl3};
 767
 768         // Special gather/conversion for formats without equal component sizes
 769         if (IsOddFormat((SWR_FORMAT)ied.Format))
 770         {
 771             // Only full 4 component fetch is supported for odd formats
 772             SWR_ASSERT(compMask == XYZW);
 773             Value* pResults[4];
 774             CreateGatherOddFormats((SWR_FORMAT)ied.Format, pStreamBase, vOffsets, pResults);
 775             ConvertFormat((SWR_FORMAT)ied.Format, pResults);
 776
 777             // check for InstanceID SGV
 778             if (fetchState.InstanceIdEnable && (fetchState.InstanceIdElementOffset == nInputElt))
 779             {
 780                 SWR_ASSERT(fetchState.InstanceIdComponentNumber < (sizeof(pResults) / sizeof(pResults[0])));
 781
 782                 // Load a SIMD of InstanceIDs
 783                 pResults[fetchState.InstanceIdComponentNumber] = VBROADCAST(LOAD(GEP(fetchInfo, { 0, SWR_FETCH_CONTEXT_CurInstance })));    // InstanceID
 784             }
 785             // check for VertexID SGV
 786             else if (fetchState.VertexIdEnable && (fetchState.VertexIdElementOffset == nInputElt))
 787             {
 788                 SWR_ASSERT(fetchState.VertexIdComponentNumber < (sizeof(pResults) / sizeof(pResults[0])));
 789
 790                 // Load a SIMD of VertexIDs
 791                 pResults[fetchState.VertexIdComponentNumber] = LOAD(GEP(fetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID }));
 792             }
 793
 794             StoreVertexElements(pVtxOut, outputElt++, 4, pResults);
 795             currentVertexElement = 0;
 796         }
 797         else if(info.type[0] == SWR_TYPE_FLOAT)
 798         {
 799             ///@todo: support 64 bit vb accesses
 800             Value* gatherSrc = VIMMED1(0.0f);
 801
 802             SWR_ASSERT(IsUniformFormat((SWR_FORMAT)ied.Format),
 803                 "Unsupported format for standard gather fetch.");
 804
 805             // Gather components from memory to store in a simdvertex structure
 806             switch(bpc)
 807             {
 808                 case 16:
 809                 {
 810                     Value* vGatherResult[2];
 811                     Value *vMask;
 812
 813                     // if we have at least one component out of x or y to fetch
 814                     if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)){
 815                         // save mask as it is zero'd out after each gather
 816                         vMask = vGatherMask;
 817
 818                         vGatherResult[0] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1));
 819                         // e.g. result of first 8x32bit integer gather for 16bit components
 820                         // 256i - 0    1    2    3    4    5    6    7
 821                         //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
 822                         //
 823                     }
 824
 825                     // if we have at least one component out of z or w to fetch
 826                     if(isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)){
 827                         // offset base to the next components(zw) in the vertex to gather
 828                         pStreamBase = GEP(pStreamBase, C((char)4));
 829                         vMask = vGatherMask;
 830
 831                         vGatherResult[1] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1));
 832                         // e.g. result of second 8x32bit integer gather for 16bit components
 833                         // 256i - 0    1    2    3    4    5    6    7
 834                         //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
 835                         //
 836                     }
 837
 838                     // if we have at least one component to shuffle into place
 839                     if(compMask){
 840                         const bool instanceIdEnable = (fetchState.InstanceIdEnable) && (fetchState.InstanceIdElementOffset == nInputElt);
 841                         const bool vertexIdEnable = (fetchState.VertexIdEnable) && (fetchState.VertexIdElementOffset == nInputElt);
 842
 843                         Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, Instruction::CastOps::FPExt, CONVERT_NONE,
 844                             currentVertexElement, outputElt, compMask, compCtrl, vVertexElements, fetchInfo, instanceIdEnable,
 845                             fetchState.InstanceIdComponentNumber, vertexIdEnable, fetchState.VertexIdComponentNumber);
 846
 847                         // Shuffle gathered components into place in simdvertex struct
 848                         Shuffle16bpcGather(args);  // outputs to vVertexElements ref
 849                     }
 850                 }
 851                     break;
 852                 case 32:
 853                 {
 854                     for (uint32_t i = 0; i < 4; i++)
 855                     {
 856                         if (isComponentEnabled(compMask, i))
 857                         {
 858                             // check for InstanceID SGV
 859                             if ((fetchState.InstanceIdEnable) && (fetchState.InstanceIdElementOffset == nInputElt) && (fetchState.InstanceIdComponentNumber == currentVertexElement))
 860                             {
 861                                 // Load a SIMD of InstanceIDs
 862                                 vVertexElements[currentVertexElement++] = VBROADCAST(LOAD(GEP(fetchInfo, { 0, SWR_FETCH_CONTEXT_CurInstance })));   // InstanceID
 863                             }
 864                             // check for VertexID SGV
 865                             else if ((fetchState.VertexIdEnable) && (fetchState.VertexIdElementOffset == nInputElt) && (fetchState.VertexIdComponentNumber == currentVertexElement))
 866                             {
 867                                 // Load a SIMD of VertexIDs
 868                                 vVertexElements[currentVertexElement++] = LOAD(GEP(fetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID }));
 869                             }
 870                             // if we need to gather the component
 871                             else if (compCtrl[i] == StoreSrc)
 872                             {
 873                                 // save mask as it is zero'd out after each gather
 874                                 Value *vMask = vGatherMask;
 875
 876                                 // Gather a SIMD of vertices
 877                                 vVertexElements[currentVertexElement++] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1));
 878                             }
 879                             else
 880                             {
 881                                 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
 882                             }
 883
 884                             if (currentVertexElement > 3)
 885                             {
 886                                 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
 887                                 // reset to the next vVertexElement to output
 888                                 currentVertexElement = 0;
 889                             }
 890
 891                         }
 892
 893                         // offset base to the next component in the vertex to gather
 894                         pStreamBase = GEP(pStreamBase, C((char)4));
 895                     }
 896                 }
 897                     break;
 898                 default:
 899                     SWR_ASSERT(0, "Tried to fetch invalid FP format");
 900                     break;
 901             }
 902         }
 903         else
 904         {
 905             Instruction::CastOps extendCastType = Instruction::CastOps::CastOpsEnd;
 906             ConversionType conversionType = CONVERT_NONE;
 907
 908             SWR_ASSERT(IsUniformFormat((SWR_FORMAT)ied.Format),
 909                 "Unsupported format for standard gather fetch.");
 910
 911             switch(info.type[0])
 912             {
 913                 case SWR_TYPE_UNORM:
 914                     conversionType = CONVERT_NORMALIZED;
 915                 case SWR_TYPE_UINT:
 916                     extendCastType = Instruction::CastOps::ZExt;
 917                     break;
 918                 case SWR_TYPE_SNORM:
 919                     conversionType = CONVERT_NORMALIZED;
 920                 case SWR_TYPE_SINT:
 921                     extendCastType = Instruction::CastOps::SExt;
 922                     break;
 923                 case SWR_TYPE_USCALED:
 924                     conversionType = CONVERT_USCALED;
 925                     extendCastType = Instruction::CastOps::UIToFP;
 926                     break;
 927                 case SWR_TYPE_SSCALED:
 928                     conversionType = CONVERT_SSCALED;
 929                     extendCastType = Instruction::CastOps::SIToFP;
 930                     break;
 931                 default:
 932                     break;
 933             }
 934
 935             // value substituted when component of gather is masked
 936             Value* gatherSrc = VIMMED1(0);
 937
 938             // Gather components from memory to store in a simdvertex structure
 939             switch (bpc)
 940             {
 941                 case 8:
 942                 {
 943                     // if we have at least one component to fetch
 944                     if(compMask)
 945                     {
 946                         Value* vGatherResult = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask, C((char)1));
 947                         // e.g. result of an 8x32bit integer gather for 8bit components
 948                         // 256i - 0    1    2    3    4    5    6    7
 949                         //        xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw
 950
 951                         const bool instanceIdEnable = fetchState.InstanceIdEnable && (fetchState.InstanceIdElementOffset == nInputElt);
 952                         const bool vertexIdEnable = fetchState.VertexIdEnable && (fetchState.VertexIdElementOffset == nInputElt);
 953
 954                         Shuffle8bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, extendCastType, conversionType,
 955                             currentVertexElement, outputElt, compMask, compCtrl, vVertexElements, info.swizzle, fetchInfo,
 956                             instanceIdEnable, fetchState.InstanceIdComponentNumber, vertexIdEnable, fetchState.VertexIdComponentNumber);
 957
 958                         // Shuffle gathered components into place in simdvertex struct
 959                         Shuffle8bpcGatherd(args); // outputs to vVertexElements ref
 960                     }
 961                 }
 962                 break;
 963                 case 16:
 964                 {
 965                     Value* vGatherResult[2];
 966                     Value *vMask;
 967
 968                     // if we have at least one component out of x or y to fetch
 969                     if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)){
 970                         // save mask as it is zero'd out after each gather
 971                         vMask = vGatherMask;
 972
 973                         vGatherResult[0] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1));
 974                         // e.g. result of first 8x32bit integer gather for 16bit components
 975                         // 256i - 0    1    2    3    4    5    6    7
 976                         //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
 977                         //
 978                     }
 979
 980                     // if we have at least one component out of z or w to fetch
 981                     if(isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)){
 982                         // offset base to the next components(zw) in the vertex to gather
 983                         pStreamBase = GEP(pStreamBase, C((char)4));
 984                         vMask = vGatherMask;
 985
 986                         vGatherResult[1] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1));
 987                         // e.g. result of second 8x32bit integer gather for 16bit components
 988                         // 256i - 0    1    2    3    4    5    6    7
 989                         //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
 990                         //
 991                     }
 992
 993                     // if we have at least one component to shuffle into place
 994                     if(compMask){
 995                         const bool instanceIdEnable = fetchState.InstanceIdEnable && (fetchState.InstanceIdElementOffset == nInputElt);
 996                         const bool vertexIdEnable = fetchState.VertexIdEnable && (fetchState.VertexIdElementOffset == nInputElt);
 997
 998                         Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, extendCastType, conversionType,
 999                             currentVertexElement, outputElt, compMask, compCtrl, vVertexElements, fetchInfo, instanceIdEnable,
1000                             fetchState.InstanceIdComponentNumber, vertexIdEnable, fetchState.VertexIdComponentNumber);
1001
1002                         // Shuffle gathered components into place in simdvertex struct
1003                         Shuffle16bpcGather(args);  // outputs to vVertexElements ref
1004                     }
1005                 }
1006                 break;
1007                 case 32:
1008                 {
1009                     SWR_ASSERT(conversionType == CONVERT_NONE);
1010
1011                     // Gathered components into place in simdvertex struct
1012                     for (uint32_t i = 0; i < 4; i++)
1013                     {
1014                         if (isComponentEnabled(compMask, i))
1015                         {
1016                             // check for InstanceID SGV
1017                             if (fetchState.InstanceIdEnable && (fetchState.InstanceIdElementOffset == nInputElt) && (fetchState.InstanceIdComponentNumber == currentVertexElement))
1018                             {
1019                                 // Load a SIMD of InstanceIDs
1020                                 vVertexElements[currentVertexElement++] = VBROADCAST(LOAD(GEP(fetchInfo, { 0, SWR_FETCH_CONTEXT_CurInstance })));   // InstanceID
1021                             }
1022                             // check for VertexID SGV
1023                             else if (fetchState.VertexIdEnable && (fetchState.VertexIdElementOffset == nInputElt) && (fetchState.VertexIdComponentNumber == currentVertexElement))
1024                             {
1025                                 // Load a SIMD of VertexIDs
1026                                 vVertexElements[currentVertexElement++] = LOAD(GEP(fetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID }));
1027                             }
1028                             // if we need to gather the component
1029                             else if (compCtrl[i] == StoreSrc)
1030                             {
1031                                 // save mask as it is zero'd out after each gather
1032                                 Value *vMask = vGatherMask;
1033
1034                                 vVertexElements[currentVertexElement++] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1));
1035
1036                                 // e.g. result of a single 8x32bit integer gather for 32bit components
1037                                 // 256i - 0    1    2    3    4    5    6    7
1038                                 //        xxxx xxxx xxxx xxxx xxxx xxxx xxxx xxxx
1039                             }
1040                             else
1041                             {
1042                                 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1043                             }
1044
1045                             if (currentVertexElement > 3)
1046                             {
1047                                 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1048                                 // reset to the next vVertexElement to output
1049                                 currentVertexElement = 0;
1050                             }
1051
1052                         }
1053
1054                         // offset base to the next component  in the vertex to gather
1055                         pStreamBase = GEP(pStreamBase, C((char)4));
1056                     }
1057                 }
1058                 break;
1059             }
1060         }
1061     }
1062
1063     // if we have a partially filled vVertexElement struct, output it
1064     if(currentVertexElement > 0){
1065         StoreVertexElements(pVtxOut, outputElt++, currentVertexElement, vVertexElements);
1066     }
1067 }
1068
1069 //////////////////////////////////////////////////////////////////////////
1070 /// @brief Loads a simd of valid indices. OOB indices are set to 0
1071 /// *Note* have to do 16bit index checking in scalar until we have AVX-512
1072 /// support
1073 /// @param pIndices - pointer to 8 bit indices
1074 /// @param pLastIndex - pointer to last valid index
1075 Value* FetchJit::GetSimdValid8bitIndices(Value* pIndices, Value* pLastIndex)
1076 {
1077     // can fit 2 16 bit integers per vWidth lane
1078     Value* vIndices =  VUNDEF_I();
1079
1080     // store 0 index on stack to be used to conditionally load from if index address is OOB
1081     Value* pZeroIndex = ALLOCA(mInt8Ty);
1082     STORE(C((uint8_t)0), pZeroIndex);
1083
1084     // Load a SIMD of index pointers
1085     for(int64_t lane = 0; lane < mVWidth; lane++)
1086     {
1087         // Calculate the address of the requested index
1088         Value *pIndex = GEP(pIndices, C(lane));
1089
1090         // check if the address is less than the max index,
1091         Value* mask = ICMP_ULT(pIndex, pLastIndex);
1092
1093         // if valid, load the index. if not, load 0 from the stack
1094         Value* pValid = SELECT(mask, pIndex, pZeroIndex);
1095         Value *index = LOAD(pValid, "valid index");
1096
1097         // zero extended index to 32 bits and insert into the correct simd lane
1098         index = Z_EXT(index, mInt32Ty);
1099         vIndices = VINSERT(vIndices, index, lane);
1100     }
1101     return vIndices;
1102 }
1103
1104 //////////////////////////////////////////////////////////////////////////
1105 /// @brief Loads a simd of valid indices. OOB indices are set to 0
1106 /// *Note* have to do 16bit index checking in scalar until we have AVX-512
1107 /// support
1108 /// @param pIndices - pointer to 16 bit indices
1109 /// @param pLastIndex - pointer to last valid index
1110 Value* FetchJit::GetSimdValid16bitIndices(Value* pIndices, Value* pLastIndex)
1111 {
1112     // can fit 2 16 bit integers per vWidth lane
1113     Value* vIndices =  VUNDEF_I();
1114
1115     // store 0 index on stack to be used to conditionally load from if index address is OOB
1116     Value* pZeroIndex = ALLOCA(mInt16Ty);
1117     STORE(C((uint16_t)0), pZeroIndex);
1118
1119     // Load a SIMD of index pointers
1120     for(int64_t lane = 0; lane < mVWidth; lane++)
1121     {
1122         // Calculate the address of the requested index
1123         Value *pIndex = GEP(pIndices, C(lane));
1124
1125         // check if the address is less than the max index,
1126         Value* mask = ICMP_ULT(pIndex, pLastIndex);
1127
1128         // if valid, load the index. if not, load 0 from the stack
1129         Value* pValid = SELECT(mask, pIndex, pZeroIndex);
1130         Value *index = LOAD(pValid, "valid index");
1131
1132         // zero extended index to 32 bits and insert into the correct simd lane
1133         index = Z_EXT(index, mInt32Ty);
1134         vIndices = VINSERT(vIndices, index, lane);
1135     }
1136     return vIndices;
1137 }
1138
1139 //////////////////////////////////////////////////////////////////////////
1140 /// @brief Loads a simd of valid indices. OOB indices are set to 0
1141 /// @param pIndices - pointer to 32 bit indices
1142 /// @param pLastIndex - pointer to last valid index
1143 Value* FetchJit::GetSimdValid32bitIndices(Value* pIndices, Value* pLastIndex)
1144 {
1145     DataLayout dL(JM()->mpCurrentModule);
1146     unsigned int ptrSize = dL.getPointerSize() * 8;  // ptr size in bits
1147     Value* iLastIndex = PTR_TO_INT(pLastIndex, Type::getIntNTy(JM()->mContext, ptrSize));
1148     Value* iIndices = PTR_TO_INT(pIndices, Type::getIntNTy(JM()->mContext, ptrSize));
1149
1150     // get the number of indices left in the buffer (endPtr - curPtr) / sizeof(index)
1151     Value* numIndicesLeft = SUB(iLastIndex,iIndices);
1152     numIndicesLeft = TRUNC(numIndicesLeft, mInt32Ty);
1153     numIndicesLeft = SDIV(numIndicesLeft, C(4));
1154
1155     // create a vector of index counts from the base index ptr passed into the fetch
1156     const std::vector<Constant*> vecIndices {C(0), C(1), C(2), C(3), C(4), C(5), C(6), C(7)};
1157     Constant* vIndexOffsets = ConstantVector::get(vecIndices);
1158
1159     // compare index count to the max valid index
1160     // e.g vMaxIndex      4 4 4 4 4 4 4 4 : 4 indices left to load
1161     //     vIndexOffsets  0 1 2 3 4 5 6 7
1162     //     ------------------------------
1163     //     vIndexMask    -1-1-1-1 0 0 0 0 : offsets < max pass
1164     //     vLoadedIndices 0 1 2 3 0 0 0 0 : offsets >= max masked to 0
1165     Value* vMaxIndex = VBROADCAST(numIndicesLeft);
1166     Value* vIndexMask = VPCMPGTD(vMaxIndex,vIndexOffsets);
1167
1168     // VMASKLOAD takes an *i8 src pointer
1169     pIndices = BITCAST(pIndices,PointerType::get(mInt8Ty,0));
1170
1171     // Load the indices; OOB loads 0
1172     return MASKLOADD(pIndices,vIndexMask);
1173 }
1174
1175 //////////////////////////////////////////////////////////////////////////
1176 /// @brief Takes a SIMD of gathered 8bpc verts, zero or sign extends,
1177 /// denormalizes if needed, converts to F32 if needed, and positions in
1178 //  the proper SIMD rows to be output to the simdvertex structure
1179 /// @param args: (tuple of args, listed below)
1180 ///   @param vGatherResult - 8 gathered 8bpc vertices
1181 ///   @param pVtxOut - base pointer to output simdvertex struct
1182 ///   @param extendType - sign extend or zero extend
1183 ///   @param bNormalized - do we need to denormalize?
1184 ///   @param currentVertexElement - reference to the current vVertexElement
1185 ///   @param outputElt - reference to the current offset from simdvertex we're o
1186 ///   @param compMask - component packing mask
1187 ///   @param compCtrl - component control val
1188 ///   @param vVertexElements[4] - vertex components to output
1189 ///   @param swizzle[4] - component swizzle location
1190 ///   @param fetchInfo - fetch shader info
1191 ///   @param instanceIdEnable - InstanceID enabled?
1192 ///   @param instanceIdComponentNumber - InstanceID component override
1193 ///   @param vertexIdEnable - VertexID enabled?
1194 ///   @param vertexIdComponentNumber - VertexID component override
1195 void FetchJit::Shuffle8bpcGatherd(Shuffle8bpcArgs &args)
1196 {
1197     // Unpack tuple args
1198     Value*& vGatherResult = std::get<0>(args);
1199     Value* pVtxOut = std::get<1>(args);
1200     const Instruction::CastOps extendType = std::get<2>(args);
1201     const ConversionType conversionType = std::get<3>(args);
1202     uint32_t &currentVertexElement = std::get<4>(args);
1203     uint32_t &outputElt =  std::get<5>(args);
1204     const ComponentEnable compMask = std::get<6>(args);
1205     const ComponentControl (&compCtrl)[4] = std::get<7>(args);
1206     Value* (&vVertexElements)[4] = std::get<8>(args);
1207     const uint32_t (&swizzle)[4] = std::get<9>(args);
1208     Value *fetchInfo = std::get<10>(args);
1209     const bool instanceIdEnable = std::get<11>(args);
1210     const uint32_t instanceIdComponentNumber = std::get<12>(args);
1211     const bool vertexIdEnable = std::get<13>(args);
1212     const uint32_t vertexIdComponentNumber = std::get<14>(args);
1213
1214     // cast types
1215     Type* vGatherTy = mSimdInt32Ty;
1216     Type* v32x8Ty =  VectorType::get(mInt8Ty, mVWidth * 4 ); // vwidth is units of 32 bits
1217
1218     // have to do extra work for sign extending
1219     if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP)){
1220         Type* v16x8Ty = VectorType::get(mInt8Ty, mVWidth * 2); // 8x16bit ints in a 128bit lane
1221         Type* v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
1222
1223         // shuffle mask, including any swizzling
1224         const char x = (char)swizzle[0]; const char y = (char)swizzle[1];
1225         const char z = (char)swizzle[2]; const char w = (char)swizzle[3];
1226         Value* vConstMask = C<char>({char(x), char(x+4), char(x+8), char(x+12),
1227                     char(y), char(y+4), char(y+8), char(y+12),
1228                     char(z), char(z+4), char(z+8), char(z+12),
1229                     char(w), char(w+4), char(w+8), char(w+12),
1230                     char(x), char(x+4), char(x+8), char(x+12),
1231                     char(y), char(y+4), char(y+8), char(y+12),
1232                     char(z), char(z+4), char(z+8), char(z+12),
1233                     char(w), char(w+4), char(w+8), char(w+12)});
1234
1235         Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherResult, v32x8Ty), vConstMask), vGatherTy);
1236         // after pshufb: group components together in each 128bit lane
1237         // 256i - 0    1    2    3    4    5    6    7
1238         //        xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww
1239
1240         Value* vi128XY = nullptr;
1241         if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)){
1242             vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({0, 4, 0, 0, 1, 5, 0, 0})), v128Ty);
1243             // after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane
1244             // 256i - 0    1    2    3    4    5    6    7
1245             //        xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care)
1246         }
1247
1248         // do the same for zw components
1249         Value* vi128ZW = nullptr;
1250         if(isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)){
1251             vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({2, 6, 0, 0, 3, 7, 0, 0})), v128Ty);
1252         }
1253
1254         // init denormalize variables if needed
1255         Instruction::CastOps fpCast;
1256         Value* conversionFactor;
1257
1258         switch (conversionType)
1259         {
1260         case CONVERT_NORMALIZED:
1261             fpCast = Instruction::CastOps::SIToFP;
1262             conversionFactor = VIMMED1((float)(1.0 / 127.0));
1263             break;
1264         case CONVERT_SSCALED:
1265             fpCast = Instruction::CastOps::SIToFP;
1266             conversionFactor = VIMMED1((float)(1.0));
1267             break;
1268         case CONVERT_USCALED:
1269             SWR_ASSERT(0, "Type should not be sign extended!");
1270             conversionFactor = nullptr;
1271             break;
1272         default:
1273             SWR_ASSERT(conversionType == CONVERT_NONE);
1274             conversionFactor = nullptr;
1275             break;
1276         }
1277
1278         // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
1279         for (uint32_t i = 0; i < 4; i++)
1280         {
1281             if (isComponentEnabled(compMask, i))
1282             {
1283                 // check for InstanceID SGV
1284                 if (instanceIdEnable && (instanceIdComponentNumber == currentVertexElement))
1285                 {
1286                     // Load a SIMD of InstanceIDs
1287                     vVertexElements[currentVertexElement++] = VBROADCAST(LOAD(GEP(fetchInfo, { 0, SWR_FETCH_CONTEXT_CurInstance })));   // InstanceID
1288                 }
1289                 // check for VertexID SGV
1290                 else if (vertexIdEnable && (vertexIdComponentNumber == currentVertexElement))
1291                 {
1292                     // Load a SIMD of VertexIDs
1293                     vVertexElements[currentVertexElement++] = LOAD(GEP(fetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID }));
1294                 }
1295                 else if (compCtrl[i] == ComponentControl::StoreSrc)
1296                 {
1297                     // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
1298                     uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
1299                     // if x or y, use vi128XY permute result, else use vi128ZW
1300                     Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
1301
1302                     // sign extend
1303                     vVertexElements[currentVertexElement] = PMOVSXBD(BITCAST(VEXTRACT(selectedPermute, C(lane)), v16x8Ty));
1304
1305                     // denormalize if needed
1306                     if (conversionType != CONVERT_NONE)
1307                     {
1308                         vVertexElements[currentVertexElement] = FMUL(CAST(fpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor);
1309                     }
1310                     currentVertexElement++;
1311                 }
1312                 else
1313                 {
1314                     vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1315                 }
1316
1317                 if (currentVertexElement > 3)
1318                 {
1319                     StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1320                     // reset to the next vVertexElement to output
1321                     currentVertexElement = 0;
1322                 }
1323             }
1324         }
1325     }
1326     // else zero extend
1327     else if ((extendType == Instruction::CastOps::ZExt) || (extendType == Instruction::CastOps::UIToFP))
1328     {
1329         // init denormalize variables if needed
1330         Instruction::CastOps fpCast;
1331         Value* conversionFactor;
1332
1333         switch (conversionType)
1334         {
1335         case CONVERT_NORMALIZED:
1336             fpCast = Instruction::CastOps::UIToFP;
1337             conversionFactor = VIMMED1((float)(1.0 / 255.0));
1338             break;
1339         case CONVERT_USCALED:
1340             fpCast = Instruction::CastOps::UIToFP;
1341             conversionFactor = VIMMED1((float)(1.0));
1342             break;
1343         case CONVERT_SSCALED:
1344             SWR_ASSERT(0, "Type should not be zero extended!");
1345             conversionFactor = nullptr;
1346             break;
1347         default:
1348             SWR_ASSERT(conversionType == CONVERT_NONE);
1349             conversionFactor = nullptr;
1350             break;
1351         }
1352
1353         // shuffle enabled components into lower byte of each 32bit lane, 0 extending to 32 bits
1354         for (uint32_t i = 0; i < 4; i++)
1355         {
1356             if (isComponentEnabled(compMask, i))
1357             {
1358                 // check for InstanceID SGV
1359                 if (instanceIdEnable && (instanceIdComponentNumber == currentVertexElement))
1360                 {
1361                     // Load a SIMD of InstanceIDs
1362                     vVertexElements[currentVertexElement++] = VBROADCAST(LOAD(GEP(fetchInfo, { 0, SWR_FETCH_CONTEXT_CurInstance })));   // InstanceID
1363                 }
1364                 // check for VertexID SGV
1365                 else if (vertexIdEnable && (vertexIdComponentNumber == currentVertexElement))
1366                 {
1367                     // Load a SIMD of VertexIDs
1368                     vVertexElements[currentVertexElement++] = LOAD(GEP(fetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID }));
1369                 }
1370                 else if (compCtrl[i] == ComponentControl::StoreSrc)
1371                 {
1372                     // pshufb masks for each component
1373                     Value* vConstMask;
1374                     switch (swizzle[i])
1375                     {
1376                     case 0:
1377                         // x shuffle mask
1378                         vConstMask = C<char>({ 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1,
1379                                                0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1 });
1380                         break;
1381                     case 1:
1382                         // y shuffle mask
1383                         vConstMask = C<char>({ 1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1,
1384                                                1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1 });
1385                         break;
1386                     case 2:
1387                         // z shuffle mask
1388                         vConstMask = C<char>({ 2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1,
1389                                                2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1 });
1390                         break;
1391                     case 3:
1392                         // w shuffle mask
1393                         vConstMask = C<char>({ 3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1,
1394                                                3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1 });
1395                         break;
1396                     default:
1397                         vConstMask = nullptr;
1398                         break;
1399                     }
1400
1401                     vVertexElements[currentVertexElement] = BITCAST(PSHUFB(BITCAST(vGatherResult, v32x8Ty), vConstMask), vGatherTy);
1402                     // after pshufb for x channel
1403                     // 256i - 0    1    2    3    4    5    6    7
1404                     //        x000 x000 x000 x000 x000 x000 x000 x000
1405
1406                     // denormalize if needed
1407                     if (conversionType != CONVERT_NONE)
1408                     {
1409                         vVertexElements[currentVertexElement] = FMUL(CAST(fpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor);
1410                     }
1411                     currentVertexElement++;
1412                 }
1413                 else
1414                 {
1415                     vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1416                 }
1417
1418                 if (currentVertexElement > 3)
1419                 {
1420                     StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1421                     // reset to the next vVertexElement to output
1422                     currentVertexElement = 0;
1423                 }
1424             }
1425         }
1426     }
1427     else
1428     {
1429         SWR_ASSERT(0, "Unsupported conversion type");
1430     }
1431 }
1432
1433 //////////////////////////////////////////////////////////////////////////
1434 /// @brief Takes a SIMD of gathered 16bpc verts, zero or sign extends,
1435 /// denormalizes if needed, converts to F32 if needed, and positions in
1436 //  the proper SIMD rows to be output to the simdvertex structure
1437 /// @param args: (tuple of args, listed below)
1438 ///   @param vGatherResult[2] - array of gathered 16bpc vertices, 4 per index
1439 ///   @param pVtxOut - base pointer to output simdvertex struct
1440 ///   @param extendType - sign extend or zero extend
1441 ///   @param bNormalized - do we need to denormalize?
1442 ///   @param currentVertexElement - reference to the current vVertexElement
1443 ///   @param outputElt - reference to the current offset from simdvertex we're o
1444 ///   @param compMask - component packing mask
1445 ///   @param compCtrl - component control val
1446 ///   @param vVertexElements[4] - vertex components to output
1447 ///   @param fetchInfo - fetch shader info
1448 ///   @param instanceIdEnable - InstanceID enabled?
1449 ///   @param instanceIdComponentNumber - InstanceID component override
1450 ///   @param vertexIdEnable - VertexID enabled?
1451 ///   @param vertexIdComponentNumber - VertexID component override
1452 void FetchJit::Shuffle16bpcGather(Shuffle16bpcArgs &args)
1453 {
1454     // Unpack tuple args
1455     Value* (&vGatherResult)[2] = std::get<0>(args);
1456     Value* pVtxOut = std::get<1>(args);
1457     const Instruction::CastOps extendType = std::get<2>(args);
1458     const ConversionType conversionType = std::get<3>(args);
1459     uint32_t &currentVertexElement = std::get<4>(args);
1460     uint32_t &outputElt = std::get<5>(args);
1461     const ComponentEnable compMask = std::get<6>(args);
1462     const ComponentControl(&compCtrl)[4] = std::get<7>(args);
1463     Value* (&vVertexElements)[4] = std::get<8>(args);
1464     Value *fetchInfo = std::get<9>(args);
1465     const bool instanceIdEnable = std::get<10>(args);
1466     const uint32_t instanceIdComponentNumber = std::get<11>(args);
1467     const bool vertexIdEnable = std::get<12>(args);
1468     const uint32_t vertexIdComponentNumber = std::get<13>(args);
1469
1470     // cast types
1471     Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
1472     Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
1473
1474     // have to do extra work for sign extending
1475     if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP)||
1476         (extendType == Instruction::CastOps::FPExt))
1477     {
1478         // is this PP float?
1479         bool bFP = (extendType == Instruction::CastOps::FPExt) ? true : false;
1480
1481         Type* v8x16Ty = VectorType::get(mInt16Ty, 8); // 8x16bit in a 128bit lane
1482         Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
1483
1484         // shuffle mask
1485         Value* vConstMask = C<char>({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
1486                                      0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15});
1487         Value* vi128XY = nullptr;
1488         if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)){
1489             Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherResult[0], v32x8Ty), vConstMask), vGatherTy);
1490             // after pshufb: group components together in each 128bit lane
1491             // 256i - 0    1    2    3    4    5    6    7
1492             //        xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
1493
1494             vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
1495             // after PERMD: move and pack xy components into each 128bit lane
1496             // 256i - 0    1    2    3    4    5    6    7
1497             //        xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
1498         }
1499
1500         // do the same for zw components
1501         Value* vi128ZW = nullptr;
1502         if(isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)){
1503             Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherResult[1], v32x8Ty), vConstMask), vGatherTy);
1504             vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
1505         }
1506
1507         // init denormalize variables if needed
1508         Instruction::CastOps IntToFpCast;
1509         Value* conversionFactor;
1510
1511         switch (conversionType)
1512         {
1513         case CONVERT_NORMALIZED:
1514             IntToFpCast = Instruction::CastOps::SIToFP;
1515             conversionFactor = VIMMED1((float)(1.0 / 32767.0));
1516             break;
1517         case CONVERT_SSCALED:
1518             IntToFpCast = Instruction::CastOps::SIToFP;
1519             conversionFactor = VIMMED1((float)(1.0));
1520             break;
1521         case CONVERT_USCALED:
1522             SWR_ASSERT(0, "Type should not be sign extended!");
1523             conversionFactor = nullptr;
1524             break;
1525         default:
1526             SWR_ASSERT(conversionType == CONVERT_NONE);
1527             conversionFactor = nullptr;
1528             break;
1529         }
1530
1531         // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
1532         for (uint32_t i = 0; i < 4; i++)
1533         {
1534             if (isComponentEnabled(compMask, i))
1535             {
1536                 // check for InstanceID SGV
1537                 if (instanceIdEnable && (instanceIdComponentNumber == currentVertexElement))
1538                 {
1539                     // Load a SIMD of InstanceIDs
1540                     vVertexElements[currentVertexElement++] = VBROADCAST(LOAD(GEP(fetchInfo, { 0, SWR_FETCH_CONTEXT_CurInstance })));   // InstanceID
1541                 }
1542                 // check for VertexID SGV
1543                 else if (vertexIdEnable && (vertexIdComponentNumber == currentVertexElement))
1544                 {
1545                     // Load a SIMD of VertexIDs
1546                     vVertexElements[currentVertexElement++] = LOAD(GEP(fetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID }));
1547                 }
1548                 else if (compCtrl[i] == ComponentControl::StoreSrc)
1549                 {
1550                     // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
1551                     uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
1552                     // if x or y, use vi128XY permute result, else use vi128ZW
1553                     Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
1554
1555                     if (bFP) {
1556                         // extract 128 bit lanes to sign extend each component
1557                         vVertexElements[currentVertexElement] = CVTPH2PS(BITCAST(VEXTRACT(selectedPermute, C(lane)), v8x16Ty));
1558                     }
1559                     else {
1560                         // extract 128 bit lanes to sign extend each component
1561                         vVertexElements[currentVertexElement] = PMOVSXWD(BITCAST(VEXTRACT(selectedPermute, C(lane)), v8x16Ty));
1562
1563                         // denormalize if needed
1564                         if (conversionType != CONVERT_NONE) {
1565                             vVertexElements[currentVertexElement] = FMUL(CAST(IntToFpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor);
1566                         }
1567                     }
1568                     currentVertexElement++;
1569                 }
1570                 else
1571                 {
1572                     vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1573                 }
1574
1575                 if (currentVertexElement > 3)
1576                 {
1577                     StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1578                     // reset to the next vVertexElement to output
1579                     currentVertexElement = 0;
1580                 }
1581             }
1582         }
1583     }
1584     // else zero extend
1585     else if ((extendType == Instruction::CastOps::ZExt) || (extendType == Instruction::CastOps::UIToFP))
1586     {
1587         // pshufb masks for each component
1588         Value* vConstMask[2];
1589         if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 2)){
1590             // x/z shuffle mask
1591             vConstMask[0] = C<char>({0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
1592                                      0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, });
1593         }
1594
1595         if(isComponentEnabled(compMask, 1) || isComponentEnabled(compMask, 3)){
1596             // y/w shuffle mask
1597             vConstMask[1] = C<char>({2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
1598                                      2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1});
1599         }
1600
1601         // init denormalize variables if needed
1602         Instruction::CastOps fpCast;
1603         Value* conversionFactor;
1604
1605         switch (conversionType)
1606         {
1607         case CONVERT_NORMALIZED:
1608             fpCast = Instruction::CastOps::UIToFP;
1609             conversionFactor = VIMMED1((float)(1.0 / 65535.0));
1610             break;
1611         case CONVERT_USCALED:
1612             fpCast = Instruction::CastOps::UIToFP;
1613             conversionFactor = VIMMED1((float)(1.0f));
1614             break;
1615         case CONVERT_SSCALED:
1616             SWR_ASSERT(0, "Type should not be zero extended!");
1617             conversionFactor = nullptr;
1618             break;
1619         default:
1620             SWR_ASSERT(conversionType == CONVERT_NONE);
1621             conversionFactor = nullptr;
1622             break;
1623         }
1624
1625         // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits
1626         for (uint32_t i = 0; i < 4; i++)
1627         {
1628             if (isComponentEnabled(compMask, i))
1629             {
1630                 // check for InstanceID SGV
1631                 if (instanceIdEnable && (instanceIdComponentNumber == currentVertexElement))
1632                 {
1633                     // Load a SIMD of InstanceIDs
1634                     vVertexElements[currentVertexElement++] = VBROADCAST(LOAD(GEP(fetchInfo, { 0, SWR_FETCH_CONTEXT_CurInstance })));   // InstanceID
1635                 }
1636                 // check for VertexID SGV
1637                 else if (vertexIdEnable && (vertexIdComponentNumber == currentVertexElement))
1638                 {
1639                     // Load a SIMD of VertexIDs
1640                     vVertexElements[currentVertexElement++] = LOAD(GEP(fetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID }));
1641                 }
1642                 else if (compCtrl[i] == ComponentControl::StoreSrc)
1643                 {
1644                     // select correct constMask for x/z or y/w pshufb
1645                     uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1;
1646                     // if x or y, use vi128XY permute result, else use vi128ZW
1647                     uint32_t selectedGather = (i < 2) ? 0 : 1;
1648
1649                     vVertexElements[currentVertexElement] = BITCAST(PSHUFB(BITCAST(vGatherResult[selectedGather], v32x8Ty), vConstMask[selectedMask]), vGatherTy);
1650                     // after pshufb mask for x channel; z uses the same shuffle from the second gather
1651                     // 256i - 0    1    2    3    4    5    6    7
1652                     //        xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00
1653
1654                     // denormalize if needed
1655                     if (conversionType != CONVERT_NONE)
1656                     {
1657                         vVertexElements[currentVertexElement] = FMUL(CAST(fpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor);
1658                     }
1659                     currentVertexElement++;
1660                 }
1661                 else
1662                 {
1663                     vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1664                 }
1665
1666                 if (currentVertexElement > 3)
1667                 {
1668                     StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1669                     // reset to the next vVertexElement to output
1670                     currentVertexElement = 0;
1671                 }
1672             }
1673         }
1674     }
1675     else
1676     {
1677         SWR_ASSERT(0, "Unsupported conversion type");
1678     }
1679 }
1680
1681 //////////////////////////////////////////////////////////////////////////
1682 /// @brief Output a simdvertex worth of elements to the current outputElt
1683 /// @param pVtxOut - base address of VIN output struct
1684 /// @param outputElt - simdvertex offset in VIN to write to
1685 /// @param numEltsToStore - number of simdvertex rows to write out
1686 /// @param vVertexElements - LLVM Value*[] simdvertex to write out
1687 void FetchJit::StoreVertexElements(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4])
1688 {
1689     for(uint32_t c = 0; c < numEltsToStore; ++c)
1690     {
1691         // STORE expects FP32 x vWidth type, just bitcast if needed
1692         if(!vVertexElements[c]->getType()->getScalarType()->isFloatTy()){
1693 #if FETCH_DUMP_VERTEX
1694             PRINT("vVertexElements[%d]: 0x%x\n", {C(c), vVertexElements[c]});
1695 #endif
1696             vVertexElements[c] = BITCAST(vVertexElements[c], mSimdFP32Ty);
1697         }
1698 #if FETCH_DUMP_VERTEX
1699         else
1700         {
1701             PRINT("vVertexElements[%d]: %f\n", {C(c), vVertexElements[c]});
1702         }
1703 #endif
1704         // outputElt * 4 = offsetting by the size of a simdvertex
1705         // + c offsets to a 32bit x vWidth row within the current vertex
1706         Value* dest = GEP(pVtxOut, C(outputElt * 4 + c), "destGEP");
1707         STORE(vVertexElements[c], dest);
1708     }
1709 }
1710
1711 //////////////////////////////////////////////////////////////////////////
1712 /// @brief Generates a constant vector of values based on the
1713 /// ComponentControl value
1714 /// @param ctrl - ComponentControl value
1715 Value* FetchJit::GenerateCompCtrlVector(const ComponentControl ctrl)
1716 {
1717     switch(ctrl)
1718     {
1719         case NoStore:   return VUNDEF_I();
1720         case Store0:    return VIMMED1(0);
1721         case Store1Fp:  return VIMMED1(1.0f);
1722         case Store1Int: return VIMMED1(1);
1723         case StoreSrc:
1724         default:        SWR_ASSERT(0, "Invalid component control"); return VUNDEF_I();
1725     }
1726 }
1727
1728 //////////////////////////////////////////////////////////////////////////
1729 /// @brief Returns the enable mask for the specified component.
1730 /// @param enableMask - enable bits
1731 /// @param component - component to check if enabled.
1732 bool isComponentEnabled(ComponentEnable enableMask, uint8_t component)
1733 {
1734     switch (component)
1735     {
1736         // X
1737     case 0: return (enableMask & ComponentEnable::X);
1738         // Y
1739     case 1: return (enableMask & ComponentEnable::Y);
1740         // Z
1741     case 2: return (enableMask & ComponentEnable::Z);
1742         // W
1743     case 3: return (enableMask & ComponentEnable::W);
1744
1745     default: return false;
1746     }
1747 }
1748
1749
1750 //////////////////////////////////////////////////////////////////////////
1751 /// @brief JITs from fetch shader IR
1752 /// @param hJitMgr - JitManager handle
1753 /// @param func   - LLVM function IR
1754 /// @return PFN_FETCH_FUNC - pointer to fetch code
1755 PFN_FETCH_FUNC JitFetchFunc(HANDLE hJitMgr, const HANDLE hFunc)
1756 {
1757     const llvm::Function* func = (const llvm::Function*)hFunc;
1758     JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
1759     PFN_FETCH_FUNC pfnFetch;
1760
1761     pfnFetch = (PFN_FETCH_FUNC)(pJitMgr->mpExec->getFunctionAddress(func->getName().str()));
1762     // MCJIT finalizes modules the first time you JIT code from them. After finalized, you cannot add new IR to the module
1763     pJitMgr->mIsModuleFinalized = true;
1764
1765 #if defined(KNOB_SWRC_TRACING)
1766     char fName[1024];
1767     const char *funcName = func->getName().data();
1768     sprintf(fName, "%s.bin", funcName);
1769     FILE *fd = fopen(fName, "wb");
1770     fwrite((void *)pfnFetch, 1, 2048, fd);
1771     fclose(fd);
1772 #endif
1773
1774     return pfnFetch;
1775 }
1776
1777 //////////////////////////////////////////////////////////////////////////
1778 /// @brief JIT compiles fetch shader
1779 /// @param hJitMgr - JitManager handle
1780 /// @param state   - fetch state to build function from
1781 extern "C" PFN_FETCH_FUNC JITCALL JitCompileFetch(HANDLE hJitMgr, const FETCH_COMPILE_STATE& state)
1782 {
1783     JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
1784
1785     pJitMgr->SetupNewModule();
1786
1787     FetchJit theJit(pJitMgr);
1788     HANDLE hFunc = theJit.Create(state);
1789
1790     return JitFetchFunc(hJitMgr, hFunc);
1791 }