src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp

   1 /****************************************************************************
   2 * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
   3 *
   4 * Permission is hereby granted, free of charge, to any person obtaining a
   5 * copy of this software and associated documentation files (the "Software"),
   6 * to deal in the Software without restriction, including without limitation
   7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8 * and/or sell copies of the Software, and to permit persons to whom the
   9 * Software is furnished to do so, subject to the following conditions:
  10 *
  11 * The above copyright notice and this permission notice (including the next
  12 * paragraph) shall be included in all copies or substantial portions of the
  13 * Software.
  14 *
  15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21 * IN THE SOFTWARE.
  22 *
  23 * @file fetch_jit.cpp
  24 *
  25 * @brief Implementation of the fetch jitter
  26 *
  27 * Notes:
  28 *
  29 ******************************************************************************/
  30 #include "builder.h"
  31 #include "jit_api.h"
  32 #include "fetch_jit.h"
  33 #include "state_llvm.h"
  34 #include <sstream>
  35 #include <tuple>
  36
  37 //#define FETCH_DUMP_VERTEX 1
  38 using namespace llvm;
  39 using namespace SwrJit;
  40
  41 bool isComponentEnabled(ComponentEnable enableMask, uint8_t component);
  42
  43 enum ConversionType
  44 {
  45     CONVERT_NONE,
  46     CONVERT_NORMALIZED,
  47     CONVERT_USCALED,
  48     CONVERT_SSCALED,
  49     CONVERT_SFIXED,
  50 };
  51
  52 //////////////////////////////////////////////////////////////////////////
  53 /// Interface to Jitting a fetch shader
  54 //////////////////////////////////////////////////////////////////////////
  55 struct FetchJit : public Builder
  56 {
  57     FetchJit(JitManager* pJitMgr) : Builder(pJitMgr){};
  58
  59     Function* Create(const FETCH_COMPILE_STATE& fetchState);
  60     Value* GetSimdValid32bitIndices(Value* vIndices, Value* pLastIndex);
  61     Value* GetSimdValid16bitIndices(Value* vIndices, Value* pLastIndex);
  62     Value* GetSimdValid8bitIndices(Value* vIndices, Value* pLastIndex);
  63
  64     // package up Shuffle*bpcGatherd args into a tuple for convenience
  65     typedef std::tuple<Value*&, Value*, const Instruction::CastOps, const ConversionType,
  66         uint32_t&, uint32_t&, const ComponentEnable, const ComponentControl(&)[4], Value*(&)[4],
  67         const uint32_t(&)[4]> Shuffle8bpcArgs;
  68     void Shuffle8bpcGatherd(Shuffle8bpcArgs &args);
  69
  70     typedef std::tuple<Value*(&)[2], Value*, const Instruction::CastOps, const ConversionType,
  71         uint32_t&, uint32_t&, const ComponentEnable, const ComponentControl(&)[4], Value*(&)[4]> Shuffle16bpcArgs;
  72     void Shuffle16bpcGather(Shuffle16bpcArgs &args);
  73
  74     void StoreVertexElements(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4]);
  75
  76     Value* GenerateCompCtrlVector(const ComponentControl ctrl);
  77
  78     void JitLoadVertices(const FETCH_COMPILE_STATE &fetchState, Value* streams, Value* vIndices, Value* pVtxOut);
  79     void JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, Value* streams, Value* vIndices, Value* pVtxOut);
  80
  81     bool IsOddFormat(SWR_FORMAT format);
  82     bool IsUniformFormat(SWR_FORMAT format);
  83     void UnpackComponents(SWR_FORMAT format, Value* vInput, Value* result[4]);
  84     void CreateGatherOddFormats(SWR_FORMAT format, Value* pMask, Value* pBase, Value* offsets, Value* result[4]);
  85     void ConvertFormat(SWR_FORMAT format, Value *texels[4]);
  86
  87     Value* mpFetchInfo;
  88 };
  89
  90 Function* FetchJit::Create(const FETCH_COMPILE_STATE& fetchState)
  91 {
  92     static std::size_t fetchNum = 0;
  93
  94     std::stringstream fnName("FetchShader", std::ios_base::in | std::ios_base::out | std::ios_base::ate);
  95     fnName << fetchNum++;
  96
  97     Function*    fetch = Function::Create(JM()->mFetchShaderTy, GlobalValue::ExternalLinkage, fnName.str(), JM()->mpCurrentModule);
  98     BasicBlock*    entry = BasicBlock::Create(JM()->mContext, "entry", fetch);
  99
 100     IRB()->SetInsertPoint(entry);
 101
 102     auto    argitr = fetch->getArgumentList().begin();
 103
 104     // Fetch shader arguments
 105     mpFetchInfo = &*argitr; ++argitr;
 106     mpFetchInfo->setName("fetchInfo");
 107     Value*    pVtxOut = &*argitr;
 108     pVtxOut->setName("vtxOutput");
 109     // this is just shorthand to tell LLVM to get a pointer to the base address of simdvertex
 110     // index 0(just the pointer to the simdvertex structure
 111     // index 1(which element of the simdvertex structure to offset to(in this case 0)
 112     // so the indices being i32's doesn't matter
 113     // TODO: generated this GEP with a VECTOR structure type so this makes sense
 114     std::vector<Value*>    vtxInputIndices(2, C(0));
 115     // GEP
 116     pVtxOut = GEP(pVtxOut, C(0));
 117     pVtxOut = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth), 0));
 118
 119     // SWR_FETCH_CONTEXT::pStreams
 120     Value*    streams = LOAD(mpFetchInfo,{0, SWR_FETCH_CONTEXT_pStreams});
 121     streams->setName("pStreams");
 122
 123     // SWR_FETCH_CONTEXT::pIndices
 124     Value*    indices = LOAD(mpFetchInfo,{0, SWR_FETCH_CONTEXT_pIndices});
 125     indices->setName("pIndices");
 126
 127     // SWR_FETCH_CONTEXT::pLastIndex
 128     Value*    pLastIndex = LOAD(mpFetchInfo,{0, SWR_FETCH_CONTEXT_pLastIndex});
 129     pLastIndex->setName("pLastIndex");
 130
 131
 132     Value* vIndices;
 133     switch(fetchState.indexType)
 134     {
 135         case R8_UINT:
 136             indices = BITCAST(indices, Type::getInt8PtrTy(JM()->mContext, 0));
 137             if(fetchState.bDisableIndexOOBCheck){
 138                 vIndices = LOAD(BITCAST(indices, PointerType::get(VectorType::get(mInt8Ty, mpJitMgr->mVWidth), 0)), {(uint32_t)0});
 139                 vIndices = Z_EXT(vIndices, mSimdInt32Ty);
 140             }
 141             else{
 142                 pLastIndex = BITCAST(pLastIndex, Type::getInt8PtrTy(JM()->mContext, 0));
 143                 vIndices = GetSimdValid8bitIndices(indices, pLastIndex);
 144             }
 145             break;
 146         case R16_UINT:
 147             indices = BITCAST(indices, Type::getInt16PtrTy(JM()->mContext, 0));
 148             if(fetchState.bDisableIndexOOBCheck){
 149                 vIndices = LOAD(BITCAST(indices, PointerType::get(VectorType::get(mInt16Ty, mpJitMgr->mVWidth), 0)), {(uint32_t)0});
 150                 vIndices = Z_EXT(vIndices, mSimdInt32Ty);
 151             }
 152             else{
 153                 pLastIndex = BITCAST(pLastIndex, Type::getInt16PtrTy(JM()->mContext, 0));
 154                 vIndices = GetSimdValid16bitIndices(indices, pLastIndex);
 155             }
 156             break;
 157         case R32_UINT:
 158             (fetchState.bDisableIndexOOBCheck) ? vIndices = LOAD(BITCAST(indices, PointerType::get(mSimdInt32Ty,0)),{(uint32_t)0})
 159                                                : vIndices = GetSimdValid32bitIndices(indices, pLastIndex);
 160             break; // incoming type is already 32bit int
 161         default: SWR_INVALID("Unsupported index type"); vIndices = nullptr; break;
 162     }
 163
 164     Value* vVertexId = vIndices;
 165     if (fetchState.bVertexIDOffsetEnable)
 166     {
 167         // Assuming one of baseVertex or startVertex is 0, so adding both should be functionally correct
 168         Value* vBaseVertex = VBROADCAST(LOAD(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_BaseVertex }));
 169         Value* vStartVertex = VBROADCAST(LOAD(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_StartVertex }));
 170         vVertexId = ADD(vIndices, vBaseVertex);
 171         vVertexId = ADD(vVertexId, vStartVertex);
 172     }
 173
 174     // store out vertex IDs
 175     STORE(vVertexId, GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID }));
 176
 177     // store out cut mask if enabled
 178     if (fetchState.bEnableCutIndex)
 179     {
 180         Value* vCutIndex = VIMMED1(fetchState.cutIndex);
 181         Value* cutMask = VMASK(ICMP_EQ(vIndices, vCutIndex));
 182         STORE(cutMask, GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_CutMask }));
 183     }
 184
 185     // Fetch attributes from memory and output to a simdvertex struct
 186     // since VGATHER has a perf penalty on HSW vs BDW, allow client to choose which fetch method to use
 187     (fetchState.bDisableVGATHER) ? JitLoadVertices(fetchState, streams, vIndices, pVtxOut)
 188                                  : JitGatherVertices(fetchState, streams, vIndices, pVtxOut);
 189
 190     RET_VOID();
 191
 192     JitManager::DumpToFile(fetch, "src");
 193
 194 #if defined(_DEBUG)
 195     verifyFunction(*fetch);
 196 #endif
 197
 198     ::FunctionPassManager setupPasses(JM()->mpCurrentModule);
 199
 200     ///@todo We don't need the CFG passes for fetch. (e.g. BreakCriticalEdges and CFGSimplification)
 201     setupPasses.add(createBreakCriticalEdgesPass());
 202     setupPasses.add(createCFGSimplificationPass());
 203     setupPasses.add(createEarlyCSEPass());
 204     setupPasses.add(createPromoteMemoryToRegisterPass());
 205
 206     setupPasses.run(*fetch);
 207
 208     JitManager::DumpToFile(fetch, "se");
 209
 210     ::FunctionPassManager optPasses(JM()->mpCurrentModule);
 211
 212     ///@todo Haven't touched these either. Need to remove some of these and add others.
 213     optPasses.add(createCFGSimplificationPass());
 214     optPasses.add(createEarlyCSEPass());
 215     optPasses.add(createInstructionCombiningPass());
 216     optPasses.add(createInstructionSimplifierPass());
 217     optPasses.add(createConstantPropagationPass());
 218     optPasses.add(createSCCPPass());
 219     optPasses.add(createAggressiveDCEPass());
 220
 221     optPasses.run(*fetch);
 222     optPasses.run(*fetch);
 223
 224     JitManager::DumpToFile(fetch, "opt");
 225
 226     return fetch;
 227 }
 228
 229 //////////////////////////////////////////////////////////////////////////
 230 /// @brief Loads attributes from memory using LOADs, shuffling the
 231 /// components into SOA form.
 232 /// *Note* currently does not support component control,
 233 /// component packing, instancing
 234 /// @param fetchState - info about attributes to be fetched from memory
 235 /// @param streams - value pointer to the current vertex stream
 236 /// @param vIndices - vector value of indices to load
 237 /// @param pVtxOut - value pointer to output simdvertex struct
 238 void FetchJit::JitLoadVertices(const FETCH_COMPILE_STATE &fetchState, Value* streams, Value* vIndices, Value* pVtxOut)
 239 {
 240     // Zack shuffles; a variant of the Charleston.
 241
 242     std::vector<Value*> vectors(16);
 243     std::vector<Constant*>    pMask(mVWidth);
 244     for(uint32_t i = 0; i < mVWidth; ++i)
 245     {
 246         pMask[i] = (C(i < 4 ? i : 4));
 247     }
 248     Constant* promoteMask = ConstantVector::get(pMask);
 249     Constant* uwvec = UndefValue::get(VectorType::get(mFP32Ty, 4));
 250
 251     Value* startVertex = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartVertex});
 252     Value* startInstance = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartInstance});
 253     Value* curInstance = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_CurInstance});
 254     Value* vBaseVertex = VBROADCAST(LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_BaseVertex}));
 255     curInstance->setName("curInstance");
 256
 257     for(uint32_t nelt = 0; nelt < fetchState.numAttribs; ++nelt)
 258     {
 259         Value*    elements[4] = {0};
 260         const INPUT_ELEMENT_DESC& ied = fetchState.layout[nelt];
 261         const SWR_FORMAT_INFO &info = GetFormatInfo((SWR_FORMAT)ied.Format);
 262         SWR_ASSERT((info.bpp != 0), "Unsupported format in JitLoadVertices.");
 263         uint32_t    numComponents = info.numComps;
 264         uint32_t bpc = info.bpp / info.numComps;  ///@todo Code below assumes all components are same size. Need to fix.
 265
 266         // load path doesn't support component packing
 267         SWR_ASSERT(ied.ComponentPacking == ComponentEnable::XYZW, "Fetch load path doesn't support component packing.");
 268
 269         vectors.clear();
 270
 271         Value *vCurIndices;
 272         Value *startOffset;
 273         if(ied.InstanceEnable)
 274         {
 275             Value* stepRate = C(ied.InstanceDataStepRate);
 276
 277             // prevent a div by 0 for 0 step rate
 278             Value* isNonZeroStep = ICMP_UGT(stepRate, C(0));
 279             stepRate = SELECT(isNonZeroStep, stepRate, C(1));
 280
 281             // calc the current offset into instanced data buffer
 282             Value* calcInstance = UDIV(curInstance, stepRate);
 283
 284             // if step rate is 0, every instance gets instance 0
 285             calcInstance = SELECT(isNonZeroStep, calcInstance, C(0));
 286
 287             vCurIndices = VBROADCAST(calcInstance);
 288
 289             startOffset = startInstance;
 290         }
 291         else
 292         {
 293             // offset indices by baseVertex
 294             vCurIndices = ADD(vIndices, vBaseVertex);
 295
 296             startOffset = startVertex;
 297         }
 298
 299         // load SWR_VERTEX_BUFFER_STATE::pData
 300         Value *stream = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pData});
 301
 302         // load SWR_VERTEX_BUFFER_STATE::pitch
 303         Value *stride = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pitch});
 304         stride = Z_EXT(stride, mInt64Ty);
 305
 306         // load SWR_VERTEX_BUFFER_STATE::size
 307         Value *size = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_size});
 308         size = Z_EXT(size, mInt64Ty);
 309
 310         Value* startVertexOffset = MUL(Z_EXT(startOffset, mInt64Ty), stride);
 311
 312         Value *minVertex = NULL;
 313         Value *minVertexOffset = NULL;
 314         if (fetchState.bPartialVertexBuffer) {
 315             // fetch min index for low bounds checking
 316             minVertex = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_minVertex)});
 317             minVertex = LOAD(minVertex);
 318             if (!fetchState.bDisableIndexOOBCheck) {
 319                 minVertexOffset = MUL(Z_EXT(minVertex, mInt64Ty), stride);
 320             }
 321         }
 322
 323         // Load from the stream.
 324         for(uint32_t lane = 0; lane < mVWidth; ++lane)
 325         {
 326             // Get index
 327             Value* index = VEXTRACT(vCurIndices, C(lane));
 328
 329             if (fetchState.bPartialVertexBuffer) {
 330                 // clamp below minvertex
 331                 Value *isBelowMin = ICMP_SLT(index, minVertex);
 332                 index = SELECT(isBelowMin, minVertex, index);
 333             }
 334
 335             index = Z_EXT(index, mInt64Ty);
 336
 337             Value*    offset = MUL(index, stride);
 338             offset = ADD(offset, C((int64_t)ied.AlignedByteOffset));
 339             offset = ADD(offset, startVertexOffset);
 340
 341             if (!fetchState.bDisableIndexOOBCheck) {
 342                 // check for out of bound access, including partial OOB, and replace them with minVertex
 343                 Value *endOffset = ADD(offset, C((int64_t)info.Bpp));
 344                 Value *oob = ICMP_ULE(endOffset, size);
 345                 if (fetchState.bPartialVertexBuffer) {
 346                     offset = SELECT(oob, offset, minVertexOffset);
 347                 } else {
 348                     offset = SELECT(oob, offset, ConstantInt::get(mInt64Ty, 0));
 349                 }
 350             }
 351
 352             Value*    pointer = GEP(stream, offset);
 353             // We use a full-lane, but don't actually care.
 354             Value*    vptr = 0;
 355
 356             // get a pointer to a 4 component attrib in default address space
 357             switch(bpc)
 358             {
 359                 case 8: vptr = BITCAST(pointer, PointerType::get(VectorType::get(mInt8Ty, 4), 0)); break;
 360                 case 16: vptr = BITCAST(pointer, PointerType::get(VectorType::get(mInt16Ty, 4), 0)); break;
 361                 case 32: vptr = BITCAST(pointer, PointerType::get(VectorType::get(mFP32Ty, 4), 0)); break;
 362                 default: SWR_INVALID("Unsupported underlying bpp!");
 363             }
 364
 365             // load 4 components of attribute
 366             Value*    vec = ALIGNED_LOAD(vptr, 1, false);
 367
 368             // Convert To FP32 internally
 369             switch(info.type[0])
 370             {
 371                 case SWR_TYPE_UNORM:
 372                     switch(bpc)
 373                     {
 374                         case 8:
 375                             vec = UI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
 376                             vec = FMUL(vec, ConstantVector::get(std::vector<Constant*>(4, ConstantFP::get(mFP32Ty, 1.0 / 255.0))));
 377                             break;
 378                         case 16:
 379                             vec = UI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
 380                             vec = FMUL(vec, ConstantVector::get(std::vector<Constant*>(4, ConstantFP::get(mFP32Ty, 1.0 / 65535.0))));
 381                             break;
 382                         default:
 383                             SWR_INVALID("Unsupported underlying type!");
 384                             break;
 385                     }
 386                     break;
 387                 case SWR_TYPE_SNORM:
 388                     switch(bpc)
 389                     {
 390                         case 8:
 391                             vec = SI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
 392                             vec = FMUL(vec, ConstantVector::get(std::vector<Constant*>(4, ConstantFP::get(mFP32Ty, 1.0 / 128.0))));
 393                             break;
 394                         case 16:
 395                             vec = SI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
 396                             vec = FMUL(vec, ConstantVector::get(std::vector<Constant*>(4, ConstantFP::get(mFP32Ty, 1.0 / 32768.0))));
 397                             break;
 398                         default:
 399                             SWR_INVALID("Unsupported underlying type!");
 400                             break;
 401                     }
 402                     break;
 403                 case SWR_TYPE_UINT:
 404                     // Zero extend uint32_t types.
 405                     switch(bpc)
 406                     {
 407                         case 8:
 408                         case 16:
 409                             vec = Z_EXT(vec, VectorType::get(mInt32Ty, 4));
 410                             vec = BITCAST(vec, VectorType::get(mFP32Ty, 4));
 411                             break;
 412                         case 32:
 413                             break; // Pass through unchanged.
 414                         default:
 415                             SWR_INVALID("Unsupported underlying type!");
 416                             break;
 417                     }
 418                     break;
 419                 case SWR_TYPE_SINT:
 420                     // Sign extend SINT types.
 421                     switch(bpc)
 422                     {
 423                         case 8:
 424                         case 16:
 425                             vec = S_EXT(vec, VectorType::get(mInt32Ty, 4));
 426                             vec = BITCAST(vec, VectorType::get(mFP32Ty, 4));
 427                             break;
 428                         case 32:
 429                             break; // Pass through unchanged.
 430                         default:
 431                             SWR_INVALID("Unsupported underlying type!");
 432                             break;
 433                     }
 434                     break;
 435                 case SWR_TYPE_FLOAT:
 436                     switch(bpc)
 437                     {
 438                         case 32:
 439                             break; // Pass through unchanged.
 440                         default:
 441                             SWR_INVALID("Unsupported underlying type!");
 442                     }
 443                     break;
 444                 case SWR_TYPE_USCALED:
 445                     vec = UI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
 446                     break;
 447                 case SWR_TYPE_SSCALED:
 448                     vec = SI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
 449                     break;
 450                 case SWR_TYPE_SFIXED:
 451                     vec = FMUL(SI_TO_FP(vec, VectorType::get(mFP32Ty, 4)), VBROADCAST(C(1/65536.0f)));
 452                     break;
 453                 case SWR_TYPE_UNKNOWN:
 454                 case SWR_TYPE_UNUSED:
 455                     SWR_INVALID("Unsupported type %d!", info.type[0]);
 456             }
 457
 458             // promote mask: sse(0,1,2,3) | avx(0,1,2,3,4,4,4,4)
 459             // uwvec: 4 x F32, undef value
 460             Value*    wvec = VSHUFFLE(vec, uwvec, promoteMask);
 461             vectors.push_back(wvec);
 462         }
 463
 464         std::vector<Constant*>        v01Mask(mVWidth);
 465         std::vector<Constant*>        v23Mask(mVWidth);
 466         std::vector<Constant*>        v02Mask(mVWidth);
 467         std::vector<Constant*>        v13Mask(mVWidth);
 468
 469         // Concatenate the vectors together.
 470         elements[0] = VUNDEF_F();
 471         elements[1] = VUNDEF_F();
 472         elements[2] = VUNDEF_F();
 473         elements[3] = VUNDEF_F();
 474         for(uint32_t b = 0, num4Wide = mVWidth / 4; b < num4Wide; ++b)
 475         {
 476             v01Mask[4 * b + 0] = C(0 + 4 * b);
 477             v01Mask[4 * b + 1] = C(1 + 4 * b);
 478             v01Mask[4 * b + 2] = C(0 + 4 * b + mVWidth);
 479             v01Mask[4 * b + 3] = C(1 + 4 * b + mVWidth);
 480
 481             v23Mask[4 * b + 0] = C(2 + 4 * b);
 482             v23Mask[4 * b + 1] = C(3 + 4 * b);
 483             v23Mask[4 * b + 2] = C(2 + 4 * b + mVWidth);
 484             v23Mask[4 * b + 3] = C(3 + 4 * b + mVWidth);
 485
 486             v02Mask[4 * b + 0] = C(0 + 4 * b);
 487             v02Mask[4 * b + 1] = C(2 + 4 * b);
 488             v02Mask[4 * b + 2] = C(0 + 4 * b + mVWidth);
 489             v02Mask[4 * b + 3] = C(2 + 4 * b + mVWidth);
 490
 491             v13Mask[4 * b + 0] = C(1 + 4 * b);
 492             v13Mask[4 * b + 1] = C(3 + 4 * b);
 493             v13Mask[4 * b + 2] = C(1 + 4 * b + mVWidth);
 494             v13Mask[4 * b + 3] = C(3 + 4 * b + mVWidth);
 495
 496             std::vector<Constant*>    iMask(mVWidth);
 497             for(uint32_t i = 0; i < mVWidth; ++i)
 498             {
 499                 if(((4 * b) <= i) && (i < (4 * (b + 1))))
 500                 {
 501                     iMask[i] = C(i % 4 + mVWidth);
 502                 }
 503                 else
 504                 {
 505                     iMask[i] = C(i);
 506                 }
 507             }
 508             Constant* insertMask = ConstantVector::get(iMask);
 509             elements[0] = VSHUFFLE(elements[0], vectors[4 * b + 0], insertMask);
 510             elements[1] = VSHUFFLE(elements[1], vectors[4 * b + 1], insertMask);
 511             elements[2] = VSHUFFLE(elements[2], vectors[4 * b + 2], insertMask);
 512             elements[3] = VSHUFFLE(elements[3], vectors[4 * b + 3], insertMask);
 513         }
 514
 515         Value* x0y0x1y1 = VSHUFFLE(elements[0], elements[1], ConstantVector::get(v01Mask));
 516         Value* x2y2x3y3 = VSHUFFLE(elements[2], elements[3], ConstantVector::get(v01Mask));
 517         Value* z0w0z1w1 = VSHUFFLE(elements[0], elements[1], ConstantVector::get(v23Mask));
 518         Value* z2w3z2w3 = VSHUFFLE(elements[2], elements[3], ConstantVector::get(v23Mask));
 519         elements[0] = VSHUFFLE(x0y0x1y1, x2y2x3y3, ConstantVector::get(v02Mask));
 520         elements[1] = VSHUFFLE(x0y0x1y1, x2y2x3y3, ConstantVector::get(v13Mask));
 521         elements[2] = VSHUFFLE(z0w0z1w1, z2w3z2w3, ConstantVector::get(v02Mask));
 522         elements[3] = VSHUFFLE(z0w0z1w1, z2w3z2w3, ConstantVector::get(v13Mask));
 523
 524         switch(numComponents + 1)
 525         {
 526             case    1: elements[0] = VIMMED1(0.0f);
 527             case    2: elements[1] = VIMMED1(0.0f);
 528             case    3: elements[2] = VIMMED1(0.0f);
 529             case    4: elements[3] = VIMMED1(1.0f);
 530         }
 531
 532         for(uint32_t c = 0; c < 4; ++c)
 533         {
 534             Value* dest = GEP(pVtxOut, C(nelt * 4 + c), "destGEP");
 535             STORE(elements[c], dest);
 536         }
 537     }
 538 }
 539
 540 // returns true for odd formats that require special state.gather handling
 541 bool FetchJit::IsOddFormat(SWR_FORMAT format)
 542 {
 543     const SWR_FORMAT_INFO& info = GetFormatInfo(format);
 544     if (info.bpc[0] != 8 && info.bpc[0] != 16 && info.bpc[0] != 32 && info.bpc[0] != 64)
 545     {
 546         return true;
 547     }
 548     return false;
 549 }
 550
 551 // format is uniform if all components are the same size and type
 552 bool FetchJit::IsUniformFormat(SWR_FORMAT format)
 553 {
 554     const SWR_FORMAT_INFO& info = GetFormatInfo(format);
 555     uint32_t bpc0 = info.bpc[0];
 556     uint32_t type0 = info.type[0];
 557
 558     for (uint32_t c = 1; c < info.numComps; ++c)
 559     {
 560         if (bpc0 != info.bpc[c] || type0 != info.type[c])
 561         {
 562             return false;
 563         }
 564     }
 565     return true;
 566 }
 567
 568 // unpacks components based on format
 569 // foreach component in the pixel
 570 //   mask off everything but this component
 571 //   shift component to LSB
 572 void FetchJit::UnpackComponents(SWR_FORMAT format, Value* vInput, Value* result[4])
 573 {
 574     const SWR_FORMAT_INFO& info = GetFormatInfo(format);
 575
 576     uint32_t bitOffset = 0;
 577     for (uint32_t c = 0; c < info.numComps; ++c)
 578     {
 579         uint32_t swizzledIndex = info.swizzle[c];
 580         uint32_t compBits = info.bpc[c];
 581         uint32_t bitmask = ((1 << compBits) - 1) << bitOffset;
 582         Value* comp = AND(vInput, bitmask);
 583         comp = LSHR(comp, bitOffset);
 584
 585         result[swizzledIndex] = comp;
 586         bitOffset += compBits;
 587     }
 588 }
 589
 590 // gather for odd component size formats
 591 // gather SIMD full pixels per lane then shift/mask to move each component to their
 592 // own vector
 593 void FetchJit::CreateGatherOddFormats(SWR_FORMAT format, Value* pMask, Value* pBase, Value* offsets, Value* result[4])
 594 {
 595     const SWR_FORMAT_INFO &info = GetFormatInfo(format);
 596
 597     // only works if pixel size is <= 32bits
 598     SWR_ASSERT(info.bpp <= 32);
 599
 600     Value* gather = VUNDEF_I();
 601
 602     // assign defaults
 603     for (uint32_t comp = 0; comp < 4; ++comp)
 604     {
 605         result[comp] = VIMMED1((int)info.defaults[comp]);
 606     }
 607
 608     // load the proper amount of data based on component size
 609     PointerType* pLoadTy = nullptr;
 610     switch (info.bpp)
 611     {
 612     case 8: pLoadTy = Type::getInt8PtrTy(JM()->mContext); break;
 613     case 16: pLoadTy = Type::getInt16PtrTy(JM()->mContext); break;
 614     case 24:
 615     case 32: pLoadTy = Type::getInt32PtrTy(JM()->mContext); break;
 616     default: SWR_INVALID("Invalid bpp: %d", info.bpp);
 617     }
 618
 619     // allocate temporary memory for masked off lanes
 620     Value* pTmp = ALLOCA(pLoadTy->getElementType());
 621
 622     // gather SIMD pixels
 623     for (uint32_t e = 0; e < JM()->mVWidth; ++e)
 624     {
 625         Value* pElemOffset = VEXTRACT(offsets, C(e));
 626         Value* pLoad = GEP(pBase, pElemOffset);
 627         Value* pLaneMask = VEXTRACT(pMask, C(e));
 628
 629         pLoad = POINTER_CAST(pLoad, pLoadTy);
 630
 631         // mask in tmp pointer for disabled lanes
 632         pLoad = SELECT(pLaneMask, pLoad, pTmp);
 633
 634         // load pixel
 635         Value *val = LOAD(pLoad);
 636
 637         // zero extend to 32bit integer
 638         val = INT_CAST(val, mInt32Ty, false);
 639
 640         // store in simd lane
 641         gather = VINSERT(gather, val, C(e));
 642     }
 643
 644     UnpackComponents(format, gather, result);
 645
 646     // cast to fp32
 647     result[0] = BITCAST(result[0], mSimdFP32Ty);
 648     result[1] = BITCAST(result[1], mSimdFP32Ty);
 649     result[2] = BITCAST(result[2], mSimdFP32Ty);
 650     result[3] = BITCAST(result[3], mSimdFP32Ty);
 651 }
 652
 653 void FetchJit::ConvertFormat(SWR_FORMAT format, Value *texels[4])
 654 {
 655     const SWR_FORMAT_INFO &info = GetFormatInfo(format);
 656
 657     for (uint32_t c = 0; c < info.numComps; ++c)
 658     {
 659         uint32_t compIndex = info.swizzle[c];
 660
 661         // skip any conversion on UNUSED components
 662         if (info.type[c] == SWR_TYPE_UNUSED)
 663         {
 664             continue;
 665         }
 666
 667         if (info.isNormalized[c])
 668         {
 669             if (info.type[c] == SWR_TYPE_SNORM)
 670             {
 671                 /// @todo The most-negative value maps to -1.0f. e.g. the 5-bit value 10000 maps to -1.0f.
 672
 673                 /// result = c * (1.0f / (2^(n-1) - 1);
 674                 uint32_t n = info.bpc[c];
 675                 uint32_t pow2 = 1 << (n - 1);
 676                 float scale = 1.0f / (float)(pow2 - 1);
 677                 Value *vScale = VIMMED1(scale);
 678                 texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty);
 679                 texels[compIndex] = SI_TO_FP(texels[compIndex], mSimdFP32Ty);
 680                 texels[compIndex] = FMUL(texels[compIndex], vScale);
 681             }
 682             else
 683             {
 684                 SWR_ASSERT(info.type[c] == SWR_TYPE_UNORM);
 685
 686                 /// result = c * (1.0f / (2^n - 1))
 687                 uint32_t n = info.bpc[c];
 688                 uint32_t pow2 = 1 << n;
 689                 // special case 24bit unorm format, which requires a full divide to meet ULP requirement
 690                 if (n == 24)
 691                 {
 692                     float scale = (float)(pow2 - 1);
 693                     Value* vScale = VIMMED1(scale);
 694                     texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty);
 695                     texels[compIndex] = SI_TO_FP(texels[compIndex], mSimdFP32Ty);
 696                     texels[compIndex] = FDIV(texels[compIndex], vScale);
 697                 }
 698                 else
 699                 {
 700                     float scale = 1.0f / (float)(pow2 - 1);
 701                     Value *vScale = VIMMED1(scale);
 702                     texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty);
 703                     texels[compIndex] = UI_TO_FP(texels[compIndex], mSimdFP32Ty);
 704                     texels[compIndex] = FMUL(texels[compIndex], vScale);
 705                 }
 706             }
 707             continue;
 708         }
 709     }
 710 }
 711
 712 //////////////////////////////////////////////////////////////////////////
 713 /// @brief Loads attributes from memory using AVX2 GATHER(s)
 714 /// @param fetchState - info about attributes to be fetched from memory
 715 /// @param streams - value pointer to the current vertex stream
 716 /// @param vIndices - vector value of indices to gather
 717 /// @param pVtxOut - value pointer to output simdvertex struct
 718 void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
 719                                  Value* streams, Value* vIndices, Value* pVtxOut)
 720 {
 721     uint32_t currentVertexElement = 0;
 722     uint32_t outputElt = 0;
 723     Value* vVertexElements[4];
 724
 725     Value* startVertex = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartVertex});
 726     Value* startInstance = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartInstance});
 727     Value* curInstance = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_CurInstance});
 728     Value* vBaseVertex = VBROADCAST(LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_BaseVertex}));
 729     curInstance->setName("curInstance");
 730
 731     for(uint32_t nInputElt = 0; nInputElt < fetchState.numAttribs; ++nInputElt)
 732     {
 733         const INPUT_ELEMENT_DESC& ied = fetchState.layout[nInputElt];
 734
 735         // skip element if all components are disabled
 736         if (ied.ComponentPacking == ComponentEnable::NONE)
 737         {
 738             continue;
 739         }
 740
 741         const SWR_FORMAT_INFO &info = GetFormatInfo((SWR_FORMAT)ied.Format);
 742         SWR_ASSERT((info.bpp != 0), "Unsupported format in JitGatherVertices.");
 743         uint32_t bpc = info.bpp / info.numComps;  ///@todo Code below assumes all components are same size. Need to fix.
 744
 745         Value *stream = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pData});
 746
 747         // VGATHER* takes an *i8 src pointer
 748         Value* pStreamBase = BITCAST(stream, PointerType::get(mInt8Ty, 0));
 749
 750         Value *stride = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pitch});
 751         Value *vStride = VBROADCAST(stride);
 752
 753         // max vertex index that is fully in bounds
 754         Value *maxVertex = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_maxVertex)});
 755         maxVertex = LOAD(maxVertex);
 756
 757         Value *minVertex = NULL;
 758         if (fetchState.bPartialVertexBuffer) {
 759             // min vertex index for low bounds OOB checking
 760             minVertex = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_minVertex)});
 761             minVertex = LOAD(minVertex);
 762         }
 763
 764         Value *vCurIndices;
 765         Value *startOffset;
 766         if(ied.InstanceEnable)
 767         {
 768             Value* stepRate = C(ied.InstanceDataStepRate);
 769
 770             // prevent a div by 0 for 0 step rate
 771             Value* isNonZeroStep = ICMP_UGT(stepRate, C(0));
 772             stepRate = SELECT(isNonZeroStep, stepRate, C(1));
 773
 774             // calc the current offset into instanced data buffer
 775             Value* calcInstance = UDIV(curInstance, stepRate);
 776
 777             // if step rate is 0, every instance gets instance 0
 778             calcInstance = SELECT(isNonZeroStep, calcInstance, C(0));
 779
 780             vCurIndices = VBROADCAST(calcInstance);
 781
 782             startOffset = startInstance;
 783         }
 784         else
 785         {
 786             // offset indices by baseVertex
 787             vCurIndices = ADD(vIndices, vBaseVertex);
 788
 789             startOffset = startVertex;
 790         }
 791
 792         // All of the OOB calculations are in vertices, not VB offsets, to prevent having to
 793         // do 64bit address offset calculations.
 794
 795         // calculate byte offset to the start of the VB
 796         Value* baseOffset = MUL(Z_EXT(startOffset, mInt64Ty), Z_EXT(stride, mInt64Ty));
 797         pStreamBase = GEP(pStreamBase, baseOffset);
 798
 799         // if we have a start offset, subtract from max vertex. Used for OOB check
 800         maxVertex = SUB(Z_EXT(maxVertex, mInt64Ty), Z_EXT(startOffset, mInt64Ty));
 801         Value* maxNeg = ICMP_SLT(maxVertex, C((int64_t)0));
 802         // if we have a negative value, we're already OOB. clamp at 0.
 803         maxVertex = SELECT(maxNeg, C(0), TRUNC(maxVertex, mInt32Ty));
 804
 805         if (fetchState.bPartialVertexBuffer) {
 806             // similary for min vertex
 807             minVertex = SUB(Z_EXT(minVertex, mInt64Ty), Z_EXT(startOffset, mInt64Ty));
 808             Value *minNeg = ICMP_SLT(minVertex, C((int64_t)0));
 809             minVertex = SELECT(minNeg, C(0), TRUNC(minVertex, mInt32Ty));
 810         }
 811
 812         // Load the in bounds size of a partially valid vertex
 813         Value *partialInboundsSize = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_partialInboundsSize)});
 814         partialInboundsSize = LOAD(partialInboundsSize);
 815         Value* vPartialVertexSize = VBROADCAST(partialInboundsSize);
 816         Value* vBpp = VBROADCAST(C(info.Bpp));
 817         Value* vAlignmentOffsets = VBROADCAST(C(ied.AlignedByteOffset));
 818
 819         // is the element is <= the partially valid size
 820         Value* vElementInBoundsMask = ICMP_SLE(vBpp, SUB(vPartialVertexSize, vAlignmentOffsets));
 821
 822         // override cur indices with 0 if pitch is 0
 823         Value* pZeroPitchMask = ICMP_EQ(vStride, VIMMED1(0));
 824         vCurIndices = SELECT(pZeroPitchMask, VIMMED1(0), vCurIndices);
 825
 826         // are vertices partially OOB?
 827         Value* vMaxVertex = VBROADCAST(maxVertex);
 828         Value* vPartialOOBMask = ICMP_EQ(vCurIndices, vMaxVertex);
 829
 830         // are vertices fully in bounds?
 831         Value* vMaxGatherMask = ICMP_ULT(vCurIndices, vMaxVertex);
 832
 833         Value *vGatherMask;
 834         if (fetchState.bPartialVertexBuffer) {
 835             // are vertices below minVertex limit?
 836             Value *vMinVertex = VBROADCAST(minVertex);
 837             Value *vMinGatherMask = ICMP_UGE(vCurIndices, vMinVertex);
 838
 839             // only fetch lanes that pass both tests
 840             vGatherMask = AND(vMaxGatherMask, vMinGatherMask);
 841         } else {
 842             vGatherMask = vMaxGatherMask;
 843         }
 844
 845         // blend in any partially OOB indices that have valid elements
 846         vGatherMask = SELECT(vPartialOOBMask, vElementInBoundsMask, vGatherMask);
 847         Value* pMask = vGatherMask;
 848         vGatherMask = VMASK(vGatherMask);
 849
 850         // calculate the actual offsets into the VB
 851         Value* vOffsets = MUL(vCurIndices, vStride);
 852         vOffsets = ADD(vOffsets, vAlignmentOffsets);
 853
 854         // Packing and component control
 855         ComponentEnable compMask = (ComponentEnable)ied.ComponentPacking;
 856         const ComponentControl compCtrl[4] { (ComponentControl)ied.ComponentControl0, (ComponentControl)ied.ComponentControl1,
 857                                              (ComponentControl)ied.ComponentControl2, (ComponentControl)ied.ComponentControl3};
 858
 859         // Special gather/conversion for formats without equal component sizes
 860         if (IsOddFormat((SWR_FORMAT)ied.Format))
 861         {
 862             Value* pResults[4];
 863             CreateGatherOddFormats((SWR_FORMAT)ied.Format, pMask, pStreamBase, vOffsets, pResults);
 864             ConvertFormat((SWR_FORMAT)ied.Format, pResults);
 865
 866             for (uint32_t c = 0; c < 4; ++c)
 867             {
 868                 if (isComponentEnabled(compMask, c))
 869                 {
 870                     vVertexElements[currentVertexElement++] = pResults[c];
 871                     if (currentVertexElement > 3)
 872                     {
 873                         StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
 874                         // reset to the next vVertexElement to output
 875                         currentVertexElement = 0;
 876                     }
 877                 }
 878             }
 879         }
 880         else if(info.type[0] == SWR_TYPE_FLOAT)
 881         {
 882             ///@todo: support 64 bit vb accesses
 883             Value* gatherSrc = VIMMED1(0.0f);
 884
 885             SWR_ASSERT(IsUniformFormat((SWR_FORMAT)ied.Format),
 886                 "Unsupported format for standard gather fetch.");
 887
 888             // Gather components from memory to store in a simdvertex structure
 889             switch(bpc)
 890             {
 891                 case 16:
 892                 {
 893                     Value* vGatherResult[2];
 894                     Value *vMask;
 895
 896                     // if we have at least one component out of x or y to fetch
 897                     if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)){
 898                         // save mask as it is zero'd out after each gather
 899                         vMask = vGatherMask;
 900
 901                         vGatherResult[0] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1));
 902                         // e.g. result of first 8x32bit integer gather for 16bit components
 903                         // 256i - 0    1    2    3    4    5    6    7
 904                         //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
 905                         //
 906                     }
 907
 908                     // if we have at least one component out of z or w to fetch
 909                     if(isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)){
 910                         // offset base to the next components(zw) in the vertex to gather
 911                         pStreamBase = GEP(pStreamBase, C((char)4));
 912                         vMask = vGatherMask;
 913
 914                         vGatherResult[1] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1));
 915                         // e.g. result of second 8x32bit integer gather for 16bit components
 916                         // 256i - 0    1    2    3    4    5    6    7
 917                         //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
 918                         //
 919                     }
 920
 921                     // if we have at least one component to shuffle into place
 922                     if(compMask){
 923                         Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, Instruction::CastOps::FPExt, CONVERT_NONE,
 924                             currentVertexElement, outputElt, compMask, compCtrl, vVertexElements);
 925
 926                         // Shuffle gathered components into place in simdvertex struct
 927                         Shuffle16bpcGather(args);  // outputs to vVertexElements ref
 928                     }
 929                 }
 930                     break;
 931                 case 32:
 932                 {
 933                     for (uint32_t i = 0; i < 4; i++)
 934                     {
 935                         if (isComponentEnabled(compMask, i))
 936                         {
 937                             // if we need to gather the component
 938                             if (compCtrl[i] == StoreSrc)
 939                             {
 940                                 // save mask as it is zero'd out after each gather
 941                                 Value *vMask = vGatherMask;
 942
 943                                 // Gather a SIMD of vertices
 944                                 vVertexElements[currentVertexElement++] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1));
 945                             }
 946                             else
 947                             {
 948                                 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
 949                             }
 950
 951                             if (currentVertexElement > 3)
 952                             {
 953                                 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
 954                                 // reset to the next vVertexElement to output
 955                                 currentVertexElement = 0;
 956                             }
 957
 958                         }
 959
 960                         // offset base to the next component in the vertex to gather
 961                         pStreamBase = GEP(pStreamBase, C((char)4));
 962                     }
 963                 }
 964                     break;
 965                 case 64:
 966                 {
 967                     for (uint32_t i = 0; i < 4; i++)
 968                     {
 969                         if (isComponentEnabled(compMask, i))
 970                         {
 971                             // if we need to gather the component
 972                             if (compCtrl[i] == StoreSrc)
 973                             {
 974                                 Value *vMaskLo = VSHUFFLE(pMask, VUNDEF(mInt1Ty, 8), C({0, 1, 2, 3}));
 975                                 Value *vMaskHi = VSHUFFLE(pMask, VUNDEF(mInt1Ty, 8), C({4, 5, 6, 7}));
 976                                 vMaskLo = S_EXT(vMaskLo, VectorType::get(mInt64Ty, 4));
 977                                 vMaskHi = S_EXT(vMaskHi, VectorType::get(mInt64Ty, 4));
 978                                 vMaskLo = BITCAST(vMaskLo, VectorType::get(mDoubleTy, 4));
 979                                 vMaskHi = BITCAST(vMaskHi, VectorType::get(mDoubleTy, 4));
 980
 981                                 Value *vOffsetsLo = VEXTRACTI128(vOffsets, C(0));
 982                                 Value *vOffsetsHi = VEXTRACTI128(vOffsets, C(1));
 983
 984                                 Value *vZeroDouble = VECTOR_SPLAT(4, ConstantFP::get(IRB()->getDoubleTy(), 0.0f));
 985
 986                                 Value* pGatherLo = GATHERPD(vZeroDouble,
 987                                                             pStreamBase, vOffsetsLo, vMaskLo, C((char)1));
 988                                 Value* pGatherHi = GATHERPD(vZeroDouble,
 989                                                             pStreamBase, vOffsetsHi, vMaskHi, C((char)1));
 990
 991                                 pGatherLo = VCVTPD2PS(pGatherLo);
 992                                 pGatherHi = VCVTPD2PS(pGatherHi);
 993
 994                                 Value *pGather = VSHUFFLE(pGatherLo, pGatherHi, C({0, 1, 2, 3, 4, 5, 6, 7}));
 995
 996                                 vVertexElements[currentVertexElement++] = pGather;
 997                             }
 998                             else
 999                             {
1000                                 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1001                             }
1002
1003                             if (currentVertexElement > 3)
1004                             {
1005                                 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1006                                 // reset to the next vVertexElement to output
1007                                 currentVertexElement = 0;
1008                             }
1009
1010                         }
1011
1012                         // offset base to the next component  in the vertex to gather
1013                         pStreamBase = GEP(pStreamBase, C((char)8));
1014                     }
1015                 }
1016                     break;
1017                 default:
1018                     SWR_INVALID("Tried to fetch invalid FP format");
1019                     break;
1020             }
1021         }
1022         else
1023         {
1024             Instruction::CastOps extendCastType = Instruction::CastOps::CastOpsEnd;
1025             ConversionType conversionType = CONVERT_NONE;
1026
1027             SWR_ASSERT(IsUniformFormat((SWR_FORMAT)ied.Format),
1028                 "Unsupported format for standard gather fetch.");
1029
1030             switch(info.type[0])
1031             {
1032                 case SWR_TYPE_UNORM:
1033                     conversionType = CONVERT_NORMALIZED;
1034                 case SWR_TYPE_UINT:
1035                     extendCastType = Instruction::CastOps::ZExt;
1036                     break;
1037                 case SWR_TYPE_SNORM:
1038                     conversionType = CONVERT_NORMALIZED;
1039                 case SWR_TYPE_SINT:
1040                     extendCastType = Instruction::CastOps::SExt;
1041                     break;
1042                 case SWR_TYPE_USCALED:
1043                     conversionType = CONVERT_USCALED;
1044                     extendCastType = Instruction::CastOps::UIToFP;
1045                     break;
1046                 case SWR_TYPE_SSCALED:
1047                     conversionType = CONVERT_SSCALED;
1048                     extendCastType = Instruction::CastOps::SIToFP;
1049                     break;
1050                 case SWR_TYPE_SFIXED:
1051                     conversionType = CONVERT_SFIXED;
1052                     extendCastType = Instruction::CastOps::SExt;
1053                     break;
1054                 default:
1055                     break;
1056             }
1057
1058             // value substituted when component of gather is masked
1059             Value* gatherSrc = VIMMED1(0);
1060
1061             // Gather components from memory to store in a simdvertex structure
1062             switch (bpc)
1063             {
1064                 case 8:
1065                 {
1066                     // if we have at least one component to fetch
1067                     if(compMask)
1068                     {
1069                         Value* vGatherResult = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask, C((char)1));
1070                         // e.g. result of an 8x32bit integer gather for 8bit components
1071                         // 256i - 0    1    2    3    4    5    6    7
1072                         //        xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw
1073
1074                         Shuffle8bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, extendCastType, conversionType,
1075                             currentVertexElement, outputElt, compMask, compCtrl, vVertexElements, info.swizzle);
1076
1077                         // Shuffle gathered components into place in simdvertex struct
1078                         Shuffle8bpcGatherd(args); // outputs to vVertexElements ref
1079                     }
1080                 }
1081                 break;
1082                 case 16:
1083                 {
1084                     Value* vGatherResult[2];
1085                     Value *vMask;
1086
1087                     // if we have at least one component out of x or y to fetch
1088                     if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)){
1089                         // save mask as it is zero'd out after each gather
1090                         vMask = vGatherMask;
1091
1092                         vGatherResult[0] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1));
1093                         // e.g. result of first 8x32bit integer gather for 16bit components
1094                         // 256i - 0    1    2    3    4    5    6    7
1095                         //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
1096                         //
1097                     }
1098
1099                     // if we have at least one component out of z or w to fetch
1100                     if(isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)){
1101                         // offset base to the next components(zw) in the vertex to gather
1102                         pStreamBase = GEP(pStreamBase, C((char)4));
1103                         vMask = vGatherMask;
1104
1105                         vGatherResult[1] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1));
1106                         // e.g. result of second 8x32bit integer gather for 16bit components
1107                         // 256i - 0    1    2    3    4    5    6    7
1108                         //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
1109                         //
1110                     }
1111
1112                     // if we have at least one component to shuffle into place
1113                     if(compMask){
1114                         Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, extendCastType, conversionType,
1115                             currentVertexElement, outputElt, compMask, compCtrl, vVertexElements);
1116
1117                         // Shuffle gathered components into place in simdvertex struct
1118                         Shuffle16bpcGather(args);  // outputs to vVertexElements ref
1119                     }
1120                 }
1121                 break;
1122                 case 32:
1123                 {
1124                     // Gathered components into place in simdvertex struct
1125                     for (uint32_t i = 0; i < 4; i++)
1126                     {
1127                         if (isComponentEnabled(compMask, i))
1128                         {
1129                             // if we need to gather the component
1130                             if (compCtrl[i] == StoreSrc)
1131                             {
1132                                 // save mask as it is zero'd out after each gather
1133                                 Value *vMask = vGatherMask;
1134
1135                                 Value* pGather = GATHERDD(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1));
1136
1137                                 if (conversionType == CONVERT_USCALED)
1138                                 {
1139                                     pGather = UI_TO_FP(pGather, mSimdFP32Ty);
1140                                 }
1141                                 else if (conversionType == CONVERT_SSCALED)
1142                                 {
1143                                     pGather = SI_TO_FP(pGather, mSimdFP32Ty);
1144                                 }
1145                                 else if (conversionType == CONVERT_SFIXED)
1146                                 {
1147                                     pGather = FMUL(SI_TO_FP(pGather, mSimdFP32Ty), VBROADCAST(C(1/65536.0f)));
1148                                 }
1149
1150                                 vVertexElements[currentVertexElement++] = pGather;
1151                                 // e.g. result of a single 8x32bit integer gather for 32bit components
1152                                 // 256i - 0    1    2    3    4    5    6    7
1153                                 //        xxxx xxxx xxxx xxxx xxxx xxxx xxxx xxxx
1154                             }
1155                             else
1156                             {
1157                                 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1158                             }
1159
1160                             if (currentVertexElement > 3)
1161                             {
1162                                 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1163                                 // reset to the next vVertexElement to output
1164                                 currentVertexElement = 0;
1165                             }
1166
1167                         }
1168
1169                         // offset base to the next component  in the vertex to gather
1170                         pStreamBase = GEP(pStreamBase, C((char)4));
1171                     }
1172                 }
1173                 break;
1174             }
1175         }
1176     }
1177
1178     // if we have a partially filled vVertexElement struct, output it
1179     if(currentVertexElement > 0){
1180         StoreVertexElements(pVtxOut, outputElt++, currentVertexElement, vVertexElements);
1181     }
1182 }
1183
1184 //////////////////////////////////////////////////////////////////////////
1185 /// @brief Loads a simd of valid indices. OOB indices are set to 0
1186 /// *Note* have to do 16bit index checking in scalar until we have AVX-512
1187 /// support
1188 /// @param pIndices - pointer to 8 bit indices
1189 /// @param pLastIndex - pointer to last valid index
1190 Value* FetchJit::GetSimdValid8bitIndices(Value* pIndices, Value* pLastIndex)
1191 {
1192     // can fit 2 16 bit integers per vWidth lane
1193     Value* vIndices =  VUNDEF_I();
1194
1195     // store 0 index on stack to be used to conditionally load from if index address is OOB
1196     Value* pZeroIndex = ALLOCA(mInt8Ty);
1197     STORE(C((uint8_t)0), pZeroIndex);
1198
1199     // Load a SIMD of index pointers
1200     for(int64_t lane = 0; lane < mVWidth; lane++)
1201     {
1202         // Calculate the address of the requested index
1203         Value *pIndex = GEP(pIndices, C(lane));
1204
1205         // check if the address is less than the max index,
1206         Value* mask = ICMP_ULT(pIndex, pLastIndex);
1207
1208         // if valid, load the index. if not, load 0 from the stack
1209         Value* pValid = SELECT(mask, pIndex, pZeroIndex);
1210         Value *index = LOAD(pValid, "valid index");
1211
1212         // zero extended index to 32 bits and insert into the correct simd lane
1213         index = Z_EXT(index, mInt32Ty);
1214         vIndices = VINSERT(vIndices, index, lane);
1215     }
1216     return vIndices;
1217 }
1218
1219 //////////////////////////////////////////////////////////////////////////
1220 /// @brief Loads a simd of valid indices. OOB indices are set to 0
1221 /// *Note* have to do 16bit index checking in scalar until we have AVX-512
1222 /// support
1223 /// @param pIndices - pointer to 16 bit indices
1224 /// @param pLastIndex - pointer to last valid index
1225 Value* FetchJit::GetSimdValid16bitIndices(Value* pIndices, Value* pLastIndex)
1226 {
1227     // can fit 2 16 bit integers per vWidth lane
1228     Value* vIndices =  VUNDEF_I();
1229
1230     // store 0 index on stack to be used to conditionally load from if index address is OOB
1231     Value* pZeroIndex = ALLOCA(mInt16Ty);
1232     STORE(C((uint16_t)0), pZeroIndex);
1233
1234     // Load a SIMD of index pointers
1235     for(int64_t lane = 0; lane < mVWidth; lane++)
1236     {
1237         // Calculate the address of the requested index
1238         Value *pIndex = GEP(pIndices, C(lane));
1239
1240         // check if the address is less than the max index,
1241         Value* mask = ICMP_ULT(pIndex, pLastIndex);
1242
1243         // if valid, load the index. if not, load 0 from the stack
1244         Value* pValid = SELECT(mask, pIndex, pZeroIndex);
1245         Value *index = LOAD(pValid, "valid index");
1246
1247         // zero extended index to 32 bits and insert into the correct simd lane
1248         index = Z_EXT(index, mInt32Ty);
1249         vIndices = VINSERT(vIndices, index, lane);
1250     }
1251     return vIndices;
1252 }
1253
1254 //////////////////////////////////////////////////////////////////////////
1255 /// @brief Loads a simd of valid indices. OOB indices are set to 0
1256 /// @param pIndices - pointer to 32 bit indices
1257 /// @param pLastIndex - pointer to last valid index
1258 Value* FetchJit::GetSimdValid32bitIndices(Value* pIndices, Value* pLastIndex)
1259 {
1260     DataLayout dL(JM()->mpCurrentModule);
1261     unsigned int ptrSize = dL.getPointerSize() * 8;  // ptr size in bits
1262     Value* iLastIndex = PTR_TO_INT(pLastIndex, Type::getIntNTy(JM()->mContext, ptrSize));
1263     Value* iIndices = PTR_TO_INT(pIndices, Type::getIntNTy(JM()->mContext, ptrSize));
1264
1265     // get the number of indices left in the buffer (endPtr - curPtr) / sizeof(index)
1266     Value* numIndicesLeft = SUB(iLastIndex,iIndices);
1267     numIndicesLeft = TRUNC(numIndicesLeft, mInt32Ty);
1268     numIndicesLeft = SDIV(numIndicesLeft, C(4));
1269
1270     // create a vector of index counts from the base index ptr passed into the fetch
1271     const std::vector<Constant*> vecIndices {C(0), C(1), C(2), C(3), C(4), C(5), C(6), C(7)};
1272     Constant* vIndexOffsets = ConstantVector::get(vecIndices);
1273
1274     // compare index count to the max valid index
1275     // e.g vMaxIndex      4 4 4 4 4 4 4 4 : 4 indices left to load
1276     //     vIndexOffsets  0 1 2 3 4 5 6 7
1277     //     ------------------------------
1278     //     vIndexMask    -1-1-1-1 0 0 0 0 : offsets < max pass
1279     //     vLoadedIndices 0 1 2 3 0 0 0 0 : offsets >= max masked to 0
1280     Value* vMaxIndex = VBROADCAST(numIndicesLeft);
1281     Value* vIndexMask = VPCMPGTD(vMaxIndex,vIndexOffsets);
1282
1283     // VMASKLOAD takes an *i8 src pointer
1284     pIndices = BITCAST(pIndices,PointerType::get(mInt8Ty,0));
1285
1286     // Load the indices; OOB loads 0
1287     return MASKLOADD(pIndices,vIndexMask);
1288 }
1289
1290 //////////////////////////////////////////////////////////////////////////
1291 /// @brief Takes a SIMD of gathered 8bpc verts, zero or sign extends,
1292 /// denormalizes if needed, converts to F32 if needed, and positions in
1293 //  the proper SIMD rows to be output to the simdvertex structure
1294 /// @param args: (tuple of args, listed below)
1295 ///   @param vGatherResult - 8 gathered 8bpc vertices
1296 ///   @param pVtxOut - base pointer to output simdvertex struct
1297 ///   @param extendType - sign extend or zero extend
1298 ///   @param bNormalized - do we need to denormalize?
1299 ///   @param currentVertexElement - reference to the current vVertexElement
1300 ///   @param outputElt - reference to the current offset from simdvertex we're o
1301 ///   @param compMask - component packing mask
1302 ///   @param compCtrl - component control val
1303 ///   @param vVertexElements[4] - vertex components to output
1304 ///   @param swizzle[4] - component swizzle location
1305 void FetchJit::Shuffle8bpcGatherd(Shuffle8bpcArgs &args)
1306 {
1307     // Unpack tuple args
1308     Value*& vGatherResult = std::get<0>(args);
1309     Value* pVtxOut = std::get<1>(args);
1310     const Instruction::CastOps extendType = std::get<2>(args);
1311     const ConversionType conversionType = std::get<3>(args);
1312     uint32_t &currentVertexElement = std::get<4>(args);
1313     uint32_t &outputElt =  std::get<5>(args);
1314     const ComponentEnable compMask = std::get<6>(args);
1315     const ComponentControl (&compCtrl)[4] = std::get<7>(args);
1316     Value* (&vVertexElements)[4] = std::get<8>(args);
1317     const uint32_t (&swizzle)[4] = std::get<9>(args);
1318
1319     // cast types
1320     Type* vGatherTy = mSimdInt32Ty;
1321     Type* v32x8Ty =  VectorType::get(mInt8Ty, mVWidth * 4 ); // vwidth is units of 32 bits
1322
1323     // have to do extra work for sign extending
1324     if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP)){
1325         Type* v16x8Ty = VectorType::get(mInt8Ty, mVWidth * 2); // 8x16bit ints in a 128bit lane
1326         Type* v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
1327
1328         // shuffle mask, including any swizzling
1329         const char x = (char)swizzle[0]; const char y = (char)swizzle[1];
1330         const char z = (char)swizzle[2]; const char w = (char)swizzle[3];
1331         Value* vConstMask = C<char>({char(x), char(x+4), char(x+8), char(x+12),
1332                     char(y), char(y+4), char(y+8), char(y+12),
1333                     char(z), char(z+4), char(z+8), char(z+12),
1334                     char(w), char(w+4), char(w+8), char(w+12),
1335                     char(x), char(x+4), char(x+8), char(x+12),
1336                     char(y), char(y+4), char(y+8), char(y+12),
1337                     char(z), char(z+4), char(z+8), char(z+12),
1338                     char(w), char(w+4), char(w+8), char(w+12)});
1339
1340         Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherResult, v32x8Ty), vConstMask), vGatherTy);
1341         // after pshufb: group components together in each 128bit lane
1342         // 256i - 0    1    2    3    4    5    6    7
1343         //        xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww
1344
1345         Value* vi128XY = nullptr;
1346         if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)){
1347             vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({0, 4, 0, 0, 1, 5, 0, 0})), v128Ty);
1348             // after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane
1349             // 256i - 0    1    2    3    4    5    6    7
1350             //        xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care)
1351         }
1352
1353         // do the same for zw components
1354         Value* vi128ZW = nullptr;
1355         if(isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)){
1356             vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({2, 6, 0, 0, 3, 7, 0, 0})), v128Ty);
1357         }
1358
1359         // init denormalize variables if needed
1360         Instruction::CastOps fpCast;
1361         Value* conversionFactor;
1362
1363         switch (conversionType)
1364         {
1365         case CONVERT_NORMALIZED:
1366             fpCast = Instruction::CastOps::SIToFP;
1367             conversionFactor = VIMMED1((float)(1.0 / 127.0));
1368             break;
1369         case CONVERT_SSCALED:
1370             fpCast = Instruction::CastOps::SIToFP;
1371             conversionFactor = VIMMED1((float)(1.0));
1372             break;
1373         case CONVERT_USCALED:
1374             SWR_INVALID("Type should not be sign extended!");
1375             conversionFactor = nullptr;
1376             break;
1377         default:
1378             SWR_ASSERT(conversionType == CONVERT_NONE);
1379             conversionFactor = nullptr;
1380             break;
1381         }
1382
1383         // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
1384         for (uint32_t i = 0; i < 4; i++)
1385         {
1386             if (isComponentEnabled(compMask, i))
1387             {
1388                 if (compCtrl[i] == ComponentControl::StoreSrc)
1389                 {
1390                     // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
1391                     uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
1392                     // if x or y, use vi128XY permute result, else use vi128ZW
1393                     Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
1394
1395                     // sign extend
1396                     vVertexElements[currentVertexElement] = PMOVSXBD(BITCAST(VEXTRACT(selectedPermute, C(lane)), v16x8Ty));
1397
1398                     // denormalize if needed
1399                     if (conversionType != CONVERT_NONE)
1400                     {
1401                         vVertexElements[currentVertexElement] = FMUL(CAST(fpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor);
1402                     }
1403                     currentVertexElement++;
1404                 }
1405                 else
1406                 {
1407                     vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1408                 }
1409
1410                 if (currentVertexElement > 3)
1411                 {
1412                     StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1413                     // reset to the next vVertexElement to output
1414                     currentVertexElement = 0;
1415                 }
1416             }
1417         }
1418     }
1419     // else zero extend
1420     else if ((extendType == Instruction::CastOps::ZExt) || (extendType == Instruction::CastOps::UIToFP))
1421     {
1422         // init denormalize variables if needed
1423         Instruction::CastOps fpCast;
1424         Value* conversionFactor;
1425
1426         switch (conversionType)
1427         {
1428         case CONVERT_NORMALIZED:
1429             fpCast = Instruction::CastOps::UIToFP;
1430             conversionFactor = VIMMED1((float)(1.0 / 255.0));
1431             break;
1432         case CONVERT_USCALED:
1433             fpCast = Instruction::CastOps::UIToFP;
1434             conversionFactor = VIMMED1((float)(1.0));
1435             break;
1436         case CONVERT_SSCALED:
1437             SWR_INVALID("Type should not be zero extended!");
1438             conversionFactor = nullptr;
1439             break;
1440         default:
1441             SWR_ASSERT(conversionType == CONVERT_NONE);
1442             conversionFactor = nullptr;
1443             break;
1444         }
1445
1446         // shuffle enabled components into lower byte of each 32bit lane, 0 extending to 32 bits
1447         for (uint32_t i = 0; i < 4; i++)
1448         {
1449             if (isComponentEnabled(compMask, i))
1450             {
1451                 if (compCtrl[i] == ComponentControl::StoreSrc)
1452                 {
1453                     // pshufb masks for each component
1454                     Value* vConstMask;
1455                     switch (swizzle[i])
1456                     {
1457                     case 0:
1458                         // x shuffle mask
1459                         vConstMask = C<char>({ 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1,
1460                                                0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1 });
1461                         break;
1462                     case 1:
1463                         // y shuffle mask
1464                         vConstMask = C<char>({ 1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1,
1465                                                1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1 });
1466                         break;
1467                     case 2:
1468                         // z shuffle mask
1469                         vConstMask = C<char>({ 2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1,
1470                                                2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1 });
1471                         break;
1472                     case 3:
1473                         // w shuffle mask
1474                         vConstMask = C<char>({ 3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1,
1475                                                3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1 });
1476                         break;
1477                     default:
1478                         vConstMask = nullptr;
1479                         break;
1480                     }
1481
1482                     vVertexElements[currentVertexElement] = BITCAST(PSHUFB(BITCAST(vGatherResult, v32x8Ty), vConstMask), vGatherTy);
1483                     // after pshufb for x channel
1484                     // 256i - 0    1    2    3    4    5    6    7
1485                     //        x000 x000 x000 x000 x000 x000 x000 x000
1486
1487                     // denormalize if needed
1488                     if (conversionType != CONVERT_NONE)
1489                     {
1490                         vVertexElements[currentVertexElement] = FMUL(CAST(fpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor);
1491                     }
1492                     currentVertexElement++;
1493                 }
1494                 else
1495                 {
1496                     vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1497                 }
1498
1499                 if (currentVertexElement > 3)
1500                 {
1501                     StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1502                     // reset to the next vVertexElement to output
1503                     currentVertexElement = 0;
1504                 }
1505             }
1506         }
1507     }
1508     else
1509     {
1510         SWR_INVALID("Unsupported conversion type");
1511     }
1512 }
1513
1514 //////////////////////////////////////////////////////////////////////////
1515 /// @brief Takes a SIMD of gathered 16bpc verts, zero or sign extends,
1516 /// denormalizes if needed, converts to F32 if needed, and positions in
1517 //  the proper SIMD rows to be output to the simdvertex structure
1518 /// @param args: (tuple of args, listed below)
1519 ///   @param vGatherResult[2] - array of gathered 16bpc vertices, 4 per index
1520 ///   @param pVtxOut - base pointer to output simdvertex struct
1521 ///   @param extendType - sign extend or zero extend
1522 ///   @param bNormalized - do we need to denormalize?
1523 ///   @param currentVertexElement - reference to the current vVertexElement
1524 ///   @param outputElt - reference to the current offset from simdvertex we're o
1525 ///   @param compMask - component packing mask
1526 ///   @param compCtrl - component control val
1527 ///   @param vVertexElements[4] - vertex components to output
1528 void FetchJit::Shuffle16bpcGather(Shuffle16bpcArgs &args)
1529 {
1530     // Unpack tuple args
1531     Value* (&vGatherResult)[2] = std::get<0>(args);
1532     Value* pVtxOut = std::get<1>(args);
1533     const Instruction::CastOps extendType = std::get<2>(args);
1534     const ConversionType conversionType = std::get<3>(args);
1535     uint32_t &currentVertexElement = std::get<4>(args);
1536     uint32_t &outputElt = std::get<5>(args);
1537     const ComponentEnable compMask = std::get<6>(args);
1538     const ComponentControl(&compCtrl)[4] = std::get<7>(args);
1539     Value* (&vVertexElements)[4] = std::get<8>(args);
1540
1541     // cast types
1542     Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
1543     Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
1544
1545     // have to do extra work for sign extending
1546     if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP)||
1547         (extendType == Instruction::CastOps::FPExt))
1548     {
1549         // is this PP float?
1550         bool bFP = (extendType == Instruction::CastOps::FPExt) ? true : false;
1551
1552         Type* v8x16Ty = VectorType::get(mInt16Ty, 8); // 8x16bit in a 128bit lane
1553         Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
1554
1555         // shuffle mask
1556         Value* vConstMask = C<char>({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
1557                                      0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15});
1558         Value* vi128XY = nullptr;
1559         if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)){
1560             Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherResult[0], v32x8Ty), vConstMask), vGatherTy);
1561             // after pshufb: group components together in each 128bit lane
1562             // 256i - 0    1    2    3    4    5    6    7
1563             //        xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
1564
1565             vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
1566             // after PERMD: move and pack xy components into each 128bit lane
1567             // 256i - 0    1    2    3    4    5    6    7
1568             //        xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
1569         }
1570
1571         // do the same for zw components
1572         Value* vi128ZW = nullptr;
1573         if(isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)){
1574             Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherResult[1], v32x8Ty), vConstMask), vGatherTy);
1575             vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
1576         }
1577
1578         // init denormalize variables if needed
1579         Instruction::CastOps IntToFpCast;
1580         Value* conversionFactor;
1581
1582         switch (conversionType)
1583         {
1584         case CONVERT_NORMALIZED:
1585             IntToFpCast = Instruction::CastOps::SIToFP;
1586             conversionFactor = VIMMED1((float)(1.0 / 32767.0));
1587             break;
1588         case CONVERT_SSCALED:
1589             IntToFpCast = Instruction::CastOps::SIToFP;
1590             conversionFactor = VIMMED1((float)(1.0));
1591             break;
1592         case CONVERT_USCALED:
1593             SWR_INVALID("Type should not be sign extended!");
1594             conversionFactor = nullptr;
1595             break;
1596         default:
1597             SWR_ASSERT(conversionType == CONVERT_NONE);
1598             conversionFactor = nullptr;
1599             break;
1600         }
1601
1602         // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
1603         for (uint32_t i = 0; i < 4; i++)
1604         {
1605             if (isComponentEnabled(compMask, i))
1606             {
1607                 if (compCtrl[i] == ComponentControl::StoreSrc)
1608                 {
1609                     // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
1610                     uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
1611                     // if x or y, use vi128XY permute result, else use vi128ZW
1612                     Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
1613
1614                     if (bFP) {
1615                         // extract 128 bit lanes to sign extend each component
1616                         vVertexElements[currentVertexElement] = CVTPH2PS(BITCAST(VEXTRACT(selectedPermute, C(lane)), v8x16Ty));
1617                     }
1618                     else {
1619                         // extract 128 bit lanes to sign extend each component
1620                         vVertexElements[currentVertexElement] = PMOVSXWD(BITCAST(VEXTRACT(selectedPermute, C(lane)), v8x16Ty));
1621
1622                         // denormalize if needed
1623                         if (conversionType != CONVERT_NONE) {
1624                             vVertexElements[currentVertexElement] = FMUL(CAST(IntToFpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor);
1625                         }
1626                     }
1627                     currentVertexElement++;
1628                 }
1629                 else
1630                 {
1631                     vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1632                 }
1633
1634                 if (currentVertexElement > 3)
1635                 {
1636                     StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1637                     // reset to the next vVertexElement to output
1638                     currentVertexElement = 0;
1639                 }
1640             }
1641         }
1642     }
1643     // else zero extend
1644     else if ((extendType == Instruction::CastOps::ZExt) || (extendType == Instruction::CastOps::UIToFP))
1645     {
1646         // pshufb masks for each component
1647         Value* vConstMask[2];
1648         if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 2)){
1649             // x/z shuffle mask
1650             vConstMask[0] = C<char>({0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
1651                                      0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, });
1652         }
1653
1654         if(isComponentEnabled(compMask, 1) || isComponentEnabled(compMask, 3)){
1655             // y/w shuffle mask
1656             vConstMask[1] = C<char>({2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
1657                                      2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1});
1658         }
1659
1660         // init denormalize variables if needed
1661         Instruction::CastOps fpCast;
1662         Value* conversionFactor;
1663
1664         switch (conversionType)
1665         {
1666         case CONVERT_NORMALIZED:
1667             fpCast = Instruction::CastOps::UIToFP;
1668             conversionFactor = VIMMED1((float)(1.0 / 65535.0));
1669             break;
1670         case CONVERT_USCALED:
1671             fpCast = Instruction::CastOps::UIToFP;
1672             conversionFactor = VIMMED1((float)(1.0f));
1673             break;
1674         case CONVERT_SSCALED:
1675             SWR_INVALID("Type should not be zero extended!");
1676             conversionFactor = nullptr;
1677             break;
1678         default:
1679             SWR_ASSERT(conversionType == CONVERT_NONE);
1680             conversionFactor = nullptr;
1681             break;
1682         }
1683
1684         // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits
1685         for (uint32_t i = 0; i < 4; i++)
1686         {
1687             if (isComponentEnabled(compMask, i))
1688             {
1689                 if (compCtrl[i] == ComponentControl::StoreSrc)
1690                 {
1691                     // select correct constMask for x/z or y/w pshufb
1692                     uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1;
1693                     // if x or y, use vi128XY permute result, else use vi128ZW
1694                     uint32_t selectedGather = (i < 2) ? 0 : 1;
1695
1696                     vVertexElements[currentVertexElement] = BITCAST(PSHUFB(BITCAST(vGatherResult[selectedGather], v32x8Ty), vConstMask[selectedMask]), vGatherTy);
1697                     // after pshufb mask for x channel; z uses the same shuffle from the second gather
1698                     // 256i - 0    1    2    3    4    5    6    7
1699                     //        xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00
1700
1701                     // denormalize if needed
1702                     if (conversionType != CONVERT_NONE)
1703                     {
1704                         vVertexElements[currentVertexElement] = FMUL(CAST(fpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor);
1705                     }
1706                     currentVertexElement++;
1707                 }
1708                 else
1709                 {
1710                     vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1711                 }
1712
1713                 if (currentVertexElement > 3)
1714                 {
1715                     StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1716                     // reset to the next vVertexElement to output
1717                     currentVertexElement = 0;
1718                 }
1719             }
1720         }
1721     }
1722     else
1723     {
1724         SWR_INVALID("Unsupported conversion type");
1725     }
1726 }
1727
1728 //////////////////////////////////////////////////////////////////////////
1729 /// @brief Output a simdvertex worth of elements to the current outputElt
1730 /// @param pVtxOut - base address of VIN output struct
1731 /// @param outputElt - simdvertex offset in VIN to write to
1732 /// @param numEltsToStore - number of simdvertex rows to write out
1733 /// @param vVertexElements - LLVM Value*[] simdvertex to write out
1734 void FetchJit::StoreVertexElements(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4])
1735 {
1736     SWR_ASSERT(numEltsToStore <= 4, "Invalid element count.");
1737
1738     for(uint32_t c = 0; c < numEltsToStore; ++c)
1739     {
1740         // STORE expects FP32 x vWidth type, just bitcast if needed
1741         if(!vVertexElements[c]->getType()->getScalarType()->isFloatTy()){
1742 #if FETCH_DUMP_VERTEX
1743             PRINT("vVertexElements[%d]: 0x%x\n", {C(c), vVertexElements[c]});
1744 #endif
1745             vVertexElements[c] = BITCAST(vVertexElements[c], mSimdFP32Ty);
1746         }
1747 #if FETCH_DUMP_VERTEX
1748         else
1749         {
1750             PRINT("vVertexElements[%d]: %f\n", {C(c), vVertexElements[c]});
1751         }
1752 #endif
1753         // outputElt * 4 = offsetting by the size of a simdvertex
1754         // + c offsets to a 32bit x vWidth row within the current vertex
1755         Value* dest = GEP(pVtxOut, C(outputElt * 4 + c), "destGEP");
1756         STORE(vVertexElements[c], dest);
1757     }
1758 }
1759
1760 //////////////////////////////////////////////////////////////////////////
1761 /// @brief Generates a constant vector of values based on the
1762 /// ComponentControl value
1763 /// @param ctrl - ComponentControl value
1764 Value* FetchJit::GenerateCompCtrlVector(const ComponentControl ctrl)
1765 {
1766     switch(ctrl)
1767     {
1768         case NoStore:   return VUNDEF_I();
1769         case Store0:    return VIMMED1(0);
1770         case Store1Fp:  return VIMMED1(1.0f);
1771         case Store1Int: return VIMMED1(1);
1772         case StoreVertexId:
1773         {
1774             Value* pId = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID })), mSimdFP32Ty);
1775             return VBROADCAST(pId);
1776         }
1777         case StoreInstanceId:
1778         {
1779             Value* pId = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_CurInstance })), mFP32Ty);
1780             return VBROADCAST(pId);
1781         }
1782         case StoreSrc:
1783         default:        SWR_INVALID("Invalid component control"); return VUNDEF_I();
1784     }
1785 }
1786
1787 //////////////////////////////////////////////////////////////////////////
1788 /// @brief Returns the enable mask for the specified component.
1789 /// @param enableMask - enable bits
1790 /// @param component - component to check if enabled.
1791 bool isComponentEnabled(ComponentEnable enableMask, uint8_t component)
1792 {
1793     switch (component)
1794     {
1795         // X
1796     case 0: return (enableMask & ComponentEnable::X);
1797         // Y
1798     case 1: return (enableMask & ComponentEnable::Y);
1799         // Z
1800     case 2: return (enableMask & ComponentEnable::Z);
1801         // W
1802     case 3: return (enableMask & ComponentEnable::W);
1803
1804     default: return false;
1805     }
1806 }
1807
1808
1809 //////////////////////////////////////////////////////////////////////////
1810 /// @brief JITs from fetch shader IR
1811 /// @param hJitMgr - JitManager handle
1812 /// @param func   - LLVM function IR
1813 /// @return PFN_FETCH_FUNC - pointer to fetch code
1814 PFN_FETCH_FUNC JitFetchFunc(HANDLE hJitMgr, const HANDLE hFunc)
1815 {
1816     const llvm::Function* func = (const llvm::Function*)hFunc;
1817     JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
1818     PFN_FETCH_FUNC pfnFetch;
1819
1820     pfnFetch = (PFN_FETCH_FUNC)(pJitMgr->mpExec->getFunctionAddress(func->getName().str()));
1821     // MCJIT finalizes modules the first time you JIT code from them. After finalized, you cannot add new IR to the module
1822     pJitMgr->mIsModuleFinalized = true;
1823
1824 #if defined(KNOB_SWRC_TRACING)
1825     char fName[1024];
1826     const char *funcName = func->getName().data();
1827     sprintf(fName, "%s.bin", funcName);
1828     FILE *fd = fopen(fName, "wb");
1829     fwrite((void *)pfnFetch, 1, 2048, fd);
1830     fclose(fd);
1831 #endif
1832
1833     pJitMgr->DumpAsm(const_cast<llvm::Function*>(func), "final");
1834
1835     return pfnFetch;
1836 }
1837
1838 //////////////////////////////////////////////////////////////////////////
1839 /// @brief JIT compiles fetch shader
1840 /// @param hJitMgr - JitManager handle
1841 /// @param state   - fetch state to build function from
1842 extern "C" PFN_FETCH_FUNC JITCALL JitCompileFetch(HANDLE hJitMgr, const FETCH_COMPILE_STATE& state)
1843 {
1844     JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
1845
1846     pJitMgr->SetupNewModule();
1847
1848     FetchJit theJit(pJitMgr);
1849     HANDLE hFunc = theJit.Create(state);
1850
1851     return JitFetchFunc(hJitMgr, hFunc);
1852 }